Include full contents of all nested repositories

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-27 16:25:02 +01:00
parent 14ff8fd54c
commit 2401ed446f
7271 changed files with 1310112 additions and 6 deletions

View File

@@ -0,0 +1,55 @@
# Changelog
## 2026.2.26
### Changes
- Version alignment with core OpenClaw release numbers.
## 2026.2.25
### Changes
- Version alignment with core OpenClaw release numbers.
## 2026.2.24
### Changes
- Version alignment with core OpenClaw release numbers.
## 2026.2.22
### Changes
- Version alignment with core OpenClaw release numbers.
## 2026.1.26
### Changes
- Breaking: voice-call TTS now uses core `messages.tts` (plugin TTS config deepmerges with core).
- Telephony TTS supports OpenAI + ElevenLabs; Edge TTS is ignored for calls.
- Removed legacy `tts.model`/`tts.voice`/`tts.instructions` plugin fields.
- Ngrok free-tier bypass renamed to `tunnel.allowNgrokFreeTierLoopbackBypass` and gated to loopback + `tunnel.provider="ngrok"`.
## 0.1.0
### Highlights
- First public release of the @openclaw/voice-call plugin.
### Features
- Providers: Twilio (Programmable Voice + Media Streams), Telnyx (Call Control v2), and mock provider for local dev.
- Call flows: outbound notify vs. conversation modes, configurable autohangup, and multiturn continuation.
- Inbound handling: policy controls (disabled/allowlist/open), allowlist matching, and inbound greeting.
- Webhooks: builtin server with configurable bind/port/path plus `publicUrl` override.
- Exposure helpers: ngrok + Tailscale serve/funnel; devonly signature bypass for ngrok free tier.
- Streaming: OpenAI Realtime STT over media WebSocket with partial + final transcripts.
- Speech: OpenAI TTS (model/voice/instructions) with Twilio `<Say>` fallback.
- Tooling: `voice_call` tool actions for initiate/continue/speak/end/status.
- Gateway RPC: `voicecall.initiate|continue|speak|end|status` (+ legacy `voicecall.start`).
- CLI: `openclaw voicecall` commands (call/start/continue/speak/end/status/tail/expose).
- Observability: JSONL call logs and `voicecall tail` for live inspection.
- Response controls: `responseModel`, `responseSystemPrompt`, and `responseTimeoutMs` for autoresponses.

View File

@@ -0,0 +1,181 @@
# @openclaw/voice-call
Official Voice Call plugin for **OpenClaw**.
Providers:
- **Twilio** (Programmable Voice + Media Streams)
- **Telnyx** (Call Control v2)
- **Plivo** (Voice API + XML transfer + GetInput speech)
- **Mock** (dev/no network)
Docs: `https://docs.openclaw.ai/plugins/voice-call`
Plugin system: `https://docs.openclaw.ai/plugin`
## Install (local dev)
### Option A: install via OpenClaw (recommended)
```bash
openclaw plugins install @openclaw/voice-call
```
Restart the Gateway afterwards.
### Option B: copy into your global extensions folder (dev)
```bash
mkdir -p ~/.openclaw/extensions
cp -R extensions/voice-call ~/.openclaw/extensions/voice-call
cd ~/.openclaw/extensions/voice-call && pnpm install
```
## Config
Put under `plugins.entries.voice-call.config`:
```json5
{
provider: "twilio", // or "telnyx" | "plivo" | "mock"
fromNumber: "+15550001234",
toNumber: "+15550005678",
twilio: {
accountSid: "ACxxxxxxxx",
authToken: "your_token",
},
telnyx: {
apiKey: "KEYxxxx",
connectionId: "CONNxxxx",
// Telnyx webhook public key from the Telnyx Mission Control Portal
// (Base64 string; can also be set via TELNYX_PUBLIC_KEY).
publicKey: "...",
},
plivo: {
authId: "MAxxxxxxxxxxxxxxxxxxxx",
authToken: "your_token",
},
// Webhook server
serve: {
port: 3334,
path: "/voice/webhook",
},
// Public exposure (pick one):
// publicUrl: "https://example.ngrok.app/voice/webhook",
// tunnel: { provider: "ngrok" },
// tailscale: { mode: "funnel", path: "/voice/webhook" }
outbound: {
defaultMode: "notify", // or "conversation"
},
streaming: {
enabled: true,
streamPath: "/voice/stream",
preStartTimeoutMs: 5000,
maxPendingConnections: 32,
maxPendingConnectionsPerIp: 4,
maxConnections: 128,
},
}
```
Notes:
- Twilio/Telnyx/Plivo require a **publicly reachable** webhook URL.
- `mock` is a local dev provider (no network calls).
- Telnyx requires `telnyx.publicKey` (or `TELNYX_PUBLIC_KEY`) unless `skipSignatureVerification` is true.
- `tunnel.allowNgrokFreeTierLoopbackBypass: true` allows Twilio webhooks with invalid signatures **only** when `tunnel.provider="ngrok"` and `serve.bind` is loopback (ngrok local agent). Use for local dev only.
Streaming security defaults:
- `streaming.preStartTimeoutMs` closes sockets that never send a valid `start` frame.
- `streaming.maxPendingConnections` caps total unauthenticated pre-start sockets.
- `streaming.maxPendingConnectionsPerIp` caps unauthenticated pre-start sockets per source IP.
- `streaming.maxConnections` caps total open media stream sockets (pending + active).
## Stale call reaper
Use `staleCallReaperSeconds` to end calls that never receive a terminal webhook
(for example, notify-mode calls that never complete). The default is `0`
(disabled).
Recommended ranges:
- **Production:** `120``300` seconds for notify-style flows.
- Keep this value **higher than `maxDurationSeconds`** so normal calls can
finish. A good starting point is `maxDurationSeconds + 3060` seconds.
Example:
```json5
{
staleCallReaperSeconds: 360,
}
```
## TTS for calls
Voice Call uses the core `messages.tts` configuration (OpenAI or ElevenLabs) for
streaming speech on calls. You can override it under the plugin config with the
same shape — overrides deep-merge with `messages.tts`.
```json5
{
tts: {
provider: "openai",
openai: {
voice: "alloy",
},
},
}
```
Notes:
- Edge TTS is ignored for voice calls (telephony audio needs PCM; Edge output is unreliable).
- Core TTS is used when Twilio media streaming is enabled; otherwise calls fall back to provider native voices.
## CLI
```bash
openclaw voicecall call --to "+15555550123" --message "Hello from OpenClaw"
openclaw voicecall continue --call-id <id> --message "Any questions?"
openclaw voicecall speak --call-id <id> --message "One moment"
openclaw voicecall end --call-id <id>
openclaw voicecall status --call-id <id>
openclaw voicecall tail
openclaw voicecall expose --mode funnel
```
## Tool
Tool name: `voice_call`
Actions:
- `initiate_call` (message, to?, mode?)
- `continue_call` (callId, message)
- `speak_to_user` (callId, message)
- `end_call` (callId)
- `get_status` (callId)
## Gateway RPC
- `voicecall.initiate` (to?, message, mode?)
- `voicecall.continue` (callId, message)
- `voicecall.speak` (callId, message)
- `voicecall.end` (callId)
- `voicecall.status` (callId)
## Notes
- Uses webhook signature verification for Twilio/Telnyx/Plivo.
- Adds replay protection for Twilio and Plivo webhooks (valid duplicate callbacks are ignored safely).
- Twilio speech turns include a per-turn token so stale/replayed callbacks cannot complete a newer turn.
- `responseModel` / `responseSystemPrompt` control AI auto-responses.
- Media streaming requires `ws` and OpenAI Realtime API key.

View File

@@ -0,0 +1,512 @@
import { Type } from "@sinclair/typebox";
import type { GatewayRequestHandlerOptions, OpenClawPluginApi } from "openclaw/plugin-sdk";
import { registerVoiceCallCli } from "./src/cli.js";
import {
VoiceCallConfigSchema,
resolveVoiceCallConfig,
validateProviderConfig,
type VoiceCallConfig,
} from "./src/config.js";
import type { CoreConfig } from "./src/core-bridge.js";
import { createVoiceCallRuntime, type VoiceCallRuntime } from "./src/runtime.js";
const voiceCallConfigSchema = {
parse(value: unknown): VoiceCallConfig {
const raw =
value && typeof value === "object" && !Array.isArray(value)
? (value as Record<string, unknown>)
: {};
const twilio = raw.twilio as Record<string, unknown> | undefined;
const legacyFrom = typeof twilio?.from === "string" ? twilio.from : undefined;
const enabled = typeof raw.enabled === "boolean" ? raw.enabled : true;
const providerRaw = raw.provider === "log" ? "mock" : raw.provider;
const provider = providerRaw ?? (enabled ? "mock" : undefined);
return VoiceCallConfigSchema.parse({
...raw,
enabled,
provider,
fromNumber: raw.fromNumber ?? legacyFrom,
});
},
uiHints: {
provider: {
label: "Provider",
help: "Use twilio, telnyx, or mock for dev/no-network.",
},
fromNumber: { label: "From Number", placeholder: "+15550001234" },
toNumber: { label: "Default To Number", placeholder: "+15550001234" },
inboundPolicy: { label: "Inbound Policy" },
allowFrom: { label: "Inbound Allowlist" },
inboundGreeting: { label: "Inbound Greeting", advanced: true },
"telnyx.apiKey": { label: "Telnyx API Key", sensitive: true },
"telnyx.connectionId": { label: "Telnyx Connection ID" },
"telnyx.publicKey": { label: "Telnyx Public Key", sensitive: true },
"twilio.accountSid": { label: "Twilio Account SID" },
"twilio.authToken": { label: "Twilio Auth Token", sensitive: true },
"outbound.defaultMode": { label: "Default Call Mode" },
"outbound.notifyHangupDelaySec": {
label: "Notify Hangup Delay (sec)",
advanced: true,
},
"serve.port": { label: "Webhook Port" },
"serve.bind": { label: "Webhook Bind" },
"serve.path": { label: "Webhook Path" },
"tailscale.mode": { label: "Tailscale Mode", advanced: true },
"tailscale.path": { label: "Tailscale Path", advanced: true },
"tunnel.provider": { label: "Tunnel Provider", advanced: true },
"tunnel.ngrokAuthToken": {
label: "ngrok Auth Token",
sensitive: true,
advanced: true,
},
"tunnel.ngrokDomain": { label: "ngrok Domain", advanced: true },
"tunnel.allowNgrokFreeTierLoopbackBypass": {
label: "Allow ngrok Free Tier (Loopback Bypass)",
advanced: true,
},
"streaming.enabled": { label: "Enable Streaming", advanced: true },
"streaming.openaiApiKey": {
label: "OpenAI Realtime API Key",
sensitive: true,
advanced: true,
},
"streaming.sttModel": { label: "Realtime STT Model", advanced: true },
"streaming.streamPath": { label: "Media Stream Path", advanced: true },
"tts.provider": {
label: "TTS Provider Override",
help: "Deep-merges with messages.tts (Edge is ignored for calls).",
advanced: true,
},
"tts.openai.model": { label: "OpenAI TTS Model", advanced: true },
"tts.openai.voice": { label: "OpenAI TTS Voice", advanced: true },
"tts.openai.apiKey": {
label: "OpenAI API Key",
sensitive: true,
advanced: true,
},
"tts.elevenlabs.modelId": { label: "ElevenLabs Model ID", advanced: true },
"tts.elevenlabs.voiceId": { label: "ElevenLabs Voice ID", advanced: true },
"tts.elevenlabs.apiKey": {
label: "ElevenLabs API Key",
sensitive: true,
advanced: true,
},
"tts.elevenlabs.baseUrl": { label: "ElevenLabs Base URL", advanced: true },
publicUrl: { label: "Public Webhook URL", advanced: true },
skipSignatureVerification: {
label: "Skip Signature Verification",
advanced: true,
},
store: { label: "Call Log Store Path", advanced: true },
responseModel: { label: "Response Model", advanced: true },
responseSystemPrompt: { label: "Response System Prompt", advanced: true },
responseTimeoutMs: { label: "Response Timeout (ms)", advanced: true },
},
};
const VoiceCallToolSchema = Type.Union([
Type.Object({
action: Type.Literal("initiate_call"),
to: Type.Optional(Type.String({ description: "Call target" })),
message: Type.String({ description: "Intro message" }),
mode: Type.Optional(Type.Union([Type.Literal("notify"), Type.Literal("conversation")])),
}),
Type.Object({
action: Type.Literal("continue_call"),
callId: Type.String({ description: "Call ID" }),
message: Type.String({ description: "Follow-up message" }),
}),
Type.Object({
action: Type.Literal("speak_to_user"),
callId: Type.String({ description: "Call ID" }),
message: Type.String({ description: "Message to speak" }),
}),
Type.Object({
action: Type.Literal("end_call"),
callId: Type.String({ description: "Call ID" }),
}),
Type.Object({
action: Type.Literal("get_status"),
callId: Type.String({ description: "Call ID" }),
}),
Type.Object({
mode: Type.Optional(Type.Union([Type.Literal("call"), Type.Literal("status")])),
to: Type.Optional(Type.String({ description: "Call target" })),
sid: Type.Optional(Type.String({ description: "Call SID" })),
message: Type.Optional(Type.String({ description: "Optional intro message" })),
}),
]);
const voiceCallPlugin = {
id: "voice-call",
name: "Voice Call",
description: "Voice-call plugin with Telnyx/Twilio/Plivo providers",
configSchema: voiceCallConfigSchema,
register(api: OpenClawPluginApi) {
const config = resolveVoiceCallConfig(voiceCallConfigSchema.parse(api.pluginConfig));
const validation = validateProviderConfig(config);
if (api.pluginConfig && typeof api.pluginConfig === "object") {
const raw = api.pluginConfig as Record<string, unknown>;
const twilio = raw.twilio as Record<string, unknown> | undefined;
if (raw.provider === "log") {
api.logger.warn('[voice-call] provider "log" is deprecated; use "mock" instead');
}
if (typeof twilio?.from === "string") {
api.logger.warn("[voice-call] twilio.from is deprecated; use fromNumber instead");
}
}
let runtimePromise: Promise<VoiceCallRuntime> | null = null;
let runtime: VoiceCallRuntime | null = null;
const ensureRuntime = async () => {
if (!config.enabled) {
throw new Error("Voice call disabled in plugin config");
}
if (!validation.valid) {
throw new Error(validation.errors.join("; "));
}
if (runtime) {
return runtime;
}
if (!runtimePromise) {
runtimePromise = createVoiceCallRuntime({
config,
coreConfig: api.config as CoreConfig,
ttsRuntime: api.runtime.tts,
logger: api.logger,
});
}
runtime = await runtimePromise;
return runtime;
};
const sendError = (respond: (ok: boolean, payload?: unknown) => void, err: unknown) => {
respond(false, { error: err instanceof Error ? err.message : String(err) });
};
api.registerGatewayMethod(
"voicecall.initiate",
async ({ params, respond }: GatewayRequestHandlerOptions) => {
try {
const message = typeof params?.message === "string" ? params.message.trim() : "";
if (!message) {
respond(false, { error: "message required" });
return;
}
const rt = await ensureRuntime();
const to =
typeof params?.to === "string" && params.to.trim()
? params.to.trim()
: rt.config.toNumber;
if (!to) {
respond(false, { error: "to required" });
return;
}
const mode =
params?.mode === "notify" || params?.mode === "conversation" ? params.mode : undefined;
const result = await rt.manager.initiateCall(to, undefined, {
message,
mode,
});
if (!result.success) {
respond(false, { error: result.error || "initiate failed" });
return;
}
respond(true, { callId: result.callId, initiated: true });
} catch (err) {
sendError(respond, err);
}
},
);
api.registerGatewayMethod(
"voicecall.continue",
async ({ params, respond }: GatewayRequestHandlerOptions) => {
try {
const callId = typeof params?.callId === "string" ? params.callId.trim() : "";
const message = typeof params?.message === "string" ? params.message.trim() : "";
if (!callId || !message) {
respond(false, { error: "callId and message required" });
return;
}
const rt = await ensureRuntime();
const result = await rt.manager.continueCall(callId, message);
if (!result.success) {
respond(false, { error: result.error || "continue failed" });
return;
}
respond(true, { success: true, transcript: result.transcript });
} catch (err) {
sendError(respond, err);
}
},
);
api.registerGatewayMethod(
"voicecall.speak",
async ({ params, respond }: GatewayRequestHandlerOptions) => {
try {
const callId = typeof params?.callId === "string" ? params.callId.trim() : "";
const message = typeof params?.message === "string" ? params.message.trim() : "";
if (!callId || !message) {
respond(false, { error: "callId and message required" });
return;
}
const rt = await ensureRuntime();
const result = await rt.manager.speak(callId, message);
if (!result.success) {
respond(false, { error: result.error || "speak failed" });
return;
}
respond(true, { success: true });
} catch (err) {
sendError(respond, err);
}
},
);
api.registerGatewayMethod(
"voicecall.end",
async ({ params, respond }: GatewayRequestHandlerOptions) => {
try {
const callId = typeof params?.callId === "string" ? params.callId.trim() : "";
if (!callId) {
respond(false, { error: "callId required" });
return;
}
const rt = await ensureRuntime();
const result = await rt.manager.endCall(callId);
if (!result.success) {
respond(false, { error: result.error || "end failed" });
return;
}
respond(true, { success: true });
} catch (err) {
sendError(respond, err);
}
},
);
api.registerGatewayMethod(
"voicecall.status",
async ({ params, respond }: GatewayRequestHandlerOptions) => {
try {
const raw =
typeof params?.callId === "string"
? params.callId.trim()
: typeof params?.sid === "string"
? params.sid.trim()
: "";
if (!raw) {
respond(false, { error: "callId required" });
return;
}
const rt = await ensureRuntime();
const call = rt.manager.getCall(raw) || rt.manager.getCallByProviderCallId(raw);
if (!call) {
respond(true, { found: false });
return;
}
respond(true, { found: true, call });
} catch (err) {
sendError(respond, err);
}
},
);
api.registerGatewayMethod(
"voicecall.start",
async ({ params, respond }: GatewayRequestHandlerOptions) => {
try {
const to = typeof params?.to === "string" ? params.to.trim() : "";
const message = typeof params?.message === "string" ? params.message.trim() : "";
if (!to) {
respond(false, { error: "to required" });
return;
}
const rt = await ensureRuntime();
const result = await rt.manager.initiateCall(to, undefined, {
message: message || undefined,
});
if (!result.success) {
respond(false, { error: result.error || "initiate failed" });
return;
}
respond(true, { callId: result.callId, initiated: true });
} catch (err) {
sendError(respond, err);
}
},
);
api.registerTool({
name: "voice_call",
label: "Voice Call",
description: "Make phone calls and have voice conversations via the voice-call plugin.",
parameters: VoiceCallToolSchema,
async execute(_toolCallId, params) {
const json = (payload: unknown) => ({
content: [{ type: "text" as const, text: JSON.stringify(payload, null, 2) }],
details: payload,
});
try {
const rt = await ensureRuntime();
if (typeof params?.action === "string") {
switch (params.action) {
case "initiate_call": {
const message = String(params.message || "").trim();
if (!message) {
throw new Error("message required");
}
const to =
typeof params.to === "string" && params.to.trim()
? params.to.trim()
: rt.config.toNumber;
if (!to) {
throw new Error("to required");
}
const result = await rt.manager.initiateCall(to, undefined, {
message,
mode:
params.mode === "notify" || params.mode === "conversation"
? params.mode
: undefined,
});
if (!result.success) {
throw new Error(result.error || "initiate failed");
}
return json({ callId: result.callId, initiated: true });
}
case "continue_call": {
const callId = String(params.callId || "").trim();
const message = String(params.message || "").trim();
if (!callId || !message) {
throw new Error("callId and message required");
}
const result = await rt.manager.continueCall(callId, message);
if (!result.success) {
throw new Error(result.error || "continue failed");
}
return json({ success: true, transcript: result.transcript });
}
case "speak_to_user": {
const callId = String(params.callId || "").trim();
const message = String(params.message || "").trim();
if (!callId || !message) {
throw new Error("callId and message required");
}
const result = await rt.manager.speak(callId, message);
if (!result.success) {
throw new Error(result.error || "speak failed");
}
return json({ success: true });
}
case "end_call": {
const callId = String(params.callId || "").trim();
if (!callId) {
throw new Error("callId required");
}
const result = await rt.manager.endCall(callId);
if (!result.success) {
throw new Error(result.error || "end failed");
}
return json({ success: true });
}
case "get_status": {
const callId = String(params.callId || "").trim();
if (!callId) {
throw new Error("callId required");
}
const call =
rt.manager.getCall(callId) || rt.manager.getCallByProviderCallId(callId);
return json(call ? { found: true, call } : { found: false });
}
}
}
const mode = params?.mode ?? "call";
if (mode === "status") {
const sid = typeof params.sid === "string" ? params.sid.trim() : "";
if (!sid) {
throw new Error("sid required for status");
}
const call = rt.manager.getCall(sid) || rt.manager.getCallByProviderCallId(sid);
return json(call ? { found: true, call } : { found: false });
}
const to =
typeof params.to === "string" && params.to.trim()
? params.to.trim()
: rt.config.toNumber;
if (!to) {
throw new Error("to required for call");
}
const result = await rt.manager.initiateCall(to, undefined, {
message:
typeof params.message === "string" && params.message.trim()
? params.message.trim()
: undefined,
});
if (!result.success) {
throw new Error(result.error || "initiate failed");
}
return json({ callId: result.callId, initiated: true });
} catch (err) {
return json({
error: err instanceof Error ? err.message : String(err),
});
}
},
});
api.registerCli(
({ program }) =>
registerVoiceCallCli({
program,
config,
ensureRuntime,
logger: api.logger,
}),
{ commands: ["voicecall"] },
);
api.registerService({
id: "voicecall",
start: async () => {
if (!config.enabled) {
return;
}
try {
await ensureRuntime();
} catch (err) {
api.logger.error(
`[voice-call] Failed to start runtime: ${
err instanceof Error ? err.message : String(err)
}`,
);
}
},
stop: async () => {
if (!runtimePromise) {
return;
}
try {
const rt = await runtimePromise;
await rt.stop();
} finally {
runtimePromise = null;
runtime = null;
}
},
});
},
};
export default voiceCallPlugin;

View File

@@ -0,0 +1,559 @@
{
"id": "voice-call",
"uiHints": {
"provider": {
"label": "Provider",
"help": "Use twilio, telnyx, or mock for dev/no-network."
},
"fromNumber": {
"label": "From Number",
"placeholder": "+15550001234"
},
"toNumber": {
"label": "Default To Number",
"placeholder": "+15550001234"
},
"inboundPolicy": {
"label": "Inbound Policy"
},
"allowFrom": {
"label": "Inbound Allowlist"
},
"inboundGreeting": {
"label": "Inbound Greeting",
"advanced": true
},
"telnyx.apiKey": {
"label": "Telnyx API Key",
"sensitive": true
},
"telnyx.connectionId": {
"label": "Telnyx Connection ID"
},
"telnyx.publicKey": {
"label": "Telnyx Public Key",
"sensitive": true
},
"twilio.accountSid": {
"label": "Twilio Account SID"
},
"twilio.authToken": {
"label": "Twilio Auth Token",
"sensitive": true
},
"outbound.defaultMode": {
"label": "Default Call Mode"
},
"outbound.notifyHangupDelaySec": {
"label": "Notify Hangup Delay (sec)",
"advanced": true
},
"serve.port": {
"label": "Webhook Port"
},
"serve.bind": {
"label": "Webhook Bind"
},
"serve.path": {
"label": "Webhook Path"
},
"tailscale.mode": {
"label": "Tailscale Mode",
"advanced": true
},
"tailscale.path": {
"label": "Tailscale Path",
"advanced": true
},
"tunnel.provider": {
"label": "Tunnel Provider",
"advanced": true
},
"tunnel.ngrokAuthToken": {
"label": "ngrok Auth Token",
"sensitive": true,
"advanced": true
},
"tunnel.ngrokDomain": {
"label": "ngrok Domain",
"advanced": true
},
"tunnel.allowNgrokFreeTierLoopbackBypass": {
"label": "Allow ngrok Free Tier (Loopback Bypass)",
"advanced": true
},
"streaming.enabled": {
"label": "Enable Streaming",
"advanced": true
},
"streaming.openaiApiKey": {
"label": "OpenAI Realtime API Key",
"sensitive": true,
"advanced": true
},
"streaming.sttModel": {
"label": "Realtime STT Model",
"advanced": true
},
"streaming.streamPath": {
"label": "Media Stream Path",
"advanced": true
},
"tts.provider": {
"label": "TTS Provider Override",
"help": "Deep-merges with messages.tts (Edge is ignored for calls).",
"advanced": true
},
"tts.openai.model": {
"label": "OpenAI TTS Model",
"advanced": true
},
"tts.openai.voice": {
"label": "OpenAI TTS Voice",
"advanced": true
},
"tts.openai.apiKey": {
"label": "OpenAI API Key",
"sensitive": true,
"advanced": true
},
"tts.elevenlabs.modelId": {
"label": "ElevenLabs Model ID",
"advanced": true
},
"tts.elevenlabs.voiceId": {
"label": "ElevenLabs Voice ID",
"advanced": true
},
"tts.elevenlabs.apiKey": {
"label": "ElevenLabs API Key",
"sensitive": true,
"advanced": true
},
"tts.elevenlabs.baseUrl": {
"label": "ElevenLabs Base URL",
"advanced": true
},
"publicUrl": {
"label": "Public Webhook URL",
"advanced": true
},
"skipSignatureVerification": {
"label": "Skip Signature Verification",
"advanced": true
},
"store": {
"label": "Call Log Store Path",
"advanced": true
},
"responseModel": {
"label": "Response Model",
"advanced": true
},
"responseSystemPrompt": {
"label": "Response System Prompt",
"advanced": true
},
"responseTimeoutMs": {
"label": "Response Timeout (ms)",
"advanced": true
}
},
"configSchema": {
"type": "object",
"additionalProperties": false,
"properties": {
"enabled": {
"type": "boolean"
},
"provider": {
"type": "string",
"enum": ["telnyx", "twilio", "plivo", "mock"]
},
"telnyx": {
"type": "object",
"additionalProperties": false,
"properties": {
"apiKey": {
"type": "string"
},
"connectionId": {
"type": "string"
},
"publicKey": {
"type": "string"
}
}
},
"twilio": {
"type": "object",
"additionalProperties": false,
"properties": {
"accountSid": {
"type": "string"
},
"authToken": {
"type": "string"
}
}
},
"plivo": {
"type": "object",
"additionalProperties": false,
"properties": {
"authId": {
"type": "string"
},
"authToken": {
"type": "string"
}
}
},
"fromNumber": {
"type": "string",
"pattern": "^\\+[1-9]\\d{1,14}$"
},
"toNumber": {
"type": "string",
"pattern": "^\\+[1-9]\\d{1,14}$"
},
"inboundPolicy": {
"type": "string",
"enum": ["disabled", "allowlist", "pairing", "open"]
},
"allowFrom": {
"type": "array",
"items": {
"type": "string",
"pattern": "^\\+[1-9]\\d{1,14}$"
}
},
"inboundGreeting": {
"type": "string"
},
"outbound": {
"type": "object",
"additionalProperties": false,
"properties": {
"defaultMode": {
"type": "string",
"enum": ["notify", "conversation"]
},
"notifyHangupDelaySec": {
"type": "integer",
"minimum": 0
}
}
},
"maxDurationSeconds": {
"type": "integer",
"minimum": 1
},
"silenceTimeoutMs": {
"type": "integer",
"minimum": 1
},
"transcriptTimeoutMs": {
"type": "integer",
"minimum": 1
},
"ringTimeoutMs": {
"type": "integer",
"minimum": 1
},
"maxConcurrentCalls": {
"type": "integer",
"minimum": 1
},
"serve": {
"type": "object",
"additionalProperties": false,
"properties": {
"port": {
"type": "integer",
"minimum": 1
},
"bind": {
"type": "string"
},
"path": {
"type": "string"
}
}
},
"tailscale": {
"type": "object",
"additionalProperties": false,
"properties": {
"mode": {
"type": "string",
"enum": ["off", "serve", "funnel"]
},
"path": {
"type": "string"
}
}
},
"tunnel": {
"type": "object",
"additionalProperties": false,
"properties": {
"provider": {
"type": "string",
"enum": ["none", "ngrok", "tailscale-serve", "tailscale-funnel"]
},
"ngrokAuthToken": {
"type": "string"
},
"ngrokDomain": {
"type": "string"
},
"allowNgrokFreeTierLoopbackBypass": {
"type": "boolean"
}
}
},
"streaming": {
"type": "object",
"additionalProperties": false,
"properties": {
"enabled": {
"type": "boolean"
},
"sttProvider": {
"type": "string",
"enum": ["openai-realtime"]
},
"openaiApiKey": {
"type": "string"
},
"sttModel": {
"type": "string"
},
"silenceDurationMs": {
"type": "integer",
"minimum": 1
},
"vadThreshold": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"streamPath": {
"type": "string"
}
}
},
"publicUrl": {
"type": "string"
},
"skipSignatureVerification": {
"type": "boolean"
},
"stt": {
"type": "object",
"additionalProperties": false,
"properties": {
"provider": {
"type": "string",
"enum": ["openai"]
},
"model": {
"type": "string"
}
}
},
"tts": {
"type": "object",
"additionalProperties": false,
"properties": {
"auto": {
"type": "string",
"enum": ["off", "always", "inbound", "tagged"]
},
"enabled": {
"type": "boolean"
},
"mode": {
"type": "string",
"enum": ["final", "all"]
},
"provider": {
"type": "string",
"enum": ["openai", "elevenlabs", "edge"]
},
"summaryModel": {
"type": "string"
},
"modelOverrides": {
"type": "object",
"additionalProperties": false,
"properties": {
"enabled": {
"type": "boolean"
},
"allowText": {
"type": "boolean"
},
"allowProvider": {
"type": "boolean"
},
"allowVoice": {
"type": "boolean"
},
"allowModelId": {
"type": "boolean"
},
"allowVoiceSettings": {
"type": "boolean"
},
"allowNormalization": {
"type": "boolean"
},
"allowSeed": {
"type": "boolean"
}
}
},
"elevenlabs": {
"type": "object",
"additionalProperties": false,
"properties": {
"apiKey": {
"type": "string"
},
"baseUrl": {
"type": "string"
},
"voiceId": {
"type": "string"
},
"modelId": {
"type": "string"
},
"seed": {
"type": "integer",
"minimum": 0,
"maximum": 4294967295
},
"applyTextNormalization": {
"type": "string",
"enum": ["auto", "on", "off"]
},
"languageCode": {
"type": "string"
},
"voiceSettings": {
"type": "object",
"additionalProperties": false,
"properties": {
"stability": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"similarityBoost": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"style": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"useSpeakerBoost": {
"type": "boolean"
},
"speed": {
"type": "number",
"minimum": 0.5,
"maximum": 2
}
}
}
}
},
"openai": {
"type": "object",
"additionalProperties": false,
"properties": {
"apiKey": {
"type": "string"
},
"model": {
"type": "string"
},
"voice": {
"type": "string"
}
}
},
"edge": {
"type": "object",
"additionalProperties": false,
"properties": {
"enabled": {
"type": "boolean"
},
"voice": {
"type": "string"
},
"lang": {
"type": "string"
},
"outputFormat": {
"type": "string"
},
"pitch": {
"type": "string"
},
"rate": {
"type": "string"
},
"volume": {
"type": "string"
},
"saveSubtitles": {
"type": "boolean"
},
"proxy": {
"type": "string"
},
"timeoutMs": {
"type": "integer",
"minimum": 1000,
"maximum": 120000
}
}
},
"prefsPath": {
"type": "string"
},
"maxTextLength": {
"type": "integer",
"minimum": 1
},
"timeoutMs": {
"type": "integer",
"minimum": 1000,
"maximum": 120000
}
}
},
"store": {
"type": "string"
},
"responseModel": {
"type": "string"
},
"responseSystemPrompt": {
"type": "string"
},
"responseTimeoutMs": {
"type": "integer",
"minimum": 1
}
}
}
}

View File

@@ -0,0 +1,16 @@
{
"name": "@openclaw/voice-call",
"version": "2026.2.26",
"description": "OpenClaw voice-call plugin",
"type": "module",
"dependencies": {
"@sinclair/typebox": "0.34.48",
"ws": "^8.19.0",
"zod": "^4.3.6"
},
"openclaw": {
"extensions": [
"./index.ts"
]
}
}

View File

@@ -0,0 +1,19 @@
export function normalizePhoneNumber(input?: string): string {
if (!input) {
return "";
}
return input.replace(/\D/g, "");
}
export function isAllowlistedCaller(
normalizedFrom: string,
allowFrom: string[] | undefined,
): boolean {
if (!normalizedFrom) {
return false;
}
return (allowFrom ?? []).some((num) => {
const normalizedAllow = normalizePhoneNumber(num);
return normalizedAllow !== "" && normalizedAllow === normalizedFrom;
});
}

View File

@@ -0,0 +1,380 @@
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import type { Command } from "commander";
import { sleep } from "openclaw/plugin-sdk";
import type { VoiceCallConfig } from "./config.js";
import type { VoiceCallRuntime } from "./runtime.js";
import { resolveUserPath } from "./utils.js";
import {
cleanupTailscaleExposureRoute,
getTailscaleSelfInfo,
setupTailscaleExposureRoute,
} from "./webhook.js";
type Logger = {
info: (message: string) => void;
warn: (message: string) => void;
error: (message: string) => void;
};
function resolveMode(input: string): "off" | "serve" | "funnel" {
const raw = input.trim().toLowerCase();
if (raw === "serve" || raw === "off") {
return raw;
}
return "funnel";
}
function resolveDefaultStorePath(config: VoiceCallConfig): string {
const preferred = path.join(os.homedir(), ".openclaw", "voice-calls");
const resolvedPreferred = resolveUserPath(preferred);
const existing =
[resolvedPreferred].find((dir) => {
try {
return fs.existsSync(path.join(dir, "calls.jsonl")) || fs.existsSync(dir);
} catch {
return false;
}
}) ?? resolvedPreferred;
const base = config.store?.trim() ? resolveUserPath(config.store) : existing;
return path.join(base, "calls.jsonl");
}
function percentile(values: number[], p: number): number {
if (values.length === 0) {
return 0;
}
const sorted = [...values].sort((a, b) => a - b);
const idx = Math.min(sorted.length - 1, Math.max(0, Math.ceil((p / 100) * sorted.length) - 1));
return sorted[idx] ?? 0;
}
function summarizeSeries(values: number[]): {
count: number;
minMs: number;
maxMs: number;
avgMs: number;
p50Ms: number;
p95Ms: number;
} {
if (values.length === 0) {
return { count: 0, minMs: 0, maxMs: 0, avgMs: 0, p50Ms: 0, p95Ms: 0 };
}
const minMs = values.reduce(
(min, value) => (value < min ? value : min),
Number.POSITIVE_INFINITY,
);
const maxMs = values.reduce(
(max, value) => (value > max ? value : max),
Number.NEGATIVE_INFINITY,
);
const avgMs = values.reduce((sum, value) => sum + value, 0) / values.length;
return {
count: values.length,
minMs,
maxMs,
avgMs,
p50Ms: percentile(values, 50),
p95Ms: percentile(values, 95),
};
}
function resolveCallMode(mode?: string): "notify" | "conversation" | undefined {
return mode === "notify" || mode === "conversation" ? mode : undefined;
}
async function initiateCallAndPrintId(params: {
runtime: VoiceCallRuntime;
to: string;
message?: string;
mode?: string;
}) {
const result = await params.runtime.manager.initiateCall(params.to, undefined, {
message: params.message,
mode: resolveCallMode(params.mode),
});
if (!result.success) {
throw new Error(result.error || "initiate failed");
}
// eslint-disable-next-line no-console
console.log(JSON.stringify({ callId: result.callId }, null, 2));
}
export function registerVoiceCallCli(params: {
program: Command;
config: VoiceCallConfig;
ensureRuntime: () => Promise<VoiceCallRuntime>;
logger: Logger;
}) {
const { program, config, ensureRuntime, logger } = params;
const root = program
.command("voicecall")
.description("Voice call utilities")
.addHelpText("after", () => `\nDocs: https://docs.openclaw.ai/cli/voicecall\n`);
root
.command("call")
.description("Initiate an outbound voice call")
.requiredOption("-m, --message <text>", "Message to speak when call connects")
.option(
"-t, --to <phone>",
"Phone number to call (E.164 format, uses config toNumber if not set)",
)
.option(
"--mode <mode>",
"Call mode: notify (hangup after message) or conversation (stay open)",
"conversation",
)
.action(async (options: { message: string; to?: string; mode?: string }) => {
const rt = await ensureRuntime();
const to = options.to ?? rt.config.toNumber;
if (!to) {
throw new Error("Missing --to and no toNumber configured");
}
await initiateCallAndPrintId({
runtime: rt,
to,
message: options.message,
mode: options.mode,
});
});
root
.command("start")
.description("Alias for voicecall call")
.requiredOption("--to <phone>", "Phone number to call")
.option("--message <text>", "Message to speak when call connects")
.option(
"--mode <mode>",
"Call mode: notify (hangup after message) or conversation (stay open)",
"conversation",
)
.action(async (options: { to: string; message?: string; mode?: string }) => {
const rt = await ensureRuntime();
await initiateCallAndPrintId({
runtime: rt,
to: options.to,
message: options.message,
mode: options.mode,
});
});
root
.command("continue")
.description("Speak a message and wait for a response")
.requiredOption("--call-id <id>", "Call ID")
.requiredOption("--message <text>", "Message to speak")
.action(async (options: { callId: string; message: string }) => {
const rt = await ensureRuntime();
const result = await rt.manager.continueCall(options.callId, options.message);
if (!result.success) {
throw new Error(result.error || "continue failed");
}
// eslint-disable-next-line no-console
console.log(JSON.stringify(result, null, 2));
});
root
.command("speak")
.description("Speak a message without waiting for response")
.requiredOption("--call-id <id>", "Call ID")
.requiredOption("--message <text>", "Message to speak")
.action(async (options: { callId: string; message: string }) => {
const rt = await ensureRuntime();
const result = await rt.manager.speak(options.callId, options.message);
if (!result.success) {
throw new Error(result.error || "speak failed");
}
// eslint-disable-next-line no-console
console.log(JSON.stringify(result, null, 2));
});
root
.command("end")
.description("Hang up an active call")
.requiredOption("--call-id <id>", "Call ID")
.action(async (options: { callId: string }) => {
const rt = await ensureRuntime();
const result = await rt.manager.endCall(options.callId);
if (!result.success) {
throw new Error(result.error || "end failed");
}
// eslint-disable-next-line no-console
console.log(JSON.stringify(result, null, 2));
});
root
.command("status")
.description("Show call status")
.requiredOption("--call-id <id>", "Call ID")
.action(async (options: { callId: string }) => {
const rt = await ensureRuntime();
const call = rt.manager.getCall(options.callId);
// eslint-disable-next-line no-console
console.log(JSON.stringify(call ?? { found: false }, null, 2));
});
root
.command("tail")
.description("Tail voice-call JSONL logs (prints new lines; useful during provider tests)")
.option("--file <path>", "Path to calls.jsonl", resolveDefaultStorePath(config))
.option("--since <n>", "Print last N lines first", "25")
.option("--poll <ms>", "Poll interval in ms", "250")
.action(async (options: { file: string; since?: string; poll?: string }) => {
const file = options.file;
const since = Math.max(0, Number(options.since ?? 0));
const pollMs = Math.max(50, Number(options.poll ?? 250));
if (!fs.existsSync(file)) {
logger.error(`No log file at ${file}`);
process.exit(1);
}
const initial = fs.readFileSync(file, "utf8");
const lines = initial.split("\n").filter(Boolean);
for (const line of lines.slice(Math.max(0, lines.length - since))) {
// eslint-disable-next-line no-console
console.log(line);
}
let offset = Buffer.byteLength(initial, "utf8");
for (;;) {
try {
const stat = fs.statSync(file);
if (stat.size < offset) {
offset = 0;
}
if (stat.size > offset) {
const fd = fs.openSync(file, "r");
try {
const buf = Buffer.alloc(stat.size - offset);
fs.readSync(fd, buf, 0, buf.length, offset);
offset = stat.size;
const text = buf.toString("utf8");
for (const line of text.split("\n").filter(Boolean)) {
// eslint-disable-next-line no-console
console.log(line);
}
} finally {
fs.closeSync(fd);
}
}
} catch {
// ignore and retry
}
await sleep(pollMs);
}
});
root
.command("latency")
.description("Summarize turn latency metrics from voice-call JSONL logs")
.option("--file <path>", "Path to calls.jsonl", resolveDefaultStorePath(config))
.option("--last <n>", "Analyze last N records", "200")
.action(async (options: { file: string; last?: string }) => {
const file = options.file;
const last = Math.max(1, Number(options.last ?? 200));
if (!fs.existsSync(file)) {
throw new Error("No log file at " + file);
}
const content = fs.readFileSync(file, "utf8");
const lines = content.split("\n").filter(Boolean).slice(-last);
const turnLatencyMs: number[] = [];
const listenWaitMs: number[] = [];
for (const line of lines) {
try {
const parsed = JSON.parse(line) as {
metadata?: { lastTurnLatencyMs?: unknown; lastTurnListenWaitMs?: unknown };
};
const latency = parsed.metadata?.lastTurnLatencyMs;
const listenWait = parsed.metadata?.lastTurnListenWaitMs;
if (typeof latency === "number" && Number.isFinite(latency)) {
turnLatencyMs.push(latency);
}
if (typeof listenWait === "number" && Number.isFinite(listenWait)) {
listenWaitMs.push(listenWait);
}
} catch {
// ignore malformed JSON lines
}
}
// eslint-disable-next-line no-console
console.log(
JSON.stringify(
{
recordsScanned: lines.length,
turnLatency: summarizeSeries(turnLatencyMs),
listenWait: summarizeSeries(listenWaitMs),
},
null,
2,
),
);
});
root
.command("expose")
.description("Enable/disable Tailscale serve/funnel for the webhook")
.option("--mode <mode>", "off | serve (tailnet) | funnel (public)", "funnel")
.option("--path <path>", "Tailscale path to expose (recommend matching serve.path)")
.option("--port <port>", "Local webhook port")
.option("--serve-path <path>", "Local webhook path")
.action(
async (options: { mode?: string; port?: string; path?: string; servePath?: string }) => {
const mode = resolveMode(options.mode ?? "funnel");
const servePort = Number(options.port ?? config.serve.port ?? 3334);
const servePath = String(options.servePath ?? config.serve.path ?? "/voice/webhook");
const tsPath = String(options.path ?? config.tailscale?.path ?? servePath);
const localUrl = `http://127.0.0.1:${servePort}`;
if (mode === "off") {
await cleanupTailscaleExposureRoute({ mode: "serve", path: tsPath });
await cleanupTailscaleExposureRoute({ mode: "funnel", path: tsPath });
// eslint-disable-next-line no-console
console.log(JSON.stringify({ ok: true, mode: "off", path: tsPath }, null, 2));
return;
}
const publicUrl = await setupTailscaleExposureRoute({
mode,
path: tsPath,
localUrl,
});
const tsInfo = publicUrl ? null : await getTailscaleSelfInfo();
const enableUrl = tsInfo?.nodeId
? `https://login.tailscale.com/f/${mode}?node=${tsInfo.nodeId}`
: null;
// eslint-disable-next-line no-console
console.log(
JSON.stringify(
{
ok: Boolean(publicUrl),
mode,
path: tsPath,
localUrl,
publicUrl,
hint: publicUrl
? undefined
: {
note: "Tailscale serve/funnel may be disabled on this tailnet (or require admin enable).",
enableUrl,
},
},
null,
2,
),
);
},
);
}

View File

@@ -0,0 +1,208 @@
import { afterEach, beforeEach, describe, expect, it } from "vitest";
import { validateProviderConfig, resolveVoiceCallConfig, type VoiceCallConfig } from "./config.js";
function createBaseConfig(provider: "telnyx" | "twilio" | "plivo" | "mock"): VoiceCallConfig {
return {
enabled: true,
provider,
fromNumber: "+15550001234",
inboundPolicy: "disabled",
allowFrom: [],
outbound: { defaultMode: "notify", notifyHangupDelaySec: 3 },
maxDurationSeconds: 300,
staleCallReaperSeconds: 600,
silenceTimeoutMs: 800,
transcriptTimeoutMs: 180000,
ringTimeoutMs: 30000,
maxConcurrentCalls: 1,
serve: { port: 3334, bind: "127.0.0.1", path: "/voice/webhook" },
tailscale: { mode: "off", path: "/voice/webhook" },
tunnel: { provider: "none", allowNgrokFreeTierLoopbackBypass: false },
webhookSecurity: {
allowedHosts: [],
trustForwardingHeaders: false,
trustedProxyIPs: [],
},
streaming: {
enabled: false,
sttProvider: "openai-realtime",
sttModel: "gpt-4o-transcribe",
silenceDurationMs: 800,
vadThreshold: 0.5,
streamPath: "/voice/stream",
preStartTimeoutMs: 5000,
maxPendingConnections: 32,
maxPendingConnectionsPerIp: 4,
maxConnections: 128,
},
skipSignatureVerification: false,
stt: { provider: "openai", model: "whisper-1" },
tts: {
provider: "openai",
openai: { model: "gpt-4o-mini-tts", voice: "coral" },
},
responseModel: "openai/gpt-4o-mini",
responseTimeoutMs: 30000,
};
}
describe("validateProviderConfig", () => {
const originalEnv = { ...process.env };
const clearProviderEnv = () => {
delete process.env.TWILIO_ACCOUNT_SID;
delete process.env.TWILIO_AUTH_TOKEN;
delete process.env.TELNYX_API_KEY;
delete process.env.TELNYX_CONNECTION_ID;
delete process.env.TELNYX_PUBLIC_KEY;
delete process.env.PLIVO_AUTH_ID;
delete process.env.PLIVO_AUTH_TOKEN;
};
beforeEach(() => {
clearProviderEnv();
});
afterEach(() => {
// Restore original env
process.env = { ...originalEnv };
});
describe("provider credential sources", () => {
it("passes validation when credentials come from config or environment", () => {
for (const provider of ["twilio", "telnyx", "plivo"] as const) {
clearProviderEnv();
const fromConfig = createBaseConfig(provider);
if (provider === "twilio") {
fromConfig.twilio = { accountSid: "AC123", authToken: "secret" };
} else if (provider === "telnyx") {
fromConfig.telnyx = {
apiKey: "KEY123",
connectionId: "CONN456",
publicKey: "public-key",
};
} else {
fromConfig.plivo = { authId: "MA123", authToken: "secret" };
}
expect(validateProviderConfig(fromConfig)).toMatchObject({ valid: true, errors: [] });
clearProviderEnv();
if (provider === "twilio") {
process.env.TWILIO_ACCOUNT_SID = "AC123";
process.env.TWILIO_AUTH_TOKEN = "secret";
} else if (provider === "telnyx") {
process.env.TELNYX_API_KEY = "KEY123";
process.env.TELNYX_CONNECTION_ID = "CONN456";
process.env.TELNYX_PUBLIC_KEY = "public-key";
} else {
process.env.PLIVO_AUTH_ID = "MA123";
process.env.PLIVO_AUTH_TOKEN = "secret";
}
const fromEnv = resolveVoiceCallConfig(createBaseConfig(provider));
expect(validateProviderConfig(fromEnv)).toMatchObject({ valid: true, errors: [] });
}
});
});
describe("twilio provider", () => {
it("passes validation with mixed config and env vars", () => {
process.env.TWILIO_AUTH_TOKEN = "secret";
let config = createBaseConfig("twilio");
config.twilio = { accountSid: "AC123" };
config = resolveVoiceCallConfig(config);
const result = validateProviderConfig(config);
expect(result.valid).toBe(true);
expect(result.errors).toEqual([]);
});
it("fails validation when required twilio credentials are missing", () => {
process.env.TWILIO_AUTH_TOKEN = "secret";
const missingSid = validateProviderConfig(resolveVoiceCallConfig(createBaseConfig("twilio")));
expect(missingSid.valid).toBe(false);
expect(missingSid.errors).toContain(
"plugins.entries.voice-call.config.twilio.accountSid is required (or set TWILIO_ACCOUNT_SID env)",
);
delete process.env.TWILIO_AUTH_TOKEN;
process.env.TWILIO_ACCOUNT_SID = "AC123";
const missingToken = validateProviderConfig(
resolveVoiceCallConfig(createBaseConfig("twilio")),
);
expect(missingToken.valid).toBe(false);
expect(missingToken.errors).toContain(
"plugins.entries.voice-call.config.twilio.authToken is required (or set TWILIO_AUTH_TOKEN env)",
);
});
});
describe("telnyx provider", () => {
it("fails validation when apiKey is missing everywhere", () => {
process.env.TELNYX_CONNECTION_ID = "CONN456";
let config = createBaseConfig("telnyx");
config = resolveVoiceCallConfig(config);
const result = validateProviderConfig(config);
expect(result.valid).toBe(false);
expect(result.errors).toContain(
"plugins.entries.voice-call.config.telnyx.apiKey is required (or set TELNYX_API_KEY env)",
);
});
it("requires a public key unless signature verification is skipped", () => {
const missingPublicKey = createBaseConfig("telnyx");
missingPublicKey.inboundPolicy = "allowlist";
missingPublicKey.telnyx = { apiKey: "KEY123", connectionId: "CONN456" };
const missingPublicKeyResult = validateProviderConfig(missingPublicKey);
expect(missingPublicKeyResult.valid).toBe(false);
expect(missingPublicKeyResult.errors).toContain(
"plugins.entries.voice-call.config.telnyx.publicKey is required (or set TELNYX_PUBLIC_KEY env)",
);
const withPublicKey = createBaseConfig("telnyx");
withPublicKey.inboundPolicy = "allowlist";
withPublicKey.telnyx = {
apiKey: "KEY123",
connectionId: "CONN456",
publicKey: "public-key",
};
expect(validateProviderConfig(withPublicKey)).toMatchObject({ valid: true, errors: [] });
const skippedVerification = createBaseConfig("telnyx");
skippedVerification.skipSignatureVerification = true;
skippedVerification.telnyx = { apiKey: "KEY123", connectionId: "CONN456" };
expect(validateProviderConfig(skippedVerification)).toMatchObject({
valid: true,
errors: [],
});
});
});
describe("plivo provider", () => {
it("fails validation when authId is missing everywhere", () => {
process.env.PLIVO_AUTH_TOKEN = "secret";
let config = createBaseConfig("plivo");
config = resolveVoiceCallConfig(config);
const result = validateProviderConfig(config);
expect(result.valid).toBe(false);
expect(result.errors).toContain(
"plugins.entries.voice-call.config.plivo.authId is required (or set PLIVO_AUTH_ID env)",
);
});
});
describe("disabled config", () => {
it("skips validation when enabled is false", () => {
const config = createBaseConfig("twilio");
config.enabled = false;
const result = validateProviderConfig(config);
expect(result.valid).toBe(true);
expect(result.errors).toEqual([]);
});
});
});

View File

@@ -0,0 +1,477 @@
import {
TtsAutoSchema,
TtsConfigSchema,
TtsModeSchema,
TtsProviderSchema,
} from "openclaw/plugin-sdk";
import { z } from "zod";
// -----------------------------------------------------------------------------
// Phone Number Validation
// -----------------------------------------------------------------------------
/**
* E.164 phone number format: +[country code][number]
* Examples use 555 prefix (reserved for fictional numbers)
*/
export const E164Schema = z
.string()
.regex(/^\+[1-9]\d{1,14}$/, "Expected E.164 format, e.g. +15550001234");
// -----------------------------------------------------------------------------
// Inbound Policy
// -----------------------------------------------------------------------------
/**
* Controls how inbound calls are handled:
* - "disabled": Block all inbound calls (outbound only)
* - "allowlist": Only accept calls from numbers in allowFrom
* - "pairing": Unknown callers can request pairing (future)
* - "open": Accept all inbound calls (dangerous!)
*/
export const InboundPolicySchema = z.enum(["disabled", "allowlist", "pairing", "open"]);
export type InboundPolicy = z.infer<typeof InboundPolicySchema>;
// -----------------------------------------------------------------------------
// Provider-Specific Configuration
// -----------------------------------------------------------------------------
export const TelnyxConfigSchema = z
.object({
/** Telnyx API v2 key */
apiKey: z.string().min(1).optional(),
/** Telnyx connection ID (from Call Control app) */
connectionId: z.string().min(1).optional(),
/** Public key for webhook signature verification */
publicKey: z.string().min(1).optional(),
})
.strict();
export type TelnyxConfig = z.infer<typeof TelnyxConfigSchema>;
export const TwilioConfigSchema = z
.object({
/** Twilio Account SID */
accountSid: z.string().min(1).optional(),
/** Twilio Auth Token */
authToken: z.string().min(1).optional(),
})
.strict();
export type TwilioConfig = z.infer<typeof TwilioConfigSchema>;
export const PlivoConfigSchema = z
.object({
/** Plivo Auth ID (starts with MA/SA) */
authId: z.string().min(1).optional(),
/** Plivo Auth Token */
authToken: z.string().min(1).optional(),
})
.strict();
export type PlivoConfig = z.infer<typeof PlivoConfigSchema>;
// -----------------------------------------------------------------------------
// STT/TTS Configuration
// -----------------------------------------------------------------------------
export const SttConfigSchema = z
.object({
/** STT provider (currently only OpenAI supported) */
provider: z.literal("openai").default("openai"),
/** Whisper model to use */
model: z.string().min(1).default("whisper-1"),
})
.strict()
.default({ provider: "openai", model: "whisper-1" });
export type SttConfig = z.infer<typeof SttConfigSchema>;
export { TtsAutoSchema, TtsConfigSchema, TtsModeSchema, TtsProviderSchema };
export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;
// -----------------------------------------------------------------------------
// Webhook Server Configuration
// -----------------------------------------------------------------------------
export const VoiceCallServeConfigSchema = z
.object({
/** Port to listen on */
port: z.number().int().positive().default(3334),
/** Bind address */
bind: z.string().default("127.0.0.1"),
/** Webhook path */
path: z.string().min(1).default("/voice/webhook"),
})
.strict()
.default({ port: 3334, bind: "127.0.0.1", path: "/voice/webhook" });
export type VoiceCallServeConfig = z.infer<typeof VoiceCallServeConfigSchema>;
export const VoiceCallTailscaleConfigSchema = z
.object({
/**
* Tailscale exposure mode:
* - "off": No Tailscale exposure
* - "serve": Tailscale serve (private to tailnet)
* - "funnel": Tailscale funnel (public HTTPS)
*/
mode: z.enum(["off", "serve", "funnel"]).default("off"),
/** Path for Tailscale serve/funnel (should usually match serve.path) */
path: z.string().min(1).default("/voice/webhook"),
})
.strict()
.default({ mode: "off", path: "/voice/webhook" });
export type VoiceCallTailscaleConfig = z.infer<typeof VoiceCallTailscaleConfigSchema>;
// -----------------------------------------------------------------------------
// Tunnel Configuration (unified ngrok/tailscale)
// -----------------------------------------------------------------------------
export const VoiceCallTunnelConfigSchema = z
.object({
/**
* Tunnel provider:
* - "none": No tunnel (use publicUrl if set, or manual setup)
* - "ngrok": Use ngrok for public HTTPS tunnel
* - "tailscale-serve": Tailscale serve (private to tailnet)
* - "tailscale-funnel": Tailscale funnel (public HTTPS)
*/
provider: z.enum(["none", "ngrok", "tailscale-serve", "tailscale-funnel"]).default("none"),
/** ngrok auth token (optional, enables longer sessions and more features) */
ngrokAuthToken: z.string().min(1).optional(),
/** ngrok custom domain (paid feature, e.g., "myapp.ngrok.io") */
ngrokDomain: z.string().min(1).optional(),
/**
* Allow ngrok free tier compatibility mode.
* When true, forwarded headers may be trusted for loopback requests
* to reconstruct the public ngrok URL used for signing.
*
* IMPORTANT: This does NOT bypass signature verification.
*/
allowNgrokFreeTierLoopbackBypass: z.boolean().default(false),
})
.strict()
.default({ provider: "none", allowNgrokFreeTierLoopbackBypass: false });
export type VoiceCallTunnelConfig = z.infer<typeof VoiceCallTunnelConfigSchema>;
// -----------------------------------------------------------------------------
// Webhook Security Configuration
// -----------------------------------------------------------------------------
export const VoiceCallWebhookSecurityConfigSchema = z
.object({
/**
* Allowed hostnames for webhook URL reconstruction.
* Only these hosts are accepted from forwarding headers.
*/
allowedHosts: z.array(z.string().min(1)).default([]),
/**
* Trust X-Forwarded-* headers without a hostname allowlist.
* WARNING: Only enable if you trust your proxy configuration.
*/
trustForwardingHeaders: z.boolean().default(false),
/**
* Trusted proxy IP addresses. Forwarded headers are only trusted when
* the remote IP matches one of these addresses.
*/
trustedProxyIPs: z.array(z.string().min(1)).default([]),
})
.strict()
.default({ allowedHosts: [], trustForwardingHeaders: false, trustedProxyIPs: [] });
export type WebhookSecurityConfig = z.infer<typeof VoiceCallWebhookSecurityConfigSchema>;
// -----------------------------------------------------------------------------
// Outbound Call Configuration
// -----------------------------------------------------------------------------
/**
* Call mode determines how outbound calls behave:
* - "notify": Deliver message and auto-hangup after delay (one-way notification)
* - "conversation": Stay open for back-and-forth until explicit end or timeout
*/
export const CallModeSchema = z.enum(["notify", "conversation"]);
export type CallMode = z.infer<typeof CallModeSchema>;
export const OutboundConfigSchema = z
.object({
/** Default call mode for outbound calls */
defaultMode: CallModeSchema.default("notify"),
/** Seconds to wait after TTS before auto-hangup in notify mode */
notifyHangupDelaySec: z.number().int().nonnegative().default(3),
})
.strict()
.default({ defaultMode: "notify", notifyHangupDelaySec: 3 });
export type OutboundConfig = z.infer<typeof OutboundConfigSchema>;
// -----------------------------------------------------------------------------
// Streaming Configuration (OpenAI Realtime STT)
// -----------------------------------------------------------------------------
export const VoiceCallStreamingConfigSchema = z
.object({
/** Enable real-time audio streaming (requires WebSocket support) */
enabled: z.boolean().default(false),
/** STT provider for real-time transcription */
sttProvider: z.enum(["openai-realtime"]).default("openai-realtime"),
/** OpenAI API key for Realtime API (uses OPENAI_API_KEY env if not set) */
openaiApiKey: z.string().min(1).optional(),
/** OpenAI transcription model (default: gpt-4o-transcribe) */
sttModel: z.string().min(1).default("gpt-4o-transcribe"),
/** VAD silence duration in ms before considering speech ended */
silenceDurationMs: z.number().int().positive().default(800),
/** VAD threshold 0-1 (higher = less sensitive) */
vadThreshold: z.number().min(0).max(1).default(0.5),
/** WebSocket path for media stream connections */
streamPath: z.string().min(1).default("/voice/stream"),
/**
* Close unauthenticated media stream sockets if no valid `start` frame arrives in time.
* Protects against pre-auth idle connection hold attacks.
*/
preStartTimeoutMs: z.number().int().positive().default(5000),
/** Maximum number of concurrently pending (pre-start) media stream sockets. */
maxPendingConnections: z.number().int().positive().default(32),
/** Maximum pending media stream sockets per source IP. */
maxPendingConnectionsPerIp: z.number().int().positive().default(4),
/** Hard cap for all open media stream sockets (pending + active). */
maxConnections: z.number().int().positive().default(128),
})
.strict()
.default({
enabled: false,
sttProvider: "openai-realtime",
sttModel: "gpt-4o-transcribe",
silenceDurationMs: 800,
vadThreshold: 0.5,
streamPath: "/voice/stream",
preStartTimeoutMs: 5000,
maxPendingConnections: 32,
maxPendingConnectionsPerIp: 4,
maxConnections: 128,
});
export type VoiceCallStreamingConfig = z.infer<typeof VoiceCallStreamingConfigSchema>;
// -----------------------------------------------------------------------------
// Main Voice Call Configuration
// -----------------------------------------------------------------------------
export const VoiceCallConfigSchema = z
.object({
/** Enable voice call functionality */
enabled: z.boolean().default(false),
/** Active provider (telnyx, twilio, plivo, or mock) */
provider: z.enum(["telnyx", "twilio", "plivo", "mock"]).optional(),
/** Telnyx-specific configuration */
telnyx: TelnyxConfigSchema.optional(),
/** Twilio-specific configuration */
twilio: TwilioConfigSchema.optional(),
/** Plivo-specific configuration */
plivo: PlivoConfigSchema.optional(),
/** Phone number to call from (E.164) */
fromNumber: E164Schema.optional(),
/** Default phone number to call (E.164) */
toNumber: E164Schema.optional(),
/** Inbound call policy */
inboundPolicy: InboundPolicySchema.default("disabled"),
/** Allowlist of phone numbers for inbound calls (E.164) */
allowFrom: z.array(E164Schema).default([]),
/** Greeting message for inbound calls */
inboundGreeting: z.string().optional(),
/** Outbound call configuration */
outbound: OutboundConfigSchema,
/** Maximum call duration in seconds */
maxDurationSeconds: z.number().int().positive().default(300),
/**
* Maximum age of a call in seconds before it is automatically reaped.
* Catches calls stuck in unexpected states (e.g., notify-mode calls that
* never receive a terminal webhook). Set to 0 to disable.
* Default: 0 (disabled). Recommended: 120-300 for production.
*/
staleCallReaperSeconds: z.number().int().nonnegative().default(0),
/** Silence timeout for end-of-speech detection (ms) */
silenceTimeoutMs: z.number().int().positive().default(800),
/** Timeout for user transcript (ms) */
transcriptTimeoutMs: z.number().int().positive().default(180000),
/** Ring timeout for outbound calls (ms) */
ringTimeoutMs: z.number().int().positive().default(30000),
/** Maximum concurrent calls */
maxConcurrentCalls: z.number().int().positive().default(1),
/** Webhook server configuration */
serve: VoiceCallServeConfigSchema,
/** Tailscale exposure configuration (legacy, prefer tunnel config) */
tailscale: VoiceCallTailscaleConfigSchema,
/** Tunnel configuration (unified ngrok/tailscale) */
tunnel: VoiceCallTunnelConfigSchema,
/** Webhook signature reconstruction and proxy trust configuration */
webhookSecurity: VoiceCallWebhookSecurityConfigSchema,
/** Real-time audio streaming configuration */
streaming: VoiceCallStreamingConfigSchema,
/** Public webhook URL override (if set, bypasses tunnel auto-detection) */
publicUrl: z.string().url().optional(),
/** Skip webhook signature verification (development only, NOT for production) */
skipSignatureVerification: z.boolean().default(false),
/** STT configuration */
stt: SttConfigSchema,
/** TTS override (deep-merges with core messages.tts) */
tts: TtsConfigSchema,
/** Store path for call logs */
store: z.string().optional(),
/** Model for generating voice responses (e.g., "anthropic/claude-sonnet-4", "openai/gpt-4o") */
responseModel: z.string().default("openai/gpt-4o-mini"),
/** System prompt for voice responses */
responseSystemPrompt: z.string().optional(),
/** Timeout for response generation in ms (default 30s) */
responseTimeoutMs: z.number().int().positive().default(30000),
})
.strict();
export type VoiceCallConfig = z.infer<typeof VoiceCallConfigSchema>;
// -----------------------------------------------------------------------------
// Configuration Helpers
// -----------------------------------------------------------------------------
/**
* Resolves the configuration by merging environment variables into missing fields.
* Returns a new configuration object with environment variables applied.
*/
export function resolveVoiceCallConfig(config: VoiceCallConfig): VoiceCallConfig {
const resolved = JSON.parse(JSON.stringify(config)) as VoiceCallConfig;
// Telnyx
if (resolved.provider === "telnyx") {
resolved.telnyx = resolved.telnyx ?? {};
resolved.telnyx.apiKey = resolved.telnyx.apiKey ?? process.env.TELNYX_API_KEY;
resolved.telnyx.connectionId = resolved.telnyx.connectionId ?? process.env.TELNYX_CONNECTION_ID;
resolved.telnyx.publicKey = resolved.telnyx.publicKey ?? process.env.TELNYX_PUBLIC_KEY;
}
// Twilio
if (resolved.provider === "twilio") {
resolved.twilio = resolved.twilio ?? {};
resolved.twilio.accountSid = resolved.twilio.accountSid ?? process.env.TWILIO_ACCOUNT_SID;
resolved.twilio.authToken = resolved.twilio.authToken ?? process.env.TWILIO_AUTH_TOKEN;
}
// Plivo
if (resolved.provider === "plivo") {
resolved.plivo = resolved.plivo ?? {};
resolved.plivo.authId = resolved.plivo.authId ?? process.env.PLIVO_AUTH_ID;
resolved.plivo.authToken = resolved.plivo.authToken ?? process.env.PLIVO_AUTH_TOKEN;
}
// Tunnel Config
resolved.tunnel = resolved.tunnel ?? {
provider: "none",
allowNgrokFreeTierLoopbackBypass: false,
};
resolved.tunnel.allowNgrokFreeTierLoopbackBypass =
resolved.tunnel.allowNgrokFreeTierLoopbackBypass ?? false;
resolved.tunnel.ngrokAuthToken = resolved.tunnel.ngrokAuthToken ?? process.env.NGROK_AUTHTOKEN;
resolved.tunnel.ngrokDomain = resolved.tunnel.ngrokDomain ?? process.env.NGROK_DOMAIN;
// Webhook Security Config
resolved.webhookSecurity = resolved.webhookSecurity ?? {
allowedHosts: [],
trustForwardingHeaders: false,
trustedProxyIPs: [],
};
resolved.webhookSecurity.allowedHosts = resolved.webhookSecurity.allowedHosts ?? [];
resolved.webhookSecurity.trustForwardingHeaders =
resolved.webhookSecurity.trustForwardingHeaders ?? false;
resolved.webhookSecurity.trustedProxyIPs = resolved.webhookSecurity.trustedProxyIPs ?? [];
return resolved;
}
/**
* Validate that the configuration has all required fields for the selected provider.
*/
export function validateProviderConfig(config: VoiceCallConfig): {
valid: boolean;
errors: string[];
} {
const errors: string[] = [];
if (!config.enabled) {
return { valid: true, errors: [] };
}
if (!config.provider) {
errors.push("plugins.entries.voice-call.config.provider is required");
}
if (!config.fromNumber && config.provider !== "mock") {
errors.push("plugins.entries.voice-call.config.fromNumber is required");
}
if (config.provider === "telnyx") {
if (!config.telnyx?.apiKey) {
errors.push(
"plugins.entries.voice-call.config.telnyx.apiKey is required (or set TELNYX_API_KEY env)",
);
}
if (!config.telnyx?.connectionId) {
errors.push(
"plugins.entries.voice-call.config.telnyx.connectionId is required (or set TELNYX_CONNECTION_ID env)",
);
}
if (!config.skipSignatureVerification && !config.telnyx?.publicKey) {
errors.push(
"plugins.entries.voice-call.config.telnyx.publicKey is required (or set TELNYX_PUBLIC_KEY env)",
);
}
}
if (config.provider === "twilio") {
if (!config.twilio?.accountSid) {
errors.push(
"plugins.entries.voice-call.config.twilio.accountSid is required (or set TWILIO_ACCOUNT_SID env)",
);
}
if (!config.twilio?.authToken) {
errors.push(
"plugins.entries.voice-call.config.twilio.authToken is required (or set TWILIO_AUTH_TOKEN env)",
);
}
}
if (config.provider === "plivo") {
if (!config.plivo?.authId) {
errors.push(
"plugins.entries.voice-call.config.plivo.authId is required (or set PLIVO_AUTH_ID env)",
);
}
if (!config.plivo?.authToken) {
errors.push(
"plugins.entries.voice-call.config.plivo.authToken is required (or set PLIVO_AUTH_TOKEN env)",
);
}
}
return { valid: errors.length === 0, errors };
}

View File

@@ -0,0 +1,159 @@
import fs from "node:fs";
import path from "node:path";
import { fileURLToPath, pathToFileURL } from "node:url";
import type { VoiceCallTtsConfig } from "./config.js";
export type CoreConfig = {
session?: {
store?: string;
};
messages?: {
tts?: VoiceCallTtsConfig;
};
[key: string]: unknown;
};
type CoreAgentDeps = {
resolveAgentDir: (cfg: CoreConfig, agentId: string) => string;
resolveAgentWorkspaceDir: (cfg: CoreConfig, agentId: string) => string;
resolveAgentIdentity: (
cfg: CoreConfig,
agentId: string,
) => { name?: string | null } | null | undefined;
resolveThinkingDefault: (params: {
cfg: CoreConfig;
provider?: string;
model?: string;
}) => string;
runEmbeddedPiAgent: (params: {
sessionId: string;
sessionKey?: string;
messageProvider?: string;
sessionFile: string;
workspaceDir: string;
config?: CoreConfig;
prompt: string;
provider?: string;
model?: string;
thinkLevel?: string;
verboseLevel?: string;
timeoutMs: number;
runId: string;
lane?: string;
extraSystemPrompt?: string;
agentDir?: string;
}) => Promise<{
payloads?: Array<{ text?: string; isError?: boolean }>;
meta?: { aborted?: boolean };
}>;
resolveAgentTimeoutMs: (opts: { cfg: CoreConfig }) => number;
ensureAgentWorkspace: (params?: { dir: string }) => Promise<void>;
resolveStorePath: (store?: string, opts?: { agentId?: string }) => string;
loadSessionStore: (storePath: string) => Record<string, unknown>;
saveSessionStore: (storePath: string, store: Record<string, unknown>) => Promise<void>;
resolveSessionFilePath: (
sessionId: string,
entry: unknown,
opts?: { agentId?: string },
) => string;
DEFAULT_MODEL: string;
DEFAULT_PROVIDER: string;
};
let coreRootCache: string | null = null;
let coreDepsPromise: Promise<CoreAgentDeps> | null = null;
function findPackageRoot(startDir: string, name: string): string | null {
let dir = startDir;
for (;;) {
const pkgPath = path.join(dir, "package.json");
try {
if (fs.existsSync(pkgPath)) {
const raw = fs.readFileSync(pkgPath, "utf8");
const pkg = JSON.parse(raw) as { name?: string };
if (pkg.name === name) {
return dir;
}
}
} catch {
// ignore parse errors and keep walking
}
const parent = path.dirname(dir);
if (parent === dir) {
return null;
}
dir = parent;
}
}
function resolveOpenClawRoot(): string {
if (coreRootCache) {
return coreRootCache;
}
const override = process.env.OPENCLAW_ROOT?.trim();
if (override) {
coreRootCache = override;
return override;
}
const candidates = new Set<string>();
if (process.argv[1]) {
candidates.add(path.dirname(process.argv[1]));
}
candidates.add(process.cwd());
try {
const urlPath = fileURLToPath(import.meta.url);
candidates.add(path.dirname(urlPath));
} catch {
// ignore
}
for (const start of candidates) {
for (const name of ["openclaw"]) {
const found = findPackageRoot(start, name);
if (found) {
coreRootCache = found;
return found;
}
}
}
throw new Error("Unable to resolve core root. Set OPENCLAW_ROOT to the package root.");
}
async function importCoreExtensionAPI(): Promise<{
resolveAgentDir: CoreAgentDeps["resolveAgentDir"];
resolveAgentWorkspaceDir: CoreAgentDeps["resolveAgentWorkspaceDir"];
DEFAULT_MODEL: string;
DEFAULT_PROVIDER: string;
resolveAgentIdentity: CoreAgentDeps["resolveAgentIdentity"];
resolveThinkingDefault: CoreAgentDeps["resolveThinkingDefault"];
runEmbeddedPiAgent: CoreAgentDeps["runEmbeddedPiAgent"];
resolveAgentTimeoutMs: CoreAgentDeps["resolveAgentTimeoutMs"];
ensureAgentWorkspace: CoreAgentDeps["ensureAgentWorkspace"];
resolveStorePath: CoreAgentDeps["resolveStorePath"];
loadSessionStore: CoreAgentDeps["loadSessionStore"];
saveSessionStore: CoreAgentDeps["saveSessionStore"];
resolveSessionFilePath: CoreAgentDeps["resolveSessionFilePath"];
}> {
// Do not import any other module. You can't touch this or you will be fired.
const distPath = path.join(resolveOpenClawRoot(), "dist", "extensionAPI.js");
if (!fs.existsSync(distPath)) {
throw new Error(
`Missing core module at ${distPath}. Run \`pnpm build\` or install the official package.`,
);
}
return await import(pathToFileURL(distPath).href);
}
export async function loadCoreAgentDeps(): Promise<CoreAgentDeps> {
if (coreDepsPromise) {
return coreDepsPromise;
}
coreDepsPromise = (async () => {
return await importCoreExtensionAPI();
})();
return coreDepsPromise;
}

View File

@@ -0,0 +1,16 @@
import { describe, expect, it } from "vitest";
import { getHeader } from "./http-headers.js";
describe("getHeader", () => {
it("returns first value when header is an array", () => {
expect(getHeader({ "x-test": ["first", "second"] }, "x-test")).toBe("first");
});
it("matches headers case-insensitively", () => {
expect(getHeader({ "X-Twilio-Signature": "sig-1" }, "x-twilio-signature")).toBe("sig-1");
});
it("returns undefined for missing header", () => {
expect(getHeader({ host: "example.com" }, "x-missing")).toBeUndefined();
});
});

View File

@@ -0,0 +1,12 @@
export type HttpHeaderMap = Record<string, string | string[] | undefined>;
export function getHeader(headers: HttpHeaderMap, name: string): string | undefined {
const target = name.toLowerCase();
const direct = headers[target];
const value =
direct ?? Object.entries(headers).find(([key]) => key.toLowerCase() === target)?.[1];
if (Array.isArray(value)) {
return value[0];
}
return value;
}

View File

@@ -0,0 +1,467 @@
import os from "node:os";
import path from "node:path";
import { describe, expect, it } from "vitest";
import { VoiceCallConfigSchema } from "./config.js";
import { CallManager } from "./manager.js";
import type { VoiceCallProvider } from "./providers/base.js";
import type {
HangupCallInput,
InitiateCallInput,
InitiateCallResult,
PlayTtsInput,
ProviderWebhookParseResult,
StartListeningInput,
StopListeningInput,
WebhookContext,
WebhookVerificationResult,
} from "./types.js";
class FakeProvider implements VoiceCallProvider {
readonly name: "plivo" | "twilio";
readonly playTtsCalls: PlayTtsInput[] = [];
readonly hangupCalls: HangupCallInput[] = [];
readonly startListeningCalls: StartListeningInput[] = [];
readonly stopListeningCalls: StopListeningInput[] = [];
constructor(name: "plivo" | "twilio" = "plivo") {
this.name = name;
}
verifyWebhook(_ctx: WebhookContext): WebhookVerificationResult {
return { ok: true };
}
parseWebhookEvent(_ctx: WebhookContext): ProviderWebhookParseResult {
return { events: [], statusCode: 200 };
}
async initiateCall(_input: InitiateCallInput): Promise<InitiateCallResult> {
return { providerCallId: "request-uuid", status: "initiated" };
}
async hangupCall(input: HangupCallInput): Promise<void> {
this.hangupCalls.push(input);
}
async playTts(input: PlayTtsInput): Promise<void> {
this.playTtsCalls.push(input);
}
async startListening(input: StartListeningInput): Promise<void> {
this.startListeningCalls.push(input);
}
async stopListening(input: StopListeningInput): Promise<void> {
this.stopListeningCalls.push(input);
}
}
let storeSeq = 0;
function createTestStorePath(): string {
storeSeq += 1;
return path.join(os.tmpdir(), `openclaw-voice-call-test-${Date.now()}-${storeSeq}`);
}
function createManagerHarness(
configOverrides: Record<string, unknown> = {},
provider = new FakeProvider(),
): {
manager: CallManager;
provider: FakeProvider;
} {
const config = VoiceCallConfigSchema.parse({
enabled: true,
provider: "plivo",
fromNumber: "+15550000000",
...configOverrides,
});
const manager = new CallManager(config, createTestStorePath());
manager.initialize(provider, "https://example.com/voice/webhook");
return { manager, provider };
}
function markCallAnswered(manager: CallManager, callId: string, eventId: string): void {
manager.processEvent({
id: eventId,
type: "call.answered",
callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
});
}
describe("CallManager", () => {
it("upgrades providerCallId mapping when provider ID changes", async () => {
const { manager } = createManagerHarness();
const { callId, success, error } = await manager.initiateCall("+15550000001");
expect(success).toBe(true);
expect(error).toBeUndefined();
// The provider returned a request UUID as the initial providerCallId.
expect(manager.getCall(callId)?.providerCallId).toBe("request-uuid");
expect(manager.getCallByProviderCallId("request-uuid")?.callId).toBe(callId);
// Provider later reports the actual call UUID.
manager.processEvent({
id: "evt-1",
type: "call.answered",
callId,
providerCallId: "call-uuid",
timestamp: Date.now(),
});
expect(manager.getCall(callId)?.providerCallId).toBe("call-uuid");
expect(manager.getCallByProviderCallId("call-uuid")?.callId).toBe(callId);
expect(manager.getCallByProviderCallId("request-uuid")).toBeUndefined();
});
it("speaks initial message on answered for notify mode (non-Twilio)", async () => {
const { manager, provider } = createManagerHarness();
const { callId, success } = await manager.initiateCall("+15550000002", undefined, {
message: "Hello there",
mode: "notify",
});
expect(success).toBe(true);
manager.processEvent({
id: "evt-2",
type: "call.answered",
callId,
providerCallId: "call-uuid",
timestamp: Date.now(),
});
await new Promise((resolve) => setTimeout(resolve, 0));
expect(provider.playTtsCalls).toHaveLength(1);
expect(provider.playTtsCalls[0]?.text).toBe("Hello there");
});
it("rejects inbound calls with missing caller ID when allowlist enabled", () => {
const { manager, provider } = createManagerHarness({
inboundPolicy: "allowlist",
allowFrom: ["+15550001234"],
});
manager.processEvent({
id: "evt-allowlist-missing",
type: "call.initiated",
callId: "call-missing",
providerCallId: "provider-missing",
timestamp: Date.now(),
direction: "inbound",
to: "+15550000000",
});
expect(manager.getCallByProviderCallId("provider-missing")).toBeUndefined();
expect(provider.hangupCalls).toHaveLength(1);
expect(provider.hangupCalls[0]?.providerCallId).toBe("provider-missing");
});
it("rejects inbound calls with anonymous caller ID when allowlist enabled", () => {
const { manager, provider } = createManagerHarness({
inboundPolicy: "allowlist",
allowFrom: ["+15550001234"],
});
manager.processEvent({
id: "evt-allowlist-anon",
type: "call.initiated",
callId: "call-anon",
providerCallId: "provider-anon",
timestamp: Date.now(),
direction: "inbound",
from: "anonymous",
to: "+15550000000",
});
expect(manager.getCallByProviderCallId("provider-anon")).toBeUndefined();
expect(provider.hangupCalls).toHaveLength(1);
expect(provider.hangupCalls[0]?.providerCallId).toBe("provider-anon");
});
it("rejects inbound calls that only match allowlist suffixes", () => {
const { manager, provider } = createManagerHarness({
inboundPolicy: "allowlist",
allowFrom: ["+15550001234"],
});
manager.processEvent({
id: "evt-allowlist-suffix",
type: "call.initiated",
callId: "call-suffix",
providerCallId: "provider-suffix",
timestamp: Date.now(),
direction: "inbound",
from: "+99915550001234",
to: "+15550000000",
});
expect(manager.getCallByProviderCallId("provider-suffix")).toBeUndefined();
expect(provider.hangupCalls).toHaveLength(1);
expect(provider.hangupCalls[0]?.providerCallId).toBe("provider-suffix");
});
it("rejects duplicate inbound events with a single hangup call", () => {
const { manager, provider } = createManagerHarness({
inboundPolicy: "disabled",
});
manager.processEvent({
id: "evt-reject-init",
type: "call.initiated",
callId: "provider-dup",
providerCallId: "provider-dup",
timestamp: Date.now(),
direction: "inbound",
from: "+15552222222",
to: "+15550000000",
});
manager.processEvent({
id: "evt-reject-ring",
type: "call.ringing",
callId: "provider-dup",
providerCallId: "provider-dup",
timestamp: Date.now(),
direction: "inbound",
from: "+15552222222",
to: "+15550000000",
});
expect(manager.getCallByProviderCallId("provider-dup")).toBeUndefined();
expect(provider.hangupCalls).toHaveLength(1);
expect(provider.hangupCalls[0]?.providerCallId).toBe("provider-dup");
});
it("accepts inbound calls that exactly match the allowlist", () => {
const { manager } = createManagerHarness({
inboundPolicy: "allowlist",
allowFrom: ["+15550001234"],
});
manager.processEvent({
id: "evt-allowlist-exact",
type: "call.initiated",
callId: "call-exact",
providerCallId: "provider-exact",
timestamp: Date.now(),
direction: "inbound",
from: "+15550001234",
to: "+15550000000",
});
expect(manager.getCallByProviderCallId("provider-exact")).toBeDefined();
});
it("completes a closed-loop turn without live audio", async () => {
const { manager, provider } = createManagerHarness({
transcriptTimeoutMs: 5000,
});
const started = await manager.initiateCall("+15550000003");
expect(started.success).toBe(true);
markCallAnswered(manager, started.callId, "evt-closed-loop-answered");
const turnPromise = manager.continueCall(started.callId, "How can I help?");
await new Promise((resolve) => setTimeout(resolve, 0));
manager.processEvent({
id: "evt-closed-loop-speech",
type: "call.speech",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
transcript: "Please check status",
isFinal: true,
});
const turn = await turnPromise;
expect(turn.success).toBe(true);
expect(turn.transcript).toBe("Please check status");
expect(provider.startListeningCalls).toHaveLength(1);
expect(provider.stopListeningCalls).toHaveLength(1);
const call = manager.getCall(started.callId);
expect(call?.transcript.map((entry) => entry.text)).toEqual([
"How can I help?",
"Please check status",
]);
const metadata = (call?.metadata ?? {}) as Record<string, unknown>;
expect(typeof metadata.lastTurnLatencyMs).toBe("number");
expect(typeof metadata.lastTurnListenWaitMs).toBe("number");
expect(metadata.turnCount).toBe(1);
});
it("rejects overlapping continueCall requests for the same call", async () => {
const { manager, provider } = createManagerHarness({
transcriptTimeoutMs: 5000,
});
const started = await manager.initiateCall("+15550000004");
expect(started.success).toBe(true);
markCallAnswered(manager, started.callId, "evt-overlap-answered");
const first = manager.continueCall(started.callId, "First prompt");
const second = await manager.continueCall(started.callId, "Second prompt");
expect(second.success).toBe(false);
expect(second.error).toBe("Already waiting for transcript");
manager.processEvent({
id: "evt-overlap-speech",
type: "call.speech",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
transcript: "Done",
isFinal: true,
});
const firstResult = await first;
expect(firstResult.success).toBe(true);
expect(firstResult.transcript).toBe("Done");
expect(provider.startListeningCalls).toHaveLength(1);
expect(provider.stopListeningCalls).toHaveLength(1);
});
it("ignores speech events with mismatched turnToken while waiting for transcript", async () => {
const { manager, provider } = createManagerHarness(
{
transcriptTimeoutMs: 5000,
},
new FakeProvider("twilio"),
);
const started = await manager.initiateCall("+15550000004");
expect(started.success).toBe(true);
markCallAnswered(manager, started.callId, "evt-turn-token-answered");
const turnPromise = manager.continueCall(started.callId, "Prompt");
await new Promise((resolve) => setTimeout(resolve, 0));
const expectedTurnToken = provider.startListeningCalls[0]?.turnToken;
expect(typeof expectedTurnToken).toBe("string");
manager.processEvent({
id: "evt-turn-token-bad",
type: "call.speech",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
transcript: "stale replay",
isFinal: true,
turnToken: "wrong-token",
});
const pendingState = await Promise.race([
turnPromise.then(() => "resolved"),
new Promise<"pending">((resolve) => setTimeout(() => resolve("pending"), 0)),
]);
expect(pendingState).toBe("pending");
manager.processEvent({
id: "evt-turn-token-good",
type: "call.speech",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
transcript: "final answer",
isFinal: true,
turnToken: expectedTurnToken,
});
const turnResult = await turnPromise;
expect(turnResult.success).toBe(true);
expect(turnResult.transcript).toBe("final answer");
const call = manager.getCall(started.callId);
expect(call?.transcript.map((entry) => entry.text)).toEqual(["Prompt", "final answer"]);
});
it("tracks latency metadata across multiple closed-loop turns", async () => {
const { manager, provider } = createManagerHarness({
transcriptTimeoutMs: 5000,
});
const started = await manager.initiateCall("+15550000005");
expect(started.success).toBe(true);
markCallAnswered(manager, started.callId, "evt-multi-answered");
const firstTurn = manager.continueCall(started.callId, "First question");
await new Promise((resolve) => setTimeout(resolve, 0));
manager.processEvent({
id: "evt-multi-speech-1",
type: "call.speech",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
transcript: "First answer",
isFinal: true,
});
await firstTurn;
const secondTurn = manager.continueCall(started.callId, "Second question");
await new Promise((resolve) => setTimeout(resolve, 0));
manager.processEvent({
id: "evt-multi-speech-2",
type: "call.speech",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
transcript: "Second answer",
isFinal: true,
});
const secondResult = await secondTurn;
expect(secondResult.success).toBe(true);
const call = manager.getCall(started.callId);
expect(call?.transcript.map((entry) => entry.text)).toEqual([
"First question",
"First answer",
"Second question",
"Second answer",
]);
const metadata = (call?.metadata ?? {}) as Record<string, unknown>;
expect(metadata.turnCount).toBe(2);
expect(typeof metadata.lastTurnLatencyMs).toBe("number");
expect(typeof metadata.lastTurnListenWaitMs).toBe("number");
expect(provider.startListeningCalls).toHaveLength(2);
expect(provider.stopListeningCalls).toHaveLength(2);
});
it("handles repeated closed-loop turns without waiter churn", async () => {
const { manager, provider } = createManagerHarness({
transcriptTimeoutMs: 5000,
});
const started = await manager.initiateCall("+15550000006");
expect(started.success).toBe(true);
markCallAnswered(manager, started.callId, "evt-loop-answered");
for (let i = 1; i <= 5; i++) {
const turnPromise = manager.continueCall(started.callId, `Prompt ${i}`);
await new Promise((resolve) => setTimeout(resolve, 0));
manager.processEvent({
id: `evt-loop-speech-${i}`,
type: "call.speech",
callId: started.callId,
providerCallId: "request-uuid",
timestamp: Date.now(),
transcript: `Answer ${i}`,
isFinal: true,
});
const result = await turnPromise;
expect(result.success).toBe(true);
expect(result.transcript).toBe(`Answer ${i}`);
}
const call = manager.getCall(started.callId);
const metadata = (call?.metadata ?? {}) as Record<string, unknown>;
expect(metadata.turnCount).toBe(5);
expect(provider.startListeningCalls).toHaveLength(5);
expect(provider.stopListeningCalls).toHaveLength(5);
});
});

View File

@@ -0,0 +1,209 @@
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import type { VoiceCallConfig } from "./config.js";
import type { CallManagerContext } from "./manager/context.js";
import { processEvent as processManagerEvent } from "./manager/events.js";
import { getCallByProviderCallId as getCallByProviderCallIdFromMaps } from "./manager/lookup.js";
import {
continueCall as continueCallWithContext,
endCall as endCallWithContext,
initiateCall as initiateCallWithContext,
speak as speakWithContext,
speakInitialMessage as speakInitialMessageWithContext,
} from "./manager/outbound.js";
import { getCallHistoryFromStore, loadActiveCallsFromStore } from "./manager/store.js";
import type { VoiceCallProvider } from "./providers/base.js";
import type { CallId, CallRecord, NormalizedEvent, OutboundCallOptions } from "./types.js";
import { resolveUserPath } from "./utils.js";
function resolveDefaultStoreBase(config: VoiceCallConfig, storePath?: string): string {
const rawOverride = storePath?.trim() || config.store?.trim();
if (rawOverride) {
return resolveUserPath(rawOverride);
}
const preferred = path.join(os.homedir(), ".openclaw", "voice-calls");
const candidates = [preferred].map((dir) => resolveUserPath(dir));
const existing =
candidates.find((dir) => {
try {
return fs.existsSync(path.join(dir, "calls.jsonl")) || fs.existsSync(dir);
} catch {
return false;
}
}) ?? resolveUserPath(preferred);
return existing;
}
/**
* Manages voice calls: state ownership and delegation to manager helper modules.
*/
export class CallManager {
private activeCalls = new Map<CallId, CallRecord>();
private providerCallIdMap = new Map<string, CallId>();
private processedEventIds = new Set<string>();
private rejectedProviderCallIds = new Set<string>();
private provider: VoiceCallProvider | null = null;
private config: VoiceCallConfig;
private storePath: string;
private webhookUrl: string | null = null;
private activeTurnCalls = new Set<CallId>();
private transcriptWaiters = new Map<
CallId,
{
resolve: (text: string) => void;
reject: (err: Error) => void;
timeout: NodeJS.Timeout;
}
>();
private maxDurationTimers = new Map<CallId, NodeJS.Timeout>();
constructor(config: VoiceCallConfig, storePath?: string) {
this.config = config;
this.storePath = resolveDefaultStoreBase(config, storePath);
}
/**
* Initialize the call manager with a provider.
*/
initialize(provider: VoiceCallProvider, webhookUrl: string): void {
this.provider = provider;
this.webhookUrl = webhookUrl;
fs.mkdirSync(this.storePath, { recursive: true });
const persisted = loadActiveCallsFromStore(this.storePath);
this.activeCalls = persisted.activeCalls;
this.providerCallIdMap = persisted.providerCallIdMap;
this.processedEventIds = persisted.processedEventIds;
this.rejectedProviderCallIds = persisted.rejectedProviderCallIds;
}
/**
* Get the current provider.
*/
getProvider(): VoiceCallProvider | null {
return this.provider;
}
/**
* Initiate an outbound call.
*/
async initiateCall(
to: string,
sessionKey?: string,
options?: OutboundCallOptions | string,
): Promise<{ callId: CallId; success: boolean; error?: string }> {
return initiateCallWithContext(this.getContext(), to, sessionKey, options);
}
/**
* Speak to user in an active call.
*/
async speak(callId: CallId, text: string): Promise<{ success: boolean; error?: string }> {
return speakWithContext(this.getContext(), callId, text);
}
/**
* Speak the initial message for a call (called when media stream connects).
*/
async speakInitialMessage(providerCallId: string): Promise<void> {
return speakInitialMessageWithContext(this.getContext(), providerCallId);
}
/**
* Continue call: speak prompt, then wait for user's final transcript.
*/
async continueCall(
callId: CallId,
prompt: string,
): Promise<{ success: boolean; transcript?: string; error?: string }> {
return continueCallWithContext(this.getContext(), callId, prompt);
}
/**
* End an active call.
*/
async endCall(callId: CallId): Promise<{ success: boolean; error?: string }> {
return endCallWithContext(this.getContext(), callId);
}
private getContext(): CallManagerContext {
return {
activeCalls: this.activeCalls,
providerCallIdMap: this.providerCallIdMap,
processedEventIds: this.processedEventIds,
rejectedProviderCallIds: this.rejectedProviderCallIds,
provider: this.provider,
config: this.config,
storePath: this.storePath,
webhookUrl: this.webhookUrl,
activeTurnCalls: this.activeTurnCalls,
transcriptWaiters: this.transcriptWaiters,
maxDurationTimers: this.maxDurationTimers,
onCallAnswered: (call) => {
this.maybeSpeakInitialMessageOnAnswered(call);
},
};
}
/**
* Process a webhook event.
*/
processEvent(event: NormalizedEvent): void {
processManagerEvent(this.getContext(), event);
}
private maybeSpeakInitialMessageOnAnswered(call: CallRecord): void {
const initialMessage =
typeof call.metadata?.initialMessage === "string" ? call.metadata.initialMessage.trim() : "";
if (!initialMessage) {
return;
}
if (!this.provider || !call.providerCallId) {
return;
}
// Twilio has provider-specific state for speaking (<Say> fallback) and can
// fail for inbound calls; keep existing Twilio behavior unchanged.
if (this.provider.name === "twilio") {
return;
}
void this.speakInitialMessage(call.providerCallId);
}
/**
* Get an active call by ID.
*/
getCall(callId: CallId): CallRecord | undefined {
return this.activeCalls.get(callId);
}
/**
* Get an active call by provider call ID (e.g., Twilio CallSid).
*/
getCallByProviderCallId(providerCallId: string): CallRecord | undefined {
return getCallByProviderCallIdFromMaps({
activeCalls: this.activeCalls,
providerCallIdMap: this.providerCallIdMap,
providerCallId,
});
}
/**
* Get all active calls.
*/
getActiveCalls(): CallRecord[] {
return Array.from(this.activeCalls.values());
}
/**
* Get call history (from persisted logs).
*/
async getCallHistory(limit = 50): Promise<CallRecord[]> {
return getCallHistoryFromStore(this.storePath, limit);
}
}

View File

@@ -0,0 +1,41 @@
import type { VoiceCallConfig } from "../config.js";
import type { VoiceCallProvider } from "../providers/base.js";
import type { CallId, CallRecord } from "../types.js";
export type TranscriptWaiter = {
resolve: (text: string) => void;
reject: (err: Error) => void;
timeout: NodeJS.Timeout;
turnToken?: string;
};
export type CallManagerRuntimeState = {
activeCalls: Map<CallId, CallRecord>;
providerCallIdMap: Map<string, CallId>;
processedEventIds: Set<string>;
/** Provider call IDs we already sent a reject hangup for; avoids duplicate hangup calls. */
rejectedProviderCallIds: Set<string>;
};
export type CallManagerRuntimeDeps = {
provider: VoiceCallProvider | null;
config: VoiceCallConfig;
storePath: string;
webhookUrl: string | null;
};
export type CallManagerTransientState = {
activeTurnCalls: Set<CallId>;
transcriptWaiters: Map<CallId, TranscriptWaiter>;
maxDurationTimers: Map<CallId, NodeJS.Timeout>;
};
export type CallManagerHooks = {
/** Optional runtime hook invoked after an event transitions a call into answered state. */
onCallAnswered?: (call: CallRecord) => void;
};
export type CallManagerContext = CallManagerRuntimeState &
CallManagerRuntimeDeps &
CallManagerTransientState &
CallManagerHooks;

View File

@@ -0,0 +1,282 @@
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { describe, expect, it } from "vitest";
import { VoiceCallConfigSchema } from "../config.js";
import type { VoiceCallProvider } from "../providers/base.js";
import type { HangupCallInput, NormalizedEvent } from "../types.js";
import type { CallManagerContext } from "./context.js";
import { processEvent } from "./events.js";
function createContext(overrides: Partial<CallManagerContext> = {}): CallManagerContext {
const storePath = path.join(os.tmpdir(), `openclaw-voice-call-events-test-${Date.now()}`);
fs.mkdirSync(storePath, { recursive: true });
return {
activeCalls: new Map(),
providerCallIdMap: new Map(),
processedEventIds: new Set(),
rejectedProviderCallIds: new Set(),
provider: null,
config: VoiceCallConfigSchema.parse({
enabled: true,
provider: "plivo",
fromNumber: "+15550000000",
}),
storePath,
webhookUrl: null,
activeTurnCalls: new Set(),
transcriptWaiters: new Map(),
maxDurationTimers: new Map(),
...overrides,
};
}
function createProvider(overrides: Partial<VoiceCallProvider> = {}): VoiceCallProvider {
return {
name: "plivo",
verifyWebhook: () => ({ ok: true }),
parseWebhookEvent: () => ({ events: [] }),
initiateCall: async () => ({ providerCallId: "provider-call-id", status: "initiated" }),
hangupCall: async () => {},
playTts: async () => {},
startListening: async () => {},
stopListening: async () => {},
...overrides,
};
}
function createInboundDisabledConfig() {
return VoiceCallConfigSchema.parse({
enabled: true,
provider: "plivo",
fromNumber: "+15550000000",
inboundPolicy: "disabled",
});
}
function createInboundInitiatedEvent(params: {
id: string;
providerCallId: string;
from: string;
}): NormalizedEvent {
return {
id: params.id,
type: "call.initiated",
callId: params.providerCallId,
providerCallId: params.providerCallId,
timestamp: Date.now(),
direction: "inbound",
from: params.from,
to: "+15550000000",
};
}
function createRejectingInboundContext(): {
ctx: CallManagerContext;
hangupCalls: HangupCallInput[];
} {
const hangupCalls: HangupCallInput[] = [];
const provider = createProvider({
hangupCall: async (input: HangupCallInput): Promise<void> => {
hangupCalls.push(input);
},
});
const ctx = createContext({
config: createInboundDisabledConfig(),
provider,
});
return { ctx, hangupCalls };
}
describe("processEvent (functional)", () => {
it("calls provider hangup when rejecting inbound call", () => {
const { ctx, hangupCalls } = createRejectingInboundContext();
const event = createInboundInitiatedEvent({
id: "evt-1",
providerCallId: "prov-1",
from: "+15559999999",
});
processEvent(ctx, event);
expect(ctx.activeCalls.size).toBe(0);
expect(hangupCalls).toHaveLength(1);
expect(hangupCalls[0]).toEqual({
callId: "prov-1",
providerCallId: "prov-1",
reason: "hangup-bot",
});
});
it("does not call hangup when provider is null", () => {
const ctx = createContext({
config: createInboundDisabledConfig(),
provider: null,
});
const event = createInboundInitiatedEvent({
id: "evt-2",
providerCallId: "prov-2",
from: "+15551111111",
});
processEvent(ctx, event);
expect(ctx.activeCalls.size).toBe(0);
});
it("calls hangup only once for duplicate events for same rejected call", () => {
const { ctx, hangupCalls } = createRejectingInboundContext();
const event1 = createInboundInitiatedEvent({
id: "evt-init",
providerCallId: "prov-dup",
from: "+15552222222",
});
const event2: NormalizedEvent = {
id: "evt-ring",
type: "call.ringing",
callId: "prov-dup",
providerCallId: "prov-dup",
timestamp: Date.now(),
direction: "inbound",
from: "+15552222222",
to: "+15550000000",
};
processEvent(ctx, event1);
processEvent(ctx, event2);
expect(ctx.activeCalls.size).toBe(0);
expect(hangupCalls).toHaveLength(1);
expect(hangupCalls[0]?.providerCallId).toBe("prov-dup");
});
it("updates providerCallId map when provider ID changes", () => {
const now = Date.now();
const ctx = createContext();
ctx.activeCalls.set("call-1", {
callId: "call-1",
providerCallId: "request-uuid",
provider: "plivo",
direction: "outbound",
state: "initiated",
from: "+15550000000",
to: "+15550000001",
startedAt: now,
transcript: [],
processedEventIds: [],
metadata: {},
});
ctx.providerCallIdMap.set("request-uuid", "call-1");
processEvent(ctx, {
id: "evt-provider-id-change",
type: "call.answered",
callId: "call-1",
providerCallId: "call-uuid",
timestamp: now + 1,
});
expect(ctx.activeCalls.get("call-1")?.providerCallId).toBe("call-uuid");
expect(ctx.providerCallIdMap.get("call-uuid")).toBe("call-1");
expect(ctx.providerCallIdMap.has("request-uuid")).toBe(false);
});
it("invokes onCallAnswered hook for answered events", () => {
const now = Date.now();
let answeredCallId: string | null = null;
const ctx = createContext({
onCallAnswered: (call) => {
answeredCallId = call.callId;
},
});
ctx.activeCalls.set("call-2", {
callId: "call-2",
providerCallId: "call-2-provider",
provider: "plivo",
direction: "inbound",
state: "ringing",
from: "+15550000002",
to: "+15550000000",
startedAt: now,
transcript: [],
processedEventIds: [],
metadata: {},
});
ctx.providerCallIdMap.set("call-2-provider", "call-2");
processEvent(ctx, {
id: "evt-answered-hook",
type: "call.answered",
callId: "call-2",
providerCallId: "call-2-provider",
timestamp: now + 1,
});
expect(answeredCallId).toBe("call-2");
});
it("when hangup throws, logs and does not throw", () => {
const provider = createProvider({
hangupCall: async (): Promise<void> => {
throw new Error("provider down");
},
});
const ctx = createContext({
config: createInboundDisabledConfig(),
provider,
});
const event = createInboundInitiatedEvent({
id: "evt-fail",
providerCallId: "prov-fail",
from: "+15553333333",
});
expect(() => processEvent(ctx, event)).not.toThrow();
expect(ctx.activeCalls.size).toBe(0);
});
it("deduplicates by dedupeKey even when event IDs differ", () => {
const now = Date.now();
const ctx = createContext();
ctx.activeCalls.set("call-dedupe", {
callId: "call-dedupe",
providerCallId: "provider-dedupe",
provider: "plivo",
direction: "outbound",
state: "answered",
from: "+15550000000",
to: "+15550000001",
startedAt: now,
transcript: [],
processedEventIds: [],
metadata: {},
});
ctx.providerCallIdMap.set("provider-dedupe", "call-dedupe");
processEvent(ctx, {
id: "evt-1",
dedupeKey: "stable-key-1",
type: "call.speech",
callId: "call-dedupe",
providerCallId: "provider-dedupe",
timestamp: now + 1,
transcript: "hello",
isFinal: true,
});
processEvent(ctx, {
id: "evt-2",
dedupeKey: "stable-key-1",
type: "call.speech",
callId: "call-dedupe",
providerCallId: "provider-dedupe",
timestamp: now + 2,
transcript: "hello",
isFinal: true,
});
const call = ctx.activeCalls.get("call-dedupe");
expect(call?.transcript).toHaveLength(1);
expect(Array.from(ctx.processedEventIds)).toEqual(["stable-key-1"]);
});
});

View File

@@ -0,0 +1,242 @@
import crypto from "node:crypto";
import { isAllowlistedCaller, normalizePhoneNumber } from "../allowlist.js";
import type { CallRecord, CallState, NormalizedEvent } from "../types.js";
import type { CallManagerContext } from "./context.js";
import { findCall } from "./lookup.js";
import { endCall } from "./outbound.js";
import { addTranscriptEntry, transitionState } from "./state.js";
import { persistCallRecord } from "./store.js";
import {
clearMaxDurationTimer,
rejectTranscriptWaiter,
resolveTranscriptWaiter,
startMaxDurationTimer,
} from "./timers.js";
type EventContext = Pick<
CallManagerContext,
| "activeCalls"
| "providerCallIdMap"
| "processedEventIds"
| "rejectedProviderCallIds"
| "provider"
| "config"
| "storePath"
| "transcriptWaiters"
| "maxDurationTimers"
| "onCallAnswered"
>;
function shouldAcceptInbound(config: EventContext["config"], from: string | undefined): boolean {
const { inboundPolicy: policy, allowFrom } = config;
switch (policy) {
case "disabled":
console.log("[voice-call] Inbound call rejected: policy is disabled");
return false;
case "open":
console.log("[voice-call] Inbound call accepted: policy is open");
return true;
case "allowlist":
case "pairing": {
const normalized = normalizePhoneNumber(from);
if (!normalized) {
console.log("[voice-call] Inbound call rejected: missing caller ID");
return false;
}
const allowed = isAllowlistedCaller(normalized, allowFrom);
const status = allowed ? "accepted" : "rejected";
console.log(
`[voice-call] Inbound call ${status}: ${from} ${allowed ? "is in" : "not in"} allowlist`,
);
return allowed;
}
default:
return false;
}
}
function createInboundCall(params: {
ctx: EventContext;
providerCallId: string;
from: string;
to: string;
}): CallRecord {
const callId = crypto.randomUUID();
const callRecord: CallRecord = {
callId,
providerCallId: params.providerCallId,
provider: params.ctx.provider?.name || "twilio",
direction: "inbound",
state: "ringing",
from: params.from,
to: params.to,
startedAt: Date.now(),
transcript: [],
processedEventIds: [],
metadata: {
initialMessage: params.ctx.config.inboundGreeting || "Hello! How can I help you today?",
},
};
params.ctx.activeCalls.set(callId, callRecord);
params.ctx.providerCallIdMap.set(params.providerCallId, callId);
persistCallRecord(params.ctx.storePath, callRecord);
console.log(`[voice-call] Created inbound call record: ${callId} from ${params.from}`);
return callRecord;
}
export function processEvent(ctx: EventContext, event: NormalizedEvent): void {
const dedupeKey = event.dedupeKey || event.id;
if (ctx.processedEventIds.has(dedupeKey)) {
return;
}
ctx.processedEventIds.add(dedupeKey);
let call = findCall({
activeCalls: ctx.activeCalls,
providerCallIdMap: ctx.providerCallIdMap,
callIdOrProviderCallId: event.callId,
});
if (!call && event.direction === "inbound" && event.providerCallId) {
if (!shouldAcceptInbound(ctx.config, event.from)) {
const pid = event.providerCallId;
if (!ctx.provider) {
console.warn(
`[voice-call] Inbound call rejected by policy but no provider to hang up (providerCallId: ${pid}, from: ${event.from}); call will time out on provider side.`,
);
return;
}
if (ctx.rejectedProviderCallIds.has(pid)) {
return;
}
ctx.rejectedProviderCallIds.add(pid);
const callId = event.callId ?? pid;
console.log(`[voice-call] Rejecting inbound call by policy: ${pid}`);
void ctx.provider
.hangupCall({
callId,
providerCallId: pid,
reason: "hangup-bot",
})
.catch((err) => {
const message = err instanceof Error ? err.message : String(err);
console.warn(`[voice-call] Failed to reject inbound call ${pid}:`, message);
});
return;
}
call = createInboundCall({
ctx,
providerCallId: event.providerCallId,
from: event.from || "unknown",
to: event.to || ctx.config.fromNumber || "unknown",
});
// Normalize event to internal ID for downstream consumers.
event.callId = call.callId;
}
if (!call) {
return;
}
if (event.providerCallId && event.providerCallId !== call.providerCallId) {
const previousProviderCallId = call.providerCallId;
call.providerCallId = event.providerCallId;
ctx.providerCallIdMap.set(event.providerCallId, call.callId);
if (previousProviderCallId) {
const mapped = ctx.providerCallIdMap.get(previousProviderCallId);
if (mapped === call.callId) {
ctx.providerCallIdMap.delete(previousProviderCallId);
}
}
}
call.processedEventIds.push(dedupeKey);
switch (event.type) {
case "call.initiated":
transitionState(call, "initiated");
break;
case "call.ringing":
transitionState(call, "ringing");
break;
case "call.answered":
call.answeredAt = event.timestamp;
transitionState(call, "answered");
startMaxDurationTimer({
ctx,
callId: call.callId,
onTimeout: async (callId) => {
await endCall(ctx, callId);
},
});
ctx.onCallAnswered?.(call);
break;
case "call.active":
transitionState(call, "active");
break;
case "call.speaking":
transitionState(call, "speaking");
break;
case "call.speech":
if (event.isFinal) {
const hadWaiter = ctx.transcriptWaiters.has(call.callId);
const resolved = resolveTranscriptWaiter(
ctx,
call.callId,
event.transcript,
event.turnToken,
);
if (hadWaiter && !resolved) {
console.warn(
`[voice-call] Ignoring speech event with mismatched turn token for ${call.callId}`,
);
break;
}
addTranscriptEntry(call, "user", event.transcript);
}
transitionState(call, "listening");
break;
case "call.ended":
call.endedAt = event.timestamp;
call.endReason = event.reason;
transitionState(call, event.reason as CallState);
clearMaxDurationTimer(ctx, call.callId);
rejectTranscriptWaiter(ctx, call.callId, `Call ended: ${event.reason}`);
ctx.activeCalls.delete(call.callId);
if (call.providerCallId) {
ctx.providerCallIdMap.delete(call.providerCallId);
}
break;
case "call.error":
if (!event.retryable) {
call.endedAt = event.timestamp;
call.endReason = "error";
transitionState(call, "error");
clearMaxDurationTimer(ctx, call.callId);
rejectTranscriptWaiter(ctx, call.callId, `Call error: ${event.error}`);
ctx.activeCalls.delete(call.callId);
if (call.providerCallId) {
ctx.providerCallIdMap.delete(call.providerCallId);
}
}
break;
}
persistCallRecord(ctx.storePath, call);
}

View File

@@ -0,0 +1,35 @@
import type { CallId, CallRecord } from "../types.js";
export function getCallByProviderCallId(params: {
activeCalls: Map<CallId, CallRecord>;
providerCallIdMap: Map<string, CallId>;
providerCallId: string;
}): CallRecord | undefined {
const callId = params.providerCallIdMap.get(params.providerCallId);
if (callId) {
return params.activeCalls.get(callId);
}
for (const call of params.activeCalls.values()) {
if (call.providerCallId === params.providerCallId) {
return call;
}
}
return undefined;
}
export function findCall(params: {
activeCalls: Map<CallId, CallRecord>;
providerCallIdMap: Map<string, CallId>;
callIdOrProviderCallId: string;
}): CallRecord | undefined {
const directCall = params.activeCalls.get(params.callIdOrProviderCallId);
if (directCall) {
return directCall;
}
return getCallByProviderCallId({
activeCalls: params.activeCalls,
providerCallIdMap: params.providerCallIdMap,
providerCallId: params.callIdOrProviderCallId,
});
}

View File

@@ -0,0 +1,380 @@
import crypto from "node:crypto";
import type { CallMode } from "../config.js";
import {
TerminalStates,
type CallId,
type CallRecord,
type OutboundCallOptions,
} from "../types.js";
import { mapVoiceToPolly } from "../voice-mapping.js";
import type { CallManagerContext } from "./context.js";
import { getCallByProviderCallId } from "./lookup.js";
import { addTranscriptEntry, transitionState } from "./state.js";
import { persistCallRecord } from "./store.js";
import {
clearMaxDurationTimer,
clearTranscriptWaiter,
rejectTranscriptWaiter,
waitForFinalTranscript,
} from "./timers.js";
import { generateNotifyTwiml } from "./twiml.js";
type InitiateContext = Pick<
CallManagerContext,
"activeCalls" | "providerCallIdMap" | "provider" | "config" | "storePath" | "webhookUrl"
>;
type SpeakContext = Pick<
CallManagerContext,
"activeCalls" | "providerCallIdMap" | "provider" | "config" | "storePath"
>;
type ConversationContext = Pick<
CallManagerContext,
| "activeCalls"
| "providerCallIdMap"
| "provider"
| "config"
| "storePath"
| "activeTurnCalls"
| "transcriptWaiters"
| "maxDurationTimers"
>;
type EndCallContext = Pick<
CallManagerContext,
| "activeCalls"
| "providerCallIdMap"
| "provider"
| "storePath"
| "transcriptWaiters"
| "maxDurationTimers"
>;
type ConnectedCallContext = Pick<CallManagerContext, "activeCalls" | "provider">;
type ConnectedCallLookup =
| { kind: "error"; error: string }
| { kind: "ended"; call: CallRecord }
| {
kind: "ok";
call: CallRecord;
providerCallId: string;
provider: NonNullable<ConnectedCallContext["provider"]>;
};
type ConnectedCallResolution =
| { ok: false; error: string }
| {
ok: true;
call: CallRecord;
providerCallId: string;
provider: NonNullable<ConnectedCallContext["provider"]>;
};
function lookupConnectedCall(ctx: ConnectedCallContext, callId: CallId): ConnectedCallLookup {
const call = ctx.activeCalls.get(callId);
if (!call) {
return { kind: "error", error: "Call not found" };
}
if (!ctx.provider || !call.providerCallId) {
return { kind: "error", error: "Call not connected" };
}
if (TerminalStates.has(call.state)) {
return { kind: "ended", call };
}
return { kind: "ok", call, providerCallId: call.providerCallId, provider: ctx.provider };
}
function requireConnectedCall(ctx: ConnectedCallContext, callId: CallId): ConnectedCallResolution {
const lookup = lookupConnectedCall(ctx, callId);
if (lookup.kind === "error") {
return { ok: false, error: lookup.error };
}
if (lookup.kind === "ended") {
return { ok: false, error: "Call has ended" };
}
return {
ok: true,
call: lookup.call,
providerCallId: lookup.providerCallId,
provider: lookup.provider,
};
}
export async function initiateCall(
ctx: InitiateContext,
to: string,
sessionKey?: string,
options?: OutboundCallOptions | string,
): Promise<{ callId: CallId; success: boolean; error?: string }> {
const opts: OutboundCallOptions =
typeof options === "string" ? { message: options } : (options ?? {});
const initialMessage = opts.message;
const mode = opts.mode ?? ctx.config.outbound.defaultMode;
if (!ctx.provider) {
return { callId: "", success: false, error: "Provider not initialized" };
}
if (!ctx.webhookUrl) {
return { callId: "", success: false, error: "Webhook URL not configured" };
}
if (ctx.activeCalls.size >= ctx.config.maxConcurrentCalls) {
return {
callId: "",
success: false,
error: `Maximum concurrent calls (${ctx.config.maxConcurrentCalls}) reached`,
};
}
const callId = crypto.randomUUID();
const from =
ctx.config.fromNumber || (ctx.provider?.name === "mock" ? "+15550000000" : undefined);
if (!from) {
return { callId: "", success: false, error: "fromNumber not configured" };
}
const callRecord: CallRecord = {
callId,
provider: ctx.provider.name,
direction: "outbound",
state: "initiated",
from,
to,
sessionKey,
startedAt: Date.now(),
transcript: [],
processedEventIds: [],
metadata: {
...(initialMessage && { initialMessage }),
mode,
},
};
ctx.activeCalls.set(callId, callRecord);
persistCallRecord(ctx.storePath, callRecord);
try {
// For notify mode with a message, use inline TwiML with <Say>.
let inlineTwiml: string | undefined;
if (mode === "notify" && initialMessage) {
const pollyVoice = mapVoiceToPolly(ctx.config.tts?.openai?.voice);
inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice);
console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`);
}
const result = await ctx.provider.initiateCall({
callId,
from,
to,
webhookUrl: ctx.webhookUrl,
inlineTwiml,
});
callRecord.providerCallId = result.providerCallId;
ctx.providerCallIdMap.set(result.providerCallId, callId);
persistCallRecord(ctx.storePath, callRecord);
return { callId, success: true };
} catch (err) {
callRecord.state = "failed";
callRecord.endedAt = Date.now();
callRecord.endReason = "failed";
persistCallRecord(ctx.storePath, callRecord);
ctx.activeCalls.delete(callId);
if (callRecord.providerCallId) {
ctx.providerCallIdMap.delete(callRecord.providerCallId);
}
return {
callId,
success: false,
error: err instanceof Error ? err.message : String(err),
};
}
}
export async function speak(
ctx: SpeakContext,
callId: CallId,
text: string,
): Promise<{ success: boolean; error?: string }> {
const connected = requireConnectedCall(ctx, callId);
if (!connected.ok) {
return { success: false, error: connected.error };
}
const { call, providerCallId, provider } = connected;
try {
transitionState(call, "speaking");
persistCallRecord(ctx.storePath, call);
addTranscriptEntry(call, "bot", text);
const voice = provider.name === "twilio" ? ctx.config.tts?.openai?.voice : undefined;
await provider.playTts({
callId,
providerCallId,
text,
voice,
});
return { success: true };
} catch (err) {
return { success: false, error: err instanceof Error ? err.message : String(err) };
}
}
export async function speakInitialMessage(
ctx: ConversationContext,
providerCallId: string,
): Promise<void> {
const call = getCallByProviderCallId({
activeCalls: ctx.activeCalls,
providerCallIdMap: ctx.providerCallIdMap,
providerCallId,
});
if (!call) {
console.warn(`[voice-call] speakInitialMessage: no call found for ${providerCallId}`);
return;
}
const initialMessage = call.metadata?.initialMessage as string | undefined;
const mode = (call.metadata?.mode as CallMode) ?? "conversation";
if (!initialMessage) {
console.log(`[voice-call] speakInitialMessage: no initial message for ${call.callId}`);
return;
}
// Clear so we don't speak it again if the provider reconnects.
if (call.metadata) {
delete call.metadata.initialMessage;
persistCallRecord(ctx.storePath, call);
}
console.log(`[voice-call] Speaking initial message for call ${call.callId} (mode: ${mode})`);
const result = await speak(ctx, call.callId, initialMessage);
if (!result.success) {
console.warn(`[voice-call] Failed to speak initial message: ${result.error}`);
return;
}
if (mode === "notify") {
const delaySec = ctx.config.outbound.notifyHangupDelaySec;
console.log(`[voice-call] Notify mode: auto-hangup in ${delaySec}s for call ${call.callId}`);
setTimeout(async () => {
const currentCall = ctx.activeCalls.get(call.callId);
if (currentCall && !TerminalStates.has(currentCall.state)) {
console.log(`[voice-call] Notify mode: hanging up call ${call.callId}`);
await endCall(ctx, call.callId);
}
}, delaySec * 1000);
}
}
export async function continueCall(
ctx: ConversationContext,
callId: CallId,
prompt: string,
): Promise<{ success: boolean; transcript?: string; error?: string }> {
const connected = requireConnectedCall(ctx, callId);
if (!connected.ok) {
return { success: false, error: connected.error };
}
const { call, providerCallId, provider } = connected;
if (ctx.activeTurnCalls.has(callId) || ctx.transcriptWaiters.has(callId)) {
return { success: false, error: "Already waiting for transcript" };
}
ctx.activeTurnCalls.add(callId);
const turnStartedAt = Date.now();
const turnToken = provider.name === "twilio" ? crypto.randomUUID() : undefined;
try {
await speak(ctx, callId, prompt);
transitionState(call, "listening");
persistCallRecord(ctx.storePath, call);
const listenStartedAt = Date.now();
await provider.startListening({ callId, providerCallId, turnToken });
const transcript = await waitForFinalTranscript(ctx, callId, turnToken);
const transcriptReceivedAt = Date.now();
// Best-effort: stop listening after final transcript.
await provider.stopListening({ callId, providerCallId });
const lastTurnLatencyMs = transcriptReceivedAt - turnStartedAt;
const lastTurnListenWaitMs = transcriptReceivedAt - listenStartedAt;
const turnCount =
call.metadata && typeof call.metadata.turnCount === "number"
? call.metadata.turnCount + 1
: 1;
call.metadata = {
...(call.metadata ?? {}),
turnCount,
lastTurnLatencyMs,
lastTurnListenWaitMs,
lastTurnCompletedAt: transcriptReceivedAt,
};
persistCallRecord(ctx.storePath, call);
console.log(
"[voice-call] continueCall latency call=" +
call.callId +
" totalMs=" +
String(lastTurnLatencyMs) +
" listenWaitMs=" +
String(lastTurnListenWaitMs),
);
return { success: true, transcript };
} catch (err) {
return { success: false, error: err instanceof Error ? err.message : String(err) };
} finally {
ctx.activeTurnCalls.delete(callId);
clearTranscriptWaiter(ctx, callId);
}
}
export async function endCall(
ctx: EndCallContext,
callId: CallId,
): Promise<{ success: boolean; error?: string }> {
const lookup = lookupConnectedCall(ctx, callId);
if (lookup.kind === "error") {
return { success: false, error: lookup.error };
}
if (lookup.kind === "ended") {
return { success: true };
}
const { call, providerCallId, provider } = lookup;
try {
await provider.hangupCall({
callId,
providerCallId,
reason: "hangup-bot",
});
call.state = "hangup-bot";
call.endedAt = Date.now();
call.endReason = "hangup-bot";
persistCallRecord(ctx.storePath, call);
clearMaxDurationTimer(ctx, callId);
rejectTranscriptWaiter(ctx, callId, "Call ended: hangup-bot");
ctx.activeCalls.delete(callId);
ctx.providerCallIdMap.delete(providerCallId);
return { success: true };
} catch (err) {
return { success: false, error: err instanceof Error ? err.message : String(err) };
}
}

View File

@@ -0,0 +1,48 @@
import { TerminalStates, type CallRecord, type CallState, type TranscriptEntry } from "../types.js";
const ConversationStates = new Set<CallState>(["speaking", "listening"]);
const StateOrder: readonly CallState[] = [
"initiated",
"ringing",
"answered",
"active",
"speaking",
"listening",
];
export function transitionState(call: CallRecord, newState: CallState): void {
// No-op for same state or already terminal.
if (call.state === newState || TerminalStates.has(call.state)) {
return;
}
// Terminal states can always be reached from non-terminal.
if (TerminalStates.has(newState)) {
call.state = newState;
return;
}
// Allow cycling between speaking and listening (multi-turn conversations).
if (ConversationStates.has(call.state) && ConversationStates.has(newState)) {
call.state = newState;
return;
}
// Only allow forward transitions in state order.
const currentIndex = StateOrder.indexOf(call.state);
const newIndex = StateOrder.indexOf(newState);
if (newIndex > currentIndex) {
call.state = newState;
}
}
export function addTranscriptEntry(call: CallRecord, speaker: "bot" | "user", text: string): void {
const entry: TranscriptEntry = {
timestamp: Date.now(),
speaker,
text,
isFinal: true,
};
call.transcript.push(entry);
}

View File

@@ -0,0 +1,94 @@
import fs from "node:fs";
import fsp from "node:fs/promises";
import path from "node:path";
import { CallRecordSchema, TerminalStates, type CallId, type CallRecord } from "../types.js";
export function persistCallRecord(storePath: string, call: CallRecord): void {
const logPath = path.join(storePath, "calls.jsonl");
const line = `${JSON.stringify(call)}\n`;
// Fire-and-forget async write to avoid blocking event loop.
fsp.appendFile(logPath, line).catch((err) => {
console.error("[voice-call] Failed to persist call record:", err);
});
}
export function loadActiveCallsFromStore(storePath: string): {
activeCalls: Map<CallId, CallRecord>;
providerCallIdMap: Map<string, CallId>;
processedEventIds: Set<string>;
rejectedProviderCallIds: Set<string>;
} {
const logPath = path.join(storePath, "calls.jsonl");
if (!fs.existsSync(logPath)) {
return {
activeCalls: new Map(),
providerCallIdMap: new Map(),
processedEventIds: new Set(),
rejectedProviderCallIds: new Set(),
};
}
const content = fs.readFileSync(logPath, "utf-8");
const lines = content.split("\n");
const callMap = new Map<CallId, CallRecord>();
for (const line of lines) {
if (!line.trim()) {
continue;
}
try {
const call = CallRecordSchema.parse(JSON.parse(line));
callMap.set(call.callId, call);
} catch {
// Skip invalid lines.
}
}
const activeCalls = new Map<CallId, CallRecord>();
const providerCallIdMap = new Map<string, CallId>();
const processedEventIds = new Set<string>();
const rejectedProviderCallIds = new Set<string>();
for (const [callId, call] of callMap) {
if (TerminalStates.has(call.state)) {
continue;
}
activeCalls.set(callId, call);
if (call.providerCallId) {
providerCallIdMap.set(call.providerCallId, callId);
}
for (const eventId of call.processedEventIds) {
processedEventIds.add(eventId);
}
}
return { activeCalls, providerCallIdMap, processedEventIds, rejectedProviderCallIds };
}
export async function getCallHistoryFromStore(
storePath: string,
limit = 50,
): Promise<CallRecord[]> {
const logPath = path.join(storePath, "calls.jsonl");
try {
await fsp.access(logPath);
} catch {
return [];
}
const content = await fsp.readFile(logPath, "utf-8");
const lines = content.trim().split("\n").filter(Boolean);
const calls: CallRecord[] = [];
for (const line of lines.slice(-limit)) {
try {
const parsed = CallRecordSchema.parse(JSON.parse(line));
calls.push(parsed);
} catch {
// Skip invalid lines.
}
}
return calls;
}

View File

@@ -0,0 +1,112 @@
import { TerminalStates, type CallId } from "../types.js";
import type { CallManagerContext } from "./context.js";
import { persistCallRecord } from "./store.js";
type TimerContext = Pick<
CallManagerContext,
"activeCalls" | "maxDurationTimers" | "config" | "storePath" | "transcriptWaiters"
>;
type MaxDurationTimerContext = Pick<
TimerContext,
"activeCalls" | "maxDurationTimers" | "config" | "storePath"
>;
type TranscriptWaiterContext = Pick<TimerContext, "transcriptWaiters">;
export function clearMaxDurationTimer(
ctx: Pick<MaxDurationTimerContext, "maxDurationTimers">,
callId: CallId,
): void {
const timer = ctx.maxDurationTimers.get(callId);
if (timer) {
clearTimeout(timer);
ctx.maxDurationTimers.delete(callId);
}
}
export function startMaxDurationTimer(params: {
ctx: MaxDurationTimerContext;
callId: CallId;
onTimeout: (callId: CallId) => Promise<void>;
}): void {
clearMaxDurationTimer(params.ctx, params.callId);
const maxDurationMs = params.ctx.config.maxDurationSeconds * 1000;
console.log(
`[voice-call] Starting max duration timer (${params.ctx.config.maxDurationSeconds}s) for call ${params.callId}`,
);
const timer = setTimeout(async () => {
params.ctx.maxDurationTimers.delete(params.callId);
const call = params.ctx.activeCalls.get(params.callId);
if (call && !TerminalStates.has(call.state)) {
console.log(
`[voice-call] Max duration reached (${params.ctx.config.maxDurationSeconds}s), ending call ${params.callId}`,
);
call.endReason = "timeout";
persistCallRecord(params.ctx.storePath, call);
await params.onTimeout(params.callId);
}
}, maxDurationMs);
params.ctx.maxDurationTimers.set(params.callId, timer);
}
export function clearTranscriptWaiter(ctx: TranscriptWaiterContext, callId: CallId): void {
const waiter = ctx.transcriptWaiters.get(callId);
if (!waiter) {
return;
}
clearTimeout(waiter.timeout);
ctx.transcriptWaiters.delete(callId);
}
export function rejectTranscriptWaiter(
ctx: TranscriptWaiterContext,
callId: CallId,
reason: string,
): void {
const waiter = ctx.transcriptWaiters.get(callId);
if (!waiter) {
return;
}
clearTranscriptWaiter(ctx, callId);
waiter.reject(new Error(reason));
}
export function resolveTranscriptWaiter(
ctx: TranscriptWaiterContext,
callId: CallId,
transcript: string,
turnToken?: string,
): boolean {
const waiter = ctx.transcriptWaiters.get(callId);
if (!waiter) {
return false;
}
if (waiter.turnToken && waiter.turnToken !== turnToken) {
return false;
}
clearTranscriptWaiter(ctx, callId);
waiter.resolve(transcript);
return true;
}
export function waitForFinalTranscript(
ctx: TimerContext,
callId: CallId,
turnToken?: string,
): Promise<string> {
if (ctx.transcriptWaiters.has(callId)) {
return Promise.reject(new Error("Already waiting for transcript"));
}
const timeoutMs = ctx.config.transcriptTimeoutMs;
return new Promise((resolve, reject) => {
const timeout = setTimeout(() => {
ctx.transcriptWaiters.delete(callId);
reject(new Error(`Timed out waiting for transcript after ${timeoutMs}ms`));
}, timeoutMs);
ctx.transcriptWaiters.set(callId, { resolve, reject, timeout, turnToken });
});
}

View File

@@ -0,0 +1,9 @@
import { escapeXml } from "../voice-mapping.js";
export function generateNotifyTwiml(message: string, voice: string): string {
return `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Say voice="${voice}">${escapeXml(message)}</Say>
<Hangup/>
</Response>`;
}

View File

@@ -0,0 +1,271 @@
import { once } from "node:events";
import http from "node:http";
import { describe, expect, it } from "vitest";
import { WebSocket } from "ws";
import { MediaStreamHandler } from "./media-stream.js";
import type {
OpenAIRealtimeSTTProvider,
RealtimeSTTSession,
} from "./providers/stt-openai-realtime.js";
const createStubSession = (): RealtimeSTTSession => ({
connect: async () => {},
sendAudio: () => {},
waitForTranscript: async () => "",
onPartial: () => {},
onTranscript: () => {},
onSpeechStart: () => {},
close: () => {},
isConnected: () => true,
});
const createStubSttProvider = (): OpenAIRealtimeSTTProvider =>
({
createSession: () => createStubSession(),
}) as unknown as OpenAIRealtimeSTTProvider;
const flush = async (): Promise<void> => {
await new Promise((resolve) => setTimeout(resolve, 0));
};
const waitForAbort = (signal: AbortSignal): Promise<void> =>
new Promise((resolve) => {
if (signal.aborted) {
resolve();
return;
}
signal.addEventListener("abort", () => resolve(), { once: true });
});
const withTimeout = async <T>(promise: Promise<T>, timeoutMs = 2000): Promise<T> => {
let timer: ReturnType<typeof setTimeout> | null = null;
const timeout = new Promise<never>((_, reject) => {
timer = setTimeout(() => reject(new Error(`Timed out after ${timeoutMs}ms`)), timeoutMs);
});
try {
return await Promise.race([promise, timeout]);
} finally {
if (timer) {
clearTimeout(timer);
}
}
};
const startWsServer = async (
handler: MediaStreamHandler,
): Promise<{
url: string;
close: () => Promise<void>;
}> => {
const server = http.createServer();
server.on("upgrade", (request, socket, head) => {
handler.handleUpgrade(request, socket, head);
});
await new Promise<void>((resolve) => {
server.listen(0, "127.0.0.1", resolve);
});
const address = server.address();
if (!address || typeof address === "string") {
throw new Error("Failed to resolve test server address");
}
return {
url: `ws://127.0.0.1:${address.port}/voice/stream`,
close: async () => {
await new Promise<void>((resolve, reject) => {
server.close((err) => (err ? reject(err) : resolve()));
});
},
};
};
const connectWs = async (url: string): Promise<WebSocket> => {
const ws = new WebSocket(url);
await withTimeout(once(ws, "open") as Promise<[unknown]>);
return ws;
};
const waitForClose = async (
ws: WebSocket,
): Promise<{
code: number;
reason: string;
}> => {
const [code, reason] = (await withTimeout(once(ws, "close") as Promise<[number, Buffer]>)) ?? [];
return {
code,
reason: Buffer.isBuffer(reason) ? reason.toString() : String(reason || ""),
};
};
describe("MediaStreamHandler TTS queue", () => {
it("serializes TTS playback and resolves in order", async () => {
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
});
const started: number[] = [];
const finished: number[] = [];
let resolveFirst!: () => void;
const firstGate = new Promise<void>((resolve) => {
resolveFirst = resolve;
});
const first = handler.queueTts("stream-1", async () => {
started.push(1);
await firstGate;
finished.push(1);
});
const second = handler.queueTts("stream-1", async () => {
started.push(2);
finished.push(2);
});
await flush();
expect(started).toEqual([1]);
resolveFirst();
await first;
await second;
expect(started).toEqual([1, 2]);
expect(finished).toEqual([1, 2]);
});
it("cancels active playback and clears queued items", async () => {
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
});
let queuedRan = false;
const started: string[] = [];
const active = handler.queueTts("stream-1", async (signal) => {
started.push("active");
await waitForAbort(signal);
});
void handler.queueTts("stream-1", async () => {
queuedRan = true;
});
await flush();
expect(started).toEqual(["active"]);
handler.clearTtsQueue("stream-1");
await active;
await flush();
expect(queuedRan).toBe(false);
});
});
describe("MediaStreamHandler security hardening", () => {
it("closes idle pre-start connections after timeout", async () => {
const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> =
[];
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
preStartTimeoutMs: 40,
shouldAcceptStream: (params) => {
shouldAcceptStreamCalls.push(params);
return true;
},
});
const server = await startWsServer(handler);
try {
const ws = await connectWs(server.url);
const closed = await waitForClose(ws);
expect(closed.code).toBe(1008);
expect(closed.reason).toBe("Start timeout");
expect(shouldAcceptStreamCalls).toEqual([]);
} finally {
await server.close();
}
});
it("enforces pending connection limits", async () => {
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
preStartTimeoutMs: 5_000,
maxPendingConnections: 1,
maxPendingConnectionsPerIp: 1,
});
const server = await startWsServer(handler);
try {
const first = await connectWs(server.url);
const second = await connectWs(server.url);
const secondClosed = await waitForClose(second);
expect(secondClosed.code).toBe(1013);
expect(secondClosed.reason).toContain("Too many pending");
expect(first.readyState).toBe(WebSocket.OPEN);
first.close();
await waitForClose(first);
} finally {
await server.close();
}
});
it("rejects upgrades when max connection cap is reached", async () => {
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
preStartTimeoutMs: 5_000,
maxConnections: 1,
maxPendingConnections: 10,
maxPendingConnectionsPerIp: 10,
});
const server = await startWsServer(handler);
try {
const first = await connectWs(server.url);
const secondError = await withTimeout(
new Promise<Error>((resolve) => {
const ws = new WebSocket(server.url);
ws.once("error", (err) => resolve(err as Error));
}),
);
expect(secondError.message).toContain("Unexpected server response: 503");
first.close();
await waitForClose(first);
} finally {
await server.close();
}
});
it("clears pending state after valid start", async () => {
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
preStartTimeoutMs: 40,
shouldAcceptStream: () => true,
});
const server = await startWsServer(handler);
try {
const ws = await connectWs(server.url);
ws.send(
JSON.stringify({
event: "start",
streamSid: "MZ123",
start: { callSid: "CA123", customParameters: { token: "token-123" } },
}),
);
await new Promise((resolve) => setTimeout(resolve, 80));
expect(ws.readyState).toBe(WebSocket.OPEN);
ws.close();
await waitForClose(ws);
} finally {
await server.close();
}
});
});

View File

@@ -0,0 +1,527 @@
/**
* Media Stream Handler
*
* Handles bidirectional audio streaming between Twilio and the AI services.
* - Receives mu-law audio from Twilio via WebSocket
* - Forwards to OpenAI Realtime STT for transcription
* - Sends TTS audio back to Twilio
*/
import type { IncomingMessage } from "node:http";
import type { Duplex } from "node:stream";
import { WebSocket, WebSocketServer } from "ws";
import type {
OpenAIRealtimeSTTProvider,
RealtimeSTTSession,
} from "./providers/stt-openai-realtime.js";
/**
* Configuration for the media stream handler.
*/
export interface MediaStreamConfig {
/** STT provider for transcription */
sttProvider: OpenAIRealtimeSTTProvider;
/** Close sockets that never send a valid `start` frame within this window. */
preStartTimeoutMs?: number;
/** Max concurrent pre-start sockets. */
maxPendingConnections?: number;
/** Max concurrent pre-start sockets from a single source IP. */
maxPendingConnectionsPerIp?: number;
/** Max total open sockets (pending + active sessions). */
maxConnections?: number;
/** Validate whether to accept a media stream for the given call ID */
shouldAcceptStream?: (params: { callId: string; streamSid: string; token?: string }) => boolean;
/** Callback when transcript is received */
onTranscript?: (callId: string, transcript: string) => void;
/** Callback for partial transcripts (streaming UI) */
onPartialTranscript?: (callId: string, partial: string) => void;
/** Callback when stream connects */
onConnect?: (callId: string, streamSid: string) => void;
/** Callback when speech starts (barge-in) */
onSpeechStart?: (callId: string) => void;
/** Callback when stream disconnects */
onDisconnect?: (callId: string) => void;
}
/**
* Active media stream session.
*/
interface StreamSession {
callId: string;
streamSid: string;
ws: WebSocket;
sttSession: RealtimeSTTSession;
}
type TtsQueueEntry = {
playFn: (signal: AbortSignal) => Promise<void>;
controller: AbortController;
resolve: () => void;
reject: (error: unknown) => void;
};
type PendingConnection = {
ip: string;
timeout: ReturnType<typeof setTimeout>;
};
const DEFAULT_PRE_START_TIMEOUT_MS = 5000;
const DEFAULT_MAX_PENDING_CONNECTIONS = 32;
const DEFAULT_MAX_PENDING_CONNECTIONS_PER_IP = 4;
const DEFAULT_MAX_CONNECTIONS = 128;
/**
* Manages WebSocket connections for Twilio media streams.
*/
export class MediaStreamHandler {
private wss: WebSocketServer | null = null;
private sessions = new Map<string, StreamSession>();
private config: MediaStreamConfig;
/** Pending sockets that have upgraded but not yet sent an accepted `start` frame. */
private pendingConnections = new Map<WebSocket, PendingConnection>();
/** Pending socket count per remote IP for pre-auth throttling. */
private pendingByIp = new Map<string, number>();
private preStartTimeoutMs: number;
private maxPendingConnections: number;
private maxPendingConnectionsPerIp: number;
private maxConnections: number;
/** TTS playback queues per stream (serialize audio to prevent overlap) */
private ttsQueues = new Map<string, TtsQueueEntry[]>();
/** Whether TTS is currently playing per stream */
private ttsPlaying = new Map<string, boolean>();
/** Active TTS playback controllers per stream */
private ttsActiveControllers = new Map<string, AbortController>();
constructor(config: MediaStreamConfig) {
this.config = config;
this.preStartTimeoutMs = config.preStartTimeoutMs ?? DEFAULT_PRE_START_TIMEOUT_MS;
this.maxPendingConnections = config.maxPendingConnections ?? DEFAULT_MAX_PENDING_CONNECTIONS;
this.maxPendingConnectionsPerIp =
config.maxPendingConnectionsPerIp ?? DEFAULT_MAX_PENDING_CONNECTIONS_PER_IP;
this.maxConnections = config.maxConnections ?? DEFAULT_MAX_CONNECTIONS;
}
/**
* Handle WebSocket upgrade for media stream connections.
*/
handleUpgrade(request: IncomingMessage, socket: Duplex, head: Buffer): void {
if (!this.wss) {
this.wss = new WebSocketServer({ noServer: true });
this.wss.on("connection", (ws, req) => this.handleConnection(ws, req));
}
const currentConnections = this.wss.clients.size;
if (currentConnections >= this.maxConnections) {
this.rejectUpgrade(socket, 503, "Too many media stream connections");
return;
}
this.wss.handleUpgrade(request, socket, head, (ws) => {
this.wss?.emit("connection", ws, request);
});
}
/**
* Handle new WebSocket connection from Twilio.
*/
private async handleConnection(ws: WebSocket, _request: IncomingMessage): Promise<void> {
let session: StreamSession | null = null;
const streamToken = this.getStreamToken(_request);
const ip = this.getClientIp(_request);
if (!this.registerPendingConnection(ws, ip)) {
ws.close(1013, "Too many pending media stream connections");
return;
}
ws.on("message", async (data: Buffer) => {
try {
const message = JSON.parse(data.toString()) as TwilioMediaMessage;
switch (message.event) {
case "connected":
console.log("[MediaStream] Twilio connected");
break;
case "start":
session = await this.handleStart(ws, message, streamToken);
if (session) {
this.clearPendingConnection(ws);
}
break;
case "media":
if (session && message.media?.payload) {
// Forward audio to STT
const audioBuffer = Buffer.from(message.media.payload, "base64");
session.sttSession.sendAudio(audioBuffer);
}
break;
case "stop":
if (session) {
this.handleStop(session);
session = null;
}
break;
}
} catch (error) {
console.error("[MediaStream] Error processing message:", error);
}
});
ws.on("close", () => {
this.clearPendingConnection(ws);
if (session) {
this.handleStop(session);
}
});
ws.on("error", (error) => {
console.error("[MediaStream] WebSocket error:", error);
});
}
/**
* Handle stream start event.
*/
private async handleStart(
ws: WebSocket,
message: TwilioMediaMessage,
streamToken?: string,
): Promise<StreamSession | null> {
const streamSid = message.streamSid || "";
const callSid = message.start?.callSid || "";
// Prefer token from start message customParameters (set via TwiML <Parameter>),
// falling back to query string token. Twilio strips query params from WebSocket
// URLs but reliably delivers <Parameter> values in customParameters.
const effectiveToken = message.start?.customParameters?.token ?? streamToken;
console.log(`[MediaStream] Stream started: ${streamSid} (call: ${callSid})`);
if (!callSid) {
console.warn("[MediaStream] Missing callSid; closing stream");
ws.close(1008, "Missing callSid");
return null;
}
if (
this.config.shouldAcceptStream &&
!this.config.shouldAcceptStream({ callId: callSid, streamSid, token: effectiveToken })
) {
console.warn(`[MediaStream] Rejecting stream for unknown call: ${callSid}`);
ws.close(1008, "Unknown call");
return null;
}
// Create STT session
const sttSession = this.config.sttProvider.createSession();
// Set up transcript callbacks
sttSession.onPartial((partial) => {
this.config.onPartialTranscript?.(callSid, partial);
});
sttSession.onTranscript((transcript) => {
this.config.onTranscript?.(callSid, transcript);
});
sttSession.onSpeechStart(() => {
this.config.onSpeechStart?.(callSid);
});
const session: StreamSession = {
callId: callSid,
streamSid,
ws,
sttSession,
};
this.sessions.set(streamSid, session);
// Notify connection BEFORE STT connect so TTS can work even if STT fails
this.config.onConnect?.(callSid, streamSid);
// Connect to OpenAI STT (non-blocking, log errors but don't fail the call)
sttSession.connect().catch((err) => {
console.warn(`[MediaStream] STT connection failed (TTS still works):`, err.message);
});
return session;
}
/**
* Handle stream stop event.
*/
private handleStop(session: StreamSession): void {
console.log(`[MediaStream] Stream stopped: ${session.streamSid}`);
this.clearTtsState(session.streamSid);
session.sttSession.close();
this.sessions.delete(session.streamSid);
this.config.onDisconnect?.(session.callId);
}
private getStreamToken(request: IncomingMessage): string | undefined {
if (!request.url || !request.headers.host) {
return undefined;
}
try {
const url = new URL(request.url, `http://${request.headers.host}`);
return url.searchParams.get("token") ?? undefined;
} catch {
return undefined;
}
}
private getClientIp(request: IncomingMessage): string {
return request.socket.remoteAddress || "unknown";
}
private registerPendingConnection(ws: WebSocket, ip: string): boolean {
if (this.pendingConnections.size >= this.maxPendingConnections) {
console.warn("[MediaStream] Rejecting connection: pending connection limit reached");
return false;
}
const pendingForIp = this.pendingByIp.get(ip) ?? 0;
if (pendingForIp >= this.maxPendingConnectionsPerIp) {
console.warn(`[MediaStream] Rejecting connection: pending per-IP limit reached (${ip})`);
return false;
}
const timeout = setTimeout(() => {
if (!this.pendingConnections.has(ws)) {
return;
}
console.warn(
`[MediaStream] Closing pre-start idle connection after ${this.preStartTimeoutMs}ms (${ip})`,
);
ws.close(1008, "Start timeout");
}, this.preStartTimeoutMs);
timeout.unref?.();
this.pendingConnections.set(ws, { ip, timeout });
this.pendingByIp.set(ip, pendingForIp + 1);
return true;
}
private clearPendingConnection(ws: WebSocket): void {
const pending = this.pendingConnections.get(ws);
if (!pending) {
return;
}
clearTimeout(pending.timeout);
this.pendingConnections.delete(ws);
const current = this.pendingByIp.get(pending.ip) ?? 0;
if (current <= 1) {
this.pendingByIp.delete(pending.ip);
return;
}
this.pendingByIp.set(pending.ip, current - 1);
}
private rejectUpgrade(socket: Duplex, statusCode: 429 | 503, message: string): void {
const statusText = statusCode === 429 ? "Too Many Requests" : "Service Unavailable";
const body = `${message}\n`;
socket.write(
`HTTP/1.1 ${statusCode} ${statusText}\r\n` +
"Connection: close\r\n" +
"Content-Type: text/plain; charset=utf-8\r\n" +
`Content-Length: ${Buffer.byteLength(body)}\r\n` +
"\r\n" +
body,
);
socket.destroy();
}
/**
* Get an active session with an open WebSocket, or undefined if unavailable.
*/
private getOpenSession(streamSid: string): StreamSession | undefined {
const session = this.sessions.get(streamSid);
return session?.ws.readyState === WebSocket.OPEN ? session : undefined;
}
/**
* Send a message to a stream's WebSocket if available.
*/
private sendToStream(streamSid: string, message: unknown): void {
const session = this.getOpenSession(streamSid);
session?.ws.send(JSON.stringify(message));
}
/**
* Send audio to a specific stream (for TTS playback).
* Audio should be mu-law encoded at 8kHz mono.
*/
sendAudio(streamSid: string, muLawAudio: Buffer): void {
this.sendToStream(streamSid, {
event: "media",
streamSid,
media: { payload: muLawAudio.toString("base64") },
});
}
/**
* Send a mark event to track audio playback position.
*/
sendMark(streamSid: string, name: string): void {
this.sendToStream(streamSid, {
event: "mark",
streamSid,
mark: { name },
});
}
/**
* Clear audio buffer (interrupt playback).
*/
clearAudio(streamSid: string): void {
this.sendToStream(streamSid, { event: "clear", streamSid });
}
/**
* Queue a TTS operation for sequential playback.
* Only one TTS operation plays at a time per stream to prevent overlap.
*/
async queueTts(streamSid: string, playFn: (signal: AbortSignal) => Promise<void>): Promise<void> {
const queue = this.getTtsQueue(streamSid);
let resolveEntry: () => void;
let rejectEntry: (error: unknown) => void;
const promise = new Promise<void>((resolve, reject) => {
resolveEntry = resolve;
rejectEntry = reject;
});
queue.push({
playFn,
controller: new AbortController(),
resolve: resolveEntry!,
reject: rejectEntry!,
});
if (!this.ttsPlaying.get(streamSid)) {
void this.processQueue(streamSid);
}
return promise;
}
/**
* Clear TTS queue and interrupt current playback (barge-in).
*/
clearTtsQueue(streamSid: string): void {
const queue = this.getTtsQueue(streamSid);
queue.length = 0;
this.ttsActiveControllers.get(streamSid)?.abort();
this.clearAudio(streamSid);
}
/**
* Get active session by call ID.
*/
getSessionByCallId(callId: string): StreamSession | undefined {
return [...this.sessions.values()].find((session) => session.callId === callId);
}
/**
* Close all sessions.
*/
closeAll(): void {
for (const session of this.sessions.values()) {
this.clearTtsState(session.streamSid);
session.sttSession.close();
session.ws.close();
}
this.sessions.clear();
}
private getTtsQueue(streamSid: string): TtsQueueEntry[] {
const existing = this.ttsQueues.get(streamSid);
if (existing) {
return existing;
}
const queue: TtsQueueEntry[] = [];
this.ttsQueues.set(streamSid, queue);
return queue;
}
/**
* Process the TTS queue for a stream.
* Uses iterative approach to avoid stack accumulation from recursion.
*/
private async processQueue(streamSid: string): Promise<void> {
this.ttsPlaying.set(streamSid, true);
while (true) {
const queue = this.ttsQueues.get(streamSid);
if (!queue || queue.length === 0) {
this.ttsPlaying.set(streamSid, false);
this.ttsActiveControllers.delete(streamSid);
return;
}
const entry = queue.shift()!;
this.ttsActiveControllers.set(streamSid, entry.controller);
try {
await entry.playFn(entry.controller.signal);
entry.resolve();
} catch (error) {
if (entry.controller.signal.aborted) {
entry.resolve();
} else {
console.error("[MediaStream] TTS playback error:", error);
entry.reject(error);
}
} finally {
if (this.ttsActiveControllers.get(streamSid) === entry.controller) {
this.ttsActiveControllers.delete(streamSid);
}
}
}
}
private clearTtsState(streamSid: string): void {
const queue = this.ttsQueues.get(streamSid);
if (queue) {
queue.length = 0;
}
this.ttsActiveControllers.get(streamSid)?.abort();
this.ttsActiveControllers.delete(streamSid);
this.ttsPlaying.delete(streamSid);
this.ttsQueues.delete(streamSid);
}
}
/**
* Twilio Media Stream message format.
*/
interface TwilioMediaMessage {
event: "connected" | "start" | "media" | "stop" | "mark" | "clear";
sequenceNumber?: string;
streamSid?: string;
start?: {
streamSid: string;
accountSid: string;
callSid: string;
tracks: string[];
customParameters?: Record<string, string>;
mediaFormat: {
encoding: string;
sampleRate: number;
channels: number;
};
};
media?: {
track?: string;
chunk?: string;
timestamp?: string;
payload?: string;
};
mark?: {
name: string;
};
}

View File

@@ -0,0 +1,68 @@
import type {
HangupCallInput,
InitiateCallInput,
InitiateCallResult,
PlayTtsInput,
ProviderName,
WebhookParseOptions,
ProviderWebhookParseResult,
StartListeningInput,
StopListeningInput,
WebhookContext,
WebhookVerificationResult,
} from "../types.js";
/**
* Abstract base interface for voice call providers.
*
* Each provider (Telnyx, Twilio, etc.) implements this interface to provide
* a consistent API for the call manager.
*
* Responsibilities:
* - Webhook verification and event parsing
* - Outbound call initiation and hangup
* - Media control (TTS playback, STT listening)
*/
export interface VoiceCallProvider {
/** Provider identifier */
readonly name: ProviderName;
/**
* Verify webhook signature/HMAC before processing.
* Must be called before parseWebhookEvent.
*/
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult;
/**
* Parse provider-specific webhook payload into normalized events.
* Returns events and optional response to send back to provider.
*/
parseWebhookEvent(ctx: WebhookContext, options?: WebhookParseOptions): ProviderWebhookParseResult;
/**
* Initiate an outbound call.
* @returns Provider call ID and status
*/
initiateCall(input: InitiateCallInput): Promise<InitiateCallResult>;
/**
* Hang up an active call.
*/
hangupCall(input: HangupCallInput): Promise<void>;
/**
* Play TTS audio to the caller.
* The provider should handle streaming if supported.
*/
playTts(input: PlayTtsInput): Promise<void>;
/**
* Start listening for user speech (activate STT).
*/
startListening(input: StartListeningInput): Promise<void>;
/**
* Stop listening for user speech (deactivate STT).
*/
stopListening(input: StopListeningInput): Promise<void>;
}

View File

@@ -0,0 +1,10 @@
export type { VoiceCallProvider } from "./base.js";
export { MockProvider } from "./mock.js";
export {
OpenAIRealtimeSTTProvider,
type RealtimeSTTConfig,
type RealtimeSTTSession,
} from "./stt-openai-realtime.js";
export { TelnyxProvider } from "./telnyx.js";
export { TwilioProvider } from "./twilio.js";
export { PlivoProvider } from "./plivo.js";

View File

@@ -0,0 +1,169 @@
import crypto from "node:crypto";
import type {
EndReason,
HangupCallInput,
InitiateCallInput,
InitiateCallResult,
NormalizedEvent,
PlayTtsInput,
WebhookParseOptions,
ProviderWebhookParseResult,
StartListeningInput,
StopListeningInput,
WebhookContext,
WebhookVerificationResult,
} from "../types.js";
import type { VoiceCallProvider } from "./base.js";
/**
* Mock voice call provider for local testing.
*
* Events are driven via webhook POST with JSON body:
* - { events: NormalizedEvent[] } for bulk events
* - { event: NormalizedEvent } for single event
*/
export class MockProvider implements VoiceCallProvider {
readonly name = "mock" as const;
verifyWebhook(_ctx: WebhookContext): WebhookVerificationResult {
return { ok: true };
}
parseWebhookEvent(
ctx: WebhookContext,
_options?: WebhookParseOptions,
): ProviderWebhookParseResult {
try {
const payload = JSON.parse(ctx.rawBody);
const events: NormalizedEvent[] = [];
if (Array.isArray(payload.events)) {
for (const evt of payload.events) {
const normalized = this.normalizeEvent(evt);
if (normalized) {
events.push(normalized);
}
}
} else if (payload.event) {
const normalized = this.normalizeEvent(payload.event);
if (normalized) {
events.push(normalized);
}
}
return { events, statusCode: 200 };
} catch {
return { events: [], statusCode: 400 };
}
}
private normalizeEvent(evt: Partial<NormalizedEvent>): NormalizedEvent | null {
if (!evt.type || !evt.callId) {
return null;
}
const base = {
id: evt.id || crypto.randomUUID(),
callId: evt.callId,
providerCallId: evt.providerCallId,
timestamp: evt.timestamp || Date.now(),
};
switch (evt.type) {
case "call.initiated":
case "call.ringing":
case "call.answered":
case "call.active":
return { ...base, type: evt.type };
case "call.speaking": {
const payload = evt as Partial<NormalizedEvent & { text?: string }>;
return {
...base,
type: evt.type,
text: payload.text || "",
};
}
case "call.speech": {
const payload = evt as Partial<
NormalizedEvent & {
transcript?: string;
isFinal?: boolean;
confidence?: number;
}
>;
return {
...base,
type: evt.type,
transcript: payload.transcript || "",
isFinal: payload.isFinal ?? true,
confidence: payload.confidence,
};
}
case "call.silence": {
const payload = evt as Partial<NormalizedEvent & { durationMs?: number }>;
return {
...base,
type: evt.type,
durationMs: payload.durationMs || 0,
};
}
case "call.dtmf": {
const payload = evt as Partial<NormalizedEvent & { digits?: string }>;
return {
...base,
type: evt.type,
digits: payload.digits || "",
};
}
case "call.ended": {
const payload = evt as Partial<NormalizedEvent & { reason?: EndReason }>;
return {
...base,
type: evt.type,
reason: payload.reason || "completed",
};
}
case "call.error": {
const payload = evt as Partial<NormalizedEvent & { error?: string; retryable?: boolean }>;
return {
...base,
type: evt.type,
error: payload.error || "unknown error",
retryable: payload.retryable,
};
}
default:
return null;
}
}
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
return {
providerCallId: `mock-${input.callId}`,
status: "initiated",
};
}
async hangupCall(_input: HangupCallInput): Promise<void> {
// No-op for mock
}
async playTts(_input: PlayTtsInput): Promise<void> {
// No-op for mock
}
async startListening(_input: StartListeningInput): Promise<void> {
// No-op for mock
}
async stopListening(_input: StopListeningInput): Promise<void> {
// No-op for mock
}
}

View File

@@ -0,0 +1,49 @@
import { describe, expect, it } from "vitest";
import { PlivoProvider } from "./plivo.js";
describe("PlivoProvider", () => {
it("parses answer callback into call.answered and returns keep-alive XML", () => {
const provider = new PlivoProvider({
authId: "MA000000000000000000",
authToken: "test-token",
});
const result = provider.parseWebhookEvent({
headers: { host: "example.com" },
rawBody:
"CallUUID=call-uuid&CallStatus=in-progress&Direction=outbound&From=%2B15550000000&To=%2B15550000001&Event=StartApp",
url: "https://example.com/voice/webhook?provider=plivo&flow=answer&callId=internal-call-id",
method: "POST",
query: { provider: "plivo", flow: "answer", callId: "internal-call-id" },
});
expect(result.events).toHaveLength(1);
expect(result.events[0]?.type).toBe("call.answered");
expect(result.events[0]?.callId).toBe("internal-call-id");
expect(result.events[0]?.providerCallId).toBe("call-uuid");
expect(result.providerResponseBody).toContain("<Wait");
expect(result.providerResponseBody).toContain('length="300"');
});
it("uses verified request key when provided", () => {
const provider = new PlivoProvider({
authId: "MA000000000000000000",
authToken: "test-token",
});
const result = provider.parseWebhookEvent(
{
headers: { host: "example.com", "x-plivo-signature-v3-nonce": "nonce-1" },
rawBody:
"CallUUID=call-uuid&CallStatus=in-progress&Direction=outbound&From=%2B15550000000&To=%2B15550000001&Event=StartApp",
url: "https://example.com/voice/webhook?provider=plivo&flow=answer&callId=internal-call-id",
method: "POST",
query: { provider: "plivo", flow: "answer", callId: "internal-call-id" },
},
{ verifiedRequestKey: "plivo:v3:verified" },
);
expect(result.events).toHaveLength(1);
expect(result.events[0]?.dedupeKey).toBe("plivo:v3:verified");
});
});

View File

@@ -0,0 +1,556 @@
import crypto from "node:crypto";
import type { PlivoConfig, WebhookSecurityConfig } from "../config.js";
import { getHeader } from "../http-headers.js";
import type {
HangupCallInput,
InitiateCallInput,
InitiateCallResult,
NormalizedEvent,
PlayTtsInput,
ProviderWebhookParseResult,
StartListeningInput,
StopListeningInput,
WebhookContext,
WebhookParseOptions,
WebhookVerificationResult,
} from "../types.js";
import { escapeXml } from "../voice-mapping.js";
import { reconstructWebhookUrl, verifyPlivoWebhook } from "../webhook-security.js";
import type { VoiceCallProvider } from "./base.js";
import { guardedJsonApiRequest } from "./shared/guarded-json-api.js";
export interface PlivoProviderOptions {
/** Override public URL origin for signature verification */
publicUrl?: string;
/** Skip webhook signature verification (development only) */
skipVerification?: boolean;
/** Outbound ring timeout in seconds */
ringTimeoutSec?: number;
/** Webhook security options (forwarded headers/allowlist) */
webhookSecurity?: WebhookSecurityConfig;
}
type PendingSpeak = { text: string; locale?: string };
type PendingListen = { language?: string };
function createPlivoRequestDedupeKey(ctx: WebhookContext): string {
const nonceV3 = getHeader(ctx.headers, "x-plivo-signature-v3-nonce");
if (nonceV3) {
return `plivo:v3:${nonceV3}`;
}
const nonceV2 = getHeader(ctx.headers, "x-plivo-signature-v2-nonce");
if (nonceV2) {
return `plivo:v2:${nonceV2}`;
}
return `plivo:fallback:${crypto.createHash("sha256").update(ctx.rawBody).digest("hex")}`;
}
export class PlivoProvider implements VoiceCallProvider {
readonly name = "plivo" as const;
private readonly authId: string;
private readonly authToken: string;
private readonly baseUrl: string;
private readonly options: PlivoProviderOptions;
private readonly apiHost: string;
// Best-effort mapping between create-call request UUID and call UUID.
private requestUuidToCallUuid = new Map<string, string>();
// Used for transfer URLs and GetInput action URLs.
private callIdToWebhookUrl = new Map<string, string>();
private callUuidToWebhookUrl = new Map<string, string>();
private pendingSpeakByCallId = new Map<string, PendingSpeak>();
private pendingListenByCallId = new Map<string, PendingListen>();
constructor(config: PlivoConfig, options: PlivoProviderOptions = {}) {
if (!config.authId) {
throw new Error("Plivo Auth ID is required");
}
if (!config.authToken) {
throw new Error("Plivo Auth Token is required");
}
this.authId = config.authId;
this.authToken = config.authToken;
this.baseUrl = `https://api.plivo.com/v1/Account/${this.authId}`;
this.apiHost = new URL(this.baseUrl).hostname;
this.options = options;
}
private async apiRequest<T = unknown>(params: {
method: "GET" | "POST" | "DELETE";
endpoint: string;
body?: Record<string, unknown>;
allowNotFound?: boolean;
}): Promise<T> {
const { method, endpoint, body, allowNotFound } = params;
return await guardedJsonApiRequest<T>({
url: `${this.baseUrl}${endpoint}`,
method,
headers: {
Authorization: `Basic ${Buffer.from(`${this.authId}:${this.authToken}`).toString("base64")}`,
"Content-Type": "application/json",
},
body,
allowNotFound,
allowedHostnames: [this.apiHost],
auditContext: "voice-call.plivo.api",
errorPrefix: "Plivo API error",
});
}
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult {
const result = verifyPlivoWebhook(ctx, this.authToken, {
publicUrl: this.options.publicUrl,
skipVerification: this.options.skipVerification,
allowedHosts: this.options.webhookSecurity?.allowedHosts,
trustForwardingHeaders: this.options.webhookSecurity?.trustForwardingHeaders,
trustedProxyIPs: this.options.webhookSecurity?.trustedProxyIPs,
remoteIP: ctx.remoteAddress,
});
if (!result.ok) {
console.warn(`[plivo] Webhook verification failed: ${result.reason}`);
}
return {
ok: result.ok,
reason: result.reason,
isReplay: result.isReplay,
verifiedRequestKey: result.verifiedRequestKey,
};
}
parseWebhookEvent(
ctx: WebhookContext,
options?: WebhookParseOptions,
): ProviderWebhookParseResult {
const flow = typeof ctx.query?.flow === "string" ? ctx.query.flow.trim() : "";
const parsed = this.parseBody(ctx.rawBody);
if (!parsed) {
return { events: [], statusCode: 400 };
}
// Keep providerCallId mapping for later call control.
const callUuid = parsed.get("CallUUID") || undefined;
if (callUuid) {
const webhookBase = this.baseWebhookUrlFromCtx(ctx);
if (webhookBase) {
this.callUuidToWebhookUrl.set(callUuid, webhookBase);
}
}
// Special flows that exist only to return Plivo XML (no events).
if (flow === "xml-speak") {
const callId = this.getCallIdFromQuery(ctx);
const pending = callId ? this.pendingSpeakByCallId.get(callId) : undefined;
if (callId) {
this.pendingSpeakByCallId.delete(callId);
}
const xml = pending
? PlivoProvider.xmlSpeak(pending.text, pending.locale)
: PlivoProvider.xmlKeepAlive();
return {
events: [],
providerResponseBody: xml,
providerResponseHeaders: { "Content-Type": "text/xml" },
statusCode: 200,
};
}
if (flow === "xml-listen") {
const callId = this.getCallIdFromQuery(ctx);
const pending = callId ? this.pendingListenByCallId.get(callId) : undefined;
if (callId) {
this.pendingListenByCallId.delete(callId);
}
const actionUrl = this.buildActionUrl(ctx, {
flow: "getinput",
callId,
});
const xml =
actionUrl && callId
? PlivoProvider.xmlGetInputSpeech({
actionUrl,
language: pending?.language,
})
: PlivoProvider.xmlKeepAlive();
return {
events: [],
providerResponseBody: xml,
providerResponseHeaders: { "Content-Type": "text/xml" },
statusCode: 200,
};
}
// Normal events.
const callIdFromQuery = this.getCallIdFromQuery(ctx);
const dedupeKey = options?.verifiedRequestKey ?? createPlivoRequestDedupeKey(ctx);
const event = this.normalizeEvent(parsed, callIdFromQuery, dedupeKey);
return {
events: event ? [event] : [],
providerResponseBody:
flow === "answer" || flow === "getinput"
? PlivoProvider.xmlKeepAlive()
: PlivoProvider.xmlEmpty(),
providerResponseHeaders: { "Content-Type": "text/xml" },
statusCode: 200,
};
}
private normalizeEvent(
params: URLSearchParams,
callIdOverride?: string,
dedupeKey?: string,
): NormalizedEvent | null {
const callUuid = params.get("CallUUID") || "";
const requestUuid = params.get("RequestUUID") || "";
if (requestUuid && callUuid) {
this.requestUuidToCallUuid.set(requestUuid, callUuid);
}
const direction = params.get("Direction");
const from = params.get("From") || undefined;
const to = params.get("To") || undefined;
const callStatus = params.get("CallStatus");
const baseEvent = {
id: crypto.randomUUID(),
dedupeKey,
callId: callIdOverride || callUuid || requestUuid,
providerCallId: callUuid || requestUuid || undefined,
timestamp: Date.now(),
direction:
direction === "inbound"
? ("inbound" as const)
: direction === "outbound"
? ("outbound" as const)
: undefined,
from,
to,
};
const digits = params.get("Digits");
if (digits) {
return { ...baseEvent, type: "call.dtmf", digits };
}
const transcript = PlivoProvider.extractTranscript(params);
if (transcript) {
return {
...baseEvent,
type: "call.speech",
transcript,
isFinal: true,
};
}
// Call lifecycle.
if (callStatus === "ringing") {
return { ...baseEvent, type: "call.ringing" };
}
if (callStatus === "in-progress") {
return { ...baseEvent, type: "call.answered" };
}
if (
callStatus === "completed" ||
callStatus === "busy" ||
callStatus === "no-answer" ||
callStatus === "failed"
) {
return {
...baseEvent,
type: "call.ended",
reason:
callStatus === "completed"
? "completed"
: callStatus === "busy"
? "busy"
: callStatus === "no-answer"
? "no-answer"
: "failed",
};
}
// Plivo will call our answer_url when the call is answered; if we don't have
// a CallStatus for some reason, treat it as answered so the call can proceed.
if (params.get("Event") === "StartApp" && callUuid) {
return { ...baseEvent, type: "call.answered" };
}
return null;
}
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
const webhookUrl = new URL(input.webhookUrl);
webhookUrl.searchParams.set("provider", "plivo");
webhookUrl.searchParams.set("callId", input.callId);
const answerUrl = new URL(webhookUrl);
answerUrl.searchParams.set("flow", "answer");
const hangupUrl = new URL(webhookUrl);
hangupUrl.searchParams.set("flow", "hangup");
this.callIdToWebhookUrl.set(input.callId, input.webhookUrl);
const ringTimeoutSec = this.options.ringTimeoutSec ?? 30;
const result = await this.apiRequest<PlivoCreateCallResponse>({
method: "POST",
endpoint: "/Call/",
body: {
from: PlivoProvider.normalizeNumber(input.from),
to: PlivoProvider.normalizeNumber(input.to),
answer_url: answerUrl.toString(),
answer_method: "POST",
hangup_url: hangupUrl.toString(),
hangup_method: "POST",
// Plivo's API uses `hangup_on_ring` for outbound ring timeout.
hangup_on_ring: ringTimeoutSec,
},
});
const requestUuid = Array.isArray(result.request_uuid)
? result.request_uuid[0]
: result.request_uuid;
if (!requestUuid) {
throw new Error("Plivo call create returned no request_uuid");
}
return { providerCallId: requestUuid, status: "initiated" };
}
async hangupCall(input: HangupCallInput): Promise<void> {
const callUuid = this.requestUuidToCallUuid.get(input.providerCallId);
if (callUuid) {
await this.apiRequest({
method: "DELETE",
endpoint: `/Call/${callUuid}/`,
allowNotFound: true,
});
return;
}
// Best-effort: try hangup (call UUID), then cancel (request UUID).
await this.apiRequest({
method: "DELETE",
endpoint: `/Call/${input.providerCallId}/`,
allowNotFound: true,
});
await this.apiRequest({
method: "DELETE",
endpoint: `/Request/${input.providerCallId}/`,
allowNotFound: true,
});
}
private resolveCallContext(params: {
providerCallId: string;
callId: string;
operation: string;
}): {
callUuid: string;
webhookBase: string;
} {
const callUuid = this.requestUuidToCallUuid.get(params.providerCallId) ?? params.providerCallId;
const webhookBase =
this.callUuidToWebhookUrl.get(callUuid) || this.callIdToWebhookUrl.get(params.callId);
if (!webhookBase) {
throw new Error("Missing webhook URL for this call (provider state missing)");
}
if (!callUuid) {
throw new Error(`Missing Plivo CallUUID for ${params.operation}`);
}
return { callUuid, webhookBase };
}
private async transferCallLeg(params: {
callUuid: string;
webhookBase: string;
callId: string;
flow: "xml-speak" | "xml-listen";
}): Promise<void> {
const transferUrl = new URL(params.webhookBase);
transferUrl.searchParams.set("provider", "plivo");
transferUrl.searchParams.set("flow", params.flow);
transferUrl.searchParams.set("callId", params.callId);
await this.apiRequest({
method: "POST",
endpoint: `/Call/${params.callUuid}/`,
body: {
legs: "aleg",
aleg_url: transferUrl.toString(),
aleg_method: "POST",
},
});
}
async playTts(input: PlayTtsInput): Promise<void> {
const { callUuid, webhookBase } = this.resolveCallContext({
providerCallId: input.providerCallId,
callId: input.callId,
operation: "playTts",
});
this.pendingSpeakByCallId.set(input.callId, {
text: input.text,
locale: input.locale,
});
await this.transferCallLeg({
callUuid,
webhookBase,
callId: input.callId,
flow: "xml-speak",
});
}
async startListening(input: StartListeningInput): Promise<void> {
const { callUuid, webhookBase } = this.resolveCallContext({
providerCallId: input.providerCallId,
callId: input.callId,
operation: "startListening",
});
this.pendingListenByCallId.set(input.callId, {
language: input.language,
});
await this.transferCallLeg({
callUuid,
webhookBase,
callId: input.callId,
flow: "xml-listen",
});
}
async stopListening(_input: StopListeningInput): Promise<void> {
// GetInput ends automatically when speech ends.
}
private static normalizeNumber(numberOrSip: string): string {
const trimmed = numberOrSip.trim();
if (trimmed.toLowerCase().startsWith("sip:")) {
return trimmed;
}
return trimmed.replace(/[^\d+]/g, "");
}
private static xmlEmpty(): string {
return `<?xml version="1.0" encoding="UTF-8"?><Response></Response>`;
}
private static xmlKeepAlive(): string {
return `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Wait length="300" />
</Response>`;
}
private static xmlSpeak(text: string, locale?: string): string {
const language = locale || "en-US";
return `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Speak language="${escapeXml(language)}">${escapeXml(text)}</Speak>
<Wait length="300" />
</Response>`;
}
private static xmlGetInputSpeech(params: { actionUrl: string; language?: string }): string {
const language = params.language || "en-US";
return `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<GetInput inputType="speech" method="POST" action="${escapeXml(params.actionUrl)}" language="${escapeXml(language)}" executionTimeout="30" speechEndTimeout="1" redirect="false">
</GetInput>
<Wait length="300" />
</Response>`;
}
private getCallIdFromQuery(ctx: WebhookContext): string | undefined {
const callId =
typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
? ctx.query.callId.trim()
: undefined;
return callId || undefined;
}
private buildActionUrl(
ctx: WebhookContext,
opts: { flow: string; callId?: string },
): string | null {
const base = this.baseWebhookUrlFromCtx(ctx);
if (!base) {
return null;
}
const u = new URL(base);
u.searchParams.set("provider", "plivo");
u.searchParams.set("flow", opts.flow);
if (opts.callId) {
u.searchParams.set("callId", opts.callId);
}
return u.toString();
}
private baseWebhookUrlFromCtx(ctx: WebhookContext): string | null {
try {
const u = new URL(
reconstructWebhookUrl(ctx, {
allowedHosts: this.options.webhookSecurity?.allowedHosts,
trustForwardingHeaders: this.options.webhookSecurity?.trustForwardingHeaders,
trustedProxyIPs: this.options.webhookSecurity?.trustedProxyIPs,
remoteIP: ctx.remoteAddress,
}),
);
return `${u.origin}${u.pathname}`;
} catch {
return null;
}
}
private parseBody(rawBody: string): URLSearchParams | null {
try {
return new URLSearchParams(rawBody);
} catch {
return null;
}
}
private static extractTranscript(params: URLSearchParams): string | null {
const candidates = [
"Speech",
"Transcription",
"TranscriptionText",
"SpeechResult",
"RecognizedSpeech",
"Text",
] as const;
for (const key of candidates) {
const value = params.get(key);
if (value && value.trim()) {
return value.trim();
}
}
return null;
}
}
type PlivoCreateCallResponse = {
api_id?: string;
message?: string;
request_uuid?: string | string[];
};

View File

@@ -0,0 +1,42 @@
import { fetchWithSsrFGuard } from "openclaw/plugin-sdk";
type GuardedJsonApiRequestParams = {
url: string;
method: "GET" | "POST" | "DELETE" | "PUT" | "PATCH";
headers: Record<string, string>;
body?: Record<string, unknown>;
allowNotFound?: boolean;
allowedHostnames: string[];
auditContext: string;
errorPrefix: string;
};
export async function guardedJsonApiRequest<T = unknown>(
params: GuardedJsonApiRequestParams,
): Promise<T> {
const { response, release } = await fetchWithSsrFGuard({
url: params.url,
init: {
method: params.method,
headers: params.headers,
body: params.body ? JSON.stringify(params.body) : undefined,
},
policy: { allowedHostnames: params.allowedHostnames },
auditContext: params.auditContext,
});
try {
if (!response.ok) {
if (params.allowNotFound && response.status === 404) {
return undefined as T;
}
const errorText = await response.text();
throw new Error(`${params.errorPrefix}: ${response.status} ${errorText}`);
}
const text = await response.text();
return text ? (JSON.parse(text) as T) : (undefined as T);
} finally {
await release();
}
}

View File

@@ -0,0 +1,311 @@
/**
* OpenAI Realtime STT Provider
*
* Uses the OpenAI Realtime API for streaming transcription with:
* - Direct mu-law audio support (no conversion needed)
* - Built-in server-side VAD for turn detection
* - Low-latency streaming transcription
* - Partial transcript callbacks for real-time UI updates
*/
import WebSocket from "ws";
/**
* Configuration for OpenAI Realtime STT.
*/
export interface RealtimeSTTConfig {
/** OpenAI API key */
apiKey: string;
/** Model to use (default: gpt-4o-transcribe) */
model?: string;
/** Silence duration in ms before considering speech ended (default: 800) */
silenceDurationMs?: number;
/** VAD threshold 0-1 (default: 0.5) */
vadThreshold?: number;
}
/**
* Session for streaming audio and receiving transcripts.
*/
export interface RealtimeSTTSession {
/** Connect to the transcription service */
connect(): Promise<void>;
/** Send mu-law audio data (8kHz mono) */
sendAudio(audio: Buffer): void;
/** Wait for next complete transcript (after VAD detects end of speech) */
waitForTranscript(timeoutMs?: number): Promise<string>;
/** Set callback for partial transcripts (streaming) */
onPartial(callback: (partial: string) => void): void;
/** Set callback for final transcripts */
onTranscript(callback: (transcript: string) => void): void;
/** Set callback when speech starts (VAD) */
onSpeechStart(callback: () => void): void;
/** Close the session */
close(): void;
/** Check if session is connected */
isConnected(): boolean;
}
/**
* Provider factory for OpenAI Realtime STT sessions.
*/
export class OpenAIRealtimeSTTProvider {
readonly name = "openai-realtime";
private apiKey: string;
private model: string;
private silenceDurationMs: number;
private vadThreshold: number;
constructor(config: RealtimeSTTConfig) {
if (!config.apiKey) {
throw new Error("OpenAI API key required for Realtime STT");
}
this.apiKey = config.apiKey;
this.model = config.model || "gpt-4o-transcribe";
this.silenceDurationMs = config.silenceDurationMs || 800;
this.vadThreshold = config.vadThreshold || 0.5;
}
/**
* Create a new realtime transcription session.
*/
createSession(): RealtimeSTTSession {
return new OpenAIRealtimeSTTSession(
this.apiKey,
this.model,
this.silenceDurationMs,
this.vadThreshold,
);
}
}
/**
* WebSocket-based session for real-time speech-to-text.
*/
class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
private static readonly MAX_RECONNECT_ATTEMPTS = 5;
private static readonly RECONNECT_DELAY_MS = 1000;
private ws: WebSocket | null = null;
private connected = false;
private closed = false;
private reconnectAttempts = 0;
private pendingTranscript = "";
private onTranscriptCallback: ((transcript: string) => void) | null = null;
private onPartialCallback: ((partial: string) => void) | null = null;
private onSpeechStartCallback: (() => void) | null = null;
constructor(
private readonly apiKey: string,
private readonly model: string,
private readonly silenceDurationMs: number,
private readonly vadThreshold: number,
) {}
async connect(): Promise<void> {
this.closed = false;
this.reconnectAttempts = 0;
return this.doConnect();
}
private async doConnect(): Promise<void> {
return new Promise((resolve, reject) => {
const url = "wss://api.openai.com/v1/realtime?intent=transcription";
this.ws = new WebSocket(url, {
headers: {
Authorization: `Bearer ${this.apiKey}`,
"OpenAI-Beta": "realtime=v1",
},
});
this.ws.on("open", () => {
console.log("[RealtimeSTT] WebSocket connected");
this.connected = true;
this.reconnectAttempts = 0;
// Configure the transcription session
this.sendEvent({
type: "transcription_session.update",
session: {
input_audio_format: "g711_ulaw",
input_audio_transcription: {
model: this.model,
},
turn_detection: {
type: "server_vad",
threshold: this.vadThreshold,
prefix_padding_ms: 300,
silence_duration_ms: this.silenceDurationMs,
},
},
});
resolve();
});
this.ws.on("message", (data: Buffer) => {
try {
const event = JSON.parse(data.toString());
this.handleEvent(event);
} catch (e) {
console.error("[RealtimeSTT] Failed to parse event:", e);
}
});
this.ws.on("error", (error) => {
console.error("[RealtimeSTT] WebSocket error:", error);
if (!this.connected) {
reject(error);
}
});
this.ws.on("close", (code, reason) => {
console.log(
`[RealtimeSTT] WebSocket closed (code: ${code}, reason: ${reason?.toString() || "none"})`,
);
this.connected = false;
// Attempt reconnection if not intentionally closed
if (!this.closed) {
void this.attemptReconnect();
}
});
setTimeout(() => {
if (!this.connected) {
reject(new Error("Realtime STT connection timeout"));
}
}, 10000);
});
}
private async attemptReconnect(): Promise<void> {
if (this.closed) {
return;
}
if (this.reconnectAttempts >= OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS) {
console.error(
`[RealtimeSTT] Max reconnect attempts (${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS}) reached`,
);
return;
}
this.reconnectAttempts++;
const delay = OpenAIRealtimeSTTSession.RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1);
console.log(
`[RealtimeSTT] Reconnecting ${this.reconnectAttempts}/${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS} in ${delay}ms...`,
);
await new Promise((resolve) => setTimeout(resolve, delay));
if (this.closed) {
return;
}
try {
await this.doConnect();
console.log("[RealtimeSTT] Reconnected successfully");
} catch (error) {
console.error("[RealtimeSTT] Reconnect failed:", error);
}
}
private handleEvent(event: {
type: string;
delta?: string;
transcript?: string;
error?: unknown;
}): void {
switch (event.type) {
case "transcription_session.created":
case "transcription_session.updated":
case "input_audio_buffer.speech_stopped":
case "input_audio_buffer.committed":
console.log(`[RealtimeSTT] ${event.type}`);
break;
case "conversation.item.input_audio_transcription.delta":
if (event.delta) {
this.pendingTranscript += event.delta;
this.onPartialCallback?.(this.pendingTranscript);
}
break;
case "conversation.item.input_audio_transcription.completed":
if (event.transcript) {
console.log(`[RealtimeSTT] Transcript: ${event.transcript}`);
this.onTranscriptCallback?.(event.transcript);
}
this.pendingTranscript = "";
break;
case "input_audio_buffer.speech_started":
console.log("[RealtimeSTT] Speech started");
this.pendingTranscript = "";
this.onSpeechStartCallback?.();
break;
case "error":
console.error("[RealtimeSTT] Error:", event.error);
break;
}
}
private sendEvent(event: unknown): void {
if (this.ws?.readyState === WebSocket.OPEN) {
this.ws.send(JSON.stringify(event));
}
}
sendAudio(muLawData: Buffer): void {
if (!this.connected) {
return;
}
this.sendEvent({
type: "input_audio_buffer.append",
audio: muLawData.toString("base64"),
});
}
onPartial(callback: (partial: string) => void): void {
this.onPartialCallback = callback;
}
onTranscript(callback: (transcript: string) => void): void {
this.onTranscriptCallback = callback;
}
onSpeechStart(callback: () => void): void {
this.onSpeechStartCallback = callback;
}
async waitForTranscript(timeoutMs = 30000): Promise<string> {
return new Promise((resolve, reject) => {
const timeout = setTimeout(() => {
this.onTranscriptCallback = null;
reject(new Error("Transcript timeout"));
}, timeoutMs);
this.onTranscriptCallback = (transcript) => {
clearTimeout(timeout);
this.onTranscriptCallback = null;
resolve(transcript);
};
});
}
close(): void {
this.closed = true;
if (this.ws) {
this.ws.close();
this.ws = null;
}
this.connected = false;
}
isConnected(): boolean {
return this.connected;
}
}

View File

@@ -0,0 +1,166 @@
import crypto from "node:crypto";
import { describe, expect, it } from "vitest";
import type { WebhookContext } from "../types.js";
import { TelnyxProvider } from "./telnyx.js";
function createCtx(params?: Partial<WebhookContext>): WebhookContext {
return {
headers: {},
rawBody: "{}",
url: "http://localhost/voice/webhook",
method: "POST",
query: {},
remoteAddress: "127.0.0.1",
...params,
};
}
function decodeBase64Url(input: string): Buffer {
const normalized = input.replace(/-/g, "+").replace(/_/g, "/");
const padLen = (4 - (normalized.length % 4)) % 4;
const padded = normalized + "=".repeat(padLen);
return Buffer.from(padded, "base64");
}
function expectWebhookVerificationSucceeds(params: {
publicKey: string;
privateKey: crypto.KeyObject;
}) {
const provider = new TelnyxProvider(
{ apiKey: "KEY123", connectionId: "CONN456", publicKey: params.publicKey },
{ skipVerification: false },
);
const rawBody = JSON.stringify({
event_type: "call.initiated",
payload: { call_control_id: "x" },
});
const timestamp = String(Math.floor(Date.now() / 1000));
const signedPayload = `${timestamp}|${rawBody}`;
const signature = crypto
.sign(null, Buffer.from(signedPayload), params.privateKey)
.toString("base64");
const result = provider.verifyWebhook(
createCtx({
rawBody,
headers: {
"telnyx-signature-ed25519": signature,
"telnyx-timestamp": timestamp,
},
}),
);
expect(result.ok).toBe(true);
}
describe("TelnyxProvider.verifyWebhook", () => {
it("fails closed when public key is missing and skipVerification is false", () => {
const provider = new TelnyxProvider(
{ apiKey: "KEY123", connectionId: "CONN456", publicKey: undefined },
{ skipVerification: false },
);
const result = provider.verifyWebhook(createCtx());
expect(result.ok).toBe(false);
});
it("allows requests when skipVerification is true (development only)", () => {
const provider = new TelnyxProvider(
{ apiKey: "KEY123", connectionId: "CONN456", publicKey: undefined },
{ skipVerification: true },
);
const result = provider.verifyWebhook(createCtx());
expect(result.ok).toBe(true);
});
it("fails when signature headers are missing (with public key configured)", () => {
const provider = new TelnyxProvider(
{ apiKey: "KEY123", connectionId: "CONN456", publicKey: "public-key" },
{ skipVerification: false },
);
const result = provider.verifyWebhook(createCtx({ headers: {} }));
expect(result.ok).toBe(false);
});
it("verifies a valid signature with a raw Ed25519 public key (Base64)", () => {
const { publicKey, privateKey } = crypto.generateKeyPairSync("ed25519");
const jwk = publicKey.export({ format: "jwk" }) as JsonWebKey;
expect(jwk.kty).toBe("OKP");
expect(jwk.crv).toBe("Ed25519");
expect(typeof jwk.x).toBe("string");
const rawPublicKey = decodeBase64Url(jwk.x as string);
const rawPublicKeyBase64 = rawPublicKey.toString("base64");
expectWebhookVerificationSucceeds({ publicKey: rawPublicKeyBase64, privateKey });
});
it("verifies a valid signature with a DER SPKI public key (Base64)", () => {
const { publicKey, privateKey } = crypto.generateKeyPairSync("ed25519");
const spkiDer = publicKey.export({ format: "der", type: "spki" }) as Buffer;
const spkiDerBase64 = spkiDer.toString("base64");
expectWebhookVerificationSucceeds({ publicKey: spkiDerBase64, privateKey });
});
it("returns replay status when the same signed request is seen twice", () => {
const { publicKey, privateKey } = crypto.generateKeyPairSync("ed25519");
const spkiDer = publicKey.export({ format: "der", type: "spki" }) as Buffer;
const provider = new TelnyxProvider(
{ apiKey: "KEY123", connectionId: "CONN456", publicKey: spkiDer.toString("base64") },
{ skipVerification: false },
);
const rawBody = JSON.stringify({
event_type: "call.initiated",
payload: { call_control_id: "call-replay-test" },
nonce: crypto.randomUUID(),
});
const timestamp = String(Math.floor(Date.now() / 1000));
const signedPayload = `${timestamp}|${rawBody}`;
const signature = crypto.sign(null, Buffer.from(signedPayload), privateKey).toString("base64");
const ctx = createCtx({
rawBody,
headers: {
"telnyx-signature-ed25519": signature,
"telnyx-timestamp": timestamp,
},
});
const first = provider.verifyWebhook(ctx);
const second = provider.verifyWebhook(ctx);
expect(first.ok).toBe(true);
expect(first.isReplay).toBeFalsy();
expect(first.verifiedRequestKey).toBeTruthy();
expect(second.ok).toBe(true);
expect(second.isReplay).toBe(true);
expect(second.verifiedRequestKey).toBe(first.verifiedRequestKey);
});
});
describe("TelnyxProvider.parseWebhookEvent", () => {
it("uses verified request key for manager dedupe", () => {
const provider = new TelnyxProvider({
apiKey: "KEY123",
connectionId: "CONN456",
publicKey: undefined,
});
const result = provider.parseWebhookEvent(
createCtx({
rawBody: JSON.stringify({
data: {
id: "evt-123",
event_type: "call.initiated",
payload: { call_control_id: "call-1" },
},
}),
}),
{ verifiedRequestKey: "telnyx:req:abc" },
);
expect(result.events).toHaveLength(1);
expect(result.events[0]?.dedupeKey).toBe("telnyx:req:abc");
});
});

View File

@@ -0,0 +1,324 @@
import crypto from "node:crypto";
import type { TelnyxConfig } from "../config.js";
import type {
EndReason,
HangupCallInput,
InitiateCallInput,
InitiateCallResult,
NormalizedEvent,
PlayTtsInput,
ProviderWebhookParseResult,
StartListeningInput,
StopListeningInput,
WebhookContext,
WebhookParseOptions,
WebhookVerificationResult,
} from "../types.js";
import { verifyTelnyxWebhook } from "../webhook-security.js";
import type { VoiceCallProvider } from "./base.js";
import { guardedJsonApiRequest } from "./shared/guarded-json-api.js";
/**
* Telnyx Voice API provider implementation.
*
* Uses Telnyx Call Control API v2 for managing calls.
* @see https://developers.telnyx.com/docs/api/v2/call-control
*/
export interface TelnyxProviderOptions {
/** Skip webhook signature verification (development only, NOT for production) */
skipVerification?: boolean;
}
export class TelnyxProvider implements VoiceCallProvider {
readonly name = "telnyx" as const;
private readonly apiKey: string;
private readonly connectionId: string;
private readonly publicKey: string | undefined;
private readonly options: TelnyxProviderOptions;
private readonly baseUrl = "https://api.telnyx.com/v2";
private readonly apiHost = "api.telnyx.com";
constructor(config: TelnyxConfig, options: TelnyxProviderOptions = {}) {
if (!config.apiKey) {
throw new Error("Telnyx API key is required");
}
if (!config.connectionId) {
throw new Error("Telnyx connection ID is required");
}
this.apiKey = config.apiKey;
this.connectionId = config.connectionId;
this.publicKey = config.publicKey;
this.options = options;
}
/**
* Make an authenticated request to the Telnyx API.
*/
private async apiRequest<T = unknown>(
endpoint: string,
body: Record<string, unknown>,
options?: { allowNotFound?: boolean },
): Promise<T> {
return await guardedJsonApiRequest<T>({
url: `${this.baseUrl}${endpoint}`,
method: "POST",
headers: {
Authorization: `Bearer ${this.apiKey}`,
"Content-Type": "application/json",
},
body,
allowNotFound: options?.allowNotFound,
allowedHostnames: [this.apiHost],
auditContext: "voice-call.telnyx.api",
errorPrefix: "Telnyx API error",
});
}
/**
* Verify Telnyx webhook signature using Ed25519.
*/
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult {
const result = verifyTelnyxWebhook(ctx, this.publicKey, {
skipVerification: this.options.skipVerification,
});
return {
ok: result.ok,
reason: result.reason,
isReplay: result.isReplay,
verifiedRequestKey: result.verifiedRequestKey,
};
}
/**
* Parse Telnyx webhook event into normalized format.
*/
parseWebhookEvent(
ctx: WebhookContext,
options?: WebhookParseOptions,
): ProviderWebhookParseResult {
try {
const payload = JSON.parse(ctx.rawBody);
const data = payload.data;
if (!data || !data.event_type) {
return { events: [], statusCode: 200 };
}
const event = this.normalizeEvent(data, options?.verifiedRequestKey);
return {
events: event ? [event] : [],
statusCode: 200,
};
} catch {
return { events: [], statusCode: 400 };
}
}
/**
* Convert Telnyx event to normalized event format.
*/
private normalizeEvent(data: TelnyxEvent, dedupeKey?: string): NormalizedEvent | null {
// Decode client_state from Base64 (we encode it in initiateCall)
let callId = "";
if (data.payload?.client_state) {
try {
callId = Buffer.from(data.payload.client_state, "base64").toString("utf8");
} catch {
// Fallback if not valid Base64
callId = data.payload.client_state;
}
}
if (!callId) {
callId = data.payload?.call_control_id || "";
}
const baseEvent = {
id: data.id || crypto.randomUUID(),
dedupeKey,
callId,
providerCallId: data.payload?.call_control_id,
timestamp: Date.now(),
};
switch (data.event_type) {
case "call.initiated":
return { ...baseEvent, type: "call.initiated" };
case "call.ringing":
return { ...baseEvent, type: "call.ringing" };
case "call.answered":
return { ...baseEvent, type: "call.answered" };
case "call.bridged":
return { ...baseEvent, type: "call.active" };
case "call.speak.started":
return {
...baseEvent,
type: "call.speaking",
text: data.payload?.text || "",
};
case "call.transcription":
return {
...baseEvent,
type: "call.speech",
transcript: data.payload?.transcription || "",
isFinal: data.payload?.is_final ?? true,
confidence: data.payload?.confidence,
};
case "call.hangup":
return {
...baseEvent,
type: "call.ended",
reason: this.mapHangupCause(data.payload?.hangup_cause),
};
case "call.dtmf.received":
return {
...baseEvent,
type: "call.dtmf",
digits: data.payload?.digit || "",
};
default:
return null;
}
}
/**
* Map Telnyx hangup cause to normalized end reason.
* @see https://developers.telnyx.com/docs/api/v2/call-control/Call-Commands#hangup-causes
*/
private mapHangupCause(cause?: string): EndReason {
switch (cause) {
case "normal_clearing":
case "normal_unspecified":
return "completed";
case "originator_cancel":
return "hangup-bot";
case "call_rejected":
case "user_busy":
return "busy";
case "no_answer":
case "no_user_response":
return "no-answer";
case "destination_out_of_order":
case "network_out_of_order":
case "service_unavailable":
case "recovery_on_timer_expire":
return "failed";
case "machine_detected":
case "fax_detected":
return "voicemail";
case "user_hangup":
case "subscriber_absent":
return "hangup-user";
default:
// Unknown cause - log it for debugging and return completed
if (cause) {
console.warn(`[telnyx] Unknown hangup cause: ${cause}`);
}
return "completed";
}
}
/**
* Initiate an outbound call via Telnyx API.
*/
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
const result = await this.apiRequest<TelnyxCallResponse>("/calls", {
connection_id: this.connectionId,
to: input.to,
from: input.from,
webhook_url: input.webhookUrl,
webhook_url_method: "POST",
client_state: Buffer.from(input.callId).toString("base64"),
timeout_secs: 30,
});
return {
providerCallId: result.data.call_control_id,
status: "initiated",
};
}
/**
* Hang up a call via Telnyx API.
*/
async hangupCall(input: HangupCallInput): Promise<void> {
await this.apiRequest(
`/calls/${input.providerCallId}/actions/hangup`,
{ command_id: crypto.randomUUID() },
{ allowNotFound: true },
);
}
/**
* Play TTS audio via Telnyx speak action.
*/
async playTts(input: PlayTtsInput): Promise<void> {
await this.apiRequest(`/calls/${input.providerCallId}/actions/speak`, {
command_id: crypto.randomUUID(),
payload: input.text,
voice: input.voice || "female",
language: input.locale || "en-US",
});
}
/**
* Start transcription (STT) via Telnyx.
*/
async startListening(input: StartListeningInput): Promise<void> {
await this.apiRequest(`/calls/${input.providerCallId}/actions/transcription_start`, {
command_id: crypto.randomUUID(),
language: input.language || "en",
});
}
/**
* Stop transcription via Telnyx.
*/
async stopListening(input: StopListeningInput): Promise<void> {
await this.apiRequest(
`/calls/${input.providerCallId}/actions/transcription_stop`,
{ command_id: crypto.randomUUID() },
{ allowNotFound: true },
);
}
}
// -----------------------------------------------------------------------------
// Telnyx-specific types
// -----------------------------------------------------------------------------
interface TelnyxEvent {
id?: string;
event_type: string;
payload?: {
call_control_id?: string;
client_state?: string;
text?: string;
transcription?: string;
is_final?: boolean;
confidence?: number;
hangup_cause?: string;
digit?: string;
[key: string]: unknown;
};
}
interface TelnyxCallResponse {
data: {
call_control_id: string;
call_leg_id: string;
call_session_id: string;
is_alive: boolean;
record_type: string;
};
}

View File

@@ -0,0 +1,259 @@
/**
* OpenAI TTS Provider
*
* Generates speech audio using OpenAI's text-to-speech API.
* Handles audio format conversion for telephony (mu-law 8kHz).
*
* Best practices from OpenAI docs:
* - Use gpt-4o-mini-tts for intelligent realtime applications (supports instructions)
* - Use tts-1 for lower latency, tts-1-hd for higher quality
* - Use marin or cedar voices for best quality
* - Use pcm or wav format for fastest response times
*
* @see https://platform.openai.com/docs/guides/text-to-speech
*/
/**
* OpenAI TTS configuration.
*/
export interface OpenAITTSConfig {
/** OpenAI API key (uses OPENAI_API_KEY env if not set) */
apiKey?: string;
/**
* TTS model:
* - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
* - tts-1: lower latency
* - tts-1-hd: higher quality
*/
model?: string;
/**
* Voice to use. For best quality, use marin or cedar.
* All 13 voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
* Note: tts-1/tts-1-hd only support: alloy, ash, coral, echo, fable, onyx, nova, sage, shimmer
*/
voice?: string;
/** Speed multiplier (0.25 to 4.0) */
speed?: number;
/**
* Instructions for speech style (only works with gpt-4o-mini-tts model).
* Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
*/
instructions?: string;
}
/**
* Supported OpenAI TTS voices (all 13 built-in voices).
* For best quality, use marin or cedar.
* Note: tts-1 and tts-1-hd support a smaller set.
*/
export const OPENAI_TTS_VOICES = [
"alloy",
"ash",
"ballad",
"coral",
"echo",
"fable",
"nova",
"onyx",
"sage",
"shimmer",
"verse",
"marin",
"cedar",
] as const;
export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number];
/**
* OpenAI TTS Provider for generating speech audio.
*/
export class OpenAITTSProvider {
private apiKey: string;
private model: string;
private voice: OpenAITTSVoice;
private speed: number;
private instructions?: string;
constructor(config: OpenAITTSConfig = {}) {
this.apiKey = config.apiKey || process.env.OPENAI_API_KEY || "";
// Default to gpt-4o-mini-tts for intelligent realtime applications
this.model = config.model || "gpt-4o-mini-tts";
// Default to coral - good balance of quality and natural tone
this.voice = (config.voice as OpenAITTSVoice) || "coral";
this.speed = config.speed || 1.0;
this.instructions = config.instructions;
if (!this.apiKey) {
throw new Error("OpenAI API key required (set OPENAI_API_KEY or pass apiKey)");
}
}
/**
* Generate speech audio from text.
* Returns raw PCM audio data (24kHz, mono, 16-bit).
*/
async synthesize(text: string, instructions?: string): Promise<Buffer> {
// Build request body
const body: Record<string, unknown> = {
model: this.model,
input: text,
voice: this.voice,
response_format: "pcm", // Raw PCM audio (24kHz, mono, 16-bit signed LE)
speed: this.speed,
};
// Add instructions if using gpt-4o-mini-tts model
const effectiveInstructions = instructions || this.instructions;
if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) {
body.instructions = effectiveInstructions;
}
const response = await fetch("https://api.openai.com/v1/audio/speech", {
method: "POST",
headers: {
Authorization: `Bearer ${this.apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify(body),
});
if (!response.ok) {
const error = await response.text();
throw new Error(`OpenAI TTS failed: ${response.status} - ${error}`);
}
const arrayBuffer = await response.arrayBuffer();
return Buffer.from(arrayBuffer);
}
/**
* Generate speech and convert to mu-law format for Twilio.
* Twilio Media Streams expect 8kHz mono mu-law audio.
*/
async synthesizeForTwilio(text: string): Promise<Buffer> {
// Get raw PCM from OpenAI (24kHz, 16-bit signed LE, mono)
const pcm24k = await this.synthesize(text);
// Resample from 24kHz to 8kHz
const pcm8k = resample24kTo8k(pcm24k);
// Encode to mu-law
return pcmToMulaw(pcm8k);
}
}
/**
* Resample 24kHz PCM to 8kHz using linear interpolation.
* Input/output: 16-bit signed little-endian mono.
*/
function resample24kTo8k(input: Buffer): Buffer {
const inputSamples = input.length / 2;
const outputSamples = Math.floor(inputSamples / 3);
const output = Buffer.alloc(outputSamples * 2);
for (let i = 0; i < outputSamples; i++) {
// Calculate position in input (3:1 ratio)
const srcPos = i * 3;
const srcIdx = srcPos * 2;
if (srcIdx + 3 < input.length) {
// Linear interpolation between samples
const s0 = input.readInt16LE(srcIdx);
const s1 = input.readInt16LE(srcIdx + 2);
const frac = srcPos % 1 || 0;
const sample = Math.round(s0 + frac * (s1 - s0));
output.writeInt16LE(clamp16(sample), i * 2);
} else {
// Last sample
output.writeInt16LE(input.readInt16LE(srcIdx), i * 2);
}
}
return output;
}
/**
* Clamp value to 16-bit signed integer range.
*/
function clamp16(value: number): number {
return Math.max(-32768, Math.min(32767, value));
}
/**
* Convert 16-bit PCM to 8-bit mu-law.
* Standard G.711 mu-law encoding for telephony.
*/
function pcmToMulaw(pcm: Buffer): Buffer {
const samples = pcm.length / 2;
const mulaw = Buffer.alloc(samples);
for (let i = 0; i < samples; i++) {
const sample = pcm.readInt16LE(i * 2);
mulaw[i] = linearToMulaw(sample);
}
return mulaw;
}
/**
* Convert a single 16-bit linear sample to 8-bit mu-law.
* Implements ITU-T G.711 mu-law encoding.
*/
function linearToMulaw(sample: number): number {
const BIAS = 132;
const CLIP = 32635;
// Get sign bit
const sign = sample < 0 ? 0x80 : 0;
if (sample < 0) {
sample = -sample;
}
// Clip to prevent overflow
if (sample > CLIP) {
sample = CLIP;
}
// Add bias and find segment
sample += BIAS;
let exponent = 7;
for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--, expMask >>= 1) {
// Find the segment (exponent)
}
// Extract mantissa bits
const mantissa = (sample >> (exponent + 3)) & 0x0f;
// Combine into mu-law byte (inverted for transmission)
return ~(sign | (exponent << 4) | mantissa) & 0xff;
}
/**
* Convert 8-bit mu-law to 16-bit linear PCM.
* Useful for decoding incoming audio.
*/
export function mulawToLinear(mulaw: number): number {
// mu-law is transmitted inverted
mulaw = ~mulaw & 0xff;
const sign = mulaw & 0x80;
const exponent = (mulaw >> 4) & 0x07;
const mantissa = mulaw & 0x0f;
let sample = ((mantissa << 3) + 132) << exponent;
sample -= 132;
return sign ? -sample : sample;
}
/**
* Chunk audio buffer into 20ms frames for streaming.
* At 8kHz mono, 20ms = 160 samples = 160 bytes (mu-law).
*/
export function chunkAudio(audio: Buffer, chunkSize = 160): Generator<Buffer, void, unknown> {
return (function* () {
for (let i = 0; i < audio.length; i += chunkSize) {
yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
}
})();
}

View File

@@ -0,0 +1,117 @@
import { describe, expect, it } from "vitest";
import type { WebhookContext } from "../types.js";
import { TwilioProvider } from "./twilio.js";
const STREAM_URL = "wss://example.ngrok.app/voice/stream";
function createProvider(): TwilioProvider {
return new TwilioProvider(
{ accountSid: "AC123", authToken: "secret" },
{ publicUrl: "https://example.ngrok.app", streamPath: "/voice/stream" },
);
}
function createContext(rawBody: string, query?: WebhookContext["query"]): WebhookContext {
return {
headers: {},
rawBody,
url: "https://example.ngrok.app/voice/twilio",
method: "POST",
query,
};
}
describe("TwilioProvider", () => {
it("returns streaming TwiML for outbound conversation calls before in-progress", () => {
const provider = createProvider();
const ctx = createContext("CallStatus=initiated&Direction=outbound-api&CallSid=CA123", {
callId: "call-1",
});
const result = provider.parseWebhookEvent(ctx);
expect(result.providerResponseBody).toContain(STREAM_URL);
expect(result.providerResponseBody).toContain('<Parameter name="token" value="');
expect(result.providerResponseBody).toContain("<Connect>");
});
it("returns empty TwiML for status callbacks", () => {
const provider = createProvider();
const ctx = createContext("CallStatus=ringing&Direction=outbound-api", {
callId: "call-1",
type: "status",
});
const result = provider.parseWebhookEvent(ctx);
expect(result.providerResponseBody).toBe(
'<?xml version="1.0" encoding="UTF-8"?><Response></Response>',
);
});
it("returns streaming TwiML for inbound calls", () => {
const provider = createProvider();
const ctx = createContext("CallStatus=ringing&Direction=inbound&CallSid=CA456");
const result = provider.parseWebhookEvent(ctx);
expect(result.providerResponseBody).toContain(STREAM_URL);
expect(result.providerResponseBody).toContain('<Parameter name="token" value="');
expect(result.providerResponseBody).toContain("<Connect>");
});
it("uses a stable fallback dedupeKey for identical request payloads", () => {
const provider = createProvider();
const rawBody = "CallSid=CA789&Direction=inbound&SpeechResult=hello";
const ctxA = {
...createContext(rawBody, { callId: "call-1", turnToken: "turn-1" }),
headers: { "i-twilio-idempotency-token": "idem-123" },
};
const ctxB = {
...createContext(rawBody, { callId: "call-1", turnToken: "turn-1" }),
headers: { "i-twilio-idempotency-token": "idem-123" },
};
const eventA = provider.parseWebhookEvent(ctxA).events[0];
const eventB = provider.parseWebhookEvent(ctxB).events[0];
expect(eventA).toBeDefined();
expect(eventB).toBeDefined();
expect(eventA?.id).not.toBe(eventB?.id);
expect(eventA?.dedupeKey).toContain("twilio:fallback:");
expect(eventA?.dedupeKey).toBe(eventB?.dedupeKey);
});
it("uses verified request key for dedupe and ignores idempotency header changes", () => {
const provider = createProvider();
const rawBody = "CallSid=CA790&Direction=inbound&SpeechResult=hello";
const ctxA = {
...createContext(rawBody, { callId: "call-1", turnToken: "turn-1" }),
headers: { "i-twilio-idempotency-token": "idem-a" },
};
const ctxB = {
...createContext(rawBody, { callId: "call-1", turnToken: "turn-1" }),
headers: { "i-twilio-idempotency-token": "idem-b" },
};
const eventA = provider.parseWebhookEvent(ctxA, { verifiedRequestKey: "twilio:req:abc" })
.events[0];
const eventB = provider.parseWebhookEvent(ctxB, { verifiedRequestKey: "twilio:req:abc" })
.events[0];
expect(eventA?.dedupeKey).toBe("twilio:req:abc");
expect(eventB?.dedupeKey).toBe("twilio:req:abc");
});
it("keeps turnToken from query on speech events", () => {
const provider = createProvider();
const ctx = createContext("CallSid=CA222&Direction=inbound&SpeechResult=hello", {
callId: "call-2",
turnToken: "turn-xyz",
});
const event = provider.parseWebhookEvent(ctx).events[0];
expect(event?.type).toBe("call.speech");
expect(event?.turnToken).toBe("turn-xyz");
});
});

View File

@@ -0,0 +1,687 @@
import crypto from "node:crypto";
import type { TwilioConfig, WebhookSecurityConfig } from "../config.js";
import { getHeader } from "../http-headers.js";
import type { MediaStreamHandler } from "../media-stream.js";
import { chunkAudio } from "../telephony-audio.js";
import type { TelephonyTtsProvider } from "../telephony-tts.js";
import type {
HangupCallInput,
InitiateCallInput,
InitiateCallResult,
NormalizedEvent,
PlayTtsInput,
ProviderWebhookParseResult,
StartListeningInput,
StopListeningInput,
WebhookContext,
WebhookParseOptions,
WebhookVerificationResult,
} from "../types.js";
import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js";
import type { VoiceCallProvider } from "./base.js";
import { twilioApiRequest } from "./twilio/api.js";
import { verifyTwilioProviderWebhook } from "./twilio/webhook.js";
function createTwilioRequestDedupeKey(ctx: WebhookContext, verifiedRequestKey?: string): string {
if (verifiedRequestKey) {
return verifiedRequestKey;
}
const signature = getHeader(ctx.headers, "x-twilio-signature") ?? "";
const params = new URLSearchParams(ctx.rawBody);
const callSid = params.get("CallSid") ?? "";
const callStatus = params.get("CallStatus") ?? "";
const direction = params.get("Direction") ?? "";
const callId = typeof ctx.query?.callId === "string" ? ctx.query.callId.trim() : "";
const flow = typeof ctx.query?.flow === "string" ? ctx.query.flow.trim() : "";
const turnToken = typeof ctx.query?.turnToken === "string" ? ctx.query.turnToken.trim() : "";
return `twilio:fallback:${crypto
.createHash("sha256")
.update(
`${signature}\n${callSid}\n${callStatus}\n${direction}\n${callId}\n${flow}\n${turnToken}\n${ctx.rawBody}`,
)
.digest("hex")}`;
}
/**
* Twilio Voice API provider implementation.
*
* Uses Twilio Programmable Voice API with Media Streams for real-time
* bidirectional audio streaming.
*
* @see https://www.twilio.com/docs/voice
* @see https://www.twilio.com/docs/voice/media-streams
*/
export interface TwilioProviderOptions {
/** Allow ngrok free tier compatibility mode (loopback only, less secure) */
allowNgrokFreeTierLoopbackBypass?: boolean;
/** Override public URL for signature verification */
publicUrl?: string;
/** Path for media stream WebSocket (e.g., /voice/stream) */
streamPath?: string;
/** Skip webhook signature verification (development only) */
skipVerification?: boolean;
/** Webhook security options (forwarded headers/allowlist) */
webhookSecurity?: WebhookSecurityConfig;
}
export class TwilioProvider implements VoiceCallProvider {
readonly name = "twilio" as const;
private readonly accountSid: string;
private readonly authToken: string;
private readonly baseUrl: string;
private readonly callWebhookUrls = new Map<string, string>();
private readonly options: TwilioProviderOptions;
/** Current public webhook URL (set when tunnel starts or from config) */
private currentPublicUrl: string | null = null;
/** Optional telephony TTS provider for streaming TTS */
private ttsProvider: TelephonyTtsProvider | null = null;
/** Optional media stream handler for sending audio */
private mediaStreamHandler: MediaStreamHandler | null = null;
/** Map of call SID to stream SID for media streams */
private callStreamMap = new Map<string, string>();
/** Per-call tokens for media stream authentication */
private streamAuthTokens = new Map<string, string>();
/** Storage for TwiML content (for notify mode with URL-based TwiML) */
private readonly twimlStorage = new Map<string, string>();
/** Track notify-mode calls to avoid streaming on follow-up callbacks */
private readonly notifyCalls = new Set<string>();
/**
* Delete stored TwiML for a given `callId`.
*
* We keep TwiML in-memory only long enough to satisfy the initial Twilio
* webhook request (notify mode). Subsequent webhooks should not reuse it.
*/
private deleteStoredTwiml(callId: string): void {
this.twimlStorage.delete(callId);
this.notifyCalls.delete(callId);
}
/**
* Delete stored TwiML for a call, addressed by Twilio's provider call SID.
*
* This is used when we only have `providerCallId` (e.g. hangup).
*/
private deleteStoredTwimlForProviderCall(providerCallId: string): void {
const webhookUrl = this.callWebhookUrls.get(providerCallId);
if (!webhookUrl) {
return;
}
const callIdMatch = webhookUrl.match(/callId=([^&]+)/);
if (!callIdMatch) {
return;
}
this.deleteStoredTwiml(callIdMatch[1]);
this.streamAuthTokens.delete(providerCallId);
}
constructor(config: TwilioConfig, options: TwilioProviderOptions = {}) {
if (!config.accountSid) {
throw new Error("Twilio Account SID is required");
}
if (!config.authToken) {
throw new Error("Twilio Auth Token is required");
}
this.accountSid = config.accountSid;
this.authToken = config.authToken;
this.baseUrl = `https://api.twilio.com/2010-04-01/Accounts/${this.accountSid}`;
this.options = options;
if (options.publicUrl) {
this.currentPublicUrl = options.publicUrl;
}
}
setPublicUrl(url: string): void {
this.currentPublicUrl = url;
}
getPublicUrl(): string | null {
return this.currentPublicUrl;
}
setTTSProvider(provider: TelephonyTtsProvider): void {
this.ttsProvider = provider;
}
setMediaStreamHandler(handler: MediaStreamHandler): void {
this.mediaStreamHandler = handler;
}
registerCallStream(callSid: string, streamSid: string): void {
this.callStreamMap.set(callSid, streamSid);
}
unregisterCallStream(callSid: string): void {
this.callStreamMap.delete(callSid);
}
isValidStreamToken(callSid: string, token?: string): boolean {
const expected = this.streamAuthTokens.get(callSid);
if (!expected || !token) {
return false;
}
if (expected.length !== token.length) {
const dummy = Buffer.from(expected);
crypto.timingSafeEqual(dummy, dummy);
return false;
}
return crypto.timingSafeEqual(Buffer.from(expected), Buffer.from(token));
}
/**
* Clear TTS queue for a call (barge-in).
* Used when user starts speaking to interrupt current TTS playback.
*/
clearTtsQueue(callSid: string): void {
const streamSid = this.callStreamMap.get(callSid);
if (streamSid && this.mediaStreamHandler) {
this.mediaStreamHandler.clearTtsQueue(streamSid);
}
}
/**
* Make an authenticated request to the Twilio API.
*/
private async apiRequest<T = unknown>(
endpoint: string,
params: Record<string, string | string[]>,
options?: { allowNotFound?: boolean },
): Promise<T> {
return await twilioApiRequest<T>({
baseUrl: this.baseUrl,
accountSid: this.accountSid,
authToken: this.authToken,
endpoint,
body: params,
allowNotFound: options?.allowNotFound,
});
}
/**
* Verify Twilio webhook signature using HMAC-SHA1.
*
* Handles reverse proxy scenarios (Tailscale, nginx, ngrok) by reconstructing
* the public URL from forwarding headers.
*
* @see https://www.twilio.com/docs/usage/webhooks/webhooks-security
*/
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult {
return verifyTwilioProviderWebhook({
ctx,
authToken: this.authToken,
currentPublicUrl: this.currentPublicUrl,
options: this.options,
});
}
/**
* Parse Twilio webhook event into normalized format.
*/
parseWebhookEvent(
ctx: WebhookContext,
options?: WebhookParseOptions,
): ProviderWebhookParseResult {
try {
const params = new URLSearchParams(ctx.rawBody);
const callIdFromQuery =
typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
? ctx.query.callId.trim()
: undefined;
const turnTokenFromQuery =
typeof ctx.query?.turnToken === "string" && ctx.query.turnToken.trim()
? ctx.query.turnToken.trim()
: undefined;
const dedupeKey = createTwilioRequestDedupeKey(ctx, options?.verifiedRequestKey);
const event = this.normalizeEvent(params, {
callIdOverride: callIdFromQuery,
dedupeKey,
turnToken: turnTokenFromQuery,
});
// For Twilio, we must return TwiML. Most actions are driven by Calls API updates,
// so the webhook response is typically a pause to keep the call alive.
const twiml = this.generateTwimlResponse(ctx);
return {
events: event ? [event] : [],
providerResponseBody: twiml,
providerResponseHeaders: { "Content-Type": "application/xml" },
statusCode: 200,
};
} catch {
return { events: [], statusCode: 400 };
}
}
/**
* Parse Twilio direction to normalized format.
*/
private static parseDirection(direction: string | null): "inbound" | "outbound" | undefined {
if (direction === "inbound") {
return "inbound";
}
if (direction === "outbound-api" || direction === "outbound-dial") {
return "outbound";
}
return undefined;
}
/**
* Convert Twilio webhook params to normalized event format.
*/
private normalizeEvent(
params: URLSearchParams,
options?: {
callIdOverride?: string;
dedupeKey?: string;
turnToken?: string;
},
): NormalizedEvent | null {
const callSid = params.get("CallSid") || "";
const callIdOverride = options?.callIdOverride;
const baseEvent = {
id: crypto.randomUUID(),
dedupeKey: options?.dedupeKey,
callId: callIdOverride || callSid,
providerCallId: callSid,
timestamp: Date.now(),
turnToken: options?.turnToken,
direction: TwilioProvider.parseDirection(params.get("Direction")),
from: params.get("From") || undefined,
to: params.get("To") || undefined,
};
// Handle speech result (from <Gather>)
const speechResult = params.get("SpeechResult");
if (speechResult) {
return {
...baseEvent,
type: "call.speech",
transcript: speechResult,
isFinal: true,
confidence: parseFloat(params.get("Confidence") || "0.9"),
};
}
// Handle DTMF
const digits = params.get("Digits");
if (digits) {
return { ...baseEvent, type: "call.dtmf", digits };
}
// Handle call status changes
const callStatus = params.get("CallStatus");
switch (callStatus) {
case "initiated":
return { ...baseEvent, type: "call.initiated" };
case "ringing":
return { ...baseEvent, type: "call.ringing" };
case "in-progress":
return { ...baseEvent, type: "call.answered" };
case "completed":
case "busy":
case "no-answer":
case "failed":
this.streamAuthTokens.delete(callSid);
if (callIdOverride) {
this.deleteStoredTwiml(callIdOverride);
}
return { ...baseEvent, type: "call.ended", reason: callStatus };
case "canceled":
this.streamAuthTokens.delete(callSid);
if (callIdOverride) {
this.deleteStoredTwiml(callIdOverride);
}
return { ...baseEvent, type: "call.ended", reason: "hangup-bot" };
default:
return null;
}
}
private static readonly EMPTY_TWIML =
'<?xml version="1.0" encoding="UTF-8"?><Response></Response>';
private static readonly PAUSE_TWIML = `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Pause length="30"/>
</Response>`;
/**
* Generate TwiML response for webhook.
* When a call is answered, connects to media stream for bidirectional audio.
*/
private generateTwimlResponse(ctx?: WebhookContext): string {
if (!ctx) {
return TwilioProvider.EMPTY_TWIML;
}
const params = new URLSearchParams(ctx.rawBody);
const type = typeof ctx.query?.type === "string" ? ctx.query.type.trim() : undefined;
const isStatusCallback = type === "status";
const callStatus = params.get("CallStatus");
const direction = params.get("Direction");
const isOutbound = direction?.startsWith("outbound") ?? false;
const callSid = params.get("CallSid") || undefined;
const callIdFromQuery =
typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
? ctx.query.callId.trim()
: undefined;
// Avoid logging webhook params/TwiML (may contain PII).
// Handle initial TwiML request (when Twilio first initiates the call)
// Check if we have stored TwiML for this call (notify mode)
if (callIdFromQuery && !isStatusCallback) {
const storedTwiml = this.twimlStorage.get(callIdFromQuery);
if (storedTwiml) {
// Clean up after serving (one-time use)
this.deleteStoredTwiml(callIdFromQuery);
return storedTwiml;
}
if (this.notifyCalls.has(callIdFromQuery)) {
return TwilioProvider.EMPTY_TWIML;
}
// Conversation mode: return streaming TwiML immediately for outbound calls.
if (isOutbound) {
const streamUrl = callSid ? this.getStreamUrlForCall(callSid) : null;
return streamUrl ? this.getStreamConnectXml(streamUrl) : TwilioProvider.PAUSE_TWIML;
}
}
// Status callbacks should not receive TwiML.
if (isStatusCallback) {
return TwilioProvider.EMPTY_TWIML;
}
// Handle subsequent webhook requests (status callbacks, etc.)
// For inbound calls, answer immediately with stream
if (direction === "inbound") {
const streamUrl = callSid ? this.getStreamUrlForCall(callSid) : null;
return streamUrl ? this.getStreamConnectXml(streamUrl) : TwilioProvider.PAUSE_TWIML;
}
// For outbound calls, only connect to stream when call is in-progress
if (callStatus !== "in-progress") {
return TwilioProvider.EMPTY_TWIML;
}
const streamUrl = callSid ? this.getStreamUrlForCall(callSid) : null;
return streamUrl ? this.getStreamConnectXml(streamUrl) : TwilioProvider.PAUSE_TWIML;
}
/**
* Get the WebSocket URL for media streaming.
* Derives from the public URL origin + stream path.
*/
private getStreamUrl(): string | null {
if (!this.currentPublicUrl || !this.options.streamPath) {
return null;
}
// Extract just the origin (host) from the public URL, ignoring any path
const url = new URL(this.currentPublicUrl);
const origin = url.origin;
// Convert https:// to wss:// for WebSocket
const wsOrigin = origin.replace(/^https:\/\//, "wss://").replace(/^http:\/\//, "ws://");
// Append the stream path
const path = this.options.streamPath.startsWith("/")
? this.options.streamPath
: `/${this.options.streamPath}`;
return `${wsOrigin}${path}`;
}
private getStreamAuthToken(callSid: string): string {
const existing = this.streamAuthTokens.get(callSid);
if (existing) {
return existing;
}
const token = crypto.randomBytes(16).toString("base64url");
this.streamAuthTokens.set(callSid, token);
return token;
}
private getStreamUrlForCall(callSid: string): string | null {
const baseUrl = this.getStreamUrl();
if (!baseUrl) {
return null;
}
const token = this.getStreamAuthToken(callSid);
const url = new URL(baseUrl);
url.searchParams.set("token", token);
return url.toString();
}
/**
* Generate TwiML to connect a call to a WebSocket media stream.
* This enables bidirectional audio streaming for real-time STT/TTS.
*
* @param streamUrl - WebSocket URL (wss://...) for the media stream
*/
getStreamConnectXml(streamUrl: string): string {
// Extract token from URL and pass via <Parameter> instead of query string.
// Twilio strips query params from WebSocket URLs, but delivers <Parameter>
// values in the "start" message's customParameters field.
const parsed = new URL(streamUrl);
const token = parsed.searchParams.get("token");
parsed.searchParams.delete("token");
const cleanUrl = parsed.toString();
const paramXml = token ? `\n <Parameter name="token" value="${escapeXml(token)}" />` : "";
return `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Connect>
<Stream url="${escapeXml(cleanUrl)}">${paramXml}
</Stream>
</Connect>
</Response>`;
}
/**
* Initiate an outbound call via Twilio API.
* If inlineTwiml is provided, uses that directly (for notify mode).
* Otherwise, uses webhook URL for dynamic TwiML.
*/
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
const url = new URL(input.webhookUrl);
url.searchParams.set("callId", input.callId);
// Create separate URL for status callbacks (required by Twilio)
const statusUrl = new URL(input.webhookUrl);
statusUrl.searchParams.set("callId", input.callId);
statusUrl.searchParams.set("type", "status"); // Differentiate from TwiML requests
// Store TwiML content if provided (for notify mode)
// We now serve it from the webhook endpoint instead of sending inline
if (input.inlineTwiml) {
this.twimlStorage.set(input.callId, input.inlineTwiml);
this.notifyCalls.add(input.callId);
}
// Build request params - always use URL-based TwiML.
// Twilio silently ignores `StatusCallback` when using the inline `Twiml` parameter.
const params: Record<string, string | string[]> = {
To: input.to,
From: input.from,
Url: url.toString(), // TwiML serving endpoint
StatusCallback: statusUrl.toString(), // Separate status callback endpoint
StatusCallbackEvent: ["initiated", "ringing", "answered", "completed"],
Timeout: "30",
};
const result = await this.apiRequest<TwilioCallResponse>("/Calls.json", params);
this.callWebhookUrls.set(result.sid, url.toString());
return {
providerCallId: result.sid,
status: result.status === "queued" ? "queued" : "initiated",
};
}
/**
* Hang up a call via Twilio API.
*/
async hangupCall(input: HangupCallInput): Promise<void> {
this.deleteStoredTwimlForProviderCall(input.providerCallId);
this.callWebhookUrls.delete(input.providerCallId);
this.streamAuthTokens.delete(input.providerCallId);
await this.apiRequest(
`/Calls/${input.providerCallId}.json`,
{ Status: "completed" },
{ allowNotFound: true },
);
}
/**
* Play TTS audio via Twilio.
*
* Two modes:
* 1. Core TTS + Media Streams: If TTS provider and media stream are available,
* generates audio via core TTS and streams it through WebSocket (preferred).
* 2. TwiML <Say>: Falls back to Twilio's native TTS with Polly voices.
* Note: This may not work on all Twilio accounts.
*/
async playTts(input: PlayTtsInput): Promise<void> {
// Try telephony TTS via media stream first (if configured)
const streamSid = this.callStreamMap.get(input.providerCallId);
if (this.ttsProvider && this.mediaStreamHandler && streamSid) {
try {
await this.playTtsViaStream(input.text, streamSid);
return;
} catch (err) {
console.warn(
`[voice-call] Telephony TTS failed, falling back to Twilio <Say>:`,
err instanceof Error ? err.message : err,
);
// Fall through to TwiML <Say> fallback
}
}
// Fall back to TwiML <Say> (may not work on all accounts)
const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
if (!webhookUrl) {
throw new Error("Missing webhook URL for this call (provider state not initialized)");
}
console.warn(
"[voice-call] Using TwiML <Say> fallback - telephony TTS not configured or media stream not active",
);
const pollyVoice = mapVoiceToPolly(input.voice);
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Say voice="${pollyVoice}" language="${input.locale || "en-US"}">${escapeXml(input.text)}</Say>
<Gather input="speech" speechTimeout="auto" action="${escapeXml(webhookUrl)}" method="POST">
<Say>.</Say>
</Gather>
</Response>`;
await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
Twiml: twiml,
});
}
/**
* Play TTS via core TTS and Twilio Media Streams.
* Generates audio with core TTS, converts to mu-law, and streams via WebSocket.
* Uses a queue to serialize playback and prevent overlapping audio.
*/
private async playTtsViaStream(text: string, streamSid: string): Promise<void> {
if (!this.ttsProvider || !this.mediaStreamHandler) {
throw new Error("TTS provider and media stream handler required");
}
// Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
const CHUNK_SIZE = 160;
const CHUNK_DELAY_MS = 20;
const handler = this.mediaStreamHandler;
const ttsProvider = this.ttsProvider;
await handler.queueTts(streamSid, async (signal) => {
// Generate audio with core TTS (returns mu-law at 8kHz)
const muLawAudio = await ttsProvider.synthesizeForTelephony(text);
for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
if (signal.aborted) {
break;
}
handler.sendAudio(streamSid, chunk);
// Pace the audio to match real-time playback
await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
if (signal.aborted) {
break;
}
}
if (!signal.aborted) {
// Send a mark to track when audio finishes
handler.sendMark(streamSid, `tts-${Date.now()}`);
}
});
}
/**
* Start listening for speech via Twilio <Gather>.
*/
async startListening(input: StartListeningInput): Promise<void> {
const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
if (!webhookUrl) {
throw new Error("Missing webhook URL for this call (provider state not initialized)");
}
const actionUrl = new URL(webhookUrl);
if (input.turnToken) {
actionUrl.searchParams.set("turnToken", input.turnToken);
}
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Gather input="speech" speechTimeout="auto" language="${input.language || "en-US"}" action="${escapeXml(actionUrl.toString())}" method="POST">
</Gather>
</Response>`;
await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
Twiml: twiml,
});
}
/**
* Stop listening - for Twilio this is a no-op as <Gather> auto-ends.
*/
async stopListening(_input: StopListeningInput): Promise<void> {
// Twilio's <Gather> automatically stops on speech end
// No explicit action needed
}
}
// -----------------------------------------------------------------------------
// Twilio-specific types
// -----------------------------------------------------------------------------
interface TwilioCallResponse {
sid: string;
status: string;
direction: string;
from: string;
to: string;
uri: string;
}

View File

@@ -0,0 +1,42 @@
export async function twilioApiRequest<T = unknown>(params: {
baseUrl: string;
accountSid: string;
authToken: string;
endpoint: string;
body: URLSearchParams | Record<string, string | string[]>;
allowNotFound?: boolean;
}): Promise<T> {
const bodyParams =
params.body instanceof URLSearchParams
? params.body
: Object.entries(params.body).reduce<URLSearchParams>((acc, [key, value]) => {
if (Array.isArray(value)) {
for (const entry of value) {
acc.append(key, entry);
}
} else if (typeof value === "string") {
acc.append(key, value);
}
return acc;
}, new URLSearchParams());
const response = await fetch(`${params.baseUrl}${params.endpoint}`, {
method: "POST",
headers: {
Authorization: `Basic ${Buffer.from(`${params.accountSid}:${params.authToken}`).toString("base64")}`,
"Content-Type": "application/x-www-form-urlencoded",
},
body: bodyParams,
});
if (!response.ok) {
if (params.allowNotFound && response.status === 404) {
return undefined as T;
}
const errorText = await response.text();
throw new Error(`Twilio API error: ${response.status} ${errorText}`);
}
const text = await response.text();
return text ? (JSON.parse(text) as T) : (undefined as T);
}

View File

@@ -0,0 +1,34 @@
import type { WebhookContext, WebhookVerificationResult } from "../../types.js";
import { verifyTwilioWebhook } from "../../webhook-security.js";
import type { TwilioProviderOptions } from "../twilio.js";
export function verifyTwilioProviderWebhook(params: {
ctx: WebhookContext;
authToken: string;
currentPublicUrl?: string | null;
options: TwilioProviderOptions;
}): WebhookVerificationResult {
const result = verifyTwilioWebhook(params.ctx, params.authToken, {
publicUrl: params.currentPublicUrl || undefined,
allowNgrokFreeTierLoopbackBypass: params.options.allowNgrokFreeTierLoopbackBypass ?? false,
skipVerification: params.options.skipVerification,
allowedHosts: params.options.webhookSecurity?.allowedHosts,
trustForwardingHeaders: params.options.webhookSecurity?.trustForwardingHeaders,
trustedProxyIPs: params.options.webhookSecurity?.trustedProxyIPs,
remoteIP: params.ctx.remoteAddress,
});
if (!result.ok) {
console.warn(`[twilio] Webhook verification failed: ${result.reason}`);
if (result.verificationUrl) {
console.warn(`[twilio] Verification URL: ${result.verificationUrl}`);
}
}
return {
ok: result.ok,
reason: result.reason,
isReplay: result.isReplay,
verifiedRequestKey: result.verifiedRequestKey,
};
}

View File

@@ -0,0 +1,158 @@
/**
* Voice call response generator - uses the embedded Pi agent for tool support.
* Routes voice responses through the same agent infrastructure as messaging.
*/
import crypto from "node:crypto";
import type { VoiceCallConfig } from "./config.js";
import { loadCoreAgentDeps, type CoreConfig } from "./core-bridge.js";
export type VoiceResponseParams = {
/** Voice call config */
voiceConfig: VoiceCallConfig;
/** Core OpenClaw config */
coreConfig: CoreConfig;
/** Call ID for session tracking */
callId: string;
/** Caller's phone number */
from: string;
/** Conversation transcript */
transcript: Array<{ speaker: "user" | "bot"; text: string }>;
/** Latest user message */
userMessage: string;
};
export type VoiceResponseResult = {
text: string | null;
error?: string;
};
type SessionEntry = {
sessionId: string;
updatedAt: number;
};
/**
* Generate a voice response using the embedded Pi agent with full tool support.
* Uses the same agent infrastructure as messaging for consistent behavior.
*/
export async function generateVoiceResponse(
params: VoiceResponseParams,
): Promise<VoiceResponseResult> {
const { voiceConfig, callId, from, transcript, userMessage, coreConfig } = params;
if (!coreConfig) {
return { text: null, error: "Core config unavailable for voice response" };
}
let deps: Awaited<ReturnType<typeof loadCoreAgentDeps>>;
try {
deps = await loadCoreAgentDeps();
} catch (err) {
return {
text: null,
error: err instanceof Error ? err.message : "Unable to load core agent dependencies",
};
}
const cfg = coreConfig;
// Build voice-specific session key based on phone number
const normalizedPhone = from.replace(/\D/g, "");
const sessionKey = `voice:${normalizedPhone}`;
const agentId = "main";
// Resolve paths
const storePath = deps.resolveStorePath(cfg.session?.store, { agentId });
const agentDir = deps.resolveAgentDir(cfg, agentId);
const workspaceDir = deps.resolveAgentWorkspaceDir(cfg, agentId);
// Ensure workspace exists
await deps.ensureAgentWorkspace({ dir: workspaceDir });
// Load or create session entry
const sessionStore = deps.loadSessionStore(storePath);
const now = Date.now();
let sessionEntry = sessionStore[sessionKey] as SessionEntry | undefined;
if (!sessionEntry) {
sessionEntry = {
sessionId: crypto.randomUUID(),
updatedAt: now,
};
sessionStore[sessionKey] = sessionEntry;
await deps.saveSessionStore(storePath, sessionStore);
}
const sessionId = sessionEntry.sessionId;
const sessionFile = deps.resolveSessionFilePath(sessionId, sessionEntry, {
agentId,
});
// Resolve model from config
const modelRef = voiceConfig.responseModel || `${deps.DEFAULT_PROVIDER}/${deps.DEFAULT_MODEL}`;
const slashIndex = modelRef.indexOf("/");
const provider = slashIndex === -1 ? deps.DEFAULT_PROVIDER : modelRef.slice(0, slashIndex);
const model = slashIndex === -1 ? modelRef : modelRef.slice(slashIndex + 1);
// Resolve thinking level
const thinkLevel = deps.resolveThinkingDefault({ cfg, provider, model });
// Resolve agent identity for personalized prompt
const identity = deps.resolveAgentIdentity(cfg, agentId);
const agentName = identity?.name?.trim() || "assistant";
// Build system prompt with conversation history
const basePrompt =
voiceConfig.responseSystemPrompt ??
`You are ${agentName}, a helpful voice assistant on a phone call. Keep responses brief and conversational (1-2 sentences max). Be natural and friendly. The caller's phone number is ${from}. You have access to tools - use them when helpful.`;
let extraSystemPrompt = basePrompt;
if (transcript.length > 0) {
const history = transcript
.map((entry) => `${entry.speaker === "bot" ? "You" : "Caller"}: ${entry.text}`)
.join("\n");
extraSystemPrompt = `${basePrompt}\n\nConversation so far:\n${history}`;
}
// Resolve timeout
const timeoutMs = voiceConfig.responseTimeoutMs ?? deps.resolveAgentTimeoutMs({ cfg });
const runId = `voice:${callId}:${Date.now()}`;
try {
const result = await deps.runEmbeddedPiAgent({
sessionId,
sessionKey,
messageProvider: "voice",
sessionFile,
workspaceDir,
config: cfg,
prompt: userMessage,
provider,
model,
thinkLevel,
verboseLevel: "off",
timeoutMs,
runId,
lane: "voice",
extraSystemPrompt,
agentDir,
});
// Extract text from payloads
const texts = (result.payloads ?? [])
.filter((p) => p.text && !p.isError)
.map((p) => p.text?.trim())
.filter(Boolean);
const text = texts.join(" ") || null;
if (!text && result.meta?.aborted) {
return { text: null, error: "Response generation was aborted" };
}
return { text };
} catch (err) {
console.error(`[voice-call] Response generation failed:`, err);
return { text: null, error: String(err) };
}
}

View File

@@ -0,0 +1,217 @@
import type { VoiceCallConfig } from "./config.js";
import { resolveVoiceCallConfig, validateProviderConfig } from "./config.js";
import type { CoreConfig } from "./core-bridge.js";
import { CallManager } from "./manager.js";
import type { VoiceCallProvider } from "./providers/base.js";
import { MockProvider } from "./providers/mock.js";
import { PlivoProvider } from "./providers/plivo.js";
import { TelnyxProvider } from "./providers/telnyx.js";
import { TwilioProvider } from "./providers/twilio.js";
import type { TelephonyTtsRuntime } from "./telephony-tts.js";
import { createTelephonyTtsProvider } from "./telephony-tts.js";
import { startTunnel, type TunnelResult } from "./tunnel.js";
import {
cleanupTailscaleExposure,
setupTailscaleExposure,
VoiceCallWebhookServer,
} from "./webhook.js";
export type VoiceCallRuntime = {
config: VoiceCallConfig;
provider: VoiceCallProvider;
manager: CallManager;
webhookServer: VoiceCallWebhookServer;
webhookUrl: string;
publicUrl: string | null;
stop: () => Promise<void>;
};
type Logger = {
info: (message: string) => void;
warn: (message: string) => void;
error: (message: string) => void;
debug?: (message: string) => void;
};
function isLoopbackBind(bind: string | undefined): boolean {
if (!bind) {
return false;
}
return bind === "127.0.0.1" || bind === "::1" || bind === "localhost";
}
function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
const allowNgrokFreeTierLoopbackBypass =
config.tunnel?.provider === "ngrok" &&
isLoopbackBind(config.serve?.bind) &&
(config.tunnel?.allowNgrokFreeTierLoopbackBypass ?? false);
switch (config.provider) {
case "telnyx":
return new TelnyxProvider(
{
apiKey: config.telnyx?.apiKey,
connectionId: config.telnyx?.connectionId,
publicKey: config.telnyx?.publicKey,
},
{
skipVerification: config.skipSignatureVerification,
},
);
case "twilio":
return new TwilioProvider(
{
accountSid: config.twilio?.accountSid,
authToken: config.twilio?.authToken,
},
{
allowNgrokFreeTierLoopbackBypass,
publicUrl: config.publicUrl,
skipVerification: config.skipSignatureVerification,
streamPath: config.streaming?.enabled ? config.streaming.streamPath : undefined,
webhookSecurity: config.webhookSecurity,
},
);
case "plivo":
return new PlivoProvider(
{
authId: config.plivo?.authId,
authToken: config.plivo?.authToken,
},
{
publicUrl: config.publicUrl,
skipVerification: config.skipSignatureVerification,
ringTimeoutSec: Math.max(1, Math.floor(config.ringTimeoutMs / 1000)),
webhookSecurity: config.webhookSecurity,
},
);
case "mock":
return new MockProvider();
default:
throw new Error(`Unsupported voice-call provider: ${String(config.provider)}`);
}
}
export async function createVoiceCallRuntime(params: {
config: VoiceCallConfig;
coreConfig: CoreConfig;
ttsRuntime?: TelephonyTtsRuntime;
logger?: Logger;
}): Promise<VoiceCallRuntime> {
const { config: rawConfig, coreConfig, ttsRuntime, logger } = params;
const log = logger ?? {
info: console.log,
warn: console.warn,
error: console.error,
debug: console.debug,
};
const config = resolveVoiceCallConfig(rawConfig);
if (!config.enabled) {
throw new Error("Voice call disabled. Enable the plugin entry in config.");
}
if (config.skipSignatureVerification) {
log.warn(
"[voice-call] SECURITY WARNING: skipSignatureVerification=true disables webhook signature verification (development only). Do not use in production.",
);
}
const validation = validateProviderConfig(config);
if (!validation.valid) {
throw new Error(`Invalid voice-call config: ${validation.errors.join("; ")}`);
}
const provider = resolveProvider(config);
const manager = new CallManager(config);
const webhookServer = new VoiceCallWebhookServer(config, manager, provider, coreConfig);
const localUrl = await webhookServer.start();
// Determine public URL - priority: config.publicUrl > tunnel > legacy tailscale
let publicUrl: string | null = config.publicUrl ?? null;
let tunnelResult: TunnelResult | null = null;
if (!publicUrl && config.tunnel?.provider && config.tunnel.provider !== "none") {
try {
tunnelResult = await startTunnel({
provider: config.tunnel.provider,
port: config.serve.port,
path: config.serve.path,
ngrokAuthToken: config.tunnel.ngrokAuthToken,
ngrokDomain: config.tunnel.ngrokDomain,
});
publicUrl = tunnelResult?.publicUrl ?? null;
} catch (err) {
log.error(
`[voice-call] Tunnel setup failed: ${err instanceof Error ? err.message : String(err)}`,
);
}
}
if (!publicUrl && config.tailscale?.mode !== "off") {
publicUrl = await setupTailscaleExposure(config);
}
const webhookUrl = publicUrl ?? localUrl;
if (publicUrl && provider.name === "twilio") {
(provider as TwilioProvider).setPublicUrl(publicUrl);
}
if (provider.name === "twilio" && config.streaming?.enabled) {
const twilioProvider = provider as TwilioProvider;
if (ttsRuntime?.textToSpeechTelephony) {
try {
const ttsProvider = createTelephonyTtsProvider({
coreConfig,
ttsOverride: config.tts,
runtime: ttsRuntime,
});
twilioProvider.setTTSProvider(ttsProvider);
log.info("[voice-call] Telephony TTS provider configured");
} catch (err) {
log.warn(
`[voice-call] Failed to initialize telephony TTS: ${
err instanceof Error ? err.message : String(err)
}`,
);
}
} else {
log.warn("[voice-call] Telephony TTS unavailable; streaming TTS disabled");
}
const mediaHandler = webhookServer.getMediaStreamHandler();
if (mediaHandler) {
twilioProvider.setMediaStreamHandler(mediaHandler);
log.info("[voice-call] Media stream handler wired to provider");
}
}
manager.initialize(provider, webhookUrl);
const stop = async () => {
if (tunnelResult) {
await tunnelResult.stop();
}
await cleanupTailscaleExposure(config);
await webhookServer.stop();
};
log.info("[voice-call] Runtime initialized");
log.info(`[voice-call] Webhook URL: ${webhookUrl}`);
if (publicUrl) {
log.info(`[voice-call] Public URL: ${publicUrl}`);
}
return {
config,
provider,
manager,
webhookServer,
webhookUrl,
publicUrl,
stop,
};
}

View File

@@ -0,0 +1,90 @@
const TELEPHONY_SAMPLE_RATE = 8000;
function clamp16(value: number): number {
return Math.max(-32768, Math.min(32767, value));
}
/**
* Resample 16-bit PCM (little-endian mono) to 8kHz using linear interpolation.
*/
export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
if (inputSampleRate === TELEPHONY_SAMPLE_RATE) {
return input;
}
const inputSamples = Math.floor(input.length / 2);
if (inputSamples === 0) {
return Buffer.alloc(0);
}
const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE;
const outputSamples = Math.floor(inputSamples / ratio);
const output = Buffer.alloc(outputSamples * 2);
for (let i = 0; i < outputSamples; i++) {
const srcPos = i * ratio;
const srcIndex = Math.floor(srcPos);
const frac = srcPos - srcIndex;
const s0 = input.readInt16LE(srcIndex * 2);
const s1Index = Math.min(srcIndex + 1, inputSamples - 1);
const s1 = input.readInt16LE(s1Index * 2);
const sample = Math.round(s0 + frac * (s1 - s0));
output.writeInt16LE(clamp16(sample), i * 2);
}
return output;
}
/**
* Convert 16-bit PCM to 8-bit mu-law (G.711).
*/
export function pcmToMulaw(pcm: Buffer): Buffer {
const samples = Math.floor(pcm.length / 2);
const mulaw = Buffer.alloc(samples);
for (let i = 0; i < samples; i++) {
const sample = pcm.readInt16LE(i * 2);
mulaw[i] = linearToMulaw(sample);
}
return mulaw;
}
export function convertPcmToMulaw8k(pcm: Buffer, inputSampleRate: number): Buffer {
const pcm8k = resamplePcmTo8k(pcm, inputSampleRate);
return pcmToMulaw(pcm8k);
}
/**
* Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law).
*/
export function chunkAudio(audio: Buffer, chunkSize = 160): Generator<Buffer, void, unknown> {
return (function* () {
for (let i = 0; i < audio.length; i += chunkSize) {
yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
}
})();
}
function linearToMulaw(sample: number): number {
const BIAS = 132;
const CLIP = 32635;
const sign = sample < 0 ? 0x80 : 0;
if (sample < 0) {
sample = -sample;
}
if (sample > CLIP) {
sample = CLIP;
}
sample += BIAS;
let exponent = 7;
for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) {
expMask >>= 1;
}
const mantissa = (sample >> (exponent + 3)) & 0x0f;
return ~(sign | (exponent << 4) | mantissa) & 0xff;
}

View File

@@ -0,0 +1,75 @@
import { afterEach, describe, expect, it } from "vitest";
import type { VoiceCallTtsConfig } from "./config.js";
import type { CoreConfig } from "./core-bridge.js";
import { createTelephonyTtsProvider } from "./telephony-tts.js";
function createCoreConfig(): CoreConfig {
const tts: VoiceCallTtsConfig = {
provider: "openai",
openai: {
model: "gpt-4o-mini-tts",
voice: "alloy",
},
};
return { messages: { tts } };
}
async function mergeOverride(override: unknown): Promise<Record<string, unknown>> {
let mergedConfig: CoreConfig | undefined;
const provider = createTelephonyTtsProvider({
coreConfig: createCoreConfig(),
ttsOverride: override as VoiceCallTtsConfig,
runtime: {
textToSpeechTelephony: async ({ cfg }) => {
mergedConfig = cfg;
return {
success: true,
audioBuffer: Buffer.alloc(2),
sampleRate: 8000,
};
},
},
});
await provider.synthesizeForTelephony("hello");
expect(mergedConfig?.messages?.tts).toBeDefined();
return mergedConfig?.messages?.tts as Record<string, unknown>;
}
afterEach(() => {
delete (Object.prototype as Record<string, unknown>).polluted;
});
describe("createTelephonyTtsProvider deepMerge hardening", () => {
it("merges safe nested overrides", async () => {
const tts = await mergeOverride({
openai: { voice: "coral" },
});
const openai = tts.openai as Record<string, unknown>;
expect(openai.voice).toBe("coral");
expect(openai.model).toBe("gpt-4o-mini-tts");
});
it("blocks top-level __proto__ keys", async () => {
const tts = await mergeOverride(
JSON.parse('{"__proto__":{"polluted":"top"},"openai":{"voice":"coral"}}'),
);
const openai = tts.openai as Record<string, unknown>;
expect((Object.prototype as Record<string, unknown>).polluted).toBeUndefined();
expect(tts.polluted).toBeUndefined();
expect(openai.voice).toBe("coral");
});
it("blocks nested __proto__ keys", async () => {
const tts = await mergeOverride(
JSON.parse('{"openai":{"model":"safe","__proto__":{"polluted":"nested"}}}'),
);
const openai = tts.openai as Record<string, unknown>;
expect((Object.prototype as Record<string, unknown>).polluted).toBeUndefined();
expect(openai.polluted).toBeUndefined();
expect(openai.model).toBe("safe");
});
});

View File

@@ -0,0 +1,106 @@
import type { VoiceCallTtsConfig } from "./config.js";
import type { CoreConfig } from "./core-bridge.js";
import { convertPcmToMulaw8k } from "./telephony-audio.js";
export type TelephonyTtsRuntime = {
textToSpeechTelephony: (params: {
text: string;
cfg: CoreConfig;
prefsPath?: string;
}) => Promise<{
success: boolean;
audioBuffer?: Buffer;
sampleRate?: number;
provider?: string;
error?: string;
}>;
};
export type TelephonyTtsProvider = {
synthesizeForTelephony: (text: string) => Promise<Buffer>;
};
const BLOCKED_MERGE_KEYS = new Set(["__proto__", "prototype", "constructor"]);
export function createTelephonyTtsProvider(params: {
coreConfig: CoreConfig;
ttsOverride?: VoiceCallTtsConfig;
runtime: TelephonyTtsRuntime;
}): TelephonyTtsProvider {
const { coreConfig, ttsOverride, runtime } = params;
const mergedConfig = applyTtsOverride(coreConfig, ttsOverride);
return {
synthesizeForTelephony: async (text: string) => {
const result = await runtime.textToSpeechTelephony({
text,
cfg: mergedConfig,
});
if (!result.success || !result.audioBuffer || !result.sampleRate) {
throw new Error(result.error ?? "TTS conversion failed");
}
return convertPcmToMulaw8k(result.audioBuffer, result.sampleRate);
},
};
}
function applyTtsOverride(coreConfig: CoreConfig, override?: VoiceCallTtsConfig): CoreConfig {
if (!override) {
return coreConfig;
}
const base = coreConfig.messages?.tts;
const merged = mergeTtsConfig(base, override);
if (!merged) {
return coreConfig;
}
return {
...coreConfig,
messages: {
...coreConfig.messages,
tts: merged,
},
};
}
function mergeTtsConfig(
base?: VoiceCallTtsConfig,
override?: VoiceCallTtsConfig,
): VoiceCallTtsConfig | undefined {
if (!base && !override) {
return undefined;
}
if (!override) {
return base;
}
if (!base) {
return override;
}
return deepMerge(base, override);
}
function deepMerge<T>(base: T, override: T): T {
if (!isPlainObject(base) || !isPlainObject(override)) {
return override;
}
const result: Record<string, unknown> = { ...base };
for (const [key, value] of Object.entries(override)) {
if (BLOCKED_MERGE_KEYS.has(key) || value === undefined) {
continue;
}
const existing = (base as Record<string, unknown>)[key];
if (isPlainObject(existing) && isPlainObject(value)) {
result[key] = deepMerge(existing, value);
} else {
result[key] = value;
}
}
return result as T;
}
function isPlainObject(value: unknown): value is Record<string, unknown> {
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
}

View File

@@ -0,0 +1,314 @@
import { spawn } from "node:child_process";
import { getTailscaleDnsName } from "./webhook.js";
/**
* Tunnel configuration for exposing the webhook server.
*/
export interface TunnelConfig {
/** Tunnel provider: ngrok, tailscale-serve, or tailscale-funnel */
provider: "ngrok" | "tailscale-serve" | "tailscale-funnel" | "none";
/** Local port to tunnel */
port: number;
/** Path prefix for the tunnel (e.g., /voice/webhook) */
path: string;
/** ngrok auth token (optional, enables longer sessions) */
ngrokAuthToken?: string;
/** ngrok custom domain (paid feature) */
ngrokDomain?: string;
}
/**
* Result of starting a tunnel.
*/
export interface TunnelResult {
/** The public URL */
publicUrl: string;
/** Function to stop the tunnel */
stop: () => Promise<void>;
/** Tunnel provider name */
provider: string;
}
/**
* Start an ngrok tunnel to expose the local webhook server.
*
* Uses the ngrok CLI which must be installed: https://ngrok.com/download
*
* @example
* const tunnel = await startNgrokTunnel({ port: 3334, path: '/voice/webhook' });
* console.log('Public URL:', tunnel.publicUrl);
* // Later: await tunnel.stop();
*/
export async function startNgrokTunnel(config: {
port: number;
path: string;
authToken?: string;
domain?: string;
}): Promise<TunnelResult> {
// Set auth token if provided
if (config.authToken) {
await runNgrokCommand(["config", "add-authtoken", config.authToken]);
}
// Build ngrok command args
const args = ["http", String(config.port), "--log", "stdout", "--log-format", "json"];
// Add custom domain if provided (paid ngrok feature)
if (config.domain) {
args.push("--domain", config.domain);
}
return new Promise((resolve, reject) => {
const proc = spawn("ngrok", args, {
stdio: ["ignore", "pipe", "pipe"],
});
let resolved = false;
let publicUrl: string | null = null;
let outputBuffer = "";
const timeout = setTimeout(() => {
if (!resolved) {
resolved = true;
proc.kill("SIGTERM");
reject(new Error("ngrok startup timed out (30s)"));
}
}, 30000);
const processLine = (line: string) => {
try {
const log = JSON.parse(line);
// ngrok logs the public URL in a 'started tunnel' message
if (log.msg === "started tunnel" && log.url) {
publicUrl = log.url;
}
// Also check for the URL field directly
if (log.addr && log.url && !publicUrl) {
publicUrl = log.url;
}
// Check for ready state
if (publicUrl && !resolved) {
resolved = true;
clearTimeout(timeout);
// Add path to the public URL
const fullUrl = publicUrl + config.path;
console.log(`[voice-call] ngrok tunnel active: ${fullUrl}`);
resolve({
publicUrl: fullUrl,
provider: "ngrok",
stop: async () => {
proc.kill("SIGTERM");
await new Promise<void>((res) => {
proc.on("close", () => res());
setTimeout(res, 2000); // Fallback timeout
});
},
});
}
} catch {
// Not JSON, might be startup message
}
};
proc.stdout.on("data", (data: Buffer) => {
outputBuffer += data.toString();
const lines = outputBuffer.split("\n");
outputBuffer = lines.pop() || "";
for (const line of lines) {
if (line.trim()) {
processLine(line);
}
}
});
proc.stderr.on("data", (data: Buffer) => {
const msg = data.toString();
// Check for common errors
if (msg.includes("ERR_NGROK")) {
if (!resolved) {
resolved = true;
clearTimeout(timeout);
reject(new Error(`ngrok error: ${msg}`));
}
}
});
proc.on("error", (err) => {
if (!resolved) {
resolved = true;
clearTimeout(timeout);
reject(new Error(`Failed to start ngrok: ${err.message}`));
}
});
proc.on("close", (code) => {
if (!resolved) {
resolved = true;
clearTimeout(timeout);
reject(new Error(`ngrok exited unexpectedly with code ${code}`));
}
});
});
}
/**
* Run an ngrok command and wait for completion.
*/
async function runNgrokCommand(args: string[]): Promise<string> {
return new Promise((resolve, reject) => {
const proc = spawn("ngrok", args, {
stdio: ["ignore", "pipe", "pipe"],
});
let stdout = "";
let stderr = "";
proc.stdout.on("data", (data) => {
stdout += data.toString();
});
proc.stderr.on("data", (data) => {
stderr += data.toString();
});
proc.on("close", (code) => {
if (code === 0) {
resolve(stdout);
} else {
reject(new Error(`ngrok command failed: ${stderr || stdout}`));
}
});
proc.on("error", reject);
});
}
/**
* Check if ngrok is installed and available.
*/
export async function isNgrokAvailable(): Promise<boolean> {
return new Promise((resolve) => {
const proc = spawn("ngrok", ["version"], {
stdio: ["ignore", "pipe", "pipe"],
});
proc.on("close", (code) => {
resolve(code === 0);
});
proc.on("error", () => {
resolve(false);
});
});
}
/**
* Start a Tailscale serve/funnel tunnel.
*/
export async function startTailscaleTunnel(config: {
mode: "serve" | "funnel";
port: number;
path: string;
}): Promise<TunnelResult> {
// Get Tailscale DNS name
const dnsName = await getTailscaleDnsName();
if (!dnsName) {
throw new Error("Could not get Tailscale DNS name. Is Tailscale running?");
}
const path = config.path.startsWith("/") ? config.path : `/${config.path}`;
const localUrl = `http://127.0.0.1:${config.port}${path}`;
return new Promise((resolve, reject) => {
const proc = spawn("tailscale", [config.mode, "--bg", "--yes", "--set-path", path, localUrl], {
stdio: ["ignore", "pipe", "pipe"],
});
const timeout = setTimeout(() => {
proc.kill("SIGKILL");
reject(new Error(`Tailscale ${config.mode} timed out`));
}, 10000);
proc.on("close", (code) => {
clearTimeout(timeout);
if (code === 0) {
const publicUrl = `https://${dnsName}${path}`;
console.log(`[voice-call] Tailscale ${config.mode} active: ${publicUrl}`);
resolve({
publicUrl,
provider: `tailscale-${config.mode}`,
stop: async () => {
await stopTailscaleTunnel(config.mode, path);
},
});
} else {
reject(new Error(`Tailscale ${config.mode} failed with code ${code}`));
}
});
proc.on("error", (err) => {
clearTimeout(timeout);
reject(err);
});
});
}
/**
* Stop a Tailscale serve/funnel tunnel.
*/
async function stopTailscaleTunnel(mode: "serve" | "funnel", path: string): Promise<void> {
return new Promise((resolve) => {
const proc = spawn("tailscale", [mode, "off", path], {
stdio: "ignore",
});
const timeout = setTimeout(() => {
proc.kill("SIGKILL");
resolve();
}, 5000);
proc.on("close", () => {
clearTimeout(timeout);
resolve();
});
});
}
/**
* Start a tunnel based on configuration.
*/
export async function startTunnel(config: TunnelConfig): Promise<TunnelResult | null> {
switch (config.provider) {
case "ngrok":
return startNgrokTunnel({
port: config.port,
path: config.path,
authToken: config.ngrokAuthToken,
domain: config.ngrokDomain,
});
case "tailscale-serve":
return startTailscaleTunnel({
mode: "serve",
port: config.port,
path: config.path,
});
case "tailscale-funnel":
return startTailscaleTunnel({
mode: "funnel",
port: config.port,
path: config.path,
});
default:
return null;
}
}

View File

@@ -0,0 +1,287 @@
import { z } from "zod";
import type { CallMode } from "./config.js";
// -----------------------------------------------------------------------------
// Provider Identifiers
// -----------------------------------------------------------------------------
export const ProviderNameSchema = z.enum(["telnyx", "twilio", "plivo", "mock"]);
export type ProviderName = z.infer<typeof ProviderNameSchema>;
// -----------------------------------------------------------------------------
// Core Call Identifiers
// -----------------------------------------------------------------------------
/** Internal call identifier (UUID) */
export type CallId = string;
/** Provider-specific call identifier */
export type ProviderCallId = string;
// -----------------------------------------------------------------------------
// Call Lifecycle States
// -----------------------------------------------------------------------------
export const CallStateSchema = z.enum([
// Non-terminal states
"initiated",
"ringing",
"answered",
"active",
"speaking",
"listening",
// Terminal states
"completed",
"hangup-user",
"hangup-bot",
"timeout",
"error",
"failed",
"no-answer",
"busy",
"voicemail",
]);
export type CallState = z.infer<typeof CallStateSchema>;
export const TerminalStates = new Set<CallState>([
"completed",
"hangup-user",
"hangup-bot",
"timeout",
"error",
"failed",
"no-answer",
"busy",
"voicemail",
]);
export const EndReasonSchema = z.enum([
"completed",
"hangup-user",
"hangup-bot",
"timeout",
"error",
"failed",
"no-answer",
"busy",
"voicemail",
]);
export type EndReason = z.infer<typeof EndReasonSchema>;
// -----------------------------------------------------------------------------
// Normalized Call Events
// -----------------------------------------------------------------------------
const BaseEventSchema = z.object({
id: z.string(),
// Stable provider-derived key for idempotency/replay dedupe.
dedupeKey: z.string().optional(),
callId: z.string(),
providerCallId: z.string().optional(),
timestamp: z.number(),
// Optional per-turn nonce for speech events (Twilio <Gather> replay hardening).
turnToken: z.string().optional(),
// Optional fields for inbound call detection
direction: z.enum(["inbound", "outbound"]).optional(),
from: z.string().optional(),
to: z.string().optional(),
});
export const NormalizedEventSchema = z.discriminatedUnion("type", [
BaseEventSchema.extend({
type: z.literal("call.initiated"),
}),
BaseEventSchema.extend({
type: z.literal("call.ringing"),
}),
BaseEventSchema.extend({
type: z.literal("call.answered"),
}),
BaseEventSchema.extend({
type: z.literal("call.active"),
}),
BaseEventSchema.extend({
type: z.literal("call.speaking"),
text: z.string(),
}),
BaseEventSchema.extend({
type: z.literal("call.speech"),
transcript: z.string(),
isFinal: z.boolean(),
confidence: z.number().min(0).max(1).optional(),
}),
BaseEventSchema.extend({
type: z.literal("call.silence"),
durationMs: z.number(),
}),
BaseEventSchema.extend({
type: z.literal("call.dtmf"),
digits: z.string(),
}),
BaseEventSchema.extend({
type: z.literal("call.ended"),
reason: EndReasonSchema,
}),
BaseEventSchema.extend({
type: z.literal("call.error"),
error: z.string(),
retryable: z.boolean().optional(),
}),
]);
export type NormalizedEvent = z.infer<typeof NormalizedEventSchema>;
// -----------------------------------------------------------------------------
// Call Direction
// -----------------------------------------------------------------------------
export const CallDirectionSchema = z.enum(["outbound", "inbound"]);
export type CallDirection = z.infer<typeof CallDirectionSchema>;
// -----------------------------------------------------------------------------
// Call Record
// -----------------------------------------------------------------------------
export const TranscriptEntrySchema = z.object({
timestamp: z.number(),
speaker: z.enum(["bot", "user"]),
text: z.string(),
isFinal: z.boolean().default(true),
});
export type TranscriptEntry = z.infer<typeof TranscriptEntrySchema>;
export const CallRecordSchema = z.object({
callId: z.string(),
providerCallId: z.string().optional(),
provider: ProviderNameSchema,
direction: CallDirectionSchema,
state: CallStateSchema,
from: z.string(),
to: z.string(),
sessionKey: z.string().optional(),
startedAt: z.number(),
answeredAt: z.number().optional(),
endedAt: z.number().optional(),
endReason: EndReasonSchema.optional(),
transcript: z.array(TranscriptEntrySchema).default([]),
processedEventIds: z.array(z.string()).default([]),
metadata: z.record(z.string(), z.unknown()).optional(),
});
export type CallRecord = z.infer<typeof CallRecordSchema>;
// -----------------------------------------------------------------------------
// Webhook Types
// -----------------------------------------------------------------------------
export type WebhookVerificationResult = {
ok: boolean;
reason?: string;
/** Signature is valid, but request was seen before within replay window. */
isReplay?: boolean;
/** Stable key derived from authenticated request material. */
verifiedRequestKey?: string;
};
export type WebhookParseOptions = {
/** Stable request key from verifyWebhook. */
verifiedRequestKey?: string;
};
export type WebhookContext = {
headers: Record<string, string | string[] | undefined>;
rawBody: string;
url: string;
method: "GET" | "POST" | "PUT" | "DELETE" | "PATCH";
query?: Record<string, string | string[] | undefined>;
remoteAddress?: string;
};
export type ProviderWebhookParseResult = {
events: NormalizedEvent[];
providerResponseBody?: string;
providerResponseHeaders?: Record<string, string>;
statusCode?: number;
};
// -----------------------------------------------------------------------------
// Provider Method Types
// -----------------------------------------------------------------------------
export type InitiateCallInput = {
callId: CallId;
from: string;
to: string;
webhookUrl: string;
clientState?: Record<string, string>;
/** Inline TwiML to execute (skips webhook, used for notify mode) */
inlineTwiml?: string;
};
export type InitiateCallResult = {
providerCallId: ProviderCallId;
status: "initiated" | "queued";
};
export type HangupCallInput = {
callId: CallId;
providerCallId: ProviderCallId;
reason: EndReason;
};
export type PlayTtsInput = {
callId: CallId;
providerCallId: ProviderCallId;
text: string;
voice?: string;
locale?: string;
};
export type StartListeningInput = {
callId: CallId;
providerCallId: ProviderCallId;
language?: string;
/** Optional per-turn nonce for provider callbacks (replay hardening). */
turnToken?: string;
};
export type StopListeningInput = {
callId: CallId;
providerCallId: ProviderCallId;
};
// -----------------------------------------------------------------------------
// Outbound Call Options
// -----------------------------------------------------------------------------
export type OutboundCallOptions = {
/** Message to speak when call connects */
message?: string;
/** Call mode (overrides config default) */
mode?: CallMode;
};
// -----------------------------------------------------------------------------
// Tool Result Types
// -----------------------------------------------------------------------------
export type InitiateCallToolResult = {
success: boolean;
callId?: string;
status?: "initiated" | "queued" | "no-answer" | "busy" | "failed";
error?: string;
};
export type ContinueCallToolResult = {
success: boolean;
transcript?: string;
error?: string;
};
export type SpeakToUserToolResult = {
success: boolean;
error?: string;
};
export type EndCallToolResult = {
success: boolean;
error?: string;
};

View File

@@ -0,0 +1,14 @@
import os from "node:os";
import path from "node:path";
export function resolveUserPath(input: string): string {
const trimmed = input.trim();
if (!trimmed) {
return trimmed;
}
if (trimmed.startsWith("~")) {
const expanded = trimmed.replace(/^~(?=$|[\\/])/, os.homedir());
return path.resolve(expanded);
}
return path.resolve(trimmed);
}

View File

@@ -0,0 +1,67 @@
/**
* Voice mapping and XML utilities for voice call providers.
*/
/**
* Escape XML special characters for TwiML and other XML responses.
*/
export function escapeXml(text: string): string {
return text
.replace(/&/g, "&amp;")
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;")
.replace(/"/g, "&quot;")
.replace(/'/g, "&apos;");
}
/**
* Map of OpenAI voice names to similar Twilio Polly voices.
*/
const OPENAI_TO_POLLY_MAP: Record<string, string> = {
alloy: "Polly.Joanna", // neutral, warm
echo: "Polly.Matthew", // male, warm
fable: "Polly.Amy", // British, expressive
onyx: "Polly.Brian", // deep male
nova: "Polly.Salli", // female, friendly
shimmer: "Polly.Kimberly", // female, clear
};
/**
* Default Polly voice when no mapping is found.
*/
export const DEFAULT_POLLY_VOICE = "Polly.Joanna";
/**
* Map OpenAI voice names to Twilio Polly equivalents.
* Falls through if already a valid Polly/Google voice.
*
* @param voice - OpenAI voice name (alloy, echo, etc.) or Polly voice name
* @returns Polly voice name suitable for Twilio TwiML
*/
export function mapVoiceToPolly(voice: string | undefined): string {
if (!voice) {
return DEFAULT_POLLY_VOICE;
}
// Already a Polly/Google voice - pass through
if (voice.startsWith("Polly.") || voice.startsWith("Google.")) {
return voice;
}
// Map OpenAI voices to Polly equivalents
return OPENAI_TO_POLLY_MAP[voice.toLowerCase()] || DEFAULT_POLLY_VOICE;
}
/**
* Check if a voice name is a known OpenAI voice.
*/
export function isOpenAiVoice(voice: string): boolean {
return voice.toLowerCase() in OPENAI_TO_POLLY_MAP;
}
/**
* Get all supported OpenAI voice names.
*/
export function getOpenAiVoiceNames(): string[] {
return Object.keys(OPENAI_TO_POLLY_MAP);
}

View File

@@ -0,0 +1,622 @@
import crypto from "node:crypto";
import { describe, expect, it } from "vitest";
import {
verifyPlivoWebhook,
verifyTelnyxWebhook,
verifyTwilioWebhook,
} from "./webhook-security.js";
function canonicalizeBase64(input: string): string {
return Buffer.from(input, "base64").toString("base64");
}
function plivoV2Signature(params: {
authToken: string;
urlNoQuery: string;
nonce: string;
}): string {
const digest = crypto
.createHmac("sha256", params.authToken)
.update(params.urlNoQuery + params.nonce)
.digest("base64");
return canonicalizeBase64(digest);
}
function plivoV3Signature(params: {
authToken: string;
urlWithQuery: string;
postBody: string;
nonce: string;
}): string {
const u = new URL(params.urlWithQuery);
const baseNoQuery = `${u.protocol}//${u.host}${u.pathname}`;
const queryPairs: Array<[string, string]> = [];
for (const [k, v] of u.searchParams.entries()) {
queryPairs.push([k, v]);
}
const queryMap = new Map<string, string[]>();
for (const [k, v] of queryPairs) {
queryMap.set(k, (queryMap.get(k) ?? []).concat(v));
}
const sortedQuery = Array.from(queryMap.keys())
.toSorted()
.flatMap((k) => [...(queryMap.get(k) ?? [])].toSorted().map((v) => `${k}=${v}`))
.join("&");
const postParams = new URLSearchParams(params.postBody);
const postMap = new Map<string, string[]>();
for (const [k, v] of postParams.entries()) {
postMap.set(k, (postMap.get(k) ?? []).concat(v));
}
const sortedPost = Array.from(postMap.keys())
.toSorted()
.flatMap((k) => [...(postMap.get(k) ?? [])].toSorted().map((v) => `${k}${v}`))
.join("");
const hasPost = sortedPost.length > 0;
let baseUrl = baseNoQuery;
if (sortedQuery.length > 0 || hasPost) {
baseUrl = `${baseNoQuery}?${sortedQuery}`;
}
if (sortedQuery.length > 0 && hasPost) {
baseUrl = `${baseUrl}.`;
}
baseUrl = `${baseUrl}${sortedPost}`;
const digest = crypto
.createHmac("sha256", params.authToken)
.update(`${baseUrl}.${params.nonce}`)
.digest("base64");
return canonicalizeBase64(digest);
}
function twilioSignature(params: { authToken: string; url: string; postBody: string }): string {
let dataToSign = params.url;
const sortedParams = Array.from(new URLSearchParams(params.postBody).entries()).toSorted((a, b) =>
a[0].localeCompare(b[0]),
);
for (const [key, value] of sortedParams) {
dataToSign += key + value;
}
return crypto.createHmac("sha1", params.authToken).update(dataToSign).digest("base64");
}
describe("verifyPlivoWebhook", () => {
it("accepts valid V2 signature", () => {
const authToken = "test-auth-token";
const nonce = "nonce-123";
const ctxUrl = "http://local/voice/webhook?flow=answer&callId=abc";
const verificationUrl = "https://example.com/voice/webhook";
const signature = plivoV2Signature({
authToken,
urlNoQuery: verificationUrl,
nonce,
});
const result = verifyPlivoWebhook(
{
headers: {
host: "example.com",
"x-forwarded-proto": "https",
"x-plivo-signature-v2": signature,
"x-plivo-signature-v2-nonce": nonce,
},
rawBody: "CallUUID=uuid&CallStatus=in-progress",
url: ctxUrl,
method: "POST",
query: { flow: "answer", callId: "abc" },
},
authToken,
);
expect(result.ok).toBe(true);
expect(result.version).toBe("v2");
});
it("accepts valid V3 signature (including multi-signature header)", () => {
const authToken = "test-auth-token";
const nonce = "nonce-456";
const urlWithQuery = "https://example.com/voice/webhook?flow=answer&callId=abc";
const postBody = "CallUUID=uuid&CallStatus=in-progress&From=%2B15550000000";
const good = plivoV3Signature({
authToken,
urlWithQuery,
postBody,
nonce,
});
const result = verifyPlivoWebhook(
{
headers: {
host: "example.com",
"x-forwarded-proto": "https",
"x-plivo-signature-v3": `bad, ${good}`,
"x-plivo-signature-v3-nonce": nonce,
},
rawBody: postBody,
url: urlWithQuery,
method: "POST",
query: { flow: "answer", callId: "abc" },
},
authToken,
);
expect(result.ok).toBe(true);
expect(result.version).toBe("v3");
});
it("rejects missing signatures", () => {
const result = verifyPlivoWebhook(
{
headers: { host: "example.com", "x-forwarded-proto": "https" },
rawBody: "",
url: "https://example.com/voice/webhook",
method: "POST",
},
"token",
);
expect(result.ok).toBe(false);
expect(result.reason).toMatch(/Missing Plivo signature headers/);
});
it("marks replayed valid V3 requests as replay without failing auth", () => {
const authToken = "test-auth-token";
const nonce = "nonce-replay-v3";
const urlWithQuery = "https://example.com/voice/webhook?flow=answer&callId=abc";
const postBody = "CallUUID=uuid&CallStatus=in-progress&From=%2B15550000000";
const signature = plivoV3Signature({
authToken,
urlWithQuery,
postBody,
nonce,
});
const ctx = {
headers: {
host: "example.com",
"x-forwarded-proto": "https",
"x-plivo-signature-v3": signature,
"x-plivo-signature-v3-nonce": nonce,
},
rawBody: postBody,
url: urlWithQuery,
method: "POST" as const,
query: { flow: "answer", callId: "abc" },
};
const first = verifyPlivoWebhook(ctx, authToken);
const second = verifyPlivoWebhook(ctx, authToken);
expect(first.ok).toBe(true);
expect(first.isReplay).toBeFalsy();
expect(first.verifiedRequestKey).toBeTruthy();
expect(second.ok).toBe(true);
expect(second.isReplay).toBe(true);
expect(second.verifiedRequestKey).toBe(first.verifiedRequestKey);
});
it("returns a stable request key when verification is skipped", () => {
const ctx = {
headers: {},
rawBody: "CallUUID=uuid&CallStatus=in-progress",
url: "https://example.com/voice/webhook",
method: "POST" as const,
};
const first = verifyPlivoWebhook(ctx, "token", { skipVerification: true });
const second = verifyPlivoWebhook(ctx, "token", { skipVerification: true });
expect(first.ok).toBe(true);
expect(first.verifiedRequestKey).toMatch(/^plivo:skip:/);
expect(second.verifiedRequestKey).toBe(first.verifiedRequestKey);
expect(second.isReplay).toBe(true);
});
});
describe("verifyTelnyxWebhook", () => {
it("marks replayed valid requests as replay without failing auth", () => {
const { publicKey, privateKey } = crypto.generateKeyPairSync("ed25519");
const pemPublicKey = publicKey.export({ format: "pem", type: "spki" }).toString();
const timestamp = String(Math.floor(Date.now() / 1000));
const rawBody = JSON.stringify({
data: { event_type: "call.initiated", payload: { call_control_id: "call-1" } },
nonce: crypto.randomUUID(),
});
const signedPayload = `${timestamp}|${rawBody}`;
const signature = crypto.sign(null, Buffer.from(signedPayload), privateKey).toString("base64");
const ctx = {
headers: {
"telnyx-signature-ed25519": signature,
"telnyx-timestamp": timestamp,
},
rawBody,
url: "https://example.com/voice/webhook",
method: "POST" as const,
};
const first = verifyTelnyxWebhook(ctx, pemPublicKey);
const second = verifyTelnyxWebhook(ctx, pemPublicKey);
expect(first.ok).toBe(true);
expect(first.isReplay).toBeFalsy();
expect(first.verifiedRequestKey).toBeTruthy();
expect(second.ok).toBe(true);
expect(second.isReplay).toBe(true);
expect(second.verifiedRequestKey).toBe(first.verifiedRequestKey);
});
it("returns a stable request key when verification is skipped", () => {
const ctx = {
headers: {},
rawBody: JSON.stringify({ data: { event_type: "call.initiated" } }),
url: "https://example.com/voice/webhook",
method: "POST" as const,
};
const first = verifyTelnyxWebhook(ctx, undefined, { skipVerification: true });
const second = verifyTelnyxWebhook(ctx, undefined, { skipVerification: true });
expect(first.ok).toBe(true);
expect(first.verifiedRequestKey).toMatch(/^telnyx:skip:/);
expect(second.verifiedRequestKey).toBe(first.verifiedRequestKey);
expect(second.isReplay).toBe(true);
});
});
describe("verifyTwilioWebhook", () => {
it("uses request query when publicUrl omits it", () => {
const authToken = "test-auth-token";
const publicUrl = "https://example.com/voice/webhook";
const urlWithQuery = `${publicUrl}?callId=abc`;
const postBody = "CallSid=CS123&CallStatus=completed&From=%2B15550000000";
const signature = twilioSignature({
authToken,
url: urlWithQuery,
postBody,
});
const result = verifyTwilioWebhook(
{
headers: {
host: "example.com",
"x-forwarded-proto": "https",
"x-twilio-signature": signature,
},
rawBody: postBody,
url: "http://local/voice/webhook?callId=abc",
method: "POST",
query: { callId: "abc" },
},
authToken,
{ publicUrl },
);
expect(result.ok).toBe(true);
});
it("marks replayed valid requests as replay without failing auth", () => {
const authToken = "test-auth-token";
const publicUrl = "https://example.com/voice/webhook";
const urlWithQuery = `${publicUrl}?callId=abc`;
const postBody = "CallSid=CS777&CallStatus=completed&From=%2B15550000000";
const signature = twilioSignature({ authToken, url: urlWithQuery, postBody });
const headers = {
host: "example.com",
"x-forwarded-proto": "https",
"x-twilio-signature": signature,
"i-twilio-idempotency-token": "idem-replay-1",
};
const first = verifyTwilioWebhook(
{
headers,
rawBody: postBody,
url: "http://local/voice/webhook?callId=abc",
method: "POST",
query: { callId: "abc" },
},
authToken,
{ publicUrl },
);
const second = verifyTwilioWebhook(
{
headers,
rawBody: postBody,
url: "http://local/voice/webhook?callId=abc",
method: "POST",
query: { callId: "abc" },
},
authToken,
{ publicUrl },
);
expect(first.ok).toBe(true);
expect(first.isReplay).toBeFalsy();
expect(first.verifiedRequestKey).toBeTruthy();
expect(second.ok).toBe(true);
expect(second.isReplay).toBe(true);
expect(second.verifiedRequestKey).toBe(first.verifiedRequestKey);
});
it("treats changed idempotency header as replay for identical signed requests", () => {
const authToken = "test-auth-token";
const publicUrl = "https://example.com/voice/webhook";
const urlWithQuery = `${publicUrl}?callId=abc`;
const postBody = "CallSid=CS778&CallStatus=completed&From=%2B15550000000";
const signature = twilioSignature({ authToken, url: urlWithQuery, postBody });
const first = verifyTwilioWebhook(
{
headers: {
host: "example.com",
"x-forwarded-proto": "https",
"x-twilio-signature": signature,
"i-twilio-idempotency-token": "idem-replay-a",
},
rawBody: postBody,
url: "http://local/voice/webhook?callId=abc",
method: "POST",
query: { callId: "abc" },
},
authToken,
{ publicUrl },
);
const second = verifyTwilioWebhook(
{
headers: {
host: "example.com",
"x-forwarded-proto": "https",
"x-twilio-signature": signature,
"i-twilio-idempotency-token": "idem-replay-b",
},
rawBody: postBody,
url: "http://local/voice/webhook?callId=abc",
method: "POST",
query: { callId: "abc" },
},
authToken,
{ publicUrl },
);
expect(first.ok).toBe(true);
expect(first.isReplay).toBe(false);
expect(first.verifiedRequestKey).toBeTruthy();
expect(second.ok).toBe(true);
expect(second.isReplay).toBe(true);
expect(second.verifiedRequestKey).toBe(first.verifiedRequestKey);
});
it("rejects invalid signatures even when attacker injects forwarded host", () => {
const authToken = "test-auth-token";
const postBody = "CallSid=CS123&CallStatus=completed&From=%2B15550000000";
const result = verifyTwilioWebhook(
{
headers: {
host: "127.0.0.1:3334",
"x-forwarded-proto": "https",
"x-forwarded-host": "attacker.ngrok-free.app",
"x-twilio-signature": "invalid",
},
rawBody: postBody,
url: "http://127.0.0.1:3334/voice/webhook",
method: "POST",
},
authToken,
);
expect(result.ok).toBe(false);
// X-Forwarded-Host is ignored by default, so URL uses Host header
expect(result.isNgrokFreeTier).toBe(false);
expect(result.reason).toMatch(/Invalid signature/);
});
it("accepts valid signatures for ngrok free tier on loopback when compatibility mode is enabled", () => {
const authToken = "test-auth-token";
const postBody = "CallSid=CS123&CallStatus=completed&From=%2B15550000000";
const webhookUrl = "https://local.ngrok-free.app/voice/webhook";
const signature = twilioSignature({
authToken,
url: webhookUrl,
postBody,
});
const result = verifyTwilioWebhook(
{
headers: {
host: "127.0.0.1:3334",
"x-forwarded-proto": "https",
"x-forwarded-host": "local.ngrok-free.app",
"x-twilio-signature": signature,
},
rawBody: postBody,
url: "http://127.0.0.1:3334/voice/webhook",
method: "POST",
remoteAddress: "127.0.0.1",
},
authToken,
{ allowNgrokFreeTierLoopbackBypass: true },
);
expect(result.ok).toBe(true);
expect(result.verificationUrl).toBe(webhookUrl);
});
it("does not allow invalid signatures for ngrok free tier on loopback", () => {
const authToken = "test-auth-token";
const postBody = "CallSid=CS123&CallStatus=completed&From=%2B15550000000";
const result = verifyTwilioWebhook(
{
headers: {
host: "127.0.0.1:3334",
"x-forwarded-proto": "https",
"x-forwarded-host": "local.ngrok-free.app",
"x-twilio-signature": "invalid",
},
rawBody: postBody,
url: "http://127.0.0.1:3334/voice/webhook",
method: "POST",
remoteAddress: "127.0.0.1",
},
authToken,
{ allowNgrokFreeTierLoopbackBypass: true },
);
expect(result.ok).toBe(false);
expect(result.reason).toMatch(/Invalid signature/);
expect(result.isNgrokFreeTier).toBe(true);
});
it("ignores attacker X-Forwarded-Host without allowedHosts or trustForwardingHeaders", () => {
const authToken = "test-auth-token";
const postBody = "CallSid=CS123&CallStatus=completed&From=%2B15550000000";
// Attacker tries to inject their host - should be ignored
const result = verifyTwilioWebhook(
{
headers: {
host: "legitimate.example.com",
"x-forwarded-host": "attacker.evil.com",
"x-twilio-signature": "invalid",
},
rawBody: postBody,
url: "http://localhost:3000/voice/webhook",
method: "POST",
},
authToken,
);
expect(result.ok).toBe(false);
// Attacker's host is ignored - uses Host header instead
expect(result.verificationUrl).toBe("https://legitimate.example.com/voice/webhook");
});
it("uses X-Forwarded-Host when allowedHosts whitelist is provided", () => {
const authToken = "test-auth-token";
const postBody = "CallSid=CS123&CallStatus=completed&From=%2B15550000000";
const webhookUrl = "https://myapp.ngrok.io/voice/webhook";
const signature = twilioSignature({ authToken, url: webhookUrl, postBody });
const result = verifyTwilioWebhook(
{
headers: {
host: "localhost:3000",
"x-forwarded-proto": "https",
"x-forwarded-host": "myapp.ngrok.io",
"x-twilio-signature": signature,
},
rawBody: postBody,
url: "http://localhost:3000/voice/webhook",
method: "POST",
},
authToken,
{ allowedHosts: ["myapp.ngrok.io"] },
);
expect(result.ok).toBe(true);
expect(result.verificationUrl).toBe(webhookUrl);
});
it("rejects X-Forwarded-Host not in allowedHosts whitelist", () => {
const authToken = "test-auth-token";
const postBody = "CallSid=CS123&CallStatus=completed&From=%2B15550000000";
const result = verifyTwilioWebhook(
{
headers: {
host: "localhost:3000",
"x-forwarded-host": "attacker.evil.com",
"x-twilio-signature": "invalid",
},
rawBody: postBody,
url: "http://localhost:3000/voice/webhook",
method: "POST",
},
authToken,
{ allowedHosts: ["myapp.ngrok.io", "webhook.example.com"] },
);
expect(result.ok).toBe(false);
// Attacker's host not in whitelist, falls back to Host header
expect(result.verificationUrl).toBe("https://localhost/voice/webhook");
});
it("trusts forwarding headers only from trusted proxy IPs", () => {
const authToken = "test-auth-token";
const postBody = "CallSid=CS123&CallStatus=completed&From=%2B15550000000";
const webhookUrl = "https://proxy.example.com/voice/webhook";
const signature = twilioSignature({ authToken, url: webhookUrl, postBody });
const result = verifyTwilioWebhook(
{
headers: {
host: "localhost:3000",
"x-forwarded-proto": "https",
"x-forwarded-host": "proxy.example.com",
"x-twilio-signature": signature,
},
rawBody: postBody,
url: "http://localhost:3000/voice/webhook",
method: "POST",
remoteAddress: "203.0.113.10",
},
authToken,
{ trustForwardingHeaders: true, trustedProxyIPs: ["203.0.113.10"] },
);
expect(result.ok).toBe(true);
expect(result.verificationUrl).toBe(webhookUrl);
});
it("ignores forwarding headers when trustedProxyIPs are set but remote IP is missing", () => {
const authToken = "test-auth-token";
const postBody = "CallSid=CS123&CallStatus=completed&From=%2B15550000000";
const result = verifyTwilioWebhook(
{
headers: {
host: "legitimate.example.com",
"x-forwarded-proto": "https",
"x-forwarded-host": "proxy.example.com",
"x-twilio-signature": "invalid",
},
rawBody: postBody,
url: "http://localhost:3000/voice/webhook",
method: "POST",
},
authToken,
{ trustForwardingHeaders: true, trustedProxyIPs: ["203.0.113.10"] },
);
expect(result.ok).toBe(false);
expect(result.verificationUrl).toBe("https://legitimate.example.com/voice/webhook");
});
it("returns a stable request key when verification is skipped", () => {
const ctx = {
headers: {},
rawBody: "CallSid=CS123&CallStatus=completed",
url: "https://example.com/voice/webhook",
method: "POST" as const,
};
const first = verifyTwilioWebhook(ctx, "token", { skipVerification: true });
const second = verifyTwilioWebhook(ctx, "token", { skipVerification: true });
expect(first.ok).toBe(true);
expect(first.verifiedRequestKey).toMatch(/^twilio:skip:/);
expect(second.verifiedRequestKey).toBe(first.verifiedRequestKey);
expect(second.isReplay).toBe(true);
});
});

View File

@@ -0,0 +1,906 @@
import crypto from "node:crypto";
import { getHeader } from "./http-headers.js";
import type { WebhookContext } from "./types.js";
const REPLAY_WINDOW_MS = 10 * 60 * 1000;
const REPLAY_CACHE_MAX_ENTRIES = 10_000;
const REPLAY_CACHE_PRUNE_INTERVAL = 64;
type ReplayCache = {
seenUntil: Map<string, number>;
calls: number;
};
const twilioReplayCache: ReplayCache = {
seenUntil: new Map<string, number>(),
calls: 0,
};
const plivoReplayCache: ReplayCache = {
seenUntil: new Map<string, number>(),
calls: 0,
};
const telnyxReplayCache: ReplayCache = {
seenUntil: new Map<string, number>(),
calls: 0,
};
function sha256Hex(input: string): string {
return crypto.createHash("sha256").update(input).digest("hex");
}
function createSkippedVerificationReplayKey(provider: string, ctx: WebhookContext): string {
return `${provider}:skip:${sha256Hex(`${ctx.method}\n${ctx.url}\n${ctx.rawBody}`)}`;
}
function pruneReplayCache(cache: ReplayCache, now: number): void {
for (const [key, expiresAt] of cache.seenUntil) {
if (expiresAt <= now) {
cache.seenUntil.delete(key);
}
}
while (cache.seenUntil.size > REPLAY_CACHE_MAX_ENTRIES) {
const oldest = cache.seenUntil.keys().next().value;
if (!oldest) {
break;
}
cache.seenUntil.delete(oldest);
}
}
function markReplay(cache: ReplayCache, replayKey: string): boolean {
const now = Date.now();
cache.calls += 1;
if (cache.calls % REPLAY_CACHE_PRUNE_INTERVAL === 0) {
pruneReplayCache(cache, now);
}
const existing = cache.seenUntil.get(replayKey);
if (existing && existing > now) {
return true;
}
cache.seenUntil.set(replayKey, now + REPLAY_WINDOW_MS);
if (cache.seenUntil.size > REPLAY_CACHE_MAX_ENTRIES) {
pruneReplayCache(cache, now);
}
return false;
}
/**
* Validate Twilio webhook signature using HMAC-SHA1.
*
* Twilio signs requests by concatenating the URL with sorted POST params,
* then computing HMAC-SHA1 with the auth token.
*
* @see https://www.twilio.com/docs/usage/webhooks/webhooks-security
*/
export function validateTwilioSignature(
authToken: string,
signature: string | undefined,
url: string,
params: URLSearchParams,
): boolean {
if (!signature) {
return false;
}
const dataToSign = buildTwilioDataToSign(url, params);
// HMAC-SHA1 with auth token, then base64 encode
const expectedSignature = crypto
.createHmac("sha1", authToken)
.update(dataToSign)
.digest("base64");
// Use timing-safe comparison to prevent timing attacks
return timingSafeEqual(signature, expectedSignature);
}
function buildTwilioDataToSign(url: string, params: URLSearchParams): string {
let dataToSign = url;
const sortedParams = Array.from(params.entries()).toSorted((a, b) =>
a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0,
);
for (const [key, value] of sortedParams) {
dataToSign += key + value;
}
return dataToSign;
}
function buildCanonicalTwilioParamString(params: URLSearchParams): string {
return Array.from(params.entries())
.toSorted((a, b) => (a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0))
.map(([key, value]) => `${key}=${value}`)
.join("&");
}
/**
* Timing-safe string comparison to prevent timing attacks.
*/
function timingSafeEqual(a: string, b: string): boolean {
if (a.length !== b.length) {
// Still do comparison to maintain constant time
const dummy = Buffer.from(a);
crypto.timingSafeEqual(dummy, dummy);
return false;
}
const bufA = Buffer.from(a);
const bufB = Buffer.from(b);
return crypto.timingSafeEqual(bufA, bufB);
}
/**
* Configuration for secure URL reconstruction.
*/
export interface WebhookUrlOptions {
/**
* Whitelist of allowed hostnames. If provided, only these hosts will be
* accepted from forwarding headers. This prevents host header injection attacks.
*
* SECURITY: You must provide this OR set trustForwardingHeaders=true to use
* X-Forwarded-Host headers. Without either, forwarding headers are ignored.
*/
allowedHosts?: string[];
/**
* Explicitly trust X-Forwarded-* headers without a whitelist.
* WARNING: Only set this to true if you trust your proxy configuration
* and understand the security implications.
*
* @default false
*/
trustForwardingHeaders?: boolean;
/**
* List of trusted proxy IP addresses. X-Forwarded-* headers will only be
* trusted if the request comes from one of these IPs.
* Requires remoteIP to be set for validation.
*/
trustedProxyIPs?: string[];
/**
* The IP address of the incoming request (for proxy validation).
*/
remoteIP?: string;
}
/**
* Validate that a hostname matches RFC 1123 format.
* Prevents injection of malformed hostnames.
*/
function isValidHostname(hostname: string): boolean {
if (!hostname || hostname.length > 253) {
return false;
}
// RFC 1123 hostname: alphanumeric, hyphens, dots
// Also allow ngrok/tunnel subdomains
const hostnameRegex =
/^([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)*[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$/;
return hostnameRegex.test(hostname);
}
/**
* Safely extract hostname from a host header value.
* Handles IPv6 addresses and prevents injection via malformed values.
*/
function extractHostname(hostHeader: string): string | null {
if (!hostHeader) {
return null;
}
let hostname: string;
// Handle IPv6 addresses: [::1]:8080
if (hostHeader.startsWith("[")) {
const endBracket = hostHeader.indexOf("]");
if (endBracket === -1) {
return null; // Malformed IPv6
}
hostname = hostHeader.substring(1, endBracket);
return hostname.toLowerCase();
}
// Handle IPv4/domain with optional port
// Check for @ which could indicate user info injection attempt
if (hostHeader.includes("@")) {
return null; // Reject potential injection: attacker.com:80@legitimate.com
}
hostname = hostHeader.split(":")[0];
// Validate the extracted hostname
if (!isValidHostname(hostname)) {
return null;
}
return hostname.toLowerCase();
}
function extractHostnameFromHeader(headerValue: string): string | null {
const first = headerValue.split(",")[0]?.trim();
if (!first) {
return null;
}
return extractHostname(first);
}
function normalizeAllowedHosts(allowedHosts?: string[]): Set<string> | null {
if (!allowedHosts || allowedHosts.length === 0) {
return null;
}
const normalized = new Set<string>();
for (const host of allowedHosts) {
const extracted = extractHostname(host.trim());
if (extracted) {
normalized.add(extracted);
}
}
return normalized.size > 0 ? normalized : null;
}
/**
* Reconstruct the public webhook URL from request headers.
*
* SECURITY: This function validates host headers to prevent host header
* injection attacks. When using forwarding headers (X-Forwarded-Host, etc.),
* always provide allowedHosts to whitelist valid hostnames.
*
* When behind a reverse proxy (Tailscale, nginx, ngrok), the original URL
* used by Twilio differs from the local request URL. We use standard
* forwarding headers to reconstruct it.
*
* Priority order:
* 1. X-Forwarded-Proto + X-Forwarded-Host (standard proxy headers)
* 2. X-Original-Host (nginx)
* 3. Ngrok-Forwarded-Host (ngrok specific)
* 4. Host header (direct connection)
*/
export function reconstructWebhookUrl(ctx: WebhookContext, options?: WebhookUrlOptions): string {
const { headers } = ctx;
// SECURITY: Only trust forwarding headers if explicitly configured.
// Either allowedHosts must be set (for whitelist validation) or
// trustForwardingHeaders must be true (explicit opt-in to trust).
const allowedHosts = normalizeAllowedHosts(options?.allowedHosts);
const hasAllowedHosts = allowedHosts !== null;
const explicitlyTrusted = options?.trustForwardingHeaders === true;
// Also check trusted proxy IPs if configured
const trustedProxyIPs = options?.trustedProxyIPs?.filter(Boolean) ?? [];
const hasTrustedProxyIPs = trustedProxyIPs.length > 0;
const remoteIP = options?.remoteIP ?? ctx.remoteAddress;
const fromTrustedProxy =
!hasTrustedProxyIPs || (remoteIP ? trustedProxyIPs.includes(remoteIP) : false);
// Only trust forwarding headers if: (has whitelist OR explicitly trusted) AND from trusted proxy
const shouldTrustForwardingHeaders = (hasAllowedHosts || explicitlyTrusted) && fromTrustedProxy;
const isAllowedForwardedHost = (host: string): boolean => !allowedHosts || allowedHosts.has(host);
// Determine protocol - only trust X-Forwarded-Proto from trusted proxies
let proto = "https";
if (shouldTrustForwardingHeaders) {
const forwardedProto = getHeader(headers, "x-forwarded-proto");
if (forwardedProto === "http" || forwardedProto === "https") {
proto = forwardedProto;
}
}
// Determine host - with security validation
let host: string | null = null;
if (shouldTrustForwardingHeaders) {
// Try forwarding headers in priority order
const forwardingHeaders = ["x-forwarded-host", "x-original-host", "ngrok-forwarded-host"];
for (const headerName of forwardingHeaders) {
const headerValue = getHeader(headers, headerName);
if (headerValue) {
const extracted = extractHostnameFromHeader(headerValue);
if (extracted && isAllowedForwardedHost(extracted)) {
host = extracted;
break;
}
}
}
}
// Fallback to Host header if no valid forwarding header found
if (!host) {
const hostHeader = getHeader(headers, "host");
if (hostHeader) {
const extracted = extractHostnameFromHeader(hostHeader);
if (extracted) {
host = extracted;
}
}
}
// Last resort: try to extract from ctx.url
if (!host) {
try {
const parsed = new URL(ctx.url);
const extracted = extractHostname(parsed.host);
if (extracted) {
host = extracted;
}
} catch {
// URL parsing failed - use empty string (will result in invalid URL)
host = "";
}
}
if (!host) {
host = "";
}
// Extract path from the context URL (fallback to "/" on parse failure)
let path = "/";
try {
const parsed = new URL(ctx.url);
path = parsed.pathname + parsed.search;
} catch {
// URL parsing failed
}
return `${proto}://${host}${path}`;
}
function buildTwilioVerificationUrl(
ctx: WebhookContext,
publicUrl?: string,
urlOptions?: WebhookUrlOptions,
): string {
if (!publicUrl) {
return reconstructWebhookUrl(ctx, urlOptions);
}
try {
const base = new URL(publicUrl);
const requestUrl = new URL(ctx.url);
base.pathname = requestUrl.pathname;
base.search = requestUrl.search;
return base.toString();
} catch {
return publicUrl;
}
}
function isLoopbackAddress(address?: string): boolean {
if (!address) {
return false;
}
if (address === "127.0.0.1" || address === "::1") {
return true;
}
if (address.startsWith("::ffff:127.")) {
return true;
}
return false;
}
/**
* Result of Twilio webhook verification with detailed info.
*/
export interface TwilioVerificationResult {
ok: boolean;
reason?: string;
/** The URL that was used for verification (for debugging) */
verificationUrl?: string;
/** Whether we're running behind ngrok free tier */
isNgrokFreeTier?: boolean;
/** Request is cryptographically valid but was already processed recently. */
isReplay?: boolean;
/** Stable request identity derived from signed Twilio material. */
verifiedRequestKey?: string;
}
export interface TelnyxVerificationResult {
ok: boolean;
reason?: string;
/** Request is cryptographically valid but was already processed recently. */
isReplay?: boolean;
/** Stable request identity derived from signed Telnyx material. */
verifiedRequestKey?: string;
}
function createTwilioReplayKey(params: {
verificationUrl: string;
signature: string;
requestParams: URLSearchParams;
}): string {
const canonicalParams = buildCanonicalTwilioParamString(params.requestParams);
return `twilio:req:${sha256Hex(
`${params.verificationUrl}\n${canonicalParams}\n${params.signature}`,
)}`;
}
function decodeBase64OrBase64Url(input: string): Buffer {
// Telnyx docs say Base64; some tooling emits Base64URL. Accept both.
const normalized = input.replace(/-/g, "+").replace(/_/g, "/");
const padLen = (4 - (normalized.length % 4)) % 4;
const padded = normalized + "=".repeat(padLen);
return Buffer.from(padded, "base64");
}
function base64UrlEncode(buf: Buffer): string {
return buf.toString("base64").replace(/\+/g, "-").replace(/\//g, "_").replace(/=+$/g, "");
}
function importEd25519PublicKey(publicKey: string): crypto.KeyObject | string {
const trimmed = publicKey.trim();
// PEM (spki) support.
if (trimmed.startsWith("-----BEGIN")) {
return trimmed;
}
// Base64-encoded raw Ed25519 key (32 bytes) or Base64-encoded DER SPKI key.
const decoded = decodeBase64OrBase64Url(trimmed);
if (decoded.length === 32) {
// JWK is the easiest portable way to import raw Ed25519 keys in Node crypto.
return crypto.createPublicKey({
key: { kty: "OKP", crv: "Ed25519", x: base64UrlEncode(decoded) },
format: "jwk",
});
}
return crypto.createPublicKey({
key: decoded,
format: "der",
type: "spki",
});
}
/**
* Verify Telnyx webhook signature using Ed25519.
*
* Telnyx signs `timestamp|payload` and provides:
* - `telnyx-signature-ed25519` (Base64 signature)
* - `telnyx-timestamp` (Unix seconds)
*/
export function verifyTelnyxWebhook(
ctx: WebhookContext,
publicKey: string | undefined,
options?: {
/** Skip verification entirely (only for development) */
skipVerification?: boolean;
/** Maximum allowed clock skew (ms). Defaults to 5 minutes. */
maxSkewMs?: number;
},
): TelnyxVerificationResult {
if (options?.skipVerification) {
const replayKey = createSkippedVerificationReplayKey("telnyx", ctx);
const isReplay = markReplay(telnyxReplayCache, replayKey);
return {
ok: true,
reason: "verification skipped (dev mode)",
isReplay,
verifiedRequestKey: replayKey,
};
}
if (!publicKey) {
return { ok: false, reason: "Missing telnyx.publicKey (configure to verify webhooks)" };
}
const signature = getHeader(ctx.headers, "telnyx-signature-ed25519");
const timestamp = getHeader(ctx.headers, "telnyx-timestamp");
if (!signature || !timestamp) {
return { ok: false, reason: "Missing signature or timestamp header" };
}
const eventTimeSec = parseInt(timestamp, 10);
if (!Number.isFinite(eventTimeSec)) {
return { ok: false, reason: "Invalid timestamp header" };
}
try {
const signedPayload = `${timestamp}|${ctx.rawBody}`;
const signatureBuffer = decodeBase64OrBase64Url(signature);
const key = importEd25519PublicKey(publicKey);
const isValid = crypto.verify(null, Buffer.from(signedPayload), key, signatureBuffer);
if (!isValid) {
return { ok: false, reason: "Invalid signature" };
}
const maxSkewMs = options?.maxSkewMs ?? 5 * 60 * 1000;
const eventTimeMs = eventTimeSec * 1000;
const now = Date.now();
if (Math.abs(now - eventTimeMs) > maxSkewMs) {
return { ok: false, reason: "Timestamp too old" };
}
const replayKey = `telnyx:${sha256Hex(`${timestamp}\n${signature}\n${ctx.rawBody}`)}`;
const isReplay = markReplay(telnyxReplayCache, replayKey);
return { ok: true, isReplay, verifiedRequestKey: replayKey };
} catch (err) {
return {
ok: false,
reason: `Verification error: ${err instanceof Error ? err.message : String(err)}`,
};
}
}
/**
* Verify Twilio webhook with full context and detailed result.
*/
export function verifyTwilioWebhook(
ctx: WebhookContext,
authToken: string,
options?: {
/** Override the public URL (e.g., from config) */
publicUrl?: string;
/**
* Allow ngrok free tier compatibility mode (loopback only).
*
* IMPORTANT: This does NOT bypass signature verification.
* It only enables trusting forwarded headers on loopback so we can
* reconstruct the public ngrok URL that Twilio used for signing.
*/
allowNgrokFreeTierLoopbackBypass?: boolean;
/** Skip verification entirely (only for development) */
skipVerification?: boolean;
/**
* Whitelist of allowed hostnames for host header validation.
* Prevents host header injection attacks.
*/
allowedHosts?: string[];
/**
* Explicitly trust X-Forwarded-* headers without a whitelist.
* WARNING: Only enable if you trust your proxy configuration.
* @default false
*/
trustForwardingHeaders?: boolean;
/**
* List of trusted proxy IP addresses. X-Forwarded-* headers will only
* be trusted from these IPs.
*/
trustedProxyIPs?: string[];
/**
* The remote IP address of the request (for proxy validation).
*/
remoteIP?: string;
},
): TwilioVerificationResult {
// Allow skipping verification for development/testing
if (options?.skipVerification) {
const replayKey = createSkippedVerificationReplayKey("twilio", ctx);
const isReplay = markReplay(twilioReplayCache, replayKey);
return {
ok: true,
reason: "verification skipped (dev mode)",
isReplay,
verifiedRequestKey: replayKey,
};
}
const signature = getHeader(ctx.headers, "x-twilio-signature");
if (!signature) {
return { ok: false, reason: "Missing X-Twilio-Signature header" };
}
const isLoopback = isLoopbackAddress(options?.remoteIP ?? ctx.remoteAddress);
const allowLoopbackForwarding = options?.allowNgrokFreeTierLoopbackBypass && isLoopback;
// Reconstruct the URL Twilio used
const verificationUrl = buildTwilioVerificationUrl(ctx, options?.publicUrl, {
allowedHosts: options?.allowedHosts,
trustForwardingHeaders: options?.trustForwardingHeaders || allowLoopbackForwarding,
trustedProxyIPs: options?.trustedProxyIPs,
remoteIP: options?.remoteIP,
});
// Parse the body as URL-encoded params
const params = new URLSearchParams(ctx.rawBody);
const isValid = validateTwilioSignature(authToken, signature, verificationUrl, params);
if (isValid) {
const replayKey = createTwilioReplayKey({
verificationUrl,
signature,
requestParams: params,
});
const isReplay = markReplay(twilioReplayCache, replayKey);
return { ok: true, verificationUrl, isReplay, verifiedRequestKey: replayKey };
}
// Check if this is ngrok free tier - the URL might have different format
const isNgrokFreeTier =
verificationUrl.includes(".ngrok-free.app") || verificationUrl.includes(".ngrok.io");
return {
ok: false,
reason: `Invalid signature for URL: ${verificationUrl}`,
verificationUrl,
isNgrokFreeTier,
};
}
// -----------------------------------------------------------------------------
// Plivo webhook verification
// -----------------------------------------------------------------------------
/**
* Result of Plivo webhook verification with detailed info.
*/
export interface PlivoVerificationResult {
ok: boolean;
reason?: string;
verificationUrl?: string;
/** Signature version used for verification */
version?: "v3" | "v2";
/** Request is cryptographically valid but was already processed recently. */
isReplay?: boolean;
/** Stable request identity derived from signed Plivo material. */
verifiedRequestKey?: string;
}
function normalizeSignatureBase64(input: string): string {
// Canonicalize base64 to match Plivo SDK behavior (decode then re-encode).
return Buffer.from(input, "base64").toString("base64");
}
function getBaseUrlNoQuery(url: string): string {
const u = new URL(url);
return `${u.protocol}//${u.host}${u.pathname}`;
}
function timingSafeEqualString(a: string, b: string): boolean {
if (a.length !== b.length) {
const dummy = Buffer.from(a);
crypto.timingSafeEqual(dummy, dummy);
return false;
}
return crypto.timingSafeEqual(Buffer.from(a), Buffer.from(b));
}
function validatePlivoV2Signature(params: {
authToken: string;
signature: string;
nonce: string;
url: string;
}): boolean {
const baseUrl = getBaseUrlNoQuery(params.url);
const digest = crypto
.createHmac("sha256", params.authToken)
.update(baseUrl + params.nonce)
.digest("base64");
const expected = normalizeSignatureBase64(digest);
const provided = normalizeSignatureBase64(params.signature);
return timingSafeEqualString(expected, provided);
}
type PlivoParamMap = Record<string, string[]>;
function toParamMapFromSearchParams(sp: URLSearchParams): PlivoParamMap {
const map: PlivoParamMap = {};
for (const [key, value] of sp.entries()) {
if (!map[key]) {
map[key] = [];
}
map[key].push(value);
}
return map;
}
function sortedQueryString(params: PlivoParamMap): string {
const parts: string[] = [];
for (const key of Object.keys(params).toSorted()) {
const values = [...params[key]].toSorted();
for (const value of values) {
parts.push(`${key}=${value}`);
}
}
return parts.join("&");
}
function sortedParamsString(params: PlivoParamMap): string {
const parts: string[] = [];
for (const key of Object.keys(params).toSorted()) {
const values = [...params[key]].toSorted();
for (const value of values) {
parts.push(`${key}${value}`);
}
}
return parts.join("");
}
function constructPlivoV3BaseUrl(params: {
method: "GET" | "POST";
url: string;
postParams: PlivoParamMap;
}): string {
const hasPostParams = Object.keys(params.postParams).length > 0;
const u = new URL(params.url);
const baseNoQuery = `${u.protocol}//${u.host}${u.pathname}`;
const queryMap = toParamMapFromSearchParams(u.searchParams);
const queryString = sortedQueryString(queryMap);
// In the Plivo V3 algorithm, the query portion is always sorted, and if we
// have POST params we add a '.' separator after the query string.
let baseUrl = baseNoQuery;
if (queryString.length > 0 || hasPostParams) {
baseUrl = `${baseNoQuery}?${queryString}`;
}
if (queryString.length > 0 && hasPostParams) {
baseUrl = `${baseUrl}.`;
}
if (params.method === "GET") {
return baseUrl;
}
return baseUrl + sortedParamsString(params.postParams);
}
function validatePlivoV3Signature(params: {
authToken: string;
signatureHeader: string;
nonce: string;
method: "GET" | "POST";
url: string;
postParams: PlivoParamMap;
}): boolean {
const baseUrl = constructPlivoV3BaseUrl({
method: params.method,
url: params.url,
postParams: params.postParams,
});
const hmacBase = `${baseUrl}.${params.nonce}`;
const digest = crypto.createHmac("sha256", params.authToken).update(hmacBase).digest("base64");
const expected = normalizeSignatureBase64(digest);
// Header can contain multiple signatures separated by commas.
const provided = params.signatureHeader
.split(",")
.map((s) => s.trim())
.filter(Boolean)
.map((s) => normalizeSignatureBase64(s));
for (const sig of provided) {
if (timingSafeEqualString(expected, sig)) {
return true;
}
}
return false;
}
/**
* Verify Plivo webhooks using V3 signature if present; fall back to V2.
*
* Header names (case-insensitive; Node provides lower-case keys):
* - V3: X-Plivo-Signature-V3 / X-Plivo-Signature-V3-Nonce
* - V2: X-Plivo-Signature-V2 / X-Plivo-Signature-V2-Nonce
*/
export function verifyPlivoWebhook(
ctx: WebhookContext,
authToken: string,
options?: {
/** Override the public URL origin (host) used for verification */
publicUrl?: string;
/** Skip verification entirely (only for development) */
skipVerification?: boolean;
/**
* Whitelist of allowed hostnames for host header validation.
* Prevents host header injection attacks.
*/
allowedHosts?: string[];
/**
* Explicitly trust X-Forwarded-* headers without a whitelist.
* WARNING: Only enable if you trust your proxy configuration.
* @default false
*/
trustForwardingHeaders?: boolean;
/**
* List of trusted proxy IP addresses. X-Forwarded-* headers will only
* be trusted from these IPs.
*/
trustedProxyIPs?: string[];
/**
* The remote IP address of the request (for proxy validation).
*/
remoteIP?: string;
},
): PlivoVerificationResult {
if (options?.skipVerification) {
const replayKey = createSkippedVerificationReplayKey("plivo", ctx);
const isReplay = markReplay(plivoReplayCache, replayKey);
return {
ok: true,
reason: "verification skipped (dev mode)",
isReplay,
verifiedRequestKey: replayKey,
};
}
const signatureV3 = getHeader(ctx.headers, "x-plivo-signature-v3");
const nonceV3 = getHeader(ctx.headers, "x-plivo-signature-v3-nonce");
const signatureV2 = getHeader(ctx.headers, "x-plivo-signature-v2");
const nonceV2 = getHeader(ctx.headers, "x-plivo-signature-v2-nonce");
const reconstructed = reconstructWebhookUrl(ctx, {
allowedHosts: options?.allowedHosts,
trustForwardingHeaders: options?.trustForwardingHeaders,
trustedProxyIPs: options?.trustedProxyIPs,
remoteIP: options?.remoteIP,
});
let verificationUrl = reconstructed;
if (options?.publicUrl) {
try {
const req = new URL(reconstructed);
const base = new URL(options.publicUrl);
base.pathname = req.pathname;
base.search = req.search;
verificationUrl = base.toString();
} catch {
verificationUrl = reconstructed;
}
}
if (signatureV3 && nonceV3) {
const method = ctx.method === "GET" || ctx.method === "POST" ? ctx.method : null;
if (!method) {
return {
ok: false,
version: "v3",
verificationUrl,
reason: `Unsupported HTTP method for Plivo V3 signature: ${ctx.method}`,
};
}
const postParams = toParamMapFromSearchParams(new URLSearchParams(ctx.rawBody));
const ok = validatePlivoV3Signature({
authToken,
signatureHeader: signatureV3,
nonce: nonceV3,
method,
url: verificationUrl,
postParams,
});
if (!ok) {
return {
ok: false,
version: "v3",
verificationUrl,
reason: "Invalid Plivo V3 signature",
};
}
const replayKey = `plivo:v3:${sha256Hex(`${verificationUrl}\n${nonceV3}`)}`;
const isReplay = markReplay(plivoReplayCache, replayKey);
return { ok: true, version: "v3", verificationUrl, isReplay, verifiedRequestKey: replayKey };
}
if (signatureV2 && nonceV2) {
const ok = validatePlivoV2Signature({
authToken,
signature: signatureV2,
nonce: nonceV2,
url: verificationUrl,
});
if (!ok) {
return {
ok: false,
version: "v2",
verificationUrl,
reason: "Invalid Plivo V2 signature",
};
}
const replayKey = `plivo:v2:${sha256Hex(`${verificationUrl}\n${nonceV2}`)}`;
const isReplay = markReplay(plivoReplayCache, replayKey);
return { ok: true, version: "v2", verificationUrl, isReplay, verifiedRequestKey: replayKey };
}
return {
ok: false,
reason: "Missing Plivo signature headers (V3 or V2)",
verificationUrl,
};
}

View File

@@ -0,0 +1,253 @@
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { VoiceCallConfigSchema, type VoiceCallConfig } from "./config.js";
import type { CallManager } from "./manager.js";
import type { VoiceCallProvider } from "./providers/base.js";
import type { CallRecord } from "./types.js";
import { VoiceCallWebhookServer } from "./webhook.js";
const provider: VoiceCallProvider = {
name: "mock",
verifyWebhook: () => ({ ok: true, verifiedRequestKey: "mock:req:base" }),
parseWebhookEvent: () => ({ events: [] }),
initiateCall: async () => ({ providerCallId: "provider-call", status: "initiated" }),
hangupCall: async () => {},
playTts: async () => {},
startListening: async () => {},
stopListening: async () => {},
};
const createConfig = (overrides: Partial<VoiceCallConfig> = {}): VoiceCallConfig => {
const base = VoiceCallConfigSchema.parse({});
base.serve.port = 0;
return {
...base,
...overrides,
serve: {
...base.serve,
...(overrides.serve ?? {}),
},
};
};
const createCall = (startedAt: number): CallRecord => ({
callId: "call-1",
providerCallId: "provider-call-1",
provider: "mock",
direction: "outbound",
state: "initiated",
from: "+15550001234",
to: "+15550005678",
startedAt,
transcript: [],
processedEventIds: [],
});
const createManager = (calls: CallRecord[]) => {
const endCall = vi.fn(async () => ({ success: true }));
const processEvent = vi.fn();
const manager = {
getActiveCalls: () => calls,
endCall,
processEvent,
} as unknown as CallManager;
return { manager, endCall, processEvent };
};
describe("VoiceCallWebhookServer stale call reaper", () => {
beforeEach(() => {
vi.useFakeTimers();
});
afterEach(() => {
vi.useRealTimers();
});
it("ends calls older than staleCallReaperSeconds", async () => {
const now = new Date("2026-02-16T00:00:00Z");
vi.setSystemTime(now);
const call = createCall(now.getTime() - 120_000);
const { manager, endCall } = createManager([call]);
const config = createConfig({ staleCallReaperSeconds: 60 });
const server = new VoiceCallWebhookServer(config, manager, provider);
try {
await server.start();
await vi.advanceTimersByTimeAsync(30_000);
expect(endCall).toHaveBeenCalledWith(call.callId);
} finally {
await server.stop();
}
});
it("skips calls that are younger than the threshold", async () => {
const now = new Date("2026-02-16T00:00:00Z");
vi.setSystemTime(now);
const call = createCall(now.getTime() - 10_000);
const { manager, endCall } = createManager([call]);
const config = createConfig({ staleCallReaperSeconds: 60 });
const server = new VoiceCallWebhookServer(config, manager, provider);
try {
await server.start();
await vi.advanceTimersByTimeAsync(30_000);
expect(endCall).not.toHaveBeenCalled();
} finally {
await server.stop();
}
});
it("does not run when staleCallReaperSeconds is disabled", async () => {
const now = new Date("2026-02-16T00:00:00Z");
vi.setSystemTime(now);
const call = createCall(now.getTime() - 120_000);
const { manager, endCall } = createManager([call]);
const config = createConfig({ staleCallReaperSeconds: 0 });
const server = new VoiceCallWebhookServer(config, manager, provider);
try {
await server.start();
await vi.advanceTimersByTimeAsync(60_000);
expect(endCall).not.toHaveBeenCalled();
} finally {
await server.stop();
}
});
});
describe("VoiceCallWebhookServer replay handling", () => {
it("acknowledges replayed webhook requests and skips event side effects", async () => {
const replayProvider: VoiceCallProvider = {
...provider,
verifyWebhook: () => ({ ok: true, isReplay: true, verifiedRequestKey: "mock:req:replay" }),
parseWebhookEvent: () => ({
events: [
{
id: "evt-replay",
dedupeKey: "stable-replay",
type: "call.speech",
callId: "call-1",
providerCallId: "provider-call-1",
timestamp: Date.now(),
transcript: "hello",
isFinal: true,
},
],
statusCode: 200,
}),
};
const { manager, processEvent } = createManager([]);
const config = createConfig({ serve: { port: 0, bind: "127.0.0.1", path: "/voice/webhook" } });
const server = new VoiceCallWebhookServer(config, manager, replayProvider);
try {
const baseUrl = await server.start();
const address = (
server as unknown as { server?: { address?: () => unknown } }
).server?.address?.();
const requestUrl = new URL(baseUrl);
if (address && typeof address === "object" && "port" in address && address.port) {
requestUrl.port = String(address.port);
}
const response = await fetch(requestUrl.toString(), {
method: "POST",
headers: { "content-type": "application/x-www-form-urlencoded" },
body: "CallSid=CA123&SpeechResult=hello",
});
expect(response.status).toBe(200);
expect(processEvent).not.toHaveBeenCalled();
} finally {
await server.stop();
}
});
it("passes verified request key from verifyWebhook into parseWebhookEvent", async () => {
const parseWebhookEvent = vi.fn((_ctx: unknown, options?: { verifiedRequestKey?: string }) => ({
events: [
{
id: "evt-verified",
dedupeKey: options?.verifiedRequestKey,
type: "call.speech" as const,
callId: "call-1",
providerCallId: "provider-call-1",
timestamp: Date.now(),
transcript: "hello",
isFinal: true,
},
],
statusCode: 200,
}));
const verifiedProvider: VoiceCallProvider = {
...provider,
verifyWebhook: () => ({ ok: true, verifiedRequestKey: "verified:req:123" }),
parseWebhookEvent,
};
const { manager, processEvent } = createManager([]);
const config = createConfig({ serve: { port: 0, bind: "127.0.0.1", path: "/voice/webhook" } });
const server = new VoiceCallWebhookServer(config, manager, verifiedProvider);
try {
const baseUrl = await server.start();
const address = (
server as unknown as { server?: { address?: () => unknown } }
).server?.address?.();
const requestUrl = new URL(baseUrl);
if (address && typeof address === "object" && "port" in address && address.port) {
requestUrl.port = String(address.port);
}
const response = await fetch(requestUrl.toString(), {
method: "POST",
headers: { "content-type": "application/x-www-form-urlencoded" },
body: "CallSid=CA123&SpeechResult=hello",
});
expect(response.status).toBe(200);
expect(parseWebhookEvent).toHaveBeenCalledTimes(1);
expect(parseWebhookEvent.mock.calls[0]?.[1]).toEqual({
verifiedRequestKey: "verified:req:123",
});
expect(processEvent).toHaveBeenCalledTimes(1);
expect(processEvent.mock.calls[0]?.[0]?.dedupeKey).toBe("verified:req:123");
} finally {
await server.stop();
}
});
it("rejects requests when verification succeeds without a request key", async () => {
const parseWebhookEvent = vi.fn(() => ({ events: [], statusCode: 200 }));
const badProvider: VoiceCallProvider = {
...provider,
verifyWebhook: () => ({ ok: true }),
parseWebhookEvent,
};
const { manager } = createManager([]);
const config = createConfig({ serve: { port: 0, bind: "127.0.0.1", path: "/voice/webhook" } });
const server = new VoiceCallWebhookServer(config, manager, badProvider);
try {
const baseUrl = await server.start();
const address = (
server as unknown as { server?: { address?: () => unknown } }
).server?.address?.();
const requestUrl = new URL(baseUrl);
if (address && typeof address === "object" && "port" in address && address.port) {
requestUrl.port = String(address.port);
}
const response = await fetch(requestUrl.toString(), {
method: "POST",
headers: { "content-type": "application/x-www-form-urlencoded" },
body: "CallSid=CA123&SpeechResult=hello",
});
expect(response.status).toBe(401);
expect(parseWebhookEvent).not.toHaveBeenCalled();
} finally {
await server.stop();
}
});
});

View File

@@ -0,0 +1,538 @@
import { spawn } from "node:child_process";
import http from "node:http";
import { URL } from "node:url";
import {
isRequestBodyLimitError,
readRequestBodyWithLimit,
requestBodyErrorToText,
} from "openclaw/plugin-sdk";
import type { VoiceCallConfig } from "./config.js";
import type { CoreConfig } from "./core-bridge.js";
import type { CallManager } from "./manager.js";
import type { MediaStreamConfig } from "./media-stream.js";
import { MediaStreamHandler } from "./media-stream.js";
import type { VoiceCallProvider } from "./providers/base.js";
import { OpenAIRealtimeSTTProvider } from "./providers/stt-openai-realtime.js";
import type { TwilioProvider } from "./providers/twilio.js";
import type { NormalizedEvent, WebhookContext } from "./types.js";
import { startStaleCallReaper } from "./webhook/stale-call-reaper.js";
const MAX_WEBHOOK_BODY_BYTES = 1024 * 1024;
/**
* HTTP server for receiving voice call webhooks from providers.
* Supports WebSocket upgrades for media streams when streaming is enabled.
*/
export class VoiceCallWebhookServer {
private server: http.Server | null = null;
private config: VoiceCallConfig;
private manager: CallManager;
private provider: VoiceCallProvider;
private coreConfig: CoreConfig | null;
private stopStaleCallReaper: (() => void) | null = null;
/** Media stream handler for bidirectional audio (when streaming enabled) */
private mediaStreamHandler: MediaStreamHandler | null = null;
constructor(
config: VoiceCallConfig,
manager: CallManager,
provider: VoiceCallProvider,
coreConfig?: CoreConfig,
) {
this.config = config;
this.manager = manager;
this.provider = provider;
this.coreConfig = coreConfig ?? null;
// Initialize media stream handler if streaming is enabled
if (config.streaming?.enabled) {
this.initializeMediaStreaming();
}
}
/**
* Get the media stream handler (for wiring to provider).
*/
getMediaStreamHandler(): MediaStreamHandler | null {
return this.mediaStreamHandler;
}
/**
* Initialize media streaming with OpenAI Realtime STT.
*/
private initializeMediaStreaming(): void {
const apiKey = this.config.streaming?.openaiApiKey || process.env.OPENAI_API_KEY;
if (!apiKey) {
console.warn("[voice-call] Streaming enabled but no OpenAI API key found");
return;
}
const sttProvider = new OpenAIRealtimeSTTProvider({
apiKey,
model: this.config.streaming?.sttModel,
silenceDurationMs: this.config.streaming?.silenceDurationMs,
vadThreshold: this.config.streaming?.vadThreshold,
});
const streamConfig: MediaStreamConfig = {
sttProvider,
preStartTimeoutMs: this.config.streaming?.preStartTimeoutMs,
maxPendingConnections: this.config.streaming?.maxPendingConnections,
maxPendingConnectionsPerIp: this.config.streaming?.maxPendingConnectionsPerIp,
maxConnections: this.config.streaming?.maxConnections,
shouldAcceptStream: ({ callId, token }) => {
const call = this.manager.getCallByProviderCallId(callId);
if (!call) {
return false;
}
if (this.provider.name === "twilio") {
const twilio = this.provider as TwilioProvider;
if (!twilio.isValidStreamToken(callId, token)) {
console.warn(`[voice-call] Rejecting media stream: invalid token for ${callId}`);
return false;
}
}
return true;
},
onTranscript: (providerCallId, transcript) => {
console.log(`[voice-call] Transcript for ${providerCallId}: ${transcript}`);
// Clear TTS queue on barge-in (user started speaking, interrupt current playback)
if (this.provider.name === "twilio") {
(this.provider as TwilioProvider).clearTtsQueue(providerCallId);
}
// Look up our internal call ID from the provider call ID
const call = this.manager.getCallByProviderCallId(providerCallId);
if (!call) {
console.warn(`[voice-call] No active call found for provider ID: ${providerCallId}`);
return;
}
// Create a speech event and process it through the manager
const event: NormalizedEvent = {
id: `stream-transcript-${Date.now()}`,
type: "call.speech",
callId: call.callId,
providerCallId,
timestamp: Date.now(),
transcript,
isFinal: true,
};
this.manager.processEvent(event);
// Auto-respond in conversation mode (inbound always, outbound if mode is conversation)
const callMode = call.metadata?.mode as string | undefined;
const shouldRespond = call.direction === "inbound" || callMode === "conversation";
if (shouldRespond) {
this.handleInboundResponse(call.callId, transcript).catch((err) => {
console.warn(`[voice-call] Failed to auto-respond:`, err);
});
}
},
onSpeechStart: (providerCallId) => {
if (this.provider.name === "twilio") {
(this.provider as TwilioProvider).clearTtsQueue(providerCallId);
}
},
onPartialTranscript: (callId, partial) => {
console.log(`[voice-call] Partial for ${callId}: ${partial}`);
},
onConnect: (callId, streamSid) => {
console.log(`[voice-call] Media stream connected: ${callId} -> ${streamSid}`);
// Register stream with provider for TTS routing
if (this.provider.name === "twilio") {
(this.provider as TwilioProvider).registerCallStream(callId, streamSid);
}
// Speak initial message if one was provided when call was initiated
// Use setTimeout to allow stream setup to complete
setTimeout(() => {
this.manager.speakInitialMessage(callId).catch((err) => {
console.warn(`[voice-call] Failed to speak initial message:`, err);
});
}, 500);
},
onDisconnect: (callId) => {
console.log(`[voice-call] Media stream disconnected: ${callId}`);
// Auto-end call when media stream disconnects to prevent stuck calls.
// Without this, calls can remain active indefinitely after the stream closes.
const disconnectedCall = this.manager.getCallByProviderCallId(callId);
if (disconnectedCall) {
console.log(
`[voice-call] Auto-ending call ${disconnectedCall.callId} on stream disconnect`,
);
void this.manager.endCall(disconnectedCall.callId).catch((err) => {
console.warn(`[voice-call] Failed to auto-end call ${disconnectedCall.callId}:`, err);
});
}
if (this.provider.name === "twilio") {
(this.provider as TwilioProvider).unregisterCallStream(callId);
}
},
};
this.mediaStreamHandler = new MediaStreamHandler(streamConfig);
console.log("[voice-call] Media streaming initialized");
}
/**
* Start the webhook server.
*/
async start(): Promise<string> {
const { port, bind, path: webhookPath } = this.config.serve;
const streamPath = this.config.streaming?.streamPath || "/voice/stream";
return new Promise((resolve, reject) => {
this.server = http.createServer((req, res) => {
this.handleRequest(req, res, webhookPath).catch((err) => {
console.error("[voice-call] Webhook error:", err);
res.statusCode = 500;
res.end("Internal Server Error");
});
});
// Handle WebSocket upgrades for media streams
if (this.mediaStreamHandler) {
this.server.on("upgrade", (request, socket, head) => {
const path = this.getUpgradePathname(request);
if (path === streamPath) {
console.log("[voice-call] WebSocket upgrade for media stream");
this.mediaStreamHandler?.handleUpgrade(request, socket, head);
} else {
socket.destroy();
}
});
}
this.server.on("error", reject);
this.server.listen(port, bind, () => {
const url = `http://${bind}:${port}${webhookPath}`;
console.log(`[voice-call] Webhook server listening on ${url}`);
if (this.mediaStreamHandler) {
console.log(`[voice-call] Media stream WebSocket on ws://${bind}:${port}${streamPath}`);
}
resolve(url);
// Start the stale call reaper if configured
this.stopStaleCallReaper = startStaleCallReaper({
manager: this.manager,
staleCallReaperSeconds: this.config.staleCallReaperSeconds,
});
});
});
}
/**
* Stop the webhook server.
*/
async stop(): Promise<void> {
if (this.stopStaleCallReaper) {
this.stopStaleCallReaper();
this.stopStaleCallReaper = null;
}
return new Promise((resolve) => {
if (this.server) {
this.server.close(() => {
this.server = null;
resolve();
});
} else {
resolve();
}
});
}
private getUpgradePathname(request: http.IncomingMessage): string | null {
try {
const host = request.headers.host || "localhost";
return new URL(request.url || "/", `http://${host}`).pathname;
} catch {
return null;
}
}
/**
* Handle incoming HTTP request.
*/
private async handleRequest(
req: http.IncomingMessage,
res: http.ServerResponse,
webhookPath: string,
): Promise<void> {
const url = new URL(req.url || "/", `http://${req.headers.host}`);
// Check path
if (!url.pathname.startsWith(webhookPath)) {
res.statusCode = 404;
res.end("Not Found");
return;
}
// Only accept POST
if (req.method !== "POST") {
res.statusCode = 405;
res.end("Method Not Allowed");
return;
}
// Read body
let body = "";
try {
body = await this.readBody(req, MAX_WEBHOOK_BODY_BYTES);
} catch (err) {
if (isRequestBodyLimitError(err, "PAYLOAD_TOO_LARGE")) {
res.statusCode = 413;
res.end("Payload Too Large");
return;
}
if (isRequestBodyLimitError(err, "REQUEST_BODY_TIMEOUT")) {
res.statusCode = 408;
res.end(requestBodyErrorToText("REQUEST_BODY_TIMEOUT"));
return;
}
throw err;
}
// Build webhook context
const ctx: WebhookContext = {
headers: req.headers as Record<string, string | string[] | undefined>,
rawBody: body,
url: `http://${req.headers.host}${req.url}`,
method: "POST",
query: Object.fromEntries(url.searchParams),
remoteAddress: req.socket.remoteAddress ?? undefined,
};
// Verify signature
const verification = this.provider.verifyWebhook(ctx);
if (!verification.ok) {
console.warn(`[voice-call] Webhook verification failed: ${verification.reason}`);
res.statusCode = 401;
res.end("Unauthorized");
return;
}
if (!verification.verifiedRequestKey) {
console.warn("[voice-call] Webhook verification succeeded without request identity key");
res.statusCode = 401;
res.end("Unauthorized");
return;
}
// Parse events
const result = this.provider.parseWebhookEvent(ctx, {
verifiedRequestKey: verification.verifiedRequestKey,
});
// Process each event
if (verification.isReplay) {
console.warn("[voice-call] Replay detected; skipping event side effects");
} else {
for (const event of result.events) {
try {
this.manager.processEvent(event);
} catch (err) {
console.error(`[voice-call] Error processing event ${event.type}:`, err);
}
}
}
// Send response
res.statusCode = result.statusCode || 200;
if (result.providerResponseHeaders) {
for (const [key, value] of Object.entries(result.providerResponseHeaders)) {
res.setHeader(key, value);
}
}
res.end(result.providerResponseBody || "OK");
}
/**
* Read request body as string with timeout protection.
*/
private readBody(
req: http.IncomingMessage,
maxBytes: number,
timeoutMs = 30_000,
): Promise<string> {
return readRequestBodyWithLimit(req, { maxBytes, timeoutMs });
}
/**
* Handle auto-response for inbound calls using the agent system.
* Supports tool calling for richer voice interactions.
*/
private async handleInboundResponse(callId: string, userMessage: string): Promise<void> {
console.log(`[voice-call] Auto-responding to inbound call ${callId}: "${userMessage}"`);
// Get call context for conversation history
const call = this.manager.getCall(callId);
if (!call) {
console.warn(`[voice-call] Call ${callId} not found for auto-response`);
return;
}
if (!this.coreConfig) {
console.warn("[voice-call] Core config missing; skipping auto-response");
return;
}
try {
const { generateVoiceResponse } = await import("./response-generator.js");
const result = await generateVoiceResponse({
voiceConfig: this.config,
coreConfig: this.coreConfig,
callId,
from: call.from,
transcript: call.transcript,
userMessage,
});
if (result.error) {
console.error(`[voice-call] Response generation error: ${result.error}`);
return;
}
if (result.text) {
console.log(`[voice-call] AI response: "${result.text}"`);
await this.manager.speak(callId, result.text);
}
} catch (err) {
console.error(`[voice-call] Auto-response error:`, err);
}
}
}
/**
* Resolve the current machine's Tailscale DNS name.
*/
export type TailscaleSelfInfo = {
dnsName: string | null;
nodeId: string | null;
};
/**
* Run a tailscale command with timeout, collecting stdout.
*/
function runTailscaleCommand(
args: string[],
timeoutMs = 2500,
): Promise<{ code: number; stdout: string }> {
return new Promise((resolve) => {
const proc = spawn("tailscale", args, {
stdio: ["ignore", "pipe", "pipe"],
});
let stdout = "";
proc.stdout.on("data", (data) => {
stdout += data;
});
const timer = setTimeout(() => {
proc.kill("SIGKILL");
resolve({ code: -1, stdout: "" });
}, timeoutMs);
proc.on("close", (code) => {
clearTimeout(timer);
resolve({ code: code ?? -1, stdout });
});
});
}
export async function getTailscaleSelfInfo(): Promise<TailscaleSelfInfo | null> {
const { code, stdout } = await runTailscaleCommand(["status", "--json"]);
if (code !== 0) {
return null;
}
try {
const status = JSON.parse(stdout);
return {
dnsName: status.Self?.DNSName?.replace(/\.$/, "") || null,
nodeId: status.Self?.ID || null,
};
} catch {
return null;
}
}
export async function getTailscaleDnsName(): Promise<string | null> {
const info = await getTailscaleSelfInfo();
return info?.dnsName ?? null;
}
export async function setupTailscaleExposureRoute(opts: {
mode: "serve" | "funnel";
path: string;
localUrl: string;
}): Promise<string | null> {
const dnsName = await getTailscaleDnsName();
if (!dnsName) {
console.warn("[voice-call] Could not get Tailscale DNS name");
return null;
}
const { code } = await runTailscaleCommand([
opts.mode,
"--bg",
"--yes",
"--set-path",
opts.path,
opts.localUrl,
]);
if (code === 0) {
const publicUrl = `https://${dnsName}${opts.path}`;
console.log(`[voice-call] Tailscale ${opts.mode} active: ${publicUrl}`);
return publicUrl;
}
console.warn(`[voice-call] Tailscale ${opts.mode} failed`);
return null;
}
export async function cleanupTailscaleExposureRoute(opts: {
mode: "serve" | "funnel";
path: string;
}): Promise<void> {
await runTailscaleCommand([opts.mode, "off", opts.path]);
}
/**
* Setup Tailscale serve/funnel for the webhook server.
* This is a helper that shells out to `tailscale serve` or `tailscale funnel`.
*/
export async function setupTailscaleExposure(config: VoiceCallConfig): Promise<string | null> {
if (config.tailscale.mode === "off") {
return null;
}
const mode = config.tailscale.mode === "funnel" ? "funnel" : "serve";
// Include the path suffix so tailscale forwards to the correct endpoint
// (tailscale strips the mount path prefix when proxying)
const localUrl = `http://127.0.0.1:${config.serve.port}${config.serve.path}`;
return setupTailscaleExposureRoute({
mode,
path: config.tailscale.path,
localUrl,
});
}
/**
* Cleanup Tailscale serve/funnel.
*/
export async function cleanupTailscaleExposure(config: VoiceCallConfig): Promise<void> {
if (config.tailscale.mode === "off") {
return;
}
const mode = config.tailscale.mode === "funnel" ? "funnel" : "serve";
await cleanupTailscaleExposureRoute({ mode, path: config.tailscale.path });
}

View File

@@ -0,0 +1,33 @@
import type { CallManager } from "../manager.js";
const CHECK_INTERVAL_MS = 30_000;
export function startStaleCallReaper(params: {
manager: CallManager;
staleCallReaperSeconds?: number;
}): (() => void) | null {
const maxAgeSeconds = params.staleCallReaperSeconds;
if (!maxAgeSeconds || maxAgeSeconds <= 0) {
return null;
}
const maxAgeMs = maxAgeSeconds * 1000;
const interval = setInterval(() => {
const now = Date.now();
for (const call of params.manager.getActiveCalls()) {
const age = now - call.startedAt;
if (age > maxAgeMs) {
console.log(
`[voice-call] Reaping stale call ${call.callId} (age: ${Math.round(age / 1000)}s, state: ${call.state})`,
);
void params.manager.endCall(call.callId).catch((err) => {
console.warn(`[voice-call] Reaper failed to end call ${call.callId}:`, err);
});
}
}
}, CHECK_INTERVAL_MS);
return () => {
clearInterval(interval);
};
}