Include full contents of all nested repositories
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
68
openclaw/extensions/voice-call/src/providers/base.ts
Normal file
68
openclaw/extensions/voice-call/src/providers/base.ts
Normal file
@@ -0,0 +1,68 @@
|
||||
import type {
|
||||
HangupCallInput,
|
||||
InitiateCallInput,
|
||||
InitiateCallResult,
|
||||
PlayTtsInput,
|
||||
ProviderName,
|
||||
WebhookParseOptions,
|
||||
ProviderWebhookParseResult,
|
||||
StartListeningInput,
|
||||
StopListeningInput,
|
||||
WebhookContext,
|
||||
WebhookVerificationResult,
|
||||
} from "../types.js";
|
||||
|
||||
/**
|
||||
* Abstract base interface for voice call providers.
|
||||
*
|
||||
* Each provider (Telnyx, Twilio, etc.) implements this interface to provide
|
||||
* a consistent API for the call manager.
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Webhook verification and event parsing
|
||||
* - Outbound call initiation and hangup
|
||||
* - Media control (TTS playback, STT listening)
|
||||
*/
|
||||
export interface VoiceCallProvider {
|
||||
/** Provider identifier */
|
||||
readonly name: ProviderName;
|
||||
|
||||
/**
|
||||
* Verify webhook signature/HMAC before processing.
|
||||
* Must be called before parseWebhookEvent.
|
||||
*/
|
||||
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult;
|
||||
|
||||
/**
|
||||
* Parse provider-specific webhook payload into normalized events.
|
||||
* Returns events and optional response to send back to provider.
|
||||
*/
|
||||
parseWebhookEvent(ctx: WebhookContext, options?: WebhookParseOptions): ProviderWebhookParseResult;
|
||||
|
||||
/**
|
||||
* Initiate an outbound call.
|
||||
* @returns Provider call ID and status
|
||||
*/
|
||||
initiateCall(input: InitiateCallInput): Promise<InitiateCallResult>;
|
||||
|
||||
/**
|
||||
* Hang up an active call.
|
||||
*/
|
||||
hangupCall(input: HangupCallInput): Promise<void>;
|
||||
|
||||
/**
|
||||
* Play TTS audio to the caller.
|
||||
* The provider should handle streaming if supported.
|
||||
*/
|
||||
playTts(input: PlayTtsInput): Promise<void>;
|
||||
|
||||
/**
|
||||
* Start listening for user speech (activate STT).
|
||||
*/
|
||||
startListening(input: StartListeningInput): Promise<void>;
|
||||
|
||||
/**
|
||||
* Stop listening for user speech (deactivate STT).
|
||||
*/
|
||||
stopListening(input: StopListeningInput): Promise<void>;
|
||||
}
|
||||
10
openclaw/extensions/voice-call/src/providers/index.ts
Normal file
10
openclaw/extensions/voice-call/src/providers/index.ts
Normal file
@@ -0,0 +1,10 @@
|
||||
export type { VoiceCallProvider } from "./base.js";
|
||||
export { MockProvider } from "./mock.js";
|
||||
export {
|
||||
OpenAIRealtimeSTTProvider,
|
||||
type RealtimeSTTConfig,
|
||||
type RealtimeSTTSession,
|
||||
} from "./stt-openai-realtime.js";
|
||||
export { TelnyxProvider } from "./telnyx.js";
|
||||
export { TwilioProvider } from "./twilio.js";
|
||||
export { PlivoProvider } from "./plivo.js";
|
||||
169
openclaw/extensions/voice-call/src/providers/mock.ts
Normal file
169
openclaw/extensions/voice-call/src/providers/mock.ts
Normal file
@@ -0,0 +1,169 @@
|
||||
import crypto from "node:crypto";
|
||||
import type {
|
||||
EndReason,
|
||||
HangupCallInput,
|
||||
InitiateCallInput,
|
||||
InitiateCallResult,
|
||||
NormalizedEvent,
|
||||
PlayTtsInput,
|
||||
WebhookParseOptions,
|
||||
ProviderWebhookParseResult,
|
||||
StartListeningInput,
|
||||
StopListeningInput,
|
||||
WebhookContext,
|
||||
WebhookVerificationResult,
|
||||
} from "../types.js";
|
||||
import type { VoiceCallProvider } from "./base.js";
|
||||
|
||||
/**
|
||||
* Mock voice call provider for local testing.
|
||||
*
|
||||
* Events are driven via webhook POST with JSON body:
|
||||
* - { events: NormalizedEvent[] } for bulk events
|
||||
* - { event: NormalizedEvent } for single event
|
||||
*/
|
||||
export class MockProvider implements VoiceCallProvider {
|
||||
readonly name = "mock" as const;
|
||||
|
||||
verifyWebhook(_ctx: WebhookContext): WebhookVerificationResult {
|
||||
return { ok: true };
|
||||
}
|
||||
|
||||
parseWebhookEvent(
|
||||
ctx: WebhookContext,
|
||||
_options?: WebhookParseOptions,
|
||||
): ProviderWebhookParseResult {
|
||||
try {
|
||||
const payload = JSON.parse(ctx.rawBody);
|
||||
const events: NormalizedEvent[] = [];
|
||||
|
||||
if (Array.isArray(payload.events)) {
|
||||
for (const evt of payload.events) {
|
||||
const normalized = this.normalizeEvent(evt);
|
||||
if (normalized) {
|
||||
events.push(normalized);
|
||||
}
|
||||
}
|
||||
} else if (payload.event) {
|
||||
const normalized = this.normalizeEvent(payload.event);
|
||||
if (normalized) {
|
||||
events.push(normalized);
|
||||
}
|
||||
}
|
||||
|
||||
return { events, statusCode: 200 };
|
||||
} catch {
|
||||
return { events: [], statusCode: 400 };
|
||||
}
|
||||
}
|
||||
|
||||
private normalizeEvent(evt: Partial<NormalizedEvent>): NormalizedEvent | null {
|
||||
if (!evt.type || !evt.callId) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const base = {
|
||||
id: evt.id || crypto.randomUUID(),
|
||||
callId: evt.callId,
|
||||
providerCallId: evt.providerCallId,
|
||||
timestamp: evt.timestamp || Date.now(),
|
||||
};
|
||||
|
||||
switch (evt.type) {
|
||||
case "call.initiated":
|
||||
case "call.ringing":
|
||||
case "call.answered":
|
||||
case "call.active":
|
||||
return { ...base, type: evt.type };
|
||||
|
||||
case "call.speaking": {
|
||||
const payload = evt as Partial<NormalizedEvent & { text?: string }>;
|
||||
return {
|
||||
...base,
|
||||
type: evt.type,
|
||||
text: payload.text || "",
|
||||
};
|
||||
}
|
||||
|
||||
case "call.speech": {
|
||||
const payload = evt as Partial<
|
||||
NormalizedEvent & {
|
||||
transcript?: string;
|
||||
isFinal?: boolean;
|
||||
confidence?: number;
|
||||
}
|
||||
>;
|
||||
return {
|
||||
...base,
|
||||
type: evt.type,
|
||||
transcript: payload.transcript || "",
|
||||
isFinal: payload.isFinal ?? true,
|
||||
confidence: payload.confidence,
|
||||
};
|
||||
}
|
||||
|
||||
case "call.silence": {
|
||||
const payload = evt as Partial<NormalizedEvent & { durationMs?: number }>;
|
||||
return {
|
||||
...base,
|
||||
type: evt.type,
|
||||
durationMs: payload.durationMs || 0,
|
||||
};
|
||||
}
|
||||
|
||||
case "call.dtmf": {
|
||||
const payload = evt as Partial<NormalizedEvent & { digits?: string }>;
|
||||
return {
|
||||
...base,
|
||||
type: evt.type,
|
||||
digits: payload.digits || "",
|
||||
};
|
||||
}
|
||||
|
||||
case "call.ended": {
|
||||
const payload = evt as Partial<NormalizedEvent & { reason?: EndReason }>;
|
||||
return {
|
||||
...base,
|
||||
type: evt.type,
|
||||
reason: payload.reason || "completed",
|
||||
};
|
||||
}
|
||||
|
||||
case "call.error": {
|
||||
const payload = evt as Partial<NormalizedEvent & { error?: string; retryable?: boolean }>;
|
||||
return {
|
||||
...base,
|
||||
type: evt.type,
|
||||
error: payload.error || "unknown error",
|
||||
retryable: payload.retryable,
|
||||
};
|
||||
}
|
||||
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
|
||||
return {
|
||||
providerCallId: `mock-${input.callId}`,
|
||||
status: "initiated",
|
||||
};
|
||||
}
|
||||
|
||||
async hangupCall(_input: HangupCallInput): Promise<void> {
|
||||
// No-op for mock
|
||||
}
|
||||
|
||||
async playTts(_input: PlayTtsInput): Promise<void> {
|
||||
// No-op for mock
|
||||
}
|
||||
|
||||
async startListening(_input: StartListeningInput): Promise<void> {
|
||||
// No-op for mock
|
||||
}
|
||||
|
||||
async stopListening(_input: StopListeningInput): Promise<void> {
|
||||
// No-op for mock
|
||||
}
|
||||
}
|
||||
49
openclaw/extensions/voice-call/src/providers/plivo.test.ts
Normal file
49
openclaw/extensions/voice-call/src/providers/plivo.test.ts
Normal file
@@ -0,0 +1,49 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { PlivoProvider } from "./plivo.js";
|
||||
|
||||
describe("PlivoProvider", () => {
|
||||
it("parses answer callback into call.answered and returns keep-alive XML", () => {
|
||||
const provider = new PlivoProvider({
|
||||
authId: "MA000000000000000000",
|
||||
authToken: "test-token",
|
||||
});
|
||||
|
||||
const result = provider.parseWebhookEvent({
|
||||
headers: { host: "example.com" },
|
||||
rawBody:
|
||||
"CallUUID=call-uuid&CallStatus=in-progress&Direction=outbound&From=%2B15550000000&To=%2B15550000001&Event=StartApp",
|
||||
url: "https://example.com/voice/webhook?provider=plivo&flow=answer&callId=internal-call-id",
|
||||
method: "POST",
|
||||
query: { provider: "plivo", flow: "answer", callId: "internal-call-id" },
|
||||
});
|
||||
|
||||
expect(result.events).toHaveLength(1);
|
||||
expect(result.events[0]?.type).toBe("call.answered");
|
||||
expect(result.events[0]?.callId).toBe("internal-call-id");
|
||||
expect(result.events[0]?.providerCallId).toBe("call-uuid");
|
||||
expect(result.providerResponseBody).toContain("<Wait");
|
||||
expect(result.providerResponseBody).toContain('length="300"');
|
||||
});
|
||||
|
||||
it("uses verified request key when provided", () => {
|
||||
const provider = new PlivoProvider({
|
||||
authId: "MA000000000000000000",
|
||||
authToken: "test-token",
|
||||
});
|
||||
|
||||
const result = provider.parseWebhookEvent(
|
||||
{
|
||||
headers: { host: "example.com", "x-plivo-signature-v3-nonce": "nonce-1" },
|
||||
rawBody:
|
||||
"CallUUID=call-uuid&CallStatus=in-progress&Direction=outbound&From=%2B15550000000&To=%2B15550000001&Event=StartApp",
|
||||
url: "https://example.com/voice/webhook?provider=plivo&flow=answer&callId=internal-call-id",
|
||||
method: "POST",
|
||||
query: { provider: "plivo", flow: "answer", callId: "internal-call-id" },
|
||||
},
|
||||
{ verifiedRequestKey: "plivo:v3:verified" },
|
||||
);
|
||||
|
||||
expect(result.events).toHaveLength(1);
|
||||
expect(result.events[0]?.dedupeKey).toBe("plivo:v3:verified");
|
||||
});
|
||||
});
|
||||
556
openclaw/extensions/voice-call/src/providers/plivo.ts
Normal file
556
openclaw/extensions/voice-call/src/providers/plivo.ts
Normal file
@@ -0,0 +1,556 @@
|
||||
import crypto from "node:crypto";
|
||||
import type { PlivoConfig, WebhookSecurityConfig } from "../config.js";
|
||||
import { getHeader } from "../http-headers.js";
|
||||
import type {
|
||||
HangupCallInput,
|
||||
InitiateCallInput,
|
||||
InitiateCallResult,
|
||||
NormalizedEvent,
|
||||
PlayTtsInput,
|
||||
ProviderWebhookParseResult,
|
||||
StartListeningInput,
|
||||
StopListeningInput,
|
||||
WebhookContext,
|
||||
WebhookParseOptions,
|
||||
WebhookVerificationResult,
|
||||
} from "../types.js";
|
||||
import { escapeXml } from "../voice-mapping.js";
|
||||
import { reconstructWebhookUrl, verifyPlivoWebhook } from "../webhook-security.js";
|
||||
import type { VoiceCallProvider } from "./base.js";
|
||||
import { guardedJsonApiRequest } from "./shared/guarded-json-api.js";
|
||||
|
||||
export interface PlivoProviderOptions {
|
||||
/** Override public URL origin for signature verification */
|
||||
publicUrl?: string;
|
||||
/** Skip webhook signature verification (development only) */
|
||||
skipVerification?: boolean;
|
||||
/** Outbound ring timeout in seconds */
|
||||
ringTimeoutSec?: number;
|
||||
/** Webhook security options (forwarded headers/allowlist) */
|
||||
webhookSecurity?: WebhookSecurityConfig;
|
||||
}
|
||||
|
||||
type PendingSpeak = { text: string; locale?: string };
|
||||
type PendingListen = { language?: string };
|
||||
|
||||
function createPlivoRequestDedupeKey(ctx: WebhookContext): string {
|
||||
const nonceV3 = getHeader(ctx.headers, "x-plivo-signature-v3-nonce");
|
||||
if (nonceV3) {
|
||||
return `plivo:v3:${nonceV3}`;
|
||||
}
|
||||
const nonceV2 = getHeader(ctx.headers, "x-plivo-signature-v2-nonce");
|
||||
if (nonceV2) {
|
||||
return `plivo:v2:${nonceV2}`;
|
||||
}
|
||||
return `plivo:fallback:${crypto.createHash("sha256").update(ctx.rawBody).digest("hex")}`;
|
||||
}
|
||||
|
||||
export class PlivoProvider implements VoiceCallProvider {
|
||||
readonly name = "plivo" as const;
|
||||
|
||||
private readonly authId: string;
|
||||
private readonly authToken: string;
|
||||
private readonly baseUrl: string;
|
||||
private readonly options: PlivoProviderOptions;
|
||||
private readonly apiHost: string;
|
||||
|
||||
// Best-effort mapping between create-call request UUID and call UUID.
|
||||
private requestUuidToCallUuid = new Map<string, string>();
|
||||
|
||||
// Used for transfer URLs and GetInput action URLs.
|
||||
private callIdToWebhookUrl = new Map<string, string>();
|
||||
private callUuidToWebhookUrl = new Map<string, string>();
|
||||
|
||||
private pendingSpeakByCallId = new Map<string, PendingSpeak>();
|
||||
private pendingListenByCallId = new Map<string, PendingListen>();
|
||||
|
||||
constructor(config: PlivoConfig, options: PlivoProviderOptions = {}) {
|
||||
if (!config.authId) {
|
||||
throw new Error("Plivo Auth ID is required");
|
||||
}
|
||||
if (!config.authToken) {
|
||||
throw new Error("Plivo Auth Token is required");
|
||||
}
|
||||
|
||||
this.authId = config.authId;
|
||||
this.authToken = config.authToken;
|
||||
this.baseUrl = `https://api.plivo.com/v1/Account/${this.authId}`;
|
||||
this.apiHost = new URL(this.baseUrl).hostname;
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
private async apiRequest<T = unknown>(params: {
|
||||
method: "GET" | "POST" | "DELETE";
|
||||
endpoint: string;
|
||||
body?: Record<string, unknown>;
|
||||
allowNotFound?: boolean;
|
||||
}): Promise<T> {
|
||||
const { method, endpoint, body, allowNotFound } = params;
|
||||
return await guardedJsonApiRequest<T>({
|
||||
url: `${this.baseUrl}${endpoint}`,
|
||||
method,
|
||||
headers: {
|
||||
Authorization: `Basic ${Buffer.from(`${this.authId}:${this.authToken}`).toString("base64")}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body,
|
||||
allowNotFound,
|
||||
allowedHostnames: [this.apiHost],
|
||||
auditContext: "voice-call.plivo.api",
|
||||
errorPrefix: "Plivo API error",
|
||||
});
|
||||
}
|
||||
|
||||
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult {
|
||||
const result = verifyPlivoWebhook(ctx, this.authToken, {
|
||||
publicUrl: this.options.publicUrl,
|
||||
skipVerification: this.options.skipVerification,
|
||||
allowedHosts: this.options.webhookSecurity?.allowedHosts,
|
||||
trustForwardingHeaders: this.options.webhookSecurity?.trustForwardingHeaders,
|
||||
trustedProxyIPs: this.options.webhookSecurity?.trustedProxyIPs,
|
||||
remoteIP: ctx.remoteAddress,
|
||||
});
|
||||
|
||||
if (!result.ok) {
|
||||
console.warn(`[plivo] Webhook verification failed: ${result.reason}`);
|
||||
}
|
||||
|
||||
return {
|
||||
ok: result.ok,
|
||||
reason: result.reason,
|
||||
isReplay: result.isReplay,
|
||||
verifiedRequestKey: result.verifiedRequestKey,
|
||||
};
|
||||
}
|
||||
|
||||
parseWebhookEvent(
|
||||
ctx: WebhookContext,
|
||||
options?: WebhookParseOptions,
|
||||
): ProviderWebhookParseResult {
|
||||
const flow = typeof ctx.query?.flow === "string" ? ctx.query.flow.trim() : "";
|
||||
|
||||
const parsed = this.parseBody(ctx.rawBody);
|
||||
if (!parsed) {
|
||||
return { events: [], statusCode: 400 };
|
||||
}
|
||||
|
||||
// Keep providerCallId mapping for later call control.
|
||||
const callUuid = parsed.get("CallUUID") || undefined;
|
||||
if (callUuid) {
|
||||
const webhookBase = this.baseWebhookUrlFromCtx(ctx);
|
||||
if (webhookBase) {
|
||||
this.callUuidToWebhookUrl.set(callUuid, webhookBase);
|
||||
}
|
||||
}
|
||||
|
||||
// Special flows that exist only to return Plivo XML (no events).
|
||||
if (flow === "xml-speak") {
|
||||
const callId = this.getCallIdFromQuery(ctx);
|
||||
const pending = callId ? this.pendingSpeakByCallId.get(callId) : undefined;
|
||||
if (callId) {
|
||||
this.pendingSpeakByCallId.delete(callId);
|
||||
}
|
||||
|
||||
const xml = pending
|
||||
? PlivoProvider.xmlSpeak(pending.text, pending.locale)
|
||||
: PlivoProvider.xmlKeepAlive();
|
||||
return {
|
||||
events: [],
|
||||
providerResponseBody: xml,
|
||||
providerResponseHeaders: { "Content-Type": "text/xml" },
|
||||
statusCode: 200,
|
||||
};
|
||||
}
|
||||
|
||||
if (flow === "xml-listen") {
|
||||
const callId = this.getCallIdFromQuery(ctx);
|
||||
const pending = callId ? this.pendingListenByCallId.get(callId) : undefined;
|
||||
if (callId) {
|
||||
this.pendingListenByCallId.delete(callId);
|
||||
}
|
||||
|
||||
const actionUrl = this.buildActionUrl(ctx, {
|
||||
flow: "getinput",
|
||||
callId,
|
||||
});
|
||||
|
||||
const xml =
|
||||
actionUrl && callId
|
||||
? PlivoProvider.xmlGetInputSpeech({
|
||||
actionUrl,
|
||||
language: pending?.language,
|
||||
})
|
||||
: PlivoProvider.xmlKeepAlive();
|
||||
|
||||
return {
|
||||
events: [],
|
||||
providerResponseBody: xml,
|
||||
providerResponseHeaders: { "Content-Type": "text/xml" },
|
||||
statusCode: 200,
|
||||
};
|
||||
}
|
||||
|
||||
// Normal events.
|
||||
const callIdFromQuery = this.getCallIdFromQuery(ctx);
|
||||
const dedupeKey = options?.verifiedRequestKey ?? createPlivoRequestDedupeKey(ctx);
|
||||
const event = this.normalizeEvent(parsed, callIdFromQuery, dedupeKey);
|
||||
|
||||
return {
|
||||
events: event ? [event] : [],
|
||||
providerResponseBody:
|
||||
flow === "answer" || flow === "getinput"
|
||||
? PlivoProvider.xmlKeepAlive()
|
||||
: PlivoProvider.xmlEmpty(),
|
||||
providerResponseHeaders: { "Content-Type": "text/xml" },
|
||||
statusCode: 200,
|
||||
};
|
||||
}
|
||||
|
||||
private normalizeEvent(
|
||||
params: URLSearchParams,
|
||||
callIdOverride?: string,
|
||||
dedupeKey?: string,
|
||||
): NormalizedEvent | null {
|
||||
const callUuid = params.get("CallUUID") || "";
|
||||
const requestUuid = params.get("RequestUUID") || "";
|
||||
|
||||
if (requestUuid && callUuid) {
|
||||
this.requestUuidToCallUuid.set(requestUuid, callUuid);
|
||||
}
|
||||
|
||||
const direction = params.get("Direction");
|
||||
const from = params.get("From") || undefined;
|
||||
const to = params.get("To") || undefined;
|
||||
const callStatus = params.get("CallStatus");
|
||||
|
||||
const baseEvent = {
|
||||
id: crypto.randomUUID(),
|
||||
dedupeKey,
|
||||
callId: callIdOverride || callUuid || requestUuid,
|
||||
providerCallId: callUuid || requestUuid || undefined,
|
||||
timestamp: Date.now(),
|
||||
direction:
|
||||
direction === "inbound"
|
||||
? ("inbound" as const)
|
||||
: direction === "outbound"
|
||||
? ("outbound" as const)
|
||||
: undefined,
|
||||
from,
|
||||
to,
|
||||
};
|
||||
|
||||
const digits = params.get("Digits");
|
||||
if (digits) {
|
||||
return { ...baseEvent, type: "call.dtmf", digits };
|
||||
}
|
||||
|
||||
const transcript = PlivoProvider.extractTranscript(params);
|
||||
if (transcript) {
|
||||
return {
|
||||
...baseEvent,
|
||||
type: "call.speech",
|
||||
transcript,
|
||||
isFinal: true,
|
||||
};
|
||||
}
|
||||
|
||||
// Call lifecycle.
|
||||
if (callStatus === "ringing") {
|
||||
return { ...baseEvent, type: "call.ringing" };
|
||||
}
|
||||
|
||||
if (callStatus === "in-progress") {
|
||||
return { ...baseEvent, type: "call.answered" };
|
||||
}
|
||||
|
||||
if (
|
||||
callStatus === "completed" ||
|
||||
callStatus === "busy" ||
|
||||
callStatus === "no-answer" ||
|
||||
callStatus === "failed"
|
||||
) {
|
||||
return {
|
||||
...baseEvent,
|
||||
type: "call.ended",
|
||||
reason:
|
||||
callStatus === "completed"
|
||||
? "completed"
|
||||
: callStatus === "busy"
|
||||
? "busy"
|
||||
: callStatus === "no-answer"
|
||||
? "no-answer"
|
||||
: "failed",
|
||||
};
|
||||
}
|
||||
|
||||
// Plivo will call our answer_url when the call is answered; if we don't have
|
||||
// a CallStatus for some reason, treat it as answered so the call can proceed.
|
||||
if (params.get("Event") === "StartApp" && callUuid) {
|
||||
return { ...baseEvent, type: "call.answered" };
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
|
||||
const webhookUrl = new URL(input.webhookUrl);
|
||||
webhookUrl.searchParams.set("provider", "plivo");
|
||||
webhookUrl.searchParams.set("callId", input.callId);
|
||||
|
||||
const answerUrl = new URL(webhookUrl);
|
||||
answerUrl.searchParams.set("flow", "answer");
|
||||
|
||||
const hangupUrl = new URL(webhookUrl);
|
||||
hangupUrl.searchParams.set("flow", "hangup");
|
||||
|
||||
this.callIdToWebhookUrl.set(input.callId, input.webhookUrl);
|
||||
|
||||
const ringTimeoutSec = this.options.ringTimeoutSec ?? 30;
|
||||
|
||||
const result = await this.apiRequest<PlivoCreateCallResponse>({
|
||||
method: "POST",
|
||||
endpoint: "/Call/",
|
||||
body: {
|
||||
from: PlivoProvider.normalizeNumber(input.from),
|
||||
to: PlivoProvider.normalizeNumber(input.to),
|
||||
answer_url: answerUrl.toString(),
|
||||
answer_method: "POST",
|
||||
hangup_url: hangupUrl.toString(),
|
||||
hangup_method: "POST",
|
||||
// Plivo's API uses `hangup_on_ring` for outbound ring timeout.
|
||||
hangup_on_ring: ringTimeoutSec,
|
||||
},
|
||||
});
|
||||
|
||||
const requestUuid = Array.isArray(result.request_uuid)
|
||||
? result.request_uuid[0]
|
||||
: result.request_uuid;
|
||||
if (!requestUuid) {
|
||||
throw new Error("Plivo call create returned no request_uuid");
|
||||
}
|
||||
|
||||
return { providerCallId: requestUuid, status: "initiated" };
|
||||
}
|
||||
|
||||
async hangupCall(input: HangupCallInput): Promise<void> {
|
||||
const callUuid = this.requestUuidToCallUuid.get(input.providerCallId);
|
||||
if (callUuid) {
|
||||
await this.apiRequest({
|
||||
method: "DELETE",
|
||||
endpoint: `/Call/${callUuid}/`,
|
||||
allowNotFound: true,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// Best-effort: try hangup (call UUID), then cancel (request UUID).
|
||||
await this.apiRequest({
|
||||
method: "DELETE",
|
||||
endpoint: `/Call/${input.providerCallId}/`,
|
||||
allowNotFound: true,
|
||||
});
|
||||
await this.apiRequest({
|
||||
method: "DELETE",
|
||||
endpoint: `/Request/${input.providerCallId}/`,
|
||||
allowNotFound: true,
|
||||
});
|
||||
}
|
||||
|
||||
private resolveCallContext(params: {
|
||||
providerCallId: string;
|
||||
callId: string;
|
||||
operation: string;
|
||||
}): {
|
||||
callUuid: string;
|
||||
webhookBase: string;
|
||||
} {
|
||||
const callUuid = this.requestUuidToCallUuid.get(params.providerCallId) ?? params.providerCallId;
|
||||
const webhookBase =
|
||||
this.callUuidToWebhookUrl.get(callUuid) || this.callIdToWebhookUrl.get(params.callId);
|
||||
if (!webhookBase) {
|
||||
throw new Error("Missing webhook URL for this call (provider state missing)");
|
||||
}
|
||||
if (!callUuid) {
|
||||
throw new Error(`Missing Plivo CallUUID for ${params.operation}`);
|
||||
}
|
||||
return { callUuid, webhookBase };
|
||||
}
|
||||
|
||||
private async transferCallLeg(params: {
|
||||
callUuid: string;
|
||||
webhookBase: string;
|
||||
callId: string;
|
||||
flow: "xml-speak" | "xml-listen";
|
||||
}): Promise<void> {
|
||||
const transferUrl = new URL(params.webhookBase);
|
||||
transferUrl.searchParams.set("provider", "plivo");
|
||||
transferUrl.searchParams.set("flow", params.flow);
|
||||
transferUrl.searchParams.set("callId", params.callId);
|
||||
|
||||
await this.apiRequest({
|
||||
method: "POST",
|
||||
endpoint: `/Call/${params.callUuid}/`,
|
||||
body: {
|
||||
legs: "aleg",
|
||||
aleg_url: transferUrl.toString(),
|
||||
aleg_method: "POST",
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
async playTts(input: PlayTtsInput): Promise<void> {
|
||||
const { callUuid, webhookBase } = this.resolveCallContext({
|
||||
providerCallId: input.providerCallId,
|
||||
callId: input.callId,
|
||||
operation: "playTts",
|
||||
});
|
||||
|
||||
this.pendingSpeakByCallId.set(input.callId, {
|
||||
text: input.text,
|
||||
locale: input.locale,
|
||||
});
|
||||
|
||||
await this.transferCallLeg({
|
||||
callUuid,
|
||||
webhookBase,
|
||||
callId: input.callId,
|
||||
flow: "xml-speak",
|
||||
});
|
||||
}
|
||||
|
||||
async startListening(input: StartListeningInput): Promise<void> {
|
||||
const { callUuid, webhookBase } = this.resolveCallContext({
|
||||
providerCallId: input.providerCallId,
|
||||
callId: input.callId,
|
||||
operation: "startListening",
|
||||
});
|
||||
|
||||
this.pendingListenByCallId.set(input.callId, {
|
||||
language: input.language,
|
||||
});
|
||||
|
||||
await this.transferCallLeg({
|
||||
callUuid,
|
||||
webhookBase,
|
||||
callId: input.callId,
|
||||
flow: "xml-listen",
|
||||
});
|
||||
}
|
||||
|
||||
async stopListening(_input: StopListeningInput): Promise<void> {
|
||||
// GetInput ends automatically when speech ends.
|
||||
}
|
||||
|
||||
private static normalizeNumber(numberOrSip: string): string {
|
||||
const trimmed = numberOrSip.trim();
|
||||
if (trimmed.toLowerCase().startsWith("sip:")) {
|
||||
return trimmed;
|
||||
}
|
||||
return trimmed.replace(/[^\d+]/g, "");
|
||||
}
|
||||
|
||||
private static xmlEmpty(): string {
|
||||
return `<?xml version="1.0" encoding="UTF-8"?><Response></Response>`;
|
||||
}
|
||||
|
||||
private static xmlKeepAlive(): string {
|
||||
return `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Wait length="300" />
|
||||
</Response>`;
|
||||
}
|
||||
|
||||
private static xmlSpeak(text: string, locale?: string): string {
|
||||
const language = locale || "en-US";
|
||||
return `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Speak language="${escapeXml(language)}">${escapeXml(text)}</Speak>
|
||||
<Wait length="300" />
|
||||
</Response>`;
|
||||
}
|
||||
|
||||
private static xmlGetInputSpeech(params: { actionUrl: string; language?: string }): string {
|
||||
const language = params.language || "en-US";
|
||||
return `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<GetInput inputType="speech" method="POST" action="${escapeXml(params.actionUrl)}" language="${escapeXml(language)}" executionTimeout="30" speechEndTimeout="1" redirect="false">
|
||||
</GetInput>
|
||||
<Wait length="300" />
|
||||
</Response>`;
|
||||
}
|
||||
|
||||
private getCallIdFromQuery(ctx: WebhookContext): string | undefined {
|
||||
const callId =
|
||||
typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
|
||||
? ctx.query.callId.trim()
|
||||
: undefined;
|
||||
return callId || undefined;
|
||||
}
|
||||
|
||||
private buildActionUrl(
|
||||
ctx: WebhookContext,
|
||||
opts: { flow: string; callId?: string },
|
||||
): string | null {
|
||||
const base = this.baseWebhookUrlFromCtx(ctx);
|
||||
if (!base) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const u = new URL(base);
|
||||
u.searchParams.set("provider", "plivo");
|
||||
u.searchParams.set("flow", opts.flow);
|
||||
if (opts.callId) {
|
||||
u.searchParams.set("callId", opts.callId);
|
||||
}
|
||||
return u.toString();
|
||||
}
|
||||
|
||||
private baseWebhookUrlFromCtx(ctx: WebhookContext): string | null {
|
||||
try {
|
||||
const u = new URL(
|
||||
reconstructWebhookUrl(ctx, {
|
||||
allowedHosts: this.options.webhookSecurity?.allowedHosts,
|
||||
trustForwardingHeaders: this.options.webhookSecurity?.trustForwardingHeaders,
|
||||
trustedProxyIPs: this.options.webhookSecurity?.trustedProxyIPs,
|
||||
remoteIP: ctx.remoteAddress,
|
||||
}),
|
||||
);
|
||||
return `${u.origin}${u.pathname}`;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private parseBody(rawBody: string): URLSearchParams | null {
|
||||
try {
|
||||
return new URLSearchParams(rawBody);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static extractTranscript(params: URLSearchParams): string | null {
|
||||
const candidates = [
|
||||
"Speech",
|
||||
"Transcription",
|
||||
"TranscriptionText",
|
||||
"SpeechResult",
|
||||
"RecognizedSpeech",
|
||||
"Text",
|
||||
] as const;
|
||||
|
||||
for (const key of candidates) {
|
||||
const value = params.get(key);
|
||||
if (value && value.trim()) {
|
||||
return value.trim();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
type PlivoCreateCallResponse = {
|
||||
api_id?: string;
|
||||
message?: string;
|
||||
request_uuid?: string | string[];
|
||||
};
|
||||
@@ -0,0 +1,42 @@
|
||||
import { fetchWithSsrFGuard } from "openclaw/plugin-sdk";
|
||||
|
||||
type GuardedJsonApiRequestParams = {
|
||||
url: string;
|
||||
method: "GET" | "POST" | "DELETE" | "PUT" | "PATCH";
|
||||
headers: Record<string, string>;
|
||||
body?: Record<string, unknown>;
|
||||
allowNotFound?: boolean;
|
||||
allowedHostnames: string[];
|
||||
auditContext: string;
|
||||
errorPrefix: string;
|
||||
};
|
||||
|
||||
export async function guardedJsonApiRequest<T = unknown>(
|
||||
params: GuardedJsonApiRequestParams,
|
||||
): Promise<T> {
|
||||
const { response, release } = await fetchWithSsrFGuard({
|
||||
url: params.url,
|
||||
init: {
|
||||
method: params.method,
|
||||
headers: params.headers,
|
||||
body: params.body ? JSON.stringify(params.body) : undefined,
|
||||
},
|
||||
policy: { allowedHostnames: params.allowedHostnames },
|
||||
auditContext: params.auditContext,
|
||||
});
|
||||
|
||||
try {
|
||||
if (!response.ok) {
|
||||
if (params.allowNotFound && response.status === 404) {
|
||||
return undefined as T;
|
||||
}
|
||||
const errorText = await response.text();
|
||||
throw new Error(`${params.errorPrefix}: ${response.status} ${errorText}`);
|
||||
}
|
||||
|
||||
const text = await response.text();
|
||||
return text ? (JSON.parse(text) as T) : (undefined as T);
|
||||
} finally {
|
||||
await release();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,311 @@
|
||||
/**
|
||||
* OpenAI Realtime STT Provider
|
||||
*
|
||||
* Uses the OpenAI Realtime API for streaming transcription with:
|
||||
* - Direct mu-law audio support (no conversion needed)
|
||||
* - Built-in server-side VAD for turn detection
|
||||
* - Low-latency streaming transcription
|
||||
* - Partial transcript callbacks for real-time UI updates
|
||||
*/
|
||||
|
||||
import WebSocket from "ws";
|
||||
|
||||
/**
|
||||
* Configuration for OpenAI Realtime STT.
|
||||
*/
|
||||
export interface RealtimeSTTConfig {
|
||||
/** OpenAI API key */
|
||||
apiKey: string;
|
||||
/** Model to use (default: gpt-4o-transcribe) */
|
||||
model?: string;
|
||||
/** Silence duration in ms before considering speech ended (default: 800) */
|
||||
silenceDurationMs?: number;
|
||||
/** VAD threshold 0-1 (default: 0.5) */
|
||||
vadThreshold?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Session for streaming audio and receiving transcripts.
|
||||
*/
|
||||
export interface RealtimeSTTSession {
|
||||
/** Connect to the transcription service */
|
||||
connect(): Promise<void>;
|
||||
/** Send mu-law audio data (8kHz mono) */
|
||||
sendAudio(audio: Buffer): void;
|
||||
/** Wait for next complete transcript (after VAD detects end of speech) */
|
||||
waitForTranscript(timeoutMs?: number): Promise<string>;
|
||||
/** Set callback for partial transcripts (streaming) */
|
||||
onPartial(callback: (partial: string) => void): void;
|
||||
/** Set callback for final transcripts */
|
||||
onTranscript(callback: (transcript: string) => void): void;
|
||||
/** Set callback when speech starts (VAD) */
|
||||
onSpeechStart(callback: () => void): void;
|
||||
/** Close the session */
|
||||
close(): void;
|
||||
/** Check if session is connected */
|
||||
isConnected(): boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provider factory for OpenAI Realtime STT sessions.
|
||||
*/
|
||||
export class OpenAIRealtimeSTTProvider {
|
||||
readonly name = "openai-realtime";
|
||||
private apiKey: string;
|
||||
private model: string;
|
||||
private silenceDurationMs: number;
|
||||
private vadThreshold: number;
|
||||
|
||||
constructor(config: RealtimeSTTConfig) {
|
||||
if (!config.apiKey) {
|
||||
throw new Error("OpenAI API key required for Realtime STT");
|
||||
}
|
||||
this.apiKey = config.apiKey;
|
||||
this.model = config.model || "gpt-4o-transcribe";
|
||||
this.silenceDurationMs = config.silenceDurationMs || 800;
|
||||
this.vadThreshold = config.vadThreshold || 0.5;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new realtime transcription session.
|
||||
*/
|
||||
createSession(): RealtimeSTTSession {
|
||||
return new OpenAIRealtimeSTTSession(
|
||||
this.apiKey,
|
||||
this.model,
|
||||
this.silenceDurationMs,
|
||||
this.vadThreshold,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* WebSocket-based session for real-time speech-to-text.
|
||||
*/
|
||||
class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
|
||||
private static readonly MAX_RECONNECT_ATTEMPTS = 5;
|
||||
private static readonly RECONNECT_DELAY_MS = 1000;
|
||||
|
||||
private ws: WebSocket | null = null;
|
||||
private connected = false;
|
||||
private closed = false;
|
||||
private reconnectAttempts = 0;
|
||||
private pendingTranscript = "";
|
||||
private onTranscriptCallback: ((transcript: string) => void) | null = null;
|
||||
private onPartialCallback: ((partial: string) => void) | null = null;
|
||||
private onSpeechStartCallback: (() => void) | null = null;
|
||||
|
||||
constructor(
|
||||
private readonly apiKey: string,
|
||||
private readonly model: string,
|
||||
private readonly silenceDurationMs: number,
|
||||
private readonly vadThreshold: number,
|
||||
) {}
|
||||
|
||||
async connect(): Promise<void> {
|
||||
this.closed = false;
|
||||
this.reconnectAttempts = 0;
|
||||
return this.doConnect();
|
||||
}
|
||||
|
||||
private async doConnect(): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const url = "wss://api.openai.com/v1/realtime?intent=transcription";
|
||||
|
||||
this.ws = new WebSocket(url, {
|
||||
headers: {
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
"OpenAI-Beta": "realtime=v1",
|
||||
},
|
||||
});
|
||||
|
||||
this.ws.on("open", () => {
|
||||
console.log("[RealtimeSTT] WebSocket connected");
|
||||
this.connected = true;
|
||||
this.reconnectAttempts = 0;
|
||||
|
||||
// Configure the transcription session
|
||||
this.sendEvent({
|
||||
type: "transcription_session.update",
|
||||
session: {
|
||||
input_audio_format: "g711_ulaw",
|
||||
input_audio_transcription: {
|
||||
model: this.model,
|
||||
},
|
||||
turn_detection: {
|
||||
type: "server_vad",
|
||||
threshold: this.vadThreshold,
|
||||
prefix_padding_ms: 300,
|
||||
silence_duration_ms: this.silenceDurationMs,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
resolve();
|
||||
});
|
||||
|
||||
this.ws.on("message", (data: Buffer) => {
|
||||
try {
|
||||
const event = JSON.parse(data.toString());
|
||||
this.handleEvent(event);
|
||||
} catch (e) {
|
||||
console.error("[RealtimeSTT] Failed to parse event:", e);
|
||||
}
|
||||
});
|
||||
|
||||
this.ws.on("error", (error) => {
|
||||
console.error("[RealtimeSTT] WebSocket error:", error);
|
||||
if (!this.connected) {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
|
||||
this.ws.on("close", (code, reason) => {
|
||||
console.log(
|
||||
`[RealtimeSTT] WebSocket closed (code: ${code}, reason: ${reason?.toString() || "none"})`,
|
||||
);
|
||||
this.connected = false;
|
||||
|
||||
// Attempt reconnection if not intentionally closed
|
||||
if (!this.closed) {
|
||||
void this.attemptReconnect();
|
||||
}
|
||||
});
|
||||
|
||||
setTimeout(() => {
|
||||
if (!this.connected) {
|
||||
reject(new Error("Realtime STT connection timeout"));
|
||||
}
|
||||
}, 10000);
|
||||
});
|
||||
}
|
||||
|
||||
private async attemptReconnect(): Promise<void> {
|
||||
if (this.closed) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.reconnectAttempts >= OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS) {
|
||||
console.error(
|
||||
`[RealtimeSTT] Max reconnect attempts (${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS}) reached`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
this.reconnectAttempts++;
|
||||
const delay = OpenAIRealtimeSTTSession.RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1);
|
||||
console.log(
|
||||
`[RealtimeSTT] Reconnecting ${this.reconnectAttempts}/${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS} in ${delay}ms...`,
|
||||
);
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
|
||||
if (this.closed) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await this.doConnect();
|
||||
console.log("[RealtimeSTT] Reconnected successfully");
|
||||
} catch (error) {
|
||||
console.error("[RealtimeSTT] Reconnect failed:", error);
|
||||
}
|
||||
}
|
||||
|
||||
private handleEvent(event: {
|
||||
type: string;
|
||||
delta?: string;
|
||||
transcript?: string;
|
||||
error?: unknown;
|
||||
}): void {
|
||||
switch (event.type) {
|
||||
case "transcription_session.created":
|
||||
case "transcription_session.updated":
|
||||
case "input_audio_buffer.speech_stopped":
|
||||
case "input_audio_buffer.committed":
|
||||
console.log(`[RealtimeSTT] ${event.type}`);
|
||||
break;
|
||||
|
||||
case "conversation.item.input_audio_transcription.delta":
|
||||
if (event.delta) {
|
||||
this.pendingTranscript += event.delta;
|
||||
this.onPartialCallback?.(this.pendingTranscript);
|
||||
}
|
||||
break;
|
||||
|
||||
case "conversation.item.input_audio_transcription.completed":
|
||||
if (event.transcript) {
|
||||
console.log(`[RealtimeSTT] Transcript: ${event.transcript}`);
|
||||
this.onTranscriptCallback?.(event.transcript);
|
||||
}
|
||||
this.pendingTranscript = "";
|
||||
break;
|
||||
|
||||
case "input_audio_buffer.speech_started":
|
||||
console.log("[RealtimeSTT] Speech started");
|
||||
this.pendingTranscript = "";
|
||||
this.onSpeechStartCallback?.();
|
||||
break;
|
||||
|
||||
case "error":
|
||||
console.error("[RealtimeSTT] Error:", event.error);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private sendEvent(event: unknown): void {
|
||||
if (this.ws?.readyState === WebSocket.OPEN) {
|
||||
this.ws.send(JSON.stringify(event));
|
||||
}
|
||||
}
|
||||
|
||||
sendAudio(muLawData: Buffer): void {
|
||||
if (!this.connected) {
|
||||
return;
|
||||
}
|
||||
this.sendEvent({
|
||||
type: "input_audio_buffer.append",
|
||||
audio: muLawData.toString("base64"),
|
||||
});
|
||||
}
|
||||
|
||||
onPartial(callback: (partial: string) => void): void {
|
||||
this.onPartialCallback = callback;
|
||||
}
|
||||
|
||||
onTranscript(callback: (transcript: string) => void): void {
|
||||
this.onTranscriptCallback = callback;
|
||||
}
|
||||
|
||||
onSpeechStart(callback: () => void): void {
|
||||
this.onSpeechStartCallback = callback;
|
||||
}
|
||||
|
||||
async waitForTranscript(timeoutMs = 30000): Promise<string> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timeout = setTimeout(() => {
|
||||
this.onTranscriptCallback = null;
|
||||
reject(new Error("Transcript timeout"));
|
||||
}, timeoutMs);
|
||||
|
||||
this.onTranscriptCallback = (transcript) => {
|
||||
clearTimeout(timeout);
|
||||
this.onTranscriptCallback = null;
|
||||
resolve(transcript);
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.closed = true;
|
||||
if (this.ws) {
|
||||
this.ws.close();
|
||||
this.ws = null;
|
||||
}
|
||||
this.connected = false;
|
||||
}
|
||||
|
||||
isConnected(): boolean {
|
||||
return this.connected;
|
||||
}
|
||||
}
|
||||
166
openclaw/extensions/voice-call/src/providers/telnyx.test.ts
Normal file
166
openclaw/extensions/voice-call/src/providers/telnyx.test.ts
Normal file
@@ -0,0 +1,166 @@
|
||||
import crypto from "node:crypto";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { WebhookContext } from "../types.js";
|
||||
import { TelnyxProvider } from "./telnyx.js";
|
||||
|
||||
function createCtx(params?: Partial<WebhookContext>): WebhookContext {
|
||||
return {
|
||||
headers: {},
|
||||
rawBody: "{}",
|
||||
url: "http://localhost/voice/webhook",
|
||||
method: "POST",
|
||||
query: {},
|
||||
remoteAddress: "127.0.0.1",
|
||||
...params,
|
||||
};
|
||||
}
|
||||
|
||||
function decodeBase64Url(input: string): Buffer {
|
||||
const normalized = input.replace(/-/g, "+").replace(/_/g, "/");
|
||||
const padLen = (4 - (normalized.length % 4)) % 4;
|
||||
const padded = normalized + "=".repeat(padLen);
|
||||
return Buffer.from(padded, "base64");
|
||||
}
|
||||
|
||||
function expectWebhookVerificationSucceeds(params: {
|
||||
publicKey: string;
|
||||
privateKey: crypto.KeyObject;
|
||||
}) {
|
||||
const provider = new TelnyxProvider(
|
||||
{ apiKey: "KEY123", connectionId: "CONN456", publicKey: params.publicKey },
|
||||
{ skipVerification: false },
|
||||
);
|
||||
|
||||
const rawBody = JSON.stringify({
|
||||
event_type: "call.initiated",
|
||||
payload: { call_control_id: "x" },
|
||||
});
|
||||
const timestamp = String(Math.floor(Date.now() / 1000));
|
||||
const signedPayload = `${timestamp}|${rawBody}`;
|
||||
const signature = crypto
|
||||
.sign(null, Buffer.from(signedPayload), params.privateKey)
|
||||
.toString("base64");
|
||||
|
||||
const result = provider.verifyWebhook(
|
||||
createCtx({
|
||||
rawBody,
|
||||
headers: {
|
||||
"telnyx-signature-ed25519": signature,
|
||||
"telnyx-timestamp": timestamp,
|
||||
},
|
||||
}),
|
||||
);
|
||||
expect(result.ok).toBe(true);
|
||||
}
|
||||
|
||||
describe("TelnyxProvider.verifyWebhook", () => {
|
||||
it("fails closed when public key is missing and skipVerification is false", () => {
|
||||
const provider = new TelnyxProvider(
|
||||
{ apiKey: "KEY123", connectionId: "CONN456", publicKey: undefined },
|
||||
{ skipVerification: false },
|
||||
);
|
||||
|
||||
const result = provider.verifyWebhook(createCtx());
|
||||
expect(result.ok).toBe(false);
|
||||
});
|
||||
|
||||
it("allows requests when skipVerification is true (development only)", () => {
|
||||
const provider = new TelnyxProvider(
|
||||
{ apiKey: "KEY123", connectionId: "CONN456", publicKey: undefined },
|
||||
{ skipVerification: true },
|
||||
);
|
||||
|
||||
const result = provider.verifyWebhook(createCtx());
|
||||
expect(result.ok).toBe(true);
|
||||
});
|
||||
|
||||
it("fails when signature headers are missing (with public key configured)", () => {
|
||||
const provider = new TelnyxProvider(
|
||||
{ apiKey: "KEY123", connectionId: "CONN456", publicKey: "public-key" },
|
||||
{ skipVerification: false },
|
||||
);
|
||||
|
||||
const result = provider.verifyWebhook(createCtx({ headers: {} }));
|
||||
expect(result.ok).toBe(false);
|
||||
});
|
||||
|
||||
it("verifies a valid signature with a raw Ed25519 public key (Base64)", () => {
|
||||
const { publicKey, privateKey } = crypto.generateKeyPairSync("ed25519");
|
||||
|
||||
const jwk = publicKey.export({ format: "jwk" }) as JsonWebKey;
|
||||
expect(jwk.kty).toBe("OKP");
|
||||
expect(jwk.crv).toBe("Ed25519");
|
||||
expect(typeof jwk.x).toBe("string");
|
||||
|
||||
const rawPublicKey = decodeBase64Url(jwk.x as string);
|
||||
const rawPublicKeyBase64 = rawPublicKey.toString("base64");
|
||||
expectWebhookVerificationSucceeds({ publicKey: rawPublicKeyBase64, privateKey });
|
||||
});
|
||||
|
||||
it("verifies a valid signature with a DER SPKI public key (Base64)", () => {
|
||||
const { publicKey, privateKey } = crypto.generateKeyPairSync("ed25519");
|
||||
const spkiDer = publicKey.export({ format: "der", type: "spki" }) as Buffer;
|
||||
const spkiDerBase64 = spkiDer.toString("base64");
|
||||
expectWebhookVerificationSucceeds({ publicKey: spkiDerBase64, privateKey });
|
||||
});
|
||||
|
||||
it("returns replay status when the same signed request is seen twice", () => {
|
||||
const { publicKey, privateKey } = crypto.generateKeyPairSync("ed25519");
|
||||
const spkiDer = publicKey.export({ format: "der", type: "spki" }) as Buffer;
|
||||
const provider = new TelnyxProvider(
|
||||
{ apiKey: "KEY123", connectionId: "CONN456", publicKey: spkiDer.toString("base64") },
|
||||
{ skipVerification: false },
|
||||
);
|
||||
|
||||
const rawBody = JSON.stringify({
|
||||
event_type: "call.initiated",
|
||||
payload: { call_control_id: "call-replay-test" },
|
||||
nonce: crypto.randomUUID(),
|
||||
});
|
||||
const timestamp = String(Math.floor(Date.now() / 1000));
|
||||
const signedPayload = `${timestamp}|${rawBody}`;
|
||||
const signature = crypto.sign(null, Buffer.from(signedPayload), privateKey).toString("base64");
|
||||
const ctx = createCtx({
|
||||
rawBody,
|
||||
headers: {
|
||||
"telnyx-signature-ed25519": signature,
|
||||
"telnyx-timestamp": timestamp,
|
||||
},
|
||||
});
|
||||
|
||||
const first = provider.verifyWebhook(ctx);
|
||||
const second = provider.verifyWebhook(ctx);
|
||||
|
||||
expect(first.ok).toBe(true);
|
||||
expect(first.isReplay).toBeFalsy();
|
||||
expect(first.verifiedRequestKey).toBeTruthy();
|
||||
expect(second.ok).toBe(true);
|
||||
expect(second.isReplay).toBe(true);
|
||||
expect(second.verifiedRequestKey).toBe(first.verifiedRequestKey);
|
||||
});
|
||||
});
|
||||
|
||||
describe("TelnyxProvider.parseWebhookEvent", () => {
|
||||
it("uses verified request key for manager dedupe", () => {
|
||||
const provider = new TelnyxProvider({
|
||||
apiKey: "KEY123",
|
||||
connectionId: "CONN456",
|
||||
publicKey: undefined,
|
||||
});
|
||||
const result = provider.parseWebhookEvent(
|
||||
createCtx({
|
||||
rawBody: JSON.stringify({
|
||||
data: {
|
||||
id: "evt-123",
|
||||
event_type: "call.initiated",
|
||||
payload: { call_control_id: "call-1" },
|
||||
},
|
||||
}),
|
||||
}),
|
||||
{ verifiedRequestKey: "telnyx:req:abc" },
|
||||
);
|
||||
|
||||
expect(result.events).toHaveLength(1);
|
||||
expect(result.events[0]?.dedupeKey).toBe("telnyx:req:abc");
|
||||
});
|
||||
});
|
||||
324
openclaw/extensions/voice-call/src/providers/telnyx.ts
Normal file
324
openclaw/extensions/voice-call/src/providers/telnyx.ts
Normal file
@@ -0,0 +1,324 @@
|
||||
import crypto from "node:crypto";
|
||||
import type { TelnyxConfig } from "../config.js";
|
||||
import type {
|
||||
EndReason,
|
||||
HangupCallInput,
|
||||
InitiateCallInput,
|
||||
InitiateCallResult,
|
||||
NormalizedEvent,
|
||||
PlayTtsInput,
|
||||
ProviderWebhookParseResult,
|
||||
StartListeningInput,
|
||||
StopListeningInput,
|
||||
WebhookContext,
|
||||
WebhookParseOptions,
|
||||
WebhookVerificationResult,
|
||||
} from "../types.js";
|
||||
import { verifyTelnyxWebhook } from "../webhook-security.js";
|
||||
import type { VoiceCallProvider } from "./base.js";
|
||||
import { guardedJsonApiRequest } from "./shared/guarded-json-api.js";
|
||||
|
||||
/**
|
||||
* Telnyx Voice API provider implementation.
|
||||
*
|
||||
* Uses Telnyx Call Control API v2 for managing calls.
|
||||
* @see https://developers.telnyx.com/docs/api/v2/call-control
|
||||
*/
|
||||
export interface TelnyxProviderOptions {
|
||||
/** Skip webhook signature verification (development only, NOT for production) */
|
||||
skipVerification?: boolean;
|
||||
}
|
||||
|
||||
export class TelnyxProvider implements VoiceCallProvider {
|
||||
readonly name = "telnyx" as const;
|
||||
|
||||
private readonly apiKey: string;
|
||||
private readonly connectionId: string;
|
||||
private readonly publicKey: string | undefined;
|
||||
private readonly options: TelnyxProviderOptions;
|
||||
private readonly baseUrl = "https://api.telnyx.com/v2";
|
||||
private readonly apiHost = "api.telnyx.com";
|
||||
|
||||
constructor(config: TelnyxConfig, options: TelnyxProviderOptions = {}) {
|
||||
if (!config.apiKey) {
|
||||
throw new Error("Telnyx API key is required");
|
||||
}
|
||||
if (!config.connectionId) {
|
||||
throw new Error("Telnyx connection ID is required");
|
||||
}
|
||||
|
||||
this.apiKey = config.apiKey;
|
||||
this.connectionId = config.connectionId;
|
||||
this.publicKey = config.publicKey;
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make an authenticated request to the Telnyx API.
|
||||
*/
|
||||
private async apiRequest<T = unknown>(
|
||||
endpoint: string,
|
||||
body: Record<string, unknown>,
|
||||
options?: { allowNotFound?: boolean },
|
||||
): Promise<T> {
|
||||
return await guardedJsonApiRequest<T>({
|
||||
url: `${this.baseUrl}${endpoint}`,
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body,
|
||||
allowNotFound: options?.allowNotFound,
|
||||
allowedHostnames: [this.apiHost],
|
||||
auditContext: "voice-call.telnyx.api",
|
||||
errorPrefix: "Telnyx API error",
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify Telnyx webhook signature using Ed25519.
|
||||
*/
|
||||
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult {
|
||||
const result = verifyTelnyxWebhook(ctx, this.publicKey, {
|
||||
skipVerification: this.options.skipVerification,
|
||||
});
|
||||
|
||||
return {
|
||||
ok: result.ok,
|
||||
reason: result.reason,
|
||||
isReplay: result.isReplay,
|
||||
verifiedRequestKey: result.verifiedRequestKey,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse Telnyx webhook event into normalized format.
|
||||
*/
|
||||
parseWebhookEvent(
|
||||
ctx: WebhookContext,
|
||||
options?: WebhookParseOptions,
|
||||
): ProviderWebhookParseResult {
|
||||
try {
|
||||
const payload = JSON.parse(ctx.rawBody);
|
||||
const data = payload.data;
|
||||
|
||||
if (!data || !data.event_type) {
|
||||
return { events: [], statusCode: 200 };
|
||||
}
|
||||
|
||||
const event = this.normalizeEvent(data, options?.verifiedRequestKey);
|
||||
return {
|
||||
events: event ? [event] : [],
|
||||
statusCode: 200,
|
||||
};
|
||||
} catch {
|
||||
return { events: [], statusCode: 400 };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Telnyx event to normalized event format.
|
||||
*/
|
||||
private normalizeEvent(data: TelnyxEvent, dedupeKey?: string): NormalizedEvent | null {
|
||||
// Decode client_state from Base64 (we encode it in initiateCall)
|
||||
let callId = "";
|
||||
if (data.payload?.client_state) {
|
||||
try {
|
||||
callId = Buffer.from(data.payload.client_state, "base64").toString("utf8");
|
||||
} catch {
|
||||
// Fallback if not valid Base64
|
||||
callId = data.payload.client_state;
|
||||
}
|
||||
}
|
||||
if (!callId) {
|
||||
callId = data.payload?.call_control_id || "";
|
||||
}
|
||||
|
||||
const baseEvent = {
|
||||
id: data.id || crypto.randomUUID(),
|
||||
dedupeKey,
|
||||
callId,
|
||||
providerCallId: data.payload?.call_control_id,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
|
||||
switch (data.event_type) {
|
||||
case "call.initiated":
|
||||
return { ...baseEvent, type: "call.initiated" };
|
||||
|
||||
case "call.ringing":
|
||||
return { ...baseEvent, type: "call.ringing" };
|
||||
|
||||
case "call.answered":
|
||||
return { ...baseEvent, type: "call.answered" };
|
||||
|
||||
case "call.bridged":
|
||||
return { ...baseEvent, type: "call.active" };
|
||||
|
||||
case "call.speak.started":
|
||||
return {
|
||||
...baseEvent,
|
||||
type: "call.speaking",
|
||||
text: data.payload?.text || "",
|
||||
};
|
||||
|
||||
case "call.transcription":
|
||||
return {
|
||||
...baseEvent,
|
||||
type: "call.speech",
|
||||
transcript: data.payload?.transcription || "",
|
||||
isFinal: data.payload?.is_final ?? true,
|
||||
confidence: data.payload?.confidence,
|
||||
};
|
||||
|
||||
case "call.hangup":
|
||||
return {
|
||||
...baseEvent,
|
||||
type: "call.ended",
|
||||
reason: this.mapHangupCause(data.payload?.hangup_cause),
|
||||
};
|
||||
|
||||
case "call.dtmf.received":
|
||||
return {
|
||||
...baseEvent,
|
||||
type: "call.dtmf",
|
||||
digits: data.payload?.digit || "",
|
||||
};
|
||||
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Map Telnyx hangup cause to normalized end reason.
|
||||
* @see https://developers.telnyx.com/docs/api/v2/call-control/Call-Commands#hangup-causes
|
||||
*/
|
||||
private mapHangupCause(cause?: string): EndReason {
|
||||
switch (cause) {
|
||||
case "normal_clearing":
|
||||
case "normal_unspecified":
|
||||
return "completed";
|
||||
case "originator_cancel":
|
||||
return "hangup-bot";
|
||||
case "call_rejected":
|
||||
case "user_busy":
|
||||
return "busy";
|
||||
case "no_answer":
|
||||
case "no_user_response":
|
||||
return "no-answer";
|
||||
case "destination_out_of_order":
|
||||
case "network_out_of_order":
|
||||
case "service_unavailable":
|
||||
case "recovery_on_timer_expire":
|
||||
return "failed";
|
||||
case "machine_detected":
|
||||
case "fax_detected":
|
||||
return "voicemail";
|
||||
case "user_hangup":
|
||||
case "subscriber_absent":
|
||||
return "hangup-user";
|
||||
default:
|
||||
// Unknown cause - log it for debugging and return completed
|
||||
if (cause) {
|
||||
console.warn(`[telnyx] Unknown hangup cause: ${cause}`);
|
||||
}
|
||||
return "completed";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiate an outbound call via Telnyx API.
|
||||
*/
|
||||
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
|
||||
const result = await this.apiRequest<TelnyxCallResponse>("/calls", {
|
||||
connection_id: this.connectionId,
|
||||
to: input.to,
|
||||
from: input.from,
|
||||
webhook_url: input.webhookUrl,
|
||||
webhook_url_method: "POST",
|
||||
client_state: Buffer.from(input.callId).toString("base64"),
|
||||
timeout_secs: 30,
|
||||
});
|
||||
|
||||
return {
|
||||
providerCallId: result.data.call_control_id,
|
||||
status: "initiated",
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Hang up a call via Telnyx API.
|
||||
*/
|
||||
async hangupCall(input: HangupCallInput): Promise<void> {
|
||||
await this.apiRequest(
|
||||
`/calls/${input.providerCallId}/actions/hangup`,
|
||||
{ command_id: crypto.randomUUID() },
|
||||
{ allowNotFound: true },
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Play TTS audio via Telnyx speak action.
|
||||
*/
|
||||
async playTts(input: PlayTtsInput): Promise<void> {
|
||||
await this.apiRequest(`/calls/${input.providerCallId}/actions/speak`, {
|
||||
command_id: crypto.randomUUID(),
|
||||
payload: input.text,
|
||||
voice: input.voice || "female",
|
||||
language: input.locale || "en-US",
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Start transcription (STT) via Telnyx.
|
||||
*/
|
||||
async startListening(input: StartListeningInput): Promise<void> {
|
||||
await this.apiRequest(`/calls/${input.providerCallId}/actions/transcription_start`, {
|
||||
command_id: crypto.randomUUID(),
|
||||
language: input.language || "en",
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop transcription via Telnyx.
|
||||
*/
|
||||
async stopListening(input: StopListeningInput): Promise<void> {
|
||||
await this.apiRequest(
|
||||
`/calls/${input.providerCallId}/actions/transcription_stop`,
|
||||
{ command_id: crypto.randomUUID() },
|
||||
{ allowNotFound: true },
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Telnyx-specific types
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
interface TelnyxEvent {
|
||||
id?: string;
|
||||
event_type: string;
|
||||
payload?: {
|
||||
call_control_id?: string;
|
||||
client_state?: string;
|
||||
text?: string;
|
||||
transcription?: string;
|
||||
is_final?: boolean;
|
||||
confidence?: number;
|
||||
hangup_cause?: string;
|
||||
digit?: string;
|
||||
[key: string]: unknown;
|
||||
};
|
||||
}
|
||||
|
||||
interface TelnyxCallResponse {
|
||||
data: {
|
||||
call_control_id: string;
|
||||
call_leg_id: string;
|
||||
call_session_id: string;
|
||||
is_alive: boolean;
|
||||
record_type: string;
|
||||
};
|
||||
}
|
||||
259
openclaw/extensions/voice-call/src/providers/tts-openai.ts
Normal file
259
openclaw/extensions/voice-call/src/providers/tts-openai.ts
Normal file
@@ -0,0 +1,259 @@
|
||||
/**
|
||||
* OpenAI TTS Provider
|
||||
*
|
||||
* Generates speech audio using OpenAI's text-to-speech API.
|
||||
* Handles audio format conversion for telephony (mu-law 8kHz).
|
||||
*
|
||||
* Best practices from OpenAI docs:
|
||||
* - Use gpt-4o-mini-tts for intelligent realtime applications (supports instructions)
|
||||
* - Use tts-1 for lower latency, tts-1-hd for higher quality
|
||||
* - Use marin or cedar voices for best quality
|
||||
* - Use pcm or wav format for fastest response times
|
||||
*
|
||||
* @see https://platform.openai.com/docs/guides/text-to-speech
|
||||
*/
|
||||
|
||||
/**
|
||||
* OpenAI TTS configuration.
|
||||
*/
|
||||
export interface OpenAITTSConfig {
|
||||
/** OpenAI API key (uses OPENAI_API_KEY env if not set) */
|
||||
apiKey?: string;
|
||||
/**
|
||||
* TTS model:
|
||||
* - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
|
||||
* - tts-1: lower latency
|
||||
* - tts-1-hd: higher quality
|
||||
*/
|
||||
model?: string;
|
||||
/**
|
||||
* Voice to use. For best quality, use marin or cedar.
|
||||
* All 13 voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
|
||||
* Note: tts-1/tts-1-hd only support: alloy, ash, coral, echo, fable, onyx, nova, sage, shimmer
|
||||
*/
|
||||
voice?: string;
|
||||
/** Speed multiplier (0.25 to 4.0) */
|
||||
speed?: number;
|
||||
/**
|
||||
* Instructions for speech style (only works with gpt-4o-mini-tts model).
|
||||
* Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
|
||||
*/
|
||||
instructions?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Supported OpenAI TTS voices (all 13 built-in voices).
|
||||
* For best quality, use marin or cedar.
|
||||
* Note: tts-1 and tts-1-hd support a smaller set.
|
||||
*/
|
||||
export const OPENAI_TTS_VOICES = [
|
||||
"alloy",
|
||||
"ash",
|
||||
"ballad",
|
||||
"coral",
|
||||
"echo",
|
||||
"fable",
|
||||
"nova",
|
||||
"onyx",
|
||||
"sage",
|
||||
"shimmer",
|
||||
"verse",
|
||||
"marin",
|
||||
"cedar",
|
||||
] as const;
|
||||
|
||||
export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number];
|
||||
|
||||
/**
|
||||
* OpenAI TTS Provider for generating speech audio.
|
||||
*/
|
||||
export class OpenAITTSProvider {
|
||||
private apiKey: string;
|
||||
private model: string;
|
||||
private voice: OpenAITTSVoice;
|
||||
private speed: number;
|
||||
private instructions?: string;
|
||||
|
||||
constructor(config: OpenAITTSConfig = {}) {
|
||||
this.apiKey = config.apiKey || process.env.OPENAI_API_KEY || "";
|
||||
// Default to gpt-4o-mini-tts for intelligent realtime applications
|
||||
this.model = config.model || "gpt-4o-mini-tts";
|
||||
// Default to coral - good balance of quality and natural tone
|
||||
this.voice = (config.voice as OpenAITTSVoice) || "coral";
|
||||
this.speed = config.speed || 1.0;
|
||||
this.instructions = config.instructions;
|
||||
|
||||
if (!this.apiKey) {
|
||||
throw new Error("OpenAI API key required (set OPENAI_API_KEY or pass apiKey)");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate speech audio from text.
|
||||
* Returns raw PCM audio data (24kHz, mono, 16-bit).
|
||||
*/
|
||||
async synthesize(text: string, instructions?: string): Promise<Buffer> {
|
||||
// Build request body
|
||||
const body: Record<string, unknown> = {
|
||||
model: this.model,
|
||||
input: text,
|
||||
voice: this.voice,
|
||||
response_format: "pcm", // Raw PCM audio (24kHz, mono, 16-bit signed LE)
|
||||
speed: this.speed,
|
||||
};
|
||||
|
||||
// Add instructions if using gpt-4o-mini-tts model
|
||||
const effectiveInstructions = instructions || this.instructions;
|
||||
if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) {
|
||||
body.instructions = effectiveInstructions;
|
||||
}
|
||||
|
||||
const response = await fetch("https://api.openai.com/v1/audio/speech", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`OpenAI TTS failed: ${response.status} - ${error}`);
|
||||
}
|
||||
|
||||
const arrayBuffer = await response.arrayBuffer();
|
||||
return Buffer.from(arrayBuffer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate speech and convert to mu-law format for Twilio.
|
||||
* Twilio Media Streams expect 8kHz mono mu-law audio.
|
||||
*/
|
||||
async synthesizeForTwilio(text: string): Promise<Buffer> {
|
||||
// Get raw PCM from OpenAI (24kHz, 16-bit signed LE, mono)
|
||||
const pcm24k = await this.synthesize(text);
|
||||
|
||||
// Resample from 24kHz to 8kHz
|
||||
const pcm8k = resample24kTo8k(pcm24k);
|
||||
|
||||
// Encode to mu-law
|
||||
return pcmToMulaw(pcm8k);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resample 24kHz PCM to 8kHz using linear interpolation.
|
||||
* Input/output: 16-bit signed little-endian mono.
|
||||
*/
|
||||
function resample24kTo8k(input: Buffer): Buffer {
|
||||
const inputSamples = input.length / 2;
|
||||
const outputSamples = Math.floor(inputSamples / 3);
|
||||
const output = Buffer.alloc(outputSamples * 2);
|
||||
|
||||
for (let i = 0; i < outputSamples; i++) {
|
||||
// Calculate position in input (3:1 ratio)
|
||||
const srcPos = i * 3;
|
||||
const srcIdx = srcPos * 2;
|
||||
|
||||
if (srcIdx + 3 < input.length) {
|
||||
// Linear interpolation between samples
|
||||
const s0 = input.readInt16LE(srcIdx);
|
||||
const s1 = input.readInt16LE(srcIdx + 2);
|
||||
const frac = srcPos % 1 || 0;
|
||||
const sample = Math.round(s0 + frac * (s1 - s0));
|
||||
output.writeInt16LE(clamp16(sample), i * 2);
|
||||
} else {
|
||||
// Last sample
|
||||
output.writeInt16LE(input.readInt16LE(srcIdx), i * 2);
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clamp value to 16-bit signed integer range.
|
||||
*/
|
||||
function clamp16(value: number): number {
|
||||
return Math.max(-32768, Math.min(32767, value));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert 16-bit PCM to 8-bit mu-law.
|
||||
* Standard G.711 mu-law encoding for telephony.
|
||||
*/
|
||||
function pcmToMulaw(pcm: Buffer): Buffer {
|
||||
const samples = pcm.length / 2;
|
||||
const mulaw = Buffer.alloc(samples);
|
||||
|
||||
for (let i = 0; i < samples; i++) {
|
||||
const sample = pcm.readInt16LE(i * 2);
|
||||
mulaw[i] = linearToMulaw(sample);
|
||||
}
|
||||
|
||||
return mulaw;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a single 16-bit linear sample to 8-bit mu-law.
|
||||
* Implements ITU-T G.711 mu-law encoding.
|
||||
*/
|
||||
function linearToMulaw(sample: number): number {
|
||||
const BIAS = 132;
|
||||
const CLIP = 32635;
|
||||
|
||||
// Get sign bit
|
||||
const sign = sample < 0 ? 0x80 : 0;
|
||||
if (sample < 0) {
|
||||
sample = -sample;
|
||||
}
|
||||
|
||||
// Clip to prevent overflow
|
||||
if (sample > CLIP) {
|
||||
sample = CLIP;
|
||||
}
|
||||
|
||||
// Add bias and find segment
|
||||
sample += BIAS;
|
||||
let exponent = 7;
|
||||
for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--, expMask >>= 1) {
|
||||
// Find the segment (exponent)
|
||||
}
|
||||
|
||||
// Extract mantissa bits
|
||||
const mantissa = (sample >> (exponent + 3)) & 0x0f;
|
||||
|
||||
// Combine into mu-law byte (inverted for transmission)
|
||||
return ~(sign | (exponent << 4) | mantissa) & 0xff;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert 8-bit mu-law to 16-bit linear PCM.
|
||||
* Useful for decoding incoming audio.
|
||||
*/
|
||||
export function mulawToLinear(mulaw: number): number {
|
||||
// mu-law is transmitted inverted
|
||||
mulaw = ~mulaw & 0xff;
|
||||
|
||||
const sign = mulaw & 0x80;
|
||||
const exponent = (mulaw >> 4) & 0x07;
|
||||
const mantissa = mulaw & 0x0f;
|
||||
|
||||
let sample = ((mantissa << 3) + 132) << exponent;
|
||||
sample -= 132;
|
||||
|
||||
return sign ? -sample : sample;
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk audio buffer into 20ms frames for streaming.
|
||||
* At 8kHz mono, 20ms = 160 samples = 160 bytes (mu-law).
|
||||
*/
|
||||
export function chunkAudio(audio: Buffer, chunkSize = 160): Generator<Buffer, void, unknown> {
|
||||
return (function* () {
|
||||
for (let i = 0; i < audio.length; i += chunkSize) {
|
||||
yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
|
||||
}
|
||||
})();
|
||||
}
|
||||
117
openclaw/extensions/voice-call/src/providers/twilio.test.ts
Normal file
117
openclaw/extensions/voice-call/src/providers/twilio.test.ts
Normal file
@@ -0,0 +1,117 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { WebhookContext } from "../types.js";
|
||||
import { TwilioProvider } from "./twilio.js";
|
||||
|
||||
const STREAM_URL = "wss://example.ngrok.app/voice/stream";
|
||||
|
||||
function createProvider(): TwilioProvider {
|
||||
return new TwilioProvider(
|
||||
{ accountSid: "AC123", authToken: "secret" },
|
||||
{ publicUrl: "https://example.ngrok.app", streamPath: "/voice/stream" },
|
||||
);
|
||||
}
|
||||
|
||||
function createContext(rawBody: string, query?: WebhookContext["query"]): WebhookContext {
|
||||
return {
|
||||
headers: {},
|
||||
rawBody,
|
||||
url: "https://example.ngrok.app/voice/twilio",
|
||||
method: "POST",
|
||||
query,
|
||||
};
|
||||
}
|
||||
|
||||
describe("TwilioProvider", () => {
|
||||
it("returns streaming TwiML for outbound conversation calls before in-progress", () => {
|
||||
const provider = createProvider();
|
||||
const ctx = createContext("CallStatus=initiated&Direction=outbound-api&CallSid=CA123", {
|
||||
callId: "call-1",
|
||||
});
|
||||
|
||||
const result = provider.parseWebhookEvent(ctx);
|
||||
|
||||
expect(result.providerResponseBody).toContain(STREAM_URL);
|
||||
expect(result.providerResponseBody).toContain('<Parameter name="token" value="');
|
||||
expect(result.providerResponseBody).toContain("<Connect>");
|
||||
});
|
||||
|
||||
it("returns empty TwiML for status callbacks", () => {
|
||||
const provider = createProvider();
|
||||
const ctx = createContext("CallStatus=ringing&Direction=outbound-api", {
|
||||
callId: "call-1",
|
||||
type: "status",
|
||||
});
|
||||
|
||||
const result = provider.parseWebhookEvent(ctx);
|
||||
|
||||
expect(result.providerResponseBody).toBe(
|
||||
'<?xml version="1.0" encoding="UTF-8"?><Response></Response>',
|
||||
);
|
||||
});
|
||||
|
||||
it("returns streaming TwiML for inbound calls", () => {
|
||||
const provider = createProvider();
|
||||
const ctx = createContext("CallStatus=ringing&Direction=inbound&CallSid=CA456");
|
||||
|
||||
const result = provider.parseWebhookEvent(ctx);
|
||||
|
||||
expect(result.providerResponseBody).toContain(STREAM_URL);
|
||||
expect(result.providerResponseBody).toContain('<Parameter name="token" value="');
|
||||
expect(result.providerResponseBody).toContain("<Connect>");
|
||||
});
|
||||
|
||||
it("uses a stable fallback dedupeKey for identical request payloads", () => {
|
||||
const provider = createProvider();
|
||||
const rawBody = "CallSid=CA789&Direction=inbound&SpeechResult=hello";
|
||||
const ctxA = {
|
||||
...createContext(rawBody, { callId: "call-1", turnToken: "turn-1" }),
|
||||
headers: { "i-twilio-idempotency-token": "idem-123" },
|
||||
};
|
||||
const ctxB = {
|
||||
...createContext(rawBody, { callId: "call-1", turnToken: "turn-1" }),
|
||||
headers: { "i-twilio-idempotency-token": "idem-123" },
|
||||
};
|
||||
|
||||
const eventA = provider.parseWebhookEvent(ctxA).events[0];
|
||||
const eventB = provider.parseWebhookEvent(ctxB).events[0];
|
||||
|
||||
expect(eventA).toBeDefined();
|
||||
expect(eventB).toBeDefined();
|
||||
expect(eventA?.id).not.toBe(eventB?.id);
|
||||
expect(eventA?.dedupeKey).toContain("twilio:fallback:");
|
||||
expect(eventA?.dedupeKey).toBe(eventB?.dedupeKey);
|
||||
});
|
||||
|
||||
it("uses verified request key for dedupe and ignores idempotency header changes", () => {
|
||||
const provider = createProvider();
|
||||
const rawBody = "CallSid=CA790&Direction=inbound&SpeechResult=hello";
|
||||
const ctxA = {
|
||||
...createContext(rawBody, { callId: "call-1", turnToken: "turn-1" }),
|
||||
headers: { "i-twilio-idempotency-token": "idem-a" },
|
||||
};
|
||||
const ctxB = {
|
||||
...createContext(rawBody, { callId: "call-1", turnToken: "turn-1" }),
|
||||
headers: { "i-twilio-idempotency-token": "idem-b" },
|
||||
};
|
||||
|
||||
const eventA = provider.parseWebhookEvent(ctxA, { verifiedRequestKey: "twilio:req:abc" })
|
||||
.events[0];
|
||||
const eventB = provider.parseWebhookEvent(ctxB, { verifiedRequestKey: "twilio:req:abc" })
|
||||
.events[0];
|
||||
|
||||
expect(eventA?.dedupeKey).toBe("twilio:req:abc");
|
||||
expect(eventB?.dedupeKey).toBe("twilio:req:abc");
|
||||
});
|
||||
|
||||
it("keeps turnToken from query on speech events", () => {
|
||||
const provider = createProvider();
|
||||
const ctx = createContext("CallSid=CA222&Direction=inbound&SpeechResult=hello", {
|
||||
callId: "call-2",
|
||||
turnToken: "turn-xyz",
|
||||
});
|
||||
|
||||
const event = provider.parseWebhookEvent(ctx).events[0];
|
||||
expect(event?.type).toBe("call.speech");
|
||||
expect(event?.turnToken).toBe("turn-xyz");
|
||||
});
|
||||
});
|
||||
687
openclaw/extensions/voice-call/src/providers/twilio.ts
Normal file
687
openclaw/extensions/voice-call/src/providers/twilio.ts
Normal file
@@ -0,0 +1,687 @@
|
||||
import crypto from "node:crypto";
|
||||
import type { TwilioConfig, WebhookSecurityConfig } from "../config.js";
|
||||
import { getHeader } from "../http-headers.js";
|
||||
import type { MediaStreamHandler } from "../media-stream.js";
|
||||
import { chunkAudio } from "../telephony-audio.js";
|
||||
import type { TelephonyTtsProvider } from "../telephony-tts.js";
|
||||
import type {
|
||||
HangupCallInput,
|
||||
InitiateCallInput,
|
||||
InitiateCallResult,
|
||||
NormalizedEvent,
|
||||
PlayTtsInput,
|
||||
ProviderWebhookParseResult,
|
||||
StartListeningInput,
|
||||
StopListeningInput,
|
||||
WebhookContext,
|
||||
WebhookParseOptions,
|
||||
WebhookVerificationResult,
|
||||
} from "../types.js";
|
||||
import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js";
|
||||
import type { VoiceCallProvider } from "./base.js";
|
||||
import { twilioApiRequest } from "./twilio/api.js";
|
||||
import { verifyTwilioProviderWebhook } from "./twilio/webhook.js";
|
||||
|
||||
function createTwilioRequestDedupeKey(ctx: WebhookContext, verifiedRequestKey?: string): string {
|
||||
if (verifiedRequestKey) {
|
||||
return verifiedRequestKey;
|
||||
}
|
||||
|
||||
const signature = getHeader(ctx.headers, "x-twilio-signature") ?? "";
|
||||
const params = new URLSearchParams(ctx.rawBody);
|
||||
const callSid = params.get("CallSid") ?? "";
|
||||
const callStatus = params.get("CallStatus") ?? "";
|
||||
const direction = params.get("Direction") ?? "";
|
||||
const callId = typeof ctx.query?.callId === "string" ? ctx.query.callId.trim() : "";
|
||||
const flow = typeof ctx.query?.flow === "string" ? ctx.query.flow.trim() : "";
|
||||
const turnToken = typeof ctx.query?.turnToken === "string" ? ctx.query.turnToken.trim() : "";
|
||||
return `twilio:fallback:${crypto
|
||||
.createHash("sha256")
|
||||
.update(
|
||||
`${signature}\n${callSid}\n${callStatus}\n${direction}\n${callId}\n${flow}\n${turnToken}\n${ctx.rawBody}`,
|
||||
)
|
||||
.digest("hex")}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Twilio Voice API provider implementation.
|
||||
*
|
||||
* Uses Twilio Programmable Voice API with Media Streams for real-time
|
||||
* bidirectional audio streaming.
|
||||
*
|
||||
* @see https://www.twilio.com/docs/voice
|
||||
* @see https://www.twilio.com/docs/voice/media-streams
|
||||
*/
|
||||
export interface TwilioProviderOptions {
|
||||
/** Allow ngrok free tier compatibility mode (loopback only, less secure) */
|
||||
allowNgrokFreeTierLoopbackBypass?: boolean;
|
||||
/** Override public URL for signature verification */
|
||||
publicUrl?: string;
|
||||
/** Path for media stream WebSocket (e.g., /voice/stream) */
|
||||
streamPath?: string;
|
||||
/** Skip webhook signature verification (development only) */
|
||||
skipVerification?: boolean;
|
||||
/** Webhook security options (forwarded headers/allowlist) */
|
||||
webhookSecurity?: WebhookSecurityConfig;
|
||||
}
|
||||
|
||||
export class TwilioProvider implements VoiceCallProvider {
|
||||
readonly name = "twilio" as const;
|
||||
|
||||
private readonly accountSid: string;
|
||||
private readonly authToken: string;
|
||||
private readonly baseUrl: string;
|
||||
private readonly callWebhookUrls = new Map<string, string>();
|
||||
private readonly options: TwilioProviderOptions;
|
||||
|
||||
/** Current public webhook URL (set when tunnel starts or from config) */
|
||||
private currentPublicUrl: string | null = null;
|
||||
|
||||
/** Optional telephony TTS provider for streaming TTS */
|
||||
private ttsProvider: TelephonyTtsProvider | null = null;
|
||||
|
||||
/** Optional media stream handler for sending audio */
|
||||
private mediaStreamHandler: MediaStreamHandler | null = null;
|
||||
|
||||
/** Map of call SID to stream SID for media streams */
|
||||
private callStreamMap = new Map<string, string>();
|
||||
/** Per-call tokens for media stream authentication */
|
||||
private streamAuthTokens = new Map<string, string>();
|
||||
|
||||
/** Storage for TwiML content (for notify mode with URL-based TwiML) */
|
||||
private readonly twimlStorage = new Map<string, string>();
|
||||
/** Track notify-mode calls to avoid streaming on follow-up callbacks */
|
||||
private readonly notifyCalls = new Set<string>();
|
||||
|
||||
/**
|
||||
* Delete stored TwiML for a given `callId`.
|
||||
*
|
||||
* We keep TwiML in-memory only long enough to satisfy the initial Twilio
|
||||
* webhook request (notify mode). Subsequent webhooks should not reuse it.
|
||||
*/
|
||||
private deleteStoredTwiml(callId: string): void {
|
||||
this.twimlStorage.delete(callId);
|
||||
this.notifyCalls.delete(callId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete stored TwiML for a call, addressed by Twilio's provider call SID.
|
||||
*
|
||||
* This is used when we only have `providerCallId` (e.g. hangup).
|
||||
*/
|
||||
private deleteStoredTwimlForProviderCall(providerCallId: string): void {
|
||||
const webhookUrl = this.callWebhookUrls.get(providerCallId);
|
||||
if (!webhookUrl) {
|
||||
return;
|
||||
}
|
||||
|
||||
const callIdMatch = webhookUrl.match(/callId=([^&]+)/);
|
||||
if (!callIdMatch) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.deleteStoredTwiml(callIdMatch[1]);
|
||||
this.streamAuthTokens.delete(providerCallId);
|
||||
}
|
||||
|
||||
constructor(config: TwilioConfig, options: TwilioProviderOptions = {}) {
|
||||
if (!config.accountSid) {
|
||||
throw new Error("Twilio Account SID is required");
|
||||
}
|
||||
if (!config.authToken) {
|
||||
throw new Error("Twilio Auth Token is required");
|
||||
}
|
||||
|
||||
this.accountSid = config.accountSid;
|
||||
this.authToken = config.authToken;
|
||||
this.baseUrl = `https://api.twilio.com/2010-04-01/Accounts/${this.accountSid}`;
|
||||
this.options = options;
|
||||
|
||||
if (options.publicUrl) {
|
||||
this.currentPublicUrl = options.publicUrl;
|
||||
}
|
||||
}
|
||||
|
||||
setPublicUrl(url: string): void {
|
||||
this.currentPublicUrl = url;
|
||||
}
|
||||
|
||||
getPublicUrl(): string | null {
|
||||
return this.currentPublicUrl;
|
||||
}
|
||||
|
||||
setTTSProvider(provider: TelephonyTtsProvider): void {
|
||||
this.ttsProvider = provider;
|
||||
}
|
||||
|
||||
setMediaStreamHandler(handler: MediaStreamHandler): void {
|
||||
this.mediaStreamHandler = handler;
|
||||
}
|
||||
|
||||
registerCallStream(callSid: string, streamSid: string): void {
|
||||
this.callStreamMap.set(callSid, streamSid);
|
||||
}
|
||||
|
||||
unregisterCallStream(callSid: string): void {
|
||||
this.callStreamMap.delete(callSid);
|
||||
}
|
||||
|
||||
isValidStreamToken(callSid: string, token?: string): boolean {
|
||||
const expected = this.streamAuthTokens.get(callSid);
|
||||
if (!expected || !token) {
|
||||
return false;
|
||||
}
|
||||
if (expected.length !== token.length) {
|
||||
const dummy = Buffer.from(expected);
|
||||
crypto.timingSafeEqual(dummy, dummy);
|
||||
return false;
|
||||
}
|
||||
return crypto.timingSafeEqual(Buffer.from(expected), Buffer.from(token));
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear TTS queue for a call (barge-in).
|
||||
* Used when user starts speaking to interrupt current TTS playback.
|
||||
*/
|
||||
clearTtsQueue(callSid: string): void {
|
||||
const streamSid = this.callStreamMap.get(callSid);
|
||||
if (streamSid && this.mediaStreamHandler) {
|
||||
this.mediaStreamHandler.clearTtsQueue(streamSid);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Make an authenticated request to the Twilio API.
|
||||
*/
|
||||
private async apiRequest<T = unknown>(
|
||||
endpoint: string,
|
||||
params: Record<string, string | string[]>,
|
||||
options?: { allowNotFound?: boolean },
|
||||
): Promise<T> {
|
||||
return await twilioApiRequest<T>({
|
||||
baseUrl: this.baseUrl,
|
||||
accountSid: this.accountSid,
|
||||
authToken: this.authToken,
|
||||
endpoint,
|
||||
body: params,
|
||||
allowNotFound: options?.allowNotFound,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify Twilio webhook signature using HMAC-SHA1.
|
||||
*
|
||||
* Handles reverse proxy scenarios (Tailscale, nginx, ngrok) by reconstructing
|
||||
* the public URL from forwarding headers.
|
||||
*
|
||||
* @see https://www.twilio.com/docs/usage/webhooks/webhooks-security
|
||||
*/
|
||||
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult {
|
||||
return verifyTwilioProviderWebhook({
|
||||
ctx,
|
||||
authToken: this.authToken,
|
||||
currentPublicUrl: this.currentPublicUrl,
|
||||
options: this.options,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse Twilio webhook event into normalized format.
|
||||
*/
|
||||
parseWebhookEvent(
|
||||
ctx: WebhookContext,
|
||||
options?: WebhookParseOptions,
|
||||
): ProviderWebhookParseResult {
|
||||
try {
|
||||
const params = new URLSearchParams(ctx.rawBody);
|
||||
const callIdFromQuery =
|
||||
typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
|
||||
? ctx.query.callId.trim()
|
||||
: undefined;
|
||||
const turnTokenFromQuery =
|
||||
typeof ctx.query?.turnToken === "string" && ctx.query.turnToken.trim()
|
||||
? ctx.query.turnToken.trim()
|
||||
: undefined;
|
||||
const dedupeKey = createTwilioRequestDedupeKey(ctx, options?.verifiedRequestKey);
|
||||
const event = this.normalizeEvent(params, {
|
||||
callIdOverride: callIdFromQuery,
|
||||
dedupeKey,
|
||||
turnToken: turnTokenFromQuery,
|
||||
});
|
||||
|
||||
// For Twilio, we must return TwiML. Most actions are driven by Calls API updates,
|
||||
// so the webhook response is typically a pause to keep the call alive.
|
||||
const twiml = this.generateTwimlResponse(ctx);
|
||||
|
||||
return {
|
||||
events: event ? [event] : [],
|
||||
providerResponseBody: twiml,
|
||||
providerResponseHeaders: { "Content-Type": "application/xml" },
|
||||
statusCode: 200,
|
||||
};
|
||||
} catch {
|
||||
return { events: [], statusCode: 400 };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse Twilio direction to normalized format.
|
||||
*/
|
||||
private static parseDirection(direction: string | null): "inbound" | "outbound" | undefined {
|
||||
if (direction === "inbound") {
|
||||
return "inbound";
|
||||
}
|
||||
if (direction === "outbound-api" || direction === "outbound-dial") {
|
||||
return "outbound";
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Twilio webhook params to normalized event format.
|
||||
*/
|
||||
private normalizeEvent(
|
||||
params: URLSearchParams,
|
||||
options?: {
|
||||
callIdOverride?: string;
|
||||
dedupeKey?: string;
|
||||
turnToken?: string;
|
||||
},
|
||||
): NormalizedEvent | null {
|
||||
const callSid = params.get("CallSid") || "";
|
||||
const callIdOverride = options?.callIdOverride;
|
||||
|
||||
const baseEvent = {
|
||||
id: crypto.randomUUID(),
|
||||
dedupeKey: options?.dedupeKey,
|
||||
callId: callIdOverride || callSid,
|
||||
providerCallId: callSid,
|
||||
timestamp: Date.now(),
|
||||
turnToken: options?.turnToken,
|
||||
direction: TwilioProvider.parseDirection(params.get("Direction")),
|
||||
from: params.get("From") || undefined,
|
||||
to: params.get("To") || undefined,
|
||||
};
|
||||
|
||||
// Handle speech result (from <Gather>)
|
||||
const speechResult = params.get("SpeechResult");
|
||||
if (speechResult) {
|
||||
return {
|
||||
...baseEvent,
|
||||
type: "call.speech",
|
||||
transcript: speechResult,
|
||||
isFinal: true,
|
||||
confidence: parseFloat(params.get("Confidence") || "0.9"),
|
||||
};
|
||||
}
|
||||
|
||||
// Handle DTMF
|
||||
const digits = params.get("Digits");
|
||||
if (digits) {
|
||||
return { ...baseEvent, type: "call.dtmf", digits };
|
||||
}
|
||||
|
||||
// Handle call status changes
|
||||
const callStatus = params.get("CallStatus");
|
||||
switch (callStatus) {
|
||||
case "initiated":
|
||||
return { ...baseEvent, type: "call.initiated" };
|
||||
case "ringing":
|
||||
return { ...baseEvent, type: "call.ringing" };
|
||||
case "in-progress":
|
||||
return { ...baseEvent, type: "call.answered" };
|
||||
case "completed":
|
||||
case "busy":
|
||||
case "no-answer":
|
||||
case "failed":
|
||||
this.streamAuthTokens.delete(callSid);
|
||||
if (callIdOverride) {
|
||||
this.deleteStoredTwiml(callIdOverride);
|
||||
}
|
||||
return { ...baseEvent, type: "call.ended", reason: callStatus };
|
||||
case "canceled":
|
||||
this.streamAuthTokens.delete(callSid);
|
||||
if (callIdOverride) {
|
||||
this.deleteStoredTwiml(callIdOverride);
|
||||
}
|
||||
return { ...baseEvent, type: "call.ended", reason: "hangup-bot" };
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static readonly EMPTY_TWIML =
|
||||
'<?xml version="1.0" encoding="UTF-8"?><Response></Response>';
|
||||
|
||||
private static readonly PAUSE_TWIML = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Pause length="30"/>
|
||||
</Response>`;
|
||||
|
||||
/**
|
||||
* Generate TwiML response for webhook.
|
||||
* When a call is answered, connects to media stream for bidirectional audio.
|
||||
*/
|
||||
private generateTwimlResponse(ctx?: WebhookContext): string {
|
||||
if (!ctx) {
|
||||
return TwilioProvider.EMPTY_TWIML;
|
||||
}
|
||||
|
||||
const params = new URLSearchParams(ctx.rawBody);
|
||||
const type = typeof ctx.query?.type === "string" ? ctx.query.type.trim() : undefined;
|
||||
const isStatusCallback = type === "status";
|
||||
const callStatus = params.get("CallStatus");
|
||||
const direction = params.get("Direction");
|
||||
const isOutbound = direction?.startsWith("outbound") ?? false;
|
||||
const callSid = params.get("CallSid") || undefined;
|
||||
const callIdFromQuery =
|
||||
typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
|
||||
? ctx.query.callId.trim()
|
||||
: undefined;
|
||||
|
||||
// Avoid logging webhook params/TwiML (may contain PII).
|
||||
|
||||
// Handle initial TwiML request (when Twilio first initiates the call)
|
||||
// Check if we have stored TwiML for this call (notify mode)
|
||||
if (callIdFromQuery && !isStatusCallback) {
|
||||
const storedTwiml = this.twimlStorage.get(callIdFromQuery);
|
||||
if (storedTwiml) {
|
||||
// Clean up after serving (one-time use)
|
||||
this.deleteStoredTwiml(callIdFromQuery);
|
||||
return storedTwiml;
|
||||
}
|
||||
if (this.notifyCalls.has(callIdFromQuery)) {
|
||||
return TwilioProvider.EMPTY_TWIML;
|
||||
}
|
||||
|
||||
// Conversation mode: return streaming TwiML immediately for outbound calls.
|
||||
if (isOutbound) {
|
||||
const streamUrl = callSid ? this.getStreamUrlForCall(callSid) : null;
|
||||
return streamUrl ? this.getStreamConnectXml(streamUrl) : TwilioProvider.PAUSE_TWIML;
|
||||
}
|
||||
}
|
||||
|
||||
// Status callbacks should not receive TwiML.
|
||||
if (isStatusCallback) {
|
||||
return TwilioProvider.EMPTY_TWIML;
|
||||
}
|
||||
|
||||
// Handle subsequent webhook requests (status callbacks, etc.)
|
||||
// For inbound calls, answer immediately with stream
|
||||
if (direction === "inbound") {
|
||||
const streamUrl = callSid ? this.getStreamUrlForCall(callSid) : null;
|
||||
return streamUrl ? this.getStreamConnectXml(streamUrl) : TwilioProvider.PAUSE_TWIML;
|
||||
}
|
||||
|
||||
// For outbound calls, only connect to stream when call is in-progress
|
||||
if (callStatus !== "in-progress") {
|
||||
return TwilioProvider.EMPTY_TWIML;
|
||||
}
|
||||
|
||||
const streamUrl = callSid ? this.getStreamUrlForCall(callSid) : null;
|
||||
return streamUrl ? this.getStreamConnectXml(streamUrl) : TwilioProvider.PAUSE_TWIML;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the WebSocket URL for media streaming.
|
||||
* Derives from the public URL origin + stream path.
|
||||
*/
|
||||
private getStreamUrl(): string | null {
|
||||
if (!this.currentPublicUrl || !this.options.streamPath) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract just the origin (host) from the public URL, ignoring any path
|
||||
const url = new URL(this.currentPublicUrl);
|
||||
const origin = url.origin;
|
||||
|
||||
// Convert https:// to wss:// for WebSocket
|
||||
const wsOrigin = origin.replace(/^https:\/\//, "wss://").replace(/^http:\/\//, "ws://");
|
||||
|
||||
// Append the stream path
|
||||
const path = this.options.streamPath.startsWith("/")
|
||||
? this.options.streamPath
|
||||
: `/${this.options.streamPath}`;
|
||||
|
||||
return `${wsOrigin}${path}`;
|
||||
}
|
||||
|
||||
private getStreamAuthToken(callSid: string): string {
|
||||
const existing = this.streamAuthTokens.get(callSid);
|
||||
if (existing) {
|
||||
return existing;
|
||||
}
|
||||
const token = crypto.randomBytes(16).toString("base64url");
|
||||
this.streamAuthTokens.set(callSid, token);
|
||||
return token;
|
||||
}
|
||||
|
||||
private getStreamUrlForCall(callSid: string): string | null {
|
||||
const baseUrl = this.getStreamUrl();
|
||||
if (!baseUrl) {
|
||||
return null;
|
||||
}
|
||||
const token = this.getStreamAuthToken(callSid);
|
||||
const url = new URL(baseUrl);
|
||||
url.searchParams.set("token", token);
|
||||
return url.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate TwiML to connect a call to a WebSocket media stream.
|
||||
* This enables bidirectional audio streaming for real-time STT/TTS.
|
||||
*
|
||||
* @param streamUrl - WebSocket URL (wss://...) for the media stream
|
||||
*/
|
||||
getStreamConnectXml(streamUrl: string): string {
|
||||
// Extract token from URL and pass via <Parameter> instead of query string.
|
||||
// Twilio strips query params from WebSocket URLs, but delivers <Parameter>
|
||||
// values in the "start" message's customParameters field.
|
||||
const parsed = new URL(streamUrl);
|
||||
const token = parsed.searchParams.get("token");
|
||||
parsed.searchParams.delete("token");
|
||||
const cleanUrl = parsed.toString();
|
||||
|
||||
const paramXml = token ? `\n <Parameter name="token" value="${escapeXml(token)}" />` : "";
|
||||
|
||||
return `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Connect>
|
||||
<Stream url="${escapeXml(cleanUrl)}">${paramXml}
|
||||
</Stream>
|
||||
</Connect>
|
||||
</Response>`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiate an outbound call via Twilio API.
|
||||
* If inlineTwiml is provided, uses that directly (for notify mode).
|
||||
* Otherwise, uses webhook URL for dynamic TwiML.
|
||||
*/
|
||||
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
|
||||
const url = new URL(input.webhookUrl);
|
||||
url.searchParams.set("callId", input.callId);
|
||||
|
||||
// Create separate URL for status callbacks (required by Twilio)
|
||||
const statusUrl = new URL(input.webhookUrl);
|
||||
statusUrl.searchParams.set("callId", input.callId);
|
||||
statusUrl.searchParams.set("type", "status"); // Differentiate from TwiML requests
|
||||
|
||||
// Store TwiML content if provided (for notify mode)
|
||||
// We now serve it from the webhook endpoint instead of sending inline
|
||||
if (input.inlineTwiml) {
|
||||
this.twimlStorage.set(input.callId, input.inlineTwiml);
|
||||
this.notifyCalls.add(input.callId);
|
||||
}
|
||||
|
||||
// Build request params - always use URL-based TwiML.
|
||||
// Twilio silently ignores `StatusCallback` when using the inline `Twiml` parameter.
|
||||
const params: Record<string, string | string[]> = {
|
||||
To: input.to,
|
||||
From: input.from,
|
||||
Url: url.toString(), // TwiML serving endpoint
|
||||
StatusCallback: statusUrl.toString(), // Separate status callback endpoint
|
||||
StatusCallbackEvent: ["initiated", "ringing", "answered", "completed"],
|
||||
Timeout: "30",
|
||||
};
|
||||
|
||||
const result = await this.apiRequest<TwilioCallResponse>("/Calls.json", params);
|
||||
|
||||
this.callWebhookUrls.set(result.sid, url.toString());
|
||||
|
||||
return {
|
||||
providerCallId: result.sid,
|
||||
status: result.status === "queued" ? "queued" : "initiated",
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Hang up a call via Twilio API.
|
||||
*/
|
||||
async hangupCall(input: HangupCallInput): Promise<void> {
|
||||
this.deleteStoredTwimlForProviderCall(input.providerCallId);
|
||||
|
||||
this.callWebhookUrls.delete(input.providerCallId);
|
||||
this.streamAuthTokens.delete(input.providerCallId);
|
||||
|
||||
await this.apiRequest(
|
||||
`/Calls/${input.providerCallId}.json`,
|
||||
{ Status: "completed" },
|
||||
{ allowNotFound: true },
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Play TTS audio via Twilio.
|
||||
*
|
||||
* Two modes:
|
||||
* 1. Core TTS + Media Streams: If TTS provider and media stream are available,
|
||||
* generates audio via core TTS and streams it through WebSocket (preferred).
|
||||
* 2. TwiML <Say>: Falls back to Twilio's native TTS with Polly voices.
|
||||
* Note: This may not work on all Twilio accounts.
|
||||
*/
|
||||
async playTts(input: PlayTtsInput): Promise<void> {
|
||||
// Try telephony TTS via media stream first (if configured)
|
||||
const streamSid = this.callStreamMap.get(input.providerCallId);
|
||||
if (this.ttsProvider && this.mediaStreamHandler && streamSid) {
|
||||
try {
|
||||
await this.playTtsViaStream(input.text, streamSid);
|
||||
return;
|
||||
} catch (err) {
|
||||
console.warn(
|
||||
`[voice-call] Telephony TTS failed, falling back to Twilio <Say>:`,
|
||||
err instanceof Error ? err.message : err,
|
||||
);
|
||||
// Fall through to TwiML <Say> fallback
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to TwiML <Say> (may not work on all accounts)
|
||||
const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
|
||||
if (!webhookUrl) {
|
||||
throw new Error("Missing webhook URL for this call (provider state not initialized)");
|
||||
}
|
||||
|
||||
console.warn(
|
||||
"[voice-call] Using TwiML <Say> fallback - telephony TTS not configured or media stream not active",
|
||||
);
|
||||
|
||||
const pollyVoice = mapVoiceToPolly(input.voice);
|
||||
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Say voice="${pollyVoice}" language="${input.locale || "en-US"}">${escapeXml(input.text)}</Say>
|
||||
<Gather input="speech" speechTimeout="auto" action="${escapeXml(webhookUrl)}" method="POST">
|
||||
<Say>.</Say>
|
||||
</Gather>
|
||||
</Response>`;
|
||||
|
||||
await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
|
||||
Twiml: twiml,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Play TTS via core TTS and Twilio Media Streams.
|
||||
* Generates audio with core TTS, converts to mu-law, and streams via WebSocket.
|
||||
* Uses a queue to serialize playback and prevent overlapping audio.
|
||||
*/
|
||||
private async playTtsViaStream(text: string, streamSid: string): Promise<void> {
|
||||
if (!this.ttsProvider || !this.mediaStreamHandler) {
|
||||
throw new Error("TTS provider and media stream handler required");
|
||||
}
|
||||
|
||||
// Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
|
||||
const CHUNK_SIZE = 160;
|
||||
const CHUNK_DELAY_MS = 20;
|
||||
|
||||
const handler = this.mediaStreamHandler;
|
||||
const ttsProvider = this.ttsProvider;
|
||||
await handler.queueTts(streamSid, async (signal) => {
|
||||
// Generate audio with core TTS (returns mu-law at 8kHz)
|
||||
const muLawAudio = await ttsProvider.synthesizeForTelephony(text);
|
||||
for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
|
||||
if (signal.aborted) {
|
||||
break;
|
||||
}
|
||||
handler.sendAudio(streamSid, chunk);
|
||||
|
||||
// Pace the audio to match real-time playback
|
||||
await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
|
||||
if (signal.aborted) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!signal.aborted) {
|
||||
// Send a mark to track when audio finishes
|
||||
handler.sendMark(streamSid, `tts-${Date.now()}`);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Start listening for speech via Twilio <Gather>.
|
||||
*/
|
||||
async startListening(input: StartListeningInput): Promise<void> {
|
||||
const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
|
||||
if (!webhookUrl) {
|
||||
throw new Error("Missing webhook URL for this call (provider state not initialized)");
|
||||
}
|
||||
|
||||
const actionUrl = new URL(webhookUrl);
|
||||
if (input.turnToken) {
|
||||
actionUrl.searchParams.set("turnToken", input.turnToken);
|
||||
}
|
||||
|
||||
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Gather input="speech" speechTimeout="auto" language="${input.language || "en-US"}" action="${escapeXml(actionUrl.toString())}" method="POST">
|
||||
</Gather>
|
||||
</Response>`;
|
||||
|
||||
await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
|
||||
Twiml: twiml,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop listening - for Twilio this is a no-op as <Gather> auto-ends.
|
||||
*/
|
||||
async stopListening(_input: StopListeningInput): Promise<void> {
|
||||
// Twilio's <Gather> automatically stops on speech end
|
||||
// No explicit action needed
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Twilio-specific types
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
interface TwilioCallResponse {
|
||||
sid: string;
|
||||
status: string;
|
||||
direction: string;
|
||||
from: string;
|
||||
to: string;
|
||||
uri: string;
|
||||
}
|
||||
42
openclaw/extensions/voice-call/src/providers/twilio/api.ts
Normal file
42
openclaw/extensions/voice-call/src/providers/twilio/api.ts
Normal file
@@ -0,0 +1,42 @@
|
||||
export async function twilioApiRequest<T = unknown>(params: {
|
||||
baseUrl: string;
|
||||
accountSid: string;
|
||||
authToken: string;
|
||||
endpoint: string;
|
||||
body: URLSearchParams | Record<string, string | string[]>;
|
||||
allowNotFound?: boolean;
|
||||
}): Promise<T> {
|
||||
const bodyParams =
|
||||
params.body instanceof URLSearchParams
|
||||
? params.body
|
||||
: Object.entries(params.body).reduce<URLSearchParams>((acc, [key, value]) => {
|
||||
if (Array.isArray(value)) {
|
||||
for (const entry of value) {
|
||||
acc.append(key, entry);
|
||||
}
|
||||
} else if (typeof value === "string") {
|
||||
acc.append(key, value);
|
||||
}
|
||||
return acc;
|
||||
}, new URLSearchParams());
|
||||
|
||||
const response = await fetch(`${params.baseUrl}${params.endpoint}`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Basic ${Buffer.from(`${params.accountSid}:${params.authToken}`).toString("base64")}`,
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
},
|
||||
body: bodyParams,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
if (params.allowNotFound && response.status === 404) {
|
||||
return undefined as T;
|
||||
}
|
||||
const errorText = await response.text();
|
||||
throw new Error(`Twilio API error: ${response.status} ${errorText}`);
|
||||
}
|
||||
|
||||
const text = await response.text();
|
||||
return text ? (JSON.parse(text) as T) : (undefined as T);
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
import type { WebhookContext, WebhookVerificationResult } from "../../types.js";
|
||||
import { verifyTwilioWebhook } from "../../webhook-security.js";
|
||||
import type { TwilioProviderOptions } from "../twilio.js";
|
||||
|
||||
export function verifyTwilioProviderWebhook(params: {
|
||||
ctx: WebhookContext;
|
||||
authToken: string;
|
||||
currentPublicUrl?: string | null;
|
||||
options: TwilioProviderOptions;
|
||||
}): WebhookVerificationResult {
|
||||
const result = verifyTwilioWebhook(params.ctx, params.authToken, {
|
||||
publicUrl: params.currentPublicUrl || undefined,
|
||||
allowNgrokFreeTierLoopbackBypass: params.options.allowNgrokFreeTierLoopbackBypass ?? false,
|
||||
skipVerification: params.options.skipVerification,
|
||||
allowedHosts: params.options.webhookSecurity?.allowedHosts,
|
||||
trustForwardingHeaders: params.options.webhookSecurity?.trustForwardingHeaders,
|
||||
trustedProxyIPs: params.options.webhookSecurity?.trustedProxyIPs,
|
||||
remoteIP: params.ctx.remoteAddress,
|
||||
});
|
||||
|
||||
if (!result.ok) {
|
||||
console.warn(`[twilio] Webhook verification failed: ${result.reason}`);
|
||||
if (result.verificationUrl) {
|
||||
console.warn(`[twilio] Verification URL: ${result.verificationUrl}`);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
ok: result.ok,
|
||||
reason: result.reason,
|
||||
isReplay: result.isReplay,
|
||||
verifiedRequestKey: result.verifiedRequestKey,
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user