make_call.ts•13.4 kB
import Fastify from 'fastify';
import WebSocket from 'ws';
import dotenv from 'dotenv';
import fastifyFormBody from '@fastify/formbody';
import fastifyWs from '@fastify/websocket';
import twilio from 'twilio';
// Load environment variables from .env file
dotenv.config();
// Retrieve the OpenAI API key from environment variables.
const { OPENAI_API_KEY, TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN, TWILIO_PHONE_NUMBER, TO_PHONE_NUMBER } = process.env;
if (!OPENAI_API_KEY) {
console.error('Missing OpenAI API key. Please set it in the .env file.');
process.exit(1);
}
// Check if Twilio credentials are available
if (!TWILIO_ACCOUNT_SID || !TWILIO_AUTH_TOKEN || !TWILIO_PHONE_NUMBER || !TO_PHONE_NUMBER) {
console.error('Missing Twilio credentials. Please set TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN, TWILIO_PHONE_NUMBER, and TO_PHONE_NUMBER in the .env file.');
process.exit(1);
}
// Initialize Twilio client
const twilioClient = twilio(TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN);
// Initialize Fastify
const fastify = Fastify();
fastify.register(fastifyFormBody);
fastify.register(fastifyWs);
// Constants
const SYSTEM_MESSAGE = `
You are an AI assistant calling a restaurant to order food for someone. Your job is to choose a meal based on their likes and dislikes.
Steps:
1. Say: "Hi, I'm calling to place a takeout order. What's on your menu today?"
2. Listen to what food is available. Ask for examples if needed.
3. Choose a dish that matches these preferences:
- Likes: spicy food, chicken, noodles, garlic, crispy texture
- Dislikes: mushrooms, seafood, dairy, sweet items, soggy texture
4. Ask questions if you're not sure what's in a dish.
5. Pick 1–2 good options, and ask for changes if needed (e.g. no mushrooms).
6. Confirm the order and pickup time.
7. End the call politely: "Thanks! That’s everything. Have a great day."
Be friendly and helpful. Make sure the food fits the customer's taste.
`;
const VOICE = 'alloy';
const PORT = parseInt(process.env.PORT || '5050', 10); // Allow dynamic port assignment
// List of Event Types to log to the console. See the OpenAI Realtime API Documentation: https://platform.openai.com/docs/api-reference/realtime
const LOG_EVENT_TYPES = [
'error',
'response.content.done',
'rate_limits.updated',
'response.done',
'input_audio_buffer.committed',
'input_audio_buffer.speech_stopped',
'input_audio_buffer.speech_started',
'session.created'
];
// Show AI response elapsed timing calculations
const SHOW_TIMING_MATH = false;
/**
* Makes an AI call using Twilio
* @param toNumber - The phone number to call (optional, defaults to TO_PHONE_NUMBER from env)
* @param fromNumber - The Twilio phone number to call from (optional, defaults to TWILIO_PHONE_NUMBER from env)
* @param serverUrl - The URL of your server (optional, defaults to localhost:5050)
* @returns Promise<twilio.twiml.VoiceResponse>
*/
export async function makeAICall(
toNumber?: string,
fromNumber?: string,
serverUrl?: string
): Promise<any> {
try {
const targetNumber = toNumber || TO_PHONE_NUMBER!;
const sourceNumber = fromNumber || TWILIO_PHONE_NUMBER!;
const baseUrl = serverUrl || `https://0ff574eb82cf.ngrok-free.app`;
console.log(`🤖 Initiating AI call to ${targetNumber} from ${sourceNumber}`);
console.log(`📞 Server URL: ${baseUrl}`);
const call = await twilioClient.calls.create({
from: sourceNumber,
to: targetNumber,
url: `${baseUrl}/incoming-call`,
statusCallback: `${baseUrl}/call-status`,
statusCallbackEvent: ['initiated', 'ringing', 'answered', 'completed'],
statusCallbackMethod: 'POST'
});
console.log(`✅ Call initiated successfully! Call SID: ${call.sid}`);
console.log(`📱 Call status: ${call.status}`);
return call;
} catch (error) {
console.error('❌ Error making AI call:', error);
throw error;
}
}
/**
* Starts the server that handles AI calls
* @param port - The port to run the server on (optional, defaults to 5050)
*/
export async function startAIServer(port?: number): Promise<void> {
const serverPort = port || PORT;
try {
await fastify.listen({ port: serverPort });
console.log(`🚀 AI Call Server is running on port ${serverPort}`);
console.log(`📞 Webhook URL: http://localhost:${serverPort}/incoming-call`);
console.log(`🔗 Media Stream URL: ws://localhost:${serverPort}/media-stream`);
} catch (err) {
console.error('❌ Error starting server:', err);
process.exit(1);
}
}
// Root Route
fastify.get('/', async (request, reply) => {
reply.send({
message: 'AI Call Server is running!',
endpoints: {
incomingCall: '/incoming-call',
mediaStream: '/media-stream',
callStatus: '/call-status'
}
});
});
// Route for Twilio to handle incoming calls
// <Say> punctuation to improve text-to-speech translation
fastify.all('/incoming-call', async (request, reply) => {
const twimlResponse = `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Connect>
<Stream url="wss://${request.headers.host}/media-stream" />
</Connect>
</Response>`;
reply.type('text/xml').send(twimlResponse);
});
// Route to handle call status updates
fastify.post('/call-status', async (request, reply) => {
const body = request.body as any;
console.log(`📞 Call ${body.CallSid} status: ${body.CallStatus}`);
reply.send({ received: true });
});
// WebSocket route for media-stream
fastify.register(async (fastify) => {
fastify.get('/media-stream', { websocket: true }, (connection, req) => {
console.log('Client connected');
// Connection-specific state
let streamSid: string | null = null;
let latestMediaTimestamp = 0;
let lastAssistantItem: string | null = null;
let markQueue: string[] = [];
let responseStartTimestampTwilio: number | null = null;
const openAiWs = new WebSocket('wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2025-06-03', {
headers: {
Authorization: `Bearer ${OPENAI_API_KEY}`,
"OpenAI-Beta": "realtime=v1"
}
});
// Control initial session with OpenAI
const initializeSession = () => {
const sessionUpdate = {
type: 'session.update',
session: {
turn_detection: { type: 'server_vad' },
input_audio_format: 'g711_ulaw',
output_audio_format: 'g711_ulaw',
voice: VOICE,
instructions: SYSTEM_MESSAGE,
modalities: ["text", "audio"],
temperature: 0.8,
}
};
console.log('Sending session update:', JSON.stringify(sessionUpdate));
openAiWs.send(JSON.stringify(sessionUpdate));
// Uncomment the following line to have AI speak first:
// sendInitialConversationItem();
};
// Handle interruption when the caller's speech starts
const handleSpeechStartedEvent = () => {
if (markQueue.length > 0 && responseStartTimestampTwilio != null) {
const elapsedTime = latestMediaTimestamp - responseStartTimestampTwilio;
if (SHOW_TIMING_MATH) console.log(`Calculating elapsed time for truncation: ${latestMediaTimestamp} - ${responseStartTimestampTwilio} = ${elapsedTime}ms`);
if (lastAssistantItem) {
const truncateEvent = {
type: 'conversation.item.truncate',
item_id: lastAssistantItem,
content_index: 0,
audio_end_ms: elapsedTime
};
if (SHOW_TIMING_MATH) console.log('Sending truncation event:', JSON.stringify(truncateEvent));
openAiWs.send(JSON.stringify(truncateEvent));
}
connection.send(JSON.stringify({
event: 'clear',
streamSid: streamSid
}));
// Reset
markQueue = [];
lastAssistantItem = null;
responseStartTimestampTwilio = null;
}
};
// Send mark messages to Media Streams so we know if and when AI response playback is finished
const sendMark = (connection: any, streamSid: string | null) => {
if (streamSid) {
const markEvent = {
event: 'mark',
streamSid: streamSid,
mark: { name: 'responsePart' }
};
connection.send(JSON.stringify(markEvent));
markQueue.push('responsePart');
}
};
// Open event for OpenAI WebSocket
openAiWs.on('open', () => {
console.log('Connected to the OpenAI Realtime API');
setTimeout(initializeSession, 100);
});
// Listen for messages from the OpenAI WebSocket (and send to Twilio if necessary)
openAiWs.on('message', (data) => {
try {
const response = JSON.parse(data.toString());
if (LOG_EVENT_TYPES.includes(response.type)) {
console.log(`Received event: ${response.type}`, response);
}
if (response.type === 'response.audio.delta' && response.delta) {
const audioDelta = {
event: 'media',
streamSid: streamSid,
media: { payload: response.delta }
};
connection.send(JSON.stringify(audioDelta));
// First delta from a new response starts the elapsed time counter
if (!responseStartTimestampTwilio) {
responseStartTimestampTwilio = latestMediaTimestamp;
if (SHOW_TIMING_MATH) console.log(`Setting start timestamp for new response: ${responseStartTimestampTwilio}ms`);
}
if (response.item_id) {
lastAssistantItem = response.item_id;
}
sendMark(connection, streamSid);
}
if (response.type === 'input_audio_buffer.speech_started') {
handleSpeechStartedEvent();
}
} catch (error) {
console.error('Error processing OpenAI message:', error, 'Raw message:', data);
}
});
// Handle incoming messages from Twilio
connection.on('message', (message) => {
try {
const data = JSON.parse(message.toString());
switch (data.event) {
case 'media':
latestMediaTimestamp = data.media.timestamp;
if (SHOW_TIMING_MATH) console.log(`Received media message with timestamp: ${latestMediaTimestamp}ms`);
if (openAiWs.readyState === WebSocket.OPEN) {
const audioAppend = {
type: 'input_audio_buffer.append',
audio: data.media.payload
};
openAiWs.send(JSON.stringify(audioAppend));
}
break;
case 'start':
streamSid = data.start.streamSid;
console.log('Incoming stream has started', streamSid);
// Reset start and media timestamp on a new stream
responseStartTimestampTwilio = null;
latestMediaTimestamp = 0;
break;
case 'mark':
if (markQueue.length > 0) {
markQueue.shift();
}
break;
default:
console.log('Received non-media event:', data.event);
break;
}
} catch (error) {
console.error('Error parsing message:', error, 'Message:', message);
}
});
// Handle connection close
connection.on('close', () => {
if (openAiWs.readyState === WebSocket.OPEN) openAiWs.close();
console.log('Client disconnected.');
});
// Handle WebSocket close and errors
openAiWs.on('close', () => {
console.log('Disconnected from the OpenAI Realtime API');
});
openAiWs.on('error', (error) => {
console.error('Error in the OpenAI WebSocket:', error);
});
});
});
fastify.listen({ port: PORT }, (err) => {
if (err) {
console.error(err);
process.exit(1);
}
console.log(`Server is listening on port ${PORT}`);
});