Skip to main content
Glama
claude-mcp-integration.test.ts28.7 kB
import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest'; import Anthropic from '@anthropic-ai/sdk'; import { Client } from "@modelcontextprotocol/sdk/client/index.js"; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; /** * Minimal Claude + MCP Integration Tests * * PURPOSE: Test ONLY what's unique to LLM integration: * 1. Can Claude understand user intent and select appropriate tools? * 2. Can Claude handle multi-step reasoning? * 3. Can Claude handle ambiguous requests appropriately? * * NOT TESTED HERE (covered in direct-integration.test.ts): * - Tool functionality * - Conflict detection * - Calendar operations * - Error handling * - Performance */ interface LLMResponse { content: string; toolCalls: Array<{ name: string; arguments: Record<string, any> }>; executedResults: Array<{ toolCall: { name: string; arguments: Record<string, any> }; result: any; success: boolean; }>; } // Pricing per million tokens (as of 2025) const MODEL_PRICING: Record<string, { input: number; output: number }> = { 'claude-haiku-4-5-20251001': { input: 1.00, output: 5.00 }, 'claude-sonnet-4-5-20250929': { input: 3.00, output: 15.00 }, }; // Default model for tests - using Haiku 4.5 for cost efficiency const DEFAULT_MODEL = 'claude-haiku-4-5-20251001'; interface CostTracker { model: string; totalInputTokens: number; totalOutputTokens: number; totalCacheWriteTokens: number; totalCacheReadTokens: number; requestCount: number; perTestCosts: Array<{ testName: string; inputTokens: number; outputTokens: number; cost: number }>; } class ClaudeMCPClient { private anthropic: Anthropic; private mcpClient: Client; private costTracker: CostTracker; private currentTestName: string = ''; constructor(apiKey: string, mcpClient: Client) { this.anthropic = new Anthropic({ apiKey }); this.mcpClient = mcpClient; const model = process.env.ANTHROPIC_MODEL ?? DEFAULT_MODEL; this.costTracker = { model, totalInputTokens: 0, totalOutputTokens: 0, totalCacheWriteTokens: 0, totalCacheReadTokens: 0, requestCount: 0, perTestCosts: [] }; } setCurrentTest(testName: string) { this.currentTestName = testName; } private calculateCost( inputTokens: number, outputTokens: number, cacheWriteTokens: number = 0, cacheReadTokens: number = 0 ): number { const pricing = MODEL_PRICING[this.costTracker.model] || { input: 3.00, output: 15.00 }; // Cache write costs 25% more, cache read costs 90% less (10% of base) const inputCost = (inputTokens / 1_000_000) * pricing.input; const outputCost = (outputTokens / 1_000_000) * pricing.output; const cacheWriteCost = (cacheWriteTokens / 1_000_000) * pricing.input * 1.25; const cacheReadCost = (cacheReadTokens / 1_000_000) * pricing.input * 0.10; return inputCost + outputCost + cacheWriteCost + cacheReadCost; } async sendMessage(prompt: string): Promise<LLMResponse> { // Get available tools from MCP server const availableTools = await this.mcpClient.listTools(); const model = process.env.ANTHROPIC_MODEL ?? DEFAULT_MODEL; // Convert MCP tools to Claude format with prompt caching // Adding cache_control to the LAST tool caches ALL tools as a prefix const claudeTools = availableTools.tools.map((tool, index, arr) => { const baseTool = { name: tool.name, description: tool.description, input_schema: tool.inputSchema }; // Add cache_control to last tool to cache entire tool schema if (index === arr.length - 1) { return { ...baseTool, cache_control: { type: 'ephemeral' as const } }; } return baseTool; }); // Send to Claude const message = await this.anthropic.messages.create({ model, max_tokens: 2500, tools: claudeTools, messages: [{ role: 'user' as const, content: prompt }] }); // Track token usage (including cache metrics) const usage = message.usage as { input_tokens: number; output_tokens: number; cache_creation_input_tokens?: number; cache_read_input_tokens?: number; }; const inputTokens = usage.input_tokens || 0; const outputTokens = usage.output_tokens || 0; const cacheWriteTokens = usage.cache_creation_input_tokens || 0; const cacheReadTokens = usage.cache_read_input_tokens || 0; this.costTracker.totalInputTokens += inputTokens; this.costTracker.totalOutputTokens += outputTokens; this.costTracker.totalCacheWriteTokens += cacheWriteTokens; this.costTracker.totalCacheReadTokens += cacheReadTokens; this.costTracker.requestCount++; // Track per-test costs if (this.currentTestName) { const cost = this.calculateCost(inputTokens, outputTokens, cacheWriteTokens, cacheReadTokens); const existing = this.costTracker.perTestCosts.find(t => t.testName === this.currentTestName); if (existing) { existing.inputTokens += inputTokens; existing.outputTokens += outputTokens; existing.cost += cost; } else { this.costTracker.perTestCosts.push({ testName: this.currentTestName, inputTokens, outputTokens, cost }); } } // Extract tool calls const toolCalls: Array<{ name: string; arguments: Record<string, any> }> = []; let textContent = ''; message.content.forEach(content => { if (content.type === 'text') { textContent += content.text; } else if (content.type === 'tool_use') { toolCalls.push({ name: content.name, arguments: content.input as Record<string, any> }); } }); // Execute tool calls const executedResults = []; for (const toolCall of toolCalls) { try { const result = await this.mcpClient.callTool({ name: toolCall.name, arguments: toolCall.arguments }); executedResults.push({ toolCall, result, success: true }); } catch (error) { executedResults.push({ toolCall, result: { error: String(error) }, success: false }); } } return { content: textContent, toolCalls, executedResults }; } getCostSummary(): string { const pricing = MODEL_PRICING[this.costTracker.model] || { input: 3.00, output: 15.00 }; const totalCost = this.calculateCost( this.costTracker.totalInputTokens, this.costTracker.totalOutputTokens, this.costTracker.totalCacheWriteTokens, this.costTracker.totalCacheReadTokens ); // Calculate what cost would have been without caching const totalTokensIfNoCaching = this.costTracker.totalInputTokens + this.costTracker.totalCacheWriteTokens + this.costTracker.totalCacheReadTokens; const costWithoutCaching = this.calculateCost(totalTokensIfNoCaching, this.costTracker.totalOutputTokens); const savings = costWithoutCaching - totalCost; const savingsPercent = costWithoutCaching > 0 ? (savings / costWithoutCaching) * 100 : 0; let summary = '\n' + '='.repeat(70) + '\n'; summary += ' CLAUDE API COST SUMMARY\n'; summary += '='.repeat(70) + '\n\n'; summary += `Model: ${this.costTracker.model}\n`; summary += `Pricing: $${pricing.input.toFixed(2)}/MTok input, $${pricing.output.toFixed(2)}/MTok output\n\n`; summary += `Total Requests: ${this.costTracker.requestCount}\n`; summary += `Input Tokens (uncached): ${this.costTracker.totalInputTokens.toLocaleString()}\n`; summary += `Output Tokens: ${this.costTracker.totalOutputTokens.toLocaleString()}\n`; // Cache statistics summary += '\n--- Cache Statistics ---\n'; summary += `Cache Write Tokens: ${this.costTracker.totalCacheWriteTokens.toLocaleString()}\n`; summary += `Cache Read Tokens: ${this.costTracker.totalCacheReadTokens.toLocaleString()}\n`; const cacheHitRate = this.costTracker.totalCacheWriteTokens + this.costTracker.totalCacheReadTokens > 0 ? (this.costTracker.totalCacheReadTokens / (this.costTracker.totalCacheWriteTokens + this.costTracker.totalCacheReadTokens)) * 100 : 0; summary += `Cache Hit Rate: ${cacheHitRate.toFixed(1)}%\n`; summary += '\n' + '-'.repeat(70) + '\n'; summary += 'Per-Test Breakdown:\n'; summary += '-'.repeat(70) + '\n'; for (const test of this.costTracker.perTestCosts) { const shortName = test.testName.length > 50 ? test.testName.substring(0, 47) + '...' : test.testName; summary += `${shortName.padEnd(52)} $${test.cost.toFixed(4)}\n`; } summary += '-'.repeat(70) + '\n'; summary += `${'TOTAL COST'.padEnd(52)} $${totalCost.toFixed(4)}\n`; if (savings > 0) { summary += `${'Cost without caching'.padEnd(52)} $${costWithoutCaching.toFixed(4)}\n`; summary += `${'SAVINGS FROM CACHING'.padEnd(52)} $${savings.toFixed(4)} (${savingsPercent.toFixed(1)}%)\n`; } summary += '='.repeat(70) + '\n'; return summary; } } describe('Claude + MCP Essential Tests', () => { let mcpClient: Client; let claudeClient: ClaudeMCPClient; beforeAll(async () => { // Start MCP server const cleanEnv = Object.fromEntries( Object.entries(process.env).filter(([_, value]) => value !== undefined) ) as Record<string, string>; cleanEnv.NODE_ENV = 'test'; // Create MCP client mcpClient = new Client({ name: "minimal-test-client", version: "1.0.0" }, { capabilities: { tools: {} } }); // Connect to server const transport = new StdioClientTransport({ command: 'node', args: ['build/index.js'], env: cleanEnv }); await mcpClient.connect(transport); // Initialize Claude client const apiKey = process.env.CLAUDE_API_KEY; if (!apiKey) { throw new Error('CLAUDE_API_KEY not set'); } claudeClient = new ClaudeMCPClient(apiKey, mcpClient); // Verify connection const tools = await mcpClient.listTools(); console.log(`Connected to MCP with ${tools.tools.length} tools available`); }, 30000); beforeEach((context) => { if (claudeClient) { claudeClient.setCurrentTest(context.task.name); } }); afterAll(async () => { if (claudeClient) { console.log(claudeClient.getCostSummary()); } if (mcpClient) await mcpClient.close(); }, 10000); describe('Core LLM Capabilities', () => { it('should select appropriate tools for user intent', async () => { // Synonyms to make intent detection more robust against LLM response variation const intentSynonyms: Record<string, string[]> = { 'create': ['create', 'schedule', 'book', 'add', 'set up', 'meeting', 'event'], 'search': ['search', 'find', 'look', 'meetings', 'sarah', 'results'], 'availability': ['availability', 'free', 'busy', 'available', 'afternoon', 'schedule'] }; const testCases = [ { intent: 'create', prompt: 'Schedule a meeting tomorrow at 3 PM', expectedTools: ['create-event', 'get-current-time'] }, { intent: 'search', prompt: 'Find my meetings with Sarah', expectedTools: ['search-events', 'list-events', 'get-current-time'] }, { intent: 'availability', prompt: 'Am I free tomorrow afternoon?', expectedTools: ['get-freebusy', 'list-events', 'get-current-time'] } ]; for (const test of testCases) { const response = await claudeClient.sendMessage(test.prompt); // Check if Claude used one of the expected tools const usedExpectedTool = response.toolCalls.some(tc => test.expectedTools.includes(tc.name) ); // Or at least understood the intent in its response (check synonyms) const responseText = response.content.toLowerCase(); const synonyms = intentSynonyms[test.intent] || [test.intent]; const mentionedIntent = synonyms.some(word => responseText.includes(word)); const understoodIntent = usedExpectedTool || mentionedIntent; expect(understoodIntent).toBe(true); } }, 60000); it('should handle multi-step requests', async () => { const response = await claudeClient.sendMessage( 'What time is it now, and do I have any meetings in the next 2 hours?' ); // This requires multiple tool calls or understanding multiple parts const handledMultiStep = response.toolCalls.length > 1 || // Multiple tools used (response.toolCalls.some(tc => tc.name === 'get-current-time') && response.toolCalls.some(tc => tc.name === 'list-events')) || // Both time and events (response.content.includes('time') && response.content.includes('meeting')); // Understood both parts expect(handledMultiStep).toBe(true); }, 30000); it('should handle ambiguous requests gracefully', async () => { const response = await claudeClient.sendMessage( 'Set up the usual' ); // Claude should either: // 1. Ask for clarification // 2. Make a reasonable attempt with available context // 3. Explain what information is needed const handledGracefully = response.content.toLowerCase().includes('what') || response.content.toLowerCase().includes('specify') || response.content.toLowerCase().includes('usual') || response.content.toLowerCase().includes('more') || response.toolCalls.length > 0; // Or attempts something expect(handledGracefully).toBe(true); }, 30000); }); describe('Tool Selection Accuracy', () => { it('should distinguish between list and search operations', async () => { // Specific search should use search-events const searchResponse = await claudeClient.sendMessage( 'Find meetings about project alpha' ); const usedSearch = searchResponse.toolCalls.some(tc => tc.name === 'search-events') || searchResponse.content.toLowerCase().includes('search'); // General list should use list-events const listResponse = await claudeClient.sendMessage( 'Show me tomorrow\'s schedule' ); const usedList = listResponse.toolCalls.some(tc => tc.name === 'list-events') || listResponse.content.toLowerCase().includes('tomorrow'); // At least one should be correct expect(usedSearch || usedList).toBe(true); }, 30000); it('should understand when NOT to use tools', async () => { const response = await claudeClient.sendMessage( 'How does Google Calendar handle recurring events?' ); // This is a question about calendars, not a calendar operation // Claude should either: // 1. Not use tools and explain // 2. Use minimal tools (like list-calendars) to provide context const appropriateResponse = response.toolCalls.length === 0 || // No tools response.toolCalls.length === 1 && response.toolCalls[0].name === 'list-calendars' || // Just checking calendars response.content.toLowerCase().includes('recurring'); // Explains about recurring events expect(appropriateResponse).toBe(true); }, 30000); }); describe('Context Understanding', () => { it('should understand relative time expressions', async () => { const testPhrases = [ 'tomorrow at 2 PM', 'next Monday', 'in 30 minutes' ]; for (const phrase of testPhrases) { const response = await claudeClient.sendMessage( `Schedule a meeting ${phrase}` ); // Claude should either get current time or attempt to create an event const understoodTime = response.toolCalls.some(tc => tc.name === 'get-current-time' || tc.name === 'create-event' ) || response.content.toLowerCase().includes(phrase.split(' ')[0]); // References the time expect(understoodTime).toBe(true); } }, 60000); }); describe('Multi-Calendar Coordination', () => { // These tests verify Claude's ability to handle realistic multi-calendar scenarios // that require coordinating across multiple calendars and accounts it('should coordinate availability across work and personal calendars', async () => { const response = await claudeClient.sendMessage( `I need to find a 2-hour block tomorrow where I'm free on both my work calendar and my personal calendar. Can you check my availability across both?` ); // Claude should understand this requires checking multiple calendars const understoodMultiCalendar = response.toolCalls.some(tc => tc.name === 'get-freebusy' || tc.name === 'list-events' || tc.name === 'list-calendars' ) || response.content.toLowerCase().includes('calendar') || response.content.toLowerCase().includes('availability'); // Should reference checking multiple sources or ask which calendars const handledMultipleCalendars = understoodMultiCalendar || response.content.toLowerCase().includes('both') || response.content.toLowerCase().includes('work') || response.content.toLowerCase().includes('personal'); expect(handledMultipleCalendars).toBe(true); }, 45000); it('should find mutual availability for group meeting', async () => { const response = await claudeClient.sendMessage( `I need to schedule a team meeting for tomorrow afternoon. Can you find a 1-hour slot where all team calendars are free? Check the primary calendars from both my work and personal accounts.` ); // Claude should use freebusy or list-events to check availability const usedAvailabilityTools = response.toolCalls.some(tc => tc.name === 'get-freebusy' || tc.name === 'list-events' ); // Or understood the multi-person coordination need const understoodCoordination = usedAvailabilityTools || response.content.toLowerCase().includes('availability') || response.content.toLowerCase().includes('free') || response.content.toLowerCase().includes('slot') || response.content.toLowerCase().includes('team'); expect(understoodCoordination).toBe(true); }, 45000); it('should handle cross-calendar conflict detection', async () => { const response = await claudeClient.sendMessage( `I want to schedule a dentist appointment on my personal calendar next Tuesday at 10 AM, but first check if I have any conflicts on my work calendar at that time.` ); // Claude should check for conflicts before creating const toolNames = response.toolCalls.map(tc => tc.name); // Should use conflict detection or availability checking tools const checkedConflicts = toolNames.includes('get-freebusy') || toolNames.includes('list-events') || response.content.toLowerCase().includes('conflict') || response.content.toLowerCase().includes('check'); expect(checkedConflicts).toBe(true); }, 45000); it('should understand calendar-specific event creation', async () => { const response = await claudeClient.sendMessage( `Schedule "Doctor Visit" on my personal calendar (not work) for next Friday at 2 PM. Make it a 1-hour appointment.` ); // Claude should understand to target a specific calendar const attemptedCreate = response.toolCalls.some(tc => tc.name === 'create-event' || tc.name === 'list-calendars' ); // Or asked for clarification about which calendar const understoodCalendarTarget = attemptedCreate || response.content.toLowerCase().includes('personal') || response.content.toLowerCase().includes('calendar') || response.content.toLowerCase().includes('which'); expect(understoodCalendarTarget).toBe(true); }, 45000); it('should plan complex multi-step scheduling workflow', async () => { const response = await claudeClient.sendMessage( `I need to block off time for a trip next week. Please: 1. First show me what meetings I have scheduled next week across all calendars 2. Then identify any meetings I'll need to reschedule 3. Create an "Out of Office" event on my primary calendar for Monday-Wednesday` ); // This requires multi-step reasoning const toolNames = response.toolCalls.map(tc => tc.name); // Should attempt to list events or understand the multi-step nature const handledMultiStep = toolNames.includes('list-events') || toolNames.includes('list-calendars') || response.content.toLowerCase().includes('step') || response.content.toLowerCase().includes('first') || response.content.toLowerCase().includes('next week') || response.toolCalls.length >= 1; expect(handledMultiStep).toBe(true); }, 45000); it('should suggest using account parameter for disambiguation', async () => { const response = await claudeClient.sendMessage( `I have multiple Google accounts connected. Schedule a meeting on the calendar from my work account specifically, not my personal one.` ); // Claude should understand account disambiguation const understoodAccountContext = response.toolCalls.some(tc => tc.arguments?.account !== undefined || tc.name === 'list-calendars' ) || response.content.toLowerCase().includes('account') || response.content.toLowerCase().includes('work') || response.content.toLowerCase().includes('which'); expect(understoodAccountContext).toBe(true); }, 45000); it('should check shared calendar availability', async () => { const response = await claudeClient.sendMessage( `Check if the team shared calendar has any events scheduled for tomorrow afternoon. I want to book the conference room if it's available.` ); // Should query events or freebusy for shared calendar const checkedSharedCalendar = response.toolCalls.some(tc => tc.name === 'list-events' || tc.name === 'get-freebusy' || tc.name === 'list-calendars' ) || response.content.toLowerCase().includes('shared') || response.content.toLowerCase().includes('team') || response.content.toLowerCase().includes('available'); expect(checkedSharedCalendar).toBe(true); }, 45000); it('should handle "block time on all calendars" request', async () => { const response = await claudeClient.sendMessage( `I need to block off 9 AM to 12 PM tomorrow for deep work. Create "Focus Time - Do Not Disturb" events on ALL my calendars so people see I'm busy regardless of which calendar they check.` ); // Should understand the need to create on multiple calendars const understoodMultiCalendarCreate = response.toolCalls.some(tc => tc.name === 'create-event' || tc.name === 'list-calendars' ) || response.content.toLowerCase().includes('all') || response.content.toLowerCase().includes('multiple') || response.content.toLowerCase().includes('calendars'); expect(understoodMultiCalendarCreate).toBe(true); }, 45000); }); describe('Real-World Scheduling Scenarios', () => { // These test realistic prompts that users might actually send it('should handle "when can we meet" style requests', async () => { const response = await claudeClient.sendMessage( `When am I free for a 30-minute call sometime this week? Prefer afternoon slots.` ); const handledAvailabilityRequest = response.toolCalls.some(tc => tc.name === 'get-freebusy' || tc.name === 'list-events' || tc.name === 'get-current-time' ) || response.content.toLowerCase().includes('free') || response.content.toLowerCase().includes('available') || response.content.toLowerCase().includes('afternoon'); expect(handledAvailabilityRequest).toBe(true); }, 45000); it('should handle rescheduling with constraints', async () => { const response = await claudeClient.sendMessage( `I need to move my 2 PM meeting today to sometime tomorrow, but it can't conflict with my existing appointments. Find me an open slot.` ); // Should check availability and understand rescheduling const handledReschedule = response.toolCalls.some(tc => tc.name === 'list-events' || tc.name === 'get-freebusy' || tc.name === 'search-events' || tc.name === 'get-current-time' || tc.name === 'update-event' ) || response.content.toLowerCase().includes('reschedul') || response.content.toLowerCase().includes('move') || response.content.toLowerCase().includes('conflict') || response.content.toLowerCase().includes('meeting') || response.content.toLowerCase().includes('tomorrow') || response.content.toLowerCase().includes('slot'); expect(handledReschedule).toBe(true); }, 45000); it('should understand recurring meeting context', async () => { const response = await claudeClient.sendMessage( `I have a weekly team standup that I need to skip this week only. Can you help me handle just this occurrence without affecting future meetings?` ); // Should understand recurring event modification scope const understoodRecurring = response.toolCalls.some(tc => tc.name === 'search-events' || tc.name === 'list-events' || tc.name === 'update-event' || tc.name === 'delete-event' ) || response.content.toLowerCase().includes('recurring') || response.content.toLowerCase().includes('this week only') || response.content.toLowerCase().includes('occurrence') || response.content.toLowerCase().includes('instance'); expect(understoodRecurring).toBe(true); }, 45000); it('should coordinate external attendee scheduling', async () => { const response = await claudeClient.sendMessage( `Schedule a meeting with an external client next week. I need to check my availability first, then create the meeting with them as an attendee. Their email is client@example.com.` ); // Should handle multi-step external attendee flow const handledExternalAttendee = response.toolCalls.some(tc => tc.name === 'list-events' || tc.name === 'get-freebusy' || tc.name === 'create-event' ) || response.content.toLowerCase().includes('attendee') || response.content.toLowerCase().includes('client') || response.content.toLowerCase().includes('availability'); expect(handledExternalAttendee).toBe(true); }, 45000); }); }); /** * Test Categories: * * Core LLM Capabilities: * ✅ Tool selection for different intents (create, search, availability) * ✅ Multi-step request handling (LLM reasoning) * ✅ Ambiguous request handling (LLM robustness) * ✅ Context understanding - relative time expressions * ✅ Knowing when NOT to use tools (LLM judgment) * * Multi-Calendar Coordination (NEW): * ✅ Cross-calendar availability checking (work + personal) * ✅ Mutual availability for group meetings * ✅ Cross-calendar conflict detection * ✅ Calendar-specific event creation * ✅ Multi-step scheduling workflows * ✅ Account parameter disambiguation * ✅ Shared calendar operations * ✅ Block time across all calendars * * Real-World Scenarios (NEW): * ✅ "When can we meet" style requests * ✅ Rescheduling with constraints * ✅ Recurring meeting modifications (single occurrence) * ✅ External attendee coordination * * What's NOT tested here (covered in direct-integration.test.ts): * ✂️ Tool functionality and response formats * ✂️ API error handling * ✂️ Performance benchmarks * ✂️ Data validation */

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/nspady/google-calendar-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server