Code Executor MCP Server

sampling-executor-integration.test.ts•19.8 KiB

import { describe, it, expect, beforeAll, beforeEach, afterEach, vi } from 'vitest'; import { executeTypescriptInSandbox } from '../src/executors/sandbox-executor.js'; import { executePythonInSandbox } from '../src/executors/pyodide-executor.js'; import { MCPClientPool } from '../src/mcp/client-pool.js'; import { initConfig } from '../src/config/loader.js'; import nock from 'nock'; let anthropicScope: nock.Scope; // Initialize config before all tests beforeAll(async () => { await initConfig(); }); // Setup fake timers and HTTP mocking for integration tests beforeEach(() => { vi.useFakeTimers(); // Set ANTHROPIC_API_KEY for fallback mode process.env.ANTHROPIC_API_KEY = 'test-key-for-integration-tests'; // Mock Anthropic API HTTP endpoint (for when sampling falls back to direct API) anthropicScope = nock('https://api.anthropic.com') .persist() .post('/v1/messages') .reply(200, { id: 'msg_integration_test', type: 'message', role: 'assistant', content: [ { type: 'text', text: 'Mock Claude response for integration test' } ], model: 'claude-3-5-haiku-20241022', stop_reason: 'end_turn', usage: { input_tokens: 15, output_tokens: 25 } }); }); afterEach(() => { vi.useRealTimers(); vi.clearAllMocks(); // Clean up nock mocks nock.cleanAll(); }); describe('Sampling Executor Integration', () => { let mcpClientPool: MCPClientPool; beforeEach(() => { mcpClientPool = new MCPClientPool(); }); describe('TypeScript Sampling', () => { it('should_throwError_when_samplingDisabledAndLlmAskCalled', async () => { // RED: This test will fail until TypeScript sampling integration is implemented const code = ` try { const result = await llm.ask("Hello, world!"); console.log(result); } catch (error) { console.error(error.message); throw error; } `; const result = await executeTypescriptInSandbox( { code, allowedTools: [], timeoutMs: 5000, enableSampling: false, permissions: { read: [], write: [], net: [] } }, mcpClientPool ); // Should fail because sampling is disabled expect(result.success).toBe(false); expect(result.error).toContain('Sampling not enabled'); }); it('should_returnClaudeResponse_when_llmAskCalled', async () => { const code = ` const response = await llm.ask("What is the capital of France?"); console.log("Response:", response); `; const result = await executeTypescriptInSandbox( { code, allowedTools: [], timeoutMs: 10000, enableSampling: true, permissions: { read: [], write: [], net: [] } }, mcpClientPool ); expect(result.success).toBe(true); expect(result).toHaveProperty('samplingCalls'); expect(result.samplingCalls).toBeDefined(); expect(result.samplingCalls!.length).toBeGreaterThanOrEqual(1); expect(result.samplingCalls![0]).toHaveProperty('response'); expect(result.samplingCalls![0].response.content[0].text).toBe('Mock Claude response for integration test'); }); it('should_supportMultiTurn_when_llmThinkCalledWithMessages', async () => { const code = ` const messages = [ { role: 'user', content: 'Hello' }, { role: 'assistant', content: 'Hi there!' }, { role: 'user', content: 'How are you?' } ]; const response = await llm.think({ messages }); console.log("Multi-turn response:", response); `; const result = await executeTypescriptInSandbox( { code, allowedTools: [], timeoutMs: 10000, enableSampling: true, permissions: { read: [], write: [], net: [] } }, mcpClientPool ); expect(result.success).toBe(true); expect(result.samplingCalls).toBeDefined(); expect(result.samplingCalls!.length).toBeGreaterThanOrEqual(1); expect(result.samplingCalls![0].messages).toHaveLength(3); expect(result.samplingCalls![0].response.content[0].text).toBe('Mock Claude response for integration test'); }); it('should_enforceRateLimits_when_multipleCallsMade', async () => { const code = ` try { for (let i = 0; i < 12; i++) { const response = await llm.ask(\`Question \${i}\`); console.log(\`Call \${i}:\`, response); } } catch (error) { console.error(error.message); throw error; } `; const result = await executeTypescriptInSandbox( { code, allowedTools: [], timeoutMs: 30000, enableSampling: true, maxSamplingRounds: 10, permissions: { read: [], write: [], net: [] } }, mcpClientPool ); // Should fail due to rate limit exceeded expect(result.success).toBe(false); expect(result.error).toMatch(/Rate limit exceeded/); }); }); describe('Multi-Provider Model Selection', () => { it('should_useGeminiModel_when_providerIsGemini', async () => { // Set Gemini provider process.env.CODE_EXECUTOR_SAMPLING_ENABLED = 'true'; process.env.CODE_EXECUTOR_AI_PROVIDER = 'gemini'; process.env.GEMINI_API_KEY = 'test-gemini-key'; delete process.env.ANTHROPIC_API_KEY; // Mock Gemini API endpoint const geminiScope = nock('https://generativelanguage.googleapis.com') .persist() .post(/\/v1beta\/models\/.*:generateContent/) .reply(200, { candidates: [ { content: { parts: [{ text: 'Gemini response' }] }, finishReason: 'STOP' } ], usageMetadata: { promptTokenCount: 10, candidatesTokenCount: 5 } }); const code = ` const response = await llm.ask("Test"); console.log("Response:", response); `; const mcpClientPool = new MCPClientPool(); await mcpClientPool.initialize(); const result = await executeTypescriptInSandbox( { code, allowedTools: [], timeoutMs: 10000, permissions: {}, enableSampling: true, maxSamplingRounds: 5, maxSamplingTokens: 1000 }, mcpClientPool, null ); await mcpClientPool.disconnect(); expect(result.success).toBe(true); expect(result.samplingCalls).toBeDefined(); expect(result.samplingCalls?.[0]?.model).toMatch(/gemini/i); geminiScope.done(); nock.cleanAll(); }); it('should_useOpenAIModel_when_providerIsOpenAI', async () => { // Set OpenAI provider process.env.CODE_EXECUTOR_SAMPLING_ENABLED = 'true'; process.env.CODE_EXECUTOR_AI_PROVIDER = 'openai'; process.env.OPENAI_API_KEY = 'test-openai-key'; delete process.env.ANTHROPIC_API_KEY; // Mock OpenAI API endpoint const openaiScope = nock('https://api.openai.com') .persist() .post('/v1/chat/completions') .reply(200, { id: 'chatcmpl-test', object: 'chat.completion', created: Date.now(), model: 'gpt-4o-mini', choices: [ { index: 0, message: { role: 'assistant', content: 'OpenAI response' }, finish_reason: 'stop' } ], usage: { prompt_tokens: 10, completion_tokens: 5, total_tokens: 15 } }); const code = ` const response = await llm.ask("Test"); console.log("Response:", response); `; const mcpClientPool = new MCPClientPool(); await mcpClientPool.initialize(); const result = await executeTypescriptInSandbox( { code, allowedTools: [], timeoutMs: 10000, permissions: {}, enableSampling: true, maxSamplingRounds: 5, maxSamplingTokens: 1000 }, mcpClientPool, null ); await mcpClientPool.disconnect(); expect(result.success).toBe(true); expect(result.samplingCalls).toBeDefined(); expect(result.samplingCalls?.[0]?.model).toMatch(/gpt-4o-mini/i); openaiScope.done(); nock.cleanAll(); }); it('should_notSendModelParam_when_llmAskCalledWithoutModel', async () => { // Test that llm.ask doesn't send a model parameter to sampling bridge // This allows the bridge to choose provider-specific default const code = ` const response = await llm.ask("Test"); console.log("Response:", response); `; const mcpClientPool = new MCPClientPool(); await mcpClientPool.initialize(); const result = await executeTypescriptInSandbox( { code, allowedTools: [], timeoutMs: 10000, permissions: {}, enableSampling: true, maxSamplingRounds: 5, maxSamplingTokens: 1000 }, mcpClientPool, null ); await mcpClientPool.disconnect(); // If llm.ask hardcoded a model, it would fail with Gemini/OpenAI // Success means the model parameter was omitted and provider-specific model was used expect(result.success).toBe(true); }); }); describe('Python Sampling', () => { // Python tests need real timers (Pyodide async operations don't work with fake timers) beforeEach(() => { vi.useRealTimers(); }); afterEach(() => { vi.useFakeTimers(); // Restore fake timers for other tests }); it('should_throwError_when_samplingDisabledAndLlmAskCalled', async () => { const code = ` try: result = await llm.ask("Hello, world!") print(result) except Exception as error: print(f"Error: {error}") raise error `; const result = await executePythonInSandbox( { code, allowedTools: [], timeoutMs: 5000, enableSampling: false, permissions: { read: [], write: [], net: [] } }, mcpClientPool ); // Should fail because sampling is disabled expect(result.success).toBe(false); expect(result.error).toContain('Sampling not enabled'); }); it('should_returnClaudeResponse_when_llmAskCalled', async () => { const code = ` response = await llm.ask("What is the capital of France?") print(f"Response: {response}") `; const result = await executePythonInSandbox( { code, allowedTools: [], timeoutMs: 10000, enableSampling: true, permissions: { read: [], write: [], net: [] } }, mcpClientPool ); expect(result.success).toBe(true); expect(result).toHaveProperty('samplingCalls'); expect(result.samplingCalls).toBeDefined(); expect(result.samplingCalls!.length).toBeGreaterThanOrEqual(1); expect(result.samplingCalls![0]).toHaveProperty('response'); expect(result.samplingCalls![0].response.content[0].text).toBe('Mock Claude response for integration test'); }); it('should_supportMultiTurn_when_llmThinkCalledWithMessages', async () => { const code = ` messages = [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}, {"role": "user", "content": "How are you?"} ] response = await llm.think(messages=messages) print(f"Multi-turn response: {response}") `; const result = await executePythonInSandbox( { code, allowedTools: [], timeoutMs: 10000, enableSampling: true, permissions: { read: [], write: [], net: [] } }, mcpClientPool ); expect(result.success).toBe(true); expect(result.samplingCalls).toBeDefined(); expect(result.samplingCalls!.length).toBeGreaterThanOrEqual(1); expect(result.samplingCalls![0].messages).toHaveLength(3); expect(result.samplingCalls![0].response.content[0].text).toBe('Mock Claude response for integration test'); }); }); describe('Sampling Metadata', () => { it('should_returnSamplingMetrics_when_executionCompletes', async () => { const code = ` const response1 = await llm.ask("First question"); const response2 = await llm.ask("Second question"); console.log("Completed 2 sampling calls"); `; const result = await executeTypescriptInSandbox( { code, allowedTools: [], timeoutMs: 10000, enableSampling: true, permissions: { read: [], write: [], net: [] } }, mcpClientPool ); expect(result.success).toBe(true); expect(result).toHaveProperty('samplingMetrics'); expect(result.samplingMetrics).toBeDefined(); expect(result.samplingMetrics!.totalRounds).toBe(2); expect(result.samplingMetrics!.totalTokens).toBeGreaterThan(0); expect(result.samplingMetrics!.averageTokensPerRound).toBeGreaterThan(0); }); it('should_streamChunks_when_streamingEnabled', async () => { // Note: Streaming support will be added in T061 const code = ` const response = await llm.ask("Test streaming"); console.log(response); `; const result = await executeTypescriptInSandbox( { code, allowedTools: [], timeoutMs: 10000, enableSampling: true, streaming: true, permissions: { read: [], write: [], net: [] } }, mcpClientPool ); // For now, verify basic functionality works // Streaming test will be enhanced when SSE is implemented expect(result.success).toBe(true); expect(result.samplingCalls).toBeDefined(); }); }); // Additional integration test stubs will be added as implementation progresses describe('T085: Sampling Metrics in Execution Result', () => { it('should_returnSamplingMetrics_when_executionCompletes', async () => { const code = ` const result = await llm.ask('What is 2+2?'); console.log('Result:', result); `; const result = await executeTypescriptInSandbox({ code, allowedTools: [], timeoutMs: 10000, permissions: { read: [], write: [], net: [] }, enableSampling: true, maxSamplingRounds: 5, maxSamplingTokens: 5000, }, mcpClientPool); // Expected to have samplingCalls array expect(result.samplingCalls).toBeDefined(); expect(Array.isArray(result.samplingCalls)).toBe(true); // Expected to have samplingMetrics expect(result.samplingMetrics).toBeDefined(); expect(result.samplingMetrics).toHaveProperty('totalRounds'); expect(result.samplingMetrics).toHaveProperty('totalTokens'); expect(result.samplingMetrics).toHaveProperty('totalDurationMs'); expect(result.samplingMetrics).toHaveProperty('averageTokensPerRound'); expect(result.samplingMetrics).toHaveProperty('quotaRemaining'); }); it('should_includeSamplingCallDetails_when_llmInvoked', async () => { const code = ` const result1 = await llm.ask('First question'); const result2 = await llm.ask('Second question'); console.log('Done'); `; const result = await executeTypescriptInSandbox({ code, allowedTools: [], timeoutMs: 10000, permissions: { read: [], write: [], net: [] }, enableSampling: true, }, mcpClientPool); expect(result.samplingCalls).toBeDefined(); expect(result.samplingCalls?.length).toBeGreaterThanOrEqual(2); // Each sampling call should have required fields result.samplingCalls?.forEach(call => { expect(call).toHaveProperty('model'); expect(call).toHaveProperty('messages'); expect(call).toHaveProperty('response'); expect(call).toHaveProperty('durationMs'); expect(call).toHaveProperty('tokensUsed'); expect(call).toHaveProperty('timestamp'); }); }); it('should_calculateQuotaRemaining_when_metricsReturned', async () => { const code = ` await llm.ask('Test question'); `; const maxRounds = 10; const result = await executeTypescriptInSandbox({ code, allowedTools: [], timeoutMs: 10000, permissions: { read: [], write: [], net: [] }, enableSampling: true, maxSamplingRounds: maxRounds, }, mcpClientPool); expect(result.samplingMetrics).toBeDefined(); expect(result.samplingMetrics?.totalRounds).toBeLessThanOrEqual(maxRounds); expect(result.samplingMetrics?.quotaRemaining.rounds).toBeGreaterThanOrEqual(0); expect(result.samplingMetrics?.quotaRemaining.rounds).toBeLessThanOrEqual(maxRounds); }); it('should_omitSamplingMetrics_when_samplingNotUsed', async () => { const code = ` console.log('No LLM calls'); `; const result = await executeTypescriptInSandbox({ code, allowedTools: [], timeoutMs: 10000, permissions: { read: [], write: [], net: [] }, enableSampling: true, }, mcpClientPool); // If no sampling calls made, metrics should be undefined or empty if (result.samplingMetrics) { expect(result.samplingMetrics.totalRounds).toBe(0); } }); }); describe('T086: Docker Detection and Bridge URL', () => { it('should_useHostDockerInternal_when_dockerDetected', async () => { // Simulate Docker environment const originalEnv = process.env.DOCKER_CONTAINER; process.env.DOCKER_CONTAINER = 'true'; const code = ` // Bridge URL should use host.docker.internal in Docker console.log('Running in Docker'); `; try { const result = await executeTypescriptInSandbox({ code, allowedTools: [], timeoutMs: 10000, permissions: { read: [], write: [], net: [] }, enableSampling: true, }, mcpClientPool); // Verify execution succeeds in Docker environment expect(result.success).toBe(true); // Bridge URL should contain host.docker.internal // (Implementation will verify this internally) } finally { // Restore env if (originalEnv === undefined) { delete process.env.DOCKER_CONTAINER; } else { process.env.DOCKER_CONTAINER = originalEnv; } } }); it('should_useLocalhost_when_dockerNotDetected', async () => { // Ensure Docker env vars are not set const originalContainer = process.env.DOCKER_CONTAINER; delete process.env.DOCKER_CONTAINER; const code = ` console.log('Running on host'); `; try { const result = await executeTypescriptInSandbox({ code, allowedTools: [], timeoutMs: 10000, permissions: { read: [], write: [], net: [] }, enableSampling: true, }, mcpClientPool); expect(result.success).toBe(true); // Bridge URL should use localhost (default) } finally { // Restore env if (originalContainer !== undefined) { process.env.DOCKER_CONTAINER = originalContainer; } } }); it('should_detectDockerEnvFile_when_dotDockerenvExists', async () => { // Test simulates checking for /.dockerenv file // Actual implementation will check fs.existsSync('/.dockerenv') const code = ` console.log('Docker detection test'); `; const result = await executeTypescriptInSandbox({ code, allowedTools: [], timeoutMs: 10000, permissions: { read: [], write: [], net: [] }, enableSampling: true, }, mcpClientPool); expect(result.success).toBe(true); }); }); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/aberemia24/code-executor-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

sampling-executor-integration.test.ts•19.8 KiB