Skip to main content
Glama
index.ts7.01 kB
import { dirname, join } from "node:path"; import path from "node:path"; import { fileURLToPath } from "node:url"; import type { Metadata } from "@superglue/shared"; import { initializeAIModel } from "@superglue/shared/utils"; import { config } from "dotenv"; import { PlaywrightFetchingStrategy } from "../../packages/core/documentation/strategies/fetching-playwright.js"; import { closeAllPools } from "../../packages/core/tools/strategies/postgres/postgres.js"; import { shutdownSharedHtmlMarkdownPool } from "../../packages/core/utils/html-markdown-pool.js"; import { logMessage } from "../../packages/core/utils/logs.js"; import { loadConfig } from "../tool-evals/config/config-loader.js"; import { ConsoleReporter } from "../tool-evals/reporters/console-reporter.js"; import { CsvReporter } from "../tool-evals/reporters/csv-reporter.js"; import { JsonReporter } from "../tool-evals/reporters/json-reporter.js"; import { MetricsCalculator } from "../tool-evals/services/metrics-calculator.js"; import type { IntegrationConfig } from "../tool-evals/types.js"; import { LlmToolRunner } from "./services/llm-tool-runner.js"; // Load environment variables const envPath = process.cwd().endsWith('packages/core') ? path.join(process.cwd(), '../../.env') : path.join(process.cwd(), '.env'); config({ path: envPath }); const PROVIDERS = [ { name: 'gpt-4.1', envProvider: 'openai', envModel: 'gpt-4.1' }, { name: 'gpt-5', envProvider: 'openai', envModel: 'gpt-5' }, { name: 'claude-sonnet-4-5', envProvider: 'anthropic', envModel: 'claude-sonnet-4-5' }, { name: 'claude-sonnet-4-20250514', envProvider: 'anthropic', envModel: 'claude-sonnet-4-20250514' }, { name: 'gemini-2.5-flash-lite', envProvider: 'gemini', envModel: 'gemini-2.5-flash-lite' }, { name: 'gemini-3-pro-preview', envProvider: 'gemini', envModel: 'gemini-3-pro-preview' }, { name: 'claude-opus-4-5-20251101', envProvider: 'anthropic', envModel: 'claude-opus-4-5-20251101' }, ]; function applyEnvironmentVariablesToCredentials( integrations: IntegrationConfig[], metadata: Metadata ): void { for (const integration of integrations) { if (!integration.credentials || !integration.id) { continue; } for (const [key] of Object.entries(integration.credentials)) { const expectedEnvVarName = `${integration.id.toUpperCase().replace(/-/g, '_')}_${key.toUpperCase()}`; const envValue = process.env[expectedEnvVarName]; if (envValue) { integration.credentials[key] = envValue; } else { logMessage('warn', `Missing credential: ${integration.id}.${key} (${expectedEnvVarName})`, metadata); } } // Special handling for postgres-lego: replace placeholders in urlHost if (integration.id === "postgres-lego") { integration.urlHost = integration.urlHost .replace("<<username>>", integration.credentials.username) .replace("<<password>>", integration.credentials.password) .replace("<<host>>", integration.credentials.host) .replace("<<port>>", integration.credentials.port) .replace("<<database>>", integration.credentials.database); } } } async function main(): Promise<void> { const metadata = { orgId: "llm-benchmark", userId: "system" }; logMessage("info", "Starting LLM Benchmark Evaluation...", metadata); try { const evalConfig = await loadConfig("../../llm-benchmark-evals/llm-benchmark-config.json"); const enabledTools = evalConfig.enabledTools === 'all' ? evalConfig.tools : evalConfig.tools.filter(tool => evalConfig.enabledTools.includes(tool.id)); // Filter integrations to only those used by enabled tools const usedIntegrationIds = new Set( enabledTools.flatMap(tool => tool.integrationIds) ); const integrations = evalConfig.integrations.filter(integration => usedIntegrationIds.has(integration.id) ); // Apply environment variables to credentials applyEnvironmentVariablesToCredentials(integrations, metadata); logMessage("info", `Loaded ${integrations.length} integrations, ${enabledTools.length} enabled tools`, metadata); const baseDir = dirname(fileURLToPath(import.meta.url)); for (const provider of PROVIDERS) { const providerStartTime = Date.now(); logMessage("info", `Running evaluation for provider: ${provider.name}`, metadata); // Set environment variables for this provider const originalProvider = process.env.LLM_PROVIDER; const originalModel = process.env[`${provider.envProvider.toUpperCase()}_MODEL`]; process.env.LLM_PROVIDER = provider.envProvider; process.env[`${provider.envProvider.toUpperCase()}_MODEL`] = provider.envModel; const providerModel = initializeAIModel({ providerEnvVar: 'LLM_PROVIDER', defaultModel: provider.envModel }); const runner = new LlmToolRunner(metadata, evalConfig.validationLlmConfig); const toolAttempts = await runner.runToolsForProvider( providerModel, provider.name, enabledTools, integrations ); const metricsCalculator = new MetricsCalculator(); const metrics = metricsCalculator.calculateMetrics(toolAttempts); const timestamp = new Date().toISOString().split('.')[0].replace(/[:.]/g, '-'); const providerSafeKey = provider.name.replace(/[^a-zA-Z0-9]/g, '-'); const csvReporter = new CsvReporter(baseDir, metadata); csvReporter.report(`${timestamp}-${providerSafeKey}`, metrics); const jsonReporter = new JsonReporter(baseDir, metadata, 1); jsonReporter.reportAttempts(`${timestamp}-${providerSafeKey}`, toolAttempts, evalConfig); const duration = Date.now() - providerStartTime; logMessage("info", `Provider ${provider.name} completed in ${(duration / 1000).toFixed(1)}s`, metadata); await new Promise(resolve => setTimeout(resolve, 1000)); ConsoleReporter.report(metrics, `${timestamp}-${providerSafeKey}`, baseDir); // Restore environment variables if (originalProvider !== undefined) { process.env.LLM_PROVIDER = originalProvider; } else { process.env.LLM_PROVIDER = undefined; } if (originalModel !== undefined) { process.env[`${provider.envProvider.toUpperCase()}_MODEL`] = originalModel; } else { delete process.env[`${provider.envProvider.toUpperCase()}_MODEL`]; } } logMessage("info", "LLM Benchmark Evaluation Completed", metadata); } catch (error) { const message = error instanceof Error ? error.stack || error.message : String(error); console.error("LLM Benchmark Eval failed:", message); logMessage("error", `LLM Benchmark Eval failed: ${message}`, metadata); process.exitCode = 1; } finally { await closeAllPools(); await shutdownSharedHtmlMarkdownPool(); await PlaywrightFetchingStrategy.closeBrowser(); } } main().catch(error => { console.error('Fatal error:', error); process.exit(1); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/superglue-ai/superglue'

If you have feedback or need assistance with the MCP directory API, please join our Discord server