Doclea MCP

Official

Overview Schema Related Servers Score Discussions

doclea-mcp
scripts

mcp-vs-grep-choice-benchmark.ts•78.3 KiB

import { spawnSync } from "node:child_process"; import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; import { basename, dirname, join, resolve } from "node:path"; import { z } from "zod"; import { resetContextCache } from "../src/caching/context-cache"; import { loadConfigWithAutoDetect } from "../src/config"; import { CachedEmbeddingClient, createEmbeddingClient, } from "../src/embeddings/provider"; import { createStorageBackend } from "../src/storage/factory"; import type { IStorageBackend } from "../src/storage/interface"; import { buildContextWithCache, type ContextEvidenceItem, } from "../src/tools/context"; import { countTokens, truncateToTokens } from "../src/utils/tokens"; import { createVectorStore } from "../src/vectors"; import { runLlmCliCompletion } from "./lib/llm-cli-runner"; type BenchmarkMode = | "mcp_full" | "mcp_hybrid_guardrail" | "grep_tools" | "filename_tools" | "symbol_index_tools" | "lsp_tools" | "hybrid_tools"; type TimingSource = "modeled" | "measured"; type RunKind = "cold" | "warm"; type InputTokenSource = "payload" | "provider"; const QuerySchema = z.object({ id: z.string().min(1), query: z.string().min(1), expectedFilePaths: z.array(z.string().min(1)).min(1), }); const FixtureSchema = z.object({ queries: z.array(QuerySchema).min(1), }); type ChoiceQuery = z.infer<typeof QuerySchema>; interface ChoiceRun { queryId: string; query: string; mode: BenchmarkMode; runKind: RunKind; timingSource: TimingSource; latencyMs: number; outputTokens: number; inputTokens: number; inputTokensSource: InputTokenSource; payloadInputTokens: number; payloadOutputTokens: number; providerInputTokens: number | null; providerOutputTokens: number | null; timingBreakdownMs: { retrievalMs: number; promptBuildMs: number; modelRequestMs: number; endToEndMs: number; }; estimatedLlmMs: number; estimatedEndToEndMs: number; openedFileCount: number; matchedFileCount: number; fileRecall: number; filePrecision: number; hallucinatedRatio: number; matchedFiles: string[]; missingFiles: string[]; retrievedTopK: string[]; } interface ChoiceModeSummary { mode: BenchmarkMode; runs: number; timingSource: TimingSource; latencyMs: { avg: number; p50: number; p95: number; }; quality: { fileRecallAvg: number; filePrecisionAvg: number; wrongPathRatioAvg: number; hallucinatedRatioAvg: number; }; tokenUsage: { inputTokensAvg: number; inputTokensP50: number; inputTokensP95: number; payloadInputTokensAvg: number; payloadInputTokensP50: number; payloadInputTokensP95: number; providerInputTokensAvg: number | null; outputTokensAvg: number; outputTokensP50: number; outputTokensP95: number; openedFileCountAvg: number; tokensPerMatchedFileAvg: number; budgetUtilizationAvg: number; }; timingMs: { modelRequestAvg: number; modelRequestP50: number; modelRequestP95: number; endToEndAvg: number; endToEndP50: number; endToEndP95: number; source: TimingSource; }; estimatedTimingMs: { llmProcessingAvg: number; endToEndAvg: number; endToEndP95: number; }; byRunKind: Partial<Record<RunKind, Omit<ChoiceModeSummary, "byRunKind">>>; } interface ChoiceBenchmarkReport { generatedAt: string; projectPath: string; fixturePath: string; fixtureValidation?: { requestedQueries: number; activeQueries: number; droppedQueryCount: number; droppedExpectedPathCount: number; droppedQueryIds: string[]; droppedExpectedPathSamples: Array<{ queryId: string; path: string; }>; }; timingSource: TimingSource; concurrency: number; recallK: number; tokenBudget: number; runsPerQuery: number; warmupRuns: number; runKinds: RunKind[]; liveInference: { command: string | null; model: string | null; region: string | null; timeoutMs: number | null; temperature: number | null; maxOutputTokens: number | null; }; tokenAccounting: { inputTokensSource: InputTokenSource; description: string; }; qwenPreflight?: { enabled: boolean; endpoint: string; embeddingDimension: number; qdrantUrl: string; qdrantCollection: string; qdrantVectorSize: number; verifiedAt: string; } | null; realworldCodex?: boolean; comparisonModel: { grepOpenFiles: number; grepFileCharLimit: number; estimatedOutputTokens: number; inputTokensPerSecond: number; outputTokensPerSecond: number; activeModes: BenchmarkMode[]; }; modes: ChoiceModeSummary[]; runs: ChoiceRun[]; } const QUERY_STOPWORDS = new Set([ "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "how", "i", "if", "in", "into", "is", "it", "list", "map", "of", "on", "or", "our", "show", "that", "the", "this", "to", "trace", "use", "what", "where", "which", "with", ]); const ALL_BENCHMARK_MODES: BenchmarkMode[] = [ "mcp_full", "mcp_hybrid_guardrail", "grep_tools", "filename_tools", "symbol_index_tools", "lsp_tools", "hybrid_tools", ]; const FILE_LIST_CACHE = new Map<string, string[]>(); const DISALLOWED_CITED_PATH_PREFIXES = [ ".doclea/reports/", ".doclea/benchmarks/", ".doclea/retrieval-", ]; function normalizeBaseUrl(value: string): string { return value.trim().replace(/\/+$/, ""); } function parseIntEnv(name: string, fallback: number): number { const raw = process.env[name]; if (!raw) return fallback; const parsed = Number.parseInt(raw, 10); return Number.isFinite(parsed) ? parsed : fallback; } function parseFloatEnv(name: string, fallback: number): number { const raw = process.env[name]; if (!raw) return fallback; const parsed = Number.parseFloat(raw); return Number.isFinite(parsed) ? parsed : fallback; } function parseBoolEnv(name: string, fallback: boolean): boolean { const raw = process.env[name]; if (!raw) return fallback; return raw.toLowerCase() === "true"; } async function fetchWithTimeout(input: { url: string; method?: "GET" | "POST"; body?: string; timeoutMs: number; }): Promise<Response> { const controller = new AbortController(); const timeoutMs = Math.max(1, Math.floor(input.timeoutMs)); const timeout = setTimeout(() => controller.abort(), timeoutMs); try { return await fetch(input.url, { method: input.method ?? "GET", headers: input.body ? { "Content-Type": "application/json", } : undefined, body: input.body, signal: controller.signal, }); } finally { clearTimeout(timeout); } } function parseJsonSafe(text: string): unknown { try { return JSON.parse(text) as unknown; } catch { return null; } } function extractQdrantVectorSize(payload: unknown): number | null { if (!payload || typeof payload !== "object") return null; const root = payload as Record<string, unknown>; const result = root.result; if (!result || typeof result !== "object") return null; const config = (result as Record<string, unknown>).config; if (!config || typeof config !== "object") return null; const params = (config as Record<string, unknown>).params; if (!params || typeof params !== "object") return null; const vectors = (params as Record<string, unknown>).vectors; if (!vectors || typeof vectors !== "object") return null; const vectorRecord = vectors as Record<string, unknown>; const direct = vectorRecord.size; if (typeof direct === "number" && Number.isFinite(direct)) { return Math.max(0, Math.floor(direct)); } for (const value of Object.values(vectorRecord)) { if (!value || typeof value !== "object") continue; const size = (value as Record<string, unknown>).size; if (typeof size === "number" && Number.isFinite(size)) { return Math.max(0, Math.floor(size)); } } return null; } async function runQwenStackPreflight(input: { projectPath: string; config: Awaited<ReturnType<typeof loadConfigWithAutoDetect>>; expectedEndpoint: string; expectedQdrantUrl: string; expectedCollectionName: string; expectedVectorSize: number; timeoutMs: number; }): Promise<NonNullable<ChoiceBenchmarkReport["qwenPreflight"]>> { const configPath = join(input.projectPath, ".doclea/config.json"); if (input.config.embedding.provider !== "local") { throw new Error( `Qwen preflight failed: embedding.provider must be "local". Found "${input.config.embedding.provider}". Check ${configPath}.`, ); } if (input.config.vector.provider !== "qdrant") { throw new Error( `Qwen preflight failed: vector.provider must be "qdrant". Found "${input.config.vector.provider}". Check ${configPath}.`, ); } const configuredEndpoint = normalizeBaseUrl(input.config.embedding.endpoint); if (configuredEndpoint !== input.expectedEndpoint) { throw new Error( `Qwen preflight failed: embedding endpoint mismatch. Expected ${input.expectedEndpoint}, found ${configuredEndpoint}. Check ${configPath}.`, ); } const configuredQdrantUrl = normalizeBaseUrl(input.config.vector.url); if (configuredQdrantUrl !== input.expectedQdrantUrl) { throw new Error( `Qwen preflight failed: qdrant URL mismatch. Expected ${input.expectedQdrantUrl}, found ${configuredQdrantUrl}. Check ${configPath}.`, ); } if (input.config.vector.collectionName !== input.expectedCollectionName) { throw new Error( `Qwen preflight failed: qdrant collection mismatch. Expected ${input.expectedCollectionName}, found ${input.config.vector.collectionName}. Check ${configPath}.`, ); } if (input.config.vector.vectorSize !== input.expectedVectorSize) { throw new Error( `Qwen preflight failed: config vectorSize mismatch. Expected ${input.expectedVectorSize}, found ${input.config.vector.vectorSize}. Check ${configPath}.`, ); } const health = await fetchWithTimeout({ url: `${input.expectedEndpoint}/health`, timeoutMs: input.timeoutMs, }); if (!health.ok) { throw new Error( `Qwen preflight failed: embeddings health check returned ${health.status} ${health.statusText} at ${input.expectedEndpoint}/health`, ); } const embedResponse = await fetchWithTimeout({ url: `${input.expectedEndpoint}/embed`, method: "POST", body: JSON.stringify({ inputs: "qwen stack preflight" }), timeoutMs: input.timeoutMs, }); const embedBody = await embedResponse.text(); if (!embedResponse.ok) { throw new Error( `Qwen preflight failed: embeddings /embed returned ${embedResponse.status} ${embedResponse.statusText} (${embedBody.slice(0, 240)})`, ); } const embedJson = parseJsonSafe(embedBody); const firstVector = Array.isArray(embedJson) ? (embedJson[0] as unknown) : embedJson; const embeddingDimension = Array.isArray(firstVector) ? firstVector.length : 0; if (embeddingDimension !== input.expectedVectorSize) { throw new Error( `Qwen preflight failed: embedding dimension mismatch. Expected ${input.expectedVectorSize}, got ${embeddingDimension}.`, ); } const collectionResponse = await fetchWithTimeout({ url: `${input.expectedQdrantUrl}/collections/${encodeURIComponent(input.expectedCollectionName)}`, timeoutMs: input.timeoutMs, }); const collectionBody = await collectionResponse.text(); if (!collectionResponse.ok) { throw new Error( `Qwen preflight failed: Qdrant collection check returned ${collectionResponse.status} ${collectionResponse.statusText} (${collectionBody.slice(0, 240)})`, ); } const collectionJson = parseJsonSafe(collectionBody); const qdrantVectorSize = extractQdrantVectorSize(collectionJson); if (qdrantVectorSize !== input.expectedVectorSize) { throw new Error( `Qwen preflight failed: Qdrant vector size mismatch. Expected ${input.expectedVectorSize}, got ${qdrantVectorSize ?? "unknown"}.`, ); } return { enabled: true, endpoint: input.expectedEndpoint, embeddingDimension, qdrantUrl: input.expectedQdrantUrl, qdrantCollection: input.expectedCollectionName, qdrantVectorSize, verifiedAt: new Date().toISOString(), }; } function parseTimingSourceEnv(): TimingSource { const raw = (process.env.DOCLEA_CHOICE_TIMING_MODE ?? "modeled") .trim() .toLowerCase(); if (raw === "measured") return "measured"; return "modeled"; } function parseInputTokenSourceEnv(): InputTokenSource { const raw = (process.env.DOCLEA_CHOICE_INPUT_TOKEN_SOURCE ?? "payload") .trim() .toLowerCase(); if (raw === "provider") return "provider"; return "payload"; } function parseRunKindsEnv(): RunKind[] { const raw = process.env.DOCLEA_CHOICE_RUN_KINDS; if (!raw) return ["cold", "warm"]; const parsed = raw .split(",") .map((value) => value.trim().toLowerCase()) .filter((value): value is RunKind => value === "cold" || value === "warm"); if (parsed.length === 0) return ["cold", "warm"]; return Array.from(new Set(parsed)); } function parseBenchmarkModesEnv(): BenchmarkMode[] { const raw = process.env.DOCLEA_CHOICE_MODES; if (!raw) { return [...ALL_BENCHMARK_MODES]; } const requested = raw .split(",") .map((value) => value.trim().toLowerCase()) .filter(Boolean); const parsed: BenchmarkMode[] = []; for (const mode of requested) { if ((ALL_BENCHMARK_MODES as string[]).includes(mode)) { parsed.push(mode as BenchmarkMode); } } if (parsed.length === 0) { return [...ALL_BENCHMARK_MODES]; } // Ensure MCP is always present as anchor comparator. if (!parsed.includes("mcp_full")) { parsed.unshift("mcp_full"); } return Array.from(new Set(parsed)); } function toFixedNumber(value: number, decimals = 4): number { return Number(value.toFixed(decimals)); } function average(values: number[]): number { if (values.length === 0) return 0; return toFixedNumber( values.reduce((sum, value) => sum + value, 0) / values.length, ); } function averageNullable(values: Array<number | null>): number | null { const present = values.filter( (value): value is number => typeof value === "number" && Number.isFinite(value), ); if (present.length === 0) return null; return average(present); } function percentile(values: number[], p: number): number { if (values.length === 0) return 0; const sorted = [...values].sort((left, right) => left - right); const index = Math.max( 0, Math.min(sorted.length - 1, Math.ceil((p / 100) * sorted.length) - 1), ); return toFixedNumber(sorted[index] ?? 0); } async function runWithConcurrency<T, U>(input: { items: T[]; concurrency: number; worker: (item: T, index: number) => Promise<U>; }): Promise<U[]> { const { items, worker } = input; const concurrency = Math.max(1, Math.floor(input.concurrency)); if (items.length === 0) return []; const results: U[] = new Array(items.length); let cursor = 0; const workers = Array.from({ length: Math.min(concurrency, items.length) }) .fill(null) .map(async () => { while (true) { const index = cursor; cursor += 1; if (index >= items.length) { return; } results[index] = await worker(items[index] as T, index); } }); await Promise.all(workers); return results; } function ensureDirectory(path: string): void { const directory = dirname(path); if (!existsSync(directory)) { mkdirSync(directory, { recursive: true }); } } function normalizeFilePath(path: string, projectPath: string): string { const normalizedProject = projectPath .replaceAll("\\", "/") .replace(/\/+$/, ""); let normalized = path.replaceAll("\\", "/").trim(); if (!normalized) { return ""; } if (normalized.startsWith(`${normalizedProject}/`)) { normalized = normalized.slice(normalizedProject.length + 1); } else if (normalized.startsWith("/")) { // Some indexers emit repo-root relative paths as "/apps/..." instead of // "apps/...". Normalize only when the repo-relative candidate exists. const repoRelative = normalized.replace(/^\/+/, ""); if (repoRelative.length > 0) { const candidatePath = `${normalizedProject}/${repoRelative}`; if (existsSync(candidatePath)) { normalized = repoRelative; } } } while (normalized.startsWith("./")) { normalized = normalized.slice(2); } return normalized; } function isDisallowedCitedPath(path: string): boolean { const lower = path.toLowerCase(); if (lower.startsWith("../")) { return true; } return DISALLOWED_CITED_PATH_PREFIXES.some((prefix) => lower.startsWith(prefix), ); } function resolveProjectPath(path: string, projectPath: string): string | null { const normalized = normalizeFilePath(path, projectPath); if (!normalized || normalized.startsWith("../")) { return null; } const root = resolve(projectPath); const absolute = resolve(root, normalized); if (absolute !== root && !absolute.startsWith(`${root}/`)) { return null; } return absolute; } function escapeRegex(value: string): string { return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } function extractQueryTerms(query: string): string[] { const terms = query .toLowerCase() .split(/[^a-z0-9_]+/) .map((term) => term.trim()) .filter((term) => term.length >= 3 && !QUERY_STOPWORDS.has(term)); return Array.from(new Set(terms)); } function extractCodeEntities(query: string): string[] { const entities = new Set<string>(); const regex = /\b[A-Z][A-Za-z0-9_]{2,}\b/g; const matches = query.match(regex) ?? []; for (const match of matches) { entities.add(match); } return Array.from(entities); } function toKebabCase(value: string): string { return value .replace(/([a-z0-9])([A-Z])/g, "$1-$2") .replace(/[_\s]+/g, "-") .toLowerCase(); } function buildSearchHints(query: string): string[] { const terms = extractQueryTerms(query); const entities = extractCodeEntities(query); const hints = new Set<string>(); for (const term of terms) { if (term.length >= 4) { hints.add(term); } } for (let i = 0; i < terms.length - 1; i++) { const left = terms[i] ?? ""; const right = terms[i + 1] ?? ""; if (left.length >= 4 && right.length >= 4) { hints.add(`${left}-${right}`); hints.add(`${left}_${right}`); hints.add(`${left}${right}`); } } if (/\baccess\s+codes?\b/i.test(query)) { hints.add("access-codes"); hints.add("access_code"); hints.add("accesscodes"); } for (const entity of entities) { const kebab = toKebabCase(entity); if (kebab.length >= 4) { hints.add(kebab); hints.add(`${kebab}.ts`); hints.add(`${kebab}.tsx`); hints.add(`${kebab}.sql`); } } return Array.from(hints).slice(0, 24); } function collectTopKFilesFromEvidence( evidence: ContextEvidenceItem[], recallK: number, projectPath: string, ): string[] { const ranked = evidence .filter((item) => item.rank > 0) .sort((left, right) => left.rank - right.rank); const fileScores = new Map<string, number>(); const maxFiles = Math.max(1, recallK * 2); const addFileScore = (filePath: string, score: number) => { const normalized = normalizeFilePath(filePath, projectPath); if (!normalized) { return; } fileScores.set(normalized, (fileScores.get(normalized) ?? 0) + score); }; for (const item of ranked) { const rankWeight = 1 / Math.max(1, item.rank); const sourceWeight = item.source === "kag" ? 1.3 : item.source === "graphrag" ? 0.95 : 0.7; if (item.code?.filePath) { addFileScore(item.code.filePath, 120 * rankWeight * sourceWeight); } if (item.memory?.relatedFiles && item.memory.relatedFiles.length > 0) { for (const relatedFile of item.memory.relatedFiles) { addFileScore(relatedFile, 36 * rankWeight * sourceWeight); } } } return Array.from(fileScores.entries()) .sort( ([leftFile, leftScore], [rightFile, rightScore]) => rightScore - leftScore || leftFile.localeCompare(rightFile), ) .slice(0, maxFiles) .map(([filePath]) => filePath); } function estimateLlmProcessingMs(input: { inputTokens: number; outputTokens: number; inputTokensPerSecond: number; outputTokensPerSecond: number; }): number { const inputMs = input.inputTokensPerSecond > 0 ? (input.inputTokens / input.inputTokensPerSecond) * 1000 : 0; const outputMs = input.outputTokensPerSecond > 0 ? (input.outputTokens / input.outputTokensPerSecond) * 1000 : 0; return toFixedNumber(inputMs + outputMs, 4); } function buildLiveBenchmarkMessages(input: { query: string; contextPayload: string; }): Array<{ role: "system" | "user"; content: string }> { const systemPrompt = "You are running a retrieval benchmark. Respond in strict JSON only with keys: answer, cited_files, confidence."; const userPrompt = `# Task\n${input.query}\n\n# Retrieved Context\n${input.contextPayload}\n\n# Output Contract\nReturn JSON only. cited_files must be an array of file paths used in your answer.`; return [ { role: "system", content: systemPrompt }, { role: "user", content: userPrompt }, ]; } function formatMessagesForTokenCount( messages: Array<{ role: "system" | "user" | "assistant"; content: string }>, ): string { return messages .map((message) => `[${message.role.toUpperCase()}]\n${message.content}`) .join("\n\n"); } function buildRealworldCodexMessages(input: { query: string; mode: BenchmarkMode; projectPath: string; tokenBudget: number; }): Array<{ role: "system" | "user"; content: string }> { const systemPrompt = 'You are running a real-world retrieval benchmark in a code repository. You must return strict JSON only: {"answer": string, "cited_files": string[], "confidence": number}.'; const strategyPrompt = input.mode === "mcp_full" ? "- Use Doclea MCP first and call `doclea_context` before finalizing.\n- Use MCP evidence to build the initial file shortlist, then verify uncertain files via targeted reads.\n- Prefer implementation/source files over generated artifacts." : "- Do not use MCP.\n- Use native repository exploration tools (search/open/read) for evidence.\n- Verify uncertain files via targeted reads."; const userPrompt = `# Repository Root\n${input.projectPath}\n\n# Task\n${input.query}\n\n# Constraints\n- Keep cited_files to concrete repository file paths only.\n- Every cited path must exist in this repository right now.\n- Do not cite benchmark artifacts or fixtures (for example: .doclea/reports/*, .doclea/retrieval-*.json).\n- Include every file that materially supports your answer.\n- If uncertain, include best-effort files and lower confidence.\n- Token budget target for context discipline: ${input.tokenBudget}.\n\n# Strategy\n${strategyPrompt}\n\n# Output Contract\nReturn JSON only with keys: answer, cited_files, confidence.`; return [ { role: "system", content: systemPrompt }, { role: "user", content: userPrompt }, ]; } function parseCitedFilesFromCompletion(input: { outputText: string; projectPath: string; }): string[] { const raw = input.outputText.trim(); const candidates: string[] = [raw]; const fenced = raw.match(/```(?:json)?\s*([\s\S]*?)```/gi) ?? []; for (const block of fenced) { const stripped = block .replace(/^```(?:json)?\s*/i, "") .replace(/```$/i, "") .trim(); if (stripped.length > 0) { candidates.push(stripped); } } const normalized = new Set<string>(); for (const candidate of candidates) { let parsed: unknown; try { parsed = JSON.parse(candidate) as unknown; } catch { continue; } if (!parsed || typeof parsed !== "object") continue; const files = (parsed as Record<string, unknown>).cited_files; if (!Array.isArray(files)) continue; for (const file of files) { if (typeof file !== "string") continue; const normalizedPath = normalizeFilePath(file, input.projectPath); const absolutePath = resolveProjectPath( normalizedPath, input.projectPath, ); if ( normalizedPath.length > 0 && !isDisallowedCitedPath(normalizedPath) && absolutePath && existsSync(absolutePath) ) { normalized.add(normalizedPath); } } } if (normalized.size > 0) { return Array.from(normalized); } // Fallback: extract likely repo paths from plain text outputs. const pathLike = raw.match(/(?:[A-Za-z0-9@._-]+\/)+[A-Za-z0-9@._-]+\.[A-Za-z0-9_-]+/g) ?? []; for (const file of pathLike) { const normalizedPath = normalizeFilePath(file, input.projectPath); const absolutePath = resolveProjectPath(normalizedPath, input.projectPath); if ( normalizedPath.length > 0 && !isDisallowedCitedPath(normalizedPath) && absolutePath && existsSync(absolutePath) ) { normalized.add(normalizedPath); } } return Array.from(normalized); } function runGrepToolingQuery( projectPath: string, query: string, recallK: number, ): string[] { const hints = buildSearchHints(query); if (hints.length === 0) { return []; } const codeEntities = extractCodeEntities(query).map((entity) => entity.toLowerCase(), ); const hintRegexes = hints .map((hint) => { const safe = escapeRegex(hint); if (!safe) return null; return new RegExp(safe, "i"); }) .filter((value): value is RegExp => value instanceof RegExp); const pattern = hints.map(escapeRegex).join("|"); const args = [ "-n", "--no-heading", "-S", "--glob", "!**/node_modules/**", "--glob", "!**/.git/**", "--glob", "!**/dist/**", "--glob", "!**/build/**", "--glob", "!**/.next/**", pattern, projectPath, ]; const result = spawnSync("rg", args, { encoding: "utf-8", maxBuffer: 32 * 1024 * 1024, }); const output = result.stdout?.trim() ?? ""; if (output.length === 0) { return []; } const fileScores = new Map<string, number>(); const lines = output.split("\n"); let processedLines = 0; const isAccessCodesQuery = /\baccess\s+codes?\b/i.test(query); for (const line of lines) { if (processedLines >= 50_000) { break; } processedLines += 1; const firstColon = line.indexOf(":"); if (firstColon <= 0) { continue; } const secondColon = line.indexOf(":", firstColon + 1); const filePath = line.slice(0, firstColon); const snippet = secondColon > firstColon ? line.slice(secondColon + 1) : ""; const normalized = normalizeFilePath(filePath, projectPath); if (normalized.length > 0) { const lowerPath = normalized.toLowerCase(); const lowerSnippet = snippet.toLowerCase(); let score = fileScores.get(normalized) ?? 0; score += 1; for (const hintRegex of hintRegexes) { if (hintRegex.test(normalized)) { score += 6; continue; } if (hintRegex.test(snippet)) { score += 2; } } for (const entity of codeEntities) { if (lowerPath.includes(entity)) { score += 8; continue; } if (lowerSnippet.includes(entity)) { score += 3; } } if (isAccessCodesQuery) { if (lowerPath.includes("access-codes")) { score += 12; } else if ( lowerPath.includes("access_codes") || lowerPath.includes("accesscodes") ) { score += 8; } } fileScores.set(normalized, score); } } return Array.from(fileScores.entries()) .sort( (left, right) => right[1] - left[1] || left[0].localeCompare(right[0]), ) .slice(0, Math.max(1, recallK * 8)) .map(([filePath]) => filePath); } function listProjectFiles(projectPath: string): string[] { const normalizedProject = resolve(projectPath); const cached = FILE_LIST_CACHE.get(normalizedProject); if (cached) { return cached; } const args = [ "--files", "--glob", "!**/node_modules/**", "--glob", "!**/.git/**", "--glob", "!**/dist/**", "--glob", "!**/build/**", "--glob", "!**/.next/**", normalizedProject, ]; const result = spawnSync("rg", args, { encoding: "utf-8", maxBuffer: 32 * 1024 * 1024, }); const output = result.stdout?.trim() ?? ""; if (!output) { FILE_LIST_CACHE.set(normalizedProject, []); return []; } const files = output .split("\n") .map((path) => normalizeFilePath(path, normalizedProject)) .filter(Boolean); FILE_LIST_CACHE.set(normalizedProject, files); return files; } function runFilenameToolingQuery( projectPath: string, query: string, recallK: number, ): string[] { const terms = extractQueryTerms(query); const hints = buildSearchHints(query); const entities = extractCodeEntities(query).map((entity) => toKebabCase(entity).toLowerCase(), ); const files = listProjectFiles(projectPath); if (files.length === 0) { return []; } const scores = new Map<string, number>(); for (const filePath of files) { const lowerPath = filePath.toLowerCase(); let score = 0; for (const hint of hints) { const normalizedHint = hint.toLowerCase(); if (lowerPath.includes(`/${normalizedHint}/`)) { score += 8; } else if (lowerPath.includes(normalizedHint)) { score += 4; } if ( lowerPath.endsWith(`/${normalizedHint}`) || lowerPath.includes(`/${normalizedHint}.`) ) { score += 4; } } for (const entity of entities) { if (entity.length < 4) { continue; } if (lowerPath.includes(entity)) { score += 7; } } for (const term of terms) { if (term.length < 4) { continue; } if (lowerPath.includes(term)) { score += 2; } } if (score > 0) { scores.set(filePath, score); } } return Array.from(scores.entries()) .sort( (left, right) => right[1] - left[1] || left[0].localeCompare(right[0]), ) .slice(0, Math.max(1, recallK * 8)) .map(([filePath]) => filePath); } function runSymbolIndexToolingQuery( storage: IStorageBackend, projectPath: string, query: string, recallK: number, ): string[] { const terms = extractQueryTerms(query); const hints = buildSearchHints(query); if (hints.length === 0) { return []; } const whereClauses = hints.map( () => "(lower(name) LIKE ? OR lower(file_path) LIKE ?)", ); const params: string[] = []; for (const hint of hints) { const pattern = `%${hint.toLowerCase()}%`; params.push(pattern, pattern); } const sql = ` SELECT file_path, COUNT(*) AS match_count FROM code_nodes WHERE (${whereClauses.join(" OR ")}) AND lower(file_path) NOT LIKE '%/dist/%' AND lower(file_path) NOT LIKE '%/build/%' AND lower(file_path) NOT LIKE '%/generated/%' AND lower(file_path) NOT LIKE '%/__tests__/%' AND lower(file_path) NOT LIKE '%/coverage/%' AND lower(file_path) NOT LIKE '%.spec.ts' AND lower(file_path) NOT LIKE '%.spec.tsx' AND lower(file_path) NOT LIKE '%.test.ts' AND lower(file_path) NOT LIKE '%.test.tsx' AND lower(file_path) NOT LIKE '%.d.ts' GROUP BY file_path ORDER BY match_count DESC LIMIT ? `; const rows = storage .getDatabase() .query(sql) .all(...params, Math.max(200, recallK * 16)) as Array<{ file_path: string; match_count: number; }>; if (rows.length === 0) { return []; } const ranked = rows .map((row) => { const normalized = normalizeFilePath(row.file_path, projectPath); const lowerPath = normalized.toLowerCase(); let score = Number(row.match_count); for (const term of terms) { if (term.length < 4) { continue; } if (lowerPath.includes(term)) { score += 1.5; } } return { file_path: normalized, score }; }) .filter((row) => row.file_path.length > 0) .sort( (left, right) => right.score - left.score || left.file_path.localeCompare(right.file_path), ); return ranked.slice(0, Math.max(1, recallK * 8)).map((row) => row.file_path); } function buildSqlPlaceholders(length: number): string { return Array.from({ length }, () => "?").join(", "); } function runLspToolingQuery( storage: IStorageBackend, projectPath: string, query: string, recallK: number, ): string[] { const terms = extractQueryTerms(query); const hints = buildSearchHints(query); const entities = extractCodeEntities(query); const anchorHints = Array.from( new Set( [ ...hints, ...terms, ...entities.map((entity) => entity.toLowerCase()), ].filter((value) => value.length >= 3), ), ).slice(0, 24); if (anchorHints.length === 0) { return []; } const whereClauses = anchorHints.map( () => "(lower(name) LIKE ? OR lower(file_path) LIKE ?)", ); const anchorParams: string[] = []; for (const hint of anchorHints) { const pattern = `%${hint.toLowerCase()}%`; anchorParams.push(pattern, pattern); } const anchorSql = ` SELECT id, file_path, name FROM code_nodes WHERE (${whereClauses.join(" OR ")}) AND lower(file_path) NOT LIKE '%/dist/%' AND lower(file_path) NOT LIKE '%/build/%' AND lower(file_path) NOT LIKE '%/generated/%' AND lower(file_path) NOT LIKE '%/__tests__/%' AND lower(file_path) NOT LIKE '%/coverage/%' AND lower(file_path) NOT LIKE '%.spec.ts' AND lower(file_path) NOT LIKE '%.spec.tsx' AND lower(file_path) NOT LIKE '%.test.ts' AND lower(file_path) NOT LIKE '%.test.tsx' AND lower(file_path) NOT LIKE '%.d.ts' LIMIT 180 `; const anchors = storage .getDatabase() .query(anchorSql) .all(...anchorParams) as Array<{ id: string; file_path: string; name: string; }>; if (anchors.length === 0) { return []; } const anchorIds = Array.from(new Set(anchors.map((row) => row.id))).slice( 0, 80, ); const anchorIdPlaceholders = buildSqlPlaceholders(anchorIds.length); const anchorScores = new Map<string, number>(); for (const anchor of anchors) { const filePath = normalizeFilePath(anchor.file_path, projectPath); if (!filePath) continue; const lowerName = (anchor.name ?? "").toLowerCase(); let score = anchorScores.get(filePath) ?? 0; score += 4; for (const term of terms) { if (term.length < 4) continue; if (lowerName.includes(term)) { score += 2.5; } } anchorScores.set(filePath, score); } const neighborSql = ` SELECT n.file_path AS file_path, COUNT(*) AS edge_count, COUNT(DISTINCT n.id) AS node_count FROM code_edges e JOIN code_nodes n ON ( (n.id = e.from_node AND e.to_node IN (${anchorIdPlaceholders})) OR (n.id = e.to_node AND e.from_node IN (${anchorIdPlaceholders})) ) WHERE n.id NOT IN (${anchorIdPlaceholders}) AND lower(n.file_path) NOT LIKE '%/dist/%' AND lower(n.file_path) NOT LIKE '%/build/%' AND lower(n.file_path) NOT LIKE '%/generated/%' AND lower(n.file_path) NOT LIKE '%/__tests__/%' AND lower(n.file_path) NOT LIKE '%/coverage/%' AND lower(n.file_path) NOT LIKE '%.spec.ts' AND lower(n.file_path) NOT LIKE '%.spec.tsx' AND lower(n.file_path) NOT LIKE '%.test.ts' AND lower(n.file_path) NOT LIKE '%.test.tsx' AND lower(n.file_path) NOT LIKE '%.d.ts' GROUP BY n.file_path ORDER BY edge_count DESC LIMIT ? `; const neighborParams = [ ...anchorIds, ...anchorIds, ...anchorIds, Math.max(220, recallK * 24), ]; const neighbors = storage .getDatabase() .query(neighborSql) .all(...neighborParams) as Array<{ file_path: string; edge_count: number; node_count: number; }>; for (const neighbor of neighbors) { const filePath = normalizeFilePath(neighbor.file_path, projectPath); if (!filePath) continue; const traversalScore = Math.max(1, Math.min(20, Number(neighbor.edge_count) * 1.15)) + Math.min(3, Number(neighbor.node_count) * 0.1); anchorScores.set( filePath, (anchorScores.get(filePath) ?? 0) + traversalScore, ); } return Array.from(anchorScores.entries()) .sort( (left, right) => right[1] - left[1] || left[0].localeCompare(right[0]), ) .slice(0, Math.max(1, recallK * 8)) .map(([filePath]) => filePath); } function runHybridToolingQuery( storage: IStorageBackend, projectPath: string, query: string, recallK: number, ): string[] { const candidates = [ runGrepToolingQuery(projectPath, query, recallK), runFilenameToolingQuery(projectPath, query, recallK), runSymbolIndexToolingQuery(storage, projectPath, query, recallK), runLspToolingQuery(storage, projectPath, query, recallK), ]; const rrf = new Map<string, number>(); const rankBias = 60; for (const list of candidates) { for (let index = 0; index < list.length; index++) { const filePath = list[index]; if (!filePath) { continue; } const score = 1 / (rankBias + index + 1); rrf.set(filePath, (rrf.get(filePath) ?? 0) + score); } } return Array.from(rrf.entries()) .sort( (left, right) => right[1] - left[1] || left[0].localeCompare(right[0]), ) .slice(0, Math.max(1, recallK * 8)) .map(([filePath]) => filePath); } function fuseRankedFileLists( lists: Array<{ files: string[]; weight: number }>, limit: number, ): string[] { const rankBias = 60; const scores = new Map<string, number>(); for (const list of lists) { const weight = Number.isFinite(list.weight) ? list.weight : 1; if (weight <= 0) { continue; } for (let index = 0; index < list.files.length; index++) { const filePath = list.files[index]; if (!filePath) { continue; } const score = weight / (rankBias + index + 1); scores.set(filePath, (scores.get(filePath) ?? 0) + score); } } return Array.from(scores.entries()) .sort( (left, right) => right[1] - left[1] || left[0].localeCompare(right[0]), ) .slice(0, Math.max(1, limit)) .map(([filePath]) => filePath); } function buildMcpGuardrailRetrievedFiles(input: { storage: IStorageBackend; projectPath: string; query: string; recallK: number; mcpFiles: string[]; }): string[] { const extractDocDriftHintPaths = (query: string): string[] => { const referenceMatch = query.match(/references\s+"([^"]+)"/i); const docMatch = query.match(/from\s+"([^"]+)"/i); const hints = [docMatch?.[1], referenceMatch?.[1]] .map((value) => value?.trim() ?? "") .filter(Boolean) .filter((value) => value.includes("/") && value.includes(".")); return hints.map((value) => normalizeFilePath(value, input.projectPath)); }; const hintPaths = extractDocDriftHintPaths(input.query); const hintCandidates = new Set<string>(); for (const hintPath of hintPaths) { if (!hintPath) { continue; } hintCandidates.add(hintPath); if (!existsSync(join(input.projectPath, hintPath))) { const lowerBase = basename(hintPath).toLowerCase(); if (lowerBase.length >= 4) { for (const filePath of listProjectFiles(input.projectPath)) { const lower = filePath.toLowerCase(); if (lower.endsWith(`/${lowerBase}`) || lower === lowerBase) { hintCandidates.add(filePath); } if (hintCandidates.size >= 20) { break; } } } } } const grepFiles = runGrepToolingQuery( input.projectPath, input.query, input.recallK, ); const filenameFiles = runFilenameToolingQuery( input.projectPath, input.query, input.recallK, ); const symbolFiles = runSymbolIndexToolingQuery( input.storage, input.projectPath, input.query, input.recallK, ); return fuseRankedFileLists( [ { files: Array.from(hintCandidates), weight: 8.5 }, { files: input.mcpFiles, weight: 3.4 }, { files: grepFiles, weight: 4.2 }, { files: filenameFiles, weight: 1.8 }, { files: symbolFiles, weight: 0.9 }, ], Math.max(1, input.recallK * 8), ); } function runToolingQuery( mode: Exclude<BenchmarkMode, "mcp_full" | "mcp_hybrid_guardrail">, storage: IStorageBackend, projectPath: string, query: string, recallK: number, ): string[] { switch (mode) { case "grep_tools": return runGrepToolingQuery(projectPath, query, recallK); case "filename_tools": return runFilenameToolingQuery(projectPath, query, recallK); case "symbol_index_tools": return runSymbolIndexToolingQuery(storage, projectPath, query, recallK); case "lsp_tools": return runLspToolingQuery(storage, projectPath, query, recallK); case "hybrid_tools": return runHybridToolingQuery(storage, projectPath, query, recallK); default: return []; } } async function buildToolingModelInputPayload(input: { projectPath: string; query: string; retrievedFiles: string[]; openFileCount: number; fileCharLimit: number; tokenBudget: number; }): Promise<{ payload: string; openedFiles: string[]; tokenCount: number }> { const filesToOpen = input.retrievedFiles.slice( 0, Math.max(1, input.openFileCount), ); const openedFiles: string[] = []; const parts: string[] = ["# Tool Results (grep/open-file chain)"]; const basePayload = parts.join("\n"); let payload = await truncateToTokens( basePayload, Math.max(1, input.tokenBudget), ); let tokenCount = await countTokens(payload); const minSnippetTokens = 24; const queryTerms = extractQueryTerms(input.query); for (const filePath of filesToOpen) { const absolute = filePath.startsWith("/") ? filePath : join(input.projectPath, filePath); if (!existsSync(absolute)) { continue; } let content = ""; try { content = readFileSync(absolute, "utf-8"); } catch { continue; } if (!content) { continue; } if (content.length > input.fileCharLimit) { content = `${content.slice(0, input.fileCharLimit)}\n/* ...truncated... */`; } const remaining = input.tokenBudget - tokenCount; if (remaining <= minSnippetTokens) { break; } const prefix = `\n\n## ${filePath}\n\`\`\`\n`; const suffix = "\n```"; const wrapperTokens = (await countTokens(prefix)) + (await countTokens(suffix)); const maxContentTokens = remaining - wrapperTokens; if (maxContentTokens < minSnippetTokens) { continue; } // Prefer a focused snippet around query terms, then fallback to leading chunk. let snippet = content; if (queryTerms.length > 0) { const lines = content.split("\n"); const lowerTerms = queryTerms.map((term) => term.toLowerCase()); let bestIndex = -1; for (let index = 0; index < lines.length; index++) { const line = (lines[index] ?? "").toLowerCase(); if (lowerTerms.some((term) => line.includes(term))) { bestIndex = index; break; } } if (bestIndex >= 0) { const start = Math.max(0, bestIndex - 40); const end = Math.min(lines.length, bestIndex + 120); snippet = lines.slice(start, end).join("\n"); } } let snippetTokens = await countTokens(snippet); if (snippetTokens > maxContentTokens) { snippet = await truncateToTokens(snippet, maxContentTokens); snippetTokens = await countTokens(snippet); } if (snippetTokens < minSnippetTokens) { continue; } const block = `${prefix}${snippet}${suffix}`; const blockTokens = await countTokens(block); if (blockTokens <= 0 || tokenCount + blockTokens > input.tokenBudget) { continue; } payload += block; tokenCount += blockTokens; openedFiles.push(filePath); } return { payload, openedFiles, tokenCount, }; } async function buildMcpGuardrailModelInputPayload(input: { mcpContext: string; retrievedFiles: string[]; tokenBudget: number; contextShare: number; maxCandidates: number; }): Promise<{ payload: string; tokenCount: number; surfacedFiles: string[] }> { const boundedShare = Math.min(0.9, Math.max(0.15, input.contextShare)); const contextBudget = Math.max( 256, Math.floor(input.tokenBudget * boundedShare), ); const boundedContext = await truncateToTokens( input.mcpContext, contextBudget, ); const surfacedFiles = input.retrievedFiles.slice( 0, Math.max(4, input.maxCandidates), ); const candidateBlock = surfacedFiles .map((filePath, index) => `${index + 1}. ${filePath}`) .join("\n"); const payload = await truncateToTokens( `# MCP Context (compressed)\n${boundedContext}\n\n# Guardrail Candidate Files\n${candidateBlock}`, Math.max(1, input.tokenBudget), ); const tokenCount = await countTokens(payload); return { payload, tokenCount, surfacedFiles }; } function scoreRetrievedFiles(input: { expectedFilePaths: string[]; retrievedFilePaths: string[]; recallK: number; projectPath: string; }) { const expected = new Set( input.expectedFilePaths.map((path) => path.toLowerCase()), ); const retrievedTopK = input.retrievedFilePaths .filter((path) => !isDisallowedCitedPath(path)) .slice(0, Math.max(1, input.recallK)); const retrieved = new Set(retrievedTopK.map((path) => path.toLowerCase())); const matched = new Set<string>(); const missing = new Set<string>(); for (const expectedPath of expected) { if (retrieved.has(expectedPath)) { matched.add(expectedPath); } else { missing.add(expectedPath); } } const hallucinated = retrievedTopK.filter((path) => { const absolute = resolveProjectPath(path, input.projectPath); return !absolute || !existsSync(absolute); }); const fileRecall = expected.size === 0 ? 1 : toFixedNumber(matched.size / expected.size, 4); const filePrecision = retrieved.size === 0 ? expected.size === 0 ? 1 : 0 : toFixedNumber(matched.size / retrieved.size, 4); const hallucinatedRatio = retrievedTopK.length === 0 ? 0 : toFixedNumber(hallucinated.length / retrievedTopK.length, 4); return { fileRecall, filePrecision, hallucinatedRatio, matchedFiles: Array.from(matched).sort((left, right) => left.localeCompare(right), ), missingFiles: Array.from(missing).sort((left, right) => left.localeCompare(right), ), retrievedTopK: retrievedTopK.sort((left, right) => left.localeCompare(right), ), }; } function summarizeModeRuns(input: { mode: BenchmarkMode; runs: ChoiceRun[]; tokenBudget: number; timingSource: TimingSource; }): Omit<ChoiceModeSummary, "byRunKind"> { const tokenLoads = input.runs.map((run) => run.inputTokens); const payloadTokenLoads = input.runs.map((run) => run.payloadInputTokens); const providerTokenLoads = input.runs.map((run) => run.providerInputTokens); const outputTokenLoads = input.runs.map((run) => run.outputTokens); const tokensPerMatchedFile = input.runs.map((run) => run.matchedFileCount > 0 ? run.inputTokens / run.matchedFileCount : run.inputTokens, ); const modelLatencies = input.runs.map( (run) => run.timingBreakdownMs.modelRequestMs, ); const endToEndLatencies = input.runs.map( (run) => run.timingBreakdownMs.endToEndMs, ); const retrievalLatencies = input.runs.map((run) => run.latencyMs); return { mode: input.mode, runs: input.runs.length, timingSource: input.timingSource, latencyMs: { avg: average(retrievalLatencies), p50: percentile(retrievalLatencies, 50), p95: percentile(retrievalLatencies, 95), }, quality: { fileRecallAvg: average(input.runs.map((run) => run.fileRecall)), filePrecisionAvg: average(input.runs.map((run) => run.filePrecision)), wrongPathRatioAvg: average( input.runs.map((run) => 1 - run.filePrecision), ), hallucinatedRatioAvg: average( input.runs.map((run) => run.hallucinatedRatio), ), }, tokenUsage: { inputTokensAvg: average(tokenLoads), inputTokensP50: percentile(tokenLoads, 50), inputTokensP95: percentile(tokenLoads, 95), payloadInputTokensAvg: average(payloadTokenLoads), payloadInputTokensP50: percentile(payloadTokenLoads, 50), payloadInputTokensP95: percentile(payloadTokenLoads, 95), providerInputTokensAvg: averageNullable(providerTokenLoads), outputTokensAvg: average(outputTokenLoads), outputTokensP50: percentile(outputTokenLoads, 50), outputTokensP95: percentile(outputTokenLoads, 95), openedFileCountAvg: average(input.runs.map((run) => run.openedFileCount)), tokensPerMatchedFileAvg: average(tokensPerMatchedFile), budgetUtilizationAvg: average( input.runs.map((run) => input.tokenBudget > 0 ? Math.min(1, run.inputTokens / input.tokenBudget) : 0, ), ), }, timingMs: { modelRequestAvg: average(modelLatencies), modelRequestP50: percentile(modelLatencies, 50), modelRequestP95: percentile(modelLatencies, 95), endToEndAvg: average(endToEndLatencies), endToEndP50: percentile(endToEndLatencies, 50), endToEndP95: percentile(endToEndLatencies, 95), source: input.timingSource, }, estimatedTimingMs: { llmProcessingAvg: average(modelLatencies), endToEndAvg: average(endToEndLatencies), endToEndP95: percentile(endToEndLatencies, 95), }, }; } function summarizeMode(input: { mode: BenchmarkMode; runs: ChoiceRun[]; tokenBudget: number; timingSource: TimingSource; runKinds: RunKind[]; }): ChoiceModeSummary { const modeRuns = input.runs.filter((run) => run.mode === input.mode); const byRunKind: Partial< Record<RunKind, Omit<ChoiceModeSummary, "byRunKind">> > = {}; for (const runKind of input.runKinds) { const kindRuns = modeRuns.filter((run) => run.runKind === runKind); if (kindRuns.length === 0) continue; byRunKind[runKind] = summarizeModeRuns({ mode: input.mode, runs: kindRuns, tokenBudget: input.tokenBudget, timingSource: input.timingSource, }); } return { ...summarizeModeRuns({ mode: input.mode, runs: modeRuns, tokenBudget: input.tokenBudget, timingSource: input.timingSource, }), byRunKind, }; } function loadFixture(projectPath: string): { fixturePath: string; queries: ChoiceQuery[]; fixtureValidation: NonNullable<ChoiceBenchmarkReport["fixtureValidation"]>; } { const fixturePath = process.env.DOCLEA_CHOICE_FIXTURE_PATH ? resolve(process.env.DOCLEA_CHOICE_FIXTURE_PATH) : resolve( projectPath, ".doclea/retrieval-agent-choice-queries.monorepo.json", ); if (!existsSync(fixturePath)) { throw new Error( `Missing fixture at ${fixturePath}. Set DOCLEA_CHOICE_FIXTURE_PATH or create .doclea/retrieval-agent-choice-queries.monorepo.json.`, ); } const parsed = JSON.parse(readFileSync(fixturePath, "utf-8")); const fixture = FixtureSchema.parse(parsed); const validateExpectedExists = parseBoolEnv( "DOCLEA_CHOICE_VALIDATE_EXPECTED_EXISTS", true, ); const dropInvalidQueries = parseBoolEnv( "DOCLEA_CHOICE_DROP_INVALID_QUERIES", true, ); if (!validateExpectedExists) { return { fixturePath, queries: fixture.queries, fixtureValidation: { requestedQueries: fixture.queries.length, activeQueries: fixture.queries.length, droppedQueryCount: 0, droppedExpectedPathCount: 0, droppedQueryIds: [], droppedExpectedPathSamples: [], }, }; } const invalidByQuery: Array<{ queryId: string; missingPaths: string[]; }> = []; const activeQueries: ChoiceQuery[] = []; for (const query of fixture.queries) { const missingPaths = query.expectedFilePaths.filter((path) => { const resolvedPath = resolveProjectPath(path, projectPath); return !resolvedPath || !existsSync(resolvedPath); }); if (missingPaths.length > 0) { invalidByQuery.push({ queryId: query.id, missingPaths, }); continue; } activeQueries.push(query); } if (invalidByQuery.length > 0 && !dropInvalidQueries) { const preview = invalidByQuery .slice(0, 8) .map( (item) => `${item.queryId}: ${item.missingPaths.slice(0, 4).join(", ")}${item.missingPaths.length > 4 ? " ..." : ""}`, ) .join("\n"); throw new Error( `Fixture contains ${invalidByQuery.length} queries with missing expected paths.\n${preview}\nSet DOCLEA_CHOICE_DROP_INVALID_QUERIES=true to drop invalid queries automatically.`, ); } if (activeQueries.length === 0) { throw new Error( `Fixture has no valid queries after expected-path validation: ${fixturePath}`, ); } const droppedExpectedPathSamples = invalidByQuery .flatMap((item) => item.missingPaths.map((path) => ({ queryId: item.queryId, path, })), ) .slice(0, 24); if (invalidByQuery.length > 0 && dropInvalidQueries) { console.error( `[doclea] Dropping ${invalidByQuery.length} invalid queries (${droppedExpectedPathSamples.length} missing path samples shown in report metadata).`, ); } return { fixturePath, queries: activeQueries, fixtureValidation: { requestedQueries: fixture.queries.length, activeQueries: activeQueries.length, droppedQueryCount: invalidByQuery.length, droppedExpectedPathCount: invalidByQuery.reduce( (sum, item) => sum + item.missingPaths.length, 0, ), droppedQueryIds: invalidByQuery.map((item) => item.queryId), droppedExpectedPathSamples, }, }; } async function main(): Promise<void> { const projectPath = resolve( process.env.DOCLEA_BENCH_PROJECT_PATH ?? process.cwd(), ); const { fixturePath, queries, fixtureValidation } = loadFixture(projectPath); const config = await loadConfigWithAutoDetect(projectPath); const runsPerQuery = parseIntEnv("DOCLEA_CHOICE_RUNS_PER_QUERY", 4); const warmupRuns = parseIntEnv("DOCLEA_CHOICE_WARMUP_RUNS", 0); const timingSource = parseTimingSourceEnv(); const inputTokensSource = parseInputTokenSourceEnv(); const runKinds = parseRunKindsEnv(); const concurrency = parseIntEnv("DOCLEA_CHOICE_CONCURRENCY", 1); const tokenBudget = parseIntEnv("DOCLEA_CHOICE_TOKEN_BUDGET", 32000); const recallK = parseIntEnv("DOCLEA_CHOICE_RECALL_K", 20); const grepOpenFiles = parseIntEnv("DOCLEA_CHOICE_GREP_OPEN_FILES", 10); const grepFileCharLimit = parseIntEnv( "DOCLEA_CHOICE_GREP_FILE_CHAR_LIMIT", 6000, ); const estimatedOutputTokens = parseIntEnv( "DOCLEA_CHOICE_ESTIMATED_OUTPUT_TOKENS", 400, ); const mcpGuardrailContextShare = parseFloatEnv( "DOCLEA_CHOICE_MCP_GUARDRAIL_CONTEXT_SHARE", 0.45, ); const mcpGuardrailMaxCandidates = parseIntEnv( "DOCLEA_CHOICE_MCP_GUARDRAIL_MAX_CANDIDATES", 80, ); const inputTokensPerSecond = parseFloatEnv( "DOCLEA_CHOICE_MODEL_INPUT_TOKENS_PER_SEC", 1200, ); const outputTokensPerSecond = parseFloatEnv( "DOCLEA_CHOICE_MODEL_OUTPUT_TOKENS_PER_SEC", 400, ); const liveLlmCommand = (process.env.DOCLEA_LIVE_LLM_CLI_COMMAND ?? "").trim(); const liveLlmModel = ( process.env.DOCLEA_LIVE_LLM_MODEL ?? "benchmark-cli-model" ).trim(); const liveLlmTemperature = parseFloatEnv("DOCLEA_LIVE_LLM_TEMPERATURE", 0); const liveLlmMaxOutputTokens = parseIntEnv( "DOCLEA_LIVE_LLM_MAX_OUTPUT_TOKENS", estimatedOutputTokens, ); const liveLlmTimeoutMs = parseIntEnv("DOCLEA_LIVE_LLM_TIMEOUT_MS", 120_000); const clearCacheBeforeRun = parseBoolEnv( "DOCLEA_CHOICE_CLEAR_CACHE_BEFORE_RUN", true, ); const template = (process.env.DOCLEA_CHOICE_TEMPLATE as | "default" | "compact" | "detailed") || "compact"; const modes = parseBenchmarkModesEnv(); const realworldCodex = parseBoolEnv("DOCLEA_CHOICE_REALWORLD_CODEX", false); const requireQwenStack = parseBoolEnv( "DOCLEA_CHOICE_REQUIRE_QWEN_STACK", realworldCodex, ); const qwenExpectedEndpoint = normalizeBaseUrl( process.env.DOCLEA_CHOICE_QWEN_EMBED_ENDPOINT ?? "http://localhost:8180", ); const qwenExpectedQdrantUrl = normalizeBaseUrl( process.env.DOCLEA_CHOICE_QWEN_QDRANT_URL ?? "http://localhost:6333", ); const qwenExpectedCollectionName = ( process.env.DOCLEA_CHOICE_QWEN_COLLECTION ?? "doclea-memories-qwen" ).trim(); const qwenExpectedVectorSize = parseIntEnv( "DOCLEA_CHOICE_QWEN_VECTOR_SIZE", 1024, ); const qwenPreflightTimeoutMs = parseIntEnv( "DOCLEA_CHOICE_QWEN_PREFLIGHT_TIMEOUT_MS", 6000, ); const runs: ChoiceRun[] = []; let qwenPreflight: ChoiceBenchmarkReport["qwenPreflight"] = null; if (timingSource === "measured" && !liveLlmCommand) { throw new Error( "DOCLEA_LIVE_LLM_CLI_COMMAND is required when DOCLEA_CHOICE_TIMING_MODE=measured.", ); } if (realworldCodex && timingSource !== "measured") { throw new Error( "DOCLEA_CHOICE_REALWORLD_CODEX requires DOCLEA_CHOICE_TIMING_MODE=measured.", ); } if (realworldCodex) { const unsupportedModes = modes.filter( (mode) => mode !== "mcp_full" && mode !== "grep_tools", ); if (unsupportedModes.length > 0) { throw new Error( `DOCLEA_CHOICE_REALWORLD_CODEX supports only mcp_full and grep_tools. Unsupported: ${unsupportedModes.join(", ")}`, ); } } if (requireQwenStack) { qwenPreflight = await runQwenStackPreflight({ projectPath, config, expectedEndpoint: qwenExpectedEndpoint, expectedQdrantUrl: qwenExpectedQdrantUrl, expectedCollectionName: qwenExpectedCollectionName, expectedVectorSize: qwenExpectedVectorSize, timeoutMs: qwenPreflightTimeoutMs, }); // Tune local TEI batch size for Qwen CPU unless explicitly overridden. if (!process.env.DOCLEA_LOCAL_EMBED_MAX_BATCH_SIZE) { process.env.DOCLEA_LOCAL_EMBED_MAX_BATCH_SIZE = "8"; } } const storage = createStorageBackend(config.storage, projectPath); await storage.initialize(); const vectors = createVectorStore(config.vector, projectPath); await vectors.initialize(); const baseEmbeddings = createEmbeddingClient(config.embedding); const modelName = config.embedding.provider === "local" ? "local-tei" : config.embedding.model; const embeddings = new CachedEmbeddingClient( baseEmbeddings, storage, modelName, ); try { for (const query of queries) { const expectedFilePaths = query.expectedFilePaths.map((path) => normalizeFilePath(path, projectPath), ); const runIndexes = Array.from( { length: runsPerQuery }, (_, index) => index, ); for (const runKind of runKinds) { const isColdRun = runKind === "cold"; const shouldResetBeforeRun = clearCacheBeforeRun && isColdRun; const effectiveConcurrency = shouldResetBeforeRun ? 1 : Math.max(1, concurrency); for (const mode of modes) { if (realworldCodex) { const modeRuns = await runWithConcurrency({ items: runIndexes, concurrency: effectiveConcurrency, worker: async () => { const retrievalMs = 0; const promptBuildStartedAt = performance.now(); const messages = buildRealworldCodexMessages({ query: query.query, mode, projectPath, tokenBudget, }); const localInputTokens = await countTokens( formatMessagesForTokenCount(messages), ); const promptBuildMs = toFixedNumber( performance.now() - promptBuildStartedAt, 4, ); let inputTokens = localInputTokens; let outputTokens = estimatedOutputTokens; let payloadOutputTokens = estimatedOutputTokens; let providerInputTokens: number | null = null; let providerOutputTokens: number | null = null; let modelRequestMs = 0; let retrievedFilesForScore: string[] = []; const modelStartedAt = performance.now(); const completion = await runLlmCliCompletion({ command: liveLlmCommand, timeoutMs: liveLlmTimeoutMs, request: { version: "v1", model: liveLlmModel, temperature: liveLlmTemperature, maxOutputTokens: liveLlmMaxOutputTokens, messages, metadata: { queryId: query.id, mode, runKind, tokenBudget, projectPath, }, }, }); modelRequestMs = toFixedNumber( performance.now() - modelStartedAt, 4, ); providerInputTokens = completion.inputTokens; providerOutputTokens = completion.outputTokens; payloadOutputTokens = await countTokens(completion.outputText); inputTokens = inputTokensSource === "provider" ? completion.inputTokens : localInputTokens; outputTokens = completion.outputTokens; retrievedFilesForScore = parseCitedFilesFromCompletion({ outputText: completion.outputText, projectPath, }); const score = scoreRetrievedFiles({ expectedFilePaths, retrievedFilePaths: retrievedFilesForScore, recallK, projectPath, }); const endToEndMs = toFixedNumber( retrievalMs + promptBuildMs + modelRequestMs, 4, ); return { queryId: query.id, query: query.query, mode, runKind, timingSource, latencyMs: retrievalMs, inputTokens, inputTokensSource, payloadInputTokens: localInputTokens, payloadOutputTokens, providerInputTokens, providerOutputTokens, outputTokens, timingBreakdownMs: { retrievalMs, promptBuildMs, modelRequestMs, endToEndMs, }, estimatedLlmMs: modelRequestMs, estimatedEndToEndMs: endToEndMs, openedFileCount: retrievedFilesForScore.length, matchedFileCount: score.matchedFiles.length, fileRecall: score.fileRecall, filePrecision: score.filePrecision, hallucinatedRatio: score.hallucinatedRatio, matchedFiles: score.matchedFiles, missingFiles: score.missingFiles, retrievedTopK: score.retrievedTopK, } as ChoiceRun; }, }); runs.push(...modeRuns); continue; } if (mode === "mcp_full" || mode === "mcp_hybrid_guardrail") { const input = { query: query.query, tokenBudget, includeCodeGraph: true, includeGraphRAG: true, includeEvidence: true, template, } as const; for (let warmup = 0; warmup < warmupRuns; warmup++) { if (shouldResetBeforeRun) { resetContextCache(); } await buildContextWithCache( input, storage, vectors, embeddings, config.cache, config.scoring, ); } const modeRuns = await runWithConcurrency({ items: runIndexes, concurrency: effectiveConcurrency, worker: async () => { if (shouldResetBeforeRun) { resetContextCache(); } const retrievalStartedAt = performance.now(); const result = await buildContextWithCache( input, storage, vectors, embeddings, config.cache, config.scoring, ); const retrievalMs = toFixedNumber( performance.now() - retrievalStartedAt, 4, ); const promptBuildStartedAt = performance.now(); const mcpRetrievedFiles = collectTopKFilesFromEvidence( result.evidence ?? [], Math.max(recallK, recallK * 2), projectPath, ); let retrievedFilesForScore: string[] = mcpRetrievedFiles; let contextPayload = ""; let localInputTokens = 0; let openedFileCount = mcpRetrievedFiles.length; if (mode === "mcp_hybrid_guardrail") { const guardrailRetrievedFiles = buildMcpGuardrailRetrievedFiles({ storage, projectPath, query: query.query, recallK, mcpFiles: mcpRetrievedFiles, }); const payload = await buildMcpGuardrailModelInputPayload({ mcpContext: result.context, retrievedFiles: guardrailRetrievedFiles, tokenBudget, contextShare: mcpGuardrailContextShare, maxCandidates: mcpGuardrailMaxCandidates, }); retrievedFilesForScore = guardrailRetrievedFiles; contextPayload = payload.payload; localInputTokens = payload.tokenCount; openedFileCount = payload.surfacedFiles.length; } else { const boundedPayload = await truncateToTokens( `# MCP Context\n${result.context}`, Math.max(1, tokenBudget), ); contextPayload = boundedPayload; localInputTokens = await countTokens(boundedPayload); } const promptBuildMs = toFixedNumber( performance.now() - promptBuildStartedAt, 4, ); const score = scoreRetrievedFiles({ expectedFilePaths, retrievedFilePaths: retrievedFilesForScore, recallK, projectPath, }); let inputTokens = localInputTokens; let outputTokens = estimatedOutputTokens; let payloadOutputTokens = estimatedOutputTokens; let providerInputTokens: number | null = null; let providerOutputTokens: number | null = null; let modelRequestMs = 0; if (timingSource === "measured") { const messages = buildLiveBenchmarkMessages({ query: query.query, contextPayload, }); const modelStartedAt = performance.now(); const completion = await runLlmCliCompletion({ command: liveLlmCommand, timeoutMs: liveLlmTimeoutMs, request: { version: "v1", model: liveLlmModel, temperature: liveLlmTemperature, maxOutputTokens: liveLlmMaxOutputTokens, messages, metadata: { queryId: query.id, mode, runKind, tokenBudget, projectPath, }, }, }); modelRequestMs = toFixedNumber( performance.now() - modelStartedAt, 4, ); providerInputTokens = completion.inputTokens; providerOutputTokens = completion.outputTokens; payloadOutputTokens = await countTokens( completion.outputText, ); inputTokens = inputTokensSource === "provider" ? completion.inputTokens : localInputTokens; outputTokens = completion.outputTokens; } else { modelRequestMs = estimateLlmProcessingMs({ inputTokens: localInputTokens, outputTokens: estimatedOutputTokens, inputTokensPerSecond, outputTokensPerSecond, }); } const endToEndMs = toFixedNumber( retrievalMs + promptBuildMs + modelRequestMs, 4, ); return { queryId: query.id, query: query.query, mode, runKind, timingSource, latencyMs: retrievalMs, inputTokens, inputTokensSource, payloadInputTokens: localInputTokens, payloadOutputTokens, providerInputTokens, providerOutputTokens, outputTokens, timingBreakdownMs: { retrievalMs, promptBuildMs, modelRequestMs, endToEndMs, }, estimatedLlmMs: modelRequestMs, estimatedEndToEndMs: endToEndMs, openedFileCount, matchedFileCount: score.matchedFiles.length, fileRecall: score.fileRecall, filePrecision: score.filePrecision, hallucinatedRatio: score.hallucinatedRatio, matchedFiles: score.matchedFiles, missingFiles: score.missingFiles, retrievedTopK: score.retrievedTopK, } as ChoiceRun; }, }); runs.push(...modeRuns); } else { for (let warmup = 0; warmup < warmupRuns; warmup++) { if (shouldResetBeforeRun) { resetContextCache(); } runToolingQuery(mode, storage, projectPath, query.query, recallK); } const modeRuns = await runWithConcurrency({ items: runIndexes, concurrency: effectiveConcurrency, worker: async () => { if (shouldResetBeforeRun) { resetContextCache(); } const retrievalStartedAt = performance.now(); const retrievedFiles = runToolingQuery( mode, storage, projectPath, query.query, recallK, ); const retrievalMs = toFixedNumber( performance.now() - retrievalStartedAt, 4, ); const promptBuildStartedAt = performance.now(); const payload = await buildToolingModelInputPayload({ projectPath, query: query.query, retrievedFiles, openFileCount: grepOpenFiles, fileCharLimit: grepFileCharLimit, tokenBudget, }); const contextPayload = payload.payload; const localInputTokens = payload.tokenCount; const promptBuildMs = toFixedNumber( performance.now() - promptBuildStartedAt, 4, ); const score = scoreRetrievedFiles({ expectedFilePaths, retrievedFilePaths: payload.openedFiles, recallK, projectPath, }); let inputTokens = localInputTokens; let outputTokens = estimatedOutputTokens; let payloadOutputTokens = estimatedOutputTokens; let providerInputTokens: number | null = null; let providerOutputTokens: number | null = null; let modelRequestMs = 0; if (timingSource === "measured") { const messages = buildLiveBenchmarkMessages({ query: query.query, contextPayload, }); const modelStartedAt = performance.now(); const completion = await runLlmCliCompletion({ command: liveLlmCommand, timeoutMs: liveLlmTimeoutMs, request: { version: "v1", model: liveLlmModel, temperature: liveLlmTemperature, maxOutputTokens: liveLlmMaxOutputTokens, messages, metadata: { queryId: query.id, mode, runKind, tokenBudget, projectPath, }, }, }); modelRequestMs = toFixedNumber( performance.now() - modelStartedAt, 4, ); providerInputTokens = completion.inputTokens; providerOutputTokens = completion.outputTokens; payloadOutputTokens = await countTokens( completion.outputText, ); inputTokens = inputTokensSource === "provider" ? completion.inputTokens : localInputTokens; outputTokens = completion.outputTokens; } else { modelRequestMs = estimateLlmProcessingMs({ inputTokens: localInputTokens, outputTokens: estimatedOutputTokens, inputTokensPerSecond, outputTokensPerSecond, }); } const endToEndMs = toFixedNumber( retrievalMs + promptBuildMs + modelRequestMs, 4, ); return { queryId: query.id, query: query.query, mode, runKind, timingSource, latencyMs: retrievalMs, inputTokens, inputTokensSource, payloadInputTokens: localInputTokens, payloadOutputTokens, providerInputTokens, providerOutputTokens, outputTokens, timingBreakdownMs: { retrievalMs, promptBuildMs, modelRequestMs, endToEndMs, }, estimatedLlmMs: modelRequestMs, estimatedEndToEndMs: endToEndMs, openedFileCount: payload.openedFiles.length, matchedFileCount: score.matchedFiles.length, fileRecall: score.fileRecall, filePrecision: score.filePrecision, hallucinatedRatio: score.hallucinatedRatio, matchedFiles: score.matchedFiles, missingFiles: score.missingFiles, retrievedTopK: score.retrievedTopK, } as ChoiceRun; }, }); runs.push(...modeRuns); } } } } const report: ChoiceBenchmarkReport = { generatedAt: new Date().toISOString(), projectPath, fixturePath, fixtureValidation, timingSource, concurrency: Math.max(1, concurrency), recallK, tokenBudget, runsPerQuery, warmupRuns, runKinds, liveInference: { command: timingSource === "measured" ? liveLlmCommand : null, model: timingSource === "measured" ? liveLlmModel : null, region: timingSource === "measured" ? (process.env.DOCLEA_LIVE_LLM_REGION ?? null) : null, timeoutMs: timingSource === "measured" ? liveLlmTimeoutMs : null, temperature: timingSource === "measured" ? toFixedNumber(liveLlmTemperature, 4) : null, maxOutputTokens: timingSource === "measured" ? liveLlmMaxOutputTokens : null, }, tokenAccounting: { inputTokensSource, description: inputTokensSource === "payload" ? "Input tokens are measured from the benchmark payload sent to the model." : "Input tokens are taken from provider-reported usage.", }, qwenPreflight, realworldCodex, comparisonModel: { grepOpenFiles, grepFileCharLimit, estimatedOutputTokens, inputTokensPerSecond: toFixedNumber(inputTokensPerSecond, 2), outputTokensPerSecond: toFixedNumber(outputTokensPerSecond, 2), activeModes: modes, }, modes: modes.map((mode) => summarizeMode({ mode, runs, tokenBudget, timingSource, runKinds, }), ), runs, }; const outputPath = resolve( process.env.DOCLEA_CHOICE_REPORT_JSON_PATH ?? `${projectPath}/.doclea/reports/mcp-vs-grep-choice-benchmark.json`, ); ensureDirectory(outputPath); writeFileSync(outputPath, `${JSON.stringify(report, null, 2)}\n`, "utf-8"); console.log( JSON.stringify( { reportPath: outputPath, queryCount: queries.length, runsPerQuery, concurrency: report.concurrency, liveInference: report.liveInference, tokenAccounting: report.tokenAccounting, fixtureValidation: report.fixtureValidation, qwenPreflight: report.qwenPreflight, realworldCodex: report.realworldCodex ?? false, recallK, tokenBudget, comparisonModel: report.comparisonModel, modes: report.modes, }, null, 2, ), ); } finally { if (typeof vectors.close === "function") { vectors.close(); } storage.close(); } } await main();

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/docleaai/doclea-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

mcp-vs-grep-choice-benchmark.ts•78.3 KiB