Vector Memory MCP Server

runner.ts•7.14 KiB

/** * Benchmark Runner * * Executes benchmark queries against a loaded dataset and calculates * retrieval quality metrics. */ import { mkdtempSync, rmSync } from "fs"; import { join } from "path"; import { tmpdir } from "os"; import { connectToDatabase } from "../../src/db/connection"; import { MemoryRepository } from "../../src/db/memory.repository"; import { EmbeddingsService } from "../../src/services/embeddings.service"; import { MemoryService } from "../../src/services/memory.service"; import type { BenchmarkDataset, BenchmarkResults, CategoryMetrics, QueryCategory, QueryResult, } from "./types"; import type { SearchIntent } from "../../src/types/memory"; import { precisionAtK, recallAtK, reciprocalRank, ndcgAtK, buildRelevanceScores, } from "./metrics"; const MODEL_NAME = "Xenova/all-MiniLM-L6-v2"; const MODEL_DIMENSION = 384; /** * Thresholds for pass/fail by query category. */ /** * Thresholds adjusted for hybrid search with intent-based scoring and jitter. * Hybrid search trades some top-1 precision for better overall recall and * noise-robustness via controlled randomness. * Lowered to account for cross-platform embedding variance in CI. */ const CATEGORY_THRESHOLDS: Record< QueryCategory, { minMRR?: number; minPrecision1?: number; minRecall5?: number } > = { exact_match: { minMRR: 0.65, minPrecision1: 0.45 }, // Adjusted for hybrid + jitter + CI variance semantic: { minMRR: 0.45 }, related_concept: { minRecall5: 0.4 }, negative: {}, // Special handling - no false positives edge_case: { minMRR: 0.3 }, }; export class BenchmarkRunner { private service: MemoryService | null = null; private tmpDir: string | null = null; /** Maps dataset memory IDs to actual stored memory IDs */ private memoryIdMap: Map<string, string> = new Map(); /** * Initialize the benchmark environment. * Creates a temporary database and initializes services. */ async setup(): Promise<void> { this.tmpDir = mkdtempSync(join(tmpdir(), "vector-memory-benchmark-")); const dbPath = join(this.tmpDir, "benchmark.lancedb"); const db = await connectToDatabase(dbPath); const repository = new MemoryRepository(db); const embeddings = new EmbeddingsService(MODEL_NAME, MODEL_DIMENSION); this.service = new MemoryService(repository, embeddings); } /** * Clean up temporary files. */ async teardown(): Promise<void> { if (this.tmpDir) { rmSync(this.tmpDir, { recursive: true, force: true }); this.tmpDir = null; } this.service = null; this.memoryIdMap.clear(); } /** * Load a dataset into the database. * Stores all memories and tracks ID mappings. */ async loadDataset(dataset: BenchmarkDataset): Promise<void> { if (!this.service) { throw new Error("Call setup() first"); } this.memoryIdMap.clear(); for (const mem of dataset.memories) { const stored = await this.service.store(mem.content, mem.metadata ?? {}); this.memoryIdMap.set(mem.id, stored.id); } } /** * Run the benchmark against all queries in the dataset. */ async runBenchmark(dataset: BenchmarkDataset): Promise<BenchmarkResults> { if (!this.service) { throw new Error("Call setup() first"); } const queryResults: QueryResult[] = []; const categoryResults = new Map<QueryCategory, QueryResult[]>(); for (const query of dataset.queries) { // Run search - fetch more than needed to measure recall // Use "fact_check" intent for benchmarks as it emphasizes relevance const intent: SearchIntent = "fact_check"; const results = await this.service.search(query.query, intent, 10); const retrievedIds = results.map((m) => m.id); // Map expected IDs to actual stored IDs const expectedActualIds = query.relevantMemoryIds .map((id) => this.memoryIdMap.get(id)) .filter((id): id is string => id !== undefined); const partialActualIds = (query.partiallyRelevantIds ?? []) .map((id) => this.memoryIdMap.get(id)) .filter((id): id is string => id !== undefined); // Calculate metrics const relevantSet = new Set(expectedActualIds); const relevanceScores = buildRelevanceScores( expectedActualIds, partialActualIds ); const result: QueryResult = { queryId: query.id, query: query.query, category: query.category, retrievedIds, expectedIds: expectedActualIds, precision1: precisionAtK(retrievedIds, relevantSet, 1), precision5: precisionAtK(retrievedIds, relevantSet, 5), recall5: recallAtK(retrievedIds, relevantSet, 5), reciprocalRank: reciprocalRank(retrievedIds, relevantSet), ndcg5: ndcgAtK(retrievedIds, relevanceScores, 5), passed: false, // Set by threshold check }; result.passed = this.meetsThreshold(result); queryResults.push(result); // Group by category if (!categoryResults.has(query.category)) { categoryResults.set(query.category, []); } categoryResults.get(query.category)!.push(result); } // Aggregate metrics by category const byCategory = new Map<QueryCategory, CategoryMetrics>(); for (const [category, results] of categoryResults) { byCategory.set(category, this.aggregateMetrics(results)); } return { byCategory, overall: this.aggregateMetrics(queryResults), queryResults, }; } /** * Check if a query result meets its category threshold. */ private meetsThreshold(result: QueryResult): boolean { const thresholds = CATEGORY_THRESHOLDS[result.category]; // Special handling for negative queries if (result.category === "negative") { // For negative tests, we don't have relevant items defined // The test passes if we don't have false positives // Since we can't easily measure this without relevance labels, // we consider negative tests as passed (informational only) return true; } if ( thresholds.minMRR !== undefined && result.reciprocalRank < thresholds.minMRR ) { return false; } if ( thresholds.minPrecision1 !== undefined && result.precision1 < thresholds.minPrecision1 ) { return false; } if ( thresholds.minRecall5 !== undefined && result.recall5 < thresholds.minRecall5 ) { return false; } return true; } /** * Aggregate metrics across multiple query results. */ private aggregateMetrics(results: QueryResult[]): CategoryMetrics { const n = results.length; if (n === 0) { return { meanPrecisionAt1: 0, meanPrecisionAt5: 0, meanRecallAt5: 0, meanReciprocalRank: 0, meanNDCGAt5: 0, queryCount: 0, }; } return { meanPrecisionAt1: results.reduce((s, r) => s + r.precision1, 0) / n, meanPrecisionAt5: results.reduce((s, r) => s + r.precision5, 0) / n, meanRecallAt5: results.reduce((s, r) => s + r.recall5, 0) / n, meanReciprocalRank: results.reduce((s, r) => s + r.reciprocalRank, 0) / n, meanNDCGAt5: results.reduce((s, r) => s + r.ndcg5, 0) / n, queryCount: n, }; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/AerionDyseti/vector-memory-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

runner.ts•7.14 KiB