MCP Router

embedding.provider.ts•7.44 KiB

import { injectable, inject } from 'inversify'; import { TYPES } from '@main/core/types'; import type { ILogger } from '@main/core/interfaces'; /** * Embedding provider interface for generating vector embeddings. */ export interface IEmbeddingProvider { /** Generate embedding vector for text */ embed(text: string): Promise<number[]>; /** Batch embed multiple texts */ embedBatch(texts: string[]): Promise<number[][]>; /** Calculate cosine similarity between two vectors */ similarity(a: number[], b: number[]): number; /** Get the embedding dimension */ getDimension(): number; /** Check if the provider is ready */ isReady(): boolean; } /** * Configuration for local embedding generation. */ export interface LocalEmbeddingConfig { /** Embedding dimension (default: 384 for MiniLM) */ dimension: number; /** Vocabulary size for hashing (default: 10000) */ vocabSize: number; /** N-gram range for feature extraction */ ngramRange: [number, number]; } /** * Local embedding provider using TF-IDF style hashing. * * This is a lightweight, local-only embedding solution that doesn't require * external APIs or ML models. It uses: * - Character and word n-grams for feature extraction * - Hash-based vectorization (feature hashing) * - TF-IDF weighting for better semantic representation * * While not as sophisticated as neural embeddings, it's fast, private, * and works well for basic semantic search. */ @injectable() export class LocalEmbeddingProvider implements IEmbeddingProvider { private config: LocalEmbeddingConfig; private idfWeights: Map<number, number> = new Map(); private documentCount = 0; private readonly stopWords: Set<string>; constructor(@inject(TYPES.Logger) private logger: ILogger) { this.config = { dimension: 384, vocabSize: 10000, ngramRange: [1, 3], }; // Common English stop words to filter out this.stopWords = new Set([ 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'the', 'this', 'but', 'they', 'have', 'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how', ]); this.logger.debug('LocalEmbeddingProvider initialized', { dimension: this.config.dimension, }); } /** * Generate embedding for a single text. */ async embed(text: string): Promise<number[]> { const features = this.extractFeatures(text); const vector = this.hashFeatures(features); return this.normalize(vector); } /** * Generate embeddings for multiple texts in batch. */ async embedBatch(texts: string[]): Promise<number[][]> { return Promise.all(texts.map(text => this.embed(text))); } /** * Calculate cosine similarity between two vectors. * Returns value between -1 and 1 (1 = identical, 0 = orthogonal, -1 = opposite). */ similarity(a: number[], b: number[]): number { if (a.length !== b.length) { throw new Error(`Vector dimensions must match: ${a.length} vs ${b.length}`); } let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { const aVal = a[i]!; const bVal = b[i]!; dotProduct += aVal * bVal; normA += aVal * aVal; normB += bVal * bVal; } const magnitude = Math.sqrt(normA) * Math.sqrt(normB); return magnitude === 0 ? 0 : dotProduct / magnitude; } /** * Get the embedding dimension. */ getDimension(): number { return this.config.dimension; } /** * Check if provider is ready (always true for local). */ isReady(): boolean { return true; } /** * Update IDF weights from a corpus of texts. * Call this periodically to improve embedding quality. */ updateIdfWeights(texts: string[]): void { const featureDocs = new Map<number, number>(); for (const text of texts) { const features = this.extractFeatures(text); const uniqueHashes = new Set<number>(); for (const feature of features) { const hash = this.hashFeature(feature); uniqueHashes.add(hash); } for (const hash of uniqueHashes) { featureDocs.set(hash, (featureDocs.get(hash) ?? 0) + 1); } } this.documentCount = texts.length; this.idfWeights.clear(); for (const [hash, docFreq] of featureDocs) { // IDF = log(N / df) + 1 (smoothed) const idf = Math.log(this.documentCount / docFreq) + 1; this.idfWeights.set(hash, idf); } this.logger.debug('IDF weights updated', { documentCount: this.documentCount, uniqueFeatures: this.idfWeights.size, }); } /** * Extract features (n-grams) from text. */ private extractFeatures(text: string): string[] { const features: string[] = []; const normalized = this.normalizeText(text); const words = this.tokenize(normalized); const [minN, maxN] = this.config.ngramRange; // Word n-grams for (let n = minN; n <= maxN; n++) { for (let i = 0; i <= words.length - n; i++) { const ngram = words.slice(i, i + n).join(' '); features.push(`w:${ngram}`); } } // Character n-grams (for handling typos and morphological variations) for (const word of words) { if (word.length > 3) { for (let i = 0; i <= word.length - 3; i++) { features.push(`c:${word.slice(i, i + 3)}`); } } } return features; } /** * Normalize text for processing. */ private normalizeText(text: string): string { return text .toLowerCase() .replace(/[^\w\s]/g, ' ') // Replace punctuation with spaces .replace(/\s+/g, ' ') // Collapse multiple spaces .trim(); } /** * Tokenize text into words. */ private tokenize(text: string): string[] { return text .split(/\s+/) .filter(word => word.length >= 2 && !this.stopWords.has(word)); } /** * Hash a single feature to a bucket index. */ private hashFeature(feature: string): number { // FNV-1a hash for good distribution let hash = 2166136261; for (let i = 0; i < feature.length; i++) { hash ^= feature.charCodeAt(i); hash = Math.imul(hash, 16777619); } return Math.abs(hash) % this.config.dimension; } /** * Hash features into a dense vector using feature hashing. */ private hashFeatures(features: string[]): number[] { const vector = new Array(this.config.dimension).fill(0); const termFreq = new Map<number, number>(); // Count term frequencies for (const feature of features) { const hash = this.hashFeature(feature); termFreq.set(hash, (termFreq.get(hash) ?? 0) + 1); } // Apply TF-IDF weighting const maxTf = Math.max(...termFreq.values(), 1); for (const [hash, tf] of termFreq) { // Sublinear TF scaling: 1 + log(tf) const tfWeight = 1 + Math.log(tf); // Normalized TF const tfNorm = tfWeight / (1 + Math.log(maxTf)); // IDF weight (default to 1 if not in corpus) const idfWeight = this.idfWeights.get(hash) ?? 1; vector[hash] += tfNorm * idfWeight; } return vector; } /** * L2 normalize a vector to unit length. */ private normalize(vector: number[]): number[] { const magnitude = Math.sqrt(vector.reduce((sum, val) => sum + val * val, 0)); if (magnitude === 0) { return vector; } return vector.map(val => val / magnitude); } } export default LocalEmbeddingProvider;

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/justinpreston/mcp-router'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

embedding.provider.ts•7.44 KiB