Skip to main content
Glama
semantic.ts12.3 kB
/** * Phase 4: Semantic Code Search with pgvector * Vector embeddings for intelligent code search */ import { Pool } from 'pg'; import { callLLM } from '../tools/llm/index.js'; import { getModelsByLayer } from '../config/models.js'; import { logger } from '../logging/logger.js'; export interface CodeEmbedding { id: string; filePath: string; codeChunk: string; embedding: number[]; language: string; chunkType: 'function' | 'class' | 'interface' | 'type' | 'module'; metadata: Record<string, unknown>; createdAt: Date; } export interface SearchResult { id: string; filePath: string; codeChunk: string; similarity: number; chunkType: string; metadata: Record<string, unknown>; } /** * Semantic Code Search Engine */ export class SemanticSearch { constructor(private db: Pool) { } /** * Generate embedding for text using LLM */ async generateEmbedding(text: string): Promise<number[]> { // Use OpenAI or similar embedding model // For now, simulate with hash-based approach // In production, use: text-embedding-ada-002 or similar logger.info('Generating embedding', { textLength: text.length, }); // Placeholder: In production, call embedding API // Example: OpenAI text-embedding-ada-002 const embedding = this.simulateEmbedding(text); return embedding; } /** * Simulate embedding (replace with real embedding API) */ private simulateEmbedding(text: string): number[] { // Create a 384-dimensional vector (simulated) const dim = 384; const embedding = new Array(dim).fill(0); // Simple hash-based simulation for (let i = 0; i < text.length; i++) { const charCode = text.charCodeAt(i); embedding[i % dim] += charCode / 1000; } // Normalize const magnitude = Math.sqrt(embedding.reduce((sum, val) => sum + val * val, 0)); return embedding.map((val) => val / magnitude); } /** * Index code file for semantic search */ async indexCodeFile( filePath: string, code: string, language: string, ): Promise<void> { logger.info('Indexing code file', { filePath, language }); // Split code into chunks (functions, classes, etc.) const chunks = await this.splitIntoChunks(code, language); for (const chunk of chunks) { const embedding = await this.generateEmbedding(chunk.code); await this.db.query( `INSERT INTO code_embeddings ( file_path, code_chunk, embedding, language, chunk_type, metadata, created_at ) VALUES ($1, $2, $3, $4, $5, $6, NOW()) ON CONFLICT (file_path, chunk_type, code_chunk) DO UPDATE SET embedding = $3, updated_at = NOW()`, [ filePath, chunk.code, JSON.stringify(embedding), language, chunk.type, JSON.stringify(chunk.metadata), ], ); } logger.info(`Indexed ${chunks.length} chunks for ${filePath}`); } /** * Split code into semantic chunks */ private async splitIntoChunks( code: string, language: string, ): Promise<Array<{ code: string; type: CodeEmbedding['chunkType']; metadata: Record<string, unknown>; }>> { const chunks: Array<{ code: string; type: CodeEmbedding['chunkType']; metadata: Record<string, unknown>; }> = []; // Extract functions const functionRegex = /(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\([^)]*\)\s*{[^}]*}/g; let match; while ((match = functionRegex.exec(code)) !== null) { chunks.push({ code: match[0], type: 'function', metadata: { name: match[1], language }, }); } // Extract classes const classRegex = /(?:export\s+)?class\s+(\w+)(?:\s+extends\s+\w+)?\s*{[^}]*}/g; while ((match = classRegex.exec(code)) !== null) { chunks.push({ code: match[0], type: 'class', metadata: { name: match[1], language }, }); } // If no functions/classes found, treat as module if (chunks.length === 0) { chunks.push({ code, type: 'module', metadata: { language }, }); } return chunks; } /** * Search for semantically similar code */ async search( query: string, limit = 10, filters?: { language?: string; chunkType?: CodeEmbedding['chunkType']; filePath?: string; }, ): Promise<SearchResult[]> { logger.info('Semantic search', { query, limit }); // Generate query embedding const queryEmbedding = await this.generateEmbedding(query); // Build WHERE clause const conditions: string[] = ['1=1']; const params: unknown[] = [JSON.stringify(queryEmbedding), limit]; let paramIndex = 3; if (filters?.language) { params.push(filters.language); conditions.push(`language = $${paramIndex++}`); } if (filters?.chunkType) { params.push(filters.chunkType); conditions.push(`chunk_type = $${paramIndex++}`); } if (filters?.filePath) { params.push(`%${filters.filePath}%`); conditions.push(`file_path LIKE $${paramIndex++}`); } // Cosine similarity search using pgvector const query_sql = ` SELECT id, file_path, code_chunk, chunk_type, metadata, 1 - (embedding <=> $1::vector) as similarity FROM code_embeddings WHERE ${conditions.join(' AND ')} ORDER BY embedding <=> $1::vector LIMIT $2 `; const result = await this.db.query(query_sql, params); return result.rows.map((row) => ({ id: row.id, filePath: row.file_path, codeChunk: row.code_chunk, similarity: parseFloat(row.similarity), chunkType: row.chunk_type, metadata: row.metadata, })); } /** * Find similar code to given code snippet */ async findSimilar( code: string, limit = 5, ): Promise<SearchResult[]> { return this.search(code, limit); } /** * Delete embeddings for a file */ async deleteFileEmbeddings(filePath: string): Promise<void> { await this.db.query( 'DELETE FROM code_embeddings WHERE file_path = $1', [filePath], ); logger.info('Deleted embeddings', { filePath }); } /** * Get embedding statistics */ async getStatistics(): Promise<{ totalChunks: number; byLanguage: Record<string, number>; byType: Record<string, number>; }> { const result = await this.db.query(` SELECT COUNT(*) as total, jsonb_object_agg(language, lang_count) as by_language, jsonb_object_agg(chunk_type, type_count) as by_type FROM ( SELECT language, chunk_type, COUNT(*) OVER (PARTITION BY language) as lang_count, COUNT(*) OVER (PARTITION BY chunk_type) as type_count FROM code_embeddings ) stats `); const row = result.rows[0]; return { totalChunks: parseInt(row.total), byLanguage: row.by_language || {}, byType: row.by_type || {}, }; } } /** * Knowledge Pack - Reusable context bundles */ export interface KnowledgePack { id: string; name: string; description: string; files: string[]; tags: string[]; embeddings: string[]; // Embedding IDs createdAt: Date; updatedAt: Date; } export class KnowledgePackManager { constructor(private db: Pool, private search: SemanticSearch) { } /** * Create knowledge pack from files */ async createPack( name: string, description: string, files: string[], tags: string[] = [], ): Promise<KnowledgePack> { logger.info('Creating knowledge pack', { name, fileCount: files.length }); // Get embedding IDs for files const embeddingIds = await this.getEmbeddingIdsForFiles(files); const result = await this.db.query( `INSERT INTO knowledge_packs ( name, description, files, tags, embedding_ids, created_at, updated_at ) VALUES ($1, $2, $3, $4, $5, NOW(), NOW()) RETURNING *`, [ name, description, JSON.stringify(files), JSON.stringify(tags), JSON.stringify(embeddingIds), ], ); const pack = result.rows[0]; return { id: pack.id, name: pack.name, description: pack.description, files: pack.files, tags: pack.tags, embeddings: pack.embedding_ids, createdAt: pack.created_at, updatedAt: pack.updated_at, }; } /** * Get embedding IDs for files */ private async getEmbeddingIdsForFiles(files: string[]): Promise<string[]> { const result = await this.db.query( 'SELECT id FROM code_embeddings WHERE file_path = ANY($1)', [files], ); return result.rows.map((row) => row.id); } /** * Load knowledge pack context */ async loadPack(packId: string): Promise<{ pack: KnowledgePack; embeddings: SearchResult[]; }> { const packResult = await this.db.query( 'SELECT * FROM knowledge_packs WHERE id = $1', [packId], ); if (packResult.rows.length === 0) { throw new Error(`Knowledge pack not found: ${packId}`); } const pack = packResult.rows[0]; const embeddingsResult = await this.db.query( 'SELECT * FROM code_embeddings WHERE id = ANY($1)', [pack.embedding_ids], ); const embeddings: SearchResult[] = embeddingsResult.rows.map((row) => ({ id: row.id, filePath: row.file_path, codeChunk: row.code_chunk, similarity: 1.0, chunkType: row.chunk_type, metadata: row.metadata, })); return { pack: { id: pack.id, name: pack.name, description: pack.description, files: pack.files, tags: pack.tags, embeddings: pack.embedding_ids, createdAt: pack.created_at, updatedAt: pack.updated_at, }, embeddings, }; } /** * Search knowledge packs by tag */ async searchByTags(tags: string[]): Promise<KnowledgePack[]> { const result = await this.db.query( `SELECT * FROM knowledge_packs WHERE tags @> $1::jsonb ORDER BY created_at DESC`, [JSON.stringify(tags)], ); return result.rows.map((row) => ({ id: row.id, name: row.name, description: row.description, files: row.files, tags: row.tags, embeddings: row.embedding_ids, createdAt: row.created_at, updatedAt: row.updated_at, })); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/babasida246/ai-mcp-gateway'

If you have feedback or need assistance with the MCP directory API, please join our Discord server