Skip to main content
Glama
joelmnz

Article Manager MCP Server

by joelmnz
databaseEmbedding.ts17.2 kB
import { database } from './database.js'; import { databaseArticleService, ArticleMetadata } from './databaseArticles.js'; import { generateEmbedding, cosineSimilarity } from './embedding.js'; import { chunkMarkdown, Chunk, calculateContentHash } from './chunking.js'; import { createHash } from 'crypto'; import { handleDatabaseError, DatabaseServiceError, DatabaseErrorType, logDatabaseError } from './databaseErrors.js'; import { databaseConstraintService } from './databaseConstraints.js'; // Database-specific embedding interfaces export interface DatabaseEmbedding { id: number; articleId: number; chunkId: string; chunkIndex: number; headingPath: string[]; textContent: string; contentHash: string; vector: number[] | null; createdAt: Date; } export interface SearchResult { chunk: Chunk; score: number; snippet: string; articleMetadata: ArticleMetadata; } export interface IndexStats { totalChunks: number; totalArticles: number; indexedArticles: number; unindexedArticles: number; } /** * Database-backed embedding service for semantic search */ export class DatabaseEmbeddingService { /** * Check if vector extension is available */ private async hasVectorExtension(): Promise<boolean> { try { const result = await database.query(` SELECT 1 FROM pg_extension WHERE extname = 'vector' `); return result.rows.length > 0; } catch (error) { return false; } } /** * Check if vector column exists in embeddings table */ private async hasVectorColumn(): Promise<boolean> { try { const result = await database.query(` SELECT 1 FROM information_schema.columns WHERE table_name = 'embeddings' AND column_name = 'vector' `); return result.rows.length > 0; } catch (error) { return false; } } /** * Store or update embeddings for an article */ async upsertArticleEmbeddings(articleId: number, chunks: Chunk[]): Promise<void> { try { const hasVector = await this.hasVectorExtension() && await this.hasVectorColumn(); // Delete existing embeddings for this article await this.deleteArticleEmbeddings(articleId); // Insert new embeddings for (const chunk of chunks) { try { // Validate embedding data using constraint service await databaseConstraintService.validateEmbeddingData({ articleId, chunkId: chunk.id, chunkIndex: chunk.chunkIndex, textContent: chunk.text }); // Generate embedding const vector = await generateEmbedding(chunk.text); const contentHash = calculateContentHash(chunk.text); // Additional vector validation await databaseConstraintService.validateEmbeddingData({ vector }); if (hasVector) { // Use native vector type await database.query( `INSERT INTO embeddings (article_id, chunk_id, chunk_index, heading_path, text_content, content_hash, vector, created_at) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`, [ articleId, chunk.id, chunk.chunkIndex, chunk.headingPath, chunk.text, contentHash, `[${vector.join(',')}]`, // PostgreSQL vector format new Date() ] ); } else { // Fallback to JSONB storage await database.query( `INSERT INTO embeddings (article_id, chunk_id, chunk_index, heading_path, text_content, content_hash, vector_data, created_at) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`, [ articleId, chunk.id, chunk.chunkIndex, chunk.headingPath, chunk.text, contentHash, JSON.stringify(vector), new Date() ] ); } } catch (error) { if (error instanceof DatabaseServiceError) { logDatabaseError(error, `Embedding Chunk ${chunk.id}`); throw error; // Don't continue if validation fails } const dbError = handleDatabaseError(error); logDatabaseError(dbError, `Embedding Chunk ${chunk.id}`); console.error(`Error storing embedding for chunk ${chunk.id}:`, dbError.message); // Continue with other chunks for non-validation errors } } console.log(`Stored ${chunks.length} embeddings for article ID ${articleId}`); } catch (error) { if (error instanceof DatabaseServiceError) { throw error; } const dbError = handleDatabaseError(error); logDatabaseError(dbError, 'Upsert Article Embeddings'); throw dbError; } } /** * Store embeddings by article slug */ async upsertArticleEmbeddingsBySlug(slug: string, chunks: Chunk[]): Promise<void> { const articleId = await databaseArticleService.getArticleId(slug); if (!articleId) { throw new Error(`Article with slug '${slug}' not found`); } await this.upsertArticleEmbeddings(articleId, chunks); } /** * Delete all embeddings for an article */ async deleteArticleEmbeddings(articleId: number): Promise<void> { const result = await database.query( 'DELETE FROM embeddings WHERE article_id = $1', [articleId] ); console.log(`Deleted ${result.rowCount} embeddings for article ID ${articleId}`); } /** * Delete embeddings by article slug */ async deleteArticleEmbeddingsBySlug(slug: string): Promise<void> { const articleId = await databaseArticleService.getArticleId(slug); if (!articleId) { throw new Error(`Article with slug '${slug}' not found`); } await this.deleteArticleEmbeddings(articleId); } /** * Generate snippet from text */ private generateSnippet(text: string, maxLength: number = 200): string { if (text.length <= maxLength) { return text; } const snippet = text.substring(0, maxLength); const lastSpace = snippet.lastIndexOf(' '); if (lastSpace > maxLength * 0.8) { return snippet.substring(0, lastSpace) + '...'; } return snippet + '...'; } /** * Perform semantic search using database vector operations */ async semanticSearch(query: string, k: number = 5, folder?: string): Promise<SearchResult[]> { const hasVector = await this.hasVectorExtension() && await this.hasVectorColumn(); // Generate query embedding const queryVector = await generateEmbedding(query); let sql: string; let params: any[]; if (hasVector) { // Use native vector similarity search sql = ` SELECT e.chunk_id, e.chunk_index, e.heading_path, e.text_content, e.created_at, a.slug, a.title, a.folder, a.is_public, a.created_at as article_created, a.updated_at, (e.vector <=> $1::vector) as distance FROM embeddings e JOIN articles a ON e.article_id = a.id `; params = [`[${queryVector.join(',')}]`]; if (folder !== undefined) { const normalizedFolder = folder === '/' ? '' : folder; sql += ' WHERE a.folder = $2'; params.push(normalizedFolder); } sql += ' ORDER BY e.vector <=> $1::vector LIMIT $' + (params.length + 1); params.push(k * 2); // Get more results for deduplication } else { // Fallback: retrieve all embeddings and calculate similarity in memory sql = ` SELECT e.chunk_id, e.chunk_index, e.heading_path, e.text_content, e.vector_data, e.created_at, a.slug, a.title, a.folder, a.is_public, a.created_at as article_created, a.updated_at FROM embeddings e JOIN articles a ON e.article_id = a.id `; params = []; if (folder !== undefined) { const normalizedFolder = folder === '/' ? '' : folder; sql += ' WHERE a.folder = $1'; params.push(normalizedFolder); } } const result = await database.query(sql, params); let searchResults: SearchResult[]; if (hasVector) { // Convert distance to similarity score (1 - distance for cosine distance) searchResults = result.rows.map(row => ({ chunk: { id: row.chunk_id, filename: `${row.slug}.md`, // Maintain compatibility with filename-based interface title: row.title, headingPath: row.heading_path, chunkIndex: row.chunk_index, text: row.text_content, created: row.article_created.toISOString(), modified: row.updated_at.toISOString() }, score: Math.max(0, 1 - row.distance), // Convert distance to similarity snippet: this.generateSnippet(row.text_content), articleMetadata: { slug: row.slug, title: row.title, folder: row.folder, created: row.article_created.toISOString(), modified: row.updated_at.toISOString(), isPublic: row.is_public } })); } else { // Calculate similarity in memory const resultsWithScores = result.rows.map(row => { const vector = JSON.parse(row.vector_data); const score = cosineSimilarity(queryVector, vector); return { chunk: { id: row.chunk_id, filename: `${row.slug}.md`, title: row.title, headingPath: row.heading_path, chunkIndex: row.chunk_index, text: row.text_content, created: row.article_created.toISOString(), modified: row.updated_at.toISOString() }, score, snippet: this.generateSnippet(row.text_content), articleMetadata: { slug: row.slug, title: row.title, folder: row.folder, created: row.article_created.toISOString(), modified: row.updated_at.toISOString(), isPublic: row.is_public } }; }); // Sort by score and take top results resultsWithScores.sort((a, b) => b.score - a.score); searchResults = resultsWithScores.slice(0, k * 2); } // Deduplicate by article (keep highest scoring chunk per article) const bestBySlug = new Map<string, SearchResult>(); for (const result of searchResults) { const slug = result.articleMetadata.slug; const existing = bestBySlug.get(slug); if (!existing || result.score > existing.score) { bestBySlug.set(slug, result); } } // Return top k unique articles, sorted by score return Array.from(bestBySlug.values()) .sort((a, b) => b.score - a.score) .slice(0, k); } /** * Hybrid search combining title search and semantic search */ async hybridSearch(query: string, k: number = 5, folder?: string): Promise<SearchResult[]> { // Get semantic search results const semanticResults = await this.semanticSearch(query, k * 2, folder); // Get title search results const titleMatches = await databaseArticleService.searchArticles(query, folder); // Create boost map for title matches const titleMatchBoost = new Map<string, number>(); titleMatches.forEach((article, index) => { const boost = 0.3 * (1 - index / titleMatches.length); titleMatchBoost.set(article.slug, boost); }); // Boost semantic results that also match titles const boostedResults = semanticResults.map(result => { const boost = titleMatchBoost.get(result.articleMetadata.slug) || 0; return { ...result, score: Math.min(1.0, result.score + boost) }; }); // Re-sort by boosted scores boostedResults.sort((a, b) => b.score - a.score); return boostedResults.slice(0, k); } /** * Rebuild the entire embedding index */ async rebuildIndex(): Promise<void> { console.log('Rebuilding embedding index...'); // Get all articles const articles = await databaseArticleService.listArticles(); let processed = 0; let failed = 0; for (const articleMeta of articles) { try { console.log(`Processing ${articleMeta.slug}...`); // Read full article const article = await databaseArticleService.readArticle(articleMeta.slug); if (!article) { console.log(`Skipping ${articleMeta.slug} - not found`); failed++; continue; } // Get article ID const articleId = await databaseArticleService.getArticleId(articleMeta.slug); if (!articleId) { console.log(`Skipping ${articleMeta.slug} - ID not found`); failed++; continue; } // Generate chunks const chunks = chunkMarkdown( `${article.slug}.md`, // Maintain filename compatibility article.title, article.content, article.created, articleMeta.modified ); // Store embeddings await this.upsertArticleEmbeddings(articleId, chunks); processed++; } catch (error) { console.error(`Error processing ${articleMeta.slug}:`, error); failed++; } } console.log(`Rebuilt index: ${processed} articles processed, ${failed} failed`); } /** * Get embedding index statistics */ async getIndexStats(): Promise<IndexStats> { // Get total chunks const chunksResult = await database.query('SELECT COUNT(*) as count FROM embeddings'); const totalChunks = parseInt(chunksResult.rows[0].count, 10); // Get total articles const articlesResult = await database.query('SELECT COUNT(*) as count FROM articles'); const totalArticles = parseInt(articlesResult.rows[0].count, 10); // Get indexed articles (articles that have embeddings) const indexedResult = await database.query(` SELECT COUNT(DISTINCT article_id) as count FROM embeddings `); const indexedArticles = parseInt(indexedResult.rows[0].count, 10); const unindexedArticles = totalArticles - indexedArticles; return { totalChunks, totalArticles, indexedArticles, unindexedArticles }; } /** * Get detailed statistics including lists of files */ async getDetailedStats(): Promise<{ unindexedSlugs: string[]; }> { // Get unindexed slugs const unindexedResult = await database.query(` SELECT a.slug FROM articles a LEFT JOIN embeddings e ON a.id = e.article_id WHERE e.id IS NULL `); const unindexedSlugs = unindexedResult.rows.map(row => row.slug); return { unindexedSlugs }; } /** * Index unindexed articles only */ async indexUnindexedArticles(): Promise<{ indexed: number; failed: string[] }> { console.log('Indexing unindexed articles...'); // Get articles that don't have embeddings const result = await database.query(` SELECT a.id, a.slug, a.title, a.content, a.folder, a.created_at, a.updated_at FROM articles a LEFT JOIN embeddings e ON a.id = e.article_id WHERE e.article_id IS NULL `); const unindexedArticles = result.rows; const failed: string[] = []; let indexed = 0; for (const row of unindexedArticles) { try { console.log(`Indexing ${row.slug}...`); // Generate chunks const chunks = chunkMarkdown( `${row.slug}.md`, row.title, row.content, row.created_at.toISOString(), row.updated_at.toISOString() ); // Store embeddings await this.upsertArticleEmbeddings(row.id, chunks); indexed++; } catch (error) { console.error(`Error indexing ${row.slug}:`, error); failed.push(row.slug); } } console.log(`Indexed ${indexed} articles, ${failed.length} failed`); return { indexed, failed }; } /** * Update embeddings for a specific article (called after article updates) */ async updateArticleEmbeddings(slug: string): Promise<void> { const article = await databaseArticleService.readArticle(slug); if (!article) { throw new Error(`Article with slug '${slug}' not found`); } const articleId = await databaseArticleService.getArticleId(slug); if (!articleId) { throw new Error(`Article with slug '${slug}' not found`); } // Get article metadata for modified date const articles = await databaseArticleService.listArticles(); const articleMeta = articles.find(a => a.slug === slug); if (!articleMeta) { throw new Error(`Article metadata for slug '${slug}' not found`); } // Generate new chunks const chunks = chunkMarkdown( `${article.slug}.md`, article.title, article.content, article.created, articleMeta.modified ); // Update embeddings await this.upsertArticleEmbeddings(articleId, chunks); } } // Export singleton instance export const databaseEmbeddingService = new DatabaseEmbeddingService();

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/joelmnz/mcp-markdown-manager'

If you have feedback or need assistance with the MCP directory API, please join our Discord server