Skip to main content
Glama

Obsidian Semantic MCP Server

adaptive-index.ts•9.78 kB
import { Document, Fragment, LocalFragment, ScoredFragment, TermStatistics } from '../types/fragment.js'; /** * Adaptive text index with dynamic term frequency scoring * Self-contained, no external dependencies required */ export class AdaptiveTextIndex { private documents = new Map<string, Document>(); private invertedIndex = new Map<string, Set<string>>(); private termStats = new Map<string, TermStatistics>(); private filePathMap = new Map<string, string>(); // docId -> filePath mapping indexDocument(docId: string, filePath: string, content: string, metadata?: any) { const lines = content.split('\n'); const tokens = this.tokenize(content); const uniqueTerms = new Set(tokens); // Store document this.documents.set(docId, { id: docId, content, metadata: { ...metadata, lineCount: lines.length }, length: tokens.length, uniqueTermCount: uniqueTerms.size }); // Store file path mapping this.filePathMap.set(docId, filePath); // Build inverted index uniqueTerms.forEach(term => { if (!this.invertedIndex.has(term)) { this.invertedIndex.set(term, new Set()); } this.invertedIndex.get(term)!.add(docId); // Track term statistics this.updateTermStats(term, tokens); }); } search(query: string, maxFragments: number = 5): Fragment[] { // Handle undefined or empty query if (!query || query.trim().length === 0) { return []; } const queryTokens = this.tokenize(query); const candidateDocs = this.getCandidateDocuments(queryTokens); // If no candidate documents found, return empty if (candidateDocs.size === 0) { return []; } // Score documents const docScores = new Map<string, number>(); candidateDocs.forEach(docId => { const score = this.scoreDocument(docId, queryTokens); if (score > 0) docScores.set(docId, score); }); // Extract best fragments from top documents return this.extractFragments(docScores, queryTokens, maxFragments); } private extractFragments( docScores: Map<string, number>, queryTokens: string[], maxFragments: number ): Fragment[] { const fragments: ScoredFragment[] = []; // Get top scoring documents (at least 1) const topDocs = Array.from(docScores.entries()) .sort((a, b) => b[1] - a[1]) .slice(0, Math.max(1, Math.ceil(maxFragments / 2))); for (const [docId, docScore] of topDocs) { const doc = this.documents.get(docId)!; const filePath = this.filePathMap.get(docId)!; const docFragments = this.findBestPassages( doc.content, queryTokens, 400 // fragment size ); fragments.push(...docFragments.map((f, idx) => ({ id: `${docId}:frag${idx}`, docId, docPath: filePath, content: f.text, score: docScore > 0 ? docScore * f.localScore : f.localScore, localScore: f.localScore, lineStart: this.getLineNumber(doc.content, f.start), lineEnd: this.getLineNumber(doc.content, f.end), metadata: doc.metadata }))); } return fragments .sort((a, b) => b.score - a.score) .slice(0, maxFragments); } private findBestPassages( content: string, queryTokens: string[], fragmentSize: number ): LocalFragment[] { const sentences = this.splitIntoSentences(content); const windows: LocalFragment[] = []; // If no sentences found, create one from the entire content if (sentences.length === 0) { const score = this.scorePassage(content, queryTokens); if (score > 0) { return [{ text: content.trim(), start: 0, end: content.length, localScore: score }]; } return []; } // Create overlapping windows of sentences for (let i = 0; i < sentences.length; i++) { let window = sentences[i].text; let windowStart = sentences[i].start; let j = i + 1; while (j < sentences.length && window.length < fragmentSize) { window += ' ' + sentences[j].text; j++; } const score = this.scorePassage(window, queryTokens); if (score > 0) { windows.push({ text: window, start: windowStart, end: sentences[Math.min(j - 1, sentences.length - 1)].end, localScore: score }); } } // Remove overlapping windows, keeping highest scoring return this.deduplicateWindows(windows); } private tokenize(text: string): string[] { // Simple but effective tokenization return text.toLowerCase() .replace(/[^\w\s-]/g, ' ') .split(/\s+/) .filter(t => t.length > 2); } private getCandidateDocuments(queryTokens: string[]): Set<string> { const candidates = new Set<string>(); // Get documents containing any query term queryTokens.forEach(term => { const docs = this.invertedIndex.get(term); if (docs) { docs.forEach(doc => candidates.add(doc)); } // Also check for partial matches in the index this.invertedIndex.forEach((docSet, indexTerm) => { if (indexTerm.includes(term) || term.includes(indexTerm)) { docSet.forEach(doc => candidates.add(doc)); } }); }); return candidates; } private scoreDocument(docId: string, queryTokens: string[]): number { const doc = this.documents.get(docId)!; const docTokens = this.tokenize(doc.content); let score = 0; // Calculate term frequency scores with flexible matching queryTokens.forEach(term => { const tf = docTokens.filter(t => t === term || t.includes(term) || term.includes(t) ).length; if (tf > 0) { const df = this.invertedIndex.get(term)?.size || 0; const idf = Math.log((this.documents.size + 1) / (df + 1)); score += (tf / docTokens.length) * idf; } }); // Return raw score - normalization happens at fragment level return score > 0 ? score : 0.1; // Ensure non-zero score for documents with matches } private scorePassage(passage: string, queryTokens: string[]): number { const passageTokens = this.tokenize(passage); let score = 0; // Query term matching - make it more flexible queryTokens.forEach(term => { const matches = passageTokens.filter(t => t === term || t.includes(term) || term.includes(t) ).length; score += matches * 2; }); // Proximity bonus - reward passages where query terms appear close together const positions = new Map<string, number[]>(); passageTokens.forEach((token, idx) => { if (queryTokens.includes(token)) { if (!positions.has(token)) positions.set(token, []); positions.get(token)!.push(idx); } }); if (positions.size > 1) { const allPositions = Array.from(positions.values()).flat().sort((a, b) => a - b); for (let i = 1; i < allPositions.length; i++) { const distance = allPositions[i] - allPositions[i - 1]; if (distance < 10) score += 5 / distance; } } // Boost for complete sentences const sentences = passage.match(/[.!?]\s/g)?.length || 0; score *= (1 + sentences * 0.1); return score; } private splitIntoSentences(content: string): Array<{ text: string; start: number; end: number }> { const sentences: Array<{ text: string; start: number; end: number }> = []; const regex = /[^.!?]+[.!?]+/g; let match; while ((match = regex.exec(content)) !== null) { sentences.push({ text: match[0].trim(), start: match.index, end: match.index + match[0].length }); } // Handle last sentence without punctuation if (sentences.length === 0 || sentences[sentences.length - 1].end < content.length) { const lastStart = sentences.length > 0 ? sentences[sentences.length - 1].end : 0; const lastText = content.substring(lastStart).trim(); if (lastText) { sentences.push({ text: lastText, start: lastStart, end: content.length }); } } return sentences; } private deduplicateWindows(windows: LocalFragment[]): LocalFragment[] { windows.sort((a, b) => b.localScore - a.localScore); const selected: LocalFragment[] = []; for (const window of windows) { // Check if this window overlaps significantly with already selected ones const overlaps = selected.some(s => (window.start >= s.start && window.start <= s.end) || (window.end >= s.start && window.end <= s.end) || (s.start >= window.start && s.start <= window.end) ); if (!overlaps) { selected.push(window); } } return selected; } private updateTermStats(term: string, tokens: string[]) { const positions = tokens.map((t, i) => t === term ? i : -1).filter(p => p >= 0); const avgPosition = positions.reduce((a, b) => a + b, 0) / positions.length; if (!this.termStats.has(term)) { this.termStats.set(term, { documentFrequency: 0, totalFrequency: 0, averagePosition: 0 }); } const stats = this.termStats.get(term)!; stats.documentFrequency++; stats.totalFrequency += positions.length; stats.averagePosition = (stats.averagePosition + avgPosition) / 2; } private getLineNumber(content: string, position: number): number { const lines = content.substring(0, position).split('\n'); return lines.length; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/aaronsb/obsidian-semantic-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server