Skip to main content
Glama
DomainExtractor.ts14.7 kB
import path from 'path'; import * as Database from 'better-sqlite3'; import { DomainVocabulary, KeywordWeight, DomainMatch } from '../types/index.js'; import { log } from './Logger.js'; /** * Domain extractor for detecting technology keywords and domain classification */ export class DomainExtractor { private db: Database.Database; private vocabularies: Map<string, DomainVocabulary> = new Map(); private lastVocabularyUpdate = 0; private readonly CACHE_TTL = 5 * 60 * 1000; // 5 minutes constructor(database: Database.Database) { this.db = database; this.loadVocabularies(); log.info('DomainExtractor initialized', { vocabularyCount: this.vocabularies.size, domains: Array.from(this.vocabularies.keys()) }); } /** * Extract domain tags from content */ async extractDomainTags( content: string, filePath: string, language: string ): Promise<string[]> { const timer = log.time(`domain-extract-${path.basename(filePath)}`); try { log.debug('Starting domain extraction', { filePath: path.basename(filePath), language, contentLength: content.length }); // Refresh vocabularies if needed this.refreshVocabulariesIfNeeded(); // Extract domains using multiple strategies const pathDomains = this.extractFromPath(filePath); const languageDomains = this.extractFromLanguage(language); const contentDomains = this.extractFromContent(content); const contextDomains = this.extractFromContext(content, filePath); // Combine and score all detected domains const domainScores = new Map<string, number>(); // Add path-based domains with high confidence for (const domain of pathDomains) { domainScores.set(domain, (domainScores.get(domain) || 0) + 0.8); } // Add language-based domains with medium confidence for (const domain of languageDomains) { domainScores.set(domain, (domainScores.get(domain) || 0) + 0.6); } // Add content-based domains with variable confidence for (const [domain, score] of contentDomains) { domainScores.set(domain, (domainScores.get(domain) || 0) + score); } // Add context-based domains for (const [domain, score] of contextDomains) { domainScores.set(domain, (domainScores.get(domain) || 0) + score); } // Filter and sort by confidence const finalDomains = Array.from(domainScores.entries()) .filter(([, score]) => score >= 0.3) // Minimum confidence threshold .sort(([, a], [, b]) => b - a) // Sort by confidence descending .slice(0, 5) // Limit to top 5 domains .map(([domain]) => domain); timer(); log.debug('Domain extraction completed', { filePath: path.basename(filePath), detectedDomains: finalDomains, totalCandidates: domainScores.size }); return finalDomains; } catch (error: any) { log.error('Domain extraction failed', error, { filePath: path.basename(filePath), contentLength: content.length }); return []; } } /** * Classify query intent by detecting domains */ async classifyQueryIntent(query: string): Promise<DomainMatch[]> { const timer = log.time(`query-intent-${query.substring(0, 20)}`); try { log.debug('Classifying query intent', { query: query.substring(0, 50), queryLength: query.length }); // Refresh vocabularies if needed this.refreshVocabulariesIfNeeded(); const queryLower = query.toLowerCase(); const queryWords = this.tokenizeQuery(queryLower); const domainMatches: DomainMatch[] = []; // Check each domain vocabulary for (const [domainName, vocabulary] of this.vocabularies) { const matchedKeywords: string[] = []; let totalWeight = 0; let maxWeight = 0; // Check for keyword matches for (const keywordData of vocabulary.keywords) { const keyword = keywordData.keyword.toLowerCase(); if (queryWords.includes(keyword) || queryLower.includes(keyword)) { matchedKeywords.push(keyword); totalWeight += keywordData.weight; maxWeight = Math.max(maxWeight, keywordData.weight); } } // Calculate confidence based on matches if (matchedKeywords.length > 0) { // Confidence factors: // - Number of matched keywords // - Weight of matched keywords // - Exact vs partial matches // - Query coverage const keywordFactor = Math.min(matchedKeywords.length / 3, 1); // Up to 3 keywords = max const weightFactor = Math.min(totalWeight / 2, 1); // Normalize weights const coverageFactor = Math.min( matchedKeywords.join(' ').length / query.length, 0.8 ); // Max 80% coverage const confidence = (keywordFactor * 0.4 + weightFactor * 0.4 + coverageFactor * 0.2); if (confidence >= 0.2) { // Minimum confidence threshold domainMatches.push({ domain: domainName, confidence, matchedKeywords, boostFactor: vocabulary.boostFactor }); } } } // Sort by confidence and limit results const sortedMatches = domainMatches .sort((a, b) => b.confidence - a.confidence) .slice(0, 3); // Top 3 domain matches timer(); log.debug('Query intent classification completed', { query: query.substring(0, 50), detectedDomains: sortedMatches.map(m => m.domain), confidences: sortedMatches.map(m => m.confidence.toFixed(3)) }); return sortedMatches; } catch (error: any) { log.error('Query intent classification failed', error, { query: query.substring(0, 50) }); return []; } } /** * Extract domains from file path */ private extractFromPath(filePath: string): string[] { const pathLower = filePath.toLowerCase(); const domains: string[] = []; // Check for framework/library names in path const pathIndicators = { 'javascript': ['js', 'node', 'npm'], 'web-frameworks': ['express', 'react', 'vue', 'angular', 'next', 'nuxt'], 'python': ['python', 'py', 'pip', 'django', 'flask'], 'databases': ['db', 'database', 'sql', 'mongo', 'postgres', 'mysql'], 'devops': ['docker', 'k8s', 'kubernetes', 'ci', 'cd', 'deploy'], 'testing': ['test', 'spec', '__tests__', 'cypress', 'jest'] }; for (const [domain, indicators] of Object.entries(pathIndicators)) { if (indicators.some(indicator => pathLower.includes(indicator))) { domains.push(domain); } } return domains; } /** * Extract domains from programming language */ private extractFromLanguage(language: string): string[] { const languageMap: { [key: string]: string[] } = { 'javascript': ['javascript', 'web-frameworks'], 'typescript': ['javascript', 'web-frameworks'], 'python': ['python'], 'java': ['java', 'enterprise'], 'csharp': ['dotnet', 'enterprise'], 'go': ['go', 'backend'], 'rust': ['rust', 'systems'], 'php': ['php', 'web-frameworks'], 'ruby': ['ruby', 'web-frameworks'], 'shell': ['devops', 'automation'], 'sql': ['databases'], 'yaml': ['devops', 'configuration'], 'json': ['configuration', 'api'], 'html': ['web-frontend'], 'css': ['web-frontend'] }; return languageMap[language.toLowerCase()] || []; } /** * Extract domains from content analysis */ private extractFromContent(content: string): Map<string, number> { const contentLower = content.toLowerCase(); const domainScores = new Map<string, number>(); // Analyze content for each vocabulary for (const [domainName, vocabulary] of this.vocabularies) { let domainScore = 0; let matchCount = 0; for (const keywordData of vocabulary.keywords) { const keyword = keywordData.keyword.toLowerCase(); const regex = new RegExp(`\\b${this.escapeRegExp(keyword)}\\b`, 'gi'); const matches = contentLower.match(regex); if (matches) { const keywordScore = matches.length * keywordData.weight * 0.1; domainScore += keywordScore; matchCount++; } } // Normalize score by vocabulary size and content length if (matchCount > 0) { const normalizedScore = Math.min( domainScore / Math.log(content.length + 1000) * 100, 1.0 ); if (normalizedScore >= 0.1) { domainScores.set(domainName, normalizedScore); } } } return domainScores; } /** * Extract domains from contextual analysis */ private extractFromContext(content: string, filePath: string): Map<string, number> { const contextScores = new Map<string, number>(); // Analyze import statements and dependencies const importPatterns = [ /import\s+.*?['"`]([^'"`]+)['"`]/g, /require\s*\(\s*['"`]([^'"`]+)['"`]\s*\)/g, /from\s+['"`]([^'"`]+)['"`]/g ]; for (const pattern of importPatterns) { let match; while ((match = pattern.exec(content)) !== null) { const importPath = match[1].toLowerCase(); // Map common imports to domains if (importPath.includes('express')) { contextScores.set('web-frameworks', (contextScores.get('web-frameworks') || 0) + 0.3); } else if (importPath.includes('react')) { contextScores.set('web-frameworks', (contextScores.get('web-frameworks') || 0) + 0.3); } else if (importPath.includes('vue')) { contextScores.set('web-frameworks', (contextScores.get('web-frameworks') || 0) + 0.3); } else if (importPath.includes('angular')) { contextScores.set('web-frameworks', (contextScores.get('web-frameworks') || 0) + 0.3); } else if (importPath.includes('django') || importPath.includes('flask')) { contextScores.set('python', (contextScores.get('python') || 0) + 0.3); contextScores.set('web-frameworks', (contextScores.get('web-frameworks') || 0) + 0.2); } } } // Analyze package.json or requirements.txt content if (filePath.includes('package.json') || content.includes('"dependencies"')) { contextScores.set('javascript', (contextScores.get('javascript') || 0) + 0.4); if (content.includes('express')) { contextScores.set('web-frameworks', (contextScores.get('web-frameworks') || 0) + 0.4); } } else if (filePath.includes('requirements.txt') || filePath.includes('Pipfile')) { contextScores.set('python', (contextScores.get('python') || 0) + 0.4); } // Analyze Docker and infrastructure context if (content.includes('FROM ') || content.includes('COPY ') || filePath.includes('dockerfile')) { contextScores.set('devops', (contextScores.get('devops') || 0) + 0.3); } return contextScores; } /** * Load vocabularies from database */ private loadVocabularies(): void { try { const stmt = this.db.prepare(` SELECT domain, keywords, authority_patterns, boost_factor FROM domain_vocabulary ORDER BY domain `); const rows = stmt.all() as Array<{ domain: string; keywords: string; authority_patterns: string; boost_factor: number; }>; this.vocabularies.clear(); for (const row of rows) { try { const vocabulary: DomainVocabulary = { domain: row.domain, keywords: JSON.parse(row.keywords) as KeywordWeight[], authorityPatterns: JSON.parse(row.authority_patterns) as string[], boostFactor: row.boost_factor }; this.vocabularies.set(row.domain, vocabulary); } catch (parseError: any) { log.warn('Failed to parse vocabulary data for domain: ' + row.domain, parseError); } } this.lastVocabularyUpdate = Date.now(); log.info('Domain vocabularies loaded', { count: this.vocabularies.size, domains: Array.from(this.vocabularies.keys()) }); } catch (error: any) { log.error('Failed to load domain vocabularies', error); } } /** * Refresh vocabularies if cache is stale */ private refreshVocabulariesIfNeeded(): void { if (Date.now() - this.lastVocabularyUpdate > this.CACHE_TTL) { this.loadVocabularies(); } } /** * Tokenize query into words */ private tokenizeQuery(query: string): string[] { return query .toLowerCase() .replace(/[^\w\s]/g, ' ') // Replace punctuation with spaces .split(/\s+/) // Split on whitespace .filter(word => word.length > 1); // Filter out single characters } /** * Escape regex special characters */ private escapeRegExp(string: string): string { return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } /** * Add or update domain vocabulary */ async addVocabulary(vocabulary: DomainVocabulary): Promise<void> { try { const stmt = this.db.prepare(` INSERT OR REPLACE INTO domain_vocabulary (domain, keywords, authority_patterns, boost_factor) VALUES (?, ?, ?, ?) `); stmt.run( vocabulary.domain, JSON.stringify(vocabulary.keywords), JSON.stringify(vocabulary.authorityPatterns), vocabulary.boostFactor ); // Update in-memory cache this.vocabularies.set(vocabulary.domain, vocabulary); log.info('Domain vocabulary updated', { domain: vocabulary.domain, keywordCount: vocabulary.keywords.length }); } catch (error: any) { log.error('Failed to add domain vocabulary', error, { domainName: vocabulary.domain }); throw error; } } /** * Get all available domains */ getDomains(): string[] { this.refreshVocabulariesIfNeeded(); return Array.from(this.vocabularies.keys()); } /** * Get vocabulary for a specific domain */ getVocabulary(domain: string): DomainVocabulary | undefined { this.refreshVocabulariesIfNeeded(); return this.vocabularies.get(domain); } /** * Get boost factor for a domain */ getBoostFactor(domain: string): number { const vocabulary = this.getVocabulary(domain); return vocabulary?.boostFactor || 1.0; } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PatrickRuddiman/local-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server