Universal Documentation MCP Server

mcpbook
src

domainDetector.ts•11.3 KiB

import { GitBookContent, GitBookPage } from './scraper.js'; import { gitBookConfig } from './config.js'; export interface DomainInfo { name: string; description: string; keywords: string[]; toolPrefix: string; } export class DomainDetector { private static extractDomainInfo(url: string): { domain: string; keywords: string[] } { try { const urlObj = new URL(url); const domain = urlObj.hostname; // Extract meaningful parts from domain const parts = domain .replace(/^www\.|\.com$|\.org$|\.net$|\.io$|\.dev$/g, '') .split('.') .filter(part => part.length > 2) // Filter out TLDs and very short parts .map(part => part.toLowerCase()); // Convert domain parts to keywords const keywords = parts .flatMap(part => { // Split camelCase/PascalCase const words = part.replace(/([a-z])([A-Z])/g, '$1 $2').toLowerCase().split(' '); // Split by special characters return words.flatMap(word => word.split(/[-_]/)); }) .filter(word => word.length > 2); // Filter out very short words return { domain, keywords: [...new Set(keywords)] // Remove duplicates }; } catch { return { domain: '', keywords: [] }; } } private static readonly DOMAIN_PATTERNS = new Map([ // Generic documentation patterns with improved matching [/(?:official\s+)?(?:documentation|docs|guide|manual|reference)(?:\s+portal|\s+hub)?/i, { name: 'generic-docs', description: 'Documentation Portal', keywords: ['documentation', 'docs', 'guide', 'manual', 'reference'], toolPrefix: 'generic_' }], // Programming language patterns [/python|javascript|java|csharp|ruby|php/i, { name: 'programming-docs', description: 'Programming language documentation', keywords: ['python', 'javascript', 'java', 'csharp', 'ruby', 'php'], toolPrefix: 'programming_' }], // Framework patterns [/react|angular|vue|django|flask/i, { name: 'framework-docs', description: 'Framework documentation', keywords: ['react', 'angular', 'vue', 'django', 'flask'], toolPrefix: 'framework_' }], // Generic blockchain patterns [/blockchain|crypto|defi|web3|solana|ethereum|polygon|avalanche|cardano|binance|validators?|nodes?|consensus|proof.of.stake|smart.contracts?|dapps?/i, { name: 'blockchain-docs', description: 'Blockchain and cryptocurrency documentation', keywords: ['blockchain', 'crypto', 'defi', 'web3', 'smart-contracts', 'validators', 'nodes'], toolPrefix: 'blockchain_' }], // API documentation patterns [/api.*reference|rest.*api|graphql/i, { name: 'api-docs', description: 'API reference documentation', keywords: ['api', 'rest', 'graphql', 'endpoints', 'reference'], toolPrefix: 'api_' }], // SDK patterns [/sdk|library|framework/i, { name: 'sdk-docs', description: 'SDK and development library documentation', keywords: ['sdk', 'library', 'framework', 'development'], toolPrefix: 'sdk_' }] ]); static detectDomain(content: GitBookContent, baseUrl?: string): DomainInfo { // If auto-detection is disabled, use config values if (!gitBookConfig.autoDetectDomain) { return { name: gitBookConfig.serverName, description: gitBookConfig.serverDescription, keywords: gitBookConfig.domainKeywords, toolPrefix: gitBookConfig.toolPrefix }; } // Extract domain from URL if available const domainInfo = baseUrl ? this.extractDomainInfo(baseUrl) : null; // Collect text for analysis with improved sampling const pages = Object.values(content); const sampleSize = Math.min(15, pages.length); // Analyze more pages for better context // Sort pages by content length and take a mix of long and short pages for better representation const sortedPages = [...pages].sort((a, b) => b.content.length - a.content.length); const sampledPages = [ ...sortedPages.slice(0, Math.floor(sampleSize / 2)), // Top half by length ...sortedPages.slice(-Math.floor(sampleSize / 2)) // Bottom half by length ]; const textSample = sampledPages .map(page => `${page.title} ${page.content} ${page.section}`) .join(' ') .toLowerCase(); // Try to match domain patterns for (const [pattern, domainInfo] of this.DOMAIN_PATTERNS) { if (pattern.test(textSample)) { return { ...domainInfo, // Merge detected keywords with config if auto-detect keywords is enabled keywords: gitBookConfig.autoDetectKeywords ? [...new Set([...domainInfo.keywords, ...gitBookConfig.domainKeywords])] : gitBookConfig.domainKeywords }; } } // Smart keyword extraction const detectedKeywords = this.extractKeywords(textSample); const siteName = this.extractSiteName(pages); // Incorporate domain information if available let combinedKeywords = detectedKeywords; let finalSiteName = siteName; if (domainInfo) { combinedKeywords = [ ...domainInfo.keywords, ...detectedKeywords ]; finalSiteName = finalSiteName || domainInfo.domain; } // Get the most relevant keywords by frequency and position const topKeywords = this.rankKeywords(combinedKeywords, textSample); // Generate tool prefix from domain info and keywords const toolPrefix = this.generateToolPrefix(domainInfo, topKeywords, finalSiteName); // Generate a smarter description using top keywords const description = this.generateDescription(domainInfo, topKeywords, finalSiteName); return { name: finalSiteName || gitBookConfig.serverName, description, keywords: gitBookConfig.autoDetectKeywords ? [...new Set([...topKeywords, ...gitBookConfig.domainKeywords])] : gitBookConfig.domainKeywords, toolPrefix }; } private static extractKeywords(text: string): string[] { const commonTechWords = [ 'api', 'sdk', 'authentication', 'auth', 'config', 'setup', 'installation', 'guide', 'tutorial', 'reference', 'documentation', 'docs', 'getting-started', 'blockchain', 'crypto', 'transaction', 'smart-contract', 'defi', 'web3', 'react', 'javascript', 'typescript', 'node', 'npm', 'yarn', 'webpack', 'database', 'sql', 'nosql', 'mongodb', 'postgres', 'redis', 'security', 'encryption', 'oauth', 'jwt', 'ssl', 'https' ]; const wordCounts = new Map<string, number>(); const words = text.toLowerCase().match(/\b\w{3,}\b/g) || []; // Count occurrences of tech words words.forEach(word => { if (commonTechWords.includes(word)) { wordCounts.set(word, (wordCounts.get(word) || 0) + 1); } }); // Return top keywords by frequency return Array.from(wordCounts.entries()) .sort(([,a], [,b]) => b - a) .slice(0, 8) .map(([word]) => word); } private static extractSiteName(pages: GitBookPage[]): string | null { if (pages.length === 0) return null; // Try to extract from common title patterns const titles = pages.map(page => page.title); const commonPrefixes = this.findCommonPrefixes(titles); if (commonPrefixes.length > 0) { const siteName = commonPrefixes[0] .toLowerCase() .replace(/\s+/g, '-') .replace(/[^\w-]/g, '') .replace(/-+/g, '-') .replace(/^-|-$/g, ''); return siteName ? `${siteName}-docs` : null; } return null; } private static rankKeywords(keywords: string[], textSample: string): string[] { const wordScores = new Map<string, number>(); keywords.forEach(word => { let score = 0; // Base frequency score const regex = new RegExp(`\\b${word}\\b`, 'gi'); const matches = textSample.match(regex) || []; score += matches.length; // Boost score if word appears in important positions (start of sentences, after colons) const importantPositionRegex = new RegExp(`(?:\\.|:|^)\\s*\\b${word}\\b`, 'gi'); const importantMatches = textSample.match(importantPositionRegex) || []; score += importantMatches.length * 2; // Boost technical terms if (this.isTechnicalTerm(word)) { score *= 1.5; } wordScores.set(word, score); }); // Return top 10 keywords by score return Array.from(wordScores.entries()) .sort(([,a], [,b]) => b - a) .slice(0, 10) .map(([word]) => word); } private static isTechnicalTerm(word: string): boolean { const technicalPatterns = [ /^(?:api|sdk|cli)$/i, /(?:sync|async)$/i, /^(?:micro|macro|meta|auto|multi)/i, /(?:service|platform|framework|library|module)$/i, /(?:config|setup|deploy|build|test|dev)/i ]; return technicalPatterns.some(pattern => pattern.test(word)); } private static generateToolPrefix(domainInfo: { domain: string; keywords: string[] } | null, keywords: string[], siteName: string | null): string { // Use domain keywords first, then detected keywords, prioritizing non-generic terms const sourceKeywords = domainInfo?.keywords || keywords; // Filter out generic terms and pick the most specific keyword const genericTerms = ['docs', 'documentation', 'guide', 'manual', 'reference', 'api', 'www']; const specificKeywords = sourceKeywords.filter(kw => !genericTerms.includes(kw.toLowerCase())); const primaryKeyword = specificKeywords[0] || sourceKeywords[0] || 'docs'; // Clean and format the primary keyword for tool prefix const cleanKeyword = primaryKeyword.toLowerCase().replace(/[^a-z0-9]/g, ''); return `${cleanKeyword}_docs_`; } private static generateDescription(domainInfo: { domain: string; keywords: string[] } | null, keywords: string[], siteName: string | null): string { // Use domain keywords first, then detected keywords, prioritizing non-generic terms const sourceKeywords = domainInfo?.keywords || keywords; // Filter out generic terms and pick the most specific keyword const genericTerms = ['docs', 'documentation', 'guide', 'manual', 'reference', 'api', 'www']; const specificKeywords = sourceKeywords.filter(kw => !genericTerms.includes(kw.toLowerCase())); const primaryKeyword = specificKeywords[0] || 'Documentation'; // Capitalize first letter const capitalizedKeyword = primaryKeyword.charAt(0).toUpperCase() + primaryKeyword.slice(1); return `${capitalizedKeyword} documentation and development resources`; } private static findCommonPrefixes(titles: string[]): string[] { if (titles.length < 2) return []; const prefixes = new Map<string, number>(); titles.forEach(title => { const words = title.split(/\s+/); for (let i = 1; i <= Math.min(3, words.length); i++) { const prefix = words.slice(0, i).join(' '); if (prefix.length > 2) { prefixes.set(prefix, (prefixes.get(prefix) || 0) + 1); } } }); // Return prefixes that appear in at least 30% of titles const threshold = Math.max(2, Math.floor(titles.length * 0.3)); return Array.from(prefixes.entries()) .filter(([, count]) => count >= threshold) .sort(([,a], [,b]) => b - a) .map(([prefix]) => prefix); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/tcsenpai/mcpbook'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

domainDetector.ts•11.3 KiB