MCP LLMS.txt Explorer

by thedaviddias
Verified
  • src
#!/opt/homebrew/bin/node import { Server } from "@modelcontextprotocol/sdk/server/index.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { CallToolRequestSchema, ListResourcesRequestSchema, ListToolsRequestSchema, ReadResourceRequestSchema, } from "@modelcontextprotocol/sdk/types.js"; import fetch from "node-fetch"; import { createRequire } from 'node:module'; const require = createRequire(import.meta.url); const { version } = require('../package.json'); const websites = 'https://raw.githubusercontent.com/thedaviddias/llms-txt-hub/main/data/websites.json' /** * Type for a website with llms.txt information */ interface Website { name: string; domain: string; description: string; llmsTxtUrl?: string; llmsFullTxtUrl?: string; category?: string; favicon?: string; } /** * Type for a linked content from llms.txt */ interface LinkedContent { url: string; content?: string; error?: string; } /** * Type for the check website result */ interface WebsiteCheckResult { hasLlmsTxt: boolean; hasLlmsFullTxt: boolean; llmsTxtUrl?: string; llmsFullTxtUrl?: string; llmsTxtContent?: string; llmsFullTxtContent?: string; linkedContents?: LinkedContent[]; error?: string; } /** * Known websites with llms.txt files * Initial data from llms-txt-hub */ let knownWebsites: Website[] = []; /** * Cache for website check results */ const websiteCheckCache: { [domain: string]: WebsiteCheckResult } = {}; /** * Create an MCP server for exploring llms.txt files */ const server = new Server( { name: "LLMS.txt Explorer", version, }, { capabilities: { resources: {}, tools: {}, }, } ); /** * Validate website data */ function isValidWebsite(website: unknown): website is Website { if (!website || typeof website !== 'object') return false; const w = website as Record<string, unknown>; return ( typeof w.name === 'string' && typeof w.domain === 'string' && typeof w.description === 'string' && (w.llmsTxtUrl === undefined || typeof w.llmsTxtUrl === 'string') && (w.llmsFullTxtUrl === undefined || typeof w.llmsFullTxtUrl === 'string') && (w.category === undefined || typeof w.category === 'string') && (w.favicon === undefined || typeof w.favicon === 'string') ); } /** * Fetch websites list from GitHub */ async function fetchWebsitesList() { try { console.error('Fetching websites list from GitHub...'); const response = await fetch(websites); if (!response.ok) { throw new Error(`Failed to fetch websites list: ${response.status}`); } const data = await response.json(); if (!Array.isArray(data)) { throw new Error('Invalid data format: expected an array'); } const validWebsites = data.filter(isValidWebsite); console.error(`Fetched ${validWebsites.length} valid websites`); knownWebsites = validWebsites; } catch (error) { console.error('Error fetching websites list:', error); // Fallback to default website if fetch fails knownWebsites = [{ name: "Supabase", domain: "https://supabase.com", description: "Build production-grade applications with Postgres", llmsTxtUrl: "https://supabase.com/llms.txt", category: "developer-tools" }]; } } /** * Extract linked URLs from llms.txt content */ function extractLinkedUrls(content: string): string[] { const urls: string[] = []; const lines = content.split('\n'); for (const line of lines) { const trimmedLine = line.trim(); if (trimmedLine.startsWith('@')) { const url = trimmedLine.slice(1).trim(); if (url) { urls.push(url); } } } return urls; } /** * Check if a website has llms.txt files */ async function checkWebsite(domain: string): Promise<WebsiteCheckResult> { console.error('Starting website check for:', domain); // Return cached result if available if (websiteCheckCache[domain]) { console.error('Returning cached result for:', domain); return websiteCheckCache[domain]; } const result: WebsiteCheckResult = { hasLlmsTxt: false, hasLlmsFullTxt: false }; // Create an overall timeout for the entire operation const globalTimeout = new Promise<never>((_, reject) => { setTimeout(() => { reject(new Error('Global timeout exceeded')); }, 15000); // 15 second global timeout }); try { // Normalize domain and add protocol if missing let normalizedDomain = domain; if (!domain.startsWith('http://') && !domain.startsWith('https://')) { normalizedDomain = `https://${domain}`; } console.error('Normalized domain:', normalizedDomain); // Validate URL format let url: URL; try { url = new URL(normalizedDomain); } catch (e) { console.error('Invalid URL:', domain); throw new Error(`Invalid URL format: ${domain}`); } // Use the normalized URL const baseUrl = url.origin; console.error('Base URL:', baseUrl); // Helper function to fetch with timeout async function fetchWithTimeout(url: string, timeout = 5000) { // Reduced to 5 seconds console.error(`Fetching ${url} with ${timeout}ms timeout`); const controller = new AbortController(); const timeoutId = setTimeout(() => { controller.abort(); console.error(`Timeout after ${timeout}ms for ${url}`); }, timeout); try { const startTime = Date.now(); const response = await fetch(url, { signal: controller.signal, headers: { 'User-Agent': 'llms-txt-explorer/0.1.0' } }); const endTime = Date.now(); console.error(`Fetch completed in ${endTime - startTime}ms for ${url}`); clearTimeout(timeoutId); return response; } catch (error) { clearTimeout(timeoutId); console.error(`Fetch error for ${url}:`, error); throw error; } } const checkPromise = (async () => { // Check for llms.txt try { const llmsTxtUrl = `${baseUrl}/llms.txt`; console.error('Fetching llms.txt from:', llmsTxtUrl); const llmsTxtRes = await fetchWithTimeout(llmsTxtUrl); console.error('llms.txt response status:', llmsTxtRes.status); if (llmsTxtRes.ok) { result.hasLlmsTxt = true; result.llmsTxtUrl = llmsTxtUrl; const content = await llmsTxtRes.text(); console.error(`llms.txt content length: ${content.length} bytes`); result.llmsTxtContent = content; console.error('Successfully fetched llms.txt'); // Extract and fetch linked contents in parallel with timeout const linkedUrls = extractLinkedUrls(content).slice(0, 3); // Reduced to 3 linked contents if (linkedUrls.length > 0) { console.error(`Found ${linkedUrls.length} linked URLs in llms.txt (limited to 3)`); result.linkedContents = []; const fetchPromises = linkedUrls.map(async (url) => { console.error(`Fetching linked content from: ${url}`); try { const linkedRes = await fetchWithTimeout(url); if (!linkedRes.ok) { throw new Error(`Failed to fetch content: ${linkedRes.status}`); } const linkedContent = await linkedRes.text(); console.error(`Linked content length: ${linkedContent.length} bytes`); return { url, content: linkedContent }; } catch (error) { console.error(`Error fetching linked content from ${url}:`, error); return { url, error: error instanceof Error ? error.message : 'Unknown error' }; } }); // Wait for all fetches to complete with a 10 second timeout const linkedContentTimeout = new Promise<never>((_, reject) => { setTimeout(() => { reject(new Error('Linked content fetch timeout')); }, 10000); }); try { result.linkedContents = await Promise.race([ Promise.all(fetchPromises), linkedContentTimeout ]); } catch (error) { console.error('Error fetching linked contents:', error); result.linkedContents = linkedUrls.map(url => ({ url, error: 'Timeout fetching linked contents' })); } } } } catch (error: unknown) { console.error('Error in main llms.txt fetch:', error); if (error instanceof Error) { result.error = error.message; } else { result.error = 'Unknown error fetching llms.txt'; } } // Only check llms-full.txt if llms.txt was successful if (result.hasLlmsTxt && !result.error) { try { const llmsFullTxtUrl = `${baseUrl}/llms-full.txt`; console.error('Fetching llms-full.txt from:', llmsFullTxtUrl); const llmsFullTxtRes = await fetchWithTimeout(llmsFullTxtUrl); console.error('llms-full.txt response status:', llmsFullTxtRes.status); if (llmsFullTxtRes.ok) { result.hasLlmsFullTxt = true; result.llmsFullTxtUrl = llmsFullTxtUrl; const content = await llmsFullTxtRes.text(); console.error(`llms-full.txt content length: ${content.length} bytes`); result.llmsFullTxtContent = content; console.error('Successfully fetched llms-full.txt'); } } catch (error) { console.error('Error fetching llms-full.txt:', error); // Don't fail the whole operation for llms-full.txt errors } } return result; })(); // Race between the check operation and the global timeout const finalResult = await Promise.race([checkPromise, globalTimeout]); // Cache successful results only if (!finalResult.error) { websiteCheckCache[domain] = finalResult; } console.error('Final result:', JSON.stringify(finalResult, null, 2)); return finalResult; } catch (error) { const errorMessage = error instanceof Error ? error.message : 'Unknown error'; console.error('Error checking website:', errorMessage); return { hasLlmsTxt: false, hasLlmsFullTxt: false, error: errorMessage }; } } /** * Handler for listing available websites as resources */ server.setRequestHandler(ListResourcesRequestSchema, async () => { return { resources: knownWebsites.map(site => ({ uri: `website://${site.domain}`, mimeType: "application/json", name: site.name, description: site.description })) }; }); /** * Handler for reading website information */ server.setRequestHandler(ReadResourceRequestSchema, async (request) => { const url = new URL(request.params.uri); const domain = url.hostname; const website = knownWebsites.find(site => new URL(site.domain).hostname === domain); if (!website) { throw new Error(`Website ${domain} not found in known websites`); } const checkResult = await checkWebsite(website.domain); return { contents: [{ uri: request.params.uri, mimeType: "application/json", text: JSON.stringify({ ...website, ...checkResult }, null, 2) }] }; }); /** * Handler that lists available tools */ server.setRequestHandler(ListToolsRequestSchema, async () => { return { tools: [ { name: "check_website", description: "Check if a website has llms.txt files", inputSchema: { type: "object", properties: { url: { type: "string", description: "URL of the website to check" } }, required: ["url"] } }, { name: "list_websites", description: "List known websites with llms.txt files", inputSchema: { type: "object", properties: { filter_llms_txt: { type: "boolean", description: "Only show websites with llms.txt" }, filter_llms_full_txt: { type: "boolean", description: "Only show websites with llms-full.txt" } } } } ] }; }); /** * Handler for tool calls */ server.setRequestHandler(CallToolRequestSchema, async (request) => { console.error('Received tool request:', request.params.name); switch (request.params.name) { case "check_website": { const url = String(request.params.arguments?.url); console.error('Checking website:', url); if (!url) { console.error('URL is required'); return { content: [{ type: "text", text: JSON.stringify({ error: "URL is required" }, null, 2) }] }; } try { const result = await checkWebsite(url); console.error('Tool returning result:', JSON.stringify(result, null, 2)); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; } catch (error) { const errorMessage = error instanceof Error ? error.message : 'Unknown error'; console.error('Tool returning error:', errorMessage); return { content: [{ type: "text", text: JSON.stringify({ error: errorMessage }, null, 2) }] }; } } case "list_websites": { const filterLlmsTxt = Boolean(request.params.arguments?.filter_llms_txt); const filterLlmsFullTxt = Boolean(request.params.arguments?.filter_llms_full_txt); let websites = knownWebsites; if (filterLlmsTxt) { websites = websites.filter(site => site.llmsTxtUrl); } if (filterLlmsFullTxt) { websites = websites.filter(site => site.llmsFullTxtUrl); } return { content: [{ type: "text", text: JSON.stringify(websites, null, 2) }] }; } default: throw new Error("Unknown tool"); } }); /** * Start the server using stdio transport */ async function main() { // Fetch websites list before starting the server await fetchWebsitesList(); const transport = new StdioServerTransport(); await server.connect(transport); } main().catch((error) => { console.error("Server error:", error); process.exit(1); });