Superglue MCP

Official

Overview Schema Related Servers Score Discussions

fetching-playwright.ts•30.7 KiB

/** * Playwright Web Crawling Strategy * * Uses headless browser to crawl documentation sites. * - First tries sitemap-based discovery for comprehensive coverage * - Falls back to iterative link crawling if sitemap fails * - Filters out non-documentation pages (login, pricing, etc.) */ import playwright from '@playwright/test'; import { Metadata } from "@superglue/shared"; import axios from "axios"; import { server_defaults } from '../../default.js'; import { logMessage } from "../../utils/logs.js"; import { filterDocumentationUrls } from '../documentation-utils.js'; import { DocumentationConfig, DocumentationFetchingStrategy } from '../types.js'; // Similarity functions for deduplication function diceCoefficient(str1: string, str2: string): number { const words1 = new Set(str1.toLowerCase().split(/\s+/)); const words2 = new Set(str2.toLowerCase().split(/\s+/)); const intersection = new Set([...words1].filter(x => words2.has(x))); return (2 * intersection.size) / (words1.size + words2.size); } function jaccardSimilarity(str1: string, str2: string): number { const words1 = new Set(str1.toLowerCase().split(/\s+/)); const words2 = new Set(str2.toLowerCase().split(/\s+/)); const intersection = new Set([...words1].filter(x => words2.has(x))); const union = new Set([...words1, ...words2]); return intersection.size / union.size; } export class PlaywrightFetchingStrategy implements DocumentationFetchingStrategy { private static readonly MAX_FETCHED_LINKS = server_defaults.DOCUMENTATION.MAX_FETCHED_LINKS; private static readonly PARALLEL_FETCH_LIMIT = server_defaults.DOCUMENTATION.MAX_PAGES_TO_FETCH_IN_PARALLEL; private static browserInstance: playwright.Browser | null = null; private browserContext: playwright.BrowserContext | null = null; public static readonly EXCLUDED_LINK_KEYWORDS = [ 'signup', 'login', 'pricing', 'contact', 'support', 'cookie', 'privacy', 'terms', 'legal', 'policy', 'status', 'help', 'blog', 'careers', 'about', 'press', 'news', 'events', 'partners', 'changelog', 'release-notes', 'updates', 'upgrade', 'register', 'cli', 'signin', 'sign-in', 'sign-up', 'trial', 'demo', 'sales', 'widget', 'webhooks', '/de/', '/it/', '/fr/', '/nl/', '/es/', '/pt/', '/pl/', '/ru/', '/ja/', '/zh/', '/ko/', '/zh-CN/', '/zh-TW/', '/id/' ]; private static async getBrowser(): Promise<playwright.Browser> { if (!PlaywrightFetchingStrategy.browserInstance) { PlaywrightFetchingStrategy.browserInstance = await playwright.chromium.launch({ headless: true, args: [ '--disable-blink-features=AutomationControlled', '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-web-security', '--disable-features=IsolateOrigins,site-per-process', ] }); } return PlaywrightFetchingStrategy.browserInstance; } static async closeBrowser(): Promise<void> { if (PlaywrightFetchingStrategy.browserInstance) { const closedInstance = PlaywrightFetchingStrategy.browserInstance; PlaywrightFetchingStrategy.browserInstance = null; try { await closedInstance.close(); } catch (error) { console.warn('Failed to close browser gracefully:', error?.message); try { const browserProcess = (closedInstance as any)._process; if (browserProcess && !browserProcess.killed) { browserProcess.kill('SIGKILL'); } } catch (killError) { console.warn('Failed to force kill browser:', killError?.message); } } } } private async getOrCreateContext(config: DocumentationConfig): Promise<playwright.BrowserContext> { if (!this.browserContext) { const browser = await PlaywrightFetchingStrategy.getBrowser(); this.browserContext = await browser.newContext({ userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', viewport: { width: 1920, height: 1080 }, locale: 'en-US', timezoneId: 'America/New_York', extraHTTPHeaders: config.headers || {} }); } return this.browserContext; } private async cleanupContext(): Promise<void> { if (this.browserContext) { try { await this.browserContext.close(); } catch (e) { } this.browserContext = null; } } private async fetchPageContentWithPlaywright(urlString: string, config: DocumentationConfig, metadata: Metadata): Promise<{ content: string; textContent: string; links: Record<string, string>; } | null> { if (!urlString?.startsWith("http")) { return null; } let page: playwright.Page | null = null; try { const browserContext = await this.getOrCreateContext(config); const url = new URL(urlString); if (config.queryParams) { Object.entries(config.queryParams).forEach(([key, value]) => { url?.searchParams?.append(key, value); }); } page = await browserContext.newPage(); await page.addInitScript(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); Object.defineProperty(navigator, 'platform', { get: () => 'MacIntel' }); Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8 }); Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 }); Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); }); await page.goto(url.toString(), { timeout: server_defaults.DOCUMENTATION.TIMEOUTS.PLAYWRIGHT }); await page.waitForLoadState('domcontentloaded', { timeout: server_defaults.DOCUMENTATION.TIMEOUTS.PLAYWRIGHT }); await page.waitForTimeout(1000); const result = await page.evaluate(() => { const selectorsToRemove = [ 'img, video, svg, canvas, iframe, picture, source, audio, embed, object', '[role="banner"], [role="dialog"], [role="contentinfo"], [role="complementary"]', '.cookie-banner, .cookie-consent, .cookies, .gdpr, .privacy-notice', 'nav, header, footer, aside, .sidebar, .menu, .navbar, .toolbar', '.social, .share, .chat, .feedback, .comments, .disqus', '.intercom, .drift, .zendesk, .freshchat, .tawk', '.ads, .advertisement, .banner, .promo, .sponsored', 'script, style, noscript, link[rel="stylesheet"]', '[data-ga], [data-gtm], [data-analytics], [data-track]', '.breadcrumb, .pagination, .pager', '.related, .recommended, .also-see', 'form, input, button, select, textarea' ]; selectorsToRemove.forEach(selector => document.querySelectorAll(selector).forEach(el => el.remove()) ); const links: Record<string, string> = {}; document.querySelectorAll('a').forEach(link => { try { const anchor = link as HTMLAnchorElement; const url = new URL(anchor.href); const key = `${anchor.textContent} ${url.pathname}`.toLowerCase().replace(/[^a-z0-9]/g, ' ').trim(); links[key] = anchor.href.split('#')[0].trim(); } catch (e) { } }); const mainContent = document.querySelector('article, main, .docs-content, .markdown, .md-content, .api-content, .docContent, .content, .doc-body'); const html = mainContent ? `<html><body>${mainContent.outerHTML}</body></html>` : `<html><body>${document.body?.innerHTML || ''}</body></html>`; const textContent = document.body?.innerText || ''; return { html, textContent, links }; }); if (!result || !result.html) { logMessage('warn', `Failed to extract content from ${urlString}`, metadata); return null; } let { html, textContent, links } = result; if (html.length > server_defaults.DOCUMENTATION.MAX_PAGE_SIZE_BYTES) { logMessage('warn', `Page ${urlString} exceeds size limit after cleanup (${Math.round(html.length / 1024 / 1024)}MB > ${Math.round(server_defaults.DOCUMENTATION.MAX_PAGE_SIZE_BYTES / 1024 / 1024)}MB), truncating`, metadata); html = html.substring(0, server_defaults.DOCUMENTATION.MAX_PAGE_SIZE_BYTES) + '\n'; } logMessage('debug', `Successfully fetched content for ${urlString}`, metadata); return { content: html, textContent, links }; } catch (error) { logMessage('warn', `Playwright fetch failed for ${urlString}: ${error?.message}`, metadata); return null; } finally { if (page) { try { await page.close(); } catch (e) { } } } } private async discoverSitemapUrls(baseUrl: string): Promise<string[]> { const candidates: string[] = []; try { const url = new URL(baseUrl); const origin = url.origin; const pathname = url.pathname; if (pathname && pathname !== '/') { candidates.push(`${baseUrl}/sitemap.xml`); const pathParts = pathname.split('/').filter(p => p); for (let i = pathParts.length; i > 0; i--) { const parentPath = '/' + pathParts.slice(0, i).join('/'); candidates.push(`${origin}${parentPath}/sitemap.xml`); } } candidates.push( `${origin}/sitemap.xml`, `${origin}/sitemap_index.xml` ); } catch { } return [...new Set(candidates)]; } private async fetchSitemapContent(sitemapUrl: string, config: DocumentationConfig): Promise<string | null> { try { const response = await axios.get(sitemapUrl, { headers: config.headers, timeout: server_defaults.DOCUMENTATION.TIMEOUTS.SITEMAP_FETCH, validateStatus: (status) => status === 200 }); const content = response.data; if (typeof content !== 'string') return null; const trimmed = content.trim(); if (!trimmed.startsWith('<?xml') && !trimmed.startsWith('<urlset') && !trimmed.startsWith('<sitemapindex')) { return null; } if (!content.includes('<loc>') || (!content.includes('<url>') && !content.includes('<sitemap>'))) { return null; } return content; } catch { return null; } } private parseSitemapContent(content: string, baseUrl: string): { urls: string[], sitemaps: string[] } { const urls: string[] = []; const sitemaps: string[] = []; try { const hasXmlTags = content.includes('<loc>') && content.includes('</loc>'); if (hasXmlTags) { const locMatches = content.matchAll(/<loc>([^<]+)<\/loc>/gi); const allLocs: string[] = []; for (const match of locMatches) { const url = match[1].trim(); if (url.startsWith('http')) { allLocs.push(url); } } for (const loc of allLocs) { const locIndex = content.indexOf(`<loc>${loc}</loc>`); if (locIndex === -1) continue; const precedingContent = content.substring(Math.max(0, locIndex - 200), locIndex); if (precedingContent.match(/<sitemap[^>]*>/i)) { sitemaps.push(loc); } else { urls.push(loc); } } if (urls.length === 0 && sitemaps.length === 0 && allLocs.length > 0) { urls.push(...allLocs); } } else { const potentialUrls = content.split(/\s+/); for (const potentialUrl of potentialUrls) { const trimmed = potentialUrl.trim(); if (trimmed.startsWith('http://') || trimmed.startsWith('https://')) { try { new URL(trimmed); urls.push(trimmed); } catch { } } } } } catch { } return { urls, sitemaps }; } private async collectSitemapUrls(config: DocumentationConfig, metadata: Metadata): Promise<string[]> { if (!config.documentationUrl) return []; const sitemapCandidates = await this.discoverSitemapUrls(config.documentationUrl); const processedSitemaps = new Set<string>(); const sitemapQueue: string[] = []; let docUrl: URL; try { docUrl = new URL(config.documentationUrl); } catch { logMessage('warn', `Invalid documentation URL: ${config.documentationUrl}`, metadata); return []; } for (const candidate of sitemapCandidates) { const content = await this.fetchSitemapContent(candidate, config); if (content) { logMessage('debug', `Found sitemap at: ${candidate}`, metadata); sitemapQueue.push(candidate); break; } } if (sitemapQueue.length === 0) { logMessage('debug', `No sitemap found. Tried: ${sitemapCandidates.slice(0, 5).join(', ')}...`, metadata); } const MAX_SITEMAP_DEPTH = server_defaults.DOCUMENTATION.MAX_SITEMAP_DEPTH; const MAX_SITEMAPS_PER_DEPTH = server_defaults.DOCUMENTATION.MAX_SITEMAPS_PER_DEPTH; const MAX_TOTAL_SITEMAPS = server_defaults.DOCUMENTATION.MAX_TOTAL_SITEMAPS; let depth = 0; const allSitemapUrls: string[] = []; while (sitemapQueue.length > 0 && depth < MAX_SITEMAP_DEPTH) { if (processedSitemaps.size >= MAX_TOTAL_SITEMAPS) { logMessage('debug', `Reached global sitemap limit (${MAX_TOTAL_SITEMAPS}), stopping sitemap discovery`, metadata); break; } const currentBatch = [...sitemapQueue]; sitemapQueue.length = 0; depth++; let sitemapsToProcess = currentBatch; if (currentBatch.length > MAX_SITEMAPS_PER_DEPTH) { const keywords = this.getMergedKeywords(config.keywords); sitemapsToProcess = this.rankItems(currentBatch, keywords) as string[]; sitemapsToProcess = sitemapsToProcess.slice(0, MAX_SITEMAPS_PER_DEPTH); logMessage('debug', `Ranked and limited sitemaps at depth ${depth} from ${currentBatch.length} to ${sitemapsToProcess.length}`, metadata); } const remainingBudget = MAX_TOTAL_SITEMAPS - processedSitemaps.size; if (sitemapsToProcess.length > remainingBudget) { sitemapsToProcess = sitemapsToProcess.slice(0, remainingBudget); logMessage('debug', `Further limited sitemaps to ${sitemapsToProcess.length} based on global budget`, metadata); } for (const sitemapUrl of sitemapsToProcess) { if (processedSitemaps.has(sitemapUrl)) continue; processedSitemaps.add(sitemapUrl); const content = await this.fetchSitemapContent(sitemapUrl, config); if (!content) continue; const { urls, sitemaps } = this.parseSitemapContent(content, sitemapUrl); const filteredUrls = filterDocumentationUrls(urls, PlaywrightFetchingStrategy.EXCLUDED_LINK_KEYWORDS); if (filteredUrls.length > 0) { logMessage('debug', `Found ${urls.length} total URLs in sitemap, ${filteredUrls.length} after filtering. First few: ${filteredUrls.slice(0, 3).join(', ')}`, metadata); } allSitemapUrls.push(...filteredUrls); if (processedSitemaps.size >= MAX_TOTAL_SITEMAPS) { continue; } const relevantSitemaps = sitemaps.filter(s => { if (processedSitemaps.has(s)) return false; try { const sitemapUrl = new URL(s); if (sitemapUrl.hostname !== docUrl.hostname) { return false; } const docPath = docUrl.pathname.replace(/\/$/, ''); const sitemapPath = sitemapUrl.pathname.replace(/\/$/, ''); if (sitemapPath === '/sitemap.xml' || sitemapPath === '/sitemap_index.xml') { return true; } if (docPath && docPath !== '/') { const docParts = docPath.split('/').filter(p => p); const sitemapParts = sitemapPath.split('/').filter(p => p); for (let i = 0; i < Math.min(docParts.length, sitemapParts.length - 1); i++) { if (docParts[i] === sitemapParts[i]) { return true; } } } const relevantKeywords = ['docs', 'api', 'documentation']; const sitemapLower = sitemapPath.toLowerCase(); if (relevantKeywords.some(keyword => sitemapLower.includes(keyword))) { return true; } return false; } catch { return false; } }); if (relevantSitemaps.length > 0) { const roomLeft = MAX_TOTAL_SITEMAPS - processedSitemaps.size; if (roomLeft > 0) { const sitemapsToAdd = relevantSitemaps.slice(0, roomLeft); logMessage('debug', `Adding ${sitemapsToAdd.length} relevant sitemaps to queue (filtered from ${sitemaps.length} total, limited by budget)`, metadata); sitemapQueue.push(...sitemapsToAdd); } } } } const uniqueSitemapUrls = [...new Set(allSitemapUrls)]; const filterPaths: string[] = []; const pathParts = docUrl.pathname.split('/').filter(p => p); filterPaths.push(docUrl.pathname); for (let i = pathParts.length - 1; i >= 0; i--) { const parentPath = '/' + pathParts.slice(0, i).join('/'); if (parentPath !== '/' && !filterPaths.includes(parentPath)) { filterPaths.push(parentPath); } } filterPaths.push('/'); for (const filterPath of filterPaths) { const filteredUrls = uniqueSitemapUrls.filter(url => { try { const urlObj = new URL(url); if (urlObj.hostname !== docUrl.hostname) { return false; } const normalizedFilterPath = filterPath.replace(/\/$/, ''); const normalizedUrlPath = urlObj.pathname.replace(/\/$/, ''); if (normalizedFilterPath === '') { return true; } return normalizedUrlPath.startsWith(normalizedFilterPath); } catch { return false; } }); if (filteredUrls.length > 0) { if (filteredUrls.length >= PlaywrightFetchingStrategy.MAX_FETCHED_LINKS || filterPath === '/') { return filteredUrls; } } } return uniqueSitemapUrls; } private async fetchPagesInBatches( urls: string[], config: DocumentationConfig, metadata: Metadata ): Promise<string> { const BATCH_SIZE = PlaywrightFetchingStrategy.PARALLEL_FETCH_LIMIT; const MAX_TOTAL_SIZE = server_defaults.DOCUMENTATION.MAX_TOTAL_CONTENT_SIZE; const pageResults: Array<{ url: string; html: string; textContent: string }> = []; let fetchedCount = 0; // Fetch all pages in batches for (let i = 0; i < urls.length && fetchedCount < PlaywrightFetchingStrategy.MAX_FETCHED_LINKS; i += BATCH_SIZE) { const remainingSlots = PlaywrightFetchingStrategy.MAX_FETCHED_LINKS - fetchedCount; const batch = urls.slice(i, Math.min(i + BATCH_SIZE, i + remainingSlots)); const batchPromises = batch.map(async (url) => { const result = await this.fetchPageContentWithPlaywright(url, config, metadata); if (!result?.content) return null; return { url, html: result.content, textContent: result.textContent }; }); const results = await Promise.all(batchPromises); for (const result of results) { if (!result || !result.html) continue; pageResults.push(result); fetchedCount++; } } // Deduplicate based on similarity const deduplicatedPages: Array<{ url: string; html: string; textContent: string }> = []; if (pageResults.length > 0) { deduplicatedPages.push(pageResults[0]); } for (let i = 1; i < pageResults.length; i++) { const currentPage = pageResults[i]; // Always include pages with short text content if (currentPage.textContent.length <= 500) { deduplicatedPages.push(currentPage); continue; } // Check similarity against all pages already in the deduplicated list let isSimilar = false; for (const existingPage of deduplicatedPages) { const dice = diceCoefficient(currentPage.textContent, existingPage.textContent); const jaccard = jaccardSimilarity(currentPage.textContent, existingPage.textContent); const avgSimilarity = (dice + jaccard) / 2; if (avgSimilarity > server_defaults.DOCUMENTATION.SIMILARITY_THRESHOLD_PERCENTAGE / 100.0) { isSimilar = true; logMessage('debug', `Skipping similar page '${currentPage.url}' because it is ${(avgSimilarity * 100).toFixed(1)}% similar to '${existingPage.url}'`, metadata); break; } } if (!isSimilar) { deduplicatedPages.push(currentPage); } } // Build final combined content from deduplicated pages let combinedContent = ""; let totalSize = 0; for (const page of deduplicatedPages) { const contentSize = Buffer.byteLength(page.html, 'utf8'); if (totalSize + contentSize > MAX_TOTAL_SIZE) { logMessage('debug', `Reached size budget (${Math.round(totalSize / 1024 / 1024)}MB), skipping remaining pages`, metadata); break; } combinedContent += combinedContent ? `\n\n${page.html}` : page.html; totalSize += contentSize; } return combinedContent; } async tryFetch(config: DocumentationConfig, metadata: Metadata): Promise<string | null> { if (!config?.documentationUrl) return null; try { try { const sitemapUrlsPromise = this.collectSitemapUrls(config, metadata); let timeoutHandle: NodeJS.Timeout; const timeoutPromise = new Promise<string[]>((resolve) => { timeoutHandle = setTimeout(() => { logMessage('warn', 'Sitemap URL collection timed out, falling back to iterative crawling', metadata); resolve([]); }, server_defaults.DOCUMENTATION.TIMEOUTS.SITEMAP_PROCESSING_TOTAL); }); const sitemapUrls = await Promise.race([sitemapUrlsPromise, timeoutPromise]); if (timeoutHandle!) { clearTimeout(timeoutHandle); } if (sitemapUrls.length > 0) { const keywords = this.getMergedKeywords(config.keywords); const rankedUrls = this.rankItems(sitemapUrls, keywords) as string[]; const topUrls = rankedUrls.slice(0, PlaywrightFetchingStrategy.MAX_FETCHED_LINKS); const content = await this.fetchPagesInBatches(topUrls, config, metadata); if (content) { return content; } } } catch (error) { logMessage('warn', `Sitemap processing failed: ${error?.message}, falling back to legacy crawling`, metadata); } return await this.legacyTryFetch(config, metadata); } finally { await this.cleanupContext(); } } public getDefaultKeywords(): Array<{ keyword: string; weight: number }> { return [ // High priority: Getting started & overview content (weight: 5) { keyword: "getting started", weight: 5 }, { keyword: "quickstart", weight: 5 }, { keyword: "overview", weight: 5 }, { keyword: "introduction", weight: 5 }, { keyword: "api", weight: 5 }, // Core concepts: Authentication & API fundamentals (weight: 4) { keyword: "authentication", weight: 4 }, { keyword: "authorization", weight: 4 }, { keyword: "rest", weight: 4 }, { keyword: "endpoints", weight: 4 }, // Important concepts (weight: 3) { keyword: "guides", weight: 3 }, { keyword: "tutorial", weight: 3 }, { keyword: "reference", weight: 3 }, { keyword: "api-reference", weight: 3 }, { keyword: "open api", weight: 3 }, { keyword: "swagger", weight: 3 }, { keyword: "bearer", weight: 3 }, { keyword: "token", weight: 3 }, { keyword: "pagination", weight: 3 }, { keyword: "schema", weight: 3 }, // Moderate importance: Data concepts (weight: 2) { keyword: "objects", weight: 2 }, { keyword: "data-objects", weight: 2 }, { keyword: "properties", weight: 2 }, { keyword: "values", weight: 2 }, { keyword: "fields", weight: 2 }, { keyword: "attributes", weight: 2 }, { keyword: "parameters", weight: 2 }, { keyword: "slugs", weight: 2 }, { keyword: "lists", weight: 2 }, { keyword: "query", weight: 2 }, { keyword: "methods", weight: 2 }, { keyword: "response", weight: 2 }, { keyword: "filtering", weight: 2 }, { keyword: "sorting", weight: 2 }, { keyword: "searching", weight: 2 }, { keyword: "filter", weight: 2 }, { keyword: "sort", weight: 2 }, { keyword: "search", weight: 2 }, // Lower priority: Specific HTTP methods (weight: 1) { keyword: "get", weight: 1 }, { keyword: "post", weight: 1 }, { keyword: "put", weight: 1 }, { keyword: "delete", weight: 1 }, { keyword: "patch", weight: 1 }, ]; } public getMergedKeywords(inputKeywords?: string[] | null): Array<{ keyword: string; weight: number }> { const defaultKeywords = this.getDefaultKeywords(); if (!inputKeywords || inputKeywords.length === 0) { return defaultKeywords; } // User-provided keywords get high weight (4) const userKeywords = inputKeywords.map(keyword => ({ keyword, weight: 4 })); // Merge, preferring user-provided weights for duplicates const keywordMap = new Map<string, number>(); for (const { keyword, weight } of [...userKeywords, ...defaultKeywords]) { const key = keyword.toLowerCase(); if (!keywordMap.has(key)) { keywordMap.set(key, weight); } } return Array.from(keywordMap.entries()).map(([keyword, weight]) => ({ keyword, weight })); } public rankItems(items: string[] | { linkText: string, href: string }[], keywords: string[] | Array<{ keyword: string; weight: number }>, fetchedLinks?: Set<string>): any[] { const normalizedItems = items.map((item, index) => { const isString = typeof item === 'string'; const url = isString ? new URL(item).pathname : new URL(item.href).pathname; const text = isString ? '' : item.linkText; const searchableContent = `${url} ${text}`.toLowerCase(); return { url, original: item, searchableContent, index }; }); let itemsToRank = fetchedLinks ? normalizedItems.filter(item => { const href = typeof item.original === 'string' ? item.original : item.original.href; return !fetchedLinks.has(href); }) : normalizedItems; const itemsToRankFiltered = itemsToRank.filter(item => { try { for (const excludedKeyword of PlaywrightFetchingStrategy.EXCLUDED_LINK_KEYWORDS) { if (item.url.includes(excludedKeyword)) { return false; } } return true; } catch { return false; } }); if (itemsToRankFiltered.length > 0) { itemsToRank = itemsToRankFiltered; } if (!keywords || keywords.length === 0) { return itemsToRank.map(item => item.original); } // Normalize keywords to weighted format const weightedKeywords: Array<{ keyword: string; weight: number }> = typeof keywords[0] === 'string' ? (keywords as string[]).map(k => ({ keyword: k, weight: 1 })) : keywords as Array<{ keyword: string; weight: number }>; const scored = itemsToRank.map(item => { let matchScore = 0; const content = item.searchableContent; for (const { keyword, weight } of weightedKeywords) { const keywordLower = keyword.toLowerCase(); const wordBoundaryRegex = new RegExp(`\\b${keywordLower}\\b`, 'g'); const exactMatches = (content.match(wordBoundaryRegex) || []).length; // Weighted scoring: exact matches get 3x base, partial matches get 1x base matchScore += exactMatches * 3 * weight; if (exactMatches === 0 && content.includes(keywordLower)) { matchScore += 1 * weight; } } // Smarter length penalty: use log scale with a cap to prevent over-penalizing long URLs const MIN_LENGTH = 10; const urlLength = Math.max(item.url.length, MIN_LENGTH); const lengthPenalty = Math.log10(urlLength); const score = matchScore / lengthPenalty; return { item: item.original, score, hasMatch: matchScore > 0 }; }); return scored .sort((a, b) => { if (a.hasMatch !== b.hasMatch) { return a.hasMatch ? -1 : 1; } return b.score - a.score; }) .map(s => s.item); } private async crawlWithLinks(startUrl: string, config: DocumentationConfig, metadata: Metadata): Promise<string> { const visitedUrls = new Set<string>(); const linkQueue: { linkText: string, href: string }[] = [{ linkText: "documentation", href: startUrl }]; const searchKeywords = this.getMergedKeywords(config.keywords); let aggregatedContent = ""; while (visitedUrls.size < PlaywrightFetchingStrategy.MAX_FETCHED_LINKS && linkQueue.length > 0) { const prioritizedLinks = this.rankItems(linkQueue, searchKeywords, visitedUrls) as { linkText: string, href: string }[]; if (prioritizedLinks.length === 0) break; const nextLink = prioritizedLinks[0]; linkQueue.splice(linkQueue.findIndex(l => l.href === nextLink.href), 1); try { const pageData = await this.fetchPageContentWithPlaywright(nextLink.href, config, metadata); visitedUrls.add(nextLink.href); if (pageData?.content) { aggregatedContent += aggregatedContent ? `\n\n${pageData.content}` : pageData.content; if (pageData.links) { for (const [linkText, href] of Object.entries(pageData.links)) { if (this.isValidDocLink(href, linkText, startUrl) && !visitedUrls.has(href) && !linkQueue.some(l => l.href === href)) { linkQueue.push({ linkText, href }); } } } } } catch (error) { logMessage('warn', `Failed to fetch ${nextLink.href}: ${error?.message}`, metadata); } } return aggregatedContent; } async legacyTryFetch(config: DocumentationConfig, metadata: Metadata): Promise<string | null> { if (!config?.documentationUrl) return null; return this.crawlWithLinks(config.documentationUrl, config, metadata); } private isValidDocLink(href: string, linkText: string, baseUrl: string): boolean { if (!linkText || !href) return false; const hrefLower = new URL(href).pathname.toLowerCase(); if (PlaywrightFetchingStrategy.EXCLUDED_LINK_KEYWORDS.some(kw => hrefLower.includes(kw))) { return false; } try { const base = new URL(baseUrl); const link = new URL(href); if (link.hostname !== base.hostname) return false; const baseParts = base.pathname.split('/').filter(p => p); if (baseParts.length >= 2) { const requiredPath = '/' + baseParts.slice(0, -1).join('/'); if (!link.pathname.startsWith(requiredPath)) return false; } return true; } catch { return false; } } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/superglue-ai/superglue'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

fetching-playwright.ts•30.7 KiB