Skip to main content
Glama
fetching-playwright.ts31.5 kB
/** * Playwright Web Crawling Strategy * * Uses headless browser to crawl documentation sites. * - First tries sitemap-based discovery for comprehensive coverage * - Falls back to iterative link crawling if sitemap fails * - Filters out non-documentation pages (login, pricing, etc.) */ import playwright from '@playwright/test'; import { Metadata } from "@superglue/shared"; import axios from "axios"; import { server_defaults } from '../../default.js'; import { logMessage } from "../../utils/logs.js"; import { filterDocumentationUrls } from '../documentation-utils.js'; import { DocumentationConfig, DocumentationFetchingStrategy } from '../types.js'; // Similarity functions for deduplication function diceCoefficient(str1: string, str2: string): number { const words1 = new Set(str1.toLowerCase().split(/\s+/)); const words2 = new Set(str2.toLowerCase().split(/\s+/)); const intersection = new Set([...words1].filter(x => words2.has(x))); return (2 * intersection.size) / (words1.size + words2.size); } function jaccardSimilarity(str1: string, str2: string): number { const words1 = new Set(str1.toLowerCase().split(/\s+/)); const words2 = new Set(str2.toLowerCase().split(/\s+/)); const intersection = new Set([...words1].filter(x => words2.has(x))); const union = new Set([...words1, ...words2]); return intersection.size / union.size; } export class PlaywrightFetchingStrategy implements DocumentationFetchingStrategy { private static readonly MAX_FETCHED_LINKS = server_defaults.DOCUMENTATION.MAX_FETCHED_LINKS; private static readonly PARALLEL_FETCH_LIMIT = server_defaults.DOCUMENTATION.MAX_PAGES_TO_FETCH_IN_PARALLEL; private static browserInstance: playwright.Browser | null = null; private browserContext: playwright.BrowserContext | null = null; public static readonly EXCLUDED_LINK_KEYWORDS = [ 'signup', 'login', 'pricing', 'contact', 'support', 'cookie', 'privacy', 'terms', 'legal', 'policy', 'status', 'help', 'blog', 'careers', 'about', 'press', 'news', 'events', 'partners', 'changelog', 'release-notes', 'updates', 'upgrade', 'register', 'cli', 'signin', 'sign-in', 'sign-up', 'trial', 'demo', 'sales', 'widget', 'webhooks', '/de/', '/it/', '/fr/', '/nl/', '/es/', '/pt/', '/pl/', '/ru/', '/ja/', '/zh/', '/ko/', '/zh-CN/', '/zh-TW/', '/id/' ]; private static async getBrowser(): Promise<playwright.Browser> { if (!PlaywrightFetchingStrategy.browserInstance) { PlaywrightFetchingStrategy.browserInstance = await playwright.chromium.launch({ headless: true, args: [ '--disable-blink-features=AutomationControlled', '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-web-security', '--disable-features=IsolateOrigins,site-per-process', ] }); } return PlaywrightFetchingStrategy.browserInstance; } static async closeBrowser(): Promise<void> { if (PlaywrightFetchingStrategy.browserInstance) { const closedInstance = PlaywrightFetchingStrategy.browserInstance; PlaywrightFetchingStrategy.browserInstance = null; try { await closedInstance.close(); } catch (error) { console.warn('Failed to close browser gracefully:', error?.message); try { const browserProcess = (closedInstance as any)._process; if (browserProcess && !browserProcess.killed) { browserProcess.kill('SIGKILL'); } } catch (killError) { console.warn('Failed to force kill browser:', killError?.message); } } } } private async getOrCreateContext(config: DocumentationConfig): Promise<playwright.BrowserContext> { if (!this.browserContext) { const browser = await PlaywrightFetchingStrategy.getBrowser(); this.browserContext = await browser.newContext({ userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', viewport: { width: 1920, height: 1080 }, locale: 'en-US', timezoneId: 'America/New_York', extraHTTPHeaders: config.headers || {} }); } return this.browserContext; } private async cleanupContext(): Promise<void> { if (this.browserContext) { try { await this.browserContext.close(); } catch (e) { } this.browserContext = null; } } private async fetchPageContentWithPlaywright(urlString: string, config: DocumentationConfig, metadata: Metadata): Promise<{ content: string; textContent: string; links: Record<string, string>; } | null> { if (!urlString?.startsWith("http")) { return null; } let page: playwright.Page | null = null; try { const browserContext = await this.getOrCreateContext(config); const url = new URL(urlString); if (config.queryParams) { Object.entries(config.queryParams).forEach(([key, value]) => { url?.searchParams?.append(key, value); }); } page = await browserContext.newPage(); await page.addInitScript(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); Object.defineProperty(navigator, 'platform', { get: () => 'MacIntel' }); Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8 }); Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 }); Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); }); await page.goto(url.toString(), { timeout: server_defaults.DOCUMENTATION.TIMEOUTS.PLAYWRIGHT }); await page.waitForLoadState('domcontentloaded', { timeout: server_defaults.DOCUMENTATION.TIMEOUTS.PLAYWRIGHT }); await page.waitForTimeout(1000); const result = await page.evaluate(() => { const selectorsToRemove = [ 'img, video, svg, canvas, iframe, picture, source, audio, embed, object', '[role="banner"], [role="dialog"], [role="contentinfo"], [role="complementary"]', '.cookie-banner, .cookie-consent, .cookies, .gdpr, .privacy-notice', 'nav, header, footer, aside, .sidebar, .menu, .navbar, .toolbar', '.social, .share, .chat, .feedback, .comments, .disqus', '.intercom, .drift, .zendesk, .freshchat, .tawk', '.ads, .advertisement, .banner, .promo, .sponsored', 'script, style, noscript, link[rel="stylesheet"]', '[data-ga], [data-gtm], [data-analytics], [data-track]', '.breadcrumb, .pagination, .pager', '.related, .recommended, .also-see', 'form, input, button, select, textarea' ]; selectorsToRemove.forEach(selector => document.querySelectorAll(selector).forEach(el => el.remove()) ); const links: Record<string, string> = {}; document.querySelectorAll('a').forEach(link => { try { const anchor = link as HTMLAnchorElement; const url = new URL(anchor.href); const key = `${anchor.textContent} ${url.pathname}`.toLowerCase().replace(/[^a-z0-9]/g, ' ').trim(); links[key] = anchor.href.split('#')[0].trim(); } catch (e) { } }); const mainContent = document.querySelector('article, main, .docs-content, .markdown, .md-content, .api-content, .docContent, .content, .doc-body'); const html = mainContent ? `<html><body>${mainContent.outerHTML}</body></html>` : `<html><body>${document.body?.innerHTML || ''}</body></html>`; const textContent = document.body?.innerText || ''; return { html, textContent, links }; }); if (!result || !result.html) { logMessage('warn', `Failed to extract content from ${urlString}`, metadata); return null; } let { html, textContent, links } = result; if (html.length > server_defaults.DOCUMENTATION.MAX_PAGE_SIZE_BYTES) { logMessage('warn', `Page ${urlString} exceeds size limit after cleanup (${Math.round(html.length / 1024 / 1024)}MB > ${Math.round(server_defaults.DOCUMENTATION.MAX_PAGE_SIZE_BYTES / 1024 / 1024)}MB), truncating`, metadata); html = html.substring(0, server_defaults.DOCUMENTATION.MAX_PAGE_SIZE_BYTES) + '\n<!-- Content truncated due to size limit -->'; } logMessage('debug', `Successfully fetched content for ${urlString}`, metadata); return { content: html, textContent, links }; } catch (error) { logMessage('warn', `Playwright fetch failed for ${urlString}: ${error?.message}`, metadata); return null; } finally { if (page) { try { await page.close(); } catch (e) { } } } } private async discoverSitemapUrls(baseUrl: string): Promise<string[]> { const candidates: string[] = []; try { const url = new URL(baseUrl); const origin = url.origin; const pathname = url.pathname; if (pathname && pathname !== '/') { candidates.push(`${baseUrl}/sitemap.xml`); const pathParts = pathname.split('/').filter(p => p); for (let i = pathParts.length; i > 0; i--) { const parentPath = '/' + pathParts.slice(0, i).join('/'); candidates.push(`${origin}${parentPath}/sitemap.xml`); } } candidates.push( `${origin}/sitemap.xml`, `${origin}/sitemap_index.xml` ); } catch { } return [...new Set(candidates)]; } private async fetchSitemapContent(sitemapUrl: string, config: DocumentationConfig): Promise<string | null> { try { const response = await axios.get(sitemapUrl, { headers: config.headers, timeout: server_defaults.DOCUMENTATION.TIMEOUTS.SITEMAP_FETCH, validateStatus: (status) => status === 200 }); const content = response.data; if (typeof content !== 'string') return null; const trimmed = content.trim(); if (!trimmed.startsWith('<?xml') && !trimmed.startsWith('<urlset') && !trimmed.startsWith('<sitemapindex')) { return null; } if (!content.includes('<loc>') || (!content.includes('<url>') && !content.includes('<sitemap>'))) { return null; } return content; } catch { return null; } } private parseSitemapContent(content: string, baseUrl: string): { urls: string[], sitemaps: string[] } { const urls: string[] = []; const sitemaps: string[] = []; try { const hasXmlTags = content.includes('<loc>') && content.includes('</loc>'); if (hasXmlTags) { const locMatches = content.matchAll(/<loc>([^<]+)<\/loc>/gi); const allLocs: string[] = []; for (const match of locMatches) { const url = match[1].trim(); if (url.startsWith('http')) { allLocs.push(url); } } for (const loc of allLocs) { const locIndex = content.indexOf(`<loc>${loc}</loc>`); if (locIndex === -1) continue; const precedingContent = content.substring(Math.max(0, locIndex - 200), locIndex); if (precedingContent.match(/<sitemap[^>]*>/i)) { sitemaps.push(loc); } else { urls.push(loc); } } if (urls.length === 0 && sitemaps.length === 0 && allLocs.length > 0) { urls.push(...allLocs); } } else { const potentialUrls = content.split(/\s+/); for (const potentialUrl of potentialUrls) { const trimmed = potentialUrl.trim(); if (trimmed.startsWith('http://') || trimmed.startsWith('https://')) { try { new URL(trimmed); urls.push(trimmed); } catch { } } } } } catch { } return { urls, sitemaps }; } private async collectSitemapUrls(config: DocumentationConfig, metadata: Metadata): Promise<string[]> { if (!config.documentationUrl) return []; const sitemapCandidates = await this.discoverSitemapUrls(config.documentationUrl); const processedSitemaps = new Set<string>(); const sitemapQueue: string[] = []; let docUrl: URL; try { docUrl = new URL(config.documentationUrl); } catch { logMessage('warn', `Invalid documentation URL: ${config.documentationUrl}`, metadata); return []; } for (const candidate of sitemapCandidates) { const content = await this.fetchSitemapContent(candidate, config); if (content) { logMessage('debug', `Found sitemap at: ${candidate}`, metadata); sitemapQueue.push(candidate); break; } } if (sitemapQueue.length === 0) { logMessage('debug', `No sitemap found. Tried: ${sitemapCandidates.slice(0, 5).join(', ')}...`, metadata); } const MAX_SITEMAP_DEPTH = server_defaults.DOCUMENTATION.MAX_SITEMAP_DEPTH; const MAX_SITEMAPS_PER_DEPTH = server_defaults.DOCUMENTATION.MAX_SITEMAPS_PER_DEPTH; const MAX_TOTAL_SITEMAPS = server_defaults.DOCUMENTATION.MAX_TOTAL_SITEMAPS; let depth = 0; const allSitemapUrls: string[] = []; while (sitemapQueue.length > 0 && depth < MAX_SITEMAP_DEPTH) { if (processedSitemaps.size >= MAX_TOTAL_SITEMAPS) { logMessage('debug', `Reached global sitemap limit (${MAX_TOTAL_SITEMAPS}), stopping sitemap discovery`, metadata); break; } const currentBatch = [...sitemapQueue]; sitemapQueue.length = 0; depth++; let sitemapsToProcess = currentBatch; if (currentBatch.length > MAX_SITEMAPS_PER_DEPTH) { const keywords = this.getMergedKeywords(config.keywords); sitemapsToProcess = this.rankItems(currentBatch, keywords) as string[]; sitemapsToProcess = sitemapsToProcess.slice(0, MAX_SITEMAPS_PER_DEPTH); logMessage('debug', `Ranked and limited sitemaps at depth ${depth} from ${currentBatch.length} to ${sitemapsToProcess.length}`, metadata); } const remainingBudget = MAX_TOTAL_SITEMAPS - processedSitemaps.size; if (sitemapsToProcess.length > remainingBudget) { sitemapsToProcess = sitemapsToProcess.slice(0, remainingBudget); logMessage('debug', `Further limited sitemaps to ${sitemapsToProcess.length} based on global budget`, metadata); } for (const sitemapUrl of sitemapsToProcess) { if (processedSitemaps.has(sitemapUrl)) continue; processedSitemaps.add(sitemapUrl); const content = await this.fetchSitemapContent(sitemapUrl, config); if (!content) continue; const { urls, sitemaps } = this.parseSitemapContent(content, sitemapUrl); const filteredUrls = filterDocumentationUrls(urls, PlaywrightFetchingStrategy.EXCLUDED_LINK_KEYWORDS); if (filteredUrls.length > 0) { logMessage('debug', `Found ${urls.length} total URLs in sitemap, ${filteredUrls.length} after filtering. First few: ${filteredUrls.slice(0, 3).join(', ')}`, metadata); } allSitemapUrls.push(...filteredUrls); if (processedSitemaps.size >= MAX_TOTAL_SITEMAPS) { continue; } const relevantSitemaps = sitemaps.filter(s => { if (processedSitemaps.has(s)) return false; try { const sitemapUrl = new URL(s); if (sitemapUrl.hostname !== docUrl.hostname) { return false; } const docPath = docUrl.pathname.replace(/\/$/, ''); const sitemapPath = sitemapUrl.pathname.replace(/\/$/, ''); if (sitemapPath === '/sitemap.xml' || sitemapPath === '/sitemap_index.xml') { return true; } if (docPath && docPath !== '/') { const docParts = docPath.split('/').filter(p => p); const sitemapParts = sitemapPath.split('/').filter(p => p); for (let i = 0; i < Math.min(docParts.length, sitemapParts.length - 1); i++) { if (docParts[i] === sitemapParts[i]) { return true; } } } const relevantKeywords = ['docs', 'api', 'documentation']; const sitemapLower = sitemapPath.toLowerCase(); if (relevantKeywords.some(keyword => sitemapLower.includes(keyword))) { return true; } return false; } catch { return false; } }); if (relevantSitemaps.length > 0) { const roomLeft = MAX_TOTAL_SITEMAPS - processedSitemaps.size; if (roomLeft > 0) { const sitemapsToAdd = relevantSitemaps.slice(0, roomLeft); logMessage('debug', `Adding ${sitemapsToAdd.length} relevant sitemaps to queue (filtered from ${sitemaps.length} total, limited by budget)`, metadata); sitemapQueue.push(...sitemapsToAdd); } } } } const uniqueSitemapUrls = [...new Set(allSitemapUrls)]; const filterPaths: string[] = []; const pathParts = docUrl.pathname.split('/').filter(p => p); filterPaths.push(docUrl.pathname); for (let i = pathParts.length - 1; i >= 0; i--) { const parentPath = '/' + pathParts.slice(0, i).join('/'); if (parentPath !== '/' && !filterPaths.includes(parentPath)) { filterPaths.push(parentPath); } } filterPaths.push('/'); for (const filterPath of filterPaths) { const filteredUrls = uniqueSitemapUrls.filter(url => { try { const urlObj = new URL(url); if (urlObj.hostname !== docUrl.hostname) { return false; } const normalizedFilterPath = filterPath.replace(/\/$/, ''); const normalizedUrlPath = urlObj.pathname.replace(/\/$/, ''); if (normalizedFilterPath === '') { return true; } return normalizedUrlPath.startsWith(normalizedFilterPath); } catch { return false; } }); if (filteredUrls.length > 0) { if (filteredUrls.length >= PlaywrightFetchingStrategy.MAX_FETCHED_LINKS || filterPath === '/') { return filteredUrls; } } } return uniqueSitemapUrls; } private async fetchPagesInBatches( urls: string[], config: DocumentationConfig, metadata: Metadata ): Promise<string> { const BATCH_SIZE = PlaywrightFetchingStrategy.PARALLEL_FETCH_LIMIT; const MAX_TOTAL_SIZE = server_defaults.DOCUMENTATION.MAX_TOTAL_CONTENT_SIZE; const pageResults: Array<{ url: string; html: string; textContent: string }> = []; let fetchedCount = 0; // Fetch all pages in batches for (let i = 0; i < urls.length && fetchedCount < PlaywrightFetchingStrategy.MAX_FETCHED_LINKS; i += BATCH_SIZE) { const remainingSlots = PlaywrightFetchingStrategy.MAX_FETCHED_LINKS - fetchedCount; const batch = urls.slice(i, Math.min(i + BATCH_SIZE, i + remainingSlots)); const batchPromises = batch.map(async (url) => { const result = await this.fetchPageContentWithPlaywright(url, config, metadata); if (!result?.content) return null; return { url, html: result.content, textContent: result.textContent }; }); const results = await Promise.all(batchPromises); for (const result of results) { if (!result || !result.html) continue; pageResults.push(result); fetchedCount++; } } // Deduplicate based on similarity const deduplicatedPages: Array<{ url: string; html: string; textContent: string }> = []; if (pageResults.length > 0) { deduplicatedPages.push(pageResults[0]); } for (let i = 1; i < pageResults.length; i++) { const currentPage = pageResults[i]; // Always include pages with short text content if (currentPage.textContent.length <= 500) { deduplicatedPages.push(currentPage); continue; } // Check similarity against all pages already in the deduplicated list let isSimilar = false; for (const existingPage of deduplicatedPages) { const dice = diceCoefficient(currentPage.textContent, existingPage.textContent); const jaccard = jaccardSimilarity(currentPage.textContent, existingPage.textContent); const avgSimilarity = (dice + jaccard) / 2; if (avgSimilarity > server_defaults.DOCUMENTATION.SIMILARITY_THRESHOLD_PERCENTAGE / 100.0) { isSimilar = true; logMessage('debug', `Skipping similar page '${currentPage.url}' because it is ${(avgSimilarity * 100).toFixed(1)}% similar to '${existingPage.url}'`, metadata); break; } } if (!isSimilar) { deduplicatedPages.push(currentPage); } } // Build final combined content from deduplicated pages let combinedContent = ""; let totalSize = 0; for (const page of deduplicatedPages) { const contentSize = Buffer.byteLength(page.html, 'utf8'); if (totalSize + contentSize > MAX_TOTAL_SIZE) { logMessage('debug', `Reached size budget (${Math.round(totalSize / 1024 / 1024)}MB), skipping remaining pages`, metadata); break; } combinedContent += combinedContent ? `\n\n${page.html}` : page.html; totalSize += contentSize; } return combinedContent; } async tryFetch(config: DocumentationConfig, metadata: Metadata): Promise<string | null> { if (!config?.documentationUrl) return null; try { try { const sitemapUrlsPromise = this.collectSitemapUrls(config, metadata); let timeoutHandle: NodeJS.Timeout; const timeoutPromise = new Promise<string[]>((resolve) => { timeoutHandle = setTimeout(() => { logMessage('warn', 'Sitemap URL collection timed out, falling back to iterative crawling', metadata); resolve([]); }, server_defaults.DOCUMENTATION.TIMEOUTS.SITEMAP_PROCESSING_TOTAL); }); const sitemapUrls = await Promise.race([sitemapUrlsPromise, timeoutPromise]); if (timeoutHandle!) { clearTimeout(timeoutHandle); } if (sitemapUrls.length > 0) { const keywords = this.getMergedKeywords(config.keywords); const rankedUrls = this.rankItems(sitemapUrls, keywords) as string[]; const topUrls = rankedUrls.slice(0, PlaywrightFetchingStrategy.MAX_FETCHED_LINKS); const content = await this.fetchPagesInBatches(topUrls, config, metadata); if (content) { return content; } } } catch (error) { logMessage('warn', `Sitemap processing failed: ${error?.message}, falling back to legacy crawling`, metadata); } return await this.legacyTryFetch(config, metadata); } finally { await this.cleanupContext(); } } public getDefaultKeywords(): Array<{ keyword: string; weight: number }> { return [ // High priority: Getting started & overview content (weight: 5) { keyword: "getting started", weight: 5 }, { keyword: "quickstart", weight: 5 }, { keyword: "overview", weight: 5 }, { keyword: "introduction", weight: 5 }, { keyword: "api", weight: 5 }, // Core concepts: Authentication & API fundamentals (weight: 4) { keyword: "authentication", weight: 4 }, { keyword: "authorization", weight: 4 }, { keyword: "rest", weight: 4 }, { keyword: "endpoints", weight: 4 }, // Important concepts (weight: 3) { keyword: "guides", weight: 3 }, { keyword: "tutorial", weight: 3 }, { keyword: "reference", weight: 3 }, { keyword: "api-reference", weight: 3 }, { keyword: "open api", weight: 3 }, { keyword: "swagger", weight: 3 }, { keyword: "bearer", weight: 3 }, { keyword: "token", weight: 3 }, { keyword: "pagination", weight: 3 }, { keyword: "schema", weight: 3 }, // Moderate importance: Data concepts (weight: 2) { keyword: "objects", weight: 2 }, { keyword: "data-objects", weight: 2 }, { keyword: "properties", weight: 2 }, { keyword: "values", weight: 2 }, { keyword: "fields", weight: 2 }, { keyword: "attributes", weight: 2 }, { keyword: "parameters", weight: 2 }, { keyword: "slugs", weight: 2 }, { keyword: "lists", weight: 2 }, { keyword: "query", weight: 2 }, { keyword: "methods", weight: 2 }, { keyword: "response", weight: 2 }, { keyword: "filtering", weight: 2 }, { keyword: "sorting", weight: 2 }, { keyword: "searching", weight: 2 }, { keyword: "filter", weight: 2 }, { keyword: "sort", weight: 2 }, { keyword: "search", weight: 2 }, // Lower priority: Specific HTTP methods (weight: 1) { keyword: "get", weight: 1 }, { keyword: "post", weight: 1 }, { keyword: "put", weight: 1 }, { keyword: "delete", weight: 1 }, { keyword: "patch", weight: 1 }, ]; } public getMergedKeywords(inputKeywords?: string[] | null): Array<{ keyword: string; weight: number }> { const defaultKeywords = this.getDefaultKeywords(); if (!inputKeywords || inputKeywords.length === 0) { return defaultKeywords; } // User-provided keywords get high weight (4) const userKeywords = inputKeywords.map(keyword => ({ keyword, weight: 4 })); // Merge, preferring user-provided weights for duplicates const keywordMap = new Map<string, number>(); for (const { keyword, weight } of [...userKeywords, ...defaultKeywords]) { const key = keyword.toLowerCase(); if (!keywordMap.has(key)) { keywordMap.set(key, weight); } } return Array.from(keywordMap.entries()).map(([keyword, weight]) => ({ keyword, weight })); } public rankItems(items: string[] | { linkText: string, href: string }[], keywords: string[] | Array<{ keyword: string; weight: number }>, fetchedLinks?: Set<string>): any[] { const normalizedItems = items.map((item, index) => { const isString = typeof item === 'string'; const url = isString ? new URL(item).pathname : new URL(item.href).pathname; const text = isString ? '' : item.linkText; const searchableContent = `${url} ${text}`.toLowerCase(); return { url, original: item, searchableContent, index }; }); let itemsToRank = fetchedLinks ? normalizedItems.filter(item => { const href = typeof item.original === 'string' ? item.original : item.original.href; return !fetchedLinks.has(href); }) : normalizedItems; const itemsToRankFiltered = itemsToRank.filter(item => { try { for (const excludedKeyword of PlaywrightFetchingStrategy.EXCLUDED_LINK_KEYWORDS) { if (item.url.includes(excludedKeyword)) { return false; } } return true; } catch { return false; } }); if (itemsToRankFiltered.length > 0) { itemsToRank = itemsToRankFiltered; } if (!keywords || keywords.length === 0) { return itemsToRank.map(item => item.original); } // Normalize keywords to weighted format const weightedKeywords: Array<{ keyword: string; weight: number }> = typeof keywords[0] === 'string' ? (keywords as string[]).map(k => ({ keyword: k, weight: 1 })) : keywords as Array<{ keyword: string; weight: number }>; const scored = itemsToRank.map(item => { let matchScore = 0; const content = item.searchableContent; for (const { keyword, weight } of weightedKeywords) { const keywordLower = keyword.toLowerCase(); const wordBoundaryRegex = new RegExp(`\\b${keywordLower}\\b`, 'g'); const exactMatches = (content.match(wordBoundaryRegex) || []).length; // Weighted scoring: exact matches get 3x base, partial matches get 1x base matchScore += exactMatches * 3 * weight; if (exactMatches === 0 && content.includes(keywordLower)) { matchScore += 1 * weight; } } // Smarter length penalty: use log scale with a cap to prevent over-penalizing long URLs const MIN_LENGTH = 10; const urlLength = Math.max(item.url.length, MIN_LENGTH); const lengthPenalty = Math.log10(urlLength); const score = matchScore / lengthPenalty; return { item: item.original, score, hasMatch: matchScore > 0 }; }); return scored .sort((a, b) => { if (a.hasMatch !== b.hasMatch) { return a.hasMatch ? -1 : 1; } return b.score - a.score; }) .map(s => s.item); } private async crawlWithLinks(startUrl: string, config: DocumentationConfig, metadata: Metadata): Promise<string> { const visitedUrls = new Set<string>(); const linkQueue: { linkText: string, href: string }[] = [{ linkText: "documentation", href: startUrl }]; const searchKeywords = this.getMergedKeywords(config.keywords); let aggregatedContent = ""; while (visitedUrls.size < PlaywrightFetchingStrategy.MAX_FETCHED_LINKS && linkQueue.length > 0) { const prioritizedLinks = this.rankItems(linkQueue, searchKeywords, visitedUrls) as { linkText: string, href: string }[]; if (prioritizedLinks.length === 0) break; const nextLink = prioritizedLinks[0]; linkQueue.splice(linkQueue.findIndex(l => l.href === nextLink.href), 1); try { const pageData = await this.fetchPageContentWithPlaywright(nextLink.href, config, metadata); visitedUrls.add(nextLink.href); if (pageData?.content) { aggregatedContent += aggregatedContent ? `\n\n${pageData.content}` : pageData.content; if (pageData.links) { for (const [linkText, href] of Object.entries(pageData.links)) { if (this.isValidDocLink(href, linkText, startUrl) && !visitedUrls.has(href) && !linkQueue.some(l => l.href === href)) { linkQueue.push({ linkText, href }); } } } } } catch (error) { logMessage('warn', `Failed to fetch ${nextLink.href}: ${error?.message}`, metadata); } } return aggregatedContent; } async legacyTryFetch(config: DocumentationConfig, metadata: Metadata): Promise<string | null> { if (!config?.documentationUrl) return null; return this.crawlWithLinks(config.documentationUrl, config, metadata); } private isValidDocLink(href: string, linkText: string, baseUrl: string): boolean { if (!linkText || !href) return false; const hrefLower = new URL(href).pathname.toLowerCase(); if (PlaywrightFetchingStrategy.EXCLUDED_LINK_KEYWORDS.some(kw => hrefLower.includes(kw))) { return false; } try { const base = new URL(baseUrl); const link = new URL(href); if (link.hostname !== base.hostname) return false; const baseParts = base.pathname.split('/').filter(p => p); if (baseParts.length >= 2) { const requiredPath = '/' + baseParts.slice(0, -1).join('/'); if (!link.pathname.startsWith(requiredPath)) return false; } return true; } catch { return false; } } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/superglue-ai/superglue'

If you have feedback or need assistance with the MCP directory API, please join our Discord server