fetching-playwright.ts•31.5 kB
/**
* Playwright Web Crawling Strategy
*
* Uses headless browser to crawl documentation sites.
* - First tries sitemap-based discovery for comprehensive coverage
* - Falls back to iterative link crawling if sitemap fails
* - Filters out non-documentation pages (login, pricing, etc.)
*/
import playwright from '@playwright/test';
import { Metadata } from "@superglue/shared";
import axios from "axios";
import { server_defaults } from '../../default.js';
import { logMessage } from "../../utils/logs.js";
import { filterDocumentationUrls } from '../documentation-utils.js';
import { DocumentationConfig, DocumentationFetchingStrategy } from '../types.js';
// Similarity functions for deduplication
function diceCoefficient(str1: string, str2: string): number {
const words1 = new Set(str1.toLowerCase().split(/\s+/));
const words2 = new Set(str2.toLowerCase().split(/\s+/));
const intersection = new Set([...words1].filter(x => words2.has(x)));
return (2 * intersection.size) / (words1.size + words2.size);
}
function jaccardSimilarity(str1: string, str2: string): number {
const words1 = new Set(str1.toLowerCase().split(/\s+/));
const words2 = new Set(str2.toLowerCase().split(/\s+/));
const intersection = new Set([...words1].filter(x => words2.has(x)));
const union = new Set([...words1, ...words2]);
return intersection.size / union.size;
}
export class PlaywrightFetchingStrategy implements DocumentationFetchingStrategy {
private static readonly MAX_FETCHED_LINKS = server_defaults.DOCUMENTATION.MAX_FETCHED_LINKS;
private static readonly PARALLEL_FETCH_LIMIT = server_defaults.DOCUMENTATION.MAX_PAGES_TO_FETCH_IN_PARALLEL;
private static browserInstance: playwright.Browser | null = null;
private browserContext: playwright.BrowserContext | null = null;
public static readonly EXCLUDED_LINK_KEYWORDS = [
'signup', 'login', 'pricing', 'contact', 'support', 'cookie',
'privacy', 'terms', 'legal', 'policy', 'status', 'help', 'blog',
'careers', 'about', 'press', 'news', 'events', 'partners',
'changelog', 'release-notes', 'updates', 'upgrade', 'register', 'cli',
'signin', 'sign-in', 'sign-up', 'trial', 'demo', 'sales', 'widget', 'webhooks',
'/de/', '/it/', '/fr/', '/nl/', '/es/', '/pt/', '/pl/', '/ru/', '/ja/', '/zh/',
'/ko/', '/zh-CN/', '/zh-TW/', '/id/'
];
private static async getBrowser(): Promise<playwright.Browser> {
if (!PlaywrightFetchingStrategy.browserInstance) {
PlaywrightFetchingStrategy.browserInstance = await playwright.chromium.launch({
headless: true,
args: [
'--disable-blink-features=AutomationControlled',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process',
]
});
}
return PlaywrightFetchingStrategy.browserInstance;
}
static async closeBrowser(): Promise<void> {
if (PlaywrightFetchingStrategy.browserInstance) {
const closedInstance = PlaywrightFetchingStrategy.browserInstance;
PlaywrightFetchingStrategy.browserInstance = null;
try {
await closedInstance.close();
} catch (error) {
console.warn('Failed to close browser gracefully:', error?.message);
try {
const browserProcess = (closedInstance as any)._process;
if (browserProcess && !browserProcess.killed) {
browserProcess.kill('SIGKILL');
}
} catch (killError) {
console.warn('Failed to force kill browser:', killError?.message);
}
}
}
}
private async getOrCreateContext(config: DocumentationConfig): Promise<playwright.BrowserContext> {
if (!this.browserContext) {
const browser = await PlaywrightFetchingStrategy.getBrowser();
this.browserContext = await browser.newContext({
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
viewport: { width: 1920, height: 1080 },
locale: 'en-US',
timezoneId: 'America/New_York',
extraHTTPHeaders: config.headers || {}
});
}
return this.browserContext;
}
private async cleanupContext(): Promise<void> {
if (this.browserContext) {
try {
await this.browserContext.close();
} catch (e) {
}
this.browserContext = null;
}
}
private async fetchPageContentWithPlaywright(urlString: string, config: DocumentationConfig, metadata: Metadata): Promise<{ content: string; textContent: string; links: Record<string, string>; } | null> {
if (!urlString?.startsWith("http")) {
return null;
}
let page: playwright.Page | null = null;
try {
const browserContext = await this.getOrCreateContext(config);
const url = new URL(urlString);
if (config.queryParams) {
Object.entries(config.queryParams).forEach(([key, value]) => {
url?.searchParams?.append(key, value);
});
}
page = await browserContext.newPage();
await page.addInitScript(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'platform', { get: () => 'MacIntel' });
Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8 });
Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 });
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
});
await page.goto(url.toString(), { timeout: server_defaults.DOCUMENTATION.TIMEOUTS.PLAYWRIGHT });
await page.waitForLoadState('domcontentloaded', { timeout: server_defaults.DOCUMENTATION.TIMEOUTS.PLAYWRIGHT });
await page.waitForTimeout(1000);
const result = await page.evaluate(() => {
const selectorsToRemove = [
'img, video, svg, canvas, iframe, picture, source, audio, embed, object',
'[role="banner"], [role="dialog"], [role="contentinfo"], [role="complementary"]',
'.cookie-banner, .cookie-consent, .cookies, .gdpr, .privacy-notice',
'nav, header, footer, aside, .sidebar, .menu, .navbar, .toolbar',
'.social, .share, .chat, .feedback, .comments, .disqus',
'.intercom, .drift, .zendesk, .freshchat, .tawk',
'.ads, .advertisement, .banner, .promo, .sponsored',
'script, style, noscript, link[rel="stylesheet"]',
'[data-ga], [data-gtm], [data-analytics], [data-track]',
'.breadcrumb, .pagination, .pager',
'.related, .recommended, .also-see',
'form, input, button, select, textarea'
];
selectorsToRemove.forEach(selector =>
document.querySelectorAll(selector).forEach(el => el.remove())
);
const links: Record<string, string> = {};
document.querySelectorAll('a').forEach(link => {
try {
const anchor = link as HTMLAnchorElement;
const url = new URL(anchor.href);
const key = `${anchor.textContent} ${url.pathname}`.toLowerCase().replace(/[^a-z0-9]/g, ' ').trim();
links[key] = anchor.href.split('#')[0].trim();
} catch (e) { }
});
const mainContent = document.querySelector('article, main, .docs-content, .markdown, .md-content, .api-content, .docContent, .content, .doc-body');
const html = mainContent
? `<html><body>${mainContent.outerHTML}</body></html>`
: `<html><body>${document.body?.innerHTML || ''}</body></html>`;
const textContent = document.body?.innerText || '';
return { html, textContent, links };
});
if (!result || !result.html) {
logMessage('warn', `Failed to extract content from ${urlString}`, metadata);
return null;
}
let { html, textContent, links } = result;
if (html.length > server_defaults.DOCUMENTATION.MAX_PAGE_SIZE_BYTES) {
logMessage('warn', `Page ${urlString} exceeds size limit after cleanup (${Math.round(html.length / 1024 / 1024)}MB > ${Math.round(server_defaults.DOCUMENTATION.MAX_PAGE_SIZE_BYTES / 1024 / 1024)}MB), truncating`, metadata);
html = html.substring(0, server_defaults.DOCUMENTATION.MAX_PAGE_SIZE_BYTES) + '\n<!-- Content truncated due to size limit -->';
}
logMessage('debug', `Successfully fetched content for ${urlString}`, metadata);
return {
content: html,
textContent,
links
};
} catch (error) {
logMessage('warn', `Playwright fetch failed for ${urlString}: ${error?.message}`, metadata);
return null;
} finally {
if (page) {
try {
await page.close();
} catch (e) {
}
}
}
}
private async discoverSitemapUrls(baseUrl: string): Promise<string[]> {
const candidates: string[] = [];
try {
const url = new URL(baseUrl);
const origin = url.origin;
const pathname = url.pathname;
if (pathname && pathname !== '/') {
candidates.push(`${baseUrl}/sitemap.xml`);
const pathParts = pathname.split('/').filter(p => p);
for (let i = pathParts.length; i > 0; i--) {
const parentPath = '/' + pathParts.slice(0, i).join('/');
candidates.push(`${origin}${parentPath}/sitemap.xml`);
}
}
candidates.push(
`${origin}/sitemap.xml`,
`${origin}/sitemap_index.xml`
);
} catch {
}
return [...new Set(candidates)];
}
private async fetchSitemapContent(sitemapUrl: string, config: DocumentationConfig): Promise<string | null> {
try {
const response = await axios.get(sitemapUrl, {
headers: config.headers,
timeout: server_defaults.DOCUMENTATION.TIMEOUTS.SITEMAP_FETCH,
validateStatus: (status) => status === 200
});
const content = response.data;
if (typeof content !== 'string') return null;
const trimmed = content.trim();
if (!trimmed.startsWith('<?xml') && !trimmed.startsWith('<urlset') && !trimmed.startsWith('<sitemapindex')) {
return null;
}
if (!content.includes('<loc>') || (!content.includes('<url>') && !content.includes('<sitemap>'))) {
return null;
}
return content;
} catch {
return null;
}
}
private parseSitemapContent(content: string, baseUrl: string): { urls: string[], sitemaps: string[] } {
const urls: string[] = [];
const sitemaps: string[] = [];
try {
const hasXmlTags = content.includes('<loc>') && content.includes('</loc>');
if (hasXmlTags) {
const locMatches = content.matchAll(/<loc>([^<]+)<\/loc>/gi);
const allLocs: string[] = [];
for (const match of locMatches) {
const url = match[1].trim();
if (url.startsWith('http')) {
allLocs.push(url);
}
}
for (const loc of allLocs) {
const locIndex = content.indexOf(`<loc>${loc}</loc>`);
if (locIndex === -1) continue;
const precedingContent = content.substring(Math.max(0, locIndex - 200), locIndex);
if (precedingContent.match(/<sitemap[^>]*>/i)) {
sitemaps.push(loc);
} else {
urls.push(loc);
}
}
if (urls.length === 0 && sitemaps.length === 0 && allLocs.length > 0) {
urls.push(...allLocs);
}
} else {
const potentialUrls = content.split(/\s+/);
for (const potentialUrl of potentialUrls) {
const trimmed = potentialUrl.trim();
if (trimmed.startsWith('http://') || trimmed.startsWith('https://')) {
try {
new URL(trimmed);
urls.push(trimmed);
} catch {
}
}
}
}
} catch {
}
return { urls, sitemaps };
}
private async collectSitemapUrls(config: DocumentationConfig, metadata: Metadata): Promise<string[]> {
if (!config.documentationUrl) return [];
const sitemapCandidates = await this.discoverSitemapUrls(config.documentationUrl);
const processedSitemaps = new Set<string>();
const sitemapQueue: string[] = [];
let docUrl: URL;
try {
docUrl = new URL(config.documentationUrl);
} catch {
logMessage('warn', `Invalid documentation URL: ${config.documentationUrl}`, metadata);
return [];
}
for (const candidate of sitemapCandidates) {
const content = await this.fetchSitemapContent(candidate, config);
if (content) {
logMessage('debug', `Found sitemap at: ${candidate}`, metadata);
sitemapQueue.push(candidate);
break;
}
}
if (sitemapQueue.length === 0) {
logMessage('debug', `No sitemap found. Tried: ${sitemapCandidates.slice(0, 5).join(', ')}...`, metadata);
}
const MAX_SITEMAP_DEPTH = server_defaults.DOCUMENTATION.MAX_SITEMAP_DEPTH;
const MAX_SITEMAPS_PER_DEPTH = server_defaults.DOCUMENTATION.MAX_SITEMAPS_PER_DEPTH;
const MAX_TOTAL_SITEMAPS = server_defaults.DOCUMENTATION.MAX_TOTAL_SITEMAPS;
let depth = 0;
const allSitemapUrls: string[] = [];
while (sitemapQueue.length > 0 && depth < MAX_SITEMAP_DEPTH) {
if (processedSitemaps.size >= MAX_TOTAL_SITEMAPS) {
logMessage('debug', `Reached global sitemap limit (${MAX_TOTAL_SITEMAPS}), stopping sitemap discovery`, metadata);
break;
}
const currentBatch = [...sitemapQueue];
sitemapQueue.length = 0;
depth++;
let sitemapsToProcess = currentBatch;
if (currentBatch.length > MAX_SITEMAPS_PER_DEPTH) {
const keywords = this.getMergedKeywords(config.keywords);
sitemapsToProcess = this.rankItems(currentBatch, keywords) as string[];
sitemapsToProcess = sitemapsToProcess.slice(0, MAX_SITEMAPS_PER_DEPTH);
logMessage('debug', `Ranked and limited sitemaps at depth ${depth} from ${currentBatch.length} to ${sitemapsToProcess.length}`, metadata);
}
const remainingBudget = MAX_TOTAL_SITEMAPS - processedSitemaps.size;
if (sitemapsToProcess.length > remainingBudget) {
sitemapsToProcess = sitemapsToProcess.slice(0, remainingBudget);
logMessage('debug', `Further limited sitemaps to ${sitemapsToProcess.length} based on global budget`, metadata);
}
for (const sitemapUrl of sitemapsToProcess) {
if (processedSitemaps.has(sitemapUrl)) continue;
processedSitemaps.add(sitemapUrl);
const content = await this.fetchSitemapContent(sitemapUrl, config);
if (!content) continue;
const { urls, sitemaps } = this.parseSitemapContent(content, sitemapUrl);
const filteredUrls = filterDocumentationUrls(urls, PlaywrightFetchingStrategy.EXCLUDED_LINK_KEYWORDS);
if (filteredUrls.length > 0) {
logMessage('debug', `Found ${urls.length} total URLs in sitemap, ${filteredUrls.length} after filtering. First few: ${filteredUrls.slice(0, 3).join(', ')}`, metadata);
}
allSitemapUrls.push(...filteredUrls);
if (processedSitemaps.size >= MAX_TOTAL_SITEMAPS) {
continue;
}
const relevantSitemaps = sitemaps.filter(s => {
if (processedSitemaps.has(s)) return false;
try {
const sitemapUrl = new URL(s);
if (sitemapUrl.hostname !== docUrl.hostname) {
return false;
}
const docPath = docUrl.pathname.replace(/\/$/, '');
const sitemapPath = sitemapUrl.pathname.replace(/\/$/, '');
if (sitemapPath === '/sitemap.xml' || sitemapPath === '/sitemap_index.xml') {
return true;
}
if (docPath && docPath !== '/') {
const docParts = docPath.split('/').filter(p => p);
const sitemapParts = sitemapPath.split('/').filter(p => p);
for (let i = 0; i < Math.min(docParts.length, sitemapParts.length - 1); i++) {
if (docParts[i] === sitemapParts[i]) {
return true;
}
}
}
const relevantKeywords = ['docs', 'api', 'documentation'];
const sitemapLower = sitemapPath.toLowerCase();
if (relevantKeywords.some(keyword => sitemapLower.includes(keyword))) {
return true;
}
return false;
} catch {
return false;
}
});
if (relevantSitemaps.length > 0) {
const roomLeft = MAX_TOTAL_SITEMAPS - processedSitemaps.size;
if (roomLeft > 0) {
const sitemapsToAdd = relevantSitemaps.slice(0, roomLeft);
logMessage('debug', `Adding ${sitemapsToAdd.length} relevant sitemaps to queue (filtered from ${sitemaps.length} total, limited by budget)`, metadata);
sitemapQueue.push(...sitemapsToAdd);
}
}
}
}
const uniqueSitemapUrls = [...new Set(allSitemapUrls)];
const filterPaths: string[] = [];
const pathParts = docUrl.pathname.split('/').filter(p => p);
filterPaths.push(docUrl.pathname);
for (let i = pathParts.length - 1; i >= 0; i--) {
const parentPath = '/' + pathParts.slice(0, i).join('/');
if (parentPath !== '/' && !filterPaths.includes(parentPath)) {
filterPaths.push(parentPath);
}
}
filterPaths.push('/');
for (const filterPath of filterPaths) {
const filteredUrls = uniqueSitemapUrls.filter(url => {
try {
const urlObj = new URL(url);
if (urlObj.hostname !== docUrl.hostname) {
return false;
}
const normalizedFilterPath = filterPath.replace(/\/$/, '');
const normalizedUrlPath = urlObj.pathname.replace(/\/$/, '');
if (normalizedFilterPath === '') {
return true;
}
return normalizedUrlPath.startsWith(normalizedFilterPath);
} catch {
return false;
}
});
if (filteredUrls.length > 0) {
if (filteredUrls.length >= PlaywrightFetchingStrategy.MAX_FETCHED_LINKS || filterPath === '/') {
return filteredUrls;
}
}
}
return uniqueSitemapUrls;
}
private async fetchPagesInBatches(
urls: string[],
config: DocumentationConfig,
metadata: Metadata
): Promise<string> {
const BATCH_SIZE = PlaywrightFetchingStrategy.PARALLEL_FETCH_LIMIT;
const MAX_TOTAL_SIZE = server_defaults.DOCUMENTATION.MAX_TOTAL_CONTENT_SIZE;
const pageResults: Array<{ url: string; html: string; textContent: string }> = [];
let fetchedCount = 0;
// Fetch all pages in batches
for (let i = 0; i < urls.length && fetchedCount < PlaywrightFetchingStrategy.MAX_FETCHED_LINKS; i += BATCH_SIZE) {
const remainingSlots = PlaywrightFetchingStrategy.MAX_FETCHED_LINKS - fetchedCount;
const batch = urls.slice(i, Math.min(i + BATCH_SIZE, i + remainingSlots));
const batchPromises = batch.map(async (url) => {
const result = await this.fetchPageContentWithPlaywright(url, config, metadata);
if (!result?.content) return null;
return { url, html: result.content, textContent: result.textContent };
});
const results = await Promise.all(batchPromises);
for (const result of results) {
if (!result || !result.html) continue;
pageResults.push(result);
fetchedCount++;
}
}
// Deduplicate based on similarity
const deduplicatedPages: Array<{ url: string; html: string; textContent: string }> = [];
if (pageResults.length > 0) {
deduplicatedPages.push(pageResults[0]);
}
for (let i = 1; i < pageResults.length; i++) {
const currentPage = pageResults[i];
// Always include pages with short text content
if (currentPage.textContent.length <= 500) {
deduplicatedPages.push(currentPage);
continue;
}
// Check similarity against all pages already in the deduplicated list
let isSimilar = false;
for (const existingPage of deduplicatedPages) {
const dice = diceCoefficient(currentPage.textContent, existingPage.textContent);
const jaccard = jaccardSimilarity(currentPage.textContent, existingPage.textContent);
const avgSimilarity = (dice + jaccard) / 2;
if (avgSimilarity > server_defaults.DOCUMENTATION.SIMILARITY_THRESHOLD_PERCENTAGE / 100.0) {
isSimilar = true;
logMessage('debug', `Skipping similar page '${currentPage.url}' because it is ${(avgSimilarity * 100).toFixed(1)}% similar to '${existingPage.url}'`, metadata);
break;
}
}
if (!isSimilar) {
deduplicatedPages.push(currentPage);
}
}
// Build final combined content from deduplicated pages
let combinedContent = "";
let totalSize = 0;
for (const page of deduplicatedPages) {
const contentSize = Buffer.byteLength(page.html, 'utf8');
if (totalSize + contentSize > MAX_TOTAL_SIZE) {
logMessage('debug', `Reached size budget (${Math.round(totalSize / 1024 / 1024)}MB), skipping remaining pages`, metadata);
break;
}
combinedContent += combinedContent ? `\n\n${page.html}` : page.html;
totalSize += contentSize;
}
return combinedContent;
}
async tryFetch(config: DocumentationConfig, metadata: Metadata): Promise<string | null> {
if (!config?.documentationUrl) return null;
try {
try {
const sitemapUrlsPromise = this.collectSitemapUrls(config, metadata);
let timeoutHandle: NodeJS.Timeout;
const timeoutPromise = new Promise<string[]>((resolve) => {
timeoutHandle = setTimeout(() => {
logMessage('warn', 'Sitemap URL collection timed out, falling back to iterative crawling', metadata);
resolve([]);
}, server_defaults.DOCUMENTATION.TIMEOUTS.SITEMAP_PROCESSING_TOTAL);
});
const sitemapUrls = await Promise.race([sitemapUrlsPromise, timeoutPromise]);
if (timeoutHandle!) {
clearTimeout(timeoutHandle);
}
if (sitemapUrls.length > 0) {
const keywords = this.getMergedKeywords(config.keywords);
const rankedUrls = this.rankItems(sitemapUrls, keywords) as string[];
const topUrls = rankedUrls.slice(0, PlaywrightFetchingStrategy.MAX_FETCHED_LINKS);
const content = await this.fetchPagesInBatches(topUrls, config, metadata);
if (content) {
return content;
}
}
} catch (error) {
logMessage('warn', `Sitemap processing failed: ${error?.message}, falling back to legacy crawling`, metadata);
}
return await this.legacyTryFetch(config, metadata);
} finally {
await this.cleanupContext();
}
}
public getDefaultKeywords(): Array<{ keyword: string; weight: number }> {
return [
// High priority: Getting started & overview content (weight: 5)
{ keyword: "getting started", weight: 5 },
{ keyword: "quickstart", weight: 5 },
{ keyword: "overview", weight: 5 },
{ keyword: "introduction", weight: 5 },
{ keyword: "api", weight: 5 },
// Core concepts: Authentication & API fundamentals (weight: 4)
{ keyword: "authentication", weight: 4 },
{ keyword: "authorization", weight: 4 },
{ keyword: "rest", weight: 4 },
{ keyword: "endpoints", weight: 4 },
// Important concepts (weight: 3)
{ keyword: "guides", weight: 3 },
{ keyword: "tutorial", weight: 3 },
{ keyword: "reference", weight: 3 },
{ keyword: "api-reference", weight: 3 },
{ keyword: "open api", weight: 3 },
{ keyword: "swagger", weight: 3 },
{ keyword: "bearer", weight: 3 },
{ keyword: "token", weight: 3 },
{ keyword: "pagination", weight: 3 },
{ keyword: "schema", weight: 3 },
// Moderate importance: Data concepts (weight: 2)
{ keyword: "objects", weight: 2 },
{ keyword: "data-objects", weight: 2 },
{ keyword: "properties", weight: 2 },
{ keyword: "values", weight: 2 },
{ keyword: "fields", weight: 2 },
{ keyword: "attributes", weight: 2 },
{ keyword: "parameters", weight: 2 },
{ keyword: "slugs", weight: 2 },
{ keyword: "lists", weight: 2 },
{ keyword: "query", weight: 2 },
{ keyword: "methods", weight: 2 },
{ keyword: "response", weight: 2 },
{ keyword: "filtering", weight: 2 },
{ keyword: "sorting", weight: 2 },
{ keyword: "searching", weight: 2 },
{ keyword: "filter", weight: 2 },
{ keyword: "sort", weight: 2 },
{ keyword: "search", weight: 2 },
// Lower priority: Specific HTTP methods (weight: 1)
{ keyword: "get", weight: 1 },
{ keyword: "post", weight: 1 },
{ keyword: "put", weight: 1 },
{ keyword: "delete", weight: 1 },
{ keyword: "patch", weight: 1 },
];
}
public getMergedKeywords(inputKeywords?: string[] | null): Array<{ keyword: string; weight: number }> {
const defaultKeywords = this.getDefaultKeywords();
if (!inputKeywords || inputKeywords.length === 0) {
return defaultKeywords;
}
// User-provided keywords get high weight (4)
const userKeywords = inputKeywords.map(keyword => ({ keyword, weight: 4 }));
// Merge, preferring user-provided weights for duplicates
const keywordMap = new Map<string, number>();
for (const { keyword, weight } of [...userKeywords, ...defaultKeywords]) {
const key = keyword.toLowerCase();
if (!keywordMap.has(key)) {
keywordMap.set(key, weight);
}
}
return Array.from(keywordMap.entries()).map(([keyword, weight]) => ({ keyword, weight }));
}
public rankItems(items: string[] | { linkText: string, href: string }[], keywords: string[] | Array<{ keyword: string; weight: number }>, fetchedLinks?: Set<string>): any[] {
const normalizedItems = items.map((item, index) => {
const isString = typeof item === 'string';
const url = isString ? new URL(item).pathname : new URL(item.href).pathname;
const text = isString ? '' : item.linkText;
const searchableContent = `${url} ${text}`.toLowerCase();
return {
url,
original: item,
searchableContent,
index
};
});
let itemsToRank = fetchedLinks
? normalizedItems.filter(item => {
const href = typeof item.original === 'string' ? item.original : item.original.href;
return !fetchedLinks.has(href);
})
: normalizedItems;
const itemsToRankFiltered = itemsToRank.filter(item => {
try {
for (const excludedKeyword of PlaywrightFetchingStrategy.EXCLUDED_LINK_KEYWORDS) {
if (item.url.includes(excludedKeyword)) {
return false;
}
}
return true;
} catch {
return false;
}
});
if (itemsToRankFiltered.length > 0) {
itemsToRank = itemsToRankFiltered;
}
if (!keywords || keywords.length === 0) {
return itemsToRank.map(item => item.original);
}
// Normalize keywords to weighted format
const weightedKeywords: Array<{ keyword: string; weight: number }> =
typeof keywords[0] === 'string'
? (keywords as string[]).map(k => ({ keyword: k, weight: 1 }))
: keywords as Array<{ keyword: string; weight: number }>;
const scored = itemsToRank.map(item => {
let matchScore = 0;
const content = item.searchableContent;
for (const { keyword, weight } of weightedKeywords) {
const keywordLower = keyword.toLowerCase();
const wordBoundaryRegex = new RegExp(`\\b${keywordLower}\\b`, 'g');
const exactMatches = (content.match(wordBoundaryRegex) || []).length;
// Weighted scoring: exact matches get 3x base, partial matches get 1x base
matchScore += exactMatches * 3 * weight;
if (exactMatches === 0 && content.includes(keywordLower)) {
matchScore += 1 * weight;
}
}
// Smarter length penalty: use log scale with a cap to prevent over-penalizing long URLs
const MIN_LENGTH = 10;
const urlLength = Math.max(item.url.length, MIN_LENGTH);
const lengthPenalty = Math.log10(urlLength);
const score = matchScore / lengthPenalty;
return {
item: item.original,
score,
hasMatch: matchScore > 0
};
});
return scored
.sort((a, b) => {
if (a.hasMatch !== b.hasMatch) {
return a.hasMatch ? -1 : 1;
}
return b.score - a.score;
})
.map(s => s.item);
}
private async crawlWithLinks(startUrl: string, config: DocumentationConfig, metadata: Metadata): Promise<string> {
const visitedUrls = new Set<string>();
const linkQueue: { linkText: string, href: string }[] = [{ linkText: "documentation", href: startUrl }];
const searchKeywords = this.getMergedKeywords(config.keywords);
let aggregatedContent = "";
while (visitedUrls.size < PlaywrightFetchingStrategy.MAX_FETCHED_LINKS && linkQueue.length > 0) {
const prioritizedLinks = this.rankItems(linkQueue, searchKeywords, visitedUrls) as { linkText: string, href: string }[];
if (prioritizedLinks.length === 0) break;
const nextLink = prioritizedLinks[0];
linkQueue.splice(linkQueue.findIndex(l => l.href === nextLink.href), 1);
try {
const pageData = await this.fetchPageContentWithPlaywright(nextLink.href, config, metadata);
visitedUrls.add(nextLink.href);
if (pageData?.content) {
aggregatedContent += aggregatedContent ? `\n\n${pageData.content}` : pageData.content;
if (pageData.links) {
for (const [linkText, href] of Object.entries(pageData.links)) {
if (this.isValidDocLink(href, linkText, startUrl) &&
!visitedUrls.has(href) &&
!linkQueue.some(l => l.href === href)) {
linkQueue.push({ linkText, href });
}
}
}
}
} catch (error) {
logMessage('warn', `Failed to fetch ${nextLink.href}: ${error?.message}`, metadata);
}
}
return aggregatedContent;
}
async legacyTryFetch(config: DocumentationConfig, metadata: Metadata): Promise<string | null> {
if (!config?.documentationUrl) return null;
return this.crawlWithLinks(config.documentationUrl, config, metadata);
}
private isValidDocLink(href: string, linkText: string, baseUrl: string): boolean {
if (!linkText || !href) return false;
const hrefLower = new URL(href).pathname.toLowerCase();
if (PlaywrightFetchingStrategy.EXCLUDED_LINK_KEYWORDS.some(kw => hrefLower.includes(kw))) {
return false;
}
try {
const base = new URL(baseUrl);
const link = new URL(href);
if (link.hostname !== base.hostname) return false;
const baseParts = base.pathname.split('/').filter(p => p);
if (baseParts.length >= 2) {
const requiredPath = '/' + baseParts.slice(0, -1).join('/');
if (!link.pathname.startsWith(requiredPath)) return false;
}
return true;
} catch {
return false;
}
}
}