docs-mcp-server

Overview Schema Related Servers Score Discussions

GitHubScraperStrategy.ts•20.1 KiB

import type { ProgressCallback } from "../../types"; import type { AppConfig } from "../../utils/config"; import { ScraperError } from "../../utils/errors"; import { logger } from "../../utils/logger"; import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; import { HttpFetcher } from "../fetcher"; import { FetchStatus } from "../fetcher/types"; import type { QueueItem, ScraperOptions, ScraperProgressEvent } from "../types"; import { shouldIncludeUrl } from "../utils/patternMatcher"; import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrategy"; import type { GitHubRepoInfo, GitHubTreeItem, GitHubTreeResponse, } from "./GitHubRepoProcessor"; import { GitHubRepoProcessor } from "./GitHubRepoProcessor"; import { GitHubWikiProcessor } from "./GitHubWikiProcessor"; import { resolveGitHubAuth } from "./github-auth"; /** * GitHubScraperStrategy is a discovery strategy that orchestrates the scraping of both * GitHub repository code and wiki pages. When given a GitHub repository URL, it will: * * 1. Attempt to scrape the repository's wiki pages using GitHubWikiProcessor (prioritized) * 2. Discover all repository files using the GitHub Tree API * 3. Create HTTPS blob URLs for each file, which are stored in the database * 4. Process blob URLs directly with GitHubRepoProcessor * * This provides comprehensive documentation coverage by including both wiki documentation * and source code in a single scraping job, with wikis prioritized as they typically * contain higher-quality curated documentation. * * Features: * - Handles base GitHub repository URLs (e.g., https://github.com/owner/repo) * - Handles branch-specific URLs (e.g., https://github.com/owner/repo/tree/branch) * - Handles single file URLs (e.g., https://github.com/owner/repo/blob/branch/path) * - Discovers all files efficiently using GitHub's Tree API * - Generates and processes user-friendly HTTPS blob URLs throughout * - Prioritizes wiki content over repository files for better documentation quality * - Respects maxPages limit across both scraping phases to prevent exceeding quotas * - Automatically discovers and scrapes both wiki and code content * - Graceful handling when wikis don't exist or are inaccessible */ export class GitHubScraperStrategy extends BaseScraperStrategy { private readonly httpFetcher: HttpFetcher; private readonly wikiProcessor: GitHubWikiProcessor; private readonly repoProcessor: GitHubRepoProcessor; private resolvedAuthHeaders?: Record<string, string>; private resolvedAuthKey?: string; constructor(config: AppConfig) { super(config); this.httpFetcher = new HttpFetcher(config.scraper); this.wikiProcessor = new GitHubWikiProcessor(config); this.repoProcessor = new GitHubRepoProcessor(config); } canHandle(url: string): boolean { // Handle legacy github-file:// protocol URLs (no longer supported) // These will be processed and marked as NOT_FOUND to trigger cleanup if (url.startsWith("github-file://")) { return true; } try { const parsedUrl = new URL(url); const { hostname, pathname } = parsedUrl; // Handle GitHub repository URLs if (!["github.com", "www.github.com"].includes(hostname)) { return false; } // Handle base repository URLs (owner/repo) const baseMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/); if (baseMatch) { return true; } // Handle tree URLs (owner/repo/tree/branch/...) const treeMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/tree\//); if (treeMatch) { return true; } // Handle blob URLs (owner/repo/blob/branch/...) const blobMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/blob\//); if (blobMatch) { return true; } return false; } catch { return false; } } /** * Parses a GitHub URL to extract repository information. */ private parseGitHubUrl( url: string, ): GitHubRepoInfo & { isBlob?: boolean; filePath?: string } { const parsedUrl = new URL(url); // Extract /<org>/<repo> from github.com/<org>/<repo>/... const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)/); if (!match) { throw new Error(`Invalid GitHub repository URL: ${url}`); } const [, owner, repo] = match; // Extract branch and optional subpath from URLs like /tree/<branch>/<subPath> const segments = parsedUrl.pathname.split("/").filter(Boolean); // Handle /blob/ URLs for single file indexing if (segments.length >= 4 && segments[2] === "blob") { const branch = segments[3]; const filePath = segments.length > 4 ? segments.slice(4).join("/") : undefined; return { owner, repo, branch, filePath, isBlob: true }; } // Handle /tree/ URLs with branch and optional subpath if (segments.length >= 4 && segments[2] === "tree") { const branch = segments[3]; const subPath = segments.length > 4 ? segments.slice(4).join("/") : undefined; return { owner, repo, branch, subPath }; } // Base repository URL return { owner, repo }; } private buildAuthCacheKey(explicitHeaders?: Record<string, string>): string { const normalizedHeaders = explicitHeaders ? Object.keys(explicitHeaders) .sort() .map((key) => [key, explicitHeaders[key]]) : []; const envKey = `${process.env.GITHUB_TOKEN ?? ""}|${process.env.GH_TOKEN ?? ""}`; return JSON.stringify({ headers: normalizedHeaders, env: envKey }); } private async getResolvedAuthHeaders( explicitHeaders?: Record<string, string>, ): Promise<Record<string, string>> { const cacheKey = this.buildAuthCacheKey(explicitHeaders); if (this.resolvedAuthHeaders && this.resolvedAuthKey === cacheKey) { return this.resolvedAuthHeaders; } const resolved = await resolveGitHubAuth(explicitHeaders); this.resolvedAuthHeaders = resolved; this.resolvedAuthKey = cacheKey; return resolved; } /** * Fetches the repository tree structure from GitHub API. */ private async fetchRepositoryTree( repoInfo: GitHubRepoInfo, headers?: Record<string, string>, signal?: AbortSignal, ): Promise<{ tree: GitHubTreeResponse; resolvedBranch: string }> { const { owner, repo, branch } = repoInfo; // If no branch specified, fetch the default branch first let targetBranch = branch; if (!targetBranch) { const repoUrl = `https://api.github.com/repos/${owner}/${repo}`; logger.debug(`Fetching repository info: ${repoUrl}`); let repoContent: Awaited<ReturnType<typeof this.httpFetcher.fetch>>; try { repoContent = await this.httpFetcher.fetch(repoUrl, { signal, headers }); } catch (error) { // Convert HTTP auth errors to user-friendly messages if (error instanceof ScraperError) { if (error.message.includes("401")) { throw new ScraperError( `GitHub authentication failed for "${owner}/${repo}". Your token is invalid or expired. Please check your GITHUB_TOKEN or GH_TOKEN environment variable.`, false, error, ); } if (error.message.includes("403")) { throw new ScraperError( `GitHub access denied for "${owner}/${repo}". Your token may lack the required permissions, or you may be rate-limited. Please check your GITHUB_TOKEN or GH_TOKEN.`, false, error, ); } } throw error; } // Check for NOT_FOUND status before parsing - repo is inaccessible (private or doesn't exist) if (repoContent.status === FetchStatus.NOT_FOUND) { throw new ScraperError( `Repository "${owner}/${repo}" not found or not accessible. For private repositories, set the GITHUB_TOKEN environment variable.`, false, ); } // Try to parse the response to get the default branch try { const content = typeof repoContent.content === "string" ? repoContent.content : repoContent.content.toString("utf-8"); const repoData = JSON.parse(content) as { default_branch?: string }; const defaultBranch = typeof repoData.default_branch === "string" ? repoData.default_branch.trim() : ""; if (!defaultBranch) { logger.warn( `⚠️ Repository info missing default_branch for ${owner}/${repo}, using 'main'`, ); targetBranch = "main"; } else { targetBranch = defaultBranch; logger.debug(`Using default branch: ${targetBranch}`); } } catch (parseError) { // Only fall back to "main" for JSON parse errors (e.g., unexpected API response format) logger.warn(`⚠️ Could not parse repository info, using 'main': ${parseError}`); targetBranch = "main"; } } const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`; logger.debug(`Fetching repository tree: ${treeUrl}`); let rawContent: Awaited<ReturnType<typeof this.httpFetcher.fetch>>; try { rawContent = await this.httpFetcher.fetch(treeUrl, { signal, headers }); } catch (error) { // Convert HTTP auth errors to user-friendly messages if (error instanceof ScraperError) { if (error.message.includes("401")) { throw new ScraperError( `GitHub authentication failed for "${owner}/${repo}". Your token is invalid or expired. Please check your GITHUB_TOKEN or GH_TOKEN environment variable.`, false, error, ); } if (error.message.includes("403")) { throw new ScraperError( `GitHub access denied for "${owner}/${repo}". Your token may lack the required permissions, or you may be rate-limited. Please check your GITHUB_TOKEN or GH_TOKEN.`, false, error, ); } } throw error; } // Check for NOT_FOUND status before parsing - this indicates repo is inaccessible if (rawContent.status === FetchStatus.NOT_FOUND) { throw new ScraperError( `Repository "${owner}/${repo}" not found or not accessible. For private repositories, set the GITHUB_TOKEN environment variable.`, false, ); } const content = typeof rawContent.content === "string" ? rawContent.content : rawContent.content.toString("utf-8"); // Parse JSON with proper error handling let treeData: GitHubTreeResponse; try { treeData = JSON.parse(content) as GitHubTreeResponse; } catch (parseError) { throw new ScraperError( `Failed to parse GitHub API response for "${owner}/${repo}". The repository may be inaccessible or the API returned an unexpected response.`, false, parseError instanceof Error ? parseError : undefined, ); } if (treeData.truncated) { logger.warn( `⚠️ Repository tree was truncated for ${owner}/${repo}. Some files may be missing.`, ); } return { tree: treeData, resolvedBranch: targetBranch }; } /** * Determines if a file should be processed based on its path and type. */ private shouldProcessFile(item: GitHubTreeItem, options: ScraperOptions): boolean { if (item.type !== "blob") { return false; } const path = item.path; // Whitelist of text-based file extensions const textExtensions = [ ".md", ".mdx", ".txt", ".rst", ".adoc", ".asciidoc", ".html", ".htm", ".xml", ".css", ".scss", ".sass", ".less", ".js", ".jsx", ".ts", ".tsx", ".py", ".java", ".c", ".cpp", ".cc", ".cxx", ".h", ".hpp", ".cs", ".go", ".rs", ".rb", ".php", ".swift", ".kt", ".scala", ".clj", ".cljs", ".hs", ".elm", ".dart", ".r", ".m", ".mm", ".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".json", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".conf", ".properties", ".env", ".gitignore", ".dockerignore", ".gitattributes", ".editorconfig", ".gradle", ".pom", ".sbt", ".maven", ".cmake", ".make", ".dockerfile", ".mod", ".sum", ".sql", ".graphql", ".gql", ".proto", ".thrift", ".avro", ".csv", ".tsv", ".log", ]; // Document extensions supported by DocumentPipeline const documentExtensions = [".pdf", ".docx", ".xlsx", ".pptx", ".ipynb"]; const pathLower = path.toLowerCase(); const hasTextExtension = textExtensions.some((ext) => pathLower.endsWith(ext)); const hasDocumentExtension = documentExtensions.some((ext) => pathLower.endsWith(ext), ); const hasCompoundExtension = pathLower.includes(".env.") || pathLower.endsWith(".env") || pathLower.includes(".config.") || pathLower.includes(".lock"); const fileName = path.split("/").pop() || ""; const fileNameLower = fileName.toLowerCase(); const commonTextFiles = [ "readme", "license", "changelog", "contributing", "authors", "maintainers", "dockerfile", "makefile", "rakefile", "gemfile", "podfile", "cartfile", "brewfile", "procfile", "vagrantfile", "gulpfile", "gruntfile", ".prettierrc", ".eslintrc", ".babelrc", ".nvmrc", ".npmrc", ]; const isCommonTextFile = commonTextFiles.some((name) => { if (name.startsWith(".")) { return fileNameLower === name || fileNameLower.startsWith(`${name}.`); } return fileNameLower === name || fileNameLower.startsWith(`${name}.`); }); // If file passes known checks, include it if ( hasTextExtension || hasDocumentExtension || hasCompoundExtension || isCommonTextFile ) { return shouldIncludeUrl(path, options.includePatterns, options.excludePatterns); } // Fallback: check if unknown extension has text/* MIME type using MimeTypeUtils const mimeType = MimeTypeUtils.detectMimeTypeFromPath(path); if (mimeType?.startsWith("text/")) { logger.debug(`Including file with text MIME type: ${path} (${mimeType})`); return shouldIncludeUrl(path, options.includePatterns, options.excludePatterns); } // Not a text file return false; } /** * Checks if a path is within the specified subpath. */ private isWithinSubPath(path: string, subPath?: string): boolean { if (!subPath) { return true; } const trimmedSubPath = subPath.replace(/^\/+/, "").replace(/\/+$/, ""); if (trimmedSubPath.length === 0) { return true; } const normalizedPath = path.replace(/^\/+/, "").replace(/\/+$/, ""); if (normalizedPath === trimmedSubPath) { return true; } return normalizedPath.startsWith(`${trimmedSubPath}/`); } async processItem( item: QueueItem, options: ScraperOptions, signal?: AbortSignal, ): Promise<ProcessItemResult> { // Handle legacy github-file:// URLs - treat as deleted/not found if (item.url.startsWith("github-file://")) { logger.info( `🗑️ Legacy github-file:// URL detected, marking as deleted: ${item.url}`, ); return { url: item.url, links: [], status: FetchStatus.NOT_FOUND, }; } const headers = await this.getResolvedAuthHeaders(options.headers); // Delegate to wiki processor for wiki URLs // Use precise pattern matching: /owner/repo/wiki or /owner/repo/wiki/ try { const parsedUrl = new URL(item.url); if (/^\/[^/]+\/[^/]+\/wiki($|\/)/.test(parsedUrl.pathname)) { return await this.wikiProcessor.process(item, options, headers, signal); } } catch { // If URL parsing fails, fall through to other handlers } // For the main repository URL (depth 0), perform discovery // This includes blob URLs at depth 0, which should return themselves as discovered links if (item.depth === 0) { const repoInfo = this.parseGitHubUrl(options.url); const { owner, repo } = repoInfo; logger.debug(`Discovering GitHub repository ${owner}/${repo}`); const discoveredLinks: string[] = []; // Handle single file (blob) URLs - strict scoping: index ONLY the file if ("isBlob" in repoInfo && repoInfo.isBlob && repoInfo.filePath) { const { branch = "main", filePath } = repoInfo; logger.debug( `Single file URL detected: ${owner}/${repo}/${filePath} - indexing file only`, ); // Generate HTTPS blob URL for storage discoveredLinks.push( `https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`, ); return { url: item.url, links: discoveredLinks, status: FetchStatus.SUCCESS, }; } // Discover wiki URL for full repo scrapes (will be processed by GitHubWikiScraperStrategy) const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`; discoveredLinks.push(wikiUrl); logger.debug(`Discovered wiki URL: ${wikiUrl}`); // 3. Discover all files in the repository const { tree, resolvedBranch } = await this.fetchRepositoryTree( repoInfo, headers, signal, ); const fileItems = tree.tree .filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)) .filter((treeItem) => this.shouldProcessFile(treeItem, options)); logger.debug( `Discovered ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`, ); // Create HTTPS blob URLs for storage in database // These are user-friendly, clickable URLs that work outside the system const fileUrls = fileItems.map( (treeItem) => `https://github.com/${owner}/${repo}/blob/${resolvedBranch}/${treeItem.path}`, ); discoveredLinks.push(...fileUrls); logger.debug( `Discovery complete: ${fileUrls.length} repo file(s) + 1 wiki URL = ${discoveredLinks.length} total URLs`, ); return { url: item.url, links: discoveredLinks, status: FetchStatus.SUCCESS }; } // Handle HTTPS blob URLs at depth > 0 (from database during refresh or discovered files) // Process blob URLs directly - fetch content and return empty links // Use precise pattern matching: /owner/repo/blob/branch/path try { const parsedUrl = new URL(item.url); if (/^\/[^/]+\/[^/]+\/blob\//.test(parsedUrl.pathname)) { logger.debug(`Processing HTTPS blob URL at depth ${item.depth}: ${item.url}`); return await this.repoProcessor.process(item, options, headers, signal); } } catch (error) { logger.warn(`⚠️ Failed to parse blob URL ${item.url}: ${error}`); return { url: item.url, links: [], status: FetchStatus.SUCCESS }; } // For any other URLs at non-zero depth, return empty (shouldn't happen in practice) logger.debug(`No further processing for URL at depth ${item.depth}: ${item.url}`); return { url: item.url, links: [], status: FetchStatus.SUCCESS }; } async scrape( options: ScraperOptions, progressCallback: ProgressCallback<ScraperProgressEvent>, signal?: AbortSignal, ): Promise<void> { const url = new URL(options.url); if (!url.hostname.includes("github.com")) { throw new Error("URL must be a GitHub URL"); } await this.getResolvedAuthHeaders(options.headers); // Use the base class implementation which handles initialQueue properly // The processItem method will discover all wiki and repo file URLs // The base scraper will automatically deduplicate URLs from initialQueue try { await super.scrape(options, progressCallback, signal); } finally { this.resolvedAuthHeaders = undefined; this.resolvedAuthKey = undefined; } } async cleanup(): Promise<void> { await Promise.all([this.wikiProcessor.cleanup(), this.repoProcessor.cleanup()]); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

GitHubScraperStrategy.ts•20.1 KiB