Skip to main content
Glama

docs-mcp-server

GitHubRepoScraperStrategy.ts16.9 kB
import type { Document, ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; import { HttpFetcher } from "../fetcher"; import type { RawContent } from "../fetcher/types"; import { PipelineFactory } from "../pipelines/PipelineFactory"; import type { ContentPipeline } from "../pipelines/types"; import { ScrapeMode, type ScraperOptions, type ScraperProgress } from "../types"; import { shouldIncludeUrl } from "../utils/patternMatcher"; import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy"; interface GitHubRepoInfo { owner: string; repo: string; branch?: string; subPath?: string; } interface GitHubTreeItem { path: string; type: "blob" | "tree"; sha: string; size?: number; url: string; } interface GitHubTreeResponse { sha: string; url: string; tree: GitHubTreeItem[]; truncated: boolean; } /** * GitHubRepoScraperStrategy handles native repository crawling by accessing GitHub's tree API * to discover repository structure and fetching raw file contents. This treats repositories * more like file systems rather than web pages. * * Features: * - Uses GitHub tree API for efficient repository structure discovery * - Fetches raw file contents from raw.githubusercontent.com * - Processes all text files (source code, markdown, documentation, etc.) * - Supports branch-specific crawling (defaults to main/default branch) * - Automatically detects repository default branch when no branch specified * - Respects repository subpath URLs (e.g., /tree/<branch>/docs) by limiting indexed files * - Filters out binary files and processes only text-based content * * Note: Wiki pages are not currently supported in this native mode. For wiki access, * consider using the web scraping approach or a separate scraping job. */ export class GitHubRepoScraperStrategy extends BaseScraperStrategy { private readonly httpFetcher = new HttpFetcher(); private readonly pipelines: ContentPipeline[]; private resolvedBranch?: string; // Cache the resolved default branch constructor() { super(); this.pipelines = PipelineFactory.createStandardPipelines(); } canHandle(url: string): boolean { const { hostname } = new URL(url); return ["github.com", "www.github.com"].includes(hostname); } /** * Override shouldProcessUrl to handle github-file:// URLs specially. * These URLs bypass scope checking since they're internal file references. */ protected shouldProcessUrl(url: string, options: ScraperOptions): boolean { // For github-file:// URLs, only apply include/exclude patterns, skip scope checking if (url.startsWith("github-file://")) { const filePath = url.replace("github-file://", ""); return shouldIncludeUrl(filePath, options.includePatterns, options.excludePatterns); } // For regular URLs, use the base implementation return super.shouldProcessUrl(url, options); } /** * Parses a GitHub URL to extract repository information. */ parseGitHubUrl(url: string): GitHubRepoInfo & { isBlob?: boolean; filePath?: string } { const parsedUrl = new URL(url); // Extract /<org>/<repo> from github.com/<org>/<repo>/... const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)/); if (!match) { throw new Error(`Invalid GitHub repository URL: ${url}`); } const [, owner, repo] = match; // Extract branch and optional subpath from URLs like /tree/<branch>/<subPath> const segments = parsedUrl.pathname.split("/").filter(Boolean); // Handle /blob/ URLs for single file indexing if (segments.length >= 4 && segments[2] === "blob") { const branch = segments[3]; const filePath = segments.length > 4 ? segments.slice(4).join("/") : undefined; return { owner, repo, branch, filePath, isBlob: true }; } // Only handle URLs of the form /owner/repo/tree/branch/subPath if (segments.length < 4 || segments[2] !== "tree") { // Unsupported format (missing branch, or not a tree/blob URL) return { owner, repo }; } const branch = segments[3]; const subPath = segments.length > 4 ? segments.slice(4).join("/") : undefined; return { owner, repo, branch, subPath }; } /** * Fetches the repository tree structure from GitHub API. * Uses 'HEAD' to get the default branch if no branch is specified. */ async fetchRepositoryTree( repoInfo: GitHubRepoInfo, signal?: AbortSignal, ): Promise<{ tree: GitHubTreeResponse; resolvedBranch: string }> { const { owner, repo, branch } = repoInfo; // If no branch specified, fetch the default branch first let targetBranch = branch; if (!targetBranch) { try { // Get repository information to find the default branch const repoUrl = `https://api.github.com/repos/${owner}/${repo}`; logger.debug(`Fetching repository info: ${repoUrl}`); const repoContent = await this.httpFetcher.fetch(repoUrl, { signal }); const content = typeof repoContent.content === "string" ? repoContent.content : repoContent.content.toString("utf-8"); const repoData = JSON.parse(content) as { default_branch: string }; targetBranch = repoData.default_branch; logger.debug(`Using default branch: ${targetBranch}`); } catch (error) { logger.warn(`⚠️ Could not fetch default branch, using 'main': ${error}`); targetBranch = "main"; } } // Cache the resolved branch for file fetching this.resolvedBranch = targetBranch; const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`; logger.debug(`Fetching repository tree: ${treeUrl}`); const rawContent = await this.httpFetcher.fetch(treeUrl, { signal }); const content = typeof rawContent.content === "string" ? rawContent.content : rawContent.content.toString("utf-8"); const treeData = JSON.parse(content) as GitHubTreeResponse; if (treeData.truncated) { logger.warn( `⚠️ Repository tree was truncated for ${owner}/${repo}. Some files may be missing.`, ); } return { tree: treeData, resolvedBranch: targetBranch }; } /** * Determines if a file should be processed based on its path and type. */ private shouldProcessFile(item: GitHubTreeItem, options: ScraperOptions): boolean { // Only process blob (file) items, not trees (directories) if (item.type !== "blob") { return false; } const path = item.path; // Whitelist of text-based file extensions that we can process const textExtensions = [ // Documentation ".md", ".mdx", ".txt", ".rst", ".adoc", ".asciidoc", // Web technologies ".html", ".htm", ".xml", ".css", ".scss", ".sass", ".less", // Programming languages ".js", ".jsx", ".ts", ".tsx", ".py", ".java", ".c", ".cpp", ".cc", ".cxx", ".h", ".hpp", ".cs", ".go", ".rs", ".rb", ".php", ".swift", ".kt", ".scala", ".clj", ".cljs", ".hs", ".elm", ".dart", ".r", ".m", ".mm", ".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", // Configuration and data ".json", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".conf", ".properties", ".env", ".gitignore", ".dockerignore", ".gitattributes", ".editorconfig", // Build and package management ".gradle", ".pom", ".sbt", ".maven", ".cmake", ".make", ".dockerfile", ".mod", // Go modules (go.mod) ".sum", // Go checksums (go.sum) // Other text formats ".sql", ".graphql", ".gql", ".proto", ".thrift", ".avro", ".csv", ".tsv", ".log", ]; const pathLower = path.toLowerCase(); // Check for known text extensions const hasTextExtension = textExtensions.some((ext) => pathLower.endsWith(ext)); // Check for compound extensions and special cases const hasCompoundExtension = pathLower.includes(".env.") || // .env.example, .env.local, etc. pathLower.endsWith(".env") || pathLower.includes(".config.") || // webpack.config.js, etc. pathLower.includes(".lock"); // package-lock.json, etc. // Also include files without extensions that are commonly text files const fileName = path.split("/").pop() || ""; const fileNameLower = fileName.toLowerCase(); const commonTextFiles = [ // Documentation files without extensions "readme", "license", "changelog", "contributing", "authors", "maintainers", // Build files without extensions "dockerfile", "makefile", "rakefile", "gemfile", "podfile", "cartfile", "brewfile", "procfile", "vagrantfile", "gulpfile", "gruntfile", // Configuration files (dotfiles) ".prettierrc", ".eslintrc", ".babelrc", ".nvmrc", ".npmrc", ]; const isCommonTextFile = commonTextFiles.some((name) => { if (name.startsWith(".")) { // For dotfiles, match exactly or with additional extension (e.g., .prettierrc.js) return fileNameLower === name || fileNameLower.startsWith(`${name}.`); } // For regular files, match exactly or with extension return fileNameLower === name || fileNameLower.startsWith(`${name}.`); }); // Process file if it has a text extension, compound extension, or is a common text file if (!hasTextExtension && !hasCompoundExtension && !isCommonTextFile) { return false; } // Apply user-defined include/exclude patterns (use the file path directly) return shouldIncludeUrl(path, options.includePatterns, options.excludePatterns); } /** * Fetches the raw content of a file from GitHub. */ async fetchFileContent( repoInfo: GitHubRepoInfo, filePath: string, signal?: AbortSignal, ): Promise<RawContent> { const { owner, repo } = repoInfo; // Use resolved branch if available, otherwise use provided branch or default to main const branch = this.resolvedBranch || repoInfo.branch || "main"; const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`; const rawContent = await this.httpFetcher.fetch(rawUrl, { signal }); // Override GitHub's generic 'text/plain' MIME type with file extension-based detection const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath); if (detectedMimeType && rawContent.mimeType === "text/plain") { return { ...rawContent, mimeType: detectedMimeType, }; } return rawContent; } protected async processItem( item: QueueItem, options: ScraperOptions, _progressCallback?: ProgressCallback<ScraperProgress>, signal?: AbortSignal, ): Promise<{ document?: Document; links?: string[] }> { // Parse the URL to get repository information const repoInfo = this.parseGitHubUrl(options.url); // For the initial item, handle blob URLs differently than tree URLs if (item.depth === 0) { // Handle single file (blob) URLs if ("isBlob" in repoInfo && repoInfo.isBlob) { if (repoInfo.filePath) { logger.info( `📄 Processing single file: ${repoInfo.owner}/${repoInfo.repo}/${repoInfo.filePath}`, ); // Process the single file directly return { links: [`github-file://${repoInfo.filePath}`] }; } else { // Blob URL without file path - return empty links logger.warn( `⚠️ Blob URL without file path: ${options.url}. No files to process.`, ); return { links: [] }; } } // Handle repository tree crawling (existing logic) logger.info( `🗂️ Discovering repository structure for ${repoInfo.owner}/${repoInfo.repo}`, ); const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal); const fileItems = tree.tree .filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)) .filter((treeItem) => this.shouldProcessFile(treeItem, options)); logger.info( `📁 Found ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`, ); // Convert tree items to URLs for the queue const links = fileItems.map((treeItem) => `github-file://${treeItem.path}`); return { links }; } // Process individual files if (item.url.startsWith("github-file://")) { const filePath = item.url.replace("github-file://", ""); logger.info( `🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`, ); const rawContent = await this.fetchFileContent(repoInfo, filePath, signal); // Process content through appropriate pipeline let processed: Awaited<ReturnType<ContentPipeline["process"]>> | undefined; for (const pipeline of this.pipelines) { if (pipeline.canProcess(rawContent)) { logger.debug( `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`, ); // Force 'fetch' mode for GitHub to avoid unnecessary Playwright usage on raw content. // GitHub raw files (e.g., HTML files) don't have their dependencies available at the // raw.githubusercontent.com domain, so rendering them in a browser would be broken // and provide no additional value over direct HTML parsing with Cheerio. const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch }; processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher); break; } } if (!processed) { logger.warn( `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`, ); return { document: undefined, links: [] }; } for (const err of processed.errors) { logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`); } // Create document with GitHub-specific metadata const githubUrl = `https://github.com/${repoInfo.owner}/${repoInfo.repo}/blob/${this.resolvedBranch || repoInfo.branch || "main"}/${filePath}`; // Use filename as fallback if title is empty or not a string const processedTitle = processed.metadata.title; const hasValidTitle = typeof processedTitle === "string" && processedTitle.trim() !== ""; const fallbackTitle = filePath.split("/").pop() || "Untitled"; return { document: { content: typeof processed.textContent === "string" ? processed.textContent : "", metadata: { url: githubUrl, title: hasValidTitle ? processedTitle : fallbackTitle, library: options.library, version: options.version, }, contentType: rawContent.mimeType, // Preserve the detected MIME type } satisfies Document, links: [], // Always return empty links array for individual files }; } return { document: undefined, links: [] }; } /** * Normalize a path by removing leading and trailing slashes. */ private normalizePath(path: string): string { return path.replace(/^\/+/, "").replace(/\/+$/, ""); } private isWithinSubPath(path: string, subPath?: string): boolean { if (!subPath) { return true; } const trimmedSubPath = this.normalizePath(subPath); if (trimmedSubPath.length === 0) { return true; } const normalizedPath = this.normalizePath(path); if (normalizedPath === trimmedSubPath) { return true; } return normalizedPath.startsWith(`${trimmedSubPath}/`); } async scrape( options: ScraperOptions, progressCallback: ProgressCallback<ScraperProgress>, signal?: AbortSignal, ): Promise<void> { // Validate it's a GitHub URL const url = new URL(options.url); if (!url.hostname.includes("github.com")) { throw new Error("URL must be a GitHub URL"); } return super.scrape(options, progressCallback, signal); } /** * Cleanup resources used by this strategy, specifically the pipeline browser instances. */ async cleanup(): Promise<void> { await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close())); } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server