Skip to main content
Glama
GitHubScraperStrategy.ts15.1 kB
import mime from "mime"; import type { ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; import { HttpFetcher } from "../fetcher"; import { FetchStatus } from "../fetcher/types"; import type { QueueItem, ScraperOptions, ScraperProgressEvent } from "../types"; import { shouldIncludeUrl } from "../utils/patternMatcher"; import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrategy"; import type { GitHubRepoInfo, GitHubTreeItem, GitHubTreeResponse, } from "./GitHubRepoProcessor"; import { GitHubRepoProcessor } from "./GitHubRepoProcessor"; import { GitHubWikiProcessor } from "./GitHubWikiProcessor"; /** * GitHubScraperStrategy is a discovery strategy that orchestrates the scraping of both * GitHub repository code and wiki pages. When given a GitHub repository URL, it will: * * 1. Attempt to scrape the repository's wiki pages using GitHubWikiProcessor (prioritized) * 2. Discover all repository files using the GitHub Tree API * 3. Create HTTPS blob URLs for each file, which are stored in the database * 4. Process blob URLs directly with GitHubRepoProcessor * * This provides comprehensive documentation coverage by including both wiki documentation * and source code in a single scraping job, with wikis prioritized as they typically * contain higher-quality curated documentation. * * Features: * - Handles base GitHub repository URLs (e.g., https://github.com/owner/repo) * - Handles branch-specific URLs (e.g., https://github.com/owner/repo/tree/branch) * - Handles single file URLs (e.g., https://github.com/owner/repo/blob/branch/path) * - Discovers all files efficiently using GitHub's Tree API * - Generates and processes user-friendly HTTPS blob URLs throughout * - Prioritizes wiki content over repository files for better documentation quality * - Respects maxPages limit across both scraping phases to prevent exceeding quotas * - Automatically discovers and scrapes both wiki and code content * - Graceful handling when wikis don't exist or are inaccessible */ export class GitHubScraperStrategy extends BaseScraperStrategy { private readonly httpFetcher = new HttpFetcher(); private readonly wikiProcessor = new GitHubWikiProcessor(); private readonly repoProcessor = new GitHubRepoProcessor(); canHandle(url: string): boolean { // Handle legacy github-file:// protocol URLs (no longer supported) // These will be processed and marked as NOT_FOUND to trigger cleanup if (url.startsWith("github-file://")) { return true; } try { const parsedUrl = new URL(url); const { hostname, pathname } = parsedUrl; // Handle GitHub repository URLs if (!["github.com", "www.github.com"].includes(hostname)) { return false; } // Handle base repository URLs (owner/repo) const baseMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/); if (baseMatch) { return true; } // Handle tree URLs (owner/repo/tree/branch/...) const treeMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/tree\//); if (treeMatch) { return true; } // Handle blob URLs (owner/repo/blob/branch/...) const blobMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/blob\//); if (blobMatch) { return true; } return false; } catch { return false; } } /** * Parses a GitHub URL to extract repository information. */ private parseGitHubUrl( url: string, ): GitHubRepoInfo & { isBlob?: boolean; filePath?: string } { const parsedUrl = new URL(url); // Extract /<org>/<repo> from github.com/<org>/<repo>/... const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)/); if (!match) { throw new Error(`Invalid GitHub repository URL: ${url}`); } const [, owner, repo] = match; // Extract branch and optional subpath from URLs like /tree/<branch>/<subPath> const segments = parsedUrl.pathname.split("/").filter(Boolean); // Handle /blob/ URLs for single file indexing if (segments.length >= 4 && segments[2] === "blob") { const branch = segments[3]; const filePath = segments.length > 4 ? segments.slice(4).join("/") : undefined; return { owner, repo, branch, filePath, isBlob: true }; } // Handle /tree/ URLs with branch and optional subpath if (segments.length >= 4 && segments[2] === "tree") { const branch = segments[3]; const subPath = segments.length > 4 ? segments.slice(4).join("/") : undefined; return { owner, repo, branch, subPath }; } // Base repository URL return { owner, repo }; } /** * Fetches the repository tree structure from GitHub API. */ private async fetchRepositoryTree( repoInfo: GitHubRepoInfo, signal?: AbortSignal, ): Promise<{ tree: GitHubTreeResponse; resolvedBranch: string }> { const { owner, repo, branch } = repoInfo; // If no branch specified, fetch the default branch first let targetBranch = branch; if (!targetBranch) { try { const repoUrl = `https://api.github.com/repos/${owner}/${repo}`; logger.debug(`Fetching repository info: ${repoUrl}`); const repoContent = await this.httpFetcher.fetch(repoUrl, { signal }); const content = typeof repoContent.content === "string" ? repoContent.content : repoContent.content.toString("utf-8"); const repoData = JSON.parse(content) as { default_branch: string }; targetBranch = repoData.default_branch; logger.debug(`Using default branch: ${targetBranch}`); } catch (error) { logger.warn(`⚠️ Could not fetch default branch, using 'main': ${error}`); targetBranch = "main"; } } const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`; logger.debug(`Fetching repository tree: ${treeUrl}`); const rawContent = await this.httpFetcher.fetch(treeUrl, { signal }); const content = typeof rawContent.content === "string" ? rawContent.content : rawContent.content.toString("utf-8"); const treeData = JSON.parse(content) as GitHubTreeResponse; if (treeData.truncated) { logger.warn( `⚠️ Repository tree was truncated for ${owner}/${repo}. Some files may be missing.`, ); } return { tree: treeData, resolvedBranch: targetBranch }; } /** * Determines if a file should be processed based on its path and type. */ private shouldProcessFile(item: GitHubTreeItem, options: ScraperOptions): boolean { if (item.type !== "blob") { return false; } const path = item.path; // Whitelist of text-based file extensions const textExtensions = [ ".md", ".mdx", ".txt", ".rst", ".adoc", ".asciidoc", ".html", ".htm", ".xml", ".css", ".scss", ".sass", ".less", ".js", ".jsx", ".ts", ".tsx", ".py", ".java", ".c", ".cpp", ".cc", ".cxx", ".h", ".hpp", ".cs", ".go", ".rs", ".rb", ".php", ".swift", ".kt", ".scala", ".clj", ".cljs", ".hs", ".elm", ".dart", ".r", ".m", ".mm", ".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".json", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".conf", ".properties", ".env", ".gitignore", ".dockerignore", ".gitattributes", ".editorconfig", ".gradle", ".pom", ".sbt", ".maven", ".cmake", ".make", ".dockerfile", ".mod", ".sum", ".sql", ".graphql", ".gql", ".proto", ".thrift", ".avro", ".csv", ".tsv", ".log", ]; const pathLower = path.toLowerCase(); const hasTextExtension = textExtensions.some((ext) => pathLower.endsWith(ext)); const hasCompoundExtension = pathLower.includes(".env.") || pathLower.endsWith(".env") || pathLower.includes(".config.") || pathLower.includes(".lock"); const fileName = path.split("/").pop() || ""; const fileNameLower = fileName.toLowerCase(); const commonTextFiles = [ "readme", "license", "changelog", "contributing", "authors", "maintainers", "dockerfile", "makefile", "rakefile", "gemfile", "podfile", "cartfile", "brewfile", "procfile", "vagrantfile", "gulpfile", "gruntfile", ".prettierrc", ".eslintrc", ".babelrc", ".nvmrc", ".npmrc", ]; const isCommonTextFile = commonTextFiles.some((name) => { if (name.startsWith(".")) { return fileNameLower === name || fileNameLower.startsWith(`${name}.`); } return fileNameLower === name || fileNameLower.startsWith(`${name}.`); }); // If file passes known checks, include it if (hasTextExtension || hasCompoundExtension || isCommonTextFile) { return shouldIncludeUrl(path, options.includePatterns, options.excludePatterns); } // Fallback: check if unknown extension has text/* MIME type const mimeType = mime.getType(path); if (mimeType?.startsWith("text/")) { logger.debug(`Including file with text MIME type: ${path} (${mimeType})`); return shouldIncludeUrl(path, options.includePatterns, options.excludePatterns); } // Not a text file return false; } /** * Checks if a path is within the specified subpath. */ private isWithinSubPath(path: string, subPath?: string): boolean { if (!subPath) { return true; } const trimmedSubPath = subPath.replace(/^\/+/, "").replace(/\/+$/, ""); if (trimmedSubPath.length === 0) { return true; } const normalizedPath = path.replace(/^\/+/, "").replace(/\/+$/, ""); if (normalizedPath === trimmedSubPath) { return true; } return normalizedPath.startsWith(`${trimmedSubPath}/`); } async processItem( item: QueueItem, options: ScraperOptions, signal?: AbortSignal, ): Promise<ProcessItemResult> { // Handle legacy github-file:// URLs - treat as deleted/not found if (item.url.startsWith("github-file://")) { logger.info( `🗑️ Legacy github-file:// URL detected, marking as deleted: ${item.url}`, ); return { url: item.url, links: [], status: FetchStatus.NOT_FOUND, }; } // Delegate to wiki processor for wiki URLs // Use precise pattern matching: /owner/repo/wiki or /owner/repo/wiki/ try { const parsedUrl = new URL(item.url); if (/^\/[^/]+\/[^/]+\/wiki($|\/)/.test(parsedUrl.pathname)) { return await this.wikiProcessor.process(item, options, signal); } } catch { // If URL parsing fails, fall through to other handlers } // For the main repository URL (depth 0), perform discovery // This includes blob URLs at depth 0, which should return themselves as discovered links if (item.depth === 0) { const repoInfo = this.parseGitHubUrl(options.url); const { owner, repo } = repoInfo; logger.debug(`Discovering GitHub repository ${owner}/${repo}`); const discoveredLinks: string[] = []; // Handle single file (blob) URLs - strict scoping: index ONLY the file if ("isBlob" in repoInfo && repoInfo.isBlob && repoInfo.filePath) { const { branch = "main", filePath } = repoInfo; logger.debug( `Single file URL detected: ${owner}/${repo}/${filePath} - indexing file only`, ); // Generate HTTPS blob URL for storage discoveredLinks.push( `https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`, ); return { url: item.url, links: discoveredLinks, status: FetchStatus.SUCCESS, }; } // Discover wiki URL for full repo scrapes (will be processed by GitHubWikiScraperStrategy) const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`; discoveredLinks.push(wikiUrl); logger.debug(`Discovered wiki URL: ${wikiUrl}`); // 3. Discover all files in the repository const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal); const fileItems = tree.tree .filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)) .filter((treeItem) => this.shouldProcessFile(treeItem, options)); logger.debug( `Discovered ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`, ); // Create HTTPS blob URLs for storage in database // These are user-friendly, clickable URLs that work outside the system const fileUrls = fileItems.map( (treeItem) => `https://github.com/${owner}/${repo}/blob/${resolvedBranch}/${treeItem.path}`, ); discoveredLinks.push(...fileUrls); logger.debug( `Discovery complete: ${fileUrls.length} repo file(s) + 1 wiki URL = ${discoveredLinks.length} total URLs`, ); return { url: item.url, links: discoveredLinks, status: FetchStatus.SUCCESS }; } // Handle HTTPS blob URLs at depth > 0 (from database during refresh or discovered files) // Process blob URLs directly - fetch content and return empty links // Use precise pattern matching: /owner/repo/blob/branch/path try { const parsedUrl = new URL(item.url); if (/^\/[^/]+\/[^/]+\/blob\//.test(parsedUrl.pathname)) { logger.debug(`Processing HTTPS blob URL at depth ${item.depth}: ${item.url}`); return await this.repoProcessor.process(item, options, signal); } } catch (error) { logger.warn(`⚠️ Failed to parse blob URL ${item.url}: ${error}`); return { url: item.url, links: [], status: FetchStatus.SUCCESS }; } // For any other URLs at non-zero depth, return empty (shouldn't happen in practice) logger.debug(`No further processing for URL at depth ${item.depth}: ${item.url}`); return { url: item.url, links: [], status: FetchStatus.SUCCESS }; } async scrape( options: ScraperOptions, progressCallback: ProgressCallback<ScraperProgressEvent>, signal?: AbortSignal, ): Promise<void> { const url = new URL(options.url); if (!url.hostname.includes("github.com")) { throw new Error("URL must be a GitHub URL"); } // Use the base class implementation which handles initialQueue properly // The processItem method will discover all wiki and repo file URLs // The base scraper will automatically deduplicate URLs from initialQueue await super.scrape(options, progressCallback, signal); } async cleanup(): Promise<void> { await Promise.all([this.wikiProcessor.cleanup(), this.repoProcessor.cleanup()]); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server