Skip to main content
Glama
GitHubRepoProcessor.ts6.03 kB
import { logger } from "../../utils/logger"; import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; import { HttpFetcher } from "../fetcher"; import { FetchStatus, type RawContent } from "../fetcher/types"; import { PipelineFactory } from "../pipelines/PipelineFactory"; import type { ContentPipeline, PipelineResult } from "../pipelines/types"; import type { QueueItem } from "../types"; import { ScrapeMode, type ScraperOptions } from "../types"; import type { ProcessItemResult } from "./BaseScraperStrategy"; export interface GitHubRepoInfo { owner: string; repo: string; branch?: string; subPath?: string; } export interface GitHubTreeItem { path: string; type: "blob" | "tree"; sha: string; size?: number; url: string; } export interface GitHubTreeResponse { sha: string; url: string; tree: GitHubTreeItem[]; truncated: boolean; } /** * GitHubRepoProcessor handles processing individual files from GitHub repositories. * It processes HTTPS blob URLs (https://github.com/owner/repo/blob/branch/filepath). * * This processor is stateless and contains the core logic from GitHubRepoScraperStrategy. */ export class GitHubRepoProcessor { private readonly httpFetcher = new HttpFetcher(); private readonly pipelines: ContentPipeline[]; constructor() { this.pipelines = PipelineFactory.createStandardPipelines(); } /** * Parses an HTTPS blob URL to extract repository information. * Format: https://github.com/owner/repo/blob/branch/filepath */ parseHttpsBlobUrl(url: string): GitHubRepoInfo & { filePath: string } { const parsedUrl = new URL(url); const segments = parsedUrl.pathname.split("/").filter(Boolean); // Expected format: /owner/repo/blob/branch/filepath if (segments.length < 5 || segments[2] !== "blob") { throw new Error( `Invalid GitHub blob URL format. Expected: https://github.com/owner/repo/blob/branch/filepath. Got: ${url}`, ); } const owner = segments[0]; const repo = segments[1]; const branch = segments[3]; const filePath = segments.slice(4).join("/"); return { owner, repo, branch, filePath }; } /** * Fetches the raw content of a file from GitHub. */ private async fetchFileContent( repoInfo: GitHubRepoInfo, filePath: string, etag?: string | null, signal?: AbortSignal, ): Promise<RawContent> { const { owner, repo, branch } = repoInfo; const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`; const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag }); // Override GitHub's generic 'text/plain' MIME type with file extension-based detection const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath); if (detectedMimeType && rawContent.mimeType === "text/plain") { return { ...rawContent, mimeType: detectedMimeType, }; } return rawContent; } /** * Processes a single GitHub repository file from an HTTPS blob URL. */ async process( item: QueueItem, options: ScraperOptions, signal?: AbortSignal, ): Promise<ProcessItemResult> { // Parse the HTTPS blob URL to extract repository info and file path const repoInfo = this.parseHttpsBlobUrl(item.url); const { owner, repo, branch, filePath } = repoInfo; // Fetch the file content from raw.githubusercontent.com const rawContent = await this.fetchFileContent( { owner, repo, branch }, filePath, item.etag, signal, ); // Return the status directly - BaseScraperStrategy handles NOT_MODIFIED and NOT_FOUND if (rawContent.status !== FetchStatus.SUCCESS) { return { url: item.url, links: [], status: rawContent.status }; } // Process content through appropriate pipeline let processed: PipelineResult | undefined; for (const pipeline of this.pipelines) { const contentBuffer = Buffer.isBuffer(rawContent.content) ? rawContent.content : Buffer.from(rawContent.content); if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) { logger.debug( `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`, ); // Force 'fetch' mode for GitHub to avoid unnecessary Playwright usage on raw content. // GitHub raw files (e.g., HTML files) don't have their dependencies available at the // raw.githubusercontent.com domain, so rendering them in a browser would be broken // and provide no additional value over direct HTML parsing with Cheerio. const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch }; processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher); break; } } if (!processed) { logger.warn( `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`, ); return { url: item.url, links: [], status: FetchStatus.SUCCESS }; } for (const err of processed.errors ?? []) { logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`); } // Create document with GitHub-specific metadata const githubUrl = `https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`; // Use filename as fallback if title is empty or not a string const filename = filePath.split("/").pop() || "Untitled"; return { url: githubUrl, title: processed.title?.trim() || filename || "Untitled", etag: rawContent.etag, lastModified: rawContent.lastModified, contentType: rawContent.mimeType, content: processed, links: [], // Always return empty links array for individual files status: FetchStatus.SUCCESS, }; } /** * Cleanup resources used by this processor. */ async cleanup(): Promise<void> { await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close())); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server