import type { ProgressCallback } from "../../types";
import type { AppConfig } from "../../utils/config";
import { ScraperError } from "../../utils/errors";
import { logger } from "../../utils/logger";
import { MimeTypeUtils } from "../../utils/mimeTypeUtils";
import { HttpFetcher } from "../fetcher";
import { FetchStatus } from "../fetcher/types";
import type { QueueItem, ScraperOptions, ScraperProgressEvent } from "../types";
import { shouldIncludeUrl } from "../utils/patternMatcher";
import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrategy";
import type {
GitHubRepoInfo,
GitHubTreeItem,
GitHubTreeResponse,
} from "./GitHubRepoProcessor";
import { GitHubRepoProcessor } from "./GitHubRepoProcessor";
import { GitHubWikiProcessor } from "./GitHubWikiProcessor";
import { resolveGitHubAuth } from "./github-auth";
/**
* GitHubScraperStrategy is a discovery strategy that orchestrates the scraping of both
* GitHub repository code and wiki pages. When given a GitHub repository URL, it will:
*
* 1. Attempt to scrape the repository's wiki pages using GitHubWikiProcessor (prioritized)
* 2. Discover all repository files using the GitHub Tree API
* 3. Create HTTPS blob URLs for each file, which are stored in the database
* 4. Process blob URLs directly with GitHubRepoProcessor
*
* This provides comprehensive documentation coverage by including both wiki documentation
* and source code in a single scraping job, with wikis prioritized as they typically
* contain higher-quality curated documentation.
*
* Features:
* - Handles base GitHub repository URLs (e.g., https://github.com/owner/repo)
* - Handles branch-specific URLs (e.g., https://github.com/owner/repo/tree/branch)
* - Handles single file URLs (e.g., https://github.com/owner/repo/blob/branch/path)
* - Discovers all files efficiently using GitHub's Tree API
* - Generates and processes user-friendly HTTPS blob URLs throughout
* - Prioritizes wiki content over repository files for better documentation quality
* - Respects maxPages limit across both scraping phases to prevent exceeding quotas
* - Automatically discovers and scrapes both wiki and code content
* - Graceful handling when wikis don't exist or are inaccessible
*/
export class GitHubScraperStrategy extends BaseScraperStrategy {
private readonly httpFetcher: HttpFetcher;
private readonly wikiProcessor: GitHubWikiProcessor;
private readonly repoProcessor: GitHubRepoProcessor;
private resolvedAuthHeaders?: Record<string, string>;
private resolvedAuthKey?: string;
constructor(config: AppConfig) {
super(config);
this.httpFetcher = new HttpFetcher(config.scraper);
this.wikiProcessor = new GitHubWikiProcessor(config);
this.repoProcessor = new GitHubRepoProcessor(config);
}
canHandle(url: string): boolean {
// Handle legacy github-file:// protocol URLs (no longer supported)
// These will be processed and marked as NOT_FOUND to trigger cleanup
if (url.startsWith("github-file://")) {
return true;
}
try {
const parsedUrl = new URL(url);
const { hostname, pathname } = parsedUrl;
// Handle GitHub repository URLs
if (!["github.com", "www.github.com"].includes(hostname)) {
return false;
}
// Handle base repository URLs (owner/repo)
const baseMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/);
if (baseMatch) {
return true;
}
// Handle tree URLs (owner/repo/tree/branch/...)
const treeMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/tree\//);
if (treeMatch) {
return true;
}
// Handle blob URLs (owner/repo/blob/branch/...)
const blobMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/blob\//);
if (blobMatch) {
return true;
}
return false;
} catch {
return false;
}
}
/**
* Parses a GitHub URL to extract repository information.
*/
private parseGitHubUrl(
url: string,
): GitHubRepoInfo & { isBlob?: boolean; filePath?: string } {
const parsedUrl = new URL(url);
// Extract /<org>/<repo> from github.com/<org>/<repo>/...
const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)/);
if (!match) {
throw new Error(`Invalid GitHub repository URL: ${url}`);
}
const [, owner, repo] = match;
// Extract branch and optional subpath from URLs like /tree/<branch>/<subPath>
const segments = parsedUrl.pathname.split("/").filter(Boolean);
// Handle /blob/ URLs for single file indexing
if (segments.length >= 4 && segments[2] === "blob") {
const branch = segments[3];
const filePath = segments.length > 4 ? segments.slice(4).join("/") : undefined;
return { owner, repo, branch, filePath, isBlob: true };
}
// Handle /tree/ URLs with branch and optional subpath
if (segments.length >= 4 && segments[2] === "tree") {
const branch = segments[3];
const subPath = segments.length > 4 ? segments.slice(4).join("/") : undefined;
return { owner, repo, branch, subPath };
}
// Base repository URL
return { owner, repo };
}
private buildAuthCacheKey(explicitHeaders?: Record<string, string>): string {
const normalizedHeaders = explicitHeaders
? Object.keys(explicitHeaders)
.sort()
.map((key) => [key, explicitHeaders[key]])
: [];
const envKey = `${process.env.GITHUB_TOKEN ?? ""}|${process.env.GH_TOKEN ?? ""}`;
return JSON.stringify({ headers: normalizedHeaders, env: envKey });
}
private async getResolvedAuthHeaders(
explicitHeaders?: Record<string, string>,
): Promise<Record<string, string>> {
const cacheKey = this.buildAuthCacheKey(explicitHeaders);
if (this.resolvedAuthHeaders && this.resolvedAuthKey === cacheKey) {
return this.resolvedAuthHeaders;
}
const resolved = await resolveGitHubAuth(explicitHeaders);
this.resolvedAuthHeaders = resolved;
this.resolvedAuthKey = cacheKey;
return resolved;
}
/**
* Fetches the repository tree structure from GitHub API.
*/
private async fetchRepositoryTree(
repoInfo: GitHubRepoInfo,
headers?: Record<string, string>,
signal?: AbortSignal,
): Promise<{ tree: GitHubTreeResponse; resolvedBranch: string }> {
const { owner, repo, branch } = repoInfo;
// If no branch specified, fetch the default branch first
let targetBranch = branch;
if (!targetBranch) {
const repoUrl = `https://api.github.com/repos/${owner}/${repo}`;
logger.debug(`Fetching repository info: ${repoUrl}`);
let repoContent: Awaited<ReturnType<typeof this.httpFetcher.fetch>>;
try {
repoContent = await this.httpFetcher.fetch(repoUrl, { signal, headers });
} catch (error) {
// Convert HTTP auth errors to user-friendly messages
if (error instanceof ScraperError) {
if (error.message.includes("401")) {
throw new ScraperError(
`GitHub authentication failed for "${owner}/${repo}". Your token is invalid or expired. Please check your GITHUB_TOKEN or GH_TOKEN environment variable.`,
false,
error,
);
}
if (error.message.includes("403")) {
throw new ScraperError(
`GitHub access denied for "${owner}/${repo}". Your token may lack the required permissions, or you may be rate-limited. Please check your GITHUB_TOKEN or GH_TOKEN.`,
false,
error,
);
}
}
throw error;
}
// Check for NOT_FOUND status before parsing - repo is inaccessible (private or doesn't exist)
if (repoContent.status === FetchStatus.NOT_FOUND) {
throw new ScraperError(
`Repository "${owner}/${repo}" not found or not accessible. For private repositories, set the GITHUB_TOKEN environment variable.`,
false,
);
}
// Try to parse the response to get the default branch
try {
const content =
typeof repoContent.content === "string"
? repoContent.content
: repoContent.content.toString("utf-8");
const repoData = JSON.parse(content) as { default_branch?: string };
const defaultBranch =
typeof repoData.default_branch === "string"
? repoData.default_branch.trim()
: "";
if (!defaultBranch) {
logger.warn(
`⚠️ Repository info missing default_branch for ${owner}/${repo}, using 'main'`,
);
targetBranch = "main";
} else {
targetBranch = defaultBranch;
logger.debug(`Using default branch: ${targetBranch}`);
}
} catch (parseError) {
// Only fall back to "main" for JSON parse errors (e.g., unexpected API response format)
logger.warn(`⚠️ Could not parse repository info, using 'main': ${parseError}`);
targetBranch = "main";
}
}
const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`;
logger.debug(`Fetching repository tree: ${treeUrl}`);
let rawContent: Awaited<ReturnType<typeof this.httpFetcher.fetch>>;
try {
rawContent = await this.httpFetcher.fetch(treeUrl, { signal, headers });
} catch (error) {
// Convert HTTP auth errors to user-friendly messages
if (error instanceof ScraperError) {
if (error.message.includes("401")) {
throw new ScraperError(
`GitHub authentication failed for "${owner}/${repo}". Your token is invalid or expired. Please check your GITHUB_TOKEN or GH_TOKEN environment variable.`,
false,
error,
);
}
if (error.message.includes("403")) {
throw new ScraperError(
`GitHub access denied for "${owner}/${repo}". Your token may lack the required permissions, or you may be rate-limited. Please check your GITHUB_TOKEN or GH_TOKEN.`,
false,
error,
);
}
}
throw error;
}
// Check for NOT_FOUND status before parsing - this indicates repo is inaccessible
if (rawContent.status === FetchStatus.NOT_FOUND) {
throw new ScraperError(
`Repository "${owner}/${repo}" not found or not accessible. For private repositories, set the GITHUB_TOKEN environment variable.`,
false,
);
}
const content =
typeof rawContent.content === "string"
? rawContent.content
: rawContent.content.toString("utf-8");
// Parse JSON with proper error handling
let treeData: GitHubTreeResponse;
try {
treeData = JSON.parse(content) as GitHubTreeResponse;
} catch (parseError) {
throw new ScraperError(
`Failed to parse GitHub API response for "${owner}/${repo}". The repository may be inaccessible or the API returned an unexpected response.`,
false,
parseError instanceof Error ? parseError : undefined,
);
}
if (treeData.truncated) {
logger.warn(
`⚠️ Repository tree was truncated for ${owner}/${repo}. Some files may be missing.`,
);
}
return { tree: treeData, resolvedBranch: targetBranch };
}
/**
* Determines if a file should be processed based on its path and type.
*/
private shouldProcessFile(item: GitHubTreeItem, options: ScraperOptions): boolean {
if (item.type !== "blob") {
return false;
}
const path = item.path;
// Whitelist of text-based file extensions
const textExtensions = [
".md",
".mdx",
".txt",
".rst",
".adoc",
".asciidoc",
".html",
".htm",
".xml",
".css",
".scss",
".sass",
".less",
".js",
".jsx",
".ts",
".tsx",
".py",
".java",
".c",
".cpp",
".cc",
".cxx",
".h",
".hpp",
".cs",
".go",
".rs",
".rb",
".php",
".swift",
".kt",
".scala",
".clj",
".cljs",
".hs",
".elm",
".dart",
".r",
".m",
".mm",
".sh",
".bash",
".zsh",
".fish",
".ps1",
".bat",
".cmd",
".json",
".yaml",
".yml",
".toml",
".ini",
".cfg",
".conf",
".properties",
".env",
".gitignore",
".dockerignore",
".gitattributes",
".editorconfig",
".gradle",
".pom",
".sbt",
".maven",
".cmake",
".make",
".dockerfile",
".mod",
".sum",
".sql",
".graphql",
".gql",
".proto",
".thrift",
".avro",
".csv",
".tsv",
".log",
];
// Document extensions supported by DocumentPipeline
const documentExtensions = [".pdf", ".docx", ".xlsx", ".pptx", ".ipynb"];
const pathLower = path.toLowerCase();
const hasTextExtension = textExtensions.some((ext) => pathLower.endsWith(ext));
const hasDocumentExtension = documentExtensions.some((ext) =>
pathLower.endsWith(ext),
);
const hasCompoundExtension =
pathLower.includes(".env.") ||
pathLower.endsWith(".env") ||
pathLower.includes(".config.") ||
pathLower.includes(".lock");
const fileName = path.split("/").pop() || "";
const fileNameLower = fileName.toLowerCase();
const commonTextFiles = [
"readme",
"license",
"changelog",
"contributing",
"authors",
"maintainers",
"dockerfile",
"makefile",
"rakefile",
"gemfile",
"podfile",
"cartfile",
"brewfile",
"procfile",
"vagrantfile",
"gulpfile",
"gruntfile",
".prettierrc",
".eslintrc",
".babelrc",
".nvmrc",
".npmrc",
];
const isCommonTextFile = commonTextFiles.some((name) => {
if (name.startsWith(".")) {
return fileNameLower === name || fileNameLower.startsWith(`${name}.`);
}
return fileNameLower === name || fileNameLower.startsWith(`${name}.`);
});
// If file passes known checks, include it
if (
hasTextExtension ||
hasDocumentExtension ||
hasCompoundExtension ||
isCommonTextFile
) {
return shouldIncludeUrl(path, options.includePatterns, options.excludePatterns);
}
// Fallback: check if unknown extension has text/* MIME type using MimeTypeUtils
const mimeType = MimeTypeUtils.detectMimeTypeFromPath(path);
if (mimeType?.startsWith("text/")) {
logger.debug(`Including file with text MIME type: ${path} (${mimeType})`);
return shouldIncludeUrl(path, options.includePatterns, options.excludePatterns);
}
// Not a text file
return false;
}
/**
* Checks if a path is within the specified subpath.
*/
private isWithinSubPath(path: string, subPath?: string): boolean {
if (!subPath) {
return true;
}
const trimmedSubPath = subPath.replace(/^\/+/, "").replace(/\/+$/, "");
if (trimmedSubPath.length === 0) {
return true;
}
const normalizedPath = path.replace(/^\/+/, "").replace(/\/+$/, "");
if (normalizedPath === trimmedSubPath) {
return true;
}
return normalizedPath.startsWith(`${trimmedSubPath}/`);
}
async processItem(
item: QueueItem,
options: ScraperOptions,
signal?: AbortSignal,
): Promise<ProcessItemResult> {
// Handle legacy github-file:// URLs - treat as deleted/not found
if (item.url.startsWith("github-file://")) {
logger.info(
`🗑️ Legacy github-file:// URL detected, marking as deleted: ${item.url}`,
);
return {
url: item.url,
links: [],
status: FetchStatus.NOT_FOUND,
};
}
const headers = await this.getResolvedAuthHeaders(options.headers);
// Delegate to wiki processor for wiki URLs
// Use precise pattern matching: /owner/repo/wiki or /owner/repo/wiki/
try {
const parsedUrl = new URL(item.url);
if (/^\/[^/]+\/[^/]+\/wiki($|\/)/.test(parsedUrl.pathname)) {
return await this.wikiProcessor.process(item, options, headers, signal);
}
} catch {
// If URL parsing fails, fall through to other handlers
}
// For the main repository URL (depth 0), perform discovery
// This includes blob URLs at depth 0, which should return themselves as discovered links
if (item.depth === 0) {
const repoInfo = this.parseGitHubUrl(options.url);
const { owner, repo } = repoInfo;
logger.debug(`Discovering GitHub repository ${owner}/${repo}`);
const discoveredLinks: string[] = [];
// Handle single file (blob) URLs - strict scoping: index ONLY the file
if ("isBlob" in repoInfo && repoInfo.isBlob && repoInfo.filePath) {
const { branch = "main", filePath } = repoInfo;
logger.debug(
`Single file URL detected: ${owner}/${repo}/${filePath} - indexing file only`,
);
// Generate HTTPS blob URL for storage
discoveredLinks.push(
`https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`,
);
return {
url: item.url,
links: discoveredLinks,
status: FetchStatus.SUCCESS,
};
}
// Discover wiki URL for full repo scrapes (will be processed by GitHubWikiScraperStrategy)
const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`;
discoveredLinks.push(wikiUrl);
logger.debug(`Discovered wiki URL: ${wikiUrl}`);
// 3. Discover all files in the repository
const { tree, resolvedBranch } = await this.fetchRepositoryTree(
repoInfo,
headers,
signal,
);
const fileItems = tree.tree
.filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath))
.filter((treeItem) => this.shouldProcessFile(treeItem, options));
logger.debug(
`Discovered ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`,
);
// Create HTTPS blob URLs for storage in database
// These are user-friendly, clickable URLs that work outside the system
const fileUrls = fileItems.map(
(treeItem) =>
`https://github.com/${owner}/${repo}/blob/${resolvedBranch}/${treeItem.path}`,
);
discoveredLinks.push(...fileUrls);
logger.debug(
`Discovery complete: ${fileUrls.length} repo file(s) + 1 wiki URL = ${discoveredLinks.length} total URLs`,
);
return { url: item.url, links: discoveredLinks, status: FetchStatus.SUCCESS };
}
// Handle HTTPS blob URLs at depth > 0 (from database during refresh or discovered files)
// Process blob URLs directly - fetch content and return empty links
// Use precise pattern matching: /owner/repo/blob/branch/path
try {
const parsedUrl = new URL(item.url);
if (/^\/[^/]+\/[^/]+\/blob\//.test(parsedUrl.pathname)) {
logger.debug(`Processing HTTPS blob URL at depth ${item.depth}: ${item.url}`);
return await this.repoProcessor.process(item, options, headers, signal);
}
} catch (error) {
logger.warn(`⚠️ Failed to parse blob URL ${item.url}: ${error}`);
return { url: item.url, links: [], status: FetchStatus.SUCCESS };
}
// For any other URLs at non-zero depth, return empty (shouldn't happen in practice)
logger.debug(`No further processing for URL at depth ${item.depth}: ${item.url}`);
return { url: item.url, links: [], status: FetchStatus.SUCCESS };
}
async scrape(
options: ScraperOptions,
progressCallback: ProgressCallback<ScraperProgressEvent>,
signal?: AbortSignal,
): Promise<void> {
const url = new URL(options.url);
if (!url.hostname.includes("github.com")) {
throw new Error("URL must be a GitHub URL");
}
await this.getResolvedAuthHeaders(options.headers);
// Use the base class implementation which handles initialQueue properly
// The processItem method will discover all wiki and repo file URLs
// The base scraper will automatically deduplicate URLs from initialQueue
try {
await super.scrape(options, progressCallback, signal);
} finally {
this.resolvedAuthHeaders = undefined;
this.resolvedAuthKey = undefined;
}
}
async cleanup(): Promise<void> {
await Promise.all([this.wikiProcessor.cleanup(), this.repoProcessor.cleanup()]);
}
}