Skip to main content
Glama
gitHubArchive.ts10.5 kB
import { createWriteStream } from 'node:fs'; import * as fs from 'node:fs/promises'; import * as path from 'node:path'; import { Readable, Transform } from 'node:stream'; import { pipeline } from 'node:stream/promises'; import { unzip } from 'fflate'; import { RepomixError } from '../../shared/errorHandle.js'; import { logger } from '../../shared/logger.js'; import { buildGitHubArchiveUrl, buildGitHubMasterArchiveUrl, buildGitHubTagArchiveUrl, checkGitHubResponse, getArchiveFilename, } from './gitHubArchiveApi.js'; import type { GitHubRepoInfo } from './gitRemoteParse.js'; export interface ArchiveDownloadOptions { timeout?: number; // Download timeout in milliseconds (default: 30000) retries?: number; // Number of retry attempts (default: 3) } export interface ArchiveDownloadProgress { downloaded: number; total: number | null; percentage: number | null; } export type ProgressCallback = (progress: ArchiveDownloadProgress) => void; /** * Downloads and extracts a GitHub repository archive */ export const downloadGitHubArchive = async ( repoInfo: GitHubRepoInfo, targetDirectory: string, options: ArchiveDownloadOptions = {}, onProgress?: ProgressCallback, deps = { fetch: globalThis.fetch, fs, pipeline, Transform, createWriteStream, }, ): Promise<void> => { const { timeout = 30000, retries = 3 } = options; // Ensure target directory exists await deps.fs.mkdir(targetDirectory, { recursive: true }); let lastError: Error | null = null; // Try downloading with multiple URL formats: main branch, master branch (fallback), then tag format const archiveUrls = [ buildGitHubArchiveUrl(repoInfo), buildGitHubMasterArchiveUrl(repoInfo), buildGitHubTagArchiveUrl(repoInfo), ].filter(Boolean) as string[]; for (const archiveUrl of archiveUrls) { for (let attempt = 1; attempt <= retries; attempt++) { try { logger.trace(`Downloading GitHub archive from: ${archiveUrl} (attempt ${attempt}/${retries})`); await downloadAndExtractArchive(archiveUrl, targetDirectory, repoInfo, timeout, onProgress, deps); logger.trace('Successfully downloaded and extracted GitHub archive'); return; // Success - exit early } catch (error) { lastError = error as Error; logger.trace(`Archive download attempt ${attempt} failed:`, lastError.message); // If it's a 404-like error and we have more URLs to try, don't retry this URL const isNotFoundError = lastError instanceof RepomixError && (lastError.message.includes('not found') || lastError.message.includes('404')); if (isNotFoundError && archiveUrls.length > 1) { break; } // If it's the last attempt, don't wait if (attempt < retries) { const delay = Math.min(1000 * 2 ** (attempt - 1), 5000); // Exponential backoff, max 5s logger.trace(`Retrying in ${delay}ms...`); await new Promise((resolve) => setTimeout(resolve, delay)); } } } } // If we get here, all attempts failed throw new RepomixError( `Failed to download GitHub archive after ${retries} attempts. ${lastError?.message || 'Unknown error'}`, ); }; /** * Downloads and extracts archive from a single URL */ const downloadAndExtractArchive = async ( archiveUrl: string, targetDirectory: string, repoInfo: GitHubRepoInfo, timeout: number, onProgress?: ProgressCallback, deps = { fetch: globalThis.fetch, fs, pipeline, Transform, createWriteStream, }, ): Promise<void> => { // Download the archive const tempArchivePath = path.join(targetDirectory, getArchiveFilename(repoInfo)); await downloadFile(archiveUrl, tempArchivePath, timeout, onProgress, deps); try { // Extract the archive await extractZipArchive(tempArchivePath, targetDirectory, repoInfo, { fs: deps.fs }); } finally { // Clean up the downloaded archive file try { await deps.fs.unlink(tempArchivePath); } catch (error) { logger.trace('Failed to cleanup archive file:', (error as Error).message); } } }; /** * Downloads a file from URL with progress tracking */ const downloadFile = async ( url: string, filePath: string, timeout: number, onProgress?: ProgressCallback, deps = { fetch: globalThis.fetch, fs, pipeline, Transform, createWriteStream, }, ): Promise<void> => { const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), timeout); try { const response = await deps.fetch(url, { signal: controller.signal, }); checkGitHubResponse(response); if (!response.body) { throw new RepomixError('No response body received'); } const totalSize = response.headers.get('content-length'); const total = totalSize ? Number.parseInt(totalSize, 10) : null; let downloaded = 0; let lastProgressUpdate = 0; // Use Readable.fromWeb for better stream handling const nodeStream = Readable.fromWeb(response.body); // Transform stream for progress tracking const progressStream = new deps.Transform({ transform(chunk, _encoding, callback) { downloaded += chunk.length; // Update progress at most every 100ms to avoid too frequent updates const now = Date.now(); if (onProgress && now - lastProgressUpdate > 100) { lastProgressUpdate = now; onProgress({ downloaded, total, percentage: total ? Math.round((downloaded / total) * 100) : null, }); } callback(null, chunk); }, flush(callback) { // Send final progress update if (onProgress) { onProgress({ downloaded, total, percentage: total ? 100 : null, }); } callback(); }, }); // Write to file const writeStream = deps.createWriteStream(filePath); await deps.pipeline(nodeStream, progressStream, writeStream); } finally { clearTimeout(timeoutId); } }; /** * Extracts a ZIP archive using fflate library */ const extractZipArchive = async ( archivePath: string, targetDirectory: string, repoInfo: GitHubRepoInfo, deps = { fs, }, ): Promise<void> => { try { // Always use in-memory extraction for simplicity and reliability await extractZipArchiveInMemory(archivePath, targetDirectory, repoInfo, deps); } catch (error) { throw new RepomixError(`Failed to extract archive: ${(error as Error).message}`); } }; /** * Extracts ZIP archive by loading it entirely into memory (faster for small files) */ const extractZipArchiveInMemory = async ( archivePath: string, targetDirectory: string, repoInfo: GitHubRepoInfo, deps = { fs, }, ): Promise<void> => { // Read the ZIP file as a buffer const zipBuffer = await deps.fs.readFile(archivePath); const zipUint8Array = new Uint8Array(zipBuffer); // Extract ZIP using fflate await new Promise<void>((resolve, reject) => { unzip(zipUint8Array, (err, extracted) => { if (err) { reject(new RepomixError(`Failed to extract ZIP archive: ${err.message}`)); return; } // Process extracted files concurrently in the callback processExtractedFiles(extracted, targetDirectory, repoInfo, deps).then(resolve).catch(reject); }); }); }; /** * Process extracted files sequentially to avoid EMFILE errors */ const processExtractedFiles = async ( extracted: Record<string, Uint8Array>, targetDirectory: string, repoInfo: GitHubRepoInfo, deps = { fs, }, ): Promise<void> => { const repoPrefix = `${repoInfo.repo}-`; const createdDirs = new Set<string>(); // Process files sequentially to avoid EMFILE errors completely for (const [filePath, fileData] of Object.entries(extracted)) { // GitHub archives have a top-level directory like "repo-branch/" // We need to remove this prefix from the file paths let relativePath = filePath; // Find and remove the repo prefix const pathParts = filePath.split('/'); if (pathParts.length > 0 && pathParts[0].startsWith(repoPrefix)) { // Remove the first directory (repo-branch/) relativePath = pathParts.slice(1).join('/'); } // Skip empty paths (root directory) if (!relativePath) { continue; } // Sanitize relativePath to prevent path traversal attacks const sanitized = path.normalize(relativePath).replace(/^(\.\.([/\\]|$))+/, ''); // Reject absolute paths outright if (path.isAbsolute(sanitized)) { logger.trace(`Absolute path detected in archive, skipping: ${relativePath}`); continue; } const targetPath = path.resolve(targetDirectory, sanitized); if (!targetPath.startsWith(path.resolve(targetDirectory))) { logger.trace(`Unsafe path detected in archive, skipping: ${relativePath}`); continue; } // Check if this entry is a directory (ends with /) or empty file data indicates directory const isDirectory = filePath.endsWith('/') || (fileData.length === 0 && relativePath.endsWith('/')); if (isDirectory) { // Create directory immediately if (!createdDirs.has(targetPath)) { logger.trace(`Creating directory: ${targetPath}`); await deps.fs.mkdir(targetPath, { recursive: true }); createdDirs.add(targetPath); } } else { // Create parent directory if needed and write file const parentDir = path.dirname(targetPath); if (!createdDirs.has(parentDir)) { logger.trace(`Creating parent directory for file: ${parentDir}`); await deps.fs.mkdir(parentDir, { recursive: true }); createdDirs.add(parentDir); } // Write file sequentially logger.trace(`Writing file: ${targetPath}`); try { await deps.fs.writeFile(targetPath, fileData); } catch (fileError) { logger.trace(`Failed to write file ${targetPath}: ${(fileError as Error).message}`); throw fileError; } } } }; /** * Checks if archive download is supported for the given repository info */ export const isArchiveDownloadSupported = (_repoInfo: GitHubRepoInfo): boolean => { // Archive download is supported for all GitHub repositories // In the future, we might add conditions here (e.g., size limits, private repos) return true; };

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/yamadashy/repomix'

If you have feedback or need assistance with the MCP directory API, please join our Discord server