Libragen

Overview Schema Related Servers Score Discussions

builder.ts•23.7 KiB

/** * Builder - Creates .libragen libraries from source files * * This class encapsulates the complete build workflow including source detection, * chunking, embedding generation, and database creation. */ import * as path from 'path'; import * as fs from 'fs/promises'; import { createHash } from 'crypto'; import { Embedder } from './embedder.ts'; import type { IEmbedder } from './embedder.ts'; import { Chunker } from './chunker.ts'; import type { Chunk } from './chunker.ts'; import { CodeChunker } from './code-chunker.ts'; import type { ContextMode } from './code-chunker.ts'; import { VectorStore } from './store.ts'; import { GitSource, isGitUrl, parseGitUrl, getAuthToken } from './sources/index.ts'; import type { GitSourceResult } from './sources/index.ts'; import { CURRENT_SCHEMA_VERSION } from './migrations/index.ts'; import { deriveGitLibraryName } from './utils.ts'; import type { LibraryMetadata, SourceProvenance } from './types.ts'; /** * Options for building a library. */ export interface BuildOptions { /** Output path for the .libragen file */ output?: string; /** Library name (defaults to directory name or derived from git URL) */ name?: string; /** Library version (default: 0.1.0) */ version?: string; /** Version of the source content */ contentVersion?: string; /** Short description of the library */ description?: string; /** Guidance for AI agents on when to use this library */ agentDescription?: string; /** Example queries this library can answer */ exampleQueries?: string[]; /** Searchable keywords/tags */ keywords?: string[]; /** Programming languages covered (e.g., "typescript", "python") */ programmingLanguages?: string[]; /** Human/natural languages of the content as ISO 639-1 codes (e.g., "en", "es") */ textLanguages?: string[]; /** Frameworks covered (e.g., "react", "express") */ frameworks?: string[]; /** Target chunk size in characters (default: 1000) */ chunkSize?: number; /** Chunk overlap in characters (default: 100) */ chunkOverlap?: number; /** Glob patterns to include */ include?: string[]; /** Glob patterns to exclude (added to defaults) */ exclude?: string[]; /** Disable default exclusions (node_modules, .git, dist, etc.) */ noDefaultExcludes?: boolean; /** Git branch, tag, or commit to checkout (remote git sources only) */ gitRef?: string; /** Auth token for private git repositories */ gitRepoAuthToken?: string; /** SPDX license identifier(s) for the source content */ license?: string[]; /** Disable AST-aware chunking for code files (default: false, AST chunking enabled) */ noAstChunking?: boolean; /** Context mode for AST chunking: 'none', 'minimal', or 'full' (default: 'full') */ contextMode?: ContextMode; } /** * Result of a successful build operation. */ export interface BuildResult { /** Absolute path to the created .libragen file */ outputPath: string; /** Library metadata */ metadata: LibraryMetadata; /** Build statistics */ stats: { /** Number of chunks created */ chunkCount: number; /** Number of source files processed */ sourceCount: number; /** Final file size in bytes */ fileSize: number; /** Time spent embedding in seconds */ embedDuration: number; /** Chunks processed per second */ chunksPerSecond: number; }; /** Git-specific information (if source was a git URL) */ git?: { /** Commit hash */ commitHash: string; /** Branch/tag name */ ref: string; /** Detected license (if any) */ detectedLicense?: { identifier: string; confidence: string; }; }; } /** * Build phase identifiers. */ export type BuildPhase = | 'initializing' | 'cloning' | 'loading-model' | 'chunking' | 'embedding' | 'creating-database' | 'complete'; /** * Progress information during build. */ export interface BuildProgress { /** Current build phase */ phase: BuildPhase; /** Progress percentage (0-100) */ progress: number; /** Human-readable message */ message: string; /** Current item being processed (for embedding phase) */ current?: number; /** Total items to process (for embedding phase) */ total?: number; } /** * Callback for receiving build progress updates. */ export type BuildProgressCallback = (progress: BuildProgress) => void; /** * Configuration for the Builder class. */ export interface BuilderConfig { /** * Custom embedder instance. * Provide your own implementation of IEmbedder to use a different embedding model * or service (e.g., OpenAI, Cohere, custom local models). * * @example * ```typescript * const builder = new Builder({ * embedder: new OpenAIEmbedder({ model: 'text-embedding-3-small' }) * }); * ``` */ embedder?: IEmbedder; } /** * Resolved source information after detecting and preparing the source. */ export interface ResolvedSource { /** Path to the source files (local path or temp directory for git) */ sourcePath: string; /** Whether the source is from a git repository */ isGit: boolean; /** Source provenance information */ provenance: SourceProvenance; /** Git result (if source was a git URL) */ gitResult?: GitSourceResult; /** Include patterns (may be modified for git path filtering) */ includePatterns?: string[]; } /** * Result of embedding generation. */ export interface EmbeddingResult { /** Generated embeddings */ embeddings: Float32Array[]; /** Time spent embedding in seconds */ duration: number; /** Chunks processed per second */ chunksPerSecond: number; } /** * Options for creating the library database. */ export interface CreateLibraryOptions { /** Path for the output file */ outputPath: string; /** Chunks to store */ chunks: Chunk[]; /** Embeddings for chunks */ embeddings: Float32Array[]; /** Library name */ libraryName: string; /** Library version */ libraryVersion: string; /** Chunk size used */ chunkSize: number; /** Chunk overlap used */ chunkOverlap: number; /** Source provenance */ provenance: SourceProvenance; /** Build options */ buildOptions: BuildOptions; } /** * Builder class for creating .libragen libraries from source files. * * @example * ```typescript * const builder = new Builder(); * const result = await builder.build('./my-docs', { * name: 'my-library', * description: 'My documentation library', * }); * console.log(`Built library at ${result.outputPath}`); * ``` */ export class Builder { private readonly _config: BuilderConfig; /** * Create a new Builder instance. * * @param config - Optional configuration */ public constructor(config?: BuilderConfig) { this._config = config ?? {}; } /** * Build a .libragen library from source files. * * @param source - Source directory, file path, or git URL * @param options - Build options * @param onProgress - Optional progress callback * @returns Build result with output path and metadata * @throws Error if build fails */ public async build( source: string, options?: BuildOptions, onProgress?: BuildProgressCallback ): Promise<BuildResult> { const opts = options ?? {}; const progress = onProgress ?? ((): void => {}); const gitSource = new GitSource(); let resolved: ResolvedSource | undefined; try { // Phase 1: Resolve source progress({ phase: 'initializing', progress: 5, message: 'Initializing...' }); resolved = await this._resolveSource(source, opts, gitSource, progress); // Derive library name and version const libraryName = opts.name || (resolved.isGit ? deriveGitLibraryName(parseGitUrl(source).repoUrl) : path.basename(resolved.sourcePath)); const libraryVersion = opts.version || '0.1.0'; // Resolve output path const outputPath = await this._resolveOutputPath(opts.output, libraryName, libraryVersion); // Phase 2: Chunk source files BEFORE loading model // This ensures we fail fast on empty directories without loading the model const chunkSize = opts.chunkSize ?? 1000, chunkOverlap = opts.chunkOverlap ?? 100; const chunks = await this._chunkSource(resolved, opts, chunkSize, chunkOverlap, progress); // Phase 3: Initialize embedder (only after we know we have content) progress({ phase: 'loading-model', progress: 20, message: 'Loading embedding model...' }); const embedder = this._config.embedder ?? new Embedder(); const disposeEmbedder = !this._config.embedder; try { await embedder.initialize(); // Phase 4: Generate embeddings const embeddingResult = await this._generateEmbeddings(chunks, embedder, progress); // Phase 5: Create library database progress({ phase: 'creating-database', progress: 90, message: 'Creating library database...' }); const metadata = await this._createLibrary({ outputPath, chunks, embeddings: embeddingResult.embeddings, libraryName, libraryVersion, chunkSize, chunkOverlap, provenance: resolved.provenance, buildOptions: opts, }); // Get final file size const fileStats = await fs.stat(outputPath); progress({ phase: 'complete', progress: 100, message: 'Complete' }); // Build result const result: BuildResult = { outputPath: path.resolve(outputPath), metadata, stats: { chunkCount: chunks.length, sourceCount: metadata.stats.sourceCount, fileSize: fileStats.size, embedDuration: embeddingResult.duration, chunksPerSecond: embeddingResult.chunksPerSecond, }, }; if (resolved.gitResult) { result.git = { commitHash: resolved.gitResult.commitHash, ref: resolved.gitResult.ref, detectedLicense: resolved.gitResult.detectedLicense, }; } return result; } finally { if (disposeEmbedder) { await embedder.dispose(); } } } finally { // Clean up git temp directory if (resolved?.gitResult?.tempDir) { await gitSource.cleanup(resolved.gitResult.tempDir); } } } /** * Resolve and prepare the source for building. * Handles both local paths and git URLs. * * @param source - Source directory, file path, or git URL * @param opts - Build options * @param gitSource - GitSource instance for cloning * @param progress - Progress callback * @returns Resolved source information */ protected async _resolveSource( source: string, opts: BuildOptions, gitSource: GitSource, progress: BuildProgressCallback ): Promise<ResolvedSource> { const isGit = isGitUrl(source); if (isGit) { return this._resolveGitSource(source, opts, gitSource, progress); } return this._resolveLocalSource(source, opts, progress); } /** * Resolve a git URL source. */ protected async _resolveGitSource( source: string, opts: BuildOptions, gitSource: GitSource, progress: BuildProgressCallback ): Promise<ResolvedSource> { progress({ phase: 'cloning', progress: 10, message: 'Cloning repository...' }); const parsed = parseGitUrl(source), resolvedRef = opts.gitRef || parsed.ref, resolvedToken = getAuthToken(parsed.repoUrl, opts.gitRepoAuthToken); // If URL contains a path, prepend it to include patterns let includePatterns = opts.include; if (parsed.path) { const pathPattern = parsed.path.endsWith('/') || !parsed.path.includes('.') ? `${parsed.path}/**` : parsed.path; includePatterns = includePatterns ? [ pathPattern, ...includePatterns ] : [ pathPattern ]; } // Clone the repository const gitResult = await gitSource.getFiles({ url: parsed.repoUrl, ref: resolvedRef, token: resolvedToken, depth: 1, patterns: includePatterns, ignore: opts.exclude, useDefaultIgnore: !opts.noDefaultExcludes, }); // Determine licenses: explicit > auto-detected const licenses = opts.license ?? ( gitResult.detectedLicense?.identifier && gitResult.detectedLicense.identifier !== 'Unknown' ? [ gitResult.detectedLicense.identifier ] : undefined ); return { sourcePath: gitResult.tempDir || parsed.repoUrl, isGit: true, provenance: { type: 'git', url: source, ref: gitResult.ref, commitHash: gitResult.commitHash, licenses, }, gitResult, includePatterns, }; } /** * Resolve a local file or directory source. */ protected async _resolveLocalSource( source: string, opts: BuildOptions, progress: BuildProgressCallback ): Promise<ResolvedSource> { progress({ phase: 'initializing', progress: 10, message: 'Reading source...' }); const sourcePath = path.resolve(source); const stats = await fs.stat(sourcePath); if (!stats.isDirectory() && !stats.isFile()) { throw new Error(`${source} is not a valid file or directory`); } return { sourcePath, isGit: false, provenance: { type: 'local', path: sourcePath, licenses: opts.license, }, includePatterns: opts.include, }; } /** * Chunk source files into smaller pieces for embedding. * Uses AST-aware chunking for supported code files (unless disabled), falling back to * text-based chunking for unsupported files. * * @param resolved - Resolved source information * @param opts - Build options * @param chunkSize - Target chunk size * @param chunkOverlap - Chunk overlap * @param progress - Progress callback * @returns Array of chunks */ protected async _chunkSource( resolved: ResolvedSource, opts: BuildOptions, chunkSize: number, chunkOverlap: number, progress: BuildProgressCallback ): Promise<Chunk[]> { progress({ phase: 'chunking', progress: 30, message: 'Chunking source files...' }); const useAstChunking = !opts.noAstChunking, contextMode = opts.contextMode ?? 'full'; const textChunker = new Chunker({ chunkSize, chunkOverlap }); const codeChunker = useAstChunking ? new CodeChunker({ maxChunkSize: chunkSize, contextMode }) : null; let chunks: Chunk[]; if (resolved.gitResult) { chunks = await this._chunkSourceFiles( resolved.gitResult.files, textChunker, codeChunker ); } else { const stats = await fs.stat(resolved.sourcePath); if (stats.isDirectory()) { chunks = await textChunker.chunkDirectory(resolved.sourcePath, { patterns: resolved.includePatterns, ignore: opts.exclude, useDefaultIgnore: !opts.noDefaultExcludes, }); // If AST chunking is enabled, re-chunk code files with CodeChunker if (codeChunker) { chunks = await this._enhanceChunksWithAst(chunks, codeChunker); } } else if (codeChunker && CodeChunker.isSupported(resolved.sourcePath)) { // Single file with AST chunking const astChunks = await codeChunker.tryChunkText( await fs.readFile(resolved.sourcePath, 'utf-8'), resolved.sourcePath ); chunks = astChunks ?? await textChunker.chunkFile(resolved.sourcePath); } else { // Single file without AST chunking chunks = await textChunker.chunkFile(resolved.sourcePath); } } if (chunks.length === 0) { throw new Error('No content found to index. Check your source path and include/exclude patterns.'); } return chunks; } /** * Generate embeddings for chunks. * * @param chunks - Chunks to embed * @param embedder - Embedder instance (implements IEmbedder) * @param progress - Progress callback * @returns Embedding result with embeddings and timing info */ protected async _generateEmbeddings( chunks: Chunk[], embedder: IEmbedder, progress: BuildProgressCallback ): Promise<EmbeddingResult> { progress({ phase: 'embedding', progress: 40, message: `Generating embeddings (${chunks.length} chunks)...`, current: 0, total: chunks.length, }); const startTime = Date.now(); // Use embeddingContent when available, fall back to raw content const contents = chunks.map((c) => { return c.embeddingContent ?? c.content; }); const batchSize = 50; const embeddings: Float32Array[] = []; for (let i = 0; i < contents.length; i += batchSize) { const batch = contents.slice(i, i + batchSize); const batchEmbeddings = await embedder.embedBatch(batch); for (const emb of batchEmbeddings) { embeddings.push(emb); } // Progress from 40% to 85% const progressPct = 40 + Math.round((i / contents.length) * 45); progress({ phase: 'embedding', progress: progressPct, message: `Generating embeddings (${Math.min(i + batchSize, contents.length)}/${contents.length})...`, current: Math.min(i + batchSize, contents.length), total: contents.length, }); } const duration = (Date.now() - startTime) / 1000; return { embeddings, duration, chunksPerSecond: Math.round(chunks.length / duration), }; } /** * Create the library database file. * * @param options - Options for creating the library * @returns Library metadata */ protected async _createLibrary(options: CreateLibraryOptions): Promise<LibraryMetadata> { const { outputPath, chunks, embeddings, libraryName, libraryVersion, chunkSize, chunkOverlap, provenance, buildOptions: opts, } = options; const store = new VectorStore(outputPath); store.initialize(); store.addChunks(chunks, embeddings); // Calculate content hash const allContent = chunks.map((c) => { return c.content; }).join(''); const contentHash = createHash('sha256').update(allContent).digest('hex'); // Set schema version store.setMeta('schema_version', String(CURRENT_SCHEMA_VERSION)); // Build metadata const metadata: LibraryMetadata = { name: libraryName, version: libraryVersion, schemaVersion: CURRENT_SCHEMA_VERSION, contentVersion: opts.contentVersion, description: opts.description, agentDescription: opts.agentDescription, exampleQueries: opts.exampleQueries, keywords: opts.keywords, programmingLanguages: opts.programmingLanguages, textLanguages: opts.textLanguages, frameworks: opts.frameworks, createdAt: new Date().toISOString(), embedding: { model: 'Xenova/bge-small-en-v1.5', dimensions: 384, }, chunking: { strategy: opts.noAstChunking ? 'recursive' : 'ast+recursive', chunkSize, chunkOverlap, }, stats: { chunkCount: chunks.length, sourceCount: new Set(chunks.map((c) => { return c.metadata.sourceFile; })).size, fileSize: 0, }, contentHash: `sha256:${contentHash}`, source: provenance, }; store.setMetadata(metadata); store.close(); return metadata; } /** * Resolve the output path for the library file. * * @param output - Explicit output path (optional) * @param libraryName - Library name * @param libraryVersion - Library version * @param isGit - Whether source is from git * @returns Resolved output path */ protected async _resolveOutputPath( output: string | undefined, libraryName: string, libraryVersion: string ): Promise<string> { const defaultFilename = `${libraryName}-${libraryVersion}.libragen`; if (!output) { return defaultFilename; } if (output.endsWith('.libragen')) { await fs.mkdir(path.dirname(path.resolve(output)), { recursive: true }); return output; } await fs.mkdir(output, { recursive: true }); return path.join(output, defaultFilename); } /** * Chunk source files using AST chunking for supported files, falling back to text * chunking. */ private async _chunkSourceFiles( files: { relativePath: string; content: string }[], textChunker: Chunker, codeChunker: CodeChunker | null ): Promise<Chunk[]> { const allChunks: Chunk[] = []; for (const file of files) { try { let chunks: Chunk[] | null = null; // Try AST chunking first for supported files if (codeChunker && CodeChunker.isSupported(file.relativePath)) { chunks = await codeChunker.tryChunkText(file.content, file.relativePath); } // Fall back to text chunking if (!chunks) { chunks = await textChunker.chunkText(file.content, file.relativePath); } allChunks.push(...chunks); } catch(_e) { // Skip files that can't be chunked continue; } } return allChunks; } /** * Re-chunk code files in an existing chunk array using AST chunking. * Groups chunks by source file, re-chunks supported files, and merges results. */ private async _enhanceChunksWithAst( chunks: Chunk[], codeChunker: CodeChunker ): Promise<Chunk[]> { // Group chunks by source file const chunksByFile = new Map<string, Chunk[]>(); for (const chunk of chunks) { const file = chunk.metadata.sourceFile, existing = chunksByFile.get(file) ?? []; existing.push(chunk); chunksByFile.set(file, existing); } const result: Chunk[] = []; for (const [ file, fileChunks ] of chunksByFile) { // If file supports AST chunking, reconstruct content and re-chunk if (CodeChunker.isSupported(file)) { // Reconstruct file content from chunks (sorted by line number) const sortedChunks = [ ...fileChunks ].sort((a, b) => { return (a.metadata.startLine ?? 0) - (b.metadata.startLine ?? 0); }); const content = sortedChunks.map((c) => { return c.content; }).join('\n'); const astChunks = await codeChunker.tryChunkText(content, file); if (astChunks) { result.push(...astChunks); } else { // AST chunking failed, keep original chunks result.push(...fileChunks); } } else { // Keep original chunks for non-code files result.push(...fileChunks); } } return result; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/libragen/libragen'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

builder.ts•23.7 KiB