Local RAG

Overview Schema Related Servers Score Discussions

index.ts•12 KiB

// DocumentParser implementation with PDF/DOCX/TXT/MD support import { statSync } from 'node:fs' import { lstat, readFile, realpath } from 'node:fs/promises' import { basename, extname, isAbsolute, resolve, sep } from 'node:path' import mammoth from 'mammoth' import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs' import type { TextItem } from 'pdfjs-dist/types/src/display/api' import { SemanticChunker } from '../chunker/index.js' import { type EmbedderInterface, type PageData, filterPageBoundarySentences } from './pdf-filter.js' import { extractDocxTitle, extractMarkdownTitle, extractPdfTitle, extractTxtTitle, } from './title-extractor.js' // ============================================ // Supported Extensions // ============================================ /** * File extensions supported by the parser module (parseFile + parsePdf). * Exported so other modules (e.g. list_files) stay in sync automatically * when new formats are added here. */ export const SUPPORTED_EXTENSIONS = new Set(['.pdf', '.docx', '.txt', '.md']) // ============================================ // Type Definitions // ============================================ /** * Result from parsing a document, containing both content and extracted title. * Title is display-only metadata (NOT used for search scoring). */ export interface ParseResult { content: string title: string } /** * DocumentParser configuration */ interface ParserConfig { /** Security: allowed base directory */ baseDir: string /** Maximum file size (100MB) */ maxFileSize: number } /** * Validation error (equivalent to 400) */ export class ValidationError extends Error { constructor( message: string, public override readonly cause?: Error ) { super(message) this.name = 'ValidationError' } } /** * File operation error (equivalent to 500) */ export class FileOperationError extends Error { constructor( message: string, public override readonly cause?: Error ) { super(message) this.name = 'FileOperationError' } } // ============================================ // DocumentParser Class // ============================================ /** * Document parser class (PDF/DOCX/TXT/MD support) * * Responsibilities: * - File path validation (path traversal prevention) * - File size validation (100MB limit) * - Parse 4 formats (PDF/DOCX/TXT/MD) */ export class DocumentParser { private readonly config: ParserConfig /** Lazily cached realpath of baseDir. Assumes baseDir is stable for the process lifetime. */ private resolvedBaseDir: string | null = null constructor(config: ParserConfig) { this.config = config } /** * File path validation (Absolute path requirement + Path traversal prevention) * * @param filePath - File path to validate (must be absolute) * @throws ValidationError - When path is not absolute or outside BASE_DIR */ async validateFilePath(filePath: string): Promise<void> { // Check if path is absolute (fast-fail without syscall) if (!isAbsolute(filePath)) { throw new ValidationError( `File path must be absolute path (received: ${filePath}). Please provide an absolute path within BASE_DIR.` ) } // Lazily resolve and cache the real baseDir path (follows symlinks) if (!this.resolvedBaseDir) { const resolved = await realpath(resolve(this.config.baseDir)) // Ensure trailing separator for safe prefix comparison this.resolvedBaseDir = resolved.endsWith(sep) ? resolved : resolved + sep } // Resolve the real path of the file (follows symlinks) let resolvedPath: string try { resolvedPath = await realpath(filePath) } catch (error) { // realpath fails if path doesn't exist on filesystem. // Distinguish broken symlinks from genuinely non-existent paths: // - Broken symlink: lstat succeeds (symlink entry exists) -> reject // - Non-existent path: lstat fails -> fall back to resolve() for validation const isSymlink = await lstat(filePath) .then((stats) => stats.isSymbolicLink()) .catch(() => false) if (isSymlink) { throw new ValidationError( `Cannot resolve file path: ${filePath}. The file may not exist or is a broken symlink.`, error as Error ) } // File doesn't exist at all - fall back to resolve() for path validation. // Note: resolve() is string-based and cannot detect symlinked parent directories. // This is acceptable because non-existent files will fail at subsequent readFile/statSync. resolvedPath = resolve(filePath) } // Check if resolved path is within BASE_DIR if (!resolvedPath.startsWith(this.resolvedBaseDir)) { throw new ValidationError( `File path must be within BASE_DIR (${this.resolvedBaseDir}). Received path outside BASE_DIR: ${filePath}` ) } } /** * File size validation (100MB limit) * * @param filePath - File path to validate * @throws ValidationError - When file size exceeds limit * @throws FileOperationError - When file read fails */ validateFileSize(filePath: string): void { try { const stats = statSync(filePath) if (stats.size > this.config.maxFileSize) { throw new ValidationError( `File size exceeds limit: ${stats.size} > ${this.config.maxFileSize}` ) } } catch (error) { if (error instanceof ValidationError) { throw error } throw new FileOperationError(`Failed to check file size: ${filePath}`, error as Error) } } /** * File parsing (auto format detection) * * @param filePath - File path to parse * @returns ParseResult with content and extracted title * @throws ValidationError - Path traversal, size exceeded, unsupported format * @throws FileOperationError - File read failed, parse failed */ async parseFile(filePath: string): Promise<ParseResult> { // Validation await this.validateFilePath(filePath) this.validateFileSize(filePath) // Format detection (PDF uses parsePdf directly) const ext = extname(filePath).toLowerCase() switch (ext) { case '.docx': return await this.parseDocx(filePath) case '.txt': return await this.parseTxt(filePath) case '.md': return await this.parseMd(filePath) default: throw new ValidationError(`Unsupported file format: ${ext}`) } } /** * PDF parsing with header/footer filtering * * Features: * - Extracts text with position information (x, y, fontSize) * - Semantic header/footer detection using embedding similarity * - Uses hasEOL for proper line break handling * - Extracts document title from PDF metadata and first page font heuristic * * @param filePath - PDF file path * @param embedder - Embedder for semantic header/footer detection * @returns ParseResult with content and extracted title * @throws FileOperationError - File read failed, parse failed */ async parsePdf(filePath: string, embedder: EmbedderInterface): Promise<ParseResult> { // Validation await this.validateFilePath(filePath) this.validateFileSize(filePath) try { const buffer = await readFile(filePath) const pdf = await getDocument({ data: new Uint8Array(buffer), useSystemFonts: true, isEvalSupported: false, }).promise // Extract metadata for title extraction const metadata = await pdf.getMetadata() const metadataTitle = (metadata?.info as Record<string, unknown>)?.['Title'] as | string | undefined // Extract text with position information from each page const pages: PageData[] = [] for (let i = 1; i <= pdf.numPages; i++) { const page = await pdf.getPage(i) const textContent = await page.getTextContent() const items = textContent.items .filter((item): item is TextItem => 'str' in item) .map((item) => ({ text: item.str, x: item.transform[4], y: item.transform[5], fontSize: Math.abs(item.transform[0]), hasEOL: item.hasEOL ?? false, })) pages.push({ pageNum: i, items }) } // Apply sentence-level header/footer filtering (returns per-page filtered text) // This handles variable content like page numbers ("7 of 75") using semantic similarity const filteredPages = await filterPageBoundarySentences(pages, embedder) const text = filteredPages.filter((t) => t.length > 0).join('\n\n') // Extract title from filtered page 1 via semantic chunking // Isolated try-catch: title extraction failure should not abort PDF ingestion const fileName = basename(filePath) let firstPageChunkText: string | undefined try { const filteredPage1 = filteredPages[0] if (filteredPage1 && filteredPage1.trim().length > 0) { const chunker = new SemanticChunker() const page1Chunks = await chunker.chunkText(filteredPage1, embedder) if (page1Chunks.length > 0) { firstPageChunkText = (page1Chunks[0] as { text: string }).text } } } catch (titleError) { console.error(`Title extraction failed, falling back to filename: ${titleError}`) } const titleResult = extractPdfTitle(metadataTitle, firstPageChunkText, fileName) console.error(`Parsed PDF: ${filePath} (${text.length} characters, ${pdf.numPages} pages)`) return { content: text, title: titleResult.title } } catch (error) { throw new FileOperationError(`Failed to parse PDF: ${filePath}`, error as Error) } } /** * DOCX parsing (using mammoth) * * Uses extractRawText for content and convertToHtml additionally for title detection. * * @param filePath - DOCX file path * @returns ParseResult with content and extracted title * @throws FileOperationError - File read failed, parse failed */ private async parseDocx(filePath: string): Promise<ParseResult> { try { // Read file once and pass buffer to both mammoth calls const buffer = await readFile(filePath) // Use extractRawText for content (unchanged behavior) const result = await mammoth.extractRawText({ buffer }) const rawText = result.value // Use convertToHtml additionally for title extraction (first <h1>) const htmlResult = await mammoth.convertToHtml({ buffer }) const fileName = basename(filePath) const titleResult = extractDocxTitle(htmlResult.value, fileName) console.error(`Parsed DOCX: ${filePath} (${rawText.length} characters)`) return { content: rawText, title: titleResult.title } } catch (error) { throw new FileOperationError(`Failed to parse DOCX: ${filePath}`, error as Error) } } /** * TXT parsing (using fs.readFile) * * @param filePath - TXT file path * @returns ParseResult with content and extracted title * @throws FileOperationError - File read failed */ private async parseTxt(filePath: string): Promise<ParseResult> { try { const text = await readFile(filePath, 'utf-8') const fileName = basename(filePath) const titleResult = extractTxtTitle(text, fileName) console.error(`Parsed TXT: ${filePath} (${text.length} characters)`) return { content: text, title: titleResult.title } } catch (error) { throw new FileOperationError(`Failed to parse TXT: ${filePath}`, error as Error) } } /** * MD parsing (using fs.readFile) * * @param filePath - MD file path * @returns ParseResult with content and extracted title * @throws FileOperationError - File read failed */ private async parseMd(filePath: string): Promise<ParseResult> { try { const text = await readFile(filePath, 'utf-8') const fileName = basename(filePath) const titleResult = extractMarkdownTitle(text, fileName) console.error(`Parsed MD: ${filePath} (${text.length} characters)`) return { content: text, title: titleResult.title } } catch (error) { throw new FileOperationError(`Failed to parse MD: ${filePath}`, error as Error) } } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/shinpr/mcp-local-rag'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

index.ts•12 KiB