Skip to main content
Glama
orneryd

M.I.M.I.R - Multi-agent Intelligent Memory & Insight Repository

by orneryd
DocumentParser.ts5.68 kB
/** * DocumentParser - Extracts text content from binary document formats * Supports PDF and DOCX files for indexing and embedding generation */ import * as mammoth from 'mammoth'; export class DocumentParser { /** * Extract plain text from PDF or DOCX files for indexing * * Parses binary document formats and extracts readable text content. * Used by FileIndexer to make documents searchable and embeddable. * Automatically detects format from extension and uses appropriate parser. * * Supported Formats: * - **PDF**: Uses pdf-parse library for text extraction * - **DOCX**: Uses mammoth library for text extraction * * @param buffer - File content as Buffer * @param extension - File extension (.pdf, .docx) * @returns Extracted plain text content * @throws {Error} If format is unsupported or extraction fails * * @example * // Extract text from PDF file * const parser = new DocumentParser(); * const pdfBuffer = await fs.readFile('/path/to/document.pdf'); * const text = await parser.extractText(pdfBuffer, '.pdf'); * console.log('Extracted', text.length, 'characters'); * console.log('First 100 chars:', text.substring(0, 100)); * * @example * // Extract text from DOCX file * const docxBuffer = await fs.readFile('/path/to/document.docx'); * const text = await parser.extractText(docxBuffer, '.docx'); * console.log('Document text:', text); * * @example * // Handle extraction errors * try { * const buffer = await fs.readFile('/path/to/doc.pdf'); * const text = await parser.extractText(buffer, '.pdf'); * if (text.length === 0) { * console.warn('Document is empty'); * } * } catch (error) { * if (error.message.includes('no extractable text')) { * console.log('PDF is image-based or encrypted'); * } else { * console.error('Extraction failed:', error.message); * } * } * * @example * // Use in file indexing pipeline * const files = await glob('docs/*.{pdf,docx}'); * for (const file of files) { * const buffer = await fs.readFile(file); * const ext = path.extname(file); * const text = await parser.extractText(buffer, ext); * await indexDocument(file, text); * } */ async extractText(buffer: Buffer, extension: string): Promise<string> { try { if (extension === '.pdf') { return await this.extractPdfText(buffer); } else if (extension === '.docx') { return await this.extractDocxText(buffer); } else { throw new Error(`Unsupported document format: ${extension}`); } } catch (error) { throw new Error( `Failed to extract text from ${extension}: ${error instanceof Error ? error.message : String(error)}` ); } } /** * Extract text from PDF using pdf-parse * @param buffer PDF file buffer * @returns Extracted text content */ private async extractPdfText(buffer: Buffer): Promise<string> { // pdf-parse exports PDFParse as a named export const { PDFParse } = await import('pdf-parse'); const parser = new PDFParse({ data: buffer }); const textResult = await parser.getText(); // getText() returns TextResult object with text property if (!textResult.text || textResult.text.trim().length === 0) { throw new Error('PDF contains no extractable text content'); } return textResult.text; } /** * Extract text from DOCX using mammoth * @param buffer DOCX file buffer * @returns Extracted text content */ private async extractDocxText(buffer: Buffer): Promise<string> { const result = await mammoth.extractRawText({ buffer }); // mammoth returns { value: string, messages: [] } if (!result.value || result.value.trim().length === 0) { throw new Error('DOCX contains no extractable text content'); } // Log warnings if any (e.g., unsupported features) if (result.messages && result.messages.length > 0) { console.warn('DOCX extraction warnings:', result.messages); } return result.value; } /** * Check if a file extension is supported for document parsing * * Tests whether the parser can extract text from files with the given * extension. Use this before attempting extraction to avoid errors. * * @param extension - File extension (e.g., '.pdf', '.docx') * @returns true if format is supported, false otherwise * * @example * // Check before parsing * const parser = new DocumentParser(); * const file = '/path/to/document.pdf'; * const ext = path.extname(file); * * if (parser.isSupportedFormat(ext)) { * const buffer = await fs.readFile(file); * const text = await parser.extractText(buffer, ext); * console.log('Extracted:', text.length, 'chars'); * } else { * console.log('Unsupported format:', ext); * } * * @example * // Filter files by supported formats * const allFiles = await glob('documents/*.*'); * const supportedFiles = allFiles.filter(file => { * const ext = path.extname(file); * return parser.isSupportedFormat(ext); * }); * console.log('Can parse', supportedFiles.length, 'files'); * * @example * // Build supported extensions list * const extensions = ['.pdf', '.docx', '.txt', '.md', '.doc']; * const supported = extensions.filter(ext => parser.isSupportedFormat(ext)); * console.log('Supported:', supported.join(', ')); * // Output: Supported: .pdf, .docx */ isSupportedFormat(extension: string): boolean { return ['.pdf', '.docx'].includes(extension.toLowerCase()); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/orneryd/Mimir'

If you have feedback or need assistance with the MCP directory API, please join our Discord server