M.I.M.I.R - Multi-agent Intelligent Memory & Insight Repository

Overview Schema Related Servers Score Discussions

Mimir
src
indexing

DocumentParser.ts•5.55 KiB

/** * DocumentParser - Extracts text content from binary document formats * Supports PDF and DOCX files for indexing and embedding generation */ import * as mammoth from 'mammoth'; export class DocumentParser { /** * Extract plain text from PDF or DOCX files for indexing * * Parses binary document formats and extracts readable text content. * Used by FileIndexer to make documents searchable and embeddable. * Automatically detects format from extension and uses appropriate parser. * * Supported Formats: * - **PDF**: Uses pdf-parse library for text extraction * - **DOCX**: Uses mammoth library for text extraction * * @param buffer - File content as Buffer * @param extension - File extension (.pdf, .docx) * @returns Extracted plain text content * @throws {Error} If format is unsupported or extraction fails * * @example * // Extract text from PDF file * const parser = new DocumentParser(); * const pdfBuffer = await fs.readFile('/path/to/document.pdf'); * const text = await parser.extractText(pdfBuffer, '.pdf'); * console.log('Extracted', text.length, 'characters'); * console.log('First 100 chars:', text.substring(0, 100)); * * @example * // Extract text from DOCX file * const docxBuffer = await fs.readFile('/path/to/document.docx'); * const text = await parser.extractText(docxBuffer, '.docx'); * console.log('Document text:', text); * * @example * // Handle extraction errors * try { * const buffer = await fs.readFile('/path/to/doc.pdf'); * const text = await parser.extractText(buffer, '.pdf'); * if (text.length === 0) { * console.warn('Document is empty'); * } * } catch (error) { * if (error.message.includes('no extractable text')) { * console.log('PDF is image-based or encrypted'); * } else { * console.error('Extraction failed:', error.message); * } * } * * @example * // Use in file indexing pipeline * const files = await glob('docs/*.{pdf,docx}'); * for (const file of files) { * const buffer = await fs.readFile(file); * const ext = path.extname(file); * const text = await parser.extractText(buffer, ext); * await indexDocument(file, text); * } */ async extractText(buffer: Buffer, extension: string): Promise<string> { try { if (extension === '.pdf') { return await this.extractPdfText(buffer); } else if (extension === '.docx') { return await this.extractDocxText(buffer); } else { throw new Error(`Unsupported document format: ${extension}`); } } catch (error) { throw new Error( `Failed to extract text from ${extension}: ${error instanceof Error ? error.message : String(error)}` ); } } /** * Extract text from PDF using pdf-parse * @param buffer PDF file buffer * @returns Extracted text content */ private async extractPdfText(buffer: Buffer): Promise<string> { // pdf-parse exports PDFParse as a named export const { PDFParse } = await import('pdf-parse'); const parser = new PDFParse({ data: buffer }); const textResult = await parser.getText(); // getText() returns TextResult object with text property if (!textResult.text || textResult.text.trim().length === 0) { throw new Error('PDF contains no extractable text content'); } return textResult.text; } /** * Extract text from DOCX using mammoth * @param buffer DOCX file buffer * @returns Extracted text content */ private async extractDocxText(buffer: Buffer): Promise<string> { const result = await mammoth.extractRawText({ buffer }); // mammoth returns { value: string, messages: [] } if (!result.value || result.value.trim().length === 0) { throw new Error('DOCX contains no extractable text content'); } // Log warnings if any (e.g., unsupported features) if (result.messages && result.messages.length > 0) { console.warn('DOCX extraction warnings:', result.messages); } return result.value; } /** * Check if a file extension is supported for document parsing * * Tests whether the parser can extract text from files with the given * extension. Use this before attempting extraction to avoid errors. * * @param extension - File extension (e.g., '.pdf', '.docx') * @returns true if format is supported, false otherwise * * @example * // Check before parsing * const parser = new DocumentParser(); * const file = '/path/to/document.pdf'; * const ext = path.extname(file); * * if (parser.isSupportedFormat(ext)) { * const buffer = await fs.readFile(file); * const text = await parser.extractText(buffer, ext); * console.log('Extracted:', text.length, 'chars'); * } else { * console.log('Unsupported format:', ext); * } * * @example * // Filter files by supported formats * const allFiles = await glob('documents/*.*'); * const supportedFiles = allFiles.filter(file => { * const ext = path.extname(file); * return parser.isSupportedFormat(ext); * }); * console.log('Can parse', supportedFiles.length, 'files'); * * @example * // Build supported extensions list * const extensions = ['.pdf', '.docx', '.txt', '.md', '.doc']; * const supported = extensions.filter(ext => parser.isSupportedFormat(ext)); * console.log('Supported:', supported.join(', ')); * // Output: Supported: .pdf, .docx */ isSupportedFormat(extension: string): boolean { return ['.pdf', '.docx'].includes(extension.toLowerCase()); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/orneryd/Mimir'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

DocumentParser.ts•5.55 KiB