Skip to main content
Glama
index.ts6.11 kB
// DocumentParser implementation with PDF/DOCX/TXT/MD support import { statSync } from 'node:fs' import { readFile } from 'node:fs/promises' import { extname, isAbsolute, resolve } from 'node:path' import mammoth from 'mammoth' import { PDFParse } from 'pdf-parse' // ============================================ // Type Definitions // ============================================ /** * DocumentParser configuration */ export interface ParserConfig { /** Security: allowed base directory */ baseDir: string /** Maximum file size (100MB) */ maxFileSize: number } /** * Validation error (equivalent to 400) */ export class ValidationError extends Error { constructor( message: string, public readonly cause?: Error ) { super(message) this.name = 'ValidationError' } } /** * File operation error (equivalent to 500) */ export class FileOperationError extends Error { constructor( message: string, public readonly cause?: Error ) { super(message) this.name = 'FileOperationError' } } // ============================================ // DocumentParser Class // ============================================ /** * Document parser class (PDF/DOCX/TXT/MD support) * * Responsibilities: * - File path validation (path traversal prevention) * - File size validation (100MB limit) * - Parse 4 formats (PDF/DOCX/TXT/MD) */ export class DocumentParser { private readonly config: ParserConfig constructor(config: ParserConfig) { this.config = config } /** * File path validation (Absolute path requirement + Path traversal prevention) * * @param filePath - File path to validate (must be absolute) * @throws ValidationError - When path is not absolute or outside BASE_DIR */ validateFilePath(filePath: string): void { // Check if path is absolute if (!isAbsolute(filePath)) { throw new ValidationError( `File path must be absolute path (received: ${filePath}). Please provide an absolute path within BASE_DIR.` ) } // Check if path is within BASE_DIR const baseDir = resolve(this.config.baseDir) const normalizedPath = resolve(filePath) if (!normalizedPath.startsWith(baseDir)) { throw new ValidationError( `File path must be within BASE_DIR (${baseDir}). Received path outside BASE_DIR: ${filePath}` ) } } /** * File size validation (100MB limit) * * @param filePath - File path to validate * @throws ValidationError - When file size exceeds limit * @throws FileOperationError - When file read fails */ validateFileSize(filePath: string): void { try { const stats = statSync(filePath) if (stats.size > this.config.maxFileSize) { throw new ValidationError( `File size exceeds limit: ${stats.size} > ${this.config.maxFileSize}` ) } } catch (error) { if (error instanceof ValidationError) { throw error } throw new FileOperationError(`Failed to check file size: ${filePath}`, error as Error) } } /** * File parsing (auto format detection) * * @param filePath - File path to parse * @returns Parsed text * @throws ValidationError - Path traversal, size exceeded, unsupported format * @throws FileOperationError - File read failed, parse failed */ async parseFile(filePath: string): Promise<string> { // Validation this.validateFilePath(filePath) this.validateFileSize(filePath) // Format detection const ext = extname(filePath).toLowerCase() switch (ext) { case '.pdf': return await this.parsePdf(filePath) case '.docx': return await this.parseDocx(filePath) case '.txt': return await this.parseTxt(filePath) case '.md': return await this.parseMd(filePath) default: throw new ValidationError(`Unsupported file format: ${ext}`) } } /** * PDF parsing (using pdf-parse v2) * * @param filePath - PDF file path * @returns Parsed text * @throws FileOperationError - File read failed, parse failed */ private async parsePdf(filePath: string): Promise<string> { try { const buffer = await readFile(filePath) const parser = new PDFParse({ data: buffer }) const result = await parser.getText() console.error(`Parsed PDF: ${filePath} (${result.text.length} characters)`) return result.text } catch (error) { throw new FileOperationError(`Failed to parse PDF: ${filePath}`, error as Error) } } /** * DOCX parsing (using mammoth) * * @param filePath - DOCX file path * @returns Parsed text * @throws FileOperationError - File read failed, parse failed */ private async parseDocx(filePath: string): Promise<string> { try { const result = await mammoth.extractRawText({ path: filePath }) console.error(`Parsed DOCX: ${filePath} (${result.value.length} characters)`) return result.value } catch (error) { throw new FileOperationError(`Failed to parse DOCX: ${filePath}`, error as Error) } } /** * TXT parsing (using fs.readFile) * * @param filePath - TXT file path * @returns Parsed text * @throws FileOperationError - File read failed */ private async parseTxt(filePath: string): Promise<string> { try { const text = await readFile(filePath, 'utf-8') console.error(`Parsed TXT: ${filePath} (${text.length} characters)`) return text } catch (error) { throw new FileOperationError(`Failed to parse TXT: ${filePath}`, error as Error) } } /** * MD parsing (using fs.readFile) * * @param filePath - MD file path * @returns Parsed text * @throws FileOperationError - File read failed */ private async parseMd(filePath: string): Promise<string> { try { const text = await readFile(filePath, 'utf-8') console.error(`Parsed MD: ${filePath} (${text.length} characters)`) return text } catch (error) { throw new FileOperationError(`Failed to parse MD: ${filePath}`, error as Error) } } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/shinpr/mcp-local-rag'

If you have feedback or need assistance with the MCP directory API, please join our Discord server