Skip to main content
Glama

ClaudeHopper

by Arborist-ai
pdf_processor.ts12.4 kB
/** * PDF Processing Utilities * * This module provides utilities for processing PDF files, including: * - Splitting large PDFs into smaller chunks * - Extracting key sections like title blocks and headers * - Extracting images (for Phase 2) */ import * as fs from 'fs'; import * as path from 'path'; import * as crypto from 'crypto'; // Import pdf-parse using dynamic import for ES Module compatibility import { createRequire } from 'module'; const require = createRequire(import.meta.url); const pdfParseLib = require('pdf-parse/lib/pdf-parse.js'); import { Ollama } from '@langchain/ollama'; import config from '../config'; // Create a wrapper for pdf-parse to handle common issues const pdfParse = async (dataBuffer: Buffer, options?: any) => { try { return await pdfParseLib(dataBuffer, options); } catch (error) { // Log the error for debugging console.error('Error parsing PDF:', error); // Return a simplified result to allow processing to continue return { numpages: 1, text: "Error extracting PDF text. The PDF may be corrupt, password-protected, or in an unsupported format.", info: {} }; } }; // Interface for processed PDF section export interface PdfSection { content: string; pageNumber: number; sectionType: 'title_block' | 'header' | 'footer' | 'content' | 'full_page'; metadata: { source: string; hash: string; pageCount: number; [key: string]: any; }; } // Interface for PDF splitting result export interface SplitPdfResult { originalPath: string; chunks: { path: string; startPage: number; endPage: number; title?: string; }[]; } /** * Checks if a PDF needs to be split based on size and page count * * @param pdfPath Path to the PDF file * @returns Boolean indicating if the PDF should be split */ export async function shouldSplitPdf(pdfPath: string): Promise<boolean> { // Check file size const stats = await fs.promises.stat(pdfPath); const fileSizeMb = stats.size / (1024 * 1024); // If file size exceeds the threshold, suggest splitting if (fileSizeMb > config.pdf.maxSizeMb) { return true; } // Check page count if size is under threshold try { const dataBuffer = await fs.promises.readFile(pdfPath); const pdfData = await pdfParse(dataBuffer, { max: 0 }); // Just get metadata // If page count exceeds threshold, suggest splitting if (pdfData.numpages > config.pdf.maxPagesPerChunk) { return true; } } catch (error) { console.error(`Error analyzing PDF ${pdfPath}:`, error); // If we can't analyze the PDF, err on the side of caution return true; } // PDF is within acceptable size limits return false; } /** * Extract key sections from a PDF for metadata analysis * * @param pdfPath Path to the PDF file * @returns Array of PDF sections containing key information */ export async function extractKeyPdfSections(pdfPath: string): Promise<PdfSection[]> { try { const dataBuffer = await fs.promises.readFile(pdfPath); const hash = crypto.createHash('sha256').update(dataBuffer).digest('hex'); // Parse the PDF - limit to first few pages for speed if configured const options = config.pdf.extractFirstPageOnly ? { max: 1 } : {}; const pdfData = await pdfParse(dataBuffer, options); // Create a results array to hold extracted sections const sections: PdfSection[] = []; // Always extract the first page - it typically contains the most important metadata sections.push({ content: pdfData.text, pageNumber: 1, sectionType: 'full_page', metadata: { source: pdfPath, hash, pageCount: pdfData.numpages } }); // For future enhancement: Add specific section detection // - Title block extraction // - Header/footer extraction // - Content area extraction return sections; } catch (error) { console.error(`Error extracting key sections from PDF ${pdfPath}:`, error); throw error; } } /** * Split a large PDF into smaller chunks based on bookmarks or page ranges * * @param pdfPath Path to the PDF file * @param outputDir Directory to store split files * @returns Information about the created chunks */ export async function splitPdfIntoChunks(pdfPath: string, outputDir: string): Promise<SplitPdfResult> { try { const result: SplitPdfResult = { originalPath: pdfPath, chunks: [] }; // Create output directory if it doesn't exist const fileName = path.basename(pdfPath, path.extname(pdfPath)); const splitDir = path.join(outputDir, `${fileName}_split`); await fs.promises.mkdir(splitDir, { recursive: true }); // For now, use simple page range splitting // In a future implementation, add bookmark-based and AI-based splitting await splitByPageRanges(pdfPath, splitDir, result); return result; } catch (error) { console.error(`Error splitting PDF ${pdfPath}:`, error); // Return original file path if we can't split return { originalPath: pdfPath, chunks: [] }; } } /** * Split PDF into chunks based on fixed page ranges * * @param pdfPath Path to the PDF file * @param outputDir Directory to store split files * @param result SplitPdfResult object to update */ async function splitByPageRanges( pdfPath: string, outputDir: string, result: SplitPdfResult ): Promise<void> { try { // Get page count of the PDF const dataBuffer = await fs.promises.readFile(pdfPath); const pdfData = await pdfParse(dataBuffer, { max: 0 }); // Just get metadata const pageCount = pdfData.numpages; // Use pdftk or similar tool to split the PDF // This is a placeholder - we need to implement actual PDF splitting // For now, just log the intented splits const pagesPerChunk = config.pdf.maxPagesPerChunk; const fileName = path.basename(pdfPath, path.extname(pdfPath)); for (let startPage = 1; startPage <= pageCount; startPage += pagesPerChunk) { const endPage = Math.min(startPage + pagesPerChunk - 1, pageCount); const chunkPath = path.join(outputDir, `${fileName}_${startPage}-${endPage}.pdf`); // In a real implementation, we would create the actual split file here // For now, just log what we would do console.log(`Would split ${pdfPath} pages ${startPage}-${endPage} to ${chunkPath}`); // Add to result result.chunks.push({ path: chunkPath, startPage, endPage, title: `Pages ${startPage}-${endPage}` }); } } catch (error) { console.error(`Error in page range splitting for ${pdfPath}:`, error); throw error; } } /** * Detect logical sections in a PDF using AI * * @param pdfPath Path to the PDF file * @returns Array of detected section boundaries (page numbers) */ export async function detectLogicalSections(pdfPath: string): Promise<number[]> { try { // Get the text content of the PDF const dataBuffer = await fs.promises.readFile(pdfPath); const pdfData = await pdfParse(dataBuffer); const pdfText = pdfData.text; // Create a sample of the text to analyze // (We don't need the full text for section detection) const textSample = pdfText.slice(0, config.text.maxMetadataTokens); // Use LLM to detect logical section breaks const model = new Ollama({ model: config.models.sectionDetection, temperature: 0.0 }); const prompt = ` You are analyzing a construction document that appears to be divided into sections. Please identify the likely page numbers where major new sections begin. Document info: - Total pages: ${pdfData.numpages} - Sample content: ${textSample} Return ONLY an array of page numbers (integers) representing the starting page of each major section. For example: [1, 5, 12, 18] `; const result = await model.call(prompt); try { // Try to parse the response as a JSON array const pageNumbers = JSON.parse(result); if (Array.isArray(pageNumbers) && pageNumbers.every(num => typeof num === 'number')) { return pageNumbers; } else { console.warn("AI returned invalid section boundaries, using default splitting"); return [1]; } } catch (e) { console.warn("Failed to parse AI section detection result:", e); return [1]; } } catch (error) { console.error(`Error detecting logical sections in ${pdfPath}:`, error); return [1]; } } /** * Extract metadata from key PDF sections using AI * * @param sections Array of PDF sections to analyze * @returns Extracted metadata object */ export async function extractMetadataWithAI(sections: PdfSection[]): Promise<Record<string, string>> { try { // Use the first section (typically first page or title block) const mainSection = sections[0]; // Create a prompt for the AI to extract metadata const model = new Ollama({ model: config.models.metadataExtraction, temperature: 0.0 }); const prompt = ` Extract metadata from this construction document. Document content: ${mainSection.content.slice(0, config.text.maxMetadataTokens)} Please identify the following information: - Project Name - Discipline (Structural, Architectural, Civil, Electrical, Mechanical, etc.) - Drawing Type (Plan, Elevation, Section, Detail, Schedule, etc.) - Phase (Schematic Design, Design Development, Construction Documents, Bid, etc.) - Drawing/Document Number - Revision information Return the information as a JSON object with these keys: project, discipline, drawingType, phase, drawingNumber, revision Only include fields where you have high confidence. If you cannot determine a field, omit it. `; const result = await model.call(prompt); try { // Try to parse the response as a JSON object const metadata = JSON.parse(result); return metadata; } catch (e) { console.warn("Failed to parse AI metadata extraction result:", e); return {}; } } catch (error) { console.error("Error extracting metadata with AI:", error); return {}; } } /** * Preprocess a PDF file before the main processing pipeline * - Checks if the file needs splitting * - Performs splitting if necessary * - Returns the files to be processed * * @param pdfPath Path to the PDF file * @param outputDir Directory to store split files * @returns Array of PDF paths to process */ export async function preprocessPdf(pdfPath: string, outputDir: string): Promise<string[]> { try { // Check if the PDF needs to be split const needsSplitting = await shouldSplitPdf(pdfPath); if (needsSplitting) { console.log(`Large PDF detected: ${pdfPath}. Splitting into smaller chunks.`); // Split the PDF const splitResult = await splitPdfIntoChunks(pdfPath, outputDir); if (splitResult.chunks.length > 0) { // Return the paths of the split chunks console.log(`Split ${pdfPath} into ${splitResult.chunks.length} chunks.`); return splitResult.chunks.map(chunk => chunk.path); } } // If we didn't split or splitting failed, return the original path return [pdfPath]; } catch (error) { console.error(`Error preprocessing PDF ${pdfPath}:`, error); // Return the original path in case of error return [pdfPath]; } } /** * Extract key information from a PDF document * - Extracts text from key sections * - Uses AI to extract metadata * - Returns combined results * * @param pdfPath Path to the PDF file * @returns Key sections and extracted metadata */ export async function extractPdfInformation(pdfPath: string): Promise<{ sections: PdfSection[], metadata: Record<string, string> }> { try { // Extract key sections (title block, first page, etc.) const sections = await extractKeyPdfSections(pdfPath); // Extract metadata using AI const aiMetadata = await extractMetadataWithAI(sections); return { sections, metadata: aiMetadata }; } catch (error) { console.error(`Error extracting information from PDF ${pdfPath}:`, error); throw error; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Arborist-ai/ClaudeHopper'

If you have feedback or need assistance with the MCP directory API, please join our Discord server