Skip to main content
Glama

Google Drive MCP Server

by ducla5
pdf-processor.ts12.7 kB
/** * PDF content processor * Handles PDF text extraction and conversion to markdown */ import pdf from 'pdf-parse'; import { ProcessedContent, ContentMetadata, DocumentStructure, ContentChunk, Heading, TableInfo } from '../types/content.js'; export interface PDFProcessorOptions { chunkSize?: number; preservePageBreaks?: boolean; extractHeadings?: boolean; pageRange?: { start: number; end: number }; } export interface PDFPageInfo { pageNumber: number; text: string; startPosition: number; endPosition: number; } export class PDFProcessor { /** * Process PDF file and convert to markdown */ async processPDF( buffer: Buffer, options: PDFProcessorOptions = {} ): Promise<ProcessedContent> { try { // Parse PDF using pdf-parse const pdfData = await pdf(buffer); // Extract pages information const pages = await this.extractPages(buffer, options.pageRange); // Process text content let fullText = pdfData.text; // Apply page range filter if specified if (options.pageRange) { fullText = this.filterByPageRange(pages, options.pageRange); } // Clean and normalize text const cleanText = this.cleanText(fullText); // Extract headings if requested const headings = options.extractHeadings ? this.extractHeadings(cleanText, pages) : []; // Create document structure const structure = this.analyzeStructure(cleanText, headings, pages, options.preservePageBreaks); // Generate chunks const chunks = this.createChunks(cleanText, pages, options.chunkSize || 4000); // Create metadata const metadata: ContentMetadata = { wordCount: this.countWords(cleanText), pageCount: pdfData.numpages, language: this.detectLanguage(cleanText), headings, images: [], // PDF image extraction would require additional processing tables: this.detectTables(cleanText), lastProcessed: new Date() }; // Convert to markdown format const markdown = this.convertToMarkdown(cleanText, pages, headings, options); return { markdown, metadata, structure, chunks }; } catch (error) { throw new Error(`Failed to process PDF: ${error instanceof Error ? error.message : 'Unknown error'}`); } } /** * Extract individual pages with position information */ private async extractPages(buffer: Buffer, pageRange?: { start: number; end: number }): Promise<PDFPageInfo[]> { const pages: PDFPageInfo[] = []; try { // Parse PDF to get page-by-page content const pdfData = await pdf(buffer); // Split full text by pages (approximation) const fullText = pdfData.text; const estimatedPageLength = Math.ceil(fullText.length / pdfData.numpages); for (let i = 0; i < pdfData.numpages; i++) { const pageNumber = i + 1; // Skip pages outside range if specified if (pageRange && (pageNumber < pageRange.start || pageNumber > pageRange.end)) { continue; } const startPosition = i * estimatedPageLength; const endPosition = Math.min((i + 1) * estimatedPageLength, fullText.length); const pageText = fullText.slice(startPosition, endPosition); pages.push({ pageNumber, text: pageText, startPosition, endPosition }); } } catch (error) { console.warn('Could not extract individual pages, using full text'); } return pages; } /** * Filter text by page range */ private filterByPageRange(pages: PDFPageInfo[], pageRange: { start: number; end: number }): string { return pages .filter(page => page.pageNumber >= pageRange.start && page.pageNumber <= pageRange.end) .map(page => page.text) .join('\n\n'); } /** * Clean and normalize PDF text */ private cleanText(text: string): string { return text // Remove excessive whitespace .replace(/\s+/g, ' ') // Fix common PDF extraction issues .replace(/([a-z])([A-Z])/g, '$1 $2') // Add space between camelCase .replace(/(\w)(\d)/g, '$1 $2') // Add space between word and number .replace(/(\d)(\w)/g, '$1 $2') // Add space between number and word // Normalize line endings .replace(/\r\n/g, '\n') .replace(/\r/g, '\n') // Remove excessive line breaks .replace(/\n{3,}/g, '\n\n') // Trim .trim(); } /** * Extract headings from PDF text */ private extractHeadings(text: string, _pages: PDFPageInfo[]): Heading[] { const headings: Heading[] = []; const lines = text.split('\n'); lines.forEach((line, _index) => { const trimmed = line.trim(); // Skip empty lines or very short lines if (trimmed.length < 3 || trimmed.length > 100) return; // Check for common heading patterns in PDFs const patterns = [ // All caps (likely heading) /^[A-Z\s\d\.\-]{3,50}$/, // Numbered sections /^\d+\.?\s+[A-Z][a-zA-Z\s]+$/, // Chapter/Section indicators /^(Chapter|Section|Part)\s+\d+/i, // Lines that are followed by content (heuristic) /^[A-Z][a-zA-Z\s]{5,50}$/ ]; let isHeading = false; let level = 3; // Default level for (const pattern of patterns) { if (pattern.test(trimmed)) { isHeading = true; // Determine level based on pattern if (trimmed.match(/^(Chapter|Part)/i)) level = 1; else if (trimmed.match(/^Section/i)) level = 2; else if (trimmed.match(/^\d+\./)) level = 2; else if (trimmed === trimmed.toUpperCase()) level = 2; break; } } // Additional heuristic: check if next line has content if (isHeading && _index < lines.length - 1) { const nextLine = lines[_index + 1].trim(); if (nextLine.length > 20) { headings.push({ level, text: trimmed, position: text.indexOf(line) }); } } }); return headings; } /** * Detect table-like structures in text */ private detectTables(text: string): TableInfo[] { const tables: TableInfo[] = []; const lines = text.split('\n'); let inTable = false; let tableStart = -1; let currentTable: string[] = []; lines.forEach((line, _index) => { const trimmed = line.trim(); // Simple heuristic: lines with multiple spaces or tabs might be table rows const hasMultipleSpaces = /\s{3,}/.test(trimmed); const hasTabularData = /\d+\s+\w+\s+\d+/.test(trimmed); if ((hasMultipleSpaces || hasTabularData) && trimmed.length > 20) { if (!inTable) { inTable = true; tableStart = text.indexOf(line); currentTable = []; } currentTable.push(trimmed); } else if (inTable && currentTable.length >= 2) { // End of table const columns = Math.max(...currentTable.map(row => row.split(/\s{2,}/).length)); tables.push({ rows: currentTable.length, columns, position: tableStart }); inTable = false; currentTable = []; } else if (inTable) { inTable = false; currentTable = []; } }); return tables; } /** * Analyze document structure */ private analyzeStructure( text: string, headings: Heading[], pages: PDFPageInfo[], preservePageBreaks?: boolean ): DocumentStructure { const sections = headings.map((heading, index) => { const nextHeading = headings[index + 1]; const endPosition = nextHeading ? nextHeading.position : text.length; return { title: heading.text, level: heading.level, startPosition: heading.position, endPosition, content: text.slice(heading.position, endPosition).trim() }; }); const toc = headings.map(heading => { // Try to determine page number for heading const page = pages.find(p => heading.position >= p.startPosition && heading.position <= p.endPosition ); return { title: heading.text, level: heading.level, page: page?.pageNumber || 1, position: heading.position }; }); // Page breaks are the boundaries between pages const pageBreaks = preservePageBreaks ? pages.map(page => page.endPosition).slice(0, -1) : []; return { sections, toc, pageBreaks }; } /** * Create content chunks with page tracking */ private createChunks(text: string, pages: PDFPageInfo[], chunkSize: number): ContentChunk[] { const chunks: ContentChunk[] = []; const words = text.split(/\s+/); let currentChunk = ''; let chunkIndex = 0; let startPosition = 0; for (let i = 0; i < words.length; i++) { const word = words[i]; const testChunk = currentChunk + (currentChunk ? ' ' : '') + word; if (testChunk.length > chunkSize && currentChunk.length > 0) { // Determine page range for chunk const chunkStart = text.indexOf(currentChunk, startPosition); const chunkEnd = chunkStart + currentChunk.length; const startPage = pages.find(p => chunkStart >= p.startPosition && chunkStart <= p.endPosition); const endPage = pages.find(p => chunkEnd >= p.startPosition && chunkEnd <= p.endPosition); chunks.push({ id: `chunk_${chunkIndex}`, content: currentChunk.trim(), startPage: startPage?.pageNumber || 1, endPage: endPage?.pageNumber || startPage?.pageNumber || 1, metadata: { wordCount: currentChunk.split(/\s+/).length, position: { start: chunkStart, end: chunkEnd } } }); // Start new chunk currentChunk = word; startPosition = chunkEnd; chunkIndex++; } else { currentChunk = testChunk; } } // Add final chunk if (currentChunk.trim()) { const chunkStart = text.indexOf(currentChunk, startPosition); const chunkEnd = chunkStart + currentChunk.length; const startPage = pages.find(p => chunkStart >= p.startPosition && chunkStart <= p.endPosition); const endPage = pages.find(p => chunkEnd >= p.startPosition && chunkEnd <= p.endPosition); chunks.push({ id: `chunk_${chunkIndex}`, content: currentChunk.trim(), startPage: startPage?.pageNumber || 1, endPage: endPage?.pageNumber || startPage?.pageNumber || 1, metadata: { wordCount: currentChunk.split(/\s+/).length, position: { start: chunkStart, end: chunkEnd } } }); } return chunks; } /** * Convert PDF text to markdown format */ private convertToMarkdown( text: string, pages: PDFPageInfo[], headings: Heading[], options: PDFProcessorOptions ): string { let markdown = text; // Convert detected headings to markdown format headings.forEach(heading => { const headingText = heading.text; const markdownHeading = '#'.repeat(Math.min(heading.level, 6)) + ' ' + headingText; markdown = markdown.replace(headingText, markdownHeading); }); // Add page breaks if requested if (options.preservePageBreaks && pages.length > 1) { pages.forEach((page, index) => { if (index > 0) { const pageBreakMarker = `\n\n---\n*Page ${page.pageNumber}*\n\n`; const insertPosition = page.startPosition; markdown = markdown.slice(0, insertPosition) + pageBreakMarker + markdown.slice(insertPosition); } }); } return markdown; } /** * Count words in text */ private countWords(text: string): number { return text.trim().split(/\s+/).filter(word => word.length > 0).length; } /** * Simple language detection */ private detectLanguage(text: string): string { const sample = text.slice(0, 1000).toLowerCase(); const englishWords = ['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']; const englishCount = englishWords.reduce((count, word) => { return count + (sample.match(new RegExp(`\\b${word}\\b`, 'g')) || []).length; }, 0); return englishCount > 5 ? 'en' : 'unknown'; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ducla5/gdriver-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server