Google Drive MCP Server

docx-processor.ts•11.6 kB

/** * Microsoft Word document processor * Handles DOCX file processing and conversion to markdown */ import mammoth from 'mammoth'; import { ProcessedContent, ContentMetadata, DocumentStructure, ContentChunk, Heading, ImageInfo, TableInfo } from '../types/content.js'; export interface DOCXProcessorOptions { chunkSize?: number; preserveFormatting?: boolean; extractHeadings?: boolean; includeImages?: boolean; includeTables?: boolean; } export class DOCXProcessor { /** * Process DOCX file and convert to markdown */ async processDOCX( buffer: Buffer, options: DOCXProcessorOptions = {} ): Promise<ProcessedContent> { try { // Configure mammoth options for better markdown conversion const mammothOptions: any = { styleMap: [ // Map Word styles to markdown "p[style-name='Heading 1'] => h1:fresh", "p[style-name='Heading 2'] => h2:fresh", "p[style-name='Heading 3'] => h3:fresh", "p[style-name='Heading 4'] => h4:fresh", "p[style-name='Heading 5'] => h5:fresh", "p[style-name='Heading 6'] => h6:fresh", "p[style-name='Title'] => h1:fresh", "p[style-name='Subtitle'] => h2:fresh", "r[style-name='Strong'] => strong", "r[style-name='Emphasis'] => em" ] }; // Add image converter if images should be included if (options.includeImages !== false) { mammothOptions.convertImage = mammoth.images.imgElement((image: any) => { // Convert images to markdown format return image.read("base64").then((imageBuffer: string) => { return { src: `data:${image.contentType};base64,${imageBuffer}` }; }); }); } // Extract HTML first, then convert to markdown const htmlResult = await mammoth.convertToHtml(buffer as any, mammothOptions); // Convert HTML to markdown const markdown = this.htmlToMarkdown(htmlResult.value); // Extract headings const headings = options.extractHeadings ? this.extractHeadings(markdown) : []; // Extract images and tables info const images = this.extractImages(htmlResult.value); const tables = this.extractTables(htmlResult.value); // Create document structure const structure = this.analyzeStructure(markdown, headings); // Generate chunks const chunks = this.createChunks(markdown, options.chunkSize || 4000); // Create metadata const metadata: ContentMetadata = { wordCount: this.countWords(markdown), language: this.detectLanguage(markdown), headings, images, tables, lastProcessed: new Date() }; return { markdown, metadata, structure, chunks }; } catch (error) { throw new Error(`Failed to process DOCX file: ${error instanceof Error ? error.message : 'Unknown error'}`); } } /** * Convert HTML to Markdown */ private htmlToMarkdown(html: string): string { return html // Headers .replace(/<h([1-6])[^>]*>(.*?)<\/h[1-6]>/gi, (_, level, text) => { return '#'.repeat(parseInt(level)) + ' ' + this.cleanText(text) + '\n\n'; }) // Paragraphs .replace(/<p[^>]*>(.*?)<\/p>/gi, (_, text) => { const cleaned = this.cleanText(text); return cleaned ? cleaned + '\n\n' : ''; }) // Strong/Bold .replace(/<(strong|b)[^>]*>(.*?)<\/(strong|b)>/gi, '**$2**') // Emphasis/Italic .replace(/<(em|i)[^>]*>(.*?)<\/(em|i)>/gi, '*$2*') // Underline .replace(/<u[^>]*>(.*?)<\/u>/gi, '<u>$1</u>') // Strikethrough .replace(/<(s|strike|del)[^>]*>(.*?)<\/(s|strike|del)>/gi, '~~$2~~') // Links .replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)') // Lists .replace(/<ul[^>]*>(.*?)<\/ul>/gis, (_, content) => { return this.processUnorderedList(content) + '\n'; }) .replace(/<ol[^>]*>(.*?)<\/ol>/gis, (_, content) => { return this.processOrderedList(content) + '\n'; }) // Tables .replace(/<table[^>]*>(.*?)<\/table>/gis, (_, content) => { return this.processTable(content) + '\n'; }) // Images .replace(/<img[^>]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*>/gi, '![$2]($1)') .replace(/<img[^>]*src="([^"]*)"[^>]*>/gi, '![Image]($1)') // Line breaks .replace(/<br[^>]*>/gi, '\n') // Remove remaining HTML tags .replace(/<[^>]*>/g, '') // Clean up excessive whitespace .replace(/\n{3,}/g, '\n\n') // Decode HTML entities .replace(/ /g, ' ') .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'") .trim(); } /** * Process unordered list */ private processUnorderedList(content: string): string { return content .replace(/<li[^>]*>(.*?)<\/li>/gis, (_, text) => { return '- ' + this.cleanText(text).replace(/\n/g, ' ') + '\n'; }); } /** * Process ordered list */ private processOrderedList(content: string): string { let counter = 1; return content .replace(/<li[^>]*>(.*?)<\/li>/gis, (_, text) => { return `${counter++}. ` + this.cleanText(text).replace(/\n/g, ' ') + '\n'; }); } /** * Process HTML table to markdown */ private processTable(content: string): string { const rows: string[] = []; // Extract table rows const rowMatches = content.match(/<tr[^>]*>(.*?)<\/tr>/gis); if (!rowMatches) return ''; rowMatches.forEach((rowHtml, rowIndex) => { const cells: string[] = []; // Extract cells (both td and th) const cellMatches = rowHtml.match(/<(td|th)[^>]*>(.*?)<\/(td|th)>/gis); if (cellMatches) { cellMatches.forEach(cellHtml => { const cellContent = cellHtml.replace(/<[^>]*>/g, '').trim(); cells.push(cellContent || ' '); }); } if (cells.length > 0) { rows.push('| ' + cells.join(' | ') + ' |'); // Add header separator after first row if (rowIndex === 0) { rows.push('| ' + cells.map(() => '---').join(' | ') + ' |'); } } }); return rows.join('\n'); } /** * Clean text content */ private cleanText(text: string): string { return text .replace(/<[^>]*>/g, '') // Remove HTML tags .replace(/ /g, ' ') .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'") .replace(/\s+/g, ' ') .trim(); } /** * Extract headings from markdown */ private extractHeadings(markdown: string): Heading[] { const headings: Heading[] = []; const lines = markdown.split('\n'); lines.forEach(line => { const trimmed = line.trim(); const match = trimmed.match(/^(#{1,6})\s+(.+)$/); if (match) { headings.push({ level: match[1].length, text: match[2], position: markdown.indexOf(line) }); } }); return headings; } /** * Extract image information from HTML */ private extractImages(html: string): ImageInfo[] { const images: ImageInfo[] = []; const imageRegex = /<img[^>]*src="([^"]*)"[^>]*(?:alt="([^"]*)")?[^>]*>/gi; let match; while ((match = imageRegex.exec(html)) !== null) { images.push({ src: match[1], alt: match[2] || 'Image', position: match.index || 0 }); } return images; } /** * Extract table information from HTML */ private extractTables(html: string): TableInfo[] { const tables: TableInfo[] = []; const tableRegex = /<table[^>]*>(.*?)<\/table>/gis; let match; while ((match = tableRegex.exec(html)) !== null) { const tableContent = match[1]; // Count rows const rowMatches = tableContent.match(/<tr[^>]*>/gi); const rows = rowMatches ? rowMatches.length : 0; // Count columns (from first row) const firstRowMatch = tableContent.match(/<tr[^>]*>(.*?)<\/tr>/i); let columns = 0; if (firstRowMatch) { const cellMatches = firstRowMatch[1].match(/<(td|th)[^>]*>/gi); columns = cellMatches ? cellMatches.length : 0; } if (rows > 0 && columns > 0) { tables.push({ rows, columns, position: match.index || 0 }); } } return tables; } /** * Analyze document structure */ private analyzeStructure(markdown: string, headings: Heading[]): DocumentStructure { const sections = headings.map((heading, index) => { const nextHeading = headings[index + 1]; const endPosition = nextHeading ? nextHeading.position : markdown.length; return { title: heading.text, level: heading.level, startPosition: heading.position, endPosition, content: markdown.slice(heading.position, endPosition).trim() }; }); const toc = headings.map(heading => ({ title: heading.text, level: heading.level, position: heading.position })); return { sections, toc, pageBreaks: [] // DOCX doesn't have traditional page breaks in this context }; } /** * Create content chunks */ private createChunks(content: string, chunkSize: number): ContentChunk[] { const chunks: ContentChunk[] = []; const paragraphs = content.split('\n\n').filter(p => p.trim()); let currentChunk = ''; let chunkIndex = 0; let startPosition = 0; for (const paragraph of paragraphs) { const testChunk = currentChunk + (currentChunk ? '\n\n' : '') + paragraph; if (testChunk.length > chunkSize && currentChunk.length > 0) { // Create chunk chunks.push({ id: `chunk_${chunkIndex}`, content: currentChunk.trim(), metadata: { wordCount: this.countWords(currentChunk), position: { start: startPosition, end: startPosition + currentChunk.length } } }); // Start new chunk currentChunk = paragraph; startPosition = content.indexOf(currentChunk, startPosition + currentChunk.length); chunkIndex++; } else { currentChunk = testChunk; } } // Add final chunk if (currentChunk.trim()) { chunks.push({ id: `chunk_${chunkIndex}`, content: currentChunk.trim(), metadata: { wordCount: this.countWords(currentChunk), position: { start: startPosition, end: startPosition + currentChunk.length } } }); } return chunks; } /** * Count words in text */ private countWords(text: string): number { return text.trim().split(/\s+/).filter(word => word.length > 0).length; } /** * Simple language detection */ private detectLanguage(text: string): string { const sample = text.slice(0, 1000).toLowerCase(); const englishWords = ['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']; const englishCount = englishWords.reduce((count, word) => { return count + (sample.match(new RegExp(`\\b${word}\\b`, 'g')) || []).length; }, 0); return englishCount > 5 ? 'en' : 'unknown'; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ducla5/gdriver-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server