Google Drive MCP Server

text-extractor.ts•13.8 kB

/** * Basic text extraction utilities */ import { ProcessedContent, ContentMetadata, DocumentStructure, ContentChunk, Heading } from '../types/content.js'; export interface TextExtractionOptions { preserveFormatting?: boolean; extractHeadings?: boolean; chunkSize?: number; } export class TextExtractor { /** * Extract text content from plain text buffer */ static async extractText( buffer: Buffer, mimeType: string, options: TextExtractionOptions = {} ): Promise<ProcessedContent> { const text = buffer.toString('utf-8'); // Clean and normalize text const cleanText = this.cleanText(text); // Extract headings if requested const headings = options.extractHeadings ? this.extractHeadings(cleanText) : []; // Create document structure const structure = this.analyzeStructure(cleanText, headings); // Generate chunks const chunks = this.createChunks(cleanText, options.chunkSize || 4000); // Create metadata const metadata: ContentMetadata = { wordCount: this.countWords(cleanText), language: this.detectLanguage(cleanText), headings, images: [], // No images in plain text tables: [], // No tables in plain text lastProcessed: new Date() }; // Convert to markdown format const markdown = this.convertToMarkdown(cleanText, mimeType, headings); return { markdown, metadata, structure, chunks }; } /** * Clean and normalize text content */ private static cleanText(text: string): string { return text // Normalize line endings .replace(/\r\n/g, '\n') .replace(/\r/g, '\n') // Remove excessive whitespace .replace(/[ \t]+/g, ' ') // Remove excessive line breaks (more than 2 consecutive) .replace(/\n{3,}/g, '\n\n') // Trim .trim(); } /** * Extract headings from text (simple heuristic) */ private static extractHeadings(text: string): Heading[] { const headings: Heading[] = []; const lines = text.split('\n'); lines.forEach((line, index) => { const trimmed = line.trim(); // Check for markdown-style headings const markdownMatch = trimmed.match(/^(#{1,6})\s+(.+)$/); if (markdownMatch) { headings.push({ level: markdownMatch[1].length, text: markdownMatch[2], position: text.indexOf(line) }); return; } // Check for underlined headings (next line is all = or -) if (index < lines.length - 1) { const nextLine = lines[index + 1].trim(); if (nextLine.length > 0 && /^[=]{3,}$/.test(nextLine)) { headings.push({ level: 1, text: trimmed, position: text.indexOf(line) }); } else if (nextLine.length > 0 && /^[-]{3,}$/.test(nextLine)) { headings.push({ level: 2, text: trimmed, position: text.indexOf(line) }); } } // Check for lines that look like headings (short, no punctuation at end) if (trimmed.length > 0 && trimmed.length < 100 && !trimmed.endsWith('.') && !trimmed.endsWith('!') && !trimmed.endsWith('?') && /^[A-Z]/.test(trimmed)) { // Only consider as heading if it's followed by content if (index < lines.length - 1 && lines[index + 1].trim().length > 0) { headings.push({ level: 3, text: trimmed, position: text.indexOf(line) }); } } }); return headings; } /** * Analyze document structure */ private static analyzeStructure(text: string, headings: Heading[]): DocumentStructure { const sections = headings.map((heading, index) => { const nextHeading = headings[index + 1]; const endPosition = nextHeading ? nextHeading.position : text.length; return { title: heading.text, level: heading.level, startPosition: heading.position, endPosition, content: text.slice(heading.position, endPosition).trim() }; }); const toc = headings.map(heading => ({ title: heading.text, level: heading.level, position: heading.position })); return { sections, toc, pageBreaks: [] // No page breaks in plain text }; } /** * Create intelligent content chunks based on document structure */ private static createChunks(text: string, chunkSize: number): ContentChunk[] { // First, try to chunk by sections if headings are present const headings = this.extractHeadings(text); if (headings.length > 1) { return this.createStructuralChunks(text, headings, chunkSize); } else { return this.createSemanticChunks(text, chunkSize); } } /** * Create chunks based on document structure (headings/sections) */ private static createStructuralChunks(text: string, headings: Heading[], chunkSize: number): ContentChunk[] { const chunks: ContentChunk[] = []; for (let i = 0; i < headings.length; i++) { const heading = headings[i]; const nextHeading = headings[i + 1]; const startPos = heading.position; const endPos = nextHeading ? nextHeading.position : text.length; const sectionContent = text.slice(startPos, endPos).trim(); if (sectionContent.length <= chunkSize) { // Section fits in one chunk chunks.push({ id: `section_${i}`, content: sectionContent, section: heading.text, metadata: { wordCount: this.countWords(sectionContent), position: { start: startPos, end: endPos } } }); } else { // Section needs to be split into multiple chunks const subChunks = this.splitLargeSection(sectionContent, chunkSize, heading.text, startPos); chunks.push(...subChunks); } } return chunks; } /** * Create semantic chunks based on paragraphs and sentence boundaries */ private static createSemanticChunks(text: string, chunkSize: number): ContentChunk[] { const chunks: ContentChunk[] = []; const paragraphs = text.split(/\n\s*\n/); let currentChunk = ''; let chunkIndex = 0; let startPosition = 0; for (const paragraph of paragraphs) { const trimmedParagraph = paragraph.trim(); if (!trimmedParagraph) continue; const testChunk = currentChunk + (currentChunk ? '\n\n' : '') + trimmedParagraph; if (testChunk.length > chunkSize && currentChunk.length > 0) { // Create chunk from current content chunks.push({ id: `chunk_${chunkIndex}`, content: currentChunk.trim(), metadata: { wordCount: this.countWords(currentChunk), position: { start: startPosition, end: startPosition + currentChunk.length } } }); // Start new chunk currentChunk = trimmedParagraph; startPosition = text.indexOf(currentChunk, startPosition + currentChunk.length); chunkIndex++; // If single paragraph is too large, split it further if (trimmedParagraph.length > chunkSize) { const sentenceChunks = this.splitLargeParagraph(trimmedParagraph, chunkSize, startPosition); chunks.push(...sentenceChunks); currentChunk = ''; chunkIndex += sentenceChunks.length; } } else { currentChunk = testChunk; } } // Add final chunk if (currentChunk.trim()) { chunks.push({ id: `chunk_${chunkIndex}`, content: currentChunk.trim(), metadata: { wordCount: this.countWords(currentChunk), position: { start: startPosition, end: startPosition + currentChunk.length } } }); } return chunks; } /** * Split large section into smaller chunks while preserving context */ private static splitLargeSection(sectionContent: string, chunkSize: number, sectionTitle: string, basePosition: number): ContentChunk[] { const chunks: ContentChunk[] = []; const paragraphs = sectionContent.split(/\n\s*\n/); let currentChunk = ''; let chunkIndex = 0; let position = basePosition; // Always include section title in first chunk const titleLine = paragraphs[0]; // Assuming first line is the title currentChunk = titleLine; for (let i = 1; i < paragraphs.length; i++) { const paragraph = paragraphs[i].trim(); if (!paragraph) continue; const testChunk = currentChunk + '\n\n' + paragraph; if (testChunk.length > chunkSize && currentChunk.length > titleLine.length) { // Create chunk chunks.push({ id: `section_${sectionTitle.replace(/\s+/g, '_').toLowerCase()}_${chunkIndex}`, content: currentChunk.trim(), section: sectionTitle, metadata: { wordCount: this.countWords(currentChunk), position: { start: position, end: position + currentChunk.length } } }); // Start new chunk with section context currentChunk = `${titleLine} (continued)\n\n${paragraph}`; position += currentChunk.length; chunkIndex++; } else { currentChunk = testChunk; } } // Add final chunk if (currentChunk.trim()) { chunks.push({ id: `section_${sectionTitle.replace(/\s+/g, '_').toLowerCase()}_${chunkIndex}`, content: currentChunk.trim(), section: sectionTitle, metadata: { wordCount: this.countWords(currentChunk), position: { start: position, end: position + currentChunk.length } } }); } return chunks; } /** * Split large paragraph at sentence boundaries */ private static splitLargeParagraph(paragraph: string, chunkSize: number, basePosition: number): ContentChunk[] { const chunks: ContentChunk[] = []; const sentences = paragraph.split(/(?<=[.!?])\s+/); let currentChunk = ''; let chunkIndex = 0; let position = basePosition; for (const sentence of sentences) { const testChunk = currentChunk + (currentChunk ? ' ' : '') + sentence; if (testChunk.length > chunkSize && currentChunk.length > 0) { chunks.push({ id: `para_chunk_${chunkIndex}`, content: currentChunk.trim(), metadata: { wordCount: this.countWords(currentChunk), position: { start: position, end: position + currentChunk.length } } }); currentChunk = sentence; position += currentChunk.length; chunkIndex++; } else { currentChunk = testChunk; } } // Add final chunk if (currentChunk.trim()) { chunks.push({ id: `para_chunk_${chunkIndex}`, content: currentChunk.trim(), metadata: { wordCount: this.countWords(currentChunk), position: { start: position, end: position + currentChunk.length } } }); } return chunks; } /** * Count words in text */ private static countWords(text: string): number { return text.trim().split(/\s+/).filter(word => word.length > 0).length; } /** * Simple language detection (basic heuristic) */ private static detectLanguage(text: string): string { // Very basic language detection - could be enhanced with a proper library const sample = text.slice(0, 1000).toLowerCase(); // Check for common English words const englishWords = ['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']; const englishCount = englishWords.reduce((count, word) => { return count + (sample.match(new RegExp(`\\b${word}\\b`, 'g')) || []).length; }, 0); if (englishCount > 5) { return 'en'; } return 'unknown'; } /** * Convert text to markdown format */ private static convertToMarkdown(text: string, mimeType: string, headings: Heading[]): string { // If already markdown, return as-is if (mimeType === 'text/markdown') { return text; } // If HTML, do basic conversion if (mimeType === 'text/html') { return this.htmlToMarkdown(text); } // For plain text, preserve structure and add markdown formatting let markdown = text; // Convert detected headings to markdown format headings.forEach(heading => { const headingText = heading.text; const markdownHeading = '#'.repeat(Math.min(heading.level, 6)) + ' ' + headingText; markdown = markdown.replace(headingText, markdownHeading); }); return markdown; } /** * Basic HTML to Markdown conversion */ private static htmlToMarkdown(html: string): string { return html // Headers .replace(/<h([1-6])[^>]*>(.*?)<\/h[1-6]>/gi, (_, level, text) => { return '#'.repeat(parseInt(level)) + ' ' + text.trim() + '\n\n'; }) // Paragraphs .replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n') // Bold .replace(/<(strong|b)[^>]*>(.*?)<\/(strong|b)>/gi, '**$2**') // Italic .replace(/<(em|i)[^>]*>(.*?)<\/(em|i)>/gi, '*$2*') // Links .replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)') // Line breaks .replace(/<br[^>]*>/gi, '\n') // Remove remaining HTML tags .replace(/<[^>]*>/g, '') // Clean up excessive whitespace .replace(/\n{3,}/g, '\n\n') .trim(); } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ducla5/gdriver-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server