Content Plan Builder

document-parser.ts•5.84 KiB

/** * Content Plan Builder - Document Parser * * Parses various document formats (PDF, DOCX, TXT, MD, etc.) into plain text. */ import { readFile } from 'fs/promises'; import { extname } from 'path'; import mammoth from 'mammoth'; import pdfParse from 'pdf-parse'; import type { ParsedDocument, ContentPlanInput } from './types.js'; /** * Parse a document file into plain text */ export async function parseDocument(filePath: string): Promise<ParsedDocument> { const ext = extname(filePath).toLowerCase(); const fileName = filePath.split('/').pop() || filePath; try { switch (ext) { case '.pdf': return await parsePdf(filePath, fileName); case '.docx': return await parseDocx(filePath, fileName); case '.txt': case '.md': case '.markdown': return await parseTextFile(filePath, fileName, ext === '.md' || ext === '.markdown' ? 'md' : 'txt'); case '.html': case '.htm': return await parseHtml(filePath, fileName); case '.csv': return await parseCsv(filePath, fileName); case '.json': return await parseJson(filePath, fileName); default: // Try to read as plain text return await parseTextFile(filePath, fileName, 'txt'); } } catch (error) { const message = error instanceof Error ? error.message : 'Unknown error'; throw new Error(`Failed to parse ${fileName}: ${message}`); } } /** * Parse a PDF file */ async function parsePdf(filePath: string, fileName: string): Promise<ParsedDocument> { const buffer = await readFile(filePath); const data = await pdfParse(buffer); return { text: data.text, fileName, type: 'pdf', pageCount: data.numpages, warnings: data.text.length === 0 ? ['PDF appears to be empty or image-only'] : undefined }; } /** * Parse a DOCX file */ async function parseDocx(filePath: string, fileName: string): Promise<ParsedDocument> { const buffer = await readFile(filePath); const result = await mammoth.extractRawText({ buffer }); return { text: result.value, fileName, type: 'docx', warnings: result.messages.length > 0 ? result.messages.map(m => m.message) : undefined }; } /** * Parse a plain text or markdown file */ async function parseTextFile( filePath: string, fileName: string, type: 'txt' | 'md' ): Promise<ParsedDocument> { const text = await readFile(filePath, 'utf-8'); return { text, fileName, type }; } /** * Parse an HTML file (basic extraction) */ async function parseHtml(filePath: string, fileName: string): Promise<ParsedDocument> { const html = await readFile(filePath, 'utf-8'); // Basic HTML tag removal const text = html .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '') .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') .replace(/<[^>]+>/g, ' ') .replace(/ /g, ' ') .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/\s+/g, ' ') .trim(); return { text, fileName, type: 'html' }; } /** * Parse a CSV file */ async function parseCsv(filePath: string, fileName: string): Promise<ParsedDocument> { const csv = await readFile(filePath, 'utf-8'); // Convert CSV to readable format const lines = csv.split('\n'); const headers = lines[0]?.split(',').map(h => h.trim()) || []; let text = `CSV Data from ${fileName}:\n\n`; text += `Columns: ${headers.join(', ')}\n\n`; // Include first 100 rows as text for (let i = 1; i < Math.min(lines.length, 101); i++) { const values = lines[i].split(','); const row = headers.map((h, idx) => `${h}: ${values[idx]?.trim() || ''}`).join(', '); text += `Row ${i}: ${row}\n`; } if (lines.length > 101) { text += `\n... and ${lines.length - 101} more rows`; } return { text, fileName, type: 'csv' }; } /** * Parse a JSON file */ async function parseJson(filePath: string, fileName: string): Promise<ParsedDocument> { const json = await readFile(filePath, 'utf-8'); const data = JSON.parse(json); // Convert JSON to readable text const text = `JSON Data from ${fileName}:\n\n${JSON.stringify(data, null, 2)}`; return { text, fileName, type: 'json' }; } /** * Process a ContentPlanInput and extract text content */ export async function processInput(input: ContentPlanInput): Promise<string> { switch (input.type) { case 'file': // Content is a file path const doc = await parseDocument(input.content); return `[Document: ${doc.fileName}]\n\n${doc.text}`; case 'text': case 'transcript': case 'meeting_notes': // Content is already text return input.content; default: return input.content; } } /** * Process multiple inputs and combine into a single context string */ export async function processAllInputs(inputs: ContentPlanInput[]): Promise<string> { const processedTexts: string[] = []; for (const input of inputs) { try { const text = await processInput(input); processedTexts.push(text); } catch (error) { const message = error instanceof Error ? error.message : 'Unknown error'; console.error(`Warning: Failed to process input: ${message}`); // Continue with other inputs } } return processedTexts.join('\n\n---\n\n'); } /** * Determine the primary input type from a list of inputs */ export function getPrimaryInputType(inputs: ContentPlanInput[]): string { if (inputs.length === 0) return 'content'; // Count input types const typeCounts: Record<string, number> = {}; for (const input of inputs) { typeCounts[input.type] = (typeCounts[input.type] || 0) + 1; } // Return the most common type const sortedTypes = Object.entries(typeCounts).sort((a, b) => b[1] - a[1]); return sortedTypes[0][0]; }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/smcdonnell7/content-plan-builder'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

document-parser.ts•5.84 KiB