/**
* Content Plan Builder - Document Parser
*
* Parses various document formats (PDF, DOCX, TXT, MD, etc.) into plain text.
*/
import { readFile } from 'fs/promises';
import { extname } from 'path';
import mammoth from 'mammoth';
import pdfParse from 'pdf-parse';
import type { ParsedDocument, ContentPlanInput } from './types.js';
/**
* Parse a document file into plain text
*/
export async function parseDocument(filePath: string): Promise<ParsedDocument> {
const ext = extname(filePath).toLowerCase();
const fileName = filePath.split('/').pop() || filePath;
try {
switch (ext) {
case '.pdf':
return await parsePdf(filePath, fileName);
case '.docx':
return await parseDocx(filePath, fileName);
case '.txt':
case '.md':
case '.markdown':
return await parseTextFile(filePath, fileName, ext === '.md' || ext === '.markdown' ? 'md' : 'txt');
case '.html':
case '.htm':
return await parseHtml(filePath, fileName);
case '.csv':
return await parseCsv(filePath, fileName);
case '.json':
return await parseJson(filePath, fileName);
default:
// Try to read as plain text
return await parseTextFile(filePath, fileName, 'txt');
}
} catch (error) {
const message = error instanceof Error ? error.message : 'Unknown error';
throw new Error(`Failed to parse ${fileName}: ${message}`);
}
}
/**
* Parse a PDF file
*/
async function parsePdf(filePath: string, fileName: string): Promise<ParsedDocument> {
const buffer = await readFile(filePath);
const data = await pdfParse(buffer);
return {
text: data.text,
fileName,
type: 'pdf',
pageCount: data.numpages,
warnings: data.text.length === 0 ? ['PDF appears to be empty or image-only'] : undefined
};
}
/**
* Parse a DOCX file
*/
async function parseDocx(filePath: string, fileName: string): Promise<ParsedDocument> {
const buffer = await readFile(filePath);
const result = await mammoth.extractRawText({ buffer });
return {
text: result.value,
fileName,
type: 'docx',
warnings: result.messages.length > 0
? result.messages.map(m => m.message)
: undefined
};
}
/**
* Parse a plain text or markdown file
*/
async function parseTextFile(
filePath: string,
fileName: string,
type: 'txt' | 'md'
): Promise<ParsedDocument> {
const text = await readFile(filePath, 'utf-8');
return {
text,
fileName,
type
};
}
/**
* Parse an HTML file (basic extraction)
*/
async function parseHtml(filePath: string, fileName: string): Promise<ParsedDocument> {
const html = await readFile(filePath, 'utf-8');
// Basic HTML tag removal
const text = html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/ /g, ' ')
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"')
.replace(/\s+/g, ' ')
.trim();
return {
text,
fileName,
type: 'html'
};
}
/**
* Parse a CSV file
*/
async function parseCsv(filePath: string, fileName: string): Promise<ParsedDocument> {
const csv = await readFile(filePath, 'utf-8');
// Convert CSV to readable format
const lines = csv.split('\n');
const headers = lines[0]?.split(',').map(h => h.trim()) || [];
let text = `CSV Data from ${fileName}:\n\n`;
text += `Columns: ${headers.join(', ')}\n\n`;
// Include first 100 rows as text
for (let i = 1; i < Math.min(lines.length, 101); i++) {
const values = lines[i].split(',');
const row = headers.map((h, idx) => `${h}: ${values[idx]?.trim() || ''}`).join(', ');
text += `Row ${i}: ${row}\n`;
}
if (lines.length > 101) {
text += `\n... and ${lines.length - 101} more rows`;
}
return {
text,
fileName,
type: 'csv'
};
}
/**
* Parse a JSON file
*/
async function parseJson(filePath: string, fileName: string): Promise<ParsedDocument> {
const json = await readFile(filePath, 'utf-8');
const data = JSON.parse(json);
// Convert JSON to readable text
const text = `JSON Data from ${fileName}:\n\n${JSON.stringify(data, null, 2)}`;
return {
text,
fileName,
type: 'json'
};
}
/**
* Process a ContentPlanInput and extract text content
*/
export async function processInput(input: ContentPlanInput): Promise<string> {
switch (input.type) {
case 'file':
// Content is a file path
const doc = await parseDocument(input.content);
return `[Document: ${doc.fileName}]\n\n${doc.text}`;
case 'text':
case 'transcript':
case 'meeting_notes':
// Content is already text
return input.content;
default:
return input.content;
}
}
/**
* Process multiple inputs and combine into a single context string
*/
export async function processAllInputs(inputs: ContentPlanInput[]): Promise<string> {
const processedTexts: string[] = [];
for (const input of inputs) {
try {
const text = await processInput(input);
processedTexts.push(text);
} catch (error) {
const message = error instanceof Error ? error.message : 'Unknown error';
console.error(`Warning: Failed to process input: ${message}`);
// Continue with other inputs
}
}
return processedTexts.join('\n\n---\n\n');
}
/**
* Determine the primary input type from a list of inputs
*/
export function getPrimaryInputType(inputs: ContentPlanInput[]): string {
if (inputs.length === 0) return 'content';
// Count input types
const typeCounts: Record<string, number> = {};
for (const input of inputs) {
typeCounts[input.type] = (typeCounts[input.type] || 0) + 1;
}
// Return the most common type
const sortedTypes = Object.entries(typeCounts).sort((a, b) => b[1] - a[1]);
return sortedTypes[0][0];
}