docx-processor.ts•11.6 kB
/**
* Microsoft Word document processor
* Handles DOCX file processing and conversion to markdown
*/
import mammoth from 'mammoth';
import { ProcessedContent, ContentMetadata, DocumentStructure, ContentChunk, Heading, ImageInfo, TableInfo } from '../types/content.js';
export interface DOCXProcessorOptions {
chunkSize?: number;
preserveFormatting?: boolean;
extractHeadings?: boolean;
includeImages?: boolean;
includeTables?: boolean;
}
export class DOCXProcessor {
/**
* Process DOCX file and convert to markdown
*/
async processDOCX(
buffer: Buffer,
options: DOCXProcessorOptions = {}
): Promise<ProcessedContent> {
try {
// Configure mammoth options for better markdown conversion
const mammothOptions: any = {
styleMap: [
// Map Word styles to markdown
"p[style-name='Heading 1'] => h1:fresh",
"p[style-name='Heading 2'] => h2:fresh",
"p[style-name='Heading 3'] => h3:fresh",
"p[style-name='Heading 4'] => h4:fresh",
"p[style-name='Heading 5'] => h5:fresh",
"p[style-name='Heading 6'] => h6:fresh",
"p[style-name='Title'] => h1:fresh",
"p[style-name='Subtitle'] => h2:fresh",
"r[style-name='Strong'] => strong",
"r[style-name='Emphasis'] => em"
]
};
// Add image converter if images should be included
if (options.includeImages !== false) {
mammothOptions.convertImage = mammoth.images.imgElement((image: any) => {
// Convert images to markdown format
return image.read("base64").then((imageBuffer: string) => {
return {
src: `data:${image.contentType};base64,${imageBuffer}`
};
});
});
}
// Extract HTML first, then convert to markdown
const htmlResult = await mammoth.convertToHtml(buffer as any, mammothOptions);
// Convert HTML to markdown
const markdown = this.htmlToMarkdown(htmlResult.value);
// Extract headings
const headings = options.extractHeadings ? this.extractHeadings(markdown) : [];
// Extract images and tables info
const images = this.extractImages(htmlResult.value);
const tables = this.extractTables(htmlResult.value);
// Create document structure
const structure = this.analyzeStructure(markdown, headings);
// Generate chunks
const chunks = this.createChunks(markdown, options.chunkSize || 4000);
// Create metadata
const metadata: ContentMetadata = {
wordCount: this.countWords(markdown),
language: this.detectLanguage(markdown),
headings,
images,
tables,
lastProcessed: new Date()
};
return {
markdown,
metadata,
structure,
chunks
};
} catch (error) {
throw new Error(`Failed to process DOCX file: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Convert HTML to Markdown
*/
private htmlToMarkdown(html: string): string {
return html
// Headers
.replace(/<h([1-6])[^>]*>(.*?)<\/h[1-6]>/gi, (_, level, text) => {
return '#'.repeat(parseInt(level)) + ' ' + this.cleanText(text) + '\n\n';
})
// Paragraphs
.replace(/<p[^>]*>(.*?)<\/p>/gi, (_, text) => {
const cleaned = this.cleanText(text);
return cleaned ? cleaned + '\n\n' : '';
})
// Strong/Bold
.replace(/<(strong|b)[^>]*>(.*?)<\/(strong|b)>/gi, '**$2**')
// Emphasis/Italic
.replace(/<(em|i)[^>]*>(.*?)<\/(em|i)>/gi, '*$2*')
// Underline
.replace(/<u[^>]*>(.*?)<\/u>/gi, '<u>$1</u>')
// Strikethrough
.replace(/<(s|strike|del)[^>]*>(.*?)<\/(s|strike|del)>/gi, '~~$2~~')
// Links
.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)')
// Lists
.replace(/<ul[^>]*>(.*?)<\/ul>/gis, (_, content) => {
return this.processUnorderedList(content) + '\n';
})
.replace(/<ol[^>]*>(.*?)<\/ol>/gis, (_, content) => {
return this.processOrderedList(content) + '\n';
})
// Tables
.replace(/<table[^>]*>(.*?)<\/table>/gis, (_, content) => {
return this.processTable(content) + '\n';
})
// Images
.replace(/<img[^>]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*>/gi, '')
.replace(/<img[^>]*src="([^"]*)"[^>]*>/gi, '')
// Line breaks
.replace(/<br[^>]*>/gi, '\n')
// Remove remaining HTML tags
.replace(/<[^>]*>/g, '')
// Clean up excessive whitespace
.replace(/\n{3,}/g, '\n\n')
// Decode HTML entities
.replace(/ /g, ' ')
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"')
.replace(/'/g, "'")
.trim();
}
/**
* Process unordered list
*/
private processUnorderedList(content: string): string {
return content
.replace(/<li[^>]*>(.*?)<\/li>/gis, (_, text) => {
return '- ' + this.cleanText(text).replace(/\n/g, ' ') + '\n';
});
}
/**
* Process ordered list
*/
private processOrderedList(content: string): string {
let counter = 1;
return content
.replace(/<li[^>]*>(.*?)<\/li>/gis, (_, text) => {
return `${counter++}. ` + this.cleanText(text).replace(/\n/g, ' ') + '\n';
});
}
/**
* Process HTML table to markdown
*/
private processTable(content: string): string {
const rows: string[] = [];
// Extract table rows
const rowMatches = content.match(/<tr[^>]*>(.*?)<\/tr>/gis);
if (!rowMatches) return '';
rowMatches.forEach((rowHtml, rowIndex) => {
const cells: string[] = [];
// Extract cells (both td and th)
const cellMatches = rowHtml.match(/<(td|th)[^>]*>(.*?)<\/(td|th)>/gis);
if (cellMatches) {
cellMatches.forEach(cellHtml => {
const cellContent = cellHtml.replace(/<[^>]*>/g, '').trim();
cells.push(cellContent || ' ');
});
}
if (cells.length > 0) {
rows.push('| ' + cells.join(' | ') + ' |');
// Add header separator after first row
if (rowIndex === 0) {
rows.push('| ' + cells.map(() => '---').join(' | ') + ' |');
}
}
});
return rows.join('\n');
}
/**
* Clean text content
*/
private cleanText(text: string): string {
return text
.replace(/<[^>]*>/g, '') // Remove HTML tags
.replace(/ /g, ' ')
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/\s+/g, ' ')
.trim();
}
/**
* Extract headings from markdown
*/
private extractHeadings(markdown: string): Heading[] {
const headings: Heading[] = [];
const lines = markdown.split('\n');
lines.forEach(line => {
const trimmed = line.trim();
const match = trimmed.match(/^(#{1,6})\s+(.+)$/);
if (match) {
headings.push({
level: match[1].length,
text: match[2],
position: markdown.indexOf(line)
});
}
});
return headings;
}
/**
* Extract image information from HTML
*/
private extractImages(html: string): ImageInfo[] {
const images: ImageInfo[] = [];
const imageRegex = /<img[^>]*src="([^"]*)"[^>]*(?:alt="([^"]*)")?[^>]*>/gi;
let match;
while ((match = imageRegex.exec(html)) !== null) {
images.push({
src: match[1],
alt: match[2] || 'Image',
position: match.index || 0
});
}
return images;
}
/**
* Extract table information from HTML
*/
private extractTables(html: string): TableInfo[] {
const tables: TableInfo[] = [];
const tableRegex = /<table[^>]*>(.*?)<\/table>/gis;
let match;
while ((match = tableRegex.exec(html)) !== null) {
const tableContent = match[1];
// Count rows
const rowMatches = tableContent.match(/<tr[^>]*>/gi);
const rows = rowMatches ? rowMatches.length : 0;
// Count columns (from first row)
const firstRowMatch = tableContent.match(/<tr[^>]*>(.*?)<\/tr>/i);
let columns = 0;
if (firstRowMatch) {
const cellMatches = firstRowMatch[1].match(/<(td|th)[^>]*>/gi);
columns = cellMatches ? cellMatches.length : 0;
}
if (rows > 0 && columns > 0) {
tables.push({
rows,
columns,
position: match.index || 0
});
}
}
return tables;
}
/**
* Analyze document structure
*/
private analyzeStructure(markdown: string, headings: Heading[]): DocumentStructure {
const sections = headings.map((heading, index) => {
const nextHeading = headings[index + 1];
const endPosition = nextHeading ? nextHeading.position : markdown.length;
return {
title: heading.text,
level: heading.level,
startPosition: heading.position,
endPosition,
content: markdown.slice(heading.position, endPosition).trim()
};
});
const toc = headings.map(heading => ({
title: heading.text,
level: heading.level,
position: heading.position
}));
return {
sections,
toc,
pageBreaks: [] // DOCX doesn't have traditional page breaks in this context
};
}
/**
* Create content chunks
*/
private createChunks(content: string, chunkSize: number): ContentChunk[] {
const chunks: ContentChunk[] = [];
const paragraphs = content.split('\n\n').filter(p => p.trim());
let currentChunk = '';
let chunkIndex = 0;
let startPosition = 0;
for (const paragraph of paragraphs) {
const testChunk = currentChunk + (currentChunk ? '\n\n' : '') + paragraph;
if (testChunk.length > chunkSize && currentChunk.length > 0) {
// Create chunk
chunks.push({
id: `chunk_${chunkIndex}`,
content: currentChunk.trim(),
metadata: {
wordCount: this.countWords(currentChunk),
position: { start: startPosition, end: startPosition + currentChunk.length }
}
});
// Start new chunk
currentChunk = paragraph;
startPosition = content.indexOf(currentChunk, startPosition + currentChunk.length);
chunkIndex++;
} else {
currentChunk = testChunk;
}
}
// Add final chunk
if (currentChunk.trim()) {
chunks.push({
id: `chunk_${chunkIndex}`,
content: currentChunk.trim(),
metadata: {
wordCount: this.countWords(currentChunk),
position: { start: startPosition, end: startPosition + currentChunk.length }
}
});
}
return chunks;
}
/**
* Count words in text
*/
private countWords(text: string): number {
return text.trim().split(/\s+/).filter(word => word.length > 0).length;
}
/**
* Simple language detection
*/
private detectLanguage(text: string): string {
const sample = text.slice(0, 1000).toLowerCase();
const englishWords = ['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'];
const englishCount = englishWords.reduce((count, word) => {
return count + (sample.match(new RegExp(`\\b${word}\\b`, 'g')) || []).length;
}, 0);
return englishCount > 5 ? 'en' : 'unknown';
}
}