text-extractor.ts•13.8 kB
/**
* Basic text extraction utilities
*/
import { ProcessedContent, ContentMetadata, DocumentStructure, ContentChunk, Heading } from '../types/content.js';
export interface TextExtractionOptions {
preserveFormatting?: boolean;
extractHeadings?: boolean;
chunkSize?: number;
}
export class TextExtractor {
/**
* Extract text content from plain text buffer
*/
static async extractText(
buffer: Buffer,
mimeType: string,
options: TextExtractionOptions = {}
): Promise<ProcessedContent> {
const text = buffer.toString('utf-8');
// Clean and normalize text
const cleanText = this.cleanText(text);
// Extract headings if requested
const headings = options.extractHeadings ? this.extractHeadings(cleanText) : [];
// Create document structure
const structure = this.analyzeStructure(cleanText, headings);
// Generate chunks
const chunks = this.createChunks(cleanText, options.chunkSize || 4000);
// Create metadata
const metadata: ContentMetadata = {
wordCount: this.countWords(cleanText),
language: this.detectLanguage(cleanText),
headings,
images: [], // No images in plain text
tables: [], // No tables in plain text
lastProcessed: new Date()
};
// Convert to markdown format
const markdown = this.convertToMarkdown(cleanText, mimeType, headings);
return {
markdown,
metadata,
structure,
chunks
};
}
/**
* Clean and normalize text content
*/
private static cleanText(text: string): string {
return text
// Normalize line endings
.replace(/\r\n/g, '\n')
.replace(/\r/g, '\n')
// Remove excessive whitespace
.replace(/[ \t]+/g, ' ')
// Remove excessive line breaks (more than 2 consecutive)
.replace(/\n{3,}/g, '\n\n')
// Trim
.trim();
}
/**
* Extract headings from text (simple heuristic)
*/
private static extractHeadings(text: string): Heading[] {
const headings: Heading[] = [];
const lines = text.split('\n');
lines.forEach((line, index) => {
const trimmed = line.trim();
// Check for markdown-style headings
const markdownMatch = trimmed.match(/^(#{1,6})\s+(.+)$/);
if (markdownMatch) {
headings.push({
level: markdownMatch[1].length,
text: markdownMatch[2],
position: text.indexOf(line)
});
return;
}
// Check for underlined headings (next line is all = or -)
if (index < lines.length - 1) {
const nextLine = lines[index + 1].trim();
if (nextLine.length > 0 && /^[=]{3,}$/.test(nextLine)) {
headings.push({
level: 1,
text: trimmed,
position: text.indexOf(line)
});
} else if (nextLine.length > 0 && /^[-]{3,}$/.test(nextLine)) {
headings.push({
level: 2,
text: trimmed,
position: text.indexOf(line)
});
}
}
// Check for lines that look like headings (short, no punctuation at end)
if (trimmed.length > 0 && trimmed.length < 100 &&
!trimmed.endsWith('.') && !trimmed.endsWith('!') && !trimmed.endsWith('?') &&
/^[A-Z]/.test(trimmed)) {
// Only consider as heading if it's followed by content
if (index < lines.length - 1 && lines[index + 1].trim().length > 0) {
headings.push({
level: 3,
text: trimmed,
position: text.indexOf(line)
});
}
}
});
return headings;
}
/**
* Analyze document structure
*/
private static analyzeStructure(text: string, headings: Heading[]): DocumentStructure {
const sections = headings.map((heading, index) => {
const nextHeading = headings[index + 1];
const endPosition = nextHeading ? nextHeading.position : text.length;
return {
title: heading.text,
level: heading.level,
startPosition: heading.position,
endPosition,
content: text.slice(heading.position, endPosition).trim()
};
});
const toc = headings.map(heading => ({
title: heading.text,
level: heading.level,
position: heading.position
}));
return {
sections,
toc,
pageBreaks: [] // No page breaks in plain text
};
}
/**
* Create intelligent content chunks based on document structure
*/
private static createChunks(text: string, chunkSize: number): ContentChunk[] {
// First, try to chunk by sections if headings are present
const headings = this.extractHeadings(text);
if (headings.length > 1) {
return this.createStructuralChunks(text, headings, chunkSize);
} else {
return this.createSemanticChunks(text, chunkSize);
}
}
/**
* Create chunks based on document structure (headings/sections)
*/
private static createStructuralChunks(text: string, headings: Heading[], chunkSize: number): ContentChunk[] {
const chunks: ContentChunk[] = [];
for (let i = 0; i < headings.length; i++) {
const heading = headings[i];
const nextHeading = headings[i + 1];
const startPos = heading.position;
const endPos = nextHeading ? nextHeading.position : text.length;
const sectionContent = text.slice(startPos, endPos).trim();
if (sectionContent.length <= chunkSize) {
// Section fits in one chunk
chunks.push({
id: `section_${i}`,
content: sectionContent,
section: heading.text,
metadata: {
wordCount: this.countWords(sectionContent),
position: { start: startPos, end: endPos }
}
});
} else {
// Section needs to be split into multiple chunks
const subChunks = this.splitLargeSection(sectionContent, chunkSize, heading.text, startPos);
chunks.push(...subChunks);
}
}
return chunks;
}
/**
* Create semantic chunks based on paragraphs and sentence boundaries
*/
private static createSemanticChunks(text: string, chunkSize: number): ContentChunk[] {
const chunks: ContentChunk[] = [];
const paragraphs = text.split(/\n\s*\n/);
let currentChunk = '';
let chunkIndex = 0;
let startPosition = 0;
for (const paragraph of paragraphs) {
const trimmedParagraph = paragraph.trim();
if (!trimmedParagraph) continue;
const testChunk = currentChunk + (currentChunk ? '\n\n' : '') + trimmedParagraph;
if (testChunk.length > chunkSize && currentChunk.length > 0) {
// Create chunk from current content
chunks.push({
id: `chunk_${chunkIndex}`,
content: currentChunk.trim(),
metadata: {
wordCount: this.countWords(currentChunk),
position: { start: startPosition, end: startPosition + currentChunk.length }
}
});
// Start new chunk
currentChunk = trimmedParagraph;
startPosition = text.indexOf(currentChunk, startPosition + currentChunk.length);
chunkIndex++;
// If single paragraph is too large, split it further
if (trimmedParagraph.length > chunkSize) {
const sentenceChunks = this.splitLargeParagraph(trimmedParagraph, chunkSize, startPosition);
chunks.push(...sentenceChunks);
currentChunk = '';
chunkIndex += sentenceChunks.length;
}
} else {
currentChunk = testChunk;
}
}
// Add final chunk
if (currentChunk.trim()) {
chunks.push({
id: `chunk_${chunkIndex}`,
content: currentChunk.trim(),
metadata: {
wordCount: this.countWords(currentChunk),
position: { start: startPosition, end: startPosition + currentChunk.length }
}
});
}
return chunks;
}
/**
* Split large section into smaller chunks while preserving context
*/
private static splitLargeSection(sectionContent: string, chunkSize: number, sectionTitle: string, basePosition: number): ContentChunk[] {
const chunks: ContentChunk[] = [];
const paragraphs = sectionContent.split(/\n\s*\n/);
let currentChunk = '';
let chunkIndex = 0;
let position = basePosition;
// Always include section title in first chunk
const titleLine = paragraphs[0]; // Assuming first line is the title
currentChunk = titleLine;
for (let i = 1; i < paragraphs.length; i++) {
const paragraph = paragraphs[i].trim();
if (!paragraph) continue;
const testChunk = currentChunk + '\n\n' + paragraph;
if (testChunk.length > chunkSize && currentChunk.length > titleLine.length) {
// Create chunk
chunks.push({
id: `section_${sectionTitle.replace(/\s+/g, '_').toLowerCase()}_${chunkIndex}`,
content: currentChunk.trim(),
section: sectionTitle,
metadata: {
wordCount: this.countWords(currentChunk),
position: { start: position, end: position + currentChunk.length }
}
});
// Start new chunk with section context
currentChunk = `${titleLine} (continued)\n\n${paragraph}`;
position += currentChunk.length;
chunkIndex++;
} else {
currentChunk = testChunk;
}
}
// Add final chunk
if (currentChunk.trim()) {
chunks.push({
id: `section_${sectionTitle.replace(/\s+/g, '_').toLowerCase()}_${chunkIndex}`,
content: currentChunk.trim(),
section: sectionTitle,
metadata: {
wordCount: this.countWords(currentChunk),
position: { start: position, end: position + currentChunk.length }
}
});
}
return chunks;
}
/**
* Split large paragraph at sentence boundaries
*/
private static splitLargeParagraph(paragraph: string, chunkSize: number, basePosition: number): ContentChunk[] {
const chunks: ContentChunk[] = [];
const sentences = paragraph.split(/(?<=[.!?])\s+/);
let currentChunk = '';
let chunkIndex = 0;
let position = basePosition;
for (const sentence of sentences) {
const testChunk = currentChunk + (currentChunk ? ' ' : '') + sentence;
if (testChunk.length > chunkSize && currentChunk.length > 0) {
chunks.push({
id: `para_chunk_${chunkIndex}`,
content: currentChunk.trim(),
metadata: {
wordCount: this.countWords(currentChunk),
position: { start: position, end: position + currentChunk.length }
}
});
currentChunk = sentence;
position += currentChunk.length;
chunkIndex++;
} else {
currentChunk = testChunk;
}
}
// Add final chunk
if (currentChunk.trim()) {
chunks.push({
id: `para_chunk_${chunkIndex}`,
content: currentChunk.trim(),
metadata: {
wordCount: this.countWords(currentChunk),
position: { start: position, end: position + currentChunk.length }
}
});
}
return chunks;
}
/**
* Count words in text
*/
private static countWords(text: string): number {
return text.trim().split(/\s+/).filter(word => word.length > 0).length;
}
/**
* Simple language detection (basic heuristic)
*/
private static detectLanguage(text: string): string {
// Very basic language detection - could be enhanced with a proper library
const sample = text.slice(0, 1000).toLowerCase();
// Check for common English words
const englishWords = ['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'];
const englishCount = englishWords.reduce((count, word) => {
return count + (sample.match(new RegExp(`\\b${word}\\b`, 'g')) || []).length;
}, 0);
if (englishCount > 5) {
return 'en';
}
return 'unknown';
}
/**
* Convert text to markdown format
*/
private static convertToMarkdown(text: string, mimeType: string, headings: Heading[]): string {
// If already markdown, return as-is
if (mimeType === 'text/markdown') {
return text;
}
// If HTML, do basic conversion
if (mimeType === 'text/html') {
return this.htmlToMarkdown(text);
}
// For plain text, preserve structure and add markdown formatting
let markdown = text;
// Convert detected headings to markdown format
headings.forEach(heading => {
const headingText = heading.text;
const markdownHeading = '#'.repeat(Math.min(heading.level, 6)) + ' ' + headingText;
markdown = markdown.replace(headingText, markdownHeading);
});
return markdown;
}
/**
* Basic HTML to Markdown conversion
*/
private static htmlToMarkdown(html: string): string {
return html
// Headers
.replace(/<h([1-6])[^>]*>(.*?)<\/h[1-6]>/gi, (_, level, text) => {
return '#'.repeat(parseInt(level)) + ' ' + text.trim() + '\n\n';
})
// Paragraphs
.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n')
// Bold
.replace(/<(strong|b)[^>]*>(.*?)<\/(strong|b)>/gi, '**$2**')
// Italic
.replace(/<(em|i)[^>]*>(.*?)<\/(em|i)>/gi, '*$2*')
// Links
.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)')
// Line breaks
.replace(/<br[^>]*>/gi, '\n')
// Remove remaining HTML tags
.replace(/<[^>]*>/g, '')
// Clean up excessive whitespace
.replace(/\n{3,}/g, '\n\n')
.trim();
}
}