content-processor.ts•30.3 kB
/**
* Main content processor that handles format detection and processing
*/
import { ContentProcessor, ProcessedContent, ChunkOptions, SearchResult, ContentMetadata, ExtractionSuggestion, Heading } from '../types/content.js';
import { FormatDetector } from './format-detector.js';
import { TextExtractor } from './text-extractor.js';
import { GoogleDocsProcessor } from './google-docs-processor.js';
import { DOCXProcessor } from './docx-processor.js';
import { OAuth2Client } from 'google-auth-library';
import { ResilientExecutor } from '../utils/retry-handler.js';
import { ErrorHandler, handleAsyncError } from '../utils/error-handler.js';
import {
FileProcessingError,
UnsupportedFileTypeError,
CorruptedFileError,
ValidationError
} from '../types/errors.js';
export class ContentProcessorImpl implements ContentProcessor {
private googleDocsProcessor?: GoogleDocsProcessor;
private pdfProcessor?: any; // Lazy loaded
private docxProcessor: DOCXProcessor;
private resilientExecutor: ResilientExecutor;
private errorHandler: ErrorHandler;
constructor(
auth?: OAuth2Client,
resilientExecutor?: ResilientExecutor,
errorHandler?: ErrorHandler
) {
if (auth) {
this.googleDocsProcessor = new GoogleDocsProcessor(auth);
}
this.docxProcessor = new DOCXProcessor();
this.resilientExecutor = resilientExecutor || new ResilientExecutor();
this.errorHandler = errorHandler || new ErrorHandler();
}
/**
* Process file content and convert to markdown
* For Google Docs, fileId should be passed instead of buffer content
*/
async processFile(file: Buffer, mimeType: string, fileId?: string): Promise<ProcessedContent> {
return this.resilientExecutor.execute(async () => {
// Validate inputs
if (!file && !fileId) {
throw new ValidationError('Either file buffer or fileId must be provided');
}
if (!mimeType) {
throw new ValidationError('MIME type is required for file processing');
}
// Detect format
const format = FormatDetector.detectFormat(mimeType);
if (!format.supported) {
throw new UnsupportedFileTypeError(
`Unsupported file format: ${mimeType}. Supported formats: ${FormatDetector.getSupportedMimeTypes().join(', ')}`,
{ mimeType, supportedFormats: FormatDetector.getSupportedMimeTypes() }
);
}
// Route to appropriate processor based on format
try {
switch (format.processor) {
case 'text':
return await this.processTextFile(file, mimeType);
case 'google-docs':
return await this.processGoogleDocsFile(fileId);
case 'pdf':
return await this.processPDFFile(file);
case 'docx':
return await this.processDOCXFile(file);
default:
throw new UnsupportedFileTypeError(
`No processor available for format: ${mimeType}`,
{ mimeType, processor: format.processor }
);
}
} catch (error) {
if (error instanceof UnsupportedFileTypeError || error instanceof ValidationError) {
throw error;
}
// Check if it's a corruption issue
if (error instanceof Error &&
(error.message.includes('corrupt') ||
error.message.includes('invalid') ||
error.message.includes('malformed'))) {
throw new CorruptedFileError(
`File appears to be corrupted: ${error.message}`,
{ mimeType, fileId, originalError: error.message }
);
}
throw new FileProcessingError(
`Failed to process file: ${error instanceof Error ? error.message : String(error)}`,
{ mimeType, fileId, originalError: error instanceof Error ? error.message : String(error) }
);
}
}, { operation: 'processFile', mimeType, fileId });
}
/**
* Process text files (plain text, markdown, HTML)
*/
private async processTextFile(file: Buffer, mimeType: string): Promise<ProcessedContent> {
const result = await handleAsyncError(async () => {
return await TextExtractor.extractText(file, mimeType, {
preserveFormatting: true,
extractHeadings: true,
chunkSize: 4000
});
}, { operation: 'processTextFile', mimeType }, this.errorHandler);
if (!result.success) {
throw result.error;
}
return result.data;
}
/**
* Process Google Docs files
*/
private async processGoogleDocsFile(fileId?: string): Promise<ProcessedContent> {
if (!fileId) {
throw new ValidationError('File ID is required for Google Docs processing');
}
if (!this.googleDocsProcessor) {
throw new ValidationError('Google Docs processor not initialized. OAuth2Client required.');
}
const result = await handleAsyncError(async () => {
return await this.googleDocsProcessor!.processGoogleDoc(fileId, {
chunkSize: 4000,
preserveFormatting: true,
includeImages: true,
includeTables: true
});
}, { operation: 'processGoogleDocsFile', fileId }, this.errorHandler);
if (!result.success) {
throw result.error;
}
return result.data;
}
/**
* Process PDF files
*/
private async processPDFFile(file: Buffer): Promise<ProcessedContent> {
try {
// Lazy load PDF processor to avoid import issues
if (!this.pdfProcessor) {
const { PDFProcessor } = await import('./pdf-processor.js');
this.pdfProcessor = new PDFProcessor();
}
return await this.pdfProcessor.processPDF(file, {
chunkSize: 4000,
preservePageBreaks: true,
extractHeadings: true
});
} catch (error) {
throw new Error(`Failed to process PDF file: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Process DOCX files
*/
private async processDOCXFile(file: Buffer): Promise<ProcessedContent> {
try {
return await this.docxProcessor.processDOCX(file, {
chunkSize: 4000,
preserveFormatting: true,
extractHeadings: true,
includeImages: true,
includeTables: true
});
} catch (error) {
throw new Error(`Failed to process DOCX file: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Extract specific chunk of content with intelligent chunking
*/
extractChunk(content: string, options: ChunkOptions): string {
// If page range is specified
if (options.pageRange) {
return this.extractPageRange(content, options.pageRange);
}
// If section name is specified
if (options.sectionName) {
return this.extractSection(content, options.sectionName);
}
// If keywords are specified
if (options.keywords && options.keywords.length > 0) {
return this.extractByKeywords(content, options.keywords, options.maxLength);
}
// If max length is specified, return intelligently truncated content
if (options.maxLength && content.length > options.maxLength) {
return this.intelligentTruncate(content, options.maxLength);
}
return content;
}
/**
* Extract content by page range with better page detection
*/
private extractPageRange(content: string, pageRange: { start: number; end: number }): string {
// Look for explicit page breaks first
const pageBreakPattern = /(?:\n\s*---\s*\n|\n\s*\f\s*\n|\n\s*Page \d+\s*\n)/gi;
const pageBreaks = [];
let match;
while ((match = pageBreakPattern.exec(content)) !== null) {
pageBreaks.push(match.index);
}
if (pageBreaks.length > 0) {
// Use explicit page breaks
const startIndex = pageRange.start <= 1 ? 0 : pageBreaks[pageRange.start - 2] || 0;
const endIndex = pageRange.end > pageBreaks.length ? content.length : pageBreaks[pageRange.end - 1] || content.length;
return content.slice(startIndex, endIndex).trim();
} else {
// Estimate pages based on content length (assuming ~500 words per page)
const wordsPerPage = 500;
const words = content.split(/\s+/);
const startWord = Math.max(0, (pageRange.start - 1) * wordsPerPage);
const endWord = Math.min(words.length, pageRange.end * wordsPerPage);
const extractedWords = words.slice(startWord, endWord);
return extractedWords.join(' ');
}
}
/**
* Extract content by section with improved section detection
*/
private extractSection(content: string, sectionName: string): string {
// Try different section patterns
const patterns = [
// Markdown headers with content
new RegExp(`^(#{1,6})\\s+${this.escapeRegex(sectionName)}\\s*$([\\s\\S]*?)(?=^#{1,6}\\s+|$)`, 'mi'),
// Underlined headers
new RegExp(`^${this.escapeRegex(sectionName)}\\s*$\\n[=-]{3,}\\n([\\s\\S]*?)(?=^.+$\\n[=-]{3,}|$)`, 'mi'),
// Numbered sections
new RegExp(`^\\d+\\.?\\s+${this.escapeRegex(sectionName)}\\s*$([\\s\\S]*?)(?=^\\d+\\.|$)`, 'mi')
];
for (const pattern of patterns) {
const match = content.match(pattern);
if (match) {
// Return the header and content
const headerLevel = match[1] || '##';
const sectionContent = match[2] || match[1] || '';
return `${headerLevel} ${sectionName}\n\n${sectionContent.trim()}`;
}
}
// If no exact match, try fuzzy matching
return this.fuzzyExtractSection(content, sectionName);
}
/**
* Fuzzy section extraction for partial matches
*/
private fuzzyExtractSection(content: string, sectionName: string): string {
const lines = content.split('\n');
const sectionLower = sectionName.toLowerCase();
for (let i = 0; i < lines.length; i++) {
const line = lines[i].toLowerCase();
// Check if line contains section name with some flexibility
if (line.includes(sectionLower) || this.calculateSimilarity(line, sectionLower) > 0.7) {
// Find the end of this section
let endIndex = lines.length;
for (let j = i + 1; j < lines.length; j++) {
const nextLine = lines[j].trim();
// Look for next section header
if (nextLine.length > 0 &&
(nextLine.match(/^#{1,6}\s/) ||
nextLine.match(/^\d+\./) ||
(j < lines.length - 1 && lines[j + 1].match(/^[=-]{3,}$/)))) {
endIndex = j;
break;
}
}
return lines.slice(i, endIndex).join('\n').trim();
}
}
return `Section "${sectionName}" not found in document.`;
}
/**
* Intelligent truncation that preserves sentence boundaries
*/
private intelligentTruncate(content: string, maxLength: number): string {
if (content.length <= maxLength) {
return content;
}
// Try to break at sentence boundaries
const sentences = content.split(/(?<=[.!?])\s+/);
let result = '';
for (const sentence of sentences) {
if ((result + sentence).length > maxLength - 3) { // Reserve space for "..."
break;
}
result += (result ? ' ' : '') + sentence;
}
// If we couldn't fit even one sentence, truncate at word boundary
if (!result) {
const words = content.split(/\s+/);
for (const word of words) {
if ((result + word).length > maxLength - 3) {
break;
}
result += (result ? ' ' : '') + word;
}
}
return result + (result.length < content.length ? '...' : '');
}
/**
* Extract content based on keywords with enhanced relevance scoring
*/
private extractByKeywords(content: string, keywords: string[], maxLength?: number): string {
const paragraphs = content.split(/\n\s*\n/);
const relevantParagraphs: { paragraph: string; score: number; context: string }[] = [];
paragraphs.forEach((paragraph, index) => {
const score = this.calculateKeywordScore(paragraph, keywords);
if (score > 0) {
// Get surrounding context
const contextStart = Math.max(0, index - 1);
const contextEnd = Math.min(paragraphs.length, index + 2);
const context = paragraphs.slice(contextStart, contextEnd).join('\n\n');
relevantParagraphs.push({
paragraph: paragraph.trim(),
score,
context: context.trim()
});
}
});
// Sort by relevance score
relevantParagraphs.sort((a, b) => b.score - a.score);
// If no paragraphs found, try sentence-level extraction
if (relevantParagraphs.length === 0) {
return this.extractSentencesByKeywords(content, keywords, maxLength);
}
// Combine paragraphs up to max length, avoiding duplicates
let result = '';
const usedContent = new Set<string>();
for (const item of relevantParagraphs) {
if (usedContent.has(item.paragraph)) continue;
const addition = (result ? '\n\n' : '') + item.paragraph;
if (maxLength && (result + addition).length > maxLength) {
break;
}
result += addition;
usedContent.add(item.paragraph);
}
return result || 'No relevant content found for the specified keywords.';
}
/**
* Calculate keyword relevance score for a text segment
*/
private calculateKeywordScore(text: string, keywords: string[]): number {
const lowerText = text.toLowerCase();
let score = 0;
keywords.forEach(keyword => {
const lowerKeyword = keyword.toLowerCase();
// Exact phrase matches get highest score
const exactMatches = (lowerText.match(new RegExp(this.escapeRegex(lowerKeyword), 'g')) || []).length;
score += exactMatches * 3;
// Word boundary matches get medium score
const wordMatches = (lowerText.match(new RegExp(`\\b${this.escapeRegex(lowerKeyword)}\\b`, 'g')) || []).length;
score += wordMatches * 2;
// Partial matches get lower score
const partialMatches = (lowerText.match(new RegExp(this.escapeRegex(lowerKeyword), 'g')) || []).length - exactMatches;
score += partialMatches * 1;
// Proximity bonus: if multiple keywords appear close together
const words = lowerText.split(/\s+/);
for (let i = 0; i < words.length - 1; i++) {
if (words[i].includes(lowerKeyword)) {
// Check if another keyword appears within 10 words
for (let j = Math.max(0, i - 10); j < Math.min(words.length, i + 10); j++) {
if (j !== i && keywords.some(k => words[j].includes(k.toLowerCase()))) {
score += 0.5;
}
}
}
}
});
// Length penalty for very short or very long segments
const wordCount = text.split(/\s+/).length;
if (wordCount < 10) score *= 0.5;
if (wordCount > 200) score *= 0.8;
return score;
}
/**
* Fallback sentence-level keyword extraction
*/
private extractSentencesByKeywords(content: string, keywords: string[], maxLength?: number): string {
const sentences = content.split(/[.!?]+/);
const relevantSentences: { sentence: string; score: number }[] = [];
sentences.forEach(sentence => {
const score = this.calculateKeywordScore(sentence, keywords);
if (score > 0) {
relevantSentences.push({ sentence: sentence.trim(), score });
}
});
// Sort by relevance score
relevantSentences.sort((a, b) => b.score - a.score);
// Combine sentences up to max length
let result = '';
for (const item of relevantSentences) {
const addition = (result ? '. ' : '') + item.sentence;
if (maxLength && (result + addition).length > maxLength) {
break;
}
result += addition;
}
return result || 'No relevant content found for the specified keywords.';
}
/**
* Generate intelligent summary of content
*/
async generateSummary(content: string, options?: { maxLength?: number; style?: 'extractive' | 'abstractive' }): Promise<string> {
const maxLength = options?.maxLength || 500;
const style = options?.style || 'extractive';
// For very short content, return as-is
if (content.length <= maxLength) {
return content;
}
if (style === 'extractive') {
return this.generateExtractiveSummary(content, maxLength);
} else {
// For now, fallback to extractive (abstractive would require ML models)
return this.generateExtractiveSummary(content, maxLength);
}
}
/**
* Generate extractive summary using advanced scoring
*/
private generateExtractiveSummary(content: string, maxLength: number): string {
const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 20);
if (sentences.length <= 2) {
return content;
}
// Calculate various scoring factors
const wordFreq = this.calculateWordFrequency(content);
const titleWords = this.extractTitleWords(content);
const keyPhrases = this.extractKeyPhrases(content);
const scoredSentences = sentences.map((sentence, index) => {
let score = 0;
const sentenceLower = sentence.toLowerCase();
const words = sentenceLower.split(/\s+/);
// Position score (first and last sentences often important)
if (index === 0) score += 0.4;
if (index === sentences.length - 1) score += 0.2;
if (index < sentences.length * 0.3) score += 0.3; // Early sentences
// Word frequency score
words.forEach(word => {
if (word.length > 3 && wordFreq[word]) {
score += wordFreq[word] * 0.5;
}
});
// Title word bonus
titleWords.forEach(titleWord => {
if (sentenceLower.includes(titleWord.toLowerCase())) {
score += 0.6;
}
});
// Key phrase bonus
keyPhrases.forEach(phrase => {
if (sentenceLower.includes(phrase.toLowerCase())) {
score += 0.4;
}
});
// Length penalty for very short or very long sentences
const wordCount = words.length;
if (wordCount < 8) score *= 0.6;
if (wordCount > 40) score *= 0.8;
// Numeric data bonus (often contains important facts)
if (/\d+/.test(sentence)) score += 0.2;
// Question sentences often important
if (sentence.includes('?')) score += 0.3;
// Sentences with superlatives often important
if (/\b(most|best|worst|largest|smallest|first|last|only)\b/i.test(sentence)) {
score += 0.2;
}
return { sentence: sentence.trim(), score, index };
});
// Sort by score and select top sentences
scoredSentences.sort((a, b) => b.score - a.score);
// Select sentences up to maxLength, maintaining original order
const selectedSentences = [];
let currentLength = 0;
for (const item of scoredSentences) {
const sentenceLength = item.sentence.length + 2; // +2 for ". "
if (currentLength + sentenceLength <= maxLength) {
selectedSentences.push(item);
currentLength += sentenceLength;
}
}
// Sort selected sentences by original order
selectedSentences.sort((a, b) => a.index - b.index);
// Ensure we have at least one sentence
if (selectedSentences.length === 0 && scoredSentences.length > 0) {
selectedSentences.push(scoredSentences[0]);
}
return selectedSentences.map(item => item.sentence).join('. ') + '.';
}
/**
* Extract potential title words from content
*/
private extractTitleWords(content: string): string[] {
const titleWords: string[] = [];
// Look for markdown headers
const headerMatches = content.match(/^#{1,6}\s+(.+)$/gm);
if (headerMatches) {
headerMatches.forEach(header => {
const words = header.replace(/^#+\s+/, '').split(/\s+/);
titleWords.push(...words.filter(w => w.length > 3));
});
}
// Look for underlined headers
const lines = content.split('\n');
for (let i = 0; i < lines.length - 1; i++) {
if (/^[=-]{3,}$/.test(lines[i + 1].trim())) {
const words = lines[i].trim().split(/\s+/);
titleWords.push(...words.filter(w => w.length > 3));
}
}
return titleWords;
}
/**
* Extract key phrases from content
*/
private extractKeyPhrases(content: string): string[] {
const phrases: string[] = [];
// Look for quoted phrases
const quotedPhrases = content.match(/"([^"]{10,50})"/g);
if (quotedPhrases) {
phrases.push(...quotedPhrases.map(p => p.slice(1, -1)));
}
// Look for capitalized phrases (potential proper nouns)
const capitalizedPhrases = content.match(/\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b/g);
if (capitalizedPhrases) {
phrases.push(...capitalizedPhrases.filter(p => p.length > 5));
}
return phrases;
}
/**
* Calculate word frequency for summarization
*/
private calculateWordFrequency(content: string): Record<string, number> {
const words = content.toLowerCase().match(/\b\w{3,}\b/g) || [];
const freq: Record<string, number> = {};
// Common stop words to exclude
const stopWords = new Set([
'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', 'above',
'below', 'between', 'among', 'this', 'that', 'these', 'those', 'was', 'were',
'been', 'have', 'has', 'had', 'will', 'would', 'could', 'should', 'may', 'might'
]);
words.forEach(word => {
if (!stopWords.has(word) && word.length > 2) {
freq[word] = (freq[word] || 0) + 1;
}
});
// Normalize frequencies
const maxFreq = Math.max(...Object.values(freq));
if (maxFreq > 0) {
Object.keys(freq).forEach(word => {
freq[word] = freq[word] / maxFreq;
});
}
return freq;
}
/**
* Escape special regex characters
*/
private escapeRegex(text: string): string {
return text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
/**
* Calculate string similarity using Levenshtein distance
*/
private calculateSimilarity(str1: string, str2: string): number {
const len1 = str1.length;
const len2 = str2.length;
if (len1 === 0) return len2 === 0 ? 1 : 0;
if (len2 === 0) return 0;
const matrix = Array(len1 + 1).fill(null).map(() => Array(len2 + 1).fill(null));
for (let i = 0; i <= len1; i++) matrix[i][0] = i;
for (let j = 0; j <= len2; j++) matrix[0][j] = j;
for (let i = 1; i <= len1; i++) {
for (let j = 1; j <= len2; j++) {
const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
matrix[i][j] = Math.min(
matrix[i - 1][j] + 1,
matrix[i][j - 1] + 1,
matrix[i - 1][j - 1] + cost
);
}
}
const maxLen = Math.max(len1, len2);
return (maxLen - matrix[len1][len2]) / maxLen;
}
/**
* Search within content
*/
searchInContent(content: string, query: string): SearchResult[] {
const results: SearchResult[] = [];
const queryLower = query.toLowerCase();
const sentences = content.split(/[.!?]+/);
sentences.forEach((sentence, index) => {
const sentenceLower = sentence.toLowerCase();
const matchIndex = sentenceLower.indexOf(queryLower);
if (matchIndex !== -1) {
// Calculate relevance score based on exact match and context
let relevanceScore = 1.0;
// Boost score for exact word matches
if (sentenceLower.includes(` ${queryLower} `) ||
sentenceLower.startsWith(queryLower + ' ') ||
sentenceLower.endsWith(' ' + queryLower)) {
relevanceScore += 0.5;
}
// Create snippet with context
const snippetStart = Math.max(0, matchIndex - 50);
const snippetEnd = Math.min(sentence.length, matchIndex + query.length + 50);
let snippet = sentence.slice(snippetStart, snippetEnd);
if (snippetStart > 0) snippet = '...' + snippet;
if (snippetEnd < sentence.length) snippet = snippet + '...';
// Get surrounding context
const contextStart = Math.max(0, index - 1);
const contextEnd = Math.min(sentences.length, index + 2);
const context = sentences.slice(contextStart, contextEnd).join('. ');
results.push({
snippet,
position: content.indexOf(sentence),
relevanceScore,
context: context.trim()
});
}
});
// Sort by relevance score
return results.sort((a, b) => b.relevanceScore - a.relevanceScore);
}
/**
* Suggest extraction options for large files (>50KB)
*/
suggestExtractionOptions(content: string, metadata: ContentMetadata): ExtractionSuggestion[] {
const suggestions: ExtractionSuggestion[] = [];
const contentSize = content.length;
// If file is not large, no suggestions needed
if (contentSize <= 50000) {
return suggestions;
}
// Suggest section-based extraction if headings are available
if (metadata.headings && metadata.headings.length > 1) {
suggestions.push({
type: 'section',
description: `Extract by section (${metadata.headings.length} sections available)`,
options: metadata.headings.map(h => ({
label: h.text,
value: h.text,
estimatedSize: this.estimateSectionSize(content, h, metadata.headings)
})),
priority: 1
});
}
// Suggest page range extraction if document has page structure
if (metadata.pageCount && metadata.pageCount > 1) {
const pagesPerChunk = Math.ceil(metadata.pageCount / Math.ceil(contentSize / 50000));
suggestions.push({
type: 'pageRange',
description: `Extract by page range (${metadata.pageCount} pages total)`,
options: this.generatePageRangeOptions(metadata.pageCount, pagesPerChunk),
priority: 2
});
}
// Suggest keyword-based extraction
const keyTerms = this.extractKeyTerms(content);
if (keyTerms.length > 0) {
suggestions.push({
type: 'keywords',
description: 'Extract content containing specific keywords',
options: keyTerms.slice(0, 10).map(term => ({
label: `Content about "${term}"`,
value: term,
estimatedSize: this.estimateKeywordContentSize(content, term)
})),
priority: 3
});
}
// Suggest summary extraction
suggestions.push({
type: 'summary',
description: 'Generate a summary of the main content',
options: [
{ label: 'Brief summary (200 words)', value: '200', estimatedSize: 1200 },
{ label: 'Detailed summary (500 words)', value: '500', estimatedSize: 3000 },
{ label: 'Extended summary (1000 words)', value: '1000', estimatedSize: 6000 }
],
priority: 4
});
// Suggest chunked extraction
const chunkCount = Math.ceil(contentSize / 50000);
suggestions.push({
type: 'chunks',
description: `Extract in manageable chunks (${chunkCount} chunks recommended)`,
options: Array.from({ length: Math.min(chunkCount, 10) }, (_, i) => ({
label: `Chunk ${i + 1}`,
value: `${i + 1}`,
estimatedSize: Math.min(50000, contentSize - (i * 50000))
})),
priority: 5
});
return suggestions.sort((a, b) => a.priority - b.priority);
}
/**
* Estimate section size for extraction suggestions
*/
private estimateSectionSize(content: string, heading: Heading, allHeadings: Heading[]): number {
const currentIndex = allHeadings.indexOf(heading);
const nextHeading = allHeadings[currentIndex + 1];
const startPos = heading.position;
const endPos = nextHeading ? nextHeading.position : content.length;
return endPos - startPos;
}
/**
* Generate page range options for extraction
*/
private generatePageRangeOptions(totalPages: number, pagesPerChunk: number): Array<{ label: string; value: string; estimatedSize: number }> {
const options = [];
for (let start = 1; start <= totalPages; start += pagesPerChunk) {
const end = Math.min(start + pagesPerChunk - 1, totalPages);
options.push({
label: `Pages ${start}-${end}`,
value: `${start}-${end}`,
estimatedSize: (end - start + 1) * 2000 // Rough estimate of 2KB per page
});
}
return options;
}
/**
* Extract key terms from content for keyword suggestions
*/
private extractKeyTerms(content: string): string[] {
const wordFreq = this.calculateWordFrequency(content);
const titleWords = this.extractTitleWords(content);
const keyPhrases = this.extractKeyPhrases(content);
// Combine and score terms
const termScores: Record<string, number> = {};
// Add frequent words
Object.entries(wordFreq).forEach(([word, freq]) => {
if (word.length > 4) {
termScores[word] = freq;
}
});
// Boost title words
titleWords.forEach(word => {
if (word.length > 3) {
termScores[word] = (termScores[word] || 0) + 0.5;
}
});
// Add key phrases
keyPhrases.forEach(phrase => {
termScores[phrase] = (termScores[phrase] || 0) + 0.3;
});
// Sort by score and return top terms
return Object.entries(termScores)
.sort(([, a], [, b]) => b - a)
.slice(0, 15)
.map(([term]) => term);
}
/**
* Estimate content size for keyword-based extraction
*/
private estimateKeywordContentSize(content: string, keyword: string): number {
const sentences = content.split(/[.!?]+/);
let relevantContent = '';
sentences.forEach(sentence => {
if (sentence.toLowerCase().includes(keyword.toLowerCase())) {
relevantContent += sentence + '. ';
}
});
return relevantContent.length;
}
}