google-docs-processor.ts•12.9 kB
/**
* Google Docs content processor
* Handles Google Docs API integration and content conversion to markdown
*/
import { google, docs_v1 } from 'googleapis';
import { OAuth2Client } from 'google-auth-library';
import { ProcessedContent, ContentMetadata, DocumentStructure, ContentChunk, Heading, ImageInfo, TableInfo } from '../types/content.js';
export interface GoogleDocsProcessorOptions {
chunkSize?: number;
preserveFormatting?: boolean;
includeImages?: boolean;
includeTables?: boolean;
}
export class GoogleDocsProcessor {
private docsApi: docs_v1.Docs;
constructor(auth: OAuth2Client) {
this.docsApi = google.docs({ version: 'v1', auth });
}
/**
* Process Google Docs file and convert to markdown
*/
async processGoogleDoc(
fileId: string,
options: GoogleDocsProcessorOptions = {}
): Promise<ProcessedContent> {
try {
// Get document content from Google Docs API
const response = await this.docsApi.documents.get({
documentId: fileId
});
const document = response.data;
if (!document || !document.body) {
throw new Error('Failed to retrieve document content');
}
// Convert document to markdown
const conversion = this.convertToMarkdown(document, options);
// Create chunks
const chunks = this.createChunks(conversion.markdown, options.chunkSize || 4000);
// Create metadata
const metadata: ContentMetadata = {
wordCount: this.countWords(conversion.markdown),
language: this.detectLanguage(conversion.markdown),
headings: conversion.headings,
images: conversion.images,
tables: conversion.tables,
lastProcessed: new Date()
};
// Create document structure
const structure = this.analyzeStructure(conversion.markdown, conversion.headings);
return {
markdown: conversion.markdown,
metadata,
structure,
chunks
};
} catch (error) {
throw new Error(`Failed to process Google Doc: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Convert Google Docs document to markdown
*/
private convertToMarkdown(
document: docs_v1.Schema$Document,
options: GoogleDocsProcessorOptions
): {
markdown: string;
headings: Heading[];
images: ImageInfo[];
tables: TableInfo[];
} {
const headings: Heading[] = [];
const images: ImageInfo[] = [];
const tables: TableInfo[] = [];
let markdown = '';
let position = 0;
if (!document.body?.content) {
return { markdown: '', headings, images, tables };
}
// Process document title
if (document.title) {
markdown += `# ${document.title}\n\n`;
headings.push({
level: 1,
text: document.title,
position: 0
});
position = markdown.length;
}
// Process body content
for (const element of document.body.content) {
const result = this.processElement(element, options, position);
markdown += result.content;
headings.push(...result.headings);
images.push(...result.images);
tables.push(...result.tables);
position = markdown.length;
}
return { markdown, headings, images, tables };
}
/**
* Process individual document element
*/
private processElement(
element: docs_v1.Schema$StructuralElement,
options: GoogleDocsProcessorOptions,
position: number
): {
content: string;
headings: Heading[];
images: ImageInfo[];
tables: TableInfo[];
} {
const headings: Heading[] = [];
const images: ImageInfo[] = [];
const tables: TableInfo[] = [];
let content = '';
// Process paragraph
if (element.paragraph) {
const paragraphResult = this.processParagraph(element.paragraph, options, position);
content += paragraphResult.content;
headings.push(...paragraphResult.headings);
images.push(...paragraphResult.images);
}
// Process table
if (element.table && options.includeTables !== false) {
const tableResult = this.processTable(element.table, position);
content += tableResult.content;
tables.push(...tableResult.tables);
}
// Process section break
if (element.sectionBreak) {
content += '\n---\n\n';
}
return { content, headings, images, tables };
}
/**
* Process paragraph element
*/
private processParagraph(
paragraph: docs_v1.Schema$Paragraph,
options: GoogleDocsProcessorOptions,
position: number
): {
content: string;
headings: Heading[];
images: ImageInfo[];
} {
const headings: Heading[] = [];
const images: ImageInfo[] = [];
let content = '';
if (!paragraph.elements) {
return { content: '\n', headings, images };
}
// Check if this is a heading
const style = paragraph.paragraphStyle;
const headingLevel = this.getHeadingLevel(style);
let paragraphText = '';
// Process paragraph elements
for (const element of paragraph.elements) {
if (element.textRun) {
const textContent = this.processTextRun(element.textRun);
paragraphText += textContent;
} else if (element.inlineObjectElement && options.includeImages !== false) {
const imageResult = this.processInlineObject(element.inlineObjectElement, position);
paragraphText += imageResult.content;
images.push(...imageResult.images);
}
}
// Format as heading or regular paragraph
if (headingLevel > 0 && paragraphText.trim()) {
content = '#'.repeat(headingLevel) + ' ' + paragraphText.trim() + '\n\n';
headings.push({
level: headingLevel,
text: paragraphText.trim(),
position: position
});
} else if (paragraphText.trim()) {
// Check for list formatting
const bullet = paragraph.bullet;
if (bullet) {
const listLevel = (bullet.nestingLevel || 0) + 1;
const indent = ' '.repeat(Math.max(0, listLevel - 1));
const marker = this.getListMarker(bullet, listLevel);
content = `${indent}${marker} ${paragraphText.trim()}\n`;
} else {
content = paragraphText.trim() + '\n\n';
}
} else {
content = '\n';
}
return { content, headings, images };
}
/**
* Process text run with formatting
*/
private processTextRun(textRun: docs_v1.Schema$TextRun): string {
let text = textRun.content || '';
const style = textRun.textStyle;
if (!style) {
return text;
}
// Apply formatting
if (style.bold) {
text = `**${text}**`;
}
if (style.italic) {
text = `*${text}*`;
}
if (style.underline) {
text = `<u>${text}</u>`;
}
if (style.strikethrough) {
text = `~~${text}~~`;
}
if (style.link?.url) {
text = `[${text}](${style.link.url})`;
}
return text;
}
/**
* Process inline object (images, etc.)
*/
private processInlineObject(
_inlineObject: docs_v1.Schema$InlineObjectElement,
position: number
): {
content: string;
images: ImageInfo[];
} {
const images: ImageInfo[] = [];
// For now, just add a placeholder for images
// In a full implementation, you'd extract image data
const imageInfo: ImageInfo = {
alt: 'Image',
position: position
};
images.push(imageInfo);
return {
content: '\n\n',
images
};
}
/**
* Process table element
*/
private processTable(
table: docs_v1.Schema$Table,
position: number
): {
content: string;
tables: TableInfo[];
} {
const tables: TableInfo[] = [];
let content = '';
if (!table.tableRows) {
return { content: '', tables };
}
const rows = table.tableRows.length;
const columns = table.tableRows[0]?.tableCells?.length || 0;
// Create markdown table
content += '\n';
// Process each row
table.tableRows.forEach((row, rowIndex) => {
if (!row.tableCells) return;
content += '|';
row.tableCells.forEach(cell => {
const cellContent = this.extractCellContent(cell);
content += ` ${cellContent} |`;
});
content += '\n';
// Add header separator after first row
if (rowIndex === 0) {
content += '|';
for (let i = 0; i < columns; i++) {
content += ' --- |';
}
content += '\n';
}
});
content += '\n';
tables.push({
rows,
columns,
position
});
return { content, tables };
}
/**
* Extract content from table cell
*/
private extractCellContent(cell: docs_v1.Schema$TableCell): string {
if (!cell.content) return '';
let content = '';
for (const element of cell.content) {
if (element.paragraph?.elements) {
for (const paragraphElement of element.paragraph.elements) {
if (paragraphElement.textRun) {
content += paragraphElement.textRun.content || '';
}
}
}
}
return content.trim().replace(/\n/g, ' ');
}
/**
* Get heading level from paragraph style
*/
private getHeadingLevel(style?: docs_v1.Schema$ParagraphStyle): number {
if (!style?.namedStyleType) return 0;
const styleType = style.namedStyleType;
if (styleType === 'HEADING_1') return 1;
if (styleType === 'HEADING_2') return 2;
if (styleType === 'HEADING_3') return 3;
if (styleType === 'HEADING_4') return 4;
if (styleType === 'HEADING_5') return 5;
if (styleType === 'HEADING_6') return 6;
return 0;
}
/**
* Get list marker for bullet points
*/
private getListMarker(_bullet: docs_v1.Schema$Bullet, _level: number): string {
// Simple implementation - could be enhanced to handle different bullet styles
return '-';
}
/**
* Create content chunks
*/
private createChunks(content: string, chunkSize: number): ContentChunk[] {
const chunks: ContentChunk[] = [];
const paragraphs = content.split('\n\n').filter(p => p.trim());
let currentChunk = '';
let chunkIndex = 0;
let startPosition = 0;
for (const paragraph of paragraphs) {
const testChunk = currentChunk + (currentChunk ? '\n\n' : '') + paragraph;
if (testChunk.length > chunkSize && currentChunk.length > 0) {
// Create chunk
chunks.push({
id: `chunk_${chunkIndex}`,
content: currentChunk.trim(),
metadata: {
wordCount: this.countWords(currentChunk),
position: { start: startPosition, end: startPosition + currentChunk.length }
}
});
// Start new chunk
currentChunk = paragraph;
startPosition = content.indexOf(currentChunk, startPosition + currentChunk.length);
chunkIndex++;
} else {
currentChunk = testChunk;
}
}
// Add final chunk
if (currentChunk.trim()) {
chunks.push({
id: `chunk_${chunkIndex}`,
content: currentChunk.trim(),
metadata: {
wordCount: this.countWords(currentChunk),
position: { start: startPosition, end: startPosition + currentChunk.length }
}
});
}
return chunks;
}
/**
* Analyze document structure
*/
private analyzeStructure(content: string, headings: Heading[]): DocumentStructure {
const sections = headings.map((heading, index) => {
const nextHeading = headings[index + 1];
const endPosition = nextHeading ? nextHeading.position : content.length;
return {
title: heading.text,
level: heading.level,
startPosition: heading.position,
endPosition,
content: content.slice(heading.position, endPosition).trim()
};
});
const toc = headings.map(heading => ({
title: heading.text,
level: heading.level,
position: heading.position
}));
return {
sections,
toc,
pageBreaks: [] // Google Docs doesn't have traditional page breaks
};
}
/**
* Count words in text
*/
private countWords(text: string): number {
return text.trim().split(/\s+/).filter(word => word.length > 0).length;
}
/**
* Simple language detection
*/
private detectLanguage(text: string): string {
// Basic English detection - could be enhanced
const sample = text.slice(0, 1000).toLowerCase();
const englishWords = ['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'];
const englishCount = englishWords.reduce((count, word) => {
return count + (sample.match(new RegExp(`\\b${word}\\b`, 'g')) || []).length;
}, 0);
return englishCount > 5 ? 'en' : 'unknown';
}
}