extract_revisions
Extract tracked changes and comments from DOCX files as structured JSON with before/after text per paragraph and revision details. Supports pagination for large documents without modifying the original file.
Instructions
Extract tracked changes as structured JSON with before/after text per paragraph, revision details, and comments. Supports pagination via offset and limit. Read-only - does not modify the document.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| file_path | Yes | Path to the DOCX file. | |
| offset | No | 0-based offset for pagination. Default: 0. | |
| limit | No | Max entries per page (1-500). Default: 50. |
Implementation Reference
- The tool handler for "extract_revisions", which resolves the session, performs input validation, utilizes the core library to extract revisions, and handles pagination.
export async function extractRevisions_tool( manager: SessionManager, params: { file_path?: string; offset?: number; limit?: number; }, ): Promise<ToolResponse> { const resolved = await resolveSessionForTool(manager, params, { toolName: 'extract_revisions' }); if (!resolved.ok) return resolved.response; const { session, metadata } = resolved; // Validate limit const limit = params.limit ?? 50; if (typeof limit !== 'number' || limit < 1 || limit > 500) { return err('INVALID_LIMIT', `limit must be between 1 and 500, got ${limit}`, 'Provide a limit in the range 1–500.'); } // Validate offset const offset = params.offset ?? 0; if (typeof offset !== 'number' || offset < 0) { return err('INVALID_OFFSET', `offset must be >= 0, got ${offset}`, 'Provide a non-negative offset.'); } try { // Check extraction cache const cached = manager.getExtractionCache(session); let allChanges; if (cached) { allChanges = cached.changes; } else { // Compute extraction from DOM clones const docClone = session.doc.getDocumentXmlClone(); const comments = await session.doc.getComments(); const result = extractRevisions(docClone, comments); allChanges = result.changes; // Cache the full result for pagination manager.setExtractionCache(session, allChanges); } // Apply pagination const totalChanges = allChanges.length; const page = allChanges.slice(offset, offset + limit); const hasMore = offset + limit < totalChanges; return ok(mergeSessionResolutionMetadata({ changes: page, total_changes: totalChanges, has_more: hasMore, edit_revision: session.editRevision, file_path: manager.normalizePath(session.originalPath), }, metadata)); } catch (e: unknown) { return err('EXTRACTION_ERROR', errorMessage(e)); } } - The core logic for extracting revisions by comparing cloned documents with changes accepted vs rejected.
export function extractRevisions( doc: Document, comments: Comment[], opts?: { offset?: number; limit?: number }, ): ExtractRevisionsResult { const body = doc.getElementsByTagNameNS(W_NS, 'body').item(0); if (!body) { return { changes: [], total_changes: 0, has_more: false }; } // Clone DOM twice and apply accept/reject const acceptedDoc = doc.cloneNode(true) as Document; const rejectedDoc = doc.cloneNode(true) as Document; acceptChanges(acceptedDoc); rejectChanges(rejectedDoc); // Build comment lookup by anchoredParagraphId const commentsByParaId = new Map<string, Comment[]>(); for (const c of comments) { if (c.anchoredParagraphId) { const existing = commentsByParaId.get(c.anchoredParagraphId); if (existing) { existing.push(c); } else { commentsByParaId.set(c.anchoredParagraphId, [c]); } } } // Walk all paragraphs in the original tracked DOM const allParagraphs = Array.from(body.getElementsByTagNameNS(W_NS, 'p')); const changedParagraphs: ParagraphRevision[] = []; for (const p of allParagraphs) { if (!paragraphHasRevisions(p)) continue; const paraId = getParagraphBookmarkId(p); if (!paraId) continue; // All paragraphs should have bookmarks from session resolution // Detect entirely-inserted/deleted paragraphs to avoid stale bookmark lookups. // When rejectChanges() removes an inserted paragraph, it relocates bookmarks // to adjacent paragraphs, which would give the wrong before_text. const isFullyInserted = paragraphIsEntirelyInserted(p); const isFullyDeleted = paragraphIsEntirelyDeleted(p); // Look up before_text in rejected clone by bookmark let beforeText: string; if (isFullyInserted) { beforeText = ''; // Didn't exist before } else { const rejectedP = findParagraphByBookmarkId(rejectedDoc, paraId); beforeText = rejectedP ? getParagraphText(rejectedP) : ''; } // Look up after_text in accepted clone by bookmark let afterText: string; if (isFullyDeleted) { afterText = ''; // Doesn't exist after } else { const acceptedP = findParagraphByBookmarkId(acceptedDoc, paraId); afterText = acceptedP ? getParagraphText(acceptedP) : ''; } // Collect revision entries const revisions = collectRevisionEntries(p); // Skip structurally-empty paragraphs with only paragraph-level markers // (e.g. empty inserted paragraphs from comparison engines with pPr/rPr/ins only) if (revisions.length === 0 && beforeText === '' && afterText === '') continue; // Associate comments const paraComments = commentsByParaId.get(paraId) ?? []; const revisionComments = paraComments.map(commentToRevisionComment); changedParagraphs.push({ para_id: paraId, before_text: beforeText, after_text: afterText, revisions, comments: revisionComments, }); } // Apply pagination const totalChanges = changedParagraphs.length; const offset = opts?.offset ?? 0; const limit = opts?.limit ?? totalChanges; const page = changedParagraphs.slice(offset, offset + limit); const hasMore = offset + limit < totalChanges; return { changes: page, total_changes: totalChanges, has_more: hasMore, }; } - Type definition for the result returned by extractRevisions.
export type ExtractRevisionsResult = { changes: ParagraphRevision[]; total_changes: number; has_more: boolean; };