analyze_structure
Extract and analyze document structure, headings, and formatting elements from .docx files. Facilitates detailed insights into document organization and layout for efficient processing.
Instructions
Analyze document structure, headings, and formatting elements
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| file_path | Yes | Path to the .docx file |
Input Schema (JSON Schema)
{
"$schema": "http://json-schema.org/draft-07/schema#",
"additionalProperties": false,
"properties": {
"file_path": {
"description": "Path to the .docx file",
"type": "string"
}
},
"required": [
"file_path"
],
"type": "object"
}
Implementation Reference
- src/index.ts:143-217 (handler)Handler function that performs structure analysis on DOCX by converting to HTML, parsing headings, paragraphs, formatting elements, and computing statistics.async ({ file_path }) => { try { const absolutePath = path.resolve(file_path) if (!fs.existsSync(absolutePath)) { throw new Error(`File not found: ${absolutePath}`) } // Convert to HTML to analyze structure const htmlResult = await mammoth.convertToHtml({ path: absolutePath }) const html = htmlResult.value // Extract text for analysis const textResult = await mammoth.extractRawText({ path: absolutePath }) const text = textResult.value // Analyze structure const headings = (html.match(/<h[1-6][^>]*>.*?<\/h[1-6]>/gi) || []).map( (h: string) => ({ level: parseInt(h.match(/<h([1-6])/)![1]), text: h.replace(/<[^>]*>/g, '').trim(), }) ) const paragraphs = (html.match(/<p[^>]*>.*?<\/p>/gi) || []).length const strongElements = (html.match(/<strong[^>]*>.*?<\/strong>/gi) || []) .length const emElements = (html.match(/<em[^>]*>.*?<\/em>/gi) || []).length const lists = (html.match(/<[uo]l[^>]*>.*?<\/[uo]l>/gi) || []).length const listItems = (html.match(/<li[^>]*>.*?<\/li>/gi) || []).length const analysis = { document_stats: { total_characters: text.length, total_words: text .split(/\s+/) .filter((word: string) => word.length > 0).length, total_paragraphs: paragraphs, total_headings: headings.length, }, structure: { headings: headings, heading_levels: [ ...new Set(headings.map((h: any) => h.level)), ].sort(), }, formatting: { bold_elements: strongElements, italic_elements: emElements, lists: lists, list_items: listItems, }, messages: htmlResult.messages, } return { content: [ { type: 'text', text: JSON.stringify(analysis, null, 2), }, ], } } catch (error) { return { content: [ { type: 'text', text: `Error analyzing structure: ${(error as Error).message}`, }, ], isError: true, } } }
- src/index.ts:140-142 (schema)Input schema defining the 'file_path' parameter using Zod.{ file_path: z.string().describe('Path to the .docx file'), },
- src/index.ts:137-218 (registration)Registration of the 'analyze_structure' tool using server.tool(), including name, description, schema, and handler.server.tool( 'analyze_structure', 'Analyze document structure, headings, and formatting elements', { file_path: z.string().describe('Path to the .docx file'), }, async ({ file_path }) => { try { const absolutePath = path.resolve(file_path) if (!fs.existsSync(absolutePath)) { throw new Error(`File not found: ${absolutePath}`) } // Convert to HTML to analyze structure const htmlResult = await mammoth.convertToHtml({ path: absolutePath }) const html = htmlResult.value // Extract text for analysis const textResult = await mammoth.extractRawText({ path: absolutePath }) const text = textResult.value // Analyze structure const headings = (html.match(/<h[1-6][^>]*>.*?<\/h[1-6]>/gi) || []).map( (h: string) => ({ level: parseInt(h.match(/<h([1-6])/)![1]), text: h.replace(/<[^>]*>/g, '').trim(), }) ) const paragraphs = (html.match(/<p[^>]*>.*?<\/p>/gi) || []).length const strongElements = (html.match(/<strong[^>]*>.*?<\/strong>/gi) || []) .length const emElements = (html.match(/<em[^>]*>.*?<\/em>/gi) || []).length const lists = (html.match(/<[uo]l[^>]*>.*?<\/[uo]l>/gi) || []).length const listItems = (html.match(/<li[^>]*>.*?<\/li>/gi) || []).length const analysis = { document_stats: { total_characters: text.length, total_words: text .split(/\s+/) .filter((word: string) => word.length > 0).length, total_paragraphs: paragraphs, total_headings: headings.length, }, structure: { headings: headings, heading_levels: [ ...new Set(headings.map((h: any) => h.level)), ].sort(), }, formatting: { bold_elements: strongElements, italic_elements: emElements, lists: lists, list_items: listItems, }, messages: htmlResult.messages, } return { content: [ { type: 'text', text: JSON.stringify(analysis, null, 2), }, ], } } catch (error) { return { content: [ { type: 'text', text: `Error analyzing structure: ${(error as Error).message}`, }, ], isError: true, } } } )