M.I.M.I.R - Multi-agent Intelligent Memory & Insight Repository

Overview Schema Related Servers Score Discussions

extract_docx.mjs•2.04 KiB

#!/usr/bin/env node /** * Extract translatable text units from a DOCX file to JSON. * Uses mammoth for DOCX parsing. */ import mammoth from 'mammoth'; import fs from 'fs/promises'; import path from 'path'; const CODE_PATTERN = /^\s*\d{3}-\d{5}[A-Z]?\s*$|^\s*\d{6}\s*$/; /** * Extract translation units from DOCX * mammoth extracts as HTML, we parse paragraphs from that */ async function extractUnits(docxPath) { const buffer = await fs.readFile(docxPath); // Get raw text with paragraph markers const result = await mammoth.extractRawText({ buffer }); const text = result.value; // Also get HTML for structure detection const htmlResult = await mammoth.convertToHtml({ buffer }); const html = htmlResult.value; const units = []; // Split by paragraphs (double newlines or single newlines) const paragraphs = text.split(/\n+/).filter(p => p.trim()); paragraphs.forEach((para, pi) => { const trimmed = para.trim(); // Skip empty or code-only patterns if (!trimmed || CODE_PATTERN.test(trimmed)) { return; } // Detect style from HTML (basic heuristic) let style = 'body'; if (html.includes(`<h1>${trimmed.substring(0, 20)}`)) style = 'heading1'; else if (html.includes(`<h2>${trimmed.substring(0, 20)}`)) style = 'heading2'; else if (html.includes(`<strong>${trimmed.substring(0, 20)}`)) style = 'bold'; units.push({ id: `body:p${pi}`, source: trimmed, style: style, where: 'body' }); }); return units; } async function main() { const args = process.argv.slice(2); if (args.length < 2) { console.log('Usage: node extract_docx.mjs <input.docx> <output.json>'); process.exit(1); } const [inputPath, outputPath] = args; try { const units = await extractUnits(inputPath); await fs.writeFile(outputPath, JSON.stringify(units, null, 2), 'utf-8'); console.log(`Extracted ${units.length} units -> ${outputPath}`); } catch (error) { console.error(`Error: ${error.message}`); process.exit(1); } } main();

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/orneryd/Mimir'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

extract_docx.mjs•2.04 KiB