Skip to main content
Glama
rtf-parser.js12.6 kB
/** * RTF to Plain Text Parser * Ported from Python striprtf library * Based on n8n workflow RTF extractor */ // RTF destinations to ignore const DESTINATIONS = new Set([ 'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor', 'atndate', 'atnicn', 'atnid', 'atnparent', 'atnref', 'atntime', 'atrfend', 'atrfstart', 'author', 'background', 'bkmkend', 'bkmkstart', 'blipuid', 'buptim', 'category', 'colorschememapping', 'colortbl', 'comment', 'company', 'creatim', 'datafield', 'datastore', 'defchp', 'defpap', 'do', 'doccomm', 'docvar', 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname', 'falt', 'fchars', 'ffdeftext', 'ffentrymcr', 'ffexitmcr', 'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext', 'file', 'filetbl', 'fldinst', 'fldtype', 'fonttbl', 'fname', 'fontemb', 'fontfile', 'footer', 'footerf', 'footerl', 'footerr', 'footnote', 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g', 'generator', 'gridtbl', 'header', 'headerf', 'headerl', 'headerr', 'hl', 'hlfr', 'hlinkbase', 'hlloc', 'hlsrc', 'hsv', 'htmltag', 'info', 'keycode', 'keywords', 'latentstyles', 'lchars', 'levelnumbers', 'leveltext', 'lfolevel', 'linkval', 'list', 'listlevel', 'listname', 'listoverride', 'listoverridetable', 'listpicture', 'liststylename', 'listtable', 'listtext', 'lsdlockedexcept', 'macc', 'maccPr', 'mailmerge', 'maln', 'malnScr', 'manager', 'margPr', 'mbar', 'mbarPr', 'mbaseJc', 'mbegChr', 'mborderBox', 'mborderBoxPr', 'mbox', 'mboxPr', 'mchr', 'mcount', 'mctrlPr', 'md', 'mdeg', 'mdegHide', 'mden', 'mdiff', 'mdPr', 'me', 'mendChr', 'meqArr', 'meqArrPr', 'mf', 'mfName', 'mfPr', 'mfunc', 'mfuncPr', 'mgroupChr', 'mgroupChrPr', 'mgrow', 'mhideBot', 'mhideLeft', 'mhideRight', 'mhideTop', 'mhtmltag', 'mlim', 'mlimloc', 'mlimlow', 'mlimlowPr', 'mlimupp', 'mlimuppPr', 'mm', 'mmaddfieldname', 'mmath', 'mmathPict', 'mmathPr', 'mmaxdist', 'mmc', 'mmcJc', 'mmconnectstr', 'mmconnectstrdata', 'mmcPr', 'mmcs', 'mmdatasource', 'mmheadersource', 'mmmailsubject', 'mmodso', 'mmodsofilter', 'mmodsofldmpdata', 'mmodsomappedname', 'mmodsoname', 'mmodsorecipdata', 'mmodsosort', 'mmodsosrc', 'mmodsotable', 'mmodsoudl', 'mmodsoudldata', 'mmodsouniquetag', 'mmPr', 'mmquery', 'mmr', 'mnary', 'mnaryPr', 'mnoBreak', 'mnum', 'mobjDist', 'moMath', 'moMathPara', 'moMathParaPr', 'mopEmu', 'mphant', 'mphantPr', 'mplcHide', 'mpos', 'mr', 'mrad', 'mradPr', 'mrPr', 'msepChr', 'mshow', 'mshp', 'msPre', 'msPrePr', 'msSub', 'msSubPr', 'msSubSup', 'msSubSupPr', 'msSup', 'msSupPr', 'mstrikeBLTR', 'mstrikeH', 'mstrikeTLBR', 'mstrikeV', 'msub', 'msubHide', 'msup', 'msupHide', 'mtransp', 'mtype', 'mvertJc', 'mvfmf', 'mvfml', 'mvtof', 'mvtol', 'mzeroAsc', 'mzeroDesc', 'mzeroWid', 'nesttableprops', 'nextfile', 'nonesttables', 'objalias', 'objclass', 'objdata', 'object', 'objname', 'objsect', 'objtime', 'oldcprops', 'oldpprops', 'oldsprops', 'oldtprops', 'oleclsid', 'operator', 'panose', 'password', 'passwordhash', 'pgp', 'pgptbl', 'picprop', 'pict', 'pn', 'pnseclvl', 'pntext', 'pntxta', 'pntxtb', 'printim', 'private', 'propname', 'protend', 'protstart', 'protusertbl', 'pxe', 'result', 'revtbl', 'revtim', 'rsidtbl', 'rxe', 'shp', 'shpgrp', 'shpinst', 'shppict', 'shprslt', 'shptxt', 'sn', 'sp', 'staticval', 'stylesheet', 'subject', 'sv', 'svb', 'tc', 'template', 'themedata', 'title', 'txe', 'ud', 'upr', 'userprops', 'wgrffmtfilter', 'windowcaption', 'writereservation', 'writereservhash', 'xe', 'xform', 'xmlattrname', 'xmlattrvalue', 'xmlclose', 'xmlname', 'xmlnstbl', 'xmlopen', ]); // Character set mappings const CHARSET_MAP = { 0: 'windows-1252', 42: 'windows-1252', 77: 'macintosh', 128: 'shift_jis', 129: 'cp949', 134: 'gb2312', 136: 'big5', 161: 'windows-1253', 162: 'windows-1254', 177: 'windows-1255', 178: 'windows-1256', 186: 'windows-1257', 204: 'windows-1251', 222: 'windows-874', 238: 'windows-1250', 254: 'cp437', 255: 'cp850', }; // Section characters const SECTION_CHARS = { 'par': '\n', 'sect': '\n\n', 'page': '\n\n', }; // Special characters const SPECIAL_CHARS = { 'line': '\n', 'tab': '\t', 'emdash': '\u2014', 'endash': '\u2013', 'emspace': ' ', 'enspace': ' ', 'qmspace': ' ', 'bullet': '\u2022', 'lquote': '\u2018', 'rquote': '\u2019', 'ldblquote': '\u201C', 'rdblquote': '\u201D', 'row': '\n', 'cell': '|', 'nestcell': '|', '~': ' ', '\n': '\n', '\r': '\r', '{': '{', '}': '}', '\\': '\\', '-': '\u00AD', '_': '\u2011', ...SECTION_CHARS, }; // Main RTF pattern const PATTERN = /\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)/gi; // Hyperlink pattern const HYPERLINKS = /(\{\\field\{\s*\\\*\\fldinst\{.*HYPERLINK\s(\".*\")\}{2}\s*\{.*?\s+(.*?)\}{2,3})/gi; // Font table pattern const FONTTABLE = /\\f(\d+).*?\\fcharset(\d+).*?([^;]+);/g; /** * Remove picture groups from RTF to reduce size */ function removePictGroups(rtfText) { if (!rtfText.includes('\\pict') || !rtfText.includes('\\bin')) { return rtfText; } const result = []; let i = 0; const n = rtfText.length; let inPict = false; while (i < n) { if (!inPict && rtfText.substring(i, i + 5) === '\\pict') { inPict = true; i += 5; continue; } if (inPict) { if (rtfText.substring(i, i + 4) === '\\bin') { i += 4; let lengthStr = ''; while (i < n && /\d/.test(rtfText[i])) { lengthStr += rtfText[i]; i++; } if (lengthStr) { i += parseInt(lengthStr); } continue; } else if (rtfText[i] === '}') { inPict = false; i++; continue; } } if (!inPict) { result.push(rtfText[i]); } i++; } return result.join(''); } /** * Convert RTF to plain text */ export function rtfToText(text, encoding = 'windows-1252') { // Remove picture groups text = removePictGroups(text); // Convert hyperlinks text = text.replace(HYPERLINKS, '$1($2)'); const stack = []; const fonttbl = {}; let defaultFont = null; let currentFont = null; let ignorable = false; let suppressOutput = false; let ucskip = 1; let curskip = 0; let out = ''; // Parse font table let fontMatch; while ((fontMatch = FONTTABLE.exec(text)) !== null) { const [, fontId, fcharset, fontName] = fontMatch; fonttbl[fontId] = { name: fontName.trim(), charset: fcharset, encoding: CHARSET_MAP[parseInt(fcharset)] || encoding, }; } // Reset regex PATTERN.lastIndex = 0; // Parse RTF let match; while ((match = PATTERN.exec(text)) !== null) { const [, word, arg, hex, char, brace, tchar] = match; if (brace) { curskip = 0; if (brace === '{') { stack.push([ucskip, ignorable, suppressOutput]); } else if (brace === '}') { if (stack.length > 0) { [ucskip, ignorable, suppressOutput] = stack.pop(); } else { ucskip = 0; ignorable = true; } } } else if (char) { curskip = 0; if (SPECIAL_CHARS[char]) { if (SECTION_CHARS[char]) { currentFont = defaultFont; } if (!ignorable) { out += SPECIAL_CHARS[char]; } } else if (char === '*') { ignorable = true; } } else if (word) { curskip = 0; if (DESTINATIONS.has(word)) { ignorable = true; } else if (word === 'ansicpg') { encoding = `cp${arg}`; } if (ignorable || suppressOutput) { // Skip } else if (SPECIAL_CHARS[word]) { out += SPECIAL_CHARS[word]; } else if (word === 'uc') { ucskip = arg ? parseInt(arg) : 1; } else if (word === 'u') { if (arg === null || arg === undefined) { curskip = ucskip; } else { try { let c = parseInt(arg); if (c < 0) { c += 0x10000; } // Only add valid printable Unicode characters if ((c >= 32 && c < 0x110000) || [9, 10, 13].includes(c)) { out += String.fromCharCode(c); } curskip = ucskip; } catch (e) { curskip = ucskip; } } } else if (word === 'f') { currentFont = arg; } else if (word === 'deff') { defaultFont = arg; } else if (word === 'fonttbl') { suppressOutput = true; } else if (word === 'colortbl') { suppressOutput = true; } } else if (hex) { if (curskip > 0) { curskip--; } else if (!ignorable) { try { // Convert hex to character const byte = parseInt(hex, 16); // Try to decode using current encoding let char = String.fromCharCode(byte); // For extended ASCII (128-255), keep as-is if (byte >= 128 && byte <= 255) { out += char; } else if (byte >= 32) { out += char; } } catch (e) { // Ignore decoding errors } } } else if (tchar) { if (curskip > 0) { curskip--; } else if (!ignorable && !suppressOutput) { out += tchar; } } } return out; } /** * Clean and format text */ export function cleanAndFormatText(text) { if (!text) { return ''; } // Remove control characters but preserve line breaks text = text.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, ''); // Convert non-breaking spaces to regular spaces text = text.replace(/\xA0/g, ' '); // Normalize whitespace (but preserve line breaks) text = text.replace(/[^\S\n]+/g, ' '); // Add proper line breaks before questions text = text.replace(/(\d+-\d+\.)/g, '\n$1'); text = text.replace(/(\d+\.)/g, '\n$1'); // Add line breaks before numbered paragraphs text = text.replace(/(\d+\s+[A-Z])/g, '\n$1'); // Ensure numbered paragraphs start with proper line breaks text = text.replace(/([.!?])\s*(\d+\s+[A-Z])/g, '$1\n\n$2'); // Add line breaks before section headings text = text.replace(/([a-z]\.?)([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)/g, '$1\n\n$2'); // Fix specific transitions text = text.replace(/(truth\?)([A-Z])/g, '$1\n\n$2'); text = text.replace(/(Progress)([A-Z])/g, '$1\n\n$2'); text = text.replace(/(Jehovah)([A-Z])/g, '$1\n\n$2'); // Clean up study article formatting text = text.replace(/Study Article\s*\d+/g, 'STUDY ARTICLE'); text = text.replace(/Song\s+\d+/g, 'SONG'); // Fix scripture references text = text.replace(/1\s+Corinthians/g, '1 COR.'); text = text.replace(/Matthew/g, 'Matt.'); text = text.replace(/,footnote/g, ', ftn'); // Add answer placeholders text = text.replace(/(\d+-\d+\.[^?]*\?)/g, '$1\n\nYour answers'); text = text.replace(/((?<!Your\s)\d+\.[^?]*\?)/g, '$1\n\nYour answer'); // Split into lines and clean const lines = text.split('\n'); const cleanedLines = []; for (let line of lines) { line = line.trim(); if (line) { line = line.replace(/\s+/g, ' '); cleanedLines.push(line); } } // Join with appropriate spacing const result = []; let prevLine = ''; for (const line of cleanedLines) { if (!line) continue; if ( line.startsWith('STUDY ARTICLE') || line.startsWith('SONG') || line.startsWith('HELP YOUR STUDENT') || line.startsWith('SHOW CONFIDENCE') || line.toUpperCase().startsWith('FOCUS') ) { if (prevLine) { result.push(''); } result.push(line); result.push(''); } else if (/^\d+\s+/.test(line)) { if (prevLine) { result.push(''); } result.push(line); } else { result.push(line); } prevLine = line; } let finalText = result.join('\n'); // Clean up excessive line breaks finalText = finalText.replace(/\n{3,}/g, '\n\n'); finalText = finalText.trim(); return finalText; } /** * Parse RTF and return formatted plain text * @param {string} rtfContent - RTF content string * @returns {string} Plain text formatted */ export function parseRTF(rtfContent) { if (!rtfContent || typeof rtfContent !== 'string') { throw new Error('RTF content must be a non-empty string'); } if (!rtfContent.trim().startsWith('{\\rtf')) { throw new Error('Invalid RTF format - must start with {\\rtf'); } // Extract plain text from RTF const plainText = rtfToText(rtfContent); // Clean and format the text const formatted = cleanAndFormatText(plainText); return formatted; }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/advenimus/jw-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server