import { JSDOM } from "jsdom";
// Parse HTML to plain text with better handling of special characters
export function htmlToPlainText(html: string | null): string {
if (!html) return '';
const dom = new JSDOM(html);
// Preserve line breaks and spacing in text content
return dom.window.document.body.textContent?.replace(/\$(\d+)/g, '\\$$$1') || '';
}
// Helper function for safer HTML to Markdown conversion
export function convertHtmlToMarkdown(html: string): string {
const dom = new JSDOM(html);
const document = dom.window.document;
// Helper to get text content while preserving $ signs
const getTextContent = (element: Element): string => {
return element.textContent?.replace(/\$(\d+)/g, '\\$$$1') || '';
};
// Process the HTML in a more structured way
function processNode(node: Node): string {
if (node.nodeType === node.TEXT_NODE) {
return node.textContent?.replace(/\$(\d+)/g, '\\$$$1') || '';
}
if (node.nodeType !== node.ELEMENT_NODE) {
return '';
}
const element = node as Element;
let result = '';
switch (element.tagName.toLowerCase()) {
case 'h1':
return `# ${getTextContent(element)}\n\n`;
case 'h2':
return `## ${getTextContent(element)}\n\n`;
case 'h3':
return `### ${getTextContent(element)}\n\n`;
case 'strong':
case 'b':
return `**${getTextContent(element)}**`;
case 'em':
case 'i':
return `*${getTextContent(element)}*`;
case 'ul':
return Array.from(element.children)
.map(li => `- ${processNode(li)}`)
.join('\n') + '\n\n';
case 'ol':
return Array.from(element.children)
.map((li, index) => `${index + 1}. ${processNode(li)}`)
.join('\n') + '\n\n';
case 'li':
return Array.from(element.childNodes)
.map(child => processNode(child))
.join('').trim();
case 'p':
return Array.from(element.childNodes)
.map(child => processNode(child))
.join('') + '\n\n';
case 'br':
return '\n';
case 'a':
const href = element.getAttribute('href');
const text = getTextContent(element);
return href ? `[${text}](${href})` : text;
default:
return Array.from(element.childNodes)
.map(child => processNode(child))
.join('');
}
}
// Process the body content
const result = Array.from(document.body.childNodes)
.map(node => processNode(node))
.join('')
.trim();
// Clean up any extra newlines
return result.replace(/\n\n\n+/g, '\n\n');
}
// Helper function to extract links from HTML
export function extractLinks(html: string | null): { text: string; href: string }[] {
if (!html) return [];
const dom = new JSDOM(html);
const links = dom.window.document.querySelectorAll('a');
return Array.from(links).map(link => ({
text: link.textContent || '',
href: link.getAttribute('href') || ''
}));
}