Synergy/DE MCP Server

html-parser.ts•12 KiB

/** * HTML parser for extracting structured content from documentation pages */ import * as cheerio from "cheerio"; import type { Topic, TopicLink } from "../../types.js"; import { parseError } from "../utils/errors.js"; import { logger } from "../utils/logger.js"; // Type for cheerio Element type CheerioElement = cheerio.Element; interface ParseOptions { url: string; version?: string; source: "online" | "local" | "hybrid"; } /** * Extract title from HTML */ function extractTitle($: cheerio.CheerioAPI): string { // Try title tag first const titleTag = $("title").text().trim(); if (titleTag) { return titleTag; } // Try main heading (h1) const h1 = $("h1").first().text().trim(); if (h1) { return h1; } // Try page title in common locations const pageTitle = $(".page-title, .title, [class*='title']").first().text().trim(); if (pageTitle) { return pageTitle; } return "Untitled"; } /** * Extract breadcrumb navigation */ function extractBreadcrumbs($: cheerio.CheerioAPI): string[] { const breadcrumbSelectors = [ ".breadcrumb", ".breadcrumbs", "[class*='breadcrumb']", "nav[aria-label='breadcrumb']", "ol.breadcrumb", "nav ol", ]; for (const selector of breadcrumbSelectors) { const $breadcrumb = $(selector); if ($breadcrumb.length > 0) { const crumbs = $breadcrumb .find("a, span, li") .map((_i: number, el: CheerioElement) => $(el).text().trim()) .get() .filter((text: string) => text.length > 0 && text !== "Home"); if (crumbs.length > 0) { return crumbs; } } } return []; } /** * Find and extract main content area */ function extractMainContent($: cheerio.CheerioAPI): cheerio.Cheerio { // Try common main content selectors const mainContentSelectors = [ "main", ".main-content", ".content", "#content", "[role='main']", ".article", "article", ".documentation-content", ".doc-content", ]; for (const selector of mainContentSelectors) { const $content = $(selector); if ($content.length > 0) { return $content.first(); } } // Fallback: try to find the largest content area // Remove common non-content elements first $("header, footer, nav, aside, .sidebar, .navigation, .menu").remove(); // Return body if no main content found return $("body"); } /** * Extract body text from HTML (exported for use in providers) */ export function extractBodyText($: cheerio.CheerioAPI | cheerio.Root): string { // cheerio.Root is compatible with CheerioAPI for our use case const $api = $ as cheerio.CheerioAPI; const $mainContent = extractMainContent($api); return htmlToText($api, $mainContent); } /** * Convert HTML element to plain text while preserving structure */ function htmlToText($: cheerio.CheerioAPI, $element: cheerio.Cheerio): string { // Clone to avoid modifying original const $clone = $element.clone(); // Remove scripts, styles, and other non-content elements $clone.find("script, style, noscript, iframe, embed, object").remove(); // Convert headings to markdown-style $clone.find("h1, h2, h3, h4, h5, h6").each((_i: number, el: CheerioElement) => { const $el = $(el); const tagName = $el.prop("tagName") || ""; const level = parseInt(tagName.charAt(1), 10) || 1; const text = $el.text().trim(); $el.replaceWith(`\n\n${"#".repeat(level)} ${text}\n\n`); }); // Convert lists to markdown-style $clone.find("ul, ol").each((_i: number, el: CheerioElement) => { const $el = $(el); const isOrdered = $el.is("ol"); const items = $el .find("> li") .map((_j: number, li: CheerioElement) => { const text = $(li).text().trim(); return isOrdered ? `${_j + 1}. ${text}` : `- ${text}`; }) .get() .join("\n"); $el.replaceWith(`\n${items}\n`); }); // Convert code blocks (preserve formatting) $clone.find("pre, code").each((_i: number, el: CheerioElement) => { const $el = $(el); const text = $el.text(); const isBlock = $el.is("pre") || $el.parent().is("pre"); if (isBlock) { $el.replaceWith(`\n\`\`\`\n${text}\n\`\`\`\n`); } else { $el.replaceWith(`\`${text}\``); } }); // Convert links to markdown-style (text only for now) $clone.find("a").each((_i: number, el: CheerioElement) => { const text = $(el).text().trim(); $(el).replaceWith(text); }); // Convert paragraphs to have line breaks $clone.find("p, div").each((_i: number, el: CheerioElement) => { const text = $(el).text().trim(); if (text) { $(el).replaceWith(`\n${text}\n`); } else { $(el).remove(); } }); // Get text and clean up let text = $clone.text(); // Normalize whitespace text = text .replace(/\n{3,}/g, "\n\n") // Max 2 consecutive newlines .replace(/[ \t]+/g, " ") // Normalize spaces .trim(); return text; } /** * Extract navigation links (prev, next, parent, related) */ function extractNavigationLinks( $: cheerio.CheerioAPI, baseUrl: string ): TopicLink[] { const links: TopicLink[] = []; // Previous link const prevSelectors = ["a[rel='prev']", ".prev", ".previous", "[class*='prev']"]; for (const selector of prevSelectors) { const $link = $(selector).first(); if ($link.length > 0) { const href = $link.attr("href"); const title = $link.text().trim() || $link.attr("title") || undefined; if (href) { const topicId = normalizeUrlToTopicId(href, baseUrl); links.push({ type: "prev", target_topic_id: topicId, title, url: href.startsWith("http") ? href : new URL(href, baseUrl).toString(), }); break; } } } // Next link const nextSelectors = ["a[rel='next']", ".next", "[class*='next']"]; for (const selector of nextSelectors) { const $link = $(selector).first(); if ($link.length > 0) { const href = $link.attr("href"); const title = $link.text().trim() || $link.attr("title") || undefined; if (href) { const topicId = normalizeUrlToTopicId(href, baseUrl); links.push({ type: "next", target_topic_id: topicId, title, url: href.startsWith("http") ? href : new URL(href, baseUrl).toString(), }); break; } } } // Parent link const parentSelectors = ["a[rel='up']", ".parent", "[class*='parent']"]; for (const selector of parentSelectors) { const $link = $(selector).first(); if ($link.length > 0) { const href = $link.attr("href"); const title = $link.text().trim() || $link.attr("title") || undefined; if (href) { const topicId = normalizeUrlToTopicId(href, baseUrl); links.push({ type: "parent", target_topic_id: topicId, title, url: href.startsWith("http") ? href : new URL(href, baseUrl).toString(), }); break; } } } // Related links const relatedSelectors = [ "a[rel='related']", ".related a", "[class*='related'] a", ".see-also a", ]; for (const selector of relatedSelectors) { $(selector).each((_i, el) => { const href = $(el).attr("href"); const title = $(el).text().trim() || $(el).attr("title") || undefined; if (href && !href.startsWith("#") && !href.startsWith("mailto:")) { const topicId = normalizeUrlToTopicId(href, baseUrl); links.push({ type: "related", target_topic_id: topicId, title, url: href.startsWith("http") ? href : new URL(href, baseUrl).toString(), }); } }); } return links; } /** * Normalize URL to topic ID */ /** * Normalize a URL to a topic ID * @param href - The URL or href to normalize * @param baseUrl - The base URL for resolving relative URLs * @returns The normalized topic ID */ export function normalizeUrlToTopicId(href: string, baseUrl: string): string { try { const url = new URL(href, baseUrl); // Remove base URL to get relative path const base = new URL(baseUrl); if (url.origin === base.origin) { let pathname = url.pathname.replace(/^\//, "").replace(/\/$/, ""); // Remove base path if present (e.g., "docs/" from "docs/topic1") const basePath = base.pathname.replace(/^\//, "").replace(/\/$/, ""); if (basePath && pathname.startsWith(basePath + "/")) { pathname = pathname.substring(basePath.length + 1); } return pathname || url.pathname; } return url.toString(); } catch { // If URL parsing fails, return the href as-is return href.replace(/^\//, ""); } } /** * Extract section from breadcrumbs or URL */ function extractSection(breadcrumbs: string[], url: string): string { // Try to get section from breadcrumbs (usually first or second item) if (breadcrumbs.length > 0) { // Skip "Home" or "Documentation" if present const section = breadcrumbs.find( (crumb) => !["Home", "Documentation", "Docs"].includes(crumb) ); if (section) { return section; } } // Try to extract from URL path try { const urlObj = new URL(url); const pathParts = urlObj.pathname.split("/").filter((p) => p); // Common section names in path const commonSections = [ "Language", "General Guides", "Data Access", "Development Tools", "Updating", ]; for (const part of pathParts) { const normalized = part.replace(/-/g, " ").replace(/\b\w/g, (l) => l.toUpperCase()); if (commonSections.some((s) => s.toLowerCase().includes(normalized.toLowerCase()))) { return normalized; } } } catch { // Ignore URL parsing errors } return "Unknown"; } /** * Parse HTML and extract structured Topic information */ export function parseHtml(html: string, options: ParseOptions): Topic { try { logger.logParsing("Parsing HTML", { url: options.url }); const $: cheerio.CheerioAPI = cheerio.load(html) as cheerio.CheerioAPI; // Extract title const title = extractTitle($); logger.debug("Extracted title", { title, url: options.url }); // Extract breadcrumbs const breadcrumbs = extractBreadcrumbs($); logger.debug("Extracted breadcrumbs", { breadcrumbs, url: options.url }); // Extract section const section = extractSection(breadcrumbs, options.url); // Extract main content const $mainContent = extractMainContent($); const bodyText = htmlToText($, $mainContent); // Generate summary (first 200 characters of body) let summary = bodyText.substring(0, 200).trim(); if (bodyText.length > 200) { // Try to end at a sentence boundary const lastPeriod = summary.lastIndexOf("."); if (lastPeriod > 100) { summary = summary.substring(0, lastPeriod + 1); } else { // Fallback: just truncate and add ellipsis summary = summary + "..."; } } // Extract navigation links const links = extractNavigationLinks($, options.url); logger.debug("Extracted navigation links", { count: links.length, types: links.map((l) => l.type), url: options.url, }); // Normalize topic ID from URL const topicId = normalizeUrlToTopicId(options.url, options.url); // Create topic (chunking will be done separately) const topic: Topic = { id: topicId, version: options.version ?? "latest", title, section, path: breadcrumbs, summary, body_chunks: [], // Will be populated by chunker links, url: options.url, source: options.source, }; logger.debug("Parsed HTML successfully", { topic_id: topicId, title, section, breadcrumb_count: breadcrumbs.length, link_count: links.length, }); return topic; } catch (error) { const errorPayload = parseError( "HTML parsing", error instanceof Error ? error.message : String(error), { url: options.url, version: options.version, } ); const err = new Error(errorPayload.message) as Error & { payload: typeof errorPayload }; err.payload = errorPayload; throw err; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/h0ck3ystyx/synergyde-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

html-parser.ts•12 KiB