superFetch MCP Server

markdown-cleanup.ts•17.8 KiB

import type { MetadataBlock } from '../transform/types.js'; import { config } from './config.js'; import { FetchError } from './errors.js'; // --- Constants & Regex --- const MAX_LINE_LENGTH = 80; const REGEX = { HEADING_MARKER: /^#{1,6}\s/m, HEADING_STRICT: /^#{1,6}\s+/m, EMPTY_HEADING_LINE: /^#{1,6}[ \t\u00A0]*$/, FENCE_START: /^\s*(`{3,}|~{3,})/, LIST_MARKER: /^(?:[-*+])\s/m, TOC_LINK: /^- \[[^\]]+\]$#[^)]+$\s*$/, TOC_HEADING: /^(?:#{1,6}\s+)?(?:table of contents|contents)\s*$/i, HTML_DOC_START: /^(<!doctype|<html)/i, COMBINED_LINE_REMOVALS: /^(?:\[Skip to (?:main )?(?:content|navigation)\]$#[^)]*$|\[Skip link\]$#[^)]*$|Was this page helpful\??)\s*$/gim, ZERO_WIDTH_ANCHOR: /\[(?:\s|\u200B)*\]$#[^)]*$[ \t]*/g, CONCATENATED_PROPS: /([a-z_][a-z0-9_]{0,30}\??:\s+)([\u0022\u201C][^\u0022\u201C\u201D]*[\u0022\u201D])([a-z_][a-z0-9_]{0,30}\??:)/g, DOUBLE_NEWLINE_REDUCER: /\n{3,}/g, SOURCE_KEY: /^source:\s/im, HEADING_SPACING: /(^#{1,6}\s[^\n]*)\n([^\n])/gm, HEADING_CODE_BLOCK: /(^#{1,6}\s+\w+)```/gm, SPACING_LINK_FIX: /\]$([^)]+)$\[/g, SPACING_ADJ_COMBINED: /(?:\]$[^)]+$|`[^`]+`)(?=[A-Za-z0-9])/g, SPACING_CODE_DASH: /(`[^`]+`)\s*\\-\s*/g, SPACING_ESCAPES: /\\([[\].])/g, SPACING_LIST_NUM_COMBINED: /^((?![-*+] |\d+\. |[ \t]).+)\n((?:[-*+]|\d+\.) )/gm, NESTED_LIST_INDENT: /^( +)((?:[-*+])|\d+\.)\s/gm, TYPEDOC_COMMENT: /(`+)(?:(?!\1)[\s\S])*?\1|\s?\/\\?\*[\s\S]*?\\?\*\//g, } as const; const HEADING_KEYWORDS = new Set( config.markdownCleanup.headingKeywords.map((value) => value.toLocaleLowerCase(config.i18n.locale) ) ); const SPECIAL_PREFIXES = /^(?:example|note|tip|warning|important|caution):\s+\S/i; const TOC_SCAN_LIMIT = 20; const TOC_MAX_NON_EMPTY = 12; const TOC_LINK_RATIO_THRESHOLD = 0.8; const TYPEDOC_PREFIXES = [ 'Defined in:', 'Returns:', 'Since:', 'See also:', ] as const; interface CleanupOptions { signal?: AbortSignal; url?: string; } function throwIfAborted( signal: AbortSignal | undefined, url: string, stage: string ): void { if (!signal?.aborted) return; throw new FetchError('Request was canceled', url, 499, { reason: 'aborted', stage, }); } function createAbortChecker(options?: CleanupOptions): (stage: string) => void { const signal = options?.signal; const url = options?.url ?? ''; return (stage: string): void => { throwIfAborted(signal, url, stage); }; } // --- Helper Functions --- function getLineEnding(content: string): '\n' | '\r\n' { return content.includes('\r\n') ? '\r\n' : '\n'; } function isBlank(line: string | undefined): boolean { return line === undefined || line.trim().length === 0; } function hasFollowingContent(lines: string[], startIndex: number): boolean { // Optimization: Bound lookahead to avoid checking too many lines in huge files for ( let i = startIndex + 1; i < Math.min(lines.length, startIndex + 50); i++ ) { if (!isBlank(lines[i])) return true; } return false; } // Optimized Heuristics function isTitleCaseOrKeyword(trimmed: string): boolean { // Quick check for length to avoid regex on long strings if (trimmed.length > MAX_LINE_LENGTH) return false; // Single word optimization if (!trimmed.includes(' ')) { if (!/^[A-Z]/.test(trimmed)) return false; return HEADING_KEYWORDS.has(trimmed.toLocaleLowerCase(config.i18n.locale)); } // Split limited number of words const words = trimmed.split(/\s+/); const len = words.length; if (len < 2 || len > 6) return false; let capitalizedCount = 0; for (let i = 0; i < len; i++) { const w = words[i]; if (!w) continue; const isCap = /^[A-Z][a-z]*$/.test(w); if (isCap) capitalizedCount++; else if (!/^(?:and|or|the|of|in|for|to|a)$/i.test(w)) return false; } return capitalizedCount >= 2; } function getHeadingPrefix(trimmed: string): string | null { if (trimmed.length > MAX_LINE_LENGTH) return null; // Fast path: Check common markdown markers first const firstChar = trimmed.charCodeAt(0); // # (35), - (45), * (42), + (43), digit (48-57), [ (91) if ( firstChar === 35 || firstChar === 45 || firstChar === 42 || firstChar === 43 || firstChar === 91 || (firstChar >= 48 && firstChar <= 57) ) { if ( REGEX.HEADING_MARKER.test(trimmed) || REGEX.LIST_MARKER.test(trimmed) || /^\d+\.\s/.test(trimmed) || /^\[.*\]$.*$$/.test(trimmed) ) { return null; } } if (SPECIAL_PREFIXES.test(trimmed)) { return /^example:\s/i.test(trimmed) ? '### ' : '## '; } const lastChar = trimmed.charCodeAt(trimmed.length - 1); // . (46), ! (33), ? (63) if (lastChar === 46 || lastChar === 33 || lastChar === 63) return null; return isTitleCaseOrKeyword(trimmed) ? '## ' : null; } // Optimized TOC detection function getTocBlockStats( lines: string[], headingIndex: number ): { total: number; linkCount: number; nonLinkCount: number } { let total = 0; let linkCount = 0; let nonLinkCount = 0; const lookaheadMax = Math.min(lines.length, headingIndex + TOC_SCAN_LIMIT); for (let i = headingIndex + 1; i < lookaheadMax; i++) { const line = lines[i]; if (!line) continue; const trimmed = line.trim(); if (!trimmed) continue; if (REGEX.HEADING_MARKER.test(trimmed)) break; total += 1; if (REGEX.TOC_LINK.test(trimmed)) linkCount += 1; else nonLinkCount += 1; if (total >= TOC_MAX_NON_EMPTY) break; } return { total, linkCount, nonLinkCount }; } function skipTocLines(lines: string[], startIndex: number): number { for (let i = startIndex; i < lines.length; i++) { const line = lines[i]; if (line === undefined) continue; const trimmed = line.trim(); if (!trimmed) continue; if (!REGEX.TOC_LINK.test(trimmed)) return i; } return lines.length; } function isTypeDocArtifactLine(line: string): boolean { const trimmed = line.trim(); for (const prefix of TYPEDOC_PREFIXES) { if (!trimmed.startsWith(prefix)) continue; const rest = trimmed.slice(prefix.length).trimStart(); if (!rest.startsWith('**`')) return false; return rest.includes('`**'); } return false; } // --- Main Processing Logic --- function tryPromoteOrphan( lines: string[], i: number, trimmed: string ): string | null { const prevLine = lines[i - 1]; const isOrphan = i === 0 || !prevLine || prevLine.trim().length === 0; if (!isOrphan) return null; const prefix = getHeadingPrefix(trimmed); if (!prefix) return null; const isSpecialPrefix = SPECIAL_PREFIXES.test(trimmed); if (!isSpecialPrefix && !hasFollowingContent(lines, i)) return null; return `${prefix}${trimmed}`; } function shouldSkipAsToc( lines: string[], i: number, trimmed: string, removeToc: boolean, options?: CleanupOptions ): number | null { if (!removeToc || !REGEX.TOC_HEADING.test(trimmed)) return null; const { total, linkCount, nonLinkCount } = getTocBlockStats(lines, i); if (total === 0 || nonLinkCount > 0) return null; const ratio = linkCount / total; if (ratio <= TOC_LINK_RATIO_THRESHOLD) return null; throwIfAborted(options?.signal, options?.url ?? '', 'markdown:cleanup:toc'); return skipTocLines(lines, i + 1); } function preprocessLines(lines: string[], options?: CleanupOptions): string { const processedLines: string[] = []; const len = lines.length; const promote = config.markdownCleanup.promoteOrphanHeadings; const removeToc = config.markdownCleanup.removeTocBlocks; const checkAbort = createAbortChecker(options); let skipUntil = -1; for (let i = 0; i < len; i++) { if (i < skipUntil) continue; let line = lines[i]; if (line === undefined) continue; const trimmed = line.trim(); if (REGEX.EMPTY_HEADING_LINE.test(trimmed)) continue; const tocSkip = shouldSkipAsToc(lines, i, trimmed, removeToc, options); if (tocSkip !== null) { skipUntil = tocSkip; continue; } if (promote && trimmed.length > 0) { checkAbort('markdown:cleanup:promote'); const promoted = tryPromoteOrphan(lines, i, trimmed); if (promoted) line = promoted; } processedLines.push(line); } return processedLines.join('\n'); } // Process a block of non-fence lines function processTextBuffer(lines: string[], options?: CleanupOptions): string { if (lines.length === 0) return ''; const text = preprocessLines(lines, options); return applyGlobalRegexes(text, options); } function applyGlobalRegexes(text: string, options?: CleanupOptions): string { let result = text; const checkAbort = createAbortChecker(options); checkAbort('markdown:cleanup:headings'); // fixAndSpaceHeadings result = result .replace(REGEX.HEADING_SPACING, '$1\n\n$2') .replace(REGEX.HEADING_CODE_BLOCK, '$1\n\n```'); if (config.markdownCleanup.removeTypeDocComments) { checkAbort('markdown:cleanup:typedoc'); result = result .split('\n') .filter((line) => !isTypeDocArtifactLine(line)) .join('\n'); result = result.replace(REGEX.TYPEDOC_COMMENT, (match) => match.startsWith('`') ? match : '' ); } if (config.markdownCleanup.removeSkipLinks) { checkAbort('markdown:cleanup:skip-links'); result = result .replace(REGEX.ZERO_WIDTH_ANCHOR, '') .replace(REGEX.COMBINED_LINE_REMOVALS, ''); } checkAbort('markdown:cleanup:spacing'); // normalizeSpacing result = result .replace(REGEX.SPACING_LINK_FIX, ']($1)\n\n[') .replace(REGEX.SPACING_ADJ_COMBINED, '$& ') .replace(REGEX.SPACING_CODE_DASH, '$1 - ') .replace(REGEX.SPACING_ESCAPES, '$1') .replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2') .replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n'); result = normalizeNestedListIndentation(result); checkAbort('markdown:cleanup:properties'); // fixProperties for (let k = 0; k < 3; k++) { const next = result.replace(REGEX.CONCATENATED_PROPS, '$1$2\n\n$3'); if (next === result) break; result = next; } return result; } function normalizeNestedListIndentation(text: string): string { return text.replace( REGEX.NESTED_LIST_INDENT, (match: string, spaces: string, marker: string): string => { const count = spaces.length; if (count < 2 || count % 2 !== 0) return match; const normalized = ' '.repeat((count / 2) * 4); return `${normalized}${marker} `; } ); } export function cleanupMarkdownArtifacts( content: string, options?: CleanupOptions ): string { if (!content) return ''; const checkAbort = createAbortChecker(options); checkAbort('markdown:cleanup:begin'); const len = content.length; let lastIndex = 0; let fenceMarker: string | null = null; const segments: string[] = []; let buffer: string[] = []; while (lastIndex < len) { let nextIndex = content.indexOf('\n', lastIndex); let line: string; if (nextIndex === -1) { line = content.slice(lastIndex); nextIndex = len; } else { if (nextIndex > lastIndex && content.charCodeAt(nextIndex - 1) === 13) { line = content.slice(lastIndex, nextIndex - 1); } else { line = content.slice(lastIndex, nextIndex); } nextIndex++; // Skip \n } const trimmed = line.trimStart(); if (fenceMarker) { segments.push(line); if ( trimmed.startsWith(fenceMarker) && trimmed.slice(fenceMarker.length).trim() === '' ) { fenceMarker = null; } } else { const match = REGEX.FENCE_START.exec(line); const newMarker = match ? (match[1] ?? '```') : null; if (!newMarker) { buffer.push(line); } else { if (buffer.length > 0) { segments.push(processTextBuffer(buffer, options)); buffer = []; } segments.push(line); fenceMarker = newMarker; } } lastIndex = nextIndex; } if (buffer.length > 0) { segments.push(processTextBuffer(buffer, options)); } return segments.join('\n').trim(); } // --- Frontmatter & Metadata Utilities --- interface FrontmatterRange { start: number; end: number; linesStart: number; linesEnd: number; lineEnding: '\n' | '\r\n'; } function detectFrontmatter(content: string): FrontmatterRange | null { const len = content.length; if (len < 4) return null; let lineEnding: '\n' | '\r\n' | null = null; let fenceLen = 0; if (content.startsWith('---\n')) { lineEnding = '\n'; fenceLen = 4; } else if (content.startsWith('---\r\n')) { lineEnding = '\r\n'; fenceLen = 5; } if (!lineEnding) return null; const fence = `---${lineEnding}`; const closeIndex = content.indexOf(fence, fenceLen); if (closeIndex === -1) return null; return { start: 0, end: closeIndex + fenceLen, linesStart: fenceLen, linesEnd: closeIndex, lineEnding, }; } function parseFrontmatterEntry( line: string ): { key: string; value: string } | null { const trimmed = line.trim(); const idx = trimmed.indexOf(':'); if (!trimmed || idx <= 0) return null; return { key: trimmed.slice(0, idx).trim().toLowerCase(), value: trimmed.slice(idx + 1).trim(), }; } function stripFrontmatterQuotes(val: string): string { const first = val.charAt(0); const last = val.charAt(val.length - 1); if ((first === '"' && last === '"') || (first === "'" && last === "'")) { return val.slice(1, -1).trim(); } return val; } function scanFrontmatterForTitle( content: string, fm: FrontmatterRange ): string | undefined { const fmBody = content.slice(fm.linesStart, fm.linesEnd); let lastIdx = 0; while (lastIdx < fmBody.length) { let nextIdx = fmBody.indexOf(fm.lineEnding, lastIdx); if (nextIdx === -1) nextIdx = fmBody.length; const line = fmBody.slice(lastIdx, nextIdx); const entry = parseFrontmatterEntry(line); if (entry) { if (entry.key === 'title' || entry.key === 'name') { const cleaned = stripFrontmatterQuotes(entry.value); if (cleaned) return cleaned; } } lastIdx = nextIdx + fm.lineEnding.length; } return undefined; } function scanBodyForTitle(content: string): string | undefined { const len = content.length; let scanIndex = 0; const LIMIT = 5000; const maxScan = Math.min(len, LIMIT); while (scanIndex < maxScan) { let nextIndex = content.indexOf('\n', scanIndex); if (nextIndex === -1) nextIndex = len; let line = content.slice(scanIndex, nextIndex); if (line.endsWith('\r')) line = line.slice(0, -1); const trimmed = line.trim(); if (trimmed) { if (REGEX.HEADING_STRICT.test(trimmed)) { return trimmed.replace(REGEX.HEADING_MARKER, '').trim() || undefined; } return undefined; } scanIndex = nextIndex + 1; } return undefined; } export function extractTitleFromRawMarkdown( content: string ): string | undefined { const fm = detectFrontmatter(content); if (fm) { const title = scanFrontmatterForTitle(content, fm); if (title) return title; } return scanBodyForTitle(content); } export function addSourceToMarkdown(content: string, url: string): string { const fm = detectFrontmatter(content); const useMarkdownFormat = config.transform.metadataFormat === 'markdown'; if (useMarkdownFormat && !fm) { if (REGEX.SOURCE_KEY.test(content)) return content; const lineEnding = getLineEnding(content); const firstH1Match = REGEX.HEADING_MARKER.exec(content); if (firstH1Match) { const h1Index = firstH1Match.index; const lineEndIndex = content.indexOf(lineEnding, h1Index); const insertPos = lineEndIndex === -1 ? content.length : lineEndIndex + lineEnding.length; const injection = `${lineEnding}Source: ${url}${lineEnding}`; return content.slice(0, insertPos) + injection + content.slice(insertPos); } return `Source: ${url}${lineEnding}${lineEnding}${content}`; } if (!fm) { const lineEnding = getLineEnding(content); const escapedUrl = url.replace(/"/g, '\\"'); return `---${lineEnding}source: "${escapedUrl}"${lineEnding}---${lineEnding}${lineEnding}${content}`; } const fmBody = content.slice(fm.linesStart, fm.linesEnd); if (REGEX.SOURCE_KEY.test(fmBody)) return content; const escapedUrl = url.replace(/"/g, '\\"'); const injection = `source: "${escapedUrl}"${fm.lineEnding}`; return content.slice(0, fm.linesEnd) + injection + content.slice(fm.linesEnd); } function countCommonTags(content: string, limit: number): number { if (limit <= 0) return 0; const regex = /<(html|head|body|div|span|script|style|meta|link)\b/gi; let count = 0; while (regex.exec(content)) { count += 1; if (count > limit) break; } return count; } export function isRawTextContent(content: string): boolean { const trimmed = content.trim(); if (REGEX.HTML_DOC_START.test(trimmed)) return false; if (detectFrontmatter(trimmed) !== null) return true; const tagCount = countCommonTags(content, 5); if (tagCount > 5) return false; return ( REGEX.HEADING_MARKER.test(content) || REGEX.LIST_MARKER.test(content) || content.includes('```') ); } function formatFetchedAt(value: string): string { const date = new Date(value); if (Number.isNaN(date.getTime())) return value; const formatter = new Intl.DateTimeFormat(config.i18n.locale, { day: '2-digit', month: '2-digit', year: 'numeric', }); return formatter.format(date); } export function buildMetadataFooter( metadata?: MetadataBlock, fallbackUrl?: string ): string { if (!metadata) return ''; const lines: string[] = ['---', '']; const url = metadata.url || fallbackUrl; const parts: string[] = []; if (metadata.title) parts.push(`_${metadata.title}_`); if (metadata.author) parts.push(`_${metadata.author}_`); if (url) parts.push(`[_Original Source_](${url})`); if (metadata.fetchedAt) { parts.push(`_${formatFetchedAt(metadata.fetchedAt)}_`); } if (parts.length > 0) lines.push(` ${parts.join(' | ')}`); if (metadata.description) lines.push(` <sub>${metadata.description}</sub>`); return lines.join('\n'); }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/j0hanz/super-fetch-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

markdown-cleanup.ts•17.8 KiB