BerryRAG

readability_extractor.cjs•5.87 KiB

#!/usr/bin/env node /** * Readability Content Extractor * Uses Mozilla's Readability library to extract clean content from HTML */ const { Readability } = require("@mozilla/readability") const { JSDOM } = require("jsdom") function extractContent(html, url = "https://example.com") { try { // Create JSDOM instance const dom = new JSDOM(html, { url }) const document = dom.window.document // Check if content is probably readable const { isProbablyReaderable } = require("@mozilla/readability") if (!isProbablyReaderable(document)) { return { success: false, error: "Content is not readable", readable: false, } } // Extract content using Readability const reader = new Readability(document, { debug: false, charThreshold: 100, classesToPreserve: ["highlight", "code", "pre"], keepClasses: false, serializer: (el) => el.innerHTML, }) const article = reader.parse() if (!article) { return { success: false, error: "Failed to parse article", readable: false, } } // Extract additional metadata const metadata = extractMetadata(document, url) // Convert HTML content to markdown-like format const textContent = convertToMarkdown(article.content) return { success: true, readable: true, article: { title: article.title || metadata.title || "Untitled", content: article.content, textContent: textContent, length: article.length, excerpt: article.excerpt || metadata.description || "", byline: article.byline || metadata.author || "", dir: article.dir || "ltr", siteName: article.siteName || metadata.siteName || "", lang: article.lang || metadata.lang || "en", publishedTime: article.publishedTime || metadata.publishedTime || null, }, metadata: metadata, } } catch (error) { return { success: false, error: error.message, readable: false, } } } function extractMetadata(document, url) { const metadata = { url } // Extract various metadata const metaSelectors = { title: ["title", "h1", '[property="og:title"]', '[name="twitter:title"]'], description: [ '[name="description"]', '[property="og:description"]', '[name="twitter:description"]', ], author: [ '[name="author"]', '[rel="author"]', ".author", ".byline", '[property="article:author"]', ], siteName: ['[property="og:site_name"]', '[name="application-name"]'], publishedTime: [ '[property="article:published_time"]', '[name="date"]', "time[datetime]", ".date", ], lang: ["html[lang]", "[lang]"], } for (const [key, selectors] of Object.entries(metaSelectors)) { for (const selector of selectors) { const element = document.querySelector(selector) if (element) { let value = null if (element.hasAttribute("content")) { value = element.getAttribute("content") } else if (element.hasAttribute("datetime")) { value = element.getAttribute("datetime") } else if (key === "lang" && element.hasAttribute("lang")) { value = element.getAttribute("lang") } else { value = element.textContent?.trim() } if (value) { metadata[key] = value break } } } } return metadata } function convertToMarkdown(html) { if (!html) return "" // Create a temporary DOM to process the HTML const dom = new JSDOM(html) const document = dom.window.document // Convert to markdown-like text let markdown = "" function processNode(node) { if (node.nodeType === 3) { // Text node return node.textContent } if (node.nodeType !== 1) return "" // Not an element const tagName = node.tagName.toLowerCase() let content = "" // Process child nodes for (const child of node.childNodes) { content += processNode(child) } // Format based on tag type switch (tagName) { case "h1": return `\n# ${content.trim()}\n\n` case "h2": return `\n## ${content.trim()}\n\n` case "h3": return `\n### ${content.trim()}\n\n` case "h4": return `\n#### ${content.trim()}\n\n` case "h5": return `\n##### ${content.trim()}\n\n` case "h6": return `\n###### ${content.trim()}\n\n` case "p": return `${content.trim()}\n\n` case "br": return "\n" case "strong": case "b": return `**${content}**` case "em": case "i": return `*${content}*` case "code": return `\`${content}\`` case "pre": return `\n\`\`\`\n${content}\n\`\`\`\n\n` case "blockquote": return `\n> ${content.trim()}\n\n` case "ul": case "ol": return `\n${content}\n` case "li": return `• ${content.trim()}\n` case "a": const href = node.getAttribute("href") return href ? `[${content}](${href})` : content case "img": const src = node.getAttribute("src") const alt = node.getAttribute("alt") || "Image" return src ? `![${alt}](${src})` : "" default: return content } } markdown = processNode(document.body || document) // Clean up excessive whitespace markdown = markdown.replace(/\n{3,}/g, "\n\n") markdown = markdown.trim() return markdown } // CLI interface if (require.main === module) { const args = process.argv.slice(2) if (args.length < 1) { console.error("Usage: node readability_extractor.js <html_content> [url]") process.exit(1) } const html = args[0] const url = args[1] || "https://example.com" const result = extractContent(html, url) console.log(JSON.stringify(result, null, 2)) } module.exports = { extractContent }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/berrydev-ai/berry-rag'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

readability_extractor.cjs•5.87 KiB