Skip to main content
Glama
scraper.ts7.42 kB
#!/usr/bin/env node /** * Vega-Lite Documentation Scraper * * This utility scrapes documentation from the official Vega-Lite website * and saves it in a structured format for the MCP server to use. */ import { promises as fs } from "fs"; import path from "path"; import { fileURLToPath } from "url"; import fetch from "node-fetch"; import * as cheerio from "cheerio"; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); interface DocSection { title: string; url: string; content: string; category: string; source?: string; } interface VegaLiteExample { name: string; description: string; category: string; spec: Record<string, unknown>; url: string; } const VEGA_LITE_BASE_URL = "https://vega.github.io/vega-lite"; const VEGA_LITE_DOCS_URL = `${VEGA_LITE_BASE_URL}/docs/`; const DENEB_BASE_URL = "https://deneb.guide"; const DENEB_DOCS_URL = `${DENEB_BASE_URL}/docs/`; // Key Vega-Lite documentation pages to scrape const VEGA_LITE_DOC_PAGES = [ { path: "", category: "introduction" }, { path: "mark.html", category: "marks" }, { path: "encoding.html", category: "encoding" }, { path: "type.html", category: "data-types" }, { path: "scale.html", category: "scales" }, { path: "axis.html", category: "axes" }, { path: "legend.html", category: "legends" }, { path: "transform.html", category: "transforms" }, { path: "aggregate.html", category: "transforms" }, { path: "bin.html", category: "transforms" }, { path: "timeunit.html", category: "transforms" }, { path: "filter.html", category: "transforms" }, { path: "selection.html", category: "interaction" }, { path: "condition.html", category: "interaction" }, { path: "layer.html", category: "composition" }, { path: "concat.html", category: "composition" }, { path: "facet.html", category: "composition" }, { path: "repeat.html", category: "composition" }, ]; // Key Deneb documentation pages to scrape const DENEB_DOC_PAGES = [ // Introduction and Getting Started { path: "", category: "introduction" }, { path: "getting-started", category: "getting-started" }, { path: "simple-example", category: "getting-started" }, { path: "visual-editor", category: "getting-started" }, { path: "dataset", category: "getting-started" }, { path: "keyboard", category: "getting-started" }, // Deeper Concepts { path: "formatting", category: "concepts" }, { path: "scrolling-overflow", category: "concepts" }, { path: "schemes", category: "concepts" }, { path: "pattern-fills", category: "concepts" }, { path: "templates", category: "concepts" }, { path: "performance", category: "concepts" }, // Interactivity Features { path: "interactivity-overview", category: "interactivity" }, { path: "interactivity-tooltips", category: "interactivity" }, { path: "interactivity-context-menu", category: "interactivity" }, { path: "interactivity-selection", category: "interactivity" }, { path: "interactivity-selection-advanced", category: "interactivity" }, { path: "interactivity-highlight", category: "interactivity" }, // Other { path: "changelog", category: "other" }, ]; // Example categories to scrape const EXAMPLE_CATEGORIES = [ "bar", "line", "area", "scatter", "histogram", "circle", "tick", "rect", "interactive", "composite", ]; /** * Scrape a documentation page (generic) */ async function scrapeDocPage( baseUrl: string, pagePath: string, category: string, source: string ): Promise<DocSection | null> { const url = `${baseUrl}${pagePath}`; try { console.log(`Scraping ${source}: ${url}...`); const response = await fetch(url); const html = await response.text(); const $ = cheerio.load(html); // Extract main content const title = $("h1").first().text().trim() || $("title").text().trim(); // Try to get the main content area let content = ""; const mainContent = $("main, article, .content, #content").first(); if (mainContent.length) { // Remove script and style tags mainContent.find("script, style").remove(); content = mainContent.text().trim(); } else { // Fallback: get body text content = $("body").text().trim(); } // Clean up the content (remove excessive whitespace) content = content.replace(/\s+/g, " ").substring(0, 5000); // Limit to 5000 chars return { title, url, content, category, source, }; } catch (error) { console.error(`Error scraping ${url}:`, error); return null; } } /** * Scrape examples (placeholder - would need actual implementation) */ async function scrapeExamples(): Promise<VegaLiteExample[]> { // This is a placeholder. In a real implementation, you would: // 1. Scrape the examples gallery page // 2. Extract example specifications // 3. Categorize them console.log("Example scraping not fully implemented - using fallback data"); return []; } /** * Main scraper function */ async function scrapeAll() { console.log("Starting Vega-Lite and Deneb documentation scraper...\n"); // Create data directory if it doesn't exist const dataDir = path.join(__dirname, "..", "data"); await fs.mkdir(dataDir, { recursive: true }); // Scrape Vega-Lite documentation pages console.log("=== Scraping Vega-Lite Documentation ===\n"); const vegaLiteDocs: DocSection[] = []; for (const page of VEGA_LITE_DOC_PAGES) { const doc = await scrapeDocPage(VEGA_LITE_DOCS_URL, page.path, page.category, "vega-lite"); if (doc) { vegaLiteDocs.push(doc); } // Be nice to the server await new Promise((resolve) => setTimeout(resolve, 1000)); } // Save Vega-Lite documentation const vegaLiteDocsPath = path.join(dataDir, "documentation.json"); await fs.writeFile(vegaLiteDocsPath, JSON.stringify(vegaLiteDocs, null, 2)); console.log(`\n✅ Saved ${vegaLiteDocs.length} Vega-Lite documentation sections to ${vegaLiteDocsPath}\n`); // Scrape Deneb documentation pages console.log("=== Scraping Deneb Documentation ===\n"); const denebDocs: DocSection[] = []; for (const page of DENEB_DOC_PAGES) { const doc = await scrapeDocPage(DENEB_DOCS_URL, page.path, page.category, "deneb"); if (doc) { denebDocs.push(doc); } // Be nice to the server await new Promise((resolve) => setTimeout(resolve, 1000)); } // Save Deneb documentation const denebDocsPath = path.join(dataDir, "deneb-documentation.json"); await fs.writeFile(denebDocsPath, JSON.stringify(denebDocs, null, 2)); console.log(`\n✅ Saved ${denebDocs.length} Deneb documentation sections to ${denebDocsPath}\n`); // Scrape examples const examples = await scrapeExamples(); const examplesPath = path.join(dataDir, "examples.json"); await fs.writeFile(examplesPath, JSON.stringify(examples, null, 2)); console.log(`✅ Saved ${examples.length} examples to ${examplesPath}\n`); console.log("=== Scraping Complete! ==="); console.log(`Total Vega-Lite docs: ${vegaLiteDocs.length}`); console.log(`Total Deneb docs: ${denebDocs.length}`); console.log(`Total examples: ${examples.length}`); } // Run scraper if called directly if (import.meta.url === `file://${process.argv[1]}`) { scrapeAll().catch((error) => { console.error("Fatal error:", error); process.exit(1); }); } export { scrapeAll, scrapeDocPage, scrapeExamples };

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/inteligencianegociosmmx/vegaLite_mcp_server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server