#!/usr/bin/env node
/**
* Vega-Lite Documentation Scraper
*
* This utility scrapes documentation from the official Vega-Lite website
* and saves it in a structured format for the MCP server to use.
*/
import { promises as fs } from "fs";
import path from "path";
import { fileURLToPath } from "url";
import fetch from "node-fetch";
import * as cheerio from "cheerio";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
interface DocSection {
title: string;
url: string;
content: string;
category: string;
source?: string;
}
interface VegaLiteExample {
name: string;
description: string;
category: string;
spec: Record<string, unknown>;
url: string;
}
const VEGA_LITE_BASE_URL = "https://vega.github.io/vega-lite";
const VEGA_LITE_DOCS_URL = `${VEGA_LITE_BASE_URL}/docs/`;
const DENEB_BASE_URL = "https://deneb.guide";
const DENEB_DOCS_URL = `${DENEB_BASE_URL}/docs/`;
// Key Vega-Lite documentation pages to scrape
const VEGA_LITE_DOC_PAGES = [
{ path: "", category: "introduction" },
{ path: "mark.html", category: "marks" },
{ path: "encoding.html", category: "encoding" },
{ path: "type.html", category: "data-types" },
{ path: "scale.html", category: "scales" },
{ path: "axis.html", category: "axes" },
{ path: "legend.html", category: "legends" },
{ path: "transform.html", category: "transforms" },
{ path: "aggregate.html", category: "transforms" },
{ path: "bin.html", category: "transforms" },
{ path: "timeunit.html", category: "transforms" },
{ path: "filter.html", category: "transforms" },
{ path: "selection.html", category: "interaction" },
{ path: "condition.html", category: "interaction" },
{ path: "layer.html", category: "composition" },
{ path: "concat.html", category: "composition" },
{ path: "facet.html", category: "composition" },
{ path: "repeat.html", category: "composition" },
];
// Key Deneb documentation pages to scrape
const DENEB_DOC_PAGES = [
// Introduction and Getting Started
{ path: "", category: "introduction" },
{ path: "getting-started", category: "getting-started" },
{ path: "simple-example", category: "getting-started" },
{ path: "visual-editor", category: "getting-started" },
{ path: "dataset", category: "getting-started" },
{ path: "keyboard", category: "getting-started" },
// Deeper Concepts
{ path: "formatting", category: "concepts" },
{ path: "scrolling-overflow", category: "concepts" },
{ path: "schemes", category: "concepts" },
{ path: "pattern-fills", category: "concepts" },
{ path: "templates", category: "concepts" },
{ path: "performance", category: "concepts" },
// Interactivity Features
{ path: "interactivity-overview", category: "interactivity" },
{ path: "interactivity-tooltips", category: "interactivity" },
{ path: "interactivity-context-menu", category: "interactivity" },
{ path: "interactivity-selection", category: "interactivity" },
{ path: "interactivity-selection-advanced", category: "interactivity" },
{ path: "interactivity-highlight", category: "interactivity" },
// Other
{ path: "changelog", category: "other" },
];
// Example categories to scrape
const EXAMPLE_CATEGORIES = [
"bar",
"line",
"area",
"scatter",
"histogram",
"circle",
"tick",
"rect",
"interactive",
"composite",
];
/**
* Scrape a documentation page (generic)
*/
async function scrapeDocPage(
baseUrl: string,
pagePath: string,
category: string,
source: string
): Promise<DocSection | null> {
const url = `${baseUrl}${pagePath}`;
try {
console.log(`Scraping ${source}: ${url}...`);
const response = await fetch(url);
const html = await response.text();
const $ = cheerio.load(html);
// Extract main content
const title = $("h1").first().text().trim() || $("title").text().trim();
// Try to get the main content area
let content = "";
const mainContent = $("main, article, .content, #content").first();
if (mainContent.length) {
// Remove script and style tags
mainContent.find("script, style").remove();
content = mainContent.text().trim();
} else {
// Fallback: get body text
content = $("body").text().trim();
}
// Clean up the content (remove excessive whitespace)
content = content.replace(/\s+/g, " ").substring(0, 5000); // Limit to 5000 chars
return {
title,
url,
content,
category,
source,
};
} catch (error) {
console.error(`Error scraping ${url}:`, error);
return null;
}
}
/**
* Scrape examples (placeholder - would need actual implementation)
*/
async function scrapeExamples(): Promise<VegaLiteExample[]> {
// This is a placeholder. In a real implementation, you would:
// 1. Scrape the examples gallery page
// 2. Extract example specifications
// 3. Categorize them
console.log("Example scraping not fully implemented - using fallback data");
return [];
}
/**
* Main scraper function
*/
async function scrapeAll() {
console.log("Starting Vega-Lite and Deneb documentation scraper...\n");
// Create data directory if it doesn't exist
const dataDir = path.join(__dirname, "..", "data");
await fs.mkdir(dataDir, { recursive: true });
// Scrape Vega-Lite documentation pages
console.log("=== Scraping Vega-Lite Documentation ===\n");
const vegaLiteDocs: DocSection[] = [];
for (const page of VEGA_LITE_DOC_PAGES) {
const doc = await scrapeDocPage(VEGA_LITE_DOCS_URL, page.path, page.category, "vega-lite");
if (doc) {
vegaLiteDocs.push(doc);
}
// Be nice to the server
await new Promise((resolve) => setTimeout(resolve, 1000));
}
// Save Vega-Lite documentation
const vegaLiteDocsPath = path.join(dataDir, "documentation.json");
await fs.writeFile(vegaLiteDocsPath, JSON.stringify(vegaLiteDocs, null, 2));
console.log(`\n✅ Saved ${vegaLiteDocs.length} Vega-Lite documentation sections to ${vegaLiteDocsPath}\n`);
// Scrape Deneb documentation pages
console.log("=== Scraping Deneb Documentation ===\n");
const denebDocs: DocSection[] = [];
for (const page of DENEB_DOC_PAGES) {
const doc = await scrapeDocPage(DENEB_DOCS_URL, page.path, page.category, "deneb");
if (doc) {
denebDocs.push(doc);
}
// Be nice to the server
await new Promise((resolve) => setTimeout(resolve, 1000));
}
// Save Deneb documentation
const denebDocsPath = path.join(dataDir, "deneb-documentation.json");
await fs.writeFile(denebDocsPath, JSON.stringify(denebDocs, null, 2));
console.log(`\n✅ Saved ${denebDocs.length} Deneb documentation sections to ${denebDocsPath}\n`);
// Scrape examples
const examples = await scrapeExamples();
const examplesPath = path.join(dataDir, "examples.json");
await fs.writeFile(examplesPath, JSON.stringify(examples, null, 2));
console.log(`✅ Saved ${examples.length} examples to ${examplesPath}\n`);
console.log("=== Scraping Complete! ===");
console.log(`Total Vega-Lite docs: ${vegaLiteDocs.length}`);
console.log(`Total Deneb docs: ${denebDocs.length}`);
console.log(`Total examples: ${examples.length}`);
}
// Run scraper if called directly
if (import.meta.url === `file://${process.argv[1]}`) {
scrapeAll().catch((error) => {
console.error("Fatal error:", error);
process.exit(1);
});
}
export { scrapeAll, scrapeDocPage, scrapeExamples };