Skip to main content
Glama
scrape-docs.mjs2.95 kB
import axios from 'axios'; import * as cheerio from 'cheerio'; import TurndownService from 'turndown'; import { promises as fs } from 'fs'; import path from 'path'; import { fileURLToPath } from 'url'; // Initialize Turndown service const turndownService = new TurndownService({ headingStyle: 'atx', // Use '#' for headings codeBlockStyle: 'fenced', // Use '```' for code blocks }); // Define __dirname for ES modules const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); // Base path for all documentation output const DOCS_BASE_OUTPUT_PATH = path.join(__dirname, '..', 'content', 'docs'); /** * Scrapes a webpage, extracts main content, converts to Markdown, and saves it. * @param {string} url - The URL to scrape. * @param {string} outputPath - The directory to save the Markdown file. * @param {string} contentSelector - Cheerio selector for the main content area. * @param {string} filename - The name of the output Markdown file (without .md extension). * @param {string} siteName - A friendly name for the site being scraped (for logging). */ async function scrapeAndSave(url, outputPath, contentSelector, filename, siteName) { console.log(`Scraping ${siteName} page: ${filename} from ${url}`); try { const { data: html } = await axios.get(url); const $ = cheerio.load(html); const mainContentHtml = $(contentSelector).html(); if (!mainContentHtml) { console.error(`Could not find content with selector "${contentSelector}" on ${url}`); return; } const markdown = turndownService.turndown(mainContentHtml); const filePath = path.join(outputPath, `${filename}.md`); await fs.mkdir(outputPath, { recursive: true }); // Ensure directory exists await fs.writeFile(filePath, markdown, 'utf-8'); console.log(`Successfully saved ${siteName} doc: ${filename}.md to ${filePath}`); } catch (error) { console.error(`Error scraping ${siteName} page ${filename} from ${url}:`, error.message || error); if (error.response) { console.error(`Status: ${error.response.status}`); } } } async function main() { console.log("Starting documentation scraping process..."); const configFile = await fs.readFile(path.join(__dirname, 'scraping-config.json'), 'utf-8'); const config = JSON.parse(configFile); for (const site of config.sites) { console.log(`\nProcessing site: ${site.siteName}`); const siteOutputPath = path.join(DOCS_BASE_OUTPUT_PATH, site.outputDir); for (const pageSlug of site.pages) { const pageUrl = site.baseURL + pageSlug; await scrapeAndSave( pageUrl, siteOutputPath, site.contentSelector, pageSlug, site.siteName ); } } console.log("\nDocumentation scraping process finished."); } main().catch(error => { console.error("An error occurred during the scraping script execution:", error); process.exit(1); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/CaullenOmdahl/Tailwind-Svelte-Assistant'

If you have feedback or need assistance with the MCP directory API, please join our Discord server