enfusion-mcp

enfusion-mcp
scripts

scrape-wiki.ts•10.6 KiB

/**
 * Scrapes the BI Community Wiki for Arma Reforger modding pages using Playwright.
 * community.bistudio.com blocks bot user-agents, so we use a real browser.
 *
 * Usage: npx tsx scripts/scrape-wiki.ts
 *
 * Collects pages from Category:Arma_Reforger/Modding recursively,
 * fetches raw wikitext via action=raw, converts to plain text, and merges
 * with existing engine wiki docs in data/wiki/pages.json.
 */

import { chromium, type Page } from "playwright";
import { writeFileSync, readFileSync, existsSync } from "node:fs";
import { join } from "node:path";

const BASE_URL = "https://community.bistudio.com";
const CATEGORY_URL = `${BASE_URL}/wiki/Category:Arma_Reforger/Modding`;
const OUTPUT_PATH = join(import.meta.dirname, "..", "data", "wiki", "pages.json");

// Rate limit: be respectful
const DELAY_MS = 1_200;

interface WikiPage {
  title: string;
  source: string;
  content: string;
  url: string;
}

async function delay(ms: number): Promise<void> {
  return new Promise((r) => setTimeout(r, ms));
}

/**
 * Recursively collect all page URLs from a category and its subcategories.
 */
async function collectPageUrls(
  page: Page,
  categoryUrl: string,
  visited: Set<string> = new Set()
): Promise<string[]> {
  if (visited.has(categoryUrl)) return [];
  visited.add(categoryUrl);

  console.log(`  Scanning: ${categoryUrl.replace(BASE_URL, "")}`);
  await page.goto(categoryUrl, { waitUntil: "domcontentloaded" });
  await delay(DELAY_MS);

  const { pageLinks, subcategoryLinks } = await page.evaluate(() => {
    const pages: string[] = [];
    const subcats: string[] = [];

    // Subcategories section
    const subcatSection = document.querySelector("#mw-subcategories");
    if (subcatSection) {
      for (const a of subcatSection.querySelectorAll("a")) {
        const href = (a as HTMLAnchorElement).href;
        if (href.includes("/wiki/Category:Arma_Reforger")) {
          subcats.push(href);
        }
      }
    }

    // Pages section
    const pagesSection = document.querySelector("#mw-pages");
    if (pagesSection) {
      for (const a of pagesSection.querySelectorAll("a")) {
        const href = (a as HTMLAnchorElement).href;
        if (href.includes("/wiki/") && !href.includes("Category:")) {
          pages.push(href);
        }
      }
    }

    return { pageLinks: pages, subcategoryLinks: subcats };
  });

  const allPages: string[] = [...pageLinks];

  for (const subcat of subcategoryLinks) {
    const sub = await collectPageUrls(page, subcat, visited);
    allPages.push(...sub);
  }

  return allPages;
}

/**
 * Convert MediaWiki markup to clean readable text.
 */
function wikitextToPlainText(wikitext: string): string {
  let text = wikitext;

  // Remove TOC directive
  text = text.replace(/\{\{TOC\|[^}]*\}\}/g, "");

  // Remove image/file links
  text = text.replace(/\[\[File:[^\]]*\]\]/g, "");
  text = text.replace(/\[\[Image:[^\]]*\]\]/g, "");

  // Remove category tags
  text = text.replace(/\{\{GameCategory[^}]*\}\}/g, "");

  // Convert {{Feature|...|text}} to just the text
  text = text.replace(/\{\{Feature\|[^|]*\|([^}]*)\}\}/g, "Note: $1");

  // Convert {{Link|Page Name}} and {{Link|Page Name|Display}}
  text = text.replace(/\{\{Link\|([^|}]+)\|([^}]+)\}\}/g, "$2");
  text = text.replace(/\{\{Link\|([^}]+)\}\}/g, "$1");

  // Convert {{hl|text}} (highlight) to just text
  text = text.replace(/\{\{hl\|([^}]*)\}\}/g, "$1");

  // Convert {{Wiki|TODO}} etc
  text = text.replace(/\{\{Wiki\|([^}]*)\}\}/g, "[$1]");

  // Convert <enforce inline>code</enforce> to `code`
  text = text.replace(/<enforce\s+inline>(.*?)<\/enforce>/gs, "`$1`");

  // Convert <enforce>code</enforce> to code blocks
  text = text.replace(/<enforce>(.*?)<\/enforce>/gs, (_, code) => {
    return "\n```\n" + code.trim() + "\n```\n";
  });

  // Convert <syntaxhighlight> and <source> to code blocks
  text = text.replace(/<syntaxhighlight[^>]*>(.*?)<\/syntaxhighlight>/gs, (_, code) => {
    return "\n```\n" + code.trim() + "\n```\n";
  });
  text = text.replace(/<source[^>]*>(.*?)<\/source>/gs, (_, code) => {
    return "\n```\n" + code.trim() + "\n```\n";
  });

  // Convert <pre> to code blocks
  text = text.replace(/<pre>(.*?)<\/pre>/gs, (_, code) => {
    return "\n```\n" + code.trim() + "\n```\n";
  });

  // Convert <code> to inline code
  text = text.replace(/<code>(.*?)<\/code>/gs, "`$1`");

  // Convert wiki headings: == H2 ==, === H3 ===, etc.
  text = text.replace(/^={5}\s*(.*?)\s*={5}/gm, "##### $1");
  text = text.replace(/^={4}\s*(.*?)\s*={4}/gm, "#### $1");
  text = text.replace(/^={3}\s*(.*?)\s*={3}/gm, "### $1");
  text = text.replace(/^={2}\s*(.*?)\s*={2}/gm, "## $1");

  // Convert [[Page|display]] links to just display text
  text = text.replace(/\[\[[^\]|]*\|([^\]]*)\]\]/g, "$1");
  // Convert [[Page]] links to just page name
  text = text.replace(/\[\[([^\]]*)\]\]/g, "$1");

  // Convert external links [url text]
  text = text.replace(/\[https?:\/\/\S+\s+([^\]]*)\]/g, "$1");

  // Convert bold '''text''' and italic ''text''
  text = text.replace(/'''(.*?)'''/g, "$1");
  text = text.replace(/''(.*?)''/g, "$1");

  // Convert wiki tables to readable format
  // Simple approach: extract cell contents
  text = text.replace(/\{\|[^\n]*\n([\s\S]*?)\|\}/g, (_, tableContent) => {
    const lines = tableContent.split("\n");
    const rows: string[] = [];
    let currentRow: string[] = [];

    for (const line of lines) {
      const trimmed = line.trim();
      if (trimmed.startsWith("|-")) {
        if (currentRow.length > 0) {
          rows.push(currentRow.join(" | "));
          currentRow = [];
        }
      } else if (trimmed.startsWith("|") || trimmed.startsWith("!")) {
        const cellContent = trimmed.replace(/^[|!]\s*/, "").trim();
        if (cellContent && !cellContent.startsWith("class=") && !cellContent.startsWith("style=")) {
          currentRow.push(cellContent);
        }
      }
    }
    if (currentRow.length > 0) {
      rows.push(currentRow.join(" | "));
    }

    return rows.join("\n");
  });

  // Remove remaining HTML tags
  text = text.replace(/<\/?[^>]+>/g, "");

  // Remove remaining template calls we don't handle
  text = text.replace(/\{\{[^}]*\}\}/g, "");

  // Clean up whitespace
  text = text.replace(/\n{3,}/g, "\n\n");
  text = text.trim();

  return text;
}

/**
 * Fetch raw wikitext for a page using action=raw.
 */
async function fetchRawPage(page: Page, url: string): Promise<WikiPage | null> {
  try {
    // Extract page title from URL for the raw action
    const pageName = url.replace(`${BASE_URL}/wiki/`, "");
    const rawUrl = `${BASE_URL}/wiki/${pageName}?action=raw`;

    await page.goto(rawUrl, { waitUntil: "domcontentloaded" });
    await delay(DELAY_MS);

    const rawText = await page.evaluate(() => {
      // action=raw returns plain text in a <pre> or just as the body text
      const pre = document.querySelector("pre");
      if (pre) return pre.textContent || "";
      return document.body.innerText || "";
    });

    if (!rawText || rawText.length < 30) {
      // Fallback: try scraping the rendered HTML
      return await fetchRenderedPage(page, url);
    }

    const content = wikitextToPlainText(rawText);

    if (content.length < 50) {
      console.log(`    Skipped (too short): ${pageName}`);
      return null;
    }

    // Extract title from page name
    const title = decodeURIComponent(pageName)
      .replace(/Arma_Reforger:/, "")
      .replace(/_/g, " ");

    return {
      title,
      source: "bistudio-wiki",
      content,
      url,
    };
  } catch (err) {
    console.error(`    Error fetching raw: ${url} — ${err}`);
    return await fetchRenderedPage(page, url);
  }
}

/**
 * Fallback: scrape rendered HTML if raw wikitext isn't available.
 */
async function fetchRenderedPage(page: Page, url: string): Promise<WikiPage | null> {
  try {
    await page.goto(url, { waitUntil: "domcontentloaded" });
    await delay(DELAY_MS);

    const result = await page.evaluate(() => {
      const titleEl = document.querySelector("#firstHeading");
      const contentEl = document.querySelector("#mw-content-text .mw-parser-output");

      if (!titleEl || !contentEl) return null;

      const title = titleEl.textContent?.trim() || "";

      // Clone and clean
      const clone = contentEl.cloneNode(true) as HTMLElement;
      clone
        .querySelectorAll(".mw-editsection, #toc, .toc, .navbox, .mbox-small, .noprint, script, style")
        .forEach((el) => el.remove());

      // Get text with basic structure preservation
      const text = clone.innerText || "";
      return { title, content: text };
    });

    if (!result || result.content.length < 50) return null;

    return {
      title: result.title.replace(/ – Arma Reforger$/, ""),
      source: "bistudio-wiki",
      content: result.content,
      url,
    };
  } catch {
    return null;
  }
}

async function main(): Promise<void> {
  console.log("Launching Chromium...\n");
  const browser = await chromium.launch({ headless: true });
  const context = await browser.newContext({
    userAgent:
      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
  });
  const page = await context.newPage();

  // Step 1: Collect all page URLs
  console.log("=== Collecting page URLs ===\n");
  const pageUrls = await collectPageUrls(page, CATEGORY_URL);
  const uniqueUrls = [...new Set(pageUrls)];
  console.log(`\nFound ${uniqueUrls.length} unique pages.\n`);

  // Step 2: Scrape each page
  console.log("=== Scraping pages ===\n");
  const pages: WikiPage[] = [];

  for (let i = 0; i < uniqueUrls.length; i++) {
    const url = uniqueUrls[i];
    const shortName = url.replace(BASE_URL + "/wiki/", "");
    console.log(`  [${i + 1}/${uniqueUrls.length}] ${shortName}`);

    const wikiPage = await fetchRawPage(page, url);
    if (wikiPage) {
      pages.push(wikiPage);
      console.log(`    ✓ ${wikiPage.title} (${wikiPage.content.length} chars)`);
    }
  }

  await browser.close();

  // Step 3: Merge with existing wiki pages
  let existing: WikiPage[] = [];
  if (existsSync(OUTPUT_PATH)) {
    existing = JSON.parse(readFileSync(OUTPUT_PATH, "utf-8"));
    // Keep non-bistudio-wiki pages (engine docs)
    existing = existing.filter((p: any) => p.source !== "bistudio-wiki");
  }

  const merged = [...existing, ...pages];
  writeFileSync(OUTPUT_PATH, JSON.stringify(merged, null, 2), "utf-8");

  console.log(`\n=== Done ===`);
  console.log(`  BI wiki pages scraped: ${pages.length}`);
  console.log(`  Engine docs kept: ${existing.length}`);
  console.log(`  Total wiki pages: ${merged.length}`);
  console.log(`  Written to: ${OUTPUT_PATH}`);
}

main().catch((err) => {
  console.error("Fatal error:", err);
  process.exit(1);
});

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Articulated7/enfusion-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

scrape-wiki.ts•10.6 KiB

/**
 * Scrapes the BI Community Wiki for Arma Reforger modding pages using Playwright.
 * community.bistudio.com blocks bot user-agents, so we use a real browser.
 *
 * Usage: npx tsx scripts/scrape-wiki.ts
 *
 * Collects pages from Category:Arma_Reforger/Modding recursively,
 * fetches raw wikitext via action=raw, converts to plain text, and merges
 * with existing engine wiki docs in data/wiki/pages.json.
 */

import { chromium, type Page } from "playwright";
import { writeFileSync, readFileSync, existsSync } from "node:fs";
import { join } from "node:path";

const BASE_URL = "https://community.bistudio.com";
const CATEGORY_URL = `${BASE_URL}/wiki/Category:Arma_Reforger/Modding`;
const OUTPUT_PATH = join(import.meta.dirname, "..", "data", "wiki", "pages.json");

// Rate limit: be respectful
const DELAY_MS = 1_200;

interface WikiPage {
  title: string;
  source: string;
  content: string;
  url: string;
}

async function delay(ms: number): Promise<void> {
  return new Promise((r) => setTimeout(r, ms));
}

/**
 * Recursively collect all page URLs from a category and its subcategories.
 */
async function collectPageUrls(
  page: Page,
  categoryUrl: string,
  visited: Set<string> = new Set()
): Promise<string[]> {
  if (visited.has(categoryUrl)) return [];
  visited.add(categoryUrl);

  console.log(`  Scanning: ${categoryUrl.replace(BASE_URL, "")}`);
  await page.goto(categoryUrl, { waitUntil: "domcontentloaded" });
  await delay(DELAY_MS);

  const { pageLinks, subcategoryLinks } = await page.evaluate(() => {
    const pages: string[] = [];
    const subcats: string[] = [];

    // Subcategories section
    const subcatSection = document.querySelector("#mw-subcategories");
    if (subcatSection) {
      for (const a of subcatSection.querySelectorAll("a")) {
        const href = (a as HTMLAnchorElement).href;
        if (href.includes("/wiki/Category:Arma_Reforger")) {
          subcats.push(href);
        }
      }
    }

    // Pages section
    const pagesSection = document.querySelector("#mw-pages");
    if (pagesSection) {
      for (const a of pagesSection.querySelectorAll("a")) {
        const href = (a as HTMLAnchorElement).href;
        if (href.includes("/wiki/") && !href.includes("Category:")) {
          pages.push(href);
        }
      }
    }

    return { pageLinks: pages, subcategoryLinks: subcats };
  });

  const allPages: string[] = [...pageLinks];

  for (const subcat of subcategoryLinks) {
    const sub = await collectPageUrls(page, subcat, visited);
    allPages.push(...sub);
  }

  return allPages;
}

/**
 * Convert MediaWiki markup to clean readable text.
 */
function wikitextToPlainText(wikitext: string): string {
  let text = wikitext;

  // Remove TOC directive
  text = text.replace(/\{\{TOC\|[^}]*\}\}/g, "");

  // Remove image/file links
  text = text.replace(/\[\[File:[^\]]*\]\]/g, "");
  text = text.replace(/\[\[Image:[^\]]*\]\]/g, "");

  // Remove category tags
  text = text.replace(/\{\{GameCategory[^}]*\}\}/g, "");

  // Convert {{Feature|...|text}} to just the text
  text = text.replace(/\{\{Feature\|[^|]*\|([^}]*)\}\}/g, "Note: $1");

  // Convert {{Link|Page Name}} and {{Link|Page Name|Display}}
  text = text.replace(/\{\{Link\|([^|}]+)\|([^}]+)\}\}/g, "$2");
  text = text.replace(/\{\{Link\|([^}]+)\}\}/g, "$1");

  // Convert {{hl|text}} (highlight) to just text
  text = text.replace(/\{\{hl\|([^}]*)\}\}/g, "$1");

  // Convert {{Wiki|TODO}} etc
  text = text.replace(/\{\{Wiki\|([^}]*)\}\}/g, "[$1]");

  // Convert <enforce inline>code</enforce> to `code`
  text = text.replace(/<enforce\s+inline>(.*?)<\/enforce>/gs, "`$1`");

  // Convert <enforce>code</enforce> to code blocks
  text = text.replace(/<enforce>(.*?)<\/enforce>/gs, (_, code) => {
    return "\n```\n" + code.trim() + "\n```\n";
  });

  // Convert <syntaxhighlight> and <source> to code blocks
  text = text.replace(/<syntaxhighlight[^>]*>(.*?)<\/syntaxhighlight>/gs, (_, code) => {
    return "\n```\n" + code.trim() + "\n```\n";
  });
  text = text.replace(/<source[^>]*>(.*?)<\/source>/gs, (_, code) => {
    return "\n```\n" + code.trim() + "\n```\n";
  });

  // Convert <pre> to code blocks
  text = text.replace(/<pre>(.*?)<\/pre>/gs, (_, code) => {
    return "\n```\n" + code.trim() + "\n```\n";
  });

  // Convert <code> to inline code
  text = text.replace(/<code>(.*?)<\/code>/gs, "`$1`");

  // Convert wiki headings: == H2 ==, === H3 ===, etc.
  text = text.replace(/^={5}\s*(.*?)\s*={5}/gm, "##### $1");
  text = text.replace(/^={4}\s*(.*?)\s*={4}/gm, "#### $1");
  text = text.replace(/^={3}\s*(.*?)\s*={3}/gm, "### $1");
  text = text.replace(/^={2}\s*(.*?)\s*={2}/gm, "## $1");

  // Convert [[Page|display]] links to just display text
  text = text.replace(/\[\[[^\]|]*\|([^\]]*)\]\]/g, "$1");
  // Convert [[Page]] links to just page name
  text = text.replace(/\[\[([^\]]*)\]\]/g, "$1");

  // Convert external links [url text]
  text = text.replace(/\[https?:\/\/\S+\s+([^\]]*)\]/g, "$1");

  // Convert bold '''text''' and italic ''text''
  text = text.replace(/'''(.*?)'''/g, "$1");
  text = text.replace(/''(.*?)''/g, "$1");

  // Convert wiki tables to readable format
  // Simple approach: extract cell contents
  text = text.replace(/\{\|[^\n]*\n([\s\S]*?)\|\}/g, (_, tableContent) => {
    const lines = tableContent.split("\n");
    const rows: string[] = [];
    let currentRow: string[] = [];

    for (const line of lines) {
      const trimmed = line.trim();
      if (trimmed.startsWith("|-")) {
        if (currentRow.length > 0) {
          rows.push(currentRow.join(" | "));
          currentRow = [];
        }
      } else if (trimmed.startsWith("|") || trimmed.startsWith("!")) {
        const cellContent = trimmed.replace(/^[|!]\s*/, "").trim();
        if (cellContent && !cellContent.startsWith("class=") && !cellContent.startsWith("style=")) {
          currentRow.push(cellContent);
        }
      }
    }
    if (currentRow.length > 0) {
      rows.push(currentRow.join(" | "));
    }

    return rows.join("\n");
  });

  // Remove remaining HTML tags
  text = text.replace(/<\/?[^>]+>/g, "");

  // Remove remaining template calls we don't handle
  text = text.replace(/\{\{[^}]*\}\}/g, "");

  // Clean up whitespace
  text = text.replace(/\n{3,}/g, "\n\n");
  text = text.trim();

  return text;
}

/**
 * Fetch raw wikitext for a page using action=raw.
 */
async function fetchRawPage(page: Page, url: string): Promise<WikiPage | null> {
  try {
    // Extract page title from URL for the raw action
    const pageName = url.replace(`${BASE_URL}/wiki/`, "");
    const rawUrl = `${BASE_URL}/wiki/${pageName}?action=raw`;

    await page.goto(rawUrl, { waitUntil: "domcontentloaded" });
    await delay(DELAY_MS);

    const rawText = await page.evaluate(() => {
      // action=raw returns plain text in a <pre> or just as the body text
      const pre = document.querySelector("pre");
      if (pre) return pre.textContent || "";
      return document.body.innerText || "";
    });

    if (!rawText || rawText.length < 30) {
      // Fallback: try scraping the rendered HTML
      return await fetchRenderedPage(page, url);
    }

    const content = wikitextToPlainText(rawText);

    if (content.length < 50) {
      console.log(`    Skipped (too short): ${pageName}`);
      return null;
    }

    // Extract title from page name
    const title = decodeURIComponent(pageName)
      .replace(/Arma_Reforger:/, "")
      .replace(/_/g, " ");

    return {
      title,
      source: "bistudio-wiki",
      content,
      url,
    };
  } catch (err) {
    console.error(`    Error fetching raw: ${url} — ${err}`);
    return await fetchRenderedPage(page, url);
  }
}

/**
 * Fallback: scrape rendered HTML if raw wikitext isn't available.
 */
async function fetchRenderedPage(page: Page, url: string): Promise<WikiPage | null> {
  try {
    await page.goto(url, { waitUntil: "domcontentloaded" });
    await delay(DELAY_MS);

    const result = await page.evaluate(() => {
      const titleEl = document.querySelector("#firstHeading");
      const contentEl = document.querySelector("#mw-content-text .mw-parser-output");

      if (!titleEl || !contentEl) return null;

      const title = titleEl.textContent?.trim() || "";

      // Clone and clean
      const clone = contentEl.cloneNode(true) as HTMLElement;
      clone
        .querySelectorAll(".mw-editsection, #toc, .toc, .navbox, .mbox-small, .noprint, script, style")
        .forEach((el) => el.remove());

      // Get text with basic structure preservation
      const text = clone.innerText || "";
      return { title, content: text };
    });

    if (!result || result.content.length < 50) return null;

    return {
      title: result.title.replace(/ – Arma Reforger$/, ""),
      source: "bistudio-wiki",
      content: result.content,
      url,
    };
  } catch {
    return null;
  }
}

async function main(): Promise<void> {
  console.log("Launching Chromium...\n");
  const browser = await chromium.launch({ headless: true });
  const context = await browser.newContext({
    userAgent:
      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
  });
  const page = await context.newPage();

  // Step 1: Collect all page URLs
  console.log("=== Collecting page URLs ===\n");
  const pageUrls = await collectPageUrls(page, CATEGORY_URL);
  const uniqueUrls = [...new Set(pageUrls)];
  console.log(`\nFound ${uniqueUrls.length} unique pages.\n`);

  // Step 2: Scrape each page
  console.log("=== Scraping pages ===\n");
  const pages: WikiPage[] = [];

  for (let i = 0; i < uniqueUrls.length; i++) {
    const url = uniqueUrls[i];
    const shortName = url.replace(BASE_URL + "/wiki/", "");
    console.log(`  [${i + 1}/${uniqueUrls.length}] ${shortName}`);

    const wikiPage = await fetchRawPage(page, url);
    if (wikiPage) {
      pages.push(wikiPage);
      console.log(`    ✓ ${wikiPage.title} (${wikiPage.content.length} chars)`);
    }
  }

  await browser.close();

  // Step 3: Merge with existing wiki pages
  let existing: WikiPage[] = [];
  if (existsSync(OUTPUT_PATH)) {
    existing = JSON.parse(readFileSync(OUTPUT_PATH, "utf-8"));
    // Keep non-bistudio-wiki pages (engine docs)
    existing = existing.filter((p: any) => p.source !== "bistudio-wiki");
  }

  const merged = [...existing, ...pages];
  writeFileSync(OUTPUT_PATH, JSON.stringify(merged, null, 2), "utf-8");

  console.log(`\n=== Done ===`);
  console.log(`  BI wiki pages scraped: ${pages.length}`);
  console.log(`  Engine docs kept: ${existing.length}`);
  console.log(`  Total wiki pages: ${merged.length}`);
  console.log(`  Written to: ${OUTPUT_PATH}`);
}

main().catch((err) => {
  console.error("Fatal error:", err);
  process.exit(1);
});