Skip to main content
Glama
sitemap.ts11.2 kB
/** * Sitemap parser for automatic URL discovery * Parses XML sitemaps to discover all documentation URLs * Supports both plain XML and gzipped sitemaps (.xml.gz) */ /* eslint-disable @typescript-eslint/no-explicit-any */ import fetch from 'node-fetch'; import { load } from 'cheerio'; import { gunzipSync } from 'zlib'; export interface SitemapEntry { url: string; lastModified?: Date; changeFreq?: string; priority?: number; } export interface SitemapFilter { includeLocales?: string[]; // e.g., ['en'] or empty for English only includePatterns?: RegExp[]; // URL patterns to include excludePatterns?: RegExp[]; // URL patterns to exclude } export class SitemapParser { private sitemapUrl: string; private filter: SitemapFilter; constructor(sitemapUrl: string, filter: SitemapFilter = {}) { this.sitemapUrl = sitemapUrl; this.filter = { includeLocales: filter.includeLocales || [], // Empty = English only includePatterns: filter.includePatterns || [], excludePatterns: filter.excludePatterns || [], }; } /** * Fetch and parse the sitemap * Automatically handles gzipped sitemaps based on content-type or URL */ async parse(): Promise<SitemapEntry[]> { console.error(`Fetching sitemap from: ${this.sitemapUrl}`); const response = await fetch(this.sitemapUrl); if (!response.ok) { throw new Error(`Failed to fetch sitemap: ${response.statusText}`); } // Check if response is gzipped const contentType = response.headers.get('content-type') || ''; const contentEncoding = response.headers.get('content-encoding') || ''; const isGzipped = contentType.includes('gzip') || contentType.includes('application/x-gzip') || contentEncoding.includes('gzip') || this.sitemapUrl.endsWith('.gz') || this.sitemapUrl.includes('do=sitemap'); // Fabric Wiki returns gzip for this let xml: string; if (isGzipped) { console.error('Detected gzipped sitemap, decompressing...'); const buffer = await response.arrayBuffer(); try { const decompressed = gunzipSync(Buffer.from(buffer)); xml = decompressed.toString('utf-8'); } catch { // Maybe it's not actually gzipped, try as plain text console.error('Gzip decompression failed, trying as plain text...'); xml = Buffer.from(buffer).toString('utf-8'); } } else { xml = await response.text(); } const entries = this.parseSitemapXML(xml); console.error(`Found ${entries.length} entries in sitemap`); // Apply filters const filtered = this.applyFilters(entries); console.error(`Filtered to ${filtered.length} entries`); return filtered; } /** * Parse sitemap XML content */ private parseSitemapXML(xml: string): SitemapEntry[] { const $ = load(xml, { xmlMode: true }); const entries: SitemapEntry[] = []; // Handle sitemap index (contains links to other sitemaps) $('sitemap').each((_: number, elem: unknown) => { const loc = $(elem as any) .find('loc') .text() .trim(); if (loc) { // This is a sitemap index, we'll need to fetch sub-sitemaps console.error(`Found sitemap index entry: ${loc}`); } }); // Handle URL entries $('url').each((_: number, elem: unknown) => { const $url = $(elem as any); const loc = $url.find('loc').text().trim(); if (!loc) return; const lastmod = $url.find('lastmod').text().trim(); const changefreq = $url.find('changefreq').text().trim(); const priority = $url.find('priority').text().trim(); entries.push({ url: loc, lastModified: lastmod ? new Date(lastmod) : undefined, changeFreq: changefreq || undefined, priority: priority ? parseFloat(priority) : undefined, }); }); return entries; } /** * Apply filters to sitemap entries */ private applyFilters(entries: SitemapEntry[]): SitemapEntry[] { return entries.filter((entry) => { const url = entry.url; // Filter by locale if (this.filter.includeLocales && this.filter.includeLocales.length > 0) { // If specific locales are requested const hasLocale = this.filter.includeLocales.some((locale) => url.includes(`/${locale}_`)); if (!hasLocale && !this.isEnglishUrl(url)) { return false; } } else { // Default: English only (no locale prefix) if (!this.isEnglishUrl(url)) { return false; } } // Apply include patterns if (this.filter.includePatterns && this.filter.includePatterns.length > 0) { const matches = this.filter.includePatterns.some((pattern) => pattern.test(url)); if (!matches) { return false; } } // Apply exclude patterns if (this.filter.excludePatterns && this.filter.excludePatterns.length > 0) { const excluded = this.filter.excludePatterns.some((pattern) => pattern.test(url)); if (excluded) { return false; } } return true; }); } /** * Check if URL is English (no locale prefix) */ private isEnglishUrl(url: string): boolean { // English URLs don't have locale prefixes like /de_de/, /es_es/, etc. const localePattern = /\/[a-z]{2}_[a-z]{2}\//; return !localePattern.test(url); } /** * Parse raw sitemap text (for SITEMAP.txt format) */ static parseRawSitemap(content: string): SitemapEntry[] { const entries: SitemapEntry[] = []; const urlPattern = /(https?:\/\/[^\s]+?)(\d{4}-\d{2}-\d{2}T[\d:]+\.\d{3}Z)/g; let match; while ((match = urlPattern.exec(content)) !== null) { const url = match[1]; const date = match[2]; if (url && date) { entries.push({ url: url, lastModified: new Date(date), }); } } return entries; } } /** * Get Fabric documentation URLs from sitemap */ export async function getFabricUrlsFromSitemap(): Promise<string[]> { const parser = new SitemapParser('https://docs.fabricmc.net/sitemap.xml', { includeLocales: [], // English only includePatterns: [/\/develop\//], // Only /develop/ pages excludePatterns: [ /\/contributing/, /\/#/, // Skip anchor links ], }); try { const entries = await parser.parse(); return entries.map((e) => e.url).sort(); } catch (error) { console.error('Failed to fetch sitemap, falling back to static list:', error); // Fallback to static list if sitemap fails return []; } } /** * Get Fabric documentation URLs from sitemap */ export async function getNeoforgeUrlsFromSitemap(): Promise<string[]> { const parser = new SitemapParser('https://docs.neoforged.net/sitemap.xml', { includeLocales: [], // English only includePatterns: [/\/docs\//], // Only /docs/ pages excludePatterns: [ /\/contributing/, /\/#/, // Skip anchor links ], }); try { const entries = await parser.parse(); return entries.map((e) => e.url).sort(); } catch (error) { console.error('Failed to fetch sitemap, falling back to static list:', error); // Fallback to static list if sitemap fails return []; } } /** * Get Fabric wiki documentation URLs from sitemap * The wiki sitemap is served as a gzipped XML file */ export async function getFabricWikiUrlsFromSitemap(): Promise<string[]> { const sitemapUrl = 'https://wiki.fabricmc.net/start?do=sitemap'; // Returns sitemap.xml.gz const parser = new SitemapParser(sitemapUrl, { includeLocales: [], // English only includePatterns: [], // Include all wiki pages excludePatterns: [ /\?do=/, // Skip action URLs (edit, revisions, etc.) /\?rev=/, // Skip revision URLs /\?idx=/, // Skip index URLs /\/playground/, // Skip playground /\/wiki\/wiki/, // Skip wiki meta pages /\/start$/, // Skip start page redirects /\/de:/, // Skip non-English locales /\/es:/, /\/fr:/, /\/ja:/, /\/ru:/, /\/zh_cn:/, /\/it:/, /\/ko_kr:/, /\/pt_br:/, /\/archive\//, // Skip archive pages /\/drafts:/, // Skip draft pages ], }); try { const entries = await parser.parse(); const urls = entries.map((e) => e.url).sort(); console.error(`Found ${urls.length} Fabric Wiki URLs`); return urls; } catch (error) { console.error('Failed to fetch Fabric wiki sitemap:', error); // Fallback to static list if sitemap fails return getFabricWikiStaticUrls(); } } /** * Static fallback URLs for Fabric Wiki if sitemap fetch fails */ function getFabricWikiStaticUrls(): string[] { const baseUrl = 'https://wiki.fabricmc.net'; return [ `${baseUrl}/tutorial/setup`, `${baseUrl}/tutorial/items`, `${baseUrl}/tutorial/blocks`, `${baseUrl}/tutorial/blockentity`, `${baseUrl}/tutorial/entity`, `${baseUrl}/tutorial/mixin`, `${baseUrl}/tutorial/networking`, `${baseUrl}/tutorial/recipes`, `${baseUrl}/tutorial/commands`, `${baseUrl}/tutorial/events`, `${baseUrl}/tutorial/keybinds`, `${baseUrl}/tutorial/sounds`, `${baseUrl}/tutorial/screen`, `${baseUrl}/tutorial/enchantments`, `${baseUrl}/documentation/fabric_mod_json`, `${baseUrl}/documentation/entrypoint`, ]; } /** * Detect Minecraft version from URL or content */ export function detectMinecraftVersion(url: string, content: string): string | undefined { // Check URL for version patterns const urlVersionMatch = url.match(/\/(\d+\.\d+(?:\.\d+)?)\//); if (urlVersionMatch) { return urlVersionMatch[1]; } // Check content for version mentions const contentPatterns = [ /Minecraft\s+(\d+\.\d+(?:\.\d+)?)/i, /MC\s+(\d+\.\d+(?:\.\d+)?)/i, /version\s+(\d+\.\d+(?:\.\d+)?)/i, /for\s+(\d+\.\d+(?:\.\d+)?)/i, ]; for (const pattern of contentPatterns) { const match = content.match(pattern); if (match) { return match[1]; } } // Default to latest stable version if not detected return '1.21.10'; } /** * Get version-specific documentation */ export interface VersionInfo { version: string; releaseDate?: Date; isLatest: boolean; isLTS?: boolean; } export function parseVersionFromUrl(url: string): string | undefined { // Extract version from URLs like /1.20.1/ or /1.21/ const match = url.match(/\/(\d+\.\d+(?:\.\d+)?)\//); return match ? match[1] : undefined; } /** * Compare semantic versions */ export function compareVersions(v1: string, v2: string): number { const parts1 = v1.split('.').map(Number); const parts2 = v2.split('.').map(Number); for (let i = 0; i < Math.max(parts1.length, parts2.length); i++) { const part1 = parts1[i] || 0; const part2 = parts2[i] || 0; if (part1 > part2) return 1; if (part1 < part2) return -1; } return 0; } /** * Get latest version from a list */ export function getLatestVersion(versions: string[]): string | undefined { if (versions.length === 0) return undefined; return versions.sort((a, b) => compareVersions(b, a))[0]; }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/OGMatrix/mcmodding-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server