Skip to main content
Glama
utils.ts13.2 kB
import * as cheerio from "cheerio"; import wolLanguages from "./wol_languages.json"; import type { Document, SearchOptions, SearchResponse, SearchResult, WOLEndpoints, WOLError, } from "./types"; export const WOL_ENDPOINTS: WOLEndpoints = { SEARCH: "https://wol.jw.org/{lang}/wol/s/{rev}/lp-{lp}", DOCUMENT: "https://wol.jw.org/{lang}/wol/d/{rev}/lp-{lp}/{docId}", LIBRARY: "https://wol.jw.org/{lang}/wol/li/{rev}/lp-{lp}", HOME: "https://wol.jw.org/{lang}/wol/h/{rev}/lp-{lp}", }; function normalizeIsoLanguageCode(language: string | undefined): string { const input = (language || "en").toLowerCase(); const primary = input.split(/[-_]/)[0]; return primary || "en"; } function resolveWolConfig(language: string | undefined): { lp: string; rev: string; } { const norm = normalizeIsoLanguageCode(language); const mapping = (wolLanguages as any).mapping as Record<string, any>; const entry = mapping ? mapping[norm] : undefined; const lp: string = (entry && entry.lp) || (norm === "en" ? "e" : norm); const rev: string = (entry && entry.rsconf) || "r1"; return { lp, rev }; } export class URLBuilder { static buildSearchURL(query: string, options: SearchOptions = {}): string { const { scope = "par", publications = [], language = "en", sort = "occ", page = 1, } = options; const { lp, rev } = resolveWolConfig(language); const baseUrl = WOL_ENDPOINTS.SEARCH.replace("{lang}", language) .replace("{rev}", rev) .replace("{lp}", lp); const params = new URLSearchParams(); // Add search query params.append("q", query); params.append("p", scope); // Map sort to WOL values const sortParam = sort === "newest" ? "newest" : sort === "oldest" ? "oldest" : "occ"; params.append("r", sortParam); params.append("st", "a"); // Add publication filters publications.forEach((pub) => { params.append("fc[]", pub); }); // Add pagination if (page > 1) { params.append("pg", page.toString()); } return `${baseUrl}?${params.toString()}`; } static buildDocumentURL(documentId: string, language: string = "en"): string { const { lp, rev } = resolveWolConfig(language); return WOL_ENDPOINTS.DOCUMENT.replace("{lang}", language) .replace("{rev}", rev) .replace("{lp}", lp) .replace("{docId}", documentId); } static validateAndNormalizeDocumentURL(urlString: string): string { let parsed: URL; try { parsed = new URL(urlString); } catch { throw new Error("Invalid URL"); } const host = parsed.hostname.toLowerCase(); if (!(host.endsWith("jw.org") && host.includes("wol"))) { throw new Error("URL must be a jw.org WOL document"); } if (!/\/wol\/d\//i.test(parsed.pathname)) { throw new Error("URL must point to a WOL document (contains /wol/d/)"); } const allowed = new Set(["q", "p"]); const clean = new URLSearchParams(); parsed.searchParams.forEach((v, k) => { if (allowed.has(k)) clean.append(k, v); }); parsed.search = clean.toString() ? `?${clean.toString()}` : ""; return parsed.toString(); } } export class SearchOperatorParser { // Convert WOL search operators to URL-safe format static encodeOperators(query: string): string { return query .replace(/&/g, "%26") .replace(/\+/g, "%2B") .replace(/\|/g, "%7C") .replace(/\//g, "%2F") .replace(/\^/g, "%5E") .replace(/%/g, "%25") .replace(/!/g, "%21") .replace(/"/g, "%22") .replace(/\*/g, "%2A") .replace(/\?/g, "%3F") .replace(/#/g, "%23") .replace(/\\/g, "%5C") .replace(/\(/g, "%28") .replace(/\)/g, "%29"); } // Validate search operators syntax static validateOperators(query: string): boolean { // Allow unicode letters/numbers so non-English queries pass validation const validOperators = /^[\p{L}\p{N}\s&+|\/\^%!"\*\?#\\()_\-\.:]+$/u; return validOperators.test(query); } // Parse complex search queries } export class ContentParser { static parseSearchResults(html: string): SearchResponse { const $ = cheerio.load(html); const results: SearchResult[] = []; // Key publications $("ul.resultContentTopic").each((_, ul) => { const $ul = $(ul); const $captionLink = $ul.find("li.caption a").first(); if ($captionLink.length === 0) return; const href = $captionLink.attr("href") || ""; const url = href.startsWith("http") ? href : `https://wol.jw.org${href}`; const title = ($captionLink.text() || "").trim(); const publication = ( $ul.find("li.ref").first().text() || "Key Publication" ).trim(); const subheadings: string[] = []; $ul.find("li.resultSubheadings ul a").each((__, a) => { const t = ($(a).text() || "").trim(); if (t) subheadings.push(t); }); const docIdMatch = url.match(/\/d\/r\d\/lp-[a-z0-9-]+\/(\d+)/i); const documentId = docIdMatch ? docIdMatch[1] : undefined; results.push({ title, url, snippet: `Key publication about ${title}`, publication, documentId, resultType: "key_publication", subheadings: subheadings.length ? subheadings : undefined, }); }); // Document results $("ul.resultContentDocument").each((_, ul) => { const $ul = $(ul); const $caption = $ul.find("li.caption").first(); const $link = $caption.find("a").first(); if ($link.length === 0) return; const href = $link.attr("href") || ""; const url = href.startsWith("http") ? href : `https://wol.jw.org${href}`; const title = ($link.text() || "").trim(); const countText = ($caption.find("span.count").text() || "").trim(); const occMatch = countText.match(/([\d][\d.,]*)/); const occurrences = occMatch ? parseInt(occMatch[1].replace(/[.,]/g, ""), 10) : undefined; const publication = ( $ul.find("li.ref").first().text() || "JW Publication" ).trim(); const contextSnippets: string[] = []; $ul.find("div.document p").each((__, p) => { const txt = ($(p).text() || "").trim(); if (txt && txt.length > 10) contextSnippets.push( txt.substring(0, 150) + (txt.length > 150 ? "..." : ""), ); }); results.push({ title, url, snippet: contextSnippets[0] || `Document in ${publication}`, publication, occurrences, resultType: "document_result", contextSnippets: contextSnippets.length ? contextSnippets : undefined, }); }); // Pagination and total const countContainer = $("#resultsCount").first().text(); const totalMatch = countContainer.match(/([\d][\d.,]*)/); const totalResults = totalMatch ? parseInt(totalMatch[1].replace(/[.,]/g, ""), 10) : 0; const pageSize = 40; const totalPages = totalResults > 0 ? Math.ceil(totalResults / pageSize) : 1; // Current page: selected navigation or default 1 let currentPage = 1; const selected = $(".resultsNavigationSelected .navContent").first().text(); if (selected && /\d+/.test(selected)) currentPage = parseInt(selected, 10); return { results, pagination: { totalResults, pageSize, totalPages, currentPage }, }; } static parseDocument(html: string, documentUrl: string): Document { try { const $ = cheerio.load(html); // Title const rawTitle = $("title").first().text() || ""; const title = rawTitle.trim() || "Untitled Document"; // Select a sensible content root const selectRoot = (): cheerio.Cheerio<any> => { const candidates = [ "article#article", "article.article", "div.article", "#content", "main", "body", ]; for (const sel of candidates) { const node = $(sel).first(); if (node && node.length > 0 && node.children().length > 0) return node; } return $("body"); }; const $root = selectRoot(); // Normalize href/src to absolute const origin = new URL(documentUrl).origin; $root.find("a[href]").each((_: number, el: any) => { const $el = $(el); const href = $el.attr("href"); if (!href) return; if (href.startsWith("/")) $el.attr("href", `${origin}${href}`); }); // Unwrap Bible citation links (/wol/bc...) to plain text (same across languages) $root.find('a[href*="/wol/bc"]').each((_: number, el: any) => { const $el = $(el); const text = ($el.text() || "").trim(); $el.replaceWith(text); }); $root.find("img").each((_: number, el: any) => { const $img = $(el); const src = $img.attr("src"); const dataSrc = $img.attr("data-src"); const small = $img.attr("data-img-small-src"); const chosen = src || dataSrc || small; if (!chosen) return; const absolute = chosen.startsWith("/") ? `${origin}${chosen}` : chosen; $img.attr("src", absolute); // Replace the image element with a clear textual label so LLMs know it's an image link const altText = ($img.attr("alt") || "").trim(); const label = altText ? `Image: ${absolute} — ${altText}` : `Image: ${absolute}`; $img.replaceWith(`<p>${label}</p>`); }); const contentHTML = ($root.html() || "").trim(); // Publication and article names via navigation bar if available let publication = "Unknown Publication"; const $navTitle = $("#resultNavigation .resultNavTitle ul").first(); if ($navTitle && $navTitle.length > 0) { const articleName = $navTitle .find("li.resultsNavigationSelected .navContent") .first() .text() .trim(); const pubName = $navTitle .find("li.resultDocumentPubTitle .navContent") .first() .text() .trim(); if (pubName) publication = pubName; if (!publication) { // fallback breadcrumbs const $crumb = $( "ul.breadcrumbs, nav.breadcrumbs, .breadcrumbs", ).first(); if ($crumb && $crumb.length > 0) { const txt = $crumb.text().trim(); if (txt) publication = txt .split("\n") .map((s) => s.trim()) .filter(Boolean) .join(" / "); } } // Note: we show the full article content below; the navigation bar's article // title is informative but we keep the <title> text as the document title to avoid surprises. } else { const $crumb = $( "ul.breadcrumbs, nav.breadcrumbs, .breadcrumbs", ).first(); if ($crumb && $crumb.length > 0) { const txt = $crumb.text().trim(); if (txt) publication = txt .split("\n") .map((s) => s.trim()) .filter(Boolean) .join(" / "); } } // Subheadings const subheadings: Array<{ title: string; url: string }> = []; $("#subheadings ul.documentNavigation li.subheading a").each( (_: number, el: any) => { const $a = $(el); const t = ($a.text() || "").trim(); const href = $a.attr("href") || ""; if (t && href) subheadings.push({ title: t, url: href.startsWith("/") ? `${origin}${href}` : href, }); }, ); // Similar material const similarMaterials: Array<{ title: string; subtitle?: string; url: string; }> = []; $("#similarMaterial ul.results li.simDoc a").each( (_: number, el: any) => { const $a = $(el); const href = $a.attr("href") || ""; const url = href ? href.startsWith("/") ? `${origin}${href}` : href : ""; const title = ( $a.find(".cardTitleBlock .cardLine1").first().text() || "" ).trim(); const subtitle = ( $a.find(".cardTitleBlock .cardLine2").first().text() || "" ).trim() || undefined; if (url && title) similarMaterials.push({ title, subtitle, url }); }, ); // Metadata best effort const metadata: any = {}; const metaDate = $( 'meta[name="date"], meta[property="article:published_time"], meta[name="DC.Date"]', ) .first() .attr("content"); const timeEl = $root.find("time").first(); const timeVal = timeEl.attr("datetime") || timeEl.text(); if (metaDate && metaDate.trim()) metadata.date = metaDate.trim(); else if (timeVal && timeVal.trim()) metadata.date = timeVal.trim(); // Language let language = ($("html").attr("lang") || "").toLowerCase(); if (!language) { const u = new URL(documentUrl); const seg = (u.pathname.split("/")[1] || "").toLowerCase(); if (seg) language = seg; } if (!language) language = "en"; // Document id from URL path (last numeric segment) const u = new URL(documentUrl); const parts = u.pathname.split("/").filter(Boolean); const isDigits = (s: string) => s.length > 0 && Array.from(s).every((c) => c >= "0" && c <= "9"); let documentId = ""; for (let i = parts.length - 1; i >= 0; i--) { if (isDigits(parts[i])) { documentId = parts[i]; break; } } return { id: documentId, title, content: contentHTML, publication, url: "", language, metadata, subheadings: subheadings.length ? subheadings : undefined, similarMaterials: similarMaterials.length ? similarMaterials : undefined, }; } catch (error) { throw new Error(`Failed to parse document: ${error}`); } } } export function createWOLError( code: WOLError["code"], message: string, details?: any, ): WOLError { const error = new Error(message) as WOLError; error.code = code; error.details = details; return error; }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/LeomaiaJr/wol-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server