Skip to main content
Glama
wol-scraper.js14.6 kB
/** * WOL (Watchtower Online Library) Web Scraper * Fetches and parses Bible study content from wol.jw.org * Ported from Python scraper in wol-api */ import fetch from 'node-fetch'; import * as cheerio from 'cheerio'; import { getBookName } from './bible-books.js'; /** * WOL Scraper class for extracting Bible study content */ export class WOLScraper { constructor() { // User-Agent to avoid blocking this.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }; } /** * Extract all content for a chapter * @param {number} bookNum - Bible book number (1-66) * @param {number} chapterNum - Chapter number * @returns {Promise<Object>} Chapter content with verses, notes, and study data */ async extractChapterContent(bookNum, chapterNum) { const url = `https://wol.jw.org/en/wol/b/r1/lp-e/nwtsty/${bookNum}/${chapterNum}#study=discover`; try { const response = await fetch(url, { headers: this.headers, timeout: 30000 }); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const html = await response.text(); const $ = cheerio.load(html); // Extract chapter-level study data const studyDiscover = $('#studyDiscover'); let chapterStudyData = null; if (studyDiscover.length > 0) { chapterStudyData = { book_num: bookNum, chapter_num: chapterNum, outline: this.extractOutline($, studyDiscover), study_articles: this.extractResearchGuideArticles($, studyDiscover), cross_references: this.extractCrossReferences($, studyDiscover) }; } // Extract verse-specific study notes const verseStudyNotes = this.extractVerseStudyNotes($, bookNum, chapterNum); // Extract all verses from the chapter const verses = this.extractVerses($, bookNum, chapterNum); return { chapter_study_data: chapterStudyData, verse_study_notes: verseStudyNotes, verses: verses }; } catch (error) { throw new Error(`Failed to extract content for ${bookNum}:${chapterNum} - ${error.message}`); } } /** * Extract all verses from the chapter * @param {CheerioAPI} $ - Cheerio instance * @param {number} bookNum - Book number * @param {number} chapterNum - Chapter number * @returns {Array} Array of verse objects */ extractVerses($, bookNum, chapterNum) { const verses = []; // Find all verse elements $('span.v').each((i, elem) => { try { const verseId = $(elem).attr('id'); if (!verseId) return; // Parse verse ID (format: v{book}-{chapter}-{verse}) const parts = verseId.replace('v', '').split('-'); if (parts.length >= 3) { const vBook = parseInt(parts[0]); const vChapter = parseInt(parts[1]); const vVerse = parseInt(parts[2]); if (vBook === bookNum && vChapter === chapterNum) { // Get verse text const verseText = $(elem).text().trim(); // Get book name const bookName = this.extractBookName($, bookNum); verses.push({ book_num: bookNum, book_name: bookName, chapter: chapterNum, verse_num: vVerse, verse_text: verseText }); } } } catch (error) { console.error(`Error parsing verse: ${error.message}`); } }); return verses; } /** * Extract book name from the page * @param {CheerioAPI} $ - Cheerio instance * @param {number} bookNum - Book number (fallback) * @returns {string} Book name */ extractBookName($, bookNum) { // Try to extract from page title const title = $('title').text(); if (title) { // Extract book name from title (usually first part before chapter) if (title.includes('—')) { return title.split('—')[0].trim(); } else if (title.includes(' ')) { // Fallback: take first word as book name return title.split(' ')[0]; } } // Fallback: use our bible-books mapping return getBookName(bookNum) || 'Unknown'; } /** * Extract study notes for individual verses * @param {CheerioAPI} $ - Cheerio instance * @param {number} bookNum - Book number * @param {number} chapterNum - Chapter number * @returns {Object} Map of verse numbers to study notes */ extractVerseStudyNotes($, bookNum, chapterNum) { const verseNotes = {}; // Find all sections with verse-specific study notes $('div.section').each((i, section) => { const dataKey = $(section).attr('data-key'); if (!dataKey) return; try { // Parse data-key to extract verse number (format: book-chapter-verse) const parts = dataKey.split('-'); if (parts.length >= 3) { const sectionBook = parseInt(parts[0]); const sectionChapter = parseInt(parts[1]); const sectionVerse = parseInt(parts[2]); // Only process if it matches our target book/chapter if (sectionBook === bookNum && sectionChapter === chapterNum) { // Look for study note groups const studyNoteGroups = $(section).find('div.studyNoteGroup'); if (studyNoteGroups.length > 0) { const studyNotes = []; studyNoteGroups.each((j, group) => { // Find individual study notes $(group).find('li.item.studyNote').each((k, note) => { // Extract paragraphs within the study note const noteContent = []; $(note).find('p').each((l, p) => { const pText = $(p).text().trim(); if (pText) { noteContent.push(pText); } }); if (noteContent.length > 0) { studyNotes.push(noteContent.join(' ')); } }); }); if (studyNotes.length > 0) { verseNotes[sectionVerse] = studyNotes; } } } } } catch (error) { console.error(`Error processing section: ${error.message}`); } }); return verseNotes; } /** * Extract chapter outline from study discover section * @param {CheerioAPI} $ - Cheerio instance * @param {Cheerio} studyDiscover - Study discover element * @returns {Array} Array of outline items */ extractOutline($, studyDiscover) { const outline = []; // Look for outline structure studyDiscover.find('div.outline').each((i, section) => { $(section).find('li').each((j, item) => { const text = $(item).text().trim(); if (text) { outline.push(text); } }); }); return outline; } /** * Extract research guide articles from li.item.ref-rsg * Consolidate multi-line entries by grouping <a> tags with the same href into a single title * @param {CheerioAPI} $ - Cheerio instance * @param {Cheerio} studyDiscover - Study discover element * @returns {Array} Array of article objects */ extractResearchGuideArticles($, studyDiscover) { const articles = []; /** * Classify article type based on URL and title */ const classify = (url, title) => { const u = url.toLowerCase(); const t = title.toLowerCase(); if (u.includes('watchtower') || t.includes('watchtower')) { return 'watchtower'; } if (u.includes('awake') || t.includes('awake')) { return 'awake'; } if (u.includes('/pc/') || u.includes('/d/')) { return 'research'; } return 'other'; }; // Find all li.item.ref-rsg elements studyDiscover.find('li.item.ref-rsg').each((itemIndex, item) => { const paragraphs = $(item).find('p').toArray(); let i = 0; while (i < paragraphs.length) { const p = paragraphs[i]; const classes = ($(p).attr('class') || '').toLowerCase(); // Check if this is a "su" paragraph (subject/heading) if (classes.includes('su')) { const suAnchors = $(p).find('a[href]').toArray(); if (suAnchors.length > 0) { // Get the su label (combine all anchor texts) const suLabel = suAnchors .map(a => $(a).text().trim()) .filter(text => text) .join(' '); let added = false; let j = i + 1; // Look for following "sk" paragraphs (sub-items) while (j < paragraphs.length) { const pNext = paragraphs[j]; const nextClasses = ($(pNext).attr('class') || '').toLowerCase(); // Stop if not a "sk" paragraph if (!nextClasses.includes('sk')) { break; } // Process sk anchors const skAnchors = $(pNext).find('a[href]').toArray(); for (const a of skAnchors) { const href = $(a).attr('href') || ''; const text = $(a).text().trim(); if (!href || !text) continue; const fullUrl = href.startsWith('/') ? `https://wol.jw.org${href}` : href; const title = `${suLabel} ${text}`.trim(); articles.push({ title: title, url: fullUrl, type: classify(fullUrl, title) }); added = true; } j++; } // If we added sk items, skip to where we left off if (added) { i = j - 1; } // If no sk items were found, process su anchors directly if (!added) { for (const a of suAnchors) { const href = $(a).attr('href') || ''; const text = $(a).text().trim(); if (!href || !text) continue; const fullUrl = href.startsWith('/') ? `https://wol.jw.org${href}` : href; articles.push({ title: text, url: fullUrl, type: classify(fullUrl, text) }); } } } } i++; } }); return articles; } /** * Extract cross references * @param {CheerioAPI} $ - Cheerio instance * @param {Cheerio} studyDiscover - Study discover element * @returns {Array} Array of cross reference strings */ extractCrossReferences($, studyDiscover) { const references = []; // Look for cross reference sections studyDiscover.find('div.crossRefs').each((i, section) => { $(section).find('a').each((j, item) => { const refText = $(item).text().trim(); if (refText) { references.push(refText); } }); }); return references; } /** * Get a single verse text (plain text only) * @param {number} bookNum - Book number * @param {number} chapterNum - Chapter number * @param {number} verseNum - Verse number * @returns {Promise<Object>} Verse object with text */ async getSingleVerse(bookNum, chapterNum, verseNum) { const content = await this.extractChapterContent(bookNum, chapterNum); const verse = content.verses.find(v => v.verse_num === verseNum); if (!verse) { throw new Error(`Verse ${bookNum}:${chapterNum}:${verseNum} not found`); } return verse; } /** * Get verses with study content * @param {number} bookNum - Book number * @param {number} chapterNum - Chapter number * @param {string|number} verseInput - Verse number or range (e.g., "14" or "14-16") * @param {Object} options - Options for field selection and limits * @returns {Promise<Object>} Complete study content */ async getVerseWithStudy(bookNum, chapterNum, verseInput, options = {}) { const { fields = ['verses', 'study_notes'], limit = null } = options; // Parse verse input (single number or range) let verseStart, verseEnd; if (typeof verseInput === 'string' && verseInput.includes('-')) { const parts = verseInput.split('-'); verseStart = parseInt(parts[0]); verseEnd = parseInt(parts[1]); } else { verseStart = verseEnd = parseInt(verseInput); } // Fetch chapter content const content = await this.extractChapterContent(bookNum, chapterNum); // Build result object based on requested fields const result = { book_num: bookNum, book_name: getBookName(bookNum), chapter: chapterNum, verse_range: verseStart === verseEnd ? `${verseStart}` : `${verseStart}-${verseEnd}` }; // Filter verses in range const versesInRange = content.verses.filter( v => v.verse_num >= verseStart && v.verse_num <= verseEnd ); // Add requested fields if (fields.includes('verses')) { result.verses = versesInRange; } if (fields.includes('combined_text')) { result.combined_text = versesInRange .map(v => `${v.verse_num}. ${v.verse_text}`) .join(' '); } if (fields.includes('study_notes')) { result.study_notes = {}; for (let v = verseStart; v <= verseEnd; v++) { if (content.verse_study_notes[v]) { result.study_notes[v] = content.verse_study_notes[v]; } } } if (fields.includes('study_articles') && content.chapter_study_data) { let articles = content.chapter_study_data.study_articles; if (limit && articles.length > limit) { articles = articles.slice(0, limit); } result.study_articles = articles; } if (fields.includes('cross_references') && content.chapter_study_data) { result.cross_references = content.chapter_study_data.cross_references; } if (fields.includes('chapter_level') && content.chapter_study_data) { result.chapter_level = { outline: content.chapter_study_data.outline, study_articles: content.chapter_study_data.study_articles, cross_references: content.chapter_study_data.cross_references }; } return result; } } /** * Create a singleton instance for convenience */ export const scraper = new WOLScraper();

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/advenimus/jw-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server