Washington Law MCP Server

Overview Schema Related Servers Score Discussions

rcw-scraper.ts•8.97 KiB

import axios from 'axios'; import * as cheerio from 'cheerio'; import Database from 'better-sqlite3'; import pLimit from 'p-limit'; import { join, dirname } from 'path'; import { fileURLToPath } from 'url'; import { RCWSection } from '../types.js'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); const BASE_URL = 'https://app.leg.wa.gov'; const RCW_URL = `${BASE_URL}/RCW/`; const DB_PATH = join(__dirname, '../../data/washington-laws.db'); // Rate limiting to be respectful const limit = pLimit(2); // Max 2 concurrent requests const DELAY_MS = 500; // Delay between requests function delay(ms: number): Promise<void> { return new Promise(resolve => setTimeout(resolve, ms)); } class RCWScraper { private db: Database.Database; private insertStmt: Database.Statement; private updateProgressStmt: Database.Statement; constructor() { this.db = new Database(DB_PATH); this.insertStmt = this.db.prepare(` INSERT OR REPLACE INTO rcw ( citation, title_num, chapter_num, section_num, title_name, chapter_name, section_name, full_text, effective_date, last_amended, updated_at ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP) `); this.updateProgressStmt = this.db.prepare(` INSERT OR REPLACE INTO scraper_progress (type, title_num, chapter_num, status, error_message, updated_at) VALUES ('RCW', ?, ?, ?, ?, CURRENT_TIMESTAMP) `); } async scrapeAllTitles(): Promise<void> { console.log('Starting RCW scraper...'); console.log('Fetching list of titles...'); try { const response = await axios.get(RCW_URL); const $ = cheerio.load(response.data); // Find all title links const titleLinks: { num: string; name: string; url: string }[] = []; $('a').each((_, element) => { const href = $(element).attr('href'); const text = $(element).text().trim(); // Match title links like "default.aspx?cite=1" or "default.aspx?cite=1A" if (href && /default\.aspx\?cite=\d+[A-Z]?$/i.test(href)) { const match = href.match(/cite=(\d+[A-Z]?)$/i); if (match) { titleLinks.push({ num: match[1], name: text, url: `${RCW_URL}${href}` }); } } }); console.log(`Found ${titleLinks.length} titles to scrape`); // Process each title for (const title of titleLinks) { console.log(`\nProcessing Title ${title.num}: ${title.name}`); await this.scrapeTitle(title.num, title.name, title.url); await delay(DELAY_MS); } console.log('\nRCW scraping completed!'); this.printStats(); } catch (error) { console.error('Error scraping titles:', error); throw error; } finally { this.db.close(); } } async scrapeTitle(titleNum: string, titleName: string, titleUrl: string): Promise<void> { try { const response = await axios.get(titleUrl); const $ = cheerio.load(response.data); // Find all chapter links within this title const chapterLinks: { num: string; name: string; url: string }[] = []; $('a').each((_, element) => { const href = $(element).attr('href'); const text = $(element).text().trim(); // Skip PDF links if (href && href.includes('pdf=true')) return; // Match chapter links like "default.aspx?cite=1.01" or absolute URLs const chapterPattern = new RegExp(`cite=${titleNum}\\.\\d+(?:\\.\\d+)?`, 'i'); if (href && chapterPattern.test(href)) { const match = href.match(/cite=([\d.]+[A-Z]?)/i); if (match) { // Handle both relative and absolute URLs const url = href.startsWith('http') ? href : `${RCW_URL}${href}`; chapterLinks.push({ num: match[1], name: text, url: url }); } } }); console.log(` Found ${chapterLinks.length} chapters in Title ${titleNum}`); // Process chapters with rate limiting const chapterPromises = chapterLinks.map(chapter => limit(async () => { await this.scrapeChapter(titleNum, titleName, chapter.num, chapter.name, chapter.url); await delay(DELAY_MS); }) ); await Promise.all(chapterPromises); this.updateProgressStmt.run(titleNum, null, 'completed', null); } catch (error) { console.error(`Error scraping title ${titleNum}:`, error); this.updateProgressStmt.run(titleNum, null, 'error', (error as Error).message); } } async scrapeChapter( titleNum: string, titleName: string, chapterNum: string, chapterName: string, chapterUrl: string ): Promise<void> { try { const response = await axios.get(chapterUrl); const $ = cheerio.load(response.data); // Find all section links within this chapter const sectionLinks: { num: string; name: string; url: string }[] = []; $('a').each((_, element) => { const href = $(element).attr('href'); const text = $(element).text().trim(); // Skip PDF links if (href && href.includes('pdf=true')) return; // Match section links like "default.aspx?cite=1.01.010" const sectionPattern = new RegExp(`cite=${chapterNum.replace('.', '\\.')}\\.\\d+`, 'i'); if (href && sectionPattern.test(href)) { const match = href.match(/cite=([\d.]+)/); if (match && !sectionLinks.find(s => s.num === match[1])) { // Handle both relative and absolute URLs const url = href.startsWith('http') ? href : `${RCW_URL}${href}`; sectionLinks.push({ num: match[1], name: text, url: url }); } } }); console.log(` Chapter ${chapterNum}: ${sectionLinks.length} sections`); // Process sections with rate limiting const sectionPromises = sectionLinks.map(section => limit(async () => { await this.scrapeSection( titleNum, titleName, chapterNum, chapterName, section.num, section.name, section.url ); await delay(DELAY_MS); }) ); await Promise.all(sectionPromises); this.updateProgressStmt.run(null, chapterNum, 'completed', null); } catch (error) { console.error(`Error scraping chapter ${chapterNum}:`, error); this.updateProgressStmt.run(null, chapterNum, 'error', (error as Error).message); } } async scrapeSection( titleNum: string, titleName: string, chapterNum: string, chapterName: string, sectionNum: string, sectionName: string, sectionUrl: string ): Promise<void> { try { console.log(` Scraping section ${sectionNum}...`); const response = await axios.get(sectionUrl); const $ = cheerio.load(response.data); // Extract the full text of the section let fullText = ''; // Remove all script and style tags first $('script, style, nav, .navigation, .breadcrumb, .footer, .header').remove(); // Try to get just the body text fullText = $('body').text() .replace(/\s+/g, ' ') .replace(/\n{3,}/g, '\n\n') .trim(); // Make sure we have content if (!fullText || fullText.length < 50) { console.warn(` Warning: Section ${sectionNum} has very little content (${fullText.length} chars)`); } // Extract effective date if present let effectiveDate: string | undefined; const effectiveDateMatch = fullText.match(/\[(\d{4}) c \d+ § \d+(?:; )?([^\]]*)\]/); if (effectiveDateMatch) { effectiveDate = effectiveDateMatch[0]; } // Save to database const result = this.insertStmt.run( sectionNum, // citation titleNum, // title_num chapterNum, // chapter_num sectionNum.split('.').pop(), // section_num titleName || '', // title_name chapterName || '', // chapter_name sectionName || '', // section_name fullText || '', // full_text effectiveDate || null,// effective_date null // last_amended ); if (result.changes > 0) { console.log(` ✓ Saved section ${sectionNum}`); } else { console.warn(` Warning: Failed to save section ${sectionNum}`); } } catch (error) { console.error(` Error scraping section ${sectionNum}:`, error); } } private printStats(): void { const stats = this.db.prepare(` SELECT COUNT(*) as count FROM rcw `).get() as any; console.log(`\nScraping complete! Total RCW sections: ${stats.count}`); } } // Run the scraper if (import.meta.url === `file://${process.argv[1]}`) { const scraper = new RCWScraper(); scraper.scrapeAllTitles().catch(console.error); }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ccchow/washington-law-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

rcw-scraper.ts•8.97 KiB