Skip to main content
Glama
ccchow

Washington Law MCP Server

by ccchow
rcw-scraper.ts9.19 kB
import axios from 'axios'; import * as cheerio from 'cheerio'; import Database from 'better-sqlite3'; import pLimit from 'p-limit'; import { join, dirname } from 'path'; import { fileURLToPath } from 'url'; import { RCWSection } from '../types.js'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); const BASE_URL = 'https://app.leg.wa.gov'; const RCW_URL = `${BASE_URL}/RCW/`; const DB_PATH = join(__dirname, '../../data/washington-laws.db'); // Rate limiting to be respectful const limit = pLimit(2); // Max 2 concurrent requests const DELAY_MS = 500; // Delay between requests function delay(ms: number): Promise<void> { return new Promise(resolve => setTimeout(resolve, ms)); } class RCWScraper { private db: Database.Database; private insertStmt: Database.Statement; private updateProgressStmt: Database.Statement; constructor() { this.db = new Database(DB_PATH); this.insertStmt = this.db.prepare(` INSERT OR REPLACE INTO rcw ( citation, title_num, chapter_num, section_num, title_name, chapter_name, section_name, full_text, effective_date, last_amended, updated_at ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP) `); this.updateProgressStmt = this.db.prepare(` INSERT OR REPLACE INTO scraper_progress (type, title_num, chapter_num, status, error_message, updated_at) VALUES ('RCW', ?, ?, ?, ?, CURRENT_TIMESTAMP) `); } async scrapeAllTitles(): Promise<void> { console.log('Starting RCW scraper...'); console.log('Fetching list of titles...'); try { const response = await axios.get(RCW_URL); const $ = cheerio.load(response.data); // Find all title links const titleLinks: { num: string; name: string; url: string }[] = []; $('a').each((_, element) => { const href = $(element).attr('href'); const text = $(element).text().trim(); // Match title links like "default.aspx?cite=1" or "default.aspx?cite=1A" if (href && /default\.aspx\?cite=\d+[A-Z]?$/i.test(href)) { const match = href.match(/cite=(\d+[A-Z]?)$/i); if (match) { titleLinks.push({ num: match[1], name: text, url: `${RCW_URL}${href}` }); } } }); console.log(`Found ${titleLinks.length} titles to scrape`); // Process each title for (const title of titleLinks) { console.log(`\nProcessing Title ${title.num}: ${title.name}`); await this.scrapeTitle(title.num, title.name, title.url); await delay(DELAY_MS); } console.log('\nRCW scraping completed!'); this.printStats(); } catch (error) { console.error('Error scraping titles:', error); throw error; } finally { this.db.close(); } } async scrapeTitle(titleNum: string, titleName: string, titleUrl: string): Promise<void> { try { const response = await axios.get(titleUrl); const $ = cheerio.load(response.data); // Find all chapter links within this title const chapterLinks: { num: string; name: string; url: string }[] = []; $('a').each((_, element) => { const href = $(element).attr('href'); const text = $(element).text().trim(); // Skip PDF links if (href && href.includes('pdf=true')) return; // Match chapter links like "default.aspx?cite=1.01" or absolute URLs const chapterPattern = new RegExp(`cite=${titleNum}\\.\\d+(?:\\.\\d+)?`, 'i'); if (href && chapterPattern.test(href)) { const match = href.match(/cite=([\d.]+[A-Z]?)/i); if (match) { // Handle both relative and absolute URLs const url = href.startsWith('http') ? href : `${RCW_URL}${href}`; chapterLinks.push({ num: match[1], name: text, url: url }); } } }); console.log(` Found ${chapterLinks.length} chapters in Title ${titleNum}`); // Process chapters with rate limiting const chapterPromises = chapterLinks.map(chapter => limit(async () => { await this.scrapeChapter(titleNum, titleName, chapter.num, chapter.name, chapter.url); await delay(DELAY_MS); }) ); await Promise.all(chapterPromises); this.updateProgressStmt.run(titleNum, null, 'completed', null); } catch (error) { console.error(`Error scraping title ${titleNum}:`, error); this.updateProgressStmt.run(titleNum, null, 'error', (error as Error).message); } } async scrapeChapter( titleNum: string, titleName: string, chapterNum: string, chapterName: string, chapterUrl: string ): Promise<void> { try { const response = await axios.get(chapterUrl); const $ = cheerio.load(response.data); // Find all section links within this chapter const sectionLinks: { num: string; name: string; url: string }[] = []; $('a').each((_, element) => { const href = $(element).attr('href'); const text = $(element).text().trim(); // Skip PDF links if (href && href.includes('pdf=true')) return; // Match section links like "default.aspx?cite=1.01.010" const sectionPattern = new RegExp(`cite=${chapterNum.replace('.', '\\.')}\\.\\d+`, 'i'); if (href && sectionPattern.test(href)) { const match = href.match(/cite=([\d.]+)/); if (match && !sectionLinks.find(s => s.num === match[1])) { // Handle both relative and absolute URLs const url = href.startsWith('http') ? href : `${RCW_URL}${href}`; sectionLinks.push({ num: match[1], name: text, url: url }); } } }); console.log(` Chapter ${chapterNum}: ${sectionLinks.length} sections`); // Process sections with rate limiting const sectionPromises = sectionLinks.map(section => limit(async () => { await this.scrapeSection( titleNum, titleName, chapterNum, chapterName, section.num, section.name, section.url ); await delay(DELAY_MS); }) ); await Promise.all(sectionPromises); this.updateProgressStmt.run(null, chapterNum, 'completed', null); } catch (error) { console.error(`Error scraping chapter ${chapterNum}:`, error); this.updateProgressStmt.run(null, chapterNum, 'error', (error as Error).message); } } async scrapeSection( titleNum: string, titleName: string, chapterNum: string, chapterName: string, sectionNum: string, sectionName: string, sectionUrl: string ): Promise<void> { try { console.log(` Scraping section ${sectionNum}...`); const response = await axios.get(sectionUrl); const $ = cheerio.load(response.data); // Extract the full text of the section let fullText = ''; // Remove all script and style tags first $('script, style, nav, .navigation, .breadcrumb, .footer, .header').remove(); // Try to get just the body text fullText = $('body').text() .replace(/\s+/g, ' ') .replace(/\n{3,}/g, '\n\n') .trim(); // Make sure we have content if (!fullText || fullText.length < 50) { console.warn(` Warning: Section ${sectionNum} has very little content (${fullText.length} chars)`); } // Extract effective date if present let effectiveDate: string | undefined; const effectiveDateMatch = fullText.match(/\[(\d{4}) c \d+ § \d+(?:; )?([^\]]*)\]/); if (effectiveDateMatch) { effectiveDate = effectiveDateMatch[0]; } // Save to database const result = this.insertStmt.run( sectionNum, // citation titleNum, // title_num chapterNum, // chapter_num sectionNum.split('.').pop(), // section_num titleName || '', // title_name chapterName || '', // chapter_name sectionName || '', // section_name fullText || '', // full_text effectiveDate || null,// effective_date null // last_amended ); if (result.changes > 0) { console.log(` ✓ Saved section ${sectionNum}`); } else { console.warn(` Warning: Failed to save section ${sectionNum}`); } } catch (error) { console.error(` Error scraping section ${sectionNum}:`, error); } } private printStats(): void { const stats = this.db.prepare(` SELECT COUNT(*) as count FROM rcw `).get() as any; console.log(`\nScraping complete! Total RCW sections: ${stats.count}`); } } // Run the scraper if (import.meta.url === `file://${process.argv[1]}`) { const scraper = new RCWScraper(); scraper.scrapeAllTitles().catch(console.error); }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ccchow/washington-law-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server