Skip to main content
Glama
ccchow

Washington Law MCP Server

by ccchow
ralj-scraper.ts8.21 kB
import axios from 'axios'; import * as cheerio from 'cheerio'; import Database from 'better-sqlite3'; import { join, dirname } from 'path'; import { fileURLToPath } from 'url'; import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); const DB_PATH = join(__dirname, '../../data/washington-laws.db'); const BASE_URL = 'https://www.courts.wa.gov'; interface RALJRule { ruleSet: string; ruleNumber: string; ruleName: string; pdfUrl: string; } class RALJScraper { private db: Database.Database; private insertStmt: Database.Statement; constructor() { this.db = new Database(DB_PATH); this.insertStmt = this.db.prepare(` INSERT OR REPLACE INTO court_rules ( rule_set, rule_number, rule_name, full_text, updated_at ) VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP) `); } async delay(ms: number): Promise<void> { return new Promise(resolve => setTimeout(resolve, ms)); } async scrapeAll(): Promise<void> { console.log('Starting RALJ (Rules for Appeal of Decisions of Courts of Limited Jurisdiction) scraper...\n'); await this.scrapeRALJ(); this.printStats(); this.db.close(); } async scrapeRALJ(): Promise<void> { console.log('Processing RALJ - Rules for Appeal of Decisions of Courts of Limited Jurisdiction'); try { // Get list of RALJ rules const listUrl = `${BASE_URL}/court_rules/?fa=court_rules.list&group=clj&set=RALJ`; console.log(` Fetching rule list from: ${listUrl}`); const response = await axios.get(listUrl); const $ = cheerio.load(response.data); const rules: RALJRule[] = []; // Find all PDF links for RALJ rules $('a').each((_, element) => { const href = $(element).attr('href'); const text = $(element).text().trim(); // Match PDF links like "../court_rules/pdf/RALJ/CLJ_RALJ_01_01_00.pdf" if (href && href.includes('.pdf') && href.includes('RALJ')) { // Extract rule number from filename (e.g., CLJ_RALJ_01_01_00.pdf -> 1.1) const fileMatch = href.match(/CLJ_RALJ_(\d+)_(\d+)_(\d+)\.pdf/i); if (fileMatch) { // Build rule number (e.g., 1.1 or 1.1a if third part is not 00) let ruleNumber = `${parseInt(fileMatch[1])}.${parseInt(fileMatch[2])}`; // Handle sub-rules if the third part is not 00 if (fileMatch[3] !== '00') { // Some rules might have sub-parts like 1.1(a), but for RALJ they typically don't // We'll handle them as decimal extensions if they exist const subPart = parseInt(fileMatch[3]); if (subPart > 0) { ruleNumber += `.${subPart}`; } } // Build full URL let pdfUrl = href; if (href.startsWith('../')) { pdfUrl = `${BASE_URL}/${href.substring(3)}`; } else if (!href.startsWith('http')) { pdfUrl = `${BASE_URL}${href.startsWith('/') ? '' : '/'}${href}`; } // Extract rule name from text let ruleName = text; // Remove "RALJ X.X" prefix if present ruleName = ruleName.replace(/^RALJ\s+[\d.]+\s*[-–]?\s*/i, '').trim(); if (!ruleName || ruleName === text) { // Try to extract from the full text const nameMatch = text.match(/RALJ\s+[\d.]+\s*[-–]\s*(.+)/i); if (nameMatch) { ruleName = nameMatch[1].trim(); } else { ruleName = `Rule ${ruleNumber}`; } } // Avoid duplicates if (!rules.find(r => r.ruleNumber === ruleNumber)) { rules.push({ ruleSet: 'RALJ', ruleNumber, ruleName, pdfUrl }); } } } }); console.log(` Found ${rules.length} RALJ rules to download and parse`); // Sort rules by rule number for better progress tracking rules.sort((a, b) => { const aParts = a.ruleNumber.split('.'); const bParts = b.ruleNumber.split('.'); for (let i = 0; i < Math.max(aParts.length, bParts.length); i++) { const aNum = parseInt(aParts[i] || '0'); const bNum = parseInt(bParts[i] || '0'); if (aNum !== bNum) return aNum - bNum; } return 0; }); // Download and parse each PDF for (let i = 0; i < rules.length; i++) { const rule = rules[i]; console.log(` Processing RALJ ${rule.ruleNumber}: ${rule.ruleName}...`); try { await this.downloadAndParseRule(rule); } catch (error) { console.error(` Error: ${(error as Error).message}`); } if ((i + 1) % 5 === 0) { console.log(` Processed ${i + 1}/${rules.length} rules`); } await this.delay(300); // Be respectful to the server } console.log(` ✓ Completed RALJ: ${rules.length} rules`); } catch (error) { console.error(`Error scraping RALJ:`, error); } } async downloadAndParseRule(rule: RALJRule): Promise<void> { try { // Download PDF const response = await axios.get(rule.pdfUrl, { responseType: 'arraybuffer', timeout: 30000 }); // Convert to Uint8Array for pdfjs-dist const pdfData = new Uint8Array(response.data); // Load PDF document const loadingTask = getDocument({ data: pdfData }); const pdf = await loadingTask.promise; // Extract text from all pages let fullText = ''; for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) { const page = await pdf.getPage(pageNum); const textContent = await page.getTextContent(); const pageText = textContent.items .map((item: any) => item.str) .join(' '); fullText += pageText + '\n'; } // Clean up the text fullText = fullText .replace(/\r\n/g, '\n') .replace(/\n{3,}/g, '\n\n') .replace(/\s+/g, ' ') .trim(); // Try to extract just the rule content (remove headers/footers) const ruleStart = fullText.search(new RegExp(`RALJ\\s+${rule.ruleNumber.replace(/\./g, '\\.')}`, 'i')); if (ruleStart > 0 && ruleStart < 100) { fullText = fullText.substring(ruleStart); } // Remove common footer/header text fullText = fullText .replace(/Page \d+ of \d+/gi, '') .replace(/Effective \d+\/\d+\/\d+/gi, '') .replace(/\[.*?Reserved\]/gi, '') .trim(); // Try to extract a better rule name from the PDF content if needed if (rule.ruleName === `Rule ${rule.ruleNumber}`) { const nameMatch = fullText.match(new RegExp(`RALJ\\s+${rule.ruleNumber.replace(/\./g, '\\.')}\\s*[-–]?\\s*([A-Z][^.\\n]+)`, 'i')); if (nameMatch) { rule.ruleName = nameMatch[1].trim(); } } // Save to database this.insertStmt.run( rule.ruleSet, rule.ruleNumber, rule.ruleName, fullText ); } catch (error) { throw new Error(`Failed to process PDF for RALJ ${rule.ruleNumber}: ${(error as Error).message}`); } } private printStats(): void { const raljCount = this.db.prepare(` SELECT COUNT(*) as count FROM court_rules WHERE rule_set = 'RALJ' `).get() as any; console.log('\n=== RALJ Scraping Complete ==='); console.log(` RALJ: ${raljCount.count} rules`); const total = this.db.prepare('SELECT COUNT(*) as count FROM court_rules').get() as any; console.log(` Total court rules in database: ${total.count}`); } } // Run the scraper if (import.meta.url === `file://${process.argv[1]}`) { const scraper = new RALJScraper(); scraper.scrapeAll().catch(console.error); }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ccchow/washington-law-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server