GovNavigator

Overview Schema Related Servers Score Discussions

municode_scraper.py•10.2 KiB

""" municode_scraper.py - Scrapes municipal code from Municode.com === WHAT IS WEB SCRAPING? === Web scraping is automatically extracting data from websites. Playwright controls a real browser to handle JavaScript-rendered content. === KEY LEARNING: ITERATIVE DEVELOPMENT === Scraping is trial and error - you try, inspect results, adjust, repeat. """ import asyncio import json import re from pathlib import Path from typing import Optional from urllib.parse import urljoin from playwright.async_api import async_playwright, Page, Browser, TimeoutError as PlaywrightTimeout from bs4 import BeautifulSoup class MunicodeScraper: """ Scrapes municipal ordinances from Municode.com. === CLASS EXPLANATION === A class bundles data and functions together. - __init__: Sets up the object when created - Methods: Functions that operate on the object's data - self: Reference to the current instance """ def __init__(self, jurisdiction: str = "wi/madison"): """ Initialize the scraper. Args: jurisdiction: The state/city path (e.g., "wi/madison") """ self.jurisdiction = jurisdiction self.base_url = f"https://library.municode.com/{jurisdiction}/codes/code_of_ordinances" # Will be set when browser starts self.browser: Optional[Browser] = None self.page: Optional[Page] = None self._playwright = None # Output directory for scraped data self.output_dir = Path("data/raw") self.output_dir.mkdir(parents=True, exist_ok=True) # Track scraped URLs to avoid duplicates self.scraped_urls: set[str] = set() async def start_browser(self): """ Launch the browser. === ASYNC/AWAIT EXPLANATION === - async def: This function can pause and resume - await: Pause here until the async operation completes - This lets us do other things while waiting for slow operations """ print("Starting browser...") self._playwright = await async_playwright().start() self.browser = await self._playwright.chromium.launch( headless=True # Set False to see the browser window ) context = await self.browser.new_context( viewport={"width": 1920, "height": 1080}, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0" ) self.page = await context.new_page() self.page.set_default_timeout(30000) print("Browser started!") async def close_browser(self): """Clean up browser resources.""" if self.browser: await self.browser.close() if self._playwright: await self._playwright.stop() print("Browser closed.") async def wait_for_content(self): """Wait for JavaScript to load content.""" try: await self.page.wait_for_selector( ".document-frame, .chunk-content, #documentContent, article", timeout=10000 ) await asyncio.sleep(1) except PlaywrightTimeout: print(" Content load timeout, continuing...") async def get_toc_links(self) -> list[dict]: """ Get all section links from the table of contents. Returns: List of dicts with 'title' and 'url' keys """ print(f"Loading: {self.base_url}") await self.page.goto(self.base_url, wait_until="domcontentloaded") await self.wait_for_content() html = await self.page.content() soup = BeautifulSoup(html, "lxml") links = [] # Find all links with nodeId (Municode's internal navigation) for a_tag in soup.select("a[href*='nodeId=']"): href = a_tag.get("href", "") text = a_tag.get_text(strip=True) # Skip navigation elements if not text or len(text) < 3: continue if text.lower() in ["expand", "collapse", "next", "previous", "share"]: continue # Build full URL if href.startswith("/"): full_url = f"https://library.municode.com{href}" elif href.startswith("http"): full_url = href else: full_url = urljoin(self.base_url, href) if full_url not in self.scraped_urls: links.append({"title": text[:200], "url": full_url}) self.scraped_urls.add(full_url) print(f"Found {len(links)} unique links") return links async def scrape_section_content(self, url: str) -> Optional[dict]: """ Scrape a single ordinance section. Args: url: URL of the section to scrape Returns: Dict with section data, or None if failed """ try: await self.page.goto(url, wait_until="domcontentloaded") await self.wait_for_content() html = await self.page.content() soup = BeautifulSoup(html, "lxml") # === EXTRACTION STRATEGY === # Try multiple selectors since website structure varies content_text = "" # Try main content area content_area = soup.select_one(".document-frame, #documentContent") if content_area: for nav in content_area.select("nav, .toolbar, .share-button"): nav.decompose() content_text = content_area.get_text(separator="\n", strip=True) # Fallback: chunk content if not content_text or len(content_text) < 100: chunks = soup.select(".chunk-content") if chunks: content_text = "\n\n".join( c.get_text(separator="\n", strip=True) for c in chunks ) # Fallback: any main element if not content_text or len(content_text) < 100: main = soup.select_one("article, main, .content") if main: content_text = main.get_text(separator="\n", strip=True) if not content_text or len(content_text) < 50: return None # Extract title title = "" title_elem = soup.select_one("h1, .document-title, .chunk-title") if title_elem: title = title_elem.get_text(strip=True) # Extract section number (e.g., "28.04", "9.13(4)") section_pattern = r"(\d+\.\d+(?:\(\d+\))?(?:\([a-z]\))?)" section_number = "" match = re.search(section_pattern, title) if match: section_number = match.group(1) else: match = re.search(section_pattern, content_text[:500]) if match: section_number = match.group(1) # Extract chapter from breadcrumb chapter = "" breadcrumb = soup.select_one(".breadcrumb, [class*='breadcrumb']") if breadcrumb: chapter = breadcrumb.get_text(strip=True) # Clean content content_text = re.sub(r'\n{3,}', '\n\n', content_text) content_text = re.sub(r' {2,}', ' ', content_text) return { "section_number": section_number, "title": title[:300] if title else "Untitled", "chapter": chapter[:200], "content": content_text[:15000], "url": url, "jurisdiction": "Madison, WI" } except Exception as e: print(f" Error: {e}") return None async def scrape_all(self, max_sections: int = 50) -> list[dict]: """ Main scraping function. Args: max_sections: Maximum sections to scrape Returns: List of scraped ordinance dicts """ ordinances = [] try: await self.start_browser() # Step 1: Get TOC print("\n=== Step 1: Getting Table of Contents ===") toc_links = await self.get_toc_links() if not toc_links: print("No links found!") return [] # Step 2: Scrape sections print(f"\n=== Step 2: Scraping up to {max_sections} sections ===") scraped_count = 0 for i, link in enumerate(toc_links): if scraped_count >= max_sections: break print(f"[{scraped_count + 1}/{max_sections}] {link['title'][:50]}...") result = await self.scrape_section_content(link["url"]) if result and len(result.get("content", "")) > 100: ordinances.append(result) scraped_count += 1 print(f" -> Got {len(result['content'])} chars") await asyncio.sleep(0.5) # Be nice to the server print(f"\nSuccessfully scraped {len(ordinances)} ordinances!") finally: await self.close_browser() return ordinances def save_results(self, ordinances: list[dict], filename: str = "madison_ordinances.json"): """Save results to JSON file.""" if not ordinances: print("No ordinances to save.") return None output_path = self.output_dir / filename with open(output_path, "w", encoding="utf-8") as f: json.dump(ordinances, f, indent=2, ensure_ascii=False) print(f"\nSaved {len(ordinances)} ordinances to {output_path}") # Show preview print("\nSample of scraped content:") for i, ord in enumerate(ordinances[:3]): print(f"\n{i+1}. {ord['title'][:60]}") print(f" Section: {ord['section_number'] or 'N/A'}") print(f" Preview: {ord['content'][:100]}...") return output_path async def main(): """Main entry point.""" print("=" * 60) print("Madison Municipal Code Scraper") print("=" * 60) scraper = MunicodeScraper("wi/madison") ordinances = await scraper.scrape_all(max_sections=30) if ordinances: scraper.save_results(ordinances) else: print("\nNo substantial content found.") print("The website structure may have changed.") if __name__ == "__main__": asyncio.run(main())

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/krishangMittal/GovNavigator'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

municode_scraper.py•10.2 KiB