SF Permits MCP Server

sf-permits-mcp
scripts

scrape_abs_v2.py•7.74 KiB

#!/usr/bin/env python3 """ Scrape Administrative Bulletins AB-093, AB-110, AB-112 from amlegal.com using Playwright. Uses DOM range extraction between anchor elements for clean content. """ import re import sys import time from pathlib import Path from playwright.sync_api import sync_playwright # Target ABs with their known URLs AB_TARGETS = { "AB-093": { "url": "https://codelibrary.amlegal.com/codes/san_francisco/latest/sf_building/0-0-0-95527#JD_AB-093", "title": "Implementation of Green Building Regulations", "end_anchor": "JD_AB-094", # Next AB on this page }, "AB-110": { "url": "https://codelibrary.amlegal.com/codes/san_francisco/latest/sf_building/0-0-0-96476#JD_AB-110", "title": "Building Facade Inspection and Maintenance", "end_anchor": "JD_AB-111", # Next AB on this page }, "AB-112": { "url": "https://codelibrary.amlegal.com/codes/san_francisco/latest/sf_building/0-0-0-100198#JD_AB-112", "title": "Implementation of All Electric New Construction Regulations", "end_anchor": "JD_AB-113", # Next AB on this page }, } OUTPUT_DIR = Path("/Users/timbrenneman/AIprojects/sf-permits-mcp/data/knowledge/tier3") def clean_text(raw_text: str, ab_id: str) -> str: """Clean the extracted text: normalize whitespace, trim the header line.""" lines = raw_text.split("\n") # The first line is usually "AB-093 Implementation of Green Building..." # which is the section heading. We want to keep that. # Then find "NO. AB-XXX" which is the start of the formal bulletin cleaned = [] prev_blank = False found_no = False for line in lines: stripped = line.strip() # Skip everything before "NO. AB-XXX" (the TOC heading line) if not found_no: if stripped.startswith(f"NO. {ab_id}") or stripped.startswith("NO. AB-"): found_no = True else: continue if not stripped: if not prev_blank: cleaned.append("") prev_blank = True else: # Clean up tab/space formatting line_clean = stripped.replace("\t", " ") cleaned.append(line_clean) prev_blank = False # Remove trailing blanks while cleaned and not cleaned[-1]: cleaned.pop() # Remove "Originally Signed by" duplicates and trailing metadata if present # Also remove the amlegal disclaimer at the end final_lines = [] for line in cleaned: if line.startswith("Disclaimer: This Code of Ordinances"): break if line.startswith("Hosted by: American Legal Publishing"): break if line.startswith("Back to Code Library"): break final_lines.append(line) # Remove trailing blanks again while final_lines and not final_lines[-1]: final_lines.pop() return "\n".join(final_lines) def extract_ab_content(page, ab_id: str, end_anchor: str) -> str: """ Extract AB content using DOM range between the AB's anchor and the next AB's anchor. """ start_anchor_id = f"JD_{ab_id}" # First, list all anchors on the page to verify anchors = page.evaluate(""" (args) => { const anchors = document.querySelectorAll('a[id^="JD_AB-"]'); return Array.from(anchors).map(a => a.id); } """, {}) print(f" Available anchors: {anchors}") # Extract text between the two anchors using DOM Range result = page.evaluate(""" (args) => { const startId = args.startId; const endId = args.endId; const startAnchor = document.querySelector('#' + startId); if (!startAnchor) return {error: 'Start anchor not found: ' + startId}; const endAnchor = document.querySelector('#' + endId); const range = document.createRange(); range.setStartBefore(startAnchor); if (endAnchor) { range.setEndBefore(endAnchor); } else { // No end anchor - go to the end of the parent section // Find the section container let section = startAnchor.closest('.Section') || startAnchor.closest('[id^="section-"]'); if (section) { range.setEndAfter(section); } else { // Last resort: get everything after start in the codecontent const codecontent = document.querySelector('#codecontent'); if (codecontent) { range.setEndAfter(codecontent); } else { return {error: 'Could not determine end boundary'}; } } } const fragment = range.cloneContents(); const div = document.createElement('div'); div.appendChild(fragment); return {text: div.innerText, length: div.innerText.length}; } """, {"startId": start_anchor_id, "endId": end_anchor}) if "error" in result: raise RuntimeError(result["error"]) print(f" Extracted {result['length']} chars via DOM range") return result["text"] def main(): output_dir = OUTPUT_DIR output_dir.mkdir(parents=True, exist_ok=True) with sync_playwright() as p: print("Launching browser...") browser = p.chromium.launch(headless=True) context = browser.new_context( user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) page = context.new_page() results = {} for ab_id, info in AB_TARGETS.items(): print(f"\n{'='*60}") print(f"Scraping {ab_id}: {info['title']}") print(f"{'='*60}") try: print(f" Navigating to {info['url']}") page.goto(info["url"], wait_until="networkidle", timeout=60000) # Wait a moment for React to fully render page.wait_for_timeout(2000) raw_text = extract_ab_content(page, ab_id, info["end_anchor"]) cleaned = clean_text(raw_text, ab_id) outfile = output_dir / f"{ab_id}.txt" outfile.write_text(cleaned, encoding="utf-8") print(f" Cleaned text length: {len(cleaned)} chars") print(f" Saved to: {outfile}") # Show first and last few lines for verification all_lines = cleaned.split("\n") print(f" First 8 lines:") for line in all_lines[:8]: print(f" | {line[:120]}") print(f" ...") print(f" Last 5 lines:") for line in all_lines[-5:]: print(f" | {line[:120]}") results[ab_id] = { "status": "success", "chars": len(cleaned), "file": str(outfile), } # Brief pause between requests time.sleep(2) except Exception as e: print(f" ERROR scraping {ab_id}: {e}") import traceback traceback.print_exc() results[ab_id] = {"status": "error", "error": str(e)} browser.close() # Summary print(f"\n{'='*60}") print("SUMMARY") print(f"{'='*60}") for ab_id, result in results.items(): if result["status"] == "success": print(f" {ab_id}: OK - {result['chars']} chars -> {result['file']}") else: print(f" {ab_id}: FAILED - {result['error']}") if any(r["status"] != "success" for r in results.values()): sys.exit(1) if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/tbrennem-source/sf-permits-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

scrape_abs_v2.py•7.74 KiB