SF Permits MCP Server

test_links.py•11 KiB

"""Dead link spider — crawls internal links and verifies none return 404/500. Extended from Sprint 67-C: - Page cap: 200 (up from 100) - Authenticated admin crawl covers /admin/* pages - Response time tracking: flags pages >5s - Separates internal vs external links (does not follow external) - Summary output at end of crawl """ import time from urllib.parse import urljoin, urlparse from html.parser import HTMLParser import pytest from web.app import app, _rate_buckets # --------------------------------------------------------------------------- # HTML link extractor # --------------------------------------------------------------------------- class LinkExtractor(HTMLParser): """Extract href values from <a> tags.""" def __init__(self): super().__init__() self.links: list[str] = [] def handle_starttag(self, tag, attrs): if tag == "a": for name, value in attrs: if name == "href" and value: self.links.append(value) def extract_links(html: str) -> list[str]: """Extract all href values from HTML.""" parser = LinkExtractor() parser.feed(html) return parser.links # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- @pytest.fixture def client(): app.config["TESTING"] = True _rate_buckets.clear() with app.test_client() as c: yield c _rate_buckets.clear() def _login(client, email="spider-test@test.com"): """Login helper for authenticated crawling.""" import src.db as db_mod if db_mod.BACKEND == "duckdb": db_mod.init_user_schema() from web.auth import get_or_create_user, create_magic_token user = get_or_create_user(email) token = create_magic_token(user["user_id"]) client.get(f"/auth/verify/{token}", follow_redirects=True) return user def _login_admin(client, email="spider-admin@test.com"): """Login helper for admin crawling.""" import src.db as db_mod if db_mod.BACKEND == "duckdb": db_mod.init_user_schema() from web.auth import get_or_create_user, create_magic_token user = get_or_create_user(email) conn = db_mod.get_connection() try: conn.execute( "UPDATE users SET is_admin = TRUE WHERE user_id = ?", [user["user_id"]], ) if hasattr(conn, "commit"): conn.commit() except Exception: pass finally: if hasattr(db_mod, "release_connection"): db_mod.release_connection(conn) token = create_magic_token(user["user_id"]) client.get(f"/auth/verify/{token}", follow_redirects=True) return user # --------------------------------------------------------------------------- # Skip patterns # --------------------------------------------------------------------------- SKIP_PREFIXES = ( "/static/", "/auth/logout", "/auth/verify/", "/cron/", "/admin/feedback/", # POST-heavy admin actions "/watch/", # POST-only watch management "/plan-images/", # requires valid session_id "/plan-session/", # requires valid session_id "/plan-jobs/", # requires valid job_id "/analysis/", # requires valid analysis_id "/report/", # requires valid block/lot "/portfolio/timeline/", # requires valid block/lot "/dashboard/bottlenecks/station/", # JSON-only endpoint "/email/", # email endpoints "/onboarding/", # POST-only "/api/", # JSON API endpoints "/project/", # requires valid project_id ) SKIP_EXTENSIONS = (".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".xml", ".json") def _is_external(url: str) -> bool: """Check if URL is external (different host).""" parsed = urlparse(url) if parsed.scheme and parsed.scheme not in ("", "http", "https"): return True if parsed.netloc and parsed.netloc not in ("", "localhost", "localhost:5001"): return True if url.startswith("mailto:") or url.startswith("javascript:") or url.startswith("tel:"): return True return False def should_crawl(url: str) -> bool: """Return True if this URL should be visited by the spider.""" if _is_external(url): return False parsed = urlparse(url) path = parsed.path if not path or path == "#": return False if any(path.lower().endswith(ext) for ext in SKIP_EXTENSIONS): return False if any(path.startswith(prefix) for prefix in SKIP_PREFIXES): return False return True # --------------------------------------------------------------------------- # Spider core # --------------------------------------------------------------------------- def _crawl(client, seeds: list[str], max_pages: int = 200): """Crawl pages starting from seeds. Returns (visited, errors, external, slow).""" visited: set[str] = set() to_visit: list[str] = list(seeds) errors: list[tuple[str, int, str]] = [] # (url, status, referrer) external_links: set[str] = set() slow_pages: list[tuple[str, float]] = [] # (url, seconds) while to_visit and len(visited) < max_pages: url = to_visit.pop(0) path = urlparse(url).path or url if path in visited: continue visited.add(path) start_time = time.time() rv = client.get(url, follow_redirects=True) elapsed = time.time() - start_time if elapsed > 5.0: slow_pages.append((url, elapsed)) if rv.status_code in (404, 500, 502, 503): errors.append((url, rv.status_code, "spider")) continue content_type = rv.content_type or "" if "html" not in content_type: continue html = rv.data.decode("utf-8", errors="replace") links = extract_links(html) for link in links: if _is_external(link): external_links.add(link) continue resolved = urljoin(url, link) resolved_path = urlparse(resolved).path if should_crawl(resolved) and resolved_path not in visited: to_visit.append(resolved_path) return visited, errors, external_links, slow_pages # --------------------------------------------------------------------------- # Tests # --------------------------------------------------------------------------- class TestDeadLinkSpider: """Crawl internal links and verify none return 404/500.""" MAX_PAGES = 200 def test_no_dead_links_anonymous(self, client): """Crawl from / as anonymous user. No 404s or 500s.""" visited, errors, external, slow = _crawl( client, ["/"], max_pages=self.MAX_PAGES, ) # Print summary print(f"\n--- Anonymous Spider Summary ---") print(f"Pages crawled: {len(visited)}") print(f"External links found: {len(external)}") print(f"Slow pages (>5s): {len(slow)}") if slow: for url, secs in slow: print(f" {secs:.1f}s {url}") print(f"Broken links: {len(errors)}") assert len(visited) > 1, "Spider should have visited at least 2 pages" if errors: error_summary = "\n".join( f" {status} {url}" for url, status, _ in errors ) pytest.fail( f"Dead links found ({len(errors)}):\n{error_summary}\n" f"Visited {len(visited)} pages total." ) def test_no_dead_links_authenticated(self, client): """Crawl from / as authenticated user. No 404s or 500s.""" _login(client) visited, errors, external, slow = _crawl( client, ["/", "/account", "/portfolio", "/brief"], max_pages=self.MAX_PAGES, ) print(f"\n--- Authenticated Spider Summary ---") print(f"Pages crawled: {len(visited)}") print(f"External links found: {len(external)}") print(f"Slow pages (>5s): {len(slow)}") if slow: for url, secs in slow: print(f" {secs:.1f}s {url}") print(f"Broken links: {len(errors)}") assert len(visited) > 3, "Authenticated spider should visit at least 4 pages" if errors: error_summary = "\n".join( f" {status} {url}" for url, status, _ in errors ) pytest.fail( f"Dead links found ({len(errors)}):\n{error_summary}\n" f"Visited {len(visited)} pages total." ) def test_no_dead_links_admin(self, client): """Crawl /admin/* as admin user. No 404s or 500s.""" _login_admin(client) admin_seeds = [ "/admin/ops", "/admin/feedback", "/admin/pipeline", "/admin/costs", "/admin/beta-requests", "/admin/activity", "/admin/sources", "/admin/regulatory-watch", ] visited, errors, external, slow = _crawl( client, admin_seeds, max_pages=self.MAX_PAGES, ) print(f"\n--- Admin Spider Summary ---") print(f"Pages crawled: {len(visited)}") print(f"External links found: {len(external)}") print(f"Slow pages (>5s): {len(slow)}") if slow: for url, secs in slow: print(f" {secs:.1f}s {url}") print(f"Broken links: {len(errors)}") assert len(visited) > 1, "Admin spider should visit at least 2 pages" if errors: error_summary = "\n".join( f" {status} {url}" for url, status, _ in errors ) pytest.fail( f"Dead links found ({len(errors)}):\n{error_summary}\n" f"Visited {len(visited)} pages total." ) def test_no_slow_pages(self, client): """No page should take >5 seconds to respond.""" visited, errors, external, slow = _crawl( client, ["/"], max_pages=50, ) if slow: slow_summary = "\n".join( f" {secs:.1f}s {url}" for url, secs in slow ) pytest.fail( f"Slow pages found ({len(slow)}):\n{slow_summary}" ) class TestSpiderCoverage: """Verify the spider finds expected pages.""" def test_spider_visits_login(self, client): """Spider should reach /auth/login from landing page.""" rv = client.get("/") html = rv.data.decode() links = extract_links(html) paths = [urlparse(l).path for l in links] assert "/auth/login" in paths, ( f"/auth/login not linked from landing page. Found links: {paths}" ) def test_spider_visits_search(self, client): """Landing page should have a search form action.""" rv = client.get("/") html = rv.data.decode() assert "/search" in html def test_methodology_page_accessible(self, client): """Methodology page returns 200.""" rv = client.get("/methodology") assert rv.status_code == 200

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/tbrennem-source/sf-permits-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_links.py•11 KiB