Skip to main content
Glama
brockwebb

Open Census MCP Server

by brockwebb
build_phrase_lists.py2.68 kB
#!/usr/bin/env python3 """ 0-C – Harvest topical and geographic key-phrases from the mini-corpus. Outputs ------- data/topical_phrases.txt one phrase per line (deduped, sorted) data/geo_phrases.txt subset of geo-centric phrases """ import re, json, string, random from collections import Counter from pathlib import Path ROOT = Path(__file__).resolve().parent CORPUS_DIR = ROOT / "data" / "mini_corpus" MANIFEST = ROOT / "data" / "mini_manifest.json" TOPICAL_OUT = ROOT / "data" / "topical_phrases.txt" GEO_OUT = ROOT / "data" / "geo_phrases.txt" # ---------- helpers ---------------------------------------------------------- punct_table = str.maketrans("", "", string.punctuation.replace("-", "")) STOP = { "the","and","of","in","to","for","on","with","by","per","a","an", "or","as","at","be","from","this","that","pdf","acs","american","community","survey" } GEO_KEYS = { "tract","block","county","state","msa","cbsa","puma","zip","zcta", "urban","rural","metro","micropolitan","place","geography","geographies", "region","division","csa","town","city","township" } def tokenize(text: str): text = text.translate(punct_table).lower() return [t for t in text.split() if t and not t.isdigit() and t not in STOP] def ngrams(tokens, n): return [" ".join(tokens[i:i+n]) for i in range(len(tokens)-n+1)] # ---------- harvest ---------------------------------------------------------- def collect_phrases(): phrase_counts = Counter() for txt_path in CORPUS_DIR.iterdir(): text = txt_path.read_text(errors="ignore") toks = tokenize(text)[:2000] # keep it cheap for n in (2,3,4): # bi-, tri-, quad-grams phrase_counts.update(ngrams(toks, n)) # remove obvious numeric range / dollar stubs cleaner = re.compile(r"^\$?\d+[k0-9,]*( to |–|-)") phrases = [p for p,c in phrase_counts.items() if c >= 2 and # appears at least twice not cleaner.match(p) and not re.fullmatch(r"[a-z_]+\d+", p)] # skip raw table ids topical = sorted(phrases) geo = sorted(p for p in phrases if any(g in p for g in GEO_KEYS)) TOPICAL_OUT.write_text("\n".join(topical)) GEO_OUT.write_text("\n".join(geo)) print(f"✅ Wrote {len(topical):,} topical phrases → {TOPICAL_OUT}") print(f"✅ Wrote {len(geo):,} geo phrases → {GEO_OUT}") # ---------- main ------------------------------------------------------------- if __name__ == "__main__": if not CORPUS_DIR.exists(): raise SystemExit("❌ Run step 0-B first – mini_corpus not found.") collect_phrases()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/brockwebb/open-census-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server