Skip to main content
Glama
brockwebb

Open Census MCP Server

by brockwebb
build_400_concepts.py24.2 kB
# knowledge-base/scripts/build_400_concepts.py """ Build the definitive 400-concept taxonomy using: 1. Official Census subject structure (subjects.json) 2. Extracted official definitions (definitions_2023.json) 3. Spock's systematic schema-first approach with confidence bucketing 4. Category template files for systematic generation """ import json import yaml from pathlib import Path from typing import Dict, List, Optional from dataclasses import dataclass import argparse # Canonical schema - fail fast if any field is missing CONCEPT_SCHEMA = { "id": str, # cendata URI "label": str, # Median Household Income "bucket": str, # economic / social / demographic / housing "universe": str, # Households / Civilian Labor Force / Population "stat_method": str, # median / mean / rate / ratio / count "census_tables": list, # ["B19013", "B19013A"] "definition": str, # verbatim from PDF "source_page": int, # page number from definitions PDF "status": str, # auto|reviewed|rejected "confidence": float # 0-1 from LLM } @dataclass class ConceptRecord: """Validated concept record matching canonical schema""" id: str label: str bucket: str universe: str stat_method: str census_tables: List[str] definition: str source_page: int status: str confidence: float def to_dict(self) -> Dict: return { "id": self.id, "label": self.label, "bucket": self.bucket, "universe": self.universe, "stat_method": self.stat_method, "census_tables": self.census_tables, "definition": self.definition, "source_page": self.source_page, "status": self.status, "confidence": self.confidence } @classmethod def validate(cls, concept_dict: Dict) -> 'ConceptRecord': """Validate concept against canonical schema""" # Check all required fields exist for field, expected_type in CONCEPT_SCHEMA.items(): if field not in concept_dict: raise ValueError(f"Missing required field: {field}") value = concept_dict[field] if not isinstance(value, expected_type): raise TypeError(f"Field {field} must be {expected_type.__name__}, got {type(value).__name__}") # Validate specific field constraints if concept_dict["confidence"] < 0 or concept_dict["confidence"] > 1: raise ValueError("Confidence must be between 0 and 1") if concept_dict["status"] not in ["auto", "reviewed", "rejected"]: raise ValueError("Status must be one of: auto, reviewed, rejected") return cls(**concept_dict) class ConceptTaxonomyBuilder: """Build authoritative 400-concept taxonomy with schema validation""" def __init__(self): self.subjects = self._load_subjects() self.definitions = self._load_definitions() self.universe_definitions = self._load_universe_definitions() self.stat_method_definitions = self._load_stat_method_definitions() self.category_templates = self._load_category_templates() self.taxonomy = { "meta": { "total_concepts": 400, "strategy": "Spock's systematic schema-first approach", "sources": ["Census subjects.json", "2023_ACSSubjectDefinitions.pdf", "90% success validation"], "schema_version": "1.0" }, "allocation": { "core_demographics": 50, "housing": 75, "economics": 75, "education": 50, "transportation": 25, "health_social": 50, "geography": 25, "specialized_populations": 50 } } def _load_subjects(self) -> Dict: """Load official Census subjects structure""" subjects_path = Path("../official_sources/subjects.json") if subjects_path.exists(): with open(subjects_path) as f: return json.load(f) else: print("❌ subjects.json not found - create it first") return {} def _load_definitions(self) -> Dict: """Load extracted official definitions""" definitions_path = Path("../official_sources/definitions_2023.json") if definitions_path.exists(): with open(definitions_path) as f: return json.load(f) else: print("❌ definitions_2023.json not found - run extract_definitions.py first") return {} def _load_universe_definitions(self) -> Dict: """Load canonical universe definitions""" universe_path = Path("../official_sources/universe_definitions.yaml") if universe_path.exists(): with open(universe_path) as f: return yaml.safe_load(f) else: # Create default universe definitions return self._create_default_universe_definitions() def _load_stat_method_definitions(self) -> Dict: """Load canonical statistical method definitions""" stat_path = Path("../official_sources/stat_method_definitions.yaml") if stat_path.exists(): with open(stat_path) as f: return yaml.safe_load(f) else: return self._create_default_stat_method_definitions() def _load_category_templates(self) -> Dict: """Load category template files for concept generation""" templates_dir = Path("../concept_templates") if not templates_dir.exists(): print(f"❌ Category templates directory not found: {templates_dir}") print(" Create template files for systematic concept generation") return {} templates = {} for category_file in templates_dir.glob("*.json"): category_name = category_file.stem with open(category_file) as f: templates[category_name] = json.load(f) print(f"✅ Loaded {category_name} template: {len(templates[category_name].get('concepts', []))} concept templates") return templates def _create_default_universe_definitions(self) -> Dict: """Create canonical universe definitions file""" universe_definitions = { "universes": { "Households": { "definition": "All occupied housing units", "excludes": "Group quarters population", "census_note": "Standard household universe for income, housing costs" }, "Family households": { "definition": "Households with related individuals", "excludes": "Single-person households, unrelated individuals", "census_note": "Subset of households - use for family-specific measures" }, "Population": { "definition": "All persons counted in census/survey", "includes": "Household and group quarters population", "census_note": "Total population universe" }, "Civilian labor force": { "definition": "Civilians 16+ who are employed or actively seeking work", "excludes": "Military, institutionalized, not seeking work", "census_note": "Standard employment universe" }, "Housing units": { "definition": "All residential structures intended for occupancy", "includes": "Occupied and vacant units", "census_note": "Physical housing stock universe" }, "Workers": { "definition": "Employed civilians 16+ with work location data", "excludes": "Unemployed, military, work-from-home varies", "census_note": "Commuting and workplace universe" }, "School-age population": { "definition": "Population 3-24 years old", "includes": "Enrolled and not enrolled", "census_note": "Education enrollment universe" }, "Geographic entity": { "definition": "Census-defined geographic boundaries", "includes": "All official Census geographic levels", "census_note": "Administrative and statistical geography" } } } # Save to file universe_path = Path("../official_sources/universe_definitions.yaml") universe_path.parent.mkdir(exist_ok=True) with open(universe_path, 'w') as f: yaml.dump(universe_definitions, f, default_flow_style=False) print(f"✅ Created universe definitions: {universe_path}") return universe_definitions def _create_default_stat_method_definitions(self) -> Dict: """Create canonical statistical method definitions""" stat_definitions = { "methods": { "median": { "definition": "50th percentile value", "use_cases": "Income, home values, age - skewed distributions", "census_tables": "Most B-tables provide medians" }, "mean": { "definition": "Arithmetic average", "use_cases": "Household size, rooms - symmetric distributions", "census_tables": "Some C-tables, derived calculations" }, "rate": { "definition": "Numerator/denominator expressed as percentage", "use_cases": "Poverty rate, unemployment rate", "calculation": "Detail variable / total variable * 100" }, "ratio": { "definition": "Relationship between two quantities", "use_cases": "Sex ratio, dependency ratio", "calculation": "Variable A / Variable B" }, "count": { "definition": "Simple enumeration", "use_cases": "Population, housing units, establishments", "census_tables": "Total variables, _001 estimates" }, "percentage": { "definition": "Share of total expressed as percentage", "use_cases": "Educational attainment distribution", "calculation": "Category / total * 100" } } } # Save to file stat_path = Path("../official_sources/stat_method_definitions.yaml") with open(stat_path, 'w') as f: yaml.dump(stat_definitions, f, default_flow_style=False) print(f"✅ Created statistical method definitions: {stat_path}") return stat_definitions def validate_concept_record(self, concept_dict: Dict) -> ConceptRecord: """Validate and create concept record""" return ConceptRecord.validate(concept_dict) def bucket_by_confidence(self, concepts: List[ConceptRecord], review_cap: int = 50) -> Dict: """Bucket concepts by confidence level""" auto_concepts = [] # ≥0.9 confidence review_queue = [] # 0.75-0.9 confidence low_confidence = [] # <0.75 confidence for concept in concepts: if concept.confidence >= 0.9: concept.status = "auto" auto_concepts.append(concept) elif concept.confidence >= 0.75: review_queue.append(concept) else: low_confidence.append(concept) # Respect review cap if len(review_queue) > review_cap: print(f"⚠️ Review queue ({len(review_queue)}) exceeds cap ({review_cap})") print(f" Keeping top {review_cap} by confidence") review_queue = sorted(review_queue, key=lambda x: x.confidence, reverse=True)[:review_cap] return { "auto": auto_concepts, "review": review_queue, "low_confidence": low_confidence } def save_concept_buckets(self, bucketed_concepts: Dict, category: str): """Save concept buckets to separate files""" concepts_dir = Path("../concepts") concepts_dir.mkdir(exist_ok=True) # Save auto-approved concepts auto_data = { "meta": { "category": category, "status": "auto_approved", "concept_count": len(bucketed_concepts["auto"]), "confidence_threshold": "≥0.9" }, "concepts": [c.to_dict() for c in bucketed_concepts["auto"]] } auto_path = concepts_dir / f"{category}.json" with open(auto_path, 'w') as f: json.dump(auto_data, f, indent=2) print(f"✅ Saved {len(bucketed_concepts['auto'])} auto-approved {category} concepts") # Save review queue as readable text dump if bucketed_concepts["review"]: review_path = concepts_dir / f"{category}_review.txt" with open(review_path, 'w') as f: f.write(f"# {category.title()} Concepts - Review Queue (0.75-0.9 confidence)\n") f.write(f"# Human review required - edit and mark as 'reviewed' when done\n\n") for i, concept in enumerate(bucketed_concepts["review"], 1): f.write(f"## {i}. {concept.label} (confidence: {concept.confidence:.2f})\n") f.write(f"ID: {concept.id}\n") f.write(f"Universe: {concept.universe}\n") f.write(f"Stat Method: {concept.stat_method}\n") f.write(f"Tables: {concept.census_tables}\n") f.write(f"Definition: {concept.definition}\n") f.write(f"Status: NEEDS_REVIEW\n") f.write("-" * 80 + "\n\n") print(f"📝 Saved {len(bucketed_concepts['review'])} concepts for review: {review_path}") # Save low confidence concepts for debugging if bucketed_concepts["low_confidence"]: low_path = concepts_dir / f"{category}_low_confidence.csv" with open(low_path, 'w') as f: f.write("label,confidence,universe,stat_method,definition\n") for concept in bucketed_concepts["low_confidence"]: f.write(f'"{concept.label}",{concept.confidence},"{concept.universe}","{concept.stat_method}","{concept.definition[:100]}..."\n') print(f"⚠️ Saved {len(bucketed_concepts['low_confidence'])} low-confidence concepts: {low_path}") def generate_geography_turtle(self, geography_concepts: List[ConceptRecord]): """Generate Turtle RDF for geography concepts""" turtle_content = """@prefix cendata: <https://raw.githubusercontent.com/yourrepo/census-mcp-server/main/ontology#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix skos: <http://www.w3.org/2004/02/skos/core#> . # Official Census Geographic Hierarchy # Based on Standard Hierarchy of Census Geographic Entities # Level 0 - Root cendata:Nation a skos:Concept ; rdfs:label "Nation" ; rdfs:comment "Singleton United States root geography" . # Level 1 - Major divisions cendata:Region a skos:Concept ; rdfs:label "Census Region" ; rdfs:comment "4 Census regions (Northeast, Midwest, South, West)" ; skos:broader cendata:Nation . cendata:Division a skos:Concept ; rdfs:label "Census Division" ; rdfs:comment "9 divisions nested in regions" ; skos:broader cendata:Region . cendata:State a skos:Concept ; rdfs:label "State" ; rdfs:comment "50 states + DC + PR" ; skos:broader cendata:Nation . # Level 2 - County and equivalent cendata:County a skos:Concept ; rdfs:label "County" ; rdfs:comment "County or county equivalent" ; skos:broader cendata:State . cendata:Place a skos:Concept ; rdfs:label "Place" ; rdfs:comment "Incorporated places and CDPs" ; skos:broader cendata:State . # Level 3-5 - Small area cendata:CensusTract a skos:Concept ; rdfs:label "Census Tract" ; rdfs:comment "Small statistical subdivisions of counties" ; skos:broader cendata:County . cendata:BlockGroup a skos:Concept ; rdfs:label "Block Group" ; rdfs:comment "Subdivisions of census tracts" ; skos:broader cendata:CensusTract . cendata:CensusBlock a skos:Concept ; rdfs:label "Census Block" ; rdfs:comment "Atomic geography - smallest entity" ; skos:broader cendata:BlockGroup . """ # Save Turtle file ontology_dir = Path("../ontology") ontology_dir.mkdir(exist_ok=True) turtle_path = ontology_dir / "census_geography.ttl" with open(turtle_path, 'w') as f: f.write(turtle_content) print(f"✅ Generated geography ontology: {turtle_path}") print(f"🔗 {len(geography_concepts)} geographic concepts with hierarchical relationships") def build_concepts_from_template(self, category: str) -> List[ConceptRecord]: """Build concepts from category template file""" if category not in self.category_templates: print(f"❌ No template found for category: {category}") return [] template = self.category_templates[category] concept_templates = template.get("concepts", []) validated_concepts = [] for concept_template in concept_templates: try: # Ensure all template concepts have proper schema if "bucket" not in concept_template: concept_template["bucket"] = category validated_concept = ConceptRecord.validate(concept_template) validated_concepts.append(validated_concept) except (ValueError, TypeError) as e: print(f"❌ Schema validation failed for {concept_template.get('label', 'unknown')}: {e}") return validated_concepts def build_all_categories(self, review_cap: int = 50) -> Dict[str, int]: """Build all concept categories systematically""" results = {} total_concepts = 0 # Build each category from templates for category in self.taxonomy["allocation"].keys(): target_count = self.taxonomy["allocation"][category] print(f"\n🔨 Building {category} concepts (target: {target_count})...") # Build from template concepts = self.build_concepts_from_template(category) if not concepts: print(f"⚠️ No concepts generated for {category} - check template file") results[category] = 0 continue # Bucket by confidence bucketed = self.bucket_by_confidence(concepts, review_cap) # Save to files self.save_concept_buckets(bucketed, category) # Generate special outputs for geography if category == "geography": self.generate_geography_turtle(concepts) # Track results category_total = len(bucketed["auto"]) + len(bucketed["review"]) + len(bucketed["low_confidence"]) results[category] = category_total total_concepts += category_total print(f"📊 {category.title()} Summary:") print(f" • Auto-approved: {len(bucketed['auto'])} concepts") print(f" • Review queue: {len(bucketed['review'])} concepts") print(f" • Low confidence: {len(bucketed['low_confidence'])} concepts") print(f" • Total: {category_total} concepts") return results, total_concepts def main(): """Build complete 400-concept taxonomy with systematic validation""" parser = argparse.ArgumentParser(description="Build 400-concept taxonomy") parser.add_argument("--subjects", default="../official_sources/subjects.json") parser.add_argument("--defs", default="../official_sources/definitions_2023.json") parser.add_argument("--target", type=int, default=400) parser.add_argument("--review_cap", type=int, default=50) parser.add_argument("--category", choices=["all", "economics", "education", "health_social", "transportation", "geography", "specialized_populations", "core_demographics", "housing"], default="all") args = parser.parse_args() print("🏗️ Building 400-Concept Authoritative Taxonomy") print("=" * 60) print(f"Target: {args.target} concepts") print(f"Review cap: {args.review_cap} concepts") print(f"Category: {args.category}") builder = ConceptTaxonomyBuilder() # Validate schema and universe definitions are available if not builder.universe_definitions or not builder.stat_method_definitions: print("❌ Missing universe or statistical method definitions") return print(f"✅ Schema validation ready") print(f"✅ {len(builder.universe_definitions['universes'])} universe definitions loaded") print(f"✅ {len(builder.stat_method_definitions['methods'])} statistical methods loaded") print(f"✅ {len(builder.category_templates)} category templates loaded") # Build concepts if args.category == "all": print(f"\n🚀 Building ALL categories systematically...") results, total_concepts = builder.build_all_categories(args.review_cap) print(f"\n📈 FINAL SUMMARY:") print(f"=" * 40) for category, count in results.items(): target = builder.taxonomy["allocation"][category] status = "✅" if count >= target * 0.8 else "⚠️" print(f" {status} {category}: {count}/{target} concepts") print(f"\n🎯 TOTAL CONCEPTS: {total_concepts}") print(f"🎯 TARGET: {args.target}") if total_concepts >= args.target: print(f"🎉 SUCCESS! Generated {total_concepts} concepts (≥{args.target} target)") else: print(f"⚠️ Generated {total_concepts} concepts (<{args.target} target)") print(f" Check template files and run category-specific builds") else: # Build single category print(f"\n🔨 Building {args.category} concepts...") concepts = builder.build_concepts_from_template(args.category) if concepts: # Bucket by confidence bucketed = builder.bucket_by_confidence(concepts, args.review_cap) # Save to files builder.save_concept_buckets(bucketed, args.category) # Generate special outputs if args.category == "geography": builder.generate_geography_turtle(concepts) print(f"📊 {args.category.title()} Summary:") print(f" • Auto-approved: {len(bucketed['auto'])} concepts") print(f" • Review queue: {len(bucketed['review'])} concepts") print(f" • Low confidence: {len(bucketed['low_confidence'])} concepts") else: print(f"❌ No concepts generated for {args.category}") print(f"\n💡 Next: Review medium-confidence concepts and run LLM validation") print(f"🎯 Schema validation prevents universe/method drift issues") if __name__ == "__main__": main()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/brockwebb/open-census-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server