Skip to main content
Glama
brockwebb

Open Census MCP Server

by brockwebb
table_resolver.py24.6 kB
#!/usr/bin/env python3 """ Table Resolver - Convert table IDs and concepts to Census variables Handles: 1. Direct table IDs (B19013 → list of variables) 2. Natural language concepts (income distribution → B19013 → variables) 3. Table metadata and variable metadata lookup 4. Fallback to hardcoded common tables if catalog unavailable Architecture: Table catalog first, hardcoded fallbacks second """ import json import logging import re from typing import List, Dict, Any, Optional from pathlib import Path logger = logging.getLogger(__name__) class TableResolver: """Resolve table IDs and concepts to Census variables""" def __init__(self, knowledge_base_dir: Optional[str] = None): self.knowledge_base_dir = knowledge_base_dir or self._find_knowledge_base() self.table_catalog = None self.variable_metadata = None # Try to load table catalog self._load_table_catalog() # Initialize concept mappings (fallback) self._init_concept_mappings() # Initialize common table definitions (fallback) self._init_common_tables() logger.info(f"✅ TableResolver initialized") logger.info(f" Catalog loaded: {'Yes' if self.table_catalog else 'No (using fallbacks)'}") logger.info(f" Knowledge base: {self.knowledge_base_dir}") def _find_knowledge_base(self) -> Optional[str]: """Auto-detect knowledge base directory""" possible_paths = [ Path(__file__).parent.parent.parent / "knowledge-base", Path(__file__).parent.parent / "knowledge-base", Path.cwd() / "knowledge-base" ] for path in possible_paths: if path.exists() and path.is_dir(): return str(path) return None def _load_table_catalog(self): """Load table catalog from knowledge base""" if not self.knowledge_base_dir: logger.warning("⚠️ No knowledge base directory found - using fallbacks only") return catalog_paths = [ Path(self.knowledge_base_dir) / "table-catalog" / "table_catalog_enhanced.json", Path(self.knowledge_base_dir) / "table-catalog" / "table_catalog.json", Path(self.knowledge_base_dir) / "table-catalog" / "table_catalog_with_keywords.json" ] for catalog_path in catalog_paths: if catalog_path.exists(): try: with open(catalog_path, 'r') as f: data = json.load(f) # Handle different catalog formats if isinstance(data, dict): if 'tables' in data: # Structured format self.table_catalog = {t['table_id']: t for t in data['tables']} else: # Direct table mapping format self.table_catalog = data logger.info(f"✅ Loaded table catalog: {catalog_path}") logger.info(f" Tables available: {len(self.table_catalog)}") return except Exception as e: logger.warning(f"⚠️ Failed to load catalog {catalog_path}: {e}") continue logger.warning("⚠️ No table catalog found - using fallbacks only") def _init_concept_mappings(self): """Initialize natural language concept mappings""" self.concept_mappings = { # Income concepts 'income': ['B19013'], 'income distribution': ['B19013'], 'household income': ['B19013'], 'median income': ['B19013'], 'family income': ['B19101'], 'per capita income': ['B19301'], # Housing concepts 'housing': ['B25001', 'B25003'], 'housing units': ['B25001'], 'housing tenure': ['B25003'], 'home value': ['B25077'], 'rent': ['B25064'], 'housing costs': ['B25070', 'B25080'], # Population concepts 'population': ['B01003'], 'total population': ['B01003'], 'age': ['B01001'], 'age and sex': ['B01001'], # Education concepts 'education': ['B15003'], 'educational attainment': ['B15003'], 'school enrollment': ['B14001'], # Employment concepts 'employment': ['B23025'], 'labor force': ['B23025'], 'unemployment': ['B23025'], 'occupation': ['B24010'], # Poverty concepts 'poverty': ['B17001'], 'poverty status': ['B17001'], # Transportation concepts 'commuting': ['B08301'], 'transportation': ['B08301'], 'travel time': ['B08303'] } def _init_common_tables(self): """Initialize common table definitions as fallbacks""" self.common_tables = { 'B01001': { 'table_id': 'B01001', 'title': 'SEX BY AGE', 'universe': 'Total population', 'variables': self._generate_b01001_variables(), 'methodology_notes': 'Age and sex characteristics of the population. Universe includes all persons.' }, 'B01003': { 'table_id': 'B01003', 'title': 'TOTAL POPULATION', 'universe': 'Total population', 'variables': ['B01003_001E', 'B01003_001M'], 'methodology_notes': 'Total population count from ACS sample data.' }, 'B19013': { 'table_id': 'B19013', 'title': 'MEDIAN HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2023 INFLATION-ADJUSTED DOLLARS)', 'universe': 'Households', 'variables': ['B19013_001E', 'B19013_001M'], 'methodology_notes': 'Median household income in inflation-adjusted dollars. Excludes group quarters population.' }, 'B25001': { 'table_id': 'B25001', 'title': 'HOUSING UNITS', 'universe': 'Housing units', 'variables': ['B25001_001E', 'B25001_001M'], 'methodology_notes': 'Total housing units including occupied and vacant units.' }, 'B25003': { 'table_id': 'B25003', 'title': 'TENURE', 'universe': 'Occupied housing units', 'variables': ['B25003_001E', 'B25003_001M', 'B25003_002E', 'B25003_002M', 'B25003_003E', 'B25003_003M'], 'methodology_notes': 'Housing tenure (owner vs renter occupied). Universe is occupied housing units only.' }, 'B25064': { 'table_id': 'B25064', 'title': 'MEDIAN GROSS RENT (DOLLARS)', 'universe': 'Renter-occupied housing units paying cash rent', 'variables': ['B25064_001E', 'B25064_001M'], 'methodology_notes': 'Median gross rent including utilities. Renter-occupied units only.' }, 'B25077': { 'table_id': 'B25077', 'title': 'MEDIAN VALUE (DOLLARS)', 'universe': 'Owner-occupied housing units', 'variables': ['B25077_001E', 'B25077_001M'], 'methodology_notes': 'Median home value. Owner-occupied units only.' }, 'B15003': { 'table_id': 'B15003', 'title': 'EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER', 'universe': 'Population 25 years and over', 'variables': self._generate_b15003_variables(), 'methodology_notes': 'Educational attainment levels. Universe is population 25 years and over.' }, 'B23025': { 'table_id': 'B23025', 'title': 'EMPLOYMENT STATUS FOR THE POPULATION 16 YEARS AND OVER', 'universe': 'Population 16 years and over', 'variables': self._generate_b23025_variables(), 'methodology_notes': 'Employment status including labor force participation. Population 16 years and over.' }, 'B17001': { 'table_id': 'B17001', 'title': 'POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE', 'universe': 'Population for whom poverty status is determined', 'variables': self._generate_b17001_variables(), 'methodology_notes': 'Poverty status by demographic characteristics. Excludes institutionalized population.' } } def resolve_tables(self, table_ids: List[str]) -> Dict[str, Any]: """ Resolve table IDs or concepts to variables Returns: { 'resolved_tables': [ { 'table_id': 'B19013', 'input': 'income distribution', 'variables': ['B19013_001E', 'B19013_001M'], 'resolution_method': 'concept_mapping' } ], 'unresolved': ['unknown_concept'] } """ resolved_tables = [] unresolved = [] for table_input in table_ids: table_input_clean = table_input.strip() # Try direct table ID first if self._is_table_id(table_input_clean): resolved = self._resolve_direct_table_id(table_input_clean) if resolved: resolved['input'] = table_input_clean resolved_tables.append(resolved) else: unresolved.append(table_input_clean) # Try concept mapping else: resolved = self._resolve_concept(table_input_clean) if resolved: for table_data in resolved: table_data['input'] = table_input_clean resolved_tables.append(table_data) else: unresolved.append(table_input_clean) if unresolved: return { 'error': f'Could not resolve table identifiers: {unresolved}', 'resolved_tables': resolved_tables, 'unresolved': unresolved, 'suggestions': self._suggest_alternatives(unresolved) } return { 'resolved_tables': resolved_tables, 'unresolved': [] } def _is_table_id(self, table_input: str) -> bool: """Check if input looks like a direct table ID""" # Pattern like B19013, S2401, DP05, etc. return bool(re.match(r'^[A-Z]{1,2}\d{5}[A-Z]*$', table_input.upper())) def _resolve_direct_table_id(self, table_id: str) -> Optional[Dict[str, Any]]: """Resolve direct table ID to variables""" table_id_upper = table_id.upper() # Try table catalog first if self.table_catalog and table_id_upper in self.table_catalog: catalog_entry = self.table_catalog[table_id_upper] # Handle different catalog formats variables = [] if isinstance(catalog_entry, dict): if 'variables' in catalog_entry: # Extract variable IDs from variable objects for var in catalog_entry['variables']: if isinstance(var, dict): var_id = var.get('variable_id') or var.get('temporal_id', '').split('.')[-1] if var_id: variables.append(var_id) else: variables.append(str(var)) else: # Look for variable-like keys variables = [k for k in catalog_entry.keys() if re.match(r'^[A-Z]\d{5}_\d{3}[EM]$', k)] if variables: return { 'table_id': table_id_upper, 'variables': variables, 'resolution_method': 'table_catalog' } # Fallback to common tables if table_id_upper in self.common_tables: table_info = self.common_tables[table_id_upper] return { 'table_id': table_id_upper, 'variables': table_info['variables'], 'resolution_method': 'common_table_fallback' } # Generate standard variables for known table patterns generated_vars = self._generate_standard_table_variables(table_id_upper) if generated_vars: return { 'table_id': table_id_upper, 'variables': generated_vars, 'resolution_method': 'pattern_generation' } return None def _resolve_concept(self, concept: str) -> Optional[List[Dict[str, Any]]]: """Resolve natural language concept to table(s)""" concept_lower = concept.lower().strip() # Direct concept mapping if concept_lower in self.concept_mappings: table_ids = self.concept_mappings[concept_lower] resolved = [] for table_id in table_ids: table_data = self._resolve_direct_table_id(table_id) if table_data: table_data['resolution_method'] = 'concept_mapping' resolved.append(table_data) return resolved if resolved else None # Partial matching for concept_key, table_ids in self.concept_mappings.items(): if concept_lower in concept_key or concept_key in concept_lower: resolved = [] for table_id in table_ids: table_data = self._resolve_direct_table_id(table_id) if table_data: table_data['resolution_method'] = 'partial_concept_match' resolved.append(table_data) return resolved if resolved else None return None def _generate_standard_table_variables(self, table_id: str) -> List[str]: """Generate standard variable list for common table patterns""" # Most tables have at least _001E (total) and _001M (margin of error) base_vars = [f"{table_id}_001E", f"{table_id}_001M"] # Income tables (B19xxx) typically have just totals if table_id.startswith('B19'): return base_vars # Population tables (B01xxx) can have many categories if table_id == 'B01001': # Age by sex return self._generate_b01001_variables() elif table_id.startswith('B01'): return base_vars # Housing tables (B25xxx) if table_id == 'B25003': # Tenure return ['B25003_001E', 'B25003_001M', 'B25003_002E', 'B25003_002M', 'B25003_003E', 'B25003_003M'] elif table_id.startswith('B25'): return base_vars # Education tables (B15xxx) if table_id == 'B15003': return self._generate_b15003_variables() elif table_id.startswith('B15'): return base_vars # Employment tables (B23xxx) if table_id == 'B23025': return self._generate_b23025_variables() elif table_id.startswith('B23'): return base_vars # Default: assume basic total + margin of error return base_vars def _generate_b01001_variables(self) -> List[str]: """Generate B01001 (Age by Sex) variables""" variables = [] # Total, Male, Female for base in ['B01001_001', 'B01001_002', 'B01001_026']: variables.extend([f"{base}E", f"{base}M"]) # Age groups for males (003-025) and females (027-049) for i in range(3, 26): # Male age groups variables.extend([f"B01001_{i:03d}E", f"B01001_{i:03d}M"]) for i in range(27, 50): # Female age groups variables.extend([f"B01001_{i:03d}E", f"B01001_{i:03d}M"]) return variables def _generate_b15003_variables(self) -> List[str]: """Generate B15003 (Educational Attainment) variables""" variables = [] # Educational attainment has ~25 categories for i in range(1, 26): variables.extend([f"B15003_{i:03d}E", f"B15003_{i:03d}M"]) return variables def _generate_b23025_variables(self) -> List[str]: """Generate B23025 (Employment Status) variables""" return [ 'B23025_001E', 'B23025_001M', # Total 'B23025_002E', 'B23025_002M', # In labor force 'B23025_003E', 'B23025_003M', # Civilian labor force 'B23025_004E', 'B23025_004M', # Employed 'B23025_005E', 'B23025_005M', # Unemployed 'B23025_006E', 'B23025_006M', # Armed forces 'B23025_007E', 'B23025_007M' # Not in labor force ] def _generate_b17001_variables(self) -> List[str]: """Generate B17001 (Poverty Status) variables""" variables = [] # Poverty status by age and sex has many categories for i in range(1, 60): # B17001 has ~59 variables variables.extend([f"B17001_{i:03d}E", f"B17001_{i:03d}M"]) return variables def get_table_metadata(self, table_id: str) -> Dict[str, Any]: """Get metadata for a table""" table_id_upper = table_id.upper() # Try table catalog first if self.table_catalog and table_id_upper in self.table_catalog: catalog_entry = self.table_catalog[table_id_upper] if isinstance(catalog_entry, dict): return { 'title': catalog_entry.get('title', f'Table {table_id_upper}'), 'universe': catalog_entry.get('universe', 'Unknown'), 'methodology_notes': catalog_entry.get('methodology_notes', ''), 'source': 'table_catalog' } # Fallback to common tables if table_id_upper in self.common_tables: table_info = self.common_tables[table_id_upper] return { 'title': table_info['title'], 'universe': table_info['universe'], 'methodology_notes': table_info['methodology_notes'], 'source': 'common_table_fallback' } # Default metadata return { 'title': f'Table {table_id_upper}', 'universe': 'Unknown', 'methodology_notes': f'Standard Census table {table_id_upper}', 'source': 'default' } def get_variable_metadata(self, variable_id: str) -> Dict[str, Any]: """Get metadata for a specific variable""" variable_id_upper = variable_id.upper() # Extract table ID from variable ID table_id = variable_id_upper.split('_')[0] # Try to find in table catalog if self.table_catalog and table_id in self.table_catalog: catalog_entry = self.table_catalog[table_id] if isinstance(catalog_entry, dict) and 'variables' in catalog_entry: for var in catalog_entry['variables']: if isinstance(var, dict): var_id = var.get('variable_id') or var.get('temporal_id', '').split('.')[-1] if var_id == variable_id_upper: return { 'label': var.get('label', 'Unknown'), 'concept': var.get('concept', ''), 'is_estimate': variable_id_upper.endswith('E'), 'is_margin_error': variable_id_upper.endswith('M'), 'source': 'table_catalog' } # Generate basic metadata is_estimate = variable_id_upper.endswith('E') is_margin = variable_id_upper.endswith('M') # Extract variable number var_parts = variable_id_upper.split('_') if len(var_parts) == 2: var_num = var_parts[1][:3] # Get just the number part if var_num == '001': label_base = 'Total' else: label_base = f'Category {var_num}' if is_estimate: label = f"Estimate!!{label_base}" elif is_margin: label = f"Margin of Error!!{label_base}" else: label = label_base else: label = 'Unknown variable' return { 'label': label, 'concept': f'Variable from table {table_id}', 'is_estimate': is_estimate, 'is_margin_error': is_margin, 'source': 'generated' } def _suggest_alternatives(self, unresolved: List[str]) -> List[str]: """Suggest alternative concepts for unresolved inputs""" suggestions = [] for item in unresolved: item_lower = item.lower() # Suggest related concepts if any(term in item_lower for term in ['money', 'pay', 'wage', 'earn']): suggestions.extend(['income', 'household income', 'median income']) elif any(term in item_lower for term in ['home', 'house', 'residence']): suggestions.extend(['housing', 'housing units', 'home value']) elif any(term in item_lower for term in ['people', 'person', 'resident']): suggestions.extend(['population', 'total population']) elif any(term in item_lower for term in ['school', 'college', 'degree']): suggestions.extend(['education', 'educational attainment']) elif any(term in item_lower for term in ['work', 'job', 'employ']): suggestions.extend(['employment', 'labor force', 'unemployment']) elif any(term in item_lower for term in ['poor', 'poverty']): suggestions.extend(['poverty', 'poverty status']) # Remove duplicates and limit suggestions return list(set(suggestions))[:5] def get_available_concepts(self) -> List[str]: """Get list of available concept mappings""" return sorted(self.concept_mappings.keys()) def get_available_tables(self) -> List[str]: """Get list of available table IDs""" available = set() # From catalog if self.table_catalog: available.update(self.table_catalog.keys()) # From common tables available.update(self.common_tables.keys()) return sorted(available) if __name__ == "__main__": # Test table resolver resolver = TableResolver() test_cases = [ ['B19013'], # Direct table ID ['income distribution'], # Concept ['housing', 'population'], # Multiple concepts ['B25001', 'unknown_concept'], # Mixed valid/invalid ] for test_case in test_cases: print(f"\n🧪 Testing: {test_case}") result = resolver.resolve_tables(test_case) if 'error' in result: print(f"❌ Error: {result['error']}") if result.get('suggestions'): print(f"💡 Suggestions: {result['suggestions']}") else: print(f"✅ Resolved {len(result['resolved_tables'])} tables") for table in result['resolved_tables']: print(f" {table['table_id']}: {len(table['variables'])} variables ({table['resolution_method']})")

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/brockwebb/open-census-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server