Skip to main content
Glama
brockwebb

Open Census MCP Server

by brockwebb
extract_coos_variables.py6.19 kB
#!/usr/bin/env python3 """ Extract all Census variable IDs from COOS concept JSON files """ import json import glob from pathlib import Path from typing import Set, Dict, List def extract_variables_from_concept_file(file_path: str) -> Set[str]: """Extract all census table variables from a single concept file""" variables = set() try: with open(file_path, 'r') as f: data = json.load(f) concepts = data.get('concepts', []) for concept in concepts: census_tables = concept.get('census_tables', []) # Add all table IDs from this concept for table in census_tables: variables.add(table) print(f" {Path(file_path).name}: {len(variables)} unique variables") return variables except Exception as e: print(f" ERROR reading {file_path}: {e}") return set() def load_full_variable_corpus(variables_file: str) -> Dict: """Load the complete Census variable corpus""" try: import pandas as pd df = pd.read_csv(variables_file) variables_data = {} for _, row in df.iterrows(): var_id = row['variable_id'] variables_data[var_id] = { 'label': row['label'], 'concept': row['concept'], 'predicateType': row['predicateType'], 'group': row['group'], 'survey': row['survey'], 'complexity': row['complexity'], 'table_family': row.get('table_family', var_id[:3] if len(var_id) >= 3 else 'OTHER') } return variables_data except Exception as e: print(f"Error loading variable corpus: {e}") return {} def expand_table_to_variables(table_id: str, variable_corpus: Dict) -> List[str]: """Expand a table ID (like B19013) to all its specific variables (like B19013_001E)""" expanded_vars = [] # Find all variables that start with this table ID for var_id in variable_corpus: if var_id.startswith(table_id): expanded_vars.append(var_id) return expanded_vars def main(): """Extract all COOS variables and expand to specific Census variables""" # Configuration CONCEPT_DIR = Path("../concept_templates") # Your concepts are here VARIABLES_FILE = Path("../complete_2023_acs_variables/complete_variables.csv") OUTPUT_FILE = Path("../coos_variables_extracted.json") print("🔍 Extracting COOS variables from concept files") print("=" * 60) # Load full variable corpus for expansion print("📊 Loading complete variable corpus...") variable_corpus = load_full_variable_corpus(str(VARIABLES_FILE)) print(f" Loaded {len(variable_corpus)} total variables") # Find all concept JSON files concept_files = list(CONCEPT_DIR.glob("*.json")) if not concept_files: print(f"❌ No JSON files found in {CONCEPT_DIR}") print(" Check the path and make sure concept files exist") return print(f"\n🎯 Processing {len(concept_files)} concept files:") # Extract variables from all concept files all_concept_variables = set() concept_count = 0 for file_path in concept_files: print(f"\n📄 Processing {file_path.name}...") file_variables = extract_variables_from_concept_file(str(file_path)) all_concept_variables.update(file_variables) # Count concepts in this file try: with open(file_path, 'r') as f: data = json.load(f) file_concept_count = len(data.get('concepts', [])) concept_count += file_concept_count print(f" Concepts in file: {file_concept_count}") except: pass print(f"\n📊 COOS Extraction Summary:") print(f" Total concept files: {len(concept_files)}") print(f" Total concepts: {concept_count}") print(f" Unique table references: {len(all_concept_variables)}") # Expand table IDs to specific variables print(f"\n🔍 Expanding table IDs to specific variables...") expanded_variables = set() expansion_report = {} for table_id in all_concept_variables: specific_vars = expand_table_to_variables(table_id, variable_corpus) expanded_variables.update(specific_vars) expansion_report[table_id] = len(specific_vars) if len(specific_vars) == 0: print(f" ⚠️ No variables found for table: {table_id}") print(f"\n📈 Expansion Results:") print(f" Table IDs processed: {len(all_concept_variables)}") print(f" Total expanded variables: {len(expanded_variables)}") # Show top table expansions print(f"\n🔢 Top table expansions:") sorted_expansions = sorted(expansion_report.items(), key=lambda x: x[1], reverse=True) for table_id, var_count in sorted_expansions[:10]: print(f" {table_id}: {var_count} variables") # Prepare output data output_data = {} for var_id in expanded_variables: if var_id in variable_corpus: output_data[var_id] = variable_corpus[var_id] else: print(f" ⚠️ Variable {var_id} not found in corpus") # Save results with open(OUTPUT_FILE, 'w') as f: json.dump(output_data, f, indent=2) print(f"\n✅ COOS variable extraction complete!") print(f"📁 Saved {len(output_data)} variables to: {OUTPUT_FILE}") # Cost estimate estimated_cost = len(output_data) * 0.008 # $0.008 per variable print(f"\n💰 Estimated enrichment cost: ${estimated_cost:.2f}") print(f"\n🚀 Ready to enrich COOS variables:") print(f" python enhanced_collaborative_enrichment.py \\") print(f" --input-file {OUTPUT_FILE} \\") print(f" --output-file coos_enriched_results.json \\") print(f" --openai-api-key $OPENAI_API_KEY \\") print(f" --claude-api-key $CLAUDE_API_KEY") if __name__ == "__main__": main()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/brockwebb/open-census-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server