Open Census MCP Server

debug_embedding.py•5.45 KiB

#!/usr/bin/env python3 """ Debug what text is actually being embedded for specific variables """ import json import sys from pathlib import Path def load_official_labels(base_path): """Load official Census labels from the comprehensive mapping.""" label_map_path = base_path / "variable_label_map.json" if not label_map_path.exists(): print(f"❌ Label map not found at {label_map_path}") return {} try: with open(label_map_path) as f: label_map = json.load(f) print(f"✅ Loaded {len(label_map)} official Census labels") return label_map except Exception as e: print(f"❌ Could not load label map: {e}") return {} def create_clean_embedding_text(record, official_labels=None): """Create clean embedding text using official Census descriptions.""" variable_id = record.get('variable_id', '') # Use official Census labels when available if official_labels and variable_id in official_labels: official = official_labels[variable_id] label = official['label'] concept = official['concept'] survey = official['survey'] source = "OFFICIAL" else: # Fallback to enriched data if no official mapping label = record.get('label', f'Variable {variable_id}') concept = record.get('concept', 'Unknown') survey = record.get('survey', 'acs5') source = "FALLBACK" # Clean up "Unknown" fallbacks if label == 'Unknown': label = f'Variable {variable_id}' if concept == 'Unknown': concept = 'Census Variable' # Create embedding text with Spock's format text = f"V_ID_{variable_id} {label} {concept} {survey}" return text.strip(), source def main(): # Set up paths base_path = Path("knowledge-base") universe_file = base_path / "2023_ACS_Enriched_Universe_weighted.json" if not universe_file.exists(): print(f"❌ Universe file not found: {universe_file}") sys.exit(1) # Load official labels print("🔍 Loading official Census labels...") official_labels = load_official_labels(base_path) # Load universe data print(f"📁 Loading universe data from {universe_file}...") with open(universe_file) as f: data = json.load(f) # Handle different formats if isinstance(data, dict): if 'variables' in data: variables = list(data['variables'].values()) if isinstance(data['variables'], dict) else data['variables'] else: variables = list(data.values()) else: variables = data print(f"📊 Loaded {len(variables)} variables") # Test specific variables that failed test_variables = [ "B25077_001E", # Should be "Median value (dollars)" "B03003_003E", # Should be "Hispanic or Latino" "B01001_020E", # Should be age 65+ related "B19013_001E" # Should be "Median household income" ] print("\n🎯 Testing specific variables that failed semantic search:") print("=" * 80) found_count = 0 for variable_id in test_variables: # Find the variable in the data variable_record = None for record in variables: if record.get('variable_id') == variable_id: variable_record = record break if variable_record: found_count += 1 text, source = create_clean_embedding_text(variable_record, official_labels) print(f"\n📍 {variable_id}:") print(f" Source: {source}") print(f" Embedding text: {text}") # Show original data for comparison orig_label = variable_record.get('label', 'N/A') orig_concept = variable_record.get('concept', 'N/A') print(f" Original label: {orig_label}") print(f" Original concept: {orig_concept}") # Check if it's in official labels if official_labels and variable_id in official_labels: official_data = official_labels[variable_id] print(f" Official label: {official_data['label']}") print(f" Official concept: {official_data['concept']}") else: print(f" ❌ NOT FOUND in official labels") else: print(f"\n❌ {variable_id}: NOT FOUND in universe data") print(f"\n📈 Summary:") print(f" Found {found_count}/{len(test_variables)} test variables") print(f" Official labels available: {'YES' if official_labels else 'NO'}") if not official_labels: print(f"\n💡 Next step: Run this to generate official labels:") print(f" python knowledge-base/scripts/build_label_concept_map.py \\") print(f" --raw-vars knowledge-base/complete_2023_acs_variables/complete_variables.json \\") print(f" --out knowledge-base/variable_label_map.json") # Sample a few random variables to see the pattern print(f"\n🔬 Sample of first 5 variables for pattern check:") print("-" * 60) for i, record in enumerate(variables[:5]): if isinstance(record, dict) and 'variable_id' in record: text, source = create_clean_embedding_text(record, official_labels) var_id = record['variable_id'] print(f"{i+1}. {var_id} ({source}): {text[:100]}...") if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/brockwebb/open-census-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

debug_embedding.py•5.45 KiB