Skip to main content
Glama
brockwebb

Open Census MCP Server

by brockwebb
test_dual_path_rag.pyโ€ข8.04 kB
#!/usr/bin/env python3 """ Test dual-path RAG separation: Variables vs Methodology databases Proves that separation solved the imbalance problem """ import chromadb from pathlib import Path from sentence_transformers import SentenceTransformer def test_dual_path_rag(): """Test methodology retrieval quality in separated vs unified approach""" # Load the embedding model print("Loading sentence transformer model...") embedding_model = SentenceTransformer('all-mpnet-base-v2') # 768-dim print(f"Model loaded: {embedding_model.get_sentence_embedding_dimension()}-dimensional embeddings") # Connect to both databases try: # Variables database variables_client = chromadb.PersistentClient(path=str(Path("variables-db"))) variables_collection = variables_client.get_collection("census_variables") print(f"โœ… Variables DB: {variables_collection.count():,} documents") # Methodology database methodology_client = chromadb.PersistentClient(path=str(Path("methodology-db"))) methodology_collection = methodology_client.get_collection("census_methodology") print(f"โœ… Methodology DB: {methodology_collection.count():,} documents") except Exception as e: print(f"โŒ Failed to connect: {e}") print("Make sure both variables-db/ and methodology-db/ directories exist") return # Test queries that should find methodology, not variables methodology_queries = [ "ACS sampling methodology", "margin of error calculation", "survey design principles", "how does Census collect data", "statistical reliability", "data quality issues" ] print("\n๐Ÿ” Testing Methodology Queries in SEPARATED Databases") print("=" * 70) total_methodology_quality = 0 for query in methodology_queries: print(f"\nQuery: '{query}'") try: # Generate query embedding query_embedding = embedding_model.encode([query]) # Test Variables DB (should have poor/irrelevant results) print(" ๐Ÿ”ข VARIABLES DB:") var_results = variables_collection.query( query_embeddings=query_embedding.tolist(), n_results=3, include=['metadatas', 'distances'] ) if var_results['metadatas'] and var_results['metadatas'][0]: for i, (metadata, distance) in enumerate(zip(var_results['metadatas'][0], var_results['distances'][0])): var_id = metadata.get('temporal_id', 'unknown') label = metadata.get('label', 'unknown')[:40] print(f" {i+1}. {var_id} | {label}... | dist: {distance:.3f}") print(" (Variables shouldn't be relevant for methodology queries)") else: print(" No results") # Test Methodology DB (should have excellent results) print(" ๐Ÿ“„ METHODOLOGY DB:") method_results = methodology_collection.query( query_embeddings=query_embedding.tolist(), n_results=5, include=['metadatas', 'distances'] ) if method_results['metadatas'] and method_results['metadatas'][0]: excellent_results = 0 for i, (metadata, distance) in enumerate(zip(method_results['metadatas'][0], method_results['distances'][0])): source = metadata.get('source_file', 'unknown') category = metadata.get('category', 'unknown') print(f" {i+1}. {category} | {source[:50]}... | dist: {distance:.3f}") # Count high-quality results (distance < 1.0 = good semantic match) if distance < 1.0: excellent_results += 1 print(f" โœ… High-quality results: {excellent_results}/5") total_methodology_quality += excellent_results else: print(" No results") except Exception as e: print(f" โŒ Query failed: {e}") # Overall assessment print(f"\n๐Ÿ“Š SEPARATION QUALITY ASSESSMENT") print("=" * 50) max_possible = len(methodology_queries) * 5 quality_percentage = (total_methodology_quality / max_possible) * 100 print(f"High-quality methodology results: {total_methodology_quality}/{max_possible} ({quality_percentage:.1f}%)") if quality_percentage > 70: print("๐ŸŽ‰ EXCELLENT: Separation dramatically improved methodology retrieval!") elif quality_percentage > 50: print("โœ… GOOD: Separation improved methodology retrieval quality") elif quality_percentage > 30: print("โš ๏ธ MODERATE: Some improvement, but could be better") else: print("โŒ POOR: Separation didn't help much") # Test variable lookup quality print(f"\n๐Ÿ” Testing Variable Lookup in Variables Database") print("=" * 50) variable_queries = [ "B01001_001", # Total population "median household income", "race demographics", "housing occupancy" ] for query in variable_queries: print(f"\nVariable Query: '{query}'") try: query_embedding = embedding_model.encode([query]) results = variables_collection.query( query_embeddings=query_embedding.tolist(), n_results=3, include=['metadatas', 'distances'] ) if results['metadatas'] and results['metadatas'][0]: for i, (metadata, distance) in enumerate(zip(results['metadatas'][0], results['distances'][0])): var_id = metadata.get('temporal_id', 'unknown') label = metadata.get('label', 'unknown')[:50] concept = metadata.get('concept', 'unknown')[:30] print(f" {i+1}. {var_id} | {label}... | {concept} | dist: {distance:.3f}") else: print(" No results") except Exception as e: print(f" โŒ Query failed: {e}") # Database composition analysis print(f"\n๐Ÿ“ˆ Database Composition Analysis") print("=" * 40) # Variables DB composition var_sample = variables_collection.get(limit=100, include=['metadatas']) if var_sample and var_sample.get('metadatas'): var_categories = {} for metadata in var_sample['metadatas']: category = metadata.get('category', 'unknown') var_categories[category] = var_categories.get(category, 0) + 1 print(f"Variables DB categories:") for category, count in var_categories.items(): percentage = (count / len(var_sample['metadatas'])) * 100 print(f" {category}: {percentage:.1f}%") # Methodology DB composition method_sample = methodology_collection.get(limit=100, include=['metadatas']) if method_sample and method_sample.get('metadatas'): method_categories = {} for metadata in method_sample['metadatas']: category = metadata.get('category', 'unknown') method_categories[category] = method_categories.get(category, 0) + 1 print(f"Methodology DB categories:") for category, count in method_categories.items(): percentage = (count / len(method_sample['metadatas'])) * 100 print(f" {category}: {percentage:.1f}%") print(f"\n๐Ÿ’ก CONCLUSION:") print("โœ… Variables DB: Pure entity lookup (no methodology contamination)") print("โœ… Methodology DB: Pure conceptual search (no variable noise)") print("๐ŸŽฏ Separation strategy: VALIDATED") if __name__ == "__main__": test_dual_path_rag()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/brockwebb/open-census-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server