Skip to main content
Glama
brockwebb

Open Census MCP Server

by brockwebb
test_rag_imbalance.py5.94 kB
#!/usr/bin/env python3 """ Direct ChromaDB test to prove RAG imbalance hypothesis """ import chromadb from pathlib import Path from sentence_transformers import SentenceTransformer def test_rag_imbalance(): """Test methodology vs variable dominance in existing ChromaDB""" # Load the CORRECT embedding model that built the ChromaDB print("Loading sentence transformer model...") embedding_model = SentenceTransformer('all-mpnet-base-v2') # 768-dim print(f"Model loaded: {embedding_model.get_sentence_embedding_dimension()}-dimensional embeddings") # Connect directly to ChromaDB try: client = chromadb.PersistentClient(path=str(Path("vector-db"))) collection = client.get_collection("census_knowledge") print(f"✅ Connected to collection with {collection.count():,} documents") except Exception as e: print(f"❌ Failed to connect: {e}") print("Make sure you're running from knowledge-base/ directory") return # Test queries that should find methodology docs methodology_queries = [ "ACS sampling methodology", "margin of error calculation", "survey design principles", "how does Census collect data", "statistical reliability", "data quality issues" ] print("\n🔍 Testing Methodology Queries") print("=" * 60) total_variable_hits = 0 total_methodology_hits = 0 for query in methodology_queries: print(f"\nQuery: '{query}'") try: # Generate query embedding with correct model query_embedding = embedding_model.encode([query]) results = collection.query( query_embeddings=query_embedding.tolist(), n_results=5, include=['metadatas', 'distances'] ) if not results['metadatas'] or not results['metadatas'][0]: print(" No results") continue variable_count = 0 methodology_count = 0 print(" Top 5 results:") for i, (metadata, distance) in enumerate(zip(results['metadatas'][0], results['distances'][0])): category = metadata.get('category', 'unknown') source = metadata.get('source_file', metadata.get('source', 'unknown')) # Classify result type if (category == 'canonical_variables' or 'canonical_variables' in source or 'Variable:' in metadata.get('title', '')): variable_count += 1 result_type = "🔢 VARIABLE" else: methodology_count += 1 result_type = "📄 METHODOLOGY" print(f" {i+1}. {result_type} | {source[:50]}... | dist: {distance:.3f}") total_variable_hits += variable_count total_methodology_hits += methodology_count # Query assessment if variable_count > methodology_count: print(" ⚠️ VARIABLES DOMINATING") elif methodology_count > variable_count: print(" ✅ Methodology ranking well") else: print(" ➡️ Mixed results") except Exception as e: print(f" ❌ Query failed: {e}") # Overall imbalance assessment print(f"\n📊 OVERALL RESULTS") print("=" * 40) print(f"Total variable hits in top-5: {total_variable_hits}") print(f"Total methodology hits in top-5: {total_methodology_hits}") if total_variable_hits > total_methodology_hits * 2: print("🚨 SEVERE IMBALANCE - Variables dominating methodology queries") print(" Recommendation: SEPARATE THE DATABASES") elif total_variable_hits > total_methodology_hits: print("⚠️ MODERATE IMBALANCE - Variables crowding methodology") print(" Recommendation: Consider separation or better ranking") else: print("✅ Reasonable balance maintained") # Collection composition analysis print(f"\n📈 Collection Composition") print("=" * 30) try: # Sample analysis (avoid loading entire collection) sample_size = min(5000, collection.count()) sample = collection.get(limit=sample_size, include=['metadatas']) category_counts = {} for metadata in sample['metadatas']: category = metadata.get('category', 'unknown') category_counts[category] = category_counts.get(category, 0) + 1 total_sampled = len(sample['metadatas']) for category, count in sorted(category_counts.items()): percentage = (count / total_sampled) * 100 print(f" {category}: {count:,} docs ({percentage:.1f}%)") # Calculate variable dominance ratio var_count = category_counts.get('canonical_variables', 0) other_count = total_sampled - var_count if other_count > 0: ratio = var_count / other_count print(f"\n📊 Variables:Other ratio: {ratio:.1f}:1") if ratio > 10: print("🚨 SEVERE IMBALANCE (>10:1) - Separation critical") elif ratio > 5: print("⚠️ HIGH IMBALANCE (>5:1) - Separation recommended") else: print("✅ Manageable ratio - unified approach viable") except Exception as e: print(f"⚠️ Could not analyze composition: {e}") print(f"\n💡 VERDICT:") print("If variables dominated methodology queries above,") print("your hypothesis is CONFIRMED - separate the databases.") if __name__ == "__main__": test_rag_imbalance()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/brockwebb/open-census-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server