Open Census MCP Server

interactive_session.py•17.4 KiB

#!/usr/bin/env python3 """ Interactive Census MCP Evaluation Session - ROBUST VERSION No more data loss, handles 3000 char responses properly Usage: python interactive_session.py """ import json import re from pathlib import Path from datetime import datetime from typing import Dict, List, Any, Optional from evaluation_db import CensusMCPEvaluator class InteractiveSession: """Robust interactive evaluation session - no more data loss""" def __init__(self): self.evaluator = CensusMCPEvaluator("census_mcp_evaluation.db") self.state_file = Path("session_state.json") # Load benchmark queries with open("benchmark_queries.json", 'r') as f: data = json.load(f) self.queries = data['queries'] # Standard prompt template self.prompt_template = """I'm systematically evaluating this Census MCP system. Please answer the following question and then provide structured evaluation data: QUESTION: {question} After providing your answer, please include this structured evaluation section: === EVALUATION DATA === MCP_TOOL_CALLED: [exact tool name(s) and parameters used] CENSUS_VARIABLES: [specific variable codes like B19013_001E] API_CALLS: [exact API call(s) used] DATA_RETURNED: [key numbers/values from response] MARGIN_OF_ERROR: [MOE values if provided] METHODOLOGY_NOTES: [statistical caveats, survey info, limitations] GEOGRAPHIC_LEVEL: [state/county/place/tract etc.] SUCCESS: [true/false - did the MCP work as expected] ISSUES_ENCOUNTERED: [any problems, failures, or limitations] DISAMBIGUATION_NEEDED: [true/false - was location/question ambiguous] ROUTING_SUGGESTION: [if other data sources recommended] CONFIDENCE_LEVEL: [your assessment 0.0-1.0 of response quality] === END EVALUATION DATA ===""" def save_state(self, state: Dict): """Save session state with backup""" try: with open(self.state_file, 'w') as f: json.dump(state, f, indent=2) # Create backup backup_file = self.state_file.with_suffix('.backup.json') with open(backup_file, 'w') as f: json.dump(state, f, indent=2) except Exception as e: print(f"⚠️ Warning: Could not save state: {e}") def load_state(self) -> Dict: """Load session state with backup fallback""" for state_file in [self.state_file, self.state_file.with_suffix('.backup.json')]: if state_file.exists(): try: with open(state_file, 'r') as f: return json.load(f) except Exception as e: print(f"⚠️ Warning: Could not load {state_file}: {e}") return {} def start_new_run(self): """Start a new evaluation run""" print("🏛️ Census MCP Evaluation Session") print("=" * 50) run_name = input("Enter run name (e.g., v2.1-semantic): ").strip() if not run_name: print("❌ Run name required") return False description = input("Enter description (optional): ").strip() try: # Create database run run_id = self.evaluator._create_test_run(run_name, description) # Initialize state state = { 'run_name': run_name, 'run_id': run_id, 'current_query_index': 0, 'completed_queries': [], 'started_at': datetime.now().isoformat() } self.save_state(state) print(f"✅ Started evaluation run: {run_name}") return True except Exception as e: print(f"❌ Error creating run: {e}") return False def parse_response(self, query: Dict, response_text: str) -> Dict: """Parse Claude's response - ROBUST VERSION""" # Extract evaluation data section eval_match = re.search( r'=== EVALUATION DATA ===(.*?)=== END EVALUATION DATA ===', response_text, re.DOTALL ) if not eval_match: print("⚠️ Warning: No evaluation data section found") eval_section = "" else: eval_section = eval_match.group(1) # Parse fields with robust defaults def extract_field(field_name: str, default: str = "not provided") -> str: pattern = rf'{field_name}:\s*(.+?)(?=\n[A-Z_]+:|$)' match = re.search(pattern, eval_section, re.IGNORECASE | re.DOTALL) if match: value = match.group(1).strip().strip('[]') return value if value else default return default # Extract main answer - INCREASED TO 3000 CHARS if eval_match: main_answer = response_text[:eval_match.start()].strip() else: main_answer = response_text.strip() # Truncate to 3000 chars with smart sentence boundary if len(main_answer) > 3000: truncated = main_answer[:3000] last_period = truncated.rfind('.') if last_period > 2000: # If we can get substantial content main_answer = truncated[:last_period + 1] + " [TRUNCATED]" else: main_answer = truncated + " [TRUNCATED]" # Parse evaluation fields mcp_tool = extract_field("MCP_TOOL_CALLED", "unknown") census_vars = extract_field("CENSUS_VARIABLES", "unknown") data_returned = extract_field("DATA_RETURNED", "unknown") moe = extract_field("MARGIN_OF_ERROR", "not provided") methodology = extract_field("METHODOLOGY_NOTES", "not provided") success_text = extract_field("SUCCESS", "false").lower() success = success_text in ["true", "yes", "1", "success"] issues = extract_field("ISSUES_ENCOUNTERED", "none reported") # Parse confidence level robustly confidence_text = extract_field("CONFIDENCE_LEVEL", "0.5") try: confidence_match = re.search(r'(\d+\.?\d*)', confidence_text) if confidence_match: confidence = float(confidence_match.group(1)) if confidence > 1.0: confidence = confidence / 10.0 confidence = max(0.0, min(1.0, confidence)) else: confidence = 0.5 except (ValueError, AttributeError): confidence = 0.5 # Score the response scores = self.score_response(query, main_answer, { 'success': success, 'confidence': confidence, 'census_vars': census_vars, 'data_returned': data_returned, 'issues': issues }) # Return complete test data return { 'query_id': query['query_id'], 'query_text': query['query_text'], 'query_category': query['query_category'], 'mcp_tool_called': mcp_tool, 'mcp_parameters': json.dumps({"tool": mcp_tool, "success": success}), 'mcp_success': success, 'final_answer': main_answer, # Now 3000 chars max 'census_variables': census_vars, 'margin_of_error': moe, 'methodology_notes': methodology, 'correctness': float(scores['correctness']), 'plan_quality': float(scores['plan_quality']), 'tool_coordination': float(scores['tool_coordination']), 'limitation_handling': float(scores['limitation_handling']), 'disambiguation': float(scores['disambiguation']), 'methodology_guidance': float(scores['methodology_guidance']), 'passed': bool(scores['passed']), 'failure_reason': scores['failure_reason'] or "", 'notes': scores['notes'] or "" } def score_response(self, query: Dict, answer: str, eval_data: Dict) -> Dict: """Score the response based on category and content""" category = query['query_category'] scores = { 'correctness': 0.5, 'plan_quality': 0.5, 'tool_coordination': 0.7 if eval_data['success'] else 0.2, 'limitation_handling': 0.5, 'disambiguation': 0.5, 'methodology_guidance': 0.3, 'passed': False, 'failure_reason': '', 'notes': '' } # Category-specific scoring if category == 'derived_statistic' and 'poverty rate' in query['query_text']: if '%' in answer and any(num in answer for num in ['31.', '32.', '33.', '34.']): scores['correctness'] = 1.0 scores['passed'] = True scores['notes'] = '🎉 SUCCESS: Returns percentage rate (v2.1 fix working!)' elif '%' in answer: scores['correctness'] = 0.8 scores['passed'] = True scores['notes'] = 'Returns percentage but unexpected value' else: scores['correctness'] = 0.2 scores['failure_reason'] = 'Still returning count instead of percentage' elif category == 'basic_demographic': if eval_data['success'] and '$' in answer and '±' in answer: scores['correctness'] = 1.0 scores['passed'] = True scores['notes'] = 'Proper income data with MOE' elif category == 'limitation_handling' or category == 'data_boundary': if not eval_data['success'] or 'cannot' in answer.lower() or 'error' in answer.lower(): scores['correctness'] = 1.0 scores['limitation_handling'] = 1.0 scores['passed'] = True scores['notes'] = 'Correctly rejected inappropriate query' elif category == 'geographic_ambiguity': if 'clarification' in answer.lower() or 'which' in answer.lower(): scores['disambiguation'] = 1.0 scores['passed'] = True else: scores['disambiguation'] = 0.2 scores['failure_reason'] = 'Should have requested clarification' # Methodology bonus if any(term in answer.lower() for term in ['acs', 'margin of error', 'confidence', 'survey']): scores['methodology_guidance'] = 0.8 return scores def continue_session(self): """Continue existing session with bulletproof error handling""" state = self.load_state() if not state: print("No session in progress. Starting new run...") if not self.start_new_run(): return state = self.load_state() current_index = state['current_query_index'] while current_index < len(self.queries): query = self.queries[current_index] # Show progress print(f"\n📊 Progress: {current_index + 1}/{len(self.queries)}") print(f"🏷️ Category: {query['query_category']}") print(f"🔍 Query ID: {query['query_id']}") print("=" * 70) # Generate prompt prompt = self.prompt_template.format(question=query['query_text']) print("📋 Copy this prompt to Claude Desktop:") print("-" * 50) print(prompt) print("-" * 50) # Get response print("\n📥 Paste Claude's FULL response below:") print("📝 Include everything from answer through '=== END EVALUATION DATA ==='") print("🔄 Type 'DONE' on a new line when finished pasting:") response_lines = [] while True: try: line = input() if line.strip().upper() == 'DONE': print("✅ Processing response...") break elif line.strip() == "=== END EVALUATION DATA ===": response_lines.append(line) print("✅ Detected end marker. Processing...") break else: response_lines.append(line) except KeyboardInterrupt: print("\n⚠️ Interrupted. Options:") print("1. Skip this query") print("2. Save and exit") print("3. Continue entering response") choice = input("Choice (1/2/3): ").strip() if choice == "1": current_index += 1 state['current_query_index'] = current_index self.save_state(state) break elif choice == "2": self.save_state(state) print("💾 Session saved. Run again to continue.") return else: print("Continuing...") continue response_text = '\n'.join(response_lines) if not response_text.strip(): print("❌ Empty response. Skipping...") current_index += 1 state['current_query_index'] = current_index self.save_state(state) continue # Process response try: parsed_data = self.parse_response(query, response_text) # Save immediately to prevent data loss self.evaluator._add_query_test(state['run_id'], parsed_data) print(f"💾 Saved to database immediately") # Show result print(f"\n✅ Processed {query['query_id']}") print(f" Success: {parsed_data['passed']}") print(f" Score: {parsed_data['correctness']:.1f}") print(f" Answer length: {len(parsed_data['final_answer'])} chars") if parsed_data['failure_reason']: print(f" Issue: {parsed_data['failure_reason']}") # Update state state['completed_queries'].append(query['query_id']) state['current_query_index'] = current_index + 1 self.save_state(state) current_index += 1 except Exception as e: print(f"❌ Error processing response: {e}") import traceback traceback.print_exc() print("\nOptions:") print("1. Retry this query") print("2. Skip this query") print("3. Save and exit") choice = input("Choice (1/2/3): ").strip() if choice == "2": current_index += 1 state['current_query_index'] = current_index self.save_state(state) elif choice == "3": self.save_state(state) print("💾 Session saved.") return # Evaluation complete try: self.finish_evaluation(state) except Exception as e: print(f"⚠️ Error in final scoring: {e}") print("Your data is saved. View results with:") print(f"python evaluation_db.py --score-run '{state['run_name']}'") def finish_evaluation(self, state: Dict): """Finish evaluation and show results""" print("\n🎉 EVALUATION COMPLETE!") print("=" * 50) # Update database self.evaluator._update_run_summary(state['run_id']) # Show results self.evaluator.score_run(state['run_name']) # Clean up self.state_file.unlink(missing_ok=True) Path(self.state_file.name + '.backup.json').unlink(missing_ok=True) print(f"\n📊 Compare results:") print(f"python evaluation_db.py --compare-runs baseline {state['run_name']}") def show_menu(self): """Show main menu""" print("\n🏛️ Census MCP Evaluation Session") print("=" * 40) print("1. Start new evaluation run") print("2. Continue existing session") print("3. Show session status") print("4. Exit") choice = input("\nSelect option (1-4): ").strip() if choice == "1": if self.start_new_run(): self.continue_session() elif choice == "2": self.continue_session() elif choice == "3": state = self.load_state() if state: completed = len(state['completed_queries']) total = len(self.queries) print(f"Run: {state['run_name']}") print(f"Progress: {completed}/{total} queries") else: print("No session in progress") elif choice == "4": print("👋 Goodbye!") return else: print("Invalid choice") # Loop back to menu self.show_menu() def main(): session = InteractiveSession() session.show_menu() if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/brockwebb/open-census-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

interactive_session.py•17.4 KiB