Skip to main content
Glama
comprehensive_118th_verification.py16.4 kB
#!/usr/bin/env python3 """ Comprehensive 118th Congress Data Verification and Testing Validates complete dataset ingestion and tests MCP server performance """ import sqlite3 import json import time from pathlib import Path from typing import Dict, List, Optional import logging class Comprehensive118thVerifier: def __init__(self, db_path: str): self.db_path = db_path self._setup_logging() def _setup_logging(self): """Configure logging system""" logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), logging.FileHandler("logs/comprehensive_118th_verification.log", mode='a', encoding='utf-8') ] ) self.logger = logging.getLogger("Comprehensive118thVerifier") def verify_data_integrity(self) -> Dict: """Verify complete dataset integrity""" self.logger.info("Starting comprehensive data integrity verification") conn = sqlite3.connect(self.db_path) cursor = conn.cursor() verification_results = {} try: # 1. Bill count verification cursor.execute("SELECT COUNT(*) FROM bills WHERE congress = 118") total_bills = cursor.fetchone()[0] verification_results['total_bills'] = total_bills # 2. Bill types distribution cursor.execute(""" SELECT bill_type, COUNT(*) as count FROM bills WHERE congress = 118 GROUP BY bill_type ORDER BY count DESC """) bill_types = dict(cursor.fetchall()) verification_results['bill_types'] = bill_types # 3. Session distribution cursor.execute(""" SELECT session, COUNT(*) as count FROM bills WHERE congress = 118 GROUP BY session ORDER BY session """) sessions = dict(cursor.fetchall()) verification_results['sessions'] = sessions # 4. Content sections verification cursor.execute(""" SELECT COUNT(*) FROM bill_sections bs JOIN bills b ON bs.bill_id = b.bill_id WHERE b.congress = 118 """) total_sections = cursor.fetchone()[0] verification_results['total_sections'] = total_sections # 5. Section types distribution cursor.execute(""" SELECT bs.section_type, COUNT(*) as count FROM bill_sections bs JOIN bills b ON bs.bill_id = b.bill_id WHERE b.congress = 118 GROUP BY bs.section_type ORDER BY count DESC """) section_types = dict(cursor.fetchall()) verification_results['section_types'] = section_types # 6. Sponsor verification cursor.execute(""" SELECT COUNT(DISTINCT sponsor_name_id) FROM bills WHERE congress = 118 AND sponsor_name_id IS NOT NULL """) unique_sponsors = cursor.fetchone()[0] verification_results['unique_sponsors'] = unique_sponsors # 7. Content completeness (bills with sections) cursor.execute(""" SELECT COUNT(DISTINCT b.bill_id) FROM bills b LEFT JOIN bill_sections bs ON b.bill_id = bs.bill_id WHERE b.congress = 118 """) bills_with_content = cursor.fetchone()[0] content_completeness = (bills_with_content / total_bills * 100) if total_bills > 0 else 0 verification_results['content_completeness'] = { 'bills_with_content': bills_with_content, 'total_bills': total_bills, 'completeness_percentage': content_completeness } # 8. Data quality checks quality_checks = {} # Check for missing essential fields cursor.execute(""" SELECT COUNT(*) FROM bills WHERE congress = 118 AND (bill_id IS NULL OR bill_type IS NULL OR bill_number IS NULL) """) quality_checks['missing_essential_fields'] = cursor.fetchone()[0] # Check for duplicate bill_ids cursor.execute(""" SELECT bill_id, COUNT(*) as count FROM bills WHERE congress = 118 GROUP BY bill_id HAVING count > 1 """) duplicates = cursor.fetchall() quality_checks['duplicate_bill_ids'] = len(duplicates) verification_results['data_quality'] = quality_checks except Exception as e: self.logger.error(f"Error during verification: {str(e)}") verification_results['error'] = str(e) finally: conn.close() return verification_results def test_mcp_performance(self) -> Dict: """Test MCP server performance with full dataset""" self.logger.info("Testing MCP server performance with full dataset") performance_results = {} try: # Test 1: Bill listing performance start_time = time.time() conn = sqlite3.connect(self.db_path) cursor = conn.cursor() cursor.execute(""" SELECT bill_id, bill_type, bill_number, title, sponsor_name FROM bills WHERE congress = 118 ORDER BY bill_number LIMIT 100 """) bills = cursor.fetchall() conn.close() query_time = time.time() - start_time performance_results['bill_listing_100'] = { 'records_returned': len(bills), 'query_time_seconds': query_time, 'records_per_second': len(bills) / query_time if query_time > 0 else 0 } # Test 2: Full text search performance start_time = time.time() conn = sqlite3.connect(self.db_path) cursor = conn.cursor() cursor.execute(""" SELECT bill_id, title, official_title FROM bills WHERE congress = 118 AND (title LIKE '%infrastructure%' OR official_title LIKE '%infrastructure%') ORDER BY bill_number LIMIT 50 """) search_results = cursor.fetchall() conn.close() search_time = time.time() - start_time performance_results['infrastructure_search'] = { 'records_found': len(search_results), 'query_time_seconds': search_time, 'records_per_second': len(search_results) / search_time if search_time > 0 else 0 } # Test 3: Content sections retrieval performance start_time = time.time() conn = sqlite3.connect(self.db_path) cursor = conn.cursor() cursor.execute(""" SELECT bs.bill_id, bs.section_type, bs.header, bs.content FROM bill_sections bs JOIN bills b ON bs.bill_id = b.bill_id WHERE b.congress = 118 AND bs.section_type = 'section' ORDER BY bs.bill_id, bs.order_index LIMIT 200 """) sections = cursor.fetchall() conn.close() sections_time = time.time() - start_time performance_results['sections_retrieval_200'] = { 'records_returned': len(sections), 'query_time_seconds': sections_time, 'records_per_second': len(sections) / sections_time if sections_time > 0 else 0 } except Exception as e: self.logger.error(f"Error during performance testing: {str(e)}") performance_results['error'] = str(e) return performance_results def generate_comprehensive_report(self) -> Dict: """Generate comprehensive verification and performance report""" self.logger.info("Generating comprehensive 118th Congress report") # Data integrity verification integrity_results = self.verify_data_integrity() # Performance testing performance_results = self.test_mcp_performance() # Overall assessment overall_assessment = { 'data_integrity': integrity_results, 'performance': performance_results, 'summary': self._generate_summary(integrity_results, performance_results), 'recommendations': self._generate_recommendations(integrity_results, performance_results) } return overall_assessment def _generate_summary(self, integrity: Dict, performance: Dict) -> Dict: """Generate overall summary assessment""" summary = {} # Data completeness assessment total_bills = integrity.get('total_bills', 0) completeness = integrity.get('content_completeness', {}) completeness_pct = completeness.get('completeness_percentage', 0) if completeness_pct >= 90: summary['data_completeness'] = 'EXCELLENT' elif completeness_pct >= 75: summary['data_completeness'] = 'GOOD' elif completeness_pct >= 50: summary['data_completeness'] = 'FAIR' else: summary['data_completeness'] = 'POOR' # Performance assessment avg_query_time = ( performance.get('bill_listing_100', {}).get('query_time_seconds', 0) + performance.get('infrastructure_search', {}).get('query_time_seconds', 0) + performance.get('sections_retrieval_200', {}).get('query_time_seconds', 0) ) / 3 if avg_query_time <= 0.1: summary['performance'] = 'EXCELLENT' elif avg_query_time <= 0.5: summary['performance'] = 'GOOD' elif avg_query_time <= 2.0: summary['performance'] = 'FAIR' else: summary['performance'] = 'POOR' # Overall readiness data_issues = integrity.get('data_quality', {}) total_issues = ( data_issues.get('missing_essential_fields', 0) + data_issues.get('duplicate_bill_ids', 0) ) if summary['data_completeness'] in ['EXCELLENT', 'GOOD'] and summary['performance'] in ['EXCELLENT', 'GOOD'] and total_issues < 10: summary['overall_readiness'] = 'PRODUCTION READY' elif summary['data_completeness'] in ['GOOD', 'FAIR'] and summary['performance'] in ['GOOD', 'FAIR']: summary['overall_readiness'] = 'DEVELOPMENT READY' else: summary['overall_readiness'] = 'NEEDS IMPROVEMENT' return summary def _generate_recommendations(self, integrity: Dict, performance: Dict) -> List[str]: """Generate improvement recommendations""" recommendations = [] # Data quality recommendations quality_issues = integrity.get('data_quality', {}) if quality_issues.get('missing_essential_fields', 0) > 0: recommendations.append(f"Fix {quality_issues['missing_essential_fields']} bills with missing essential fields") if quality_issues.get('duplicate_bill_ids', 0) > 0: recommendations.append(f"Resolve {quality_issues['duplicate_bill_ids']} duplicate bill IDs") # Performance recommendations avg_query_time = ( performance.get('bill_listing_100', {}).get('query_time_seconds', 0) + performance.get('infrastructure_search', {}).get('query_time_seconds', 0) + performance.get('sections_retrieval_200', {}).get('query_time_seconds', 0) ) / 3 if avg_query_time > 1.0: recommendations.append("Optimize database indexes for better query performance") if avg_query_time > 2.0: recommendations.append("Consider database query optimization and connection pooling") # Content recommendations completeness = integrity.get('content_completeness', {}) completeness_pct = completeness.get('completeness_percentage', 0) if completeness_pct < 80: recommendations.append("Improve content extraction to capture more legislative text") return recommendations def save_report(self, report: Dict): """Save comprehensive report to file""" report_file = Path("comprehensive_118th_report.json") try: with open(report_file, 'w') as f: json.dump(report, f, indent=2, default=str) self.logger.info(f"Comprehensive report saved to {report_file}") except Exception as e: self.logger.error(f"Error saving report: {str(e)}") def print_summary(self, report: Dict): """Print formatted summary to console""" print("\n" + "="*80) print("COMPREHENSIVE 118TH CONGRESS DATA VERIFICATION REPORT") print("="*80) # Data integrity summary integrity = report['data_integrity'] print(f"\n📊 DATA INTEGRITY:") print(f" Total Bills: {integrity.get('total_bills', 0):,}") print(f" Total Sections: {integrity.get('total_sections', 0):,}") print(f" Unique Sponsors: {integrity.get('unique_sponsors', 0):,}") bill_types = integrity.get('bill_types', {}) print(f"\n📋 BILL TYPES:") for bill_type, count in list(bill_types.items())[:5]: # Top 5 print(f" {bill_type}: {count:,}") sessions = integrity.get('sessions', {}) print(f"\n📅 SESSIONS:") for session, count in sessions.items(): print(f" Session {session}: {count:,} bills") completeness = integrity.get('content_completeness', {}) print(f"\n✅ CONTENT COMPLETENESS:") print(f" Bills with Content: {completeness.get('bills_with_content', 0):,}") print(f" Completeness: {completeness.get('completeness_percentage', 0):.1f}%") # Performance summary performance = report['performance'] print(f"\n⚡ PERFORMANCE:") for test_name, results in performance.items(): if isinstance(results, dict): print(f" {test_name}:") print(f" Records: {results.get('records_returned', 0):,}") print(f" Query Time: {results.get('query_time_seconds', 0):.3f}s") print(f" Rate: {results.get('records_per_second', 0):.1f} records/s") # Overall assessment summary = report['summary'] print(f"\n🎯 OVERALL ASSESSMENT:") print(f" Data Completeness: {summary.get('data_completeness', 'UNKNOWN')}") print(f" Performance: {summary.get('performance', 'UNKNOWN')}") print(f" Overall Readiness: {summary.get('overall_readiness', 'UNKNOWN')}") # Recommendations recommendations = report.get('recommendations', []) if recommendations: print(f"\n💡 RECOMMENDATIONS:") for i, rec in enumerate(recommendations, 1): print(f" {i}. {rec}") print("="*80) def main(): """Main verification and testing entry point""" verifier = Comprehensive118thVerifier("data/govinfo_downloads.db") # Generate comprehensive report report = verifier.generate_comprehensive_report() # Save report verifier.save_report(report) # Print summary verifier.print_summary(report) if __name__ == "__main__": main()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cbwinslow/opendiscourse_mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server