Skip to main content
Glama
brockwebb

Open Census MCP Server

by brockwebb
create_bulk_variables.py4.96 kB
#!/usr/bin/env python3 """ Create bulk variables list by excluding already processed variables """ import json import pandas as pd import argparse from pathlib import Path def load_variable_ids(file_path): """Load variable IDs from various file formats""" if not Path(file_path).exists(): print(f"⚠️ File not found: {file_path}") return set() try: # Handle CSV files if file_path.endswith('.csv'): df = pd.read_csv(file_path) if 'variable_id' in df.columns: return set(df['variable_id'].dropna()) elif 'name' in df.columns: return set(df['name'].dropna()) else: print(f"⚠️ CSV file {file_path} missing 'variable_id' or 'name' column") return set() # Handle JSON files with open(file_path, 'r') as f: data = json.load(f) if isinstance(data, dict): return set(data.keys()) elif isinstance(data, list): # Handle results format with variable_id field var_ids = set() for item in data: if isinstance(item, dict) and 'variable_id' in item: var_ids.add(item['variable_id']) elif isinstance(item, str): var_ids.add(item) return var_ids else: print(f"⚠️ Unexpected format in {file_path}") return set() except Exception as e: print(f"⚠️ Error loading {file_path}: {e}") return set() def load_complete_corpus(file_path): """Load complete corpus data in proper format""" if file_path.endswith('.csv'): df = pd.read_csv(file_path) corpus_data = {} for _, row in df.iterrows(): var_id = row.get('variable_id') if var_id: corpus_data[var_id] = { 'label': row.get('label', 'Unknown'), 'concept': row.get('concept', 'Unknown'), 'predicateType': row.get('predicateType', 'Unknown'), 'group': row.get('group', 'Unknown'), 'survey': row.get('survey', 'Unknown'), 'complexity': row.get('complexity', 'Unknown') } return corpus_data else: # JSON format with open(file_path, 'r') as f: return json.load(f) def main(): parser = argparse.ArgumentParser(description='Create bulk variables by excluding processed ones') parser.add_argument('--complete-corpus', required=True, help='Complete variable corpus file') parser.add_argument('--exclude-coos', help='COOS variables file') parser.add_argument('--exclude-results', action='append', help='Results files to exclude (can use multiple times)') parser.add_argument('--output', required=True, help='Output file for bulk variables') args = parser.parse_args() print("🔍 Creating bulk variables list...") # Load complete corpus print(f"📊 Loading complete corpus from {args.complete_corpus}") corpus_data = load_complete_corpus(args.complete_corpus) all_variables = set(corpus_data.keys()) print(f" Found {len(all_variables)} total variables") # Track exclusions excluded = set() # Exclude COOS variables if args.exclude_coos: print(f"🎯 Excluding COOS variables from {args.exclude_coos}") coos_vars = load_variable_ids(args.exclude_coos) excluded.update(coos_vars) print(f" Excluded {len(coos_vars)} COOS variables") # Exclude result files if args.exclude_results: for results_file in args.exclude_results: print(f"✅ Excluding processed variables from {results_file}") processed_vars = load_variable_ids(results_file) excluded.update(processed_vars) print(f" Excluded {len(processed_vars)} processed variables") # Calculate bulk variables bulk_variables = all_variables - excluded print(f"\n📈 Summary:") print(f" Total variables: {len(all_variables)}") print(f" Excluded variables: {len(excluded)}") print(f" Bulk variables: {len(bulk_variables)}") # Load original corpus data for bulk variables print(f"💾 Creating bulk variables file...") # Extract data for bulk variables only bulk_data = {var_id: corpus_data[var_id] for var_id in bulk_variables if var_id in corpus_data} # Save bulk variables with open(args.output, 'w') as f: json.dump(bulk_data, f, indent=2) print(f"✅ Saved {len(bulk_data)} bulk variables to {args.output}") # Estimate cost estimated_cost = len(bulk_data) * 0.001 # $0.001 per variable for single agent print(f"💰 Estimated cost: ${estimated_cost:.2f}") if __name__ == "__main__": main()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/brockwebb/open-census-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server