Open Census MCP Server

update_scoring_packet.py•6.32 KiB

#!/usr/bin/env python3 """Update manual scoring packet from v2 to v3. Changes: - Remove D6 from all scoring tables (change total from /12 to /10) - Replace response content with v3 data - Update header and add Pipeline Fidelity note """ import json import re from pathlib import Path # Load v3 responses queries = {} with open('results/cqs_responses_20260213_091530.jsonl') as f: for line in f: pair = json.loads(line) queries[pair['query_id']] = pair # Answer key: which response is treatment/control answer_key = { 'NORM-001': {'A': 'treatment', 'B': 'control'}, 'NORM-008': {'A': 'treatment', 'B': 'control'}, 'NORM-015': {'A': 'control', 'B': 'treatment'}, 'GEO-006': {'A': 'treatment', 'B': 'control'}, 'SML-001': {'A': 'treatment', 'B': 'control'}, 'TMP-002': {'A': 'treatment', 'B': 'control'}, 'MIS-002': {'A': 'treatment', 'B': 'control'}, 'AMB-002': {'A': 'treatment', 'B': 'control'}, 'PER-001a': {'A': 'control', 'B': 'treatment'}, } # Target queries in order target_queries = ['NORM-001', 'NORM-008', 'NORM-015', 'GEO-006', 'SML-001', 'TMP-002', 'MIS-002', 'AMB-002', 'PER-001a'] # Load original packet with open('docs/test/cqs_manual_scoring_packet.md') as f: content = f.read() # Update header content = re.sub( r'\*\*Date:\*\* \d{4}-\d{2}-\d{2}', '**Date:** 2026-02-13', content ) content = re.sub( r'\*\*Battery:\*\* cqs_responses_\d+_\d+\.jsonl', '**Battery:** cqs_responses_20260213_091530.jsonl', content ) # Update scoring instructions - change "six" to "five" content = re.sub( r'Score each response independently on the six CQS dimensions', 'Score each response independently on the five CQS dimensions', content ) # Update The Six Dimensions table - remove D6 row and add note dimensions_table = '''| Dim | Name | What to Look For | |---|---|---| | D1 | Source Selection & Fitness | Right product, vintage, geography for the question? | | D2 | Methodological Soundness | Correct computation, weights, denominators, formulas? | | D3 | Uncertainty Communication | MOE/SE provided? Reliability assessed? Proper confidence level? | | D4 | Definitional Accuracy | Official Census concepts used correctly? Period vs point-in-time? | | D5 | Reproducibility & Traceability | Table IDs, variable codes, FIPS codes — can you replicate? | > **Note:** Groundedness/faithfulness (formerly D6) is measured separately via > automated Pipeline Fidelity verification, not by human raters. This metric > compares response claims against API tool call logs and is reported independently.''' old_dimensions_pattern = r'\| Dim \| Name \| What to Look For \|.*?\| D6 \|[^\n]+\|[^\n]+\|' content = re.sub(old_dimensions_pattern, dimensions_table, content, flags=re.DOTALL) # Remove D6 gate failure principle content = re.sub( r'\d+\.\s+\*\*D6 = 0 is a gate failure\.\*\*[^\n]+\n', '', content ) # Renumber principles if needed content = re.sub(r'(\d+)\.\s+\*\*Score each response independently', r'2. **Score each response independently', content) content = re.sub(r'(\d+)\.\s+After scoring both', r'3. After scoring both', content) # For each query, replace responses and update scoring tables for query_id in target_queries: if query_id not in queries: print(f"Warning: {query_id} not found in v3 data") continue pair = queries[query_id] # Get treatment and control responses treatment_text = pair['treatment']['response_text'] control_text = pair['control']['response_text'] # Map to A/B based on answer key if answer_key[query_id]['A'] == 'treatment': response_a = treatment_text response_b = control_text else: response_a = control_text response_b = treatment_text # Find the section for this query pattern = rf'(## Query \d+ of 9: {re.escape(query_id)}.*?)(### Response A\n\n)(.*?)(\n\n---\n\n### Response B\n\n)(.*?)(\n\n---\n\n### Scoring)' def replace_responses(match): return match.group(1) + match.group(2) + response_a + match.group(4) + response_b + match.group(6) content = re.sub(pattern, replace_responses, content, flags=re.DOTALL) # Update scoring tables for this query - remove D6 rows and change /12 to /10 # Find both scoring tables for this query query_section_pattern = rf'(## Query \d+ of 9: {re.escape(query_id)}.*?)(## Query \d+ of 9:|$)' def update_scoring_tables(match): section = match.group(1) # Remove D6 rows from both tables section = re.sub(r'\| D6: Groundedness \|[^\n]+\n', '', section) # Change /12 to /10 section = re.sub(r'/12', '/10', section) # Add back the next query marker if it exists if match.group(2): return section + match.group(2) return section content = re.sub(query_section_pattern, update_scoring_tables, content, flags=re.DOTALL) # Write updated packet with open('docs/test/cqs_manual_scoring_packet.md', 'w') as f: f.write(content) print("✓ Updated scoring packet") # Update answer key answer_key_content = """ANSWER KEY — DO NOT SHARE WITH RATERS ================================================== RUBRIC VERSION: v3 (5-dimension CQS, D1-D5) STAGE 1 DATA: cqs_responses_20260213_091530.jsonl DATE GENERATED: 2026-02-13 NOTES: - D6 (Groundedness) removed from human scoring. Measured via automated Pipeline Fidelity (Stage 3). - Total score per response: /10 (was /12 in v2) KNOWN ISSUES: - PER-001a treatment: Reports ~117K for Bozeman city. Actual city pop ~53K. This is Gallatin County data. Known FIPS resolution bug in MCP tool. Random seed: 42 NORM-001: A=treatment, B=control NORM-008: A=treatment, B=control NORM-015: A=control, B=treatment GEO-006: A=treatment, B=control SML-001: A=treatment, B=control TMP-002: A=treatment, B=control MIS-002: A=treatment, B=control AMB-002: A=treatment, B=control PER-001a: A=control, B=treatment """ with open('docs/test/scoring_answer_key.txt', 'w') as f: f.write(answer_key_content) print("✓ Updated answer key") # Verification with open('docs/test/cqs_manual_scoring_packet.md') as f: updated_content = f.read() d6_count = updated_content.count('D6') print(f"\n Verification:") print(f" D6 occurrences: {d6_count} (should be ~1-2 in Pipeline Fidelity note only)") print(f" /10 occurrences: {updated_content.count('/10')}") print(f" /12 occurrences: {updated_content.count('/12')} (should be 0)")

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/brockwebb/open-census-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

update_scoring_packet.py•6.32 KiB