Open Census MCP Server

models.py•3.1 KiB

"""Pydantic models for CQS evaluation data structures.""" from pydantic import BaseModel from typing import Literal, Optional from datetime import datetime class ToolCall(BaseModel): """Record of a single tool call during agent loop.""" tool_name: str arguments: dict result: dict # parsed JSON from tool response latency_ms: float class ResponseRecord(BaseModel): """Complete record of one response (control, pragmatics, or rag).""" query_id: str condition: Literal["control", "pragmatics", "rag"] model: str system_prompt: str response_text: str tool_calls: list[ToolCall] = [] pragmatics_returned: list[str] = [] # context_ids extracted from tool results total_latency_ms: float input_tokens: int output_tokens: int timestamp: datetime tools_offered: bool = False # True when tools were passed to the API tool_rounds_used: int = 0 # Number of agent loop iterations used tool_rounds_exhausted: bool = False # True if forced synthesis was needed # RAG-specific metadata (only populated when condition='rag') retrieved_chunks: Optional[list[dict]] = None # Chunk metadata from retriever retrieval_context_chars: Optional[int] = None # Total chars injected as context class QueryPair(BaseModel): """Paired control + treatment for one query. (V1 — retained for backward compat)""" query_id: str query_text: str category: str difficulty: str control: ResponseRecord treatment: ResponseRecord class ComparisonPair(BaseModel): """Paired responses for V2 pairwise comparison. condition_a and condition_b are named by the comparison (e.g., for rag_vs_pragmatics: condition_a=rag, condition_b=pragmatics). """ query_id: str query_text: str category: str difficulty: str condition_a: ResponseRecord condition_b: ResponseRecord condition_a_name: str # "control", "rag", or "pragmatics" condition_b_name: str # "control", "rag", or "pragmatics" class DimensionScore(BaseModel): """Score for a single CQS dimension.""" score: int # 0, 1, or 2 confidence: int # 1-5 reasoning: str class JudgeRecord(BaseModel): """Complete record of one judge evaluation.""" query_id: str judge_model: str judge_vendor: str presentation_order: str # "condition_a_first" or "condition_b_first" scores_response_a: dict[str, DimensionScore] # D1-D6 -> DimensionScore scores_response_b: dict[str, DimensionScore] # D1-D6 -> DimensionScore preference: str # "A" / "B" / "tie" preference_reasoning: str response_a_label: str # actual condition name: "control", "rag", or "pragmatics" response_b_label: str # actual condition name: "control", "rag", or "pragmatics" comparison: str # e.g., "rag_vs_pragmatics", "control_vs_pragmatics", "control_vs_rag" latency_ms: float input_tokens: int output_tokens: int timestamp: datetime run_id: str raw_response: str # Full judge response for debugging parse_success: bool # Whether JSON parsing succeeded pass_number: int = 1 # 1-6, which pass this measurement came from

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/brockwebb/open-census-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

models.py•3.1 KiB