xai-toolkit

xai-mcp
src
xai_toolkit

schemas.py•8.54 KiB

"""Pydantic schemas — single source of truth for tool contracts (ADR-004).

These models define the shape of data flowing between layers:
  explainers.py → narrators.py → server.py

No MCP imports here. These are pure data contracts.
"""

from datetime import datetime, timezone

from pydantic import BaseModel, Field


# --- Explainer outputs (what computation produces) ---


class ShapResult(BaseModel):
    """Output from SHAP computation for a single sample."""

    prediction: int = Field(description="Predicted class (0 or 1)")
    prediction_label: str = Field(description="Human-readable class name")
    probability: float = Field(description="Predicted probability for the positive class")
    base_value: float = Field(description="SHAP base value (average model output)")
    shap_values: dict[str, float] = Field(
        description="Feature name → SHAP value mapping"
    )
    feature_values: dict[str, float] = Field(
        description="Feature name → actual value for this sample"
    )
    feature_names: list[str] = Field(description="All feature names in order")


class FeatureImportance(BaseModel):
    """A single feature's global importance."""

    name: str
    importance: float = Field(description="Mean absolute SHAP value")
    direction: str = Field(description="'positive' or 'negative' average effect")
    mean_shap: float = Field(description="Mean SHAP value (signed)")


class ModelSummary(BaseModel):
    """Summary statistics about a model."""

    model_type: str
    accuracy: float
    n_features: int
    n_train_samples: int
    n_test_samples: int
    target_names: list[str]
    top_features: list[FeatureImportance]


class PartialDependenceResult(BaseModel):
    """Output from partial dependence + ICE computation."""

    feature_name: str
    feature_values: list[float] = Field(description="Grid of feature values")
    predictions: list[float] = Field(description="Mean prediction at each grid point (PDP)")
    ice_curves: list[list[float]] = Field(
        default_factory=list,
        description="Per-sample prediction curves (ICE). Each inner list is one sample's predictions across the grid.",
    )
    feature_min: float
    feature_max: float
    prediction_min: float
    prediction_max: float


class DatasetDescription(BaseModel):
    """Summary statistics about a dataset."""

    n_samples: int
    n_features: int
    feature_names: list[str]
    class_distribution: dict[str, int] = Field(
        description="Class label → count mapping"
    )
    missing_values: int = Field(description="Total missing values across all features")
    feature_stats: dict[str, dict[str, float]] = Field(
        description="Feature name → {mean, std, min, max}"
    )


# --- Tool response (what MCP tools return) — ADR-005 ---


class ToolMetadata(BaseModel):
    """Audit metadata attached to every tool response.

    Every field here answers a question an auditor might ask:
      - model_id          → which model made this prediction?
      - model_type         → what kind of model is it?
      - timestamp          → when was this explanation generated?
      - tool_version       → which version of the toolkit produced this?
      - sample_index       → which sample was explained?
      - dataset_size       → how large was the dataset?
      - data_hash          → was this the exact same data as before?
      - source             → was this computed on-the-fly or from pipeline?
      - detected_type      → how did the pipeline detect the model type?
      - explainer_type     → which SHAP explainer was used?
      - n_rows_explained   → how many samples were explained?

    The pipeline-related fields (detected_type, explainer_type,
    n_rows_explained) are populated when reading from pre-computed Kedro
    pipeline artifacts. They are None for on-the-fly computation.
    This schema is designed to be compatible with the metadata format
    produced by the Kedro explainability pipeline developed by Tamas (xai-xgboost-clf).
    """

    model_id: str
    model_type: str
    timestamp: str = Field(
        default_factory=lambda: datetime.now(timezone.utc).isoformat()
    )
    tool_version: str = "0.1.0"
    sample_index: int | None = None
    dataset_size: int | None = None
    data_hash: str | None = Field(
        default=None,
        description=(
            "SHA256 hex digest of the input data used to generate this explanation. "
            "Enables audit trail: same hash guarantees same underlying data. "
            "64 hex characters."
        ),
    )
    source: str = Field(
        default="on_the_fly",
        description=(
            "How this explanation was generated. "
            "'on_the_fly': computed in real-time from model + data (PoC). "
            "'pipeline': read from pre-computed Kedro pipeline artifacts (production)."
        ),
    )
    detected_type: str | None = Field(
        default=None,
        description=(
            "Model type as detected by the pipeline's _detect_model_type(). "
            "E.g., 'xgboost', 'lightgbm', 'tree', 'keras'. "
            "Populated when source='pipeline'; None for on-the-fly."
        ),
    )
    explainer_type: str | None = Field(
        default=None,
        description=(
            "SHAP explainer used by the pipeline. "
            "E.g., 'tree', 'kernel', 'deep', 'auto'. "
            "Populated when source='pipeline'; None for on-the-fly."
        ),
    )
    n_rows_explained: int | None = Field(
        default=None,
        description=(
            "Number of samples the pipeline computed SHAP for. "
            "Populated when source='pipeline'; None for on-the-fly."
        ),
    )


class ToolResponse(BaseModel):
    """Generic response for all tools (ADR-005).

    Every tool returns this same shape: narrative + evidence + metadata.
    This consistency means the LLM always knows what to expect.

    The `grounded` flag is the epistemic label for the consuming LLM:
      - grounded=True  → this answer was computed deterministically from a
                         registered model. It is reproducible and audit-ready.
      - grounded=False → reserved for future use (e.g. a conversational
                         fallback tool). Currently all tool responses are
                         grounded=True by definition: if a tool was called,
                         computation happened.

    The LLM is instructed (via server instructions and copilot-instructions.md)
    to prepend a disclaimer on any response it generates WITHOUT calling a tool.
    That disclaimer is triggered by the *absence* of grounded=True in the
    response — because ungrounded answers don't go through this schema at all.
    """

    narrative: str = Field(description="Plain English interpretation")
    evidence: dict = Field(description="Structured data backing the narrative")
    metadata: ToolMetadata
    plot_base64: str | None = Field(
        default=None, description="Optional base64-encoded PNG"
    )
    grounded: bool = Field(
        default=True,
        description=(
            "True when this response was computed deterministically from a "
            "registered model. Always True for tool responses. Signals to "
            "the LLM that the narrative is audit-ready and should be "
            "presented as a verified result."
        ),
    )


class ErrorResponse(BaseModel):
    """Structured error returned when a tool call fails (D3-S3, S4, S5).

    Follows the same structural pattern as ToolResponse so the LLM
    always receives a predictable shape regardless of success or failure.
    The 'available' field tells the user what valid options exist,
    and 'suggestion' provides a closest-match hint for typos.

    Error codes are SCREAMING_SNAKE_CASE for machine-readability.
    Messages are plain English for human-readability.
    """

    error_code: str = Field(
        description=(
            "Machine-readable error type. One of: "
            "MODEL_NOT_FOUND | SAMPLE_OUT_OF_RANGE | FEATURE_NOT_FOUND | UNKNOWN_ERROR"
        )
    )
    message: str = Field(description="Human-readable explanation of what went wrong")
    available: list[str] = Field(
        default_factory=list,
        description=(
            "Valid options the user can try instead. "
            "For MODEL_NOT_FOUND: list of registered model IDs. "
            "For SAMPLE_OUT_OF_RANGE: valid index range as a string. "
            "For FEATURE_NOT_FOUND: list of all feature names."
        ),
    )
    suggestion: str | None = Field(
        default=None,
        description="Closest-match suggestion (populated for typo-style errors like FEATURE_NOT_FOUND)",
    )

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/florenciakabas/xai-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

schemas.py•8.54 KiB

"""Pydantic schemas — single source of truth for tool contracts (ADR-004).

These models define the shape of data flowing between layers:
  explainers.py → narrators.py → server.py

No MCP imports here. These are pure data contracts.
"""

from datetime import datetime, timezone

from pydantic import BaseModel, Field


# --- Explainer outputs (what computation produces) ---


class ShapResult(BaseModel):
    """Output from SHAP computation for a single sample."""

    prediction: int = Field(description="Predicted class (0 or 1)")
    prediction_label: str = Field(description="Human-readable class name")
    probability: float = Field(description="Predicted probability for the positive class")
    base_value: float = Field(description="SHAP base value (average model output)")
    shap_values: dict[str, float] = Field(
        description="Feature name → SHAP value mapping"
    )
    feature_values: dict[str, float] = Field(
        description="Feature name → actual value for this sample"
    )
    feature_names: list[str] = Field(description="All feature names in order")


class FeatureImportance(BaseModel):
    """A single feature's global importance."""

    name: str
    importance: float = Field(description="Mean absolute SHAP value")
    direction: str = Field(description="'positive' or 'negative' average effect")
    mean_shap: float = Field(description="Mean SHAP value (signed)")


class ModelSummary(BaseModel):
    """Summary statistics about a model."""

    model_type: str
    accuracy: float
    n_features: int
    n_train_samples: int
    n_test_samples: int
    target_names: list[str]
    top_features: list[FeatureImportance]


class PartialDependenceResult(BaseModel):
    """Output from partial dependence + ICE computation."""

    feature_name: str
    feature_values: list[float] = Field(description="Grid of feature values")
    predictions: list[float] = Field(description="Mean prediction at each grid point (PDP)")
    ice_curves: list[list[float]] = Field(
        default_factory=list,
        description="Per-sample prediction curves (ICE). Each inner list is one sample's predictions across the grid.",
    )
    feature_min: float
    feature_max: float
    prediction_min: float
    prediction_max: float


class DatasetDescription(BaseModel):
    """Summary statistics about a dataset."""

    n_samples: int
    n_features: int
    feature_names: list[str]
    class_distribution: dict[str, int] = Field(
        description="Class label → count mapping"
    )
    missing_values: int = Field(description="Total missing values across all features")
    feature_stats: dict[str, dict[str, float]] = Field(
        description="Feature name → {mean, std, min, max}"
    )


# --- Tool response (what MCP tools return) — ADR-005 ---


class ToolMetadata(BaseModel):
    """Audit metadata attached to every tool response.

    Every field here answers a question an auditor might ask:
      - model_id          → which model made this prediction?
      - model_type         → what kind of model is it?
      - timestamp          → when was this explanation generated?
      - tool_version       → which version of the toolkit produced this?
      - sample_index       → which sample was explained?
      - dataset_size       → how large was the dataset?
      - data_hash          → was this the exact same data as before?
      - source             → was this computed on-the-fly or from pipeline?
      - detected_type      → how did the pipeline detect the model type?
      - explainer_type     → which SHAP explainer was used?
      - n_rows_explained   → how many samples were explained?

    The pipeline-related fields (detected_type, explainer_type,
    n_rows_explained) are populated when reading from pre-computed Kedro
    pipeline artifacts. They are None for on-the-fly computation.
    This schema is designed to be compatible with the metadata format
    produced by the Kedro explainability pipeline developed by Tamas (xai-xgboost-clf).
    """

    model_id: str
    model_type: str
    timestamp: str = Field(
        default_factory=lambda: datetime.now(timezone.utc).isoformat()
    )
    tool_version: str = "0.1.0"
    sample_index: int | None = None
    dataset_size: int | None = None
    data_hash: str | None = Field(
        default=None,
        description=(
            "SHA256 hex digest of the input data used to generate this explanation. "
            "Enables audit trail: same hash guarantees same underlying data. "
            "64 hex characters."
        ),
    )
    source: str = Field(
        default="on_the_fly",
        description=(
            "How this explanation was generated. "
            "'on_the_fly': computed in real-time from model + data (PoC). "
            "'pipeline': read from pre-computed Kedro pipeline artifacts (production)."
        ),
    )
    detected_type: str | None = Field(
        default=None,
        description=(
            "Model type as detected by the pipeline's _detect_model_type(). "
            "E.g., 'xgboost', 'lightgbm', 'tree', 'keras'. "
            "Populated when source='pipeline'; None for on-the-fly."
        ),
    )
    explainer_type: str | None = Field(
        default=None,
        description=(
            "SHAP explainer used by the pipeline. "
            "E.g., 'tree', 'kernel', 'deep', 'auto'. "
            "Populated when source='pipeline'; None for on-the-fly."
        ),
    )
    n_rows_explained: int | None = Field(
        default=None,
        description=(
            "Number of samples the pipeline computed SHAP for. "
            "Populated when source='pipeline'; None for on-the-fly."
        ),
    )


class ToolResponse(BaseModel):
    """Generic response for all tools (ADR-005).

    Every tool returns this same shape: narrative + evidence + metadata.
    This consistency means the LLM always knows what to expect.

    The `grounded` flag is the epistemic label for the consuming LLM:
      - grounded=True  → this answer was computed deterministically from a
                         registered model. It is reproducible and audit-ready.
      - grounded=False → reserved for future use (e.g. a conversational
                         fallback tool). Currently all tool responses are
                         grounded=True by definition: if a tool was called,
                         computation happened.

    The LLM is instructed (via server instructions and copilot-instructions.md)
    to prepend a disclaimer on any response it generates WITHOUT calling a tool.
    That disclaimer is triggered by the *absence* of grounded=True in the
    response — because ungrounded answers don't go through this schema at all.
    """

    narrative: str = Field(description="Plain English interpretation")
    evidence: dict = Field(description="Structured data backing the narrative")
    metadata: ToolMetadata
    plot_base64: str | None = Field(
        default=None, description="Optional base64-encoded PNG"
    )
    grounded: bool = Field(
        default=True,
        description=(
            "True when this response was computed deterministically from a "
            "registered model. Always True for tool responses. Signals to "
            "the LLM that the narrative is audit-ready and should be "
            "presented as a verified result."
        ),
    )


class ErrorResponse(BaseModel):
    """Structured error returned when a tool call fails (D3-S3, S4, S5).

    Follows the same structural pattern as ToolResponse so the LLM
    always receives a predictable shape regardless of success or failure.
    The 'available' field tells the user what valid options exist,
    and 'suggestion' provides a closest-match hint for typos.

    Error codes are SCREAMING_SNAKE_CASE for machine-readability.
    Messages are plain English for human-readability.
    """

    error_code: str = Field(
        description=(
            "Machine-readable error type. One of: "
            "MODEL_NOT_FOUND | SAMPLE_OUT_OF_RANGE | FEATURE_NOT_FOUND | UNKNOWN_ERROR"
        )
    )
    message: str = Field(description="Human-readable explanation of what went wrong")
    available: list[str] = Field(
        default_factory=list,
        description=(
            "Valid options the user can try instead. "
            "For MODEL_NOT_FOUND: list of registered model IDs. "
            "For SAMPLE_OUT_OF_RANGE: valid index range as a string. "
            "For FEATURE_NOT_FOUND: list of all feature names."
        ),
    )
    suggestion: str | None = Field(
        default=None,
        description="Closest-match suggestion (populated for typo-style errors like FEATURE_NOT_FOUND)",
    )