Reexpress MCP Server

Official

Overview Schema Related Servers Score Discussions

mcp_utils_llm_api.py•11.2 KiB

# Copyright Reexpress AI, Inc. All rights reserved. # LLM API calls and transformations for MCP server import torch import numpy as np from pydantic import BaseModel import time import os import constants from transformers import AutoModelForCausalLM, AutoTokenizer from google import genai from google.genai import types model_path = "ibm-granite/granite-3.3-8b-instruct" try: device = str(os.getenv("MCP_SERVER_AGREEMENT_MODEL_DEVICE", default=constants.MCP_SERVER_AGREEMENT_MODEL_DEVICE__DEFAULT)) MCP_SERVER_AGREEMENT_MODEL_MAX_CHARACTER_LENGTH = int( os.getenv("MCP_SERVER_AGREEMENT_MODEL_MAX_CHARACTER_LENGTH", default=constants.MCP_SERVER_AGREEMENT_MODEL_MAX_CHARACTER_LENGTH__DEFAULT)) except: device = constants.MCP_SERVER_AGREEMENT_MODEL_DEVICE__DEFAULT MCP_SERVER_AGREEMENT_MODEL_MAX_CHARACTER_LENGTH = \ constants.MCP_SERVER_AGREEMENT_MODEL_MAX_CHARACTER_LENGTH__DEFAULT model = AutoModelForCausalLM.from_pretrained( model_path, device_map=device, torch_dtype=torch.bfloat16, ) tokenizer = AutoTokenizer.from_pretrained( model_path ) # env variables GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") google_client = genai.Client(api_key=GEMINI_API_KEY) # can alternatively replace with a Vertex AI deployment GEMINI_MODEL="gemini-3-pro-preview" USE_AZURE_01 = int(os.getenv("USE_AZURE_01", "1")) if USE_AZURE_01 == 1: from openai import AzureOpenAI kAPI_VERSION = "2024-12-01-preview" client = AzureOpenAI( api_key=os.getenv("AZURE_OPENAI_API_KEY"), api_version=kAPI_VERSION, azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT") ) GPT5_MODEL = os.getenv("GPT_5_2_MODEL_2025_12_11_AZURE_DEPLOYMENT") else: from openai import OpenAI client = OpenAI() GPT5_MODEL = "gpt-5.2-2025-12-11" class ResponseVerificationWithConfidenceAndExplanationAndSummary(BaseModel): short_summary_of_original_question_and_response: str verification_classification: bool confidence_in_classification: float short_explanation_for_classification_confidence: str def get_document_attributes_from_gpt5(previous_query_and_response_to_verify_string: str) -> \ dict[str, float | bool]: time.sleep(torch.abs(torch.randn(1)).item() / constants.SLEEP_CONSTANT) try: max_tokens=100000 messages_structure = [ {"role": "developer", "content": f"{constants.GPT_5_SYSTEM_MESSAGE.strip()}"}, {"role": "user", "content": f"{previous_query_and_response_to_verify_string}"} ] completion = client.beta.chat.completions.parse( model=GPT5_MODEL, messages=messages_structure, response_format=ResponseVerificationWithConfidenceAndExplanationAndSummary, max_completion_tokens=max_tokens, reasoning_effort="high", user="sdm_llm_reasoning_branching_v1", seed=0 ) verification_object = completion.choices[0].message.parsed verification_dict = {constants.SHORT_SUMMARY_KEY: verification_object.short_summary_of_original_question_and_response, constants.VERIFICATION_CLASSIFICATION_KEY: verification_object.verification_classification, constants.CONFIDENCE_IN_CLASSIFICATION_KEY: verification_object.confidence_in_classification, constants.SHORT_EXPLANATION_FOR_CLASSIFICATION_CONFIDENCE_KEY: verification_object.short_explanation_for_classification_confidence, constants.LLM_API_ERROR_KEY: False} except: verification_dict = {constants.SHORT_SUMMARY_KEY: "", constants.VERIFICATION_CLASSIFICATION_KEY: False, constants.CONFIDENCE_IN_CLASSIFICATION_KEY: 0.01, constants.SHORT_EXPLANATION_FOR_CLASSIFICATION_CONFIDENCE_KEY: constants.SHORT_EXPLANATION_FOR_CLASSIFICATION_CONFIDENCE__DEFAULT_ERROR, constants.LLM_API_ERROR_KEY: True} return verification_dict return verification_dict def get_document_attributes_from_gemini_reasoning(previous_query_and_response_to_verify_string: str) -> \ dict[str, float | bool]: time.sleep(torch.abs(torch.randn(1)).item() / constants.SLEEP_CONSTANT) try: max_tokens=65535 response = google_client.models.generate_content( model=GEMINI_MODEL, contents=previous_query_and_response_to_verify_string, config=types.GenerateContentConfig( system_instruction=constants.SYSTEM_MESSAGE_WITH_EXPLANATION.strip(), thinking_config=types.ThinkingConfig( thinking_level=types.ThinkingLevel.HIGH, include_thoughts=False ), tools=[ types.Tool( google_search=types.GoogleSearch(), code_execution=types.ToolCodeExecution(), url_context=types.UrlContext(), ) ], response_mime_type="application/json", response_schema={"type": "OBJECT", "properties": {"verification_classification": {"type": "BOOLEAN"}, "confidence_in_classification": {"type": "NUMBER"}, "short_explanation_for_classification_confidence": { "type": "STRING"}}, "required": ["verification_classification", "confidence_in_classification", "short_explanation_for_classification_confidence"]}, response_modalities=["TEXT"], temperature=1.0, max_output_tokens=max_tokens, seed=0) ) verification_object = response.parsed verification_dict = {constants.VERIFICATION_CLASSIFICATION_KEY: bool(verification_object[constants.VERIFICATION_CLASSIFICATION_KEY]), constants.CONFIDENCE_IN_CLASSIFICATION_KEY: float(verification_object[constants.CONFIDENCE_IN_CLASSIFICATION_KEY]), constants.SHORT_EXPLANATION_FOR_CLASSIFICATION_CONFIDENCE_KEY: str(verification_object[constants.SHORT_EXPLANATION_FOR_CLASSIFICATION_CONFIDENCE_KEY]), constants.LLM_API_ERROR_KEY: False} except: verification_dict = {constants.VERIFICATION_CLASSIFICATION_KEY: False, constants.CONFIDENCE_IN_CLASSIFICATION_KEY: 0.01, constants.SHORT_EXPLANATION_FOR_CLASSIFICATION_CONFIDENCE_KEY: constants.SHORT_EXPLANATION_FOR_CLASSIFICATION_CONFIDENCE__DEFAULT_ERROR, constants.LLM_API_ERROR_KEY: True} return verification_dict return verification_dict def get_agreement_model_embedding(document_text: str): conv = [{"role": "user", "content": document_text}] input_ids = tokenizer.apply_chat_template(conv, return_tensors="pt", thinking=False, return_dict=True, add_generation_prompt=True).to(device) outputs = model.generate( **input_ids, max_new_tokens=1, output_hidden_states=True, return_dict_in_generate=True, output_scores=True, ) hidden_states = outputs.hidden_states scores = outputs.scores no_id = tokenizer.vocab["No"] yes_id = tokenizer.vocab["Yes"] probs = torch.softmax(scores[0], dim=-1) # average of all (across tokens) final hidden states :: final token hidden state (here this corresponds to the hidden state of the linear layer that determines the No/Yes classification) :: no_prob :: yes_prob embedding = torch.cat([ torch.mean(hidden_states[0][-1][0], dim=0).unsqueeze(0), hidden_states[0][-1][0][-1, :].unsqueeze(0), probs[0:1, no_id].unsqueeze(0), probs[0:1, yes_id].unsqueeze(0) ], dim=-1) embedding = [float(x) for x in embedding[0].cpu().numpy().tolist()] assert len(embedding) == constants.EXPECTED_EMBEDDING_SIZE agreement_classification = probs[0:1, no_id] < probs[0:1, yes_id] return embedding, agreement_classification.item() def get_model_explanations_formatted_as_binary_agreement_prompt(gpt5_model_summary, gpt5_model_explanation, gemini_model_explanation) -> str: if gpt5_model_summary != "": topic_string = f"<topic> {gpt5_model_summary} </topic> " else: topic_string = "" formatted_output_string = f"{topic_string}Do the following model explanations agree that the response is correct? <model1_explanation> {gpt5_model_explanation} </model1_explanation> <model2_explanation> {gemini_model_explanation} </model2_explanation> Yes or No?" return formatted_output_string def llm_api_controller(gpt5_model_summary: str, gpt5_model_explanation: str, gemini_model_explanation: str): try: # Hard truncate by max allowed character count, with strict priority: # gpt5_model_explanation first, then gemini_model_explanation, then summary. # This is intended to put a hard constraint on memory use of the on-device model. Adjust as applicable # via the corresponding environment variable. remaining_max_length_counter = MCP_SERVER_AGREEMENT_MODEL_MAX_CHARACTER_LENGTH gpt5_model_explanation_filtered = gpt5_model_explanation[0:max(0, remaining_max_length_counter)] remaining_max_length_counter -= len(gpt5_model_explanation_filtered) gemini_model_explanation_filtered = gemini_model_explanation[0:max(0, remaining_max_length_counter)] remaining_max_length_counter -= len(gemini_model_explanation_filtered) gpt5_model_summary_filtered = gpt5_model_summary[0:max(0, remaining_max_length_counter)] prompt = get_model_explanations_formatted_as_binary_agreement_prompt(gpt5_model_summary_filtered, gpt5_model_explanation_filtered, gemini_model_explanation_filtered) agreement_model_embedding, agreement_model_classification = \ get_agreement_model_embedding(document_text=prompt) return agreement_model_embedding, agreement_model_classification except: return None, None def get_model_explanations(gpt5_model_verification_dict, gemini_model_verification_dict): return gpt5_model_verification_dict[constants.SHORT_SUMMARY_KEY].strip(), \ gpt5_model_verification_dict[constants.VERIFICATION_CLASSIFICATION_KEY], \ gpt5_model_verification_dict[constants.SHORT_EXPLANATION_FOR_CLASSIFICATION_CONFIDENCE_KEY].strip(), \ gemini_model_verification_dict[constants.VERIFICATION_CLASSIFICATION_KEY], \ gemini_model_verification_dict[constants.SHORT_EXPLANATION_FOR_CLASSIFICATION_CONFIDENCE_KEY].strip()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ReexpressAI/reexpress_mcp_server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

mcp_utils_llm_api.py•11.2 KiB