Reexpress MCP Server

Official

Overview Schema Related Servers Score Discussions

aux_construct_document_db.py•10.9 KiB

# Copyright Reexpress AI, Inc. All rights reserved. """ This constructs the document database for an existing model. The text comes from a HuggingFace datasets dataset. In this version, not all database instances are from the same version of the GPT and Gemini. For GPT, the training examples are from GPT-5 or GPT-5.2, and for Gemini, the training examples are from Gemini-2.5 or Gemini-3. """ import torch import argparse import codecs import json import time from datasets import load_from_disk from datasets import load_dataset import utils_model import mcp_utils_sqlite_document_db_controller GPT5_PRESENT = "gpt5_present" GEMINI2_5_PRESENT = "gemini2_5_present" GPT5_2_PRESENT = "gpt5_2_present" GEMINI3_PRESENT = "gemini3_present" def retrieve_model_documents_ids(options): model = utils_model.load_model_torch(options.model_dir, torch.device("cpu"), load_for_inference=True) assert len(model.train_uuids) == len(set(model.train_uuids)), \ "Unexpected Error: The training documents ids are not unique." print(f"Support set size: {len(model.train_uuids)}") return set(model.train_uuids) def get_agreement_classification(options, train_uuids_set): # We back this out from the embedding which contains the agreement model's output logits as the final two indexes document_id_2_agreement_classification_int = {} document_id_2_label_int = {} document_id_2_model_identity_dict = {} with codecs.open(options.best_iteration_train_split_file, encoding="utf-8") as f: for line in f: line = line.strip() json_obj = json.loads(line) document_id = json_obj["id"] label = json_obj["label"] no_prob = json_obj["embedding"][-2] yes_prob = json_obj["embedding"][-1] agreement_classification_int = int(no_prob < yes_prob) assert document_id in train_uuids_set train_uuids_set.remove(document_id) document_id_2_agreement_classification_int[document_id] = agreement_classification_int document_id_2_label_int[document_id] = label document_id_2_model_identity_dict[document_id] = { GPT5_PRESENT: bool(json_obj[GPT5_PRESENT]), GEMINI2_5_PRESENT: bool(json_obj[GEMINI2_5_PRESENT]), GPT5_2_PRESENT: bool(json_obj[GPT5_2_PRESENT]), GEMINI3_PRESENT: bool(json_obj[GEMINI3_PRESENT]) } assert len(train_uuids_set) == 0 return document_id_2_agreement_classification_int, document_id_2_label_int, document_id_2_model_identity_dict def get_row_by_id(dataset_dict, document_id_2_hf_idx, target_id): for split_name in dataset_dict: if target_id in document_id_2_hf_idx[split_name]: idx = document_id_2_hf_idx[split_name][target_id] return dataset_dict[split_name][idx], split_name return None, None def init_db(database_file: str, document_id_2_agreement_classification_int, document_id_2_label_int, document_id_2_model_identity_dict, expected_support_size: int, dataset_dict, document_id_2_hf_idx, dataset_label: str): db = mcp_utils_sqlite_document_db_controller.DocumentDatabase(database_file) missing_documents = 0 for document_id in document_id_2_agreement_classification_int: model1_label = "unavailable" model2_label = "unavailable" agreement_classification_int = document_id_2_agreement_classification_int[document_id] label_int = document_id_2_label_int[document_id] model_identity_dict = document_id_2_model_identity_dict[document_id] row, _ = get_row_by_id(dataset_dict, document_id_2_hf_idx, document_id) if row is None: missing_documents += 1 # default values: success = db.add_document( document_id=document_id, model1_summary="", model1_explanation="", model2_explanation="", model3_explanation="", model4_explanation="", model1_classification_int=0, model2_classification_int=0, model3_classification_int=0, model4_classification_int=0, agreement_model_classification_int=agreement_classification_int, label_int=label_int, label_was_updated_int=0, document_source="N/A", info=f"{model1_label},{model2_label}", user_question="", ai_response="" ) else: # Note the remapping of the model names. # Model indices are remapped to match the agreement model's expected ordering # as of v2.1.0 of the MCP Server. # Model 1: GPT-5.2 if available, else GPT-5 # Model 2: Gemini-3-pro-preview if available, else Gemini-2.5-pro model1_summary = "" model1_explanation = "" model2_explanation = "" model1_classification_int = 0 model2_classification_int = 0 gemini2_5_model_label = "model3" gpt5_model_label = "model4" gemini3_model_label = "model6" gpt5_2_model_label = "model5" if model_identity_dict[GPT5_2_PRESENT]: model1_summary = row[f"{gpt5_2_model_label}_short_summary_of_original_question_and_response"] model1_explanation = row[f"{gpt5_2_model_label}_short_explanation_for_classification_confidence"] model1_classification_int = row[f'{gpt5_2_model_label}_verification_classification'] model1_label = row[f"{gpt5_2_model_label}"] else: assert model_identity_dict[GPT5_PRESENT] model1_summary = row[f"{gpt5_model_label}_short_summary_of_original_question_and_response"] model1_explanation = row[f"{gpt5_model_label}_short_explanation_for_classification_confidence"] model1_classification_int = row[f'{gpt5_model_label}_verification_classification'] model1_label = row[f"{gpt5_model_label}"] if model_identity_dict[GEMINI3_PRESENT]: model2_explanation = row[f"{gemini3_model_label}_short_explanation_for_classification_confidence"] model2_classification_int = row[f'{gemini3_model_label}_verification_classification'] model2_label = row[f"{gemini3_model_label}"] else: assert model_identity_dict[GEMINI2_5_PRESENT] model2_explanation = row[f"{gemini2_5_model_label}_short_explanation_for_classification_confidence"] model2_classification_int = row[f'{gemini2_5_model_label}_verification_classification'] # Here, we override to use a similar naming convention as the other models # model2_label = row[f"{GEMINI2_5_PRESENT}"] assert row[f"{gemini2_5_model_label}"] == "GEMINI_2_5_PRO_MODEL" model2_label = "gemini-2.5-pro" success = db.add_document( document_id=document_id, model1_summary=model1_summary, model1_explanation=model1_explanation, model2_explanation=model2_explanation, model3_explanation='', model4_explanation='', model1_classification_int=model1_classification_int, model2_classification_int=model2_classification_int, model3_classification_int=0, model4_classification_int=0, agreement_model_classification_int=agreement_classification_int, label_int=label_int, label_was_updated_int=0, document_source=f"{dataset_label}: {row['info']}", info=f"{model1_label},{model2_label}", user_question=row['user_question'], ai_response=row['ai_response'] ) assert success count = db.get_document_count() print(f"Database created with {count} entries.") if missing_documents > 0: print(f"There were {missing_documents} documents not present in the HF dataset. " f"They were added with empty strings for the model explanations and the model classifications were set " f"to False.") assert count == expected_support_size def main(): parser = argparse.ArgumentParser(description="-----[Construct database]-----") parser.add_argument("--model_dir", default="", help="model_dir") parser.add_argument("--best_iteration_train_split_file", default="", help="best_iteration_train_split") parser.add_argument("--database_file", default="", help="database_file") parser.add_argument("--load_hf_dataset_from_disk", default=False, action='store_true', help="load_hf_dataset_from_disk") parser.add_argument("--input_datasets_file", default="ReexpressAI/OpenVerification1", help="If --load_hf_dataset_from_disk, then --input_datasets_file should be a path to " "a datasets file. Otherwise, --input_datasets_file should be the name of the applicable " "dataset on the HuggingFace Hub.") parser.add_argument("--dataset_label", default="OpenVerification1", help="This is saved to the database's document_source field as: " "dataset_label: [additional info]") options = parser.parse_args() start_time = time.time() print(f"This assumes the agreement model's classification is encoded in the input embedding.") if options.load_hf_dataset_from_disk: ds = load_from_disk(options.input_datasets_file) print(f"Successfully loaded {options.input_datasets_file} from disk.") else: ds = load_dataset(options.input_datasets_file) print(f"Successfully loaded {options.input_datasets_file}, which is the current version from HF Hub.") document_id_2_hf_idx = {} for split_name in ds: split_ids = ds[split_name]['id'] document_id_2_hf_idx[split_name] = {id_val: idx for idx, id_val in enumerate(split_ids)} train_uuids_set = retrieve_model_documents_ids(options) expected_support_size = len(train_uuids_set) document_id_2_agreement_classification_int, document_id_2_label_int, document_id_2_model_identity_dict = \ get_agreement_classification(options, train_uuids_set) init_db(database_file=options.database_file, document_id_2_agreement_classification_int=document_id_2_agreement_classification_int, document_id_2_label_int=document_id_2_label_int, document_id_2_model_identity_dict=document_id_2_model_identity_dict, expected_support_size=expected_support_size, dataset_dict=ds, document_id_2_hf_idx=document_id_2_hf_idx, dataset_label=options.dataset_label) cumulative_time = time.time() - start_time print(f"Cumulative running time: {cumulative_time}") if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ReexpressAI/reexpress_mcp_server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

aux_construct_document_db.py•10.9 KiB