Skip to main content
Glama
ReexpressAI

Reexpress MCP Server

Official
by ReexpressAI
utils_preprocess.py6.5 kB
# Copyright Reexpress AI, Inc. All rights reserved. import data_validator import utils_model import constants import torch import numpy as np import json import codecs def get_data(filename_with_path): """ Get the preprocessed data :param filename_with_path: A filepath to the preprocessed data. See the Tutorial for details. :return: A list of dictionaries """ json_list = [] with codecs.open(filename_with_path, "r", encoding="utf-8") as f: for line in f: line = line.strip() json_obj = json.loads(line) json_list.append(json_obj) return json_list def get_metadata_lines_from_json_list(options, json_list, reduce=False, reduce_size=20, use_embeddings=True, concat_embeddings_to_attributes=False, calculate_summary_stats=False, is_training=False): lines = [] line_ids = [] line_id = 0 labels = [] original_labels = [] original_predictions = [] embeddings = [] uuids = [] uuid2idx = {} refusals = [] for json_obj in json_list: uuids.append(json_obj["id"]) uuid2idx[json_obj["id"]] = line_id label = int(json_obj['label']) # if not data_validator.isKnownValidLabel(label=label, numberOfClasses=numberOfClasses): # print("Currently we do not support ") if "refusal" in json_obj: refusals.append(json_obj["refusal"]) if "original_label" in json_obj: original_label = int(json_obj["original_label"]) original_labels.append(original_label) # This can be useful for comparing against tasks in which the input is a textual representation # of the output, which could (in principle) differ from the calibrated version. if "original_prediction" in json_obj: original_prediction = int(json_obj["original_prediction"]) original_predictions.append(original_prediction) labels.append(label) lines.append(json_obj.get('document', '')) line_ids.append(line_id) if concat_embeddings_to_attributes: embedding = torch.tensor(json_obj["embedding"] + json_obj["attributes"]) elif use_embeddings: embedding = torch.tensor(json_obj["embedding"]) else: embedding = torch.tensor(json_obj["attributes"]) embeddings.append(embedding.unsqueeze(0)) line_id += 1 if reduce and line_id == reduce_size: break assert len(lines) == len(line_ids) embeddings = torch.cat(embeddings, dim=0) summary_stats = None if calculate_summary_stats: if options.do_not_normalize_input_embeddings: summary_stats = { constants.STORAGE_KEY_SUMMARY_STATS_EMBEDDINGS_training_embedding_mean: 0.0, constants.STORAGE_KEY_SUMMARY_STATS_EMBEDDINGS_training_embedding_std: 1.0 } else: summary_stats = utils_model.get_embedding_summary_stats(embeddings, is_training) print(f"Total existing metadata lines: {len(lines)}") return {"lines": lines, "line_ids": line_ids, "original_labels": original_labels, # the original task labels, if applicable "original_predictions": original_predictions, # the original LLM prediction, if applicable "labels": labels, "refusals": refusals, "embeddings": embeddings, "uuids": uuids, "uuid2idx": uuid2idx}, summary_stats def get_metadata_lines(options, filepath_with_name, reduce=False, reduce_size=20, use_embeddings=True, concat_embeddings_to_attributes=False, calculate_summary_stats=False, is_training=False): lines = [] line_ids = [] line_id = 0 labels = [] original_labels = [] original_predictions = [] embeddings = [] uuids = [] uuid2idx = {} refusals = [] with codecs.open(filepath_with_name, encoding="utf-8") as f: for line in f: line = line.strip() json_obj = json.loads(line) uuids.append(json_obj["id"]) uuid2idx[json_obj["id"]] = line_id label = int(json_obj['label']) labels.append(label) if "refusal" in json_obj: refusals.append(json_obj["refusal"]) if "original_label" in json_obj: original_label = int(json_obj["original_label"]) original_labels.append(original_label) # This can be useful for comparing against tasks in which the input is a textual representation # of the output, which could (in principle) differ from the calibrated version. if "original_prediction" in json_obj: original_prediction = int(json_obj["original_prediction"]) original_predictions.append(original_prediction) lines.append(json_obj.get('document', '')) line_ids.append(line_id) if concat_embeddings_to_attributes: embedding = torch.tensor(json_obj["embedding"] + json_obj["attributes"]) elif use_embeddings: embedding = torch.tensor(json_obj["embedding"]) else: embedding = torch.tensor(json_obj["attributes"]) embeddings.append(embedding.unsqueeze(0)) line_id += 1 if reduce and line_id == reduce_size: break assert len(lines) == len(line_ids) embeddings = torch.cat(embeddings, dim=0) summary_stats = None if calculate_summary_stats: if options.do_not_normalize_input_embeddings: summary_stats = { constants.STORAGE_KEY_SUMMARY_STATS_EMBEDDINGS_training_embedding_mean: 0.0, constants.STORAGE_KEY_SUMMARY_STATS_EMBEDDINGS_training_embedding_std: 1.0 } else: summary_stats = utils_model.get_embedding_summary_stats(embeddings, is_training) print(f"Total existing metadata lines: {len(lines)}") return {"lines": lines, "line_ids": line_ids, "original_labels": original_labels, # the original task labels, if applicable "original_predictions": original_predictions, # the original LLM prediction, if applicable "labels": labels, "refusals": refusals, "embeddings": embeddings, "uuids": uuids, "uuid2idx": uuid2idx}, summary_stats

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ReexpressAI/reexpress_mcp_server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server