MCP Terminal Server

MIT License
Overview InspectNew Schema Related Servers Reviews Score
js
plugins
vertexai
src
evaluation
/**
 * Copyright 2024 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import { Action, Genkit, z } from 'genkit';
import { GoogleAuth } from 'google-auth-library';
import { EvaluatorFactory } from './evaluator_factory.js';

/**
 * Vertex AI Evaluation metrics. See API documentation for more information.
 * https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/evaluation#parameter-list
 */
export enum VertexAIEvaluationMetricType {
  // Update genkit/docs/plugins/vertex-ai.md when modifying the list of enums
  BLEU = 'BLEU',
  ROUGE = 'ROUGE',
  FLUENCY = 'FLEUNCY',
  SAFETY = 'SAFETY',
  GROUNDEDNESS = 'GROUNDEDNESS',
  SUMMARIZATION_QUALITY = 'SUMMARIZATION_QUALITY',
  SUMMARIZATION_HELPFULNESS = 'SUMMARIZATION_HELPFULNESS',
  SUMMARIZATION_VERBOSITY = 'SUMMARIZATION_VERBOSITY',
}

/**
 * Evaluation metric config. Use `metricSpec` to define the behavior of the metric.
 * The value of `metricSpec` will be included in the request to the API. See the API documentation
 * for details on the possible values of `metricSpec` for each metric.
 * https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/evaluation#parameter-list
 */
export type VertexAIEvaluationMetricConfig = {
  type: VertexAIEvaluationMetricType;
  metricSpec: any;
};

export type VertexAIEvaluationMetric =
  | VertexAIEvaluationMetricType
  | VertexAIEvaluationMetricConfig;

function stringify(input: unknown) {
  return typeof input === 'string' ? input : JSON.stringify(input);
}

export function vertexEvaluators(
  ai: Genkit,
  auth: GoogleAuth,
  metrics: VertexAIEvaluationMetric[],
  projectId: string,
  location: string
): Action[] {
  const factory = new EvaluatorFactory(auth, location, projectId);
  return metrics.map((metric) => {
    const metricType = isConfig(metric) ? metric.type : metric;
    const metricSpec = isConfig(metric) ? metric.metricSpec : {};

    switch (metricType) {
      case VertexAIEvaluationMetricType.BLEU: {
        return createBleuEvaluator(ai, factory, metricSpec);
      }
      case VertexAIEvaluationMetricType.ROUGE: {
        return createRougeEvaluator(ai, factory, metricSpec);
      }
      case VertexAIEvaluationMetricType.FLUENCY: {
        return createFluencyEvaluator(ai, factory, metricSpec);
      }
      case VertexAIEvaluationMetricType.SAFETY: {
        return createSafetyEvaluator(ai, factory, metricSpec);
      }
      case VertexAIEvaluationMetricType.GROUNDEDNESS: {
        return createGroundednessEvaluator(ai, factory, metricSpec);
      }
      case VertexAIEvaluationMetricType.SUMMARIZATION_QUALITY: {
        return createSummarizationQualityEvaluator(ai, factory, metricSpec);
      }
      case VertexAIEvaluationMetricType.SUMMARIZATION_HELPFULNESS: {
        return createSummarizationHelpfulnessEvaluator(ai, factory, metricSpec);
      }
      case VertexAIEvaluationMetricType.SUMMARIZATION_VERBOSITY: {
        return createSummarizationVerbosityEvaluator(ai, factory, metricSpec);
      }
    }
  });
}

function isConfig(
  config: VertexAIEvaluationMetric
): config is VertexAIEvaluationMetricConfig {
  return (config as VertexAIEvaluationMetricConfig).type !== undefined;
}

const BleuResponseSchema = z.object({
  bleuResults: z.object({
    bleuMetricValues: z.array(z.object({ score: z.number() })),
  }),
});

// TODO: Add support for batch inputs
function createBleuEvaluator(
  ai: Genkit,
  factory: EvaluatorFactory,
  metricSpec: any
): Action {
  return factory.create(
    ai,
    {
      metric: VertexAIEvaluationMetricType.BLEU,
      displayName: 'BLEU',
      definition:
        'Computes the BLEU score by comparing the output against the ground truth',
      responseSchema: BleuResponseSchema,
    },
    (datapoint) => {
      return {
        bleuInput: {
          metricSpec,
          instances: [
            {
              prediction: stringify(datapoint.output),
              reference: datapoint.reference,
            },
          ],
        },
      };
    },
    (response) => {
      return {
        score: response.bleuResults.bleuMetricValues[0].score,
      };
    }
  );
}

const RougeResponseSchema = z.object({
  rougeResults: z.object({
    rougeMetricValues: z.array(z.object({ score: z.number() })),
  }),
});

// TODO: Add support for batch inputs
function createRougeEvaluator(
  ai: Genkit,
  factory: EvaluatorFactory,
  metricSpec: any
): Action {
  return factory.create(
    ai,
    {
      metric: VertexAIEvaluationMetricType.ROUGE,
      displayName: 'ROUGE',
      definition:
        'Computes the ROUGE score by comparing the output against the ground truth',
      responseSchema: RougeResponseSchema,
    },
    (datapoint) => {
      return {
        rougeInput: {
          metricSpec,
          instances: {
            prediction: stringify(datapoint.output),
            reference: datapoint.reference,
          },
        },
      };
    },
    (response) => {
      return {
        score: response.rougeResults.rougeMetricValues[0].score,
      };
    }
  );
}

const FluencyResponseSchema = z.object({
  fluencyResult: z.object({
    score: z.number(),
    explanation: z.string(),
    confidence: z.number(),
  }),
});

function createFluencyEvaluator(
  ai: Genkit,
  factory: EvaluatorFactory,
  metricSpec: any
): Action {
  return factory.create(
    ai,
    {
      metric: VertexAIEvaluationMetricType.FLUENCY,
      displayName: 'Fluency',
      definition: 'Assesses the language mastery of an output',
      responseSchema: FluencyResponseSchema,
    },
    (datapoint) => {
      return {
        fluencyInput: {
          metricSpec,
          instance: {
            prediction: stringify(datapoint.output),
          },
        },
      };
    },
    (response) => {
      return {
        score: response.fluencyResult.score,
        details: {
          reasoning: response.fluencyResult.explanation,
        },
      };
    }
  );
}

const SafetyResponseSchema = z.object({
  safetyResult: z.object({
    score: z.number(),
    explanation: z.string(),
    confidence: z.number(),
  }),
});

function createSafetyEvaluator(
  ai: Genkit,
  factory: EvaluatorFactory,
  metricSpec: any
): Action {
  return factory.create(
    ai,
    {
      metric: VertexAIEvaluationMetricType.SAFETY,
      displayName: 'Safety',
      definition: 'Assesses the level of safety of an output',
      responseSchema: SafetyResponseSchema,
    },
    (datapoint) => {
      return {
        safetyInput: {
          metricSpec,
          instance: {
            prediction: stringify(datapoint.output),
          },
        },
      };
    },
    (response) => {
      return {
        score: response.safetyResult.score,
        details: {
          reasoning: response.safetyResult.explanation,
        },
      };
    }
  );
}

const GroundednessResponseSchema = z.object({
  groundednessResult: z.object({
    score: z.number(),
    explanation: z.string(),
    confidence: z.number(),
  }),
});

function createGroundednessEvaluator(
  ai: Genkit,
  factory: EvaluatorFactory,
  metricSpec: any
): Action {
  return factory.create(
    ai,
    {
      metric: VertexAIEvaluationMetricType.GROUNDEDNESS,
      displayName: 'Groundedness',
      definition:
        'Assesses the ability to provide or reference information included only in the context',
      responseSchema: GroundednessResponseSchema,
    },
    (datapoint) => {
      return {
        groundednessInput: {
          metricSpec,
          instance: {
            prediction: stringify(datapoint.output),
            context: datapoint.context?.join('. '),
          },
        },
      };
    },
    (response) => {
      return {
        score: response.groundednessResult.score,
        details: {
          reasoning: response.groundednessResult.explanation,
        },
      };
    }
  );
}

const SummarizationQualityResponseSchema = z.object({
  summarizationQualityResult: z.object({
    score: z.number(),
    explanation: z.string(),
    confidence: z.number(),
  }),
});

function createSummarizationQualityEvaluator(
  ai: Genkit,
  factory: EvaluatorFactory,
  metricSpec: any
): Action {
  return factory.create(
    ai,
    {
      metric: VertexAIEvaluationMetricType.SUMMARIZATION_QUALITY,
      displayName: 'Summarization quality',
      definition: 'Assesses the overall ability to summarize text',
      responseSchema: SummarizationQualityResponseSchema,
    },
    (datapoint) => {
      return {
        summarizationQualityInput: {
          metricSpec,
          instance: {
            prediction: stringify(datapoint.output),
            instruction: stringify(datapoint.input),
            context: datapoint.context?.join('. '),
          },
        },
      };
    },
    (response) => {
      return {
        score: response.summarizationQualityResult.score,
        details: {
          reasoning: response.summarizationQualityResult.explanation,
        },
      };
    }
  );
}

const SummarizationHelpfulnessResponseSchema = z.object({
  summarizationHelpfulnessResult: z.object({
    score: z.number(),
    explanation: z.string(),
    confidence: z.number(),
  }),
});

function createSummarizationHelpfulnessEvaluator(
  ai: Genkit,
  factory: EvaluatorFactory,
  metricSpec: any
): Action {
  return factory.create(
    ai,
    {
      metric: VertexAIEvaluationMetricType.SUMMARIZATION_HELPFULNESS,
      displayName: 'Summarization helpfulness',
      definition:
        'Assesses the ability to provide a summarization, which contains the details necessary to substitute the original text',
      responseSchema: SummarizationHelpfulnessResponseSchema,
    },
    (datapoint) => {
      return {
        summarizationHelpfulnessInput: {
          metricSpec,
          instance: {
            prediction: stringify(datapoint.output),
            instruction: stringify(datapoint.input),
            context: datapoint.context?.join('. '),
          },
        },
      };
    },
    (response) => {
      return {
        score: response.summarizationHelpfulnessResult.score,
        details: {
          reasoning: response.summarizationHelpfulnessResult.explanation,
        },
      };
    }
  );
}

const SummarizationVerbositySchema = z.object({
  summarizationVerbosityResult: z.object({
    score: z.number(),
    explanation: z.string(),
    confidence: z.number(),
  }),
});

function createSummarizationVerbosityEvaluator(
  ai: Genkit,
  factory: EvaluatorFactory,
  metricSpec: any
): Action {
  return factory.create(
    ai,
    {
      metric: VertexAIEvaluationMetricType.SUMMARIZATION_VERBOSITY,
      displayName: 'Summarization verbosity',
      definition: 'Aassess the ability to provide a succinct summarization',
      responseSchema: SummarizationVerbositySchema,
    },
    (datapoint) => {
      return {
        summarizationVerbosityInput: {
          metricSpec,
          instance: {
            prediction: stringify(datapoint.output),
            instruction: stringify(datapoint.input),
            context: datapoint.context?.join('. '),
          },
        },
      };
    },
    (response) => {
      return {
        score: response.summarizationVerbosityResult.score,
        details: {
          reasoning: response.summarizationVerbosityResult.explanation,
        },
      };
    }
  );
}