MCP Terminal Server

/** * Copyright 2024 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import { Genkit, ModelArgument, z } from 'genkit'; import { BaseEvalDataPoint, Score } from 'genkit/evaluator'; import path from 'path'; import { getDirName, loadPromptFile, renderText } from './helper.js'; const LongFormResponseSchema = z.object({ statements: z.array(z.string()) }); const NliResponseBaseSchema = z.object({ statement: z.string(), reason: z.string(), verdict: z.enum(['0', '1'] as const), }); type NliResponseBase = z.infer<typeof NliResponseBaseSchema>; const NliResponseSchema = z.array(NliResponseBaseSchema); /** * */ export async function faithfulnessScore< CustomModelOptions extends z.ZodTypeAny, >( ai: Genkit, judgeLlm: ModelArgument<CustomModelOptions>, dataPoint: BaseEvalDataPoint, judgeConfig?: CustomModelOptions ): Promise<Score> { try { if (!dataPoint.input) { throw new Error('Input was not provided'); } if (!dataPoint.output) { throw new Error('Output was not provided'); } if (!dataPoint.context?.length) { throw new Error('Context was not provided'); } const input = typeof dataPoint.input === 'string' ? dataPoint.input : JSON.stringify(dataPoint.input); const output = typeof dataPoint.output === 'string' ? dataPoint.output : JSON.stringify(dataPoint.output); const context = dataPoint.context.map((i) => JSON.stringify(i)); const longFormPrompt = await loadPromptFile( path.resolve(getDirName(), '../../prompts/faithfulness_long_form.prompt') ); const longFormResponse = await ai.generate({ model: judgeLlm, config: judgeConfig, prompt: await renderText(longFormPrompt, { question: input, answer: output, }), output: { schema: LongFormResponseSchema, }, }); const parsedLongFormResponse = longFormResponse.output; let statements = parsedLongFormResponse?.statements ?? []; if (statements.length === 0) { throw new Error('No statements returned'); } const allStatements = statements.map((s) => `statement: ${s}`).join('\n'); const allContext = context.join('\n'); const nliPrompt = await loadPromptFile( path.resolve(getDirName(), '../../prompts/faithfulness_nli.prompt') ); const response = await ai.generate({ model: judgeLlm, prompt: await renderText(nliPrompt, { context: allContext, statements: allStatements, }), output: { schema: NliResponseSchema, }, }); const parsedResponse = response.output; return nliResponseToScore(parsedResponse); } catch (err) { console.debug( `Genkit faithfulness evaluation failed with error ${err} for sample ${JSON.stringify( dataPoint )}` ); throw err; } } function nliResponseToScore(input: NliResponseBase[] | null): Score { if (!input) { throw new Error(`Evaluator response empty`); } const faithfulStatements = input.reduce((total, resp) => { return total + (resp.verdict === '1' ? 1 : 0); }, 0); return { score: faithfulStatements / input.length, details: { reasoning: input.map((r) => r.reason).join('; ') }, }; }