Skip to main content
Glama

Genkit MCP

Official
by firebase
parser_test.ts21.1 kB
/** * Copyright 2024 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import { describe, expect, it } from '@jest/globals'; import { enrichResultsWithScoring, extractMetricSummaries, } from '../../src/eval'; import { EvalMetricSchema, EvalStatusEnum, type EvalResult, } from '../../src/types/eval'; import type { EvalFnResponse, EvalResponse } from '../../src/types/evaluator'; describe('parser', () => { const evalRunResults: EvalResult[] = [ { testCaseId: 'case1', input: 'Who is bob best friend?', output: 'Patrick', context: [ "Bob's best friend is Patrick.", 'Bob has a friend named Patrick.', 'Bob has a friend named Sandy.', ], metrics: [], traceIds: ['trace2'], }, { testCaseId: 'case2', input: 'How many friends does Bob have?', output: '2', context: [ "Bob's best friend is Patrick.", 'Bob has a friend named Patrick.', 'Bob has a friend named Sandy.', ], metrics: [], traceIds: ['trace2'], }, ]; const evaluatorOutput: Record<string, EvalResponse> = { '/evaluator/genkit/faithfulness': [ { testCaseId: 'case1', sampleIndex: 0, evaluation: { score: 1, details: { reasoning: 'It looks good to me!', }, }, }, { testCaseId: 'case2', sampleIndex: 1, evaluation: { score: 1, details: { reasoning: 'I thought the LLM did a very nice job', }, }, }, ], '/evaluator/genkit/context_relevancy': [ { testCaseId: 'case1', sampleIndex: 0, evaluation: { score: 1, details: { reasoning: 'Context was utilized.', }, }, }, { testCaseId: 'case2', sampleIndex: 0, evaluation: { score: 1, details: { reasoning: 'Context was utilized.', }, }, }, ], }; describe('enrichResultsWithScoring', () => { it('Adds scoring data to eval results', () => { const results = enrichResultsWithScoring(evaluatorOutput, evalRunResults); expect(results).toHaveLength(2); results.forEach((result) => { expect(result.metrics).toMatchObject([ EvalMetricSchema.parse({ evaluator: '/evaluator/genkit/faithfulness', score: 1, }), EvalMetricSchema.parse({ evaluator: '/evaluator/genkit/context_relevancy', score: 1, }), ]); expect(result.traceIds).toHaveLength(1); }); }); }); describe('extractMetricSummaries', () => { const simpleEvalOutput: Record<string, EvalResponse> = { '/evaluator/genkit/context_relevancy': [ { testCaseId: 'case1', evaluation: { score: 7, }, }, { testCaseId: 'case2', evaluation: { score: 10, }, }, { testCaseId: 'case3', evaluation: { score: 5, }, }, ], }; describe('simpler scenarios', () => { it('mean for simple numeric scores', () => { const results = extractMetricSummaries(simpleEvalOutput); expect(results).toHaveLength(1); const result = results[0]; expect(result).toEqual({ evaluator: '/evaluator/genkit/context_relevancy', testCaseCount: 3, errorCount: 0, scoreUndefinedCount: 0, statusDistribution: { undefined: 3 }, // 7 + 10 + 5 averageScore: 22.0 / 3, }); }); it('scoreDistribution for simple boolean scores', () => { const booleanScores = mockScores(simpleEvalOutput, [ { score: true, }, { score: false }, { score: true }, ]); const results = extractMetricSummaries(booleanScores); expect(results).toHaveLength(1); const result = results[0]; expect(result).toEqual({ evaluator: '/evaluator/genkit/context_relevancy', testCaseCount: 3, errorCount: 0, scoreUndefinedCount: 0, statusDistribution: { undefined: 3 }, // True, False, True scoreDistribution: { true: 2, false: 1 }, }); }); it('scoreDistribution for simple string scores (under 5)', () => { const stringScores = mockScores(simpleEvalOutput, [ { score: 'TYPE_0' }, { score: 'TYPE_1' }, { score: 'TYPE_0' }, ]); const results = extractMetricSummaries(stringScores); expect(results).toHaveLength(1); const result = results[0]; expect(result).toEqual({ evaluator: '/evaluator/genkit/context_relevancy', testCaseCount: 3, errorCount: 0, scoreUndefinedCount: 0, statusDistribution: { undefined: 3 }, // TYPE_0, TYPE_1, TYPE_0 scoreDistribution: { TYPE_0: 2, TYPE_1: 1 }, }); }); it('scoreDistribution for simple string scores (over 5)', () => { const extendedSimpleEvalOutput: Record<string, EvalResponse> = {}; // 2x the simpleEvalOutput to get 6 samples. extendedSimpleEvalOutput['/evaluator/genkit/context_relevancy'] = Array( 2 ) .fill(simpleEvalOutput['/evaluator/genkit/context_relevancy']) .flat(); const stringScores = mockScores(extendedSimpleEvalOutput, [ { score: 'TYPE_0' }, { score: 'TYPE_1' }, { score: 'TYPE_2' }, { score: 'TYPE_3' }, { score: 'TYPE_4' }, { score: 'TYPE_5' }, ]); const results = extractMetricSummaries(stringScores); expect(results).toHaveLength(1); const result = results[0]; expect(result).toEqual({ evaluator: '/evaluator/genkit/context_relevancy', testCaseCount: 6, errorCount: 0, scoreUndefinedCount: 0, statusDistribution: { undefined: 6 }, }); }); it('status distribution for simple numeric scores', () => { const scores = [ { score: 0, status: EvalStatusEnum.PASS, }, { score: 1, status: EvalStatusEnum.FAIL, }, { score: 2, }, ]; const withStatus = mockScores(simpleEvalOutput, scores); const results = extractMetricSummaries(withStatus); expect(results).toHaveLength(1); const result = results[0]; expect(result).toEqual({ evaluator: '/evaluator/genkit/context_relevancy', testCaseCount: 3, errorCount: 0, scoreUndefinedCount: 0, statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, // avg(0, 1, 2) averageScore: 3.0 / 3, }); }); }); describe('edge cases', () => { it('metrics if scores are undefined but status available', () => { const scores = [ { status: EvalStatusEnum.PASS, }, { status: EvalStatusEnum.FAIL, }, {}, ]; const undefinedScores = mockScores(simpleEvalOutput, scores); const results = extractMetricSummaries(undefinedScores); expect(results).toHaveLength(1); const result = results[0]; expect(result).toEqual({ evaluator: '/evaluator/genkit/context_relevancy', testCaseCount: 3, errorCount: 0, scoreUndefinedCount: 3, // PASS, FAIL, undefined statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, }); }); it('metrics if some scores are undefined with status available', () => { const scores = [ { score: 0, status: EvalStatusEnum.PASS, }, { score: 1, status: EvalStatusEnum.FAIL, }, {}, ]; const someDefinedScores = mockScores(simpleEvalOutput, scores); const results = extractMetricSummaries(someDefinedScores); expect(results).toHaveLength(1); const result = results[0]; expect(result).toEqual({ evaluator: '/evaluator/genkit/context_relevancy', testCaseCount: 3, errorCount: 0, scoreUndefinedCount: 1, // avg(0, 1) averageScore: 1 / 2.0, // PASS, FAIL, undefined statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, }); }); it('metrics if some scores are undefined, some errors and with status available', () => { const scores = [ { score: undefined, error: 'some error', status: EvalStatusEnum.PASS, }, { score: 1, status: EvalStatusEnum.FAIL, }, { error: 'some error' }, ]; const someDefinedScores = mockScores(simpleEvalOutput, scores); const results = extractMetricSummaries(someDefinedScores); expect(results).toHaveLength(1); const result = results[0]; expect(result).toEqual({ evaluator: '/evaluator/genkit/context_relevancy', testCaseCount: 3, errorCount: 2, scoreUndefinedCount: 2, // avg(1) averageScore: 1.0, // PASS, FAIL, undefined statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, }); }); }); describe('multiple evaluators grouped', () => { const multiOutput: Record<string, EvalResponse> = { '/evaluator/genkit/faithfulness': [ { testCaseId: 'case1', evaluation: { score: 7, }, }, { testCaseId: 'case2', evaluation: { score: 10, }, }, { testCaseId: 'case3', evaluation: { score: 5, }, }, ], '/evaluator/genkit/context_relevancy': [ { testCaseId: 'case1', evaluation: { score: true, }, }, { testCaseId: 'case2', evaluation: { score: false, }, }, { testCaseId: 'case3', evaluation: { score: true, }, }, ], }; it('treats each evaluator separately', () => { const results = extractMetricSummaries(multiOutput); expect(results).toHaveLength(2); expect(results).toContainEqual({ evaluator: '/evaluator/genkit/context_relevancy', testCaseCount: 3, errorCount: 0, scoreUndefinedCount: 0, statusDistribution: { undefined: 3 }, // true, false, true scoreDistribution: { true: 2, false: 1 }, }); expect(results).toContainEqual({ evaluator: '/evaluator/genkit/faithfulness', testCaseCount: 3, errorCount: 0, scoreUndefinedCount: 0, statusDistribution: { undefined: 3 }, // avg(7, 10, 5) averageScore: 22.0 / 3, }); }); it('treats each evaluator separately, with errors, status, undefined scores', () => { const mockFaithfulness = [ { status: EvalStatusEnum.PASS, error: 'some error', }, { score: 10, status: EvalStatusEnum.FAIL, }, { error: 'some error', }, ]; const mockContextRel = [ { score: 'alpha', status: EvalStatusEnum.PASS, }, { status: EvalStatusEnum.FAIL, error: 'some error', }, { score: 'gamma', }, ]; const someDefinedScores = reMapScores( multiOutput, (response, i, evaluator) => { if (evaluator === '/evaluator/genkit/faithfulness') { return { testCaseId: response.testCaseId, evaluation: mockFaithfulness[i], }; } else { return { testCaseId: response.testCaseId, evaluation: mockContextRel[i], }; } } ); const results = extractMetricSummaries(someDefinedScores); expect(results).toHaveLength(2); expect(results).toContainEqual({ evaluator: '/evaluator/genkit/faithfulness', testCaseCount: 3, errorCount: 2, scoreUndefinedCount: 2, statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, // avg(10) averageScore: 10.0, }); expect(results).toContainEqual({ evaluator: '/evaluator/genkit/context_relevancy', testCaseCount: 3, errorCount: 1, scoreUndefinedCount: 1, statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, // alpha, gamma scoreDistribution: { alpha: 1, gamma: 1 }, }); }); describe('multi-scores', () => { it('mix of scores', () => { const mockEvaluations = [ { score: 1, status: EvalStatusEnum.PASS, }, [ { score: 1, status: EvalStatusEnum.FAIL, }, { score: 2, status: EvalStatusEnum.PASS, }, ], { score: undefined, }, ]; const mixedScores = reMapScores(simpleEvalOutput, (response, i) => ({ testCaseId: response.testCaseId, evaluation: mockEvaluations[i], })); const results = extractMetricSummaries(mixedScores); expect(results).toHaveLength(1); const result = results[0]; expect(result).toEqual({ evaluator: '/evaluator/genkit/context_relevancy', testCaseCount: 3, errorCount: 0, scoreUndefinedCount: 1, // avg(1, 1, 2) averageScore: 4.0 / 3, // PASS, FAIL, PASS, undefined statusDistribution: { undefined: 1, PASS: 2, FAIL: 1 }, }); }); it('scores with IDs', () => { const mockEvaluations = [ [ { score: 5, id: 'numeric', status: EvalStatusEnum.PASS, }, { score: 'YES', id: 'enum', status: EvalStatusEnum.FAIL, }, ], [ { score: 7, id: 'numeric', status: EvalStatusEnum.FAIL, }, { score: 'NO', id: 'enum', status: EvalStatusEnum.PASS, }, ], [ { score: undefined, id: 'numeric', error: 'somer error', }, { score: undefined, id: 'enum', }, ], ]; const mixedScores = reMapScores(simpleEvalOutput, (response, i) => ({ testCaseId: response.testCaseId, evaluation: mockEvaluations[i], })); const results = extractMetricSummaries(mixedScores); expect(results).toHaveLength(2); expect(results).toContainEqual({ evaluator: '/evaluator/genkit/context_relevancy/numeric', testCaseCount: 3, errorCount: 1, scoreUndefinedCount: 1, // avg(5, 7) averageScore: 12.0 / 2, // PASS, FAIL, undefined statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, }); expect(results).toContainEqual({ evaluator: '/evaluator/genkit/context_relevancy/enum', testCaseCount: 3, errorCount: 0, scoreUndefinedCount: 1, // YES, NO scoreDistribution: { YES: 1, NO: 1 }, // FAIL, PASS, undefined statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, }); }); it('multi-scores with IDs', () => { const mockEvaluations = [ [ { score: 5, id: 'numeric', status: EvalStatusEnum.PASS, }, { score: 'YES', id: 'enum', status: EvalStatusEnum.FAIL, }, ], [ { score: 7, id: 'numeric', status: EvalStatusEnum.FAIL, }, { score: 'NO', id: 'enum', status: EvalStatusEnum.PASS, }, ], [ { score: undefined, id: 'numeric', error: 'somer error', }, { score: undefined, id: 'enum', }, ], ]; const mixedScores = reMapScores(multiOutput, (response, i) => ({ testCaseId: response.testCaseId, evaluation: mockEvaluations[i], })); const results = extractMetricSummaries(mixedScores); expect(results).toHaveLength(4); expect(results).toContainEqual({ evaluator: '/evaluator/genkit/context_relevancy/numeric', testCaseCount: 3, errorCount: 1, scoreUndefinedCount: 1, // avg(5, 7) averageScore: 12.0 / 2, // PASS, FAIL, undefined statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, }); expect(results).toContainEqual({ evaluator: '/evaluator/genkit/context_relevancy/enum', testCaseCount: 3, errorCount: 0, scoreUndefinedCount: 1, // YES, NO scoreDistribution: { YES: 1, NO: 1 }, // FAIL, PASS, undefined statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, }); expect(results).toContainEqual({ evaluator: '/evaluator/genkit/faithfulness/numeric', testCaseCount: 3, errorCount: 1, scoreUndefinedCount: 1, // avg(5, 7) averageScore: 12.0 / 2, // PASS, FAIL, undefined statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, }); expect(results).toContainEqual({ evaluator: '/evaluator/genkit/faithfulness/enum', testCaseCount: 3, errorCount: 0, scoreUndefinedCount: 1, // YES, NO scoreDistribution: { YES: 1, NO: 1 }, // FAIL, PASS, undefined statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 }, }); }); }); }); }); }); function reMapScores( scoresMap: Record<string, EvalResponse>, fn: ( score: EvalFnResponse, index: number, evaluator?: string ) => EvalFnResponse ): Record<string, EvalResponse> { const remapped: Record<string, EvalResponse> = {}; for (const [evaluator, scores] of Object.entries(scoresMap)) { remapped[evaluator] = scores.map((score, index) => fn(score, index, evaluator) ); } return remapped; } function mockScores( scoresMap: Record<string, EvalResponse>, mockedScores: any[] | Record<string, any[]> ): Record<string, EvalResponse> { const remapped: Record<string, EvalResponse> = {}; for (const [evaluator, scores] of Object.entries(scoresMap)) { remapped[evaluator] = scores.map((score, index) => { const evaluation = Array.isArray(mockedScores) ? { ...mockedScores[index] } : mockedScores[evaluator][index]; return { testCaseId: score.testCaseId, evaluation, }; }); } return remapped; }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/firebase/genkit'

If you have feedback or need assistance with the MCP directory API, please join our Discord server