parser_test.ts•21.1 kB
/**
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { describe, expect, it } from '@jest/globals';
import {
enrichResultsWithScoring,
extractMetricSummaries,
} from '../../src/eval';
import {
EvalMetricSchema,
EvalStatusEnum,
type EvalResult,
} from '../../src/types/eval';
import type { EvalFnResponse, EvalResponse } from '../../src/types/evaluator';
describe('parser', () => {
const evalRunResults: EvalResult[] = [
{
testCaseId: 'case1',
input: 'Who is bob best friend?',
output: 'Patrick',
context: [
"Bob's best friend is Patrick.",
'Bob has a friend named Patrick.',
'Bob has a friend named Sandy.',
],
metrics: [],
traceIds: ['trace2'],
},
{
testCaseId: 'case2',
input: 'How many friends does Bob have?',
output: '2',
context: [
"Bob's best friend is Patrick.",
'Bob has a friend named Patrick.',
'Bob has a friend named Sandy.',
],
metrics: [],
traceIds: ['trace2'],
},
];
const evaluatorOutput: Record<string, EvalResponse> = {
'/evaluator/genkit/faithfulness': [
{
testCaseId: 'case1',
sampleIndex: 0,
evaluation: {
score: 1,
details: {
reasoning: 'It looks good to me!',
},
},
},
{
testCaseId: 'case2',
sampleIndex: 1,
evaluation: {
score: 1,
details: {
reasoning: 'I thought the LLM did a very nice job',
},
},
},
],
'/evaluator/genkit/context_relevancy': [
{
testCaseId: 'case1',
sampleIndex: 0,
evaluation: {
score: 1,
details: {
reasoning: 'Context was utilized.',
},
},
},
{
testCaseId: 'case2',
sampleIndex: 0,
evaluation: {
score: 1,
details: {
reasoning: 'Context was utilized.',
},
},
},
],
};
describe('enrichResultsWithScoring', () => {
it('Adds scoring data to eval results', () => {
const results = enrichResultsWithScoring(evaluatorOutput, evalRunResults);
expect(results).toHaveLength(2);
results.forEach((result) => {
expect(result.metrics).toMatchObject([
EvalMetricSchema.parse({
evaluator: '/evaluator/genkit/faithfulness',
score: 1,
}),
EvalMetricSchema.parse({
evaluator: '/evaluator/genkit/context_relevancy',
score: 1,
}),
]);
expect(result.traceIds).toHaveLength(1);
});
});
});
describe('extractMetricSummaries', () => {
const simpleEvalOutput: Record<string, EvalResponse> = {
'/evaluator/genkit/context_relevancy': [
{
testCaseId: 'case1',
evaluation: {
score: 7,
},
},
{
testCaseId: 'case2',
evaluation: {
score: 10,
},
},
{
testCaseId: 'case3',
evaluation: {
score: 5,
},
},
],
};
describe('simpler scenarios', () => {
it('mean for simple numeric scores', () => {
const results = extractMetricSummaries(simpleEvalOutput);
expect(results).toHaveLength(1);
const result = results[0];
expect(result).toEqual({
evaluator: '/evaluator/genkit/context_relevancy',
testCaseCount: 3,
errorCount: 0,
scoreUndefinedCount: 0,
statusDistribution: { undefined: 3 },
// 7 + 10 + 5
averageScore: 22.0 / 3,
});
});
it('scoreDistribution for simple boolean scores', () => {
const booleanScores = mockScores(simpleEvalOutput, [
{
score: true,
},
{ score: false },
{ score: true },
]);
const results = extractMetricSummaries(booleanScores);
expect(results).toHaveLength(1);
const result = results[0];
expect(result).toEqual({
evaluator: '/evaluator/genkit/context_relevancy',
testCaseCount: 3,
errorCount: 0,
scoreUndefinedCount: 0,
statusDistribution: { undefined: 3 },
// True, False, True
scoreDistribution: { true: 2, false: 1 },
});
});
it('scoreDistribution for simple string scores (under 5)', () => {
const stringScores = mockScores(simpleEvalOutput, [
{ score: 'TYPE_0' },
{ score: 'TYPE_1' },
{ score: 'TYPE_0' },
]);
const results = extractMetricSummaries(stringScores);
expect(results).toHaveLength(1);
const result = results[0];
expect(result).toEqual({
evaluator: '/evaluator/genkit/context_relevancy',
testCaseCount: 3,
errorCount: 0,
scoreUndefinedCount: 0,
statusDistribution: { undefined: 3 },
// TYPE_0, TYPE_1, TYPE_0
scoreDistribution: { TYPE_0: 2, TYPE_1: 1 },
});
});
it('scoreDistribution for simple string scores (over 5)', () => {
const extendedSimpleEvalOutput: Record<string, EvalResponse> = {};
// 2x the simpleEvalOutput to get 6 samples.
extendedSimpleEvalOutput['/evaluator/genkit/context_relevancy'] = Array(
2
)
.fill(simpleEvalOutput['/evaluator/genkit/context_relevancy'])
.flat();
const stringScores = mockScores(extendedSimpleEvalOutput, [
{ score: 'TYPE_0' },
{ score: 'TYPE_1' },
{ score: 'TYPE_2' },
{ score: 'TYPE_3' },
{ score: 'TYPE_4' },
{ score: 'TYPE_5' },
]);
const results = extractMetricSummaries(stringScores);
expect(results).toHaveLength(1);
const result = results[0];
expect(result).toEqual({
evaluator: '/evaluator/genkit/context_relevancy',
testCaseCount: 6,
errorCount: 0,
scoreUndefinedCount: 0,
statusDistribution: { undefined: 6 },
});
});
it('status distribution for simple numeric scores', () => {
const scores = [
{
score: 0,
status: EvalStatusEnum.PASS,
},
{
score: 1,
status: EvalStatusEnum.FAIL,
},
{
score: 2,
},
];
const withStatus = mockScores(simpleEvalOutput, scores);
const results = extractMetricSummaries(withStatus);
expect(results).toHaveLength(1);
const result = results[0];
expect(result).toEqual({
evaluator: '/evaluator/genkit/context_relevancy',
testCaseCount: 3,
errorCount: 0,
scoreUndefinedCount: 0,
statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 },
// avg(0, 1, 2)
averageScore: 3.0 / 3,
});
});
});
describe('edge cases', () => {
it('metrics if scores are undefined but status available', () => {
const scores = [
{
status: EvalStatusEnum.PASS,
},
{
status: EvalStatusEnum.FAIL,
},
{},
];
const undefinedScores = mockScores(simpleEvalOutput, scores);
const results = extractMetricSummaries(undefinedScores);
expect(results).toHaveLength(1);
const result = results[0];
expect(result).toEqual({
evaluator: '/evaluator/genkit/context_relevancy',
testCaseCount: 3,
errorCount: 0,
scoreUndefinedCount: 3,
// PASS, FAIL, undefined
statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 },
});
});
it('metrics if some scores are undefined with status available', () => {
const scores = [
{
score: 0,
status: EvalStatusEnum.PASS,
},
{
score: 1,
status: EvalStatusEnum.FAIL,
},
{},
];
const someDefinedScores = mockScores(simpleEvalOutput, scores);
const results = extractMetricSummaries(someDefinedScores);
expect(results).toHaveLength(1);
const result = results[0];
expect(result).toEqual({
evaluator: '/evaluator/genkit/context_relevancy',
testCaseCount: 3,
errorCount: 0,
scoreUndefinedCount: 1,
// avg(0, 1)
averageScore: 1 / 2.0,
// PASS, FAIL, undefined
statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 },
});
});
it('metrics if some scores are undefined, some errors and with status available', () => {
const scores = [
{
score: undefined,
error: 'some error',
status: EvalStatusEnum.PASS,
},
{
score: 1,
status: EvalStatusEnum.FAIL,
},
{ error: 'some error' },
];
const someDefinedScores = mockScores(simpleEvalOutput, scores);
const results = extractMetricSummaries(someDefinedScores);
expect(results).toHaveLength(1);
const result = results[0];
expect(result).toEqual({
evaluator: '/evaluator/genkit/context_relevancy',
testCaseCount: 3,
errorCount: 2,
scoreUndefinedCount: 2,
// avg(1)
averageScore: 1.0,
// PASS, FAIL, undefined
statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 },
});
});
});
describe('multiple evaluators grouped', () => {
const multiOutput: Record<string, EvalResponse> = {
'/evaluator/genkit/faithfulness': [
{
testCaseId: 'case1',
evaluation: {
score: 7,
},
},
{
testCaseId: 'case2',
evaluation: {
score: 10,
},
},
{
testCaseId: 'case3',
evaluation: {
score: 5,
},
},
],
'/evaluator/genkit/context_relevancy': [
{
testCaseId: 'case1',
evaluation: {
score: true,
},
},
{
testCaseId: 'case2',
evaluation: {
score: false,
},
},
{
testCaseId: 'case3',
evaluation: {
score: true,
},
},
],
};
it('treats each evaluator separately', () => {
const results = extractMetricSummaries(multiOutput);
expect(results).toHaveLength(2);
expect(results).toContainEqual({
evaluator: '/evaluator/genkit/context_relevancy',
testCaseCount: 3,
errorCount: 0,
scoreUndefinedCount: 0,
statusDistribution: { undefined: 3 },
// true, false, true
scoreDistribution: { true: 2, false: 1 },
});
expect(results).toContainEqual({
evaluator: '/evaluator/genkit/faithfulness',
testCaseCount: 3,
errorCount: 0,
scoreUndefinedCount: 0,
statusDistribution: { undefined: 3 },
// avg(7, 10, 5)
averageScore: 22.0 / 3,
});
});
it('treats each evaluator separately, with errors, status, undefined scores', () => {
const mockFaithfulness = [
{
status: EvalStatusEnum.PASS,
error: 'some error',
},
{
score: 10,
status: EvalStatusEnum.FAIL,
},
{
error: 'some error',
},
];
const mockContextRel = [
{
score: 'alpha',
status: EvalStatusEnum.PASS,
},
{
status: EvalStatusEnum.FAIL,
error: 'some error',
},
{
score: 'gamma',
},
];
const someDefinedScores = reMapScores(
multiOutput,
(response, i, evaluator) => {
if (evaluator === '/evaluator/genkit/faithfulness') {
return {
testCaseId: response.testCaseId,
evaluation: mockFaithfulness[i],
};
} else {
return {
testCaseId: response.testCaseId,
evaluation: mockContextRel[i],
};
}
}
);
const results = extractMetricSummaries(someDefinedScores);
expect(results).toHaveLength(2);
expect(results).toContainEqual({
evaluator: '/evaluator/genkit/faithfulness',
testCaseCount: 3,
errorCount: 2,
scoreUndefinedCount: 2,
statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 },
// avg(10)
averageScore: 10.0,
});
expect(results).toContainEqual({
evaluator: '/evaluator/genkit/context_relevancy',
testCaseCount: 3,
errorCount: 1,
scoreUndefinedCount: 1,
statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 },
// alpha, gamma
scoreDistribution: { alpha: 1, gamma: 1 },
});
});
describe('multi-scores', () => {
it('mix of scores', () => {
const mockEvaluations = [
{
score: 1,
status: EvalStatusEnum.PASS,
},
[
{
score: 1,
status: EvalStatusEnum.FAIL,
},
{
score: 2,
status: EvalStatusEnum.PASS,
},
],
{
score: undefined,
},
];
const mixedScores = reMapScores(simpleEvalOutput, (response, i) => ({
testCaseId: response.testCaseId,
evaluation: mockEvaluations[i],
}));
const results = extractMetricSummaries(mixedScores);
expect(results).toHaveLength(1);
const result = results[0];
expect(result).toEqual({
evaluator: '/evaluator/genkit/context_relevancy',
testCaseCount: 3,
errorCount: 0,
scoreUndefinedCount: 1,
// avg(1, 1, 2)
averageScore: 4.0 / 3,
// PASS, FAIL, PASS, undefined
statusDistribution: { undefined: 1, PASS: 2, FAIL: 1 },
});
});
it('scores with IDs', () => {
const mockEvaluations = [
[
{
score: 5,
id: 'numeric',
status: EvalStatusEnum.PASS,
},
{
score: 'YES',
id: 'enum',
status: EvalStatusEnum.FAIL,
},
],
[
{
score: 7,
id: 'numeric',
status: EvalStatusEnum.FAIL,
},
{
score: 'NO',
id: 'enum',
status: EvalStatusEnum.PASS,
},
],
[
{
score: undefined,
id: 'numeric',
error: 'somer error',
},
{
score: undefined,
id: 'enum',
},
],
];
const mixedScores = reMapScores(simpleEvalOutput, (response, i) => ({
testCaseId: response.testCaseId,
evaluation: mockEvaluations[i],
}));
const results = extractMetricSummaries(mixedScores);
expect(results).toHaveLength(2);
expect(results).toContainEqual({
evaluator: '/evaluator/genkit/context_relevancy/numeric',
testCaseCount: 3,
errorCount: 1,
scoreUndefinedCount: 1,
// avg(5, 7)
averageScore: 12.0 / 2,
// PASS, FAIL, undefined
statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 },
});
expect(results).toContainEqual({
evaluator: '/evaluator/genkit/context_relevancy/enum',
testCaseCount: 3,
errorCount: 0,
scoreUndefinedCount: 1,
// YES, NO
scoreDistribution: { YES: 1, NO: 1 },
// FAIL, PASS, undefined
statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 },
});
});
it('multi-scores with IDs', () => {
const mockEvaluations = [
[
{
score: 5,
id: 'numeric',
status: EvalStatusEnum.PASS,
},
{
score: 'YES',
id: 'enum',
status: EvalStatusEnum.FAIL,
},
],
[
{
score: 7,
id: 'numeric',
status: EvalStatusEnum.FAIL,
},
{
score: 'NO',
id: 'enum',
status: EvalStatusEnum.PASS,
},
],
[
{
score: undefined,
id: 'numeric',
error: 'somer error',
},
{
score: undefined,
id: 'enum',
},
],
];
const mixedScores = reMapScores(multiOutput, (response, i) => ({
testCaseId: response.testCaseId,
evaluation: mockEvaluations[i],
}));
const results = extractMetricSummaries(mixedScores);
expect(results).toHaveLength(4);
expect(results).toContainEqual({
evaluator: '/evaluator/genkit/context_relevancy/numeric',
testCaseCount: 3,
errorCount: 1,
scoreUndefinedCount: 1,
// avg(5, 7)
averageScore: 12.0 / 2,
// PASS, FAIL, undefined
statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 },
});
expect(results).toContainEqual({
evaluator: '/evaluator/genkit/context_relevancy/enum',
testCaseCount: 3,
errorCount: 0,
scoreUndefinedCount: 1,
// YES, NO
scoreDistribution: { YES: 1, NO: 1 },
// FAIL, PASS, undefined
statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 },
});
expect(results).toContainEqual({
evaluator: '/evaluator/genkit/faithfulness/numeric',
testCaseCount: 3,
errorCount: 1,
scoreUndefinedCount: 1,
// avg(5, 7)
averageScore: 12.0 / 2,
// PASS, FAIL, undefined
statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 },
});
expect(results).toContainEqual({
evaluator: '/evaluator/genkit/faithfulness/enum',
testCaseCount: 3,
errorCount: 0,
scoreUndefinedCount: 1,
// YES, NO
scoreDistribution: { YES: 1, NO: 1 },
// FAIL, PASS, undefined
statusDistribution: { undefined: 1, PASS: 1, FAIL: 1 },
});
});
});
});
});
});
function reMapScores(
scoresMap: Record<string, EvalResponse>,
fn: (
score: EvalFnResponse,
index: number,
evaluator?: string
) => EvalFnResponse
): Record<string, EvalResponse> {
const remapped: Record<string, EvalResponse> = {};
for (const [evaluator, scores] of Object.entries(scoresMap)) {
remapped[evaluator] = scores.map((score, index) =>
fn(score, index, evaluator)
);
}
return remapped;
}
function mockScores(
scoresMap: Record<string, EvalResponse>,
mockedScores: any[] | Record<string, any[]>
): Record<string, EvalResponse> {
const remapped: Record<string, EvalResponse> = {};
for (const [evaluator, scores] of Object.entries(scoresMap)) {
remapped[evaluator] = scores.map((score, index) => {
const evaluation = Array.isArray(mockedScores)
? { ...mockedScores[index] }
: mockedScores[evaluator][index];
return {
testCaseId: score.testCaseId,
evaluation,
};
});
}
return remapped;
}