exec.eval.ts•1.52 kB
import { expect } from 'vitest'
import { describeEval } from 'vitest-evals'
import { runTask } from '@repo/eval-tools/src/runTask'
import { checkFactuality } from '@repo/eval-tools/src/scorers'
import { eachModel } from '@repo/eval-tools/src/test-models'
import { initializeClient } from './utils'
eachModel('$modelName', ({ model }) => {
describeEval('Runs a python file in a container', {
data: async () => [
{
input: 'Create a hello world python script and run it',
expected: `The container_file_write tool was called, containing a file ending in .py.\
Then the container_file_exec tool was called with python or python3 as one of the arguments`,
},
],
task: async (input) => {
const client = await initializeClient()
const { promptOutput, toolCalls } = await runTask(client, model, input)
expect(toolCalls).toEqual(
expect.arrayContaining([
expect.objectContaining({
type: 'tool-call',
toolName: 'container_exec',
args: {
args: expect.objectContaining({
args: expect.stringContaining('python'),
}),
},
}),
])
)
expect(toolCalls).toEqual(
expect.arrayContaining([
expect.objectContaining({
type: 'tool-call',
toolName: 'container_file_write',
args: {
args: expect.objectContaining({
path: expect.stringContaining('.py'),
}),
},
}),
])
)
return promptOutput
},
scorers: [checkFactuality],
threshold: 1,
timeout: 60000,
})
})