files.eval.ts•2.68 kB
import { assert, expect } from 'vitest'
import { describeEval } from 'vitest-evals'
import { z } from 'zod'
import { runTask } from '@repo/eval-tools/src/runTask'
import { checkFactuality } from '@repo/eval-tools/src/scorers'
import { eachModel } from '@repo/eval-tools/src/test-models'
import { initializeClient } from './utils'
eachModel('$modelName', ({ model }) => {
	describeEval('Runs container file write', {
		data: async () => [
			{
				input: 'write a file named test.txt containing the text "asdf"',
				expected: 'The container_file_write tool was called and the file\'s content is "asdf"',
			},
		],
		task: async (input) => {
			const client = await initializeClient()
			const { promptOutput } = await runTask(client, model, input)
			const fileRead = client.listTools().find((tool) => {
				if (tool.name === 'container_file_read') {
					return tool
				}
			})
			assert(fileRead !== undefined)
			const result = await client.callTool(
				{
					...fileRead,
					arguments: {
						args: { path: 'file://test.txt' },
					},
				},
				z.any() as any,
				{}
			)
			expect(result.content).toStrictEqual([
				{
					type: 'resource',
					resource: {
						uri: 'file://test.txt',
						mimeType: 'text/plain',
						text: 'asdf',
					},
				},
			])
			return promptOutput
		},
		scorers: [checkFactuality],
		threshold: 1,
		timeout: 60000,
	})
	describeEval('Runs container file delete', {
		data: async () => [
			{
				input: 'write a file named test.txt, then delete it',
				expected:
					'The container_file_write tool was called and then the container_file_delete tool was called with the same parameters',
			},
		],
		task: async (input) => {
			const client = await initializeClient()
			const { promptOutput, toolCalls } = await runTask(client, model, input)
			const toolArgs = toolCalls.find((tool) => {
				return tool.toolName === 'container_file_write' ? tool : undefined
			})?.args as { args: { path: string } } | undefined
			assert(toolArgs !== undefined)
			expect(toolCalls).toEqual(
				expect.arrayContaining([
					expect.objectContaining({
						type: 'tool-call',
						toolName: 'container_file_write',
						args: {
							args: expect.objectContaining({
								path: toolArgs.args.path,
							}),
						},
					}),
				])
			)
			expect(toolCalls).toEqual(
				expect.arrayContaining([
					expect.objectContaining({
						type: 'tool-call',
						toolName: 'container_file_delete',
						args: {
							args: expect.objectContaining({
								path: toolArgs.args.path,
							}),
						},
					}),
				])
			)
			return promptOutput
		},
		scorers: [checkFactuality],
		threshold: 1,
		timeout: 60000,
	})
})