tool-eval-config.json•66.6 kB
{
"integrations": [
{
"id": "clickup",
"name": "ClickUp",
"urlHost": "https://api.clickup.com",
"urlPath": "/api",
"documentationUrl": "https://developer.clickup.com/reference/getauthorizeduser",
"openApiUrl": "https://developer.clickup.com/openapi/clickup-api-v2-reference.json",
"credentials": {
"api_token": ""
},
"description": "Project management Software",
"keywords": []
},
{
"id": "stripe",
"name": "Stripe",
"urlHost": "https://api.stripe.com",
"urlPath": "/v1",
"documentationUrl": "https://stripe.com/docs/api",
"openApiUrl": "https://raw.githubusercontent.com/stripe/openapi/master/openapi/spec3.json",
"credentials": {
"api_key": ""
},
"description": "Payment processing and subscription management",
"keywords": [
"customers",
"charges",
"payment_intents",
"products",
"prices",
"subscriptions",
"invoices",
"balance_transactions",
"refunds",
"checkout_sessions",
"line_items",
"payment_methods",
"issuers",
"plans",
"setup_intents",
"payouts",
"transfers",
"balance",
"users",
"emails"
]
},
{
"id": "airtable",
"name": "Airtable",
"urlHost": "https://api.airtable.com",
"urlPath": "/v0",
"documentationUrl": "https://airtable.com/developers/web/api/introduction",
"credentials": {
"api_key": ""
},
"description": "Airtable API",
"keywords": [
"bases",
"tables",
"records",
"fields",
"views",
"formulas",
"attachments",
"comments",
"collaborators",
"metadata",
"schemas",
"api key",
"key"
]
},
{
"id": "clockify",
"name": "Clockify",
"urlHost": "https://api.clockify.me",
"urlPath": "/api/v1",
"documentationUrl": "https://docs.clockify.me/",
"credentials": {
"api_key": ""
},
"description": "Clockify API",
"keywords": [
"time entries",
"projects",
"workspaces",
"users",
"tags",
"clients",
"estimates",
"invoices",
"payments"
]
},
{
"id": "typeform",
"name": "Typeform",
"urlHost": "https://api.typeform.com/",
"urlPath": "",
"documentationUrl": "https://www.typeform.com/developers/get-started/",
"credentials": {
"personal_access_token": ""
},
"description": "Typeform API",
"keywords": [
"forms",
"responses",
"questions",
"fields",
"themes",
"images",
"workspaces",
"logic jumps",
"hidden fields",
"variables",
"calculations",
"insights",
"reports",
"oauth"
]
},
{
"id": "confluence",
"name": "Confluence",
"urlHost": "https://superglue-evals.atlassian.net",
"urlPath": "/wiki/rest/api",
"documentationUrl": "https://docs.atlassian.com/atlassian-confluence/REST/6.6.0/",
"openApiUrl": "https://developer.atlassian.com/cloud/confluence/swagger.json",
"credentials": {
"email": "",
"api_token": ""
},
"description": "Confluence API",
"keywords": [
"spaces",
"pages",
"content",
"attachments",
"comments",
"labels",
"templates",
"blueprints",
"macros",
"restrictions",
"versions",
"ancestors",
"descendants",
"children",
"oauth"
]
},
{
"id": "github",
"name": "GitHub",
"urlHost": "https://api.github.com",
"urlPath": "",
"documentationUrl": "https://docs.github.com/en/rest",
"openApiUrl": "https://raw.githubusercontent.com/github/rest-api-description/main/descriptions/api.github.com/api.github.com.json",
"credentials": {
"api_token": ""
},
"description": "Github API",
"keywords": [
"repositories",
"issues",
"pull_requests",
"commits",
"branches",
"tags",
"releases",
"deployments",
"check_runs",
"actions_artifacts",
"organizations",
"packages",
"collaborators",
"gists",
"milestones"
]
},
{
"id": "linear",
"name": "Linear",
"urlHost": "https://api.linear.app/graphql",
"urlPath": "",
"documentationUrl": "https://linear.app/developers",
"credentials": {
"api_key": ""
},
"description": "Linear API",
"keywords": [
"issues",
"projects",
"cycles",
"teams",
"users",
"comments",
"labels",
"milestones",
"roadmaps",
"workflows",
"states",
"graphql",
"mutations",
"queries",
"api key"
]
},
{
"id": "jira",
"name": "JIRA",
"urlHost": "https://<<jira_organization>>.atlassian.net",
"urlPath": "rest/api/3",
"documentationUrl": "https://developer.atlassian.com/cloud/jira/platform/rest/v3",
"openApiUrl": "https://developer.atlassian.com/cloud/jira/platform/swagger-v3.json",
"credentials": {
"email": "",
"api_token": "",
"organization": ""
},
"description": "JIRA project management API",
"keywords": [
"issues",
"projects",
"boards",
"sprints",
"epics",
"users",
"workflows",
"fields",
"components",
"versions",
"priorities",
"statuses",
"comments",
"attachments",
"jql",
"query",
"search",
"oauth"
]
},
{
"id": "trello",
"name": "Trello",
"urlHost": "https://api.trello.com",
"urlPath": "",
"documentationUrl": "https://developer.atlassian.com/cloud/trello/rest",
"openApiUrl": "https://developer.atlassian.com/cloud/trello/swagger.v3.json",
"icon": "trello",
"credentials": {
"api_key": "",
"api_token": ""
},
"description": "Trello API",
"keywords": [
"boards",
"lists",
"cards",
"members",
"labels",
"checklists",
"attachments",
"comments",
"actions",
"organizations",
"teams",
"power-ups",
"custom fields",
"stickers",
"api key"
]
},
{
"id": "hubspot",
"name": "HubSpot",
"urlHost": "https://api.hubapi.com",
"urlPath": "/crm/v3",
"documentationUrl": "https://developers.hubspot.com/docs/api/overview",
"openApiUrl": "https://api.hubspot.com/public/api/spec/v1/specs",
"credentials": {
"private_app_token": ""
},
"description": "Customer relationship management API",
"keywords": [
"contacts",
"companies",
"deals",
"tickets",
"line_items",
"products",
"associations",
"memberships"
]
},
{
"id": "attio",
"name": "Attio",
"urlHost": "https://api.attio.com/v2",
"urlPath": "",
"documentationUrl": "https://docs.attio.com/rest-api/overview",
"openApiUrl": "https://api.attio.com/openapi/api",
"credentials": {
"api_token": ""
},
"description": "Modern CRM with OpenAPI specification",
"keywords": [
"people",
"objects",
"records",
"lists",
"entries",
"workspace_members",
"notes",
"tasks",
"threads",
"comments",
"sorts",
"api_slug",
"attribute_type",
"record_id",
"workspace_id",
"object_id"
]
},
{
"id": "slack",
"name": "Slack",
"urlHost": "https://api.slack.com",
"urlPath": "",
"documentationUrl": "https://api.slack.com/docs",
"openApiUrl": "https://raw.githubusercontent.com/slackapi/slack-api-specs/master/web-api/slack_web_openapi_v2.json",
"credentials": {
"bot_token": ""
},
"description": "Slack API",
"keywords": [
"channel",
"conversation",
"user",
"file",
"event",
"message",
"workflow_step",
"workflow_published",
"workflow_step_execute",
"usergroup",
"im",
"mpim",
"group",
"check_run",
"apps_permissions_resource"
]
},
{
"id": "postgres-lego",
"name": "LEGO Database",
"urlHost": "postgres://<<postgres-lego_username>>:<<postgres-lego_password>>@<<postgres-lego_host>>:<<postgres-lego_port>>",
"urlPath": "<<postgres-lego_database>>",
"documentationUrl": "",
"credentials": {
"username": "",
"password": "",
"host": "",
"port": "",
"database": ""
},
"description": "PostgreSQL LEGO database for testing",
"keywords": [
"database",
"sql",
"postgres",
"postgresql",
"api key",
"tables"
]
},
{
"id": "eval-postgres",
"name": "Eval PostgreSQL Test Database",
"urlHost": "postgres://<<eval-postgres_username>>:<<eval-postgres_password>>@<<eval-postgres_host>>:<<eval-postgres_port>>",
"urlPath": "<<eval-postgres_database>>",
"documentationUrl": "",
"credentials": {
"username": "",
"password": "",
"host": "",
"port": "",
"database": ""
},
"description": "PostgreSQL test database for evaluations",
"keywords": [
"database",
"sql",
"postgres",
"postgresql",
"test"
]
},
{
"id": "eval-sftp",
"name": "Eval SFTP Test Server",
"urlHost": "sftp://<<eval-sftp_username>>:<<eval-sftp_password>>@<<eval-sftp_host>>:<<eval-sftp_port>>",
"urlPath": "/",
"documentationUrl": "",
"credentials": {
"username": "",
"password": "",
"host": "",
"port": "",
"home": "",
"uploads_dir": ""
},
"description": "SFTP test server for file operations evaluations",
"keywords": [
"sftp",
"ftp",
"file",
"upload",
"download",
"test"
]
},
{
"id": "timbuk2-shopify",
"name": "Timbuk2-shopify",
"urlHost": "https://www.timbuk2.com",
"urlPath": "/products.json",
"documentationUrl": "https://shopify.dev/docs/api/ajax/reference/product",
"credentials": {},
"description": "Public Shopify API demo",
"keywords": [
"products",
"variants",
"collections",
"customers",
"orders",
"fulfillments",
"inventory_items",
"inventory_levels",
"metafields",
"price_rules",
"discount_codes",
"shipping_zones",
"locations",
"gift_cards",
"product_images"
]
},
{
"id": "asana",
"name": "Asana",
"urlHost": "https://app.asana.com/api",
"urlPath": "",
"documentationUrl": "https://developers.asana.com/docs",
"openApiUrl": "https://raw.githubusercontent.com/Asana/openapi/master/defs/asana_oas.yaml",
"credentials": {
"personal_access_token": ""
},
"description": "Asana API",
"keywords": [
"tasks",
"projects",
"workspaces",
"teams",
"portfolios",
"goals",
"sections",
"tags",
"custom fields",
"stories",
"attachments",
"followers",
"assignee",
"due dates",
"query",
"search",
"api key"
]
},
{
"id": "openai",
"name": "OpenAI",
"urlHost": "https://api.openai.com",
"urlPath": "",
"documentationUrl": "https://platform.openai.com/docs/api-reference/introduction",
"openApiUrl": "https://app.stainless.com/api/spec/documented/openai/openapi.documented.yml",
"credentials": {
"api_key": ""
},
"keywords": [
"completions",
"chat",
"models",
"embeddings",
"images",
"audio",
"files",
"fine-tuning",
"assistants",
"threads",
"messages",
"runs",
"moderation",
"usage",
"api key"
]
},
{
"id": "tele2-http",
"name": "Tele2 HTTP and FTP Speedtest",
"urlHost": "http://speedtest.tele2.net",
"urlPath": "",
"documentationUrl": "http://speedtest.tele2.net/",
"credentials": {},
"keywords": [
"ftp",
"speedtest",
"download",
"upload",
"files",
"performance",
"bandwidth"
]
},
{
"id": "berkshire-hathaway-letter",
"name": "Berkshire Hathaway Letter",
"urlHost": "https://www.berkshirehathaway.com",
"urlPath": "/letters/2024ltr.pdf",
"documentationUrl": "https://www.berkshirehathaway.com/letters/2024ltr.pdf",
"credentials": {},
"description": "Berkshire Hathaway 2024 shareholder letter",
"keywords": [
"letter",
"shareholder",
"letter",
"shareholder",
"pdf"
]
},
{
"id": "sec-gov",
"name": "SEC EDGAR",
"urlHost": "https://www.sec.gov",
"urlPath": "/cgi-bin/browse-edgar",
"documentationUrl": "https://www.sec.gov/edgar/searchedgar/companysearch.html",
"credentials": {},
"description": "U.S. Securities and Exchange Commission EDGAR database",
"keywords": [
"filings",
"companies",
"forms",
"10-K",
"10-Q",
"8-K",
"CIK",
"submissions",
"securities",
"financial reports",
"public companies"
]
},
{
"id": "jina-reader",
"name": "Jina Reader API",
"urlHost": "https://r.jina.ai",
"urlPath": "",
"documentationUrl": "https://jina.ai/reader",
"credentials": {},
"description": "Jina AI Reader API for converting web pages to clean markdown",
"keywords": [
"reader",
"markdown",
"web scraping",
"html to markdown",
"content extraction",
"web crawler",
"clean text",
"article extraction"
]
},
{
"id": "open-brewery-db",
"name": "Open Brewery DB",
"urlHost": "https://api.openbrewerydb.org",
"urlPath": "/v1",
"documentationUrl": "https://www.openbrewerydb.org/documentation",
"credentials": {},
"description": "Public API for brewery data across the United States",
"keywords": [
"breweries",
"brewery",
"beer",
"state",
"city",
"search",
"pagination"
]
},
{
"id": "openalex",
"name": "OpenAlex",
"urlHost": "https://api.openalex.org",
"urlPath": "",
"documentationUrl": "https://docs.openalex.org",
"credentials": {},
"description": "Open catalog of scholarly papers, authors, institutions, and more",
"keywords": [
"works",
"authors",
"institutions",
"publications",
"research",
"papers"
]
},
{
"id": "punkapi",
"name": "PunkAPI",
"urlHost": "https://punkapi.online",
"urlPath": "/v3",
"documentationUrl": "https://publicapis.io/punk-api",
"credentials": {},
"description": "BrewDog's DIY Dog beer catalogue API with detailed recipes and information",
"keywords": [
"beer",
"brewery",
"brewdog",
"recipe",
"hops",
"malt",
"ingredients",
"abv",
"ibu"
]
}
],
"tools": [
{
"id": "001-clickup-task-list",
"name": "ClickUp Task List",
"type": "retrieval",
"instruction": "Get all tasks from the ClickUp task list. I want the final output to be a JSON object with the following structure: { \"tasks\": [ { \"name\": \"Task 1\", \"description\": \"Task 1 description\" } ]",
"integrationIds": ["clickup"],
"payload": {
"listId": "901516361522"
},
"validationFunction": "validators/001-clickup-task-list.ts",
"expectedResultDescription": "Should return tasks from a give list ID. Expected to find at least one task named 'First Task' with a description starting with 'Lorem ipsum dolor sit amet'."
},
{
"id": "002-clickup-all-lists",
"name": "ClickUp All Lists",
"type": "retrieval",
"instruction": "Get all lists of all workspaces in ClickUp. Including Lists located in Folders and in Spaces. Return an object of this shape: { lists: Array<{ id: string, name: string, content: string, due_date: string | null, start_date: string | null }> }. Dates should have the format YYYY-MM-DD.",
"integrationIds": ["clickup"],
"payload": {},
"validationFunction": "validators/002-clickup-all-lists.ts",
"expectedResultDescription": "Should return all lists from all workspaces (including Folders and Spaces). Expected to find 3 lists: 'Projekt 1' (with due_date 2025-11-06 and start_date 2025-10-15), 'Projekt 2' (no dates), and 'Test' (no dates). List IDs should be like '901516249723', '901516249722', '901516361522'."
},
{
"id": "003-clickup-all-comments-and-replies",
"name": "ClickUp All Comments and Replies",
"type": "retrieval",
"instruction": "Get all comments and and for each comment its sub-comments for the task with the given taskId. Be aware that you need to iterate over the comments and get the sub-comments for each comment using another API call. Return an object of this shape: { comments: Array<{ id: string, authorId: string, authorName: string, text: string, parentCommentId: string | null, createdAt: string }> }. Convert the create date to YYYY-MM-DD.",
"integrationIds": ["clickup"],
"payload": {
"taskId": "86c5y6t56"
},
"validationFunction": "validators/003-clickup-all-comments-and-replies.ts",
"expectedResultDescription": "Should return all comments and sub-comments for task 86c5y6t56. Expected to find 3 comments total from author 'Max Mustermann': 'Second comment without replies' (top-level), 'One comment' (top-level), and 'Sub comment' (reply to 'One comment'). Comments dated 2025-10-14."
},
{
"id": "004-clockify-time-entries",
"name": "Clockify Time Entries",
"type": "retrieval",
"instruction": "Get all time entries for the authenticated user. I want the final output to be a JSON object with the following structure: { timeEntries: [{id: string,description: string, duration: number, // duration in MS }]}",
"integrationIds": ["clockify"],
"payload": {},
"validationFunction": "validators/004-clockify-time-entries.ts",
"expectedResultDescription": "Should return time entries for the authenticated user. Expected to find 3 entries: one 'Buchhaltung' entry (55 minutes) and two 'Beratungsgespräch' entries (each 1 hour). Durations in milliseconds."
},
{
"id": "005-clockify-projects",
"name": "Clockify Projects",
"type": "retrieval",
"instruction": "Get all projects for the authenticated user's active workspace. I want the final output to be a JSON object with the following structure: { projects: [{id: string, name: string, note: string, billable: boolean, hourlyRate?: number, hourlyRateCurrency?: string }]}. ",
"integrationIds": ["clockify"],
"payload": {},
"validationFunction": "validators/005-clockify-projects.ts",
"expectedResultDescription": "Should return projects from the user's active workspace. Expected to find 2 projects: 'Buchhaltung 2025' (non-billable, with notes) and 'Marketing Project X' (billable at $80/hour). Both in USD currency."
},
{
"id": "006-clockify-projects-all-workspaces",
"name": "Clockify Projects All Workspaces",
"type": "retrieval",
"instruction": "Get all projects the authenticated user has access to. I want the final output to be a JSON object with the following structure: { projects: [{id: string, name: string, note: string, billable: boolean, hourlyRate?: number, hourlyRateCurrency?: string }]}. ",
"integrationIds": ["clockify"],
"payload": {},
"validationFunction": "validators/006-clockify-projects-all-workspaces.ts",
"expectedResultDescription": "Should return projects from ALL workspaces the user has access to (not just active). Expected same 2 projects as active workspace: 'Buchhaltung 2025' and 'Marketing Project X'."
},
{
"id": "007-typeform-get-all-forms",
"name": "Typeform Get All Forms",
"type": "retrieval",
"instruction": "Get all forms with responses (question and answer) for the authenticated user. I want the final output to be a JSON object with the following structure: { forms: [{id: string, name: string, responses: Array<{id: string, answers: Array<{question: string, answer: string}>, submittedAt: number // unix timestamp}>, createdAt: number // unix timestamp }], total: number}. Please make sure to include the questions and answers for each response.",
"integrationIds": ["typeform"],
"payload": {},
"validationFunction": "validators/007-typeform-get-all-forms.ts",
"expectedResultDescription": "Should return all forms with their responses. Expected 2 forms total: 'Coffee Shop Applications' (with 2 responses from Peter Mustermann as Cashier and Max Mustermann as Barista, including questions about name, email, position, experience, and motivation) and 'Just empty' (with no responses)."
},
{
"id": "008-typeform-get-form-by-id",
"name": "Typeform Get Form By Id",
"type": "retrieval",
"instruction": "Get the form with the given id. I want the final output to be a JSON object with the following structure: {id: string, name: string, responses: Array<{id: string, answers: Array<{question: string, answer: string}>, submittedAt: number // unix timestamp}>, createdAt: number // unix timestamp }. Please make sure to include the questions and answers for each response.",
"integrationIds": ["typeform"],
"payload": {
"formId": "lyyDJUcC"
},
"validationFunction": "validators/008-typeform-get-form-by-id.ts",
"expectedResultDescription": "Should return the form with ID lyyDJUcC. Expected to be 'Coffee Shop Applications' form with 2 responses from Peter and Max Mustermann applying for Cashier and Barista position. Response should include answers to questions about name, position, work experience, and motivation."
},
{
"id": "009-typeform-get-all-workspaces",
"name": "Typeform Get All Workspaces",
"type": "retrieval",
"instruction": "Get all workspaces for the authenticated user. I want the final output to be a JSON object with the following structure: { workspaces: [{id: string, name: string, numberOfForms: number}] }.",
"integrationIds": ["typeform"],
"payload": {},
"validationFunction": "validators/009-typeform-get-all-workspaces.ts",
"expectedResultDescription": "Should return all workspaces for the authenticated user. Expected to find 1 workspace named 'My workspace' containing 2 forms."
},
{
"id": "010-confluence-get-all-spaces",
"name": "Confluence Get All Spaces",
"type": "retrieval",
"instruction": "Get all spaces for the authenticated user. I want the final output to be a JSON object with the following structure: { spaces: [{id: number, type: string, name: string, archived: boolean}] }. Please use /wiki/rest/api/... not rest/api/v2/...",
"integrationIds": ["confluence"],
"payload": {},
"validationFunction": "validators/010-confluence-get-all-spaces.ts",
"expectedResultDescription": "Should return all Confluence spaces for the authenticated user. Expected to find 3 spaces: 'Docs' (knowledge_base), 'Max Mustermann' (personal), and 'Projektmanagement' (onboarding, archived)."
},
{
"id": "011-confluence-get-page-content",
"name": "Confluence Get Page Content",
"type": "retrieval",
"instruction": "Get the content of the page with the given id. I want the final output to be a JSON object with the following structure: { content: string }. ",
"integrationIds": ["confluence"],
"payload": {
"pageId": "163855"
},
"validationFunction": "validators/011-confluence-get-page-content.ts",
"expectedResultDescription": "Should return the content of Confluence page for a given ID. Expected content about 'Intern Onboarding' in HTML format."
},
{
"id": "012-confluence-get-all-pages-of-space",
"name": "Confluence Get All Pages Of Space",
"type": "retrieval",
"instruction": "Get all pages for the given space id. I want the final output to be a JSON object with the following structure: { pages: [{id: string, title: string, content: string}] }. ",
"integrationIds": ["confluence"],
"payload": {
"spaceId": "294916"
},
"validationFunction": "validators/012-confluence-get-all-pages-of-space.ts",
"expectedResultDescription": "Should return all pages from Confluence for a given space ID. Expected to find 4 pages: 'Company Guidelines', 'Docs', 'Interns', and 'IT Service Desk'. Each page should contain HTML content."
},
{
"id": "013-github-list-repos",
"name": "GitHub List Repositories",
"type": "retrieval",
"instruction": "List all repositories for the authenticated user. I want the final output to be a JSON object with the following structure: { repositories: [{id: number, name: string, isPublic: boolean}] }. ",
"integrationIds": ["github"],
"payload": {},
"validationFunction": "validators/013-github-list-repos.ts",
"expectedResultDescription": "Should return all repositories for the authenticated user. Expected to find 5 repositories including: 'congenial-tribble' (private), 'expert-octo-doodle' (private), 'strapi-cloud-template-blog-4b5423dbba' (public), 'Test1' (private), and 'vigilant-octo-lamp' (private)."
},
{
"id": "014-github-get-user",
"name": "GitHub Get User",
"type": "retrieval",
"instruction": "Get the user information for the authenticated user. I want the final output to be a JSON object with the following structure: { user: {id: number, username: string, profilePictureUrl: string | null, url: string, createdAt: string } }. ",
"integrationIds": ["github"],
"payload": {},
"validationFunction": "validators/014-github-get-user.ts",
"expectedResultDescription": "Should return authenticated GitHub user information. Expected username 'Evals304' with profile picture URL from avatars.githubusercontent.com, account created on 2025-10-12."
},
{
"id": "015-github-get-user-pull-requests",
"name": "GitHub Get User's Pull Requests",
"type": "retrieval",
"instruction": "Get all pull requests for the authenticated user. I want the final output to be a JSON object with the following structure: { pullRequests: [{id: number, title, string, url: string, updatedAt: string, createdAt: string }] }. ",
"integrationIds": ["github"],
"payload": {},
"validationFunction": "validators/015-github-get-user-pull-requests.ts",
"expectedResultDescription": "Should return all pull requests for the authenticated user. Expected to find 2 PRs: 'Test something' (from congenial-tribble repo) and 'Update README.md' (from vigilant-octo-lamp repo), both created on 2025-10-18."
},
{
"id": "016-linear-get-all-issues",
"name": "Linear Get All Issues",
"type": "retrieval",
"instruction": "Get me all linear issues. I want the final output to be a JSON object with the following structure: { issues: [{id: string, title: string }] }. ",
"integrationIds": ["linear"],
"payload": {},
"validationFunction": "validators/016-linear-get-all-issues.ts",
"expectedResultDescription": "Should return all Linear issues. Expected to find 3 issues with titles: 'Third', 'Second', and 'First'."
},
{
"id": "017-linear-get-issue-by-id",
"name": "Linear Get Issue By Id",
"type": "retrieval",
"instruction": "Get the issue with the given id. I want the final output to be a JSON object with the following structure: { issue: {id: string, title: string }] }. ",
"integrationIds": ["linear"],
"payload": {
"issueId": "91d119b6-8c91-436e-9986-0198cf30cd8e"
},
"validationFunction": "validators/017-linear-get-issue-by-id.ts",
"expectedResultDescription": "Should return the Linear issue with ID 91d119b6-8c91-436e-9986-0198cf30cd8e. Expected title to be 'Third'."
},
{
"id": "018-linear-get-all-projects",
"name": "Linear Get All Projects",
"type": "retrieval",
"instruction": "Get all projects. I want the final output to be a JSON object with the following structure: { projects: [{id: string, name: string }] }. ",
"integrationIds": ["linear"],
"payload": {},
"validationFunction": "validators/018-linear-get-all-projects.ts",
"expectedResultDescription": "Should return all Linear projects. Expected to find 2 projects named 'Empty 2' and 'Empty1'."
},
{
"id": "019-postgres-lego-inventory-analysis",
"name": "Postgres LEGO Inventory Analysis",
"type": "retrieval",
"instruction":"Use the payload to query a specific section of my lego themes list. Return the output in this structure: { themes: [{id: string, name: string}]}",
"integrationIds": ["postgres-lego"],
"payload": {
"page": 2,
"limit": 10
},
"validationFunction": "validators/019-postgres-lego-inventory-analysis.ts",
"expectedResultDescription": "Should return page 2 of LEGO themes (10 items per page). Expected themes from IDs 11-20 and no more."
},
{
"id": "020-postgres-lego-get-colors",
"name": "Postgres LEGO Get Colors",
"type": "retrieval",
"instruction": "Get all colors from the LEGO database. I want the final output to be a JSON object with the following structure: { colors: string[] }. ",
"integrationIds": ["postgres-lego"],
"payload": {},
"validationFunction": "validators/020-postgres-lego-get-colors.ts",
"expectedResultDescription": "Should return all LEGO colors from the database. Expected over 100 colors including basic colors like 'Black', 'Blue', 'Green', 'Red', 'White', 'Yellow', and specialty colors like 'Trans-Dark Blue', 'Chrome Gold', 'Glow In Dark Opaque'. Last color should be '[No Color]'."
},
{
"id": "021-postgres-lego-get-ranked-themes",
"name": "Postgres LEGO Get Ranked Themes",
"type": "retrieval",
"instruction": "Return the top 20 most used LEGO themes from the database, ranked by number of sets (setCount) in each theme, descending. Output: { themes: [{ name: string, setCount: number }] }.",
"integrationIds": ["postgres-lego"],
"payload": {},
"validationFunction": "validators/021-postgres-lego-get-ranked-themes.ts",
"expectedResultDescription": "Should return the top 20 most popular LEGO themes ranked by number of sets (descending). Expected top theme to be 'Gear' with 246 sets, followed by 'Supplemental' with 240 sets, 'Duplo' with 219 sets, and 'City' with 216 sets. Themes like 'Friends', 'Ninjago', and 'Star Wars' should also appear in the top 20."
},
{
"id": "022-timbuk2-paginated-product-fetch",
"name": "Timbuk2 Paginated Product Fetch",
"description": "Tests basic pagination functionality - fetching multiple pages and combining results",
"type": "retrieval",
"instruction": "Fetch the first 3 pages of products from Timbuk2 with 5 products per page. Return a JSON object with structure: { total_fetched: number, products: [{ id: number, title: string, vendor: string }] }",
"integrationIds": ["timbuk2-shopify"],
"payload": {},
"skipValidationFunction": true,
"expectedResultDescription": "Should fetch exactly 15 products across 3 pages. Each product must have numeric id, non-empty title string, and vendor string starting with 'Timbuk2-'. The total_fetched field should equal 15."
},
{
"id": "023-timbuk2-vendor-aggregation",
"name": "Timbuk2 Vendor Aggregation",
"description": "Tests pagination with client-side aggregation and deduplication",
"type": "retrieval",
"instruction": "Get all unique vendors from Timbuk2 products. Fetch multiple pages to ensure you capture all products, using page and limit parameters. Return JSON: { unique_vendors: string[], total_products_scanned: number }",
"integrationIds": ["timbuk2-shopify"],
"payload": {},
"skipValidationFunction": true,
"expectedResultDescription": "Should return an array of unique vendor names (unique_vendors should be a string array with no duplicates, each starting with 'Timbuk2-'). The total_products_scanned should be a positive number indicating how many products were processed to find the vendors."
},
{
"id": "024-timbuk2-price-range-analysis",
"name": "Timbuk2 Price Range Analysis",
"description": "Tests pagination with data transformation and aggregation across variants",
"type": "retrieval",
"instruction": "Analyze product prices from Timbuk2. Fetch products using pagination (page and limit parameters). For each product, extract all variant prices and calculate statistics. Return JSON: { products_analyzed: number, price_range: { min_price: string, max_price: string }, zero_price_products: number }",
"integrationIds": ["timbuk2-shopify"],
"payload": {},
"skipValidationFunction": true,
"expectedResultDescription": "Should return statistics from paginated product data. products_analyzed should be a positive number. price_range should contain min_price and max_price as numeric strings (e.g., '0', '149.99'). zero_price_products should be a number >= 0 representing products with free variants."
},
{
"id": "025-stripe-get-customers",
"name": "Stripe Get Customers",
"description": "This tests pagination. It usually fails because of nested structure.",
"type": "retrieval",
"instruction": "Get all (over the whole account history) my customers from Stripe.",
"integrationIds": ["stripe"],
"payload": {},
"validationFunction": "validators/025-stripe-get-customers.ts",
"expectedResultDescription": "Should return 202 customer names from Stripe using pagination (tests nested data structure handling). Expected names include individuals like 'Daniel Miller', 'Olivia Taylor', 'Charlotte Brown', 'Sophia Lee', 'Jane Smith', and businesses like 'Hans GmbH' and 'Peter GmbH'. Names appear multiple times (e.g., 'Daniel Miller', 'Olivia Taylor', 'Matthew Lee' appear repeatedly)."
},
{
"id": "026-stripe-calculate-revenue",
"name": "Stripe Calculate Revenue",
"description": "This usually fails because of nested structure.",
"type": "retrieval",
"instruction": "Calculate the total revenue from Stripe. I want the final output to be a JSON object with the following structure: { gross_revenue: number, fees: number, net_revenue: number }.",
"integrationIds": ["stripe"],
"payload": {},
"validationFunction": "validators/026-stripe-calculate-revenue.ts",
"expectedResultDescription": "Should calculate total revenue from Stripe transactions (tests nested data aggregation). Expected gross revenue of $5.00, fees of $0.74, and net revenue of $4.26."
},
{
"id": "027-stripe-get-all-products",
"name": "Stripe Get Products",
"description": "This tests pagination. It usually fails because of nested structure.",
"type": "retrieval",
"instruction": "Get a complete list of all products from Stripe. I need the ID and the name",
"integrationIds": ["stripe"],
"payload": {},
"validationFunction": "validators/027-stripe-get-all-products.ts",
"expectedResultDescription": "Should return all 3 Stripe products (tests nested data structure). Expected products: 'Superglue Cap', 'Superglue T-Shirt', and 'Superglue Coffee Mug'. All products should be active with IDs starting with 'prod_TGb'."
},
{
"id": "028-attio-get-15-oldest-people",
"name": "Attio Get 15 Oldest People",
"description": "",
"type": "retrieval",
"instruction": "From Attio, get me the 10 'oldest' people, so the people with the oldest created_at date. I want the final output to be a JSON object with the names of the people and the following structure: { people: string[] }. ",
"integrationIds": ["attio"],
"payload": {},
"skipValidationFunction": true,
"expectedResultDescription": "Should return exactly 10 people from Attio CRM. Expected names (not in order): 'Isabella Davis', 'Mia Miller', 'Ava Martinez', 'Evelyn Martinez', 'Henry Brown', 'Amelia Brown', 'James Taylor', 'Evelyn Jones', 'Alexander Moore', 'Benjamin Davis'."
},
{
"id": "029-attio-get-person-by-id",
"name": "Attio Get Person",
"description": "",
"type": "retrieval",
"instruction": "Get the person with the given id from Attio. I want the final output to be a JSON object with the following structure: { person: { name: string, email: string, job_title: string } }. ",
"integrationIds": ["attio"],
"payload": {
"id": "004e6660-3d14-42ef-9d2d-b1f0167209aa"
},
"validationFunction": "validators/029-attio-get-person-by-id.ts",
"expectedResultDescription": "Should return person with ID 004e6660-3d14-42ef-9d2d-b1f0167209aa from Attio. Expected to be 'Olivia Jackson' with email olivia.jackson@yahoo.com and job title 'Business Analyst'."
},
{
"id": "030-attio-people-no-email",
"name": "Attio People No Email",
"type": "retrieval",
"instruction": "Retrieve all people records from Attio CRM that have an empty email address. Return their names and created_at. Output JSON: { \"people\": [ { \"name\": string, \"created_at\": string } ] }.",
"integrationIds": ["attio"],
"payload": {},
"validationFunction": "validators/030-attio-people-no-email.ts",
"expectedResultDescription": "Should return all Attio people records with empty email addresses. Expected to find 2 people: 'Hans Peter' (created 2025-10-20 00:09:59) and 'Max Mustermann' (created 2025-10-20 00:10:13)."
},
{
"id": "031-openai-structured-output-name-age",
"name": "OpenAI Structured Output Name and Age",
"type": "retrieval",
"instruction": "Call the OpenAI API with an unstructured list of 5 people's names and ages: 'Sarah is 28 years old, Mike 35, Emma age 22, David is 41, and Chris who is 19'. Ask OpenAI to return this data in a structured format with the people sorted alphabetically by name. Use structured output or response_format to enforce the schema. The final transform should propagate the result from the OpenAI request. Output JSON: { \"people\": Array<{ \"name\": string, \"age\": number }> }. Make sure the workflow output by the final transform is exactly this JSON object.",
"integrationIds": ["openai"],
"payload": {},
"validationFunction": "validators/031-openai-structured-output-name-age.ts",
"expectedResultDescription": "Should extract 5 people from unstructured text and sort alphabetically. Expected in order: Chris (19), David (41), Emma (22), Mike (35), and Sarah (28)."
},
{
"id": "032-openai-pdf-analysis-with-input-retrieval",
"name": "OpenAI PDF Analysis with Input Retrieval",
"type": "retrieval",
"instruction": "Call the OpenAI API to analyze a PDF file. First, make a request to the responses API endpoint with a file attachment pointing to this PDF: https://arxiv.org/pdf/2501.14426 and add exactly this prompt: Please provide a one sentence summary of this paper. Then, make a second call to the OpenAI input_items endpoint using the response ID from the first request to retrieve what inputs were provided to that request. The final output should be a list of the content from the second request, showing the inputs that were provided to the first OpenAI request. Final Transform Output JSON: { \"content\": Array<{ \"type\": string, (\"text\": string) | (\"file_url\": string) }> }.",
"integrationIds": ["openai"],
"payload": {},
"validationFunction": "validators/032-openai-pdf-analysis-with-input-retrieval.ts",
"expectedResultDescription": "Should retrieve OpenAI request inputs showing 2 items: one text input with prompt '1-sentence summary' and one file input pointing to arxiv.org/pdf/2501.14426."
},
{
"id": "033-postgres-lego-openai-table-extraction",
"name": "PostgreSQL+OpenAI Structured Output",
"type": "retrieval",
"instruction": "First, query the LEGO PostgreSQL database to get the table schema information. You can use a query like SELECT table_name FROM information_schema.tables WHERE table_schema = 'public' to get all table names. Then, send this schema information to OpenAI and ask it to extract and return a structured output containing only a list of table names. Use OpenAI's structured output feature or response_format to force the output to match the exact schema with a table_names array. Output JSON: { \"table_names\": string[] }. Make sure the final workflow output by the final transform also has this schema: { \"table_names\": string[] }.",
"integrationIds": ["postgres-lego", "openai"],
"payload": {},
"validationFunction": "validators/033-postgres-lego-openai-table-extraction.ts",
"expectedResultDescription": "Should query LEGO database tables and use OpenAI to extract table names. Expected 8 tables: lego_colors, lego_inventories, lego_inventory_parts, lego_inventory_sets, lego_part_categories, lego_parts, lego_sets, and lego_themes."
},
{
"id": "034-tele2-http-download-verification",
"name": "Tele2 HTTP Download",
"type": "retrieval",
"instruction": "Download the 10MB.zip file from the Tele2 speedtest service at http://speedtest.tele2.net/10MB.zip. After the download completes, verify that the file exists and was successfully downloaded. In the final transform, check if the file is present and return a status object indicating success if the file exists, otherwise return failure. Output JSON: { \"status\": \"success\" | \"failure\", \"file\": string }.",
"integrationIds": ["tele2-http"],
"payload": {},
"validationFunction": "validators/034-tele2-http-download-verification.ts",
"expectedResultDescription": "Should download 10MB.zip file from Tele2 speedtest and verify successful download. Expected status 'success' with file name '10MB.zip'."
},
{
"id": "035-tele2-http-upload-berkshire-letter",
"name": "Tele2 HTTP Upload Berkshire Letter",
"type": "retrieval",
"instruction": "Download the Berkshire Hathaway 2024 shareholder letter PDF from https://www.berkshirehathaway.com/letters/2024ltr.pdf and upload it to the Tele2 FTP speedtest server. Return success status if upload works, failure status otherwise.",
"integrationIds": ["tele2-http", "berkshire-hathaway-letter"],
"payload": {},
"validationFunction": "validators/035-tele2-http-upload-berkshire-letter.ts",
"expectedResultDescription": "Should download Berkshire Hathaway 2024 shareholder letter PDF and upload to Tele2 FTP server. Expected status 'success' indicating file was downloaded and uploaded successfully."
},
{
"id": "036-sec-jina-markdown-conversion",
"name": "SEC Filing to Markdown via Jina Reader",
"type": "retrieval",
"instruction": "This workflow has 2 steps. Step 1: Read the SEC EDGAR filing HTML from https://www.sec.gov/ix?doc=/Archives/edgar/data/0000040987/000119312525195442/d28129d8k.htm. Step 2: Call Jina's Reader API (https://r.jina.ai/) to convert the page content to markdown. When calling Jina's API, prepend 'https://r.jina.ai/' to the SEC URL and add the header 'x-respond-with: markdown' to get the response as markdown format. Make sure to retrieve the markdown content from the response. In the final transform, check if the markdown content is present and properly formatted. Return status 'success' if markdown is retrieved successfully, otherwise return 'failure'. Output JSON: { \"status\": \"success\" | \"failure\" }.",
"integrationIds": ["sec-gov", "jina-reader"],
"payload": {},
"validationFunction": "validators/036-sec-jina-markdown-conversion.ts",
"expectedResultDescription": "Should fetch SEC EDGAR filing HTML and convert to markdown using Jina Reader API. Expected status 'success' indicating the HTML was successfully converted to markdown format."
},
{
"id": "037-brewery-db-california-count",
"name": "Open Brewery DB - California Brewery Count",
"description": "Fetch all breweries in California and return total count",
"type": "retrieval",
"instruction": "Count all breweries in California using the Open Brewery DB API. Return a JSON object with this structure: { total_brewery_count: number }",
"integrationIds": [
"open-brewery-db"
],
"payload": {},
"validationFunction": "validators/037-brewery-db-california-count.ts",
"expectedResultDescription": "Should count all breweries in California state. Expected total of 919 breweries."
},
{
"id": "040-punkapi-fetch-all-beers",
"name": "PunkAPI - Fetch All Beers",
"description": "Fetch all BrewDog beers from the catalogue",
"type": "retrieval",
"instruction": "Fetch all beers from the BrewDog PunkAPI catalogue. Return a JSON object with this structure: { total_count: number, first_beer_name: string, last_beer_name: string }.",
"integrationIds": [
"punkapi"
],
"payload": {},
"validationFunction": "validators/040-punkapi-fetch-all-beers.ts",
"expectedResultDescription": "Should fetch all 415 BrewDog beers from PunkAPI catalogue. Expected first beer 'Punk IPA 2007 - 2010' and last beer 'Aplomb Bomb'."
},
{
"id": "041-punkapi-fetch-all-beers-with-pagination-reminder",
"name": "PunkAPI - Fetch All Beers With Pagination Reminder",
"description": "Fetch all BrewDog beers from the catalogue with a reminder to use pagination",
"type": "retrieval",
"instruction": "Fetch all beers from the BrewDog PunkAPI catalogue. Remember to use pagination. Return a JSON object with this structure: { total_count: number, first_beer_name: string, last_beer_name: string }.",
"integrationIds": [
"punkapi"
],
"payload": {},
"validationFunction": "validators/041-punkapi-fetch-all-beers-with-pagination-reminder.ts",
"expectedResultDescription": "Should fetch all 415 BrewDog beers with explicit pagination reminder. Expected first beer 'Punk IPA 2007 - 2010' and last beer 'Aplomb Bomb'."
},
{
"id": "042-slack-post-eval-message",
"name": "Slack Post Eval Message",
"description": "Post a message to the evals channel",
"type": "action",
"instruction": "Post a message into the evals channel with the message 'Hi, this is the superglue bot running some evals'",
"integrationIds": [
"slack"
],
"payload": {},
"validationFunction": "validators/042-slack-post-eval-message.ts",
"expectedResultDescription": "Should successfully post message 'Hi, this is the superglue bot running some evals' to the evals channel. Expected response ok: true with a channel ID."
},
{
"id": "043-jira-workflow-anomaly-analysis",
"name": "JIRA Workflow Anomaly Analysis",
"description": "Complex test: Multi-dimensional analysis with JQL filtering and aggregation",
"type": "retrieval",
"instruction": "Perform a comprehensive workflow analysis on the GTMS project: 1) Search for all issues using JQL 'project=GTMS'. 2) Calculate workload metrics: total issues, assigned vs unassigned count, issues by assignee with their status distribution. 3) Identify workflow anomalies: unassigned issues that are 'In Progress' (should typically have an assignee). 4) Calculate backlog metrics: count of 'To Do' issues, percentage of total. 5) Aggregate by issue type (Task vs Sub-task) and show distribution. Return JSON: { metrics: { total_issues: number, assigned_count: number, unassigned_count: number, backlog_count: number, backlog_percentage: number }, by_assignee: Array<{ assignee: string, issue_count: number, statuses: Array<{ status: string, count: number }> }>, anomalies: { unassigned_in_progress: Array<{ key: string, summary: string }>, count: number }, by_issue_type: Array<{ type: string, count: number }> }. Sort by_assignee by issue_count descending.",
"integrationIds": ["jira"],
"payload": {},
"validationFunction": "validators/063-jira-workflow-anomaly-analysis.ts",
"expectedResultDescription": "Should perform complex multi-dimensional analysis on GTMS issues. Expected: 22 total issues, 3 assigned to Stefan Faistenauer, 19 unassigned. Anomalies: 3 unassigned 'In Progress' issues (GTMS-20, GTMS-3, GTMS-1). Backlog: 10 'To Do' issues (45.45%). Issue types: 15 Tasks, 7 Sub-tasks. Stefan's issues: In Progress (2 issues: GTMS-9, GTMS-7), Launched (1 issue: GTMS-4)."
},
{
"id": "044-slack-get-last-20-messages",
"name": "Slack Get Last 20 Messages",
"description": "Retrieve exactly 20 messages from evals channel",
"type": "retrieval",
"instruction": "Retrieve the last 20 messages from the evals channel. Return a JSON object with this structure: { message_count: number, messages: Array<{type: string, ts: string, text: string}> }. The message_count must equal the number of messages retrieved.",
"integrationIds": [
"slack"
],
"payload": {},
"validationFunction": "validators/044-slack-get-last-20-messages.ts",
"expectedResultDescription": "Should retrieve exactly 20 most recent messages from the 'evals' Slack channel. Expected message_count to be exactly 20."
},
{
"id": "045-slack-get-all-messages",
"name": "Slack Get All Messages",
"description": "Retrieve exactly 1005 messages from the evals channel",
"type": "retrieval",
"instruction": "Retrieve exactly 1005 messages from the evals channel. Return a JSON object with this structure: { message_count: number }. The message_count must equal the number of messages retrieved.",
"integrationIds": [
"slack"
],
"payload": {},
"validationFunction": "validators/045-slack-get-all-messages.ts",
"expectedResultDescription": "Should retrieve exactly 1005 messages from the 'evals' Slack channel (tests pagination handling). Expected message_count to be exactly 1005."
},
{
"id": "046-slack-get-all-messages-with-pagination-reminder",
"name": "Slack Get All Messages",
"description": "Retrieve exactly 1005 messages from the evals channel with a reminder to use pagination",
"type": "retrieval",
"instruction": "Retrieve exactly 1005 messages from the evals channel. Remember to use pagination. Return a JSON object with this structure: { message_count: number }. The message_count must equal the number of messages retrieved.",
"integrationIds": [
"slack"
],
"payload": {},
"validationFunction": "validators/046-slack-get-all-messages-with-pagination-reminder.ts",
"expectedResultDescription": "Should retrieve exactly 1005 messages from the 'evals' Slack channel with explicit pagination reminder. Expected message_count to be exactly 1005."
},
{
"id": "047-csv-aggregate-filter",
"name": "CSV Aggregate and Filter",
"description": "Aggregate sales data by category and filter categories with revenue > $5000",
"type": "retrieval",
"instruction": "Parse the CSV data provided in csv_file. Aggregate total revenue by category (sum of quantity * price for each row). Filter to only include categories where total revenue exceeds $5000. Sort results by revenue descending. Return JSON: { categories: Array<{ category: string, total_revenue: number, item_count: number }> }",
"integrationIds": [],
"payload": {
"csv_file": "sales_data.csv"
},
"validationFunction": "validators/047-csv-aggregate-filter.ts",
"expectedResultDescription": "Should parse CSV, aggregate by category, and filter categories with revenue > $5000. Expected to find Electronics with highest revenue (~$23.5k) and Furniture (~$20k), with accurate item counts for each category."
},
{
"id": "048-csv-top-performers",
"name": "CSV Top Sales Performers",
"description": "Find top 3 salespeople by revenue with region and sales count",
"type": "retrieval",
"instruction": "Parse the CSV data provided in csv_file. Calculate total revenue for each salesperson (sum of quantity * price). Find the top 3 salespeople by total revenue. For each, include their name, total revenue, region (use the most common region for that salesperson), and number of sales transactions. Return JSON: { top_performers: Array<{ name: string, total_revenue: number, region: string, sales_count: number }> }",
"integrationIds": [],
"payload": {
"csv_file": "sales_data.csv"
},
"validationFunction": "validators/048-csv-top-performers.ts",
"expectedResultDescription": "Should identify top 3 salespeople by revenue. Expected Bob Smith, Alice Johnson, and Diana Lee or Charlie Davis in top positions with accurate revenue calculations and sales counts."
},
{
"id": "049-json-flatten-users",
"name": "JSON Flatten User Profiles",
"description": "Flatten nested user profile objects using dot notation",
"type": "retrieval",
"instruction": "Parse the JSON data provided in json_file. Flatten each user object to a single-level object using dot notation for nested fields (e.g., 'address.city', 'preferences.theme'). Do not flatten the orders array - keep it as is. Return JSON: { users: Array<{ id: string, name: string, email: string, 'address.street': string, 'address.city': string, 'address.state': string, 'address.zipcode': string, 'address.country': string, orders: Array<any>, 'preferences.newsletter': boolean, 'preferences.notifications.email': boolean, 'preferences.notifications.sms': boolean, 'preferences.theme': string }> }",
"integrationIds": [],
"payload": {
"json_file": "user_profiles.json"
},
"validationFunction": "validators/049-json-flatten-users.ts",
"expectedResultDescription": "Should flatten nested user objects using dot notation. Expected all 10 users with fields like 'address.city' and 'preferences.theme' at the top level, while preserving orders array structure."
},
{
"id": "050-json-extract-orders",
"name": "JSON Extract and Aggregate Orders",
"description": "Extract all orders across users and calculate totals per user",
"type": "retrieval",
"instruction": "Parse the JSON data provided in json_file. Extract all orders from all users. For each user who has orders, calculate their total order value (sum of all order totals). Return JSON: { order_summary: Array<{ user_id: string, user_name: string, order_count: number, total_value: number }> }. Sort by total_value descending.",
"integrationIds": [],
"payload": {
"json_file": "user_profiles.json"
},
"validationFunction": "validators/050-json-extract-orders.ts",
"expectedResultDescription": "Should extract all orders and calculate totals per user. Expected 9 users with orders (U004 has no orders). Total of 14 orders across all users. John Smith (U001) should have highest total at $1759.96 with 2 orders."
},
{
"id": "051-fixedwidth-parse-structure",
"name": "Fixed-Width Parse to Structured JSON",
"description": "Parse fixed-width customer records into structured JSON",
"type": "retrieval",
"instruction": "Parse the fixed-width text data provided in fixedwidth_file. The format is: customer_id (positions 0-9, 10 chars), name (positions 10-39, 30 chars), age (positions 40-42, 3 chars), city (positions 43-62, 20 chars), balance (positions 63-72, 10 chars). Trim whitespace from all fields. Convert age to integer and balance to float. Return JSON: { customers: Array<{ customer_id: string, name: string, age: number, city: string, balance: number }> }",
"integrationIds": [],
"payload": {
"fixedwidth_file": "customer_records.txt",
"field_widths": [10, 30, 3, 20, 10]
},
"validationFunction": "validators/051-fixedwidth-parse-structure.ts",
"expectedResultDescription": "Should parse fixed-width format correctly. Expected 20 customers with properly extracted fields, trimmed strings, age as integers, and balance as floats. First customer should be Alice Johnson from New York with balance 5250.75."
},
{
"id": "052-multifile-csv-json-join",
"name": "Multi-File CSV-JSON Join",
"description": "Join sales CSV data with user JSON data on salesperson name",
"type": "retrieval",
"instruction": "Parse both csv_file and json_file. From the CSV, aggregate sales by salesperson (total revenue). From the JSON, extract user names and their email addresses. Join the data where the salesperson name matches a user name (use contains matching - check if any user name is contained in or contains the salesperson name). Return JSON: { matched_sales: Array<{ salesperson: string, total_revenue: number, email: string | null }> }. If no email match found, set email to null.",
"integrationIds": [],
"payload": {
"csv_file": "sales_data.csv",
"json_file": "user_profiles.json"
},
"validationFunction": "validators/052-multifile-csv-json-join.ts",
"expectedResultDescription": "Should join CSV sales data with JSON user data. Expected 4 salespeople (Alice Johnson, Bob Smith, Charlie Davis, Diana Lee) with their revenue totals. May or may not match emails depending on name matching logic."
},
{
"id": "053-multifile-all-summary",
"name": "Multi-File Summary Statistics",
"description": "Generate summary statistics from all three file types",
"type": "retrieval",
"instruction": "Parse all three files: csv_file, json_file, and fixedwidth_file. Generate summary statistics for each. Return JSON: { csv_summary: { total_rows: number, total_revenue: number, unique_products: number, date_range: { earliest: string, latest: string } }, json_summary: { total_users: number, users_with_orders: number, total_orders: number, total_order_value: number }, fixedwidth_summary: { total_customers: number, total_balance: number, avg_age: number, unique_cities: number } }",
"integrationIds": [],
"payload": {
"csv_file": "sales_data.csv",
"json_file": "user_profiles.json",
"fixedwidth_file": "customer_records.txt",
"field_widths": [10, 30, 3, 20, 10]
},
"validationFunction": "validators/053-multifile-all-summary.ts",
"expectedResultDescription": "Should generate accurate summary statistics from all three files. CSV: 50 rows, ~$44k revenue, 35 unique products. JSON: 10 users, 9 with orders, 14 total orders, ~$5619.59 total value. Fixed-width: 20 customers, ~$161.5k total balance, avg age ~38.5, 20 unique cities."
},
{
"id": "054-postgres-list-tables",
"name": "PostgreSQL List All Eval Tables",
"description": "Simple test: Query database to list all eval tables",
"type": "retrieval",
"instruction": "Query the PostgreSQL database to retrieve all table names from the public schema that start with 'eval_'. Return a JSON object with this structure: { tables: string[] }",
"integrationIds": ["eval-postgres"],
"payload": {},
"validationFunction": "validators/054-postgres-list-tables.ts",
"expectedResultDescription": "Should return all eval table names from the public schema. Expected tables: eval_customers, eval_employees, eval_orders, eval_products."
},
{
"id": "055-postgres-aggregate-products",
"name": "PostgreSQL Aggregate Products by Category",
"description": "Medium test: Aggregate product data with filtering",
"type": "retrieval",
"instruction": "Query the PostgreSQL eval_products table to aggregate data by category. Calculate the total number of products, average price, and total stock for each category. Return JSON: { categories: Array<{ category: string, product_count: number, avg_price: number, total_stock: number }> }. Sort by product_count descending.",
"integrationIds": ["eval-postgres"],
"payload": {},
"validationFunction": "validators/055-postgres-aggregate-products.ts",
"expectedResultDescription": "Should aggregate products by category. Expected 2 categories: Electronics (5 products, 865 total stock) and Furniture (3 products, 145 total stock) with accurate price and stock calculations."
},
{
"id": "056-postgres-customer-order-analysis",
"name": "PostgreSQL Customer Order Analysis",
"description": "Complex test: Join multiple tables and aggregate order statistics",
"type": "retrieval",
"instruction": "Query the PostgreSQL database to analyze customer orders. Join the eval_orders, eval_customers, and eval_products tables. Calculate: total number of orders, completed orders (status='completed'), failed/pending orders, total revenue from completed orders. Also provide per-customer analysis including customer name, order count, total spent, and most purchased category. Return JSON: { summary: { total_orders: number, completed_orders: number, pending_orders: number, failed_orders: number, total_revenue: number }, by_customer: Array<{ customer_name: string, order_count: number, total_spent: number, most_purchased_category: string }> }. Sort by_customer by total_spent descending.",
"integrationIds": ["eval-postgres"],
"payload": {},
"validationFunction": "validators/056-postgres-customer-order-analysis.ts",
"expectedResultDescription": "Should perform complex joins and aggregations on customer order data. Expected 9 total orders, 7 completed, 1 pending, 1 failed. Total revenue ~$2554.87. John Smith should be the top customer by spending with ~$1959.96."
},
{
"id": "057-sftp-list-directory",
"name": "SFTP List Directory Contents",
"description": "Simple test: List files and directories in home directory",
"type": "retrieval",
"instruction": "Connect to the SFTP server and list all files and directories in the home directory. Return JSON: { files: Array<{ name: string, type: 'file' | 'directory', size?: number }> }",
"integrationIds": ["eval-sftp"],
"payload": {},
"validationFunction": "validators/057-sftp-list-directory.ts",
"expectedResultDescription": "Should list all files and directories in SFTP home directory. Expected to find 'uploads' directory and various configuration files."
},
{
"id": "058-jira-list-projects",
"name": "JIRA List All Projects",
"description": "Simple test: Retrieve all accessible Jira projects",
"type": "retrieval",
"instruction": "Query the JIRA API to retrieve all projects that the authenticated user has access to. For each project, get the project key, name, and project type. Return JSON: { project_count: number, projects: Array<{ key: string, name: string, projectTypeKey: string }> }",
"integrationIds": ["jira"],
"payload": {},
"validationFunction": "validators/061-jira-list-projects.ts",
"expectedResultDescription": "Should return all accessible Jira projects. Expected 3 projects: 'GTMS' (Go to market sample, business), 'LEARNJIRA' (Learn Jira in 10 minutes, software), and 'KAN' (My Kanban Project, software)."
},
{
"id": "059-jira-search-issues-by-status",
"name": "JIRA Search Issues by Status",
"description": "Medium test: Search for issues using JQL and aggregate by status",
"type": "retrieval",
"instruction": "Use the JIRA search API (POST /rest/api/3/search/jql) to find all issues in the GTMS project. Use JQL (Jira Query Language) with the query 'project=GTMS'. Then aggregate the results by status and calculate: total issues found, count per status, and list of issue keys and summaries for 'In Progress' status. Return JSON: { summary: { total_issues: number, total_statuses: number }, by_status: Array<{ status: string, count: number }>, in_progress_issues: Array<{ key: string, summary: string, assignee: string | null }> }. Sort by_status by count descending.",
"integrationIds": ["jira"],
"payload": {},
"validationFunction": "validators/062-jira-search-issues-by-status.ts",
"expectedResultDescription": "Should search all GTMS project issues and aggregate by status. Expected 22 total issues with 4 statuses: 'To Do' (10), 'In Progress' (5), 'Ready for Launch' (4), 'Launched' (3). In Progress issues should include GTMS-20, GTMS-9, GTMS-7, GTMS-3, GTMS-1."
}
],
"enabledTools": "all",
"settings": {
"runOneShotMode": true,
"runSelfHealingMode": true,
"attemptsEachMode": 5,
"maxConcurrentWorkers": 5,
"toolAttemptTimeoutMs": 300000
},
"validationLlmConfig": {
"provider": "openai",
"model": "gpt-5.1"
}
}