Apify MCP Server

dataset.ts•9.78 KiB

import { Ajv } from 'ajv'; import toJsonSchema from 'to-json-schema'; import { z } from 'zod'; import zodToJsonSchema from 'zod-to-json-schema'; import { ApifyClient } from '../apify-client.js'; import { HelperTools } from '../const.js'; import type { InternalTool, ToolEntry } from '../types.js'; const ajv = new Ajv({ coerceTypes: 'array', strict: false }); const getDatasetArgs = z.object({ datasetId: z.string() .min(1) .describe('Dataset ID or username~dataset-name.'), }); const getDatasetItemsArgs = z.object({ datasetId: z.string() .min(1) .describe('Dataset ID or username~dataset-name.'), clean: z.boolean().optional() .describe('If true, returns only non-empty items and skips hidden fields (starting with #). Shortcut for skipHidden=true and skipEmpty=true.'), offset: z.number().optional() .describe('Number of items to skip at the start. Default is 0.'), limit: z.number().optional() .describe('Maximum number of items to return. No limit by default.'), fields: z.string().optional() .describe('Comma-separated list of fields to include in results. ' + 'Fields in output are sorted as specified. ' + 'For nested objects, use dot notation (e.g. "metadata.url") after flattening.'), omit: z.string().optional() .describe('Comma-separated list of fields to exclude from results.'), desc: z.boolean().optional() .describe('If true, results are returned in reverse order (newest to oldest).'), flatten: z.string().optional() .describe('Comma-separated list of fields which should transform nested objects into flat structures. ' + 'For example, with flatten="metadata" the object {"metadata":{"url":"hello"}} becomes {"metadata.url":"hello"}. ' + 'This is required before accessing nested fields with the fields parameter.'), }); /** * https://docs.apify.com/api/v2/dataset-get */ export const getDataset: ToolEntry = { type: 'internal', tool: { name: HelperTools.DATASET_GET, actorFullName: HelperTools.DATASET_GET, description: 'Dataset is a collection of structured data created by an Actor run. ' + 'Returns information about dataset object with metadata (itemCount, schema, fields, stats). ' + `Fields describe the structure of the dataset and can be used to filter the data with the ${HelperTools.DATASET_GET_ITEMS} tool. ` + 'Note: itemCount updates may have 5s delay.' + 'The dataset can be accessed with the dataset URL: GET: https://api.apify.com/v2/datasets/:datasetId', inputSchema: zodToJsonSchema(getDatasetArgs), ajvValidate: ajv.compile(zodToJsonSchema(getDatasetArgs)), call: async (toolArgs) => { const { args, apifyToken } = toolArgs; const parsed = getDatasetArgs.parse(args); const client = new ApifyClient({ token: apifyToken }); const v = await client.dataset(parsed.datasetId).get(); if (!v) { return { content: [{ type: 'text', text: `Dataset '${parsed.datasetId}' not found.` }] }; } return { content: [{ type: 'text', text: JSON.stringify(v) }] }; }, } as InternalTool, }; /** * https://docs.apify.com/api/v2/dataset-items-get */ export const getDatasetItems: ToolEntry = { type: 'internal', tool: { name: HelperTools.DATASET_GET_ITEMS, actorFullName: HelperTools.DATASET_GET_ITEMS, description: 'Returns dataset items with pagination support. ' + 'Items can be sorted (newest to oldest) and filtered (clean mode skips empty items and hidden fields). ' + 'Supports field selection - include specific fields or exclude unwanted ones using comma-separated lists. ' + 'For nested objects, you must first flatten them using the flatten parameter before accessing their fields. ' + 'Example: To get URLs from items like [{"metadata":{"url":"example.com"}}], ' + 'use flatten="metadata" and then fields="metadata.url". ' + 'The flattening transforms nested objects into dot-notation format ' + '(e.g. {"metadata":{"url":"x"}} becomes {"metadata.url":"x"}). ' + 'Retrieve only the fields you need, reducing the response size and improving performance. ' + 'The response includes total count, offset, limit, and items array.', inputSchema: zodToJsonSchema(getDatasetItemsArgs), ajvValidate: ajv.compile(zodToJsonSchema(getDatasetItemsArgs)), call: async (toolArgs) => { const { args, apifyToken } = toolArgs; const parsed = getDatasetItemsArgs.parse(args); const client = new ApifyClient({ token: apifyToken }); // Convert comma-separated strings to arrays const fields = parsed.fields?.split(',').map((f) => f.trim()); const omit = parsed.omit?.split(',').map((f) => f.trim()); const flatten = parsed.flatten?.split(',').map((f) => f.trim()); const v = await client.dataset(parsed.datasetId).listItems({ clean: parsed.clean, offset: parsed.offset, limit: parsed.limit, fields, omit, desc: parsed.desc, flatten, }); if (!v) { return { content: [{ type: 'text', text: `Dataset '${parsed.datasetId}' not found.` }] }; } return { content: [{ type: 'text', text: JSON.stringify(v) }] }; }, } as InternalTool, }; /** * Function to recursively remove empty arrays from an object */ function removeEmptyArrays(obj: unknown): unknown { if (Array.isArray(obj)) { // If the item is an array, recursively call removeEmptyArrays on each element. return obj.map((item) => removeEmptyArrays(item)); } if (typeof obj !== 'object' || obj === null) { // Return primitives and null values as is. return obj; } // Use reduce to build a new object, excluding keys with empty arrays. return Object.entries(obj).reduce((acc, [key, value]) => { const processedValue = removeEmptyArrays(value); // Exclude the key if the processed value is an empty array. if (Array.isArray(processedValue) && processedValue.length === 0) { return acc; } acc[key] = processedValue; return acc; }, {} as Record<string, unknown>); } const getDatasetSchemaArgs = z.object({ datasetId: z.string() .min(1) .describe('Dataset ID or username~dataset-name.'), limit: z.number().optional() .describe('Maximum number of items to use for schema generation. Default is 5.') .default(5), clean: z.boolean().optional() .describe('If true, uses only non-empty items and skips hidden fields (starting with #). Default is true.') .default(true), arrayMode: z.enum(['first', 'all']).optional() .describe('Strategy for handling arrays. "first" uses first item as template, "all" merges all items. Default is "all".') .default('all'), additionalProperties: z.boolean().optional() .describe('If true, allows additional properties in objects. Default is true.') .default(true), }); /** * Generates a JSON schema from dataset items */ export const getDatasetSchema: ToolEntry = { type: 'internal', tool: { name: HelperTools.DATASET_SCHEMA_GET, actorFullName: HelperTools.DATASET_SCHEMA_GET, description: 'Generates a JSON schema from dataset items. ' + 'The schema describes the structure of the data in the dataset, which can be used for validation, documentation, or data processing.' + 'Since the dataset can be large it is convenient to understand the structure of the dataset before getting dataset items.', inputSchema: zodToJsonSchema(getDatasetSchemaArgs), ajvValidate: ajv.compile(zodToJsonSchema(getDatasetSchemaArgs)), call: async (toolArgs) => { const { args, apifyToken } = toolArgs; const parsed = getDatasetSchemaArgs.parse(args); const client = new ApifyClient({ token: apifyToken }); // Get dataset items const datasetResponse = await client.dataset(parsed.datasetId).listItems({ clean: parsed.clean, limit: parsed.limit, }); if (!datasetResponse) { return { content: [{ type: 'text', text: `Dataset '${parsed.datasetId}' not found.` }] }; } const datasetItems = datasetResponse.items; if (datasetItems.length === 0) { return { content: [{ type: 'text', text: `Dataset '${parsed.datasetId}' is empty.` }] }; } // Clean the dataset items by removing empty arrays const cleanedDatasetItems = datasetItems.map((item) => removeEmptyArrays(item)); // Try to generate schema with full options first try { const schema = toJsonSchema(cleanedDatasetItems, { arrays: { mode: parsed.arrayMode }, objects: { additionalProperties: parsed.additionalProperties }, }); return { content: [{ type: 'text', text: JSON.stringify(schema), }], }; } catch { // Fallback: try with simpler approach const fallbackSchema = toJsonSchema(cleanedDatasetItems, { arrays: { mode: 'first' }, }); return { content: [{ type: 'text', text: JSON.stringify(fallbackSchema) }], }; } }, } as InternalTool, };

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jirispilka/actors-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

dataset.ts•9.78 KiB