Skip to main content
Glama
dataset.ts10 kB
import { Ajv } from 'ajv'; import toJsonSchema from 'to-json-schema'; import { z } from 'zod'; import zodToJsonSchema from 'zod-to-json-schema'; import { ApifyClient } from '../apify-client.js'; import { HelperTools } from '../const.js'; import type { InternalTool, ToolEntry } from '../types.js'; const ajv = new Ajv({ coerceTypes: 'array', strict: false }); const getDatasetArgs = z.object({ datasetId: z.string() .min(1) .describe('Dataset ID or username~dataset-name.'), }); const getDatasetItemsArgs = z.object({ datasetId: z.string() .min(1) .describe('Dataset ID or username~dataset-name.'), clean: z.boolean().optional() .describe('If true, returns only non-empty items and skips hidden fields (starting with #). Shortcut for skipHidden=true and skipEmpty=true.'), offset: z.number().optional() .describe('Number of items to skip at the start. Default is 0.'), limit: z.number().optional() .describe('Maximum number of items to return. No limit by default.'), fields: z.string().optional() .describe('Comma-separated list of fields to include in results. ' + 'Fields in output are sorted as specified. ' + 'For nested objects, use dot notation (e.g. "metadata.url") after flattening.'), omit: z.string().optional() .describe('Comma-separated list of fields to exclude from results.'), desc: z.boolean().optional() .describe('If true, results are returned in reverse order (newest to oldest).'), flatten: z.string().optional() .describe('Comma-separated list of fields which should transform nested objects into flat structures. ' + 'For example, with flatten="metadata" the object {"metadata":{"url":"hello"}} becomes {"metadata.url":"hello"}. ' + 'This is required before accessing nested fields with the fields parameter.'), }); /** * https://docs.apify.com/api/v2/dataset-get */ export const getDataset: ToolEntry = { type: 'internal', tool: { name: HelperTools.DATASET_GET, actorFullName: HelperTools.DATASET_GET, description: 'Dataset is a collection of structured data created by an Actor run. ' + 'Returns information about dataset object with metadata (itemCount, schema, fields, stats). ' + `Fields describe the structure of the dataset and can be used to filter the data with the ${HelperTools.DATASET_GET_ITEMS} tool. ` + 'Note: itemCount updates may have 5s delay.' + 'The dataset can be accessed with the dataset URL: GET: https://api.apify.com/v2/datasets/:datasetId', inputSchema: zodToJsonSchema(getDatasetArgs), ajvValidate: ajv.compile(zodToJsonSchema(getDatasetArgs)), call: async (toolArgs) => { const { args, apifyToken } = toolArgs; const parsed = getDatasetArgs.parse(args); const client = new ApifyClient({ token: apifyToken }); const v = await client.dataset(parsed.datasetId).get(); if (!v) { return { content: [{ type: 'text', text: `Dataset '${parsed.datasetId}' not found.` }] }; } return { content: [{ type: 'text', text: JSON.stringify(v) }] }; }, } as InternalTool, }; /** * https://docs.apify.com/api/v2/dataset-items-get */ export const getDatasetItems: ToolEntry = { type: 'internal', tool: { name: HelperTools.DATASET_GET_ITEMS, actorFullName: HelperTools.DATASET_GET_ITEMS, description: 'Returns dataset items with pagination support. ' + 'Items can be sorted (newest to oldest) and filtered (clean mode skips empty items and hidden fields). ' + 'Supports field selection - include specific fields or exclude unwanted ones using comma-separated lists. ' + 'For nested objects, you must first flatten them using the flatten parameter before accessing their fields. ' + 'Example: To get URLs from items like [{"metadata":{"url":"example.com"}}], ' + 'use flatten="metadata" and then fields="metadata.url". ' + 'The flattening transforms nested objects into dot-notation format ' + '(e.g. {"metadata":{"url":"x"}} becomes {"metadata.url":"x"}). ' + 'Retrieve only the fields you need, reducing the response size and improving performance. ' + 'The response includes total count, offset, limit, and items array.', inputSchema: zodToJsonSchema(getDatasetItemsArgs), ajvValidate: ajv.compile(zodToJsonSchema(getDatasetItemsArgs)), call: async (toolArgs) => { const { args, apifyToken } = toolArgs; const parsed = getDatasetItemsArgs.parse(args); const client = new ApifyClient({ token: apifyToken }); // Convert comma-separated strings to arrays const fields = parsed.fields?.split(',').map((f) => f.trim()); const omit = parsed.omit?.split(',').map((f) => f.trim()); const flatten = parsed.flatten?.split(',').map((f) => f.trim()); const v = await client.dataset(parsed.datasetId).listItems({ clean: parsed.clean, offset: parsed.offset, limit: parsed.limit, fields, omit, desc: parsed.desc, flatten, }); if (!v) { return { content: [{ type: 'text', text: `Dataset '${parsed.datasetId}' not found.` }] }; } return { content: [{ type: 'text', text: JSON.stringify(v) }] }; }, } as InternalTool, }; /** * Function to recursively remove empty arrays from an object */ function removeEmptyArrays(obj: unknown): unknown { if (Array.isArray(obj)) { // If the item is an array, recursively call removeEmptyArrays on each element. return obj.map((item) => removeEmptyArrays(item)); } if (typeof obj !== 'object' || obj === null) { // Return primitives and null values as is. return obj; } // Use reduce to build a new object, excluding keys with empty arrays. return Object.entries(obj).reduce((acc, [key, value]) => { const processedValue = removeEmptyArrays(value); // Exclude the key if the processed value is an empty array. if (Array.isArray(processedValue) && processedValue.length === 0) { return acc; } acc[key] = processedValue; return acc; }, {} as Record<string, unknown>); } const getDatasetSchemaArgs = z.object({ datasetId: z.string() .min(1) .describe('Dataset ID or username~dataset-name.'), limit: z.number().optional() .describe('Maximum number of items to use for schema generation. Default is 5.') .default(5), clean: z.boolean().optional() .describe('If true, uses only non-empty items and skips hidden fields (starting with #). Default is true.') .default(true), arrayMode: z.enum(['first', 'all']).optional() .describe('Strategy for handling arrays. "first" uses first item as template, "all" merges all items. Default is "all".') .default('all'), additionalProperties: z.boolean().optional() .describe('If true, allows additional properties in objects. Default is true.') .default(true), }); /** * Generates a JSON schema from dataset items */ export const getDatasetSchema: ToolEntry = { type: 'internal', tool: { name: HelperTools.DATASET_SCHEMA_GET, actorFullName: HelperTools.DATASET_SCHEMA_GET, description: 'Generates a JSON schema from dataset items. ' + 'The schema describes the structure of the data in the dataset, which can be used for validation, documentation, or data processing.' + 'Since the dataset can be large it is convenient to understand the structure of the dataset before getting dataset items.', inputSchema: zodToJsonSchema(getDatasetSchemaArgs), ajvValidate: ajv.compile(zodToJsonSchema(getDatasetSchemaArgs)), call: async (toolArgs) => { const { args, apifyToken } = toolArgs; const parsed = getDatasetSchemaArgs.parse(args); const client = new ApifyClient({ token: apifyToken }); // Get dataset items const datasetResponse = await client.dataset(parsed.datasetId).listItems({ clean: parsed.clean, limit: parsed.limit, }); if (!datasetResponse) { return { content: [{ type: 'text', text: `Dataset '${parsed.datasetId}' not found.` }] }; } const datasetItems = datasetResponse.items; if (datasetItems.length === 0) { return { content: [{ type: 'text', text: `Dataset '${parsed.datasetId}' is empty.` }] }; } // Clean the dataset items by removing empty arrays const cleanedDatasetItems = datasetItems.map((item) => removeEmptyArrays(item)); // Try to generate schema with full options first try { const schema = toJsonSchema(cleanedDatasetItems, { arrays: { mode: parsed.arrayMode }, objects: { additionalProperties: parsed.additionalProperties }, }); return { content: [{ type: 'text', text: JSON.stringify(schema), }], }; } catch { // Fallback: try with simpler approach const fallbackSchema = toJsonSchema(cleanedDatasetItems, { arrays: { mode: 'first' }, }); return { content: [{ type: 'text', text: JSON.stringify(fallbackSchema) }], }; } }, } as InternalTool, };

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jirispilka/actors-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server