datasets.js•8.4 kB
/**
* Tool definition for listing datasets
*/
export const listDatasetsToolDefinition = {
name: "hf_list_datasets",
description: "Get information from all datasets in the Hub. Supports filtering by search terms, authors, tags, and more. " +
"Returns paginated results with dataset metadata including downloads, likes, and tags.",
inputSchema: {
type: "object",
properties: {
search: {
type: "string",
description: "Filter based on substrings for repos and their usernames (e.g., 'pets', 'microsoft')"
},
author: {
type: "string",
description: "Filter datasets by author or organization (e.g., 'huggingface', 'microsoft')"
},
filter: {
type: "string",
description: "Filter based on tags (e.g., 'task_categories:text-classification', 'languages:en')"
},
sort: {
type: "string",
description: "Property to use when sorting (e.g., 'downloads', 'author')"
},
direction: {
type: "string",
description: "Sort direction: '-1' for descending, anything else for ascending"
},
limit: {
type: "number",
description: "Limit the number of datasets fetched"
},
full: {
type: "boolean",
description: "Whether to fetch most dataset data including all tags and files"
},
config: {
type: "boolean",
description: "Whether to also fetch the repo config"
}
},
required: []
}
};
/**
* Tool definition for getting dataset info
*/
export const getDatasetInfoToolDefinition = {
name: "hf_get_dataset_info",
description: "Get detailed information for a specific dataset including metadata, files, configuration, and more.",
inputSchema: {
type: "object",
properties: {
repo_id: {
type: "string",
description: "Dataset repository ID (e.g., 'squad', 'imdb')"
},
revision: {
type: "string",
description: "Optional git revision (branch, tag, or commit hash)"
},
full: {
type: "boolean",
description: "Whether to fetch most dataset data including all tags and files"
}
},
required: ["repo_id"]
}
};
/**
* Tool definition for getting dataset parquet files
*/
export const getDatasetParquetToolDefinition = {
name: "hf_get_dataset_parquet",
description: "Get the list of auto-converted parquet files for a dataset. Can specify subset (config) and split to get specific files.",
inputSchema: {
type: "object",
properties: {
repo_id: {
type: "string",
description: "Dataset repository ID"
},
subset: {
type: "string",
description: "Optional dataset subset/config name"
},
split: {
type: "string",
description: "Optional dataset split (train, test, validation, etc.)"
},
n: {
type: "number",
description: "Optional shard number to get the nth parquet file"
}
},
required: ["repo_id"]
}
};
/**
* Tool definition for getting Croissant metadata
*/
export const getCroissantToolDefinition = {
name: "hf_get_croissant",
description: "Get the Croissant metadata for a dataset. Croissant is a high-level format for machine learning datasets.",
inputSchema: {
type: "object",
properties: {
repo_id: {
type: "string",
description: "Dataset repository ID"
}
},
required: ["repo_id"]
}
};
/**
* Tool definition for getting dataset tags
*/
export const getDatasetTagsToolDefinition = {
name: "hf_get_dataset_tags",
description: "Gets all available dataset tags hosted in the Hub, organized by type (e.g., task categories, languages, licenses).",
inputSchema: {
type: "object",
properties: {},
required: []
}
};
function isDatasetSearchArgs(args) {
return typeof args === "object" && args !== null;
}
function isDatasetInfoArgs(args) {
return (typeof args === "object" &&
args !== null &&
"repo_id" in args &&
typeof args.repo_id === "string");
}
function isDatasetParquetArgs(args) {
return (typeof args === "object" &&
args !== null &&
"repo_id" in args &&
typeof args.repo_id === "string");
}
function isCroissantArgs(args) {
return (typeof args === "object" &&
args !== null &&
"repo_id" in args &&
typeof args.repo_id === "string");
}
export async function handleListDatasets(client, args) {
try {
if (!isDatasetSearchArgs(args)) {
throw new Error("Invalid arguments for hf_list_datasets");
}
const results = await client.getDatasets(args);
return {
content: [{ type: "text", text: results }],
isError: false,
};
}
catch (error) {
return {
content: [
{
type: "text",
text: `Error: ${error instanceof Error ? error.message : String(error)}`,
},
],
isError: true,
};
}
}
export async function handleGetDatasetInfo(client, args) {
try {
if (!isDatasetInfoArgs(args)) {
throw new Error("Invalid arguments for hf_get_dataset_info");
}
const { repo_id, revision, full } = args;
const params = full ? { full } : {};
const results = await client.getDatasetInfo(repo_id, revision, params);
return {
content: [{ type: "text", text: results }],
isError: false,
};
}
catch (error) {
return {
content: [
{
type: "text",
text: `Error: ${error instanceof Error ? error.message : String(error)}`,
},
],
isError: true,
};
}
}
export async function handleGetDatasetParquet(client, args) {
try {
if (!isDatasetParquetArgs(args)) {
throw new Error("Invalid arguments for hf_get_dataset_parquet");
}
const { repo_id, subset, split, n } = args;
const results = await client.getDatasetParquet(repo_id, subset, split, n);
return {
content: [{ type: "text", text: results }],
isError: false,
};
}
catch (error) {
return {
content: [
{
type: "text",
text: `Error: ${error instanceof Error ? error.message : String(error)}`,
},
],
isError: true,
};
}
}
export async function handleGetCroissant(client, args) {
try {
if (!isCroissantArgs(args)) {
throw new Error("Invalid arguments for hf_get_croissant");
}
const { repo_id } = args;
const results = await client.getDatasetCroissant(repo_id);
return {
content: [{ type: "text", text: results }],
isError: false,
};
}
catch (error) {
return {
content: [
{
type: "text",
text: `Error: ${error instanceof Error ? error.message : String(error)}`,
},
],
isError: true,
};
}
}
export async function handleGetDatasetTags(client, args) {
try {
const results = await client.getDatasetTags();
return {
content: [{ type: "text", text: results }],
isError: false,
};
}
catch (error) {
return {
content: [
{
type: "text",
text: `Error: ${error instanceof Error ? error.message : String(error)}`,
},
],
isError: true,
};
}
}