Skip to main content
Glama

Hugging Face Hub MCP Server

by michaelwaves
datasets.ts9.28 kB
import { Tool, CallToolResult } from '@modelcontextprotocol/sdk/types.js'; import { HuggingFaceClient } from '../client.js'; import { DatasetSearchArgs, DatasetInfoArgs, DatasetParquetArgs, CroissantArgs } from '../types.js'; /** * Tool definition for listing datasets */ export const listDatasetsToolDefinition: Tool = { name: "hf_list_datasets", description: "Get information from all datasets in the Hub. Supports filtering by search terms, authors, tags, and more. " + "Returns paginated results with dataset metadata including downloads, likes, and tags.", inputSchema: { type: "object", properties: { search: { type: "string", description: "Filter based on substrings for repos and their usernames (e.g., 'pets', 'microsoft')" }, author: { type: "string", description: "Filter datasets by author or organization (e.g., 'huggingface', 'microsoft')" }, filter: { type: "string", description: "Filter based on tags (e.g., 'task_categories:text-classification', 'languages:en')" }, sort: { type: "string", description: "Property to use when sorting (e.g., 'downloads', 'author')" }, direction: { type: "string", description: "Sort direction: '-1' for descending, anything else for ascending" }, limit: { type: "number", description: "Limit the number of datasets fetched" }, full: { type: "boolean", description: "Whether to fetch most dataset data including all tags and files" }, config: { type: "boolean", description: "Whether to also fetch the repo config" } }, required: [] } }; /** * Tool definition for getting dataset info */ export const getDatasetInfoToolDefinition: Tool = { name: "hf_get_dataset_info", description: "Get detailed information for a specific dataset including metadata, files, configuration, and more.", inputSchema: { type: "object", properties: { repo_id: { type: "string", description: "Dataset repository ID (e.g., 'squad', 'imdb')" }, revision: { type: "string", description: "Optional git revision (branch, tag, or commit hash)" }, full: { type: "boolean", description: "Whether to fetch most dataset data including all tags and files" } }, required: ["repo_id"] } }; /** * Tool definition for getting dataset parquet files */ export const getDatasetParquetToolDefinition: Tool = { name: "hf_get_dataset_parquet", description: "Get the list of auto-converted parquet files for a dataset. Can specify subset (config) and split to get specific files.", inputSchema: { type: "object", properties: { repo_id: { type: "string", description: "Dataset repository ID" }, subset: { type: "string", description: "Optional dataset subset/config name" }, split: { type: "string", description: "Optional dataset split (train, test, validation, etc.)" }, n: { type: "number", description: "Optional shard number to get the nth parquet file" } }, required: ["repo_id"] } }; /** * Tool definition for getting Croissant metadata */ export const getCroissantToolDefinition: Tool = { name: "hf_get_croissant", description: "Get the Croissant metadata for a dataset. Croissant is a high-level format for machine learning datasets.", inputSchema: { type: "object", properties: { repo_id: { type: "string", description: "Dataset repository ID" } }, required: ["repo_id"] } }; /** * Tool definition for getting dataset tags */ export const getDatasetTagsToolDefinition: Tool = { name: "hf_get_dataset_tags", description: "Gets all available dataset tags hosted in the Hub, organized by type (e.g., task categories, languages, licenses).", inputSchema: { type: "object", properties: {}, required: [] } }; function isDatasetSearchArgs(args: unknown): args is DatasetSearchArgs { return typeof args === "object" && args !== null; } function isDatasetInfoArgs(args: unknown): args is DatasetInfoArgs { return ( typeof args === "object" && args !== null && "repo_id" in args && typeof (args as { repo_id: string }).repo_id === "string" ); } function isDatasetParquetArgs(args: unknown): args is DatasetParquetArgs { return ( typeof args === "object" && args !== null && "repo_id" in args && typeof (args as { repo_id: string }).repo_id === "string" ); } function isCroissantArgs(args: unknown): args is CroissantArgs { return ( typeof args === "object" && args !== null && "repo_id" in args && typeof (args as { repo_id: string }).repo_id === "string" ); } export async function handleListDatasets(client: HuggingFaceClient, args: unknown): Promise<CallToolResult> { try { if (!isDatasetSearchArgs(args)) { throw new Error("Invalid arguments for hf_list_datasets"); } const results = await client.getDatasets(args as Record<string, any>); return { content: [{ type: "text", text: results }], isError: false, }; } catch (error) { return { content: [ { type: "text", text: `Error: ${error instanceof Error ? error.message : String(error)}`, }, ], isError: true, }; } } export async function handleGetDatasetInfo(client: HuggingFaceClient, args: unknown): Promise<CallToolResult> { try { if (!isDatasetInfoArgs(args)) { throw new Error("Invalid arguments for hf_get_dataset_info"); } const { repo_id, revision, full } = args; const params = full ? { full } : {}; const results = await client.getDatasetInfo(repo_id, revision, params); return { content: [{ type: "text", text: results }], isError: false, }; } catch (error) { return { content: [ { type: "text", text: `Error: ${error instanceof Error ? error.message : String(error)}`, }, ], isError: true, }; } } export async function handleGetDatasetParquet(client: HuggingFaceClient, args: unknown): Promise<CallToolResult> { try { if (!isDatasetParquetArgs(args)) { throw new Error("Invalid arguments for hf_get_dataset_parquet"); } const { repo_id, subset, split, n } = args; const results = await client.getDatasetParquet(repo_id, subset, split, n); return { content: [{ type: "text", text: results }], isError: false, }; } catch (error) { return { content: [ { type: "text", text: `Error: ${error instanceof Error ? error.message : String(error)}`, }, ], isError: true, }; } } export async function handleGetCroissant(client: HuggingFaceClient, args: unknown): Promise<CallToolResult> { try { if (!isCroissantArgs(args)) { throw new Error("Invalid arguments for hf_get_croissant"); } const { repo_id } = args; const results = await client.getDatasetCroissant(repo_id); return { content: [{ type: "text", text: results }], isError: false, }; } catch (error) { return { content: [ { type: "text", text: `Error: ${error instanceof Error ? error.message : String(error)}`, }, ], isError: true, }; } } export async function handleGetDatasetTags(client: HuggingFaceClient, args: unknown): Promise<CallToolResult> { try { const results = await client.getDatasetTags(); return { content: [{ type: "text", text: results }], isError: false, }; } catch (error) { return { content: [ { type: "text", text: `Error: ${error instanceof Error ? error.message : String(error)}`, }, ], isError: true, }; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/michaelwaves/hf-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server