Kaggle-MCP

MIT License

Overview InspectNew Endpoints Schema Related Servers Reviews Score

datasets.py•13.2 kB

"""Dataset tools for Kaggle API.""" import os import json import tempfile from typing import Optional, Dict, Any, List from kaggle_mcp.tools.auth import api, ensure_authenticated def init_dataset_tools(mcp_instance): """Initialize dataset tools with the given MCP instance.""" @mcp_instance.tool() def datasets_list(search: str = "", user: str = "", license_name: str = "all", file_type: str = "all", tags: str = "", sort_by: str = "hotness", size: str = "all", page: int = 1) -> str: """List available Kaggle datasets. Args: search: Term(s) to search for user: Display datasets by a specific user or organization license_name: Display datasets with a specific license (all, cc, gpl, odb, other) file_type: Display datasets of a specific file type (all, csv, sqlite, json, bigQuery) tags: Tag IDs to filter by (comma-separated) sort_by: Sort datasets by (hotness, votes, updated, active) size: Filter by dataset size (all, small, medium, large) page: Page number for results paging Returns: JSON string with dataset details """ authenticated, msg = ensure_authenticated() if not authenticated: return msg try: datasets = api.dataset_list(search=search, user=user, license_name=license_name, file_type=file_type, tags=tags, sort_by=sort_by, size=size, page=page) result = [] for ds in datasets: result.append({ "ref": ds.ref if hasattr(ds, 'ref') else None, "title": ds.title if hasattr(ds, 'title') else None, "size": ds.size if hasattr(ds, 'size') else None, "lastUpdated": str(ds.lastUpdated) if hasattr(ds, 'lastUpdated') else None, "downloadCount": ds.downloadCount if hasattr(ds, 'downloadCount') else None, "voteCount": ds.voteCount if hasattr(ds, 'voteCount') else None, "usabilityRating": ds.usabilityRating if hasattr(ds, 'usabilityRating') else None, "description": ds.description if hasattr(ds, 'description') else None, "ownerName": ds.ownerName if hasattr(ds, 'ownerName') else None, "tags": ds.tags if hasattr(ds, 'tags') else [] }) return json.dumps(result, indent=2) except Exception as e: return f"Error listing datasets: {str(e)}" @mcp_instance.tool() def dataset_list_files(dataset: str) -> str: """List files in a dataset. Args: dataset: Dataset identifier in format <owner>/<dataset-name> Returns: JSON string with file details """ authenticated, msg = ensure_authenticated() if not authenticated: return msg try: owner, name = dataset.split('/') files = api.dataset_list_files(owner, name) result = [] for file in files: result.append({ "name": file.name, "size": file.size, "creationDate": str(file.creationDate) if hasattr(file, 'creationDate') else None }) return json.dumps(result, indent=2) except Exception as e: return f"Error listing dataset files: {str(e)}" @mcp_instance.tool() def dataset_download_files(dataset: str, path: str = "", file_name: str = "", force: bool = False) -> str: """Download dataset files. Args: dataset: Dataset identifier in format <owner>/<dataset-name> path: Folder where file(s) will be downloaded (defaults to a temp directory) file_name: File name, all files downloaded if not provided force: Force download even if files exist Returns: Success message or error details """ authenticated, msg = ensure_authenticated() if not authenticated: return msg # Create a temporary directory if no path is specified use_temp = False if not path: path = tempfile.mkdtemp() use_temp = True try: # Split dataset reference into owner and name owner, name = dataset.split('/') if file_name: api.dataset_download_file(owner, name, file_name, path=path, force=force) result = f"Downloaded file '{file_name}' to {path}" else: api.dataset_download_files(owner, name, path=path, force=force) result = f"Downloaded all dataset files to {path}" return result except Exception as e: if use_temp: try: os.rmdir(path) except: pass return f"Error downloading dataset files: {str(e)}" @mcp_instance.tool() def dataset_metadata(dataset: str) -> str: """Get dataset metadata. Args: dataset: Dataset identifier in format <owner>/<dataset-name> Returns: JSON string with dataset metadata """ authenticated, msg = ensure_authenticated() if not authenticated: return msg try: owner, name = dataset.split('/') metadata = api.dataset_metadata(owner, name) # Convert metadata to a JSON-serializable format if isinstance(metadata, dict): # Already JSON-serializable result = metadata else: # Convert to dict result = metadata.__dict__ if hasattr(metadata, '__dict__') else {"error": "Could not parse metadata"} return json.dumps(result, indent=2) except Exception as e: return f"Error getting dataset metadata: {str(e)}" @mcp_instance.tool() def dataset_create_new(title: str, files_dir: str, license_name: str = "unknown", description: str = "", is_private: bool = True) -> str: """Create a new dataset. Args: title: Title of the dataset files_dir: Directory containing files to upload license_name: License for the dataset (e.g., 'CC0-1.0', 'CC-BY-SA-4.0') description: Dataset description is_private: Whether the dataset should be private Returns: Success message or error details """ authenticated, msg = ensure_authenticated() if not authenticated: return msg try: # Check if directory exists if not os.path.isdir(files_dir): return f"Error: Directory not found at {files_dir}" # Create metadata file metadata_path = os.path.join(files_dir, "dataset-metadata.json") # Generate slug from title slug = title.lower().replace(' ', '-') # Get username from API username = api.get_config_value("username") metadata = { "title": title, "id": f"{username}/{slug}", "licenses": [{"name": license_name}], "description": description } with open(metadata_path, "w") as f: json.dump(metadata, f, indent=2) # Create the dataset api.dataset_create_new(files_dir, convert_to_csv=True, dir_mode="tar", quiet=False) return f"Dataset created successfully: {username}/{slug}" except Exception as e: return f"Error creating dataset: {str(e)}" @mcp_instance.tool() def dataset_create_version(dataset: str, files_dir: str, version_notes: str, convert_to_csv: bool = True, delete_old_versions: bool = False) -> str: """Create a new version of an existing dataset. Args: dataset: Dataset identifier in format <owner>/<dataset-name> files_dir: Directory containing files to upload version_notes: Notes describing the new version convert_to_csv: Whether to convert tabular data to CSV delete_old_versions: Whether to delete all previous versions Returns: Success message or error details """ authenticated, msg = ensure_authenticated() if not authenticated: return msg try: # Check if directory exists if not os.path.isdir(files_dir): return f"Error: Directory not found at {files_dir}" # Split dataset reference owner, name = dataset.split('/') # Create metadata file metadata_path = os.path.join(files_dir, "dataset-metadata.json") metadata = { "id": f"{owner}/{name}", "title": name.replace('-', ' ').title() } with open(metadata_path, "w") as f: json.dump(metadata, f, indent=2) # Create new version api.dataset_create_version(files_dir, version_notes, quiet=False, convert_to_csv=convert_to_csv, delete_old_versions=delete_old_versions, dir_mode="tar") return f"New version of dataset {owner}/{name} created successfully" except Exception as e: return f"Error creating dataset version: {str(e)}" @mcp_instance.tool() def dataset_status(dataset: str) -> str: """Check the creation status of a dataset. Args: dataset: Dataset identifier in format <owner>/<dataset-name> Returns: Status information """ authenticated, msg = ensure_authenticated() if not authenticated: return msg try: owner, name = dataset.split('/') status = api.dataset_status(owner, name) result = { "ref": status.ref if hasattr(status, 'ref') else None, "title": status.title if hasattr(status, 'title') else None, "status": status.status if hasattr(status, 'status') else None, "error": status.error if hasattr(status, 'error') else None, "versionNumber": status.versionNumber if hasattr(status, 'versionNumber') else None } return json.dumps(result, indent=2) except Exception as e: return f"Error checking dataset status: {str(e)}" @mcp_instance.tool() def dataset_initialize_metadata(path: str = ".") -> str: """Initialize dataset metadata file. Args: path: Directory where metadata file will be created Returns: Success message or error details """ try: # Check if directory exists if not os.path.isdir(path): return f"Error: Directory not found at {path}" # Initialize metadata api.dataset_initialize(folder=path) metadata_path = os.path.join(path, "dataset-metadata.json") if os.path.exists(metadata_path): return f"Dataset metadata file initialized at {metadata_path}" else: return f"Failed to initialize metadata file" except Exception as e: return f"Error initializing dataset metadata: {str(e)}" @mcp_instance.tool() def dataset_update_metadata(dataset: str, metadata_dict: str) -> str: """Update dataset metadata. Args: dataset: Dataset identifier in format <owner>/<dataset-name> metadata_dict: JSON string with metadata to update (title, subtitle, description, etc.) Returns: Success message or error details """ authenticated, msg = ensure_authenticated() if not authenticated: return msg try: owner, name = dataset.split('/') # Parse metadata dictionary try: metadata = json.loads(metadata_dict) except: return "Error: Invalid JSON in metadata_dict" # Update metadata api.dataset_metadata_update(owner, name, metadata) return f"Metadata updated successfully for {owner}/{name}" except Exception as e: return f"Error updating dataset metadata: {str(e)}"

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/54yyyu/kaggle-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server