validate_data
Check CSV data integrity and format to identify issues and warnings for reliable analysis.
Instructions
Validate CSV data integrity and format.
Args:
filename: Name of the CSV file
Returns:
Dictionary with validation results, issues, and warnings
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| filename | Yes |
Implementation Reference
- csv_mcp_server/server.py:254-269 (handler)MCP tool handler for 'validate_data'. Registers the tool and delegates execution to CSVManager's validate_data method, handling exceptions.@mcp.tool() def validate_data(filename: str) -> Dict[str, Any]: """ Validate CSV data integrity and format. Args: filename: Name of the CSV file Returns: Dictionary with validation results, issues, and warnings """ try: return csv_manager.validate_data(filename) except Exception as e: return {"success": False, "error": str(e)}
- Core implementation of the validate_data functionality in CSVManager class. Performs comprehensive validation including checks for empty rows, duplicates, missing values, mixed data types, and long text values.def validate_data(self, filename: str) -> Dict[str, Any]: """Validate CSV data integrity and format.""" filepath = self._get_file_path(filename) if not filepath.exists(): raise FileNotFoundError(f"CSV file '{filename}' not found") try: df = pd.read_csv(filepath) validation_results = { "success": True, "filename": filename, "total_rows": len(df), "total_columns": len(df.columns), "issues": [], "warnings": [] } # Check for empty rows empty_rows = df.isnull().all(axis=1).sum() if empty_rows > 0: validation_results["issues"].append(f"Found {empty_rows} completely empty rows") # Check for duplicate rows duplicate_rows = df.duplicated().sum() if duplicate_rows > 0: validation_results["warnings"].append(f"Found {duplicate_rows} duplicate rows") # Check for missing values by column null_counts = df.isnull().sum() for col, null_count in null_counts.items(): if null_count > 0: percentage = (null_count / len(df)) * 100 validation_results["warnings"].append(f"Column '{col}' has {null_count} missing values ({percentage:.1f}%)") # Check for columns with mixed data types (if possible) for col in df.columns: if df[col].dtype == 'object': # Try to detect mixed numeric/text data numeric_count = pd.to_numeric(df[col], errors='coerce').notna().sum() if 0 < numeric_count < len(df): validation_results["warnings"].append(f"Column '{col}' appears to have mixed data types") # Check for unusually long text values for col in df.select_dtypes(include=['object']).columns: max_length = df[col].astype(str).str.len().max() if max_length > 1000: validation_results["warnings"].append(f"Column '{col}' has very long text values (max: {max_length} characters)") validation_results["is_valid"] = len(validation_results["issues"]) == 0 return validation_results except Exception as e: logger.error(f"Failed to validate data: {e}") raise