profile_data
Analyzes CSV data to generate detailed profiles, including correlations and outliers, for comprehensive insights during data processing and validation.
Instructions
Generate comprehensive data profile.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| include_correlations | No | ||
| include_outliers | No | ||
| session_id | Yes |
Implementation Reference
- Core implementation of the profile_data tool handler. Performs comprehensive data profiling: overview stats, per-column analysis (numeric, datetime, categorical), optional correlations and outliers, data quality score.async def profile_data( session_id: str, include_correlations: bool = True, include_outliers: bool = True, ctx: Context = None ) -> Dict[str, Any]: """ Generate comprehensive data profile. Args: session_id: Session identifier include_correlations: Include correlation analysis include_outliers: Include outlier detection ctx: FastMCP context Returns: Dict with complete data profile """ try: manager = get_session_manager() session = manager.get_session(session_id) if not session or session.df is None: return {"success": False, "error": "Invalid session or no data loaded"} df = session.df profile = { "overview": { "row_count": len(df), "column_count": len(df.columns), "memory_usage_mb": round(df.memory_usage(deep=True).sum() / (1024 * 1024), 2), "duplicate_rows": df.duplicated().sum(), "duplicate_percentage": round(df.duplicated().sum() / len(df) * 100, 2) }, "columns": {} } # Analyze each column for col in df.columns: col_data = df[col] col_profile = { "dtype": str(col_data.dtype), "null_count": int(col_data.isna().sum()), "null_percentage": round(col_data.isna().sum() / len(df) * 100, 2), "unique_count": int(col_data.nunique()), "unique_percentage": round(col_data.nunique() / len(df) * 100, 2) } # Numeric column analysis if pd.api.types.is_numeric_dtype(col_data): col_profile["type"] = "numeric" col_profile["statistics"] = { "mean": float(col_data.mean()), "std": float(col_data.std()), "min": float(col_data.min()), "max": float(col_data.max()), "25%": float(col_data.quantile(0.25)), "50%": float(col_data.quantile(0.50)), "75%": float(col_data.quantile(0.75)), "skewness": float(col_data.skew()), "kurtosis": float(col_data.kurt()) } col_profile["zeros"] = int((col_data == 0).sum()) col_profile["negative_count"] = int((col_data < 0).sum()) # Datetime column analysis elif pd.api.types.is_datetime64_any_dtype(col_data): col_profile["type"] = "datetime" non_null = col_data.dropna() if len(non_null) > 0: col_profile["date_range"] = { "min": str(non_null.min()), "max": str(non_null.max()), "range_days": (non_null.max() - non_null.min()).days } # Categorical/text column analysis else: col_profile["type"] = "categorical" value_counts = col_data.value_counts() col_profile["most_frequent"] = { "value": str(value_counts.index[0]) if len(value_counts) > 0 else None, "count": int(value_counts.iloc[0]) if len(value_counts) > 0 else 0 } # String-specific analysis if col_data.dtype == 'object': str_lengths = col_data.dropna().astype(str).str.len() if len(str_lengths) > 0: col_profile["string_stats"] = { "min_length": int(str_lengths.min()), "max_length": int(str_lengths.max()), "mean_length": round(str_lengths.mean(), 2) } profile["columns"][col] = col_profile # Add correlations if requested if include_correlations: numeric_cols = df.select_dtypes(include=[np.number]).columns if len(numeric_cols) >= 2: corr_result = await get_correlation_matrix(session_id, ctx=ctx) if corr_result["success"]: profile["correlations"] = corr_result["high_correlations"] # Add outlier detection if requested if include_outliers: outlier_result = await detect_outliers(session_id, ctx=ctx) if outlier_result["success"]: profile["outliers"] = { col: { "count": info["outlier_count"], "percentage": info["outlier_percentage"] } for col, info in outlier_result["outliers"].items() } # Data quality score total_cells = len(df) * len(df.columns) missing_cells = df.isna().sum().sum() quality_score = round((1 - missing_cells / total_cells) * 100, 2) profile["data_quality_score"] = quality_score session.record_operation(OperationType.PROFILE, { "include_correlations": include_correlations, "include_outliers": include_outliers }) return { "success": True, "profile": profile } except Exception as e: logger.error(f"Error profiling data: {str(e)}") return {"success": False, "error": str(e)}
- src/csv_editor/server.py:381-389 (registration)MCP tool registration for profile_data. Thin wrapper that delegates to the analytics.py implementation.@mcp.tool async def profile_data( session_id: str, include_correlations: bool = True, include_outliers: bool = True, ctx: Context = None ) -> Dict[str, Any]: """Generate comprehensive data profile.""" return await _profile_data(session_id, include_correlations, include_outliers, ctx)