profile_data
Generate a comprehensive data profile of your CSV file, including structure, correlations, and outliers.
Instructions
Generate comprehensive data profile.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| session_id | Yes | ||
| include_correlations | No | ||
| include_outliers | No |
Output Schema
| Name | Required | Description | Default |
|---|---|---|---|
No arguments | |||
Implementation Reference
- The main handler function for the profile_data tool. Generates a comprehensive data profile including overview statistics, per-column analysis (numeric/datetime/categorical), optional correlation matrix, optional outlier detection, and a data quality score.
async def profile_data( session_id: str, include_correlations: bool = True, include_outliers: bool = True, ctx: Context = None, ) -> dict[str, Any]: """ Generate comprehensive data profile. Args: session_id: Session identifier include_correlations: Include correlation analysis include_outliers: Include outlier detection ctx: FastMCP context Returns: Dict with complete data profile """ try: manager = get_session_manager() session = manager.get_session(session_id) if not session or session.df is None: return {"success": False, "error": "Invalid session or no data loaded"} df = session.df profile = { "overview": { "row_count": len(df), "column_count": len(df.columns), "memory_usage_mb": round(df.memory_usage(deep=True).sum() / (1024 * 1024), 2), "duplicate_rows": df.duplicated().sum(), "duplicate_percentage": round(df.duplicated().sum() / len(df) * 100, 2), }, "columns": {}, } # Analyze each column for col in df.columns: col_data = df[col] col_profile = { "dtype": str(col_data.dtype), "null_count": int(col_data.isna().sum()), "null_percentage": round(col_data.isna().sum() / len(df) * 100, 2), "unique_count": int(col_data.nunique()), "unique_percentage": round(col_data.nunique() / len(df) * 100, 2), } # Numeric column analysis if pd.api.types.is_numeric_dtype(col_data): col_profile["type"] = "numeric" col_profile["statistics"] = { "mean": float(col_data.mean()), "std": float(col_data.std()), "min": float(col_data.min()), "max": float(col_data.max()), "25%": float(col_data.quantile(0.25)), "50%": float(col_data.quantile(0.50)), "75%": float(col_data.quantile(0.75)), "skewness": float(col_data.skew()), "kurtosis": float(col_data.kurt()), } col_profile["zeros"] = int((col_data == 0).sum()) col_profile["negative_count"] = int((col_data < 0).sum()) # Datetime column analysis elif pd.api.types.is_datetime64_any_dtype(col_data): col_profile["type"] = "datetime" non_null = col_data.dropna() if len(non_null) > 0: col_profile["date_range"] = { "min": str(non_null.min()), "max": str(non_null.max()), "range_days": (non_null.max() - non_null.min()).days, } # Categorical/text column analysis else: col_profile["type"] = "categorical" value_counts = col_data.value_counts() col_profile["most_frequent"] = { "value": str(value_counts.index[0]) if len(value_counts) > 0 else None, "count": int(value_counts.iloc[0]) if len(value_counts) > 0 else 0, } # String-specific analysis if col_data.dtype == "object": str_lengths = col_data.dropna().astype(str).str.len() if len(str_lengths) > 0: col_profile["string_stats"] = { "min_length": int(str_lengths.min()), "max_length": int(str_lengths.max()), "mean_length": round(str_lengths.mean(), 2), } profile["columns"][col] = col_profile # Add correlations if requested if include_correlations: numeric_cols = df.select_dtypes(include=[np.number]).columns if len(numeric_cols) >= 2: corr_result = await get_correlation_matrix(session_id, ctx=ctx) if corr_result["success"]: profile["correlations"] = corr_result["high_correlations"] # Add outlier detection if requested if include_outliers: outlier_result = await detect_outliers(session_id, ctx=ctx) if outlier_result["success"]: profile["outliers"] = { col: {"count": info["outlier_count"], "percentage": info["outlier_percentage"]} for col, info in outlier_result["outliers"].items() } # Data quality score total_cells = len(df) * len(df.columns) missing_cells = df.isna().sum().sum() quality_score = round((1 - missing_cells / total_cells) * 100, 2) profile["data_quality_score"] = quality_score session.record_operation( OperationType.PROFILE, {"include_correlations": include_correlations, "include_outliers": include_outliers}, ) return {"success": True, "profile": profile} except Exception as e: logger.error(f"Error profiling data: {e!s}") return {"success": False, "error": str(e)} - src/csv_editor/server.py:363-371 (registration)Registers profile_data as an MCP tool via the @mcp.tool decorator in server.py. Imports _profile_data from analytics module and delegates call to the handler.
@mcp.tool async def profile_data( session_id: str, include_correlations: bool = True, include_outliers: bool = True, ctx: Context = None, ) -> dict[str, Any]: """Generate comprehensive data profile.""" return await _profile_data(session_id, include_correlations, include_outliers, ctx) - src/csv_editor/server.py:295-295 (registration)Import of profile_data from tools.analytics module, aliased as _profile_data for use in the MCP tool registration.
from .tools.analytics import profile_data as _profile_data - src/csv_editor/server.py:86-86 (schema)Lists profile_data as a capability in the 'data_analysis' category of the server info.
"profile_data", - A recommendation in the check_data_quality tool that suggests running profile_data for a comprehensive overview of data issues.
"Consider running profile_data to get a comprehensive overview of data issues" )