Skip to main content
Glama

profile_data

Analyzes CSV data to generate detailed profiles, including correlations and outliers, for comprehensive insights during data processing and validation.

Instructions

Generate comprehensive data profile.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
include_correlationsNo
include_outliersNo
session_idYes

Implementation Reference

  • Core implementation of the profile_data tool handler. Performs comprehensive data profiling: overview stats, per-column analysis (numeric, datetime, categorical), optional correlations and outliers, data quality score.
    async def profile_data( session_id: str, include_correlations: bool = True, include_outliers: bool = True, ctx: Context = None ) -> Dict[str, Any]: """ Generate comprehensive data profile. Args: session_id: Session identifier include_correlations: Include correlation analysis include_outliers: Include outlier detection ctx: FastMCP context Returns: Dict with complete data profile """ try: manager = get_session_manager() session = manager.get_session(session_id) if not session or session.df is None: return {"success": False, "error": "Invalid session or no data loaded"} df = session.df profile = { "overview": { "row_count": len(df), "column_count": len(df.columns), "memory_usage_mb": round(df.memory_usage(deep=True).sum() / (1024 * 1024), 2), "duplicate_rows": df.duplicated().sum(), "duplicate_percentage": round(df.duplicated().sum() / len(df) * 100, 2) }, "columns": {} } # Analyze each column for col in df.columns: col_data = df[col] col_profile = { "dtype": str(col_data.dtype), "null_count": int(col_data.isna().sum()), "null_percentage": round(col_data.isna().sum() / len(df) * 100, 2), "unique_count": int(col_data.nunique()), "unique_percentage": round(col_data.nunique() / len(df) * 100, 2) } # Numeric column analysis if pd.api.types.is_numeric_dtype(col_data): col_profile["type"] = "numeric" col_profile["statistics"] = { "mean": float(col_data.mean()), "std": float(col_data.std()), "min": float(col_data.min()), "max": float(col_data.max()), "25%": float(col_data.quantile(0.25)), "50%": float(col_data.quantile(0.50)), "75%": float(col_data.quantile(0.75)), "skewness": float(col_data.skew()), "kurtosis": float(col_data.kurt()) } col_profile["zeros"] = int((col_data == 0).sum()) col_profile["negative_count"] = int((col_data < 0).sum()) # Datetime column analysis elif pd.api.types.is_datetime64_any_dtype(col_data): col_profile["type"] = "datetime" non_null = col_data.dropna() if len(non_null) > 0: col_profile["date_range"] = { "min": str(non_null.min()), "max": str(non_null.max()), "range_days": (non_null.max() - non_null.min()).days } # Categorical/text column analysis else: col_profile["type"] = "categorical" value_counts = col_data.value_counts() col_profile["most_frequent"] = { "value": str(value_counts.index[0]) if len(value_counts) > 0 else None, "count": int(value_counts.iloc[0]) if len(value_counts) > 0 else 0 } # String-specific analysis if col_data.dtype == 'object': str_lengths = col_data.dropna().astype(str).str.len() if len(str_lengths) > 0: col_profile["string_stats"] = { "min_length": int(str_lengths.min()), "max_length": int(str_lengths.max()), "mean_length": round(str_lengths.mean(), 2) } profile["columns"][col] = col_profile # Add correlations if requested if include_correlations: numeric_cols = df.select_dtypes(include=[np.number]).columns if len(numeric_cols) >= 2: corr_result = await get_correlation_matrix(session_id, ctx=ctx) if corr_result["success"]: profile["correlations"] = corr_result["high_correlations"] # Add outlier detection if requested if include_outliers: outlier_result = await detect_outliers(session_id, ctx=ctx) if outlier_result["success"]: profile["outliers"] = { col: { "count": info["outlier_count"], "percentage": info["outlier_percentage"] } for col, info in outlier_result["outliers"].items() } # Data quality score total_cells = len(df) * len(df.columns) missing_cells = df.isna().sum().sum() quality_score = round((1 - missing_cells / total_cells) * 100, 2) profile["data_quality_score"] = quality_score session.record_operation(OperationType.PROFILE, { "include_correlations": include_correlations, "include_outliers": include_outliers }) return { "success": True, "profile": profile } except Exception as e: logger.error(f"Error profiling data: {str(e)}") return {"success": False, "error": str(e)}
  • MCP tool registration for profile_data. Thin wrapper that delegates to the analytics.py implementation.
    @mcp.tool async def profile_data( session_id: str, include_correlations: bool = True, include_outliers: bool = True, ctx: Context = None ) -> Dict[str, Any]: """Generate comprehensive data profile.""" return await _profile_data(session_id, include_correlations, include_outliers, ctx)

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/santoshray02/csv-editor'

If you have feedback or need assistance with the MCP directory API, please join our Discord server