Skip to main content
Glama
santoshray02

CSV Editor

by santoshray02

profile_data

Generate comprehensive data profiles from CSV files to analyze structure, correlations, and outliers for data quality assessment.

Instructions

Generate comprehensive data profile.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
session_idYes
include_correlationsNo
include_outliersNo

Implementation Reference

  • Core handler function that generates a comprehensive data profile, including overview statistics, per-column analysis (numeric, datetime, categorical), optional correlations and outliers, and data quality score.
    async def profile_data(
        session_id: str,
        include_correlations: bool = True,
        include_outliers: bool = True,
        ctx: Context = None
    ) -> Dict[str, Any]:
        """
        Generate comprehensive data profile.
        
        Args:
            session_id: Session identifier
            include_correlations: Include correlation analysis
            include_outliers: Include outlier detection
            ctx: FastMCP context
            
        Returns:
            Dict with complete data profile
        """
        try:
            manager = get_session_manager()
            session = manager.get_session(session_id)
            
            if not session or session.df is None:
                return {"success": False, "error": "Invalid session or no data loaded"}
            
            df = session.df
            
            profile = {
                "overview": {
                    "row_count": len(df),
                    "column_count": len(df.columns),
                    "memory_usage_mb": round(df.memory_usage(deep=True).sum() / (1024 * 1024), 2),
                    "duplicate_rows": df.duplicated().sum(),
                    "duplicate_percentage": round(df.duplicated().sum() / len(df) * 100, 2)
                },
                "columns": {}
            }
            
            # Analyze each column
            for col in df.columns:
                col_data = df[col]
                col_profile = {
                    "dtype": str(col_data.dtype),
                    "null_count": int(col_data.isna().sum()),
                    "null_percentage": round(col_data.isna().sum() / len(df) * 100, 2),
                    "unique_count": int(col_data.nunique()),
                    "unique_percentage": round(col_data.nunique() / len(df) * 100, 2)
                }
                
                # Numeric column analysis
                if pd.api.types.is_numeric_dtype(col_data):
                    col_profile["type"] = "numeric"
                    col_profile["statistics"] = {
                        "mean": float(col_data.mean()),
                        "std": float(col_data.std()),
                        "min": float(col_data.min()),
                        "max": float(col_data.max()),
                        "25%": float(col_data.quantile(0.25)),
                        "50%": float(col_data.quantile(0.50)),
                        "75%": float(col_data.quantile(0.75)),
                        "skewness": float(col_data.skew()),
                        "kurtosis": float(col_data.kurt())
                    }
                    col_profile["zeros"] = int((col_data == 0).sum())
                    col_profile["negative_count"] = int((col_data < 0).sum())
                    
                # Datetime column analysis
                elif pd.api.types.is_datetime64_any_dtype(col_data):
                    col_profile["type"] = "datetime"
                    non_null = col_data.dropna()
                    if len(non_null) > 0:
                        col_profile["date_range"] = {
                            "min": str(non_null.min()),
                            "max": str(non_null.max()),
                            "range_days": (non_null.max() - non_null.min()).days
                        }
                        
                # Categorical/text column analysis
                else:
                    col_profile["type"] = "categorical"
                    value_counts = col_data.value_counts()
                    col_profile["most_frequent"] = {
                        "value": str(value_counts.index[0]) if len(value_counts) > 0 else None,
                        "count": int(value_counts.iloc[0]) if len(value_counts) > 0 else 0
                    }
                    
                    # String-specific analysis
                    if col_data.dtype == 'object':
                        str_lengths = col_data.dropna().astype(str).str.len()
                        if len(str_lengths) > 0:
                            col_profile["string_stats"] = {
                                "min_length": int(str_lengths.min()),
                                "max_length": int(str_lengths.max()),
                                "mean_length": round(str_lengths.mean(), 2)
                            }
                
                profile["columns"][col] = col_profile
            
            # Add correlations if requested
            if include_correlations:
                numeric_cols = df.select_dtypes(include=[np.number]).columns
                if len(numeric_cols) >= 2:
                    corr_result = await get_correlation_matrix(session_id, ctx=ctx)
                    if corr_result["success"]:
                        profile["correlations"] = corr_result["high_correlations"]
            
            # Add outlier detection if requested
            if include_outliers:
                outlier_result = await detect_outliers(session_id, ctx=ctx)
                if outlier_result["success"]:
                    profile["outliers"] = {
                        col: {
                            "count": info["outlier_count"],
                            "percentage": info["outlier_percentage"]
                        }
                        for col, info in outlier_result["outliers"].items()
                    }
            
            # Data quality score
            total_cells = len(df) * len(df.columns)
            missing_cells = df.isna().sum().sum()
            quality_score = round((1 - missing_cells / total_cells) * 100, 2)
            profile["data_quality_score"] = quality_score
            
            session.record_operation(OperationType.PROFILE, {
                "include_correlations": include_correlations,
                "include_outliers": include_outliers
            })
            
            return {
                "success": True,
                "profile": profile
            }
            
        except Exception as e:
            logger.error(f"Error profiling data: {str(e)}")
            return {"success": False, "error": str(e)}
  • FastMCP tool registration for profile_data, acting as a thin wrapper that delegates to the implementation in analytics.py.
    @mcp.tool
    async def profile_data(
        session_id: str,
        include_correlations: bool = True,
        include_outliers: bool = True,
        ctx: Context = None
    ) -> Dict[str, Any]:
        """Generate comprehensive data profile."""
        return await _profile_data(session_id, include_correlations, include_outliers, ctx)

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/santoshray02/csv-editor'

If you have feedback or need assistance with the MCP directory API, please join our Discord server