get_column_statistics
Analyze CSV column data to calculate statistics like mean, median, and distribution for data validation and insights.
Instructions
Get detailed statistics for a specific column.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| session_id | Yes | ||
| column | Yes |
Implementation Reference
- The primary handler function that implements the logic for computing detailed statistics (numeric, categorical, string) for a specific column in a CSV session's dataframe.async def get_column_statistics( session_id: str, column: str, ctx: Context = None ) -> Dict[str, Any]: """ Get detailed statistics for a specific column. Args: session_id: Session identifier column: Column name to analyze ctx: FastMCP context Returns: Dict with detailed column statistics """ try: manager = get_session_manager() session = manager.get_session(session_id) if not session or session.df is None: return {"success": False, "error": "Invalid session or no data loaded"} df = session.df if column not in df.columns: return {"success": False, "error": f"Column '{column}' not found"} col_data = df[column] result = { "column": column, "dtype": str(col_data.dtype), "total_count": len(col_data), "null_count": int(col_data.isna().sum()), "null_percentage": round(col_data.isna().sum() / len(col_data) * 100, 2), "unique_count": int(col_data.nunique()), "unique_percentage": round(col_data.nunique() / len(col_data) * 100, 2) } # Numeric column statistics if pd.api.types.is_numeric_dtype(col_data): non_null = col_data.dropna() result.update({ "type": "numeric", "mean": float(non_null.mean()), "median": float(non_null.median()), "mode": float(non_null.mode()[0]) if len(non_null.mode()) > 0 else None, "std": float(non_null.std()), "variance": float(non_null.var()), "min": float(non_null.min()), "max": float(non_null.max()), "range": float(non_null.max() - non_null.min()), "sum": float(non_null.sum()), "skewness": float(non_null.skew()), "kurtosis": float(non_null.kurt()), "25%": float(non_null.quantile(0.25)), "50%": float(non_null.quantile(0.50)), "75%": float(non_null.quantile(0.75)), "iqr": float(non_null.quantile(0.75) - non_null.quantile(0.25)), "zero_count": int((col_data == 0).sum()), "positive_count": int((col_data > 0).sum()), "negative_count": int((col_data < 0).sum()) }) # Categorical column statistics else: value_counts = col_data.value_counts() top_values = value_counts.head(10).to_dict() result.update({ "type": "categorical", "most_frequent": str(value_counts.index[0]) if len(value_counts) > 0 else None, "most_frequent_count": int(value_counts.iloc[0]) if len(value_counts) > 0 else 0, "top_10_values": {str(k): int(v) for k, v in top_values.items()} }) # String-specific stats if col_data.dtype == 'object': str_data = col_data.dropna().astype(str) if len(str_data) > 0: str_lengths = str_data.str.len() result["string_stats"] = { "min_length": int(str_lengths.min()), "max_length": int(str_lengths.max()), "mean_length": round(str_lengths.mean(), 2), "empty_string_count": int((str_data == "").sum()) } session.record_operation(OperationType.ANALYZE, { "type": "column_statistics", "column": column }) return { "success": True, "statistics": result } except Exception as e: logger.error(f"Error getting column statistics: {str(e)}") return {"success": False, "error": str(e)}
- src/csv_editor/server.py:327-334 (registration)Registers the 'get_column_statistics' tool with the FastMCP server using the @mcp.tool decorator. This wrapper function delegates execution to the actual implementation imported as _get_column_statistics from analytics.py.@mcp.tool async def get_column_statistics( session_id: str, column: str, ctx: Context = None ) -> Dict[str, Any]: """Get detailed statistics for a specific column.""" return await _get_column_statistics(session_id, column, ctx)