detect_outliers
Detect outliers in numeric columns of your CSV data using configurable methods like IQR and adjustable thresholds.
Instructions
Detect outliers in numeric columns.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| session_id | Yes | ||
| columns | No | ||
| method | No | iqr | |
| threshold | No |
Output Schema
| Name | Required | Description | Default |
|---|---|---|---|
No arguments | |||
Implementation Reference
- Core handler function that detects outliers in numeric columns using IQR or Z-Score methods. Validates session/columns, computes outlier boundaries, and returns structured results with counts, percentages, and indices.
async def detect_outliers( session_id: str, columns: list[str] | None = None, method: str = "iqr", threshold: float = 1.5, ctx: Context = None, ) -> dict[str, Any]: """ Detect outliers in numeric columns. Args: session_id: Session identifier columns: Columns to check (None for all numeric) method: Detection method ('iqr', 'zscore', 'isolation_forest') threshold: Threshold for outlier detection (1.5 for IQR, 3 for z-score) ctx: FastMCP context Returns: Dict with outlier information """ try: manager = get_session_manager() session = manager.get_session(session_id) if not session or session.df is None: return {"success": False, "error": "Invalid session or no data loaded"} df = session.df # Select numeric columns if columns: missing_cols = [col for col in columns if col not in df.columns] if missing_cols: return {"success": False, "error": f"Columns not found: {missing_cols}"} numeric_df = df[columns].select_dtypes(include=[np.number]) else: numeric_df = df.select_dtypes(include=[np.number]) if numeric_df.empty: return {"success": False, "error": "No numeric columns found"} outliers = {} if method == "iqr": for col in numeric_df.columns: Q1 = numeric_df[col].quantile(0.25) Q3 = numeric_df[col].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - threshold * IQR upper_bound = Q3 + threshold * IQR outlier_mask = (numeric_df[col] < lower_bound) | (numeric_df[col] > upper_bound) outlier_indices = df.index[outlier_mask].tolist() outliers[col] = { "method": "IQR", "lower_bound": float(lower_bound), "upper_bound": float(upper_bound), "outlier_count": len(outlier_indices), "outlier_percentage": round(len(outlier_indices) / len(df) * 100, 2), "outlier_indices": outlier_indices[:100], # Limit to first 100 "q1": float(Q1), "q3": float(Q3), "iqr": float(IQR), } elif method == "zscore": for col in numeric_df.columns: z_scores = np.abs( (numeric_df[col] - numeric_df[col].mean()) / numeric_df[col].std() ) outlier_mask = z_scores > threshold outlier_indices = df.index[outlier_mask].tolist() outliers[col] = { "method": "Z-Score", "threshold": threshold, "outlier_count": len(outlier_indices), "outlier_percentage": round(len(outlier_indices) / len(df) * 100, 2), "outlier_indices": outlier_indices[:100], # Limit to first 100 "mean": float(numeric_df[col].mean()), "std": float(numeric_df[col].std()), } else: return {"success": False, "error": f"Unknown method: {method}"} # Summary statistics total_outliers = sum(info["outlier_count"] for info in outliers.values()) session.record_operation( OperationType.ANALYZE, { "type": "outlier_detection", "method": method, "threshold": threshold, "columns": list(outliers.keys()), }, ) return { "success": True, "method": method, "threshold": threshold, "outliers": outliers, "total_outliers": total_outliers, "columns_analyzed": list(outliers.keys()), } except Exception as e: logger.error(f"Error detecting outliers: {e!s}") return {"success": False, "error": str(e)} - src/csv_editor/server.py:351-360 (registration)MCP tool registration using @mcp.tool decorator. Imports the handler from analytics module and delegates to it.
@mcp.tool async def detect_outliers( session_id: str, columns: list[str] | None = None, method: str = "iqr", threshold: float = 1.5, ctx: Context = None, ) -> dict[str, Any]: """Detect outliers in numeric columns.""" return await _detect_outliers(session_id, columns, method, threshold, ctx) - src/csv_editor/server.py:80-94 (schema)Tool capability listing in the server info/health check response, documenting detect_outliers as part of data_analysis capabilities.
"data_analysis": [ "get_statistics", "correlation_matrix", "group_by_aggregate", "value_counts", "detect_outliers", "profile_data", ], "data_validation": ["validate_schema", "check_data_quality", "find_anomalies"], "session_management": ["multi_session_support", "session_isolation", "auto_cleanup"], }, "supported_formats": ["csv", "tsv", "json", "excel", "parquet", "html", "markdown"], "max_file_size_mb": int(os.getenv("CSV_MAX_FILE_SIZE", "1024")), "session_timeout_minutes": int(os.getenv("CSV_SESSION_TIMEOUT", "60")), } - src/csv_editor/server.py:289-295 (helper)Import of detect_outliers from the analytics module into the server's registration namespace.
from .tools.analytics import detect_outliers as _detect_outliers from .tools.analytics import get_column_statistics as _get_column_statistics from .tools.analytics import get_correlation_matrix as _get_correlation_matrix from .tools.analytics import get_statistics as _get_statistics from .tools.analytics import get_value_counts as _get_value_counts from .tools.analytics import group_by_aggregate as _group_by_aggregate from .tools.analytics import profile_data as _profile_data - Call to detect_outliers from within profile_data helper to include outlier information in the data profile.
if include_outliers: outlier_result = await detect_outliers(session_id, ctx=ctx) if outlier_result["success"]: profile["outliers"] = { col: {"count": info["outlier_count"], "percentage": info["outlier_percentage"]} for col, info in outlier_result["outliers"].items() }