compute_correlation
Calculate correlation matrices between numeric columns in CSV or SQLite files to identify relationships in tabular data using Pearson, Spearman, or Kendall methods.
Instructions
Compute correlation matrix between numeric columns.
Args:
file_path: Path to CSV or SQLite file
columns: List of columns to include (default: all numeric columns)
method: Correlation method - 'pearson' (default), 'spearman', or 'kendall'
Returns:
Dictionary containing:
- method: Correlation method used
- correlation_matrix: Full correlation matrix
- top_correlations: Top 10 strongest correlations (excluding self-correlations)
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| file_path | Yes | ||
| columns | No | ||
| method | No | pearson |
Implementation Reference
- src/mcp_tabular/server.py:236-297 (handler)The primary handler function for the 'compute_correlation' tool. Loads the dataset, validates inputs, computes the correlation matrix using pandas.corr() with the specified method, extracts the top 10 pairwise correlations (excluding self-correlations), categorizes correlation strength, and returns a structured dictionary with the matrix and highlights.@mcp.tool() def compute_correlation( file_path: str, columns: list[str] | None = None, method: str = "pearson", ) -> dict[str, Any]: """ Compute correlation matrix between numeric columns. Args: file_path: Path to CSV or SQLite file columns: List of columns to include (default: all numeric columns) method: Correlation method - 'pearson' (default), 'spearman', or 'kendall' Returns: Dictionary containing: - method: Correlation method used - correlation_matrix: Full correlation matrix - top_correlations: Top 10 strongest correlations (excluding self-correlations) """ df = _load_data(file_path) # Get numeric columns if columns: # Validate provided columns invalid = [c for c in columns if c not in df.columns] if invalid: raise ValueError(f"Columns not found: {invalid}") numeric_df = df[columns].select_dtypes(include=[np.number]) else: numeric_df = df.select_dtypes(include=[np.number]) if len(numeric_df.columns) < 2: raise ValueError("Need at least 2 numeric columns for correlation") # Compute correlation matrix corr_matrix = numeric_df.corr(method=method) # Find top correlations (excluding diagonal) correlations = [] for i, col1 in enumerate(corr_matrix.columns): for j, col2 in enumerate(corr_matrix.columns): if i < j: # Upper triangle only corr_value = corr_matrix.loc[col1, col2] if not np.isnan(corr_value): correlations.append({ "column1": col1, "column2": col2, "correlation": round(float(corr_value), 4), "strength": _interpret_correlation(abs(corr_value)) }) # Sort by absolute correlation correlations.sort(key=lambda x: abs(x["correlation"]), reverse=True) return { "method": method, "columns_analyzed": corr_matrix.columns.tolist(), "correlation_matrix": corr_matrix.round(4).to_dict(), "top_correlations": correlations[:10], }
- src/mcp_tabular/server.py:299-311 (helper)Helper function called by compute_correlation to categorize the strength of correlations into 'very_strong', 'strong', 'moderate', 'weak', or 'negligible' based on absolute value thresholds.def _interpret_correlation(value: float) -> str: """Interpret correlation strength.""" if value >= 0.9: return "very_strong" elif value >= 0.7: return "strong" elif value >= 0.5: return "moderate" elif value >= 0.3: return "weak" else: return "negligible"
- src/mcp_tabular/server.py:103-106 (helper)Helper function used by compute_correlation to identify numeric columns in the dataframe for correlation analysis.def _get_numeric_columns(df: pd.DataFrame) -> list[str]: """Get list of numeric column names.""" return df.select_dtypes(include=[np.number]).columns.tolist()
- src/mcp_tabular/server.py:70-101 (helper)Shared helper function used by compute_correlation to load tabular data from CSV or SQLite files into a pandas DataFrame.def _load_data(file_path: str) -> pd.DataFrame: """Load data from CSV or SQLite file.""" path = _resolve_path(file_path) if not path.exists(): raise FileNotFoundError( f"File not found: {file_path}\n" f"Resolved to: {path}\n" f"Project root: {_PROJECT_ROOT}\n" f"Current working directory: {Path.cwd()}" ) suffix = path.suffix.lower() if suffix == ".csv": return pd.read_csv(str(path)) elif suffix in (".db", ".sqlite", ".sqlite3"): # For SQLite, list tables or load first table conn = sqlite3.connect(str(path)) tables = pd.read_sql_query( "SELECT name FROM sqlite_master WHERE type='table'", conn ) if tables.empty: conn.close() raise ValueError(f"No tables found in SQLite database: {file_path}") first_table = tables.iloc[0]["name"] df = pd.read_sql_query(f"SELECT * FROM {first_table}", conn) conn.close() return df else: raise ValueError(f"Unsupported file format: {suffix}. Use .csv or .db/.sqlite")