auto_insights
Generate automated insights from CSV or SQLite datasets to support quick data exploration and understanding.
Instructions
Automatically generate interesting insights about a dataset.
Perfect for quick data exploration and understanding.
Args:
file_path: Path to CSV or SQLite file
max_insights: Maximum number of insights to generate (default 10)
Returns:
Dictionary containing automatically discovered insights
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| file_path | Yes | ||
| max_insights | No |
Implementation Reference
- src/mcp_tabular/server.py:1391-1547 (handler)The primary handler function for the 'auto_insights' MCP tool. It automatically generates insights about a tabular dataset, covering overview, data quality, distributions, correlations, outliers, and more. Registered via @mcp.tool() decorator on the FastMCP instance.@mcp.tool() def auto_insights(file_path: str, max_insights: int = 10) -> dict[str, Any]: """ Automatically generate interesting insights about a dataset. Perfect for quick data exploration and understanding. Args: file_path: Path to CSV or SQLite file max_insights: Maximum number of insights to generate (default 10) Returns: Dictionary containing automatically discovered insights """ df = _load_data(file_path) insights = [] numeric_cols = _get_numeric_columns(df) cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() # 1. Dataset overview insights.append({ "category": "overview", "title": "Dataset Size", "insight": f"The dataset contains {len(df):,} rows and {len(df.columns)} columns ({len(numeric_cols)} numeric, {len(cat_cols)} categorical).", "importance": "high", }) # 2. Missing data insights missing_total = df.isnull().sum().sum() if missing_total > 0: most_missing_col = df.isnull().sum().idxmax() most_missing_pct = df[most_missing_col].isnull().mean() * 100 insights.append({ "category": "data_quality", "title": "Missing Values Alert", "insight": f"Found {missing_total:,} missing values total. Column '{most_missing_col}' has the most missing data ({most_missing_pct:.1f}%).", "importance": "high" if most_missing_pct > 20 else "medium", }) # 3. Numeric column insights for col in numeric_cols[:5]: # Limit to first 5 col_data = df[col].dropna() if len(col_data) == 0: continue # Skewness insight skew = col_data.skew() if abs(skew) > 1: direction = "right (positive)" if skew > 0 else "left (negative)" insights.append({ "category": "distribution", "title": f"Skewed Distribution: {col}", "insight": f"'{col}' is highly skewed to the {direction} (skewness: {skew:.2f}). Consider log transformation for analysis.", "importance": "medium", }) # Outlier insight q1, q3 = col_data.quantile([0.25, 0.75]) iqr = q3 - q1 outliers = ((col_data < q1 - 1.5 * iqr) | (col_data > q3 + 1.5 * iqr)).sum() if outliers > 0: outlier_pct = outliers / len(col_data) * 100 if outlier_pct > 5: insights.append({ "category": "outliers", "title": f"Outliers Detected: {col}", "insight": f"'{col}' has {outliers} outliers ({outlier_pct:.1f}% of data). Range: {col_data.min():.2f} to {col_data.max():.2f}.", "importance": "medium", }) # 4. Top correlations if len(numeric_cols) >= 2: corr_matrix = df[numeric_cols].corr() for i, col1 in enumerate(corr_matrix.columns): for j, col2 in enumerate(corr_matrix.columns): if i < j: corr_val = corr_matrix.loc[col1, col2] if abs(corr_val) > 0.7 and not np.isnan(corr_val): direction = "positive" if corr_val > 0 else "negative" insights.append({ "category": "correlation", "title": f"Strong Correlation Found", "insight": f"'{col1}' and '{col2}' have a strong {direction} correlation ({corr_val:.2f}).", "importance": "high", }) # 5. Categorical column insights for col in cat_cols[:3]: # Limit to first 3 unique_count = df[col].nunique() value_counts = df[col].value_counts() if unique_count <= 1: insights.append({ "category": "data_quality", "title": f"Constant Column: {col}", "insight": f"'{col}' has only {unique_count} unique value(s). Consider removing.", "importance": "medium", }) elif unique_count == len(df): insights.append({ "category": "data_structure", "title": f"Unique Identifier: {col}", "insight": f"'{col}' has all unique values - likely a primary key/identifier.", "importance": "low", }) else: top_value = value_counts.index[0] top_pct = value_counts.iloc[0] / len(df) * 100 if top_pct > 50: insights.append({ "category": "distribution", "title": f"Dominant Category: {col}", "insight": f"'{col}' is dominated by '{top_value}' ({top_pct:.1f}% of data).", "importance": "medium", }) # 6. Date column detection for col in df.columns: if df[col].dtype == 'object': try: parsed = pd.to_datetime(df[col], errors='coerce') if parsed.notna().sum() > len(df) * 0.5: # More than 50% valid dates date_range = parsed.dropna() insights.append({ "category": "temporal", "title": f"Date Column Detected: {col}", "insight": f"'{col}' contains dates from {date_range.min().date()} to {date_range.max().date()} ({(date_range.max() - date_range.min()).days} days span).", "importance": "medium", }) break # Only report first date column except: pass # 7. Summary statistics insight if numeric_cols: main_col = numeric_cols[0] main_data = df[main_col].dropna() if len(main_data) > 0: insights.append({ "category": "statistics", "title": f"Key Metric Summary: {main_col}", "insight": f"'{main_col}' ranges from {main_data.min():,.2f} to {main_data.max():,.2f} with mean {main_data.mean():,.2f} and median {main_data.median():,.2f}.", "importance": "medium", }) # Sort by importance and limit importance_order = {"high": 0, "medium": 1, "low": 2} insights.sort(key=lambda x: importance_order.get(x["importance"], 2)) insights = insights[:max_insights] return { "file": file_path, "total_insights": len(insights), "insights": insights, "recommendation": "Use describe_dataset() for detailed statistics, or specific tools like detect_anomalies() to investigate further.", }
- src/mcp_tabular/server.py:103-106 (helper)Helper function used by auto_insights to identify numeric columns for analysis.def _get_numeric_columns(df: pd.DataFrame) -> list[str]: """Get list of numeric column names.""" return df.select_dtypes(include=[np.number]).columns.tolist()
- src/mcp_tabular/server.py:70-101 (helper)Core helper function used by auto_insights to load and parse CSV or SQLite datasets into pandas DataFrame.def _load_data(file_path: str) -> pd.DataFrame: """Load data from CSV or SQLite file.""" path = _resolve_path(file_path) if not path.exists(): raise FileNotFoundError( f"File not found: {file_path}\n" f"Resolved to: {path}\n" f"Project root: {_PROJECT_ROOT}\n" f"Current working directory: {Path.cwd()}" ) suffix = path.suffix.lower() if suffix == ".csv": return pd.read_csv(str(path)) elif suffix in (".db", ".sqlite", ".sqlite3"): # For SQLite, list tables or load first table conn = sqlite3.connect(str(path)) tables = pd.read_sql_query( "SELECT name FROM sqlite_master WHERE type='table'", conn ) if tables.empty: conn.close() raise ValueError(f"No tables found in SQLite database: {file_path}") first_table = tables.iloc[0]["name"] df = pd.read_sql_query(f"SELECT * FROM {first_table}", conn) conn.close() return df else: raise ValueError(f"Unsupported file format: {suffix}. Use .csv or .db/.sqlite")
- src/mcp_tabular/server.py:37-40 (registration)Initialization of the FastMCP server instance to which all @mcp.tool() decorators register their tools, including auto_insights.mcp = FastMCP( "Tabular Data Analysis", dependencies=["pandas", "numpy", "scipy"], )