Skip to main content
Glama
K02D

MCP Tabular Data Analysis Server

by K02D

auto_insights

Automatically generate key insights from CSV or SQLite datasets to support quick data exploration and understanding.

Instructions

Automatically generate interesting insights about a dataset.
Perfect for quick data exploration and understanding.

Args:
    file_path: Path to CSV or SQLite file
    max_insights: Maximum number of insights to generate (default 10)

Returns:
    Dictionary containing automatically discovered insights

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
file_pathYes
max_insightsNo

Implementation Reference

  • The main handler function for the 'auto_insights' tool. It loads the dataset, analyzes various aspects like missing values, distributions, correlations, categorical insights, date detection, and generates a list of prioritized insights.
    def auto_insights(file_path: str, max_insights: int = 10) -> dict[str, Any]:
        """
        Automatically generate interesting insights about a dataset.
        Perfect for quick data exploration and understanding.
        
        Args:
            file_path: Path to CSV or SQLite file
            max_insights: Maximum number of insights to generate (default 10)
        
        Returns:
            Dictionary containing automatically discovered insights
        """
        df = _load_data(file_path)
        insights = []
        
        numeric_cols = _get_numeric_columns(df)
        cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
        
        # 1. Dataset overview
        insights.append({
            "category": "overview",
            "title": "Dataset Size",
            "insight": f"The dataset contains {len(df):,} rows and {len(df.columns)} columns ({len(numeric_cols)} numeric, {len(cat_cols)} categorical).",
            "importance": "high",
        })
        
        # 2. Missing data insights
        missing_total = df.isnull().sum().sum()
        if missing_total > 0:
            most_missing_col = df.isnull().sum().idxmax()
            most_missing_pct = df[most_missing_col].isnull().mean() * 100
            insights.append({
                "category": "data_quality",
                "title": "Missing Values Alert",
                "insight": f"Found {missing_total:,} missing values total. Column '{most_missing_col}' has the most missing data ({most_missing_pct:.1f}%).",
                "importance": "high" if most_missing_pct > 20 else "medium",
            })
        
        # 3. Numeric column insights
        for col in numeric_cols[:5]:  # Limit to first 5
            col_data = df[col].dropna()
            if len(col_data) == 0:
                continue
                
            # Skewness insight
            skew = col_data.skew()
            if abs(skew) > 1:
                direction = "right (positive)" if skew > 0 else "left (negative)"
                insights.append({
                    "category": "distribution",
                    "title": f"Skewed Distribution: {col}",
                    "insight": f"'{col}' is highly skewed to the {direction} (skewness: {skew:.2f}). Consider log transformation for analysis.",
                    "importance": "medium",
                })
            
            # Outlier insight
            q1, q3 = col_data.quantile([0.25, 0.75])
            iqr = q3 - q1
            outliers = ((col_data < q1 - 1.5 * iqr) | (col_data > q3 + 1.5 * iqr)).sum()
            if outliers > 0:
                outlier_pct = outliers / len(col_data) * 100
                if outlier_pct > 5:
                    insights.append({
                        "category": "outliers",
                        "title": f"Outliers Detected: {col}",
                        "insight": f"'{col}' has {outliers} outliers ({outlier_pct:.1f}% of data). Range: {col_data.min():.2f} to {col_data.max():.2f}.",
                        "importance": "medium",
                    })
        
        # 4. Top correlations
        if len(numeric_cols) >= 2:
            corr_matrix = df[numeric_cols].corr()
            for i, col1 in enumerate(corr_matrix.columns):
                for j, col2 in enumerate(corr_matrix.columns):
                    if i < j:
                        corr_val = corr_matrix.loc[col1, col2]
                        if abs(corr_val) > 0.7 and not np.isnan(corr_val):
                            direction = "positive" if corr_val > 0 else "negative"
                            insights.append({
                                "category": "correlation",
                                "title": f"Strong Correlation Found",
                                "insight": f"'{col1}' and '{col2}' have a strong {direction} correlation ({corr_val:.2f}).",
                                "importance": "high",
                            })
        
        # 5. Categorical column insights
        for col in cat_cols[:3]:  # Limit to first 3
            unique_count = df[col].nunique()
            value_counts = df[col].value_counts()
            
            if unique_count <= 1:
                insights.append({
                    "category": "data_quality",
                    "title": f"Constant Column: {col}",
                    "insight": f"'{col}' has only {unique_count} unique value(s). Consider removing.",
                    "importance": "medium",
                })
            elif unique_count == len(df):
                insights.append({
                    "category": "data_structure",
                    "title": f"Unique Identifier: {col}",
                    "insight": f"'{col}' has all unique values - likely a primary key/identifier.",
                    "importance": "low",
                })
            else:
                top_value = value_counts.index[0]
                top_pct = value_counts.iloc[0] / len(df) * 100
                if top_pct > 50:
                    insights.append({
                        "category": "distribution",
                        "title": f"Dominant Category: {col}",
                        "insight": f"'{col}' is dominated by '{top_value}' ({top_pct:.1f}% of data).",
                        "importance": "medium",
                    })
        
        # 6. Date column detection
        for col in df.columns:
            if df[col].dtype == 'object':
                try:
                    parsed = pd.to_datetime(df[col], errors='coerce')
                    if parsed.notna().sum() > len(df) * 0.5:  # More than 50% valid dates
                        date_range = parsed.dropna()
                        insights.append({
                            "category": "temporal",
                            "title": f"Date Column Detected: {col}",
                            "insight": f"'{col}' contains dates from {date_range.min().date()} to {date_range.max().date()} ({(date_range.max() - date_range.min()).days} days span).",
                            "importance": "medium",
                        })
                        break  # Only report first date column
                except:
                    pass
        
        # 7. Summary statistics insight
        if numeric_cols:
            main_col = numeric_cols[0]
            main_data = df[main_col].dropna()
            if len(main_data) > 0:
                insights.append({
                    "category": "statistics",
                    "title": f"Key Metric Summary: {main_col}",
                    "insight": f"'{main_col}' ranges from {main_data.min():,.2f} to {main_data.max():,.2f} with mean {main_data.mean():,.2f} and median {main_data.median():,.2f}.",
                    "importance": "medium",
                })
        
        # Sort by importance and limit
        importance_order = {"high": 0, "medium": 1, "low": 2}
        insights.sort(key=lambda x: importance_order.get(x["importance"], 2))
        insights = insights[:max_insights]
        
        return {
            "file": file_path,
            "total_insights": len(insights),
            "insights": insights,
            "recommendation": "Use describe_dataset() for detailed statistics, or specific tools like detect_anomalies() to investigate further.",
        }
  • Helper function used by auto_insights to identify numeric columns for analysis.
    def _get_numeric_columns(df: pd.DataFrame) -> list[str]:
        """Get list of numeric column names."""
        return df.select_dtypes(include=[np.number]).columns.tolist()
  • Core helper function used by auto_insights to load the dataset from CSV or SQLite files.
    def _load_data(file_path: str) -> pd.DataFrame:
        """Load data from CSV or SQLite file."""
        path = _resolve_path(file_path)
        
        if not path.exists():
            raise FileNotFoundError(
                f"File not found: {file_path}\n"
                f"Resolved to: {path}\n"
                f"Project root: {_PROJECT_ROOT}\n"
                f"Current working directory: {Path.cwd()}"
            )
        
        suffix = path.suffix.lower()
        
        if suffix == ".csv":
            return pd.read_csv(str(path))
        elif suffix in (".db", ".sqlite", ".sqlite3"):
            # For SQLite, list tables or load first table
            conn = sqlite3.connect(str(path))
            tables = pd.read_sql_query(
                "SELECT name FROM sqlite_master WHERE type='table'", conn
            )
            if tables.empty:
                conn.close()
                raise ValueError(f"No tables found in SQLite database: {file_path}")
            first_table = tables.iloc[0]["name"]
            df = pd.read_sql_query(f"SELECT * FROM {first_table}", conn)
            conn.close()
            return df
        else:
            raise ValueError(f"Unsupported file format: {suffix}. Use .csv or .db/.sqlite")
  • The @mcp.tool() decorator registers the auto_insights function as an MCP tool.
    def auto_insights(file_path: str, max_insights: int = 10) -> dict[str, Any]:

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/K02D/mcp-tabular'

If you have feedback or need assistance with the MCP directory API, please join our Discord server