data_summary
Generate a detailed summary of Excel sheet data by analyzing file content. Identifies key statistics and insights for structured data analysis and reporting.
Instructions
Generate a comprehensive summary of the data in an Excel file.
Args:
file_path: Path to the Excel file
sheet_name: Name of the sheet to summarize (for Excel files)
Returns:
Comprehensive data summary as string
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| file_path | Yes | ||
| sheet_name | No |
Input Schema (JSON Schema)
{
"properties": {
"file_path": {
"title": "File Path",
"type": "string"
},
"sheet_name": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Sheet Name"
}
},
"required": [
"file_path"
],
"title": "data_summaryArguments",
"type": "object"
}
Implementation Reference
- mcp_excel_server/server.py:557-650 (handler)The handler function for the 'data_summary' MCP tool. It is decorated with @mcp.tool(), which automatically registers it with the name 'data_summary' based on the function name. The function reads the Excel file (or CSV/TSV/JSON), computes comprehensive statistics including file info, data structure, quality metrics, and summaries for numeric, categorical, and datetime columns, then returns a JSON string summary.@mcp.tool() def data_summary(file_path: str, sheet_name: Optional[str] = None) -> str: """ Generate a comprehensive summary of the data in an Excel file. Args: file_path: Path to the Excel file sheet_name: Name of the sheet to summarize (for Excel files) Returns: Comprehensive data summary as string """ try: # Read file _, ext = os.path.splitext(file_path) ext = ext.lower() read_params = {} if ext in ['.xlsx', '.xls', '.xlsm'] and sheet_name is not None: read_params["sheet_name"] = sheet_name if ext in ['.xlsx', '.xls', '.xlsm']: df = pd.read_excel(file_path, **read_params) elif ext == '.csv': df = pd.read_csv(file_path) elif ext == '.tsv': df = pd.read_csv(file_path, sep='\t') elif ext == '.json': df = pd.read_json(file_path) else: return f"Unsupported file extension: {ext}" # Basic file info file_info = { "file_name": os.path.basename(file_path), "file_type": ext, "file_size": f"{os.path.getsize(file_path) / 1024:.2f} KB", "last_modified": datetime.fromtimestamp(os.path.getmtime(file_path)).strftime('%Y-%m-%d %H:%M:%S') } # Data structure data_structure = { "rows": df.shape[0], "columns": df.shape[1], "column_names": list(df.columns), "column_types": {col: str(dtype) for col, dtype in df.dtypes.items()}, "memory_usage": f"{df.memory_usage(deep=True).sum() / 1024:.2f} KB" } # Data quality data_quality = { "missing_values": {col: int(count) for col, count in df.isnull().sum().items()}, "missing_percentage": {col: f"{count/len(df)*100:.2f}%" for col, count in df.isnull().sum().items()}, "duplicate_rows": int(df.duplicated().sum()), "unique_values": {col: int(df[col].nunique()) for col in df.columns} } # Statistical summary numeric_cols = df.select_dtypes(include=['number']).columns categorical_cols = df.select_dtypes(include=['object', 'category']).columns datetime_cols = df.select_dtypes(include=['datetime', 'datetime64']).columns statistics = {} if len(numeric_cols) > 0: statistics["numeric"] = df[numeric_cols].describe().to_dict() if len(categorical_cols) > 0: statistics["categorical"] = { col: { "unique_values": int(df[col].nunique()), "top_values": df[col].value_counts().head(5).to_dict() } for col in categorical_cols } if len(datetime_cols) > 0: statistics["datetime"] = { col: { "min": df[col].min().strftime('%Y-%m-%d') if pd.notna(df[col].min()) else None, "max": df[col].max().strftime('%Y-%m-%d') if pd.notna(df[col].max()) else None, "range_days": (df[col].max() - df[col].min()).days if pd.notna(df[col].min()) and pd.notna(df[col].max()) else None } for col in datetime_cols } # Combine all info summary = { "file_info": file_info, "data_structure": data_structure, "data_quality": data_quality, "statistics": statistics } return json.dumps(summary, indent=2, default=str) except Exception as e: return f"Error generating summary: {str(e)}"