Skip to main content
Glama

MCP Databricks Server

from databricks.sdk import WorkspaceClient from databricks.sdk.core import Config from databricks.sdk.service.catalog import TableInfo, SchemaInfo, ColumnInfo, CatalogInfo from databricks.sdk.service.sql import StatementResponse, StatementState from typing import Dict, Any, List import os import json import time from dotenv import load_dotenv # Load environment variables from .env file when the module is imported load_dotenv() DATABRICKS_HOST = os.environ.get("DATABRICKS_HOST") DATABRICKS_TOKEN = os.environ.get("DATABRICKS_TOKEN") DATABRICKS_SQL_WAREHOUSE_ID = os.environ.get("DATABRICKS_SQL_WAREHOUSE_ID") if not DATABRICKS_HOST or not DATABRICKS_TOKEN: raise ImportError( "DATABRICKS_HOST and DATABRICKS_TOKEN must be set in environment variables or .env file " "for databricks_sdk_utils to initialize." ) # Configure and initialize the global SDK client # Using short timeouts as previously determined to be effective sdk_config = Config( host=DATABRICKS_HOST, token=DATABRICKS_TOKEN, http_timeout_seconds=30, retry_timeout_seconds=60 ) sdk_client = WorkspaceClient(config=sdk_config) # Cache for job information to avoid redundant API calls _job_cache = {} _notebook_cache = {} def _format_column_details_md(columns: List[ColumnInfo]) -> List[str]: """ Formats a list of ColumnInfo objects into a list of Markdown strings. """ markdown_lines = [] if not columns: markdown_lines.append(" - *No column information available.*") return markdown_lines for col in columns: if not isinstance(col, ColumnInfo): print(f"Warning: Encountered an unexpected item in columns list: {type(col)}. Skipping.") continue col_type = col.type_text or (col.type_name.value if col.type_name and hasattr(col.type_name, 'value') else "N/A") nullable_status = "nullable" if col.nullable else "not nullable" col_description = f": {col.comment}" if col.comment else "" markdown_lines.append(f" - **{col.name}** (`{col_type}`, {nullable_status}){col_description}") return markdown_lines def _get_job_info_cached(job_id: str) -> Dict[str, Any]: """Get job information with caching to avoid redundant API calls""" if job_id not in _job_cache: try: job_info = sdk_client.jobs.get(job_id=job_id) _job_cache[job_id] = { 'name': job_info.settings.name if job_info.settings.name else f"Job {job_id}", 'tasks': [] } # Pre-process all tasks to build notebook mapping if job_info.settings.tasks: for task in job_info.settings.tasks: if hasattr(task, 'notebook_task') and task.notebook_task: task_info = { 'task_key': task.task_key, 'notebook_path': task.notebook_task.notebook_path } _job_cache[job_id]['tasks'].append(task_info) except Exception as e: print(f"Error fetching job {job_id}: {e}") _job_cache[job_id] = { 'name': f"Job {job_id}", 'tasks': [], 'error': str(e) } return _job_cache[job_id] def _get_notebook_id_cached(notebook_path: str) -> str: """Get notebook ID with caching to avoid redundant API calls""" if notebook_path not in _notebook_cache: try: notebook_details = sdk_client.workspace.get_status(notebook_path) _notebook_cache[notebook_path] = str(notebook_details.object_id) except Exception as e: print(f"Error fetching notebook {notebook_path}: {e}") _notebook_cache[notebook_path] = None return _notebook_cache[notebook_path] def _resolve_notebook_info_optimized(notebook_id: str, job_id: str) -> Dict[str, Any]: """ Optimized version that resolves notebook info using cached job data. Returns dict with notebook_path, notebook_name, job_name, and task_key. """ result = { 'notebook_id': notebook_id, 'notebook_path': f"notebook_id:{notebook_id}", 'notebook_name': f"notebook_id:{notebook_id}", 'job_id': job_id, 'job_name': f"Job {job_id}", 'task_key': None } # Get cached job info job_info = _get_job_info_cached(job_id) result['job_name'] = job_info['name'] # Look for notebook in job tasks for task_info in job_info['tasks']: notebook_path = task_info['notebook_path'] cached_notebook_id = _get_notebook_id_cached(notebook_path) if cached_notebook_id == notebook_id: result['notebook_path'] = notebook_path result['notebook_name'] = notebook_path.split('/')[-1] result['task_key'] = task_info['task_key'] break return result def _format_notebook_info_optimized(notebook_info: Dict[str, Any]) -> str: """ Formats notebook information using pre-resolved data. """ lines = [] if notebook_info['notebook_path'].startswith('/'): lines.append(f"**`{notebook_info['notebook_name']}`**") lines.append(f" - **Path**: `{notebook_info['notebook_path']}`") else: lines.append(f"**{notebook_info['notebook_name']}**") lines.append(f" - **Job**: {notebook_info['job_name']} (ID: {notebook_info['job_id']})") if notebook_info['task_key']: lines.append(f" - **Task**: {notebook_info['task_key']}") return "\n".join(lines) def _process_lineage_results(lineage_query_output: Dict[str, Any], main_table_full_name: str) -> Dict[str, Any]: """ Optimized version of lineage processing that batches API calls and uses caching. """ print("Processing lineage results with optimization...") start_time = time.time() processed_data: Dict[str, Any] = { "upstream_tables": [], "downstream_tables": [], "notebooks_reading": [], "notebooks_writing": [] } if not lineage_query_output or lineage_query_output.get("status") != "success" or not isinstance(lineage_query_output.get("data"), list): print("Warning: Lineage query output is invalid or not successful. Returning empty lineage.") return processed_data upstream_set = set() downstream_set = set() notebooks_reading_dict = {} notebooks_writing_dict = {} # Collect all unique job IDs first for batch processing unique_job_ids = set() notebook_job_pairs = [] for row in lineage_query_output["data"]: source_table = row.get("source_table_full_name") target_table = row.get("target_table_full_name") entity_metadata = row.get("entity_metadata") # Parse entity metadata notebook_id = None job_id = None if entity_metadata: try: if isinstance(entity_metadata, str): metadata_dict = json.loads(entity_metadata) else: metadata_dict = entity_metadata notebook_id = metadata_dict.get("notebook_id") job_info = metadata_dict.get("job_info") if job_info: job_id = job_info.get("job_id") except (json.JSONDecodeError, AttributeError): pass # Process table-to-table lineage if source_table == main_table_full_name and target_table and target_table != main_table_full_name: downstream_set.add(target_table) elif target_table == main_table_full_name and source_table and source_table != main_table_full_name: upstream_set.add(source_table) # Collect notebook-job pairs for batch processing if notebook_id and job_id: unique_job_ids.add(job_id) notebook_job_pairs.append({ 'notebook_id': notebook_id, 'job_id': job_id, 'source_table': source_table, 'target_table': target_table }) # Pre-load all job information in parallel (this is where the optimization happens) print(f"Pre-loading {len(unique_job_ids)} unique jobs...") batch_start = time.time() for job_id in unique_job_ids: _get_job_info_cached(job_id) # This will cache the job info batch_time = time.time() - batch_start print(f"Job batch loading took {batch_time:.2f} seconds") # Now process all notebook-job pairs using cached data print(f"Processing {len(notebook_job_pairs)} notebook entries...") for pair in notebook_job_pairs: notebook_info = _resolve_notebook_info_optimized(pair['notebook_id'], pair['job_id']) formatted_info = _format_notebook_info_optimized(notebook_info) if pair['source_table'] == main_table_full_name: notebooks_reading_dict[pair['notebook_id']] = formatted_info elif pair['target_table'] == main_table_full_name: notebooks_writing_dict[pair['notebook_id']] = formatted_info processed_data["upstream_tables"] = sorted(list(upstream_set)) processed_data["downstream_tables"] = sorted(list(downstream_set)) processed_data["notebooks_reading"] = sorted(list(notebooks_reading_dict.values())) processed_data["notebooks_writing"] = sorted(list(notebooks_writing_dict.values())) total_time = time.time() - start_time print(f"Total lineage processing took {total_time:.2f} seconds") return processed_data def clear_lineage_cache(): """Clear the job and notebook caches to free memory""" global _job_cache, _notebook_cache _job_cache = {} _notebook_cache = {} print("Cleared lineage caches") def _get_table_lineage(table_full_name: str) -> Dict[str, Any]: """ Retrieves table lineage information for a given table using the global SDK client and global SQL warehouse ID. Now includes notebook and job information with enhanced details. """ if not DATABRICKS_SQL_WAREHOUSE_ID: # Check before attempting query return {"status": "error", "error": "DATABRICKS_SQL_WAREHOUSE_ID is not set. Cannot fetch lineage."} lineage_sql_query = f""" SELECT source_table_full_name, target_table_full_name, entity_type, entity_id, entity_run_id, entity_metadata, created_by, event_time FROM system.access.table_lineage WHERE source_table_full_name = '{table_full_name}' OR target_table_full_name = '{table_full_name}' ORDER BY event_time DESC LIMIT 100; """ print(f"Fetching and processing lineage for table: {table_full_name}") # execute_databricks_sql will now use the global warehouse_id raw_lineage_output = execute_databricks_sql(lineage_sql_query, wait_timeout='50s') return _process_lineage_results(raw_lineage_output, table_full_name) def _format_single_table_md(table_info: TableInfo, base_heading_level: int, display_columns: bool) -> List[str]: """ Formats the details for a single TableInfo object into a list of Markdown strings. Uses a base_heading_level to control Markdown header depth for hierarchical display. """ table_markdown_parts = [] table_header_prefix = "#" * base_heading_level sub_header_prefix = "#" * (base_heading_level + 1) table_markdown_parts.append(f"{table_header_prefix} Table: **{table_info.full_name}**") if table_info.comment: table_markdown_parts.extend(["", f"**Description**: {table_info.comment}"]) elif base_heading_level == 1: table_markdown_parts.extend(["", "**Description**: No description provided."]) # Process and add partition columns partition_column_names: List[str] = [] if table_info.columns: temp_partition_cols: List[tuple[str, int]] = [] for col in table_info.columns: if col.partition_index is not None: temp_partition_cols.append((col.name, col.partition_index)) if temp_partition_cols: temp_partition_cols.sort(key=lambda x: x[1]) partition_column_names = [name for name, index in temp_partition_cols] if partition_column_names: table_markdown_parts.extend(["", f"{sub_header_prefix} Partition Columns"]) table_markdown_parts.extend([f"- `{col_name}`" for col_name in partition_column_names]) elif base_heading_level == 1: table_markdown_parts.extend(["", f"{sub_header_prefix} Partition Columns", "- *This table is not partitioned or partition key information is unavailable.*"]) if display_columns: table_markdown_parts.extend(["", f"{sub_header_prefix} Table Columns"]) if table_info.columns: table_markdown_parts.extend(_format_column_details_md(table_info.columns)) else: table_markdown_parts.append(" - *No column information available.*") return table_markdown_parts def execute_databricks_sql(sql_query: str, wait_timeout: str = '50s') -> Dict[str, Any]: """ Executes a SQL query on Databricks using the global SDK client and global SQL warehouse ID. """ if not DATABRICKS_SQL_WAREHOUSE_ID: return {"status": "error", "error": "DATABRICKS_SQL_WAREHOUSE_ID is not set. Cannot execute SQL query."} try: print(f"Executing SQL on warehouse {DATABRICKS_SQL_WAREHOUSE_ID} (timeout: {wait_timeout}):\n{sql_query[:200]}..." + (" (truncated)" if len(sql_query) > 200 else "")) response: StatementResponse = sdk_client.statement_execution.execute_statement( statement=sql_query, warehouse_id=DATABRICKS_SQL_WAREHOUSE_ID, # Use global warehouse ID wait_timeout=wait_timeout ) if response.status and response.status.state == StatementState.SUCCEEDED: if response.result and response.result.data_array: column_names = [col.name for col in response.manifest.schema.columns] if response.manifest and response.manifest.schema and response.manifest.schema.columns else [] results = [dict(zip(column_names, row)) for row in response.result.data_array] return {"status": "success", "row_count": len(results), "data": results} else: return {"status": "success", "row_count": 0, "data": [], "message": "Query succeeded but returned no data."} elif response.status: error_message = response.status.error.message if response.status.error else "No error details provided." return {"status": "failed", "error": f"Query execution failed with state: {response.status.state.value}", "details": error_message} else: return {"status": "failed", "error": "Query execution status unknown."} except Exception as e: return {"status": "error", "error": f"An error occurred during SQL execution: {str(e)}"} def get_uc_table_details(full_table_name: str, include_lineage: bool = False) -> str: """ Fetches table metadata and optionally lineage, then formats it into a Markdown string. Uses the _format_single_table_md helper for core table structure. """ print(f"Fetching metadata for {full_table_name}...") try: table_info: TableInfo = sdk_client.tables.get(full_name=full_table_name) except Exception as e: error_details = str(e) return f"""# Error: Could Not Retrieve Table Details **Table:** `{full_table_name}` **Problem:** Failed to fetch the complete metadata for this table. **Details:** ``` {error_details} ```""" markdown_parts = _format_single_table_md(table_info, base_heading_level=1, display_columns=True) if include_lineage: markdown_parts.extend(["", "## Lineage Information"]) if not DATABRICKS_SQL_WAREHOUSE_ID: markdown_parts.append("- *Lineage fetching skipped: `DATABRICKS_SQL_WAREHOUSE_ID` environment variable is not set.*") else: print(f"Fetching lineage for {full_table_name}...") lineage_info = _get_table_lineage(full_table_name) has_upstream = lineage_info and isinstance(lineage_info.get("upstream_tables"), list) and lineage_info["upstream_tables"] has_downstream = lineage_info and isinstance(lineage_info.get("downstream_tables"), list) and lineage_info["downstream_tables"] has_notebooks_reading = lineage_info and isinstance(lineage_info.get("notebooks_reading"), list) and lineage_info["notebooks_reading"] has_notebooks_writing = lineage_info and isinstance(lineage_info.get("notebooks_writing"), list) and lineage_info["notebooks_writing"] if has_upstream: markdown_parts.extend(["", "### Upstream Tables (tables this table reads from):"]) markdown_parts.extend([f"- `{table}`" for table in lineage_info["upstream_tables"]]) if has_downstream: markdown_parts.extend(["", "### Downstream Tables (tables that read from this table):"]) markdown_parts.extend([f"- `{table}`" for table in lineage_info["downstream_tables"]]) if has_notebooks_reading: markdown_parts.extend(["", "### Notebooks Reading from this Table:"]) for notebook in lineage_info["notebooks_reading"]: markdown_parts.extend([f"- {notebook}", ""]) if has_notebooks_writing: markdown_parts.extend(["", "### Notebooks Writing to this Table:"]) for notebook in lineage_info["notebooks_writing"]: markdown_parts.extend([f"- {notebook}", ""]) if not any([has_upstream, has_downstream, has_notebooks_reading, has_notebooks_writing]): if lineage_info and lineage_info.get("status") == "error" and lineage_info.get("error"): markdown_parts.extend(["", "*Note: Could not retrieve complete lineage information.*", f"> *Lineage fetch error: {lineage_info.get('error')}*"]) elif lineage_info and lineage_info.get("status") != "success" and lineage_info.get("error"): markdown_parts.extend(["", "*Note: Could not retrieve complete lineage information.*", f"> *Lineage fetch error: {lineage_info.get('error')}*"]) else: markdown_parts.append("- *No table, notebook, or job dependencies found or lineage fetch was not fully successful.*") else: markdown_parts.extend(["", "## Lineage Information", "- *Lineage fetching skipped as per request.*"]) return "\n".join(markdown_parts) def get_uc_schema_details(catalog_name: str, schema_name: str, include_columns: bool = False) -> str: """ Fetches detailed information for a specific schema, optionally including its tables and their columns. Uses the global SDK client and the _format_single_table_md helper with appropriate heading levels. """ full_schema_name = f"{catalog_name}.{schema_name}" markdown_parts = [f"# Schema Details: **{full_schema_name}**"] try: print(f"Fetching details for schema: {full_schema_name}...") schema_info: SchemaInfo = sdk_client.schemas.get(full_name=full_schema_name) description = schema_info.comment if schema_info.comment else "No description provided." markdown_parts.append(f"**Description**: {description}") markdown_parts.append("") markdown_parts.append(f"## Tables in Schema `{schema_name}`") tables_iterable = sdk_client.tables.list(catalog_name=catalog_name, schema_name=schema_name) tables_list = list(tables_iterable) if not tables_list: markdown_parts.append("- *No tables found in this schema.*") else: for i, table_info in enumerate(tables_list): if not isinstance(table_info, TableInfo): print(f"Warning: Encountered an unexpected item in tables list: {type(table_info)}") continue markdown_parts.extend(_format_single_table_md( table_info, base_heading_level=3, display_columns=include_columns )) if i < len(tables_list) - 1: markdown_parts.append("\n=============\n") else: markdown_parts.append("") except Exception as e: error_message = f"Failed to retrieve details for schema '{full_schema_name}': {str(e)}" print(f"Error in get_uc_schema_details: {error_message}") return f"""# Error: Could Not Retrieve Schema Details **Schema:** `{full_schema_name}` **Problem:** An error occurred while attempting to fetch schema information. **Details:** ``` {error_message} ```""" return "\n".join(markdown_parts) def get_uc_catalog_details(catalog_name: str) -> str: """ Fetches and formats a summary of all schemas within a given catalog using the global SDK client. """ markdown_parts = [f"# Catalog Summary: **{catalog_name}**", ""] schemas_found_count = 0 try: print(f"Fetching schemas for catalog: {catalog_name} using global sdk_client...") # The sdk_client is globally defined in this module schemas_iterable = sdk_client.schemas.list(catalog_name=catalog_name) # Convert iterator to list to easily check if empty and get a count schemas_list = list(schemas_iterable) if not schemas_list: markdown_parts.append(f"No schemas found in catalog `{catalog_name}`.") return "\n".join(markdown_parts) schemas_found_count = len(schemas_list) markdown_parts.append(f"Showing top {schemas_found_count} schemas found in catalog `{catalog_name}`:") markdown_parts.append("") for i, schema_info in enumerate(schemas_list): if not isinstance(schema_info, SchemaInfo): print(f"Warning: Encountered an unexpected item in schemas list: {type(schema_info)}") continue # Start of a schema item in the list schema_name_display = schema_info.full_name if schema_info.full_name else "Unnamed Schema" markdown_parts.append(f"## {schema_name_display}") # Main bullet point for schema name description = f"**Description**: {schema_info.comment}" if schema_info.comment else "" markdown_parts.append(description) markdown_parts.append("") # Add a blank line for separation between schemas, or remove if too much space except Exception as e: error_message = f"Failed to retrieve schemas for catalog '{catalog_name}': {str(e)}" print(f"Error in get_catalog_summary: {error_message}") # Return a structured error message in Markdown return f"""# Error: Could Not Retrieve Catalog Summary **Catalog:** `{catalog_name}` **Problem:** An error occurred while attempting to fetch schema information. **Details:** ``` {error_message} ```""" markdown_parts.append(f"**Total Schemas Found in `{catalog_name}`**: {schemas_found_count}") return "\n".join(markdown_parts) def get_uc_all_catalogs_summary() -> str: """ Fetches a summary of all available Unity Catalogs, including their names, comments, and types. Uses the global SDK client. """ markdown_parts = ["# Available Unity Catalogs", ""] catalogs_found_count = 0 try: print("Fetching all catalogs using global sdk_client...") catalogs_iterable = sdk_client.catalogs.list() catalogs_list = list(catalogs_iterable) if not catalogs_list: markdown_parts.append("- *No catalogs found or accessible.*") return "\n".join(markdown_parts) catalogs_found_count = len(catalogs_list) markdown_parts.append(f"Found {catalogs_found_count} catalog(s):") markdown_parts.append("") for catalog_info in catalogs_list: if not isinstance(catalog_info, CatalogInfo): print(f"Warning: Encountered an unexpected item in catalogs list: {type(catalog_info)}") continue markdown_parts.append(f"- **`{catalog_info.name}`**") description = catalog_info.comment if catalog_info.comment else "No description provided." markdown_parts.append(f" - **Description**: {description}") catalog_type_str = "N/A" if catalog_info.catalog_type and hasattr(catalog_info.catalog_type, 'value'): catalog_type_str = catalog_info.catalog_type.value elif catalog_info.catalog_type: # Fallback if it's not an Enum but has a direct string representation catalog_type_str = str(catalog_info.catalog_type) markdown_parts.append(f" - **Type**: `{catalog_type_str}`") markdown_parts.append("") # Add a blank line for separation except Exception as e: error_message = f"Failed to retrieve catalog list: {str(e)}" print(f"Error in get_uc_all_catalogs_summary: {error_message}") return f"""# Error: Could Not Retrieve Catalog List **Problem:** An error occurred while attempting to fetch the list of catalogs. **Details:** ``` {error_message} ```""" return "\n".join(markdown_parts)

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/RafaelCartenet/mcp-databricks-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server