Skip to main content
Glama
prompts.py24.2 kB
#!/usr/bin/env python3 """ Databricks MCP Server Prompts Predefined prompt templates for common Databricks workflows and tasks. These prompts provide guided interactions for data exploration, analysis, and optimization. """ from typing import Dict, List, Optional, Any from mcp.types import Prompt, PromptArgument, PromptMessage, TextContent, GetPromptResult import logging logger = logging.getLogger(__name__) # Define available prompts for Databricks workflows DATABRICKS_PROMPTS: Dict[str, Prompt] = { "explore-dataset": Prompt( name="explore-dataset", description="Get comprehensive insights about a dataset including schema, sample data, and analysis recommendations", arguments=[ PromptArgument( name="catalog_name", description="Name of the catalog containing the dataset", required=True ), PromptArgument( name="schema_name", description="Name of the schema containing the dataset", required=True ), PromptArgument( name="table_name", description="Name of the table/dataset to explore", required=True ), PromptArgument( name="analysis_focus", description="Specific aspect to focus on (e.g., 'data quality', 'business insights', 'technical structure')", required=False ) ] ), "optimize-query": Prompt( name="optimize-query", description="Analyze and suggest optimizations for a SQL query performance", arguments=[ PromptArgument( name="query", description="The SQL query to analyze and optimize", required=True ), PromptArgument( name="performance_issues", description="Any specific performance issues observed (e.g., 'slow execution', 'high memory usage')", required=False ), PromptArgument( name="execution_context", description="Context about how the query is used (e.g., 'dashboard', 'batch processing', 'real-time')", required=False ) ] ), "investigate-lineage": Prompt( name="investigate-lineage", description="Investigate data lineage for compliance, debugging, or impact analysis", arguments=[ PromptArgument( name="catalog_name", description="Name of the catalog", required=True ), PromptArgument( name="schema_name", description="Name of the schema", required=True ), PromptArgument( name="table_name", description="Name of the table to investigate", required=True ), PromptArgument( name="investigation_purpose", description="Purpose of the lineage investigation (e.g., 'compliance audit', 'impact analysis', 'debugging data issues')", required=False ) ] ), "design-data-quality-checks": Prompt( name="design-data-quality-checks", description="Generate comprehensive data quality validation rules and checks for a table", arguments=[ PromptArgument( name="catalog_name", description="Name of the catalog", required=True ), PromptArgument( name="schema_name", description="Name of the schema", required=True ), PromptArgument( name="table_name", description="Name of the table to create quality checks for", required=True ), PromptArgument( name="business_rules", description="Any specific business rules or constraints that should be validated", required=False ), PromptArgument( name="quality_dimensions", description="Specific quality dimensions to focus on (e.g., 'completeness', 'accuracy', 'consistency', 'timeliness')", required=False ) ] ), "generate-documentation": Prompt( name="generate-documentation", description="Generate comprehensive documentation for a Databricks table or schema", arguments=[ PromptArgument( name="catalog_name", description="Name of the catalog", required=True ), PromptArgument( name="schema_name", description="Name of the schema", required=True ), PromptArgument( name="table_name", description="Name of the table (optional - if not provided, documents the entire schema)", required=False ), PromptArgument( name="audience", description="Target audience for the documentation (e.g., 'business users', 'data engineers', 'analysts')", required=False ), PromptArgument( name="doc_style", description="Documentation style preference (e.g., 'technical', 'business-friendly', 'compliance-focused')", required=False ) ] ), "troubleshoot-data-issue": Prompt( name="troubleshoot-data-issue", description="Systematically troubleshoot data quality or pipeline issues", arguments=[ PromptArgument( name="issue_description", description="Description of the data issue encountered", required=True ), PromptArgument( name="affected_table", description="Full table name affected (catalog.schema.table format)", required=True ), PromptArgument( name="observed_symptoms", description="Specific symptoms observed (e.g., 'missing data', 'incorrect values', 'duplicates')", required=False ), PromptArgument( name="timeframe", description="When the issue was first noticed or time period affected", required=False ) ] ), "compare-schemas": Prompt( name="compare-schemas", description="Compare two database schemas or tables for differences and migration planning", arguments=[ PromptArgument( name="source_table", description="Source table in format catalog.schema.table", required=True ), PromptArgument( name="target_table", description="Target table in format catalog.schema.table", required=True ), PromptArgument( name="comparison_focus", description="Aspect to focus comparison on (e.g., 'schema differences', 'data migration', 'compatibility')", required=False ) ] ) } class PromptsManager: """Manager for Databricks MCP prompts.""" def __init__(self): self.prompts = DATABRICKS_PROMPTS def list_prompts(self) -> List[Prompt]: """Return list of all available prompts.""" return list(self.prompts.values()) def get_prompt(self, name: str, arguments: Optional[Dict[str, str]] = None) -> GetPromptResult: """Generate a specific prompt with arguments.""" if name not in self.prompts: raise ValueError(f"Prompt not found: {name}") prompt = self.prompts[name] arguments = arguments or {} # Validate required arguments for arg in prompt.arguments or []: if arg.required and arg.name not in arguments: raise ValueError(f"Required argument '{arg.name}' missing for prompt '{name}'") # Generate prompt content based on the specific prompt type messages = self._generate_prompt_messages(name, arguments) return GetPromptResult( description=prompt.description, messages=messages ) def _generate_prompt_messages(self, name: str, arguments: Dict[str, str]) -> List[PromptMessage]: """Generate the actual prompt messages based on the prompt type and arguments.""" if name == "explore-dataset": return self._generate_explore_dataset_prompt(arguments) elif name == "optimize-query": return self._generate_optimize_query_prompt(arguments) elif name == "investigate-lineage": return self._generate_investigate_lineage_prompt(arguments) elif name == "design-data-quality-checks": return self._generate_data_quality_prompt(arguments) elif name == "generate-documentation": return self._generate_documentation_prompt(arguments) elif name == "troubleshoot-data-issue": return self._generate_troubleshoot_prompt(arguments) elif name == "compare-schemas": return self._generate_compare_schemas_prompt(arguments) else: raise ValueError(f"No message generator for prompt: {name}") def _generate_explore_dataset_prompt(self, args: Dict[str, str]) -> List[PromptMessage]: """Generate dataset exploration prompt.""" catalog = args["catalog_name"] schema = args["schema_name"] table = args["table_name"] focus = args.get("analysis_focus", "comprehensive analysis") prompt_text = f"""I need to explore the dataset `{catalog}.{schema}.{table}` with a focus on {focus}. Please help me understand this dataset by: 1. **Schema Analysis**: First, describe the table structure including: - Column names, data types, and any constraints - Key relationships and foreign keys if any - Table properties and metadata 2. **Data Sampling**: Show me a representative sample of the data to understand: - Typical data patterns and formats - Value distributions and ranges - Any obvious data quality issues 3. **Data Profiling**: Analyze the dataset for: - Data completeness (null values, missing data) - Data uniqueness and potential duplicates - Statistical summaries for numeric columns - Value distributions for categorical columns 4. **Business Context**: Help me understand: - What this dataset likely represents - Potential use cases and analytical opportunities - Relationships to other datasets that might be relevant 5. **Recommendations**: Suggest: - Next steps for analysis based on the data characteristics - Potential data quality improvements - Interesting analytical questions to explore Please use the available Databricks tools to gather this information systematically.""" return [ PromptMessage( role="user", content=TextContent(type="text", text=prompt_text) ) ] def _generate_optimize_query_prompt(self, args: Dict[str, str]) -> List[PromptMessage]: """Generate query optimization prompt.""" query = args["query"] issues = args.get("performance_issues", "") context = args.get("execution_context", "") prompt_text = f"""I need help optimizing this SQL query for better performance: ```sql {query} ``` {f"**Performance Issues Observed**: {issues}" if issues else ""} {f"**Execution Context**: {context}" if context else ""} Please analyze this query and provide optimization recommendations: 1. **Query Analysis**: - Identify the main operations (joins, aggregations, filters) - Spot potential performance bottlenecks - Check for inefficient patterns 2. **Table Information**: - Use Databricks tools to get information about the tables involved - Check table sizes, partitioning, and indexing - Analyze column statistics and data distribution 3. **Optimization Recommendations**: - Suggest query rewrite opportunities - Recommend indexing strategies - Propose partitioning improvements - Identify opportunities for caching or materialized views 4. **Execution Plan Analysis**: - If possible, analyze the query execution plan - Identify expensive operations - Suggest alternative approaches 5. **Best Practices**: - Apply Databricks-specific optimization techniques - Consider Delta Lake optimizations if applicable - Recommend monitoring and profiling approaches Please provide specific, actionable recommendations with explanations.""" return [ PromptMessage( role="user", content=TextContent(type="text", text=prompt_text) ) ] def _generate_investigate_lineage_prompt(self, args: Dict[str, str]) -> List[PromptMessage]: """Generate lineage investigation prompt.""" catalog = args["catalog_name"] schema = args["schema_name"] table = args["table_name"] purpose = args.get("investigation_purpose", "understanding data flow") prompt_text = f"""I need to investigate the data lineage for `{catalog}.{schema}.{table}` for {purpose}. Please help me understand the complete data lineage by: 1. **Upstream Dependencies**: - Identify all source tables and datasets that feed into this table - Map the transformation logic and data flow - Understand any external data sources or ingestion processes 2. **Downstream Consumption**: - Find all tables, views, and processes that depend on this table - Identify any reports, dashboards, or applications using this data - Map the complete impact chain 3. **Transformation Analysis**: - Document any ETL/ELT processes that transform this data - Understand the business logic applied - Identify any data quality or validation steps 4. **Impact Assessment**: - Assess the potential impact of changes to this table - Identify critical dependencies and business processes - Recommend testing strategies for any changes 5. **Compliance and Governance**: - Check for any data governance policies or classifications - Identify sensitive data or compliance requirements - Document data retention and lifecycle policies 6. **Documentation and Visualization**: - Create a clear lineage diagram or flow chart - Document key relationships and dependencies - Highlight any risks or recommendations Please use available tools to gather comprehensive lineage information.""" return [ PromptMessage( role="user", content=TextContent(type="text", text=prompt_text) ) ] def _generate_data_quality_prompt(self, args: Dict[str, str]) -> List[PromptMessage]: """Generate data quality checks prompt.""" catalog = args["catalog_name"] schema = args["schema_name"] table = args["table_name"] business_rules = args.get("business_rules", "") dimensions = args.get("quality_dimensions", "all standard dimensions") prompt_text = f"""I need to design comprehensive data quality checks for `{catalog}.{schema}.{table}` focusing on {dimensions}. {f"**Business Rules to Consider**: {business_rules}" if business_rules else ""} Please create data quality validation rules by: 1. **Schema-Based Checks**: - First, analyze the table structure and columns - Create checks for data type validation - Validate column constraints and business rules - Check for required fields and null value policies 2. **Completeness Checks**: - Identify critical fields that should never be null - Create completeness thresholds for each column - Design checks for missing or empty records - Validate record counts and data freshness 3. **Accuracy and Validity Checks**: - Create range checks for numeric columns - Validate format patterns for text fields (emails, phone numbers, etc.) - Check referential integrity with related tables - Validate business logic rules 4. **Consistency Checks**: - Cross-field validation rules - Consistency with reference data or lookup tables - Historical consistency patterns - Data standardization checks 5. **Uniqueness and Duplication Checks**: - Identify primary key constraints - Check for duplicate records - Validate unique business identifiers - Cross-table duplication checks 6. **Implementation Recommendations**: - Provide SQL queries for each quality check - Suggest alerting thresholds and monitoring frequency - Recommend automated vs. manual validation processes - Integration with Databricks data quality tools Please analyze the table structure first, then provide specific, implementable data quality rules.""" return [ PromptMessage( role="user", content=TextContent(type="text", text=prompt_text) ) ] def _generate_documentation_prompt(self, args: Dict[str, str]) -> List[PromptMessage]: """Generate documentation prompt.""" catalog = args["catalog_name"] schema = args["schema_name"] table = args.get("table_name", "") audience = args.get("audience", "technical and business users") style = args.get("doc_style", "comprehensive") if table: target = f"table `{catalog}.{schema}.{table}`" scope = "table" else: target = f"schema `{catalog}.{schema}`" scope = "schema" prompt_text = f"""I need to generate {style} documentation for {target} targeted at {audience}. Please create comprehensive documentation by: 1. **Overview Section**: - Provide a clear description of what this {scope} contains - Explain the business purpose and use cases - Document the data source and update frequency - Include any important context or background 2. **Technical Specifications**: - Document the complete schema structure - Include data types, constraints, and relationships - Document any partitioning, indexing, or optimization details - Include table properties and metadata 3. **Data Dictionary**: - Create detailed descriptions for each column - Include business definitions and context - Document valid values, ranges, and formats - Explain any codes, abbreviations, or special values 4. **Usage Guidelines**: - Provide examples of common queries and use cases - Document best practices for accessing the data - Include performance considerations and tips - Suggest relevant joins with other tables 5. **Data Quality and Governance**: - Document data quality expectations and SLAs - Include information about data lineage and sources - Document any governance policies or classifications - Include contact information for data stewards 6. **Sample Data and Examples**: - Provide representative sample data - Include example queries for common use cases - Show typical analysis patterns Please gather the necessary information using available tools and create well-structured, user-friendly documentation.""" return [ PromptMessage( role="user", content=TextContent(type="text", text=prompt_text) ) ] def _generate_troubleshoot_prompt(self, args: Dict[str, str]) -> List[PromptMessage]: """Generate troubleshooting prompt.""" issue = args["issue_description"] table = args["affected_table"] symptoms = args.get("observed_symptoms", "") timeframe = args.get("timeframe", "") prompt_text = f"""I'm experiencing a data issue that needs systematic troubleshooting: **Issue Description**: {issue} **Affected Table**: {table} {f"**Observed Symptoms**: {symptoms}" if symptoms else ""} {f"**Timeframe**: {timeframe}" if timeframe else ""} Please help me troubleshoot this systematically: 1. **Initial Investigation**: - Gather current information about the affected table - Check table structure, properties, and recent changes - Analyze recent data patterns and trends - Verify access permissions and configuration 2. **Data Quality Analysis**: - Sample recent data to identify anomalies - Compare current data with historical patterns - Check for missing, duplicate, or invalid records - Analyze data freshness and completeness 3. **Pipeline Investigation**: - Investigate the data lineage and upstream sources - Check recent job runs and transformation processes - Review any recent changes to ETL/ELT pipelines - Analyze error logs and job execution history 4. **Root Cause Analysis**: - Identify potential causes based on symptoms - Test hypotheses with targeted queries - Narrow down the source of the issue - Document timeline and contributing factors 5. **Impact Assessment**: - Determine scope of data affected - Identify downstream dependencies and consumers - Assess business impact and urgency - Evaluate data recovery requirements 6. **Resolution Recommendations**: - Propose immediate remediation steps - Suggest long-term fixes to prevent recurrence - Recommend monitoring improvements - Document lessons learned Please start with gathering current state information and work systematically through the investigation.""" return [ PromptMessage( role="user", content=TextContent(type="text", text=prompt_text) ) ] def _generate_compare_schemas_prompt(self, args: Dict[str, str]) -> List[PromptMessage]: """Generate schema comparison prompt.""" source = args["source_table"] target = args["target_table"] focus = args.get("comparison_focus", "comprehensive comparison") prompt_text = f"""I need to compare two tables for {focus}: **Source Table**: {source} **Target Table**: {target} Please provide a comprehensive comparison by: 1. **Schema Structure Analysis**: - Get detailed information about both tables - Compare column names, data types, and constraints - Identify added, removed, or modified columns - Check for differences in table properties and metadata 2. **Data Type Compatibility**: - Analyze data type differences and compatibility - Identify potential conversion issues - Recommend data transformation strategies - Flag any breaking changes 3. **Data Size and Distribution**: - Compare table sizes and row counts - Analyze data distribution patterns - Check partitioning and indexing differences - Assess performance implications 4. **Sample Data Comparison**: - Compare sample data from both tables - Look for structural differences in data patterns - Identify any obvious data quality differences - Check value ranges and formats 5. **Migration Planning**: - Recommend migration strategies if applicable - Identify required data transformations - Suggest validation approaches - Estimate migration complexity and risks 6. **Compatibility Assessment**: - Evaluate backward/forward compatibility - Identify breaking changes for consumers - Recommend transition strategies - Document required application changes Please start by gathering detailed information about both tables and provide a structured comparison report.""" return [ PromptMessage( role="user", content=TextContent(type="text", text=prompt_text) ) ]

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/knustx/databricks-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server