Umami Analytics MCP Server

  • src
  • analytics_service
import os import json from typing import Any, Sequence import sys from datetime import datetime from dotenv import load_dotenv from mcp.server import Server from mcp.types import Tool, TextContent, ImageContent, EmbeddedResource from .api import UmamiClient from .embeddings import get_chunks from .crawler import CrawlingAPI def convert_date_to_unix(date_str: str, end_of_day: bool = False) -> int: """ Convert a date string to Unix timestamp in milliseconds. Format should be YYYY-MM-DD or YYYY-MM-DD HH:MM:SS Args: date_str (str): Date string in format YYYY-MM-DD or YYYY-MM-DD HH:MM:SS end_of_day (bool): If True and time not provided, set time to 23:59:59.999 Returns: int: Unix timestamp in milliseconds """ try: # Try parsing with time first try: dt = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S") except ValueError: # If that fails, try just date dt = datetime.strptime(date_str, "%Y-%m-%d") # If end_of_day is True, set time to end of day if end_of_day: dt = dt.replace(hour=23, minute=59, second=59, microsecond=999000) # Convert to milliseconds return int(dt.timestamp() * 1000) except ValueError as e: raise ValueError(f"Invalid date format. Please use YYYY-MM-DD or YYYY-MM-DD HH:MM:SS. Error: {str(e)}") # Load environment variables load_dotenv() # API configuration API_BASE_URL = os.getenv("UMAMI_API_URL") API_USERNAME = os.getenv("UMAMI_USERNAME") API_PASSWORD = os.getenv("UMAMI_PASSWORD") TEAM_ID = os.getenv("UMAMI_TEAM_ID") if not all([API_BASE_URL, API_USERNAME, API_PASSWORD, TEAM_ID]): raise ValueError("Missing required environment variables") # Initialize API client client = UmamiClient(API_BASE_URL) crawler = CrawlingAPI() # Ensure client is logged in at startup if not client.login(API_USERNAME, API_PASSWORD): raise RuntimeError("Failed to login to Umami API") if not client.verify_token(): raise RuntimeError("Failed to verify Umami API token") # Create server instance app = Server("analytics-server") def get_session_ids(website_id, event_name, start_at, end_at): """ Retrieve session IDs for a specific event on a website. Args: website_id (str): ID of the website event_name (str): Name of the event to filter by Returns: list: Unique session IDs associated with the event """ ids = [] page = 1 while True: events_where = client.get_events_where( website_id=website_id, start_at=start_at, end_at=end_at, unit="day", timezone="UTC", query=event_name, page=page, page_size=200 ) if events_where: db = (list({event['sessionId'] for event in events_where['data']})) for i in db: ids.append(i) if 200 * events_where['page'] >= events_where['count']: break else: page += 1 return list(set(ids)) # List of tools and their descriptions for LLM @app.list_tools() async def list_tools() -> list[Tool]: """List available tracking data tools.""" return [ Tool( name="get_websites", description="""Retrieve a list of the websites present in the tracking database. This tool does not require any input. The output of this tool includes the following fields for each website: - id: The unique identifier of the website - name: The name of the website - domain: The URL of the website - shareId: The unique identifier that can connect websites together - resetAt: The date and time when the website was last reset - userId: The unique identifier of the user that owns the website - teamId: The unique identifier of the team that owns the website - createdBy: The unique identifier of the user that created the website - createdAt: The date and time when the website was created - updatedAt: The date and time when the website was last updated - deletedAt: The date and time when the website was deleted - createUser: The unique identifier of the user that created the website, and their username """, inputSchema={ "type": "object", "properties": { } } ), Tool( name="get_tracking_data", description="Get the user journey for a specific session ID within a time range. Note: If no results are returned, do not immediately assume there is no data - verify the unix timestamps are correct and ask the user for specific date ranges if not provided.", inputSchema={ "type": "object", "properties": { "website_id": { "type": "string", "description": "The ID of the website where the user journey is located" }, "start_at": { "type": "string", "description": """Start date for time range of data to retrieve. Format: YYYY-MM-DD or YYYY-MM-DD HH:MM:SS Examples: - 2024-03-01 - 2024-03-01 00:00:00 - 2024-01-31 Note: If time is not provided, 00:00:00 will be used""" }, "end_at": { "type": "string", "description": """End date for time range of data to retrieve. Format: YYYY-MM-DD or YYYY-MM-DD HH:MM:SS Examples: - 2024-03-01 - 2024-03-01 23:59:59 - 2024-01-31 Note: If time is not provided, 23:59:59.999 will be used""" }, "session_id": { "type": "string", "description": "ID of the user session to get tracking data for" } }, "required": ["website_id", "start_at", "end_at", "session_id"] } ), Tool( name="get_website_stats", description="""Get the 5 overivew metrics for a specific website within a time range. The returned metrics are as follows: - pageviews: The number of total pageviews for the entire website - visitors: The number of unique visitors the website has had - visits: The number of unique visits those visitors have had to the website - bounces: The number of visitors that left the website without interacting with it - totaltime: The total time spent on the website by all visitors Note: If no results are returned, do not immediately assume there is no data - verify the unix timestamps are correct and ask the user for specific date ranges if not provided.""", inputSchema={ "type": "object", "properties": { "website_id": { "type": "string", "description": "ID of the website to get overivew stats for" }, "start_at": { "type": "string", "description": """Start date for time range of data to retrieve. Format: YYYY-MM-DD or YYYY-MM-DD HH:MM:SS Examples: - 2024-03-01 - 2024-03-01 00:00:00 - 2024-01-31 Note: If time is not provided, 00:00:00 will be used""" }, "end_at": { "type": "string", "description": """End date for time range of data to retrieve. Format: YYYY-MM-DD or YYYY-MM-DD HH:MM:SS Examples: - 2024-03-01 - 2024-03-01 23:59:59 - 2024-01-31 Note: If time is not provided, 23:59:59.999 will be used""" } }, "required": ["website_id", "start_at", "end_at"] } ), Tool( name="get_session_ids", description="""Get a list of the unique session IDs who visited a specific website within a time range and perform a specific event. WARNING: due to api limitations, only the first 1000 total session IDs will be returned by the api. Within those less will be unique. Do not use this tool to calculate the number of unique visitors - only use it to get session IDs. Note: If no results are returned, do not immediately assume there is no data - verify the unix timestamps are correct and ask the user for specific date ranges if not provided.""", inputSchema={ "type": "object", "properties": { "website_id": { "type": "string", "description": "ID of the website to get session IDs for" }, "start_at": { "type": "string", "description": """Start date for time range of data to retrieve. Format: YYYY-MM-DD or YYYY-MM-DD HH:MM:SS Examples: - 2024-03-01 - 2024-03-01 00:00:00 - 2024-01-31 Note: If time is not provided, 00:00:00 will be used""" }, "end_at": { "type": "string", "description": """End date for time range of data to retrieve. Format: YYYY-MM-DD or YYYY-MM-DD HH:MM:SS Examples: - 2024-03-01 - 2024-03-01 23:59:59 - 2024-01-31 Note: If time is not provided, 23:59:59.999 will be used""" }, "event_name": { "type": "string", "description": """Name of the event to filter by. Here are the possible events: - product_details_viewed - product_clicked - user_sign_in - product_added_to_cart - checkout_started - language_changed - checkout_completed If not filtering by an event, set this to None. """ } }, "required": ["website_id", "start_at", "end_at", "event_name"] } ), Tool( name="get_website_metrics", description="""Get various metrics for a specific website within a time range and how many visitors have had each metric. The metric type is selected by type property. Note: If no results are returned, do not immediately assume there is no data - verify the unix timestamps are correct and ask the user for specific date ranges if not provided.""", inputSchema={ "type": "object", "properties": { "website_id": { "type": "string", "description": "ID of the website to get metrics for" }, "start_at": { "type": "string", "description": """Start date for time range of data to retrieve. Format: YYYY-MM-DD or YYYY-MM-DD HH:MM:SS Examples: - 2024-03-01 - 2024-03-01 00:00:00 - 2024-01-31 Note: If time is not provided, 00:00:00 will be used""" }, "end_at": { "type": "string", "description": """End date for time range of data to retrieve. Format: YYYY-MM-DD or YYYY-MM-DD HH:MM:SS Examples: - 2024-03-01 - 2024-03-01 23:59:59 - 2024-01-31 Note: If time is not provided, 23:59:59.999 will be used""" }, "type": { "type": "string", "description": """Type of metrics to retrieve. Here are the possible types: - url: The number of visits for each URL on the website (effectively the number times each page has been visited) - referrer: Where the visitors came from to get to the website - browser: Which browser the visitors used to visit the website - os: Which operating system the visitors used to visit the website - device: Which device the visitors used to visit the website - country: Which country the visitors are from - event: The tally of each event that has occurred on the website """ } }, "required": ["website_id", "start_at", "end_at", "type"] } ), Tool( name="get_docs", description="""Performs the document selection and retrieval part of the RAG pipeline for user journeys from umami tracking data. User journey data is retrieved for all users who performed the selected event. Then the data is then chunked into documents and embedded into a vector database. Similarity search based of the users question is then used to retrieve the most relevant documents. These documents are returned for use in answering the user's question. Note: If no results are returned, do not immediately assume there is no data - verify the unix timestamps are correct and ask the user for specific date ranges if not provided.""", inputSchema={ "type": "object", "properties": { "user_question": { "type": "string", "description": """The user's question to be used to retrieve relevant documents. This does not have to be word for word the same as the question the user asked, but should allow for the most relevant documents to be retrieved.""" }, "selected_event": { "type": "string", "description": """The event to filter the session ids by. Here are the possible events: - product_details_viewed - product_clicked - user_sign_in - product_added_to_cart - checkout_started - language_changed - checkout_completed If not filtering by an event, set this to None.""" }, "website_id": { "type": "string", "description": "The ID of the website to get user journey data from" }, "start_at": { "type": "string", "description": """Start date for time range of data to retrieve. Format: YYYY-MM-DD or YYYY-MM-DD HH:MM:SS Examples: - 2024-03-01 - 2024-03-01 00:00:00 - 2024-01-31 Note: If time is not provided, 00:00:00 will be used""" }, "end_at": { "type": "string", "description": """End date for time range of data to retrieve. Format: YYYY-MM-DD or YYYY-MM-DD HH:MM:SS Examples: - 2024-03-01 - 2024-03-01 23:59:59 - 2024-01-31 Note: If time is not provided, 23:59:59.999 will be used""" } }, "required": ["user_question", "selected_event", "website_id", "start_at", "end_at"] } ), Tool( name="get_screenshot", description="Get a screenshot of a webpage for a specified URL", inputSchema={ "type": "object", "properties": { "url": { "type": "string", "description": "URL of the webpage to screenshot for" } }, "required": ["url"] } ), Tool( name="get_html", description="Get the HTML code of a webpage for a specified URL", inputSchema={ "type": "object", "properties": { "url": { "type": "string", "description": "URL of the webpage to get the HTML code for" } }, "required": ["url"] } ), Tool( name="get_pageview_series", description="""Get the pageview data series for a specific website within a time range. The data is grouped by the specified time unit (hour, day, month) and includes the number of pageviews and sessions for each time period. Note: If no results are returned, do not immediately assume there is no data - verify the unix timestamps are correct and ask the user for specific date ranges if not provided.""", inputSchema={ "type": "object", "properties": { "website_id": { "type": "string", "description": "ID of the website to get pageview data for" }, "start_at": { "type": "string", "description": """Start date for time range of data to retrieve. Format: YYYY-MM-DD or YYYY-MM-DD HH:MM:SS Examples: - 2024-03-01 - 2024-03-01 00:00:00 - 2024-01-31 Note: If time is not provided, 00:00:00 will be used""" }, "end_at": { "type": "string", "description": """End date for time range of data to retrieve. Format: YYYY-MM-DD or YYYY-MM-DD HH:MM:SS Examples: - 2024-03-01 - 2024-03-01 23:59:59 - 2024-01-31 Note: If time is not provided, 23:59:59.999 will be used""" }, "unit": { "type": "string", "description": "Time unit for grouping data (hour, day, or month)", "enum": ["hour", "day", "month"] }, "timezone": { "type": "string", "description": "Timezone for the data (e.g., 'UTC', 'Europe/London')" } }, "required": ["website_id", "start_at", "end_at", "unit", "timezone"] } ), Tool( name="get_active_visitors", description="""Get the current number of active visitors on a specific website. This provides real-time data about how many visitors are currently on the website.""", inputSchema={ "type": "object", "properties": { "website_id": { "type": "string", "description": "ID of the website to get active visitor data for" } }, "required": ["website_id"] } ) ] @app.list_prompts() async def list_prompts(): """List available prompts for analytics dashboard creation.""" return [ { "name": "Create Dashboard", "description": "Guide for creating comprehensive analytics dashboards using website metrics and stats", "arguments": [ { "name": "Website Name", "description": "Name of the website to analyze", "required": True }, { "name": "Start Date (YYYY-MM-DD)", "description": "Start date for analysis (YYYY-MM-DD or YYYY-MM-DD HH:MM:SS)", "required": True }, { "name": "End Date (YYYY-MM-DD)", "description": "End date for analysis (YYYY-MM-DD or YYYY-MM-DD HH:MM:SS)", "required": True }, { "name": "Timezone", "description": "Timezone for the analysis (e.g., 'UTC', 'Europe/London')", "required": True } ] } ] @app.get_prompt() async def get_prompt(name: str, arguments: Any): """Handle prompt requests.""" if name == "Create Dashboard": return { "messages": [ { "role": "user", "content": { "type": "text", "text": f"""You are an analytics expert helping to create a comprehensive dashboard using website tracking data. Follow these steps to create an attractive and engaging dashboard for website: {arguments['Website Name']}, analyzing data from {arguments['Start Date (YYYY-MM-DD)']} to {arguments['End Date (YYYY-MM-DD)']} in timezone {arguments['Timezone']}. To begin, get the website id using get_websites and find the id of the website with the name {arguments['Website Name']}. Then use the id to get the other data. 1. OVERVIEW METRICS First, get the high-level website statistics using get_website_stats: - Total pageviews - Unique visitors - Total visits - Bounce rate - Total time spent 2. TIME-BASED ANALYSIS Use get_pageview_series to analyze traffic patterns: - Get hourly data for short time ranges (1-7 days) - Get daily data for medium ranges (8-90 days) - Get monthly data for long ranges (90+ days) - Look for patterns in peak usage times - Identify trends in visitor engagement 3. USER BEHAVIOR METRICS Use get_website_metrics to analyze: a) Page Performance (type: "url") - Most visited pages - Entry and exit pages - Time spent per page b) Traffic Sources (type: "referrer") - Top referral sources - Direct vs indirect traffic - Search engine performance c) User Technology (types: "browser", "os", "device") - Browser usage - Operating system distribution - Device type preferences d) Geographic Data (type: "country") - User distribution by country - Regional engagement patterns e) Event Analysis (type: "event") - Key user interactions - Conversion events - User journey milestones 4. ACTIVE USERS Use get_active_visitors to: - Monitor current site activity - Compare with historical averages - Track real-time engagement 5. USER JOURNEY ANALYSIS For deeper insights into specific behaviors: a) Use get_session_ids to identify relevant user sessions b) Use get_tracking_data to analyze specific user journeys c) Use get_docs to find patterns in user behavior 6. VISUAL CONTEXT When needed: - Use get_screenshot to capture page layouts - Use get_html to analyze page structure PRESENTATION GUIDELINES: 1. Start with the most important metrics for your audience 2. Group related metrics together 3. Show trends over time where possible 4. Highlight significant changes or patterns 5. Include context and explanations for metrics 6. Consider different time ranges for different metrics 7. Focus on actionable insights Remember to: - Validate all date ranges before analysis - Consider timezone effects on data - Look for correlations between different metrics - Highlight unusual patterns or anomalies - Provide context for significant changes - Consider seasonal or temporal factors - Focus on metrics that drive business decisions Start by gathering the overview metrics and then proceed through each analysis section systematically. Only create once you are satisfied you have gathered all the data you need. Ensure the dashboard is visually appealing and easy to understand.""" } } ] } raise ValueError(f"Unknown prompt: {name}") @app.call_tool() async def call_tool(name: str, arguments: Any) -> Sequence[TextContent | ImageContent | EmbeddedResource]: """Handle tool calls for tracking data.""" # Verify token before making any request if not client.verify_token(): # Try to login again if token is invalid if not client.login(API_USERNAME, API_PASSWORD): raise RuntimeError("Failed to re-authenticate with Umami API") # Convert date strings to Unix timestamps for relevant tools if name in ["get_tracking_data", "get_website_stats", "get_session_ids", "get_website_metrics", "get_docs", "get_pageview_series"]: if "start_at" in arguments: arguments["start_at"] = convert_date_to_unix(arguments["start_at"], end_of_day=False) if "end_at" in arguments: arguments["end_at"] = convert_date_to_unix(arguments["end_at"], end_of_day=True) if name == "get_websites": team_id = arguments.get("team_id", TEAM_ID) websites = client.get_websites(team_id) if websites is None: raise RuntimeError("Failed to fetch websites data") return [ TextContent( type="text", text=json.dumps(websites, indent=2) ) ] elif name == "get_session_ids": # Validate required arguments required_args = ["website_id", "start_at", "end_at", "event_name"] if not all(arg in arguments for arg in required_args): raise ValueError(f"Missing required arguments. Need: {required_args}") if arguments["event_name"] == "None": include_ids = get_session_ids(arguments["website_id"], None, arguments["start_at"], arguments["end_at"]) else: include_ids = get_session_ids(arguments["website_id"], arguments["event_name"], arguments["start_at"], arguments["end_at"]) exclude_ids = [] ids = [i for i in include_ids if i not in exclude_ids] return [ TextContent( type="text", text=json.dumps(ids, indent=2) ) ] elif name == "get_tracking_data": # Validate required arguments required_args = ["website_id", "start_at", "end_at", "session_id"] if not all(arg in arguments for arg in required_args): raise ValueError(f"Missing required arguments. Need: {required_args}") user_activity = client.get_user_activity( website_id=arguments["website_id"], session_id=arguments["session_id"] , start_at=arguments["start_at"], end_at=arguments["end_at"] ) if user_activity is None: raise RuntimeError("Failed to fetch website statistics") return [ TextContent( type="text", text=json.dumps(user_activity, indent=2) ) ] elif name == "get_website_stats": # Validate required arguments required_args = ["website_id", "start_at", "end_at"] if not all(arg in arguments for arg in required_args): raise ValueError(f"Missing required arguments. Need: {required_args}") stats = client.get_website_stats( website_id=arguments["website_id"], start_at=arguments["start_at"], end_at=arguments["end_at"] ) if stats is None: raise ValueError(f"Failed to get stats for website {arguments['website_id']}") return [ TextContent( type="text", text=json.dumps(stats, indent=2) ) ] elif name == "get_website_metrics": # Validate required arguments required_args = ["website_id", "start_at", "end_at", "type"] if not all(arg in arguments for arg in required_args): raise ValueError(f"Missing required arguments. Need: {required_args}") metrics = client.get_website_metrics( website_id=arguments["website_id"], start_at=arguments["start_at"], end_at=arguments["end_at"], type=arguments["type"] ) return [ TextContent( type="text", text=json.dumps(metrics, indent=2) ) ] elif name == "get_docs": # Validate required arguments required_args = ["user_question", "selected_event", "website_id", "start_at", "end_at"] if not all(arg in arguments for arg in required_args): raise ValueError(f"Missing required arguments. Need: {required_args}") try: if arguments["selected_event"] == "None": include_ids = get_session_ids(arguments["website_id"], None, arguments["start_at"], arguments["end_at"]) else: include_ids = get_session_ids(arguments["website_id"], arguments["selected_event"], arguments["start_at"], arguments["end_at"]) exclude_ids = [] ids = [i for i in include_ids if i not in exclude_ids] user_activity_list = [] for count, session_id in enumerate(ids, 1): user_activity = client.get_user_activity( website_id=arguments["website_id"], session_id=session_id, start_at=arguments["start_at"], end_at=arguments["end_at"] ) if user_activity: user_activity_list.append(json.dumps(user_activity, indent=2)) docs = await get_chunks(user_activity_list, arguments["user_question"]) # Convert docs to string format docs_text = "\n\n".join(doc.page_content for doc in docs) return [ TextContent( text=docs_text, # Add the required 'text' field type="text", mimeType="text/plain" ) ] except Exception as e: raise RuntimeError(f"Failed to get tracking data: {str(e)}") elif name == "get_screenshot": # Validate required arguments if "url" not in arguments: raise ValueError("Missing required argument: url") try: screenshot_base64 = await crawler.get_screenshot(arguments["url"]) if not screenshot_base64: raise RuntimeError("No screenshot data returned") return [ ImageContent( type="image", mimeType="image/jpeg", data=screenshot_base64 ) ] except TimeoutError as e: raise RuntimeError(f"Screenshot timed out: {str(e)}") except Exception as e: raise RuntimeError(f"Failed to get screenshot: {str(e)}") elif name == "get_html": # Validate required arguments if "url" not in arguments: raise ValueError("Missing required argument: url") try: html = await crawler.get_html(arguments["url"]) return [ TextContent( type="text", text=html ) ] except Exception as e: raise RuntimeError(f"Failed to get html: {str(e)}") elif name == "get_pageview_series": # Validate required arguments required_args = ["website_id", "start_at", "end_at", "unit", "timezone"] if not all(arg in arguments for arg in required_args): raise ValueError(f"Missing required arguments. Need: {required_args}") pageview_series = client.get_pageview_series( website_id=arguments["website_id"], start_at=arguments["start_at"], end_at=arguments["end_at"], unit=arguments["unit"], timezone=arguments["timezone"] ) if pageview_series is None: raise ValueError(f"Failed to get pageview series for website {arguments['website_id']}") return [ TextContent( type="text", text=json.dumps(pageview_series, indent=2) ) ] elif name == "get_active_visitors": # Validate required arguments if "website_id" not in arguments: raise ValueError("Missing required argument: website_id") active_data = client.get_active( website_id=arguments["website_id"] ) if active_data is None: raise ValueError(f"Failed to get active visitor data for website {arguments['website_id']}") return [ TextContent( type="text", text=json.dumps(active_data, indent=2) ) ] else: raise ValueError(f"Unknown tool: {name}") async def main(): from mcp.server.stdio import stdio_server async with stdio_server() as (read_stream, write_stream): await app.run( read_stream, write_stream, app.create_initialization_options() )