#!/usr/bin/env python3
"""
CKAN Analysis Components Demo (Simplified)
This script demonstrates the three core analysis components:
- RelevanceScorer: Scores datasets based on query relevance
- UpdateFrequencyAnalyzer: Analyzes dataset update patterns
- SummaryBuilder: Creates structured summaries of CKAN data
Run this script to see how the analysis components work.
"""
import asyncio
import json
import os
from datetime import datetime, timedelta
from dotenv import load_dotenv
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
from ckan_mcp.ckan_tools import CkanToolsManager, CkanApiError
from ckan_mcp.types import (
CkanToolsConfig,
RelevanceWeights,
FrequencyThresholds,
CkanPackage,
CkanResource,
CkanTag,
CkanOrganization
)
from ckan_mcp.helpers import RelevanceScorer, UpdateFrequencyAnalyzer, SummaryBuilder
# Load environment variables
load_dotenv()
def _load_toronto_config() -> dict:
"""Load Toronto configuration from the JSON config file."""
config_path = os.path.join(
os.path.dirname(__file__), "..", "src", "ckan_mcp", "data", "ckan_config_selection.json"
)
with open(config_path, "r", encoding="utf-8") as f:
config_data = json.load(f)
toronto_config = config_data["countries"]["Canada"]["locations"]["Toronto"]
result = {"ckan_base_url": toronto_config["base_url"]}
if "overrides" in toronto_config:
result.update(toronto_config["overrides"])
return result
def _build_default_config() -> CkanToolsConfig:
"""Create a CKAN config with custom analysis weights."""
base_url = os.getenv("CKAN_BASE_URL", _load_toronto_config()["ckan_base_url"])
site_url = os.getenv("CKAN_SITE_URL", _load_toronto_config()["ckan_site_url"])
dataset_url_template = os.getenv(
"CKAN_DATASET_URL_TEMPLATE", _load_toronto_config()["dataset_page_url_template"]
)
# Custom relevance weights for demonstration
custom_weights = RelevanceWeights(
title=15, # Higher weight for title matches
description=7, # Medium weight for description matches
tags=5, # Good weight for tag matches
organization=3, # Small weight for organization matches
resource=2 # Small weight for resource matches
)
# Custom frequency thresholds
custom_thresholds = FrequencyThresholds(
frequent_days=14, # Consider updates within 14 days as frequent
monthly_days=45, # Consider updates within 45 days as monthly
quarterly_days=120 # Consider updates within 120 days as quarterly
)
config_kwargs = {
"ckan_base_url": base_url,
"ckan_site_url": site_url,
"dataset_page_url_template": dataset_url_template,
"action_transport": _load_toronto_config()["action_transport"],
"datastore_id_alias": _load_toronto_config()["datastore_id_alias"],
"helper_prompt": _load_toronto_config()["helper_prompt"],
"relevance_weights": custom_weights,
"frequency_thresholds": custom_thresholds,
}
api_key = os.getenv("CKAN_API_KEY")
if api_key:
config_kwargs["api_key"] = api_key
return CkanToolsConfig(**config_kwargs)
def create_sample_dataset() -> CkanPackage:
"""Create a sample dataset for testing analysis components."""
org = CkanOrganization(
id="city-of-toronto",
name="city-of-toronto",
title="City of Toronto"
)
tags = [
CkanTag(id="transportation", name="transportation"),
CkanTag(id="traffic", name="traffic"),
CkanTag(id="real-time", name="real-time"),
CkanTag(id="api", name="api")
]
dataset = CkanPackage(
id="traffic-volumes-toronto",
name="traffic-volumes-toronto",
title="Traffic Volumes - Toronto Transportation",
notes="""This dataset contains traffic volume counts for various intersections across the City of Toronto.
Data is collected through automated traffic counters and updated on a regular basis.
The dataset includes information about vehicle types, traffic patterns, and peak hours.
This data is essential for transportation planning and traffic management.""",
tags=tags,
organization=org,
resources=[
CkanResource(
id="resource-1",
name="Traffic Volume Data",
format="CSV",
size=1048576,
datastore_active=True,
url="https://example.com/traffic-data.csv"
),
CkanResource(
id="resource-2",
name="API Documentation",
format="PDF",
size=524288,
datastore_active=False,
url="https://example.com/api-doc.pdf"
)
],
metadata_created="2023-01-15T10:30:00Z",
metadata_modified="2024-11-20T14:45:00Z",
refresh_rate="daily"
)
return dataset
async def demo_relevance_scorer():
"""Demonstrate RelevanceScorer functionality."""
print("šÆ Demo 1: Relevance Scorer")
print("=" * 60)
manager = CkanToolsManager(_build_default_config())
scorer = manager.scorer
print("Configuration:")
print(f" Title weight: {manager.config.relevance_weights.title}")
print(f" Description weight: {manager.config.relevance_weights.description}")
print(f" Tags weight: {manager.config.relevance_weights.tags}")
print(f" Organization weight: {manager.config.relevance_weights.organization}")
print(f" Resource weight: {manager.config.relevance_weights.resource}")
print()
dataset = create_sample_dataset()
print(f"Dataset: {dataset.title}")
print(f"Description: {dataset.notes[:100]}...")
print(f"Organization: {dataset.organization.title}")
print(f"Tags: {', '.join(tag.name for tag in dataset.tags)}")
print()
test_queries = ["traffic", "transportation", "toronto", "data", "api", "weather"]
print("Relevance Scores:")
print("-" * 30)
for query in test_queries:
score = scorer.score(dataset, query)
print(f"Query: '{query:12}' -> Score: {score:3d}")
print()
async def demo_update_frequency_analyzer():
"""Demonstrate UpdateFrequencyAnalyzer functionality."""
print("š Demo 2: Update Frequency Analyzer")
print("=" * 60)
manager = CkanToolsManager(_build_default_config())
analyzer = manager.analyzer
print("Configuration:")
print(f" Frequent threshold: {manager.config.frequency_thresholds.frequent_days} days")
print(f" Monthly threshold: {manager.config.frequency_thresholds.monthly_days} days")
print(f" Quarterly threshold: {manager.config.frequency_thresholds.quarterly_days} days")
print()
# Test explicit patterns
explicit_patterns = [
("Daily Weather Data", "daily"),
("Weekly Crime Statistics", "weekly"),
("Monthly Financial Reports", "monthly"),
("Quarterly Performance Metrics", "quarterly"),
("Annual Population Estimates", "annually"),
("Irregular Special Events", "irregular")
]
print("Explicit Pattern Analysis:")
print("-" * 40)
base_data = create_sample_dataset().model_dump()
for name, pattern in explicit_patterns:
dataset_data = base_data.copy()
dataset_data.update({
"refresh_rate": pattern,
"name": name.lower().replace(" ", "-")
})
dataset = CkanPackage.model_construct(**dataset_data)
category = analyzer.categorize(dataset)
print(f"{name:30} -> {category.upper()}")
print()
async def demo_summary_builder():
"""Demonstrate SummaryBuilder functionality."""
print("š Demo 3: Summary Builder")
print("=" * 60)
manager = CkanToolsManager(_build_default_config())
summary_builder = manager.summary_builder
dataset = create_sample_dataset()
print("Original Dataset:")
print("-" * 30)
print(f"Title: {dataset.title}")
print(f"Organization: {dataset.organization.title}")
print(f"Resources: {len(dataset.resources)}")
print()
# Create package summary
package_summary = summary_builder.package(dataset)
print("Package Summary:")
print("-" * 20)
print(f"ID: {package_summary.id}")
print(f"Title: {package_summary.title}")
print(f"Description: {package_summary.description}")
print(f"Organization: {package_summary.organization}")
print(f"Tags: {package_summary.tags}")
print(f"Resource Count: {package_summary.resource_count}")
print(f"Datastore Resources: {package_summary.datastore_resources}")
print(f"URL: {package_summary.url}")
print()
# Create resource summaries
print("Resource Summaries:")
print("-" * 25)
for i, resource in enumerate(dataset.resources, 1):
resource_summary = summary_builder.resource(resource)
print(f"Resource {i}:")
print(f" Name: {resource_summary.name}")
print(f" Format: {resource_summary.format}")
print(f" Size: {resource_summary.size:,} bytes" if resource_summary.size else " Size: Unknown")
print(f" Datastore Active: {resource_summary.datastore_active}")
print()
print()
async def main():
"""Run all analysis demonstrations."""
print("š¬ CKAN MCP Server Analysis Components Demo")
print("=" * 70)
print("This demonstrates the three core analysis components:")
print("1. RelevanceScorer - Ranks datasets by query relevance")
print("2. UpdateFrequencyAnalyzer - Categorizes update patterns")
print("3. SummaryBuilder - Creates structured summaries")
print()
if not os.getenv("CKAN_BASE_URL"):
print("ā ļø Warning: CKAN_BASE_URL not set, using default")
print()
try:
await demo_relevance_scorer()
await demo_update_frequency_analyzer()
await demo_summary_builder()
print("ā
All analysis demonstrations completed!")
except KeyboardInterrupt:
print("\nā¹ļø Demonstrations interrupted by user")
except Exception as e:
print(f"\nā Error running demonstrations: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(main())