Petamind MCP

petamind-mcp
scripts

raw_model_test.py•17.5 KiB

#!/usr/bin/env python3
"""
Raw Model Test - No Pipeline (25 Diverse Websites)
Compare Kimi K2 vs MiniMax M2 output quality without any pipeline processing.
Just system prompt + user prompt → raw output across 25 different site types.

Usage:
    python scripts/raw_model_test.py                    # Run all 25 prompts on both models
    python scripts/raw_model_test.py --model kimi       # Run all 25 on Kimi only
    python scripts/raw_model_test.py --limit 5          # Run first 5 prompts only
    python scripts/raw_model_test.py --index 3          # Run only prompt #3
    python scripts/raw_model_test.py --concurrency 2    # Run 2 at a time
"""

import argparse
import asyncio
import json
import os
import sys
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))

# 25 Diverse Website Prompts
TEST_PROMPTS = [
    # 1-5: Local Services
    {
        "id": "01_dog_walking",
        "mood": "LIGHT", "accent": "teal",
        "prompt": "Premium dog walking service in Austin. GPS tracking, certified handlers, daily photo updates. $25/walk, monthly packages available. 500+ happy pups served."
    },
    {
        "id": "02_tattoo_studio",
        "mood": "DARK", "accent": "red",
        "prompt": "High-end tattoo studio in Brooklyn. Custom designs, award-winning artists, sterile environment. Consultations required. Featured in Inked Magazine."
    },
    {
        "id": "03_yoga_studio",
        "mood": "LIGHT", "accent": "violet",
        "prompt": "Boutique yoga studio in Denver. Hot yoga, meditation, aerial classes. First class free. 200+ 5-star reviews. Certified instructors."
    },
    {
        "id": "04_auto_detailing",
        "mood": "DARK", "accent": "orange",
        "prompt": "Mobile auto detailing in Miami. Ceramic coating, paint correction, interior restoration. We come to you. Satisfaction guaranteed. 1000+ cars detailed."
    },
    {
        "id": "05_bakery",
        "mood": "LIGHT", "accent": "pink",
        "prompt": "Artisan bakery in Portland. Sourdough, croissants, custom cakes. Organic ingredients, baked fresh daily. Order online, pickup or delivery."
    },

    # 6-10: SaaS/Tech
    {
        "id": "06_analytics_saas",
        "mood": "DARK", "accent": "blue",
        "prompt": "Real-time analytics platform for e-commerce. Track conversions, customer journeys, revenue attribution. Integrates with Shopify, WooCommerce. Free 14-day trial."
    },
    {
        "id": "07_project_management",
        "mood": "LIGHT", "accent": "green",
        "prompt": "AI-powered project management for remote teams. Auto-scheduling, workload balancing, Slack integration. Used by 10K+ teams. SOC2 certified."
    },
    {
        "id": "08_email_marketing",
        "mood": "DARK", "accent": "violet",
        "prompt": "Email marketing automation for creators. Drag-drop builder, AI subject lines, advanced segmentation. Free up to 1000 subscribers. 99.9% deliverability."
    },
    {
        "id": "09_api_platform",
        "mood": "DARK", "accent": "cyan",
        "prompt": "Unified API for payment processing. One integration, 50+ payment methods. 99.99% uptime. PCI compliant. Used by Fortune 500 companies."
    },
    {
        "id": "10_design_tool",
        "mood": "LIGHT", "accent": "orange",
        "prompt": "Browser-based design tool for non-designers. AI-generated layouts, brand kit management, team collaboration. Free tier available. 50K+ users."
    },

    # 11-15: E-commerce/Products
    {
        "id": "11_coffee_subscription",
        "mood": "DARK", "accent": "amber",
        "prompt": "Premium coffee subscription. Single-origin beans, roasted fresh, shipped weekly. Personalized taste profiles. Skip or cancel anytime. $15/bag."
    },
    {
        "id": "12_skincare_brand",
        "mood": "LIGHT", "accent": "rose",
        "prompt": "Clean skincare for sensitive skin. Dermatologist-developed, fragrance-free, cruelty-free. 30-day money-back guarantee. As seen in Vogue."
    },
    {
        "id": "13_fitness_equipment",
        "mood": "DARK", "accent": "red",
        "prompt": "Smart home gym equipment. AI personal trainer, compact design, 500+ workouts. Financing available. 10K+ home gyms shipped."
    },
    {
        "id": "14_pet_food",
        "mood": "LIGHT", "accent": "green",
        "prompt": "Fresh pet food delivered. Vet-formulated, human-grade ingredients, personalized portions. 90% of dogs show improved health in 30 days."
    },
    {
        "id": "15_sustainable_fashion",
        "mood": "LIGHT", "accent": "teal",
        "prompt": "Sustainable fashion marketplace. Verified ethical brands, carbon-neutral shipping, circular fashion program. B-Corp certified."
    },

    # 16-20: Professional Services
    {
        "id": "16_law_firm",
        "mood": "DARK", "accent": "blue",
        "prompt": "Tech startup law firm in San Francisco. IP, fundraising, M&A. Fixed-fee packages. Trusted by 200+ YC companies. Free 30-min consultation."
    },
    {
        "id": "17_accounting",
        "mood": "LIGHT", "accent": "green",
        "prompt": "Accounting for freelancers and creators. Tax prep, quarterly estimates, bookkeeping. Flat monthly fee. Save an average of $3K in taxes."
    },
    {
        "id": "18_therapy_practice",
        "mood": "LIGHT", "accent": "violet",
        "prompt": "Online therapy practice. Licensed therapists, flexible scheduling, insurance accepted. Specializing in anxiety, depression, relationships. HIPAA compliant."
    },
    {
        "id": "19_consulting",
        "mood": "DARK", "accent": "amber",
        "prompt": "Growth consulting for DTC brands. Revenue optimization, customer acquisition, retention strategies. Average 3X ROI for clients. Book a strategy call."
    },
    {
        "id": "20_recruiting",
        "mood": "LIGHT", "accent": "blue",
        "prompt": "Technical recruiting for startups. Senior engineers, product managers, designers. Success-based fees. Average 21-day time-to-hire. 95% retention rate."
    },

    # 21-25: Unique/Creative
    {
        "id": "21_escape_room",
        "mood": "DARK", "accent": "red",
        "prompt": "Immersive escape rooms in Chicago. Horror, mystery, sci-fi themes. 2-8 players. Corporate team building available. #1 rated on TripAdvisor."
    },
    {
        "id": "22_music_lessons",
        "mood": "LIGHT", "accent": "violet",
        "prompt": "Online music lessons for adults. Guitar, piano, voice. Learn songs you love. Flexible scheduling, patient instructors. First lesson free."
    },
    {
        "id": "23_coworking",
        "mood": "LIGHT", "accent": "teal",
        "prompt": "Design-forward coworking in Seattle. Private offices, hot desks, meeting rooms. 24/7 access, craft coffee, community events. Day passes available."
    },
    {
        "id": "24_wedding_planning",
        "mood": "LIGHT", "accent": "rose",
        "prompt": "Luxury wedding planning in Napa Valley. Full-service coordination, vendor relationships, day-of management. 100+ dream weddings. Featured in Martha Stewart."
    },
    {
        "id": "25_arcade_gaming",
        "mood": "DARK", "accent": "teal",
        "prompt": "Premium pixel-art arcade in Minneapolis. Members-only lounge, monthly tournaments, authentic cabinet collection. Free trial, monthly plans. 10K+ happy gamers."
    },
]

from google.oauth2 import service_account
from google.auth.transport.requests import Request as GoogleRequest
import urllib.request
import urllib.error

# Config
PROJECT_ID = "gen-lang-client-0707199026"
REGION = "global"
VERTEX_BASE = f"https://aiplatform.googleapis.com/v1beta1/projects/{PROJECT_ID}/locations/{REGION}/endpoints/openapi"

# Models
MODELS = {
    "kimi": "moonshotai/kimi-k2-thinking-maas",
    "minimax": "minimaxai/minimax-m2-maas",
}

# Load system prompt
PROMPT_PATH = Path(__file__).parent.parent / "prompts" / "titan_ui_system_long.txt"


def get_token():
    """Get fresh Vertex AI token."""
    key_paths = [
        Path.home() / ".config/kilo-code/kilo-code-vertex-key.json",
        Path.home() / ".config/gcloud/application_default_credentials.json",
    ]

    for key_path in key_paths:
        if key_path.exists():
            credentials = service_account.Credentials.from_service_account_file(
                str(key_path),
                scopes=["https://www.googleapis.com/auth/cloud-platform"]
            )
            credentials.refresh(GoogleRequest())
            return credentials.token

    # Try default credentials
    import google.auth
    credentials, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
    credentials.refresh(GoogleRequest())
    return credentials.token


def load_system_prompt():
    """Load the TITAN UI system prompt."""
    if PROMPT_PATH.exists():
        return PROMPT_PATH.read_text()
    else:
        print(f"Warning: System prompt not found at {PROMPT_PATH}")
        return "You are a UI code generator. Output valid JSON with TSX code."


def call_model(model_key: str, user_prompt: str, system_prompt: str) -> dict:
    """Call a model via Vertex MaaS and return response."""
    model_id = MODELS[model_key]
    token = get_token()

    payload = {
        "model": model_id,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        "max_tokens": 12000,
        "temperature": 0.85,
    }

    req = urllib.request.Request(
        f"{VERTEX_BASE}/chat/completions",
        data=json.dumps(payload).encode(),
        headers={
            "Authorization": f"Bearer {token}",
            "Content-Type": "application/json",
        },
        method="POST"
    )

    print(f"\n{'='*60}")
    print(f"Calling {model_key.upper()} ({model_id})...")
    print(f"{'='*60}")

    try:
        with urllib.request.urlopen(req, timeout=180) as resp:
            data = json.loads(resp.read().decode())
            return {
                "model": model_key,
                "model_id": model_id,
                "success": True,
                "content": data["choices"][0]["message"]["content"],
                "usage": data.get("usage", {}),
            }
    except urllib.error.HTTPError as e:
        error_body = e.read().decode()
        return {
            "model": model_key,
            "model_id": model_id,
            "success": False,
            "error": f"HTTP {e.code}: {error_body[:500]}",
        }
    except Exception as e:
        return {
            "model": model_key,
            "model_id": model_id,
            "success": False,
            "error": str(e),
        }


def save_output(result: dict, user_prompt: str, output_dir: Path):
    """Save model output to file."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{result['model']}_{timestamp}.json"
    filepath = output_dir / filename

    output = {
        "timestamp": timestamp,
        "model": result["model"],
        "model_id": result["model_id"],
        "user_prompt": user_prompt,
        "success": result["success"],
    }

    if result["success"]:
        output["content"] = result["content"]
        output["usage"] = result.get("usage", {})

        # Try to extract just the JSON part (after </think>)
        content = result["content"]
        if "</think>" in content:
            json_part = content.split("</think>", 1)[1].strip()
            output["json_extracted"] = json_part
    else:
        output["error"] = result["error"]

    filepath.write_text(json.dumps(output, indent=2))
    print(f"Saved: {filepath}")
    return filepath


def build_full_prompt(test_case: dict) -> str:
    """Build the full user prompt from a test case."""
    return f"""OUTPUT_MODE=TSX_ONLY
Creative risk: high
Brand mood: {test_case['mood']}
Brand accent: {test_case['accent']}

Create a landing page for the following business:
{test_case['prompt']}

Make it feel premium and distinctive, not generic. Follow all accessibility requirements."""


def run_single_test(test_case: dict, model_key: str, system_prompt: str, output_dir: Path) -> dict:
    """Run a single test case for a single model."""
    prompt_id = test_case["id"]
    user_prompt = build_full_prompt(test_case)

    print(f"  [{prompt_id}] {model_key.upper()}...")

    result = call_model(model_key, user_prompt, system_prompt)
    result["prompt_id"] = prompt_id
    result["test_case"] = test_case

    if result["success"]:
        # Save with prompt ID in filename
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{prompt_id}_{model_key}_{timestamp}.json"
        filepath = output_dir / filename

        output = {
            "timestamp": timestamp,
            "prompt_id": prompt_id,
            "model": model_key,
            "model_id": result["model_id"],
            "test_case": test_case,
            "user_prompt": user_prompt,
            "success": True,
            "content": result["content"],
            "usage": result.get("usage", {}),
        }

        # Extract JSON part
        content = result["content"]
        if "</think>" in content:
            json_part = content.split("</think>", 1)[1].strip()
            output["json_extracted"] = json_part

        filepath.write_text(json.dumps(output, indent=2))
        result["filepath"] = str(filepath)
        print(f"    OK - saved to {filename}")
    else:
        print(f"    FAILED: {result['error'][:100]}")

    return result


def main():
    parser = argparse.ArgumentParser(description="Raw model test - 25 diverse websites")
    parser.add_argument("--model", choices=["kimi", "minimax"], help="Test only one model")
    parser.add_argument("--limit", type=int, help="Limit to first N prompts")
    parser.add_argument("--index", type=int, help="Run only prompt at this index (0-24)")
    parser.add_argument("--concurrency", type=int, default=1, help="How many to run in parallel")
    parser.add_argument("--output-dir", type=Path, default=Path("out/raw_tests"), help="Output directory")
    parser.add_argument("--list", action="store_true", help="List all prompts and exit")

    args = parser.parse_args()

    # List mode
    if args.list:
        print("Available test prompts:")
        for i, tc in enumerate(TEST_PROMPTS):
            print(f"  {i:2d}. [{tc['id']}] {tc['mood']}/{tc['accent']} - {tc['prompt'][:60]}...")
        return

    system_prompt = load_system_prompt()
    args.output_dir.mkdir(parents=True, exist_ok=True)

    # Select prompts
    if args.index is not None:
        prompts = [TEST_PROMPTS[args.index]]
    elif args.limit:
        prompts = TEST_PROMPTS[:args.limit]
    else:
        prompts = TEST_PROMPTS

    # Select models
    if args.model:
        models = [args.model]
    else:
        models = ["kimi", "minimax"]

    total_tasks = len(prompts) * len(models)
    print(f"Running {len(prompts)} prompts × {len(models)} models = {total_tasks} total generations")
    print(f"Output directory: {args.output_dir}")
    print(f"Concurrency: {args.concurrency}")
    print()

    all_results = []
    success_count = 0
    fail_count = 0

    if args.concurrency == 1:
        # Sequential
        for i, test_case in enumerate(prompts):
            print(f"\n[{i+1}/{len(prompts)}] {test_case['id']}")
            for model_key in models:
                result = run_single_test(test_case, model_key, system_prompt, args.output_dir)
                all_results.append(result)
                if result["success"]:
                    success_count += 1
                else:
                    fail_count += 1
    else:
        # Parallel with ThreadPoolExecutor
        tasks = [(tc, m) for tc in prompts for m in models]

        with ThreadPoolExecutor(max_workers=args.concurrency) as executor:
            futures = {
                executor.submit(run_single_test, tc, m, system_prompt, args.output_dir): (tc, m)
                for tc, m in tasks
            }

            for future in as_completed(futures):
                result = future.result()
                all_results.append(result)
                if result["success"]:
                    success_count += 1
                else:
                    fail_count += 1

    # Summary
    print(f"\n{'='*60}")
    print("SUMMARY")
    print(f"{'='*60}")
    print(f"Total: {total_tasks} | Success: {success_count} | Failed: {fail_count}")
    print()

    # Per-model breakdown
    for model_key in models:
        model_results = [r for r in all_results if r["model"] == model_key]
        model_success = sum(1 for r in model_results if r["success"])
        print(f"  {model_key.upper()}: {model_success}/{len(model_results)} succeeded")

    # List failures
    failures = [r for r in all_results if not r["success"]]
    if failures:
        print(f"\nFailures:")
        for r in failures:
            print(f"  - {r.get('prompt_id', '?')} / {r['model']}: {r.get('error', 'unknown')[:80]}")

    print(f"\nOutputs saved to: {args.output_dir}")

    # Save summary JSON
    summary_path = args.output_dir / f"summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    summary = {
        "timestamp": datetime.now().isoformat(),
        "total": total_tasks,
        "success": success_count,
        "failed": fail_count,
        "models": models,
        "prompts_count": len(prompts),
        "results": [
            {
                "prompt_id": r.get("prompt_id"),
                "model": r["model"],
                "success": r["success"],
                "filepath": r.get("filepath"),
                "error": r.get("error"),
            }
            for r in all_results
        ],
    }
    summary_path.write_text(json.dumps(summary, indent=2))
    print(f"Summary saved to: {summary_path}")


if __name__ == "__main__":
    main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/alexalexalex222/petamind-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

raw_model_test.py•17.5 KiB

#!/usr/bin/env python3
"""
Raw Model Test - No Pipeline (25 Diverse Websites)
Compare Kimi K2 vs MiniMax M2 output quality without any pipeline processing.
Just system prompt + user prompt → raw output across 25 different site types.

Usage:
    python scripts/raw_model_test.py                    # Run all 25 prompts on both models
    python scripts/raw_model_test.py --model kimi       # Run all 25 on Kimi only
    python scripts/raw_model_test.py --limit 5          # Run first 5 prompts only
    python scripts/raw_model_test.py --index 3          # Run only prompt #3
    python scripts/raw_model_test.py --concurrency 2    # Run 2 at a time
"""

import argparse
import asyncio
import json
import os
import sys
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))

# 25 Diverse Website Prompts
TEST_PROMPTS = [
    # 1-5: Local Services
    {
        "id": "01_dog_walking",
        "mood": "LIGHT", "accent": "teal",
        "prompt": "Premium dog walking service in Austin. GPS tracking, certified handlers, daily photo updates. $25/walk, monthly packages available. 500+ happy pups served."
    },
    {
        "id": "02_tattoo_studio",
        "mood": "DARK", "accent": "red",
        "prompt": "High-end tattoo studio in Brooklyn. Custom designs, award-winning artists, sterile environment. Consultations required. Featured in Inked Magazine."
    },
    {
        "id": "03_yoga_studio",
        "mood": "LIGHT", "accent": "violet",
        "prompt": "Boutique yoga studio in Denver. Hot yoga, meditation, aerial classes. First class free. 200+ 5-star reviews. Certified instructors."
    },
    {
        "id": "04_auto_detailing",
        "mood": "DARK", "accent": "orange",
        "prompt": "Mobile auto detailing in Miami. Ceramic coating, paint correction, interior restoration. We come to you. Satisfaction guaranteed. 1000+ cars detailed."
    },
    {
        "id": "05_bakery",
        "mood": "LIGHT", "accent": "pink",
        "prompt": "Artisan bakery in Portland. Sourdough, croissants, custom cakes. Organic ingredients, baked fresh daily. Order online, pickup or delivery."
    },

    # 6-10: SaaS/Tech
    {
        "id": "06_analytics_saas",
        "mood": "DARK", "accent": "blue",
        "prompt": "Real-time analytics platform for e-commerce. Track conversions, customer journeys, revenue attribution. Integrates with Shopify, WooCommerce. Free 14-day trial."
    },
    {
        "id": "07_project_management",
        "mood": "LIGHT", "accent": "green",
        "prompt": "AI-powered project management for remote teams. Auto-scheduling, workload balancing, Slack integration. Used by 10K+ teams. SOC2 certified."
    },
    {
        "id": "08_email_marketing",
        "mood": "DARK", "accent": "violet",
        "prompt": "Email marketing automation for creators. Drag-drop builder, AI subject lines, advanced segmentation. Free up to 1000 subscribers. 99.9% deliverability."
    },
    {
        "id": "09_api_platform",
        "mood": "DARK", "accent": "cyan",
        "prompt": "Unified API for payment processing. One integration, 50+ payment methods. 99.99% uptime. PCI compliant. Used by Fortune 500 companies."
    },
    {
        "id": "10_design_tool",
        "mood": "LIGHT", "accent": "orange",
        "prompt": "Browser-based design tool for non-designers. AI-generated layouts, brand kit management, team collaboration. Free tier available. 50K+ users."
    },

    # 11-15: E-commerce/Products
    {
        "id": "11_coffee_subscription",
        "mood": "DARK", "accent": "amber",
        "prompt": "Premium coffee subscription. Single-origin beans, roasted fresh, shipped weekly. Personalized taste profiles. Skip or cancel anytime. $15/bag."
    },
    {
        "id": "12_skincare_brand",
        "mood": "LIGHT", "accent": "rose",
        "prompt": "Clean skincare for sensitive skin. Dermatologist-developed, fragrance-free, cruelty-free. 30-day money-back guarantee. As seen in Vogue."
    },
    {
        "id": "13_fitness_equipment",
        "mood": "DARK", "accent": "red",
        "prompt": "Smart home gym equipment. AI personal trainer, compact design, 500+ workouts. Financing available. 10K+ home gyms shipped."
    },
    {
        "id": "14_pet_food",
        "mood": "LIGHT", "accent": "green",
        "prompt": "Fresh pet food delivered. Vet-formulated, human-grade ingredients, personalized portions. 90% of dogs show improved health in 30 days."
    },
    {
        "id": "15_sustainable_fashion",
        "mood": "LIGHT", "accent": "teal",
        "prompt": "Sustainable fashion marketplace. Verified ethical brands, carbon-neutral shipping, circular fashion program. B-Corp certified."
    },

    # 16-20: Professional Services
    {
        "id": "16_law_firm",
        "mood": "DARK", "accent": "blue",
        "prompt": "Tech startup law firm in San Francisco. IP, fundraising, M&A. Fixed-fee packages. Trusted by 200+ YC companies. Free 30-min consultation."
    },
    {
        "id": "17_accounting",
        "mood": "LIGHT", "accent": "green",
        "prompt": "Accounting for freelancers and creators. Tax prep, quarterly estimates, bookkeeping. Flat monthly fee. Save an average of $3K in taxes."
    },
    {
        "id": "18_therapy_practice",
        "mood": "LIGHT", "accent": "violet",
        "prompt": "Online therapy practice. Licensed therapists, flexible scheduling, insurance accepted. Specializing in anxiety, depression, relationships. HIPAA compliant."
    },
    {
        "id": "19_consulting",
        "mood": "DARK", "accent": "amber",
        "prompt": "Growth consulting for DTC brands. Revenue optimization, customer acquisition, retention strategies. Average 3X ROI for clients. Book a strategy call."
    },
    {
        "id": "20_recruiting",
        "mood": "LIGHT", "accent": "blue",
        "prompt": "Technical recruiting for startups. Senior engineers, product managers, designers. Success-based fees. Average 21-day time-to-hire. 95% retention rate."
    },

    # 21-25: Unique/Creative
    {
        "id": "21_escape_room",
        "mood": "DARK", "accent": "red",
        "prompt": "Immersive escape rooms in Chicago. Horror, mystery, sci-fi themes. 2-8 players. Corporate team building available. #1 rated on TripAdvisor."
    },
    {
        "id": "22_music_lessons",
        "mood": "LIGHT", "accent": "violet",
        "prompt": "Online music lessons for adults. Guitar, piano, voice. Learn songs you love. Flexible scheduling, patient instructors. First lesson free."
    },
    {
        "id": "23_coworking",
        "mood": "LIGHT", "accent": "teal",
        "prompt": "Design-forward coworking in Seattle. Private offices, hot desks, meeting rooms. 24/7 access, craft coffee, community events. Day passes available."
    },
    {
        "id": "24_wedding_planning",
        "mood": "LIGHT", "accent": "rose",
        "prompt": "Luxury wedding planning in Napa Valley. Full-service coordination, vendor relationships, day-of management. 100+ dream weddings. Featured in Martha Stewart."
    },
    {
        "id": "25_arcade_gaming",
        "mood": "DARK", "accent": "teal",
        "prompt": "Premium pixel-art arcade in Minneapolis. Members-only lounge, monthly tournaments, authentic cabinet collection. Free trial, monthly plans. 10K+ happy gamers."
    },
]

from google.oauth2 import service_account
from google.auth.transport.requests import Request as GoogleRequest
import urllib.request
import urllib.error

# Config
PROJECT_ID = "gen-lang-client-0707199026"
REGION = "global"
VERTEX_BASE = f"https://aiplatform.googleapis.com/v1beta1/projects/{PROJECT_ID}/locations/{REGION}/endpoints/openapi"

# Models
MODELS = {
    "kimi": "moonshotai/kimi-k2-thinking-maas",
    "minimax": "minimaxai/minimax-m2-maas",
}

# Load system prompt
PROMPT_PATH = Path(__file__).parent.parent / "prompts" / "titan_ui_system_long.txt"


def get_token():
    """Get fresh Vertex AI token."""
    key_paths = [
        Path.home() / ".config/kilo-code/kilo-code-vertex-key.json",
        Path.home() / ".config/gcloud/application_default_credentials.json",
    ]

    for key_path in key_paths:
        if key_path.exists():
            credentials = service_account.Credentials.from_service_account_file(
                str(key_path),
                scopes=["https://www.googleapis.com/auth/cloud-platform"]
            )
            credentials.refresh(GoogleRequest())
            return credentials.token

    # Try default credentials
    import google.auth
    credentials, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
    credentials.refresh(GoogleRequest())
    return credentials.token


def load_system_prompt():
    """Load the TITAN UI system prompt."""
    if PROMPT_PATH.exists():
        return PROMPT_PATH.read_text()
    else:
        print(f"Warning: System prompt not found at {PROMPT_PATH}")
        return "You are a UI code generator. Output valid JSON with TSX code."


def call_model(model_key: str, user_prompt: str, system_prompt: str) -> dict:
    """Call a model via Vertex MaaS and return response."""
    model_id = MODELS[model_key]
    token = get_token()

    payload = {
        "model": model_id,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        "max_tokens": 12000,
        "temperature": 0.85,
    }

    req = urllib.request.Request(
        f"{VERTEX_BASE}/chat/completions",
        data=json.dumps(payload).encode(),
        headers={
            "Authorization": f"Bearer {token}",
            "Content-Type": "application/json",
        },
        method="POST"
    )

    print(f"\n{'='*60}")
    print(f"Calling {model_key.upper()} ({model_id})...")
    print(f"{'='*60}")

    try:
        with urllib.request.urlopen(req, timeout=180) as resp:
            data = json.loads(resp.read().decode())
            return {
                "model": model_key,
                "model_id": model_id,
                "success": True,
                "content": data["choices"][0]["message"]["content"],
                "usage": data.get("usage", {}),
            }
    except urllib.error.HTTPError as e:
        error_body = e.read().decode()
        return {
            "model": model_key,
            "model_id": model_id,
            "success": False,
            "error": f"HTTP {e.code}: {error_body[:500]}",
        }
    except Exception as e:
        return {
            "model": model_key,
            "model_id": model_id,
            "success": False,
            "error": str(e),
        }


def save_output(result: dict, user_prompt: str, output_dir: Path):
    """Save model output to file."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{result['model']}_{timestamp}.json"
    filepath = output_dir / filename

    output = {
        "timestamp": timestamp,
        "model": result["model"],
        "model_id": result["model_id"],
        "user_prompt": user_prompt,
        "success": result["success"],
    }

    if result["success"]:
        output["content"] = result["content"]
        output["usage"] = result.get("usage", {})

        # Try to extract just the JSON part (after </think>)
        content = result["content"]
        if "</think>" in content:
            json_part = content.split("</think>", 1)[1].strip()
            output["json_extracted"] = json_part
    else:
        output["error"] = result["error"]

    filepath.write_text(json.dumps(output, indent=2))
    print(f"Saved: {filepath}")
    return filepath


def build_full_prompt(test_case: dict) -> str:
    """Build the full user prompt from a test case."""
    return f"""OUTPUT_MODE=TSX_ONLY
Creative risk: high
Brand mood: {test_case['mood']}
Brand accent: {test_case['accent']}

Create a landing page for the following business:
{test_case['prompt']}

Make it feel premium and distinctive, not generic. Follow all accessibility requirements."""


def run_single_test(test_case: dict, model_key: str, system_prompt: str, output_dir: Path) -> dict:
    """Run a single test case for a single model."""
    prompt_id = test_case["id"]
    user_prompt = build_full_prompt(test_case)

    print(f"  [{prompt_id}] {model_key.upper()}...")

    result = call_model(model_key, user_prompt, system_prompt)
    result["prompt_id"] = prompt_id
    result["test_case"] = test_case

    if result["success"]:
        # Save with prompt ID in filename
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{prompt_id}_{model_key}_{timestamp}.json"
        filepath = output_dir / filename

        output = {
            "timestamp": timestamp,
            "prompt_id": prompt_id,
            "model": model_key,
            "model_id": result["model_id"],
            "test_case": test_case,
            "user_prompt": user_prompt,
            "success": True,
            "content": result["content"],
            "usage": result.get("usage", {}),
        }

        # Extract JSON part
        content = result["content"]
        if "</think>" in content:
            json_part = content.split("</think>", 1)[1].strip()
            output["json_extracted"] = json_part

        filepath.write_text(json.dumps(output, indent=2))
        result["filepath"] = str(filepath)
        print(f"    OK - saved to {filename}")
    else:
        print(f"    FAILED: {result['error'][:100]}")

    return result


def main():
    parser = argparse.ArgumentParser(description="Raw model test - 25 diverse websites")
    parser.add_argument("--model", choices=["kimi", "minimax"], help="Test only one model")
    parser.add_argument("--limit", type=int, help="Limit to first N prompts")
    parser.add_argument("--index", type=int, help="Run only prompt at this index (0-24)")
    parser.add_argument("--concurrency", type=int, default=1, help="How many to run in parallel")
    parser.add_argument("--output-dir", type=Path, default=Path("out/raw_tests"), help="Output directory")
    parser.add_argument("--list", action="store_true", help="List all prompts and exit")

    args = parser.parse_args()

    # List mode
    if args.list:
        print("Available test prompts:")
        for i, tc in enumerate(TEST_PROMPTS):
            print(f"  {i:2d}. [{tc['id']}] {tc['mood']}/{tc['accent']} - {tc['prompt'][:60]}...")
        return

    system_prompt = load_system_prompt()
    args.output_dir.mkdir(parents=True, exist_ok=True)

    # Select prompts
    if args.index is not None:
        prompts = [TEST_PROMPTS[args.index]]
    elif args.limit:
        prompts = TEST_PROMPTS[:args.limit]
    else:
        prompts = TEST_PROMPTS

    # Select models
    if args.model:
        models = [args.model]
    else:
        models = ["kimi", "minimax"]

    total_tasks = len(prompts) * len(models)
    print(f"Running {len(prompts)} prompts × {len(models)} models = {total_tasks} total generations")
    print(f"Output directory: {args.output_dir}")
    print(f"Concurrency: {args.concurrency}")
    print()

    all_results = []
    success_count = 0
    fail_count = 0

    if args.concurrency == 1:
        # Sequential
        for i, test_case in enumerate(prompts):
            print(f"\n[{i+1}/{len(prompts)}] {test_case['id']}")
            for model_key in models:
                result = run_single_test(test_case, model_key, system_prompt, args.output_dir)
                all_results.append(result)
                if result["success"]:
                    success_count += 1
                else:
                    fail_count += 1
    else:
        # Parallel with ThreadPoolExecutor
        tasks = [(tc, m) for tc in prompts for m in models]

        with ThreadPoolExecutor(max_workers=args.concurrency) as executor:
            futures = {
                executor.submit(run_single_test, tc, m, system_prompt, args.output_dir): (tc, m)
                for tc, m in tasks
            }

            for future in as_completed(futures):
                result = future.result()
                all_results.append(result)
                if result["success"]:
                    success_count += 1
                else:
                    fail_count += 1

    # Summary
    print(f"\n{'='*60}")
    print("SUMMARY")
    print(f"{'='*60}")
    print(f"Total: {total_tasks} | Success: {success_count} | Failed: {fail_count}")
    print()

    # Per-model breakdown
    for model_key in models:
        model_results = [r for r in all_results if r["model"] == model_key]
        model_success = sum(1 for r in model_results if r["success"])
        print(f"  {model_key.upper()}: {model_success}/{len(model_results)} succeeded")

    # List failures
    failures = [r for r in all_results if not r["success"]]
    if failures:
        print(f"\nFailures:")
        for r in failures:
            print(f"  - {r.get('prompt_id', '?')} / {r['model']}: {r.get('error', 'unknown')[:80]}")

    print(f"\nOutputs saved to: {args.output_dir}")

    # Save summary JSON
    summary_path = args.output_dir / f"summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    summary = {
        "timestamp": datetime.now().isoformat(),
        "total": total_tasks,
        "success": success_count,
        "failed": fail_count,
        "models": models,
        "prompts_count": len(prompts),
        "results": [
            {
                "prompt_id": r.get("prompt_id"),
                "model": r["model"],
                "success": r["success"],
                "filepath": r.get("filepath"),
                "error": r.get("error"),
            }
            for r in all_results
        ],
    }
    summary_path.write_text(json.dumps(summary, indent=2))
    print(f"Summary saved to: {summary_path}")


if __name__ == "__main__":
    main()