Petamind MCP

petamind-mcp
src
titan_factory

judge.py•47.5 kB

"""Judge module - scores candidates using vision models or heuristics.""" import asyncio import re from pathlib import Path from titan_factory.config import Config from titan_factory.providers import Message, ProviderFactory from titan_factory.schema import ( Candidate, CandidateStatus, CreativeDirectorFeedback, JudgeScore, PremiumGate, ) from titan_factory.utils import extract_json_strict, log_error, log_info, log_warning # === Vision Judge Prompt === JUDGE_SYSTEM_PROMPT = """You are a UI design quality judge. Score the provided screenshots on a scale of 0-10. SCORING CRITERIA (each worth 2 points): 1. Visual polish: Premium feel, attention to detail, typography quality 2. Layout: Proper spacing, alignment, visual hierarchy, responsive design 3. Branding: Consistent colors, mood, style throughout 4. Usability: Clear CTAs, readable text, intuitive navigation 5. Completeness: All sections present, no broken layouts, no placeholder issues CRITICAL RULES: 1. Output ONLY valid JSON - no markdown, no explanation, no <think> 2. Be strict but fair - only score 8+ for truly premium designs 3. Note specific issues that could be fixed OUTPUT FORMAT: { "score": 7.5, "pass": true, "issues": ["Heading text too small on mobile", "CTA button needs more contrast"], "highlights": ["Excellent typography choices", "Clean layout"], "fix_suggestions": ["Increase heading font size by 20%", "Add bg-blue-600 to CTA"] } Start with { and end with }.""" JUDGE_USER_PROMPT = """Score these screenshots of a {page_type} page. SCREENSHOTS PROVIDED (in order): {viewport_labels} Expected style: - Mood: {mood} - Accent color: {accent} - Density: {density} - Style: {style_keywords} Evaluate the design quality. Pass threshold is {threshold}/10. Be specific in issues and suggestions. Note which viewport has issues.""" # Viewport dimensions for labeling VIEWPORT_LABELS = { "mobile": "Mobile (375×812)", "tablet": "Tablet (768×1024)", "desktop": "Desktop (1440×900)", } _VISION_MODEL_FALLBACKS = [ # NOTE: Some Vertex projects only expose Gemini 3 Flash as the preview model. # We keep fallbacks within the Gemini 3 Flash family (per ops preference). "gemini-3-flash-preview", ] def _is_model_not_found_error(err: Exception) -> bool: msg = str(err or "") lowered = msg.lower() # Be liberal in detection: providers format 404s differently (space/newline/json). return ("404" in lowered) and ( "not found" in lowered or "not_supported" in lowered or "not supported" in lowered ) async def _complete_with_vision_fallback( *, provider, messages: list[Message], model: str, images: list[bytes], max_tokens: int, temperature: float, ) -> "CompletionResponse": """Call vision completion with safe model fallbacks when a model ID is not available. This prevents a config typo (or auth-mode mismatch) from disabling broken-gate safety. """ tried: list[str] = [] last_err: Exception | None = None # Try configured model first, then fallbacks. model_list = [model] + [m for m in _VISION_MODEL_FALLBACKS if m and m != model] for m in model_list: tried.append(m) try: return await provider.complete_with_vision( messages=messages, model=m, images=images, max_tokens=max_tokens, temperature=temperature, ) except Exception as e: last_err = e if _is_model_not_found_error(e): continue raise raise RuntimeError( "Vision model not available. Tried: " + ", ".join(tried[:6]) + f". Last error: {last_err}" ) BROKEN_GATE_SYSTEM_PROMPT = """You are a STRICT website screenshot validator. Your job is NOT to judge aesthetics. Only decide if the page is clearly BROKEN. Mark broken=true ONLY when you are highly confident the page is broken, such as: - runtime error overlay, stack trace, red error screen, "Unhandled Runtime Error" - "Application error", "Something went wrong", Next.js error overlay - 404 / "page could not be found" - blank/empty page with almost no visible content - obvious missing CSS/layout causing the page to be unusable (e.g. everything overlaps as a single blob) If the page looks like a real website (even if ugly, boring, or low quality), broken MUST be false. OUTPUT RULES: - Output ONLY valid JSON (no markdown, no <think>, no explanation). - JSON must start with { and end with }. OUTPUT FORMAT: { "broken": false, "confidence": 0.0, "reasons": [] }""" BROKEN_GATE_USER_PROMPT = """Decide whether this page is BROKEN. Be conservative: if unsure, set broken=false. Return broken=true only if confidence >= {min_confidence}. """ SECTION_CREATIVITY_SYSTEM_PROMPT = """You are a section-level creativity evaluator for website screenshots. You will be given ONE full-page desktop screenshot and a list of expected section IDs (in order). Your job is NOT to judge overall aesthetics or whether the page is "premium". Instead, score each section for how DISTINCTIVE / CREATIVE it looks versus generic/template. SCORING (0.0 to 1.0): - 1.0: Distinctive, memorable, has a signature layout moment or motif, cohesive with the page. - 0.7: Solid and non-generic, some unique structure, visually intentional. - 0.4: Generic (stacked cards / plain blocks) with minimal uniqueness. - 0.0: Empty/blank, placeholder, broken-looking, or effectively missing. IMPORTANT: - Do NOT mark a section low just because it's minimal; minimal can still be intentional. - If a section is not visible / unclear, set score=0.5 and confidence<=0.4 with notes "unclear". - Output ONLY the provided section IDs (no extras) and keep the same order. - Keep notes short (<= 8 words). Do not include quotes. OUTPUT RULES: - Output ONLY valid JSON (no markdown, no <think>, no explanation). - JSON must start with { and end with }. - Prefer COMPACT JSON (single-line or minimal whitespace). OUTPUT FORMAT: { "sections": [ {"id": "hero", "score": 0.0, "confidence": 0.0, "notes": "short note"}, {"id": "testimonials", "score": 0.0, "confidence": 0.0, "notes": "short note"} ] } """ SECTION_CREATIVITY_USER_PROMPT = """Score section creativity for this page. Sections to evaluate (in order): {sections} Return scores for EVERY listed section id and ONLY those ids. If unclear/not found, use score=0.5 confidence<=0.4 and notes=unclear. """ # === Section creativity aggregation helpers === # # We compute multiple aggregates: # - avg_all: across all confidently-evaluated sections (legacy behavior) # - core_avg: excludes utility sections like header/nav/footer/faq (avoids dragging down creativity signal) # - key_avg: focuses on the "key" conversion sections (hero/features/proof/pricing/etc) # # This supports the "creativity is the north star" philosophy without requiring *every* # utility section (e.g., footer) to be an avant-garde layout moment. _CREATIVITY_UTILITY_SUBSTRINGS = ( "header", "nav", "navbar", "footer", "faq", "legal", ) _CREATIVITY_KEY_SUBSTRINGS = ( "hero", "problem", "tension", "how_it_works", "how-it-works", "process", "timeline", "benefits", "features", "social", "proof", "testimonials", "pricing", "offer", "final_cta", "cta", "comparison", "listings", "grid", ) def _compute_section_creativity_aggregates( sections: list[dict[str, Any]], *, confidence_threshold: float = 0.5, high_score_threshold: float = 0.7, ) -> dict[str, float | int | None]: def _is_utility(sid: str) -> bool: sid_l = sid.lower().strip() return any(x in sid_l for x in _CREATIVITY_UTILITY_SUBSTRINGS) def _is_key(sid: str) -> bool: sid_l = sid.lower().strip() if _is_utility(sid_l): return False return any(x in sid_l for x in _CREATIVITY_KEY_SUBSTRINGS) any_all: list[float] = [] any_core: list[float] = [] any_key: list[float] = [] conf_all: list[float] = [] conf_core: list[float] = [] conf_key: list[float] = [] high_count = 0 for s in sections: if not isinstance(s, dict): continue sid = str(s.get("id") or "").strip() if not sid: continue try: score_val = float(s.get("score") or 0.0) except Exception: continue try: conf_val = float(s.get("confidence") or 0.0) except Exception: conf_val = 0.0 # Clamp defensively if score_val < 0.0: score_val = 0.0 if score_val > 1.0: score_val = 1.0 if conf_val < 0.0: conf_val = 0.0 if conf_val > 1.0: conf_val = 1.0 is_utility = _is_utility(sid) is_key = _is_key(sid) any_all.append(score_val) if not is_utility: any_core.append(score_val) if is_key: any_key.append(score_val) if conf_val >= confidence_threshold: conf_all.append(score_val) if not is_utility: conf_core.append(score_val) if score_val >= high_score_threshold: high_count += 1 if is_key: conf_key.append(score_val) def _avg(values: list[float]) -> float | None: if not values: return None return sum(values) / float(len(values)) avg_all = _avg(conf_all) if conf_all else _avg(any_all) core_avg = _avg(conf_core) if conf_core else _avg(any_core) key_avg = _avg(conf_key) if conf_key else _avg(any_key) # If we can't identify key sections, fall back to the more robust core_avg, then avg_all. if key_avg is None: key_avg = core_avg if core_avg is not None else avg_all return { "avg_all": avg_all, "core_avg": core_avg, "key_avg": key_avg, "high_count": int(high_count), } PREMIUM_GATE_SYSTEM_PROMPT = """You are a STRICT premium website quality validator. Your job is to decide whether the page looks SHIP-READY and PREMIUM. This is NOT about "does it compile" and NOT about "is it broken". The page may be functional but still not premium. Premium means: strong visual polish, clear hierarchy, coherent design, and looks like something a real product team would ship. Rules: - Be strict: set premium=true ONLY when the page clearly looks premium. - If the page is merely "fine" or "basic", premium must be false. - If you are unsure, premium=false with low confidence. OUTPUT RULES: - Output ONLY valid JSON (no markdown, no <think>, no explanation). - JSON must start with { and end with }. OUTPUT FORMAT: { "premium": false, "confidence": 0.0, "issues": [], "fix_suggestions": [] }""" PREMIUM_GATE_USER_PROMPT = """Decide whether this page is premium/ship-ready. Be strict: premium=true only if you'd ship it as a "premium" site without redesign. If it's basic, generic, or under-designed, premium=false. Keep the output SHORT to avoid truncation: - issues: max 4 items (short phrases) - fix_suggestions: max 4 items (short phrases) Do not include any additional keys. Do not include any text outside the JSON. """ # === CREATIVE DIRECTOR MODE === # Qualitative feedback instead of numeric scoring. # Encourages creative risk-taking and emergence. CREATIVE_DIRECTOR_SYSTEM_PROMPT = """You are a world-class creative director reviewing a website design. Your job is NOT to give a numeric score. Instead, provide rich qualitative feedback that helps improve the design while PRESERVING creative emergence and risk-taking. IMPORTANT PHILOSOPHY: - "Different" is NOT "broken". Unconventional designs may be intentional. - Creative risk-taking should be ENCOURAGED, not punished. - Focus on production readiness, not personal taste. - A simple, minimal design is valid if executed well. - An experimental, bold design is valid if it works. WHAT MAKES SOMETHING "SHIPPABLE": - No critical errors (blank page, runtime errors, missing content) - Core functionality is present (navigation works, CTAs visible) - Text is readable, layout is coherent - Responsive across viewports - NO EMOJI CHARACTERS (🚀❌✅🎯💡 etc.) - emojis look cheap/unprofessional EMOJI CHECK: If you see ANY emoji characters in the UI (buttons, headings, cards, etc.), add "Remove emoji characters - use SVG icons instead" to missing_for_production. Emojis are a hard requirement violation. WHAT DOES NOT MAKE SOMETHING "NOT SHIPPABLE": - Unusual color choices (that's creative expression) - Unconventional layouts (that's experimentation) - Bold typography (that's design intent) - Minimal/sparse design (that's a valid style) OUTPUT RULES: - Output ONLY valid JSON (no markdown, no <think>, no explanation) - JSON must start with { and end with } - Be GENEROUS with creative choices - Be SPECIFIC with production issues OUTPUT FORMAT: { "shippable": true, "obviously_broken": false, "preserve": ["list of creative choices to keep"], "missing_for_production": ["only critical issues"], "creative_elevations": ["suggestions from a master designer"], "appropriate_for_type": true, "type_feedback": "brief note on layout/length fit" }""" CREATIVE_DIRECTOR_USER_PROMPT = """Review these screenshots as a creative director. SITE TYPE: {page_type} INTENDED STYLE: {mood} mood, {accent} accent, {style_keywords} DENSITY: {density} SCREENSHOTS PROVIDED: {viewport_labels} Answer these questions through your JSON output: 1. SHIPPABLE: Could this go live as-is? (not "is it perfect" - just "is it ready") 2. OBVIOUSLY BROKEN: Is there a critical error making this unusable? 3. PRESERVE: What creative choices are working? What should we NOT change? 4. MISSING FOR PRODUCTION: Only list things that MUST be fixed (not preferences) 5. CREATIVE ELEVATIONS: How would a master designer take this to the next level? 6. APPROPRIATE FOR TYPE: Does the layout/length fit this type of site? Be generous with creative expression. Be specific with production issues. Remember: unconventional ≠ broken.""" async def get_creative_director_feedback( candidate: Candidate, config: Config, ) -> CreativeDirectorFeedback | None: """Get qualitative creative director feedback instead of numeric score. This replaces the traditional judge scoring with rich qualitative feedback that guides refinement while preserving creative emergence. Args: candidate: Rendered candidate with screenshots config: Application configuration Returns: CreativeDirectorFeedback or None if feedback fails """ # Accept both RENDERED and SCORED candidates (scoring happens before CD feedback) if candidate.status not in (CandidateStatus.RENDERED, CandidateStatus.SCORED): log_error(f"Candidate {candidate.id}: Cannot get feedback, not rendered (status: {candidate.status})") return None if not config.vision_judge.model: log_warning("No vision model configured for creative director feedback") return None provider = ProviderFactory.get(config.vision_judge.provider, config) # Load screenshots images = [] viewport_labels = [] for i, viewport in enumerate(["mobile", "tablet", "desktop"], 1): path = candidate.screenshot_paths.get(viewport) if not path: continue try: with open(path, "rb") as f: images.append(f.read()) label = VIEWPORT_LABELS.get(viewport, viewport) viewport_labels.append(f"Image {i}: {label}") except Exception as e: log_error(f"Failed to load screenshot {path}: {e}") if not images: return None # Build prompt ui_spec = candidate.ui_spec user_prompt = CREATIVE_DIRECTOR_USER_PROMPT.format( page_type=ui_spec.page_type if ui_spec else "unknown", mood=ui_spec.brand.mood if ui_spec else "unknown", accent=ui_spec.brand.accent if ui_spec else "unknown", density=ui_spec.brand.density if ui_spec else "balanced", style_keywords=", ".join(ui_spec.brand.style_keywords) if ui_spec else "", viewport_labels="\n".join(viewport_labels), ) messages = [ Message(role="system", content=CREATIVE_DIRECTOR_SYSTEM_PROMPT), Message(role="user", content=user_prompt), ] try: response = await _complete_with_vision_fallback( provider=provider, messages=messages, model=config.vision_judge.model, images=images, max_tokens=int(config.vision_judge.max_tokens or 1500), temperature=float(config.vision_judge.temperature or 0.0), ) data = extract_json_strict(response.content) if isinstance(data, list): data = next((x for x in data if isinstance(x, dict)), None) if data is None: raise ValueError("Creative director returned list without object") if not isinstance(data, dict): raise ValueError(f"Expected JSON object, got {type(data).__name__}") def _as_str_list(value) -> list[str]: if value is None: return [] if isinstance(value, list): return [str(v) for v in value if str(v).strip()] return [str(value)] feedback = CreativeDirectorFeedback( shippable=bool(data.get("shippable", False)), obviously_broken=bool(data.get("obviously_broken", False)), preserve=_as_str_list(data.get("preserve", [])), missing_for_production=_as_str_list(data.get("missing_for_production", [])), creative_elevations=_as_str_list(data.get("creative_elevations", [])), appropriate_for_type=bool(data.get("appropriate_for_type", True)), type_feedback=str(data.get("type_feedback", "")), ) status = "SHIPPABLE" if feedback.shippable else "NEEDS WORK" if feedback.obviously_broken: status = "BROKEN" log_info(f"Candidate {candidate.id}: Creative Director says {status}") if feedback.preserve: log_info(f" Preserve: {feedback.preserve[:2]}") if feedback.missing_for_production: log_info(f" Missing: {feedback.missing_for_production[:2]}") return feedback except Exception as e: log_error(f"Candidate {candidate.id}: Creative director feedback failed - {e}") return None async def assess_premium_candidates( candidates: list[Candidate], config: Config, ) -> list[Candidate]: """Optionally label candidates as premium/ship-ready using the configured vision model. This does NOT discard candidates. It's used to: - audit quality distribution when skip_judge=true - decide whether to trigger an automatic polish pass (if enabled) On judge/model failures, we default to premium_gate=None (or premium=false confidence=0) to avoid accidental filtering/polishing. """ if not getattr(config.pipeline, "premium_vision_gate_enabled", False): return candidates if not config.vision_judge.model: log_warning("premium_vision_gate_enabled=true but no vision_judge.model configured; skipping") return candidates to_check = [c for c in candidates if c.status == CandidateStatus.RENDERED] if not to_check: return candidates provider = ProviderFactory.get(config.vision_judge.provider, config) sem = asyncio.Semaphore(max(1, int(getattr(config.budget, "concurrency_gemini", 2) or 2))) async def _check_one(candidate: Candidate) -> None: desktop_path = candidate.screenshot_paths.get("desktop") if candidate.screenshot_paths else None if not desktop_path: return try: with open(desktop_path, "rb") as f: img_bytes = f.read() except Exception as e: log_warning(f"Candidate {candidate.id}: Failed to read screenshot for premium gate: {e}") return messages = [ Message(role="system", content=PREMIUM_GATE_SYSTEM_PROMPT), Message(role="user", content=PREMIUM_GATE_USER_PROMPT), ] async with sem: try: resp = await provider.complete_with_vision( messages=messages, model=config.vision_judge.model, images=[img_bytes], # Keep small to reduce truncation risk; we only need a boolean + short lists. max_tokens=min(350, int(config.vision_judge.max_tokens or 350)), temperature=0.0, ) except Exception as e: log_warning(f"Candidate {candidate.id}: Premium vision gate failed (skipping): {e}") return try: data = extract_json_strict(resp.content) except Exception as e: log_warning(f"Candidate {candidate.id}: Premium gate returned non-JSON (skipping): {e}") return if not isinstance(data, dict): return premium_raw = data.get("premium", False) premium = bool(premium_raw) if isinstance(premium_raw, (bool, int)) else str(premium_raw).lower() == "true" conf_raw = data.get("confidence", 0.0) try: confidence = float(conf_raw or 0.0) except Exception: confidence = 0.0 confidence = max(0.0, min(1.0, confidence)) issues = data.get("issues", []) if not isinstance(issues, list): issues = [str(issues)] issues_str = [str(i) for i in issues if str(i).strip()][:6] fixes = data.get("fix_suggestions", []) if not isinstance(fixes, list): fixes = [str(fixes)] fixes_str = [str(i) for i in fixes if str(i).strip()][:6] candidate.premium_gate = PremiumGate( premium=premium, confidence=confidence, issues=issues_str, fix_suggestions=fixes_str, ) log_info(f"Premium vision gate: labeling {len(to_check)} rendered candidates...") results = await asyncio.gather(*[_check_one(c) for c in to_check], return_exceptions=True) for cand, res in zip(to_check, results): if isinstance(res, BaseException): log_warning(f"Candidate {cand.id}: Premium vision gate error (keeping): {res}") return candidates async def filter_broken_candidates( candidates: list[Candidate], config: Config, ) -> list[Candidate]: """Optionally discard visually broken renders using the configured vision model. This is designed for "accept-all" pipelines: - It does NOT score or pick winners. - It only discards candidates that are clearly broken even though they rendered. If the vision model fails, we default to NOT discarding (avoid false positives). Args: candidates: Candidates (some may be rendered) config: Application configuration Returns: Updated candidates list (broken renders marked DISCARDED) """ if not getattr(config.pipeline, "broken_vision_gate_enabled", False): return candidates if not config.vision_judge.model: log_warning("broken_vision_gate_enabled=true but no vision_judge.model configured; skipping") return candidates to_check = [c for c in candidates if c.status == CandidateStatus.RENDERED] if not to_check: return candidates provider = ProviderFactory.get(config.vision_judge.provider, config) min_conf = float(getattr(config.pipeline, "broken_vision_gate_min_confidence", 0.85) or 0.85) async def _check_one(candidate: Candidate) -> None: desktop_path = candidate.screenshot_paths.get("desktop") if candidate.screenshot_paths else None if not desktop_path: return try: with open(desktop_path, "rb") as f: img_bytes = f.read() except Exception as e: log_warning(f"Candidate {candidate.id}: Failed to read screenshot for broken gate: {e}") return # Deterministic blank-page heuristic: # Truly blank/empty desktop screenshots compress extremely small (single-color PNGs). # This catches the core bug where blank pages pass axe/Lighthouse because there's # effectively nothing to audit. # # 1440×900 blank pages we observed were ~7–10 KB. Real pages are typically 100 KB+. if len(img_bytes) < 20_000: candidate.status = CandidateStatus.DISCARDED candidate.error = ( f"Broken render heuristic: desktop screenshot too small ({len(img_bytes)} bytes)" ) log_warning( f"Candidate {candidate.id}: Discarded as broken (tiny screenshot: {len(img_bytes)} bytes)" ) return messages = [ Message(role="system", content=BROKEN_GATE_SYSTEM_PROMPT), Message(role="user", content=BROKEN_GATE_USER_PROMPT.format(min_confidence=min_conf)), ] # Some Gemini "Flash preview" models allocate a hidden "thoughts" budget. # Keep max_tokens high enough that we still receive the JSON output. primary_model = config.vision_judge.model resp = None data = None try: resp = await _complete_with_vision_fallback( provider=provider, messages=messages, model=primary_model, images=[img_bytes], max_tokens=min(1200, int(config.vision_judge.max_tokens or 1200)), temperature=0.0, ) data = extract_json_strict(resp.content) except Exception: # One retry: request strict JSON only. try: retry_messages = messages + [ Message( role="user", content=( "Your last output was not valid JSON. Re-output ONLY the JSON object.\n" "No markdown. No extra keys. Start with { and end with }." ), ) ] resp2 = await _complete_with_vision_fallback( provider=provider, messages=retry_messages, model=primary_model, images=[img_bytes], max_tokens=min(1200, int(config.vision_judge.max_tokens or 1200)), temperature=0.0, ) data = extract_json_strict(resp2.content) resp = resp2 except Exception: # Do not discard on judge failure log_warning( f"Candidate {candidate.id}: Broken vision gate failed (keeping)" ) return if not isinstance(data, dict): return broken_raw = data.get("broken", False) broken = bool(broken_raw) if isinstance(broken_raw, (bool, int)) else str(broken_raw).lower() == "true" conf_raw = data.get("confidence", 0.0) try: confidence = float(conf_raw or 0.0) except Exception: confidence = 0.0 reasons = data.get("reasons", []) if not isinstance(reasons, list): reasons = [str(reasons)] reasons_str = [str(r) for r in reasons if str(r).strip()] if broken and confidence >= min_conf: candidate.status = CandidateStatus.DISCARDED candidate.error = ( f"Vision broken gate: broken=true confidence={confidence:.2f} " + (f"reasons={reasons_str[:5]}" if reasons_str else "") ) log_warning(f"Candidate {candidate.id}: Discarded as broken (confidence {confidence:.2f})") else: # Keep candidate; optionally store a marker in score_details for auditing if candidate.score_details is None: candidate.score_details = JudgeScore( score=0.0, passing=True, issues=reasons_str[:5] if broken else [], highlights=[], fix_suggestions=[], ) log_info(f"Broken vision gate: checking {len(to_check)} rendered candidates...") results = await asyncio.gather(*[_check_one(c) for c in to_check], return_exceptions=True) for cand, res in zip(to_check, results): if isinstance(res, BaseException): log_warning(f"Candidate {cand.id}: Broken vision gate error (keeping): {res}") return candidates async def assess_section_creativity( candidate: Candidate, config: Config, ) -> list[dict] | None: """Evaluate section-level creativity scores for a rendered candidate. This is used in skip_judge mode to selectively refine weak sections without running full scoring/winner selection. Returns: List of dicts: [{"id": str, "score": float, "confidence": float, "notes": str}, ...] or None if unavailable. """ if candidate.status != CandidateStatus.RENDERED: return None if not config.vision_judge.model: log_warning("creativity_refinement_enabled=true but no vision_judge.model configured; skipping") return None desktop_path = candidate.screenshot_paths.get("desktop") if candidate.screenshot_paths else None if not desktop_path: return None try: with open(desktop_path, "rb") as f: img_bytes = f.read() except Exception as e: log_warning(f"Candidate {candidate.id}: Failed to read screenshot for creativity eval: {e}") return None # Build section list from UI_SPEC when available; fall back to a safe default list. section_ids: list[str] = [] ui_spec = candidate.ui_spec if ui_spec and getattr(ui_spec, "layout", None) and getattr(ui_spec.layout, "sections", None): for s in ui_spec.layout.sections: sid = str(getattr(s, "id", "") or "").strip() if sid: section_ids.append(sid) if not section_ids: section_ids = ["hero", "features", "testimonials", "faq", "footer"] sections_text = "\n".join(f"- {sid}" for sid in section_ids[:12]) messages = [ Message(role="system", content=SECTION_CREATIVITY_SYSTEM_PROMPT), Message(role="user", content=SECTION_CREATIVITY_USER_PROMPT.format(sections=sections_text)), ] provider = ProviderFactory.get(config.vision_judge.provider, config) primary_model = config.vision_judge.model async def _call(model: str, *, retry_json_only: bool) -> "CompletionResponse": call_messages = messages if retry_json_only: call_messages = messages + [ Message( role="user", content=( "Re-output ONLY the JSON object. No markdown. No extra keys. " "Start with { and end with }." ), ) ] return await _complete_with_vision_fallback( provider=provider, messages=call_messages, model=model, images=[img_bytes], # Keep section-level creativity scoring deterministic and fast. # This is an evaluator, not a generator; higher temperature does not increase creativity. max_tokens=min(4000, int(config.vision_judge.max_tokens or 2000)), temperature=0.0, ) resp = None data = None try: resp = await _call(primary_model, retry_json_only=False) data = extract_json_strict(resp.content) except asyncio.CancelledError as e: log_warning(f"Candidate {candidate.id}: Section creativity eval cancelled (skipping): {e}") return None except Exception as e: # Retry once: ask for strict JSON. try: resp2 = await _call(primary_model, retry_json_only=True) data = extract_json_strict(resp2.content) resp = resp2 except asyncio.CancelledError as e2: log_warning(f"Candidate {candidate.id}: Section creativity eval cancelled (skipping): {e2}") return None except Exception as e2: if data is None: log_warning( f"Candidate {candidate.id}: Section creativity eval returned non-JSON: {e2}" ) return None sections = None if isinstance(data, dict): sections = data.get("sections") elif isinstance(data, list): # Some models output a bare list; accept it. sections = data if not isinstance(sections, list): try: keys = list(data.keys())[:10] if isinstance(data, dict) else [] except Exception: keys = [] log_warning( f"Candidate {candidate.id}: Section creativity JSON missing sections list " f"(type={type(data).__name__}, keys={keys})" ) return None out: list[dict] = [] for item in sections[:20]: if not isinstance(item, dict): continue sid = str(item.get("id") or "").strip() if not sid: continue try: score = float(item.get("score") or 0.0) except Exception: score = 0.0 try: conf = float(item.get("confidence") or 0.0) except Exception: conf = 0.0 notes = str(item.get("notes") or "").strip() out.append( { "id": sid, "score": max(0.0, min(1.0, score)), "confidence": max(0.0, min(1.0, conf)), "notes": notes[:120], } ) # Ensure deterministic ordering by the requested ids when possible. idx = {sid: i for i, sid in enumerate(section_ids)} out.sort(key=lambda d: idx.get(str(d.get("id") or ""), 10_000)) return out async def score_candidate( candidate: Candidate, config: Config, ) -> JudgeScore: """Score a candidate using vision model or heuristics. Args: candidate: Rendered candidate with screenshots config: Application configuration Returns: JudgeScore with score and feedback """ if candidate.status != CandidateStatus.RENDERED: log_error(f"Candidate {candidate.id}: Cannot score, not rendered") return JudgeScore( score=0, passing=False, issues=["Candidate not rendered"], ) # Check if vision model is configured if config.vision_judge.model: return await _score_with_vision(candidate, config) else: log_warning("No vision model configured, using heuristic scorer") return _score_with_heuristics(candidate, config) async def _score_with_vision(candidate: Candidate, config: Config) -> JudgeScore: """Score using vision model. Args: candidate: Candidate to score config: Application configuration Returns: JudgeScore from vision model """ provider = ProviderFactory.get(config.vision_judge.provider, config) # Load screenshots with viewport labels images = [] viewport_labels = [] # Process in consistent order: mobile, tablet, desktop for i, viewport in enumerate(["mobile", "tablet", "desktop"], 1): path = candidate.screenshot_paths.get(viewport) if not path: continue try: with open(path, "rb") as f: images.append(f.read()) label = VIEWPORT_LABELS.get(viewport, viewport) viewport_labels.append(f"Image {i}: {label}") except Exception as e: log_error(f"Failed to load screenshot {path}: {e}") if not images: return JudgeScore( score=0, passing=False, issues=["No screenshots available"], ) # Build prompt with viewport labels ui_spec = candidate.ui_spec user_prompt = JUDGE_USER_PROMPT.format( page_type=ui_spec.page_type if ui_spec else "unknown", mood=ui_spec.brand.mood if ui_spec else "unknown", accent=ui_spec.brand.accent if ui_spec else "unknown", density=ui_spec.brand.density if ui_spec else "balanced", style_keywords=", ".join(ui_spec.brand.style_keywords) if ui_spec else "", threshold=config.pipeline.vision_score_threshold, viewport_labels="\n".join(viewport_labels), ) messages = [ Message(role="system", content=JUDGE_SYSTEM_PROMPT), Message(role="user", content=user_prompt), ] def _as_str_list(value) -> list[str]: if value is None: return [] if isinstance(value, list): return [str(v) for v in value if str(v).strip()] return [str(value)] try: response = await _complete_with_vision_fallback( provider=provider, messages=messages, model=config.vision_judge.model, images=images, max_tokens=int(config.vision_judge.max_tokens or 1500), temperature=float(config.vision_judge.temperature or 0.0), ) try: score_data = extract_json_strict(response.content) except Exception: # Retry once with a strict JSON-only instruction to reduce parsing failures # from models that include prefacing text. response2 = await _complete_with_vision_fallback( provider=provider, messages=messages + [ Message( role="user", content=( "Re-output ONLY the JSON object. No markdown. No extra keys. " "Start with { and end with }." ), ) ], model=config.vision_judge.model, images=images, max_tokens=int(config.vision_judge.max_tokens or 1500), temperature=float(config.vision_judge.temperature or 0.0), ) score_data = extract_json_strict(response2.content) # Some models occasionally wrap the object in a JSON list. # Normalize to a dict payload. if isinstance(score_data, list): first_obj = next((x for x in score_data if isinstance(x, dict)), None) if first_obj is None: raise ValueError("Judge returned a JSON list without an object payload") score_data = first_obj if not isinstance(score_data, dict): raise ValueError(f"Expected JSON object, got {type(score_data).__name__}") # Normalize field names passing_raw = score_data.get("pass", score_data.get("passing", False)) passing = bool(passing_raw) if isinstance(passing_raw, (bool, int)) else str(passing_raw).lower() == "true" score = JudgeScore( score=float(score_data.get("score", 0) or 0), passing=passing, issues=_as_str_list(score_data.get("issues", [])), highlights=_as_str_list(score_data.get("highlights", [])), fix_suggestions=_as_str_list(score_data.get("fix_suggestions", [])), ) log_info( f"Candidate {candidate.id}: Vision score {score.score:.1f} " f"({'PASS' if score.passing else 'FAIL'})" ) return score except Exception as e: log_error(f"Candidate {candidate.id}: Vision scoring failed - {e}") return JudgeScore( score=0, passing=False, issues=[f"Vision scoring error: {e}"], ) def _score_with_heuristics(candidate: Candidate, config: Config) -> JudgeScore: """Score using HTML/CSS heuristics (fallback). This is a basic fallback when no vision model is available. It checks code quality indicators rather than visual quality. Args: candidate: Candidate to score config: Application configuration Returns: Heuristic-based JudgeScore """ issues = [] highlights = [] score = 5.0 # Start at middle # Get main page content main_file = None for f in candidate.files: if f.path == "app/page.tsx": main_file = f break if not main_file: return JudgeScore( score=0, passing=False, issues=["No app/page.tsx found"], ) content = main_file.content # Check for TypeScript typing if ": React.FC" in content or "interface " in content or "type " in content: score += 0.5 highlights.append("Good TypeScript usage") # Check for Tailwind classes tailwind_patterns = [ r"className=[\"'][^\"']*flex", r"className=[\"'][^\"']*grid", r"className=[\"'][^\"']*gap-", r"className=[\"'][^\"']*p[xy]?-", r"className=[\"'][^\"']*m[xy]?-", ] tailwind_count = sum(1 for p in tailwind_patterns if re.search(p, content)) if tailwind_count >= 3: score += 1.0 highlights.append("Good Tailwind usage") elif tailwind_count == 0: score -= 1.0 issues.append("Limited Tailwind usage detected") # Check for responsive classes if "sm:" in content or "md:" in content or "lg:" in content: score += 0.5 highlights.append("Responsive design implemented") else: score -= 0.5 issues.append("Missing responsive breakpoints") # Check for semantic HTML semantic_tags = ["<header", "<main", "<section", "<footer", "<nav", "<article"] semantic_count = sum(1 for tag in semantic_tags if tag in content) if semantic_count >= 3: score += 0.5 highlights.append("Good semantic HTML") elif semantic_count == 0: issues.append("Missing semantic HTML elements") # Check for accessibility if "aria-" in content or "role=" in content: score += 0.5 highlights.append("ARIA attributes present") if 'alt="' in content or "alt={" in content: score += 0.3 highlights.append("Image alt attributes present") # Check for proper sections section_keywords = ["hero", "feature", "testimonial", "pricing", "faq", "cta"] section_count = sum(1 for kw in section_keywords if kw.lower() in content.lower()) if section_count >= 4: score += 1.0 highlights.append(f"Good section coverage ({section_count}/6)") elif section_count < 2: score -= 1.0 issues.append("Missing expected page sections") # Check file size (too small = incomplete, too large = bloated) file_size = len(content) if file_size < 1000: score -= 1.5 issues.append("Code seems incomplete (too short)") elif file_size > 15000: score -= 0.5 issues.append("Code may be overly complex") elif 3000 < file_size < 10000: score += 0.5 highlights.append("Appropriate code size") # Check for hardcoded lorem ipsum or placeholder if "lorem ipsum" in content.lower(): score -= 0.5 issues.append("Contains lorem ipsum placeholder text") # Check for dark mode implementation if specified if candidate.ui_spec and candidate.ui_spec.brand.mood == "dark": dark_indicators = ["bg-gray-900", "bg-slate-900", "bg-zinc-900", "bg-black", "dark:"] if any(ind in content for ind in dark_indicators): score += 0.5 highlights.append("Dark theme implemented") else: score -= 0.5 issues.append("Dark theme not properly implemented") # Clamp score score = max(0, min(10, score)) # Determine pass/fail threshold = config.pipeline.vision_score_threshold passing = score >= threshold log_info( f"Candidate {candidate.id}: Heuristic score {score:.1f} " f"({'PASS' if passing else 'FAIL'})" ) return JudgeScore( score=score, passing=passing, issues=issues, highlights=highlights, fix_suggestions=[ f"Fix: {issue}" for issue in issues[:3] ], ) async def score_all_candidates( candidates: list[Candidate], config: Config, ) -> list[Candidate]: """Score all rendered candidates. Args: candidates: List of candidates config: Application configuration Returns: Candidates with scores """ to_score = [c for c in candidates if c.status == CandidateStatus.RENDERED] if not to_score: log_info("No candidates to score") return candidates log_info(f"Scoring {len(to_score)} candidates...") creativity_weight = float(getattr(config.pipeline, "selection_creativity_weight", 0.4) or 0.0) need_creativity = bool(getattr(config.pipeline, "refinement_skip_for_high_creativity", True)) or ( creativity_weight > 0.0 ) for candidate in to_score: # Best-effort: compute section-level creativity metrics for downstream selection/refinement. # Keep this failure-tolerant so judge outages don't block the pipeline. if need_creativity: try: sections = await assess_section_creativity(candidate, config) except Exception as e: sections = None log_warning(f"Candidate {candidate.id}: Section creativity eval failed (continuing): {e}") if sections: candidate.section_creativity = [s for s in sections if isinstance(s, dict)] agg = _compute_section_creativity_aggregates(candidate.section_creativity) candidate.section_creativity_avg = ( float(agg["avg_all"]) if agg.get("avg_all") is not None else None ) candidate.section_creativity_core_avg = ( float(agg["core_avg"]) if agg.get("core_avg") is not None else None ) candidate.section_creativity_key_avg = ( float(agg["key_avg"]) if agg.get("key_avg") is not None else None ) try: candidate.section_creativity_high_count = int(agg.get("high_count") or 0) except Exception: candidate.section_creativity_high_count = None try: score = await score_candidate(candidate, config) candidate.score = score.score candidate.score_details = score candidate.status = CandidateStatus.SCORED except Exception as e: log_error(f"Candidate {candidate.id}: Scoring failed - {e}") # If vision scoring fails, fall back to a pass-through score so # build-passed candidates are not discarded. candidate.error = str(e) candidate.score = 0 candidate.score_details = JudgeScore( score=0, passing=True, issues=[f"Vision scoring error: {e}"], highlights=[], fix_suggestions=[], ) candidate.status = CandidateStatus.SCORED return candidates

Loading blob content...

Latest Blog Posts

Don't Use Large Strings as Cache Keys
By punkpeye on January 11, 2026.
markdown
node-js
cache
What are Claude Skills?
By punkpeye on January 10, 2026.
mcp
skills
How to Test MCP Streamable HTTP Endpoints Using cURL
By punkpeye on January 2, 2026.
tutorial
bash

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/alexalexalex222/petamind-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

judge.py•47.5 kB