judge_workplan

Instructions

Triggers an asynchronous code judgement comparing two git refs against a workplan.

This tool will:

Create a sub-issue linked to the workplan immediately
Launch a background AI process to analyze the code changes
Update the sub-issue with the judgement once complete

The judgement will evaluate:

Whether the implementation follows the workplan
Code quality and completeness
Missing or incomplete items
Suggestions for improvement

Supports comparing:

Branches (e.g., feature-branch vs main)
Commits (e.g., abc123 vs def456)
PR changes (automatically uses PR's base and head)

Returns the sub-issue URL immediately.

Input Schema

TableJSON Schema

Name	Required	Default
`issue_number`	Yes
`base_ref`	No	main
`head_ref`	No	HEAD
`codebase_reasoning`	No	full
`debug`	No
`disable_search_grounding`	No
`subissue_to_update`	No
`pr_url`	No

Implementation Reference

yellhorn_mcp/server.py:669-895 (handler)
Main MCP tool handler for 'judge_workplan'. Orchestrates fetching the workplan issue body, generating git diff between refs, creating a placeholder judgement sub-issue, and launching the asynchronous LLM judgement process via process_judgement_async.
async def judge_workplan( ctx: Context, issue_number: str, base_ref: str = "main", head_ref: str = "HEAD", codebase_reasoning: str = "full", debug: bool = False, disable_search_grounding: bool = False, subissue_to_update: str | None = None, pr_url: str | None = None, ) -> str: """Triggers an asynchronous code judgement for changes against a workplan. Args: ctx: Server context. issue_number: The workplan issue number to judge against. base_ref: The base git reference (default: "main"). head_ref: The head git reference (default: "HEAD"). codebase_reasoning: Reasoning mode for codebase analysis: - "full": Include complete file contents and full diff - "lsp": Include function signatures and diff of changed functions - "file_structure": Include only file structure and list of changed files - "none": No codebase context, only diff summary debug: If True, adds a comment with the full prompt used for generation. disable_search_grounding: If True, disables Google Search Grounding. Returns: JSON string containing the sub-issue URL and number. Raises: YellhornMCPError: If judgement creation fails. """ original_search_grounding = True try: repo_path: Path = ctx.request_context.lifespan_context["repo_path"] model = ctx.request_context.lifespan_context["model"] llm_manager = ctx.request_context.lifespan_context.get("llm_manager") reasoning_effort = ctx.request_context.lifespan_context.get("reasoning_effort") # Handle search grounding override if specified original_search_grounding = ctx.request_context.lifespan_context.get( "use_search_grounding", True ) if disable_search_grounding: ctx.request_context.lifespan_context["use_search_grounding"] = False await ctx.log( level="info", message="Search grounding temporarily disabled for this request", ) # Use default branch if base_ref is "main" but the repo uses "master" if base_ref == "main": default_branch = await get_default_branch(repo_path) if default_branch != "main": await ctx.log( level="info", message=f"Using default branch '{default_branch}' instead of 'main'", ) base_ref = default_branch # Check if issue_number is a PR URL if issue_number.startswith("http") and "/pull/" in issue_number: # This is a PR URL, we need to extract the diff and find the related workplan pr_diff = await get_github_pr_diff(repo_path, issue_number) # Extract PR number for finding related workplan import re pr_match = re.search(r"/pull/(\d+)", issue_number) if not pr_match: raise YellhornMCPError(f"Invalid PR URL: {issue_number}") pr_number = pr_match.group(1) # Try to find workplan issue number in PR description or title # For now, we'll ask the user to provide the workplan issue number raise YellhornMCPError( f"PR URL detected. Please provide the workplan issue number instead of PR URL. " f"You can find the workplan issue number in the PR description." ) # Fetch the workplan workplan = await get_issue_body(repo_path, issue_number) # Handle PR URL or git refs for diff generation if pr_url: # Use PR diff instead of git refs diff = await get_github_pr_diff(repo_path, pr_url) # For PR, use placeholder commit hashes base_commit_hash = "pr_base" head_commit_hash = "pr_head" else: # Resolve git references to commit hashes base_commit_hash = await run_git_command( repo_path, ["rev-parse", base_ref], ctx.request_context.lifespan_context.get("git_command_func"), ) head_commit_hash = await run_git_command( repo_path, ["rev-parse", head_ref], ctx.request_context.lifespan_context.get("git_command_func"), ) # Generate diff for review diff = await get_git_diff( repo_path, base_ref, head_ref, codebase_reasoning, ctx.request_context.lifespan_context.get("git_command_func"), ) # Check if diff is empty or only contains the header for file_structure mode is_empty = not diff.strip() or ( codebase_reasoning in ["file_structure", "none"] and diff.strip() == f"Changed files between {base_ref} and {head_ref}:" ) if is_empty: # No changes to judge return json.dumps( { "error": f"No changes found between {base_ref} and {head_ref}", "base_commit": base_commit_hash, "head_commit": head_commit_hash, } ) # Extract URLs from the workplan submitted_urls = extract_urls(workplan) # Create a placeholder sub-issue immediately submission_metadata = SubmissionMetadata( status="Generating judgement...", model_name=model, search_grounding_enabled=ctx.request_context.lifespan_context.get( "use_search_grounding", False ), yellhorn_version=__version__, submitted_urls=submitted_urls if submitted_urls else None, codebase_reasoning_mode=codebase_reasoning, timestamp=datetime.now(timezone.utc), ) submission_comment = format_submission_comment(submission_metadata) placeholder_body = f"Parent workplan: #{issue_number}\n\n## Status\nGenerating judgement...\n\n{submission_comment}" judgement_title = f"Judgement for #{issue_number}: {head_ref} vs {base_ref}" # Create or update the sub-issue if subissue_to_update: # Update existing subissue subissue_number = subissue_to_update subissue_url = f"https://github.com/{repo_path.name}/issues/{subissue_number}" await update_github_issue(repo_path, subissue_number, placeholder_body) else: # Create new sub-issue from yellhorn_mcp.integrations.github_integration import create_judgement_subissue subissue_url = await create_judgement_subissue( repo_path, issue_number, judgement_title, placeholder_body ) # Extract sub-issue number from URL import re issue_match = re.search(r"/issues/(\d+)", subissue_url) subissue_number = issue_match.group(1) if issue_match else None await ctx.log( level="info", message=f"Created judgement sub-issue: {subissue_url}", ) # Launch background task to generate judgement await ctx.log( level="info", message=f"Launching background task to generate judgement with AI model {model}", ) # Prepare metadata for async processing start_time = datetime.now(timezone.utc) asyncio.create_task( process_judgement_async( repo_path, llm_manager, model, workplan, diff, base_ref, head_ref, base_commit_hash, head_commit_hash, issue_number, subissue_to_update=subissue_number, debug=debug, codebase_reasoning=codebase_reasoning, disable_search_grounding=disable_search_grounding, reasoning_effort=reasoning_effort, _meta={ "original_search_grounding": original_search_grounding, "start_time": start_time, "submitted_urls": submitted_urls, }, ctx=ctx, github_command_func=ctx.request_context.lifespan_context.get("github_command_func"), git_command_func=ctx.request_context.lifespan_context.get("git_command_func"), ) ) # Restore original search grounding setting if modified if disable_search_grounding: ctx.request_context.lifespan_context["use_search_grounding"] = original_search_grounding # Return the sub-issue URL and number as JSON return json.dumps({"subissue_url": subissue_url, "subissue_number": subissue_number}) except Exception as e: # Restore original search grounding setting on error if disable_search_grounding: try: ctx.request_context.lifespan_context["use_search_grounding"] = ( original_search_grounding ) except NameError: pass # original_search_grounding was not defined yet raise YellhornMCPError(f"Failed to create judgement: {str(e)}")
yellhorn_mcp/server.py:647-668 (registration)
FastMCP tool registration decorator that binds the judge_workplan function to the tool name 'judge_workplan' with full description and input parameters inferred from function signature.
@mcp.tool( name="judge_workplan", description="""Triggers an asynchronous code judgement comparing two git refs against a workplan. This tool will: 1. Create a sub-issue linked to the workplan immediately 2. Launch a background AI process to analyze the code changes 3. Update the sub-issue with the judgement once complete The judgement will evaluate: - Whether the implementation follows the workplan - Code quality and completeness - Missing or incomplete items - Suggestions for improvement Supports comparing: - Branches (e.g., feature-branch vs main) - Commits (e.g., abc123 vs def456) - PR changes (automatically uses PR's base and head) Returns the sub-issue URL immediately.""", )
yellhorn_mcp/processors/judgement_processor.py:105-491 (helper)
Core asynchronous helper that performs the LLM call to judge the code diff against the workplan, formats the response with metadata, handles citations/search grounding, calculates costs, and updates the GitHub judgement sub-issue.
async def process_judgement_async( repo_path: Path, llm_manager: LLMManager, model: str, workplan_content: str, diff_content: str, base_ref: str, head_ref: str, base_commit_hash: str, head_commit_hash: str, parent_workplan_issue_number: str, subissue_to_update: str | None = None, debug: bool = False, codebase_reasoning: str = "full", disable_search_grounding: bool = False, _meta: dict[str, object] | None = None, ctx: Context | None = None, github_command_func: Callable | None = None, git_command_func: Callable | None = None, reasoning_effort: ReasoningEffort | None = None, ) -> None: """Judge a code diff against a workplan asynchronously. Args: repo_path: Path to the repository. llm_manager: LLM Manager instance for API calls. model: Model name to use (Gemini or OpenAI). workplan_content: The original workplan content. diff_content: The code diff to judge. base_ref: Base reference name. head_ref: Head reference name. base_commit_hash: Base commit hash. head_commit_hash: Head commit hash. parent_workplan_issue_number: Parent workplan issue number. subissue_to_update: Optional existing sub-issue to update. debug: If True, add a comment with the full prompt. codebase_reasoning: Mode for codebase context. disable_search_grounding: If True, disables search grounding. _meta: Optional metadata from the caller. ctx: Optional context for logging. github_command_func: Optional GitHub command function (for mocking). git_command_func: Optional Git command function (for mocking). reasoning_effort: Optional reasoning effort to apply for supported models. """ try: # Construct prompt prompt = f"""You are an expert software reviewer tasked with judging whether a code diff successfully implements a given workplan. # Original Workplan {workplan_content} # Code Diff {diff_content} # Task Review the code diff against the original workplan and provide a detailed judgement. Consider: 1. **Completeness**: Does the diff implement all the steps and requirements outlined in the workplan? 2. **Correctness**: Is the implementation technically correct and does it follow best practices? 3. **Missing Elements**: What parts of the workplan, if any, were not addressed? 4. **Additional Changes**: Were there any changes made that weren't part of the original workplan? 5. **Quality**: Comment on code quality, testing, documentation, and any potential issues. The diff represents changes between '{base_ref}' and '{head_ref}'. Structure your response with these clear sections: ## Judgement Summary Provide a clear verdict: APPROVED, NEEDS_WORK, or INCOMPLETE, followed by a brief explanation. ## Implementation Analysis Detail what was successfully implemented from the workplan. ## Missing or Incomplete Items List specific items from the workplan that were not addressed or were only partially implemented. ## Code Quality Assessment Evaluate the quality of the implementation including: - Code style and consistency - Error handling - Test coverage - Documentation ## Recommendations Provide specific, actionable recommendations for improvement. ## References Extract any URLs mentioned in the workplan or that would be helpful for understanding the implementation and list them here. This ensures important links are preserved. IMPORTANT: Respond *only* with the Markdown content for the judgement. Do *not* wrap your entire response in a single Markdown code block (```). Start directly with the '## Judgement Summary' heading. """ # Check if we should use search grounding use_search_grounding = not disable_search_grounding if _meta and "original_search_grounding" in _meta: use_search_grounding = ( _meta["original_search_grounding"] and not disable_search_grounding ) # Prepare optional generation config for the LLM call generation_config = None is_openai_model = llm_manager._is_openai_model(model) # Handle search grounding for Gemini models if not is_openai_model and use_search_grounding: if ctx: await ctx.log( level="info", message=f"Attempting to enable search grounding for model {model}" ) try: from google.genai.types import GenerateContentConfig from yellhorn_mcp.utils.search_grounding_utils import _get_gemini_search_tools search_tools = _get_gemini_search_tools(model) if search_tools: generation_config = GenerateContentConfig(tools=search_tools) if ctx: await ctx.log( level="info", message=f"Search grounding enabled for model {model}" ) except ImportError: if ctx: await ctx.log( level="warning", message="GenerateContentConfig not available, skipping search grounding", ) # Call LLM through the manager with citation support effective_reasoning: ReasoningEffort | None = None if is_openai_model: # OpenAI models don't support citations if reasoning_effort is not None: usage_result: UsageResult = await llm_manager.call_llm_with_usage( prompt=prompt, model=model, temperature=0.0, ctx=ctx, generation_config=generation_config, reasoning_effort=reasoning_effort, ) else: usage_result = await llm_manager.call_llm_with_usage( prompt=prompt, model=model, temperature=0.0, ctx=ctx, generation_config=generation_config, ) usage_metadata: UsageMetadata = usage_result["usage_metadata"] content_value = usage_result["content"] judgement_content = ( content_value if isinstance(content_value, str) else str(content_value) ) effective_reasoning = usage_result.get("reasoning_effort") completion_metadata = CompletionMetadata( model_name=model, status="✅ Judgement generated successfully", generation_time_seconds=0.0, # Will be calculated below input_tokens=usage_metadata.prompt_tokens, output_tokens=usage_metadata.completion_tokens, total_tokens=usage_metadata.total_tokens, timestamp=datetime.now(timezone.utc), ) else: # Gemini models - use citation-aware call if reasoning_effort is not None: citation_result: CitationResult = await llm_manager.call_llm_with_citations( prompt=prompt, model=model, temperature=0.0, ctx=ctx, generation_config=generation_config, reasoning_effort=reasoning_effort, ) else: citation_result = await llm_manager.call_llm_with_citations( prompt=prompt, model=model, temperature=0.0, ctx=ctx, generation_config=generation_config, ) content_val = citation_result.get("content", "") judgement_content = content_val if isinstance(content_val, str) else str(content_val) usage_metadata = citation_result.get("usage_metadata", UsageMetadata()) # Process citations if available grounding_metadata = citation_result.get("grounding_metadata") if grounding_metadata is not None: from yellhorn_mcp.utils.search_grounding_utils import add_citations_from_metadata judgement_content = add_citations_from_metadata( judgement_content, cast(GroundingMetadata, grounding_metadata) ) # Create completion metadata if isinstance(grounding_metadata, GroundingMetadata): sr_used = ( len(grounding_metadata.grounding_chunks) if grounding_metadata.grounding_chunks is not None else None ) else: sr_used = None effective_reasoning = None completion_metadata = CompletionMetadata( model_name=model, status="✅ Judgement generated successfully", generation_time_seconds=0.0, # Will be calculated below input_tokens=usage_metadata.prompt_tokens, output_tokens=usage_metadata.completion_tokens, total_tokens=usage_metadata.total_tokens, search_results_used=sr_used, timestamp=datetime.now(timezone.utc), ) if not judgement_content: api_name = "OpenAI" if is_openai_model else "Gemini" raise YellhornMCPError( f"Failed to generate judgement: Received an empty response from {api_name} API." ) # Calculate generation time if we have metadata if ( completion_metadata and _meta and "start_time" in _meta and isinstance(_meta["start_time"], datetime) ): generation_time = (datetime.now(timezone.utc) - _meta["start_time"]).total_seconds() completion_metadata.generation_time_seconds = generation_time completion_metadata.timestamp = datetime.now(timezone.utc) # Calculate cost if we have token counts if ( completion_metadata and completion_metadata.input_tokens and completion_metadata.output_tokens ): completion_metadata.estimated_cost = calculate_cost( model, int(completion_metadata.input_tokens or 0), int(completion_metadata.output_tokens or 0), effective_reasoning.value if effective_reasoning else None, ) # Add context size if completion_metadata: completion_metadata.context_size_chars = len(prompt) # Construct metadata section for the final body metadata_section = f"""## Comparison Metadata - **Workplan Issue**: `#{parent_workplan_issue_number}` - **Base Ref**: `{base_ref}` (Commit: `{base_commit_hash}`) - **Head Ref**: `{head_ref}` (Commit: `{head_commit_hash}`) - **Codebase Reasoning Mode**: `{codebase_reasoning}` - **AI Model**: `{model}` """ # Add parent issue link at the top parent_link = f"Parent workplan: #{parent_workplan_issue_number}\n\n" # Construct the full body (no metrics in body) full_body = f"{parent_link}{metadata_section}{judgement_content}" # Construct title judgement_title = f"Judgement for #{parent_workplan_issue_number}: {head_ref} vs {base_ref}" # Create or update the sub-issue if subissue_to_update: # Update existing issue await update_github_issue( repo_path=repo_path, issue_number=subissue_to_update, title=judgement_title, body=full_body, github_command_func=github_command_func, ) # Construct the URL for the updated issue repo_info = await run_git_command( repo_path, ["remote", "get-url", "origin"], git_command_func ) # Clean up the repo URL to get the proper format if repo_info.endswith(".git"): repo_info = repo_info[:-4] if repo_info.startswith("git@github.com:"): repo_info = repo_info.replace("git@github.com:", "https://github.com/") subissue_url = f"{repo_info}/issues/{subissue_to_update}" else: subissue_url = await create_judgement_subissue( repo_path, parent_workplan_issue_number, judgement_title, full_body, github_command_func=github_command_func, ) if ctx: await ctx.log( level="info", message=f"Successfully created judgement sub-issue: {subissue_url}", ) # Add debug comment if requested if debug: # Extract issue number from URL issue_match = re.search(r"/issues/(\d+)", subissue_url) if issue_match: sub_issue_number = issue_match.group(1) debug_comment = f"<details>\n<summary>Debug: Full prompt used for generation</summary>\n\n```\n{prompt}\n```\n</details>" await add_issue_comment( repo_path, sub_issue_number, debug_comment, github_command_func=github_command_func, ) # Add completion comment to the PARENT issue, not the sub-issue if completion_metadata and _meta: _urls_obj = _meta.get("submitted_urls") urls = ( [u for u in _urls_obj if isinstance(u, str)] if isinstance(_urls_obj, list) else None ) _ts_obj = _meta.get("start_time") ts = _ts_obj if isinstance(_ts_obj, datetime) else datetime.now(timezone.utc) submission_metadata = SubmissionMetadata( status="Generating judgement...", model_name=model, search_grounding_enabled=not disable_search_grounding, yellhorn_version=__version__, submitted_urls=urls, codebase_reasoning_mode=codebase_reasoning, timestamp=ts, ) # Post completion comment to the sub-issue completion_comment = format_completion_comment(completion_metadata) # Extract sub-issue number from URL or use the provided one if subissue_to_update: sub_issue_number = subissue_to_update else: # Extract issue number from URL issue_match = re.search(r"/issues/(\d+)", subissue_url) if issue_match: sub_issue_number = issue_match.group(1) else: # Fallback to parent if we can't extract sub-issue number sub_issue_number = parent_workplan_issue_number await add_issue_comment( repo_path, sub_issue_number, completion_comment, github_command_func=github_command_func, ) except Exception as e: error_msg = f"Error processing judgement: {str(e)}" if ctx: await ctx.log(level="error", message=error_msg) # Try to add error comment to parent issue try: error_comment = f"❌ **Error generating judgement**\n\n{str(e)}" await add_issue_comment( repo_path, parent_workplan_issue_number, error_comment, github_command_func=github_command_func, ) except Exception: # If we can't even add a comment, just log if ctx: await ctx.log( level="error", message=f"Failed to add error comment to issue: {str(e)}" ) # Re-raise as YellhornMCPError to signal failure outward raise YellhornMCPError(error_msg)

Yellhorn MCP

Instructions

Input Schema

Implementation Reference

Other Tools

Latest Blog Posts

MCP directory API