track: reasoning-agents
track_display_name: "🧠 Reasoning Agents"
notes: "Uses the common overall criteria directly."
scoring_policy:
evidence_required: true
min_evidence_per_criterion: 1
differentiation_rules:
- "Score 8+ requires citing specific reasoning patterns (CoT, ReAct, etc.) with code evidence"
- "Score 9+ requires demonstrable self-correction or multi-agent coordination"
- "Score 5-6 is the DEFAULT when evidence is ambiguous — do NOT default to 7+"
- "Foundry utilization depth matters: basic API call = 4-6, advanced features = 7+"
red_flags:
- signal: "No reasoning chain visible — just single prompt-response"
max_reasoning_score: 4
- signal: "No Foundry usage evidence"
max_accuracy_score: 4
- signal: "Repository is private or inaccessible"
max_accuracy_score: 5
- signal: "Hardcoded responses or mock data only"
max_accuracy_score: 3
bonus_signals:
- signal: "Self-reflection or evaluation loop implemented"
bonus_criteria: "Reasoning & Multi-step Thinking"
min_score: 8
- signal: "Multi-agent architecture with clear role separation"
bonus_criteria: "Technical Implementation"
min_score: 7
- signal: "Grounding with external knowledge sources"
bonus_criteria: "Accuracy & Relevance"
min_score: 7
criteria:
- name: "Accuracy & Relevance"
weight: 0.25
description: "Does the submission accurately meet scenario requirements? Degree of Foundry utilization. Suitability as a communication team support agent."
scoring_guide:
"1-3": "Significantly deviates from scenario requirements. Minimal Foundry usage."
"4-6": "Basic scenario coverage. Foundry is used but utilization is limited."
"7-9": "Appropriately addresses the scenario. Effectively uses Foundry models and tools."
"10": "Delivers value beyond the scenario. Maximizes advanced Foundry capabilities."
evidence_signals:
positive:
- "Foundry model deployment or API calls visible in code"
- "Scenario requirements mapped to implemented features"
- "Communication team use case clearly addressed"
- "Grounding with real data sources"
negative:
- "Scenario requirements not addressed"
- "Generic chatbot with no Foundry integration"
- "Only placeholder or mock responses"
- name: "Reasoning & Multi-step Thinking"
weight: 0.25
description: "Quality of reasoning. Implementation of reasoning patterns such as Chain-of-Thought, ReAct, Self-Reflection."
scoring_guide:
"1-3": "Single-step processing only. No reasoning patterns implemented."
"4-6": "Basic Chain-of-Thought. Limited multi-step reasoning."
"7-9": "Combines multiple reasoning patterns. Self-correction mechanism present."
"10": "Advanced reasoning patterns fully implemented. Multi-agent architecture. Evaluation metrics configured."
evidence_signals:
positive:
- "CoT prompting with explicit reasoning steps in output"
- "ReAct loop: observe → think → act cycle in code"
- "Self-reflection: agent reviews and corrects own output"
- "Multi-step workflow with intermediate state management"
- "Evaluation metrics (accuracy, relevance scores) tracked"
negative:
- "Single API call returning final answer directly"
- "No visible reasoning chain in agent logic"
- "Hardcoded decision trees without LLM reasoning"
- name: "Creativity & Originality"
weight: 0.20
description: "Originality of the idea. Innovative approach to social media content generation."
scoring_guide:
"1-3": "Template-based simple generation. No originality."
"4-6": "Some creative approaches. Customized prompt design."
"7-9": "Unique content strategy. Brand-specific advanced generation."
"10": "Innovative content generation pipeline. Industry-leading approach."
evidence_signals:
positive:
- "Novel approach not seen in other submissions"
- "Custom content strategy tailored to specific brand/use case"
- "Unique data pipeline or processing architecture"
- "Original combination of reasoning techniques"
negative:
- "Standard tutorial-level implementation"
- "Nearly identical approach to another submission"
- "No differentiation from basic Foundry quickstart"
- name: "User Experience & Presentation"
weight: 0.15
description: "User experience. Demo quality. Documentation completeness."
scoring_guide:
"1-3": "No demo. Insufficient documentation."
"4-6": "Screenshots provided. Basic README."
"7-9": "Interactive demo. Detailed setup instructions. Sample inputs/outputs provided."
"10": "Production quality. Comprehensive documentation. Video demo."
evidence_signals:
positive:
- "Sample input/output pairs showing reasoning process"
- "Architecture diagram showing agent flow"
- "Video demonstrating multi-step reasoning"
- "Clear setup guide with prerequisites listed"
negative:
- "No demo or only static screenshots"
- "README lacks setup instructions"
- "No sample inputs/outputs provided"
- name: "Technical Implementation"
weight: 0.15
description: "Technical completeness. Code quality. Error handling. Grounding."
scoring_guide:
"1-3": "Minimal implementation. No error handling."
"4-6": "Basic setup. Some error handling."
"7-9": "Robust implementation. External tool integration. Proper error handling."
"10": "Production quality. Monitoring and safety measures complete. Grounding knowledge integrated."
evidence_signals:
positive:
- "Clean code structure with separation of concerns"
- "External tool/API integration beyond basic LLM calls"
- "Error handling with graceful fallbacks"
- "Monitoring, logging, or observability setup"
- "Automated tests present"
negative:
- "Single-file implementation with no structure"
- "No error handling around API calls"
- "Hardcoded credentials or configuration"