Saiten MCP Server

run_scoring.py•19 KiB

"""Saiten — Full scoring pipeline. Fetches all 42 submissions, evaluates each against track-specific rubrics, saves scores, and generates the ranking report. Scoring heuristics are based on objective signals from the submission data: - Checklist completion status - README presence and length - Demo availability - Technologies count and depth - Technical highlights quality - Repository URL presence - Setup instructions """ from __future__ import annotations import asyncio import logging import re import sys from typing import Any logging.basicConfig(level=logging.INFO, format="%(message)s") logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Scoring helpers — data-driven heuristic evaluation # --------------------------------------------------------------------------- # Keywords that indicate advanced reasoning / agentic patterns REASONING_KEYWORDS = [ "chain-of-thought", "cot", "react", "reflection", "self-correct", "multi-agent", "orchestrat", "planner", "pipeline", "workflow", "semantic kernel", "autogen", "langchain", "langgraph", "tool use", "function call", "rag", "grounding", "evaluation", "feedback loop", "retry", "iterative", ] # Keywords that indicate reliability / safety RELIABILITY_KEYWORDS = [ "error handl", "try", "catch", "except", "retry", "rate limit", "authentication", "oauth", "security", "helmet", "cors", "validation", "sanitiz", "encrypt", ".env", "secret", "test", "pytest", "jest", "unit test", "logging", "monitor", ] # Keywords that indicate MCP / Copilot usage COPILOT_MCP_KEYWORDS = [ "mcp", "model context protocol", "copilot", "github copilot", "copilot agent", "agent.md", "mcp server", "fastmcp", "copilot chat", "copilot extension", "copilot sdk", ] # Keywords for enterprise / M365 ENTERPRISE_KEYWORDS = [ "adaptive card", "connected agent", "declarative agent", "teams", "m365", "microsoft 365", "copilot studio", "sharepoint", "graph api", "entra", "oauth", "agents toolkit", "custom engine", "bot framework", ] # Keywords for creativity / innovation CREATIVITY_KEYWORDS = [ "novel", "innovat", "unique", "creative", "original", "first", "new approach", "reimagin", "transform", "ai-powered", "intelligent", "smart", "automat", "real-time", "live", "interactive", "dynamic", ] def _count_keyword_hits(text: str, keywords: list[str]) -> int: """Count how many keywords appear in the text (case-insensitive).""" text_lower = text.lower() return sum(1 for kw in keywords if kw in text_lower) def _score_from_signals( hits: int, max_keywords: int, base: int = 4, ceiling: int = 9 ) -> int: """Convert keyword hit count to a 1-10 score.""" if max_keywords == 0: return base ratio = min(hits / max_keywords, 1.0) return min(ceiling, max(1, base + round(ratio * (ceiling - base)))) def _evaluate_creative_apps(detail: dict[str, Any]) -> dict[str, Any]: """Score a Creative Apps submission.""" readme = detail.get("readme_content") or "" desc = detail.get("description") or "" highlights = detail.get("technical_highlights") or "" setup = detail.get("setup_summary") or "" demo_desc = detail.get("demo_description") or "" techs = detail.get("technologies") or [] checklist = detail.get("submission_checklist") or {} has_demo = detail.get("has_demo", False) has_repo = detail.get("repo_url") is not None corpus = f"{readme} {desc} {highlights} {setup} {demo_desc}".lower() tech_str = " ".join(techs).lower() full_text = f"{corpus} {tech_str}" # --- Accuracy & Relevance (Copilot + MCP usage) --- copilot_hits = _count_keyword_hits(full_text, COPILOT_MCP_KEYWORDS) checklist_done = sum(1 for v in checklist.values() if v) checklist_total = max(len(checklist), 1) checklist_ratio = checklist_done / checklist_total acc_base = 4 if has_repo else 2 acc_base += round(checklist_ratio * 2) acc_score = min(10, acc_base + min(copilot_hits, 3)) # --- Reasoning & Multi-step Thinking --- reasoning_hits = _count_keyword_hits(full_text, REASONING_KEYWORDS) rea_score = _score_from_signals(reasoning_hits, 5, base=3, ceiling=9) # --- Creativity & Originality --- creativity_hits = _count_keyword_hits(full_text, CREATIVITY_KEYWORDS) tech_diversity = min(len(techs), 10) cre_base = 3 + min(tech_diversity // 3, 3) cre_score = min(10, cre_base + min(creativity_hits, 3)) # --- UX & Presentation --- ux_score = 3 if has_repo: ux_score += 1 if has_demo: ux_score += 2 if len(readme) > 2000: ux_score += 1 if len(readme) > 5000: ux_score += 1 if setup and setup.lower() not in ("_no response_", ""): ux_score += 1 ux_score = min(10, ux_score) # --- Reliability & Safety --- rel_hits = _count_keyword_hits(full_text, RELIABILITY_KEYWORDS) rel_score = _score_from_signals(rel_hits, 5, base=3, ceiling=9) criteria_scores = { "Accuracy & Relevance": acc_score, "Reasoning & Multi-step Thinking": rea_score, "Creativity & Originality": cre_score, "UX & Presentation": ux_score, "Reliability & Safety": rel_score, } weights = { "Accuracy & Relevance": 0.222, "Reasoning & Multi-step Thinking": 0.222, "Creativity & Originality": 0.167, "UX & Presentation": 0.167, "Reliability & Safety": 0.222, } weighted_total = sum( criteria_scores[k] * weights[k] for k in criteria_scores ) * 10 return _build_score_entry(detail, criteria_scores, weighted_total, full_text) def _evaluate_reasoning_agents(detail: dict[str, Any]) -> dict[str, Any]: """Score a Reasoning Agents submission.""" readme = detail.get("readme_content") or "" desc = detail.get("description") or "" highlights = detail.get("technical_highlights") or "" setup = detail.get("setup_summary") or "" demo_desc = detail.get("demo_description") or "" techs = detail.get("technologies") or [] checklist = detail.get("submission_checklist") or {} has_demo = detail.get("has_demo", False) has_repo = detail.get("repo_url") is not None corpus = f"{readme} {desc} {highlights} {setup} {demo_desc}".lower() tech_str = " ".join(techs).lower() full_text = f"{corpus} {tech_str}" # --- Accuracy & Relevance (Foundry / scenario fit) --- foundry_kw = ["foundry", "azure ai", "azure openai", "gpt", "ai project", "social media", "communication", "content"] foundry_hits = _count_keyword_hits(full_text, foundry_kw) checklist_done = sum(1 for v in checklist.values() if v) checklist_total = max(len(checklist), 1) acc_base = 4 if has_repo else 2 acc_base += round((checklist_done / checklist_total) * 2) acc_score = min(10, acc_base + min(foundry_hits, 3)) # --- Reasoning & Multi-step Thinking --- reasoning_hits = _count_keyword_hits(full_text, REASONING_KEYWORDS) rea_score = _score_from_signals(reasoning_hits, 5, base=3, ceiling=9) # --- Creativity & Originality --- creativity_hits = _count_keyword_hits(full_text, CREATIVITY_KEYWORDS) cre_score = _score_from_signals(creativity_hits, 4, base=4, ceiling=9) # --- UX & Presentation --- ux_score = 3 if has_repo: ux_score += 1 if has_demo: ux_score += 2 if len(readme) > 2000: ux_score += 1 if len(readme) > 5000: ux_score += 1 if setup and setup.lower() not in ("_no response_", ""): ux_score += 1 ux_score = min(10, ux_score) # --- Technical Implementation --- tech_hits = _count_keyword_hits(full_text, RELIABILITY_KEYWORDS) tech_diversity = min(len(techs), 10) tech_score = _score_from_signals(tech_hits, 5, base=3, ceiling=9) if tech_diversity > 5: tech_score = min(10, tech_score + 1) criteria_scores = { "Accuracy & Relevance": acc_score, "Reasoning & Multi-step Thinking": rea_score, "Creativity & Originality": cre_score, "User Experience & Presentation": ux_score, "Technical Implementation": tech_score, } weights = { "Accuracy & Relevance": 0.25, "Reasoning & Multi-step Thinking": 0.25, "Creativity & Originality": 0.20, "User Experience & Presentation": 0.15, "Technical Implementation": 0.15, } weighted_total = sum( criteria_scores[k] * weights[k] for k in criteria_scores ) * 10 return _build_score_entry(detail, criteria_scores, weighted_total, full_text) def _evaluate_enterprise_agents(detail: dict[str, Any]) -> dict[str, Any]: """Score an Enterprise Agents submission.""" readme = detail.get("readme_content") or "" desc = detail.get("description") or "" highlights = detail.get("technical_highlights") or "" setup = detail.get("setup_summary") or "" demo_desc = detail.get("demo_description") or "" techs = detail.get("technologies") or [] checklist = detail.get("submission_checklist") or {} has_demo = detail.get("has_demo", False) has_repo = detail.get("repo_url") is not None corpus = f"{readme} {desc} {highlights} {setup} {demo_desc}".lower() tech_str = " ".join(techs).lower() full_text = f"{corpus} {tech_str}" # --- Technical Implementation --- ent_hits = _count_keyword_hits(full_text, ENTERPRISE_KEYWORDS) rel_hits = _count_keyword_hits(full_text, RELIABILITY_KEYWORDS) tech_score = 3 if has_repo: tech_score += 1 tech_score += min(ent_hits, 3) tech_score += min(rel_hits // 2, 2) tech_score = min(10, tech_score) # --- Business Value --- biz_kw = ["business", "enterprise", "roi", "productiv", "efficien", "automat", "workflow", "hr", "finance", "legal", "supply chain", "helpdesk", "it support", "sales", "customer"] biz_hits = _count_keyword_hits(full_text, biz_kw) biz_score = _score_from_signals(biz_hits, 4, base=4, ceiling=9) # --- Innovation & Creativity --- creativity_hits = _count_keyword_hits(full_text, CREATIVITY_KEYWORDS) innov_score = _score_from_signals(creativity_hits, 4, base=4, ceiling=9) if has_demo: innov_score = min(10, innov_score + 1) criteria_scores = { "Technical Implementation": tech_score, "Business Value": biz_score, "Innovation & Creativity": innov_score, } weights = { "Technical Implementation": 0.33, "Business Value": 0.33, "Innovation & Creativity": 0.34, } weighted_total = sum( criteria_scores[k] * weights[k] for k in criteria_scores ) * 10 return _build_score_entry(detail, criteria_scores, weighted_total, full_text) def _evaluate_unknown_track(detail: dict[str, Any]) -> dict[str, Any]: """Fallback scoring for unknown track — use general criteria.""" readme = detail.get("readme_content") or "" desc = detail.get("description") or "" highlights = detail.get("technical_highlights") or "" techs = detail.get("technologies") or [] has_demo = detail.get("has_demo", False) has_repo = detail.get("repo_url") is not None full_text = f"{readme} {desc} {highlights} ".lower() # Try to infer actual track from content ent_hits = _count_keyword_hits(full_text, ENTERPRISE_KEYWORDS) copilot_hits = _count_keyword_hits(full_text, COPILOT_MCP_KEYWORDS) if ent_hits > copilot_hits: detail_copy = dict(detail) detail_copy["track"] = "enterprise-agents" return _evaluate_enterprise_agents(detail_copy) else: detail_copy = dict(detail) detail_copy["track"] = "creative-apps" return _evaluate_creative_apps(detail_copy) def _build_score_entry( detail: dict[str, Any], criteria_scores: dict[str, int], weighted_total: float, full_text: str, ) -> dict[str, Any]: """Build the score entry dict.""" # Auto-generate strengths/improvements strengths = [] improvements = [] if detail.get("has_demo"): strengths.append("Demo video or screenshots provided") else: improvements.append("No demo materials provided") if detail.get("readme_content") and len(detail["readme_content"]) > 3000: strengths.append("Comprehensive README documentation") elif not detail.get("readme_content"): improvements.append("No README found or repository inaccessible") if detail.get("repo_url"): strengths.append("Public repository available") else: improvements.append("No repository URL provided") techs = detail.get("technologies") or [] if len(techs) >= 5: strengths.append(f"Rich technology stack ({len(techs)} technologies)") checklist = detail.get("submission_checklist") or {} done = sum(1 for v in checklist.values() if v) if checklist and done == len(checklist): strengths.append("All submission checklist items completed") elif checklist and done < len(checklist): improvements.append( f"Incomplete checklist ({done}/{len(checklist)} items)" ) high_scores = [k for k, v in criteria_scores.items() if v >= 7] low_scores = [k for k, v in criteria_scores.items() if v <= 4] for k in high_scores[:2]: strengths.append(f"Strong {k}") for k in low_scores[:2]: improvements.append(f"Weak {k}") project = detail.get("project_name") or detail.get("title", "Unknown") track = detail.get("track", "unknown") summary = ( f"{project} is a {track.replace('-', ' ')} submission " f"scoring {weighted_total:.1f}/100. " ) if strengths: summary += f"Key strengths: {', '.join(strengths[:2])}. " if improvements: summary += f"Areas for improvement: {', '.join(improvements[:2])}." return { "issue_number": detail["issue_number"], "project_name": project, "track": track, "github_username": detail.get("github_username"), "issue_url": detail.get("issue_url", ""), "repo_url": detail.get("repo_url"), "criteria_scores": criteria_scores, "weighted_total": round(weighted_total, 1), "strengths": strengths[:5], "improvements": improvements[:5], "summary": summary, } EVALUATORS = { "creative-apps": _evaluate_creative_apps, "reasoning-agents": _evaluate_reasoning_agents, "enterprise-agents": _evaluate_enterprise_agents, "unknown": _evaluate_unknown_track, } # --------------------------------------------------------------------------- # Main pipeline # --------------------------------------------------------------------------- async def run_scoring_pipeline(): """Execute the full scoring pipeline.""" from saiten_mcp.tools.submissions import list_submissions, get_submission_detail from saiten_mcp.tools.scores import save_scores from saiten_mcp.tools.reports import generate_ranking_report from saiten_mcp.server import rate_limiter # Batch scoring calls get_submission_detail many times in a short burst. # Raise the per-minute allowance to avoid false-positive local throttling. rate_limiter.max_calls = max(rate_limiter.max_calls, 150) async def get_detail_with_retry(issue_num: int, max_attempts: int = 3) -> dict[str, Any]: """Fetch submission detail with retry when local rate limiter is hit.""" for attempt in range(1, max_attempts + 1): try: return await get_submission_detail(issue_num) except ValueError as exc: msg = str(exc) is_rate_limited = "Rate limit exceeded for 'get_submission_detail'" in msg if is_rate_limited and attempt < max_attempts: wait_seconds = 61 logger.warning( " Rate-limited on #%d; waiting %ds before retry (%d/%d)", issue_num, wait_seconds, attempt, max_attempts, ) await asyncio.sleep(wait_seconds) continue raise raise RuntimeError(f"Failed to fetch detail after retries: #{issue_num}") # 1. Fetch all submissions logger.info("=" * 60) logger.info("STEP 1: Fetching submission list...") logger.info("=" * 60) submissions = await list_submissions(state="all") logger.info(f" Found {len(submissions)} submissions") tracks = {} for s in submissions: t = s["track"] tracks[t] = tracks.get(t, 0) + 1 logger.info(f" Distribution: {tracks}") # 2. Score each submission logger.info("\n" + "=" * 60) logger.info("STEP 2: Scoring submissions...") logger.info("=" * 60) all_scores: list[dict[str, Any]] = [] errors: list[dict[str, Any]] = [] for i, sub in enumerate(submissions, 1): issue_num = sub["issue_number"] project = sub["project_name"][:40] track = sub["track"] logger.info(f" [{i:2d}/{len(submissions)}] #{issue_num} ({track}) {project}") try: detail = await get_detail_with_retry(issue_num) # Use actual track from detail (may differ from list) actual_track = detail.get("track", track) evaluator = EVALUATORS.get(actual_track, _evaluate_unknown_track) score_entry = evaluator(detail) all_scores.append(score_entry) logger.info(f" -> {score_entry['weighted_total']:.1f}/100") except Exception as exc: logger.warning(f" -> SKIPPED: {exc}") errors.append({"issue_number": issue_num, "error": str(exc)}) logger.info(f"\n Scored: {len(all_scores)}, Errors: {len(errors)}") # 3. Save scores logger.info("\n" + "=" * 60) logger.info("STEP 3: Saving scores...") logger.info("=" * 60) result = await save_scores(all_scores) logger.info( f" New: {result['saved_count']}, Updated: {result['updated_count']}, " f"Total: {result['total_in_store']}" ) # 4. Generate ranking report logger.info("\n" + "=" * 60) logger.info("STEP 4: Generating ranking report...") logger.info("=" * 60) report = await generate_ranking_report(top_n=10) logger.info(f" Report: {report['report_path']}") logger.info(f" Total scored: {report['total_scored']}") # 5. Print Top 10 logger.info("\n" + "=" * 60) logger.info("🏆 TOP 10 RESULTS") logger.info("=" * 60) for entry in report["top_entries"]: rank = entry["rank"] name = entry["project_name"][:45] track = entry["track"] score = entry["score"] emoji = {"creative-apps": "🎨", "reasoning-agents": "🧠", "enterprise-agents": "💼"}.get(track, "❓") logger.info(f" {rank:2d}. {emoji} {name:<45s} {score:5.1f}") if errors: logger.info(f"\n⚠️ Skipped {len(errors)} submissions due to errors:") for e in errors: logger.info(f" - #{e['issue_number']}: {e['error'][:80]}") logger.info("\n✅ Pipeline complete!") if __name__ == "__main__": asyncio.run(run_scoring_pipeline())

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/aktsmm/FY26_techconnect_saiten'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

run_scoring.py•19 KiB