Chain of Draft (CoD) MCP Server

""" Analytics service for the Chain of Draft MCP server. Tracks performance metrics for different reasoning approaches. """ import datetime from sqlalchemy import create_engine, Column, Integer, String, Float, DateTime, JSON, func from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker import os Base = declarative_base() class InferenceRecord(Base): """Database model for tracking inference performance.""" __tablename__ = 'inference_records' id = Column(Integer, primary_key=True) timestamp = Column(DateTime, default=datetime.datetime.utcnow) problem_id = Column(String) problem_text = Column(String) domain = Column(String) approach = Column(String) # "CoD" or "CoT" word_limit = Column(Integer) tokens_used = Column(Integer) execution_time_ms = Column(Float) reasoning_steps = Column(String) answer = Column(String) expected_answer = Column(String, nullable=True) is_correct = Column(Integer, nullable=True) # 1=correct, 0=incorrect, null=unknown meta_data = Column(JSON, nullable=True) # Changed from metadata to meta_data to avoid SQLAlchemy reserved keyword class AnalyticsService: """Service for tracking and analyzing inference performance.""" def __init__(self, db_url=None): """Initialize the analytics service with a database connection.""" if db_url is None: # Default to SQLite in the current directory db_url = os.environ.get("COD_DB_URL", "sqlite:///cod_analytics.db") self.engine = create_engine(db_url) Base.metadata.create_all(self.engine) self.Session = sessionmaker(bind=self.engine) async def record_inference(self, problem, domain, approach, word_limit, tokens_used, execution_time, reasoning, answer, expected_answer=None, metadata=None): """Record a new inference with performance metrics.""" session = self.Session() try: # Simple hash function for problem ID problem_id = str(abs(hash(problem)) % (10 ** 10)) record = InferenceRecord( problem_id=problem_id, problem_text=problem, domain=domain, approach=approach, word_limit=word_limit, tokens_used=tokens_used, execution_time_ms=execution_time, reasoning_steps=reasoning, answer=answer, expected_answer=expected_answer, is_correct=self._check_correctness(answer, expected_answer) if expected_answer else None, meta_data=metadata ) session.add(record) session.commit() return record.id finally: session.close() def _check_correctness(self, answer, expected_answer): """Check if an answer is correct.""" # Basic string comparison - could be improved with more sophisticated matching if not answer or not expected_answer: return None return 1 if answer.strip().lower() == expected_answer.strip().lower() else 0 async def get_performance_by_domain(self, domain=None): """Get performance statistics by domain.""" session = self.Session() try: query = session.query( InferenceRecord.domain, InferenceRecord.approach, func.avg(InferenceRecord.tokens_used).label("avg_tokens"), func.avg(InferenceRecord.execution_time_ms).label("avg_time"), func.avg(InferenceRecord.is_correct).label("accuracy"), func.count(InferenceRecord.id).label("count") ).group_by(InferenceRecord.domain, InferenceRecord.approach) if domain: query = query.filter(InferenceRecord.domain == domain) results = query.all() return [ { "domain": r.domain, "approach": r.approach, "avg_tokens": r.avg_tokens, "avg_time_ms": r.avg_time, "accuracy": r.accuracy if r.accuracy is not None else None, "count": r.count } for r in results ] finally: session.close() async def get_token_reduction_stats(self): """Calculate token reduction statistics for CoD vs CoT.""" session = self.Session() try: domains = session.query(InferenceRecord.domain).distinct().all() results = [] for domain_row in domains: domain = domain_row[0] # Get average tokens for CoD and CoT approaches in this domain cod_avg = session.query(func.avg(InferenceRecord.tokens_used)).filter( InferenceRecord.domain == domain, InferenceRecord.approach == "CoD" ).scalar() or 0 cot_avg = session.query(func.avg(InferenceRecord.tokens_used)).filter( InferenceRecord.domain == domain, InferenceRecord.approach == "CoT" ).scalar() or 0 if cot_avg > 0: reduction_percentage = (1 - (cod_avg / cot_avg)) * 100 else: reduction_percentage = 0 results.append({ "domain": domain, "cod_avg_tokens": cod_avg, "cot_avg_tokens": cot_avg, "reduction_percentage": reduction_percentage }) return results finally: session.close() async def get_accuracy_comparison(self): """Compare accuracy between CoD and CoT approaches.""" session = self.Session() try: domains = session.query(InferenceRecord.domain).distinct().all() results = [] for domain_row in domains: domain = domain_row[0] # Get accuracy for CoD and CoT approaches in this domain cod_accuracy = session.query(func.avg(InferenceRecord.is_correct)).filter( InferenceRecord.domain == domain, InferenceRecord.approach == "CoD", InferenceRecord.is_correct.isnot(None) ).scalar() cot_accuracy = session.query(func.avg(InferenceRecord.is_correct)).filter( InferenceRecord.domain == domain, InferenceRecord.approach == "CoT", InferenceRecord.is_correct.isnot(None) ).scalar() results.append({ "domain": domain, "cod_accuracy": cod_accuracy, "cot_accuracy": cot_accuracy, "accuracy_difference": (cod_accuracy - cot_accuracy) if cod_accuracy and cot_accuracy else None }) return results finally: session.close()