mcp-reasoner

import { v4 as uuidv4 } from 'uuid'; import { ThoughtNode, ReasoningRequest, ReasoningResponse, CONFIG } from '../../types.js'; import { MonteCarloTreeSearchStrategy } from '../mcts.js'; interface PolicyGuidedNode extends ThoughtNode { visits: number; totalReward: number; untriedActions?: string[]; policyScore: number; // Policy network prediction valueEstimate: number; // Value network estimate priorActionProbs: Map<string, number>; // Action probabilities puct?: number; // PUCT score for selection actionHistory?: string[]; // Track sequence of actions noveltyScore?: number; // Measure of thought novelty } interface PolicyMetrics { averagePolicyScore: number; averageValueEstimate: number; actionDistribution: { [action: string]: number }; explorationStats: { temperature: number; explorationRate: number; noveltyBonus: number; }; convergenceMetrics: { policyEntropy: number; valueStability: number; }; } export class MCTS002AlphaStrategy extends MonteCarloTreeSearchStrategy { private readonly temperature: number; private explorationRate: number; private readonly learningRate: number; private readonly noveltyBonus: number; private policyMetrics: PolicyMetrics; protected readonly simulationCount: number; constructor(stateManager: any, numSimulations: number = CONFIG.numSimulations) { super(stateManager, numSimulations); this.temperature = 1.0; this.explorationRate = Math.sqrt(2); this.learningRate = 0.1; this.noveltyBonus = 0.2; this.simulationCount = numSimulations; this.policyMetrics = this.initializePolicyMetrics(); } private initializePolicyMetrics(): PolicyMetrics { return { averagePolicyScore: 0, averageValueEstimate: 0, actionDistribution: {}, explorationStats: { temperature: this.temperature, explorationRate: this.explorationRate, noveltyBonus: this.noveltyBonus }, convergenceMetrics: { policyEntropy: 0, valueStability: 0 } }; } public async processThought(request: ReasoningRequest): Promise<ReasoningResponse> { // Get base MCTS response const baseResponse = await super.processThought(request); const nodeId = uuidv4(); const parentNode = request.parentId ? await this.getNode(request.parentId) as PolicyGuidedNode : undefined; const node: PolicyGuidedNode = { id: nodeId, thought: request.thought, depth: request.thoughtNumber - 1, score: 0, children: [], parentId: request.parentId, isComplete: !request.nextThoughtNeeded, visits: 0, totalReward: 0, untriedActions: [], policyScore: 0, valueEstimate: 0, priorActionProbs: new Map(), actionHistory: parentNode ? [...(parentNode.actionHistory || []), this.extractAction(request.thought)] : [this.extractAction(request.thought)] }; // Initialize node with policy guidance node.score = this.evaluateThought(node, parentNode); node.visits = 1; node.totalReward = node.score; node.policyScore = this.calculatePolicyScore(node, parentNode); node.valueEstimate = this.estimateValue(node); node.noveltyScore = this.calculateNovelty(node); await this.saveNode(node); // Update parent if exists if (parentNode) { parentNode.children.push(node.id); await this.saveNode(parentNode); await this.updatePolicyMetrics(node, parentNode); } // Run policy-guided search if (!node.isComplete) { await this.runPolicyGuidedSearch(node); } // Calculate enhanced path statistics const currentPath = await this.stateManager.getPath(nodeId); const enhancedScore = this.calculatePolicyEnhancedScore(currentPath); return { ...baseResponse, score: enhancedScore, bestScore: Math.max(baseResponse.bestScore || 0, enhancedScore) }; } private extractAction(thought: string): string { // Simple action extraction based on first few words return thought.split(/\s+/).slice(0, 3).join('_').toLowerCase(); } private calculatePolicyScore(node: PolicyGuidedNode, parent?: PolicyGuidedNode): number { // Combine multiple policy factors const depthFactor = Math.exp(-0.1 * node.depth); const parentAlignment = parent ? this.thoughtCoherence(node.thought, parent.thought) : 1; const noveltyBonus = node.noveltyScore || 0; return ( 0.4 * depthFactor + 0.4 * parentAlignment + 0.2 * noveltyBonus ); } private estimateValue(node: PolicyGuidedNode): number { // Combine immediate score with future potential const immediateValue = node.score; const depthPotential = 1 - (node.depth / CONFIG.maxDepth); const noveltyValue = node.noveltyScore || 0; return ( 0.5 * immediateValue + 0.3 * depthPotential + 0.2 * noveltyValue ); } private calculateNovelty(node: PolicyGuidedNode): number { // Measure thought novelty based on action history const uniqueActions = new Set(node.actionHistory).size; const historyLength = node.actionHistory?.length || 1; const uniquenessRatio = uniqueActions / historyLength; // Combine with linguistic novelty const complexityScore = (node.thought.match(/[.!?;]|therefore|because|if|then/g) || []).length / 10; return (0.7 * uniquenessRatio + 0.3 * complexityScore); } private thoughtCoherence(thought1: string, thought2: string): number { const words1 = new Set(thought1.toLowerCase().split(/\W+/)); const words2 = new Set(thought2.toLowerCase().split(/\W+/)); const intersection = new Set([...words1].filter(x => words2.has(x))); const union = new Set([...words1, ...words2]); return intersection.size / union.size; } private async runPolicyGuidedSearch(node: PolicyGuidedNode): Promise<void> { for (let i = 0; i < this.simulationCount; i++) { const selectedNode = await this.selectWithPUCT(node); const expandedNode = await this.expandWithPolicy(selectedNode); const reward = await this.simulateWithValueGuidance(expandedNode); await this.backpropagateWithPolicyUpdate(expandedNode, reward); // Adapt exploration rate this.adaptExplorationRate(expandedNode); } } private async selectWithPUCT(root: PolicyGuidedNode): Promise<PolicyGuidedNode> { let node = root; while (node.children.length > 0) { const children = await Promise.all( node.children.map(id => this.getNode(id)) ) as PolicyGuidedNode[]; node = this.selectBestPUCTChild(children); } return node; } private selectBestPUCTChild(nodes: PolicyGuidedNode[]): PolicyGuidedNode { const totalVisits = nodes.reduce((sum, node) => sum + node.visits, 0); return nodes.reduce((best, node) => { const exploitation = node.valueEstimate; const exploration = Math.sqrt(Math.log(totalVisits) / node.visits); const policyTerm = node.policyScore * this.explorationRate; const noveltyBonus = (node.noveltyScore || 0) * this.noveltyBonus; const puct = exploitation + exploration * policyTerm + noveltyBonus; node.puct = puct; return puct > (best.puct || 0) ? node : best; }); } private async expandWithPolicy(node: PolicyGuidedNode): Promise<PolicyGuidedNode> { if (node.isComplete) return node; const newNode: PolicyGuidedNode = { ...node, id: uuidv4(), depth: node.depth + 1, parentId: node.id, children: [], visits: 1, totalReward: 0, policyScore: 0, valueEstimate: 0, priorActionProbs: new Map(), actionHistory: [...(node.actionHistory || [])] }; newNode.policyScore = this.calculatePolicyScore(newNode, node); newNode.score = this.evaluateThought(newNode, node); newNode.valueEstimate = this.estimateValue(newNode); newNode.noveltyScore = this.calculateNovelty(newNode); await this.saveNode(newNode); return newNode; } private async simulateWithValueGuidance(node: PolicyGuidedNode): Promise<number> { let current = node; let totalReward = 0; let depth = 0; while (!current.isComplete && depth < CONFIG.maxDepth) { const reward = current.valueEstimate; totalReward += reward; const expanded = await this.expandWithPolicy(current); current = expanded; depth++; } return totalReward / depth; } private async backpropagateWithPolicyUpdate( node: PolicyGuidedNode, reward: number ): Promise<void> { let current: PolicyGuidedNode | undefined = node; while (current) { current.visits++; current.totalReward += reward; // Update value estimate with temporal difference const newValue = (1 - this.learningRate) * current.valueEstimate + this.learningRate * reward; current.valueEstimate = newValue; // Update action probabilities if (current.parentId) { const parentNode = await this.getNode(current.parentId) as PolicyGuidedNode; const actionKey = this.extractAction(current.thought); const currentProb = parentNode.priorActionProbs.get(actionKey) || 0; const newProb = currentProb + this.learningRate * (reward - currentProb); parentNode.priorActionProbs.set(actionKey, newProb); await this.saveNode(parentNode); } await this.saveNode(current); current = current.parentId ? await this.getNode(current.parentId) as PolicyGuidedNode : undefined; } } private adaptExplorationRate(node: PolicyGuidedNode): void { const successRate = node.totalReward / node.visits; const targetRate = 0.6; if (successRate > targetRate) { // Reduce exploration when doing well this.explorationRate = Math.max(0.5, this.explorationRate * 0.95); } else { // Increase exploration when results are poor this.explorationRate = Math.min(2.0, this.explorationRate / 0.95); } } private async updatePolicyMetrics(node: PolicyGuidedNode, parent: PolicyGuidedNode): Promise<void> { // Update running averages this.policyMetrics.averagePolicyScore = (this.policyMetrics.averagePolicyScore + node.policyScore) / 2; this.policyMetrics.averageValueEstimate = (this.policyMetrics.averageValueEstimate + node.valueEstimate) / 2; // Update action distribution const action = this.extractAction(node.thought); this.policyMetrics.actionDistribution[action] = (this.policyMetrics.actionDistribution[action] || 0) + 1; // Update exploration stats this.policyMetrics.explorationStats = { temperature: this.temperature, explorationRate: this.explorationRate, noveltyBonus: this.noveltyBonus }; // Calculate policy entropy and value stability const probs = Array.from(parent.priorActionProbs.values()); this.policyMetrics.convergenceMetrics = { policyEntropy: this.calculateEntropy(probs), valueStability: Math.abs(node.valueEstimate - parent.valueEstimate) }; } private calculateEntropy(probs: number[]): number { const sum = probs.reduce((a, b) => a + b, 0); return -probs.reduce((acc, p) => { const norm = p / sum; return acc + (norm * Math.log2(norm + 1e-10)); }, 0); } private calculatePolicyEnhancedScore(path: ThoughtNode[]): number { if (path.length === 0) return 0; return path.reduce((acc, node) => { const policyNode = node as PolicyGuidedNode; const baseScore = node.score; const policyBonus = policyNode.policyScore || 0; const valueBonus = policyNode.valueEstimate || 0; const noveltyBonus = (policyNode.noveltyScore || 0) * this.noveltyBonus; return acc + (baseScore + policyBonus + valueBonus + noveltyBonus) / 4; }, 0) / path.length; } public async getMetrics(): Promise<any> { const baseMetrics = await super.getMetrics(); const nodes = await this.stateManager.getAllNodes() as PolicyGuidedNode[]; // Calculate additional policy-specific metrics const currentNode = nodes[nodes.length - 1]; const policyStats = { currentNode: currentNode ? { policyScore: currentNode.policyScore, valueEstimate: currentNode.valueEstimate, noveltyScore: currentNode.noveltyScore, actionHistory: currentNode.actionHistory } : null, averages: { policyScore: nodes.reduce((sum, n) => sum + n.policyScore, 0) / nodes.length, valueEstimate: nodes.reduce((sum, n) => sum + n.valueEstimate, 0) / nodes.length, noveltyScore: nodes.reduce((sum, n) => sum + (n.noveltyScore || 0), 0) / nodes.length }, policyMetrics: this.policyMetrics }; return { ...baseMetrics, name: 'MCTS-002-Alpha (Policy Enhanced)', temperature: this.temperature, explorationRate: this.explorationRate, learningRate: this.learningRate, policyStats }; } }