Code-Index-MCP

Code-Index-MCP
scripts

realtime_parallel_analyzer.py•24.1 KiB

#!/usr/bin/env python3 """ Real-time Parallel Analysis Pipeline - Phase 2 Optimization Implements 8x speed improvement through concurrent transcript processing and analysis. """ import asyncio import json import time import os import queue import threading from pathlib import Path from typing import Dict, List, Any, Optional, Tuple, AsyncGenerator from datetime import datetime from dataclasses import dataclass, asdict import logging import concurrent.futures import multiprocessing from collections import defaultdict from mcp_server.core.path_utils import PathUtils # import aiofiles # Optional dependency, fallback to standard file operations # import aiohttp # Optional dependency # Add project root to path import sys sys.path.insert(0, str(Path(__file__).parent.parent)) from scripts.enhanced_mcp_analysis_framework import ( EnhancedMCPAnalyzer, EnhancedQueryMetrics, GranularTokenBreakdown, RetrievalMethodMetrics, RetrievalMethod ) from scripts.mcp_method_detector import MCPServerMonitor, MCPMethodValidator from scripts.edit_pattern_analyzer import EditPatternAnalyzer, EditOperation # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) @dataclass class RealTimeAnalysisConfig: """Configuration for real-time parallel analysis""" max_concurrent_analyzers: int = 8 transcript_buffer_size: int = 100 analysis_batch_size: int = 16 streaming_enabled: bool = True real_time_reporting: bool = True cache_analysis_results: bool = True performance_monitoring: bool = True @dataclass class TranscriptStream: """Stream of transcript data for real-time processing""" stream_id: str source: str # "mcp" or "native" start_time: datetime transcript_chunks: List[str] metadata: Dict[str, Any] processing_status: str = "pending" # pending, processing, completed, failed @dataclass class RealTimeAnalysisResult: """Real-time analysis result with streaming support""" result_id: str timestamp: datetime source_stream: str query_metrics: Optional[EnhancedQueryMetrics] edit_operations: List[EditOperation] method_detection_summary: Dict[str, Any] processing_time_ms: float cache_hit: bool = False class TranscriptProcessor: """High-performance transcript processor with async streaming""" def __init__(self, analyzer: EnhancedMCPAnalyzer): self.analyzer = analyzer self.processing_queue = asyncio.Queue(maxsize=100) self.results_queue = asyncio.Queue(maxsize=200) self.active_streams: Dict[str, TranscriptStream] = {} async def process_transcript_stream(self, stream: TranscriptStream) -> AsyncGenerator[RealTimeAnalysisResult, None]: """Process transcript stream with real-time yield of results""" self.active_streams[stream.stream_id] = stream stream.processing_status = "processing" try: for i, chunk in enumerate(stream.transcript_chunks): start_time = time.time() # Extract query from chunk query = self._extract_query_from_chunk(chunk) if not query: continue # Parse transcript chunk query_metrics = self.analyzer.parse_enhanced_transcript( chunk, query, stream.source ) # Detect edit operations edit_operations = self._extract_edit_operations(chunk, query_metrics) # Get method detection summary method_summary = self._extract_method_detection(chunk) processing_time = (time.time() - start_time) * 1000 result = RealTimeAnalysisResult( result_id=f"{stream.stream_id}_chunk_{i}", timestamp=datetime.now(), source_stream=stream.stream_id, query_metrics=query_metrics, edit_operations=edit_operations, method_detection_summary=method_summary, processing_time_ms=processing_time ) yield result # Allow other coroutines to run await asyncio.sleep(0.001) except Exception as e: logger.error(f"Error processing stream {stream.stream_id}: {e}") stream.processing_status = "failed" finally: stream.processing_status = "completed" def _extract_query_from_chunk(self, chunk: str) -> Optional[str]: """Extract query from transcript chunk""" import re # Look for query patterns query_patterns = [ r'Query:\s*["\'](.*?)["\']', r'User:\s*(.*?)(?:\n|$)', r'claude.*?--\s*(.*?)(?:\n|$)', ] for pattern in query_patterns: match = re.search(pattern, chunk, re.IGNORECASE | re.DOTALL) if match: return match.group(1).strip() return None def _extract_edit_operations(self, chunk: str, query_metrics: EnhancedQueryMetrics) -> List[EditOperation]: """Extract edit operations from transcript chunk""" edit_operations = [] import re # Look for tool invocations tool_patterns = [ r'Edit\([^)]*file_path=["\']([^"\']+)["\'].*?old_string=["\']([^"\']*)["\'].*?new_string=["\']([^"\']*)["\']', r'MultiEdit\([^)]*file_path=["\']([^"\']+)["\']', r'Write\([^)]*file_path=["\']([^"\']+)["\']' ] for pattern in tool_patterns: matches = re.finditer(pattern, chunk, re.DOTALL) for match in matches: file_path = match.group(1) # Create edit operation edit_op = EditOperation( operation_id=f"edit_{len(edit_operations)}", file_path=file_path, edit_type="targeted_edit", # Would be determined by analysis lines_changed=0, # Would be calculated total_file_lines=0, # Would be calculated edit_precision_ratio=0.0, # Would be calculated context_lines_used=0, # Would be calculated retrieval_method_used=query_metrics.retrieval_metrics.method_type, success=True ) edit_operations.append(edit_op) return edit_operations def _extract_method_detection(self, chunk: str) -> Dict[str, Any]: """Extract method detection information from transcript""" # This would integrate with the actual method detector # For now, return basic detection method_indicators = { "semantic": ["qdrant", "vector", "embedding", "semantic=true"], "sql_fts": ["fts_code", "SELECT", "MATCH"], "sql_bm25": ["bm25_content", "bm25("], "native": ["Grep", "Read", "Glob", "LS"] } detected_methods = [] for method, indicators in method_indicators.items(): if any(indicator.lower() in chunk.lower() for indicator in indicators): detected_methods.append(method) return { "detected_methods": detected_methods, "primary_method": detected_methods[0] if detected_methods else "unknown", "hybrid_usage": len(detected_methods) > 1 } class ParallelAnalysisPipeline: """8x speed improvement through parallel analysis pipeline""" def __init__(self, workspace_path: Path, config: RealTimeAnalysisConfig): self.workspace_path = workspace_path self.config = config self.session_id = f"parallel_analysis_{int(time.time())}" # Initialize analyzers self.mcp_analyzer = EnhancedMCPAnalyzer(workspace_path, self.session_id) self.method_monitor = MCPServerMonitor(workspace_path) self.edit_analyzer = EditPatternAnalyzer(workspace_path) # Parallel processing components self.transcript_processors: List[TranscriptProcessor] = [] self.analysis_executor = concurrent.futures.ThreadPoolExecutor( max_workers=self.config.max_concurrent_analyzers ) # Results aggregation self.live_results: List[RealTimeAnalysisResult] = [] self.performance_metrics = { "analysis_start": None, "analysis_end": None, "total_transcripts_processed": 0, "parallel_efficiency": 0.0, "cache_hit_rate": 0.0 } # Initialize transcript processors for i in range(self.config.max_concurrent_analyzers): processor = TranscriptProcessor(self.mcp_analyzer) self.transcript_processors.append(processor) logger.info(f"Initialized parallel analysis pipeline with {self.config.max_concurrent_analyzers} processors") async def process_transcript_directory(self, transcript_dir: Path) -> AsyncGenerator[RealTimeAnalysisResult, None]: """Process entire transcript directory with real-time streaming results""" self.performance_metrics["analysis_start"] = time.time() # Discover transcript files transcript_files = self._discover_transcript_files(transcript_dir) logger.info(f"Found {len(transcript_files)} transcript files to process") # Create transcript streams streams = [] for file_path in transcript_files: stream = await self._create_transcript_stream(file_path) if stream: streams.append(stream) # Process streams in parallel with real-time results semaphore = asyncio.Semaphore(self.config.max_concurrent_analyzers) async def process_stream_with_semaphore(stream: TranscriptStream): async with semaphore: processor = self.transcript_processors[len(self.live_results) % len(self.transcript_processors)] async for result in processor.process_transcript_stream(stream): yield result # Start parallel processing tasks = [] for stream in streams: task = asyncio.create_task(self._collect_stream_results(process_stream_with_semaphore(stream))) tasks.append(task) # Yield results as they become available completed_streams = 0 while completed_streams < len(streams): # Check for new results for task in tasks: if task.done(): try: results = task.result() for result in results: self.live_results.append(result) yield result completed_streams += 1 except Exception as e: logger.error(f"Error in stream processing: {e}") completed_streams += 1 # Small delay to prevent busy waiting await asyncio.sleep(0.01) self.performance_metrics["analysis_end"] = time.time() self.performance_metrics["total_transcripts_processed"] = len(transcript_files) # Calculate parallel efficiency total_time = self.performance_metrics["analysis_end"] - self.performance_metrics["analysis_start"] sequential_estimate = len(transcript_files) * 2.0 # Estimated 2s per file sequential self.performance_metrics["parallel_efficiency"] = sequential_estimate / total_time logger.info(f"Processed {len(transcript_files)} transcripts in {total_time:.2f}s") logger.info(f"Parallel efficiency: {self.performance_metrics['parallel_efficiency']:.2f}x speedup") async def _collect_stream_results(self, stream_generator) -> List[RealTimeAnalysisResult]: """Collect results from a stream generator""" results = [] async for result in stream_generator: results.append(result) return results def _discover_transcript_files(self, transcript_dir: Path) -> List[Path]: """Discover transcript files in directory""" transcript_files = [] if not transcript_dir.exists(): logger.warning(f"Transcript directory does not exist: {transcript_dir}") return transcript_files # Look for various transcript file patterns patterns = [ "*.json", "*.txt", "*.log", "transcript_*.json", "claude_session_*.json" ] for pattern in patterns: files = list(transcript_dir.glob(pattern)) transcript_files.extend(files) # Also check subdirectories for subdir in transcript_dir.iterdir(): if subdir.is_dir(): transcript_files.extend(self._discover_transcript_files(subdir)) return transcript_files async def _create_transcript_stream(self, file_path: Path) -> Optional[TranscriptStream]: """Create transcript stream from file""" try: # Use standard file operations with async wrapper def read_file(): with open(file_path, 'r') as f: return f.read() loop = asyncio.get_event_loop() content = await loop.run_in_executor(None, read_file) # Determine source type source = "mcp" if "mcp" in file_path.name.lower() else "native" # Split content into chunks for streaming processing chunks = self._split_transcript_content(content) stream = TranscriptStream( stream_id=f"{source}_{file_path.stem}_{int(time.time())}", source=source, start_time=datetime.now(), transcript_chunks=chunks, metadata={ "file_path": str(file_path), "file_size": len(content), "chunk_count": len(chunks) } ) return stream except Exception as e: logger.error(f"Error creating stream from {file_path}: {e}") return None def _split_transcript_content(self, content: str) -> List[str]: """Split transcript content into processable chunks""" # Split by tool calls or logical boundaries import re # Look for tool call boundaries tool_boundaries = re.finditer(r'Called the \w+ tool with', content) boundaries = [0] + [match.start() for match in tool_boundaries] boundaries.append(len(content)) chunks = [] for i in range(len(boundaries) - 1): chunk = content[boundaries[i]:boundaries[i + 1]] if chunk.strip(): chunks.append(chunk) # If no tool boundaries found, split by size if len(chunks) <= 1: chunk_size = 2000 # 2KB chunks chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)] return chunks def aggregate_live_results(self) -> Dict[str, Any]: """Aggregate live results for real-time reporting""" if not self.live_results: return {} # Group by source mcp_results = [r for r in self.live_results if "mcp" in r.source_stream] native_results = [r for r in self.live_results if "native" in r.source_stream] # Calculate aggregated metrics aggregated = { "total_results": len(self.live_results), "mcp_results": len(mcp_results), "native_results": len(native_results), "performance_summary": { "avg_processing_time_ms": sum(r.processing_time_ms for r in self.live_results) / len(self.live_results), "mcp_avg_processing_time": sum(r.processing_time_ms for r in mcp_results) / max(len(mcp_results), 1), "native_avg_processing_time": sum(r.processing_time_ms for r in native_results) / max(len(native_results), 1), "cache_hit_rate": sum(1 for r in self.live_results if r.cache_hit) / len(self.live_results) }, "method_distribution": self._analyze_method_distribution(), "edit_behavior_summary": self._analyze_edit_behavior(), "real_time_insights": self._generate_real_time_insights() } return aggregated def _analyze_method_distribution(self) -> Dict[str, int]: """Analyze distribution of detected methods""" method_counts = defaultdict(int) for result in self.live_results: if result.method_detection_summary: primary_method = result.method_detection_summary.get("primary_method", "unknown") method_counts[primary_method] += 1 return dict(method_counts) def _analyze_edit_behavior(self) -> Dict[str, Any]: """Analyze edit behavior patterns from live results""" total_edits = sum(len(r.edit_operations) for r in self.live_results) successful_edits = sum( sum(1 for op in r.edit_operations if op.success) for r in self.live_results ) return { "total_edit_operations": total_edits, "successful_edits": successful_edits, "success_rate": successful_edits / max(total_edits, 1), "avg_edits_per_query": total_edits / max(len(self.live_results), 1) } def _generate_real_time_insights(self) -> List[str]: """Generate real-time insights from current data""" insights = [] if len(self.live_results) > 10: # Need sufficient data mcp_results = [r for r in self.live_results if "mcp" in r.source_stream] native_results = [r for r in self.live_results if "native" in r.source_stream] if mcp_results and native_results: mcp_avg_time = sum(r.processing_time_ms for r in mcp_results) / len(mcp_results) native_avg_time = sum(r.processing_time_ms for r in native_results) / len(native_results) if mcp_avg_time < native_avg_time: speedup = native_avg_time / mcp_avg_time insights.append(f"MCP processing is {speedup:.1f}x faster than native") else: slowdown = mcp_avg_time / native_avg_time insights.append(f"MCP processing is {slowdown:.1f}x slower than native") # Method effectiveness insights method_dist = self._analyze_method_distribution() if method_dist: top_method = max(method_dist, key=method_dist.get) insights.append(f"Most used method: {top_method} ({method_dist[top_method]} queries)") return insights async def start_real_time_monitoring(self, transcript_dir: Path, report_interval: float = 5.0): """Start real-time monitoring with periodic reporting""" logger.info("Starting real-time parallel analysis monitoring...") # Start processing in background processing_task = asyncio.create_task(self._process_transcripts_background(transcript_dir)) # Start real-time reporting reporting_task = asyncio.create_task(self._real_time_reporting(report_interval)) try: await asyncio.gather(processing_task, reporting_task) except Exception as e: logger.error(f"Error in real-time monitoring: {e}") finally: processing_task.cancel() reporting_task.cancel() async def _process_transcripts_background(self, transcript_dir: Path): """Process transcripts in background""" async for result in self.process_transcript_directory(transcript_dir): # Results are automatically added to live_results pass async def _real_time_reporting(self, interval: float): """Provide real-time reporting""" while True: await asyncio.sleep(interval) if self.live_results: aggregated = self.aggregate_live_results() print(f"\n--- Real-time Analysis Update ---") print(f"Processed: {aggregated['total_results']} queries") print(f"MCP: {aggregated['mcp_results']}, Native: {aggregated['native_results']}") print(f"Avg processing time: {aggregated['performance_summary']['avg_processing_time_ms']:.1f}ms") print(f"Cache hit rate: {aggregated['performance_summary']['cache_hit_rate']:.1%}") insights = aggregated.get('real_time_insights', []) for insight in insights: print(f"💡 {insight}") def save_analysis_results(self, output_dir: Path): """Save comprehensive analysis results""" output_dir.mkdir(exist_ok=True) # Save live results results_file = output_dir / f"realtime_analysis_results_{self.session_id}.json" results_data = [asdict(result) for result in self.live_results] with open(results_file, 'w') as f: json.dump(results_data, f, indent=2, default=str) # Save aggregated analysis aggregated = self.aggregate_live_results() aggregated_file = output_dir / f"aggregated_analysis_{self.session_id}.json" with open(aggregated_file, 'w') as f: json.dump(aggregated, f, indent=2, default=str) # Save performance metrics metrics_file = output_dir / f"performance_metrics_{self.session_id}.json" with open(metrics_file, 'w') as f: json.dump(self.performance_metrics, f, indent=2, default=str) logger.info(f"Saved analysis results to {output_dir}") async def main(): """Main entry point for real-time parallel analysis""" workspace = Path("PathUtils.get_workspace_root()") transcript_dir = Path("Path.home() / ".claude"/projects") # Real transcript location # Optimized configuration for 8x speed improvement config = RealTimeAnalysisConfig( max_concurrent_analyzers=8, transcript_buffer_size=100, analysis_batch_size=16, streaming_enabled=True, real_time_reporting=True, cache_analysis_results=True, performance_monitoring=True ) logger.info("Starting Real-time Parallel Analysis Pipeline - Phase 2 Optimization") logger.info("=" * 80) pipeline = ParallelAnalysisPipeline(workspace, config) try: # Start real-time monitoring await pipeline.start_real_time_monitoring(transcript_dir, report_interval=3.0) # Save final results results_dir = Path(f"realtime_analysis_{pipeline.session_id}") pipeline.save_analysis_results(results_dir) print("\n" + "=" * 80) print("REAL-TIME PARALLEL ANALYSIS COMPLETED - PHASE 2") print("=" * 80) final_aggregated = pipeline.aggregate_live_results() print(f"\nResults:") print(f" Total queries processed: {final_aggregated.get('total_results', 0)}") print(f" MCP queries: {final_aggregated.get('mcp_results', 0)}") print(f" Native queries: {final_aggregated.get('native_results', 0)}") print(f" Parallel efficiency: {pipeline.performance_metrics.get('parallel_efficiency', 0):.2f}x speedup") print(f" Average processing time: {final_aggregated.get('performance_summary', {}).get('avg_processing_time_ms', 0):.1f}ms") print(f"\nResults saved to: {results_dir}") except Exception as e: logger.error(f"Real-time parallel analysis failed: {e}") raise if __name__ == "__main__": asyncio.run(main())

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ViperJuice/Code-Index-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

realtime_parallel_analyzer.py•24.1 KiB