CodeGraph CLI MCP Server

processor.rs•14.1 kB

use super::{CodeLanguage, ASTFeatures, ControlFlowGraph}; use crate::embedding::EmbeddingError; use std::collections::HashMap; use std::path::Path; use serde::{Serialize, Deserialize}; #[derive(Debug, Clone)] pub struct CodeProcessor { parsers: HashMap<CodeLanguage, Box<dyn LanguageParser + Send + Sync>>, ast_extractor: ASTFeatureExtractor, normalizer: CodeNormalizer, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CodeInput { pub source: String, pub language: CodeLanguage, pub ast_features: Option<ASTFeatures>, pub control_flow: Option<ControlFlowGraph>, pub metadata: CodeMetadata, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CodeMetadata { pub file_path: Option<String>, pub function_name: Option<String>, pub class_name: Option<String>, pub line_count: usize, pub char_count: usize, pub tokens: Vec<String>, pub imports: Vec<String>, pub doc_comments: Vec<String>, } pub trait LanguageParser { fn parse(&self, source: &str) -> Result<ParsedCode, EmbeddingError>; fn extract_features(&self, parsed: &ParsedCode) -> ASTFeatures; fn build_control_flow(&self, parsed: &ParsedCode) -> ControlFlowGraph; } #[derive(Debug, Clone)] pub struct ParsedCode { pub ast: Vec<ASTNode>, pub symbols: Vec<Symbol>, pub comments: Vec<Comment>, pub imports: Vec<Import>, } #[derive(Debug, Clone)] pub struct ASTNode { pub id: usize, pub node_type: String, pub children: Vec<usize>, pub start_byte: usize, pub end_byte: usize, pub text: String, } #[derive(Debug, Clone)] pub struct Symbol { pub name: String, pub symbol_type: SymbolType, pub scope: String, pub location: (usize, usize), } #[derive(Debug, Clone)] pub enum SymbolType { Function, Class, Variable, Constant, Module, Interface, } #[derive(Debug, Clone)] pub struct Comment { pub text: String, pub comment_type: CommentType, pub location: (usize, usize), } #[derive(Debug, Clone)] pub enum CommentType { Line, Block, Documentation, } #[derive(Debug, Clone)] pub struct Import { pub module: String, pub items: Vec<String>, pub alias: Option<String>, } impl CodeProcessor { pub fn new() -> Self { let mut parsers: HashMap<CodeLanguage, Box<dyn LanguageParser + Send + Sync>> = HashMap::new(); // Register language parsers parsers.insert(CodeLanguage::Rust, Box::new(RustParser::new())); parsers.insert(CodeLanguage::Python, Box::new(PythonParser::new())); parsers.insert(CodeLanguage::JavaScript, Box::new(JavaScriptParser::new())); parsers.insert(CodeLanguage::TypeScript, Box::new(TypeScriptParser::new())); parsers.insert(CodeLanguage::Java, Box::new(JavaParser::new())); parsers.insert(CodeLanguage::Go, Box::new(GoParser::new())); Self { parsers, ast_extractor: ASTFeatureExtractor::new(), normalizer: CodeNormalizer::new(), } } pub fn process_code(&self, source: &str, language: CodeLanguage) -> Result<CodeInput, EmbeddingError> { // Normalize source code let normalized_source = self.normalizer.normalize(source, language); // Extract basic metadata let metadata = self.extract_metadata(&normalized_source, language, None); // Try to parse with specific language parser let (ast_features, control_flow) = if let Some(parser) = self.parsers.get(&language) { match parser.parse(&normalized_source) { Ok(parsed) => { let features = parser.extract_features(&parsed); let cfg = parser.build_control_flow(&parsed); (Some(features), Some(cfg)) } Err(_) => (None, None), // Fall back to basic processing } } else { (None, None) }; Ok(CodeInput { source: normalized_source, language, ast_features, control_flow, metadata, }) } pub fn process_file(&self, file_path: &Path) -> Result<CodeInput, EmbeddingError> { let source = std::fs::read_to_string(file_path) .map_err(|e| EmbeddingError::InferenceError(format!("Failed to read file: {}", e)))?; let language = CodeLanguage::from_extension( file_path.extension() .and_then(|ext| ext.to_str()) .unwrap_or("") ); let mut input = self.process_code(&source, language)?; input.metadata.file_path = Some(file_path.to_string_lossy().to_string()); Ok(input) } fn extract_metadata(&self, source: &str, language: CodeLanguage, file_path: Option<&str>) -> CodeMetadata { let lines: Vec<&str> = source.lines().collect(); let tokens = self.simple_tokenize(source, language); let imports = self.extract_imports(source, language); let doc_comments = self.extract_doc_comments(source, language); CodeMetadata { file_path: file_path.map(|p| p.to_string()), function_name: None, class_name: None, line_count: lines.len(), char_count: source.chars().count(), tokens, imports, doc_comments, } } fn simple_tokenize(&self, source: &str, language: CodeLanguage) -> Vec<String> { // Simple tokenization - can be enhanced with proper lexers let separators = match language { CodeLanguage::Rust | CodeLanguage::Cpp | CodeLanguage::Java | CodeLanguage::Csharp => { &[' ', '\t', '\n', '(', ')', '{', '}', '[', ']', ';', ',', '.', ':', '=', '+', '-', '*', '/', '&', '|'][..] } CodeLanguage::Python => { &[' ', '\t', '\n', '(', ')', '[', ']', ':', ',', '.', '=', '+', '-', '*', '/', '#'][..] } _ => &[' ', '\t', '\n', '(', ')', '{', '}', '[', ']'][..], }; source .split(separators) .filter(|token| !token.is_empty()) .map(|token| token.to_string()) .collect() } fn extract_imports(&self, source: &str, language: CodeLanguage) -> Vec<String> { let import_patterns = match language { CodeLanguage::Rust => vec![r"use\s+([^;]+);", r"extern\s+crate\s+(\w+)"], CodeLanguage::Python => vec![r"^import\s+(.+)$", r"^from\s+(.+)\s+import"], CodeLanguage::JavaScript | CodeLanguage::TypeScript => { vec![r"^import\s+.+\s+from\s+['\"](.+)['\"]", r"^const\s+.+\s+=\s+require$['\"](.+)['\"]$"] } CodeLanguage::Java => vec![r"^import\s+(.+);"], CodeLanguage::Go => vec![r#"^import\s+"(.+)""#, r"^import\s+\("], _ => vec![], }; let mut imports = Vec::new(); for line in source.lines() { for pattern in &import_patterns { if let Ok(regex) = regex::Regex::new(pattern) { if let Some(captures) = regex.captures(line.trim()) { if let Some(import) = captures.get(1) { imports.push(import.as_str().to_string()); } } } } } imports } fn extract_doc_comments(&self, source: &str, language: CodeLanguage) -> Vec<String> { let comment_patterns = match language { CodeLanguage::Rust => vec![r"///\s*(.+)", r"//!\s*(.+)", r"/\*\*(.+?)\*/"], CodeLanguage::Python => vec![r'"""(.+?)"""', r"'''(.+?)'''", r"#\s*(.+)"], CodeLanguage::JavaScript | CodeLanguage::TypeScript => { vec![r"/\*\*(.+?)\*/", r"//\s*(.+)"] } CodeLanguage::Java => vec![r"/\*\*(.+?)\*/", r"//\s*(.+)"], CodeLanguage::Go => vec![r"//\s*(.+)", r"/\*(.+?)\*/"], _ => vec![], }; let mut comments = Vec::new(); for line in source.lines() { for pattern in &comment_patterns { if let Ok(regex) = regex::Regex::new(pattern) { if let Some(captures) = regex.captures(line.trim()) { if let Some(comment) = captures.get(1) { comments.push(comment.as_str().trim().to_string()); } } } } } comments } } pub struct ASTFeatureExtractor { // Feature extraction logic } impl ASTFeatureExtractor { pub fn new() -> Self { Self {} } pub fn extract(&self, ast: &[ASTNode]) -> ASTFeatures { let mut features = ASTFeatures::default(); features.depth = self.calculate_max_depth(ast); features.complexity_score = self.calculate_complexity(ast); features.function_count = ast.iter().filter(|n| n.node_type == "function").count(); features.class_count = ast.iter().filter(|n| n.node_type == "class").count(); features.node_types = ast.iter().map(|n| n.node_type.clone()).collect(); features } fn calculate_max_depth(&self, ast: &[ASTNode]) -> usize { // Simplified depth calculation ast.len() / 10 + 1 } fn calculate_complexity(&self, ast: &[ASTNode]) -> f32 { // Simplified complexity score let branch_nodes = ast.iter() .filter(|n| matches!(n.node_type.as_str(), "if" | "while" | "for" | "match" | "switch")) .count(); 1.0 + branch_nodes as f32 * 0.5 } } pub struct CodeNormalizer; impl CodeNormalizer { pub fn new() -> Self { Self } pub fn normalize(&self, source: &str, language: CodeLanguage) -> String { let mut normalized = source.to_string(); // Remove excessive whitespace normalized = regex::Regex::new(r"\s+").unwrap() .replace_all(&normalized, " ").to_string(); // Language-specific normalization match language { CodeLanguage::Python => self.normalize_python(&normalized), CodeLanguage::JavaScript | CodeLanguage::TypeScript => self.normalize_js(&normalized), _ => normalized, } } fn normalize_python(&self, source: &str) -> String { // Remove docstrings for normalization regex::Regex::new(r#""""[\s\S]*?""""#).unwrap() .replace_all(source, "").to_string() } fn normalize_js(&self, source: &str) -> String { // Remove comments regex::Regex::new(r"//.*$").unwrap() .replace_all(source, "").to_string() } } // Language-specific parsers (simplified implementations) struct RustParser; struct PythonParser; struct JavaScriptParser; struct TypeScriptParser; struct JavaParser; struct GoParser; impl RustParser { fn new() -> Self { Self } } impl LanguageParser for RustParser { fn parse(&self, source: &str) -> Result<ParsedCode, EmbeddingError> { // Simplified parsing - would use tree-sitter-rust in real implementation Ok(ParsedCode { ast: vec![], symbols: vec![], comments: vec![], imports: vec![], }) } fn extract_features(&self, _parsed: &ParsedCode) -> ASTFeatures { ASTFeatures::default() } fn build_control_flow(&self, _parsed: &ParsedCode) -> ControlFlowGraph { ControlFlowGraph::default() } } // Similar implementations for other language parsers... impl PythonParser { fn new() -> Self { Self } } impl JavaScriptParser { fn new() -> Self { Self } } impl TypeScriptParser { fn new() -> Self { Self } } impl JavaParser { fn new() -> Self { Self } } impl GoParser { fn new() -> Self { Self } } impl LanguageParser for PythonParser { fn parse(&self, _source: &str) -> Result<ParsedCode, EmbeddingError> { Ok(ParsedCode { ast: vec![], symbols: vec![], comments: vec![], imports: vec![] }) } fn extract_features(&self, _parsed: &ParsedCode) -> ASTFeatures { ASTFeatures::default() } fn build_control_flow(&self, _parsed: &ParsedCode) -> ControlFlowGraph { ControlFlowGraph::default() } } impl LanguageParser for JavaScriptParser { fn parse(&self, _source: &str) -> Result<ParsedCode, EmbeddingError> { Ok(ParsedCode { ast: vec![], symbols: vec![], comments: vec![], imports: vec![] }) } fn extract_features(&self, _parsed: &ParsedCode) -> ASTFeatures { ASTFeatures::default() } fn build_control_flow(&self, _parsed: &ParsedCode) -> ControlFlowGraph { ControlFlowGraph::default() } } impl LanguageParser for TypeScriptParser { fn parse(&self, _source: &str) -> Result<ParsedCode, EmbeddingError> { Ok(ParsedCode { ast: vec![], symbols: vec![], comments: vec![], imports: vec![] }) } fn extract_features(&self, _parsed: &ParsedCode) -> ASTFeatures { ASTFeatures::default() } fn build_control_flow(&self, _parsed: &ParsedCode) -> ControlFlowGraph { ControlFlowGraph::default() } } impl LanguageParser for JavaParser { fn parse(&self, _source: &str) -> Result<ParsedCode, EmbeddingError> { Ok(ParsedCode { ast: vec![], symbols: vec![], comments: vec![], imports: vec![] }) } fn extract_features(&self, _parsed: &ParsedCode) -> ASTFeatures { ASTFeatures::default() } fn build_control_flow(&self, _parsed: &ParsedCode) -> ControlFlowGraph { ControlFlowGraph::default() } } impl LanguageParser for GoParser { fn parse(&self, _source: &str) -> Result<ParsedCode, EmbeddingError> { Ok(ParsedCode { ast: vec![], symbols: vec![], comments: vec![], imports: vec![] }) } fn extract_features(&self, _parsed: &ParsedCode) -> ASTFeatures { ASTFeatures::default() } fn build_control_flow(&self, _parsed: &ParsedCode) -> ControlFlowGraph { ControlFlowGraph::default() } } impl Default for CodeProcessor { fn default() -> Self { Self::new() } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Jakedismo/codegraph-rust'

If you have feedback or need assistance with the MCP directory API, please join our Discord server