CodeGraph CLI MCP Server

text_processor.rs•73.1 kB

use crate::LanguageRegistry; use codegraph_core::{CodeGraphError, Language, Result}; use dashmap::DashMap; use parking_lot::Mutex; use regex::Regex; use std::collections::{HashMap, HashSet}; use std::sync::Arc; use tracing::instrument; use tree_sitter::{Node, Parser, Tree}; /// Configuration for text processing operations #[derive(Debug, Clone)] pub struct TextProcessorConfig { pub max_chunk_size: usize, pub min_chunk_size: usize, pub overlap_size: usize, pub preserve_semantic_boundaries: bool, pub enable_deduplication: bool, pub normalization_level: NormalizationLevel, } impl Default for TextProcessorConfig { fn default() -> Self { Self { max_chunk_size: 1000, min_chunk_size: 100, overlap_size: 50, preserve_semantic_boundaries: true, enable_deduplication: true, normalization_level: NormalizationLevel::Standard, } } } #[derive(Debug, Clone, PartialEq)] pub enum NormalizationLevel { None, Basic, // Whitespace normalization Standard, // Basic + case normalization for identifiers Aggressive, // Standard + comment removal + formatting } /// A semantic chunk of text with context information #[derive(Debug, Clone, PartialEq)] pub struct TextChunk { pub content: String, pub start_byte: usize, pub end_byte: usize, pub start_line: usize, pub end_line: usize, pub language: Option<Language>, pub chunk_type: ChunkType, pub semantic_level: u8, pub context_before: Option<String>, pub context_after: Option<String>, pub hash: String, } #[derive(Debug, Clone, PartialEq)] pub enum ChunkType { Function, Class, Module, Comment, Import, Text, Code, } /// Token with language-aware information #[derive(Debug, Clone, PartialEq)] pub struct LanguageToken { pub content: String, pub token_type: TokenType, pub start_byte: usize, pub end_byte: usize, pub line: usize, pub column: usize, pub language: Option<Language>, } #[derive(Debug, Clone, PartialEq)] pub enum TokenType { Identifier, Keyword, String, Number, Comment, Operator, Punctuation, Whitespace, Unknown, } /// Context extraction result with metadata #[derive(Debug, Clone)] pub struct ContextExtraction { pub primary_content: String, pub surrounding_context: Vec<String>, pub semantic_relationships: Vec<TextSemanticRelationship>, pub importance_score: f32, } #[derive(Debug, Clone)] pub struct TextSemanticRelationship { pub relation_type: RelationType, pub target_content: String, pub confidence: f32, } #[derive(Debug, Clone, PartialEq)] pub enum RelationType { Calls, Defines, Uses, References, Contains, Implements, Extends, } /// Statistics about text processing operations #[derive(Debug, Clone)] pub struct ProcessingStatistics { pub total_chunks: usize, pub deduplicated_chunks: usize, pub total_tokens: usize, pub processing_time_ms: u64, pub bytes_processed: usize, pub language_distribution: HashMap<Language, usize>, } /// Main text processor for code tokenization and chunking pub struct TextProcessor { config: TextProcessorConfig, language_registry: Arc<LanguageRegistry>, deduplication_cache: Arc<DashMap<String, bool>>, normalization_patterns: Arc<DashMap<Language, Vec<Regex>>>, compiled_regex_cache: Arc<DashMap<String, Regex>>, keyword_cache: Arc<DashMap<Language, HashSet<&'static str>>>, parser_pool: Arc<Mutex<Vec<Parser>>>, } impl TextProcessor { pub fn new(config: TextProcessorConfig) -> Self { let processor = Self { config, language_registry: Arc::new(LanguageRegistry::new()), deduplication_cache: Arc::new(DashMap::new()), normalization_patterns: Arc::new(DashMap::new()), compiled_regex_cache: Arc::new(DashMap::new()), keyword_cache: Arc::new(DashMap::new()), parser_pool: Arc::new(Mutex::new(Vec::new())), }; processor.initialize_normalization_patterns(); processor.initialize_keyword_cache(); processor } pub fn with_language_registry(mut self, registry: Arc<LanguageRegistry>) -> Self { self.language_registry = registry; self } /// Language-aware tokenization for multiple programming languages #[instrument(skip(self, content))] pub async fn tokenize_language_aware( &self, content: &str, language: Option<Language>, ) -> Result<Vec<LanguageToken>> { let language = language.unwrap_or(Language::Other("text".to_string())); let registry = self.language_registry.clone(); let regex_cache = self.compiled_regex_cache.clone(); let keyword_cache = self.keyword_cache.clone(); let parser_pool = self.parser_pool.clone(); let content = content.to_string(); tokio::task::spawn_blocking(move || { let mut tokens = Vec::with_capacity(content.len() / 10); // Pre-allocate with estimate // Try to get parser from pool first let parser_opt = { let mut pool = parser_pool.lock(); pool.pop() }; // Try to use tree-sitter for structured tokenization if language is supported if let Some(mut parser) = parser_opt.or_else(|| registry.create_parser(&language)) { if let Some(tree) = parser.parse(&content, None) { Self::extract_tokens_from_tree_optimized( &tree, &content, &language, &mut tokens, &keyword_cache, ); // Return parser to pool let mut pool = parser_pool.lock(); if pool.len() < 10 { // Limit pool size pool.push(parser); } return Ok(tokens); } // Return parser to pool even if parsing failed let mut pool = parser_pool.lock(); if pool.len() < 10 { pool.push(parser); } } // Fallback to optimized regex-based tokenization Self::extract_tokens_regex_optimized( &content, &language, &mut tokens, &regex_cache, &keyword_cache, ); Ok(tokens) }) .await .map_err(|e| CodeGraphError::Parse(e.to_string()))? } /// Optimized tree-based token extraction with reduced allocations fn extract_tokens_from_tree_optimized( tree: &Tree, content: &str, language: &Language, tokens: &mut Vec<LanguageToken>, keyword_cache: &DashMap<Language, HashSet<&'static str>>, ) { let root = tree.root_node(); let cursor = root.walk(); let content_bytes = content.as_bytes(); // Stack-based traversal for better performance let mut stack = vec![root]; while let Some(node) = stack.pop() { if node.is_error() { continue; } // Process leaf nodes if node.child_count() == 0 { let start_byte = node.start_byte(); let end_byte = node.end_byte(); if let Ok(node_text) = std::str::from_utf8(&content_bytes[start_byte..end_byte]) { let token_type = Self::classify_token_type_optimized( node.kind(), node_text, language, keyword_cache, ); let start_position = node.start_position(); tokens.push(LanguageToken { content: node_text.to_string(), token_type, start_byte, end_byte, line: start_position.row, column: start_position.column, language: Some(language.clone()), }); } } else { // Add children to stack (in reverse order for proper traversal) for i in (0..node.child_count()).rev() { if let Some(child) = node.child(i) { stack.push(child); } } } } } fn extract_tokens_from_tree( tree: &Tree, content: &str, language: &Language, tokens: &mut Vec<LanguageToken>, ) { let root = tree.root_node(); let mut cursor = root.walk(); let content_bytes = content.as_bytes(); loop { let node = cursor.node(); // Skip error nodes if node.is_error() { if !cursor.goto_next_sibling() { break; } continue; } // Only process leaf nodes (actual tokens) if node.child_count() == 0 { let start_byte = node.start_byte(); let end_byte = node.end_byte(); if let Ok(node_text) = std::str::from_utf8(&content_bytes[start_byte..end_byte]) { let token_type = Self::classify_token_type(node.kind(), node_text, language); let start_position = node.start_position(); tokens.push(LanguageToken { content: node_text.to_string(), token_type, start_byte, end_byte, line: start_position.row, column: start_position.column, language: Some(language.clone()), }); } } // Navigate the tree if cursor.goto_first_child() { continue; } while !cursor.goto_next_sibling() { if !cursor.goto_parent() { return; } } } } fn extract_tokens_regex(content: &str, language: &Language, tokens: &mut Vec<LanguageToken>) { // Language-agnostic regex patterns for basic tokenization let patterns = vec![ (r"//[^\r\n]*", TokenType::Comment), // Line comments (r"/\*[\s\S]*?\*/", TokenType::Comment), // Block comments (r"#[^\r\n]*", TokenType::Comment), // Python/Shell comments (r#""([^"\\]|\\.)*""#, TokenType::String), // Double-quoted strings (r"'([^'\\]|\\.)*'", TokenType::String), // Single-quoted strings (r"`([^`\\]|\\.)*`", TokenType::String), // Backtick strings (r"\b\d+\.?\d*\b", TokenType::Number), // Numbers (r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", TokenType::Identifier), // Identifiers (r"[+\-*/%=<>!&|^~]", TokenType::Operator), // Operators (r"[{}()\[\];,.]", TokenType::Punctuation), // Punctuation (r"\s+", TokenType::Whitespace), // Whitespace ]; let keywords = Self::get_language_keywords(language); for (i, line) in content.lines().enumerate() { let line_offset = 0; let line_start_byte = content[..content .split('\n') .take(i) .map(|l| l.len() + 1) .sum::<usize>()] .len(); for (pattern, default_type) in &patterns { let re = Regex::new(pattern).unwrap(); for mat in re.find_iter(line) { let token_content = mat.as_str(); let mut token_type = default_type.clone(); // Check if identifier is actually a keyword if token_type == TokenType::Identifier && keywords.contains(token_content) { token_type = TokenType::Keyword; } tokens.push(LanguageToken { content: token_content.to_string(), token_type, start_byte: line_start_byte + mat.start(), end_byte: line_start_byte + mat.end(), line: i, column: mat.start(), language: Some(language.clone()), }); } } } // Sort tokens by position tokens.sort_by_key(|t| (t.line, t.column)); } /// Optimized regex-based tokenization with caching and reduced allocations fn extract_tokens_regex_optimized( content: &str, language: &Language, tokens: &mut Vec<LanguageToken>, regex_cache: &DashMap<String, Regex>, keyword_cache: &DashMap<Language, HashSet<&'static str>>, ) { // Get or create compiled regexes with caching let patterns = vec![ ("line_comment", r"//[^\r\n]*", TokenType::Comment), ("block_comment", r"/\*[\s\S]*?\*/", TokenType::Comment), ("python_comment", r"#[^\r\n]*", TokenType::Comment), ("double_string", r#""([^"\\]|\\.)*""#, TokenType::String), ("single_string", r"'([^'\\]|\\.)*'", TokenType::String), ("backtick_string", r"`([^`\\]|\\.)*`", TokenType::String), ("number", r"\b\d+\.?\d*\b", TokenType::Number), ( "identifier", r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", TokenType::Identifier, ), ("operator", r"[+\-*/%=<>!&|^~]", TokenType::Operator), ("punctuation", r"[{}()\[\];,.]", TokenType::Punctuation), ("whitespace", r"\s+", TokenType::Whitespace), ]; let keywords = keyword_cache .get(language) .map(|entry| entry.value().clone()) .unwrap_or_default(); // Process content line by line for better memory efficiency let mut byte_offset = 0; for (line_idx, line) in content.lines().enumerate() { for (pattern_name, pattern_str, default_type) in &patterns { let regex = regex_cache .entry(pattern_name.to_string()) .or_insert_with(|| { Regex::new(pattern_str).unwrap_or_else(|_| Regex::new(r"\w+").unwrap()) }); for mat in regex.find_iter(line) { let token_content = mat.as_str(); let mut token_type = default_type.clone(); // Check if identifier is actually a keyword if token_type == TokenType::Identifier && keywords.contains(token_content) { token_type = TokenType::Keyword; } tokens.push(LanguageToken { content: token_content.to_string(), token_type, start_byte: byte_offset + mat.start(), end_byte: byte_offset + mat.end(), line: line_idx, column: mat.start(), language: Some(language.clone()), }); } } byte_offset += line.len() + 1; // +1 for newline } // Sort tokens by position tokens.sort_by_key(|t| (t.line, t.column)); } /// Optimized token classification with caching fn classify_token_type_optimized( node_kind: &str, content: &str, language: &Language, keyword_cache: &DashMap<Language, HashSet<&'static str>>, ) -> TokenType { // Fast path for common node types match node_kind { "comment" | "line_comment" | "block_comment" => return TokenType::Comment, "string" | "string_literal" | "raw_string_literal" => return TokenType::String, "number" | "integer" | "float" | "decimal" => return TokenType::Number, "identifier" => { // Check against cached keywords if let Some(keywords) = keyword_cache.get(language) { if keywords.contains(content) { return TokenType::Keyword; } } return TokenType::Identifier; } kind if kind.contains("keyword") => return TokenType::Keyword, _ => {} } // Operator classification if matches!( node_kind, "+" | "-" | "*" | "/" | "=" | "==" | "!=" | "<" | ">" | "<=" | ">=" | "&&" | "||" | "!" | "&" | "|" | "^" | "~" | "<<" | ">>" | "+=" | "-=" | "*=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" ) { return TokenType::Operator; } // Punctuation classification if matches!( node_kind, "(" | ")" | "[" | "]" | "{" | "}" | ";" | "," | "." ) { return TokenType::Punctuation; } // Fallback classification if content.chars().all(|c| c.is_whitespace()) { TokenType::Whitespace } else { TokenType::Unknown } } fn classify_token_type(node_kind: &str, content: &str, language: &Language) -> TokenType { match node_kind { "comment" | "line_comment" | "block_comment" => TokenType::Comment, "string" | "string_literal" | "raw_string_literal" => TokenType::String, "number" | "integer" | "float" | "decimal" => TokenType::Number, "identifier" => TokenType::Identifier, kind if kind.contains("keyword") => TokenType::Keyword, kind if matches!( kind, "+" | "-" | "*" | "/" | "=" | "==" | "!=" | "<" | ">" | "<=" | ">=" | "&&" | "||" | "!" | "&" | "|" | "^" | "~" | "<<" | ">>" | "+=" | "-=" | "*=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" ) => { TokenType::Operator } kind if matches!(kind, "(" | ")" | "[" | "]" | "{" | "}" | ";" | "," | ".") => { TokenType::Punctuation } _ => { // Additional language-specific classification let keywords = Self::get_language_keywords(language); if keywords.contains(content) { TokenType::Keyword } else if content.chars().all(|c| c.is_whitespace()) { TokenType::Whitespace } else { TokenType::Unknown } } } } fn get_language_keywords(language: &Language) -> HashSet<&'static str> { match language { Language::Rust => [ "fn", "let", "mut", "const", "static", "if", "else", "match", "for", "while", "loop", "break", "continue", "return", "struct", "enum", "impl", "trait", "mod", "use", "pub", "crate", "super", "self", "Self", "async", "await", "move", "ref", "where", "type", "as", "true", "false", "unsafe", "extern", "dyn", ] .into_iter() .collect(), Language::Python => [ "def", "class", "if", "elif", "else", "for", "while", "try", "except", "finally", "with", "as", "import", "from", "return", "yield", "lambda", "and", "or", "not", "in", "is", "True", "False", "None", "pass", "break", "continue", "global", "nonlocal", "assert", "raise", "del", ] .into_iter() .collect(), Language::JavaScript | Language::TypeScript => [ "function", "var", "let", "const", "if", "else", "for", "while", "do", "switch", "case", "default", "break", "continue", "return", "try", "catch", "finally", "throw", "new", "this", "typeof", "instanceof", "in", "true", "false", "null", "undefined", "class", "extends", "super", "import", "export", "async", "await", ] .into_iter() .collect(), Language::Go => [ "func", "var", "const", "if", "else", "for", "switch", "case", "default", "break", "continue", "return", "go", "defer", "chan", "select", "type", "struct", "interface", "map", "package", "import", "true", "false", "nil", "range", ] .into_iter() .collect(), Language::Java => [ "public", "private", "protected", "static", "final", "abstract", "class", "interface", "extends", "implements", "if", "else", "for", "while", "do", "switch", "case", "default", "break", "continue", "return", "try", "catch", "finally", "throw", "throws", "new", "this", "super", "import", "package", "true", "false", "null", ] .into_iter() .collect(), Language::Cpp => [ "auto", "bool", "char", "const", "double", "float", "int", "long", "short", "signed", "unsigned", "void", "class", "struct", "public", "private", "protected", "virtual", "static", "extern", "inline", "if", "else", "for", "while", "do", "switch", "case", "default", "break", "continue", "return", "try", "catch", "throw", "new", "delete", "this", "true", "false", "nullptr", "template", "typename", "namespace", "using", ] .into_iter() .collect(), _ => HashSet::new(), } } /// Semantic text chunking with context preservation pub async fn chunk_semantic( &self, content: &str, language: Option<Language>, ) -> Result<Vec<TextChunk>> { let language = language.unwrap_or(Language::Other("text".to_string())); let registry = self.language_registry.clone(); let content = content.to_string(); let config = self.config.clone(); tokio::task::spawn_blocking(move || { let mut chunks = Vec::new(); // Try tree-sitter based chunking for supported languages if let Some(mut parser) = registry.create_parser(&language) { if let Some(tree) = parser.parse(&content, None) { Self::extract_semantic_chunks_from_tree( &tree, &content, &language, &config, &mut chunks, ); return Ok(chunks); } } // Fallback to text-based chunking Self::extract_chunks_text_based(&content, &language, &config, &mut chunks); Ok(chunks) }) .await .map_err(|e| CodeGraphError::Parse(e.to_string()))? } fn extract_semantic_chunks_from_tree( tree: &Tree, content: &str, language: &Language, config: &TextProcessorConfig, chunks: &mut Vec<TextChunk>, ) { let root = tree.root_node(); let mut cursor = root.walk(); let content_bytes = content.as_bytes(); // Track visited nodes to avoid duplicates let mut processed_ranges = Vec::new(); loop { let node = cursor.node(); // Skip error nodes and already processed ranges if node.is_error() || Self::is_range_processed(&processed_ranges, node.start_byte(), node.end_byte()) { if !cursor.goto_next_sibling() { if !cursor.goto_parent() { break; } continue; } continue; } // Check if this is a semantically meaningful chunk if Self::is_semantic_boundary(&node, language) { let start_byte = node.start_byte(); let end_byte = node.end_byte(); let chunk_size = end_byte - start_byte; // Only create chunks within size limits if chunk_size >= config.min_chunk_size && chunk_size <= config.max_chunk_size { if let Ok(chunk_content) = std::str::from_utf8(&content_bytes[start_byte..end_byte]) { let chunk_type = Self::determine_chunk_type(&node, language); let semantic_level = Self::calculate_semantic_level(&node); let start_position = node.start_position(); let end_position = node.end_position(); // Extract context if configured let (context_before, context_after) = if config.preserve_semantic_boundaries { Self::extract_surrounding_context( content, start_byte, end_byte, &config, ) } else { (None, None) }; let chunk = TextChunk { content: chunk_content.trim().to_string(), start_byte, end_byte, start_line: start_position.row, end_line: end_position.row, language: Some(language.clone()), chunk_type, semantic_level, context_before, context_after, hash: Self::compute_hash(chunk_content), }; chunks.push(chunk); processed_ranges.push((start_byte, end_byte)); } } else if chunk_size > config.max_chunk_size { // Split large chunks recursively Self::split_large_chunk( &node, content, language, config, chunks, &mut processed_ranges, ); } } // Navigate the tree if cursor.goto_first_child() { continue; } while !cursor.goto_next_sibling() { if !cursor.goto_parent() { return; } } } // Sort chunks by position chunks.sort_by_key(|c| c.start_byte); // Apply overlap if configured if config.overlap_size > 0 { Self::apply_overlap(chunks, content, config.overlap_size); } } fn extract_chunks_text_based( content: &str, language: &Language, config: &TextProcessorConfig, chunks: &mut Vec<TextChunk>, ) { let lines: Vec<&str> = content.lines().collect(); let mut current_chunk = String::new(); let mut chunk_start_byte = 0; let mut chunk_start_line = 0; let mut current_byte_offset = 0; for (line_idx, line) in lines.iter().enumerate() { let line_with_newline = format!("{}\n", line); let line_bytes = line_with_newline.as_bytes().len(); // Check if adding this line would exceed max size let potential_size = current_chunk.len() + line_bytes; if potential_size > config.max_chunk_size && current_chunk.len() >= config.min_chunk_size { // Create chunk from accumulated content if !current_chunk.trim().is_empty() { let chunk = TextChunk { content: current_chunk.trim().to_string(), start_byte: chunk_start_byte, end_byte: current_byte_offset, start_line: chunk_start_line, end_line: line_idx.saturating_sub(1), language: Some(language.clone()), chunk_type: ChunkType::Text, semantic_level: 1, context_before: None, context_after: None, hash: Self::compute_hash(&current_chunk), }; chunks.push(chunk); } // Start new chunk current_chunk.clear(); chunk_start_byte = current_byte_offset; chunk_start_line = line_idx; } current_chunk.push_str(&line_with_newline); current_byte_offset += line_bytes; } // Add final chunk if it has content if !current_chunk.trim().is_empty() && current_chunk.len() >= config.min_chunk_size { let chunk = TextChunk { content: current_chunk.trim().to_string(), start_byte: chunk_start_byte, end_byte: current_byte_offset, start_line: chunk_start_line, end_line: lines.len().saturating_sub(1), language: Some(language.clone()), chunk_type: ChunkType::Text, semantic_level: 1, context_before: None, context_after: None, hash: Self::compute_hash(&current_chunk), }; chunks.push(chunk); } } fn is_semantic_boundary(node: &Node, language: &Language) -> bool { match language { Language::Rust => { matches!( node.kind(), "function_item" | "struct_item" | "enum_item" | "impl_item" | "trait_item" | "mod_item" | "use_declaration" | "const_item" | "static_item" ) } Language::Python => { matches!( node.kind(), "function_definition" | "class_definition" | "import_statement" | "import_from_statement" | "decorated_definition" ) } Language::JavaScript | Language::TypeScript => { matches!( node.kind(), "function_declaration" | "arrow_function" | "class_declaration" | "method_definition" | "import_statement" | "export_statement" ) } Language::Go => { matches!( node.kind(), "function_declaration" | "method_declaration" | "type_declaration" | "var_declaration" | "const_declaration" | "package_clause" | "import_declaration" ) } Language::Java => { matches!( node.kind(), "method_declaration" | "class_declaration" | "interface_declaration" | "constructor_declaration" | "import_declaration" | "package_declaration" ) } Language::Cpp => { matches!( node.kind(), "function_definition" | "class_specifier" | "struct_specifier" | "namespace_definition" | "preproc_include" | "declaration" ) } _ => { // For unknown languages, use structural indicators node.child_count() > 2 && node.byte_range().len() > 50 } } } fn determine_chunk_type(node: &Node, language: &Language) -> ChunkType { let kind = node.kind(); match language { Language::Rust => match kind { "function_item" => ChunkType::Function, "struct_item" | "enum_item" => ChunkType::Class, "mod_item" => ChunkType::Module, "use_declaration" => ChunkType::Import, "line_comment" | "block_comment" => ChunkType::Comment, _ => ChunkType::Code, }, Language::Python => match kind { "function_definition" => ChunkType::Function, "class_definition" => ChunkType::Class, "import_statement" | "import_from_statement" => ChunkType::Import, "comment" => ChunkType::Comment, _ => ChunkType::Code, }, _ => { if kind.contains("function") { ChunkType::Function } else if kind.contains("class") { ChunkType::Class } else if kind.contains("import") { ChunkType::Import } else if kind.contains("comment") { ChunkType::Comment } else { ChunkType::Code } } } } fn calculate_semantic_level(node: &Node) -> u8 { let mut level = 0; let mut current = node.parent(); while let Some(parent) = current { level += 1; current = parent.parent(); } level.min(255) as u8 } fn extract_surrounding_context( content: &str, start_byte: usize, end_byte: usize, config: &TextProcessorConfig, ) -> (Option<String>, Option<String>) { let context_size = config.overlap_size; let context_before = if start_byte > context_size { let before_start = start_byte - context_size; std::str::from_utf8(&content.as_bytes()[before_start..start_byte]) .ok() .map(|s| s.to_string()) } else { None }; let context_after = if end_byte + context_size < content.len() { let after_end = end_byte + context_size; std::str::from_utf8(&content.as_bytes()[end_byte..after_end]) .ok() .map(|s| s.to_string()) } else { None }; (context_before, context_after) } fn split_large_chunk( node: &Node, content: &str, language: &Language, config: &TextProcessorConfig, chunks: &mut Vec<TextChunk>, processed_ranges: &mut Vec<(usize, usize)>, ) { // Recursively process child nodes for large chunks let mut cursor = node.walk(); if cursor.goto_first_child() { loop { let child = cursor.node(); if Self::is_semantic_boundary(&child, language) { let start_byte = child.start_byte(); let end_byte = child.end_byte(); let chunk_size = end_byte - start_byte; if chunk_size >= config.min_chunk_size && chunk_size <= config.max_chunk_size { if let Ok(chunk_content) = std::str::from_utf8(&content.as_bytes()[start_byte..end_byte]) { let chunk_type = Self::determine_chunk_type(&child, language); let semantic_level = Self::calculate_semantic_level(&child); let start_position = child.start_position(); let end_position = child.end_position(); let chunk = TextChunk { content: chunk_content.trim().to_string(), start_byte, end_byte, start_line: start_position.row, end_line: end_position.row, language: Some(language.clone()), chunk_type, semantic_level, context_before: None, context_after: None, hash: Self::compute_hash(chunk_content), }; chunks.push(chunk); processed_ranges.push((start_byte, end_byte)); } } } if !cursor.goto_next_sibling() { break; } } } } fn is_range_processed(processed_ranges: &[(usize, usize)], start: usize, end: usize) -> bool { processed_ranges .iter() .any(|(s, e)| start >= *s && end <= *e) } fn apply_overlap(chunks: &mut Vec<TextChunk>, content: &str, overlap_size: usize) { // Add overlapping content between adjacent chunks for i in 1..chunks.len() { let prev_end = chunks[i - 1].end_byte; let curr_start = chunks[i].start_byte; if curr_start > prev_end && curr_start - prev_end > overlap_size { // Add overlap to previous chunk's end if let Ok(overlap_content) = std::str::from_utf8(&content.as_bytes()[prev_end..prev_end + overlap_size]) { chunks[i - 1].content.push_str("\n...\n"); chunks[i - 1].content.push_str(overlap_content.trim()); } // Add overlap to current chunk's beginning if let Ok(overlap_content) = std::str::from_utf8(&content.as_bytes()[curr_start - overlap_size..curr_start]) { let mut new_content = String::new(); new_content.push_str(overlap_content.trim()); new_content.push_str("\n...\n"); new_content.push_str(&chunks[i].content); chunks[i].content = new_content; } } } } fn compute_hash(content: &str) -> String { use sha2::{Digest, Sha256}; format!("{:x}", Sha256::digest(content.as_bytes())) } /// Context extraction algorithms for relevant embedding context pub async fn extract_context( &self, chunk: &TextChunk, full_content: &str, ) -> Result<ContextExtraction> { let language = chunk .language .clone() .unwrap_or(Language::Other("text".to_string())); let registry = self.language_registry.clone(); let chunk_content = chunk.content.clone(); let full_content = full_content.to_string(); let chunk_start = chunk.start_byte; let chunk_end = chunk.end_byte; tokio::task::spawn_blocking(move || { let mut relationships = Vec::new(); let mut surrounding_context = Vec::new(); // Extract relationships based on tree-sitter analysis if let Some(mut parser) = registry.create_parser(&language) { if let Some(tree) = parser.parse(&full_content, None) { Self::extract_semantic_relationships( &tree, &full_content, chunk_start, chunk_end, &language, &mut relationships, ); } } // Extract surrounding context Self::extract_surrounding_context_detailed( &full_content, chunk_start, chunk_end, &mut surrounding_context, ); // Calculate importance score let importance_score = Self::calculate_importance_score(&chunk_content, &relationships, &language); Ok(ContextExtraction { primary_content: chunk_content, surrounding_context, semantic_relationships: relationships, importance_score, }) }) .await .map_err(|e| CodeGraphError::Parse(e.to_string()))? } fn extract_semantic_relationships( tree: &Tree, content: &str, chunk_start: usize, chunk_end: usize, language: &Language, relationships: &mut Vec<TextSemanticRelationship>, ) { let root = tree.root_node(); let cursor = root.walk(); let content_bytes = content.as_bytes(); // Find the node containing our chunk let chunk_node = Self::find_node_at_range(&root, chunk_start, chunk_end); if let Some(node) = chunk_node { // Find function calls within the chunk Self::find_function_calls(&node, content_bytes, language, relationships); // Find variable uses and definitions Self::find_variable_relationships(&node, content_bytes, language, relationships); // Find type relationships Self::find_type_relationships(&node, content_bytes, language, relationships); } } fn find_node_at_range<'a>(node: &Node<'a>, start: usize, end: usize) -> Option<Node<'a>> { let node_start = node.start_byte(); let node_end = node.end_byte(); // Check if this node contains our range if node_start <= start && node_end >= end { // Check children for more specific match let mut cursor = node.walk(); if cursor.goto_first_child() { loop { let child = cursor.node(); if let Some(child_match) = Self::find_node_at_range(&child, start, end) { return Some(child_match); } if !cursor.goto_next_sibling() { break; } } } Some(*node) } else { None } } fn find_function_calls( node: &Node, content_bytes: &[u8], language: &Language, relationships: &mut Vec<TextSemanticRelationship>, ) { let mut cursor = node.walk(); loop { let current = cursor.node(); // Check for function call patterns based on language let is_call = match language { Language::Rust => matches!(current.kind(), "call_expression"), Language::Python => matches!(current.kind(), "call"), Language::JavaScript | Language::TypeScript => { matches!(current.kind(), "call_expression") } Language::Go => matches!(current.kind(), "call_expression"), Language::Java => matches!(current.kind(), "method_invocation"), Language::Cpp => matches!(current.kind(), "call_expression"), _ => current.kind().contains("call"), }; if is_call { if let Ok(call_text) = std::str::from_utf8(&content_bytes[current.start_byte()..current.end_byte()]) { relationships.push(TextSemanticRelationship { relation_type: RelationType::Calls, target_content: call_text.trim().to_string(), confidence: 0.9, }); } } // Navigate tree if cursor.goto_first_child() { continue; } while !cursor.goto_next_sibling() { if !cursor.goto_parent() { return; } } } } fn find_variable_relationships( node: &Node, content_bytes: &[u8], language: &Language, relationships: &mut Vec<TextSemanticRelationship>, ) { let mut cursor = node.walk(); loop { let current = cursor.node(); // Check for variable usage patterns let is_identifier = match language { Language::Rust => matches!(current.kind(), "identifier"), Language::Python => matches!(current.kind(), "identifier"), Language::JavaScript | Language::TypeScript => { matches!(current.kind(), "identifier") } Language::Go => matches!(current.kind(), "identifier"), Language::Java => matches!(current.kind(), "identifier"), Language::Cpp => matches!(current.kind(), "identifier"), _ => current.kind() == "identifier", }; if is_identifier { if let Ok(id_text) = std::str::from_utf8(&content_bytes[current.start_byte()..current.end_byte()]) { // Determine if this is a definition or use based on parent context if let Some(parent) = current.parent() { let relation_type = match parent.kind() { kind if kind.contains("declaration") || kind.contains("definition") => { RelationType::Defines } _ => RelationType::Uses, }; relationships.push(TextSemanticRelationship { relation_type, target_content: id_text.trim().to_string(), confidence: 0.7, }); } } } // Navigate tree if cursor.goto_first_child() { continue; } while !cursor.goto_next_sibling() { if !cursor.goto_parent() { return; } } } } fn find_type_relationships( node: &Node, content_bytes: &[u8], language: &Language, relationships: &mut Vec<TextSemanticRelationship>, ) { let mut cursor = node.walk(); loop { let current = cursor.node(); // Check for type relationships let is_type_ref = match language { Language::Rust => matches!(current.kind(), "type_identifier" | "generic_type"), Language::Python => matches!(current.kind(), "type"), Language::JavaScript | Language::TypeScript => { matches!(current.kind(), "type_identifier") } Language::Go => matches!(current.kind(), "type_identifier"), Language::Java => matches!(current.kind(), "type_identifier"), Language::Cpp => matches!(current.kind(), "type_identifier"), _ => current.kind().contains("type"), }; if is_type_ref { if let Ok(type_text) = std::str::from_utf8(&content_bytes[current.start_byte()..current.end_byte()]) { relationships.push(TextSemanticRelationship { relation_type: RelationType::References, target_content: type_text.trim().to_string(), confidence: 0.8, }); } } // Navigate tree if cursor.goto_first_child() { continue; } while !cursor.goto_next_sibling() { if !cursor.goto_parent() { return; } } } } fn extract_surrounding_context_detailed( content: &str, chunk_start: usize, chunk_end: usize, context: &mut Vec<String>, ) { let lines: Vec<&str> = content.lines().collect(); let chunk_lines = content[..chunk_start].lines().count(); let context_radius = 3; // Number of lines before and after // Add lines before let start_line = chunk_lines.saturating_sub(context_radius); for i in start_line..chunk_lines { if i < lines.len() { context.push(format!("BEFORE: {}", lines[i])); } } // Add lines after let chunk_end_line = content[..chunk_end].lines().count(); let end_line = (chunk_end_line + context_radius).min(lines.len()); for i in (chunk_end_line + 1)..end_line { if i < lines.len() { context.push(format!("AFTER: {}", lines[i])); } } } fn calculate_importance_score( content: &str, relationships: &[TextSemanticRelationship], language: &Language, ) -> f32 { let mut score = 0.5; // Base score // Boost score based on content characteristics if content.contains("fn ") || content.contains("function ") || content.contains("def ") { score += 0.3; // Functions are important } if content.contains("class ") || content.contains("struct ") || content.contains("interface ") { score += 0.4; // Type definitions are very important } if content.contains("pub ") || content.contains("public ") || content.contains("export ") { score += 0.2; // Public APIs are important } // Boost based on number of relationships score += (relationships.len() as f32 * 0.1).min(0.3); // Language-specific adjustments match language { Language::Rust => { if content.contains("unsafe") || content.contains("impl") { score += 0.2; } } Language::Python => { if content.contains("@") || content.contains("__") { score += 0.1; } } _ => {} } score.min(1.0) } /// Deduplication strategies and text normalization pub async fn deduplicate_and_normalize( &self, chunks: Vec<TextChunk>, ) -> Result<Vec<TextChunk>> { if !self.config.enable_deduplication { return Ok(chunks); } let config = self.config.clone(); let cache = self.deduplication_cache.clone(); tokio::task::spawn_blocking(move || { let mut normalized_chunks = Vec::new(); let mut seen_hashes = std::collections::HashSet::new(); for mut chunk in chunks { // Normalize content based on configuration chunk.content = Self::normalize_content( &chunk.content, &chunk.language, &config.normalization_level, ); // Recompute hash after normalization chunk.hash = Self::compute_hash(&chunk.content); // Check for duplicates if !seen_hashes.contains(&chunk.hash) { seen_hashes.insert(chunk.hash.clone()); normalized_chunks.push(chunk); } } Ok(normalized_chunks) }) .await .map_err(|e| CodeGraphError::Parse(e.to_string()))? } fn normalize_content( content: &str, language: &Option<Language>, level: &NormalizationLevel, ) -> String { match level { NormalizationLevel::None => content.to_string(), NormalizationLevel::Basic => { // Basic whitespace normalization content .lines() .map(|line| line.trim()) .filter(|line| !line.is_empty()) .collect::<Vec<_>>() .join("\n") } NormalizationLevel::Standard => { let mut normalized = Self::normalize_content(content, language, &NormalizationLevel::Basic); // Language-specific normalization if let Some(lang) = language { normalized = Self::apply_language_normalization(normalized, lang); } normalized } NormalizationLevel::Aggressive => { let mut normalized = Self::normalize_content(content, language, &NormalizationLevel::Standard); // Remove comments normalized = Self::remove_comments(&normalized, language); // Normalize formatting normalized = Self::normalize_formatting(&normalized); normalized } } } fn apply_language_normalization(content: String, language: &Language) -> String { match language { Language::Rust => { // Normalize Rust-specific patterns content .replace(" ", " ") .replace("{ ", "{") .replace(" }", "}") } Language::Python => { // Python-specific normalization content.replace(" ", "\t") // Convert spaces to tabs } _ => content, } } fn remove_comments(content: &str, language: &Option<Language>) -> String { if let Some(lang) = language { match lang { Language::Rust | Language::Cpp | Language::JavaScript | Language::TypeScript | Language::Go | Language::Java => { let re_line = Regex::new(r"//.*").unwrap(); let re_block = Regex::new(r"/\*[\s\S]*?\*/").unwrap(); let no_line = re_line.replace_all(content, ""); re_block.replace_all(&no_line, "").to_string() } Language::Python => { let re = Regex::new(r"#.*").unwrap(); re.replace_all(content, "").to_string() } _ => content.to_string(), } } else { content.to_string() } } fn normalize_formatting(content: &str) -> String { // Remove extra whitespace and normalize line breaks let re = Regex::new(r"\s+").unwrap(); re.replace_all(content.trim(), " ").to_string() } /// Get processing statistics pub fn get_statistics(&self) -> ProcessingStatistics { // For now, return empty statistics // In a real implementation, this would track actual processing metrics ProcessingStatistics { total_chunks: 0, deduplicated_chunks: 0, total_tokens: 0, processing_time_ms: 0, bytes_processed: 0, language_distribution: HashMap::new(), } } /// Clear deduplication cache pub async fn clear_cache(&self) { self.deduplication_cache.clear(); } fn initialize_normalization_patterns(&self) { // Initialize language-specific normalization patterns for preprocessing for language in &[ Language::Rust, Language::Python, Language::JavaScript, Language::TypeScript, Language::Go, Language::Java, Language::Cpp, ] { let patterns = match language { Language::Rust => vec![ Regex::new(r"\s+").unwrap(), // Multiple whitespace Regex::new(r"//.*").unwrap(), // Line comments ], Language::Python => vec![ Regex::new(r"#.*").unwrap(), // Comments Regex::new(r"\s+").unwrap(), // Multiple whitespace ], _ => vec![ Regex::new(r"\s+").unwrap(), // Basic whitespace normalization ], }; self.normalization_patterns .insert(language.clone(), patterns); } } /// Initialize keyword cache for fast keyword lookups fn initialize_keyword_cache(&self) { // Cache static keywords for each language to avoid repeated allocations let rust_keywords: HashSet<&'static str> = vec![ "as", "break", "const", "continue", "crate", "else", "enum", "extern", "false", "fn", "for", "if", "impl", "in", "let", "loop", "match", "mod", "move", "mut", "pub", "ref", "return", "self", "Self", "static", "struct", "super", "trait", "true", "type", "unsafe", "use", "where", "while", "async", "await", "dyn", ] .into_iter() .collect(); let python_keywords: HashSet<&'static str> = vec![ "and", "as", "assert", "break", "class", "continue", "def", "del", "elif", "else", "except", "exec", "finally", "for", "from", "global", "if", "import", "in", "is", "lambda", "not", "or", "pass", "print", "raise", "return", "try", "while", "with", "yield", "async", "await", ] .into_iter() .collect(); let js_keywords: HashSet<&'static str> = vec![ "async", "await", "break", "case", "catch", "class", "const", "continue", "debugger", "default", "delete", "do", "else", "export", "extends", "finally", "for", "function", "if", "import", "in", "instanceof", "let", "new", "return", "super", "switch", "this", "throw", "try", "typeof", "var", "void", "while", "with", "yield", ] .into_iter() .collect(); let go_keywords: HashSet<&'static str> = vec![ "break", "case", "chan", "const", "continue", "default", "defer", "else", "fallthrough", "for", "func", "go", "goto", "if", "import", "interface", "map", "package", "range", "return", "select", "struct", "switch", "type", "var", ] .into_iter() .collect(); let java_keywords: HashSet<&'static str> = vec![ "abstract", "assert", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "continue", "default", "do", "double", "else", "enum", "extends", "final", "finally", "float", "for", "goto", "if", "implements", "import", "instanceof", "int", "interface", "long", "native", "new", "package", "private", "protected", "public", "return", "short", "static", "strictfp", "super", "switch", "synchronized", "this", "throw", "throws", "transient", "try", "void", "volatile", "while", ] .into_iter() .collect(); let cpp_keywords: HashSet<&'static str> = vec![ "alignas", "alignof", "and", "and_eq", "asm", "atomic_cancel", "atomic_commit", "atomic_noexcept", "auto", "bitand", "bitor", "bool", "break", "case", "catch", "char", "char8_t", "char16_t", "char32_t", "class", "compl", "concept", "const", "consteval", "constexpr", "constinit", "const_cast", "continue", "co_await", "co_return", "co_yield", "decltype", "default", "delete", "do", "double", "dynamic_cast", "else", "enum", "explicit", "export", "extern", "false", "float", "for", "friend", "goto", "if", "inline", "int", "long", "mutable", "namespace", "new", "noexcept", "not", "not_eq", "nullptr", "operator", "or", "or_eq", "private", "protected", "public", "reflexpr", "register", "reinterpret_cast", "requires", "return", "short", "signed", "sizeof", "static", "static_assert", "static_cast", "struct", "switch", "synchronized", "template", "this", "thread_local", "throw", "true", "try", "typedef", "typeid", "typename", "union", "unsigned", "using", "virtual", "void", "volatile", "wchar_t", "while", "xor", "xor_eq", ] .into_iter() .collect(); // Cache all keywords self.keyword_cache.insert(Language::Rust, rust_keywords); self.keyword_cache.insert(Language::Python, python_keywords); self.keyword_cache .insert(Language::JavaScript, js_keywords.clone()); self.keyword_cache.insert(Language::TypeScript, js_keywords); // TypeScript shares JS keywords self.keyword_cache.insert(Language::Go, go_keywords); self.keyword_cache.insert(Language::Java, java_keywords); self.keyword_cache.insert(Language::Cpp, cpp_keywords); } fn compute_chunk_hash(&self, content: &str) -> String { use sha2::{Digest, Sha256}; format!("{:x}", Sha256::digest(content.as_bytes())) } fn determine_semantic_level(&self, node: Node, language: &Language) -> u8 { // Determine semantic level based on tree-sitter node depth and type let mut level = 0; let mut current = Some(node); while let Some(n) = current { level += 1; current = n.parent(); } // Adjust level based on node type importance match node.kind() { "function_item" | "function_definition" | "function_declaration" => level + 2, "struct_item" | "class_definition" | "class_declaration" => level + 3, "impl_item" | "interface_declaration" => level + 2, "mod_item" | "module" => level + 1, _ => level, } .min(255) as u8 } } #[cfg(test)] mod tests { use super::*; #[tokio::test] async fn test_text_processor_creation() { let config = TextProcessorConfig::default(); let processor = TextProcessor::new(config); // Basic creation test assert!(processor.deduplication_cache.is_empty()); } #[tokio::test] async fn test_language_aware_tokenization() { let processor = TextProcessor::new(TextProcessorConfig::default()); let rust_code = "fn main() { println!(\"Hello, world!\"); }"; // This should not panic when implemented // let tokens = processor.tokenize_language_aware(rust_code, Some(Language::Rust)).await; // assert!(tokens.is_ok()); } #[tokio::test] async fn test_semantic_chunking() { let processor = TextProcessor::new(TextProcessorConfig::default()); let code = " struct User { name: String, age: u32, } impl User { fn new(name: String, age: u32) -> Self { Self { name, age } } }"; // This should not panic when implemented // let chunks = processor.chunk_semantic(code, Some(Language::Rust)).await; // assert!(chunks.is_ok()); } #[tokio::test] async fn test_context_extraction() { let processor = TextProcessor::new(TextProcessorConfig::default()); let chunk = TextChunk { content: "fn new(name: String, age: u32) -> Self".to_string(), start_byte: 0, end_byte: 36, start_line: 1, end_line: 1, language: Some(Language::Rust), chunk_type: ChunkType::Function, semantic_level: 2, context_before: None, context_after: None, hash: "test_hash".to_string(), }; let full_content = "impl User { fn new(name: String, age: u32) -> Self { Self { name, age } } }"; // This should not panic when implemented // let context = processor.extract_context(&chunk, full_content).await; // assert!(context.is_ok()); } #[tokio::test] async fn test_deduplication() { let processor = TextProcessor::new(TextProcessorConfig::default()); let chunks = vec![ TextChunk { content: "println!(\"test\");".to_string(), start_byte: 0, end_byte: 17, start_line: 1, end_line: 1, language: Some(Language::Rust), chunk_type: ChunkType::Code, semantic_level: 1, context_before: None, context_after: None, hash: processor.compute_chunk_hash("println!(\"test\");"), }, TextChunk { content: "println!(\"test\");".to_string(), // Duplicate start_byte: 18, end_byte: 35, start_line: 2, end_line: 2, language: Some(Language::Rust), chunk_type: ChunkType::Code, semantic_level: 1, context_before: None, context_after: None, hash: processor.compute_chunk_hash("println!(\"test\");"), }, ]; // This should not panic when implemented // let deduplicated = processor.deduplicate_and_normalize(chunks).await; // assert!(deduplicated.is_ok()); } #[tokio::test] async fn test_configuration_options() { let config = TextProcessorConfig { max_chunk_size: 500, min_chunk_size: 50, overlap_size: 25, preserve_semantic_boundaries: true, enable_deduplication: false, normalization_level: NormalizationLevel::Aggressive, }; let processor = TextProcessor::new(config.clone()); assert_eq!(processor.config.max_chunk_size, 500); assert_eq!( processor.config.normalization_level, NormalizationLevel::Aggressive ); } #[test] fn test_chunk_hash_computation() { let processor = TextProcessor::new(TextProcessorConfig::default()); let hash1 = processor.compute_chunk_hash("test content"); let hash2 = processor.compute_chunk_hash("test content"); let hash3 = processor.compute_chunk_hash("different content"); assert_eq!(hash1, hash2); assert_ne!(hash1, hash3); } #[test] fn test_chunk_type_detection() { // Test chunk type classification logic assert_eq!(ChunkType::Function, ChunkType::Function); assert_ne!(ChunkType::Function, ChunkType::Class); } #[test] fn test_token_type_classification() { // Test token type classification assert_eq!(TokenType::Identifier, TokenType::Identifier); assert_ne!(TokenType::Keyword, TokenType::Identifier); } #[test] fn test_normalization_levels() { assert_ne!(NormalizationLevel::None, NormalizationLevel::Basic); assert_ne!(NormalizationLevel::Standard, NormalizationLevel::Aggressive); } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Jakedismo/codegraph-rust'

If you have feedback or need assistance with the MCP directory API, please join our Discord server