Skip to main content
Glama

CodeGraph CLI MCP Server

by Jakedismo
pattern_matcher.rs9.36 kB
// ABOUTME: Aho-Corasick pattern matcher for fast multi-pattern code analysis // ABOUTME: Provides sub-microsecond pattern matching without training requirements use aho_corasick::{AhoCorasick, AhoCorasickBuilder}; use codegraph_core::{EdgeRelationship, EdgeType, ExtractionResult}; use std::collections::HashMap; use tracing::debug; /// Fast pattern matcher using Aho-Corasick automaton (50-500ns per search) pub struct PatternMatcher { /// Compiled pattern automaton automaton: AhoCorasick, /// Pattern metadata (pattern index → (pattern name, edge type)) patterns: Vec<(String, EdgeType)>, } impl PatternMatcher { /// Create pattern matcher from common code patterns pub fn new() -> Self { // Common code patterns across languages // Language-scoped patterns to avoid useless matches let pattern_configs = vec![ // Rust patterns ("rust:std::", EdgeType::Uses), ("rust:derive(", EdgeType::Uses), ("rust:impl ", EdgeType::Implements), ("rust:trait ", EdgeType::Defines), ("rust:async fn", EdgeType::Defines), ("rust:pub fn", EdgeType::Defines), ("rust:use ", EdgeType::Uses), // TypeScript / JavaScript ("ts:import ", EdgeType::Uses), ("ts:export ", EdgeType::Defines), ("ts:async ", EdgeType::Defines), ("ts:interface ", EdgeType::Defines), ("ts:class ", EdgeType::Defines), ("ts:extends ", EdgeType::Extends), ("ts:implements ", EdgeType::Implements), ("js:import ", EdgeType::Uses), ("js:export ", EdgeType::Defines), ("js:class ", EdgeType::Defines), ("js:extends ", EdgeType::Extends), // Python ("py:from ", EdgeType::Uses), ("py:import ", EdgeType::Uses), ("py:class ", EdgeType::Defines), ("py:def ", EdgeType::Defines), ("py:async def", EdgeType::Defines), // Go ("go:package ", EdgeType::Defines), ("go:import ", EdgeType::Uses), ("go:func ", EdgeType::Defines), ("go:type ", EdgeType::Defines), ("go:interface ", EdgeType::Defines), ]; let patterns: Vec<String> = pattern_configs.iter().map(|(p, _)| p.to_string()).collect(); let pattern_metadata: Vec<(String, EdgeType)> = pattern_configs .iter() .map(|(p, e)| (p.to_string(), e.clone())) .collect(); let automaton = AhoCorasickBuilder::new() .match_kind(aho_corasick::MatchKind::LeftmostLongest) .build(&patterns) .expect("Failed to build Aho-Corasick automaton"); debug!( "Initialized PatternMatcher with {} patterns", patterns.len() ); Self { automaton, patterns: pattern_metadata, } } /// Enhance extraction result with pattern-based edges AND node enrichment (50-500ns per file) pub fn enhance_extraction( &self, mut result: ExtractionResult, content: &str, ) -> ExtractionResult { // Prefix content with language marker to gate matches (e.g., "rust:", "py:") let mut gated_content = String::with_capacity(content.len() + 4); let lang_prefix = match result.nodes.first().and_then(|n| n.language.as_ref()) { Some(codegraph_core::Language::Rust) => "rust:", Some(codegraph_core::Language::TypeScript) => "ts:", Some(codegraph_core::Language::JavaScript) => "js:", Some(codegraph_core::Language::Python) => "py:", Some(codegraph_core::Language::Go) => "go:", _ => "", }; gated_content.push_str(lang_prefix); gated_content.push_str(content); // Find all pattern matches in content (SIMD-accelerated, sub-microsecond) let matches: Vec<_> = self.automaton.find_iter(&gated_content).collect(); if matches.is_empty() { return result; } // Collect pattern statistics for node enrichment let mut new_edges = Vec::new(); let mut pattern_counts: HashMap<usize, usize> = HashMap::new(); let mut pattern_names = Vec::new(); for m in &matches { let pattern_idx = m.pattern().as_usize(); *pattern_counts.entry(pattern_idx).or_insert(0) += 1; // Track pattern names for node metadata enrichment if let Some((pattern_name, _)) = self.patterns.get(pattern_idx) { if !pattern_names.contains(pattern_name) { pattern_names.push(pattern_name.clone()); } } } // Pick a representative node (file/module node, else longest node content) let representative_node = result .nodes .iter() .find(|n| { matches!(n.node_type, Some(codegraph_core::NodeType::Module)) || n.node_type.is_none() }) .or_else(|| { result .nodes .iter() .max_by_key(|n| n.content.as_ref().map(|c| c.len()).unwrap_or(0)) }) .cloned(); // Generate edges based on pattern frequency (top-k per file, capped per pattern) if let Some(rep) = representative_node { // Sort patterns by count desc let mut freq: Vec<(usize, usize)> = pattern_counts.into_iter().collect(); freq.sort_by(|a, b| b.1.cmp(&a.1)); let top_k = 5usize; let max_edges_per_file = 25usize; let max_per_pattern = 5usize; let mut edges_added = 0usize; for (pattern_idx, count) in freq.into_iter().take(top_k) { if edges_added >= max_edges_per_file { break; } if let Some((pattern_name, edge_type)) = self.patterns.get(pattern_idx) { let edge_reps = count.min(max_per_pattern); let mut metadata = HashMap::new(); metadata.insert("pattern".to_string(), pattern_name.clone()); metadata.insert("pattern_count".to_string(), count.to_string()); metadata.insert( "fast_ml_enhancement".to_string(), "pattern_match".to_string(), ); for _ in 0..edge_reps { if edges_added >= max_edges_per_file { break; } new_edges.push(EdgeRelationship { from: rep.id, to: pattern_name.clone(), edge_type: edge_type.clone(), metadata: metadata.clone(), span: None, }); edges_added += 1; } } } } let enhancement_count = new_edges.len(); if enhancement_count > 0 { debug!( "⚡ PatternMatcher: Added {} pattern-based edges (found {} total matches)", enhancement_count, matches.len() ); result.edges.extend(new_edges); } // Enrich file-level nodes with pattern context for better embeddings // This helps SOTA embedding models understand the code's characteristics if !pattern_names.is_empty() && !result.nodes.is_empty() { // Find file-level or module-level nodes to enrich for node in &mut result.nodes { // Enrich nodes that represent the file or module scope if matches!(node.node_type, Some(codegraph_core::NodeType::Module)) || node.node_type.is_none() { // Add pattern context to metadata node.metadata .attributes .insert("fast_ml_patterns".to_string(), pattern_names.join(", ")); node.metadata.attributes.insert( "fast_ml_pattern_count".to_string(), matches.len().to_string(), ); break; // Only enrich the first file-level node } } } result } /// Get pattern statistics pub fn pattern_count(&self) -> usize { self.patterns.len() } } impl Default for PatternMatcher { fn default() -> Self { Self::new() } } #[cfg(test)] mod tests { use super::*; use codegraph_core::CodeNode; #[test] fn test_pattern_matching_speed() { let matcher = PatternMatcher::new(); let content = r#" use std::collections::HashMap; pub fn test() {} impl MyTrait for Foo {} "#; let result = ExtractionResult { nodes: vec![CodeNode::new_test()], edges: vec![], }; let enhanced = matcher.enhance_extraction(result, content); assert!(enhanced.edges.len() > 0); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Jakedismo/codegraph-rust'

If you have feedback or need assistance with the MCP directory API, please join our Discord server