processor.rs•14.1 kB
use super::{CodeLanguage, ASTFeatures, ControlFlowGraph};
use crate::embedding::EmbeddingError;
use std::collections::HashMap;
use std::path::Path;
use serde::{Serialize, Deserialize};
#[derive(Debug, Clone)]
pub struct CodeProcessor {
parsers: HashMap<CodeLanguage, Box<dyn LanguageParser + Send + Sync>>,
ast_extractor: ASTFeatureExtractor,
normalizer: CodeNormalizer,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CodeInput {
pub source: String,
pub language: CodeLanguage,
pub ast_features: Option<ASTFeatures>,
pub control_flow: Option<ControlFlowGraph>,
pub metadata: CodeMetadata,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CodeMetadata {
pub file_path: Option<String>,
pub function_name: Option<String>,
pub class_name: Option<String>,
pub line_count: usize,
pub char_count: usize,
pub tokens: Vec<String>,
pub imports: Vec<String>,
pub doc_comments: Vec<String>,
}
pub trait LanguageParser {
fn parse(&self, source: &str) -> Result<ParsedCode, EmbeddingError>;
fn extract_features(&self, parsed: &ParsedCode) -> ASTFeatures;
fn build_control_flow(&self, parsed: &ParsedCode) -> ControlFlowGraph;
}
#[derive(Debug, Clone)]
pub struct ParsedCode {
pub ast: Vec<ASTNode>,
pub symbols: Vec<Symbol>,
pub comments: Vec<Comment>,
pub imports: Vec<Import>,
}
#[derive(Debug, Clone)]
pub struct ASTNode {
pub id: usize,
pub node_type: String,
pub children: Vec<usize>,
pub start_byte: usize,
pub end_byte: usize,
pub text: String,
}
#[derive(Debug, Clone)]
pub struct Symbol {
pub name: String,
pub symbol_type: SymbolType,
pub scope: String,
pub location: (usize, usize),
}
#[derive(Debug, Clone)]
pub enum SymbolType {
Function,
Class,
Variable,
Constant,
Module,
Interface,
}
#[derive(Debug, Clone)]
pub struct Comment {
pub text: String,
pub comment_type: CommentType,
pub location: (usize, usize),
}
#[derive(Debug, Clone)]
pub enum CommentType {
Line,
Block,
Documentation,
}
#[derive(Debug, Clone)]
pub struct Import {
pub module: String,
pub items: Vec<String>,
pub alias: Option<String>,
}
impl CodeProcessor {
pub fn new() -> Self {
let mut parsers: HashMap<CodeLanguage, Box<dyn LanguageParser + Send + Sync>> = HashMap::new();
// Register language parsers
parsers.insert(CodeLanguage::Rust, Box::new(RustParser::new()));
parsers.insert(CodeLanguage::Python, Box::new(PythonParser::new()));
parsers.insert(CodeLanguage::JavaScript, Box::new(JavaScriptParser::new()));
parsers.insert(CodeLanguage::TypeScript, Box::new(TypeScriptParser::new()));
parsers.insert(CodeLanguage::Java, Box::new(JavaParser::new()));
parsers.insert(CodeLanguage::Go, Box::new(GoParser::new()));
Self {
parsers,
ast_extractor: ASTFeatureExtractor::new(),
normalizer: CodeNormalizer::new(),
}
}
pub fn process_code(&self, source: &str, language: CodeLanguage) -> Result<CodeInput, EmbeddingError> {
// Normalize source code
let normalized_source = self.normalizer.normalize(source, language);
// Extract basic metadata
let metadata = self.extract_metadata(&normalized_source, language, None);
// Try to parse with specific language parser
let (ast_features, control_flow) = if let Some(parser) = self.parsers.get(&language) {
match parser.parse(&normalized_source) {
Ok(parsed) => {
let features = parser.extract_features(&parsed);
let cfg = parser.build_control_flow(&parsed);
(Some(features), Some(cfg))
}
Err(_) => (None, None), // Fall back to basic processing
}
} else {
(None, None)
};
Ok(CodeInput {
source: normalized_source,
language,
ast_features,
control_flow,
metadata,
})
}
pub fn process_file(&self, file_path: &Path) -> Result<CodeInput, EmbeddingError> {
let source = std::fs::read_to_string(file_path)
.map_err(|e| EmbeddingError::InferenceError(format!("Failed to read file: {}", e)))?;
let language = CodeLanguage::from_extension(
file_path.extension()
.and_then(|ext| ext.to_str())
.unwrap_or("")
);
let mut input = self.process_code(&source, language)?;
input.metadata.file_path = Some(file_path.to_string_lossy().to_string());
Ok(input)
}
fn extract_metadata(&self, source: &str, language: CodeLanguage, file_path: Option<&str>) -> CodeMetadata {
let lines: Vec<&str> = source.lines().collect();
let tokens = self.simple_tokenize(source, language);
let imports = self.extract_imports(source, language);
let doc_comments = self.extract_doc_comments(source, language);
CodeMetadata {
file_path: file_path.map(|p| p.to_string()),
function_name: None,
class_name: None,
line_count: lines.len(),
char_count: source.chars().count(),
tokens,
imports,
doc_comments,
}
}
fn simple_tokenize(&self, source: &str, language: CodeLanguage) -> Vec<String> {
// Simple tokenization - can be enhanced with proper lexers
let separators = match language {
CodeLanguage::Rust | CodeLanguage::Cpp | CodeLanguage::Java | CodeLanguage::Csharp => {
&[' ', '\t', '\n', '(', ')', '{', '}', '[', ']', ';', ',', '.', ':', '=', '+', '-', '*', '/', '&', '|'][..]
}
CodeLanguage::Python => {
&[' ', '\t', '\n', '(', ')', '[', ']', ':', ',', '.', '=', '+', '-', '*', '/', '#'][..]
}
_ => &[' ', '\t', '\n', '(', ')', '{', '}', '[', ']'][..],
};
source
.split(separators)
.filter(|token| !token.is_empty())
.map(|token| token.to_string())
.collect()
}
fn extract_imports(&self, source: &str, language: CodeLanguage) -> Vec<String> {
let import_patterns = match language {
CodeLanguage::Rust => vec![r"use\s+([^;]+);", r"extern\s+crate\s+(\w+)"],
CodeLanguage::Python => vec![r"^import\s+(.+)$", r"^from\s+(.+)\s+import"],
CodeLanguage::JavaScript | CodeLanguage::TypeScript => {
vec['\"]", r"^const\s+.+\s+=\s+require\(['\"](.+)['\"]\)"]
}
CodeLanguage::Java => vec![r"^import\s+(.+);"],
CodeLanguage::Go => vec![r#"^import\s+"(.+)""#, r"^import\s+\("],
_ => vec![],
};
let mut imports = Vec::new();
for line in source.lines() {
for pattern in &import_patterns {
if let Ok(regex) = regex::Regex::new(pattern) {
if let Some(captures) = regex.captures(line.trim()) {
if let Some(import) = captures.get(1) {
imports.push(import.as_str().to_string());
}
}
}
}
}
imports
}
fn extract_doc_comments(&self, source: &str, language: CodeLanguage) -> Vec<String> {
let comment_patterns = match language {
CodeLanguage::Rust => vec![r"///\s*(.+)", r"//!\s*(.+)", r"/\*\*(.+?)\*/"],
CodeLanguage::Python => vec![r'"""(.+?)"""', r"'''(.+?)'''", r"#\s*(.+)"],
CodeLanguage::JavaScript | CodeLanguage::TypeScript => {
vec![r"/\*\*(.+?)\*/", r"//\s*(.+)"]
}
CodeLanguage::Java => vec![r"/\*\*(.+?)\*/", r"//\s*(.+)"],
CodeLanguage::Go => vec![r"//\s*(.+)", r"/\*(.+?)\*/"],
_ => vec![],
};
let mut comments = Vec::new();
for line in source.lines() {
for pattern in &comment_patterns {
if let Ok(regex) = regex::Regex::new(pattern) {
if let Some(captures) = regex.captures(line.trim()) {
if let Some(comment) = captures.get(1) {
comments.push(comment.as_str().trim().to_string());
}
}
}
}
}
comments
}
}
pub struct ASTFeatureExtractor {
// Feature extraction logic
}
impl ASTFeatureExtractor {
pub fn new() -> Self {
Self {}
}
pub fn extract(&self, ast: &[ASTNode]) -> ASTFeatures {
let mut features = ASTFeatures::default();
features.depth = self.calculate_max_depth(ast);
features.complexity_score = self.calculate_complexity(ast);
features.function_count = ast.iter().filter(|n| n.node_type == "function").count();
features.class_count = ast.iter().filter(|n| n.node_type == "class").count();
features.node_types = ast.iter().map(|n| n.node_type.clone()).collect();
features
}
fn calculate_max_depth(&self, ast: &[ASTNode]) -> usize {
// Simplified depth calculation
ast.len() / 10 + 1
}
fn calculate_complexity(&self, ast: &[ASTNode]) -> f32 {
// Simplified complexity score
let branch_nodes = ast.iter()
.filter(|n| matches!(n.node_type.as_str(), "if" | "while" | "for" | "match" | "switch"))
.count();
1.0 + branch_nodes as f32 * 0.5
}
}
pub struct CodeNormalizer;
impl CodeNormalizer {
pub fn new() -> Self {
Self
}
pub fn normalize(&self, source: &str, language: CodeLanguage) -> String {
let mut normalized = source.to_string();
// Remove excessive whitespace
normalized = regex::Regex::new(r"\s+").unwrap()
.replace_all(&normalized, " ").to_string();
// Language-specific normalization
match language {
CodeLanguage::Python => self.normalize_python(&normalized),
CodeLanguage::JavaScript | CodeLanguage::TypeScript => self.normalize_js(&normalized),
_ => normalized,
}
}
fn normalize_python(&self, source: &str) -> String {
// Remove docstrings for normalization
regex::Regex::new(r#""""[\s\S]*?""""#).unwrap()
.replace_all(source, "").to_string()
}
fn normalize_js(&self, source: &str) -> String {
// Remove comments
regex::Regex::new(r"//.*$").unwrap()
.replace_all(source, "").to_string()
}
}
// Language-specific parsers (simplified implementations)
struct RustParser;
struct PythonParser;
struct JavaScriptParser;
struct TypeScriptParser;
struct JavaParser;
struct GoParser;
impl RustParser {
fn new() -> Self { Self }
}
impl LanguageParser for RustParser {
fn parse(&self, source: &str) -> Result<ParsedCode, EmbeddingError> {
// Simplified parsing - would use tree-sitter-rust in real implementation
Ok(ParsedCode {
ast: vec![],
symbols: vec![],
comments: vec![],
imports: vec![],
})
}
fn extract_features(&self, _parsed: &ParsedCode) -> ASTFeatures {
ASTFeatures::default()
}
fn build_control_flow(&self, _parsed: &ParsedCode) -> ControlFlowGraph {
ControlFlowGraph::default()
}
}
// Similar implementations for other language parsers...
impl PythonParser { fn new() -> Self { Self } }
impl JavaScriptParser { fn new() -> Self { Self } }
impl TypeScriptParser { fn new() -> Self { Self } }
impl JavaParser { fn new() -> Self { Self } }
impl GoParser { fn new() -> Self { Self } }
impl LanguageParser for PythonParser {
fn parse(&self, _source: &str) -> Result<ParsedCode, EmbeddingError> {
Ok(ParsedCode { ast: vec![], symbols: vec![], comments: vec![], imports: vec![] })
}
fn extract_features(&self, _parsed: &ParsedCode) -> ASTFeatures { ASTFeatures::default() }
fn build_control_flow(&self, _parsed: &ParsedCode) -> ControlFlowGraph { ControlFlowGraph::default() }
}
impl LanguageParser for JavaScriptParser {
fn parse(&self, _source: &str) -> Result<ParsedCode, EmbeddingError> {
Ok(ParsedCode { ast: vec![], symbols: vec![], comments: vec![], imports: vec![] })
}
fn extract_features(&self, _parsed: &ParsedCode) -> ASTFeatures { ASTFeatures::default() }
fn build_control_flow(&self, _parsed: &ParsedCode) -> ControlFlowGraph { ControlFlowGraph::default() }
}
impl LanguageParser for TypeScriptParser {
fn parse(&self, _source: &str) -> Result<ParsedCode, EmbeddingError> {
Ok(ParsedCode { ast: vec![], symbols: vec![], comments: vec![], imports: vec![] })
}
fn extract_features(&self, _parsed: &ParsedCode) -> ASTFeatures { ASTFeatures::default() }
fn build_control_flow(&self, _parsed: &ParsedCode) -> ControlFlowGraph { ControlFlowGraph::default() }
}
impl LanguageParser for JavaParser {
fn parse(&self, _source: &str) -> Result<ParsedCode, EmbeddingError> {
Ok(ParsedCode { ast: vec![], symbols: vec![], comments: vec![], imports: vec![] })
}
fn extract_features(&self, _parsed: &ParsedCode) -> ASTFeatures { ASTFeatures::default() }
fn build_control_flow(&self, _parsed: &ParsedCode) -> ControlFlowGraph { ControlFlowGraph::default() }
}
impl LanguageParser for GoParser {
fn parse(&self, _source: &str) -> Result<ParsedCode, EmbeddingError> {
Ok(ParsedCode { ast: vec![], symbols: vec![], comments: vec![], imports: vec![] })
}
fn extract_features(&self, _parsed: &ParsedCode) -> ASTFeatures { ASTFeatures::default() }
fn build_control_flow(&self, _parsed: &ParsedCode) -> ControlFlowGraph { ControlFlowGraph::default() }
}
impl Default for CodeProcessor {
fn default() -> Self {
Self::new()
}
}