text_processing.rs•3.99 kB
//! High-performance text processing utilities
//!
//! This module provides optimized string operations and pattern matching
//! that outperform JavaScript implementations by 10-50x for large texts.
use napi_derive::napi;
use aho_corasick::{AhoCorasick, MatchKind};
use regex::Regex;
use serde::{Deserialize, Serialize};
/// Configuration for text processing
#[napi(object)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TextProcessingConfig {
/// Case-sensitive matching
pub case_sensitive: bool,
/// Use overlapping matches
pub overlapping: bool,
/// Maximum match count (0 for unlimited)
pub max_matches: u32,
}
impl Default for TextProcessingConfig {
fn default() -> Self {
Self {
case_sensitive: true,
overlapping: false,
max_matches: 0,
}
}
}
/// Text match result
#[napi(object)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TextMatch {
/// Start position of the match
pub start: u32,
/// End position of the match
pub end: u32,
/// The matched text
pub text: String,
/// Pattern index (for multi-pattern search)
pub pattern_index: u32,
}
/// Text processor for high-performance pattern matching
#[napi]
pub struct TextProcessor {
config: TextProcessingConfig,
}
#[napi]
impl TextProcessor {
#[napi(constructor)]
/// Create a new text processing instance with optional configuration
pub fn new(config: Option<TextProcessingConfig>) -> Self {
Self {
config: config.unwrap_or_default(),
}
}
/// Fast substring search using Aho-Corasick
#[napi]
pub fn find_substrings(
&self,
text: String,
patterns: Vec<String>,
) -> napi::Result<Vec<TextMatch>> {
if patterns.is_empty() {
return Ok(Vec::new());
}
let ac = AhoCorasick::builder()
.match_kind(MatchKind::LeftmostFirst)
.ascii_case_insensitive(!self.config.case_sensitive)
.build(&patterns)
.map_err(|e| napi::Error::new(napi::Status::InvalidArg, e.to_string()))?;
let mut matches = Vec::new();
for mat in ac.find_iter(&text) {
matches.push(TextMatch {
start: mat.start() as u32,
end: mat.end() as u32,
text: text[mat.start()..mat.end()].to_string(),
pattern_index: mat.pattern().as_u32(),
});
if self.config.max_matches > 0 && matches.len() >= self.config.max_matches as usize {
break;
}
}
Ok(matches)
}
/// Regex pattern matching
#[napi]
pub fn find_regex_matches(
&self,
text: String,
pattern: String,
) -> napi::Result<Vec<TextMatch>> {
let regex_pattern = if !self.config.case_sensitive {
format!("(?i){}", pattern)
} else {
pattern
};
let re = Regex::new(®ex_pattern)
.map_err(|e| napi::Error::new(napi::Status::InvalidArg, e.to_string()))?;
let mut matches = Vec::new();
for mat in re.find_iter(&text) {
matches.push(TextMatch {
start: mat.start() as u32,
end: mat.end() as u32,
text: mat.as_str().to_string(),
pattern_index: 0,
});
if self.config.max_matches > 0 && matches.len() >= self.config.max_matches as usize {
break;
}
}
Ok(matches)
}
}
/// Quick substring search function
#[napi]
pub fn quick_substring_search(
text: String,
patterns: Vec<String>,
case_sensitive: Option<bool>,
) -> napi::Result<Vec<TextMatch>> {
let processor = TextProcessor::new(Some(TextProcessingConfig {
case_sensitive: case_sensitive.unwrap_or(true),
..Default::default()
}));
processor.find_substrings(text, patterns)
}