Pomera AI Commander

optimized_pattern_engine.py•12.9 KiB

""" Optimized Pattern Engine for fast text analysis with minimal regex usage. Provides specialized algorithms optimized for different text sizes with Unicode awareness. """ import re import unicodedata from typing import Dict, Optional, Tuple, List from dataclasses import dataclass from functools import lru_cache @dataclass class TextStructure: """Detailed text structure analysis.""" char_count: int = 0 word_count: int = 0 sentence_count: int = 0 line_count: int = 0 paragraph_count: int = 0 whitespace_count: int = 0 punctuation_count: int = 0 # Performance metadata processing_method: str = "fast" # fast, standard, regex processing_time_ms: float = 0.0 class OptimizedPatternEngine: """ High-performance pattern engine that minimizes regex usage. Uses string-based counting methods where possible and compiled regex patterns with caching. """ # Sentence ending punctuation SENTENCE_ENDINGS = frozenset('.!?') # Common whitespace characters WHITESPACE_CHARS = frozenset(' \t\n\r\f\v') # Word boundary characters (optimized set) WORD_BOUNDARIES = frozenset(' \t\n\r\f\v.,;:!?()[]{}"\'-—–') def __init__(self): """Initialize the optimized pattern engine with compiled regex patterns.""" # Compiled regex patterns (cached for performance) self._word_pattern = re.compile(r'\b\w+\b', re.UNICODE) self._sentence_pattern = re.compile(r'[.!?]+(?:\s|$)', re.UNICODE) self._paragraph_pattern = re.compile(r'\n\s*\n', re.UNICODE) self._whitespace_pattern = re.compile(r'\s+', re.UNICODE) # Complex sentence pattern for edge cases self._complex_sentence_pattern = re.compile( r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$', re.UNICODE ) # Pattern cache for dynamic patterns self._pattern_cache: Dict[str, re.Pattern] = {} self._cache_max_size = 50 def count_words_fast(self, text: str) -> int: """ Fast word counting using string-based methods. Optimized for performance with minimal regex usage. Args: text: Text to analyze Returns: Word count """ if not text: return 0 # For small text, use simple split (fastest) if len(text) < 500: return self._count_words_simple(text) # For all other sizes, use regex (fastest overall) return len(self._word_pattern.findall(text)) def _count_words_simple(self, text: str) -> int: """Simple word counting for small text.""" # Split on whitespace and filter empty strings return len([word for word in text.split() if word]) def _count_words_optimized(self, text: str) -> int: """ Optimized word counting using character scanning. Handles Unicode word boundaries correctly. """ word_count = 0 in_word = False for char in text: if char in self.WORD_BOUNDARIES or char.isspace(): if in_word: word_count += 1 in_word = False elif char.isalnum() or char == '_': in_word = True # Count last word if text ends with a word character if in_word: word_count += 1 return word_count def count_sentences_fast(self, text: str) -> int: """ Fast sentence counting with minimal regex usage. Args: text: Text to analyze Returns: Sentence count """ if not text: return 0 # Use compiled regex for all sizes (fastest and most accurate) return len(self._sentence_pattern.findall(text)) def _count_sentences_simple(self, text: str) -> int: """ Simple sentence counting using character scanning. Looks for sentence-ending punctuation followed by whitespace or end of text. """ sentence_count = 0 text_len = len(text) i = 0 while i < text_len: char = text[i] # Check for sentence ending punctuation if char in self.SENTENCE_ENDINGS: # Look ahead to confirm it's a sentence boundary # (followed by whitespace, uppercase, or end of text) if i + 1 >= text_len: sentence_count += 1 break next_char = text[i + 1] if next_char.isspace(): sentence_count += 1 # Skip consecutive punctuation while i + 1 < text_len and text[i + 1] in self.SENTENCE_ENDINGS: i += 1 i += 1 return max(sentence_count, 1 if text.strip() else 0) def count_lines_fast(self, text: str) -> int: """ Fast line counting using string method. Args: text: Text to analyze Returns: Line count """ if not text: return 0 # Simple and fast: count newlines and add 1 line_count = text.count('\n') + 1 # Adjust if text ends with newline if text.endswith('\n'): line_count -= 1 return max(line_count, 1 if text.strip() else 0) def count_paragraphs_fast(self, text: str) -> int: """ Fast paragraph counting. Args: text: Text to analyze Returns: Paragraph count """ if not text.strip(): return 0 # For small text, use simple method if len(text) < 5000: return self._count_paragraphs_simple(text) # For larger text, use regex paragraphs = self._paragraph_pattern.split(text) return len([p for p in paragraphs if p.strip()]) def _count_paragraphs_simple(self, text: str) -> int: """Simple paragraph counting by looking for blank lines.""" lines = text.split('\n') paragraph_count = 0 in_paragraph = False for line in lines: if line.strip(): if not in_paragraph: paragraph_count += 1 in_paragraph = True else: in_paragraph = False return paragraph_count def count_characters_unicode_aware(self, text: str) -> Tuple[int, int]: """ Count characters with Unicode awareness. Args: text: Text to analyze Returns: Tuple of (character_count, byte_count) """ if not text: return (0, 0) # Character count (Unicode code points) char_count = len(text) # Byte count (UTF-8 encoding) byte_count = len(text.encode('utf-8')) return (char_count, byte_count) def analyze_text_structure(self, text: str) -> TextStructure: """ Comprehensive text structure analysis using optimized methods. Args: text: Text to analyze Returns: TextStructure with detailed analysis """ import time start_time = time.time() structure = TextStructure() if not text: return structure # Determine processing method based on text size text_size = len(text) if text_size < 1000: structure.processing_method = "fast" elif text_size < 50000: structure.processing_method = "standard" else: structure.processing_method = "regex" # Character counts char_count, byte_count = self.count_characters_unicode_aware(text) structure.char_count = byte_count # Use byte count for consistency # Line count (always fast) structure.line_count = self.count_lines_fast(text) # Word count (optimized based on size) structure.word_count = self.count_words_fast(text) # Sentence count (optimized based on size) structure.sentence_count = self.count_sentences_fast(text) # Paragraph count (optimized based on size) structure.paragraph_count = self.count_paragraphs_fast(text) # Whitespace count (fast string method) structure.whitespace_count = sum(1 for c in text if c.isspace()) # Punctuation count (fast character check) structure.punctuation_count = sum( 1 for c in text if unicodedata.category(c).startswith('P') ) # Record processing time structure.processing_time_ms = (time.time() - start_time) * 1000 return structure @lru_cache(maxsize=100) def get_compiled_pattern(self, pattern: str, flags: int = 0) -> re.Pattern: """ Get a compiled regex pattern with caching. Args: pattern: Regex pattern string flags: Regex flags Returns: Compiled regex pattern """ cache_key = f"{pattern}_{flags}" if cache_key not in self._pattern_cache: # Compile and cache the pattern compiled = re.compile(pattern, flags) # Manage cache size if len(self._pattern_cache) >= self._cache_max_size: # Remove oldest entry (simple FIFO) oldest_key = next(iter(self._pattern_cache)) del self._pattern_cache[oldest_key] self._pattern_cache[cache_key] = compiled return self._pattern_cache[cache_key] def find_all_optimized(self, pattern: str, text: str, flags: int = 0) -> List[str]: """ Find all matches using cached compiled pattern. Args: pattern: Regex pattern text: Text to search flags: Regex flags Returns: List of matches """ compiled_pattern = self.get_compiled_pattern(pattern, flags) return compiled_pattern.findall(text) def count_pattern_optimized(self, pattern: str, text: str, flags: int = 0) -> int: """ Count pattern matches using cached compiled pattern. Args: pattern: Regex pattern text: Text to search flags: Regex flags Returns: Match count """ matches = self.find_all_optimized(pattern, text, flags) return len(matches) def is_unicode_text(self, text: str) -> bool: """ Check if text contains non-ASCII Unicode characters. Args: text: Text to check Returns: True if text contains Unicode characters beyond ASCII """ if not text: return False # Fast check: if all characters are ASCII, no Unicode try: text.encode('ascii') return False except UnicodeEncodeError: return True def normalize_unicode(self, text: str, form: str = 'NFC') -> str: """ Normalize Unicode text for consistent processing. Args: text: Text to normalize form: Normalization form (NFC, NFD, NFKC, NFKD) Returns: Normalized text """ return unicodedata.normalize(form, text) def clear_pattern_cache(self): """Clear the pattern cache.""" self._pattern_cache.clear() # Clear LRU cache self.get_compiled_pattern.cache_clear() def get_cache_info(self) -> Dict[str, int]: """ Get pattern cache information. Returns: Dictionary with cache statistics """ lru_info = self.get_compiled_pattern.cache_info() return { 'pattern_cache_size': len(self._pattern_cache), 'pattern_cache_max_size': self._cache_max_size, 'lru_cache_hits': lru_info.hits, 'lru_cache_misses': lru_info.misses, 'lru_cache_size': lru_info.currsize, 'lru_cache_max_size': lru_info.maxsize } # Global instance _global_pattern_engine: Optional[OptimizedPatternEngine] = None def get_pattern_engine() -> OptimizedPatternEngine: """ Get the global optimized pattern engine instance. Returns: Global OptimizedPatternEngine instance """ global _global_pattern_engine if _global_pattern_engine is None: _global_pattern_engine = OptimizedPatternEngine() return _global_pattern_engine

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/matbanik/Pomera-AI-Commander'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

optimized_pattern_engine.py•12.9 KiB