MCP Code Indexer

  • mcp_code_indexer
""" 代码压缩器模块 提供代码压缩、规范化和简化功能 """ import re from typing import Dict, List, Set, Optional, Tuple import logging from dataclasses import dataclass from enum import Enum logger = logging.getLogger(__name__) class NormalizationLevel(Enum): """规范化级别""" MINIMAL = "minimal" # 最小化改动 NORMAL = "normal" # 标准规范化 AGGRESSIVE = "aggressive" # 激进规范化 @dataclass class CompressionOptions: """压缩选项""" remove_comments: bool = True remove_empty_lines: bool = True normalize_whitespace: bool = True normalize_names: bool = False combine_imports: bool = True remove_unused: bool = True minify_strings: bool = False normalize_level: NormalizationLevel = NormalizationLevel.NORMAL class CodeCompressor: """代码压缩器类""" def __init__(self): """初始化代码压缩器""" self.name_mapping: Dict[str, str] = {} self.used_names: Set[str] = set() self.preserved_names: Set[str] = { 'self', 'cls', 'super', 'None', 'True', 'False', '__init__', '__main__', '__name__', '__file__' } def compress(self, content: str, language: str, options: Optional[CompressionOptions] = None) -> str: """ 压缩代码 Args: content: 代码内容 language: 编程语言 options: 压缩选项 Returns: 压缩后的代码 """ if options is None: options = CompressionOptions() # 保存重要注释 preserved_comments = self._extract_important_comments(content) # 移除注释和空行 if options.remove_comments: content = self._remove_comments(content, language) if options.remove_empty_lines: content = self._remove_empty_lines(content) # 规范化代码 if options.normalize_whitespace: content = self._normalize_whitespace(content) if options.normalize_names: content = self._normalize_names(content, language) # 合并导入语句 if options.combine_imports: content = self._combine_imports(content, language) # 移除未使用的代码 if options.remove_unused: content = self._remove_unused_code(content, language) # 最小化字符串 if options.minify_strings: content = self._minify_strings(content, language) # 恢复重要注释 content = self._restore_important_comments(content, preserved_comments) return content def normalize(self, content: str, language: str, level: NormalizationLevel = NormalizationLevel.NORMAL) -> str: """ 规范化代码 Args: content: 代码内容 language: 编程语言 level: 规范化级别 Returns: 规范化后的代码 """ # 基于级别选择规范化选项 options = CompressionOptions( remove_comments=level == NormalizationLevel.AGGRESSIVE, remove_empty_lines=level != NormalizationLevel.MINIMAL, normalize_whitespace=True, normalize_names=level == NormalizationLevel.AGGRESSIVE, combine_imports=level != NormalizationLevel.MINIMAL, remove_unused=level == NormalizationLevel.AGGRESSIVE, minify_strings=False, normalize_level=level ) return self.compress(content, language, options) def _extract_important_comments(self, content: str) -> List[Tuple[int, str]]: """提取重要注释""" important_patterns = [ r'#\s*TODO', r'#\s*FIXME', r'#\s*NOTE', r'#\s*IMPORTANT', r'""".*?TODO.*?"""', r"'''.*?TODO.*?'''", ] important_comments = [] lines = content.split('\n') for i, line in enumerate(lines): for pattern in important_patterns: if re.search(pattern, line, re.IGNORECASE): important_comments.append((i + 1, line)) break return important_comments def _remove_comments(self, content: str, language: str) -> str: """移除注释""" if language in ['python']: # 移除单行注释 content = re.sub(r'#.*$', '', content, flags=re.MULTILINE) # 移除多行注释 content = re.sub(r'""".*?"""', '', content, flags=re.DOTALL) content = re.sub(r"'''.*?'''", '', content, flags=re.DOTALL) elif language in ['javascript', 'typescript']: # 移除单行注释 content = re.sub(r'//.*$', '', content, flags=re.MULTILINE) # 移除多行注释 content = re.sub(r'/\*.*?\*/', '', content, flags=re.DOTALL) return content def _remove_empty_lines(self, content: str) -> str: """移除空行""" lines = content.split('\n') non_empty_lines = [line for line in lines if line.strip()] return '\n'.join(non_empty_lines) def _normalize_whitespace(self, content: str) -> str: """规范化空白字符""" # 规范化缩进 lines = content.split('\n') normalized_lines = [] for line in lines: # 将制表符转换为空格 line = line.replace('\t', ' ') # 移除行尾空白 line = line.rstrip() # 确保操作符周围有空格 line = re.sub(r'([=+\-*/<>!]+)', r' \1 ', line) # 移除多余空格 line = re.sub(r'\s+', ' ', line) # 保持缩进 indent = len(line) - len(line.lstrip()) if indent > 0: line = ' ' * indent + line.lstrip() normalized_lines.append(line) return '\n'.join(normalized_lines) def _normalize_names(self, content: str, language: str) -> str: """规范化变量名""" # 识别变量名 if language in ['python']: pattern = r'\b[a-zA-Z_]\w*\b' else: pattern = r'\b[a-zA-Z_$]\w*\b' def replace_name(match): name = match.group(0) if name in self.preserved_names: return name if name not in self.name_mapping: new_name = self._generate_name(len(self.name_mapping)) self.name_mapping[name] = new_name return self.name_mapping[name] return re.sub(pattern, replace_name, content) def _generate_name(self, index: int) -> str: """生成简短的变量名""" chars = 'abcdefghijklmnopqrstuvwxyz' base = len(chars) if index < base: return chars[index] name = '' while index >= 0: name = chars[index % base] + name index = index // base - 1 return name def _combine_imports(self, content: str, language: str) -> str: """合并导入语句""" if language == 'python': # 提取所有导入语句 import_pattern = r'^(?:from\s+[\w.]+\s+)?import\s+(?:[\w.]+(?:\s+as\s+\w+)?(?:\s*,\s*[\w.]+(?:\s+as\s+\w+)?)*)' imports = re.finditer(import_pattern, content, re.MULTILINE) # 按模块分组 grouped_imports = {} for match in imports: import_stmt = match.group(0) if import_stmt.startswith('from'): module = re.match(r'from\s+([\w.]+)', import_stmt).group(1) if module not in grouped_imports: grouped_imports[module] = [] grouped_imports[module].append(import_stmt) else: if 'direct' not in grouped_imports: grouped_imports['direct'] = [] grouped_imports['direct'].append(import_stmt) # 合并导入语句 new_imports = [] for module, stmts in grouped_imports.items(): if module == 'direct': new_imports.extend(stmts) else: imports = [] for stmt in stmts: imports.extend(re.findall(r'import\s+((?:[\w.]+(?:\s+as\s+\w+)?(?:\s*,\s*)?)+)', stmt)) new_imports.append(f"from {module} import {', '.join(imports)}") # 替换原有导入语句 content = re.sub(import_pattern + r'\n?', '', content, flags=re.MULTILINE) return '\n'.join(new_imports) + '\n\n' + content.lstrip() return content def _remove_unused_code(self, content: str, language: str) -> str: """移除未使用的代码""" # TODO: 实现未使用代码检测和移除 return content def _minify_strings(self, content: str, language: str) -> str: """最小化字符串""" def shorten_string(match): string = match.group(0) # 保持字符串引号 quote = string[0] # 压缩空白字符 content = string[1:-1] content = re.sub(r'\s+', ' ', content) return quote + content + quote # 处理单引号和双引号字符串 content = re.sub(r'"[^"\\]*(?:\\.[^"\\]*)*"', shorten_string, content) content = re.sub(r"'[^'\\]*(?:\\.[^'\\]*)*'", shorten_string, content) return content def _restore_important_comments(self, content: str, comments: List[Tuple[int, str]]) -> str: """恢复重要注释""" if not comments: return content lines = content.split('\n') # 按行号倒序插入注释 for line_num, comment in sorted(comments, reverse=True): if line_num <= len(lines): lines.insert(line_num - 1, comment) else: lines.append(comment) return '\n'.join(lines)