FS-MCP Server

text_detector.py•11.4 KiB

""" 智能文本检测器：判断文件是否为文本文件 """ import os import mimetypes from pathlib import Path from typing import Union, Optional, Dict, Any import logging try: import magic HAS_MAGIC = True except ImportError: HAS_MAGIC = False from .config_manager import get_config_manager class TextDetector: """智能文本检测器""" def __init__(self): """初始化文本检测器""" current_config_manager = get_config_manager() self.config = current_config_manager.get_text_detection_config() self.sample_bytes = self.config.get("sample_bytes", 8192) self.max_binary_ratio = self.config.get("max_binary_ratio", 0.1) self.use_mime_detection = self.config.get("use_mime_detection", True) # 已知的二进制扩展名 self.binary_extensions = set(current_config_manager.get_binary_extensions()) # 文本MIME类型 self.text_mime_types = { 'text/', 'application/json', 'application/xml', 'application/javascript', 'application/x-sh', 'application/x-csh', 'application/x-python-code' } def is_text_file(self, file_path: Union[str, Path]) -> bool: """ 判断文件是否为文本文件 Args: file_path: 文件路径 Returns: bool: 是否为文本文件 """ file_path = Path(file_path) # 1. 检查文件是否存在 if not file_path.exists(): return False # 2. 检查是否为目录 if file_path.is_dir(): return False # 3. 检查文件扩展名是否为明确的二进制格式 file_ext = file_path.suffix.lower() if file_ext in self.binary_extensions: return False # 4. 检查文件大小（空文件视为文本文件） try: file_size = file_path.stat().st_size if file_size == 0: return True except OSError: return False # 5. 使用MIME类型检测 if self.use_mime_detection and self._is_text_by_mime(file_path): return True # 6. 使用文件内容检测 return self._is_text_by_content(file_path) def _is_text_by_mime(self, file_path: Path) -> Optional[bool]: """ 通过MIME类型判断是否为文本文件 Args: file_path: 文件路径 Returns: Optional[bool]: 如果能确定则返回bool，否则返回None """ try: # 使用python-magic库（如果可用） if HAS_MAGIC: mime_type = magic.from_file(str(file_path), mime=True) return self._is_text_mime_type(mime_type) # 使用标准库的mimetypes mime_type, _ = mimetypes.guess_type(str(file_path)) if mime_type: return self._is_text_mime_type(mime_type) except Exception as e: logging.debug(f"MIME类型检测失败: {e}") return None def _is_text_mime_type(self, mime_type: str) -> bool: """ 判断MIME类型是否为文本类型 Args: mime_type: MIME类型 Returns: bool: 是否为文本类型 """ if not mime_type: return False mime_type = mime_type.lower() # 检查是否匹配文本MIME类型 for text_mime in self.text_mime_types: if mime_type.startswith(text_mime): return True return False def _is_text_by_content(self, file_path: Path) -> bool: """ 通过文件内容判断是否为文本文件 Args: file_path: 文件路径 Returns: bool: 是否为文本文件 """ try: with open(file_path, 'rb') as f: # 读取文件开头的一部分内容 sample = f.read(self.sample_bytes) if not sample: return True # 空文件视为文本文件 return self._analyze_bytes(sample) except (OSError, IOError) as e: logging.debug(f"读取文件内容失败: {e}") return False def _analyze_bytes(self, data: bytes) -> bool: """ 分析字节数据判断是否为文本 Args: data: 字节数据 Returns: bool: 是否为文本 """ if not data: return True # 检查是否包含NULL字节（强烈暗示为二进制文件） if b'\x00' in data: return False # 尝试用UTF-8解码 try: text = data.decode('utf-8') # 成功解码，进一步检查内容 return self._analyze_text_content(text) except UnicodeDecodeError: pass # 尝试其他常见编码 for encoding in ['gbk', 'gb2312', 'latin1', 'cp1252']: try: text = data.decode(encoding) return self._analyze_text_content(text) except UnicodeDecodeError: continue # 如果都无法解码，分析字节特征 return self._analyze_byte_patterns(data) def _analyze_text_content(self, text: str) -> bool: """ 分析文本内容特征 Args: text: 文本内容 Returns: bool: 是否为文本 """ # 计算控制字符的比例 control_chars = 0 printable_chars = 0 for char in text: if ord(char) < 32 and char not in '\t\r\n': control_chars += 1 elif ord(char) >= 32 and ord(char) <= 126: printable_chars += 1 elif ord(char) > 126: # Unicode字符，通常是文本 printable_chars += 1 total_chars = len(text) if total_chars == 0: return True # 如果控制字符比例过高，可能是二进制文件 control_ratio = control_chars / total_chars return control_ratio <= self.max_binary_ratio def _analyze_byte_patterns(self, data: bytes) -> bool: """ 分析字节模式判断是否为文本 Args: data: 字节数据 Returns: bool: 是否为文本 """ # 计算各种字节的比例 ascii_chars = 0 high_bytes = 0 for byte in data: if 32 <= byte <= 126: # 可打印ASCII字符 ascii_chars += 1 elif byte >= 128: # 高位字节 high_bytes += 1 total_bytes = len(data) if total_bytes == 0: return True # 如果大部分是可打印ASCII字符，可能是文本 ascii_ratio = ascii_chars / total_bytes # 如果ASCII字符比例高，很可能是文本 if ascii_ratio >= 0.7: return True # 如果高位字节很多但没有NULL字节，可能是UTF-8等编码的文本 if high_bytes > 0 and ascii_ratio >= 0.3: return True return False def get_file_encoding(self, file_path: Union[str, Path]) -> Optional[str]: """ 尝试检测文件编码 Args: file_path: 文件路径 Returns: Optional[str]: 检测到的编码或None """ file_path = Path(file_path) if not file_path.is_file(): return None try: with open(file_path, 'rb') as f: sample = f.read(self.sample_bytes) if not sample: return None # 空文件无法判断编码 # 尝试 UTF-8 (BOM or no BOM) try: sample.decode('utf-8-sig') # Handles UTF-8 BOM return 'utf-8-sig' except UnicodeDecodeError: try: sample.decode('utf-8') return 'utf-8' except UnicodeDecodeError: pass # Not UTF-8 # 尝试其他常见编码 # TODO: 可以引入更强大的编码检测库如 chardet，但会增加依赖 # 简单的启发式检测： common_encodings = ['gbk', 'gb2312', 'latin1', 'cp1252', 'iso-8859-1'] for enc in common_encodings: try: sample.decode(enc) return enc except UnicodeDecodeError: continue # 使用 python-magic (如果可用且能提供编码信息) if HAS_MAGIC: try: mime_info = magic.from_file(str(file_path)) if 'charset=' in mime_info: encoding = mime_info.split('charset=')[-1].split(';')[0].strip() # python-magic 有时返回 binary，需要过滤 if encoding.lower() not in ['binary', 'unknown-8bit']: return encoding.lower() except Exception: pass # magic 库可能出错 except (OSError, IOError): return None return None def get_detection_info(self, file_path: Union[str, Path]) -> Dict[str, Any]: """ 获取文件检测的详细信息 Args: file_path: 文件路径 Returns: Dict[str, Any]: 检测信息字典 """ file_path_obj = Path(file_path) info = { 'path': str(file_path_obj), 'exists': file_path_obj.exists(), 'is_file': file_path_obj.is_file(), 'is_dir': file_path_obj.is_dir(), 'size': None, 'mime_type': None, 'encoding': None, 'is_text': None, } if not info['exists'] or not info['is_file']: info['is_text'] = False # 不存在或不是文件，则不是文本 return info info['size'] = file_path_obj.stat().st_size # MIME Type if HAS_MAGIC: try: info['mime_type'] = magic.from_file(str(file_path_obj), mime=True) except Exception: pass # magic 库可能出错 if not info['mime_type']: guessed_mime, _ = mimetypes.guess_type(str(file_path_obj)) info['mime_type'] = guessed_mime # Encoding and Is Text info['encoding'] = self.get_file_encoding(file_path_obj) info['is_text'] = self.is_text_file(file_path_obj) # is_text_file 内部会做更全面的判断 return info # 单例实例存储 _text_detector_instance = None def get_text_detector() -> TextDetector: """ 获取TextDetector的单例。在首次调用时创建实例。 """ global _text_detector_instance if _text_detector_instance is None: _text_detector_instance = TextDetector() return _text_detector_instance # 移除旧的全局实例 # text_detector = TextDetector()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/boleyn/fs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

text_detector.py•11.4 KiB