Local Search MCP Server

document_analyzer.py•8.67 KiB

""" Document Analyzer Module for Local Search MCP Provides quality scoring, language detection, structure analysis, and issue detection. """ import re from typing import Dict, List, Optional, Tuple from dataclasses import dataclass from enum import Enum import langdetect from langdetect import DetectorFactory # Set seed for consistent language detection DetectorFactory.seed = 0 class DocumentType(Enum): """Document type classification.""" MARKDOWN = "markdown" CODE_PYTHON = "code_python" CODE_JAVASCRIPT = "code_javascript" CODE_TYPESCRIPT = "code_typescript" PLAIN_TEXT = "plain_text" UNKNOWN = "unknown" @dataclass class DocumentAnalysis: """Results of document analysis.""" quality_score: float # 0-1 scale language: str # ISO language code (e.g., "en", "ja") language_confidence: float # 0-1 scale document_type: DocumentType char_count: int word_count: int line_count: int avg_line_length: float issues: List[str] # List of detected issues recommendations: List[str] # List of recommendations class DocumentAnalyzer: """ Analyzes documents for quality, language, structure, and potential issues. """ # Quality scoring thresholds MIN_CONTENT_LENGTH = 50 IDEAL_AVG_LINE_LENGTH = 80 MAX_AVG_LINE_LENGTH = 200 def analyze(self, text: str, file_path: Optional[str] = None) -> DocumentAnalysis: """ Perform comprehensive document analysis. Args: text: Document text content file_path: Optional file path for type detection Returns: DocumentAnalysis object with all analysis results """ # Basic metrics char_count = len(text) lines = text.split('\n') line_count = len(lines) word_count = len(text.split()) avg_line_length = char_count / line_count if line_count > 0 else 0 # Document type detection doc_type = self._detect_document_type(text, file_path) # Language detection language, lang_confidence = self._detect_language(text) # Issue detection issues = self._detect_issues(text, char_count, word_count, avg_line_length) # Quality scoring quality_score = self._calculate_quality_score( text, char_count, word_count, avg_line_length, issues ) # Generate recommendations recommendations = self._generate_recommendations( quality_score, issues, doc_type, language ) return DocumentAnalysis( quality_score=quality_score, language=language, language_confidence=lang_confidence, document_type=doc_type, char_count=char_count, word_count=word_count, line_count=line_count, avg_line_length=avg_line_length, issues=issues, recommendations=recommendations ) def _detect_document_type( self, text: str, file_path: Optional[str] = None ) -> DocumentType: """Detect document type from content and file extension.""" if file_path: ext = file_path.lower().split('.')[-1] if ext in ['md', 'markdown']: return DocumentType.MARKDOWN elif ext == 'py': return DocumentType.CODE_PYTHON elif ext == 'js': return DocumentType.CODE_JAVASCRIPT elif ext == 'ts': return DocumentType.CODE_TYPESCRIPT # Content-based detection if re.search(r'^#{1,6}\s+', text, re.MULTILINE): return DocumentType.MARKDOWN elif re.search(r'\bdef\s+\w+\s*\(|\bclass\s+\w+\s*[:(]', text): return DocumentType.CODE_PYTHON elif re.search(r'\bfunction\s+\w+\s*\(|\bconst\s+\w+\s*=', text): return DocumentType.CODE_JAVASCRIPT return DocumentType.PLAIN_TEXT def _detect_language(self, text: str) -> Tuple[str, float]: """ Detect language of the text. Returns: Tuple of (language_code, confidence) """ # Remove code blocks and URLs for better detection clean_text = re.sub(r'```[\s\S]*?```', '', text) clean_text = re.sub(r'http[s]?://\S+', '', clean_text) clean_text = re.sub(r'`[^`]+`', '', clean_text) try: # Detect language lang = langdetect.detect(clean_text) # Get confidence (langdetect doesn't provide confidence directly) # We use a heuristic based on text length confidence = min(1.0, len(clean_text) / 500) # Higher confidence for longer texts return lang, confidence except: return "unknown", 0.0 def _detect_issues( self, text: str, char_count: int, word_count: int, avg_line_length: float ) -> List[str]: """Detect potential issues in the document.""" issues = [] # Check minimum content length if char_count < self.MIN_CONTENT_LENGTH: issues.append(f"Content too short ({char_count} chars, minimum {self.MIN_CONTENT_LENGTH})") # Check for very long lines if avg_line_length > self.MAX_AVG_LINE_LENGTH: issues.append(f"Average line length too long ({avg_line_length:.0f} chars)") # Check for repetitive content lines = text.split('\n') unique_lines = set(lines) if len(lines) > 10 and len(unique_lines) / len(lines) < 0.5: issues.append("High line repetition detected") # Check for excessive whitespace whitespace_ratio = len(re.findall(r'\s', text)) / len(text) if len(text) > 0 else 0 if whitespace_ratio > 0.5: issues.append(f"Excessive whitespace ({whitespace_ratio*100:.1f}%)") # Check for very low word count if word_count < 10 and char_count > self.MIN_CONTENT_LENGTH: issues.append("Very low word count (possible non-textual content)") return issues def _calculate_quality_score( self, text: str, char_count: int, word_count: int, avg_line_length: float, issues: List[str] ) -> float: """ Calculate document quality score (0-1). Factors: - Content length - Line length appropriateness - Issue count - Content density """ score = 1.0 # Penalty for short content if char_count < self.MIN_CONTENT_LENGTH: score *= char_count / self.MIN_CONTENT_LENGTH # Penalty for line length issues if avg_line_length > self.MAX_AVG_LINE_LENGTH: penalty = (avg_line_length - self.MAX_AVG_LINE_LENGTH) / self.MAX_AVG_LINE_LENGTH score *= max(0.5, 1.0 - penalty) # Penalty for each issue issue_penalty = 0.1 * len(issues) score *= max(0.3, 1.0 - issue_penalty) # Bonus for good content density (words per character) if char_count > 0: density = word_count / char_count if 0.15 <= density <= 0.25: # Typical for good text score *= 1.1 return min(1.0, max(0.0, score)) def _generate_recommendations( self, quality_score: float, issues: List[str], doc_type: DocumentType, language: str ) -> List[str]: """Generate recommendations based on analysis.""" recommendations = [] if quality_score < 0.5: recommendations.append("Consider improving document quality before indexing") if "Content too short" in str(issues): recommendations.append("Add more content for better search results") if "Average line length too long" in str(issues): recommendations.append("Break long lines for better readability") if "High line repetition detected" in str(issues): recommendations.append("Remove repetitive content") # Type-specific recommendations if doc_type == DocumentType.MARKDOWN and quality_score > 0.7: recommendations.append("Use Markdown chunking strategy for best results") elif doc_type in [DocumentType.CODE_PYTHON, DocumentType.CODE_JAVASCRIPT, DocumentType.CODE_TYPESCRIPT]: recommendations.append("Use code-aware chunking strategy") # Language-specific recommendations if language == "ja": recommendations.append("Japanese detected: chunk size will be adjusted (1.2x)") elif language == "unknown": recommendations.append("Language detection failed: using default chunking") if not recommendations: recommendations.append("Document quality is good, ready for indexing") return recommendations

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/yuzuponikemi/localsearch-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

document_analyzer.py•8.67 KiB