Code-Index-MCP

test_plaintext_comprehensive.py•41.8 KiB

""" Comprehensive tests for plain text document processing features. This test suite covers: - Natural language processing features - Paragraph detection and sentence splitting - Topic extraction and keyword analysis - Various text formats and edge cases - Integration with search capabilities """ import re # Import the PlainText plugin and related classes import sys import pytest sys.path.insert(0, "/app") from mcp_server.document_processing import DocumentChunk from mcp_server.plugins.plaintext_plugin.nlp_processor import NLPProcessor, TextType from mcp_server.plugins.plaintext_plugin.paragraph_detector import ParagraphDetector from mcp_server.plugins.plaintext_plugin.plugin import PlainTextPlugin from mcp_server.plugins.plaintext_plugin.sentence_splitter import SentenceSplitter from mcp_server.plugins.plaintext_plugin.topic_extractor import TopicExtractor from mcp_server.storage.sqlite_store import SQLiteStore class TestNLPProcessor: """Test natural language processing functionality.""" @pytest.fixture def processor(self): """Create an NLPProcessor instance.""" return NLPProcessor() def test_text_type_detection(self, processor): """Test automatic text type detection.""" # Technical text technical_text = """ This document describes the implementation of a REST API using Python Flask. The authentication mechanism uses JWT tokens for secure access. Database operations are performed using SQLAlchemy ORM. Error handling follows HTTP status code conventions. """ analysis = processor.analyze_text(technical_text) assert analysis.text_type in [TextType.TECHNICAL, TextType.INSTRUCTIONAL] # Narrative text narrative_text = """ Once upon a time, in a small village nestled between rolling hills, there lived a young woman named Elena. She had always dreamed of adventure beyond the confines of her quiet hometown. Every morning, she would gaze at the distant mountains and wonder what mysteries lay beyond. """ analysis = processor.analyze_text(narrative_text) assert analysis.text_type == TextType.NARRATIVE # Instructional text instructional_text = """ To install the software, follow these steps: 1. Download the installer from the official website 2. Run the installer as administrator 3. Accept the license agreement 4. Choose the installation directory 5. Click 'Install' to begin the process 6. Restart your computer when prompted Warning: Make sure to backup your data before installation. """ analysis = processor.analyze_text(instructional_text) assert analysis.text_type == TextType.INSTRUCTIONAL def test_readability_analysis(self, processor): """Test readability score calculation.""" # Simple text simple_text = "This is easy to read. Short sentences. Simple words." analysis = processor.analyze_text(simple_text) assert analysis.readability_score > 80 # Should be high readability # Complex text complex_text = """ The implementation of sophisticated machine learning algorithms necessitates comprehensive understanding of mathematical foundations, statistical principles, and computational complexity considerations that significantly impact performance optimization strategies in distributed computing environments. """ analysis = processor.analyze_text(complex_text) assert analysis.readability_score < 50 # Should be lower readability def test_sentence_statistics(self, processor): """Test sentence-level statistics calculation.""" text = """ This is the first sentence. This is a longer sentence with more words to test the average calculation. Short one. Another sentence of moderate length for testing purposes. """ analysis = processor.analyze_text(text) assert analysis.sentence_count == 4 assert analysis.avg_sentence_length > 0 assert 5 <= analysis.avg_sentence_length <= 15 # Reasonable range def test_vocabulary_analysis(self, processor): """Test vocabulary richness calculation.""" # Repetitive text repetitive_text = "The cat sat on the mat. The cat was on the mat. The mat had the cat." analysis = processor.analyze_text(repetitive_text) assert analysis.vocabulary_richness < 0.7 # Lower vocabulary richness # Diverse text diverse_text = """ The magnificent elephant wandered through the pristine forest. Crystalline waterfalls cascaded down granite cliffs while exotic birds performed aerial ballet above emerald canopies. """ analysis = processor.analyze_text(diverse_text) assert analysis.vocabulary_richness > 0.8 # Higher vocabulary richness def test_topic_extraction(self, processor): """Test topic extraction from text.""" text = """ Python is a powerful programming language used for web development, data science, and machine learning. Django and Flask are popular web frameworks for Python. NumPy and Pandas are essential libraries for data analysis. Scikit-learn provides machine learning algorithms. """ analysis = processor.analyze_text(text) assert len(analysis.topics) > 0 # Check if relevant topics are identified topic_keywords = [kw for topic in analysis.topics for kw in topic.keywords] expected_keywords = ["python", "programming", "web", "data", "machine learning"] found_keywords = sum( 1 for kw in expected_keywords if any(kw.lower() in keyword.lower() for keyword in topic_keywords) ) assert found_keywords >= 2 # Should find at least 2 relevant keywords def test_key_phrases_extraction(self, processor): """Test key phrase extraction.""" text = """ Artificial intelligence and machine learning are transforming industries. Natural language processing enables computers to understand human language. Deep learning networks can process vast amounts of data efficiently. """ analysis = processor.analyze_text(text) assert len(analysis.key_phrases) > 0 # Check for multi-word phrases multi_word_phrases = [phrase for phrase in analysis.key_phrases if " " in phrase] assert len(multi_word_phrases) > 0 def test_semantic_chunking(self, processor): """Test semantic-aware text chunking.""" text = """ Introduction to Machine Learning Machine learning is a subset of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed. It focuses on developing algorithms that can access data and use it to learn for themselves. Types of Machine Learning There are three main types of machine learning: supervised learning, unsupervised learning, and reinforcement learning. Each type has its own characteristics and applications. Supervised Learning Supervised learning uses labeled training data to learn a mapping function from input variables to output variables. Common algorithms include linear regression, decision trees, and neural networks. """ chunks = processor.extract_semantic_chunks(text, target_size=200) assert len(chunks) > 1 # Verify chunks are meaningful for chunk in chunks: assert len(chunk.strip()) > 50 # Not too small assert len(chunk.strip()) < 500 # Not too large # Verify semantic coherence (chunks should end at natural boundaries) for chunk in chunks[:-1]: # All but last chunk assert chunk.strip().endswith((".", "!", "?", ":")) or "\n\n" in chunk def test_structured_content_extraction(self, processor): """Test extraction of structured content elements.""" text = """ Project Setup Instructions Prerequisites: - Python 3.8 or higher - Git version control - Text editor or IDE Installation Steps: 1. Clone the repository 2. Create virtual environment 3. Install dependencies 4. Configure settings Common Issues: • Permission denied errors • Missing dependencies • Configuration problems Tips for Success: ✓ Always backup your work ✓ Read documentation carefully ✓ Test in development environment first """ structured = processor.extract_structured_content(text) assert "lists" in structured assert len(structured["lists"]) >= 3 # Check for different list types list_types = [lst["type"] for lst in structured["lists"]] assert "bulleted" in list_types or "hyphenated" in list_types assert "numbered" in list_types class TestParagraphDetector: """Test paragraph detection functionality.""" @pytest.fixture def detector(self): """Create a ParagraphDetector instance.""" return ParagraphDetector() def test_basic_paragraph_detection(self, detector): """Test basic paragraph detection.""" text = """This is the first paragraph. It contains multiple sentences on different lines. This is the second paragraph. This is the third paragraph with a single line.""" paragraphs = detector.detect_paragraphs(text) assert len(paragraphs) == 3 # Check first paragraph assert "first paragraph" in paragraphs[0].text assert "multiple sentences" in paragraphs[0].text # Check line numbers assert paragraphs[0].start_line == 0 assert paragraphs[1].start_line > paragraphs[0].end_line assert paragraphs[2].start_line > paragraphs[1].end_line def test_code_block_detection(self, detector): """Test detection of code blocks as special paragraphs.""" text = """This is regular text. # This is an indented code block def function(): return "code" Back to regular text. ```python # This is a fenced code block print("Hello, World!") ``` More regular text.""" paragraphs = detector.detect_paragraphs(text) # Find code paragraphs code_paragraphs = [p for p in paragraphs if p.is_code_block] assert len(code_paragraphs) >= 1 # Verify code content indented_code = next((p for p in code_paragraphs if "def function" in p.text), None) if indented_code: assert indented_code.is_code_block def test_list_detection(self, detector): """Test detection of list items.""" text = """Regular paragraph before list. - First bullet item - Second bullet item - Nested item - Third bullet item 1. First numbered item 2. Second numbered item a. Nested letter item b. Another nested item Regular paragraph after list.""" paragraphs = detector.detect_paragraphs(text) # Find list paragraphs list_paragraphs = [p for p in paragraphs if p.is_list_item] assert len(list_paragraphs) >= 6 # Should detect most list items # Check list types bullet_items = [p for p in list_paragraphs if p.text.strip().startswith("-")] numbered_items = [p for p in list_paragraphs if re.match(r"^\d+\.", p.text.strip())] assert len(bullet_items) >= 3 assert len(numbered_items) >= 2 def test_heading_detection(self, detector): """Test detection of headings in plain text.""" text = """MAIN TITLE This is content under the main title. Subtitle in Title Case More content here. Another Section =============== Content with underline heading. Yet Another Section ------------------- Content with another underline style.""" paragraphs = detector.detect_paragraphs(text) # Check for potential headings potential_headings = [] for p in paragraphs: if ( len(p.text.split()) <= 5 and not p.text.endswith(".") and (p.text.isupper() or p.text.istitle()) ): potential_headings.append(p) assert len(potential_headings) >= 2 def test_empty_line_handling(self, detector): """Test handling of multiple empty lines.""" text = """First paragraph. Second paragraph after multiple empty lines. Third paragraph.""" paragraphs = detector.detect_paragraphs(text) assert len(paragraphs) == 3 assert "First paragraph" in paragraphs[0].text assert "Second paragraph" in paragraphs[1].text assert "Third paragraph" in paragraphs[2].text def test_mixed_content_detection(self, detector): """Test detection in mixed content with various elements.""" text = """# Markdown-style heading Regular paragraph with **markdown** formatting. - List item one - List item two Code block here with multiple lines > Blockquote text > spanning multiple lines Regular paragraph at the end.""" paragraphs = detector.detect_paragraphs(text) assert len(paragraphs) >= 5 # Should handle mixed content gracefully content_types = [] for p in paragraphs: if p.is_code_block: content_types.append("code") elif p.is_list_item: content_types.append("list") elif p.text.startswith("#"): content_types.append("heading") elif p.text.startswith(">"): content_types.append("quote") else: content_types.append("text") assert "text" in content_types assert len(set(content_types)) >= 2 # Should detect multiple types class TestSentenceSplitter: """Test sentence splitting functionality.""" @pytest.fixture def splitter(self): """Create a SentenceSplitter instance.""" return SentenceSplitter() def test_basic_sentence_splitting(self, splitter): """Test basic sentence splitting.""" text = "This is the first sentence. This is the second sentence! Is this a question?" sentences = splitter.split_sentences(text) assert len(sentences) == 3 assert sentences[0].strip() == "This is the first sentence." assert sentences[1].strip() == "This is the second sentence!" assert sentences[2].strip() == "Is this a question?" def test_abbreviation_handling(self, splitter): """Test handling of abbreviations and acronyms.""" text = "Dr. Smith works at NASA. The U.S. government funds research. Mr. Jones disagrees." sentences = splitter.split_sentences(text) # Should not split at abbreviations assert len(sentences) == 3 assert "Dr. Smith works at NASA" in sentences[0] assert "U.S. government" in sentences[1] assert "Mr. Jones" in sentences[2] def test_decimal_number_handling(self, splitter): """Test handling of decimal numbers.""" text = "The price is $19.99 for the item. Version 2.0 was released. It costs 3.14 dollars." sentences = splitter.split_sentences(text) # Should not split at decimal points assert len(sentences) == 3 assert "$19.99" in sentences[0] assert "Version 2.0" in sentences[1] assert "3.14 dollars" in sentences[2] def test_quotation_handling(self, splitter): """Test handling of sentences with quotations.""" text = 'He said, "This is important." She replied, "I agree completely!" They nodded.' sentences = splitter.split_sentences(text) assert len(sentences) == 3 assert 'He said, "This is important."' in sentences[0] assert 'She replied, "I agree completely!"' in sentences[1] assert "They nodded" in sentences[2] def test_complex_punctuation(self, splitter): """Test handling of complex punctuation patterns.""" text = "What?! You can't be serious... Well, I suppose it's possible. (But unlikely.)" sentences = splitter.split_sentences(text) # Should handle complex punctuation assert len(sentences) >= 2 assert "What?!" in sentences[0] assert "serious..." in sentences[0] or "serious" in sentences[0] def test_code_and_urls(self, splitter): """Test handling of code snippets and URLs.""" text = "Visit www.example.com for info. Use object.method() in code. The file is config.json format." sentences = splitter.split_sentences(text) # Should not split at dots in URLs or code assert len(sentences) == 3 assert "www.example.com" in sentences[0] assert "object.method()" in sentences[1] assert "config.json" in sentences[2] class TestTopicExtractor: """Test topic and keyword extraction functionality.""" @pytest.fixture def extractor(self): """Create a TopicExtractor instance.""" return TopicExtractor() def test_keyword_extraction(self, extractor): """Test basic keyword extraction.""" text = """ Python programming is essential for data science and machine learning. Libraries like NumPy, Pandas, and Scikit-learn are fundamental tools. Statistical analysis and algorithmic thinking are crucial skills. """ keywords = extractor.extract_keywords(text, max_keywords=10) assert len(keywords) > 0 assert len(keywords) <= 10 # Verify format (keyword, score) for keyword, score in keywords: assert isinstance(keyword, str) assert isinstance(score, (int, float)) assert len(keyword) > 0 # Check for relevant keywords keyword_list = [kw.lower() for kw, _ in keywords] expected = ["python", "programming", "data", "machine", "learning"] found = sum(1 for exp in expected if any(exp in kw for kw in keyword_list)) assert found >= 2 def test_phrase_extraction(self, extractor): """Test extraction of multi-word phrases.""" text = """ Natural language processing enables advanced text analysis. Machine learning algorithms can identify semantic patterns. Deep neural networks process complex linguistic structures. """ phrases = extractor.extract_phrases(text, max_phrases=8) assert len(phrases) > 0 # Check for multi-word phrases multi_word = [phrase for phrase, _ in phrases if " " in phrase] assert len(multi_word) > 0 # Check for relevant technical phrases phrase_list = [phrase.lower() for phrase, _ in phrases] expected_phrases = ["natural language", "machine learning", "neural networks"] found_phrases = sum( 1 for exp in expected_phrases if any(exp in phrase for phrase in phrase_list) ) assert found_phrases >= 1 def test_technical_term_extraction(self, extractor): """Test extraction of technical terms.""" text = """ REST API endpoints use HTTP methods like GET, POST, PUT, DELETE. JSON serialization handles data transfer between client and server. Authentication mechanisms include JWT tokens and OAuth2 flows. Database transactions ensure ACID compliance in PostgreSQL. """ technical_terms = extractor.extract_technical_terms(text) assert len(technical_terms) > 0 # Check for technical terms terms_lower = [term.lower() for term in technical_terms] expected_terms = ["api", "http", "json", "jwt", "oauth", "database", "postgresql"] found_terms = sum(1 for exp in expected_terms if any(exp in term for term in terms_lower)) assert found_terms >= 3 def test_topic_clustering(self, extractor): """Test clustering of related keywords into topics.""" text = """ Web development involves HTML, CSS, and JavaScript programming. Frontend frameworks like React and Vue.js create interactive interfaces. Backend services use Node.js, Python Flask, or Django frameworks. Database systems like MongoDB and PostgreSQL store application data. DevOps practices include Docker containerization and AWS deployment. """ topics = extractor.extract_topics(text, num_topics=3) assert len(topics) > 0 assert len(topics) <= 3 # Check topic structure for topic in topics: assert hasattr(topic, "keywords") assert hasattr(topic, "score") assert len(topic.keywords) > 0 assert isinstance(topic.score, (int, float)) # Verify topic coherence all_keywords = [kw for topic in topics for kw in topic.keywords] web_related = ["web", "html", "css", "javascript", "frontend", "backend"] found_web = sum(1 for kw in all_keywords if any(web in kw.lower() for web in web_related)) assert found_web >= 2 def test_named_entity_extraction(self, extractor): """Test extraction of named entities.""" text = """ Microsoft Azure and Amazon Web Services are major cloud providers. Google Cloud Platform competes with these services globally. IBM Watson provides artificial intelligence capabilities. Tesla uses machine learning for autonomous vehicle development. """ entities = extractor.extract_named_entities(text) assert len(entities) > 0 # Check for company names entity_list = [entity.lower() for entity in entities] companies = ["microsoft", "amazon", "google", "ibm", "tesla"] found_companies = sum(1 for comp in companies if any(comp in ent for ent in entity_list)) assert found_companies >= 2 def test_sentiment_analysis(self, extractor): """Test basic sentiment analysis.""" positive_text = ( "This is an excellent product with amazing features and outstanding performance." ) negative_text = ( "This is a terrible product with awful quality and horrible user experience." ) neutral_text = "This product has standard features and average performance characteristics." pos_sentiment = extractor.analyze_sentiment(positive_text) neg_sentiment = extractor.analyze_sentiment(negative_text) neu_sentiment = extractor.analyze_sentiment(neutral_text) # Verify sentiment scores assert pos_sentiment > neu_sentiment assert neg_sentiment < neu_sentiment assert -1 <= pos_sentiment <= 1 assert -1 <= neg_sentiment <= 1 assert -1 <= neu_sentiment <= 1 class TestPlainTextPlugin: """Test the complete PlainText plugin functionality.""" @pytest.fixture def temp_db(self, tmp_path): """Create a temporary database.""" db_path = tmp_path / "test.db" return SQLiteStore(str(db_path)) @pytest.fixture def plugin(self, temp_db): """Create a PlainTextPlugin instance.""" language_config = { "name": "plaintext", "code": "txt", "extensions": [".txt", ".text", ".md", ".rst", ".log"], "language": "plaintext", } return PlainTextPlugin(language_config, sqlite_store=temp_db, enable_semantic=False) @pytest.fixture def sample_text_file(self, tmp_path): """Create a sample text file.""" content = """Project Documentation Introduction This document provides comprehensive information about our software project. The project aims to create a robust and scalable application using modern development practices and technologies. Technical Architecture The system follows a microservices architecture pattern with the following components: - API Gateway for request routing - Authentication service for user management - Business logic services for core functionality - Database layer for data persistence - Monitoring and logging infrastructure Implementation Details Backend Development: The backend is implemented using Python with FastAPI framework. Database operations use SQLAlchemy ORM for data modeling. Redis is used for caching and session management. Celery handles asynchronous task processing. Frontend Development: The user interface is built with React and TypeScript. State management uses Redux toolkit for predictable updates. Material-UI provides consistent design components. Webpack bundles the application for production deployment. Deployment Strategy The application is containerized using Docker containers. Kubernetes orchestrates the container deployment and scaling. CI/CD pipelines automate testing and deployment processes. AWS cloud infrastructure provides hosting and scalability. Performance Considerations Load balancing distributes traffic across multiple instances. Database indexing optimizes query performance. Caching strategies reduce response times significantly. CDN delivers static assets with minimal latency. Security Measures Authentication uses JWT tokens with refresh mechanisms. API endpoints implement rate limiting and validation. HTTPS encryption protects data transmission. Regular security audits identify potential vulnerabilities. Conclusion This architecture provides a solid foundation for scalable application development. The chosen technologies and patterns support maintainability and performance. Future enhancements can be implemented without major architectural changes. """ file_path = tmp_path / "project_docs.txt" file_path.write_text(content) return file_path, content def test_file_support(self, plugin): """Test file extension support.""" assert plugin.supports("document.txt") assert plugin.supports("readme.text") assert plugin.supports("notes.md") assert plugin.supports("guide.rst") assert plugin.supports("application.log") assert not plugin.supports("script.py") assert not plugin.supports("data.json") def test_metadata_extraction(self, plugin, sample_text_file): """Test metadata extraction from plain text.""" file_path, content = sample_text_file metadata = plugin.extract_metadata(content, file_path) # Verify basic metadata assert metadata.title == "Project Documentation" # First heading assert metadata.document_type in ["technical", "instructional", "mixed"] assert metadata.language == "en" assert len(metadata.tags) > 0 # Verify NLP-derived metadata assert "readability_score" in metadata.custom assert "avg_sentence_length" in metadata.custom assert "vocabulary_richness" in metadata.custom assert "topics" in metadata.custom def test_structure_extraction(self, plugin, sample_text_file): """Test document structure extraction.""" file_path, content = sample_text_file structure = plugin.extract_structure(content, file_path) # Verify structure components assert len(structure.sections) >= 5 assert len(structure.headings) >= 5 assert structure.outline is not None # Check for main sections section_titles = [s.get("title", "") for s in structure.sections] expected_sections = ["Introduction", "Technical Architecture", "Implementation Details"] found_sections = sum( 1 for exp in expected_sections if any(exp in title for title in section_titles) ) assert found_sections >= 2 # Verify heading hierarchy heading_levels = [h["level"] for h in structure.headings] assert min(heading_levels) == 1 # Should have top-level headings assert len(set(heading_levels)) >= 2 # Should have multiple levels def test_content_parsing(self, plugin, sample_text_file): """Test content parsing and cleaning.""" file_path, content = sample_text_file parsed_content = plugin.parse_content(content, file_path) # Verify content cleaning assert len(parsed_content) > 0 assert "Project Documentation" in parsed_content assert "microservices architecture" in parsed_content # Verify encoding fixes and normalization assert "\r\n" not in parsed_content # Should normalize line endings assert "\n\n\n" not in parsed_content # Should remove excessive blank lines def test_intelligent_chunking(self, plugin, sample_text_file): """Test NLP-aware intelligent chunking.""" file_path, content = sample_text_file # Extract structure first structure = plugin.extract_structure(content, file_path) # Perform intelligent chunking chunks = plugin._intelligent_chunk(content, structure) # Verify chunks assert len(chunks) > 0 assert all(isinstance(chunk, DocumentChunk) for chunk in chunks) # Verify chunk content for chunk in chunks: assert len(chunk.content) > 50 # Meaningful content assert chunk.chunk_index >= 0 assert chunk.metadata is not None # Verify semantic coherence total_content = "".join(chunk.content for chunk in chunks) assert "Project Documentation" in total_content assert "microservices" in total_content def test_search_functionality(self, plugin, sample_text_file): """Test enhanced search with NLP understanding.""" file_path, content = sample_text_file # Index the content first plugin.indexFile(file_path, content) # Test search queries queries = [ "microservices architecture", "Python FastAPI", "Docker containers", "security measures", ] for query in queries: results = plugin.search(query, opts={"limit": 5}) # Should return relevant results assert isinstance(results, list) # If results found, verify relevance if results: for result in results: assert "file" in result assert "snippet" in result assert "relevance" in result assert result["relevance"] > 0 def test_technical_text_processing(self, plugin, tmp_path): """Test processing of technical documentation.""" technical_content = """API Reference Guide Authentication All API requests require authentication using Bearer tokens. Include the token in the Authorization header: Authorization: Bearer YOUR_TOKEN_HERE Rate limiting applies to all endpoints (100 requests per minute). Endpoints GET /api/users Retrieves a list of all users in the system. Parameters: - limit (integer): Maximum number of results (default: 20) - offset (integer): Number of results to skip (default: 0) - filter (string): Filter expression for results Response: Returns JSON array of user objects with following fields: - id (integer): Unique user identifier - username (string): User login name - email (string): User email address - created_at (timestamp): Account creation date Error Codes: - 400: Bad Request - Invalid parameters - 401: Unauthorized - Missing or invalid token - 429: Too Many Requests - Rate limit exceeded - 500: Internal Server Error - Server processing error POST /api/users Creates a new user account. Request Body: { "username": "string", "email": "string", "password": "string" } """ file_path = tmp_path / "api_guide.txt" file_path.write_text(technical_content) # Test technical text processing metadata = plugin.extract_metadata(technical_content, file_path) assert metadata.document_type == "technical" # Should extract technical terms assert any("api" in tag.lower() for tag in metadata.tags) structure = plugin.extract_structure(technical_content, file_path) assert len(structure.sections) >= 3 # Should identify main sections def test_narrative_text_processing(self, plugin, tmp_path): """Test processing of narrative text.""" narrative_content = """The Journey Home Chapter 1: Departure Sarah stood at the train station platform, clutching her worn leather suitcase with both hands. The morning mist swirled around her ankles as the old steam engine approached with a thunderous roar. She had been planning this journey for months, saving every penny to buy the ticket that would take her back to the small town where she grew up. The conductor, a kindly old man with silver whiskers, helped her aboard and found her a window seat in the nearly empty car. As the train pulled away from the station, Sarah watched the city skyline fade into the distance. Buildings gave way to rolling hills dotted with farmhouses and red barns. Chapter 2: Memories During the long ride, Sarah's mind wandered to her childhood memories. She remembered running through fields of sunflowers with her sister Emma, building forts in the old oak tree behind their house, and the smell of her grandmother's apple pie cooling on the kitchen windowsill. Those were simpler times, when her biggest worry was whether it would rain on the day of the annual summer fair. Now, at thirty-five, she carried the weight of failed relationships, career disappointments, and the recent loss of her father. The journey home felt like both an escape and a return to something she had lost along the way. """ file_path = tmp_path / "story.txt" file_path.write_text(narrative_content) # Test narrative text processing metadata = plugin.extract_metadata(narrative_content, file_path) assert metadata.document_type == "narrative" # Should have reasonable readability for narrative assert metadata.custom["readability_score"] > 60 structure = plugin.extract_structure(narrative_content, file_path) chapter_sections = [ s for s in structure.sections if "chapter" in s.get("title", "").lower() ] assert len(chapter_sections) >= 2 def test_instructional_text_processing(self, plugin, tmp_path): """Test processing of instructional text.""" instructional_content = """How to Install Python on Your Computer Prerequisites Before installing Python, make sure you have: - Administrator access to your computer - At least 100 MB of free disk space - Internet connection for downloading Step-by-Step Installation Windows Installation: 1. Go to the official Python website (python.org) 2. Click on the "Downloads" tab 3. Select the latest version for Windows 4. Run the downloaded installer 5. Check "Add Python to PATH" during installation 6. Click "Install Now" to begin the process 7. Wait for the installation to complete 8. Click "Close" when finished macOS Installation: 1. Download the macOS installer from python.org 2. Double-click the downloaded .pkg file 3. Follow the installation wizard prompts 4. Enter your password when requested 5. Complete the installation process Linux Installation: Most Linux distributions include Python by default. To install the latest version: 1. Open terminal 2. Update package manager: sudo apt update 3. Install Python: sudo apt install python3 4. Verify installation: python3 --version Verification Steps To verify Python is installed correctly: 1. Open command prompt or terminal 2. Type: python --version 3. You should see the version number displayed 4. Type: python to start the Python interpreter 5. Try: print("Hello, World!") 6. Type: exit() to quit the interpreter Troubleshooting Common Issues: Problem: "Python is not recognized as an internal or external command" Solution: Add Python to your system PATH environment variable Problem: Permission denied errors during installation Solution: Run installer as administrator Problem: Installation fails with error messages Solution: Temporarily disable antivirus software during installation Warning: Always download Python from the official website to avoid malware. Tip: Consider using a virtual environment for your Python projects. """ file_path = tmp_path / "python_install.txt" file_path.write_text(instructional_content) # Test instructional text processing metadata = plugin.extract_metadata(instructional_content, file_path) assert metadata.document_type == "instructional" structure = plugin.extract_structure(instructional_content, file_path) # Should identify step-by-step structure section_titles = [s.get("title", "") for s in structure.sections] expected_sections = ["Prerequisites", "Installation", "Verification", "Troubleshooting"] found = sum( 1 for exp in expected_sections if any(exp.lower() in title.lower() for title in section_titles) ) assert found >= 2 def test_mixed_content_processing(self, plugin, tmp_path): """Test processing of mixed content types.""" mixed_content = """Product Requirements Document Executive Summary This document outlines the requirements for the new mobile application. The app will provide users with real-time weather information and alerts. User Stories As a user, I want to: - View current weather conditions for my location - Receive severe weather alerts and notifications - Check hourly and weekly weather forecasts - Save multiple locations for quick access - Share weather information with friends Technical Specifications The application will be developed using React Native framework. Backend services will use Node.js with MongoDB database. Real-time data will be sourced from OpenWeatherMap API. API Integration: ``` GET /weather/current?lat={latitude}&lon={longitude} Response: { "temperature": 72, "humidity": 65, "conditions": "partly cloudy" } ``` Performance Requirements: - App startup time: < 3 seconds - API response time: < 500ms - Battery usage: minimal impact Installation Guide Development Setup: 1. Install Node.js (version 14 or higher) 2. Install React Native CLI 3. Clone the repository 4. Run npm install 5. Configure API keys 6. Start the development server Warning: Never commit API keys to version control. Once upon a time, there was a weather app that changed everything... Just kidding! This is a technical document, not a fairy tale. Conclusion The weather application will provide valuable functionality to users while maintaining high performance and reliability standards. """ file_path = tmp_path / "mixed_doc.txt" file_path.write_text(mixed_content) # Test mixed content processing metadata = plugin.extract_metadata(mixed_content, file_path) assert metadata.document_type in ["mixed", "technical", "instructional"] structure = plugin.extract_structure(mixed_content, file_path) assert len(structure.sections) >= 4 # Should handle various content types chunks = plugin._intelligent_chunk(mixed_content, structure) assert len(chunks) > 1 def test_large_document_handling(self, plugin, tmp_path): """Test handling of large plain text documents.""" # Generate large content sections = [] for i in range(100): section = f"""Section {i} This is section number {i} of the large document. """ + ( "Lorem ipsum dolor sit amet. " * 10 ) sections.append(section) large_content = "Large Document\n\n" + "\n\n".join(sections) file_path = tmp_path / "large_doc.txt" file_path.write_text(large_content) # Test processing import time start_time = time.time() metadata = plugin.extract_metadata(large_content, file_path) structure = plugin.extract_structure(large_content, file_path) chunks = plugin._intelligent_chunk(large_content, structure) end_time = time.time() elapsed = end_time - start_time # Should complete in reasonable time assert elapsed < 30.0, f"Processing took too long: {elapsed:.2f}s" # Verify results assert metadata is not None assert len(structure.sections) > 50 assert len(chunks) > 10 def test_encoding_edge_cases(self, plugin, tmp_path): """Test handling of various text encodings and edge cases.""" edge_case_content = """Special Characters Test This document contains various special characters and encodings: Unicode characters: café, naïve, résumé, Björk, 北京 Punctuation: "smart quotes", 'apostrophes', em—dashes, ellipses… Symbols: ©2024, ®trademark, ™symbol, §section, ¶paragraph Mathematics: α + β = γ, ∑, ∫, √, ≈, ≠, ≤, ≥ Currency: $100, €50, £25, ¥1000, ₹500 URLs and Emails: - https://www.example.com/path?param=value - user@domain.co.uk - ftp://files.example.org/resource Code snippets: ``` function test() { return "Hello, World!"; } ``` File paths: - C:\\Users\\Name\\Documents\\file.txt - /home/user/documents/file.txt - ~/Desktop/project/ Weird spacing and tabs mixed with spaces. Empty lines: Multiple empty lines should be handled gracefully. The end.""" file_path = tmp_path / "edge_cases.txt" file_path.write_text(edge_case_content, encoding="utf-8") # Test processing with edge cases try: metadata = plugin.extract_metadata(edge_case_content, file_path) structure = plugin.extract_structure(edge_case_content, file_path) parsed_content = plugin.parse_content(edge_case_content, file_path) # Should handle gracefully assert metadata is not None assert structure is not None assert len(parsed_content) > 0 # Should preserve important content assert "Special Characters" in parsed_content assert "Unicode characters" in parsed_content assert "example.com" in parsed_content except Exception as e: pytest.fail(f"Should handle edge cases gracefully: {e}") if __name__ == "__main__": pytest.main([__file__, "-v"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ViperJuice/Code-Index-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_plaintext_comprehensive.py•41.8 KiB