RAGStack-Lambda

Overview Schema Related Servers Score Discussions

test_text_extractor_base.py•9.31 KiB

"""Unit tests for base extractor class and ExtractionResult.""" import pytest from ragstack_common.text_extractors.base import ( BaseExtractor, ExtractionResult, ) class TestExtractionResult: """Tests for ExtractionResult dataclass.""" def test_creation_with_all_fields(self): """Test creating ExtractionResult with all fields.""" result = ExtractionResult( markdown="# Title\n\nContent here", file_type="txt", title="Test Document", word_count=5, structural_metadata={"lines": 3}, parse_warning=None, ) assert result.markdown == "# Title\n\nContent here" assert result.file_type == "txt" assert result.title == "Test Document" assert result.word_count == 5 assert result.structural_metadata == {"lines": 3} assert result.parse_warning is None def test_creation_with_parse_warning(self): """Test creating ExtractionResult with a parse warning.""" result = ExtractionResult( markdown="Raw content", file_type="json", title="malformed.json", word_count=2, structural_metadata={}, parse_warning="Invalid JSON: Expecting property name", ) assert result.parse_warning == "Invalid JSON: Expecting property name" def test_structural_metadata_can_be_empty(self): """Test that structural_metadata can be an empty dict.""" result = ExtractionResult( markdown="Content", file_type="txt", title="file.txt", word_count=1, structural_metadata={}, parse_warning=None, ) assert result.structural_metadata == {} class TestBaseExtractorHelpers: """Tests for BaseExtractor helper methods.""" def test_generate_frontmatter_simple(self): """Test frontmatter generation with simple values.""" metadata = { "source_file": "test.txt", "file_type": "txt", "word_count": 100, } frontmatter = BaseExtractor._generate_frontmatter(metadata) assert frontmatter.startswith("---\n") assert frontmatter.endswith("---\n") assert "source_file: test.txt" in frontmatter assert "file_type: txt" in frontmatter assert "word_count: 100" in frontmatter def test_generate_frontmatter_with_quotes(self): """Test frontmatter generation escapes quotes in values.""" metadata = { "title": 'Document with "quotes" inside', } frontmatter = BaseExtractor._generate_frontmatter(metadata) # YAML should escape or quote the string properly assert "quotes" in frontmatter assert frontmatter.startswith("---\n") def test_generate_frontmatter_with_colons(self): """Test frontmatter generation handles colons in values.""" metadata = { "url": "https://example.com/path", } frontmatter = BaseExtractor._generate_frontmatter(metadata) assert "https://example.com/path" in frontmatter or "example.com" in frontmatter def test_generate_frontmatter_with_list(self): """Test frontmatter generation handles list values.""" metadata = { "columns": ["name", "age", "city"], } frontmatter = BaseExtractor._generate_frontmatter(metadata) assert "columns:" in frontmatter assert "name" in frontmatter def test_generate_frontmatter_with_none(self): """Test frontmatter generation handles None values.""" metadata = { "source_file": "test.txt", "parse_warning": None, } frontmatter = BaseExtractor._generate_frontmatter(metadata) assert "source_file: test.txt" in frontmatter assert "null" in frontmatter.lower() or "parse_warning:" in frontmatter def test_count_words_simple(self): """Test word count with simple text.""" text = "one two three four five" assert BaseExtractor._count_words(text) == 5 def test_count_words_with_punctuation(self): """Test word count ignores punctuation properly.""" text = "Hello, world! This is a test." assert BaseExtractor._count_words(text) == 6 def test_count_words_with_multiple_spaces(self): """Test word count handles multiple spaces.""" text = "word1 word2 word3" assert BaseExtractor._count_words(text) == 3 def test_count_words_with_newlines(self): """Test word count handles newlines.""" text = "line one\nline two\nline three" assert BaseExtractor._count_words(text) == 6 def test_count_words_empty_string(self): """Test word count with empty string.""" assert BaseExtractor._count_words("") == 0 def test_count_words_whitespace_only(self): """Test word count with whitespace only.""" assert BaseExtractor._count_words(" \n\t ") == 0 def test_decode_content_utf8(self): """Test decoding valid UTF-8 content.""" content = "Hello, World! 你好".encode() decoded = BaseExtractor._decode_content(content) assert decoded == "Hello, World! 你好" def test_decode_content_latin1_fallback(self): """Test decoding falls back to latin-1 for invalid UTF-8.""" # Create content that's valid latin-1 but invalid UTF-8 content = "café".encode("latin-1") decoded = BaseExtractor._decode_content(content) # Should decode without raising, may use replacement or latin-1 assert decoded is not None assert len(decoded) > 0 def test_decode_content_empty(self): """Test decoding empty bytes.""" assert BaseExtractor._decode_content(b"") == "" def test_extract_title_from_filename_simple(self): """Test extracting title from simple filename.""" assert BaseExtractor._extract_title_from_filename("document.txt") == "document" def test_extract_title_from_filename_multiple_dots(self): """Test extracting title from filename with multiple dots.""" assert BaseExtractor._extract_title_from_filename("my.file.name.txt") == "my.file.name" def test_extract_title_from_filename_no_extension(self): """Test extracting title from filename without extension.""" assert BaseExtractor._extract_title_from_filename("README") == "README" def test_extract_title_from_filename_with_path(self): """Test extracting title handles path-like filenames.""" result = BaseExtractor._extract_title_from_filename("path/to/document.txt") # Should extract just the filename part without extension assert "document" in result def test_extract_title_from_filename_underscores(self): """Test title extraction converts underscores to spaces.""" result = BaseExtractor._extract_title_from_filename("my_document_name.txt") # Implementation may or may not convert underscores assert "my" in result.lower() def test_extract_title_from_filename_hyphens(self): """Test title extraction handles hyphens.""" result = BaseExtractor._extract_title_from_filename("my-document-name.txt") assert "my" in result.lower() class TestBaseExtractorAbstract: """Tests verifying BaseExtractor is properly abstract.""" def test_cannot_instantiate_directly(self): """Test that BaseExtractor cannot be instantiated directly.""" with pytest.raises(TypeError): BaseExtractor() def test_extract_method_is_abstract(self): """Test that extract method must be implemented by subclasses.""" # Create a minimal subclass that doesn't implement extract class IncompleteExtractor(BaseExtractor): pass with pytest.raises(TypeError): IncompleteExtractor() class TestConcreteExtractorSubclass: """Tests using a concrete subclass to verify base functionality works.""" def test_subclass_with_extract_implementation(self): """Test that a properly implemented subclass works.""" class SimpleExtractor(BaseExtractor): def extract(self, content: bytes, filename: str): text = self._decode_content(content) title = self._extract_title_from_filename(filename) word_count = self._count_words(text) metadata = { "source_file": filename, "file_type": "txt", "word_count": word_count, } frontmatter = self._generate_frontmatter(metadata) markdown = f"{frontmatter}\n{text}" return ExtractionResult( markdown=markdown, file_type="txt", title=title, word_count=word_count, structural_metadata={"char_count": len(text)}, parse_warning=None, ) extractor = SimpleExtractor() result = extractor.extract(b"Hello world", "test.txt") assert isinstance(result, ExtractionResult) assert result.file_type == "txt" assert result.title == "test" assert result.word_count == 2 assert "Hello world" in result.markdown assert "---" in result.markdown if __name__ == "__main__": pytest.main([__file__, "-v"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/HatmanStack/RAGStack-Lambda'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_text_extractor_base.py•9.31 KiB