RAGStack-Lambda

Overview Schema Related Servers Score Discussions

test_text_extractor_registry.py•6.59 KiB

"""Integration tests for text extractor registry and public API.""" import io import pytest from ragstack_common.text_extractors import ( ContentSniffer, ExtractionResult, extract_text, ) from tests.fixtures.text_extractor_samples import ( CSV_STANDARD, EMAIL_SIMPLE, FULL_HTML_PAGE, JSON_SIMPLE_OBJECT, SIMPLE_TEXT, XML_SIMPLE, ) # Guard optional imports for binary format tests try: from docx import Document HAS_DOCX = True except ImportError: HAS_DOCX = False Document = None try: from openpyxl import Workbook HAS_OPENPYXL = True except ImportError: HAS_OPENPYXL = False Workbook = None try: from ebooklib import epub HAS_EBOOKLIB = True except ImportError: HAS_EBOOKLIB = False epub = None class TestExtractTextFunction: """Tests for the main extract_text() function.""" def test_extracts_txt(self): """Test extraction routes to TextExtractor for .txt files.""" result = extract_text(SIMPLE_TEXT.encode(), "document.txt") assert isinstance(result, ExtractionResult) assert result.file_type == "txt" assert "simple text file" in result.markdown.lower() def test_extracts_html(self): """Test extraction routes to HtmlExtractor for HTML content.""" result = extract_text(FULL_HTML_PAGE.encode(), "page.html") assert result.file_type == "html" assert "Main Heading" in result.markdown def test_extracts_csv(self): """Test extraction routes to CsvExtractor for CSV content.""" result = extract_text(CSV_STANDARD.encode(), "data.csv") assert result.file_type == "csv" assert "name" in result.markdown.lower() def test_extracts_json(self): """Test extraction routes to JsonExtractor for JSON content.""" result = extract_text(JSON_SIMPLE_OBJECT.encode(), "config.json") assert result.file_type == "json" assert "name" in result.markdown.lower() def test_extracts_xml(self): """Test extraction routes to XmlExtractor for XML content.""" result = extract_text(XML_SIMPLE.encode(), "data.xml") assert result.file_type == "xml" assert "root" in result.markdown.lower() def test_extracts_email(self): """Test extraction routes to EmailExtractor for email content.""" result = extract_text(EMAIL_SIMPLE.encode(), "message.eml") assert result.file_type == "eml" assert "sender@example.com" in result.markdown def test_content_sniffing_overrides_extension(self): """Test that content sniffing can override file extension.""" # JSON content but with .txt extension result = extract_text(JSON_SIMPLE_OBJECT.encode(), "data.txt") # Should still detect as JSON due to content assert result.file_type == "json" def test_fallback_to_text_for_unknown(self): """Test fallback to text extraction for unknown types.""" result = extract_text(b"Just plain content", "file.xyz") # Should fall back to txt assert result.file_type == "txt" def test_handles_binary_content(self): """Test handling of binary content that can't be text.""" # Random binary that doesn't match any format result = extract_text(bytes(range(256)), "binary.bin") assert isinstance(result, ExtractionResult) class TestExtractTextBinaryFormats: """Tests for binary format extraction via extract_text.""" def _create_minimal_docx(self) -> bytes: """Create minimal DOCX for testing.""" if not HAS_DOCX: pytest.skip("python-docx not installed") doc = Document() doc.add_paragraph("Test content") buffer = io.BytesIO() doc.save(buffer) return buffer.getvalue() def _create_minimal_xlsx(self) -> bytes: """Create minimal XLSX for testing.""" if not HAS_OPENPYXL: pytest.skip("openpyxl not installed") wb = Workbook() ws = wb.active ws.append(["Header"]) ws.append(["Data"]) buffer = io.BytesIO() wb.save(buffer) return buffer.getvalue() def _create_minimal_epub(self) -> bytes: """Create minimal EPUB for testing.""" if not HAS_EBOOKLIB: pytest.skip("ebooklib not installed") book = epub.EpubBook() book.set_identifier("test-id") book.set_title("Test Book") book.set_language("en") chapter = epub.EpubHtml(title="Ch1", file_name="ch1.xhtml") chapter.content = "<html><body><p>Content</p></body></html>" book.add_item(chapter) book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) book.spine = ["nav", chapter] buffer = io.BytesIO() epub.write_epub(buffer, book) return buffer.getvalue() @pytest.mark.skipif(not HAS_DOCX, reason="python-docx not installed") def test_extracts_docx(self): """Test extraction of DOCX via extract_text.""" docx_bytes = self._create_minimal_docx() result = extract_text(docx_bytes, "document.docx") assert result.file_type == "docx" @pytest.mark.skipif(not HAS_OPENPYXL, reason="openpyxl not installed") def test_extracts_xlsx(self): """Test extraction of XLSX via extract_text.""" xlsx_bytes = self._create_minimal_xlsx() result = extract_text(xlsx_bytes, "workbook.xlsx") assert result.file_type == "xlsx" @pytest.mark.skipif(not HAS_EBOOKLIB, reason="ebooklib not installed") def test_extracts_epub(self): """Test extraction of EPUB via extract_text.""" epub_bytes = self._create_minimal_epub() result = extract_text(epub_bytes, "book.epub") assert result.file_type == "epub" class TestContentSnifferExport: """Tests for ContentSniffer export.""" def test_sniffer_is_exported(self): """Test that ContentSniffer is exported from public API.""" sniffer = ContentSniffer() file_type, confidence = sniffer.sniff(JSON_SIMPLE_OBJECT.encode()) assert file_type == "json" assert confidence >= 0.8 class TestExtractionResultExport: """Tests for ExtractionResult export.""" def test_extraction_result_is_exported(self): """Test that ExtractionResult is exported from public API.""" result = ExtractionResult( markdown="# Test", file_type="txt", title="Test", word_count=1, structural_metadata={}, parse_warning=None, ) assert isinstance(result, ExtractionResult) if __name__ == "__main__": pytest.main([__file__, "-v"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/HatmanStack/RAGStack-Lambda'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_text_extractor_registry.py•6.59 KiB