RAGStack-Lambda

Overview Schema Related Servers Score Discussions

test_epub_extractor.py•5.17 KiB

"""Unit tests for EPUB extractor.""" import io import pytest try: from ebooklib import epub except ImportError: epub = None from ragstack_common.text_extractors.base import ExtractionResult from ragstack_common.text_extractors.epub_extractor import EpubExtractor def create_minimal_epub( title: str = "Test Book", author: str = "Test Author", chapters: list[tuple[str, str]] | None = None, ) -> bytes: """Create a minimal EPUB file for testing.""" if epub is None: pytest.skip("ebooklib not installed") book = epub.EpubBook() book.set_identifier("test-id-12345") book.set_title(title) book.set_language("en") book.add_author(author) if chapters is None: chapters = [ ("Chapter 1", "<h1>Chapter 1</h1><p>First chapter content.</p>"), ("Chapter 2", "<h1>Chapter 2</h1><p>Second chapter content.</p>"), ] spine = ["nav"] for i, (chapter_title, content) in enumerate(chapters): chapter = epub.EpubHtml(title=chapter_title, file_name=f"chap_{i + 1}.xhtml", lang="en") chapter.content = f"<html><body>{content}</body></html>" book.add_item(chapter) spine.append(chapter) # Add navigation book.toc = [ epub.Link(f"chap_{i + 1}.xhtml", title, f"chap{i + 1}") for i, (title, _) in enumerate(chapters) ] book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) book.spine = spine # Write to bytes buffer = io.BytesIO() epub.write_epub(buffer, book) return buffer.getvalue() @pytest.fixture def minimal_epub() -> bytes: """Fixture for minimal EPUB file.""" return create_minimal_epub() @pytest.fixture def corrupted_epub() -> bytes: """Fixture for corrupted EPUB (invalid ZIP).""" return b"not a valid zip file" class TestEpubExtractor: """Tests for EpubExtractor.""" def test_extracts_epub(self, minimal_epub): """Test extraction of EPUB file.""" extractor = EpubExtractor() result = extractor.extract(minimal_epub, "book.epub") assert isinstance(result, ExtractionResult) assert result.file_type == "epub" def test_extracts_title(self, minimal_epub): """Test extraction of book title.""" extractor = EpubExtractor() result = extractor.extract(minimal_epub, "book.epub") assert result.title == "Test Book" def test_extracts_author(self, minimal_epub): """Test extraction of author.""" extractor = EpubExtractor() result = extractor.extract(minimal_epub, "book.epub") assert result.structural_metadata.get("author") == "Test Author" def test_extracts_chapters(self, minimal_epub): """Test extraction of chapter content.""" extractor = EpubExtractor() result = extractor.extract(minimal_epub, "book.epub") assert "Chapter 1" in result.markdown assert "Chapter 2" in result.markdown def test_extracts_chapter_content(self, minimal_epub): """Test that chapter content is extracted.""" extractor = EpubExtractor() result = extractor.extract(minimal_epub, "book.epub") assert "First chapter content" in result.markdown assert "Second chapter content" in result.markdown def test_generates_frontmatter(self, minimal_epub): """Test that frontmatter is generated correctly.""" extractor = EpubExtractor() result = extractor.extract(minimal_epub, "book.epub") assert result.markdown.startswith("---\n") assert "source_file: book.epub" in result.markdown assert "file_type: epub" in result.markdown def test_structural_metadata_includes_chapter_count(self, minimal_epub): """Test structural metadata includes chapter count.""" extractor = EpubExtractor() result = extractor.extract(minimal_epub, "book.epub") assert "chapter_count" in result.structural_metadata # Chapter count includes nav documents, so >= 2 content chapters assert result.structural_metadata["chapter_count"] >= 2 def test_corrupted_epub_falls_back(self, corrupted_epub): """Test that corrupted EPUB falls back with warning.""" extractor = EpubExtractor() result = extractor.extract(corrupted_epub, "broken.epub") assert isinstance(result, ExtractionResult) assert result.parse_warning is not None def test_custom_book(self): """Test extraction of custom book.""" epub_bytes = create_minimal_epub( title="Custom Title", author="Custom Author", chapters=[ ("Introduction", "<p>Welcome to the book.</p>"), ("Main Content", "<p>This is the main content.</p>"), ("Conclusion", "<p>The end.</p>"), ], ) extractor = EpubExtractor() result = extractor.extract(epub_bytes, "custom.epub") assert result.title == "Custom Title" assert result.structural_metadata.get("author") == "Custom Author" # Chapter count includes nav documents, so >= 3 content chapters assert result.structural_metadata.get("chapter_count") >= 3 if __name__ == "__main__": pytest.main([__file__, "-v"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/HatmanStack/RAGStack-Lambda'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_epub_extractor.py•5.17 KiB