"""Integration tests for PDF document indexing and semantic search.
These tests validate the complete PDF processing flow:
1. Process PDF with PyMuPDFProcessor
2. Chunk extracted text with page numbers
3. Index chunks into Qdrant with metadata
4. Perform semantic search on PDF content
5. Verify page numbers and metadata are preserved
"""
import pymupdf
import pytest
from qdrant_client import AsyncQdrantClient
from qdrant_client.models import Distance, PointStruct, VectorParams
from nextcloud_mcp_server.document_processors.pymupdf import PyMuPDFProcessor
from nextcloud_mcp_server.embedding import SimpleEmbeddingProvider
from nextcloud_mcp_server.vector.document_chunker import (
ChunkWithPosition,
RecursiveCharacterTextSplitter,
)
pytestmark = pytest.mark.integration
def create_test_pdf() -> bytes:
"""Create a small test PDF with multiple pages."""
doc = pymupdf.open()
# Page 1: Introduction
page1 = doc.new_page(width=595, height=842) # A4 size
page1.insert_text(
(50, 50),
"Nextcloud Administration Guide\n\n"
"Chapter 1: Introduction\n\n"
"Nextcloud is a self-hosted file sharing and collaboration platform. "
"It provides secure file storage, sharing, and synchronization across devices. "
"This guide covers installation, configuration, and maintenance of Nextcloud.",
)
# Page 2: Installation
page2 = doc.new_page(width=595, height=842)
page2.insert_text(
(50, 50),
"Chapter 2: Installation\n\n"
"System Requirements:\n"
"- PHP 8.0 or higher\n"
"- MySQL 8.0 or MariaDB 10.5\n"
"- Apache or Nginx web server\n\n"
"Installation steps:\n"
"1. Download Nextcloud package\n"
"2. Extract to web server directory\n"
"3. Configure database connection\n"
"4. Run installation wizard",
)
# Page 3: Configuration
page3 = doc.new_page(width=595, height=842)
page3.insert_text(
(50, 50),
"Chapter 3: Configuration\n\n"
"Database Configuration:\n"
"Edit config/config.php to set database parameters. "
"Configure database host, username, password, and database name. "
"For optimal performance, use MySQL or MariaDB.\n\n"
"Security Settings:\n"
"Enable HTTPS, configure trusted domains, and set up firewall rules.",
)
# Convert to bytes
pdf_bytes = doc.tobytes()
doc.close()
return pdf_bytes
@pytest.fixture
async def simple_embedding_provider():
"""Simple in-process embedding provider for testing."""
return SimpleEmbeddingProvider(dimension=384)
@pytest.fixture
async def qdrant_test_client():
"""Qdrant client for testing (in-memory)."""
client = AsyncQdrantClient(":memory:")
yield client
await client.close()
@pytest.fixture
async def test_collection(qdrant_test_client: AsyncQdrantClient):
"""Create test collection in Qdrant."""
collection_name = "test_pdf_indexing"
# Create collection
await qdrant_test_client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)
yield collection_name
# Cleanup
try:
await qdrant_test_client.delete_collection(collection_name)
except Exception:
pass
@pytest.fixture
def pymupdf_processor():
"""PyMuPDF processor for testing (without image extraction)."""
return PyMuPDFProcessor(extract_images=False)
async def test_pymupdf_processor_extracts_text_and_metadata(pymupdf_processor):
"""Test PyMuPDF processor extracts text and metadata from PDF."""
pdf_bytes = create_test_pdf()
result = await pymupdf_processor.process(
content=pdf_bytes,
content_type="application/pdf",
filename="test-admin-guide.pdf",
)
# Verify result structure
assert result.success is True
assert result.processor == "pymupdf"
assert result.text is not None
assert len(result.text) > 0
# Verify extracted text contains expected content
assert "Nextcloud Administration Guide" in result.text
assert "Chapter 1: Introduction" in result.text
assert "Chapter 2: Installation" in result.text
assert "Chapter 3: Configuration" in result.text
assert "PHP 8.0 or higher" in result.text
assert "MySQL" in result.text
# Verify metadata
assert result.metadata is not None
assert result.metadata["page_count"] == 3
assert result.metadata["filename"] == "test-admin-guide.pdf"
assert "format" in result.metadata
async def test_document_chunker_preserves_page_numbers():
"""Test that document chunker can handle chunks with page number metadata."""
# Create chunks with page numbers
chunks = [
ChunkWithPosition(
text="Chapter 1 content on page 1",
start_offset=0,
end_offset=28,
page_number=1,
),
ChunkWithPosition(
text="Chapter 2 content on page 2",
start_offset=29,
end_offset=57,
page_number=2,
),
ChunkWithPosition(
text="Chapter 3 content on page 3",
start_offset=58,
end_offset=86,
page_number=3,
),
]
# Verify page numbers are preserved
assert chunks[0].page_number == 1
assert chunks[1].page_number == 2
assert chunks[2].page_number == 3
async def test_pdf_indexing_and_search_flow(
pymupdf_processor: PyMuPDFProcessor,
qdrant_test_client: AsyncQdrantClient,
test_collection: str,
simple_embedding_provider: SimpleEmbeddingProvider,
):
"""Test complete PDF indexing and semantic search flow."""
# Step 1: Process PDF with PyMuPDF
pdf_bytes = create_test_pdf()
result = await pymupdf_processor.process(
content=pdf_bytes,
content_type="application/pdf",
filename="/Documents/admin-guide.pdf",
)
assert result.success is True
assert result.metadata["page_count"] == 3
# Step 2: Chunk the extracted text
# Note: In real implementation, we'd track which chunk came from which page
# For this test, we'll simulate by creating chunks manually
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_text(result.text)
assert len(chunks) > 0
# Step 3: Index chunks into Qdrant with PDF metadata
points = []
for idx, chunk_text in enumerate(chunks):
embedding = await simple_embedding_provider.embed(chunk_text)
# Simulate page number assignment (in real implementation, this would be tracked)
# For simplicity, assign page based on content
page_number = 1
if "Chapter 2" in chunk_text or "Installation" in chunk_text:
page_number = 2
elif "Chapter 3" in chunk_text or "Configuration" in chunk_text:
page_number = 3
points.append(
PointStruct(
id=idx,
vector=embedding,
payload={
"user_id": "admin",
"doc_id": "/Documents/admin-guide.pdf",
"doc_type": "file",
"title": "Nextcloud Administration Guide",
"file_path": "/Documents/admin-guide.pdf",
"mime_type": "application/pdf",
"page_number": page_number,
"page_count": result.metadata["page_count"],
"chunk_index": idx,
"excerpt": chunk_text[:200],
},
)
)
await qdrant_test_client.upsert(
collection_name=test_collection, points=points, wait=True
)
# Step 4: Perform semantic search for installation instructions
query = "how to install Nextcloud system requirements"
query_embedding = await simple_embedding_provider.embed(query)
response = await qdrant_test_client.query_points(
collection_name=test_collection,
query=query_embedding,
limit=3,
score_threshold=0.0,
)
# Verify search results
assert len(response.points) > 0
# Top result should be from installation chapter (page 2)
top_result = response.points[0]
assert top_result.payload["doc_type"] == "file"
assert top_result.payload["file_path"] == "/Documents/admin-guide.pdf"
assert (
"Installation" in top_result.payload["excerpt"]
or top_result.payload["page_number"] == 2
)
# Verify page number is preserved
assert top_result.payload["page_number"] in [1, 2, 3]
assert top_result.payload["page_count"] == 3
# Step 5: Search for configuration
query = "database configuration settings MySQL"
query_embedding = await simple_embedding_provider.embed(query)
response = await qdrant_test_client.query_points(
collection_name=test_collection,
query=query_embedding,
limit=3,
score_threshold=0.0,
)
assert len(response.points) > 0
# Should find configuration chapter (page 3)
found_config = any(
"Configuration" in r.payload["excerpt"] or r.payload["page_number"] == 3
for r in response.points[:2]
)
assert found_config
async def test_pdf_search_with_filters(
pymupdf_processor: PyMuPDFProcessor,
qdrant_test_client: AsyncQdrantClient,
test_collection: str,
simple_embedding_provider: SimpleEmbeddingProvider,
):
"""Test PDF search with metadata filters."""
from qdrant_client.models import FieldCondition, Filter, MatchValue
# Process and index PDF
pdf_bytes = create_test_pdf()
result = await pymupdf_processor.process(
content=pdf_bytes,
content_type="application/pdf",
filename="/Documents/admin-guide.pdf",
)
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_text(result.text)
# Index with metadata
points = []
for idx, chunk_text in enumerate(chunks):
embedding = await simple_embedding_provider.embed(chunk_text)
points.append(
PointStruct(
id=idx,
vector=embedding,
payload={
"user_id": "admin",
"doc_id": "/Documents/admin-guide.pdf",
"doc_type": "file",
"mime_type": "application/pdf",
"excerpt": chunk_text[:200],
},
)
)
await qdrant_test_client.upsert(
collection_name=test_collection, points=points, wait=True
)
# Search with filter for PDFs only
query = "Nextcloud installation"
query_embedding = await simple_embedding_provider.embed(query)
response = await qdrant_test_client.query_points(
collection_name=test_collection,
query=query_embedding,
query_filter=Filter(
must=[FieldCondition(key="doc_type", match=MatchValue(value="file"))]
),
limit=3,
)
# All results should be from file documents
assert len(response.points) > 0
for result in response.points:
assert result.payload["doc_type"] == "file"
assert result.payload["mime_type"] == "application/pdf"
async def test_pymupdf_health_check(pymupdf_processor: PyMuPDFProcessor):
"""Test PyMuPDF processor health check."""
is_healthy = await pymupdf_processor.health_check()
assert is_healthy is True
async def test_pymupdf_supports_pdf_mime_type(pymupdf_processor: PyMuPDFProcessor):
"""Test PyMuPDF processor declares PDF support."""
assert "application/pdf" in pymupdf_processor.supported_mime_types
assert pymupdf_processor.name == "pymupdf"