Archive Agent

test_chunk.py•9.46 KiB

# tests/data/test_chunk.py # Copyright © 2025 Dr.-Ing. Paul Wilhelm <paul@wilhelm.dev> # This file is part of Archive Agent. See LICENSE for details. import logging from typing import List from unittest.mock import Mock from archive_agent.ai.AiManagerFactory import AiManagerFactory from archive_agent.ai.AiManager import AiManager from archive_agent.data.DocumentContent import DocumentContent from archive_agent.data.chunk import get_sentences_with_reference_ranges, get_chunks_with_reference_ranges, SentenceWithRange from archive_agent.ai.chunk.AiChunk import ChunkSchema, ChunkItem from archive_agent.util.text_util import splitlines_exact from archive_agent.core.ProgressManager import ProgressManager, ProgressInfo logger = logging.getLogger(__name__) def test_split_sentences_output(): """ Verify that `split_sentences` preserves text structure for a real file without references. Loads a test file (`test_unsanitized.txt`) and checks its joined sentences match a sanitized version (`test_sanitized.txt`). Tests: Structure preservation (paragraph breaks as ""), no-reference case with (0,0) ranges. """ with open("./tests/data/test_data/test_unsanitized.txt", "r", encoding="utf-8") as f: raw_text = f.read().strip() with open("./tests/data/test_data/test_sanitized.txt", "r", encoding="utf-8") as f: expect_text = f.read().strip() doc_content = DocumentContent.from_text(text=raw_text, lines_per_line=list(range(len(splitlines_exact(raw_text))))) result = get_sentences_with_reference_ranges(doc_content) joined_text = "\n".join([s.text for s in result]).strip() print(f"\n{joined_text=}\n") print(f"\n{expect_text=}\n") assert joined_text == expect_text def test_split_sentences_simple(): """ Test `split_sentences` with a simple text. Input: Text with two sentences in one paragraph, a break, and a third sentence. Expected: Sentences joined in first paragraph, break as "", third sentence separate. Tests: Paragraph breaks, sentence joining by spaCy. """ raw_text = "A.\nB.\n\nC." doc_content = DocumentContent.from_text( text=raw_text, lines_per_line=[1, 2, 3, 4], ) result = get_sentences_with_reference_ranges(doc_content) expected = [ SentenceWithRange("A. B.", (1, 2)), SentenceWithRange("", (0, 0)), SentenceWithRange("C.", (4, 4)), ] assert result == expected def test_split_sentences_with_references_spanned(): """ Test `split_sentences` with references and a sentence spanning multiple lines. Input: Text with a single-line sentence, a multi-line sentence, a break, and another sentence; references [1,2,3,4]. Expected: Sentences with min-max ranges (e.g., (1,2) for spanned), break as "" (0,0). Tests: Reference aggregation, multi-line sentences, monotonic references. """ doc_content = DocumentContent.from_text( text="First. Second spans\nlines.\n\nThird.", lines_per_line=[1, 2, 3, 4], ) result = get_sentences_with_reference_ranges(doc_content) expected = [ SentenceWithRange("First.", (1, 1)), SentenceWithRange("Second spans lines.", (1, 2)), SentenceWithRange("", (0, 0)), SentenceWithRange("Third.", (4, 4)), ] assert result == expected def test_split_sentences_markdown_lists(): """ Test `split_sentences` with Markdown list items as separate paragraphs. Input: Text with a paragraph, break, and two list items; references [1,2,3,4]. Expected: Paragraph, break, each list item as separate sentence, breaks between, with correct ranges. Tests: Markdown list handling, paragraph breaks, reference assignment. """ doc_content = DocumentContent.from_text( text="Para.\n\n- Item1.\n- Item2.", lines_per_line=[1, 2, 3, 4], ) result = get_sentences_with_reference_ranges(doc_content) expected = [ SentenceWithRange("Para.", (1, 1)), SentenceWithRange("", (0, 0)), SentenceWithRange("- Item1.", (3, 3)), SentenceWithRange("", (0, 0)), SentenceWithRange("- Item2.", (4, 4)), ] assert result == expected def test_split_sentences_empty_or_blanks(): """ Test `split_sentences` with empty or blank-only input. Input: Empty string or multiple blank lines. Expected: Empty list for both cases. Tests: Edge cases for empty input, blank line handling. """ doc_content = DocumentContent.from_text( text="", lines_per_line=[1], ) result_empty = get_sentences_with_reference_ranges(doc_content) assert result_empty == [] doc_content = DocumentContent.from_text( text="\n\n", lines_per_line=[1, 2, 3], ) result_blanks = get_sentences_with_reference_ranges(doc_content) assert result_blanks == [] # Mock factory that returns a mock AI manager - the actual AI instance isn't used in our tests def create_mock_ai_factory() -> Mock: """Create a mock AiManagerFactory that returns a mock AiManager.""" mock_ai = Mock(spec=AiManager) mock_factory = Mock(spec=AiManagerFactory) mock_factory.get_ai.return_value = mock_ai return mock_factory def create_mock_progress_info() -> ProgressInfo: """Create a mock ProgressInfo with mocked ProgressManager for testing.""" mock_progress_manager = Mock(spec=ProgressManager) mock_progress_manager.set_total = Mock() mock_progress_manager.update_task = Mock() return ProgressInfo(progress_manager=mock_progress_manager, parent_key="test_parent_key") # noinspection PyUnusedLocal def dummy_chunk_callback(ai: AiManager, block_of_sentences: List[str]) -> ChunkSchema: """ Simulate AI chunking with a fixed, single-chunk output. Input: List of sentences. Output: ChunkSchema with one chunk starting at line 1, with a header, or empty if no sentences. Used to test chunking logic deterministically. """ return ChunkSchema(chunk_items=[ChunkItem(start_line=1, header="Header 1")] if block_of_sentences else []) def test_generate_chunks_with_ranges_basic_no_carry(): """ Test `generate_chunks_with_ranges` with a small block, no carry-over. Input: Two sentences with ranges (1,1), (2,2); block size 2. Expected: One chunk with all sentences, aggregated range (1,2), formatted with header. Tests: Basic chunking, range aggregation, formatting. """ sentences_with_ranges = [ SentenceWithRange("S1", (1, 1)), SentenceWithRange("S2", (2, 2)), ] chunk_lines_block = 2 file_path = "test.txt" result = get_chunks_with_reference_ranges( create_mock_ai_factory(), sentences_with_ranges, dummy_chunk_callback, chunk_lines_block, file_path, create_mock_progress_info(), logger, True, ) assert len(result) == 1 assert result[0].reference_range == (1, 2) assert "Header 1" in result[0].text assert "S1\nS2" in result[0].text def test_generate_chunks_with_ranges_with_carry(): """ Test `generate_chunks_with_ranges` with multiple blocks and carry-over. Input: Three sentences with ranges (1,1), (2,2), (3,3); block size 2. Expected: One chunk grouping all sentences (due to dummy callback), range (1,3). Tests: Block grouping, carry-over handling, range aggregation. """ sentences_with_ranges = [ SentenceWithRange("S1", (1, 1)), SentenceWithRange("S2", (2, 2)), SentenceWithRange("S3", (3, 3)), ] chunk_lines_block = 2 file_path = "test.txt" result = get_chunks_with_reference_ranges( create_mock_ai_factory(), sentences_with_ranges, dummy_chunk_callback, chunk_lines_block, file_path, create_mock_progress_info(), logger, True, ) assert len(result) == 1 assert result[0].reference_range == (1, 3) assert "Header 1" in result[0].text assert "S1\nS2\nS3" in result[0].text def test_generate_chunks_with_ranges_ignores_zeros_in_agg(): """ Test `generate_chunks_with_ranges` with a break (0,0) in sentences. Input: Two sentences with (1,1), (2,2), a break (0,0); block size 3. Expected: One chunk with sentences, range (1,2) ignoring (0,0), break preserved. Tests: Sentinel (0,0) filtering, paragraph break preservation. """ sentences_with_ranges = [ SentenceWithRange("S1", (1, 1)), SentenceWithRange("", (0, 0)), SentenceWithRange("S2", (2, 2)), ] chunk_lines_block = 3 file_path = "test.txt" result = get_chunks_with_reference_ranges( create_mock_ai_factory(), sentences_with_ranges, dummy_chunk_callback, chunk_lines_block, file_path, create_mock_progress_info(), logger, True, ) assert len(result) == 1 assert result[0].reference_range == (1, 2) assert "Header 1" in result[0].text assert "S1\n\nS2" in result[0].text def test_generate_chunks_with_ranges_empty(): """ Test `generate_chunks_with_ranges` with empty input. Input: Empty list of sentences; block size 1. Expected: Empty list of chunks. Tests: Edge case for empty input handling. """ sentences_with_ranges = [] chunk_lines_block = 1 file_path = "test.txt" result = get_chunks_with_reference_ranges( create_mock_ai_factory(), sentences_with_ranges, dummy_chunk_callback, chunk_lines_block, file_path, create_mock_progress_info(), logger, True, ) assert result == []

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/shredEngineer/Archive-Agent'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_chunk.py•9.46 KiB