test_content_splitting.py•11.3 kB
# Copyright 2024 Heinrich Krupp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Tests for content splitting and backend-specific length limits.
Tests cover:
- Content splitting utility functions
- Backend limit enforcement
- Automatic chunking with metadata
- Boundary preservation (sentences, paragraphs, code blocks)
- Overlap between chunks for context preservation
"""
import pytest
from src.mcp_memory_service.utils.content_splitter import (
    split_content,
    estimate_chunks_needed,
    validate_chunk_lengths,
    _find_best_split_point
)
class TestContentSplitter:
    """Test the content_splitter utility module."""
    def test_split_short_content(self):
        """Content shorter than max_length should not be split."""
        content = "This is a short sentence."
        chunks = split_content(content, max_length=100)
        assert len(chunks) == 1
        assert chunks[0] == content
    def test_split_long_content_character_mode(self):
        """Test character-based splitting without boundary preservation."""
        content = "a" * 500
        chunks = split_content(content, max_length=100, preserve_boundaries=False, overlap=10)
        # Should create multiple chunks
        assert len(chunks) > 1
        # All chunks should be <= max_length
        assert all(len(chunk) <= 100 for chunk in chunks)
        # Should have overlap
        assert chunks[1].startswith(chunks[0][-10:])
    def test_split_preserves_paragraphs(self):
        """Test that paragraph boundaries are preferred for splitting."""
        content = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph."
        chunks = split_content(content, max_length=30, preserve_boundaries=True)
        # Should split at paragraph boundaries
        assert len(chunks) >= 2
        # Each chunk should end cleanly (no mid-paragraph cuts)
        for chunk in chunks[:-1]:  # Check all but last chunk
            assert chunk.strip().endswith('.') or '\n\n' in chunk
    def test_split_preserves_sentences(self):
        """Test that sentence boundaries are preferred when paragraphs don't fit."""
        content = "First sentence. Second sentence. Third sentence. Fourth sentence."
        chunks = split_content(content, max_length=40, preserve_boundaries=True)
        # Should split at sentence boundaries
        assert len(chunks) >= 2
        # Most chunks should end with period
        period_endings = sum(1 for chunk in chunks if chunk.strip().endswith('.'))
        assert period_endings >= len(chunks) - 1
    def test_split_preserves_words(self):
        """Test that word boundaries are preferred when sentences don't fit."""
        content = "word1 word2 word3 word4 word5 word6 word7 word8"
        chunks = split_content(content, max_length=25, preserve_boundaries=True)
        # Should split at word boundaries
        assert len(chunks) >= 2
        # No chunk should end mid-word (except possibly last)
        for chunk in chunks[:-1]:
            # Should not end with partial word (will end with space or be complete)
            assert chunk.endswith(' ') or chunk == chunks[-1]
    def test_split_overlap(self):
        """Test that chunks have proper overlap for context."""
        content = "The quick brown fox jumps over the lazy dog. " * 10
        chunks = split_content(content, max_length=100, preserve_boundaries=True, overlap=20)
        assert len(chunks) > 1
        # Check that consecutive chunks have overlap
        for i in range(len(chunks) - 1):
            # The next chunk should contain some content from the end of current chunk
            current_end = chunks[i][-20:]
            assert any(word in chunks[i+1] for word in current_end.split()[:3])
    def test_estimate_chunks_needed(self):
        """Test chunk estimation function."""
        # Basic cases without overlap
        assert estimate_chunks_needed(0, 100) == 0
        assert estimate_chunks_needed(100, 100) == 1
        assert estimate_chunks_needed(200, 100) == 2
        assert estimate_chunks_needed(250, 100) == 3
        # Cases with overlap
        assert estimate_chunks_needed(100, 100, overlap=10) == 1  # Fits in one chunk
        assert estimate_chunks_needed(150, 100, overlap=10) == 2  # First chunk 100, second chunk covers remaining 50
        assert estimate_chunks_needed(200, 100, overlap=50) == 3  # Effective chunk size is 50
        # Edge cases
        assert estimate_chunks_needed(100, 100, overlap=100) == 1  # Invalid overlap, fallback to simple division
        assert estimate_chunks_needed(100, 100, overlap=150) == 1  # Invalid overlap larger than max_length
    def test_validate_chunk_lengths(self):
        """Test chunk length validation."""
        valid_chunks = ["short", "also short", "still short"]
        invalid_chunks = ["short", "this is way too long for the limit", "short"]
        assert validate_chunk_lengths(valid_chunks, max_length=50) is True
        assert validate_chunk_lengths(invalid_chunks, max_length=20) is False
    def test_find_best_split_point_paragraph(self):
        """Test that paragraph breaks are prioritized."""
        text = "First para.\n\nSecond para.\n\nThird para."
        split_point = _find_best_split_point(text, max_length=25)
        # Should split at first paragraph break
        assert text[split_point-2:split_point] == '\n\n'
    def test_find_best_split_point_sentence(self):
        """Test that sentence boundaries are used when no paragraph breaks."""
        text = "First sentence. Second sentence. Third sentence."
        split_point = _find_best_split_point(text, max_length=30)
        # Should split at sentence boundary
        assert '. ' in text[:split_point]
    def test_split_empty_content(self):
        """Test handling of empty content."""
        chunks = split_content("", max_length=100)
        assert chunks == []
    def test_split_exact_length(self):
        """Test content exactly at max_length."""
        content = "a" * 100
        chunks = split_content(content, max_length=100)
        assert len(chunks) == 1
        assert chunks[0] == content
    def test_split_code_blocks(self):
        """Test that code blocks are handled reasonably."""
        content = """def function_one():
    return True
def function_two():
    return False
def function_three():
    return None"""
        chunks = split_content(content, max_length=60, preserve_boundaries=True)
        # Should split at paragraph/function boundaries
        assert len(chunks) >= 2
        # Each chunk should contain complete functions ideally
        for chunk in chunks:
            # Count function definitions
            if 'def ' in chunk:
                # If it has a def, it should have a return (complete function)
                assert 'return' in chunk or chunk == chunks[-1]
class TestBackendLimits:
    """Test backend-specific content length limits."""
    def test_cloudflare_limit(self):
        """Test that Cloudflare backend uses config constant."""
        from src.mcp_memory_service.storage.cloudflare import CloudflareStorage
        from src.mcp_memory_service.config import CLOUDFLARE_MAX_CONTENT_LENGTH
        # Verify the class constant matches config
        assert CloudflareStorage._MAX_CONTENT_LENGTH == CLOUDFLARE_MAX_CONTENT_LENGTH
    def test_chromadb_limit(self):
        """Test that ChromaDB backend uses config constant."""
        from src.mcp_memory_service.storage.chroma import ChromaMemoryStorage
        from src.mcp_memory_service.config import CHROMADB_MAX_CONTENT_LENGTH
        assert ChromaMemoryStorage._MAX_CONTENT_LENGTH == CHROMADB_MAX_CONTENT_LENGTH
    def test_sqlitevec_unlimited(self):
        """Test that SQLite-vec backend uses config constant."""
        from src.mcp_memory_service.storage.sqlite_vec import SqliteVecMemoryStorage
        from src.mcp_memory_service.config import SQLITEVEC_MAX_CONTENT_LENGTH
        # Create a mock instance to check property
        import tempfile
        import os
        with tempfile.TemporaryDirectory() as tmpdir:
            db_path = os.path.join(tmpdir, "test.db")
            storage = SqliteVecMemoryStorage(db_path=db_path)
            # Should return configured value (default: None/unlimited)
            assert storage.max_content_length == SQLITEVEC_MAX_CONTENT_LENGTH
            assert storage.supports_chunking is True
    def test_hybrid_follows_config(self):
        """Test that Hybrid backend uses config constant."""
        from src.mcp_memory_service.storage.hybrid import HybridMemoryStorage
        from src.mcp_memory_service.config import HYBRID_MAX_CONTENT_LENGTH
        import tempfile
        import os
        with tempfile.TemporaryDirectory() as tmpdir:
            db_path = os.path.join(tmpdir, "test.db")
            storage = HybridMemoryStorage(
                sqlite_db_path=db_path,
                cloudflare_config=None  # No cloud sync for this test
            )
            # Should match configured hybrid limit
            assert storage.max_content_length == HYBRID_MAX_CONTENT_LENGTH
            assert storage.supports_chunking is True
class TestConfigurationConstants:
    """Test configuration constants for content limits."""
    def test_config_constants_exist(self):
        """Test that all content limit constants are defined."""
        from src.mcp_memory_service.config import (
            CLOUDFLARE_MAX_CONTENT_LENGTH,
            CHROMADB_MAX_CONTENT_LENGTH,
            SQLITEVEC_MAX_CONTENT_LENGTH,
            HYBRID_MAX_CONTENT_LENGTH,
            ENABLE_AUTO_SPLIT,
            CONTENT_SPLIT_OVERLAP,
            CONTENT_PRESERVE_BOUNDARIES
        )
        assert CLOUDFLARE_MAX_CONTENT_LENGTH == 800
        assert CHROMADB_MAX_CONTENT_LENGTH == 1500
        assert SQLITEVEC_MAX_CONTENT_LENGTH is None  # Unlimited
        assert HYBRID_MAX_CONTENT_LENGTH == CLOUDFLARE_MAX_CONTENT_LENGTH
        assert isinstance(ENABLE_AUTO_SPLIT, bool)
        assert isinstance(CONTENT_SPLIT_OVERLAP, int)
        assert isinstance(CONTENT_PRESERVE_BOUNDARIES, bool)
    def test_config_validation(self):
        """Test that config values are sensible."""
        from src.mcp_memory_service.config import (
            CLOUDFLARE_MAX_CONTENT_LENGTH,
            CHROMADB_MAX_CONTENT_LENGTH,
            CONTENT_SPLIT_OVERLAP
        )
        # Limits should be positive
        assert CLOUDFLARE_MAX_CONTENT_LENGTH > 0
        assert CHROMADB_MAX_CONTENT_LENGTH > 0
        # ChromaDB should have higher limit (larger model)
        assert CHROMADB_MAX_CONTENT_LENGTH > CLOUDFLARE_MAX_CONTENT_LENGTH
        # Overlap should be reasonable
        assert 0 <= CONTENT_SPLIT_OVERLAP <= 500
if __name__ == "__main__":
    pytest.main([__file__, "-v"])