test_search_repository.py•25.5 kB
"""Tests for the SearchRepository."""
from datetime import datetime, timezone
import pytest
import pytest_asyncio
from sqlalchemy import text
from basic_memory import db
from basic_memory.models import Entity
from basic_memory.models.project import Project
from basic_memory.repository.search_repository import SearchRepository, SearchIndexRow
from basic_memory.schemas.search import SearchItemType
@pytest_asyncio.fixture
async def search_entity(session_maker, test_project: Project):
    """Create a test entity for search testing."""
    async with db.scoped_session(session_maker) as session:
        entity = Entity(
            project_id=test_project.id,
            title="Search Test Entity",
            entity_type="test",
            permalink="test/search-test-entity",
            file_path="test/search_test_entity.md",
            content_type="text/markdown",
            created_at=datetime.now(timezone.utc),
            updated_at=datetime.now(timezone.utc),
        )
        session.add(entity)
        await session.flush()
        return entity
@pytest_asyncio.fixture
async def second_project(project_repository):
    """Create a second project for testing project isolation."""
    project_data = {
        "name": "Second Test Project",
        "description": "Another project for testing",
        "path": "/second/project/path",
        "is_active": True,
        "is_default": None,
    }
    return await project_repository.create(project_data)
@pytest_asyncio.fixture
async def second_project_repository(session_maker, second_project):
    """Create a repository for the second project."""
    return SearchRepository(session_maker, project_id=second_project.id)
@pytest_asyncio.fixture
async def second_entity(session_maker, second_project: Project):
    """Create a test entity in the second project."""
    async with db.scoped_session(session_maker) as session:
        entity = Entity(
            project_id=second_project.id,
            title="Second Project Entity",
            entity_type="test",
            permalink="test/second-project-entity",
            file_path="test/second_project_entity.md",
            content_type="text/markdown",
            created_at=datetime.now(timezone.utc),
            updated_at=datetime.now(timezone.utc),
        )
        session.add(entity)
        await session.flush()
        return entity
@pytest.mark.asyncio
async def test_init_search_index(search_repository):
    """Test that search index can be initialized."""
    await search_repository.init_search_index()
    # Verify search_index table exists
    async with db.scoped_session(search_repository.session_maker) as session:
        result = await session.execute(
            text("SELECT name FROM sqlite_master WHERE type='table' AND name='search_index';")
        )
        assert result.scalar() == "search_index"
@pytest.mark.asyncio
async def test_index_item(search_repository, search_entity):
    """Test indexing an item with project_id."""
    # Create search index row for the entity
    search_row = SearchIndexRow(
        id=search_entity.id,
        type=SearchItemType.ENTITY.value,
        title=search_entity.title,
        content_stems="search test entity content",
        content_snippet="This is a test entity for search",
        permalink=search_entity.permalink,
        file_path=search_entity.file_path,
        entity_id=search_entity.id,
        metadata={"entity_type": search_entity.entity_type},
        created_at=search_entity.created_at,
        updated_at=search_entity.updated_at,
        project_id=search_repository.project_id,
    )
    # Index the item
    await search_repository.index_item(search_row)
    # Search for the item
    results = await search_repository.search(search_text="search test")
    # Verify we found the item
    assert len(results) == 1
    assert results[0].title == search_entity.title
    assert results[0].project_id == search_repository.project_id
@pytest.mark.asyncio
async def test_project_isolation(
    search_repository, second_project_repository, search_entity, second_entity
):
    """Test that search is isolated by project."""
    # Index entities in both projects
    search_row1 = SearchIndexRow(
        id=search_entity.id,
        type=SearchItemType.ENTITY.value,
        title=search_entity.title,
        content_stems="unique first project content",
        content_snippet="This is a test entity in the first project",
        permalink=search_entity.permalink,
        file_path=search_entity.file_path,
        entity_id=search_entity.id,
        metadata={"entity_type": search_entity.entity_type},
        created_at=search_entity.created_at,
        updated_at=search_entity.updated_at,
        project_id=search_repository.project_id,
    )
    search_row2 = SearchIndexRow(
        id=second_entity.id,
        type=SearchItemType.ENTITY.value,
        title=second_entity.title,
        content_stems="unique second project content",
        content_snippet="This is a test entity in the second project",
        permalink=second_entity.permalink,
        file_path=second_entity.file_path,
        entity_id=second_entity.id,
        metadata={"entity_type": second_entity.entity_type},
        created_at=second_entity.created_at,
        updated_at=second_entity.updated_at,
        project_id=second_project_repository.project_id,
    )
    # Index items in their respective repositories
    await search_repository.index_item(search_row1)
    await second_project_repository.index_item(search_row2)
    # Search in first project
    results1 = await search_repository.search(search_text="unique first")
    assert len(results1) == 1
    assert results1[0].title == search_entity.title
    assert results1[0].project_id == search_repository.project_id
    # Search in second project
    results2 = await second_project_repository.search(search_text="unique second")
    assert len(results2) == 1
    assert results2[0].title == second_entity.title
    assert results2[0].project_id == second_project_repository.project_id
    # Make sure first project can't see second project's content
    results_cross1 = await search_repository.search(search_text="unique second")
    assert len(results_cross1) == 0
    # Make sure second project can't see first project's content
    results_cross2 = await second_project_repository.search(search_text="unique first")
    assert len(results_cross2) == 0
@pytest.mark.asyncio
async def test_delete_by_permalink(search_repository, search_entity):
    """Test deleting an item by permalink respects project isolation."""
    # Index the item
    search_row = SearchIndexRow(
        id=search_entity.id,
        type=SearchItemType.ENTITY.value,
        title=search_entity.title,
        content_stems="content to delete",
        content_snippet="This content should be deleted",
        permalink=search_entity.permalink,
        file_path=search_entity.file_path,
        entity_id=search_entity.id,
        metadata={"entity_type": search_entity.entity_type},
        created_at=search_entity.created_at,
        updated_at=search_entity.updated_at,
        project_id=search_repository.project_id,
    )
    await search_repository.index_item(search_row)
    # Verify it exists
    results = await search_repository.search(search_text="content to delete")
    assert len(results) == 1
    # Delete by permalink
    await search_repository.delete_by_permalink(search_entity.permalink)
    # Verify it's gone
    results_after = await search_repository.search(search_text="content to delete")
    assert len(results_after) == 0
@pytest.mark.asyncio
async def test_delete_by_entity_id(search_repository, search_entity):
    """Test deleting an item by entity_id respects project isolation."""
    # Index the item
    search_row = SearchIndexRow(
        id=search_entity.id,
        type=SearchItemType.ENTITY.value,
        title=search_entity.title,
        content_stems="entity to delete",
        content_snippet="This entity should be deleted",
        permalink=search_entity.permalink,
        file_path=search_entity.file_path,
        entity_id=search_entity.id,
        metadata={"entity_type": search_entity.entity_type},
        created_at=search_entity.created_at,
        updated_at=search_entity.updated_at,
        project_id=search_repository.project_id,
    )
    await search_repository.index_item(search_row)
    # Verify it exists
    results = await search_repository.search(search_text="entity to delete")
    assert len(results) == 1
    # Delete by entity_id
    await search_repository.delete_by_entity_id(search_entity.id)
    # Verify it's gone
    results_after = await search_repository.search(search_text="entity to delete")
    assert len(results_after) == 0
@pytest.mark.asyncio
async def test_to_insert_includes_project_id(search_repository):
    """Test that the to_insert method includes project_id."""
    # Create a search index row with project_id
    row = SearchIndexRow(
        id=1234,
        type=SearchItemType.ENTITY.value,
        title="Test Title",
        content_stems="test content",
        content_snippet="test snippet",
        permalink="test/permalink",
        file_path="test/file.md",
        metadata={"test": "metadata"},
        created_at=datetime.now(timezone.utc),
        updated_at=datetime.now(timezone.utc),
        project_id=search_repository.project_id,
    )
    # Get insert data
    insert_data = row.to_insert()
    # Verify project_id is included
    assert "project_id" in insert_data
    assert insert_data["project_id"] == search_repository.project_id
def test_directory_property():
    """Test the directory property of SearchIndexRow."""
    # Test a file in a nested directory
    row1 = SearchIndexRow(
        id=1,
        type=SearchItemType.ENTITY.value,
        file_path="projects/notes/ideas.md",
        created_at=datetime.now(timezone.utc),
        updated_at=datetime.now(timezone.utc),
        project_id=1,
    )
    assert row1.directory == "/projects/notes"
    # Test a file at the root level
    row2 = SearchIndexRow(
        id=2,
        type=SearchItemType.ENTITY.value,
        file_path="README.md",
        created_at=datetime.now(timezone.utc),
        updated_at=datetime.now(timezone.utc),
        project_id=1,
    )
    assert row2.directory == "/"
    # Test a non-entity type with empty file_path
    row3 = SearchIndexRow(
        id=3,
        type=SearchItemType.OBSERVATION.value,
        file_path="",
        created_at=datetime.now(timezone.utc),
        updated_at=datetime.now(timezone.utc),
        project_id=1,
    )
    assert row3.directory == ""
class TestSearchTermPreparation:
    """Test cases for FTS5 search term preparation."""
    def test_simple_terms_get_prefix_wildcard(self, search_repository):
        """Simple alphanumeric terms should get prefix matching."""
        assert search_repository._prepare_search_term("hello") == "hello*"
        assert search_repository._prepare_search_term("project") == "project*"
        assert search_repository._prepare_search_term("test123") == "test123*"
    def test_terms_with_existing_wildcard_unchanged(self, search_repository):
        """Terms that already contain * should remain unchanged."""
        assert search_repository._prepare_search_term("hello*") == "hello*"
        assert search_repository._prepare_search_term("test*world") == "test*world"
    def test_boolean_operators_preserved(self, search_repository):
        """Boolean operators should be preserved without modification."""
        assert search_repository._prepare_search_term("hello AND world") == "hello AND world"
        assert search_repository._prepare_search_term("cat OR dog") == "cat OR dog"
        assert (
            search_repository._prepare_search_term("project NOT meeting") == "project NOT meeting"
        )
        assert (
            search_repository._prepare_search_term("(hello AND world) OR test")
            == "(hello AND world) OR test"
        )
    def test_hyphenated_terms_with_boolean_operators(self, search_repository):
        """Hyphenated terms with Boolean operators should be properly quoted."""
        # Test the specific case from the GitHub issue
        result = search_repository._prepare_search_term("tier1-test AND unicode")
        assert result == '"tier1-test" AND unicode'
        # Test other hyphenated Boolean combinations
        assert (
            search_repository._prepare_search_term("multi-word OR single")
            == '"multi-word" OR single'
        )
        assert (
            search_repository._prepare_search_term("well-formed NOT badly-formed")
            == '"well-formed" NOT "badly-formed"'
        )
        assert (
            search_repository._prepare_search_term("test-case AND (hello OR world)")
            == '"test-case" AND (hello OR world)'
        )
        # Test mixed special characters with Boolean operators
        assert (
            search_repository._prepare_search_term("config.json AND test-file")
            == '"config.json" AND "test-file"'
        )
        assert (
            search_repository._prepare_search_term("C++ OR python-script")
            == '"C++" OR "python-script"'
        )
    def test_programming_terms_should_work(self, search_repository):
        """Programming-related terms with special chars should be searchable."""
        # These should be quoted to handle special characters safely
        assert search_repository._prepare_search_term("C++") == '"C++"*'
        assert search_repository._prepare_search_term("function()") == '"function()"*'
        assert search_repository._prepare_search_term("email@domain.com") == '"email@domain.com"*'
        assert search_repository._prepare_search_term("array[index]") == '"array[index]"*'
        assert search_repository._prepare_search_term("config.json") == '"config.json"*'
    def test_malformed_fts5_syntax_quoted(self, search_repository):
        """Malformed FTS5 syntax should be quoted to prevent errors."""
        # Multiple operators without proper syntax
        assert search_repository._prepare_search_term("+++invalid+++") == '"+++invalid+++"*'
        assert search_repository._prepare_search_term("!!!error!!!") == '"!!!error!!!"*'
        assert search_repository._prepare_search_term("@#$%^&*()") == '"@#$%^&*()"*'
    def test_quoted_strings_handled_properly(self, search_repository):
        """Strings with quotes should have quotes escaped."""
        assert search_repository._prepare_search_term('say "hello"') == '"say ""hello"""*'
        assert search_repository._prepare_search_term("it's working") == '"it\'s working"*'
    def test_file_paths_no_prefix_wildcard(self, search_repository):
        """File paths should not get prefix wildcards."""
        assert (
            search_repository._prepare_search_term("config.json", is_prefix=False)
            == '"config.json"'
        )
        assert (
            search_repository._prepare_search_term("docs/readme.md", is_prefix=False)
            == '"docs/readme.md"'
        )
    def test_spaces_handled_correctly(self, search_repository):
        """Terms with spaces should use boolean AND for word order independence."""
        assert search_repository._prepare_search_term("hello world") == "hello* AND world*"
        assert (
            search_repository._prepare_search_term("project planning") == "project* AND planning*"
        )
    def test_version_strings_with_dots_handled_correctly(self, search_repository):
        """Version strings with dots should be quoted to prevent FTS5 syntax errors."""
        # This reproduces the bug where "Basic Memory v0.13.0b2" becomes "Basic* AND Memory* AND v0.13.0b2*"
        # which causes FTS5 syntax errors because v0.13.0b2* is not valid FTS5 syntax
        result = search_repository._prepare_search_term("Basic Memory v0.13.0b2")
        # Should be quoted because of dots in v0.13.0b2
        assert result == '"Basic Memory v0.13.0b2"*'
    def test_mixed_special_characters_in_multi_word_queries(self, search_repository):
        """Multi-word queries with special characters in any word should be fully quoted."""
        # Any word containing special characters should cause the entire phrase to be quoted
        assert search_repository._prepare_search_term("config.json file") == '"config.json file"*'
        assert (
            search_repository._prepare_search_term("user@email.com account")
            == '"user@email.com account"*'
        )
        assert search_repository._prepare_search_term("node.js and react") == '"node.js and react"*'
    @pytest.mark.asyncio
    async def test_search_with_special_characters_returns_results(self, search_repository):
        """Integration test: search with special characters should work gracefully."""
        # This test ensures the search doesn't crash with FTS5 syntax errors
        # These should all return empty results gracefully, not crash
        results1 = await search_repository.search(search_text="C++")
        assert isinstance(results1, list)  # Should not crash
        results2 = await search_repository.search(search_text="function()")
        assert isinstance(results2, list)  # Should not crash
        results3 = await search_repository.search(search_text="+++malformed+++")
        assert isinstance(results3, list)  # Should not crash, return empty results
        results4 = await search_repository.search(search_text="email@domain.com")
        assert isinstance(results4, list)  # Should not crash
    @pytest.mark.asyncio
    async def test_boolean_search_still_works(self, search_repository):
        """Boolean search operations should continue to work."""
        # These should not crash and should respect boolean logic
        results1 = await search_repository.search(search_text="hello AND world")
        assert isinstance(results1, list)
        results2 = await search_repository.search(search_text="cat OR dog")
        assert isinstance(results2, list)
        results3 = await search_repository.search(search_text="project NOT meeting")
        assert isinstance(results3, list)
    @pytest.mark.asyncio
    async def test_permalink_match_exact_with_slash(self, search_repository):
        """Test exact permalink matching with slash (line 249 coverage)."""
        # This tests the exact match path: if "/" in permalink_text:
        results = await search_repository.search(permalink_match="test/path")
        assert isinstance(results, list)
        # Should use exact equality matching for paths with slashes
    @pytest.mark.asyncio
    async def test_permalink_match_simple_term(self, search_repository):
        """Test permalink matching with simple term (no slash)."""
        # This tests the simple term path that goes through _prepare_search_term
        results = await search_repository.search(permalink_match="simpleterm")
        assert isinstance(results, list)
        # Should use FTS5 MATCH for simple terms
    @pytest.mark.asyncio
    async def test_fts5_error_handling_database_error(self, search_repository):
        """Test that non-FTS5 database errors are properly re-raised."""
        import unittest.mock
        # Mock the scoped_session to raise a non-FTS5 error
        with unittest.mock.patch("basic_memory.db.scoped_session") as mock_scoped_session:
            mock_session = unittest.mock.AsyncMock()
            mock_scoped_session.return_value.__aenter__.return_value = mock_session
            # Simulate a database error that's NOT an FTS5 syntax error
            mock_session.execute.side_effect = Exception("Database connection failed")
            # This should re-raise the exception (not return empty list)
            with pytest.raises(Exception, match="Database connection failed"):
                await search_repository.search(search_text="test")
    @pytest.mark.asyncio
    async def test_version_string_search_integration(self, search_repository, search_entity):
        """Integration test: searching for version strings should work without FTS5 errors."""
        # Index an entity with version information
        search_row = SearchIndexRow(
            id=search_entity.id,
            type=SearchItemType.ENTITY.value,
            title="Basic Memory v0.13.0b2 Release",
            content_stems="basic memory version 0.13.0b2 beta release notes features",
            content_snippet="Basic Memory v0.13.0b2 is a beta release with new features",
            permalink=search_entity.permalink,
            file_path=search_entity.file_path,
            entity_id=search_entity.id,
            metadata={"entity_type": search_entity.entity_type},
            created_at=search_entity.created_at,
            updated_at=search_entity.updated_at,
            project_id=search_repository.project_id,
        )
        await search_repository.index_item(search_row)
        # This should not cause FTS5 syntax errors and should find the entity
        results = await search_repository.search(search_text="Basic Memory v0.13.0b2")
        assert len(results) == 1
        assert results[0].title == "Basic Memory v0.13.0b2 Release"
        # Test other version-like patterns
        results2 = await search_repository.search(search_text="v0.13.0b2")
        assert len(results2) == 1  # Should still find it due to content_stems
        # Test with other problematic patterns
        results3 = await search_repository.search(search_text="node.js version")
        assert isinstance(results3, list)  # Should not crash
    @pytest.mark.asyncio
    async def test_wildcard_only_search(self, search_repository, search_entity):
        """Test that wildcard-only search '*' doesn't cause FTS5 errors (line 243 coverage)."""
        # Index an entity for testing
        search_row = SearchIndexRow(
            id=search_entity.id,
            type=SearchItemType.ENTITY.value,
            title="Test Entity",
            content_stems="test entity content",
            content_snippet="This is a test entity",
            permalink=search_entity.permalink,
            file_path=search_entity.file_path,
            entity_id=search_entity.id,
            metadata={"entity_type": search_entity.entity_type},
            created_at=search_entity.created_at,
            updated_at=search_entity.updated_at,
            project_id=search_repository.project_id,
        )
        await search_repository.index_item(search_row)
        # Test wildcard-only search - should not crash and should return results
        results = await search_repository.search(search_text="*")
        assert isinstance(results, list)  # Should not crash
        assert len(results) >= 1  # Should return all results, including our test entity
        # Test empty string search - should also not crash
        results_empty = await search_repository.search(search_text="")
        assert isinstance(results_empty, list)  # Should not crash
        # Test whitespace-only search
        results_whitespace = await search_repository.search(search_text="   ")
        assert isinstance(results_whitespace, list)  # Should not crash
    def test_boolean_query_empty_parts_coverage(self, search_repository):
        """Test Boolean query parsing with empty parts (line 143 coverage)."""
        # Create queries that will result in empty parts after splitting
        result1 = search_repository._prepare_boolean_query(
            "hello AND  AND world"
        )  # Double operator
        assert "hello" in result1 and "world" in result1
        result2 = search_repository._prepare_boolean_query("  OR test")  # Leading operator
        assert "test" in result2
        result3 = search_repository._prepare_boolean_query("test OR  ")  # Trailing operator
        assert "test" in result3
    def test_parenthetical_term_quote_escaping(self, search_repository):
        """Test quote escaping in parenthetical terms (lines 190-191 coverage)."""
        # Test term with quotes that needs escaping
        result = search_repository._prepare_parenthetical_term('(say "hello" world)')
        # Should escape quotes by doubling them
        assert '""hello""' in result
        # Test term with single quotes
        result2 = search_repository._prepare_parenthetical_term("(it's working)")
        assert "it's working" in result2
    def test_needs_quoting_empty_input(self, search_repository):
        """Test _needs_quoting with empty inputs (line 207 coverage)."""
        # Test empty string
        assert not search_repository._needs_quoting("")
        # Test whitespace-only string
        assert not search_repository._needs_quoting("   ")
        # Test None-like cases
        assert not search_repository._needs_quoting("\t")
    def test_prepare_single_term_empty_input(self, search_repository):
        """Test _prepare_single_term with empty inputs (line 227 coverage)."""
        # Test empty string
        result1 = search_repository._prepare_single_term("")
        assert result1 == ""
        # Test whitespace-only string
        result2 = search_repository._prepare_single_term("   ")
        assert result2 == "   "  # Should return as-is
        # Test string that becomes empty after strip
        result3 = search_repository._prepare_single_term("\t\n")
        assert result3 == "\t\n"  # Should return original