"""Tests for text processor utilities."""
from mcp_server_builder.utils.doc_fetcher import Page
from mcp_server_builder.utils.text_processor import (
format_display_title,
index_title_variants,
make_snippet,
normalize,
normalize_for_comparison,
title_from_url,
)
class TestNormalize:
"""Tests for normalize function."""
def test_collapses_whitespace(self) -> None:
"""Test that multiple spaces are collapsed."""
assert normalize("hello world") == "hello world"
def test_strips_edges(self) -> None:
"""Test that leading/trailing whitespace is removed."""
assert normalize(" hello ") == "hello"
def test_handles_newlines(self) -> None:
"""Test that newlines are converted to spaces."""
assert normalize("hello\nworld") == "hello world"
def test_empty_string(self) -> None:
"""Test empty string handling."""
assert normalize("") == ""
class TestTitleFromUrl:
"""Tests for title_from_url function."""
def test_extracts_slug(self) -> None:
"""Test extracting title from URL slug."""
url = "https://example.com/docs/getting-started"
assert title_from_url(url) == "Getting Started"
def test_handles_underscores(self) -> None:
"""Test converting underscores to spaces."""
url = "https://example.com/api_reference"
assert title_from_url(url) == "Api Reference"
def test_removes_index_files(self) -> None:
"""Test that index.* files are handled."""
url = "https://example.com/docs/index.html"
result = title_from_url(url)
assert "index" not in result.lower()
def test_fallback_to_documentation(self) -> None:
"""Test fallback when no meaningful path."""
url = "https://example.com/"
result = title_from_url(url)
assert result # Should have some value
class TestFormatDisplayTitle:
"""Tests for format_display_title function."""
def test_prefers_curated_title(self) -> None:
"""Test that curated titles are preferred."""
url = "https://example.com/page"
url_titles = {url: "Curated Title"}
result = format_display_title(url, "Extracted Title", url_titles)
assert result == "Curated Title"
def test_uses_extracted_when_no_curated(self) -> None:
"""Test using extracted title when no curated available."""
result = format_display_title(
"https://example.com/page",
"Extracted Title",
{},
)
assert result == "Extracted Title"
def test_uses_url_when_no_extracted(self) -> None:
"""Test using URL-derived title when no extracted."""
result = format_display_title(
"https://example.com/my-page",
None,
{},
)
assert "My Page" in result or "my" in result.lower()
def test_rejects_generic_titles(self) -> None:
"""Test that generic titles like 'index' are rejected."""
result = format_display_title(
"https://example.com/docs",
"index",
{},
)
assert result.lower() != "index"
class TestIndexTitleVariants:
"""Tests for index_title_variants function."""
def test_includes_original(self) -> None:
"""Test that original title is included."""
result = index_title_variants("Hello World", "https://example.com/hello")
assert "Hello World" in result
def test_numeric_to_word(self) -> None:
"""Test Agent2Agent -> Agent to Agent conversion."""
result = index_title_variants("Agent2Agent", "https://example.com/a2a")
assert "to" in result.lower()
class TestNormalizeForComparison:
"""Tests for normalize_for_comparison function."""
def test_lowercase(self) -> None:
"""Test that string is lowercased."""
result = normalize_for_comparison("Hello WORLD")
assert result == "hello world"
def test_removes_punctuation(self) -> None:
"""Test that punctuation is removed."""
result = normalize_for_comparison("Hello, World!")
assert "," not in result
assert "!" not in result
class TestMakeSnippet:
"""Tests for make_snippet function."""
def test_returns_title_for_none_page(self) -> None:
"""Test fallback to title when page is None."""
result = make_snippet(None, "Fallback Title")
assert result == "Fallback Title"
def test_returns_title_for_empty_content(self) -> None:
"""Test fallback when content is empty."""
page = Page(url="https://test.com", title="Test", content="")
result = make_snippet(page, "Fallback Title")
assert result == "Fallback Title"
def test_extracts_first_paragraph(self) -> None:
"""Test extracting first meaningful paragraph."""
content = """# Heading
This is the first paragraph that should be extracted.
## Another heading
More content here.
"""
page = Page(url="https://test.com", title="Test", content=content)
result = make_snippet(page, "Test")
assert "first paragraph" in result
def test_truncates_long_snippets(self) -> None:
"""Test that long snippets are truncated."""
long_content = "A" * 500
page = Page(url="https://test.com", title="Test", content=long_content)
result = make_snippet(page, "Test", max_chars=100)
assert len(result) <= 100
assert result.endswith("…")
def test_skips_headings(self) -> None:
"""Test that markdown headings are skipped."""
content = """# Main Heading
## Sub Heading
This is the actual content.
"""
page = Page(url="https://test.com", title="Test", content=content)
result = make_snippet(page, "Test")
assert not result.startswith("#")