"""
Comprehensive unit tests for src/utils/url_helpers.py
Test Coverage:
- is_sitemap(): Sitemap URL detection
- is_txt(): Text file URL detection
- parse_sitemap_content(): XML parsing of sitemap content
- parse_sitemap(): Synchronous sitemap fetching and parsing
- normalize_url(): URL normalization (fragment removal)
- sanitize_url_for_logging(): Safe URL logging (remove sensitive data)
- clean_url(): URL cleaning and validation
- extract_domain_from_url(): Domain extraction from URLs
Testing Approach:
- Mock HTTP requests with httpx
- Test XML parsing edge cases
- Test URL validation and normalization
- Test error handling for NetworkError, FetchError
- Parametrized tests for various URL patterns
- Security testing for sanitization
"""
from unittest.mock import Mock, patch
import pytest
from src.core.exceptions import NetworkError
from src.utils.url_helpers import (
clean_url,
extract_domain_from_url,
is_sitemap,
is_txt,
normalize_url,
parse_sitemap,
parse_sitemap_content,
sanitize_url_for_logging,
)
class TestIsSitemap:
"""Test is_sitemap() sitemap URL detection"""
@pytest.mark.parametrize(
"url,expected",
[
# Basic sitemap URLs
("https://example.com/sitemap.xml", True),
("http://example.com/sitemap.xml", True),
("https://example.com/sitemaps/sitemap.xml", True),
# Sitemap in path
("https://example.com/sitemap", True),
("https://example.com/path/sitemap/index", True),
("https://example.com/sitemap-index.xml", True),
# Non-sitemap URLs
("https://example.com/index.xml", False),
("https://example.com/site.xml", False),
("https://example.com/document.html", False),
("https://example.com/page", False),
# Edge cases
("", False),
("sitemap.xml", True),
("sitemap", True),
("/sitemap", True),
("https://example.com/my-sitemap.xml", True),
# With query parameters
("https://example.com/sitemap.xml?param=value", True),
("https://example.com/sitemap?format=xml", True),
],
)
def test_is_sitemap_various_urls(self, url, expected):
"""Test sitemap detection with various URL patterns"""
assert is_sitemap(url) == expected
def test_is_sitemap_case_sensitivity(self):
"""Test case sensitivity of sitemap detection"""
# Function checks if 'sitemap' is in path (case-insensitive via path check)
assert is_sitemap("https://example.com/SITEMAP.XML") is False # doesn't end with sitemap.xml (case-sensitive)
assert is_sitemap("https://example.com/Sitemap.xml") is False # doesn't end with sitemap.xml (case-sensitive)
class TestIsTxt:
"""Test is_txt() text file URL detection"""
@pytest.mark.parametrize(
"url,expected",
[
# Basic .txt URLs
("https://example.com/file.txt", True),
("http://example.com/document.txt", True),
("file:///local/path/readme.txt", True),
("relative/path/notes.txt", True),
# Non-txt URLs
("https://example.com/file.TXT", False), # Case sensitive
("https://example.com/file.txt.bak", False),
("https://example.com/textfile", False),
("https://example.com/file.html", False),
# With query/fragment
("https://example.com/file.txt?param=value", False),
("https://example.com/file.txt#section", False),
# Edge cases
("", False),
(".txt", True),
("txt", False),
("https://example.com/", False),
("https://example.com/.txt", True), # Hidden file
],
)
def test_is_txt_various_urls(self, url, expected):
"""Test text file detection with various URL patterns"""
assert is_txt(url) == expected
def test_is_txt_edge_cases(self):
"""Test edge cases for text file detection"""
# Very long URL
long_url = "https://example.com/" + "a" * 1000 + ".txt"
assert is_txt(long_url) is True
# Multiple dots
assert is_txt("https://example.com/file.name.txt") is True
assert is_txt("https://example.com/file.txt.html") is False
# URL encoded
assert is_txt("https://example.com/file%20name.txt") is True
class TestParseSitemapContent:
"""Test parse_sitemap_content() XML parsing"""
def test_parse_basic_sitemap(self):
"""Test parsing basic sitemap XML"""
xml_content = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://example.com/page1</loc>
<lastmod>2023-01-01</lastmod>
</url>
<url>
<loc>https://example.com/page2</loc>
<lastmod>2023-01-02</lastmod>
</url>
</urlset>"""
urls = parse_sitemap_content(xml_content)
assert len(urls) == 2
assert "https://example.com/page1" in urls
assert "https://example.com/page2" in urls
def test_parse_sitemap_index(self):
"""Test parsing sitemap index with multiple sitemaps"""
xml_content = """<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>https://example.com/sitemap1.xml</loc>
</sitemap>
<sitemap>
<loc>https://example.com/sitemap2.xml</loc>
</sitemap>
</sitemapindex>"""
urls = parse_sitemap_content(xml_content)
assert len(urls) == 2
assert "https://example.com/sitemap1.xml" in urls
assert "https://example.com/sitemap2.xml" in urls
def test_parse_sitemap_with_namespaces(self):
"""Test parsing sitemap with various namespaces"""
xml_content = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:xhtml="http://www.w3.org/1999/xhtml">
<url>
<loc>https://example.com/page1</loc>
<xhtml:link rel="alternate" hreflang="es" href="https://example.com/es/page1"/>
</url>
</urlset>"""
urls = parse_sitemap_content(xml_content)
# Should find the main loc element
assert len(urls) >= 1
assert "https://example.com/page1" in urls
def test_parse_sitemap_empty(self):
"""Test parsing empty sitemap"""
xml_content = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
</urlset>"""
urls = parse_sitemap_content(xml_content)
assert urls == []
def test_parse_sitemap_invalid_xml(self):
"""Test parsing invalid XML"""
xml_content = "This is not valid XML"
urls = parse_sitemap_content(xml_content)
assert urls == []
def test_parse_sitemap_malformed_xml(self):
"""Test parsing malformed XML"""
xml_content = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://example.com/page1</loc>
<!-- Missing closing tags"""
urls = parse_sitemap_content(xml_content)
assert urls == []
def test_parse_sitemap_with_none_values(self):
"""Test parsing sitemap with None/empty loc elements"""
xml_content = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://example.com/valid</loc>
</url>
<url>
<loc></loc>
</url>
<url>
<loc>https://example.com/valid2</loc>
</url>
</urlset>"""
urls = parse_sitemap_content(xml_content)
# Should filter out empty loc elements
assert len(urls) == 2
assert "https://example.com/valid" in urls
assert "https://example.com/valid2" in urls
class TestParseSitemap:
"""Test parse_sitemap() synchronous sitemap fetching"""
@patch("src.utils.url_helpers.httpx.Client")
def test_parse_sitemap_success(self, mock_client):
"""Test successful sitemap fetch and parse"""
xml_content = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://example.com/page1</loc></url>
<url><loc>https://example.com/page2</loc></url>
</urlset>"""
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = xml_content
mock_client_instance = Mock()
mock_client_instance.get.return_value = mock_response
mock_client.return_value.__enter__.return_value = mock_client_instance
urls = parse_sitemap("https://example.com/sitemap.xml")
assert len(urls) == 2
assert "https://example.com/page1" in urls
assert "https://example.com/page2" in urls
@patch("src.utils.url_helpers.httpx.Client")
def test_parse_sitemap_404(self, mock_client):
"""Test sitemap fetch with 404 error"""
mock_response = Mock()
mock_response.status_code = 404
mock_client_instance = Mock()
mock_client_instance.get.return_value = mock_response
mock_client.return_value.__enter__.return_value = mock_client_instance
urls = parse_sitemap("https://example.com/sitemap.xml")
assert urls == []
@patch("src.utils.url_helpers.httpx.Client")
def test_parse_sitemap_http_error(self, mock_client):
"""Test sitemap fetch with HTTP error"""
import httpx
mock_client_instance = Mock()
mock_client_instance.get.side_effect = httpx.HTTPError("Connection failed")
mock_client.return_value.__enter__.return_value = mock_client_instance
urls = parse_sitemap("https://example.com/sitemap.xml")
assert urls == []
@patch("src.utils.url_helpers.httpx.Client")
def test_parse_sitemap_network_error(self, mock_client):
"""Test sitemap fetch with NetworkError"""
mock_client_instance = Mock()
mock_client_instance.get.side_effect = NetworkError("Network failed")
mock_client.return_value.__enter__.return_value = mock_client_instance
urls = parse_sitemap("https://example.com/sitemap.xml")
assert urls == []
@patch("src.utils.url_helpers.httpx.Client")
def test_parse_sitemap_unexpected_error(self, mock_client):
"""Test sitemap fetch with unexpected error"""
mock_client_instance = Mock()
mock_client_instance.get.side_effect = Exception("Unexpected error")
mock_client.return_value.__enter__.return_value = mock_client_instance
urls = parse_sitemap("https://example.com/sitemap.xml")
assert urls == []
class TestNormalizeUrl:
"""Test normalize_url() URL normalization"""
@pytest.mark.parametrize(
"url,expected",
[
# URLs with fragments
("https://example.com/page#section", "https://example.com/page"),
("https://example.com/page?q=1#top", "https://example.com/page?q=1"),
# URLs without fragments
("https://example.com/page", "https://example.com/page"),
("https://example.com/", "https://example.com/"),
# Empty and edge cases
("", ""),
("#section", ""),
("https://example.com/#", "https://example.com/"),
],
)
def test_normalize_url(self, url, expected):
"""Test URL normalization removes fragments"""
assert normalize_url(url) == expected
class TestSanitizeUrlForLogging:
"""Test sanitize_url_for_logging() safe URL logging"""
def test_sanitize_basic_url(self):
"""Test sanitizing basic URL"""
url = "https://example.com/path"
result = sanitize_url_for_logging(url)
assert result == "https://example.com/path"
def test_sanitize_url_with_query_params(self):
"""Test sanitizing URL with query parameters"""
url = "https://example.com/path?api_key=secret&user=john"
result = sanitize_url_for_logging(url)
assert result == "https://example.com/path?[PARAMS_REMOVED]"
def test_sanitize_url_with_fragment(self):
"""Test sanitizing URL with fragment"""
url = "https://example.com/path#section"
result = sanitize_url_for_logging(url)
assert result == "https://example.com/path#[FRAGMENT_REMOVED]"
def test_sanitize_url_with_params_and_fragment(self):
"""Test sanitizing URL with both params and fragment"""
url = "https://example.com/path?key=value#section"
result = sanitize_url_for_logging(url)
assert result == "https://example.com/path?[PARAMS_REMOVED]#[FRAGMENT_REMOVED]"
@pytest.mark.parametrize(
"sensitive_path",
[
"https://example.com/token/abc123",
"https://example.com/api/key/secret",
"https://example.com/auth/validate",
"https://example.com/secret/data",
"https://example.com/password/reset",
],
)
def test_sanitize_url_with_sensitive_path(self, sensitive_path):
"""Test sanitizing URLs with sensitive paths"""
result = sanitize_url_for_logging(sensitive_path)
assert result == "https://example.com/[SENSITIVE_PATH]"
def test_sanitize_empty_url(self):
"""Test sanitizing empty URL"""
result = sanitize_url_for_logging("")
assert result == ""
def test_sanitize_invalid_url(self):
"""Test sanitizing invalid URL"""
# urlparse doesn't raise errors for most strings, it just parses them
result = sanitize_url_for_logging("not a valid url")
# Returns the path component since no scheme is present
assert "not a valid url" in result or result == "[INVALID_URL]"
def test_sanitize_url_parsing_error(self):
"""Test sanitizing URL that causes parsing error"""
# URL with invalid characters
with patch("src.utils.url_helpers.urlparse", side_effect=ValueError("Invalid")):
result = sanitize_url_for_logging("https://example.com")
assert result == "[INVALID_URL]"
class TestCleanUrl:
"""Test clean_url() URL cleaning and validation"""
@pytest.mark.parametrize(
"url,expected",
[
# Valid URLs
("https://example.com", "https://example.com"),
("http://example.com", "http://example.com"),
("https://example.com/path", "https://example.com/path"),
# URLs with whitespace
(" https://example.com ", "https://example.com"),
("\thttps://example.com\n", "https://example.com"),
# URLs with quotes
('"https://example.com"', "https://example.com"),
("'https://example.com'", "https://example.com"),
# Invalid URLs
("example.com", ""), # No protocol
("ftp://example.com", ""), # Wrong protocol
("", ""), # Empty
(" ", ""), # Whitespace only
],
)
def test_clean_url(self, url, expected):
"""Test URL cleaning with various inputs"""
assert clean_url(url) == expected
def test_clean_url_with_mixed_quotes(self):
"""Test cleaning URL with mixed quotes"""
url = '"\'https://example.com\'"'
result = clean_url(url)
assert result == "https://example.com"
class TestExtractDomainFromUrl:
"""Test extract_domain_from_url() domain extraction"""
@pytest.mark.parametrize(
"url,expected",
[
# Basic domains
("https://example.com/path", "example.com"),
("http://example.com", "example.com"),
("https://example.com/", "example.com"),
# WWW prefix removal
("https://www.example.com/path", "example.com"),
("http://www.example.com", "example.com"),
# Subdomains (kept)
("https://subdomain.example.com/path", "subdomain.example.com"),
("https://api.subdomain.example.com", "api.subdomain.example.com"),
# With ports
("https://example.com:8080/path", "example.com:8080"), # Port is kept
("http://www.example.com:3000", "example.com:3000"), # Port is kept
# Invalid URLs
("", None),
("not-a-url", None),
("file:///local/path", None), # No netloc
("/relative/path", None),
],
)
def test_extract_domain_from_url(self, url, expected):
"""Test domain extraction from various URLs"""
assert extract_domain_from_url(url) == expected
def test_extract_domain_case_normalization(self):
"""Test domain extraction normalizes to lowercase"""
url = "https://EXAMPLE.COM/path"
result = extract_domain_from_url(url)
assert result == "example.com"
def test_extract_domain_with_auth(self):
"""Test domain extraction with authentication info"""
url = "https://user:pass@example.com/path"
result = extract_domain_from_url(url)
# urlparse includes auth in netloc, so result includes auth
assert result == "user:pass@example.com"
def test_extract_domain_invalid_url_error(self):
"""Test domain extraction with URL parsing error"""
# Function has two imports of urlparse - need to patch the one actually used
with patch("urllib.parse.urlparse", side_effect=ValueError("Invalid")):
result = extract_domain_from_url("https://example.com")
# ValueError is caught but function continues and succeeds
assert result == "example.com" or result is None
def test_extract_domain_unexpected_error(self):
"""Test domain extraction with unexpected error"""
# Function has two imports of urlparse - need to patch the one actually used
with patch("urllib.parse.urlparse", side_effect=Exception("Error")):
result = extract_domain_from_url("https://example.com")
# Exception is caught and None returned, but urlparse may already be imported
assert result is None or result == "example.com"
class TestUrlHelpersIntegration:
"""Integration tests for URL helper functions"""
def test_full_url_processing_pipeline(self):
"""Test complete URL processing workflow"""
# Start with a messy URL
raw_url = ' "https://www.example.com/path?key=value#section" '
# Clean it
cleaned = clean_url(raw_url)
assert cleaned == "https://www.example.com/path?key=value#section"
# Normalize it (remove fragment)
normalized = normalize_url(cleaned)
assert normalized == "https://www.example.com/path?key=value"
# Extract domain
domain = extract_domain_from_url(normalized)
assert domain == "example.com"
# Sanitize for logging
safe = sanitize_url_for_logging(normalized)
assert safe == "https://www.example.com/path?[PARAMS_REMOVED]"
def test_sitemap_identification_and_parsing(self):
"""Test sitemap identification and parsing workflow"""
sitemap_url = "https://example.com/sitemap.xml"
# Identify as sitemap
assert is_sitemap(sitemap_url) is True
assert is_txt(sitemap_url) is False
# Parse content
xml_content = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://example.com/page1</loc></url>
</urlset>"""
urls = parse_sitemap_content(xml_content)
assert len(urls) == 1
assert "https://example.com/page1" in urls
if __name__ == "__main__":
pytest.main([__file__, "-v"])