RAGStack-Lambda

Overview Schema Related Servers Score Discussions

test_scraper_discovery.py•9.13 KiB

"""Unit tests for URL discovery logic.""" import pytest from ragstack_common.scraper.discovery import ( extract_links, filter_discovered_urls, get_url_depth, matches_patterns, normalize_url, should_crawl, ) from ragstack_common.scraper.models import ScrapeConfig, ScrapeScope class TestNormalizeUrl: """Tests for normalize_url function.""" def test_removes_fragments(self): url = "https://example.com/page#section" assert normalize_url(url) == "https://example.com/page" def test_normalizes_trailing_slash(self): url = "https://example.com/page/" assert normalize_url(url) == "https://example.com/page" def test_preserves_root_trailing_slash(self): # Root path should keep its form url = "https://example.com/" result = normalize_url(url) # Either https://example.com/ or https://example.com is acceptable assert result in ("https://example.com/", "https://example.com") def test_lowercases_hostname(self): url = "https://EXAMPLE.COM/Page" result = normalize_url(url) assert "example.com" in result # Path case should be preserved assert "/Page" in result def test_preserves_query_params(self): url = "https://example.com/page?foo=bar" assert normalize_url(url) == "https://example.com/page?foo=bar" def test_complex_url(self): url = "https://EXAMPLE.COM/path/to/page/?query=value#fragment" result = normalize_url(url) assert result == "https://example.com/path/to/page?query=value" class TestExtractLinks: """Tests for extract_links function.""" def test_extracts_anchor_hrefs(self): html = '<html><body><a href="https://example.com/page1">Link</a></body></html>' links = extract_links(html, "https://example.com/") assert "https://example.com/page1" in links def test_resolves_relative_links(self): html = '<html><body><a href="/page">Link</a></body></html>' links = extract_links(html, "https://example.com/base/") assert "https://example.com/page" in links def test_resolves_relative_paths(self): html = '<html><body><a href="subpage">Link</a></body></html>' links = extract_links(html, "https://example.com/docs/") assert "https://example.com/docs/subpage" in links def test_ignores_fragment_only_links(self): html = '<html><body><a href="#section">Link</a></body></html>' links = extract_links(html, "https://example.com/") assert len(links) == 0 def test_ignores_javascript_links(self): html = '<html><body><a href="javascript:void(0)">Link</a></body></html>' links = extract_links(html, "https://example.com/") assert len(links) == 0 def test_ignores_mailto_links(self): html = '<html><body><a href="mailto:test@example.com">Link</a></body></html>' links = extract_links(html, "https://example.com/") assert len(links) == 0 def test_extracts_multiple_links(self): html = """ <html><body> <a href="/page1">Link 1</a> <a href="/page2">Link 2</a> <a href="https://other.com">External</a> </body></html> """ links = extract_links(html, "https://example.com/") assert len(links) == 3 assert "https://example.com/page1" in links assert "https://example.com/page2" in links assert "https://other.com" in links def test_deduplicates_links(self): html = """ <html><body> <a href="/page">Link 1</a> <a href="/page">Link 2</a> <a href="/page/">Link 3</a> </body></html> """ links = extract_links(html, "https://example.com/") # All three should normalize to the same URL assert links.count("https://example.com/page") == 1 def test_empty_html(self): links = extract_links("", "https://example.com/") assert links == [] class TestShouldCrawl: """Tests for should_crawl function.""" def test_same_hostname_allowed(self): config = ScrapeConfig(scope=ScrapeScope.HOSTNAME) assert should_crawl( "https://example.com/page", "https://example.com/", config, ) def test_different_hostname_blocked(self): config = ScrapeConfig(scope=ScrapeScope.HOSTNAME) assert not should_crawl( "https://other.com/page", "https://example.com/", config, ) def test_subpages_scope_same_path(self): config = ScrapeConfig(scope=ScrapeScope.SUBPAGES) assert should_crawl( "https://example.com/docs/page", "https://example.com/docs/", config, ) def test_subpages_scope_different_path(self): config = ScrapeConfig(scope=ScrapeScope.SUBPAGES) assert not should_crawl( "https://example.com/blog/post", "https://example.com/docs/", config, ) def test_subpages_scope_exact_base(self): config = ScrapeConfig(scope=ScrapeScope.SUBPAGES) assert should_crawl( "https://example.com/docs", "https://example.com/docs", config, ) def test_domain_scope_includes_subdomains(self): config = ScrapeConfig(scope=ScrapeScope.DOMAIN) assert should_crawl( "https://blog.example.com/post", "https://example.com/", config, ) def test_domain_scope_different_domain(self): config = ScrapeConfig(scope=ScrapeScope.DOMAIN) assert not should_crawl( "https://other.com/page", "https://example.com/", config, ) def test_include_patterns(self): config = ScrapeConfig( scope=ScrapeScope.HOSTNAME, include_patterns=["*/docs/*"], ) assert should_crawl( "https://example.com/docs/page", "https://example.com/", config, ) assert not should_crawl( "https://example.com/blog/page", "https://example.com/", config, ) def test_exclude_patterns(self): config = ScrapeConfig( scope=ScrapeScope.HOSTNAME, exclude_patterns=["*/login*", "*/admin/*"], ) assert should_crawl( "https://example.com/docs/page", "https://example.com/", config, ) assert not should_crawl( "https://example.com/login", "https://example.com/", config, ) assert not should_crawl( "https://example.com/admin/dashboard", "https://example.com/", config, ) class TestMatchesPatterns: """Tests for matches_patterns function.""" def test_matches_glob_star(self): assert matches_patterns("https://example.com/docs/page", ["*/docs/*"]) def test_matches_exact(self): assert matches_patterns("https://example.com/page", ["https://example.com/page"]) def test_no_match(self): assert not matches_patterns("https://example.com/page", ["*/other/*"]) def test_multiple_patterns(self): patterns = ["*/docs/*", "*/api/*"] assert matches_patterns("https://example.com/api/v1", patterns) class TestGetUrlDepth: """Tests for get_url_depth function.""" def test_base_url_depth_zero(self): assert get_url_depth("https://example.com/", "https://example.com/") == 0 def test_subpage_depth(self): assert get_url_depth("https://example.com/docs", "https://example.com/") == 1 assert get_url_depth("https://example.com/docs/page", "https://example.com/") == 2 def test_depth_relative_to_base(self): assert ( get_url_depth( "https://example.com/docs/api/v1", "https://example.com/docs", ) == 2 ) def test_different_hostname_zero_depth(self): assert get_url_depth("https://other.com/page", "https://example.com/") == 0 class TestFilterDiscoveredUrls: """Tests for filter_discovered_urls function.""" def test_filters_visited(self): config = ScrapeConfig(scope=ScrapeScope.HOSTNAME) urls = ["https://example.com/page1", "https://example.com/page2"] visited = {"https://example.com/page1"} filtered = filter_discovered_urls(urls, "https://example.com/", config, visited) assert "https://example.com/page1" not in filtered assert "https://example.com/page2" in filtered def test_filters_out_of_scope(self): config = ScrapeConfig(scope=ScrapeScope.HOSTNAME) urls = ["https://example.com/page", "https://other.com/page"] filtered = filter_discovered_urls(urls, "https://example.com/", config, set()) assert "https://example.com/page" in filtered assert "https://other.com/page" not in filtered def test_empty_urls(self): config = ScrapeConfig() filtered = filter_discovered_urls([], "https://example.com/", config, set()) assert filtered == [] if __name__ == "__main__": pytest.main([__file__, "-v"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/HatmanStack/RAGStack-Lambda'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_scraper_discovery.py•9.13 KiB