Crawl4AI+SearXNG MCP Server

test_final_validation.py•3.12 KiB

#!/usr/bin/env python3 """Final test to verify URL validation catches problematic URLs.""" import sys from pathlib import Path # Add src to Python path for imports sys.path.insert(0, str(Path(__file__).parent / "src")) from src.utils.validation import validate_urls_for_crawling def test_edge_cases(): """Test edge cases that might cause the original error.""" print("=" * 60) print("TESTING URL VALIDATION FOR CRAWL4AI COMPATIBILITY") print("=" * 60) # Test cases that should PASS valid_test_cases = [ ["https://example.com"], ["https://www.iana.org/help/example-domains"], ["http://httpbin.org/html"], ["https://example.com", "https://httpbin.org/html"], ["file:///path/to/local/file"], ["raw:some-content"], ] print("\n✅ VALID URL TESTS:") for i, urls in enumerate(valid_test_cases, 1): result = validate_urls_for_crawling(urls) status = "✅ PASS" if result["valid"] else "❌ FAIL" print(f" Test {i}: {status} - {urls}") if not result["valid"]: print(f" Error: {result['error']}") # Test cases that should FAIL (and be caught by our validation) invalid_test_cases = [ ["ftp://invalid.com"], # Unsupported protocol ["javascript:alert(1)"], # Dangerous protocol ["www.example.com"], # Missing protocol ["https://"], # Missing domain ["https:///path"], # Missing domain with path [""], # Empty URL ["https://example.com", "ftp://bad.com"], # Mixed valid/invalid ["telnet://old.protocol.com"], # Old protocol ["data:text/html,<h1>test</h1>"], # Data URL (not supported) ] print("\n❌ INVALID URL TESTS (should be caught):") for i, urls in enumerate(invalid_test_cases, 1): result = validate_urls_for_crawling(urls) status = "✅ CAUGHT" if not result["valid"] else "❌ MISSED" print(f" Test {i}: {status} - {urls}") if not result["valid"]: print(f" Error: {result['error']}") else: print(" ⚠️ This should have been rejected!") # Test the specific URLs from the original error report print("\n🔍 ORIGINAL ERROR CASE TESTS:") original_urls = [ ["https://example.com", "https://www.iana.org/help/example-domains"], ] for i, urls in enumerate(original_urls, 1): result = validate_urls_for_crawling(urls) status = "✅ VALID" if result["valid"] else "❌ INVALID" print(f" Original Test {i}: {status} - {urls}") if result["valid"]: print(f" Normalized URLs: {result['urls']}") else: print(f" Error: {result['error']}") print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print("✅ URL validation is now in place") print("✅ Invalid URLs will be caught before reaching crawl4ai") print("✅ Clear error messages are provided for debugging") print("✅ Original problematic URLs are properly validated") print("\nThe fix should prevent the 'URL must start with...' error from crawl4ai") if __name__ == "__main__": test_edge_cases()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/AI-enthusiasts/crawl4ai-rag-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_final_validation.py•3.12 KiB