Crawl4AI+SearXNG MCP Server

recursive.py•3.68 KiB

"""Recursive link crawling utilities. This module provides recursive internal link crawling functionality for discovering and crawling related pages within a website. """ import logging from typing import Any from crawl4ai import ( AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig, MemoryAdaptiveDispatcher, ) from crawl4ai.async_logger import AsyncLoggerBase from crawl4ai.content_filter_strategy import PruningContentFilter from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from src.utils.url_helpers import normalize_url from .memory import track_memory logger = logging.getLogger(__name__) async def crawl_recursive_internal_links( browser_config: BrowserConfig, start_urls: list[str], dispatcher: MemoryAdaptiveDispatcher, max_depth: int = 3, crawl4ai_logger: AsyncLoggerBase | None = None, ) -> list[dict[str, Any]]: """Recursively crawl internal links from start URLs up to a maximum depth. Args: browser_config: BrowserConfig for creating crawler instance start_urls: List of starting URLs dispatcher: Shared MemoryAdaptiveDispatcher for global concurrency control max_depth: Maximum recursion depth Returns: List of dictionaries with URL and markdown content """ run_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, stream=False, excluded_tags=["nav", "footer", "header", "aside", "script", "style"], markdown_generator=DefaultMarkdownGenerator( content_filter=PruningContentFilter( threshold=0.4, threshold_type="fixed", min_word_threshold=20, ), ), ) # Use shared dispatcher from context for global concurrency control visited = set() current_urls = {normalize_url(u) for u in start_urls} results_all = [] crawler_kwargs: dict[str, Any] = {"config": browser_config} if crawl4ai_logger is not None: crawler_kwargs["logger"] = crawl4ai_logger async with AsyncWebCrawler(**crawler_kwargs) as crawler: for depth in range(max_depth): urls_to_crawl = [ normalize_url(url) for url in current_urls if normalize_url(url) not in visited ] if not urls_to_crawl: break async with track_memory( f"recursive_crawl(depth={depth}, urls={len(urls_to_crawl)})", ) as mem_ctx: result_container = await crawler.arun_many( urls=urls_to_crawl, config=run_config, dispatcher=dispatcher, ) assert isinstance(result_container, list), "Expected list in batch mode" results = result_container mem_ctx["results"] = results next_level_urls = set() for result in results: norm_url = normalize_url(result.url) visited.add(norm_url) if result.success and result.markdown: content = ( result.markdown.fit_markdown if result.markdown.fit_markdown else result.markdown.raw_markdown ) if content: results_all.append({"url": result.url, "markdown": content}) for link in result.links.get("internal", []): next_url = normalize_url(link["href"]) if next_url not in visited: next_level_urls.add(next_url) current_urls = next_level_urls return results_all

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/AI-enthusiasts/crawl4ai-rag-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

recursive.py•3.68 KiB