MCP Server Builder

Overview Inspect Schema Related Servers Score Discussions

MIT License

doc_fetcher.py•4.68 kB

"""Document fetching and parsing utilities.""" import html import re import urllib.request from typing import Final from pydantic import BaseModel, Field from ..config import doc_config from .url_validator import URLValidationError, validate_urls # Regex patterns for parsing _MD_LINK: Final[re.Pattern[str]] = re.compile(r"\[([^\]]+)\]\(([^\)]+)\)") _HTML_BLOCK: Final[re.Pattern[str]] = re.compile(r"(?is)<(script|style|noscript).*?>.*?</\1>") _TAG: Final[re.Pattern[str]] = re.compile(r"(?s)<[^>]+>") _TITLE_TAG: Final[re.Pattern[str]] = re.compile(r"(?is)<title[^>]*>(.*?)</title>") _H1_TAG: Final[re.Pattern[str]] = re.compile(r"(?is)<h1[^>]*>(.*?)</h1>") _META_OG: Final[re.Pattern[str]] = re.compile( r'(?is)<meta[^>]+property=["\']og:title["\'][^>]+content=["\'](.*?)["\']' ) class Page(BaseModel): """Represents a fetched and cleaned documentation page. Attributes: url: The source URL of the page title: Extracted or derived title of the page content: Cleaned text content of the page """ url: str = Field(description="The source URL of the page") title: str = Field(description="Page title (extracted or derived)") content: str = Field(description="Cleaned text content of the page") def _get(url: str) -> str: """Fetch content from a URL with proper headers and timeout. Args: url: The URL to fetch Returns: The decoded text content of the response Raises: urllib.error.URLError: If the request fails """ req = urllib.request.Request(url, headers={"User-Agent": doc_config.user_agent}) with urllib.request.urlopen(req, timeout=doc_config.timeout) as r: # noqa: S310 return r.read().decode("utf-8", errors="ignore") def parse_llms_txt(url: str) -> list[tuple[str, str]]: """Parse an llms.txt file and extract document links. Args: url: URL of the llms.txt file to parse Returns: List of (title, url) tuples extracted from markdown links """ txt = _get(url) links: list[tuple[str, str]] = [] for match in _MD_LINK.finditer(txt): title = match.group(1).strip() or match.group(2).strip() doc_url = match.group(2).strip() try: validated_urls = validate_urls(doc_url) links.append((title, validated_urls[0])) except URLValidationError: # Skip invalid URLs silently continue return links def _html_to_text(raw_html: str) -> str: """Convert HTML to plain text using stdlib only. Args: raw_html: Raw HTML content to convert Returns: Plain text with HTML tags removed and entities unescaped """ # Remove script/style blocks stripped = _HTML_BLOCK.sub("", raw_html) # Drop tags stripped = _TAG.sub(" ", stripped) # Unescape HTML entities stripped = html.unescape(stripped) # Normalize whitespace, remove empty lines lines = [ln.strip() for ln in stripped.splitlines()] return "\n".join(ln for ln in lines if ln) def _extract_html_title(raw_html: str) -> str | None: """Extract title from HTML content using multiple strategies. Args: raw_html: Raw HTML content to extract title from Returns: Extracted title string, or None if no title found """ # Try <title> tag match = _TITLE_TAG.search(raw_html) if match: return html.unescape(match.group(1)).strip() # Try og:title meta tag match = _META_OG.search(raw_html) if match: return html.unescape(match.group(1)).strip() # Try <h1> tag match = _H1_TAG.search(raw_html) if match: inner = _TAG.sub(" ", match.group(1)) return html.unescape(inner).strip() return None def fetch_and_clean(page_url: str) -> Page: """Fetch a web page and return cleaned content. Args: page_url: URL of the page to fetch Returns: Page object with URL, title, and cleaned content Raises: URLValidationError: If the URL is not allowed """ validated_url = validate_urls(page_url)[0] raw = _get(validated_url) lower = raw.lower() # Check if it's HTML content if "<html" in lower or "<head" in lower or "<body" in lower: extracted_title = _extract_html_title(raw) content = _html_to_text(raw) title = extracted_title or validated_url.rsplit("/", 1)[-1] or validated_url return Page(url=validated_url, title=title, content=content) else: # Plain text (e.g., markdown) title = validated_url.rsplit("/", 1)[-1] or validated_url return Page(url=validated_url, title=title, content=raw)

Latest Blog Posts

The State of MCP in 2025: Who's Building What and Why It Matters
By punkpeye on December 7, 2025.
mcp
startups
MCP hosting with persistent storage
By punkpeye on December 6, 2025.
changelog
The Operational Metrics and Agent Analytics Driving Successful Model Context Protocol (MCP) Servers
By Om-Shree-0709 on December 6, 2025.
observability
mcp
Agent-Analytics

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/praveenc/mcp-server-builder'

If you have feedback or need assistance with the MCP directory API, please join our Discord server