Broken Link Checker MCP Server

link_extractor.py•4.47 KiB

"""Extract links from HTML pages.""" from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup from .models import LinkInfo class LinkExtractor: """Extract various types of links from HTML content.""" def extract_links(self, html: str, base_url: str) -> list[LinkInfo]: """ Extract all links from HTML content. Args: html: The HTML content to parse base_url: The base URL for resolving relative links Returns: List of LinkInfo objects containing link details """ soup = BeautifulSoup(html, "html.parser") links = [] # Extract hyperlinks (a href) for tag in soup.find_all("a", href=True): href = tag["href"] absolute_url = urljoin(base_url, href) reference = tag.get_text(strip=True) or "<a>" links.append( LinkInfo(url=absolute_url, reference=reference, element_type="a") ) # Extract image sources (img src) for tag in soup.find_all("img", src=True): src = tag["src"] absolute_url = urljoin(base_url, src) alt_text = tag.get("alt", "").strip() reference = f'<img alt="{alt_text}">' if alt_text else "<img>" links.append( LinkInfo(url=absolute_url, reference=reference, element_type="img") ) # Extract script sources (script src) for tag in soup.find_all("script", src=True): src = tag["src"] absolute_url = urljoin(base_url, src) links.append( LinkInfo(url=absolute_url, reference="<script>", element_type="script") ) # Extract stylesheet links (link href) for tag in soup.find_all("link", href=True): href = tag["href"] absolute_url = urljoin(base_url, href) rel = tag.get("rel", [""])[0] if isinstance(tag.get("rel"), list) else tag.get("rel", "") reference = f'<link rel="{rel}">' if rel else "<link>" links.append( LinkInfo(url=absolute_url, reference=reference, element_type="link") ) # Extract video sources for tag in soup.find_all("video"): # Check for src attribute if tag.get("src"): absolute_url = urljoin(base_url, tag["src"]) links.append( LinkInfo( url=absolute_url, reference="<video>", element_type="video" ) ) # Check for source children for source in tag.find_all("source", src=True): absolute_url = urljoin(base_url, source["src"]) links.append( LinkInfo( url=absolute_url, reference="<video><source>", element_type="video", ) ) # Extract audio sources for tag in soup.find_all("audio"): # Check for src attribute if tag.get("src"): absolute_url = urljoin(base_url, tag["src"]) links.append( LinkInfo( url=absolute_url, reference="<audio>", element_type="audio" ) ) # Check for source children for source in tag.find_all("source", src=True): absolute_url = urljoin(base_url, source["src"]) links.append( LinkInfo( url=absolute_url, reference="<audio><source>", element_type="audio", ) ) # Extract iframe sources for tag in soup.find_all("iframe", src=True): src = tag["src"] absolute_url = urljoin(base_url, src) links.append( LinkInfo(url=absolute_url, reference="<iframe>", element_type="iframe") ) return links def is_internal_link(self, url: str, domain: str) -> bool: """ Check if a URL belongs to the same domain. Args: url: The URL to check domain: The domain to compare against Returns: True if the URL is internal to the domain """ parsed = urlparse(url) parsed_domain = urlparse(domain) # Compare netloc (domain + port) return parsed.netloc == parsed_domain.netloc

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/davinoishi/BLC-ground'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

link_extractor.py•4.47 KiB