Skip to main content
Glama
davinoishi

Broken Link Checker MCP Server

by davinoishi
link_extractor.py4.58 kB
"""Extract links from HTML pages.""" from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup from .models import LinkInfo class LinkExtractor: """Extract various types of links from HTML content.""" def extract_links(self, html: str, base_url: str) -> list[LinkInfo]: """ Extract all links from HTML content. Args: html: The HTML content to parse base_url: The base URL for resolving relative links Returns: List of LinkInfo objects containing link details """ soup = BeautifulSoup(html, "html.parser") links = [] # Extract hyperlinks (a href) for tag in soup.find_all("a", href=True): href = tag["href"] absolute_url = urljoin(base_url, href) reference = tag.get_text(strip=True) or "<a>" links.append( LinkInfo(url=absolute_url, reference=reference, element_type="a") ) # Extract image sources (img src) for tag in soup.find_all("img", src=True): src = tag["src"] absolute_url = urljoin(base_url, src) alt_text = tag.get("alt", "").strip() reference = f'<img alt="{alt_text}">' if alt_text else "<img>" links.append( LinkInfo(url=absolute_url, reference=reference, element_type="img") ) # Extract script sources (script src) for tag in soup.find_all("script", src=True): src = tag["src"] absolute_url = urljoin(base_url, src) links.append( LinkInfo(url=absolute_url, reference="<script>", element_type="script") ) # Extract stylesheet links (link href) for tag in soup.find_all("link", href=True): href = tag["href"] absolute_url = urljoin(base_url, href) rel = tag.get("rel", [""])[0] if isinstance(tag.get("rel"), list) else tag.get("rel", "") reference = f'<link rel="{rel}">' if rel else "<link>" links.append( LinkInfo(url=absolute_url, reference=reference, element_type="link") ) # Extract video sources for tag in soup.find_all("video"): # Check for src attribute if tag.get("src"): absolute_url = urljoin(base_url, tag["src"]) links.append( LinkInfo( url=absolute_url, reference="<video>", element_type="video" ) ) # Check for source children for source in tag.find_all("source", src=True): absolute_url = urljoin(base_url, source["src"]) links.append( LinkInfo( url=absolute_url, reference="<video><source>", element_type="video", ) ) # Extract audio sources for tag in soup.find_all("audio"): # Check for src attribute if tag.get("src"): absolute_url = urljoin(base_url, tag["src"]) links.append( LinkInfo( url=absolute_url, reference="<audio>", element_type="audio" ) ) # Check for source children for source in tag.find_all("source", src=True): absolute_url = urljoin(base_url, source["src"]) links.append( LinkInfo( url=absolute_url, reference="<audio><source>", element_type="audio", ) ) # Extract iframe sources for tag in soup.find_all("iframe", src=True): src = tag["src"] absolute_url = urljoin(base_url, src) links.append( LinkInfo(url=absolute_url, reference="<iframe>", element_type="iframe") ) return links def is_internal_link(self, url: str, domain: str) -> bool: """ Check if a URL belongs to the same domain. Args: url: The URL to check domain: The domain to compare against Returns: True if the URL is internal to the domain """ parsed = urlparse(url) parsed_domain = urlparse(domain) # Compare netloc (domain + port) return parsed.netloc == parsed_domain.netloc

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/davinoishi/BLC-ground'

If you have feedback or need assistance with the MCP directory API, please join our Discord server