"""Extract links from HTML pages."""
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from .models import LinkInfo
class LinkExtractor:
"""Extract various types of links from HTML content."""
def extract_links(self, html: str, base_url: str) -> list[LinkInfo]:
"""
Extract all links from HTML content.
Args:
html: The HTML content to parse
base_url: The base URL for resolving relative links
Returns:
List of LinkInfo objects containing link details
"""
soup = BeautifulSoup(html, "html.parser")
links = []
# Extract hyperlinks (a href)
for tag in soup.find_all("a", href=True):
href = tag["href"]
absolute_url = urljoin(base_url, href)
reference = tag.get_text(strip=True) or "<a>"
links.append(
LinkInfo(url=absolute_url, reference=reference, element_type="a")
)
# Extract image sources (img src)
for tag in soup.find_all("img", src=True):
src = tag["src"]
absolute_url = urljoin(base_url, src)
alt_text = tag.get("alt", "").strip()
reference = f'<img alt="{alt_text}">' if alt_text else "<img>"
links.append(
LinkInfo(url=absolute_url, reference=reference, element_type="img")
)
# Extract script sources (script src)
for tag in soup.find_all("script", src=True):
src = tag["src"]
absolute_url = urljoin(base_url, src)
links.append(
LinkInfo(url=absolute_url, reference="<script>", element_type="script")
)
# Extract stylesheet links (link href)
for tag in soup.find_all("link", href=True):
href = tag["href"]
absolute_url = urljoin(base_url, href)
rel = tag.get("rel", [""])[0] if isinstance(tag.get("rel"), list) else tag.get("rel", "")
reference = f'<link rel="{rel}">' if rel else "<link>"
links.append(
LinkInfo(url=absolute_url, reference=reference, element_type="link")
)
# Extract video sources
for tag in soup.find_all("video"):
# Check for src attribute
if tag.get("src"):
absolute_url = urljoin(base_url, tag["src"])
links.append(
LinkInfo(
url=absolute_url, reference="<video>", element_type="video"
)
)
# Check for source children
for source in tag.find_all("source", src=True):
absolute_url = urljoin(base_url, source["src"])
links.append(
LinkInfo(
url=absolute_url,
reference="<video><source>",
element_type="video",
)
)
# Extract audio sources
for tag in soup.find_all("audio"):
# Check for src attribute
if tag.get("src"):
absolute_url = urljoin(base_url, tag["src"])
links.append(
LinkInfo(
url=absolute_url, reference="<audio>", element_type="audio"
)
)
# Check for source children
for source in tag.find_all("source", src=True):
absolute_url = urljoin(base_url, source["src"])
links.append(
LinkInfo(
url=absolute_url,
reference="<audio><source>",
element_type="audio",
)
)
# Extract iframe sources
for tag in soup.find_all("iframe", src=True):
src = tag["src"]
absolute_url = urljoin(base_url, src)
links.append(
LinkInfo(url=absolute_url, reference="<iframe>", element_type="iframe")
)
return links
def is_internal_link(self, url: str, domain: str) -> bool:
"""
Check if a URL belongs to the same domain.
Args:
url: The URL to check
domain: The domain to compare against
Returns:
True if the URL is internal to the domain
"""
parsed = urlparse(url)
parsed_domain = urlparse(domain)
# Compare netloc (domain + port)
return parsed.netloc == parsed_domain.netloc