"""Tests for link extractor."""
import pytest
from src.link_extractor import LinkExtractor
class TestLinkExtractor:
"""Test the LinkExtractor class."""
def setup_method(self):
"""Set up test fixtures."""
self.extractor = LinkExtractor()
def test_extract_hyperlinks(self):
"""Test extraction of hyperlinks."""
html = """
<html>
<body>
<a href="https://example.com">Example</a>
<a href="/relative/path">Relative Link</a>
</body>
</html>
"""
base_url = "https://test.com"
links = self.extractor.extract_links(html, base_url)
# Find hyperlinks
hyperlinks = [l for l in links if l.element_type == "a"]
assert len(hyperlinks) == 2
assert hyperlinks[0].url == "https://example.com"
assert hyperlinks[0].reference == "Example"
assert hyperlinks[1].url == "https://test.com/relative/path"
def test_extract_images(self):
"""Test extraction of image sources."""
html = """
<html>
<body>
<img src="https://example.com/image.png" alt="Test Image">
<img src="/images/local.jpg">
</body>
</html>
"""
base_url = "https://test.com"
links = self.extractor.extract_links(html, base_url)
images = [l for l in links if l.element_type == "img"]
assert len(images) == 2
assert images[0].url == "https://example.com/image.png"
assert 'alt="Test Image"' in images[0].reference
assert images[1].url == "https://test.com/images/local.jpg"
def test_extract_scripts(self):
"""Test extraction of script sources."""
html = """
<html>
<head>
<script src="https://cdn.example.com/script.js"></script>
<script src="/js/app.js"></script>
</head>
</html>
"""
base_url = "https://test.com"
links = self.extractor.extract_links(html, base_url)
scripts = [l for l in links if l.element_type == "script"]
assert len(scripts) == 2
assert scripts[0].url == "https://cdn.example.com/script.js"
assert scripts[1].url == "https://test.com/js/app.js"
def test_extract_stylesheets(self):
"""Test extraction of stylesheet links."""
html = """
<html>
<head>
<link rel="stylesheet" href="https://cdn.example.com/style.css">
<link rel="stylesheet" href="/css/main.css">
</head>
</html>
"""
base_url = "https://test.com"
links = self.extractor.extract_links(html, base_url)
stylesheets = [l for l in links if l.element_type == "link"]
assert len(stylesheets) == 2
assert stylesheets[0].url == "https://cdn.example.com/style.css"
assert stylesheets[1].url == "https://test.com/css/main.css"
def test_extract_video(self):
"""Test extraction of video sources."""
html = """
<html>
<body>
<video src="https://example.com/video.mp4"></video>
<video>
<source src="/videos/local.mp4">
</video>
</body>
</html>
"""
base_url = "https://test.com"
links = self.extractor.extract_links(html, base_url)
videos = [l for l in links if l.element_type == "video"]
assert len(videos) == 2
assert videos[0].url == "https://example.com/video.mp4"
assert videos[1].url == "https://test.com/videos/local.mp4"
def test_extract_audio(self):
"""Test extraction of audio sources."""
html = """
<html>
<body>
<audio src="https://example.com/audio.mp3"></audio>
<audio>
<source src="/audio/local.mp3">
</audio>
</body>
</html>
"""
base_url = "https://test.com"
links = self.extractor.extract_links(html, base_url)
audios = [l for l in links if l.element_type == "audio"]
assert len(audios) == 2
assert audios[0].url == "https://example.com/audio.mp3"
assert audios[1].url == "https://test.com/audio/local.mp3"
def test_extract_iframe(self):
"""Test extraction of iframe sources."""
html = """
<html>
<body>
<iframe src="https://example.com/embed"></iframe>
</body>
</html>
"""
base_url = "https://test.com"
links = self.extractor.extract_links(html, base_url)
iframes = [l for l in links if l.element_type == "iframe"]
assert len(iframes) == 1
assert iframes[0].url == "https://example.com/embed"
def test_is_internal_link(self):
"""Test internal link detection."""
assert self.extractor.is_internal_link(
"https://example.com/page", "https://example.com"
)
assert self.extractor.is_internal_link(
"https://example.com/page", "https://example.com/"
)
assert not self.extractor.is_internal_link(
"https://other.com/page", "https://example.com"
)
assert not self.extractor.is_internal_link(
"https://sub.example.com/page", "https://example.com"
)