Scantool - File Scanner MCP

Overview Schema Related Servers Score Discussions

test_html.py•14.1 KiB

"""Tests for HTML scanner and analyzer.""" import pytest from pathlib import Path from scantool.scanners.html_scanner import HTMLScanner from scantool.analyzers.html_analyzer import HTMLAnalyzer @pytest.fixture def html_scanner(): """Create an HTML scanner instance.""" return HTMLScanner() @pytest.fixture def html_analyzer(): """Create an HTML analyzer instance.""" return HTMLAnalyzer() @pytest.fixture def basic_html(): """Load basic.html test file.""" path = Path(__file__).parent / "basic.html" return path.read_bytes() @pytest.fixture def edge_cases_html(): """Load edge_cases.html test file.""" path = Path(__file__).parent / "edge_cases.html" return path.read_bytes() @pytest.fixture def broken_html(): """Load broken.html test file.""" path = Path(__file__).parent / "broken.html" return path.read_bytes() class TestHTMLScanner: """Tests for HTMLScanner.""" def test_get_extensions(self): """Test supported file extensions.""" extensions = HTMLScanner.get_extensions() assert ".html" in extensions assert ".htm" in extensions assert ".xhtml" in extensions def test_get_language_name(self): """Test language name.""" assert HTMLScanner.get_language_name() == "HTML" def test_should_skip_minified(self): """Test skipping minified files.""" assert HTMLScanner.should_skip("app.min.html") is True assert HTMLScanner.should_skip("index.html") is False def test_scan_basic_doctype(self, html_scanner, basic_html): """Test DOCTYPE extraction.""" structures = html_scanner.scan(basic_html) assert structures is not None doctypes = [s for s in structures if s.type == "doctype"] assert len(doctypes) == 1 assert doctypes[0].name == "DOCTYPE" def test_scan_basic_sections(self, html_scanner, basic_html): """Test semantic section extraction.""" structures = html_scanner.scan(basic_html) assert structures is not None sections = [s for s in structures if s.type == "section"] section_names = [s.name for s in sections] # Check main semantic sections assert "main-header" in section_names assert "content" in section_names assert "main-footer" in section_names def test_scan_basic_headings(self, html_scanner, basic_html): """Test heading extraction.""" structures = html_scanner.scan(basic_html) assert structures is not None # Find headings recursively def find_headings(nodes): headings = [] for node in nodes: if node.type == "heading": headings.append(node) headings.extend(find_headings(node.children)) return headings headings = find_headings(structures) assert len(headings) >= 3 # h1, h2, h2 h1_headings = [h for h in headings if h.signature == "H1"] assert len(h1_headings) >= 1 assert "Welcome" in h1_headings[0].name def test_scan_basic_forms(self, html_scanner, basic_html): """Test form extraction.""" structures = html_scanner.scan(basic_html) assert structures is not None # Find forms recursively def find_forms(nodes): forms = [] for node in nodes: if node.type == "form": forms.append(node) forms.extend(find_forms(node.children)) return forms forms = find_forms(structures) assert len(forms) >= 1 contact_form = next((f for f in forms if f.name == "contact-form"), None) assert contact_form is not None assert contact_form.signature == "POST /api/contact" assert "post" in contact_form.modifiers def test_scan_basic_lists(self, html_scanner, basic_html): """Test list extraction.""" structures = html_scanner.scan(basic_html) assert structures is not None # Find lists recursively def find_lists(nodes): lists = [] for node in nodes: if node.type == "list": lists.append(node) lists.extend(find_lists(node.children)) return lists lists = find_lists(structures) assert len(lists) >= 2 # ul and ol feature_list = next((l for l in lists if l.name == "feature-list"), None) assert feature_list is not None assert "unordered" in feature_list.modifiers def test_scan_scripts_and_styles(self, html_scanner, basic_html): """Test script and style extraction.""" structures = html_scanner.scan(basic_html) assert structures is not None scripts = [s for s in structures if s.type == "script"] styles = [s for s in structures if s.type == "style"] assert len(scripts) >= 1 assert any("app.js" in s.name for s in scripts) assert len(styles) >= 1 def test_scan_edge_cases_empty_heading(self, html_scanner, edge_cases_html): """Test handling of empty headings.""" structures = html_scanner.scan(edge_cases_html) assert structures is not None def find_headings(nodes): headings = [] for node in nodes: if node.type == "heading": headings.append(node) headings.extend(find_headings(node.children)) return headings headings = find_headings(structures) empty_h1 = [h for h in headings if "(empty h1)" in h.name] assert len(empty_h1) >= 1 def test_scan_edge_cases_truncated_heading(self, html_scanner, edge_cases_html): """Test heading truncation for long text.""" structures = html_scanner.scan(edge_cases_html) assert structures is not None def find_headings(nodes): headings = [] for node in nodes: if node.type == "heading": headings.append(node) headings.extend(find_headings(node.children)) return headings headings = find_headings(structures) long_heading = [h for h in headings if "..." in h.name] assert len(long_heading) >= 1 assert len(long_heading[0].name) <= 54 # 50 chars + "..." def test_scan_edge_cases_nested_sections(self, html_scanner, edge_cases_html): """Test nested semantic sections.""" structures = html_scanner.scan(edge_cases_html) assert structures is not None # Find the nested article def find_by_name(nodes, name): for node in nodes: if node.name == name: return node result = find_by_name(node.children, name) if result: return result return None article = find_by_name(structures, "nested-article") assert article is not None assert len(article.children) > 0 def test_scan_edge_cases_multiple_forms(self, html_scanner, edge_cases_html): """Test multiple forms extraction.""" structures = html_scanner.scan(edge_cases_html) assert structures is not None def find_forms(nodes): forms = [] for node in nodes: if node.type == "form": forms.append(node) forms.extend(find_forms(node.children)) return forms forms = find_forms(structures) assert len(forms) >= 3 # search-form, newsletter, anonymous def test_scan_edge_cases_table(self, html_scanner, edge_cases_html): """Test table extraction with dimensions.""" structures = html_scanner.scan(edge_cases_html) assert structures is not None def find_tables(nodes): tables = [] for node in nodes: if node.type == "table": tables.append(node) tables.extend(find_tables(node.children)) return tables tables = find_tables(structures) data_table = next((t for t in tables if t.name == "data-table"), None) assert data_table is not None assert "4x4" in data_table.signature or "4 rows" in data_table.signature def test_scan_edge_cases_element_with_id(self, html_scanner, edge_cases_html): """Test element extraction for landmarks with id.""" structures = html_scanner.scan(edge_cases_html) assert structures is not None def find_elements(nodes): elements = [] for node in nodes: if node.type == "element": elements.append(node) elements.extend(find_elements(node.children)) return elements elements = find_elements(structures) important = next((e for e in elements if e.name == "important-element"), None) assert important is not None def test_scan_broken_html_fallback(self, html_scanner, broken_html): """Test fallback parsing for broken HTML.""" # Should not crash and should extract what it can structures = html_scanner.scan(broken_html) assert structures is not None # tree-sitter-html is very tolerant, so it may parse successfully # The important thing is that we don't crash and return something # Either tree-sitter finds structures OR fallback does def count_all(nodes): count = len(nodes) for node in nodes: count += count_all(node.children) return count # Should have at least DOCTYPE total = count_all(structures) assert total >= 1 # At minimum we should find the DOCTYPE class TestHTMLAnalyzer: """Tests for HTMLAnalyzer.""" def test_get_extensions(self): """Test supported file extensions.""" extensions = HTMLAnalyzer.get_extensions() assert ".html" in extensions assert ".htm" in extensions def test_get_language_name(self): """Test language name.""" assert HTMLAnalyzer.get_language_name() == "HTML" def test_extract_imports_link(self, html_analyzer, basic_html): """Test stylesheet and icon import extraction.""" content = basic_html.decode("utf-8") imports = html_analyzer.extract_imports("test.html", content) # Find stylesheet import stylesheets = [i for i in imports if i.import_type == "stylesheet"] assert len(stylesheets) >= 1 assert any("main.css" in i.target_module for i in stylesheets) # Find icon import icons = [i for i in imports if i.import_type == "icon"] assert len(icons) >= 1 def test_extract_imports_script(self, html_analyzer, basic_html): """Test script import extraction.""" content = basic_html.decode("utf-8") imports = html_analyzer.extract_imports("test.html", content) scripts = [i for i in imports if i.import_type == "script"] assert len(scripts) >= 1 assert any("app.js" in i.target_module for i in scripts) def test_extract_imports_css_import(self, html_analyzer, basic_html): """Test CSS @import extraction from style blocks.""" content = basic_html.decode("utf-8") imports = html_analyzer.extract_imports("test.html", content) css_imports = [i for i in imports if i.import_type == "css_import"] assert len(css_imports) >= 1 assert any("buttons.css" in i.target_module for i in css_imports) def test_extract_imports_ignores_external(self, html_analyzer, edge_cases_html): """Test that external URLs are ignored.""" content = edge_cases_html.decode("utf-8") imports = html_analyzer.extract_imports("test.html", content) # Should not include CDN URLs for imp in imports: assert "cdn.example.com" not in imp.target_module assert "cdnjs.cloudflare.com" not in imp.target_module assert "googleapis.com" not in imp.target_module def test_find_entry_points_index(self, html_analyzer, basic_html): """Test entry point detection for index files.""" content = basic_html.decode("utf-8") entry_points = html_analyzer.find_entry_points("index.html", content) html_entries = [e for e in entry_points if e.type == "html_entry"] assert len(html_entries) >= 1 def test_find_entry_points_doctype(self, html_analyzer, basic_html): """Test DOCTYPE detection as entry point.""" content = basic_html.decode("utf-8") entry_points = html_analyzer.find_entry_points("page.html", content) doctypes = [e for e in entry_points if e.type == "html_document"] assert len(doctypes) >= 1 def test_find_entry_points_viewport(self, html_analyzer, basic_html): """Test viewport meta detection.""" content = basic_html.decode("utf-8") entry_points = html_analyzer.find_entry_points("page.html", content) viewports = [e for e in entry_points if e.type == "responsive_page"] assert len(viewports) >= 1 def test_classify_file_index(self, html_analyzer): """Test file classification for index files.""" cluster = html_analyzer.classify_file("index.html", "<!DOCTYPE html>") assert cluster == "entry_points" def test_classify_file_error(self, html_analyzer): """Test file classification for error pages.""" cluster = html_analyzer.classify_file("404.html", "<!DOCTYPE html>") assert cluster == "utilities" def test_should_analyze_normal(self, html_analyzer): """Test normal files should be analyzed.""" assert html_analyzer.should_analyze("page.html") is True def test_should_analyze_minified(self, html_analyzer): """Test minified files should be skipped.""" assert html_analyzer.should_analyze("app.min.html") is False def test_is_low_value_small_file(self, html_analyzer): """Test small files are low value.""" assert html_analyzer.is_low_value_for_inventory("stub.html", 50) is True def test_is_low_value_error_page(self, html_analyzer): """Test error pages are low value.""" assert html_analyzer.is_low_value_for_inventory("404.html", 500) is True

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/mariusei/file-scanner-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_html.py•14.1 KiB