Scantool - File Scanner MCP

Overview Schema Related Servers Score Discussions

html_scanner.py•20.6 KiB

"""HTML language scanner with structure and metadata extraction.""" import re from typing import Optional import tree_sitter_html from tree_sitter import Language, Node, Parser from .base import BaseScanner, StructureNode # Semantic HTML5 elements that define document structure SEMANTIC_SECTIONS = { "header", "nav", "main", "footer", "article", "aside", "section" } # Form-related elements FORM_ELEMENTS = {"form"} FORM_CONTROLS = {"input", "select", "textarea", "button", "fieldset", "label"} # Heading elements HEADING_ELEMENTS = {"h1", "h2", "h3", "h4", "h5", "h6"} # List elements LIST_ELEMENTS = {"ul", "ol", "dl"} # Table elements TABLE_ELEMENTS = {"table"} # Media elements MEDIA_ELEMENTS = {"img", "video", "audio", "picture", "canvas", "svg"} # Elements with external resources RESOURCE_ELEMENTS = {"script", "style", "link"} class HTMLScanner(BaseScanner): """Scanner for HTML files with semantic structure extraction.""" def __init__(self, **kwargs): super().__init__(**kwargs) self.parser = Parser() self.parser.language = Language(tree_sitter_html.language()) @classmethod def get_extensions(cls) -> list[str]: return [".html", ".htm", ".xhtml"] @classmethod def get_language_name(cls) -> str: return "HTML" @classmethod def should_skip(cls, filename: str) -> bool: """Skip minified and generated HTML files.""" if filename.endswith(".min.html"): return True # Skip common generated/template cache files if any(pattern in filename.lower() for pattern in [ ".cache.", ".generated.", ".compiled." ]): return True return False def scan(self, source_code: bytes) -> Optional[list[StructureNode]]: """Scan HTML source code and extract structure with metadata.""" try: tree = self.parser.parse(source_code) # Check for excessive errors if self._should_use_fallback(tree.root_node): if self.fallback_on_errors: return self._fallback_extract(source_code) return None return self._extract_structure(tree.root_node, source_code) except Exception as e: if self.show_errors: print(f"HTML parsing error: {e}") if self.fallback_on_errors: return self._fallback_extract(source_code) return None def _extract_structure( self, root: Node, source_code: bytes ) -> list[StructureNode]: """Extract structure from HTML document.""" structures = [] def traverse(node: Node, parent_list: list): """Recursively traverse and extract meaningful structures.""" if node.type == "ERROR": # Continue traversing children of ERROR nodes to find valid structures # Tree-sitter often wraps valid HTML in ERROR when there's a syntax issue for child in node.children: traverse(child, parent_list) return # DOCTYPE declaration if node.type == "doctype": parent_list.append(StructureNode( type="doctype", name="DOCTYPE", start_line=node.start_point[0] + 1, end_line=node.end_point[0] + 1 )) # Element nodes elif node.type == "element": element_node = self._extract_element(node, source_code) if element_node: parent_list.append(element_node) # Traverse children for nested structures for child in node.children: traverse(child, element_node.children) else: # Not a structural element, continue traversing for child in node.children: traverse(child, parent_list) # Script and style elements (self-contained) elif node.type in ("script_element", "style_element"): resource_node = self._extract_resource_element(node, source_code) if resource_node: parent_list.append(resource_node) # Continue traversing for other node types else: for child in node.children: traverse(child, parent_list) traverse(root, structures) return structures def _extract_element( self, node: Node, source_code: bytes ) -> Optional[StructureNode]: """Extract a structural HTML element.""" tag_name = self._get_tag_name(node, source_code) if not tag_name: return None tag_lower = tag_name.lower() # Determine if this is a structural element worth extracting if tag_lower in SEMANTIC_SECTIONS: return self._create_section_node(node, source_code, tag_lower) elif tag_lower in FORM_ELEMENTS: return self._create_form_node(node, source_code) elif tag_lower in HEADING_ELEMENTS: return self._create_heading_node(node, source_code, tag_lower) elif tag_lower in LIST_ELEMENTS: return self._create_list_node(node, source_code, tag_lower) elif tag_lower in TABLE_ELEMENTS: return self._create_table_node(node, source_code) # Check for elements with id attribute (landmarks) attrs = self._extract_attributes(node, source_code) if attrs.get("id"): return self._create_landmark_node(node, source_code, tag_lower, attrs) return None def _get_tag_name(self, node: Node, source_code: bytes) -> Optional[str]: """Get the tag name from an element node.""" for child in node.children: if child.type == "start_tag": for tag_child in child.children: if tag_child.type == "tag_name": return self._get_node_text(tag_child, source_code) elif child.type == "self_closing_tag": for tag_child in child.children: if tag_child.type == "tag_name": return self._get_node_text(tag_child, source_code) return None def _extract_attributes( self, node: Node, source_code: bytes ) -> dict[str, str]: """Extract attributes from an element.""" attrs = {} for child in node.children: if child.type in ("start_tag", "self_closing_tag"): for attr_node in child.children: if attr_node.type == "attribute": name = None value = None for attr_child in attr_node.children: if attr_child.type == "attribute_name": name = self._get_node_text(attr_child, source_code) elif attr_child.type in ( "attribute_value", "quoted_attribute_value" ): value = self._get_node_text( attr_child, source_code ).strip('"\'') if name: attrs[name.lower()] = value or "" return attrs def _create_section_node( self, node: Node, source_code: bytes, tag_name: str ) -> StructureNode: """Create a node for semantic section elements.""" attrs = self._extract_attributes(node, source_code) name = attrs.get("id") or attrs.get("aria-label") or tag_name signature_parts = [] if attrs.get("id"): signature_parts.append(f"#{attrs['id']}") if attrs.get("class"): classes = attrs["class"].split()[:3] # First 3 classes signature_parts.extend(f".{c}" for c in classes) return StructureNode( type="section", name=name, start_line=node.start_point[0] + 1, end_line=node.end_point[0] + 1, signature=" ".join(signature_parts) if signature_parts else None, docstring=attrs.get("aria-label") or attrs.get("title"), modifiers=[tag_name], children=[] ) def _create_form_node( self, node: Node, source_code: bytes ) -> StructureNode: """Create a node for form elements.""" attrs = self._extract_attributes(node, source_code) name = attrs.get("id") or attrs.get("name") or "form" method = attrs.get("method", "GET").upper() action = attrs.get("action", "#") # Count form controls control_count = self._count_form_controls(node, source_code) return StructureNode( type="form", name=name, start_line=node.start_point[0] + 1, end_line=node.end_point[0] + 1, signature=f"{method} {action}", docstring=attrs.get("aria-label") or attrs.get("title"), modifiers=[method.lower()], complexity={"fields": control_count}, children=[] ) def _count_form_controls(self, node: Node, source_code: bytes) -> int: """Count form control elements within a form.""" count = 0 def count_controls(n: Node): nonlocal count if n.type == "element": tag = self._get_tag_name(n, source_code) if tag and tag.lower() in FORM_CONTROLS: count += 1 for child in n.children: count_controls(child) count_controls(node) return count def _create_heading_node( self, node: Node, source_code: bytes, tag_name: str ) -> StructureNode: """Create a node for heading elements.""" level = int(tag_name[1]) # h1 -> 1, h2 -> 2, etc. text = self._extract_text_content(node, source_code) attrs = self._extract_attributes(node, source_code) name = text[:50] + "..." if len(text) > 50 else text if not name.strip(): name = f"(empty {tag_name})" return StructureNode( type="heading", name=name, start_line=node.start_point[0] + 1, end_line=node.end_point[0] + 1, signature=f"H{level}", docstring=attrs.get("id"), children=[] ) def _create_list_node( self, node: Node, source_code: bytes, tag_name: str ) -> StructureNode: """Create a node for list elements.""" attrs = self._extract_attributes(node, source_code) list_type = "ordered" if tag_name == "ol" else ( "definition" if tag_name == "dl" else "unordered" ) # Count list items item_count = self._count_list_items(node, tag_name) name = attrs.get("id") or f"{list_type} list" return StructureNode( type="list", name=name, start_line=node.start_point[0] + 1, end_line=node.end_point[0] + 1, signature=f"{item_count} items", modifiers=[list_type], children=[] ) def _count_list_items(self, node: Node, tag_name: str) -> int: """Count items in a list.""" item_tag = "dt" if tag_name == "dl" else "li" count = 0 def count_items(n: Node): nonlocal count if n.type == "element": for child in n.children: if child.type == "start_tag": for tag_child in child.children: if tag_child.type == "tag_name": if self._get_node_text( tag_child, n.text ).lower() == item_tag: count += 1 for child in n.children: count_items(child) count_items(node) return count def _create_table_node( self, node: Node, source_code: bytes ) -> StructureNode: """Create a node for table elements.""" attrs = self._extract_attributes(node, source_code) name = attrs.get("id") or "table" # Count rows and columns rows, cols = self._count_table_dimensions(node, source_code) return StructureNode( type="table", name=name, start_line=node.start_point[0] + 1, end_line=node.end_point[0] + 1, signature=f"{rows}x{cols}" if cols > 0 else f"{rows} rows", docstring=attrs.get("aria-label") or attrs.get("summary"), children=[] ) def _count_table_dimensions( self, node: Node, source_code: bytes ) -> tuple[int, int]: """Count rows and columns in a table.""" rows = 0 max_cols = 0 def traverse(n: Node): nonlocal rows, max_cols tag = self._get_tag_name(n, source_code) if n.type == "element" else None if tag and tag.lower() == "tr": rows += 1 cols = 0 for child in n.children: child_tag = ( self._get_tag_name(child, source_code) if child.type == "element" else None ) if child_tag and child_tag.lower() in ("td", "th"): cols += 1 max_cols = max(max_cols, cols) for child in n.children: traverse(child) traverse(node) return rows, max_cols def _create_landmark_node( self, node: Node, source_code: bytes, tag_name: str, attrs: dict ) -> StructureNode: """Create a node for elements with id (landmarks).""" name = attrs.get("id", tag_name) signature_parts = [f"<{tag_name}>"] if attrs.get("class"): classes = attrs["class"].split()[:2] signature_parts.extend(f".{c}" for c in classes) return StructureNode( type="element", name=name, start_line=node.start_point[0] + 1, end_line=node.end_point[0] + 1, signature=" ".join(signature_parts), docstring=attrs.get("title") or attrs.get("aria-label"), modifiers=[tag_name], children=[] ) def _extract_resource_element( self, node: Node, source_code: bytes ) -> Optional[StructureNode]: """Extract script or style element.""" is_script = node.type == "script_element" element_type = "script" if is_script else "style" # Find the start tag to get attributes attrs = {} for child in node.children: if child.type == "start_tag": for attr_node in child.children: if attr_node.type == "attribute": name = None value = None for attr_child in attr_node.children: if attr_child.type == "attribute_name": name = self._get_node_text(attr_child, source_code) elif attr_child.type in ( "attribute_value", "quoted_attribute_value" ): value = self._get_node_text( attr_child, source_code ).strip('"\'') if name: attrs[name.lower()] = value or "" if is_script: src = attrs.get("src", "") name = src.split("/")[-1] if src else "inline" signature = src if src else "inline script" modifiers = [] if attrs.get("type"): modifiers.append(attrs["type"]) if attrs.get("async") is not None: modifiers.append("async") if attrs.get("defer") is not None: modifiers.append("defer") else: media = attrs.get("media", "") name = attrs.get("id", "style") signature = f"media={media}" if media else None modifiers = [media] if media else [] return StructureNode( type=element_type, name=name, start_line=node.start_point[0] + 1, end_line=node.end_point[0] + 1, signature=signature, modifiers=modifiers, children=[] ) def _extract_text_content(self, node: Node, source_code: bytes) -> str: """Extract text content from an element, excluding nested tags.""" text_parts = [] def collect_text(n: Node): if n.type == "text": text_parts.append(self._get_node_text(n, source_code)) for child in n.children: collect_text(child) collect_text(node) return " ".join(text_parts).strip() def _fallback_extract(self, source_code: bytes) -> list[StructureNode]: """Regex-based extraction for malformed HTML files.""" text = source_code.decode("utf-8", errors="replace") structures = [] # Find DOCTYPE doctype_match = re.search(r'<!DOCTYPE[^>]*>', text, re.IGNORECASE) if doctype_match: line_num = text[:doctype_match.start()].count("\n") + 1 structures.append(StructureNode( type="doctype", name="DOCTYPE", start_line=line_num, end_line=line_num )) # Find headings heading_pattern = r'<(h[1-6])[^>]*>(.*?)</\1>' for match in re.finditer(heading_pattern, text, re.IGNORECASE | re.DOTALL): tag = match.group(1).lower() content = re.sub(r'<[^>]+>', '', match.group(2)).strip() line_num = text[:match.start()].count("\n") + 1 level = int(tag[1]) name = content[:50] + "..." if len(content) > 50 else content if not name: name = f"(empty {tag})" structures.append(StructureNode( type="heading", name=name, start_line=line_num, end_line=line_num, signature=f"H{level}" )) # Find forms form_pattern = r'<form([^>]*)>' for match in re.finditer(form_pattern, text, re.IGNORECASE): attrs_str = match.group(1) line_num = text[:match.start()].count("\n") + 1 # Extract id/name/method/action from attributes id_match = re.search(r'id=["\']([^"\']+)["\']', attrs_str, re.IGNORECASE) method_match = re.search( r'method=["\']([^"\']+)["\']', attrs_str, re.IGNORECASE ) action_match = re.search( r'action=["\']([^"\']+)["\']', attrs_str, re.IGNORECASE ) name = id_match.group(1) if id_match else "form" method = method_match.group(1).upper() if method_match else "GET" action = action_match.group(1) if action_match else "#" structures.append(StructureNode( type="form", name=name, start_line=line_num, end_line=line_num, signature=f"{method} {action}", modifiers=[method.lower()] )) # Find semantic sections section_pattern = r'<(header|nav|main|footer|article|aside|section)([^>]*)>' for match in re.finditer(section_pattern, text, re.IGNORECASE): tag = match.group(1).lower() attrs_str = match.group(2) line_num = text[:match.start()].count("\n") + 1 id_match = re.search(r'id=["\']([^"\']+)["\']', attrs_str, re.IGNORECASE) name = id_match.group(1) if id_match else tag structures.append(StructureNode( type="section", name=name, start_line=line_num, end_line=line_num, modifiers=[tag] )) # Find scripts with src script_pattern = r'<script([^>]*)>' for match in re.finditer(script_pattern, text, re.IGNORECASE): attrs_str = match.group(1) line_num = text[:match.start()].count("\n") + 1 src_match = re.search(r'src=["\']([^"\']+)["\']', attrs_str, re.IGNORECASE) if src_match: src = src_match.group(1) name = src.split("/")[-1] structures.append(StructureNode( type="script", name=name, start_line=line_num, end_line=line_num, signature=src )) return structures

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/mariusei/file-scanner-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

html_scanner.py•20.6 KiB