Code-Index-MCP

section_extractor.py•9.83 KiB

""" Section extractor for hierarchical document structure extraction. """ import logging import re from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) class SectionExtractor: """Extract hierarchical sections from Markdown documents.""" def __init__(self): """Initialize the section extractor.""" self.heading_levels = ["h1", "h2", "h3", "h4", "h5", "h6"] def extract(self, ast: Dict[str, Any], content: str) -> List[Dict[str, Any]]: """Extract sections from the AST.""" sections = [] current_section_stack = [] content_lines = content.split("\n") for node in ast.get("children", []): if node.get("type") == "heading": # Process heading level = node.get("depth", 1) title = self._extract_text(node) start_line = node.get("position", {}).get("start", {}).get("line", 1) - 1 # Create section object section = { "id": self._generate_section_id(title), "title": title, "level": level, "start_line": start_line, "content": "", "subsections": [], "metadata": { "word_count": 0, "code_blocks": 0, "links": 0, "images": 0, }, } # Update section hierarchy while current_section_stack and current_section_stack[-1]["level"] >= level: # Close previous sections at same or higher level closed_section = current_section_stack.pop() self._finalize_section(closed_section, content_lines) if current_section_stack: current_section_stack[-1]["subsections"].append(closed_section) else: sections.append(closed_section) # Add new section to stack current_section_stack.append(section) elif current_section_stack: # Add content to current section self._add_content_to_section(current_section_stack[-1], node) # Close remaining sections while current_section_stack: closed_section = current_section_stack.pop() self._finalize_section(closed_section, content_lines) if current_section_stack: current_section_stack[-1]["subsections"].append(closed_section) else: sections.append(closed_section) return sections def extract_flat(self, ast: Dict[str, Any]) -> List[Dict[str, Any]]: """Extract a flat list of sections without hierarchy.""" sections = [] for node in ast.get("children", []): if node.get("type") == "heading": level = node.get("depth", 1) title = self._extract_text(node) section = { "id": self._generate_section_id(title), "title": title, "level": level, "type": f"h{level}", "position": node.get("position", {}), } sections.append(section) return sections def get_all_sections_flat(self, sections: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Get all sections in a flat list, including nested ones.""" flat_sections = [] def flatten(section_list): for section in section_list: flat_sections.append(section) if section.get("subsections"): flatten(section["subsections"]) flatten(sections) return flat_sections def find_section_by_heading( self, sections: List[Dict[str, Any]], heading: str ) -> Optional[Dict[str, Any]]: """Find a section by its heading text.""" heading_lower = heading.lower() def search_sections( section_list: List[Dict[str, Any]], ) -> Optional[Dict[str, Any]]: for section in section_list: if section["title"].lower() == heading_lower: return section # Search subsections found = search_sections(section.get("subsections", [])) if found: return found return None return search_sections(sections) def get_section_path(self, sections: List[Dict[str, Any]], target_id: str) -> List[str]: """Get the path to a section (list of parent section titles).""" _ = [] def find_path( section_list: List[Dict[str, Any]], current_path: List[str] ) -> Optional[List[str]]: for section in section_list: new_path = current_path + [section["title"]] if section["id"] == target_id: return new_path # Search subsections found = find_path(section.get("subsections", []), new_path) if found: return found return None result = find_path(sections, []) return result or [] def get_table_of_contents( self, sections: List[Dict[str, Any]], max_depth: int = 3 ) -> List[Dict[str, Any]]: """Generate a table of contents from sections.""" toc = [] def process_sections(section_list: List[Dict[str, Any]], depth: int = 1): for section in section_list: if depth <= max_depth: toc_entry = { "id": section["id"], "title": section["title"], "level": section["level"], "depth": depth, } toc.append(toc_entry) # Process subsections if section.get("subsections"): process_sections(section["subsections"], depth + 1) process_sections(sections) return toc def get_section_content(self, section: Dict[str, Any], include_subsections: bool = True) -> str: """Get the full content of a section.""" content_parts = [section.get("content", "")] if include_subsections: for subsection in section.get("subsections", []): subsection_content = self.get_section_content(subsection, True) if subsection_content: content_parts.append(subsection_content) return "\n\n".join(filter(None, content_parts)) def _extract_text(self, node: Dict[str, Any]) -> str: """Extract text content from an AST node.""" if node.get("type") == "text": return node.get("value", "") text_parts = [] for child in node.get("children", []): text_parts.append(self._extract_text(child)) return " ".join(text_parts).strip() def _generate_section_id(self, title: str) -> str: """Generate a URL-friendly section ID from title.""" # Convert to lowercase id_str = title.lower() # Replace spaces and special characters with hyphens id_str = re.sub(r"[^\w\s-]", "", id_str) id_str = re.sub(r"[-\s]+", "-", id_str) # Remove leading/trailing hyphens id_str = id_str.strip("-") return id_str def _add_content_to_section(self, section: Dict[str, Any], node: Dict[str, Any]): """Add content metadata to a section.""" node_type = node.get("type", "") if node_type == "code": section["metadata"]["code_blocks"] += 1 elif node_type == "paragraph": # Count words text = self._extract_text(node) section["metadata"]["word_count"] += len(text.split()) # Count links and images self._count_inline_elements(node, section["metadata"]) elif node_type == "list": # Count words in list items for item in node.get("children", []): text = self._extract_text(item) section["metadata"]["word_count"] += len(text.split()) self._count_inline_elements(item, section["metadata"]) def _count_inline_elements(self, node: Dict[str, Any], metadata: Dict[str, Any]): """Count inline elements in a node.""" if node.get("type") == "link": metadata["links"] += 1 elif node.get("type") == "image": metadata["images"] += 1 for child in node.get("children", []): self._count_inline_elements(child, metadata) def _finalize_section(self, section: Dict[str, Any], content_lines: List[str]): """Finalize a section by extracting its content.""" start_line = section["start_line"] # Find the end line (start of next section at same or higher level) end_line = len(content_lines) # Extract content lines (excluding the heading itself) if start_line + 1 < end_line: section_lines = content_lines[start_line + 1 : end_line] # Remove empty lines at start and end while section_lines and not section_lines[0].strip(): section_lines.pop(0) while section_lines and not section_lines[-1].strip(): section_lines.pop() section["content"] = "\n".join(section_lines) def merge_sections( self, sections1: List[Dict[str, Any]], sections2: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: """Merge two lists of sections, avoiding duplicates.""" merged = sections1.copy() existing_ids = {s["id"] for s in sections1} for section in sections2: if section["id"] not in existing_ids: merged.append(section) return merged

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ViperJuice/Code-Index-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

section_extractor.py•9.83 KiB