RAG Document Server

hierarchical_chunker.py•7.95 kB

"""Hierarchical chunker that preserves markdown document structure.""" import re from typing import List, Optional from utils.frontmatter_parser import FrontmatterParser from utils.markdown_parser import MarkdownParser from utils.text_chunker import TextChunker class HierarchicalChunker: """Chunk documents while preserving hierarchical structure.""" def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200): """ Initialize the hierarchical chunker. Args: chunk_size: Maximum size of each chunk in characters chunk_overlap: Number of overlapping characters between chunks """ self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.text_chunker = TextChunker(chunk_size, chunk_overlap) def chunk_markdown( self, markdown_text: str, path_prefix: Optional[str] = None ) -> List[dict]: """ Chunk markdown text while preserving section hierarchy. Args: markdown_text: The markdown document text path_prefix: Optional filesystem path prefix to prepend to section paths Returns: List of dicts with 'text', 'section_path', 'section_level' """ if not markdown_text or not markdown_text.strip(): return [] # Parse frontmatter frontmatter, content = FrontmatterParser.parse(markdown_text) frontmatter_title = FrontmatterParser.get_title(frontmatter) # Parse markdown structure from content (without frontmatter) sections = MarkdownParser.parse(content) if not sections: # Fallback to basic text chunking chunks = self.text_chunker.split_text(content) # Build path with filesystem + frontmatter base_path = self._build_base_path(path_prefix, frontmatter_title) return [ { "text": chunk, "section_path": base_path, "section_level": 0, } for chunk in chunks ] result_chunks = [] for section in sections: # Build full section path: filesystem > frontmatter > markdown headers full_section_path = self._build_full_section_path( path_prefix, frontmatter_title, section.breadcrumb ) # Get full section text including header section_header = "#" * section.level + " " + section.title section_full_text = section_header + "\n\n" + section.content # Check if section fits in one chunk if len(section_full_text) <= self.chunk_size: result_chunks.append( { "text": section_full_text.strip(), "section_path": full_section_path, "section_level": section.level, } ) else: # Section is too large, need to split it # Split content while keeping section context content_chunks = self._split_large_section( section_header, section.content, full_section_path, section.level ) result_chunks.extend(content_chunks) return result_chunks def _build_base_path( self, path_prefix: Optional[str], frontmatter_title: Optional[str] ) -> str: """Build base path from filesystem and frontmatter.""" parts = [] if path_prefix: parts.append(path_prefix) if frontmatter_title: parts.append(frontmatter_title) return " > ".join(parts) if parts else "Document" def _build_full_section_path( self, path_prefix: Optional[str], frontmatter_title: Optional[str], section_breadcrumb: str, ) -> str: """Build full section path combining all levels of hierarchy.""" parts = [] # Add filesystem path if path_prefix: parts.append(path_prefix) # Add frontmatter title (if it's not redundant with first section) if frontmatter_title: # Check if section breadcrumb starts with frontmatter title # to avoid redundancy like "Quickstart > Quickstart > Prerequisites" first_section = section_breadcrumb.split(" > ")[0] if frontmatter_title.lower() != first_section.lower(): parts.append(frontmatter_title) # Add markdown section breadcrumb parts.append(section_breadcrumb) return " > ".join(parts) def _split_large_section( self, header: str, content: str, breadcrumb: str, level: int ) -> List[dict]: """ Split a large section into multiple chunks while maintaining context. Args: header: Section header text (e.g., "## Installation") content: Section content breadcrumb: Full section path level: Header level Returns: List of chunk dictionaries """ chunks = [] # Calculate space available for content (reserve space for header) header_with_spacing = header + "\n\n" available_size = self.chunk_size - len(header_with_spacing) if available_size <= 0: # Header itself is too long, just split the content content_chunks = self.text_chunker.split_text(content) return [ { "text": chunk, "section_path": breadcrumb, "section_level": level, } for chunk in content_chunks ] # Split content into smaller pieces content_parts = self.text_chunker.split_text(content) for part in content_parts: # Prepend header to each chunk for context chunk_text = header_with_spacing + part chunks.append( { "text": chunk_text.strip(), "section_path": breadcrumb, "section_level": level, } ) return chunks def chunk_with_metadata( self, text: str, doc_id: str, is_markdown: bool = True, extra_metadata: dict = None ) -> List[dict]: """ Chunk text with full metadata. Args: text: The text to chunk doc_id: Document identifier is_markdown: Whether to use markdown-aware chunking extra_metadata: Additional metadata to include (should include 'path_structure') Returns: List of dictionaries with chunk text and metadata """ extra_metadata = extra_metadata or {} # Extract filesystem path structure from metadata path_structure = extra_metadata.get("path_structure") if is_markdown: chunks = self.chunk_markdown(text, path_prefix=path_structure) else: # Fallback to basic text chunking for non-markdown text_chunks = self.text_chunker.split_text(text) base_path = path_structure if path_structure else "Document" chunks = [ { "text": chunk, "section_path": base_path, "section_level": 0, } for chunk in text_chunks ] # Add full metadata to each chunk return [ { "text": chunk["text"], "doc_id": doc_id, "chunk_index": idx, "total_chunks": len(chunks), "section_path": chunk["section_path"], "section_level": chunk["section_level"], **extra_metadata, } for idx, chunk in enumerate(chunks) ]

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jaimeferj/mcp-rag-docs'

If you have feedback or need assistance with the MCP directory API, please join our Discord server