Qdrant RAG MCP Server

documentation_indexer.py•11.1 KiB

""" Documentation Indexer for Markdown and Other Documentation Files This module provides specialized indexing for documentation files, particularly markdown, with section-based chunking and metadata extraction. """ import re from typing import List, Dict, Any, Optional, Tuple from pathlib import Path from datetime import datetime import logging logger = logging.getLogger(__name__) class DocumentationIndexer: """ Specialized indexer for markdown and other documentation files. Chunks documents by sections (headings) and extracts rich metadata including heading hierarchy, code blocks, and links. """ def __init__(self, chunk_size: int = 2000, chunk_overlap: int = 400): """ Initialize the documentation indexer. Args: chunk_size: Target size for chunks (characters) chunk_overlap: Overlap between chunks for context """ # Validate parameters to prevent division by zero if chunk_size <= 0: raise ValueError("chunk_size must be positive") if chunk_overlap < 0: raise ValueError("chunk_overlap must be non-negative") if chunk_overlap >= chunk_size: raise ValueError("chunk_overlap must be less than chunk_size") self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.supported_extensions = {'.md', '.markdown', '.rst', '.txt', '.mdx'} # Regex patterns for markdown parsing self.heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE) self.code_block_pattern = re.compile(r'```(\w*)\n(.*?)\n```', re.DOTALL) self.inline_code_pattern = re.compile(r'`([^`]+)`') self.link_pattern = re.compile(r'\[([^\]]+)\]$([^)]+)$') self.frontmatter_pattern = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL) def is_supported(self, file_path: str) -> bool: """Check if file type is supported for documentation indexing.""" return Path(file_path).suffix.lower() in self.supported_extensions def index_file(self, file_path: str) -> List[Dict[str, Any]]: """ Index a documentation file and return chunks with metadata. Args: file_path: Path to the documentation file Returns: List of chunks with metadata """ try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() except Exception as e: logger.error(f"Error reading documentation file {file_path}: {e}") return [] # Handle empty files gracefully if not content or not content.strip(): logger.warning(f"Documentation file {file_path} is empty") return [] # Extract metadata file_metadata = self._extract_file_metadata(file_path, content) # Parse markdown structure sections = self._parse_sections(content) # Create chunks chunks = [] for section in sections: chunk_metadata = { **file_metadata, **section['metadata'], 'chunk_index': len(chunks), 'chunk_type': section['type'] } # Split large sections if needed if len(section['content']) > self.chunk_size: sub_chunks = self._split_large_section(section['content'], chunk_metadata) chunks.extend(sub_chunks) else: chunks.append({ 'content': section['content'], 'metadata': chunk_metadata }) return chunks def _extract_file_metadata(self, file_path: str, content: str) -> Dict[str, Any]: """Extract file-level metadata.""" path = Path(file_path) # Extract title (first # heading or filename) title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE) title = title_match.group(1) if title_match else path.stem.replace('-', ' ').title() # Extract frontmatter if present frontmatter = {} frontmatter_match = self.frontmatter_pattern.match(content) if frontmatter_match: # Simple key: value parsing (could be enhanced with yaml parser) for line in frontmatter_match.group(1).split('\n'): if ':' in line: key, value = line.split(':', 1) frontmatter[key.strip()] = value.strip() # Count code blocks and languages code_blocks = self.code_block_pattern.findall(content) code_languages = list(set(lang for lang, _ in code_blocks if lang)) # Extract links links = self.link_pattern.findall(content) internal_links = [link for _, link in links if not link.startswith('http')] external_links = [link for _, link in links if link.startswith('http')] return { 'file_path': file_path, 'file_name': path.name, 'doc_type': path.suffix[1:], # Remove the dot 'title': title, 'frontmatter': frontmatter, 'has_code_blocks': len(code_blocks) > 0, 'code_languages': code_languages, 'code_block_count': len(code_blocks), 'internal_links': internal_links, 'external_links': external_links, 'modified_at': datetime.fromtimestamp(path.stat().st_mtime).isoformat() } def _parse_sections(self, content: str) -> List[Dict[str, Any]]: """Parse markdown content into sections based on headings.""" sections = [] # Find all headings with their positions headings = [] for match in self.heading_pattern.finditer(content): level = len(match.group(1)) title = match.group(2) start = match.start() headings.append({ 'level': level, 'title': title, 'start': start, 'full_match': match.group(0) }) # If no headings, treat entire content as one section if not headings: sections.append({ 'content': content.strip(), 'type': 'document', 'metadata': { 'heading': None, 'heading_hierarchy': [], 'heading_level': 0 } }) return sections # Build heading hierarchy and extract sections current_hierarchy = [] for i, heading in enumerate(headings): # Update hierarchy level = heading['level'] # Remove deeper levels from hierarchy current_hierarchy = current_hierarchy[:level-1] # Add current heading if len(current_hierarchy) < level: current_hierarchy.append(heading['title']) else: current_hierarchy[level-1] = heading['title'] # Determine section end section_start = heading['start'] section_end = headings[i+1]['start'] if i+1 < len(headings) else len(content) # Extract section content (including the heading) section_content = content[section_start:section_end].strip() # Extract any code blocks in this section section_code_blocks = self.code_block_pattern.findall(section_content) section_code_languages = list(set(lang for lang, _ in section_code_blocks if lang)) sections.append({ 'content': section_content, 'type': 'section', 'metadata': { 'heading': heading['title'], 'heading_hierarchy': current_hierarchy.copy(), 'heading_level': level, 'has_code_blocks': len(section_code_blocks) > 0, 'section_code_languages': section_code_languages } }) # Add any content before the first heading if headings and headings[0]['start'] > 0: preamble = content[:headings[0]['start']].strip() if preamble: sections.insert(0, { 'content': preamble, 'type': 'preamble', 'metadata': { 'heading': 'Introduction', 'heading_hierarchy': [], 'heading_level': 0 } }) return sections def _split_large_section(self, content: str, base_metadata: Dict[str, Any]) -> List[Dict[str, Any]]: """Split a large section into smaller chunks while preserving context.""" chunks = [] # Try to split at paragraph boundaries paragraphs = content.split('\n\n') current_chunk = "" for paragraph in paragraphs: # If adding this paragraph would exceed chunk size, start a new chunk if current_chunk and len(current_chunk) + len(paragraph) + 2 > self.chunk_size: chunks.append({ 'content': current_chunk.strip(), 'metadata': { **base_metadata, 'chunk_index': base_metadata.get('chunk_index', 0) + len(chunks), 'is_partial': True } }) # Start new chunk with overlap if len(current_chunk) > self.chunk_overlap: overlap_text = current_chunk[-self.chunk_overlap:] current_chunk = overlap_text + "\n\n" + paragraph else: current_chunk = paragraph else: current_chunk += ("\n\n" if current_chunk else "") + paragraph # Add final chunk if current_chunk: chunks.append({ 'content': current_chunk.strip(), 'metadata': { **base_metadata, 'chunk_index': base_metadata.get('chunk_index', 0) + len(chunks), 'is_partial': len(chunks) > 0 } }) return chunks def extract_summary(self, content: str, max_length: int = 200) -> str: """Extract a summary from the beginning of the content.""" # Remove headings and code blocks for summary clean_content = re.sub(self.heading_pattern, '', content) clean_content = re.sub(self.code_block_pattern, '', clean_content) clean_content = re.sub(r'\s+', ' ', clean_content).strip() if len(clean_content) <= max_length: return clean_content # Try to cut at sentence boundary sentences = clean_content.split('. ') summary = "" for sentence in sentences: if len(summary) + len(sentence) + 2 <= max_length: summary += sentence + ". " else: break return summary.strip() or clean_content[:max_length] + "..."

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ancoleman/qdrant-rag-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

documentation_indexer.py•11.1 KiB