RBT Document Editor

Overview Schema Related Servers Score Discussions

parser.py•9.42 KiB

import re import yaml import datetime from typing import List, Dict, Any, Optional from dataclasses import dataclass, field # @REQ: REQ-md-json-converter # @BP: BP-md-json-converter # @TASK: TASK-md-json-converter-001-Parser # --- Intermediate Data Structures --- @dataclass class ParsedBlock: id: str type: str title: Optional[str] = None content: Optional[str] = None items: List[str] = field(default_factory=list) @dataclass class ParsedSection: id: str title: str level: int summary: Optional[str] = None blocks: List[ParsedBlock] = field(default_factory=list) # Temp fields for building hierarchy parent_id: Optional[str] = None @dataclass class ParsedData: metadata: Dict[str, Any] = field(default_factory=dict) info: Dict[str, Any] = field(default_factory=dict) main_title: str = "" sections: List[ParsedSection] = field(default_factory=list) class MarkdownParser: """Parses the custom Markdown format into an intermediate structure.""" def _convert_dates_to_iso(self, data: Dict[str, Any]) -> Dict[str, Any]: """Recursively converts date/datetime objects in a dictionary to ISO 8601 strings.""" if data is None: return None for key, value in data.items(): if isinstance(value, (datetime.date, datetime.datetime)): data[key] = value.isoformat() elif isinstance(value, dict): data[key] = self._convert_dates_to_iso(value) return data def parse(self, md_text: str) -> ParsedData: """Main method to parse the entire Markdown string.""" parsed_data = ParsedData() # 1. Split content into YAML, Info, and Main parts try: _, yaml_content, main_content = md_text.split('---\n', 2) parsed_data.metadata = yaml.safe_load(yaml_content) parsed_data.metadata = self._convert_dates_to_iso(parsed_data.metadata) except (ValueError, yaml.YAMLError) as e: raise ValueError(f"Error parsing YAML front matter: {e}") lines = main_content.strip().split('\n') # 2. Parse Info Section if lines and lines[0] == '': lines.pop(0) info_lines = [] while lines and lines[0].startswith('> '): info_lines.append(lines.pop(0)[2:]) parsed_data.info = yaml.safe_load('\n'.join(info_lines)) parsed_data.info = self._convert_dates_to_iso(parsed_data.info) if lines and not lines[0].strip(): lines.pop(0) # Pop blank line after info # 3. Parse Main Title and Root Section Comment if lines and lines[0] == '': lines.pop(0) # Consume the root section comment if lines and lines[0].startswith('# '): parsed_data.main_title = lines.pop(0).lstrip('# ').strip() if lines and not lines[0].strip(): lines.pop(0) # Pop blank line after title else: raise ValueError("Root section comment must be followed by a main title (#).") else: raise ValueError("Document must start with a root section comment ().") # 4. Parse Sections and Blocks current_section: Optional[ParsedSection] = None current_block_meta: Optional[Dict[str, str]] = None content_buffer: List[str] = [] i = 0 while i < len(lines): line = lines[i] comment_match = re.match(r'^$', line) if comment_match: # Finalize previous block if it exists and buffer is not empty if current_block_meta and content_buffer: self._finalize_block(content_buffer, current_block_meta, current_section) content_buffer.clear() meta_string = comment_match.group(1) meta = dict(re.findall(r'(\w+):\s*([^,]+(?:\s*[^,]+)*)', meta_string)) # Check if next line is a section heading next_line_idx = i + 1 next_line_is_section_heading = (next_line_idx < len(lines)) and re.match(r'^#+\s', lines[next_line_idx]) if next_line_is_section_heading: # This comment is for a section current_section = self._create_section(meta, lines[next_line_idx]) parsed_data.sections.append(current_section) current_block_meta = None # Ensure block meta is cleared for a new section i += 1 # Consume the section title line # Check for summary after section title summary_line_idx = i + 1 if summary_line_idx < len(lines): summary_line = lines[summary_line_idx].strip() summary_match = re.match(r'^\[SUMMARY:(.*?)\]$', summary_line) if summary_match: current_section.summary = summary_match.group(1).strip() i += 1 # Consume the summary line else: # This comment is for a block current_block_meta = meta # The line after the comment is the start of the block content # We don't consume it here, it will be consumed by the main loop elif re.match(r'^##\s', line): # Heading without id comment - auto-generate section # Finalize previous block first if current_block_meta and content_buffer: self._finalize_block(content_buffer, current_block_meta, current_section) content_buffer.clear() current_block_meta = None # Auto-generate section id from heading text heading_text = line.lstrip('#').strip() auto_id = self._generate_section_id(heading_text) auto_meta = {'id': auto_id} current_section = self._create_section(auto_meta, line) parsed_data.sections.append(current_section) elif current_section: is_summary = re.match(r'^\[SUMMARY:(.*?)\]$', line) if is_summary: current_section.summary = is_summary.group(1).strip() # Only append content if we are inside a section and it's not a heading elif not re.match(r'^#+\s', line): content_buffer.append(line) i += 1 # Move to the next line # Finalize the very last block if any content remains if current_block_meta and content_buffer: self._finalize_block(content_buffer, current_block_meta, current_section) return parsed_data def _generate_section_id(self, heading_text: str) -> str: """ Generate a section ID from heading text (auto-generate for sections without id comment). Examples: "Section 1" -> "sec-section-1" "1. 任務目標與前置" -> "sec-1-任務目標與前置" "Implementation & Testing" -> "sec-implementation-testing" """ # Convert to lowercase slug = heading_text.lower() # Replace spaces and underscores with hyphens slug = re.sub(r'[\s_]+', '-', slug) # Remove special characters except hyphens, alphanumerics, and unicode letters slug = re.sub(r'[^\w\-]', '', slug, flags=re.UNICODE) # Remove multiple consecutive hyphens slug = re.sub(r'-+', '-', slug) # Strip leading/trailing hyphens slug = slug.strip('-') # Add "sec-" prefix if not already present if not slug.startswith('sec-'): slug = f"sec-{slug}" return slug def _create_section(self, meta: Dict[str, str], title_line: str) -> ParsedSection: level = title_line.find(' ') title = title_line.lstrip('# ').strip() return ParsedSection(id=meta['id'], title=title, level=level) def _finalize_block(self, buffer: List[str], meta: Optional[Dict[str, str]], section: Optional[ParsedSection]): if not meta or not section: return content_str = '\n'.join(buffer).strip() if not content_str: return block = ParsedBlock(id=meta['id'], type=meta['type']) # Extract title if it exists (must be first line of content_str) lines = content_str.split('\n') first_line = lines[0].strip() # Check for bold title pattern: **Title** or **Title**: title_match = re.match(r'^\*\*(.*?)\*\*(:?)$', first_line) if title_match: block.title = title_match.group(1).strip() lines.pop(0) content_str = '\n'.join(lines).strip() # Process content based on type if block.type == 'list': # Remove all leading '-' and whitespace (handles nested lists) block.items = [ re.sub(r'^[\s\-]+', '', line).strip() for line in content_str.split('\n') if line.strip() ] elif block.type == 'code': # For code, strip the ``` markers if content_str.startswith('```') and content_str.endswith('```'): content_str = '\n'.join(content_str.split('\n')[1:-1]).strip() block.content = content_str elif block.type in ['table', 'paragraph']: block.content = content_str section.blocks.append(block)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/leo7nel23/KnowkedgeSmith-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

parser.py•9.42 KiB