Skip to main content
Glama
parser.py21 kB
""" Markdown parser for slash commands. This module provides comprehensive markdown parsing capabilities including: - Command block extraction - Frontmatter parsing - Code block processing - Metadata extraction """ import re import yaml import json from pathlib import Path from typing import Dict, Any, List, Optional, Tuple, Union from dataclasses import dataclass, field from datetime import datetime from enum import Enum import structlog from ..utils.logging import get_logger from ..utils.errors import ValidationError, SystemError logger = get_logger(__name__) class BlockType(Enum): """Types of blocks that can be extracted from markdown.""" COMMAND = "command" CODE = "code" TEXT = "text" FRONTMATTER = "frontmatter" HEADING = "heading" LIST = "list" QUOTE = "quote" class CommandBlockType(Enum): """Types of command blocks.""" SLASH_COMMAND = "slash_command" # /command_name AT_COMMAND = "@command_name" # @command_name HASH_COMMAND = "hash_command" # #command_name CUSTOM = "custom" # Custom pattern @dataclass class FrontmatterData: """Parsed frontmatter data from markdown.""" raw_content: str parsed_data: Dict[str, Any] = field(default_factory=dict) format: str = "yaml" # yaml, json, toml start_line: int = 0 end_line: int = 0 def get(self, key: str, default: Any = None) -> Any: """Get value from frontmatter data.""" return self.parsed_data.get(key, default) def has(self, key: str) -> bool: """Check if key exists in frontmatter.""" return key in self.parsed_data def to_dict(self) -> Dict[str, Any]: """Convert to dictionary.""" return { "raw_content": self.raw_content, "parsed_data": self.parsed_data, "format": self.format, "start_line": self.start_line, "end_line": self.end_line } @dataclass class CommandBlock: """A command block extracted from markdown.""" command_type: CommandBlockType command_name: str arguments: List[str] = field(default_factory=list) options: Dict[str, Any] = field(default_factory=dict) content: str = "" line_number: int = 0 column_number: int = 0 # Context information raw_text: str = "" surrounding_context: str = "" metadata: Dict[str, Any] = field(default_factory=dict) def to_dict(self) -> Dict[str, Any]: """Convert to dictionary.""" return { "command_type": self.command_type.value, "command_name": self.command_name, "arguments": self.arguments, "options": self.options, "content": self.content, "line_number": self.line_number, "column_number": self.column_number, "raw_text": self.raw_text, "surrounding_context": self.surrounding_context, "metadata": self.metadata } @dataclass class MarkdownBlock: """A generic block of markdown content.""" block_type: BlockType content: str line_start: int line_end: int metadata: Dict[str, Any] = field(default_factory=dict) def to_dict(self) -> Dict[str, Any]: """Convert to dictionary.""" return { "block_type": self.block_type.value, "content": self.content, "line_start": self.line_start, "line_end": self.line_end, "metadata": self.metadata } class MarkdownParser: """Comprehensive markdown parser for slash commands.""" # Command patterns SLASH_COMMAND_PATTERN = re.compile(r'^/(\w+)(?:\s+(.*))?$', re.MULTILINE) AT_COMMAND_PATTERN = re.compile(r'^@(\w+)(?:\s+(.*))?$', re.MULTILINE) HASH_COMMAND_PATTERN = re.compile(r'^#(\w+)(?:\s+(.*))?$', re.MULTILINE) # Frontmatter patterns YAML_FRONTMATTER_PATTERN = re.compile(r'^---\n(.*?)\n---', re.DOTALL) JSON_FRONTMATTER_PATTERN = re.compile(r'^```json\n(.*?)\n```', re.DOTALL) TOML_FRONTMATTER_PATTERN = re.compile(r'^\+\+\+\n(.*?)\n\+\+\+', re.DOTALL) # Code block patterns CODE_BLOCK_PATTERN = re.compile(r'^```(\w*)\n(.*?)\n```', re.DOTALL | re.MULTILINE) INLINE_CODE_PATTERN = re.compile(r'`([^`]+)`') # Structure patterns HEADING_PATTERN = re.compile(r'^(#{1,6})\s+(.*)$', re.MULTILINE) LIST_PATTERN = re.compile(r'^(\s*)[-*+]\s+(.*)$', re.MULTILINE) QUOTE_PATTERN = re.compile(r'^>\s+(.*)$', re.MULTILINE) def __init__(self): """Initialize markdown parser.""" self.custom_patterns: Dict[str, re.Pattern] = {} self.options = { 'extract_frontmatter': True, 'extract_code_blocks': True, 'extract_commands': True, 'include_context': True, 'context_lines': 2 } def add_custom_pattern(self, name: str, pattern: Union[str, re.Pattern]) -> None: """Add a custom command pattern.""" if isinstance(pattern, str): pattern = re.compile(pattern, re.MULTILINE) self.custom_patterns[name] = pattern logger.debug("custom_pattern_added", name=name) def set_options(self, **options) -> None: """Set parser options.""" self.options.update(options) logger.debug("parser_options_updated", options=options) def parse(self, content: str, file_path: Optional[Path] = None) -> Dict[str, Any]: """ Parse markdown content comprehensively. Args: content: Markdown content to parse file_path: Optional file path for context Returns: Dictionary containing parsed elements """ lines = content.split('\n') result = { 'content': content, 'file_path': str(file_path) if file_path else None, 'line_count': len(lines), 'frontmatter': None, 'command_blocks': [], 'code_blocks': [], 'markdown_blocks': [], 'metadata': { 'parsed_at': datetime.utcnow().isoformat(), 'parser_version': '1.0.0' } } try: # Extract frontmatter first if self.options['extract_frontmatter']: frontmatter = self._extract_frontmatter(content) if frontmatter: result['frontmatter'] = frontmatter.to_dict() # Remove frontmatter from content for further processing content = self._remove_frontmatter(content, frontmatter) # Extract command blocks if self.options['extract_commands']: command_blocks = self._extract_command_blocks(content, lines) result['command_blocks'] = [block.to_dict() for block in command_blocks] # Extract code blocks if self.options['extract_code_blocks']: code_blocks = self._extract_code_blocks(content, lines) result['code_blocks'] = code_blocks # Extract markdown structure markdown_blocks = self._extract_markdown_blocks(content, lines) result['markdown_blocks'] = [block.to_dict() for block in markdown_blocks] logger.info( "markdown_parsed", file_path=file_path, command_count=len(result['command_blocks']), code_block_count=len(result['code_blocks']), block_count=len(result['markdown_blocks']) ) return result except Exception as e: logger.error( "markdown_parse_error", file_path=file_path, error=str(e), exc_info=True ) raise SystemError(f"Failed to parse markdown: {e}") from e def parse_file(self, file_path: Path) -> Dict[str, Any]: """Parse markdown file.""" try: content = file_path.read_text(encoding='utf-8') return self.parse(content, file_path) except Exception as e: logger.error( "file_parse_error", file_path=file_path, error=str(e) ) raise SystemError(f"Failed to parse file {file_path}: {e}") from e def _extract_frontmatter(self, content: str) -> Optional[FrontmatterData]: """Extract frontmatter from markdown content.""" # Try YAML frontmatter (most common) match = self.YAML_FRONTMATTER_PATTERN.match(content) if match: raw_content = match.group(1) try: parsed_data = yaml.safe_load(raw_content) return FrontmatterData( raw_content=raw_content, parsed_data=parsed_data or {}, format="yaml", start_line=0, end_line=content[:match.end()].count('\n') ) except yaml.YAMLError as e: logger.warning("yaml_frontmatter_parse_error", error=str(e)) # Try JSON frontmatter match = self.JSON_FRONTMATTER_PATTERN.match(content) if match: raw_content = match.group(1) try: parsed_data = json.loads(raw_content) return FrontmatterData( raw_content=raw_content, parsed_data=parsed_data, format="json", start_line=0, end_line=content[:match.end()].count('\n') ) except json.JSONDecodeError as e: logger.warning("json_frontmatter_parse_error", error=str(e)) # Try TOML frontmatter match = self.TOML_FRONTMATTER_PATTERN.match(content) if match: raw_content = match.group(1) try: import toml parsed_data = toml.loads(raw_content) return FrontmatterData( raw_content=raw_content, parsed_data=parsed_data, format="toml", start_line=0, end_line=content[:match.end()].count('\n') ) except Exception as e: logger.warning("toml_frontmatter_parse_error", error=str(e)) return None def _remove_frontmatter(self, content: str, frontmatter: FrontmatterData) -> str: """Remove frontmatter from content.""" lines = content.split('\n') return '\n'.join(lines[frontmatter.end_line + 1:]) def _extract_command_blocks(self, content: str, lines: List[str]) -> List[CommandBlock]: """Extract command blocks from content.""" command_blocks = [] # Extract slash commands for match in self.SLASH_COMMAND_PATTERN.finditer(content): command_name = match.group(1) args_str = match.group(2) or "" line_num = content[:match.start()].count('\n') # Parse arguments and options arguments, options = self._parse_command_args(args_str) # Get surrounding context context = self._get_surrounding_context(lines, line_num) command_blocks.append(CommandBlock( command_type=CommandBlockType.SLASH_COMMAND, command_name=command_name, arguments=arguments, options=options, content=args_str, line_number=line_num, column_number=match.start() - content.rfind('\n', 0, match.start()) - 1, raw_text=match.group(0), surrounding_context=context )) # Extract @ commands for match in self.AT_COMMAND_PATTERN.finditer(content): command_name = match.group(1) args_str = match.group(2) or "" line_num = content[:match.start()].count('\n') arguments, options = self._parse_command_args(args_str) context = self._get_surrounding_context(lines, line_num) command_blocks.append(CommandBlock( command_type=CommandBlockType.AT_COMMAND, command_name=command_name, arguments=arguments, options=options, content=args_str, line_number=line_num, column_number=match.start() - content.rfind('\n', 0, match.start()) - 1, raw_text=match.group(0), surrounding_context=context )) # Extract # commands (but not headings) for match in self.HASH_COMMAND_PATTERN.finditer(content): # Skip if this looks like a heading if match.group(0).startswith('##') or ' ' in match.group(1): continue command_name = match.group(1) args_str = match.group(2) or "" line_num = content[:match.start()].count('\n') arguments, options = self._parse_command_args(args_str) context = self._get_surrounding_context(lines, line_num) command_blocks.append(CommandBlock( command_type=CommandBlockType.HASH_COMMAND, command_name=command_name, arguments=arguments, options=options, content=args_str, line_number=line_num, column_number=match.start() - content.rfind('\n', 0, match.start()) - 1, raw_text=match.group(0), surrounding_context=context )) # Extract custom pattern commands for pattern_name, pattern in self.custom_patterns.items(): for match in pattern.finditer(content): line_num = content[:match.start()].count('\n') context = self._get_surrounding_context(lines, line_num) command_blocks.append(CommandBlock( command_type=CommandBlockType.CUSTOM, command_name=pattern_name, arguments=[], options={}, content=match.group(0), line_number=line_num, column_number=match.start() - content.rfind('\n', 0, match.start()) - 1, raw_text=match.group(0), surrounding_context=context, metadata={'pattern_name': pattern_name} )) return command_blocks def _extract_code_blocks(self, content: str, lines: List[str]) -> List[Dict[str, Any]]: """Extract code blocks from content.""" code_blocks = [] for match in self.CODE_BLOCK_PATTERN.finditer(content): language = match.group(1) or "text" code_content = match.group(2) line_start = content[:match.start()].count('\n') line_end = content[:match.end()].count('\n') code_blocks.append({ 'language': language, 'content': code_content, 'line_start': line_start, 'line_end': line_end, 'raw_text': match.group(0) }) return code_blocks def _extract_markdown_blocks(self, content: str, lines: List[str]) -> List[MarkdownBlock]: """Extract structured markdown blocks.""" blocks = [] # Extract headings for match in self.HEADING_PATTERN.finditer(content): level = len(match.group(1)) title = match.group(2) line_num = content[:match.start()].count('\n') blocks.append(MarkdownBlock( block_type=BlockType.HEADING, content=title, line_start=line_num, line_end=line_num, metadata={'level': level, 'raw': match.group(0)} )) # Extract lists for match in self.LIST_PATTERN.finditer(content): indent = len(match.group(1)) item_content = match.group(2) line_num = content[:match.start()].count('\n') blocks.append(MarkdownBlock( block_type=BlockType.LIST, content=item_content, line_start=line_num, line_end=line_num, metadata={'indent_level': indent // 2, 'raw': match.group(0)} )) # Extract quotes for match in self.QUOTE_PATTERN.finditer(content): quote_content = match.group(1) line_num = content[:match.start()].count('\n') blocks.append(MarkdownBlock( block_type=BlockType.QUOTE, content=quote_content, line_start=line_num, line_end=line_num, metadata={'raw': match.group(0)} )) return blocks def _parse_command_args(self, args_str: str) -> Tuple[List[str], Dict[str, Any]]: """Parse command arguments and options.""" if not args_str.strip(): return [], {} # Simple argument parsing - can be enhanced parts = args_str.split() arguments = [] options = {} i = 0 while i < len(parts): part = parts[i] # Handle --option=value or --option value if part.startswith('--'): if '=' in part: key, value = part[2:].split('=', 1) options[key] = self._parse_value(value) else: key = part[2:] if i + 1 < len(parts) and not parts[i + 1].startswith('-'): options[key] = self._parse_value(parts[i + 1]) i += 1 else: options[key] = True # Handle -o value elif part.startswith('-') and len(part) == 2: key = part[1:] if i + 1 < len(parts) and not parts[i + 1].startswith('-'): options[key] = self._parse_value(parts[i + 1]) i += 1 else: options[key] = True # Regular argument else: arguments.append(part) i += 1 return arguments, options def _parse_value(self, value: str) -> Any: """Parse a string value to appropriate type.""" # Try boolean if value.lower() in ('true', 'yes', '1'): return True elif value.lower() in ('false', 'no', '0'): return False # Try number try: if '.' in value: return float(value) else: return int(value) except ValueError: pass # Return as string return value def _get_surrounding_context(self, lines: List[str], line_num: int) -> str: """Get surrounding context lines.""" if not self.options['include_context']: return "" context_lines = self.options['context_lines'] start = max(0, line_num - context_lines) end = min(len(lines), line_num + context_lines + 1) return '\n'.join(lines[start:end]) def extract_commands_only(self, content: str) -> List[CommandBlock]: """Extract only command blocks from content.""" lines = content.split('\n') return self._extract_command_blocks(content, lines) def extract_frontmatter_only(self, content: str) -> Optional[FrontmatterData]: """Extract only frontmatter from content.""" return self._extract_frontmatter(content) def validate_command_syntax(self, command_block: CommandBlock) -> List[str]: """Validate command syntax and return list of errors.""" errors = [] # Basic validation if not command_block.command_name: errors.append("Command name is required") if not command_block.command_name.isalnum() and '_' not in command_block.command_name: errors.append("Command name must be alphanumeric with underscores") # Command-specific validation can be added here return errors # Export public API __all__ = [ 'MarkdownParser', 'CommandBlock', 'FrontmatterData', 'MarkdownBlock', 'BlockType', 'CommandBlockType' ]

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/krzemienski/shannon-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server