Memex
by narphorium
import re
from dataclasses import dataclass
from typing import List, Optional
@dataclass
class MarkdownSection:
path: str
index: int
level: int # Header level (1-6)
header: str # The header text
content: str # Full content including header
parent_index: Optional[int] # Index of parent section
child_indices: List[int] # Indices of child sections
def split_markdown_sections(markdown_text: str, path: str) -> List[MarkdownSection]:
"""
Split markdown text into sections based on headers while maintaining hierarchy.
Returns a list of MarkdownSection objects.
Example:
# Header 1
Content 1
## Subheader 1
Content 2
# Header 2
Content 3
Will create 3 sections with appropriate parent-child relationships.
"""
# Regex to match markdown headers (# Header)
header_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
# Find all headers with their positions
headers = [(match.group(1), match.group(2), match.start())
for match in header_pattern.finditer(markdown_text)]
if not headers:
# If no headers found, treat entire text as one section
return [MarkdownSection(
path=path,
index=0,
level=0,
header="",
content=markdown_text,
parent_index=None,
child_indices=[],
)]
# Create sections
sections = []
section_stack = []
for i, (hashes, header_text, start_pos) in enumerate(headers):
level = len(hashes)
# Get section content (from this header to next header or end)
end_pos = headers[i + 1][2] if i < len(headers) - 1 else len(markdown_text)
content = markdown_text[start_pos:end_pos].strip()
# Create new section
section = MarkdownSection(
path=path,
index=i,
level=level,
header=header_text.strip(),
content=content,
parent_index=None,
child_indices=[],
)
# Update parent-child relationships
while section_stack and section_stack[-1].level >= level:
section_stack.pop()
if section_stack:
parent = section_stack[-1]
section.parent_index = parent.index
parent.child_indices.append(section.index)
section_stack.append(section)
sections.append(section)
return sections