"""
Natural language task list parser
Parses various task list formats (bullets, numbers, checkboxes) and extracts metadata.
"""
import re
from typing import Any, Optional
def parse_natural_language_task_list(
text: str,
default_priority: str = "medium",
default_timeframe: Optional[str] = None,
) -> dict[str, Any]:
"""
Parse natural language task list into structured todos
Supports multiple formats and extracts metadata from inline markers
"""
lines = text.strip().split("\n")
parsed_todos = []
unparseable_lines = []
section_context = {} # Track section headers for context
for line in lines:
line = line.strip()
if not line:
continue
# Check if it's a section header (used for context, not a task)
section_match = re.match(
r"^(.+?)(?:\s*\(due\s+[\d/-]+\))?:\s*$", line, re.IGNORECASE
)
if section_match:
section_title = section_match.group(1).strip()
section_context = extract_section_context(section_title)
continue
# Try to parse as a task
parsed = parse_single_task_line(
line, default_priority, default_timeframe, section_context
)
if parsed:
parsed_todos.append(parsed)
else:
unparseable_lines.append(line)
# Generate summary
summary_parts = [f"Parsed {len(parsed_todos)} tasks"]
if unparseable_lines:
summary_parts.append(f"{len(unparseable_lines)} lines could not be parsed")
# Count by priority
priority_counts: dict[str, int] = {}
for todo in parsed_todos:
p = todo.get("priority", "medium")
priority_counts[p] = priority_counts.get(p, 0) + 1
if priority_counts:
summary_parts.append(
f"Priority breakdown: {', '.join(f'{p}={c}' for p, c in priority_counts.items())}"
)
return {
"parsed_todos": parsed_todos,
"parse_summary": ". ".join(summary_parts),
"unparseable_lines": unparseable_lines,
}
def parse_single_task_line(
line: str,
default_priority: str,
default_timeframe: Optional[str],
section_context: dict[str, Any],
) -> Optional[dict[str, Any]]:
"""
Parse a single task line and extract metadata
Returns todo dict or None if unparseable
"""
# Skip completed checkboxes
if re.match(r"^-?\s*\[x\]", line, re.IGNORECASE):
return None
# Remove common prefixes
line = re.sub(r"^[-*•]\s*", "", line) # Bullet points
line = re.sub(r"^\d+\.\s*", "", line) # Numbered lists
line = re.sub(r"^\[\s*\]\s*", "", line) # Checkboxes
# Initialize with defaults and section context
todo: dict[str, Any] = {
"priority": section_context.get("priority", default_priority),
"timeframe": section_context.get("timeframe", default_timeframe),
}
# Inherit theme from section context if present
if "theme_tag" in section_context:
todo["theme_tag"] = section_context["theme_tag"]
# Extract priority markers
priority_patterns = [
(r"^(!{2,}|URGENT:?)\s*", "high"),
(r"#high\b|#urgent\b", "high"),
(r"#low\b", "low"),
(r"#medium\b", "medium"),
]
for pattern, priority in priority_patterns:
match = re.search(pattern, line, re.IGNORECASE)
if match:
todo["priority"] = priority
line = re.sub(pattern, "", line, flags=re.IGNORECASE).strip()
break
# Extract timeframe markers
timeframe_patterns = [
(r"\bthis\s+week\b", "this_week"),
(r"\bnext\s+sprint\b", "next_sprint"),
(r"\bthis\s+month\b", "this_month"),
(r"\bthis\s+quarter\b|q[1-4]\b", "this_quarter"),
(r"\bsomeday\b|\blater\b", "someday"),
]
for pattern, timeframe in timeframe_patterns:
match = re.search(pattern, line, re.IGNORECASE)
if match:
todo["timeframe"] = timeframe
line = re.sub(pattern, "", line, flags=re.IGNORECASE).strip()
break
# Extract time estimates
time_patterns = [
(
r"\((\d+(?:\.\d+)?)\s*h(?:ou)?rs?\)",
lambda m: int(float(m.group(1)) * 60),
),
(r"\((\d+)\s*min(?:ute)?s?\)", lambda m: int(m.group(1))),
(r"~(\d+)h\b", lambda m: int(m.group(1)) * 60),
(r"~(\d+)min\b", lambda m: int(m.group(1))),
]
for pattern, converter in time_patterns:
match = re.search(pattern, line, re.IGNORECASE)
if match:
todo["time_estimate"] = converter(match)
line = re.sub(pattern, "", line, flags=re.IGNORECASE).strip()
break
# Extract energy markers
energy_patterns = [
(r"\[high\s+energy\]|\[deep\s+work\]", "high"),
(r"\[low\s+(?:energy|effort)\]|\[easy\]", "low"),
(r"\[medium\s+energy\]", "medium"),
]
for pattern, energy in energy_patterns:
match = re.search(pattern, line, re.IGNORECASE)
if match:
todo["energy_required"] = energy
line = re.sub(pattern, "", line, flags=re.IGNORECASE).strip()
break
# Extract theme tags
theme_patterns = [
(r"#sprint\b", "sprint_work"),
(r"#strategic\b", "strategic"),
(r"@?#?admin\b", "admin"),
(r"#learning\b", "learning"),
]
for pattern, theme in theme_patterns:
match = re.search(pattern, line, re.IGNORECASE)
if match:
todo["theme_tag"] = theme
line = re.sub(pattern, "", line, flags=re.IGNORECASE).strip()
break
# Extract quick marker
if re.search(r"\bquick\b", line, re.IGNORECASE):
todo["quick"] = True
line = re.sub(r"\bquick\b", "", line, flags=re.IGNORECASE).strip()
# Clean up line and use as title
line = re.sub(r"\s+", " ", line).strip() # Collapse whitespace
line = re.sub(r"^[-:]+\s*", "", line) # Remove leading separators
if not line:
return None # No title left after extraction
todo["title"] = line
return todo
def extract_section_context(section_title: str) -> dict[str, Any]:
"""
Extract context from section headers like "Sprint work (due 11/18):" or "Strategic (this month):"
Returns dict with priority/timeframe/theme that should apply to tasks in this section
"""
context: dict[str, Any] = {}
title_lower = section_title.lower()
# Detect theme from section title
if "sprint" in title_lower:
context["theme_tag"] = "sprint_work"
context["priority"] = "high" # Sprint work is usually high priority
elif "strategic" in title_lower:
context["theme_tag"] = "strategic"
elif "admin" in title_lower or "quick" in title_lower:
context["theme_tag"] = "admin"
context["priority"] = "low"
elif "learning" in title_lower:
context["theme_tag"] = "learning"
# Detect timeframe from section title
if "this week" in title_lower or "week" in title_lower:
context["timeframe"] = "this_week"
elif "this month" in title_lower or "month" in title_lower:
context["timeframe"] = "this_month"
elif "someday" in title_lower or "backlog" in title_lower:
context["timeframe"] = "someday"
# Extract due dates from section title
due_match = re.search(r"due\s+([\d/-]+)", section_title, re.IGNORECASE)
if due_match:
context["timeframe"] = "this_week" # Assume due dates are urgent
context["priority"] = "high"
return context