MCP Task Aggregator

markdown_adapter.py•11 KiB

"""Markdown adapter for parsing local TO-DO.md and TODO.md files.

Implements the TaskAdapter interface for local markdown task files.
"""

from __future__ import annotations

import re
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any

from loguru import logger

from mcp_task_aggregator.adapters.base import TaskAdapter
from mcp_task_aggregator.models import (
    ExternalTaskMetadata,
    MarkdownMetadata,
    Tag,
    Todo,
    TodoSource,
    TodoStatus,
)


@dataclass
class MarkdownConfig:
    """Configuration for Markdown adapter."""

    search_paths: list[Path]
    file_patterns: list[str] = None

    def __post_init__(self) -> None:
        """Set default file patterns if not provided."""
        if self.file_patterns is None:
            self.file_patterns = ["TO-DO.md", "TODO.md", "todo.md", "to-do.md"]


# Checkbox state mapping to internal TodoStatus
CHECKBOX_STATUS_MAP: dict[str, TodoStatus] = {
    "[ ]": TodoStatus.TODO,
    "[x]": TodoStatus.DONE,
    "[X]": TodoStatus.DONE,
    "[-]": TodoStatus.CANCELLED,
    "[~]": TodoStatus.IN_PROGRESS,
    "[>]": TodoStatus.BLOCKED,
    "[?]": TodoStatus.IN_REVIEW,
}

# Regex pattern for markdown checkboxes
CHECKBOX_PATTERN = re.compile(
    r"^(\s*)"  # Leading whitespace (capture group 1 - indent)
    r"[-*+]\s+"  # List marker (-, *, or +) followed by space
    r"(\[[ xX\-~>?]\])"  # Checkbox state (capture group 2)
    r"\s+"  # Space after checkbox
    r"(.+)$"  # Task content (capture group 3)
)

# Regex pattern for priority markers like (P1), (P2), etc.
PRIORITY_PATTERN = re.compile(r"\(P([0-5])\)")

# Regex pattern for due date markers like (due:2024-12-25)
DUE_DATE_PATTERN = re.compile(r"\(due:(\d{4}-\d{2}-\d{2})\)")

# Regex pattern for tags like #tag or @context
TAG_PATTERN = re.compile(r"(?:^|\s)([#@]\w+)")


class MarkdownAdapter(TaskAdapter):
    """Adapter for parsing tasks from local markdown files."""

    def __init__(self, config: MarkdownConfig) -> None:
        """Initialize Markdown adapter.

        Args:
            config: Configuration with search paths and file patterns.
        """
        self.config = config

    def fetch_tasks(self, **kwargs: Any) -> list[dict[str, Any]]:
        """Fetch tasks from markdown files.

        Args:
            **kwargs: Optional parameters:
                - search_paths: Override default search paths
                - file_patterns: Override default file patterns

        Returns:
            List of raw task data as dictionaries.
        """
        search_paths = kwargs.get("search_paths", self.config.search_paths)
        file_patterns = kwargs.get("file_patterns", self.config.file_patterns)

        logger.info(f"Searching for markdown todo files in {len(search_paths)} paths")

        tasks: list[dict[str, Any]] = []

        for search_path in search_paths:
            path = Path(search_path)
            if not path.exists():
                logger.warning(f"Search path does not exist: {path}")
                continue

            # Find matching files
            found_files = self._find_todo_files(path, file_patterns)
            logger.debug(f"Found {len(found_files)} todo files in {path}")

            for file_path in found_files:
                file_tasks = self._parse_file(file_path)
                tasks.extend(file_tasks)

        logger.info(f"Fetched {len(tasks)} tasks from markdown files")
        return tasks

    def _find_todo_files(self, search_path: Path, patterns: list[str]) -> list[Path]:
        """Find todo files matching patterns in a directory.

        Args:
            search_path: Directory to search.
            patterns: File name patterns to match.

        Returns:
            List of matching file paths.
        """
        found_files = []

        if search_path.is_file():
            # Direct file path provided
            if search_path.name in patterns or search_path.suffix == ".md":
                found_files.append(search_path)
        else:
            # Search directory for matching files
            for pattern in patterns:
                found_files.extend(search_path.glob(pattern))
                # Also search subdirectories
                found_files.extend(search_path.glob(f"**/{pattern}"))

        # Deduplicate and sort
        return sorted(set(found_files))

    def _parse_file(self, file_path: Path) -> list[dict[str, Any]]:
        """Parse a markdown file for checkbox tasks.

        Args:
            file_path: Path to the markdown file.

        Returns:
            List of raw task dictionaries.
        """
        tasks = []
        current_heading = None

        try:
            content = file_path.read_text(encoding="utf-8")
            lines = content.splitlines()

            for line_number, line in enumerate(lines, start=1):
                # Track headings for context
                if line.startswith("#"):
                    current_heading = line.lstrip("#").strip()
                    continue

                # Match checkbox pattern
                match = CHECKBOX_PATTERN.match(line)
                if match:
                    indent = match.group(1)
                    checkbox_state = match.group(2)
                    content = match.group(3)

                    tasks.append(
                        {
                            "file_path": str(file_path.resolve()),
                            "line_number": line_number,
                            "checkbox_state": checkbox_state,
                            "indent_level": len(indent),
                            "parent_heading": current_heading,
                            "raw_line": line,
                            "content": content.strip(),
                        }
                    )

        except Exception as e:
            logger.error(f"Error parsing {file_path}: {e}")

        return tasks

    def normalize_task(self, raw_task: dict[str, Any]) -> Todo:
        """Normalize a markdown task to the internal Todo model.

        Args:
            raw_task: Raw task data from file parsing.

        Returns:
            Normalized Todo instance.
        """
        content = raw_task["content"]

        # Map status from checkbox state
        status = self.map_status(raw_task["checkbox_state"])

        # Extract priority from content
        priority = self._extract_priority(content)

        # Extract due date from content
        due_date = self._extract_due_date(content)

        # Clean content (remove markers)
        clean_content = self._clean_content(content)

        # Generate source ID from file path and line number
        source_id = f"{raw_task['file_path']}:{raw_task['line_number']}"

        # Build source URL (file:// protocol for local files)
        source_url = f"file://{raw_task['file_path']}#L{raw_task['line_number']}"

        # Build markdown metadata
        markdown_metadata = MarkdownMetadata(
            file_path=raw_task["file_path"],
            line_number=raw_task["line_number"],
            checkbox_state=raw_task["checkbox_state"],
            indent_level=raw_task["indent_level"],
            parent_heading=raw_task["parent_heading"],
            raw_line=raw_task["raw_line"],
        )

        # Build external metadata
        external_metadata = ExternalTaskMetadata(
            markdown=markdown_metadata,
            raw_response=raw_task,
            fetched_at=datetime.now(),
        )

        # Build tags
        tags = self._build_tags(raw_task)

        # Generate sync hash
        sync_hash = self.generate_sync_hash(raw_task)

        return Todo(
            content=clean_content,
            status=status,
            priority=priority,
            due_date=due_date,
            source_system=TodoSource.MARKDOWN,
            source_id=source_id,
            source_url=source_url,
            external_metadata=external_metadata,
            sync_hash=sync_hash,
            tags=tags,
        )

    def map_status(self, external_status: str) -> TodoStatus:
        """Map checkbox state to internal TodoStatus.

        Args:
            external_status: Checkbox state string (e.g., "[ ]", "[x]").

        Returns:
            Corresponding TodoStatus enum value.
        """
        return CHECKBOX_STATUS_MAP.get(external_status, TodoStatus.TODO)

    def _extract_priority(self, content: str) -> int:
        """Extract priority from task content.

        Looks for patterns like (P1), (P2), etc.

        Args:
            content: Task content string.

        Returns:
            Priority integer (0-5), default 0.
        """
        match = PRIORITY_PATTERN.search(content)
        if match:
            return int(match.group(1))
        return 0

    def _extract_due_date(self, content: str) -> datetime | None:
        """Extract due date from task content.

        Looks for patterns like (due:2024-12-25).

        Args:
            content: Task content string.

        Returns:
            datetime if found, None otherwise.
        """
        match = DUE_DATE_PATTERN.search(content)
        if match:
            try:
                return datetime.strptime(match.group(1), "%Y-%m-%d")
            except ValueError:
                pass
        return None

    def _clean_content(self, content: str) -> str:
        """Clean task content by removing metadata markers.

        Removes priority markers (P1), due date markers, etc.

        Args:
            content: Raw task content.

        Returns:
            Cleaned content string.
        """
        # Remove priority markers
        content = PRIORITY_PATTERN.sub("", content)
        # Remove due date markers
        content = DUE_DATE_PATTERN.sub("", content)
        # Clean up extra whitespace
        return " ".join(content.split())

    def _build_tags(self, raw_task: dict[str, Any]) -> list[Tag]:
        """Build tags list from markdown task data.

        Extracts:
        - `markdown` tag for all markdown tasks
        - `#tag` hashtags from content
        - `@context` context tags from content
        - `file:{filename}` for the source file

        Args:
            raw_task: Raw task data from file parsing.

        Returns:
            List of Tag objects.
        """
        tags = []

        # Always add markdown tag
        tags.append(Tag(name="markdown"))

        # Add file name tag
        file_path = Path(raw_task["file_path"])
        tags.append(Tag(name=f"file:{file_path.name}"))

        # Add heading tag if present
        if raw_task.get("parent_heading"):
            # Sanitize heading for tag name
            heading_tag = raw_task["parent_heading"].lower().replace(" ", "-")
            tags.append(Tag(name=f"section:{heading_tag}"))

        # Extract hashtags and context tags from content
        content = raw_task["content"]
        for match in TAG_PATTERN.finditer(content):
            tag_name = match.group(1)
            # Convert @context to context: prefix
            if tag_name.startswith("@"):
                tags.append(Tag(name=f"context:{tag_name[1:]}"))
            elif tag_name.startswith("#"):
                tags.append(Tag(name=tag_name[1:]))  # Remove # prefix

        return tags

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/89jobrien/mcp-joecc'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

markdown_adapter.py•11 KiB

"""Markdown adapter for parsing local TO-DO.md and TODO.md files.

Implements the TaskAdapter interface for local markdown task files.
"""

from __future__ import annotations

import re
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any

from loguru import logger

from mcp_task_aggregator.adapters.base import TaskAdapter
from mcp_task_aggregator.models import (
    ExternalTaskMetadata,
    MarkdownMetadata,
    Tag,
    Todo,
    TodoSource,
    TodoStatus,
)


@dataclass
class MarkdownConfig:
    """Configuration for Markdown adapter."""

    search_paths: list[Path]
    file_patterns: list[str] = None

    def __post_init__(self) -> None:
        """Set default file patterns if not provided."""
        if self.file_patterns is None:
            self.file_patterns = ["TO-DO.md", "TODO.md", "todo.md", "to-do.md"]


# Checkbox state mapping to internal TodoStatus
CHECKBOX_STATUS_MAP: dict[str, TodoStatus] = {
    "[ ]": TodoStatus.TODO,
    "[x]": TodoStatus.DONE,
    "[X]": TodoStatus.DONE,
    "[-]": TodoStatus.CANCELLED,
    "[~]": TodoStatus.IN_PROGRESS,
    "[>]": TodoStatus.BLOCKED,
    "[?]": TodoStatus.IN_REVIEW,
}

# Regex pattern for markdown checkboxes
CHECKBOX_PATTERN = re.compile(
    r"^(\s*)"  # Leading whitespace (capture group 1 - indent)
    r"[-*+]\s+"  # List marker (-, *, or +) followed by space
    r"(\[[ xX\-~>?]\])"  # Checkbox state (capture group 2)
    r"\s+"  # Space after checkbox
    r"(.+)$"  # Task content (capture group 3)
)

# Regex pattern for priority markers like (P1), (P2), etc.
PRIORITY_PATTERN = re.compile(r"\(P([0-5])\)")

# Regex pattern for due date markers like (due:2024-12-25)
DUE_DATE_PATTERN = re.compile(r"\(due:(\d{4}-\d{2}-\d{2})\)")

# Regex pattern for tags like #tag or @context
TAG_PATTERN = re.compile(r"(?:^|\s)([#@]\w+)")


class MarkdownAdapter(TaskAdapter):
    """Adapter for parsing tasks from local markdown files."""

    def __init__(self, config: MarkdownConfig) -> None:
        """Initialize Markdown adapter.

        Args:
            config: Configuration with search paths and file patterns.
        """
        self.config = config

    def fetch_tasks(self, **kwargs: Any) -> list[dict[str, Any]]:
        """Fetch tasks from markdown files.

        Args:
            **kwargs: Optional parameters:
                - search_paths: Override default search paths
                - file_patterns: Override default file patterns

        Returns:
            List of raw task data as dictionaries.
        """
        search_paths = kwargs.get("search_paths", self.config.search_paths)
        file_patterns = kwargs.get("file_patterns", self.config.file_patterns)

        logger.info(f"Searching for markdown todo files in {len(search_paths)} paths")

        tasks: list[dict[str, Any]] = []

        for search_path in search_paths:
            path = Path(search_path)
            if not path.exists():
                logger.warning(f"Search path does not exist: {path}")
                continue

            # Find matching files
            found_files = self._find_todo_files(path, file_patterns)
            logger.debug(f"Found {len(found_files)} todo files in {path}")

            for file_path in found_files:
                file_tasks = self._parse_file(file_path)
                tasks.extend(file_tasks)

        logger.info(f"Fetched {len(tasks)} tasks from markdown files")
        return tasks

    def _find_todo_files(self, search_path: Path, patterns: list[str]) -> list[Path]:
        """Find todo files matching patterns in a directory.

        Args:
            search_path: Directory to search.
            patterns: File name patterns to match.

        Returns:
            List of matching file paths.
        """
        found_files = []

        if search_path.is_file():
            # Direct file path provided
            if search_path.name in patterns or search_path.suffix == ".md":
                found_files.append(search_path)
        else:
            # Search directory for matching files
            for pattern in patterns:
                found_files.extend(search_path.glob(pattern))
                # Also search subdirectories
                found_files.extend(search_path.glob(f"**/{pattern}"))

        # Deduplicate and sort
        return sorted(set(found_files))

    def _parse_file(self, file_path: Path) -> list[dict[str, Any]]:
        """Parse a markdown file for checkbox tasks.

        Args:
            file_path: Path to the markdown file.

        Returns:
            List of raw task dictionaries.
        """
        tasks = []
        current_heading = None

        try:
            content = file_path.read_text(encoding="utf-8")
            lines = content.splitlines()

            for line_number, line in enumerate(lines, start=1):
                # Track headings for context
                if line.startswith("#"):
                    current_heading = line.lstrip("#").strip()
                    continue

                # Match checkbox pattern
                match = CHECKBOX_PATTERN.match(line)
                if match:
                    indent = match.group(1)
                    checkbox_state = match.group(2)
                    content = match.group(3)

                    tasks.append(
                        {
                            "file_path": str(file_path.resolve()),
                            "line_number": line_number,
                            "checkbox_state": checkbox_state,
                            "indent_level": len(indent),
                            "parent_heading": current_heading,
                            "raw_line": line,
                            "content": content.strip(),
                        }
                    )

        except Exception as e:
            logger.error(f"Error parsing {file_path}: {e}")

        return tasks

    def normalize_task(self, raw_task: dict[str, Any]) -> Todo:
        """Normalize a markdown task to the internal Todo model.

        Args:
            raw_task: Raw task data from file parsing.

        Returns:
            Normalized Todo instance.
        """
        content = raw_task["content"]

        # Map status from checkbox state
        status = self.map_status(raw_task["checkbox_state"])

        # Extract priority from content
        priority = self._extract_priority(content)

        # Extract due date from content
        due_date = self._extract_due_date(content)

        # Clean content (remove markers)
        clean_content = self._clean_content(content)

        # Generate source ID from file path and line number
        source_id = f"{raw_task['file_path']}:{raw_task['line_number']}"

        # Build source URL (file:// protocol for local files)
        source_url = f"file://{raw_task['file_path']}#L{raw_task['line_number']}"

        # Build markdown metadata
        markdown_metadata = MarkdownMetadata(
            file_path=raw_task["file_path"],
            line_number=raw_task["line_number"],
            checkbox_state=raw_task["checkbox_state"],
            indent_level=raw_task["indent_level"],
            parent_heading=raw_task["parent_heading"],
            raw_line=raw_task["raw_line"],
        )

        # Build external metadata
        external_metadata = ExternalTaskMetadata(
            markdown=markdown_metadata,
            raw_response=raw_task,
            fetched_at=datetime.now(),
        )

        # Build tags
        tags = self._build_tags(raw_task)

        # Generate sync hash
        sync_hash = self.generate_sync_hash(raw_task)

        return Todo(
            content=clean_content,
            status=status,
            priority=priority,
            due_date=due_date,
            source_system=TodoSource.MARKDOWN,
            source_id=source_id,
            source_url=source_url,
            external_metadata=external_metadata,
            sync_hash=sync_hash,
            tags=tags,
        )

    def map_status(self, external_status: str) -> TodoStatus:
        """Map checkbox state to internal TodoStatus.

        Args:
            external_status: Checkbox state string (e.g., "[ ]", "[x]").

        Returns:
            Corresponding TodoStatus enum value.
        """
        return CHECKBOX_STATUS_MAP.get(external_status, TodoStatus.TODO)

    def _extract_priority(self, content: str) -> int:
        """Extract priority from task content.

        Looks for patterns like (P1), (P2), etc.

        Args:
            content: Task content string.

        Returns:
            Priority integer (0-5), default 0.
        """
        match = PRIORITY_PATTERN.search(content)
        if match:
            return int(match.group(1))
        return 0

    def _extract_due_date(self, content: str) -> datetime | None:
        """Extract due date from task content.

        Looks for patterns like (due:2024-12-25).

        Args:
            content: Task content string.

        Returns:
            datetime if found, None otherwise.
        """
        match = DUE_DATE_PATTERN.search(content)
        if match:
            try:
                return datetime.strptime(match.group(1), "%Y-%m-%d")
            except ValueError:
                pass
        return None

    def _clean_content(self, content: str) -> str:
        """Clean task content by removing metadata markers.

        Removes priority markers (P1), due date markers, etc.

        Args:
            content: Raw task content.

        Returns:
            Cleaned content string.
        """
        # Remove priority markers
        content = PRIORITY_PATTERN.sub("", content)
        # Remove due date markers
        content = DUE_DATE_PATTERN.sub("", content)
        # Clean up extra whitespace
        return " ".join(content.split())

    def _build_tags(self, raw_task: dict[str, Any]) -> list[Tag]:
        """Build tags list from markdown task data.

        Extracts:
        - `markdown` tag for all markdown tasks
        - `#tag` hashtags from content
        - `@context` context tags from content
        - `file:{filename}` for the source file

        Args:
            raw_task: Raw task data from file parsing.

        Returns:
            List of Tag objects.
        """
        tags = []

        # Always add markdown tag
        tags.append(Tag(name="markdown"))

        # Add file name tag
        file_path = Path(raw_task["file_path"])
        tags.append(Tag(name=f"file:{file_path.name}"))

        # Add heading tag if present
        if raw_task.get("parent_heading"):
            # Sanitize heading for tag name
            heading_tag = raw_task["parent_heading"].lower().replace(" ", "-")
            tags.append(Tag(name=f"section:{heading_tag}"))

        # Extract hashtags and context tags from content
        content = raw_task["content"]
        for match in TAG_PATTERN.finditer(content):
            tag_name = match.group(1)
            # Convert @context to context: prefix
            if tag_name.startswith("@"):
                tags.append(Tag(name=f"context:{tag_name[1:]}"))
            elif tag_name.startswith("#"):
                tags.append(Tag(name=tag_name[1:]))  # Remove # prefix

        return tags