docshelf_add_document

Idempotent

Add a PDF or Markdown file to a categorized document shelf. Automatically converts PDFs to Markdown, splits large documents by headings, and updates the navigation index.

Instructions

Add a PDF or Markdown file to the shelf and refresh INDEX.md.

.pdf is converted to Markdown (pymupdf4llm by default; pass quality='high' to use marker-pdf).
Documents larger than 50 KB with multiple H2 headings are split into one file per section (turn this off with split=False).
INDEX.md is regenerated automatically. The caller still owns the git commit / push step.

Input Schema

TableJSON Schema

Name	Required	Description	Default
`params`	Yes

Output Schema

TableJSON Schema

Name	Required	Description	Default
`result`	Yes

Implementation Reference

src/docshelf_mcp/core/shelf.py:164-248 (handler)

The core implementation of add_document on the Shelf class. Handles PDF conversion, Markdown writing, splitting, metadata updates, and index rebuild.

def add_document(
    self,
    source: Path | str,
    *,
    category: str,
    title: str,
    description: str = "",
    split: bool = True,
    quality: Quality = "fast",
) -> AddResult:
    """Add (or replace) a document in the shelf.

    Args:
        source: Path to a ``.pdf`` or ``.md`` file.
        category: Category bucket (e.g. ``"laptops"``). Created if missing.
        title: Human-readable title — used in the INDEX entry.
        description: Short description (one sentence). Empty by default.
        split: If True (default) and the document is large enough, split it
            by H2 into a sibling subdirectory.
        quality: PDF conversion quality preset (``"fast"`` or ``"high"``).

    Returns:
        :class:`AddResult` with the on-disk paths.

    Raises:
        FileNotFoundError: ``source`` doesn't exist.
        ValueError: ``source`` is not a .pdf or .md file.
    """
    source = Path(source).expanduser().resolve()
    if not source.exists():
        raise FileNotFoundError(f"Source not found: {source}")

    suffix = source.suffix.lower()
    if suffix not in {".pdf", ".md"}:
        raise ValueError(
            f"Unsupported source type {suffix!r}; expected .pdf or .md"
        )

    category_slug = slugify(category, max_len=80) or "uncategorized"
    category_dir = self.root / "docs" / category_slug
    category_dir.mkdir(parents=True, exist_ok=True)

    doc_stem = slugify(title, max_len=80) or "document"
    doc_path = category_dir / f"{doc_stem}.md"

    if suffix == ".pdf":
        raw_md = pdf_to_markdown(source, quality=quality)
        converted_from_pdf = True
    else:
        raw_md = source.read_text(encoding="utf-8", errors="replace")
        converted_from_pdf = False

    cleaned = clean_markdown(raw_md)
    if not cleaned.lstrip().startswith("#"):
        cleaned = f"# {title}\n\n{cleaned}"
    doc_path.write_text(cleaned, encoding="utf-8")

    section_paths: list[Path] = []
    was_split = False
    split_dir = category_dir / doc_stem
    if split and should_split(cleaned, self.config.split_threshold_bytes):
        sections = split_by_h2(cleaned)
        if len(sections) >= 2:
            section_paths = write_split_files(sections, split_dir)
            was_split = True
    elif split_dir.is_dir():
        # Document is no longer large enough — wipe the stale split.
        import shutil

        shutil.rmtree(split_dir)

    # Record title/description in .meta.json for the indexer.
    self._update_category_meta(category_dir, doc_path.name, title, description)

    # Auto-rebuild INDEX.md so the on-disk state and the index stay in sync.
    # Callers that need batch performance can short-circuit by going one
    # layer down (write files manually, then call rebuild_index once).
    self.rebuild_index()

    return AddResult(
        document_path=doc_path,
        section_paths=section_paths,
        was_split=was_split,
        converted_from_pdf=converted_from_pdf,
    )

src/docshelf_mcp/tools.py:193-218 (helper)

The thin wrapper in tools.py that resolves the shelf, calls Shelf.add_document, rebuilds the index, and returns a serializable dict response.

def add_document(params: AddDocumentInput) -> dict:
    """Implementation of the ``add_document`` MCP tool."""
    shelf = _resolve_shelf(params.shelf_path)
    result = shelf.add_document(
        params.source_path,
        category=params.category,
        title=params.title,
        description=params.description,
        split=params.split,
        quality=params.quality,
    )
    shelf.rebuild_index()
    return {
        "status": "ok",
        "shelf_root": str(shelf.root),
        "document_path": str(result.document_path.relative_to(shelf.root)),
        "section_paths": [str(p.relative_to(shelf.root)) for p in result.section_paths],
        "was_split": result.was_split,
        "section_count": len(result.section_paths),
        "converted_from_pdf": result.converted_from_pdf,
        "index_path": "INDEX.md",
        "next_steps": (
            f"Commit the changes ('git add . && git commit -m \"docs: add {params.title}\"') "
            "to make the new entry visible via raw URLs."
        ),
    }

src/docshelf_mcp/tools.py:60-103 (schema)

Pydantic model AddDocumentInput with all input fields (source_path, category, title, description, split, quality, shelf_path) and validation.

class AddDocumentInput(_BaseInput):
    """Input for ``add_document``."""

    source_path: str = Field(
        ...,
        description="Absolute path to the source .pdf or .md file on disk.",
        min_length=1,
    )
    category: str = Field(
        ...,
        description="Category bucket — e.g. 'laptops', 'recipes', 'research-papers'. "
        "Created if missing.",
        min_length=1,
        max_length=80,
    )
    title: str = Field(
        ...,
        description="Human-readable document title. Used as the INDEX entry and "
        "(slugified) as the filename.",
        min_length=1,
        max_length=200,
    )
    description: str = Field(
        default="",
        description="Optional one-sentence description shown next to the entry "
        "in INDEX.md.",
        max_length=500,
    )
    split: bool = Field(
        default=True,
        description="Auto-split large documents (>50 KB) by H2 heading. "
        "Recommended unless the source is already small.",
    )
    quality: Quality = Field(
        default="fast",
        description="PDF conversion quality: 'fast' (pymupdf4llm, default) or "
        "'high' (marker-pdf, requires optional install).",
    )
    shelf_path: str | None = Field(
        default=None,
        description="Path to the shelf root directory. Defaults to $DOCSHELF_ROOT "
        "or the server's working directory.",
    )

src/docshelf_mcp/server.py:68-92 (registration)

MCP tool registration via @mcp.tool decorator with name='docshelf_add_document', annotations, and delegation to tools.add_document.

@mcp.tool(
    name="docshelf_add_document",
    annotations={
        "title": "Add a document to the shelf",
        "readOnlyHint": False,
        "destructiveHint": False,
        "idempotentHint": True,
        "openWorldHint": False,
    },
)
def add_document(params: t.AddDocumentInput) -> str:
    """Add a PDF or Markdown file to the shelf and refresh INDEX.md.

    * ``.pdf`` is converted to Markdown (``pymupdf4llm`` by default; pass
      ``quality='high'`` to use ``marker-pdf``).
    * Documents larger than 50 KB with multiple H2 headings are split into
      one file per section (turn this off with ``split=False``).
    * INDEX.md is regenerated automatically. The caller still owns the git
      commit / push step.
    """
    try:
        return _serialize(t.add_document(params))
    except Exception as exc:
        logger.exception("add_document failed")
        return _serialize({"status": "error", "error": str(exc), "type": type(exc).__name__})

src/docshelf_mcp/core/shelf.py:77-84 (helper)

AddResult dataclass returned by Shelf.add_document containing document_path, section_paths, was_split, and converted_from_pdf.

@dataclass
class AddResult:
    """Outcome of :meth:`Shelf.add_document`."""

    document_path: Path
    section_paths: list[Path]
    was_split: bool
    converted_from_pdf: bool

docshelf-mcp

docshelf_add_document

Instructions

Input Schema

Output Schema

Implementation Reference

Tool Definition Quality

Other Tools

Latest Blog Posts

MCP directory API