docshelf_add_document
Add a PDF or Markdown file to a categorized document shelf. Automatically converts PDFs to Markdown, splits large documents by headings, and updates the navigation index.
Instructions
Add a PDF or Markdown file to the shelf and refresh INDEX.md.
.pdfis converted to Markdown (pymupdf4llmby default; passquality='high'to usemarker-pdf).Documents larger than 50 KB with multiple H2 headings are split into one file per section (turn this off with
split=False).INDEX.md is regenerated automatically. The caller still owns the git commit / push step.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| params | Yes |
Output Schema
| Name | Required | Description | Default |
|---|---|---|---|
| result | Yes |
Implementation Reference
- src/docshelf_mcp/core/shelf.py:164-248 (handler)The core implementation of add_document on the Shelf class. Handles PDF conversion, Markdown writing, splitting, metadata updates, and index rebuild.
def add_document( self, source: Path | str, *, category: str, title: str, description: str = "", split: bool = True, quality: Quality = "fast", ) -> AddResult: """Add (or replace) a document in the shelf. Args: source: Path to a ``.pdf`` or ``.md`` file. category: Category bucket (e.g. ``"laptops"``). Created if missing. title: Human-readable title — used in the INDEX entry. description: Short description (one sentence). Empty by default. split: If True (default) and the document is large enough, split it by H2 into a sibling subdirectory. quality: PDF conversion quality preset (``"fast"`` or ``"high"``). Returns: :class:`AddResult` with the on-disk paths. Raises: FileNotFoundError: ``source`` doesn't exist. ValueError: ``source`` is not a .pdf or .md file. """ source = Path(source).expanduser().resolve() if not source.exists(): raise FileNotFoundError(f"Source not found: {source}") suffix = source.suffix.lower() if suffix not in {".pdf", ".md"}: raise ValueError( f"Unsupported source type {suffix!r}; expected .pdf or .md" ) category_slug = slugify(category, max_len=80) or "uncategorized" category_dir = self.root / "docs" / category_slug category_dir.mkdir(parents=True, exist_ok=True) doc_stem = slugify(title, max_len=80) or "document" doc_path = category_dir / f"{doc_stem}.md" if suffix == ".pdf": raw_md = pdf_to_markdown(source, quality=quality) converted_from_pdf = True else: raw_md = source.read_text(encoding="utf-8", errors="replace") converted_from_pdf = False cleaned = clean_markdown(raw_md) if not cleaned.lstrip().startswith("#"): cleaned = f"# {title}\n\n{cleaned}" doc_path.write_text(cleaned, encoding="utf-8") section_paths: list[Path] = [] was_split = False split_dir = category_dir / doc_stem if split and should_split(cleaned, self.config.split_threshold_bytes): sections = split_by_h2(cleaned) if len(sections) >= 2: section_paths = write_split_files(sections, split_dir) was_split = True elif split_dir.is_dir(): # Document is no longer large enough — wipe the stale split. import shutil shutil.rmtree(split_dir) # Record title/description in .meta.json for the indexer. self._update_category_meta(category_dir, doc_path.name, title, description) # Auto-rebuild INDEX.md so the on-disk state and the index stay in sync. # Callers that need batch performance can short-circuit by going one # layer down (write files manually, then call rebuild_index once). self.rebuild_index() return AddResult( document_path=doc_path, section_paths=section_paths, was_split=was_split, converted_from_pdf=converted_from_pdf, ) - src/docshelf_mcp/tools.py:193-218 (helper)The thin wrapper in tools.py that resolves the shelf, calls Shelf.add_document, rebuilds the index, and returns a serializable dict response.
def add_document(params: AddDocumentInput) -> dict: """Implementation of the ``add_document`` MCP tool.""" shelf = _resolve_shelf(params.shelf_path) result = shelf.add_document( params.source_path, category=params.category, title=params.title, description=params.description, split=params.split, quality=params.quality, ) shelf.rebuild_index() return { "status": "ok", "shelf_root": str(shelf.root), "document_path": str(result.document_path.relative_to(shelf.root)), "section_paths": [str(p.relative_to(shelf.root)) for p in result.section_paths], "was_split": result.was_split, "section_count": len(result.section_paths), "converted_from_pdf": result.converted_from_pdf, "index_path": "INDEX.md", "next_steps": ( f"Commit the changes ('git add . && git commit -m \"docs: add {params.title}\"') " "to make the new entry visible via raw URLs." ), } - src/docshelf_mcp/tools.py:60-103 (schema)Pydantic model AddDocumentInput with all input fields (source_path, category, title, description, split, quality, shelf_path) and validation.
class AddDocumentInput(_BaseInput): """Input for ``add_document``.""" source_path: str = Field( ..., description="Absolute path to the source .pdf or .md file on disk.", min_length=1, ) category: str = Field( ..., description="Category bucket — e.g. 'laptops', 'recipes', 'research-papers'. " "Created if missing.", min_length=1, max_length=80, ) title: str = Field( ..., description="Human-readable document title. Used as the INDEX entry and " "(slugified) as the filename.", min_length=1, max_length=200, ) description: str = Field( default="", description="Optional one-sentence description shown next to the entry " "in INDEX.md.", max_length=500, ) split: bool = Field( default=True, description="Auto-split large documents (>50 KB) by H2 heading. " "Recommended unless the source is already small.", ) quality: Quality = Field( default="fast", description="PDF conversion quality: 'fast' (pymupdf4llm, default) or " "'high' (marker-pdf, requires optional install).", ) shelf_path: str | None = Field( default=None, description="Path to the shelf root directory. Defaults to $DOCSHELF_ROOT " "or the server's working directory.", ) - src/docshelf_mcp/server.py:68-92 (registration)MCP tool registration via @mcp.tool decorator with name='docshelf_add_document', annotations, and delegation to tools.add_document.
@mcp.tool( name="docshelf_add_document", annotations={ "title": "Add a document to the shelf", "readOnlyHint": False, "destructiveHint": False, "idempotentHint": True, "openWorldHint": False, }, ) def add_document(params: t.AddDocumentInput) -> str: """Add a PDF or Markdown file to the shelf and refresh INDEX.md. * ``.pdf`` is converted to Markdown (``pymupdf4llm`` by default; pass ``quality='high'`` to use ``marker-pdf``). * Documents larger than 50 KB with multiple H2 headings are split into one file per section (turn this off with ``split=False``). * INDEX.md is regenerated automatically. The caller still owns the git commit / push step. """ try: return _serialize(t.add_document(params)) except Exception as exc: logger.exception("add_document failed") return _serialize({"status": "error", "error": str(exc), "type": type(exc).__name__}) - src/docshelf_mcp/core/shelf.py:77-84 (helper)AddResult dataclass returned by Shelf.add_document containing document_path, section_paths, was_split, and converted_from_pdf.
@dataclass class AddResult: """Outcome of :meth:`Shelf.add_document`.""" document_path: Path section_paths: list[Path] was_split: bool converted_from_pdf: bool