read_pdf_text
Extract and convert PDF document text to markdown format for AI processing. This tool reads PDF content and returns clean text from all pages, simplifying document analysis for agents.
Instructions
Read a PDF document and return only the markdown text content from all pages.
This is a simpler alternative to read_pdf that returns just the text content
without the full OCR metadata, which can be easier for agents to process.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| absolute_path | Yes |
Implementation Reference
- main.py:128-146 (handler)The handler function for the 'read_pdf_text' tool, registered via @mcp.tool(). It invokes Lizeur to perform OCR on the PDF and extracts and concatenates the markdown text from all pages.@mcp.tool() def read_pdf_text(absolute_path: str) -> str: """Read a PDF document and return only the markdown text content from all pages. This is a simpler alternative to read_pdf that returns just the text content without the full OCR metadata, which can be easier for agents to process. """ ocr_response = Lizeur().read_document(Path(absolute_path)) if ocr_response is None: return "Error: Failed to process document" # Combine all pages' markdown content all_text = [] for i, page in enumerate(ocr_response.pages): if hasattr(page, "markdown") and page.markdown: all_text.append(f"--- Page {i+1} ---\n{page.markdown}") return "\n\n".join(all_text) if all_text else "No text content found"
- main.py:33-64 (helper)Helper method in Lizeur class that reads or retrieves cached OCR response for a document path, used by the read_pdf_text handler.def read_document(self, path: Path) -> OCRResponse | None: """Read a document and return the OCRResponse.""" logging.info(f"read_document: Reading document {path.name}") # Check if the document is already cached cached_document_path = self.cache_path / path.name if cached_document_path.exists(): logging.info(f"read_document: Document {path.name} is already cached.") try: with open(cached_document_path, "r") as f: cached_json = f.read() # Parse JSON and reconstruct OCRResponse cached_data = json.loads(cached_json) return OCRResponse.model_validate(cached_data) except (json.JSONDecodeError, ValueError) as e: logging.warning(f"Failed to load cached document {path.name}: {e}") # Remove corrupted cache file cached_document_path.unlink(missing_ok=True) # OCR the document ocr_response = self._ocr_document(path) if ocr_response is None: return None # Cache the document using model_dump_json() for direct JSON serialization try: with open(cached_document_path, "w") as f: f.write(ocr_response.model_dump_json(indent=2)) logging.info(f"Successfully cached document {path.name}") except Exception as e: logging.error(f"Failed to cache document {path.name}: {e}") return ocr_response
- main.py:66-100 (helper)Private helper method in Lizeur class that performs the actual OCR using Mistral AI API, including file upload, processing, and cleanup.def _ocr_document(self, path: Path) -> OCRResponse | None: """OCR a document and return the OCRResponse.""" try: # Upload the file to MistralAI uploaded_file = self.mistral.files.upload( file={ "file_name": path.stem, "content": path.read_bytes(), }, purpose="ocr", ) # Process the uploaded file with OCR ocr_response = self.mistral.ocr.process( document={ "type": "file", "file_id": uploaded_file.id, }, model="mistral-ocr-latest", include_image_base64=True, ) # Clean up the uploaded file try: self.mistral.files.delete(uploaded_file.id) except Exception as e: logging.warning( f"Failed to delete uploaded file {uploaded_file.id}: {e}" ) return ocr_response except Exception as e: logging.error(f"OCR processing failed for {path}: {e}") return None