convert_pdf_url
Convert PDF files from URLs to Markdown format with optional OCR support for extracting text from scanned documents.
Instructions
Convert PDF URL to Markdown, supports single URL or URL list
Args:
url: PDF file URL or URL list, can be separated by spaces, commas, or newlines
enable_ocr: Whether to enable OCR (default: True)
Returns:
dict: Conversion result information
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | ||
| enable_ocr | No |
Implementation Reference
- src/pdf2md/server.py:315-393 (handler)The core handler function implementing the logic for convert_pdf_url tool. It parses input URLs, submits batch jobs to MinerU API, polls for completion, downloads and extracts Markdown results.async def convert_pdf_url(url: str, enable_ocr: bool = True) -> Dict[str, Any]: """ Convert PDF URL to Markdown, supports single URL or URL list Args: url: PDF file URL or URL list, can be separated by spaces, commas, or newlines enable_ocr: Whether to enable OCR (default: True) Returns: dict: Conversion result information """ if not MINERU_API_KEY: return {"success": False, "error": "Missing API key, please set environment variable MINERU_API_KEY"} if isinstance(url, str): urls = parse_url_string(url) else: urls = [url] async with httpx.AsyncClient(timeout=300.0) as client: try: files = [] for i, url_item in enumerate(urls): files.append({ "url": url_item, "is_ocr": enable_ocr, "data_id": f"url_convert_{i+1}_{int(time.time())}" }) batch_data = { "enable_formula": True, "language": "auto", "layout_model": "doclayout_yolo", "enable_table": True, "files": files } response = await client.post( MINERU_BATCH_API, headers=HEADERS, json=batch_data, timeout=300.0 ) if response.status_code != 200: return {"success": False, "error": f"Request failed: {response.status_code}"} try: status_data = response.json() if status_data.get("code") != 0 and status_data.get("code") != 200: error_msg = status_data.get("msg", "Unknown error") return {"success": False, "error": f"API returned error: {error_msg}"} batch_id = status_data.get("data", {}).get("batch_id", "") if not batch_id: return {"success": False, "error": "Failed to get batch ID"} task_status = await check_task_status(client, batch_id) if not task_status.get("success"): return task_status downloaded_files = await download_batch_results(client, task_status.get("extract_results", [])) return { "success": True, "downloaded_files": downloaded_files, "batch_id": batch_id, "total_urls": len(urls), "processed_urls": len(downloaded_files) } except json.JSONDecodeError as e: return {"success": False, "error": f"Failed to parse JSON: {e}"} except Exception as e: return {"success": False, "error": str(e)}
- src/pdf2md/server.py:314-314 (registration)The @mcp.tool() decorator registers the convert_pdf_url function as an MCP tool.@mcp.tool()
- src/pdf2md/server.py:316-325 (schema)Docstring providing input schema (parameters) and output description for the tool.""" Convert PDF URL to Markdown, supports single URL or URL list Args: url: PDF file URL or URL list, can be separated by spaces, commas, or newlines enable_ocr: Whether to enable OCR (default: True) Returns: dict: Conversion result information """
- src/pdf2md/server.py:243-275 (helper)Helper function used by the handler to parse input URL strings into a list, handling quotes, spaces, commas, and newlines.def parse_url_string(url_string): """ Parse URL string separated by spaces, commas, or newlines Args: url_string: URL string Returns: list: List of URLs """ if isinstance(url_string, str): if (url_string.startswith('"') and url_string.endswith('"')) or \ (url_string.startswith("'") and url_string.endswith("'")): url_string = url_string[1:-1] urls = [] for part in url_string.split(): if ',' in part: urls.extend(part.split(',')) elif '\n' in part: urls.extend(part.split('\n')) else: urls.append(part) cleaned_urls = [] for url in urls: if (url.startswith('"') and url.endswith('"')) or \ (url.startswith("'") and url.endswith("'")): cleaned_urls.append(url[1:-1]) else: cleaned_urls.append(url) return cleaned_urls
- src/pdf2md/server.py:73-136 (helper)Helper function to poll the MinerU API for batch task completion status.async def check_task_status(client, batch_id, max_retries=60, sleep_seconds=5): """ Check batch task status Args: client: HTTP client batch_id: Batch ID max_retries: Maximum number of retries sleep_seconds: Seconds between retries Returns: dict: Dictionary containing task status information, or error message if failed """ retry_count = 0 while retry_count < max_retries: retry_count += 1 try: status_response = await client.get( f"{MINERU_BATCH_RESULTS_API}/{batch_id}", headers=HEADERS, timeout=60.0 ) if status_response.status_code != 200: retry_count += 1 if retry_count < max_retries: await asyncio.sleep(sleep_seconds) continue try: status_data = status_response.json() except Exception as e: retry_count += 1 if retry_count < max_retries: await asyncio.sleep(sleep_seconds) continue task_data = status_data.get("data", {}) extract_results = task_data.get("extract_result", []) all_done, any_done = print_task_status(extract_results) if all_done: return { "success": True, "extract_results": extract_results, "task_data": task_data, "status_data": status_data } await asyncio.sleep(sleep_seconds) except Exception as e: retry_count += 1 if retry_count < max_retries: await asyncio.sleep(sleep_seconds) return { "success": False, "error": "Polling timeout, unable to get final results" }