get_pdf_text_bulk
Extract all plain text from multi-page PDFs to understand content. Automatically skips reference pages to reduce noise.
Instructions
批量提取多页 PDF 纯文本(无坐标),适合大 PDF 内容理解。
与 get_pdf_layout_text 的区别:不返回坐标,context 占用减少 ~80%。 推荐工作流:
先用此工具理解全文 → 确定目标页和目标句子
再用 get_pdf_layout_text 获取目标页的精确坐标
最后用 create_pdf_annotation 写入标注
Args: item_id: Zotero PDF 附件的 itemID(数字),或 PDF 文件的绝对路径 pages: 要提取的页码列表(0-indexed),不传则提取全文 skip_refs: 是否自动跳过参考文献页(默认 True)
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| item_id | Yes | ||
| pages | No | ||
| skip_refs | No |
Output Schema
| Name | Required | Description | Default |
|---|---|---|---|
| result | Yes |
Implementation Reference
- annota/server.py:202-226 (handler)MCP tool handler for 'get_pdf_text_bulk' — registers as a FastMCP tool, resolves the PDF path, calls pdf_tools.extract_bulk_text() to extract plain text from multiple pages, and returns JSON result.
# ── Tool 6: get_pdf_text_bulk ───────────────────────────────── @mcp.tool() def get_pdf_text_bulk( item_id: str, pages: list[int] | None = None, skip_refs: bool = True, ) -> str: """批量提取多页 PDF 纯文本(无坐标),适合大 PDF 内容理解。 与 get_pdf_layout_text 的区别:不返回坐标,context 占用减少 ~80%。 推荐工作流: 1. 先用此工具理解全文 → 确定目标页和目标句子 2. 再用 get_pdf_layout_text 获取目标页的精确坐标 3. 最后用 create_pdf_annotation 写入标注 Args: item_id: Zotero PDF 附件的 itemID(数字),或 PDF 文件的绝对路径 pages: 要提取的页码列表(0-indexed),不传则提取全文 skip_refs: 是否自动跳过参考文献页(默认 True) """ pdf_path = _resolve_pdf_path(item_id) result = pdf_tools.extract_bulk_text(pdf_path, pages=pages, skip_refs=skip_refs) return json.dumps(result, ensure_ascii=False, indent=2) - annota/pdf_tools.py:115-178 (handler)Core implementation of extract_bulk_text() — opens PDF with PyMuPDF, detects references start page, extracts plain text from specified pages (or all pages before references), and returns structured result without coordinates.
def extract_bulk_text( pdf_path: str | Path, pages: list[int] | None = None, skip_refs: bool = True, ) -> dict: """批量提取多页 PDF 纯文本(无坐标),适合内容理解阶段。 与 extract_page_text 的区别:不返回坐标,体积缩小 ~80%。 用于两阶段工作流的第一阶段:先理解内容,再对目标页获取精确坐标。 Args: pdf_path: PDF 文件路径 pages: 要提取的页码列表(0-indexed),None = 全文 skip_refs: 是否自动跳过参考文献页(默认 True) Returns: { "total_pages": int, "extracted_pages": int, "refs_start_page": int | None, "pages": [ {"page": int, "text": str, "char_count": int}, ... ] } """ doc = fitz.open(str(pdf_path)) try: total_pages = len(doc) # 检测参考文献起始页 refs_start = _detect_refs_page(doc) if skip_refs else total_pages # 确定要提取的页码 if pages is not None: target_pages = [p for p in pages if 0 <= p < total_pages] else: target_pages = list(range(min(refs_start, total_pages))) result_pages = [] for pn in target_pages: page = doc[pn] text = page.get_text("text").strip() if text: result_pages.append({ "page": pn, "text": text, "char_count": len(text), }) logger.info( "批量提取: %d/%d 页, 跳过参考文献=%s (refs_start=%s)", len(result_pages), total_pages, skip_refs, refs_start if skip_refs else "N/A", ) return { "total_pages": total_pages, "extracted_pages": len(result_pages), "refs_start_page": refs_start if skip_refs and refs_start < total_pages else None, "pages": result_pages, } finally: doc.close() - annota/pdf_tools.py:181-203 (helper)Helper function _detect_refs_page() — heuristic scan from last 10 pages to find References/Bibliography headers, used by extract_bulk_text to skip reference pages.
def _detect_refs_page(doc) -> int: """启发式检测参考文献起始页。 从最后 10 页往前扫描,寻找 References / Bibliography / 参考文献 标题。 返回参考文献起始页码(0-indexed),未找到则返回总页数。 """ total = len(doc) scan_start = max(0, total - 10) for i in range(total - 1, scan_start - 1, -1): text = doc[i].get_text("text") lines = text.split("\n") for line in lines[:15]: # 只检查页面前 15 行 stripped = line.strip().lower() if stripped in ( "references", "bibliography", "参考文献", "reference", ): return i return total # 未检测到 - annota/server.py:327-347 (helper)Helper _resolve_pdf_path() — resolves item_id (numeric or file path) to a PDF Path object, used by get_pdf_text_bulk and other tools.
def _resolve_pdf_path(item_id: str) -> Path: """将 item_id 解析为 PDF 文件路径。 如果是纯数字,按 Zotero itemID 查找; 如果包含路径分隔符,按文件路径处理。 """ if "/" in item_id or "\\" in item_id or ":" in item_id: p = Path(item_id) if not p.exists(): raise FileNotFoundError(f"PDF 文件不存在: {item_id}") return p attachment_id = int(item_id) pdf_path = zotero_db.get_pdf_path(attachment_id) if pdf_path is None: raise FileNotFoundError( f"在 Zotero 数据库中未找到 itemID={attachment_id} 对应的 PDF 文件" ) if not pdf_path.exists(): raise FileNotFoundError(f"PDF 文件在磁盘上不存在: {pdf_path}") return pdf_path