pdf-tools-mcp

MIT License

Overview InspectNew Endpoints Schema Related Servers Reviews Score

text.py•2.03 kB

import fitz import logging from pathlib import Path import json from config import DATA_DIR, uuid4_pdf_re logger = logging.getLogger(__name__) async def get_text_blocks(file_name: str, page_number: int = 1): """ Extract text content from a specific page of a PDF file, in blocks. This contains the bounding box, text, number and type. """ if not uuid4_pdf_re.match(file_name): logger.error( "Input file must be in the 'data' folder and have a .pdf extension." ) return False file_path = Path(DATA_DIR, file_name) document = fitz.open(file_path) page = document[page_number - 1] result = page.get_text("blocks") result = [ { "x0": block[0], "y0": block[1], "x1": block[2], "y1": block[3], "text": block[4], "block_no": block[5], "block_type": block[6], } for block in result ] response = { "success": True, "file_name": file_name, "page_number": page_number, "text_blocks": result, } logger.info("Returning response of length: %s", len(str(result))) return response async def get_text_json(file_name: str, page_number: int = 1): """ Extract text content from a specific page of a PDF file, as json. This contains the most information. """ if not uuid4_pdf_re.match(file_name): logger.error( "Input file must be in the 'data' folder and have a .pdf extension." ) return False file_path = Path(DATA_DIR, file_name) document = fitz.open(file_path) page = document[page_number - 1] result = page.get_text("json") result = json.loads(result) logger.info(type(result)) response = { "success": True, "file_name": file_name, "page_number": page_number, "text_json": result, } logger.info("Returning response of length: %s", len(str(result))) return response

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/danielkennedy1/pdf-tools-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server