pdf4vllm

content_orderer.py•2 KiB

""" Content ordering utility Sort text, tables, and images in original document order """ from typing import List, Dict def order_content_blocks( text_regions: List[Dict], tables: List[Dict], images: List[Dict] ) -> List[Dict]: """ Sort text regions, tables, and images by top coordinate in reading order Args: text_regions: Text regions excluding tables [{'top': float, 'text': str}, ...] tables: Table info [{'top': float, 'markdown': str}, ...] images: Image info [{'top': float, 'image_data': str}, ...] Returns: Sorted content blocks """ all_blocks = [] # Add text regions for region in text_regions: all_blocks.append({ 'type': 'text', 'top': region['top'], 'content': region['text'], 'image': None }) # Add tables for table_info in tables: all_blocks.append({ 'type': 'table', 'top': table_info['top'], 'content': table_info['markdown'], 'image': None }) # Add images (image_data is base64 string) for img_info in images: all_blocks.append({ 'type': 'image', 'top': img_info['top'], 'content': img_info['image_data'], # base64 string 'image': None }) # Sort by top coordinate (top to bottom) all_blocks.sort(key=lambda x: x['top']) # Add position for block in all_blocks: block['position'] = block['top'] del block['top'] # Replace top with position return all_blocks def merge_adjacent_text_blocks(blocks: List[Dict]) -> List[Dict]: """ Merge adjacent text blocks into one (Simplified since text excluding tables is already merged by region) Args: blocks: content blocks Returns: Merged blocks """ if not blocks: return [] # Text excluding tables is already extracted by region # No additional merging needed, return as is return blocks

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PyJudge/pdf4vllm-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

content_orderer.py•2 KiB