ocr.py•35.7 kB
"""
OCR implementation using Apple's Vision framework directly.
"""
import io
import logging
import objc
from pathlib import Path
from typing import List, Tuple, Union, Optional, Any
from PIL import Image
import glob
import Vision
from Foundation import NSMakeRange, NSData
# Register Vision framework metadata for proper function calling
objc.registerMetaDataForSelector(
b"VKCImageAnalyzer",
b"processRequest:progressHandler:completionHandler:",
{
"arguments": {
3: {
"callable": {
"retval": {"type": b"v"},
"arguments": {
0: {"type": b"^v"},
1: {"type": b"d"},
},
}
},
4: {
"callable": {
"retval": {"type": b"v"},
"arguments": {
0: {"type": b"^v"},
1: {"type": b"@"},
2: {"type": b"@"},
},
}
},
}
},
)
class VisionOCR:
"""OCR implementation using Vision framework directly."""
def __init__(self, language_preference: Optional[List[str]] = None):
"""Initialize the OCR engine.
Args:
language_preference: Optional list of language codes to use for OCR.
Default is ["en-US", "zh-Hans", "zh-Hant"]
"""
self.language_preference = language_preference or ["zh-Hans", "zh-Hant","en-US"]
self.logger = logging.getLogger(__name__)
def pil_to_data(self, pil_image: Image.Image) -> bytes:
"""Convert PIL image to bytes data for Vision framework.
Args:
pil_image: PIL Image object
Returns:
Bytes data of the image
"""
buffer = io.BytesIO()
pil_image.save(buffer, format="PNG")
return buffer.getvalue()
def recognize_text(self, img: Union[Image.Image, str, Path], recognition_level: str = "accurate", return_raw_observations: bool = False) -> List[Union[Tuple[str, float, List[float]], Any]]:
"""
Recognizes text in an image using the Vision framework.
Args:
img: PIL Image object, file path (str or Path).
recognition_level: "accurate" or "fast".
return_raw_observations: If True, returns raw VNRecognizedTextObservation objects.
Otherwise, returns tuples of (text, confidence, bbox).
Returns:
List of recognition results, format depends on return_raw_observations.
"""
# Handle different input types
if isinstance(img, (str, Path)):
try:
img_path = str(Path(img).resolve())
# Use NSData to read the file directly
img_data = NSData.dataWithContentsOfFile_(img_path)
if img_data is None:
self.logger.error(f"Failed to read image file: {img_path}")
return []
# Create handler directly from data
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(img_data, None)
except Exception as e:
self.logger.error(f"Error loading image from path: {e}")
return []
elif isinstance(img, Image.Image):
try:
# Convert PIL Image to bytes
if img.mode == 'RGBA':
img = img.convert('RGB') # Convert to RGB if necessary
img_bytes = io.BytesIO()
img.save(img_bytes, format="PNG")
img_data = NSData.dataWithBytes_length_(img_bytes.getvalue(), len(img_bytes.getvalue()))
# Create handler directly from data
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(img_data, None)
except Exception as e:
self.logger.error(f"Error converting PIL image: {e}")
return []
else:
raise TypeError("Input must be a PIL Image, file path (str or Path)")
results = []
request = Vision.VNRecognizeTextRequest.alloc().init()
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate if recognition_level == "accurate" else Vision.VNRequestTextRecognitionLevelFast)
request.setUsesLanguageCorrection_(False) # Often faster without correction if only layout is needed
# Set preferred languages if available
if hasattr(request, 'setRecognitionLanguages_'):
request.setRecognitionLanguages_(self.language_preference)
success, error = handler.performRequests_error_([request], None)
if not success or error:
error_msg = f"Error performing text recognition request: {error.localizedDescription()}" if error else "Unknown error during text recognition."
self.logger.error(error_msg)
return []
vision_results = request.results()
if vision_results is None:
return []
for observation in vision_results:
if isinstance(observation, Vision.VNRecognizedTextObservation):
if return_raw_observations:
results.append(observation) # Append the raw object
else:
top_candidate = observation.topCandidates_(1)[0]
text = top_candidate.string()
confidence = top_candidate.confidence()
# Vision coordinates are normalized [0, 1] with origin at bottom-left.
# bbox format: [x, y, width, height]
bbox_rect = observation.boundingBox()
bbox = [bbox_rect.origin.x, bbox_rect.origin.y, bbox_rect.size.width, bbox_rect.size.height]
results.append((text, confidence, bbox))
return results
def organize_text_by_lines(self, ocr_results: List[Tuple[str, float, List[float]]],
detail: bool = True) -> List[Tuple[Any, ...]]:
"""
Organize OCR results by lines based on vertical position.
Args:
ocr_results: List of OCR results, each is (text, confidence, [x, y, w, h])
detail: Whether to include bounding box details in output
Returns:
List of text organized by lines with requested format
"""
if not ocr_results:
return []
# 首先,根据boundingBox的中心Y坐标粗略地将文本分组
# 注意:Vision框架中y坐标是从图像顶部(0)到底部(1)增加,
# 但排序方向可能需要反转,因为观察结果显示当前可能是从下到上
y_sorted_results = sorted(ocr_results, key=lambda r: r[2][1] + r[2][3]/2)
# 使用更智能的行分组算法
lines = []
current_line = [y_sorted_results[0]]
# 获取第一个元素的中心y坐标
line_y_center = y_sorted_results[0][2][1] + y_sorted_results[0][2][3]/2
# 使用动态容差计算 - 基于文本高度的一部分
base_tolerance = y_sorted_results[0][2][3] * 0.7 # 使用文本高度的70%作为基础容差
# 分析所有结果,按行分组
for result in y_sorted_results[1:]:
result_y_center = result[2][1] + result[2][3]/2
# 计算这个元素的高度相关容差
element_tolerance = max(base_tolerance, result[2][3] * 0.7)
tolerance = min(element_tolerance, 0.05) # 设置上限,防止容差过大
# 如果中心Y坐标在容差范围内,认为是同一行
if abs(result_y_center - line_y_center) < tolerance:
current_line.append(result)
else:
# 当前行处理完毕,按X坐标排序(从左到右)
current_line.sort(key=lambda r: r[2][0])
lines.append(current_line)
# 开始新的一行
current_line = [result]
line_y_center = result_y_center
# 添加最后一行
if current_line:
current_line.sort(key=lambda r: r[2][0])
lines.append(current_line)
# 修改:反转行排序,从下到上排序
# 尝试反转排序方向,因为观察结果显示可能需要从下到上的顺序
lines.sort(key=lambda line: sum(r[2][1] for r in line) / len(line), reverse=True)
# 格式化输出
output = []
for line in lines:
# 将行内的单词连接成文本
line_text = " ".join(r[0] for r in line)
avg_confidence = sum(r[1] for r in line) / len(line)
if detail:
# 计算整行的边界框(x1,y1,x2,y2格式)
x_min = min(r[2][0] for r in line)
y_min = min(r[2][1] for r in line)
x_max = max(r[2][0] + r[2][2] for r in line)
y_max = max(r[2][1] + r[2][3] for r in line)
output.append((line_text, avg_confidence, (x_min, y_min, x_max, y_max)))
else:
output.append((line_text, avg_confidence))
return output
def _analyze_text_flow_orientation(self, img: Image.Image) -> str:
"""
Analyzes image orientation based on the relative position of characters in the longest text blocks.
Strategy:
1. Identify the longest text blocks (most characters)
2. If longest text is < 3 chars, return 'indeterminate'
3. Otherwise, analyze positions of first, middle, and last character in each text block
4. Determine orientation based on the positions of these characters
Args:
img: PIL Image object.
Returns:
Orientation string: 'upright', 'rotated_90', 'rotated_180', 'rotated_270', or 'indeterminate'.
"""
self.logger.info("Analyzing text flow for orientation using longest text blocks...")
try:
# Get raw observations using the modified recognize_text
observations = self.recognize_text(img, recognition_level="accurate", return_raw_observations=True)
if not observations:
self.logger.info("No text observations found for text flow analysis.")
return 'indeterminate'
# Sort observations by text length (longest first)
text_blocks = []
for obs in observations:
top_candidate = obs.topCandidates_(1)[0]
text = top_candidate.string()
text_blocks.append((text, obs, top_candidate))
# Sort by text length, longest first
text_blocks.sort(key=lambda x: len(x[0]), reverse=True)
# If longest text is too short, we can't reliably determine orientation
if len(text_blocks[0][0]) < 3:
self.logger.info(f"Longest text block has only {len(text_blocks[0][0])} characters. Insufficient for orientation detection.")
return 'indeterminate'
# Take up to 2 longest text blocks for analysis
selected_blocks = text_blocks[:min(2, len(text_blocks))]
self.logger.info(f"Using {len(selected_blocks)} longest text blocks for orientation analysis.")
# Collect orientation votes from each block
orientation_votes = []
for text, obs, candidate in selected_blocks:
text_len = len(text)
self.logger.debug(f"Analyzing text block: '{text[:10]}...' ({text_len} chars)")
try:
# First character
range1 = NSMakeRange(0, 1)
char_box1_obs, error1 = candidate.boundingBoxForRange_error_(range1, None)
if error1 or char_box1_obs is None:
self.logger.warning(f"Could not get bounding box for first char. Error: {error1}")
continue
# Middle character
mid_index = text_len // 2
range_mid = NSMakeRange(mid_index, 1)
char_box_mid_obs, error_mid = candidate.boundingBoxForRange_error_(range_mid, None)
if error_mid or char_box_mid_obs is None:
self.logger.warning(f"Could not get bounding box for middle char. Error: {error_mid}")
continue
# Last character
range_last = NSMakeRange(text_len - 1, 1)
char_box_last_obs, error_last = candidate.boundingBoxForRange_error_(range_last, None)
if error_last or char_box_last_obs is None:
self.logger.warning(f"Could not get bounding box for last char. Error: {error_last}")
continue
# Extract rectangle info for all three positions
boxes = [
char_box1_obs.boundingBox(),
char_box_mid_obs.boundingBox(),
char_box_last_obs.boundingBox()
]
# Calculate midpoints for all boxes
midpoints = []
for box in boxes:
midX = box.origin.x + box.size.width / 2.0
midY = box.origin.y + box.size.height / 2.0
midpoints.append((midX, midY))
# Analyze the general direction of the text flow
# Calculate primary direction from first to last character
first_mid = midpoints[0]
last_mid = midpoints[2]
deltaX = last_mid[0] - first_mid[0]
deltaY = last_mid[1] - first_mid[1]
# Define a threshold (relative to image size)
threshold = 0.01
self.logger.debug(f"Text flow: DeltaX: {deltaX:.4f}, DeltaY: {deltaY:.4f}")
# Determine primary orientation based on deltas
if abs(deltaX) > abs(deltaY) + threshold: # Primarily horizontal movement
if deltaX > threshold:
orientation_votes.append('upright') # Left-to-right
elif deltaX < -threshold:
orientation_votes.append('rotated_180') # Right-to-left
elif abs(deltaY) > abs(deltaX) + threshold: # Primarily vertical movement
# Vision使用左下角为原点(0,0),y轴向上为正
if deltaY > threshold:
# Y增加 = 从下到上的文本流 = 图像顺时针旋转90度
# (标准阅读习惯是从左到右或从上到下)
orientation_votes.append('rotated_90') # Bottom-to-top, image rotated 90° clockwise
elif deltaY < -threshold:
# Y减小 = 从上到下的文本流 = 图像逆时针旋转90度(或顺时针270度)
orientation_votes.append('rotated_270') # Top-to-bottom, image rotated 270° clockwise
except Exception as char_error:
self.logger.warning(f"Error processing character boxes: {char_error}")
continue
# Determine final orientation based on votes
if not orientation_votes:
self.logger.info("No valid orientation votes collected.")
return 'indeterminate'
# If all votes agree, use that orientation
if all(vote == orientation_votes[0] for vote in orientation_votes):
self.logger.info(f"All text blocks indicate {orientation_votes[0]} orientation.")
return orientation_votes[0]
# If votes disagree, use the most common one (or first in case of tie)
from collections import Counter
vote_counts = Counter(orientation_votes)
most_common = vote_counts.most_common(1)[0][0]
self.logger.info(f"Text blocks have mixed orientations. Most common: {most_common}")
return most_common
except Exception as e:
self.logger.error(f"Error during text flow orientation analysis: {e}")
return 'indeterminate'
def correct_image_rotation(self, img: Image.Image) -> Image.Image:
"""
Detects and corrects image rotation using text flow analysis.
Args:
img: PIL Image object
Returns:
PIL Image with corrected orientation, or original image if no rotation needed/detected.
"""
try:
self.logger.info("Starting image rotation correction process...")
# Try Text Flow Analysis (most reliable method)
text_flow_orientation = self._analyze_text_flow_orientation(img)
self.logger.info(f"Text flow analysis result: {text_flow_orientation}")
if text_flow_orientation == 'upright':
self.logger.info("Text flow indicates image is upright. No rotation needed.")
return img
elif text_flow_orientation == 'rotated_90':
self.logger.info("Text flow indicates 90-degree clockwise rotation. Correcting...")
return img.rotate(270, expand=True) # 逆时针270度=顺时针90度
elif text_flow_orientation == 'rotated_180':
self.logger.info("Text flow indicates 180-degree rotation. Correcting...")
return img.rotate(180, expand=True)
elif text_flow_orientation == 'rotated_270':
self.logger.info("Text flow indicates 270-degree clockwise rotation. Correcting...")
return img.rotate(90, expand=True) # 逆时针90度=顺时针270度
# If we can't determine orientation from text flow, try rotation trial and error
self.logger.info("Text flow analysis indeterminate. Using rotation trial-and-error...")
# Get initial OCR results to compare with
initial_results = self.recognize_text(img, recognition_level="fast")
initial_text_count = len(initial_results)
self.logger.info(f"Initial text count: {initial_text_count}")
# Try rotating the image and see if we get better results
best_img = img
max_text_count = initial_text_count
for angle in [90, 180, 270]:
self.logger.debug(f"Trying rotation: {angle} degrees")
rotated_img = img.rotate(angle, expand=True)
try:
results = self.recognize_text(rotated_img, recognition_level="fast")
text_count = len(results)
self.logger.debug(f" Found {text_count} text elements at {angle} degrees.")
if text_count > max_text_count + 2: # Require a significant improvement
max_text_count = text_count
best_img = rotated_img
self.logger.info(f"Found better orientation at {angle} degrees with {text_count} elements.")
except Exception as rot_err:
self.logger.warning(f"Error during OCR on rotated image ({angle} deg): {rot_err}")
if best_img is not img:
self.logger.info("Applying rotation based on trial-and-error result.")
return best_img
else:
self.logger.info("Trial-and-error did not find a better orientation.")
# If none of the methods suggest rotation
self.logger.info("No rotation correction applied.")
return img
except Exception as e:
self.logger.error(f"Error during image rotation correction: {e}")
return img # Return original image on error
def _analyze_text_direction(self, ocr_results):
"""
分析文本的阅读方向(从上到下、从左到右)。
Args:
ocr_results: OCR识别结果
Returns:
方向信息,用于排序和组织文本
"""
if not ocr_results or len(ocr_results) < 2:
return None
# 统计垂直和水平方向上的文本元素关系
vertical_relations = 0
horizontal_relations = 0
# 遍历所有文本元素对
for i in range(len(ocr_results)):
for j in range(i+1, len(ocr_results)):
elem1 = ocr_results[i]
elem2 = ocr_results[j]
# 获取中心点
center1_x = elem1[2][0] + elem1[2][2]/2
center1_y = elem1[2][1] + elem1[2][3]/2
center2_x = elem2[2][0] + elem2[2][2]/2
center2_y = elem2[2][1] + elem2[2][3]/2
# 计算水平和垂直距离
dx = abs(center1_x - center2_x)
dy = abs(center1_y - center2_y)
# 判断关系
if dx > dy:
horizontal_relations += 1
else:
vertical_relations += 1
# 返回方向信息
if horizontal_relations > vertical_relations:
return "horizontal"
else:
return "vertical"
def process(self, input_data: Union[str, Path, Image.Image], detail: bool = False) -> Tuple[str, Any]:
"""统一处理函数,处理图像或PDF文件。
Args:
input_data: 输入数据,可以是文件路径(str或Path)或PIL Image对象
detail: 是否在结果中包含位置信息
Returns:
Tuple of (file_path or "image", text_content)
file_path:如果输入是文件路径,则返回绝对路径;如果是PIL Image对象,则返回"image"
text_content:如果detail=True,返回带有位置信息的结构化数据;否则返回纯文本
"""
try:
# 处理不同类型的输入
if isinstance(input_data, Image.Image):
# 输入是PIL图像对象
self.logger.info("处理PIL图像对象")
img = input_data
# 处理旋转
img = self.correct_image_rotation(img)
# 执行OCR
ocr_results = self.recognize_text(img)
# 组织结果
text_by_lines = self.organize_text_by_lines(ocr_results, detail=detail)
if detail:
return "image", text_by_lines
else:
# 提取纯文本
text = "\n".join(line[0] for line in text_by_lines)
# 确保输出文本为UTF-8编码
text = text.encode('utf-8').decode('utf-8')
return "image", text
elif isinstance(input_data, (str, Path)):
# 输入是文件路径
path = Path(input_data)
abs_path = str(path.absolute())
self.logger.info(f"处理文件: {abs_path}")
# 检查文件是否存在
if not path.exists():
raise ValueError(f"文件不存在: {abs_path}")
# 检查文件类型
suffix = path.suffix.lower()
if suffix == '.pdf':
# PDF文件处理
result = self.process_pdf(path, detail=detail)
# 确保输出文本为UTF-8编码
if not detail and isinstance(result[1], str):
result = (result[0], result[1].encode('utf-8').decode('utf-8'))
return result
elif suffix in ['.png', '.jpg', '.jpeg']:
# 图像文件处理
result = self.process_image(path, detail=detail)
# 确保输出文本为UTF-8编码
if not detail and isinstance(result[1], str):
result = (result[0], result[1].encode('utf-8').decode('utf-8'))
return result
else:
raise ValueError(f"不支持的文件类型: {suffix}。支持的类型: PDF, PNG, JPG, JPEG")
else:
raise ValueError("输入必须是文件路径(str或Path)或PIL Image对象")
except Exception as e:
self.logger.error(f"处理过程中出错: {e}")
if isinstance(input_data, (str, Path)):
return str(input_data), f"Error: {str(e)}"
else:
return "image", f"Error: {str(e)}"
def process_image(self, image_path: Union[str, Path], detail: bool = False) -> Tuple[str, Any]:
"""Process a single image file with OCR.
Args:
image_path: Path to the image file
detail: Whether to include position information in OCR results
Returns:
Tuple of (absolute_path, text_content)
If detail=True, text_content will be a list of tuples with text, confidence and position
If detail=False, text_content will be a string with line-by-line text
"""
try:
# Convert image path to absolute path if needed
abs_path = str(Path(image_path).absolute())
# Open image file
img = Image.open(abs_path)
# Handle rotation if needed
img = self.correct_image_rotation(img)
# Perform OCR
self.logger.info(f"执行OCR识别: {abs_path}")
ocr_results = self.recognize_text(img)
if not ocr_results:
self.logger.warning(f"未检测到文本: {abs_path}")
return abs_path, "" if not detail else []
# 分析文本方向,为文本组织提供参考
text_direction = self._analyze_text_direction(ocr_results)
self.logger.info(f"文本主要方向: {text_direction or '未确定'}")
# 根据文本方向组织结果
if text_direction == "vertical":
# 对于垂直文本(如中文竖排),需要特殊处理
self.logger.info("检测到垂直排列文本,使用垂直文本排序")
# 此处可以添加垂直文本的特殊处理逻辑
# 目前还是使用标准文本组织方法
# 组织结果为行
text_by_lines = self.organize_text_by_lines(ocr_results, detail=detail)
if detail:
# 保持详细格式
text = text_by_lines
else:
# 提取纯文本
text = "\n".join(line[0] for line in text_by_lines)
# 确保输出文本为UTF-8编码
text = text.encode('utf-8').decode('utf-8')
return abs_path, text
except Exception as e:
self.logger.error(f"处理图像时出错 {image_path}: {e}")
return str(image_path), f"Error: {str(e)}"
def process_pdf(self, pdf_path: Union[str, Path],
detail: bool = False) -> Tuple[str, Any]:
"""Process a PDF file by converting it to images and then performing OCR.
Args:
pdf_path: Path to the PDF file
detail: Whether to include position information in OCR results
Returns:
Tuple of (absolute_path, text_content)
"""
try:
from pdf2image import convert_from_path
# Convert path to absolute path if needed
abs_path = str(Path(pdf_path).absolute())
self.logger.info(f"处理PDF文件: {abs_path}")
# Convert PDF to list of PIL images
self.logger.info("将PDF转换为图像...")
images = convert_from_path(abs_path)
self.logger.info(f"PDF包含 {len(images)} 页")
# Process each image with OCR
all_page_results = []
for i, image in enumerate(images, 1):
self.logger.info(f"处理第 {i} 页...")
# Handle rotation if needed
image = self.correct_image_rotation(image)
# Perform OCR
ocr_results = self.recognize_text(image)
if not ocr_results:
self.logger.warning(f"第 {i} 页未检测到文本")
if detail:
all_page_results.append((f"Page {i}", []))
else:
all_page_results.append(f"Page {i}\n[No text detected]")
continue
# 分析文本方向
text_direction = self._analyze_text_direction(ocr_results)
self.logger.info(f"第 {i} 页文本主要方向: {text_direction or '未确定'}")
# 根据文本方向组织结果
if text_direction == "vertical":
# 对于垂直文本(如中文竖排),可能需要特殊处理
self.logger.info(f"第 {i} 页检测到垂直排列文本")
# 目前使用标准文本组织方法
# Organize results by lines
text_by_lines = self.organize_text_by_lines(ocr_results, detail=detail)
# Add page information
if detail:
all_page_results.append((f"Page {i}", text_by_lines))
else:
page_text = "\n".join(line[0] for line in text_by_lines)
all_page_results.append(f"Page {i}\n{page_text}")
# Format the final result
if detail:
# Keep the detailed format with page information
text = all_page_results
else:
# Join all pages with double newlines
text = "\n\n".join(all_page_results)
# 确保输出文本为UTF-8编码
text = text.encode('utf-8').decode('utf-8')
return abs_path, text
except Exception as e:
self.logger.error(f"处理PDF时出错 {pdf_path}: {e}")
return str(pdf_path), f"Error: {str(e)}"
def process_directory(self, directory_path: Union[str, Path],
detail: bool = False) -> List[Tuple[str, Any]]:
"""Process all supported files in a directory with OCR.
Args:
directory_path: Path to the directory
detail: Whether to include position information in OCR results
Returns:
List of tuples, each containing file path and OCR results
"""
path = Path(directory_path)
if not path.exists() or not path.is_dir():
raise ValueError(f"Directory not found: {directory_path}")
# Find all supported files
supported_extensions = ['.pdf', '.png', '.jpg', '.jpeg']
files_to_process = []
for ext in supported_extensions:
files_to_process.extend(path.glob(f"*{ext}"))
files_to_process.extend(path.glob(f"*{ext.upper()}"))
# Process each file
results = []
for file_path in files_to_process:
result = self.process(file_path, detail=detail)
# 确保所有文本输出为UTF-8编码
if not detail and isinstance(result[1], str):
result = (result[0], result[1].encode('utf-8').decode('utf-8'))
results.append(result)
return results
if __name__ == "__main__":
import time
# 设置日志格式
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# 默认测试文件
# test_files = [
# "/Users/xinyu3/Documents/9.0-self/expense/Mar-2025/cy/253320000000_65651335_思科(中国)有限公司上海分公司.pdf",
# ]
test_files = []
# 添加指定目录下的所有PDF和图片文件
test_directory = "/Users/xinyu3/Documents/9.0-self/expense/Mar-2025/hotel"
# 查找目录下的所有PDF文件
pdf_files = glob.glob(f"{test_directory}/*.pdf")
pdf_files.extend(glob.glob(f"{test_directory}/*.PDF"))
# 查找目录下的所有图片文件
image_files = []
for ext in ["jpg", "jpeg", "png", "JPG", "JPEG", "PNG"]:
image_files.extend(glob.glob(f"{test_directory}/*.{ext}"))
# 添加到测试文件列表
for file in pdf_files + image_files:
if file not in test_files: # 避免重复添加
test_files.append(file)
print(f"找到 {len(test_files)} 个待测试文件")
# 创建OCR引擎实例
ocr = VisionOCR()
# 处理测试文件
for file_path in test_files:
if not Path(file_path).exists():
print(f"测试文件不存在: {file_path}")
continue
print(f"\n============ 处理文件: {file_path} ============")
start_time = time.time()
# 使用统一的process函数处理文件
file_path, result = ocr.process(file_path, detail=False)
elapsed_time = time.time() - start_time
print(f"处理时间: {elapsed_time:.2f} 秒")
# 打印结果
if isinstance(result, list):
if result and isinstance(result[0], tuple) and len(result[0]) == 2:
# PDF结果
print(f"PDF文件,包含 {len(result)} 页")
for page_info in result:
page_num, page_lines = page_info
print(f"\n{page_num},包含 {len(page_lines)} 行文本")
# 打印所有行
for i, line_info in enumerate(page_lines):
text, confidence, _ = line_info
print(f"{text}")
else:
# 图像结果
print(f"图像文件,包含 {len(result)} 行文本")
# 打印所有行
for i, line_info in enumerate(result):
text, confidence, _ = line_info
print(f"{text}")
else:
# 纯文本结果
print(f"\n结果:\n{result}")