ocr.pyā¢3.19 kB
# standard libraries
from datetime import datetime
import os
import math
from pathlib import Path
from typing import List
from PIL import Image
from sys import argv
# third-party libraries
import cv2
import pytesseract
from pdf2image import convert_from_path
import numpy as np
from mcp.server.fastmcp import FastMCP
def enhance_image(image: np.ndarray) -> np.ndarray:
"""
Enhance the image for better OCR results by converting to grayscale, applying thresholding, and denoising.
Args:
image (numpy.ndarray): The input image to enhance.
Returns:
numpy.ndarray: The enhanced image ready for OCR processing.
"""
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Apply thresholding
thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)[1]
# Denoise
denoised = cv2.fastNlMeansDenoising(thresh, None, 30, 7, 21)
print(f"enhancing image size: {denoised.shape}")
return denoised
def extract_text(image: np.ndarray) -> str:
"""
Extract text from an image using Tesseract OCR with custom configuration.
Args:
image (numpy.ndarray): The input image from which to extract text.
Returns:
str: The extracted text from the image.
"""
custom_config = r'--oem 3 --psm 6'
text = pytesseract.image_to_string(image, config=custom_config)
return text
def process_images(images: List[Image], extract_texts: list) -> list:
"""
Process a list of images to extract text using OCR.
Args:
images (List[Image]): A list of PIL Image objects to process.
extract_texts (list): A list to store the extracted texts from each image.
Returns:
None: The function modifies the extract_texts list in place.
"""
for image in images:
np_image = np.array(image)
enhanced_image_output = enhance_image(np_image)
text = extract_text(enhanced_image_output)
extract_texts.append(text)
return extract_texts
def get_scanned_document_text(file_path: str, poppler_path:str) -> dict:
"""
Load and extract text from a scanned document using Tesseract OCR with multiprocessing.
Args:
file_path (str): The path to the scanned document file (PDF).
poppler_path (str): The path to the Poppler binaries for PDF processing.
Returns:
dict: A dictionary containing:
- content (str): The extracted text content from the document.
- processing_time (str): The time taken to process the document.
"""
try:
start_time = datetime.now()
print(f"Processing file: {file_path}")
images = convert_from_path(file_path, dpi=150, poppler_path=poppler_path)
print(f"Number of images extracted: {len(images)}")
extract_texts = []
extract_texts = process_images(images, extract_texts)
# results
final_text = '\n---\n'.join(extract_texts)
end_time = datetime.now()
return {
'content':final_text,
'processing_time': str(end_time - start_time),
}
except Exception as e:
return f"Error processing document: {e}"