Skip to main content
Glama
by Baronco
ocr.py•3.19 kB
# standard libraries from datetime import datetime import os import math from pathlib import Path from typing import List from PIL import Image from sys import argv # third-party libraries import cv2 import pytesseract from pdf2image import convert_from_path import numpy as np from mcp.server.fastmcp import FastMCP def enhance_image(image: np.ndarray) -> np.ndarray: """ Enhance the image for better OCR results by converting to grayscale, applying thresholding, and denoising. Args: image (numpy.ndarray): The input image to enhance. Returns: numpy.ndarray: The enhanced image ready for OCR processing. """ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # Apply thresholding thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)[1] # Denoise denoised = cv2.fastNlMeansDenoising(thresh, None, 30, 7, 21) print(f"enhancing image size: {denoised.shape}") return denoised def extract_text(image: np.ndarray) -> str: """ Extract text from an image using Tesseract OCR with custom configuration. Args: image (numpy.ndarray): The input image from which to extract text. Returns: str: The extracted text from the image. """ custom_config = r'--oem 3 --psm 6' text = pytesseract.image_to_string(image, config=custom_config) return text def process_images(images: List[Image], extract_texts: list) -> list: """ Process a list of images to extract text using OCR. Args: images (List[Image]): A list of PIL Image objects to process. extract_texts (list): A list to store the extracted texts from each image. Returns: None: The function modifies the extract_texts list in place. """ for image in images: np_image = np.array(image) enhanced_image_output = enhance_image(np_image) text = extract_text(enhanced_image_output) extract_texts.append(text) return extract_texts def get_scanned_document_text(file_path: str, poppler_path:str) -> dict: """ Load and extract text from a scanned document using Tesseract OCR with multiprocessing. Args: file_path (str): The path to the scanned document file (PDF). poppler_path (str): The path to the Poppler binaries for PDF processing. Returns: dict: A dictionary containing: - content (str): The extracted text content from the document. - processing_time (str): The time taken to process the document. """ try: start_time = datetime.now() print(f"Processing file: {file_path}") images = convert_from_path(file_path, dpi=150, poppler_path=poppler_path) print(f"Number of images extracted: {len(images)}") extract_texts = [] extract_texts = process_images(images, extract_texts) # results final_text = '\n---\n'.join(extract_texts) end_time = datetime.now() return { 'content':final_text, 'processing_time': str(end_time - start_time), } except Exception as e: return f"Error processing document: {e}"

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Baronco/Local-Docs-MCP-Tool'

If you have feedback or need assistance with the MCP directory API, please join our Discord server