KnowledgeMCP

Overview Schema Related Servers Score Discussions

KnowledgeMCP
src
services

ocr_service.py•7.38 KiB

""" OCR service using Tesseract with smart detection and PDF processing. """ import asyncio import tempfile from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import Optional try: import pytesseract from PIL import Image TESSERACT_AVAILABLE = True except ImportError: TESSERACT_AVAILABLE = False try: from pdf2image import convert_from_path PDF2IMAGE_AVAILABLE = True except ImportError: PDF2IMAGE_AVAILABLE = False from src.utils.logging_config import get_logger logger = get_logger(__name__) class OCRService: """Service for OCR processing using Tesseract with smart detection.""" def __init__( self, language: str = "eng", force_ocr: bool = False, max_workers: int = 2 ): """ Initialize OCR service. Args: language: OCR language code (default: "eng") force_ocr: Always use OCR regardless of text quality max_workers: Number of worker threads for OCR processing """ self.language = language self.force_ocr = force_ocr self.executor = ThreadPoolExecutor(max_workers=max_workers) if not TESSERACT_AVAILABLE: logger.warning("Tesseract OCR not available - install pytesseract and tesseract-ocr") if not PDF2IMAGE_AVAILABLE: logger.warning("pdf2image not available - install pdf2image for PDF OCR support") async def is_ocr_needed(self, extracted_text: str) -> bool: """ Determine if OCR is needed based on extracted text quality. Args: extracted_text: Text extracted from document Returns: True if OCR is recommended """ if self.force_ocr: logger.info("Force OCR enabled - OCR will be used") return True text_length = len(extracted_text.strip()) # Too little text suggests scanned document if text_length < 100: logger.info(f"Text too short ({text_length} chars) - OCR recommended") return True # Check for gibberish (high ratio of non-alphanumeric characters) alphanumeric = sum(c.isalnum() or c.isspace() for c in extracted_text) ratio = alphanumeric / len(extracted_text) if extracted_text else 0 if ratio < 0.7: # Less than 70% readable characters logger.info(f"Low alphanumeric ratio ({ratio:.2%}) - OCR recommended") return True logger.info("Text quality sufficient - OCR not needed") return False async def extract_text_from_image( self, image_path: Path, language: Optional[str] = None, ) -> tuple[str, float]: """ Extract text from image using OCR. Args: image_path: Path to image file language: OCR language code (uses instance default if None) Returns: Tuple of (extracted_text, confidence_score) """ if not TESSERACT_AVAILABLE: raise RuntimeError("Tesseract OCR not available") lang = language or self.language try: # Run OCR in thread pool to avoid blocking loop = asyncio.get_event_loop() text, confidence = await loop.run_in_executor( self.executor, self._extract_text_sync, image_path, lang ) logger.info( f"OCR extracted {len(text)} characters from {image_path.name} " f"(confidence: {confidence:.2f})" ) return text, confidence except Exception as e: logger.error(f"OCR failed for {image_path}: {e}") raise def _extract_text_sync(self, image_path: Path, language: str) -> tuple[str, float]: """Synchronous OCR extraction (runs in thread pool).""" # Load and preprocess image image = Image.open(image_path) # Convert to grayscale for better OCR if image.mode != "L": image = image.convert("L") # Extract text text = pytesseract.image_to_string(image, lang=language) # Get confidence score data = pytesseract.image_to_data( image, lang=language, output_type=pytesseract.Output.DICT ) confidences = [int(conf) for conf in data["conf"] if conf != "-1"] avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0 confidence_score = avg_confidence / 100.0 # Normalize to 0-1 return text, confidence_score async def process_pdf_with_ocr( self, pdf_path: Path, language: Optional[str] = None, ) -> tuple[str, float]: """ Extract text from PDF using OCR by converting pages to images. Args: pdf_path: Path to PDF file language: OCR language code (uses instance default if None) Returns: Tuple of (combined_text, average_confidence) """ if not TESSERACT_AVAILABLE: raise RuntimeError("Tesseract OCR not available") if not PDF2IMAGE_AVAILABLE: raise RuntimeError("pdf2image not available - install pdf2image for PDF OCR") lang = language or self.language temp_dir = None try: # Create temporary directory for images temp_dir = tempfile.TemporaryDirectory() temp_path = Path(temp_dir.name) logger.info(f"Converting PDF pages to images: {pdf_path.name}") # Convert PDF pages to images loop = asyncio.get_event_loop() images = await loop.run_in_executor( self.executor, convert_from_path, str(pdf_path), 300 # DPI ) logger.info(f"Processing {len(images)} pages with OCR") # Process each page all_text = [] all_confidences = [] for i, image in enumerate(images, 1): # Save image temporarily image_path = temp_path / f"page_{i}.png" image.save(image_path, "PNG") # Extract text from page text, confidence = await self.extract_text_from_image(image_path, lang) all_text.append(text) all_confidences.append(confidence) logger.debug(f"Page {i}/{len(images)}: {len(text)} chars, confidence {confidence:.2f}") # Combine results combined_text = "\n\n".join(all_text) avg_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0 logger.info( f"OCR completed: {len(combined_text)} total characters, " f"average confidence: {avg_confidence:.2f}" ) return combined_text, avg_confidence except Exception as e: logger.error(f"PDF OCR failed for {pdf_path}: {e}") raise finally: # Clean up temporary files if temp_dir: try: temp_dir.cleanup() logger.debug("Temporary files cleaned up") except Exception as e: logger.warning(f"Failed to cleanup temp files: {e}") def __del__(self): """Clean up executor on deletion.""" if hasattr(self, 'executor'): self.executor.shutdown(wait=False)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/maxzrff/KnowledgeMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

ocr_service.py•7.38 KiB