RivalSearchMCP

ocr.py•1.88 KiB

#!/usr/bin/env python3 """ OCR processing component for RivalSearchMCP. Handles OCR processing of images from web content. """ import asyncio from io import BytesIO from typing import List, cast from urllib.parse import urlparse import httpx from bs4 import BeautifulSoup, Tag from PIL import Image from pytesseract import image_to_string from src.logging.logger import logger async def process_images_ocr(soup: BeautifulSoup, base_url: str) -> List[str]: """ Process images in HTML content using OCR to extract text. Args: soup: BeautifulSoup object containing HTML base_url: Base URL for resolving relative image URLs Returns: List of extracted text strings from images """ images = [] for img in soup.find_all("img"): img_tag = cast(Tag, img) src = img_tag.get("src", "") if src: images.append(src) async def ocr_img(img_src: str) -> str: """Process a single image with OCR.""" img_url = ( urlparse(base_url)._replace(path=img_src).geturl() if not img_src.startswith("http") else img_src ) try: async with httpx.AsyncClient() as client: resp = await client.get(img_url, timeout=10) img = Image.open(BytesIO(resp.content)) return image_to_string(img) except Exception as e: logger.debug(f"OCR failed for {img_url}: {e}") return "" tasks = [ocr_img(src) for src in images] results = await asyncio.gather(*tasks, return_exceptions=True) # Filter out exceptions and empty results valid_results = [] for result in results: if isinstance(result, Exception): logger.debug(f"OCR task failed: {result}") elif isinstance(result, str) and result.strip(): valid_results.append(result.strip()) return valid_results

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/DamionR/RivalSearchMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

ocr.py•1.88 KiB