ReadPDFx - OCR PDF MCP Server

install_indonesian_complete.py•8.35 KiB

#!/usr/bin/env python3 """ Enhanced Indonesian OCR Installer Comprehensive installer for Indonesian + English OCR support with eng+ind combination """ import os import requests import shutil import subprocess from pathlib import Path import tempfile def get_tesseract_path(): """Find Tesseract installation path""" common_paths = [ r"C:\Program Files\Tesseract-OCR", r"C:\Program Files (x86)\Tesseract-OCR", r"C:\Users\{}\AppData\Local\Tesseract-OCR".format(os.getenv('USERNAME')), r"C:\tools\tesseract" ] for path in common_paths: if os.path.exists(path): return Path(path) return None def download_language_pack(url, filename, tessdata_dir): """Download a single language pack with progress""" try: target_file = tessdata_dir / filename if target_file.exists(): print(f"✅ {filename} already exists") return True print(f"⬇️ Downloading {filename}...") # Create temp file first with tempfile.NamedTemporaryFile(delete=False) as temp_file: response = requests.get(url, stream=True) response.raise_for_status() total_size = int(response.headers.get('content-length', 0)) downloaded = 0 for chunk in response.iter_content(chunk_size=8192): if chunk: temp_file.write(chunk) downloaded += len(chunk) if total_size > 0: percent = (downloaded / total_size) * 100 print(f" Progress: {percent:.1f}% ({downloaded}/{total_size} bytes)", end='\r') print() # New line after progress # Move temp file to final location shutil.move(temp_file.name, target_file) print(f"✅ {filename} downloaded successfully") return True except Exception as e: print(f"❌ Failed to download {filename}: {e}") # Clean up temp file if exists try: if 'temp_file' in locals(): os.unlink(temp_file.name) except: pass return False def install_indonesian_ocr_complete(): """Complete Indonesian OCR installation for eng+ind usage""" print("🇮🇩 Enhanced Indonesian OCR Installer") print("=" * 50) print("Installing support for lang='eng+ind' combination") print() # Find Tesseract tesseract_path = get_tesseract_path() if not tesseract_path: print("❌ Tesseract installation not found!") print("Install Tesseract first: https://github.com/UB-Mannheim/tesseract/wiki") return False tessdata_dir = tesseract_path / "tessdata" if not tessdata_dir.exists(): print(f"❌ Tessdata directory not found: {tessdata_dir}") return False print(f"📁 Tesseract found: {tesseract_path}") print(f"📁 Installing to: {tessdata_dir}") print() # Essential language packs for Indonesian + English language_packs = { 'ind.traineddata': { 'url': 'https://github.com/tesseract-ocr/tessdata/raw/main/ind.traineddata', 'description': 'Indonesian (Bahasa Indonesia) - REQUIRED', 'priority': 1 }, 'msa.traineddata': { 'url': 'https://github.com/tesseract-ocr/tessdata/raw/main/msa.traineddata', 'description': 'Malay (similar to Indonesian) - RECOMMENDED', 'priority': 2 } } success_count = 0 total_packs = len(language_packs) print("📦 Downloading language packs...") print() for filename, pack_info in language_packs.items(): print(f"[{pack_info['priority']}/{total_packs}] {pack_info['description']}") if download_language_pack(pack_info['url'], filename, tessdata_dir): success_count += 1 print() print(f"📊 Installation Summary: {success_count}/{total_packs} packs installed") return success_count > 0 def test_eng_ind_combination(): """Test eng+ind language combination""" print("🧪 Testing eng+ind Language Combination") print("-" * 40) try: # Check available languages result = subprocess.run(['tesseract', '--list-langs'], capture_output=True, text=True) if result.returncode == 0: languages = result.stdout print("📋 Available languages:") for line in languages.split('\n'): if line.strip() and not line.startswith('List of'): lang = line.strip() status = "✅" if lang in ['eng', 'ind', 'msa'] else "📝" print(f" {status} {lang}") # Check Indonesian specifically if 'ind' in languages: print(f"\n🎉 Indonesian Language Pack: ✅ INSTALLED") print(f"🔧 Ready for lang='eng+ind' usage!") # Show usage examples print(f"\n📖 Usage Examples:") print(f" Python: pytesseract.image_to_string(image, lang='eng+ind')") print(f" CLI: tesseract input.pdf output.txt -l eng+ind") print(f" MCP Tools: process_pdf_smart(pdf_path, language='eng+ind')") return True else: print(f"\n❌ Indonesian not found in language list") return False except Exception as e: print(f"❌ Testing failed: {e}") return False def create_usage_guide(): """Create usage guide for eng+ind combination""" guide_content = """# Indonesian OCR Usage Guide ## Language Combination: eng+ind ### Optimal Settings - **Mixed Documents**: `lang='eng+ind'` (English + Indonesian) - RECOMMENDED - **Fallback**: `lang='eng'` (English only) ### MCP Tools Usage ```python # Smart PDF processing with Indonesian support result = await process_pdf_smart(pdf_path, language='eng+ind') # OCR with Indonesian language ocr_result = await perform_ocr(file_path, language='eng+ind') # Batch processing with Indonesian batch_result = await batch_process_pdfs(directory, language='eng+ind') ``` ### Python Direct Usage ```python import pytesseract from PIL import Image # Load image image = Image.open('indonesian_document.png') # OCR with Indonesian + English text = pytesseract.image_to_string(image, lang='eng+ind') ``` ### Command Line Usage ```bash # OCR PDF with Indonesian support tesseract input.pdf output.txt -l eng+ind # OCR image with Indonesian tesseract document.png result.txt -l eng+ind ``` ## Language Codes - `ind`: Indonesian (Bahasa Indonesia) - `eng`: English - `msa`: Malay (Bahasa Malaysia) - `eng+ind`: English + Indonesian (RECOMMENDED) ## Installation Verification ```bash tesseract --list-langs # Should show: eng, ind, msa, osd ``` """ guide_path = Path(__file__).parent / "INDONESIAN_OCR_GUIDE.md" try: with open(guide_path, 'w', encoding='utf-8') as f: f.write(guide_content) print(f"📖 Usage guide created: {guide_path}") return True except Exception as e: print(f"❌ Failed to create guide: {e}") return False def main(): """Main installation process""" print("Starting Indonesian OCR installation...") print() # Install language packs if install_indonesian_ocr_complete(): print("✅ Language packs installation completed") print() # Test installation if test_eng_ind_combination(): print("✅ eng+ind combination working!") print() # Create usage guide create_usage_guide() print("\n🎉 Installation Complete!") print("Indonesian OCR support ready for lang='eng+ind' usage") else: print("⚠️ Installation completed but testing failed") print("Check language packs manually") else: print("❌ Installation failed") print("\nManual installation steps:") print("1. Download: https://github.com/tesseract-ocr/tessdata/raw/main/ind.traineddata") print("2. Copy to: C:\\Program Files\\Tesseract-OCR\\tessdata\\") print("3. Verify: tesseract --list-langs") if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/irev/mcp-readpdfx'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

install_indonesian_complete.py•8.35 KiB