#!/usr/bin/env python3
"""
Test script for PDF OCR functionality
"""
import asyncio
import sys
import time
from pathlib import Path
# Add project to path
sys.path.append('/home/arne/src/nanonets_mcp')
async def test_pdf_support():
"""Test PDF processing capabilities"""
try:
from nanonets_mcp.server import mcp, PDF_SUPPORT
print("🧪 Testing PDF OCR functionality")
print("=" * 50)
# Test 1: Check PDF support
print(f"PDF Support Available: {PDF_SUPPORT}")
if not PDF_SUPPORT:
print("❌ PDF dependencies not installed")
print("Install with: pip install pdf2image PyMuPDF")
return False
# Test 2: List available tools
tools = await mcp.list_tools()
tool_names = [tool.name for tool in tools]
print(f"Available tools: {tool_names}")
pdf_tool_available = 'ocr_pdf_to_markdown' in tool_names
print(f"PDF OCR tool available: {pdf_tool_available}")
# Test 3: Get supported formats
formats_result = await mcp.call_tool('get_supported_formats', {})
# Handle MCP result format
if hasattr(formats_result, '__iter__') and len(formats_result) >= 1:
formats = formats_result[0]
if hasattr(formats, 'text'):
# If it's wrapped in a TextContent object
import json
try:
formats = json.loads(formats.text)
except:
formats = {"supported_formats": ["Unable to parse"]}
elif hasattr(formats, '__dict__'):
formats = formats.__dict__
else:
formats = formats_result
supported_formats = formats.get('supported_formats', []) if isinstance(formats, dict) else ["Unknown"]
print(f"Supported formats: {supported_formats}")
# Test 4: Create a simple test PDF (if we had one)
# For now, just verify the tool can be called
print("\n✅ PDF OCR functionality is properly configured!")
return True
except Exception as e:
print(f"❌ Error testing PDF functionality: {e}")
import traceback
traceback.print_exc()
return False
async def test_with_sample_pdf():
"""Test with a sample PDF file if available"""
try:
from nanonets_mcp.server import mcp
# Look for any PDF files in tests directory
pdf_files = list(Path('/home/arne/src/nanonets_mcp/tests').glob('*.pdf'))
if not pdf_files:
print("ℹ️ No PDF files found in tests directory for testing")
print(" To test with a real PDF, add a PDF file to the tests/ directory")
return True
print(f"\n📄 Found PDF file: {pdf_files[0]}")
print("🤖 Starting PDF OCR processing...")
start_time = time.time()
# Read PDF file as base64
pdf_path = str(pdf_files[0])
result = await mcp.call_tool('ocr_pdf_to_markdown', {
'pdf_data': pdf_path
})
end_time = time.time()
processing_time = end_time - start_time
print(f"✅ PDF OCR completed in {processing_time:.2f} seconds")
# Show first 300 characters of result
if hasattr(result, '__iter__') and len(result) >= 1:
content = result[0]
if hasattr(content, 'text'):
text = content.text
else:
text = str(content)
else:
text = str(result)
print("📄 PDF OCR Result (first 300 chars):")
print("=" * 50)
print(text[:300] + "..." if len(text) > 300 else text)
print("=" * 50)
return True
except Exception as e:
print(f"❌ Error testing with sample PDF: {e}")
import traceback
traceback.print_exc()
return False
async def main():
"""Main test function"""
print("🧪 Nanonets MCP Server PDF Test Suite")
print("=" * 50)
# Test 1: Basic PDF support
basic_ok = await test_pdf_support()
# Test 2: Sample PDF processing (if available)
sample_ok = await test_with_sample_pdf() if basic_ok else True
print("\n📊 Test Results:")
print("=" * 50)
print(f"PDF Support: {'✅ PASS' if basic_ok else '❌ FAIL'}")
print(f"Sample PDF Test: {'✅ PASS' if sample_ok else '❌ FAIL'}")
if basic_ok and sample_ok:
print("\n🎉 ALL TESTS PASSED! PDF OCR functionality is ready!")
else:
print("\n⚠️ Some tests failed. Check the logs above.")
if __name__ == "__main__":
asyncio.run(main())