#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = ["browser-use", "mistralai"]
# ///
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import asyncio
import logging
from mistralai import Mistral # type: ignore
from pydantic import BaseModel, Field
from browser_use import Agent, Controller
from browser_use.agent.views import ActionResult
from browser_use.browser import BrowserSession
from browser_use.llm import ChatOpenAI
if not os.getenv('OPENAI_API_KEY'):
raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
if not os.getenv('MISTRAL_API_KEY'):
raise ValueError('MISTRAL_API_KEY is not set. Please add it to your environment variables.')
logger = logging.getLogger(__name__)
controller = Controller()
class PdfExtractParams(BaseModel):
url: str = Field(description='URL to a PDF document')
@controller.registry.action(
'Extract PDF Text',
param_model=PdfExtractParams,
)
def extract_mistral_ocr(params: PdfExtractParams, browser: BrowserSession) -> ActionResult:
"""
Process a PDF URL using Mistral OCR API and return the OCR response.
Args:
url: URL to a PDF document
Returns:
OCR response object from Mistral API
"""
api_key = os.getenv('MISTRAL_API_KEY')
client = Mistral(api_key=api_key)
response = client.ocr.process(
model='mistral-ocr-latest',
document={
'type': 'document_url',
'document_url': params.url,
},
include_image_base64=False,
)
markdown = '\n\n'.join(f'### Page {i + 1}\n{response.pages[i].markdown}' for i in range(len(response.pages)))
return ActionResult(
extracted_content=markdown,
include_in_memory=False, ## PDF content can be very large, so we don't include it in memory
)
async def main():
agent = Agent(
task="""
Objective: Navigate to the following URL, extract its contents using the Extract PDF Text action, and explain its historical significance.
URL: https://docs.house.gov/meetings/GO/GO00/20220929/115171/HHRG-117-GO00-20220929-SD010.pdf
""",
llm=ChatOpenAI(model='gpt-4.1'),
controller=controller,
)
result = await agent.run()
logger.info(result)
if __name__ == '__main__':
asyncio.run(main())