omniparser-autogui-mcp

by NON906
Verified
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import asyncio from contextlib import redirect_stdout from dotenv import load_dotenv from langchain_community.chat_message_histories import ChatMessageHistory from langchain_core.messages import SystemMessage from langchain_openai import ChatOpenAI from langchain_google_genai import ChatGoogleGenerativeAI from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory from mcp_autogui.langchain.mcp_manager import McpManager from mcp_autogui.langchain.agent_graph import create_agent_graph load_dotenv() def create_agent(tools): #llm = ChatOpenAI(model="gpt-4o-mini") harm_block = HarmBlockThreshold.BLOCK_NONE safety_settings = { HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: harm_block, HarmCategory.HARM_CATEGORY_HATE_SPEECH: harm_block, HarmCategory.HARM_CATEGORY_HARASSMENT: harm_block, HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: harm_block, } llm = ChatGoogleGenerativeAI( model='gemini-2.0-flash-exp', safety_settings=safety_settings, ) agent = create_agent_graph(llm, tools, debug=True) return agent async def batch_main(prompts: list[str], system_prompt: str = ''): with redirect_stdout(sys.stderr): ret = [] mcp_manager = McpManager() await mcp_manager.load('langchain_settings/mcp_config.json') agent = create_agent(mcp_manager.get_tools()) for prompt in prompts: chat_history = ChatMessageHistory() if system_prompt == '': system_prompt = '''You are using a Windows device. You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot. You may be given some history plan and actions, this is the response from the previous loop. You should carefully consider your plan base on the task, screenshot, and history actions. IMPORTANT NOTES: 1. You should only give a single action at a time. 2. You should give an analysis to the current screen, and reflect on what has been done by looking at the history, then describe your step-by-step thoughts on how to achieve the task. 3. The tasks involve buying multiple products or navigating through multiple pages. You should break it into subgoals and complete each subgoal one by one in the order of the instructions. 4. avoid choosing the same action/elements multiple times in a row, if it happens, reflect to yourself, what may have gone wrong, and predict a different action. 5. If you are prompted with login information page or captcha page, or you think it need user's permission to do the next action, finish actions and ask the user for a response. 6. Exit if you have achieved your goal.''' chat_history.add_message(SystemMessage(content=system_prompt)) chat_history.add_user_message(prompt) print(prompt) response = await agent.ainvoke( {"messages": chat_history.messages}, {"recursion_limit": 100}, ) ret.append(response["messages"][-1].content) print(response["messages"][-1].content) mcp_manager.stop_servers() return ret if __name__ == "__main__": prompts = [ '''画面を確認し、ブラウザから「MCPサーバー」と入力して検索してください''', ] asyncio.run(batch_main(prompts))