Skip to main content
Glama

WeChat Article Reader MCP Server

by whbfxy
browser_client.py13.6 kB
""" 浏览器客户端模块 提供基于Playwright的浏览器自动化功能,用于获取微信公众号文章内容。 """ import asyncio import os import random import time from typing import Optional, Dict, Any, List from contextlib import asynccontextmanager from playwright.async_api import async_playwright, Browser, BrowserContext, Page import httpx from ..models import BrowserConfig, RequestConfig from .errors import BrowserError, NetworkError class BrowserClient: """浏览器客户端类,基于Playwright实现""" def __init__(self, browser_config: BrowserConfig, request_config: RequestConfig): """ 初始化浏览器客户端 Args: browser_config: 浏览器配置 request_config: 请求配置 """ self.browser_config = browser_config self.request_config = request_config self._playwright = None self._browser: Optional[Browser] = None self._context: Optional[BrowserContext] = None # 预定义的User-Agent列表 self._user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/121.0" ] async def __aenter__(self): """异步上下文管理器入口""" await self.start() return self async def __aexit__(self, exc_type, exc_val, exc_tb): """异步上下文管理器出口""" await self.stop() async def start(self): """启动浏览器""" try: self._playwright = await async_playwright().start() # 配置浏览器启动选项 launch_options = { "headless": self.browser_config.headless, } # 如果配置了代理,添加代理设置 if self.browser_config.proxy: launch_options["proxy"] = {"server": self.browser_config.proxy} # 启动浏览器 self._browser = await self._playwright.chromium.launch(**launch_options) # 创建浏览器上下文 context_options = { "viewport": { "width": self.browser_config.viewport_width, "height": self.browser_config.viewport_height }, "user_agent": self.browser_config.user_agent or random.choice(self._user_agents), "ignore_https_errors": True, } self._context = await self._browser.new_context(**context_options) except Exception as e: raise BrowserError(f"启动浏览器失败: {str(e)}") async def stop(self): """停止浏览器""" try: if self._context: await self._context.close() self._context = None if self._browser: await self._browser.close() self._browser = None if self._playwright: await self._playwright.stop() self._playwright = None except Exception as e: raise BrowserError(f"停止浏览器失败: {str(e)}") async def fetch_page(self, url: str, wait_for_selector: str = None, wait_time: int = None) -> str: """ 获取页面内容 Args: url: 页面URL wait_for_selector: 等待的选择器 wait_time: 等待时间(秒) Returns: 页面HTML内容 Raises: BrowserError: 浏览器相关错误 NetworkError: 网络相关错误 """ if not self._context: raise BrowserError("浏览器未启动,请先调用start()方法") wait_time = wait_time or self.browser_config.timeout # 实现重试机制 last_exception = None for attempt in range(self.request_config.max_retries): try: # 创建新页面 page = await self._context.new_page() try: # 设置超时时间 page.set_default_timeout(wait_time * 1000) # 导航到目标URL await page.goto(url, wait_until="networkidle") # 如果指定了等待选择器,等待元素出现 if wait_for_selector: await page.wait_for_selector(wait_for_selector, timeout=wait_time * 1000) # 获取页面内容 content = await page.content() return content finally: await page.close() except Exception as e: last_exception = e if attempt < self.request_config.max_retries - 1: # 等待一段时间后重试 await asyncio.sleep(self.request_config.retry_delay * (attempt + 1)) continue else: # 所有重试都失败了 if "Timeout" in str(e) or "timeout" in str(e).lower(): raise NetworkError(f"请求超时: {str(e)}") elif "net::" in str(e) or "network" in str(e).lower(): raise NetworkError(f"网络错误: {str(e)}") else: raise BrowserError(f"获取页面内容失败: {str(e)}") # 如果所有重试都失败,抛出最后一个异常 if last_exception: raise last_exception async def fetch_wechat_article(self, url: str) -> str: """ 获取微信公众号文章内容 Args: url: 微信公众号文章URL Returns: 文章HTML内容 """ # 微信文章可能需要等待特定元素加载 return await self.fetch_page( url, wait_for_selector=".rich_media_content", wait_time=self.browser_config.timeout ) async def get_page_screenshot(self, url: str, output_path: str = None) -> bytes: """ 获取页面截图 Args: url: 页面URL output_path: 截图保存路径,如果为None则返回二进制数据 Returns: 截图二进制数据(如果output_path为None) """ if not self._context: raise BrowserError("浏览器未启动,请先调用start()方法") page = await self._context.new_page() try: await page.goto(url, wait_until="networkidle") # 等待文章内容加载 await page.wait_for_selector(".rich_media_content", timeout=self.browser_config.timeout * 1000) # 获取截图 if output_path: await page.screenshot(path=output_path, full_page=True) return None else: return await page.screenshot(full_page=True) finally: await page.close() class HttpClient: """HTTP客户端类,基于httpx实现""" def __init__(self, request_config: RequestConfig): """ 初始化HTTP客户端 Args: request_config: 请求配置 """ self.request_config = request_config self._client = None # 预定义的User-Agent列表 self._user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/121.0" ] async def __aenter__(self): """异步上下文管理器入口""" await self.start() return self async def __aexit__(self, exc_type, exc_val, exc_tb): """异步上下文管理器出口""" await self.stop() async def start(self): """启动HTTP客户端""" try: headers = { "User-Agent": random.choice(self._user_agents), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Accept-Encoding": "gzip, deflate, br", "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", } timeout = httpx.Timeout(self.request_config.timeout) self._client = httpx.AsyncClient( headers=headers, timeout=timeout, follow_redirects=True ) except Exception as e: raise NetworkError(f"启动HTTP客户端失败: {str(e)}") async def stop(self): """停止HTTP客户端""" if self._client: await self._client.aclose() self._client = None async def get(self, url: str, **kwargs) -> httpx.Response: """ 发送GET请求 Args: url: 请求URL **kwargs: 其他请求参数 Returns: HTTP响应对象 Raises: NetworkError: 网络相关错误 """ if not self._client: raise NetworkError("HTTP客户端未启动,请先调用start()方法") # 实现重试机制 last_exception = None for attempt in range(self.request_config.max_retries): try: response = await self._client.get(url, **kwargs) response.raise_for_status() return response except Exception as e: last_exception = e if attempt < self.request_config.max_retries - 1: # 等待一段时间后重试 await asyncio.sleep(self.request_config.retry_delay * (attempt + 1)) continue else: # 所有重试都失败了 if isinstance(e, httpx.TimeoutException): raise NetworkError(f"请求超时: {str(e)}") elif isinstance(e, httpx.NetworkError): raise NetworkError(f"网络错误: {str(e)}") elif isinstance(e, httpx.HTTPStatusError): raise NetworkError(f"HTTP错误: {e.response.status_code} - {str(e)}") else: raise NetworkError(f"请求失败: {str(e)}") # 如果所有重试都失败,抛出最后一个异常 if last_exception: raise last_exception @asynccontextmanager async def get_browser_client(browser_config: BrowserConfig, request_config: RequestConfig): """ 获取浏览器客户端的上下文管理器 Args: browser_config: 浏览器配置 request_config: 请求配置 Yields: BrowserClient实例 """ client = BrowserClient(browser_config, request_config) try: await client.start() yield client finally: await client.stop() @asynccontextmanager async def get_http_client(request_config: RequestConfig): """ 获取HTTP客户端的上下文管理器 Args: request_config: 请求配置 Yields: HttpClient实例 """ client = HttpClient(request_config) try: await client.start() yield client finally: await client.stop() async def fetch_wechat_article(url: str, browser_config: BrowserConfig, request_config: RequestConfig) -> str: """ 便捷函数:获取微信公众号文章内容 Args: url: 微信公众号文章URL browser_config: 浏览器配置 request_config: 请求配置 Returns: 文章HTML内容 """ async with get_browser_client(browser_config, request_config) as client: return await client.fetch_wechat_article(url)

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/whbfxy/MCP101Demo'

If you have feedback or need assistance with the MCP directory API, please join our Discord server