browser_client.py•13.6 kB
"""
浏览器客户端模块
提供基于Playwright的浏览器自动化功能,用于获取微信公众号文章内容。
"""
import asyncio
import os
import random
import time
from typing import Optional, Dict, Any, List
from contextlib import asynccontextmanager
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
import httpx
from ..models import BrowserConfig, RequestConfig
from .errors import BrowserError, NetworkError
class BrowserClient:
"""浏览器客户端类,基于Playwright实现"""
def __init__(self, browser_config: BrowserConfig, request_config: RequestConfig):
"""
初始化浏览器客户端
Args:
browser_config: 浏览器配置
request_config: 请求配置
"""
self.browser_config = browser_config
self.request_config = request_config
self._playwright = None
self._browser: Optional[Browser] = None
self._context: Optional[BrowserContext] = None
# 预定义的User-Agent列表
self._user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/121.0"
]
async def __aenter__(self):
"""异步上下文管理器入口"""
await self.start()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""异步上下文管理器出口"""
await self.stop()
async def start(self):
"""启动浏览器"""
try:
self._playwright = await async_playwright().start()
# 配置浏览器启动选项
launch_options = {
"headless": self.browser_config.headless,
}
# 如果配置了代理,添加代理设置
if self.browser_config.proxy:
launch_options["proxy"] = {"server": self.browser_config.proxy}
# 启动浏览器
self._browser = await self._playwright.chromium.launch(**launch_options)
# 创建浏览器上下文
context_options = {
"viewport": {
"width": self.browser_config.viewport_width,
"height": self.browser_config.viewport_height
},
"user_agent": self.browser_config.user_agent or random.choice(self._user_agents),
"ignore_https_errors": True,
}
self._context = await self._browser.new_context(**context_options)
except Exception as e:
raise BrowserError(f"启动浏览器失败: {str(e)}")
async def stop(self):
"""停止浏览器"""
try:
if self._context:
await self._context.close()
self._context = None
if self._browser:
await self._browser.close()
self._browser = None
if self._playwright:
await self._playwright.stop()
self._playwright = None
except Exception as e:
raise BrowserError(f"停止浏览器失败: {str(e)}")
async def fetch_page(self, url: str, wait_for_selector: str = None,
wait_time: int = None) -> str:
"""
获取页面内容
Args:
url: 页面URL
wait_for_selector: 等待的选择器
wait_time: 等待时间(秒)
Returns:
页面HTML内容
Raises:
BrowserError: 浏览器相关错误
NetworkError: 网络相关错误
"""
if not self._context:
raise BrowserError("浏览器未启动,请先调用start()方法")
wait_time = wait_time or self.browser_config.timeout
# 实现重试机制
last_exception = None
for attempt in range(self.request_config.max_retries):
try:
# 创建新页面
page = await self._context.new_page()
try:
# 设置超时时间
page.set_default_timeout(wait_time * 1000)
# 导航到目标URL
await page.goto(url, wait_until="networkidle")
# 如果指定了等待选择器,等待元素出现
if wait_for_selector:
await page.wait_for_selector(wait_for_selector, timeout=wait_time * 1000)
# 获取页面内容
content = await page.content()
return content
finally:
await page.close()
except Exception as e:
last_exception = e
if attempt < self.request_config.max_retries - 1:
# 等待一段时间后重试
await asyncio.sleep(self.request_config.retry_delay * (attempt + 1))
continue
else:
# 所有重试都失败了
if "Timeout" in str(e) or "timeout" in str(e).lower():
raise NetworkError(f"请求超时: {str(e)}")
elif "net::" in str(e) or "network" in str(e).lower():
raise NetworkError(f"网络错误: {str(e)}")
else:
raise BrowserError(f"获取页面内容失败: {str(e)}")
# 如果所有重试都失败,抛出最后一个异常
if last_exception:
raise last_exception
async def fetch_wechat_article(self, url: str) -> str:
"""
获取微信公众号文章内容
Args:
url: 微信公众号文章URL
Returns:
文章HTML内容
"""
# 微信文章可能需要等待特定元素加载
return await self.fetch_page(
url,
wait_for_selector=".rich_media_content",
wait_time=self.browser_config.timeout
)
async def get_page_screenshot(self, url: str, output_path: str = None) -> bytes:
"""
获取页面截图
Args:
url: 页面URL
output_path: 截图保存路径,如果为None则返回二进制数据
Returns:
截图二进制数据(如果output_path为None)
"""
if not self._context:
raise BrowserError("浏览器未启动,请先调用start()方法")
page = await self._context.new_page()
try:
await page.goto(url, wait_until="networkidle")
# 等待文章内容加载
await page.wait_for_selector(".rich_media_content", timeout=self.browser_config.timeout * 1000)
# 获取截图
if output_path:
await page.screenshot(path=output_path, full_page=True)
return None
else:
return await page.screenshot(full_page=True)
finally:
await page.close()
class HttpClient:
"""HTTP客户端类,基于httpx实现"""
def __init__(self, request_config: RequestConfig):
"""
初始化HTTP客户端
Args:
request_config: 请求配置
"""
self.request_config = request_config
self._client = None
# 预定义的User-Agent列表
self._user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/121.0"
]
async def __aenter__(self):
"""异步上下文管理器入口"""
await self.start()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""异步上下文管理器出口"""
await self.stop()
async def start(self):
"""启动HTTP客户端"""
try:
headers = {
"User-Agent": random.choice(self._user_agents),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
timeout = httpx.Timeout(self.request_config.timeout)
self._client = httpx.AsyncClient(
headers=headers,
timeout=timeout,
follow_redirects=True
)
except Exception as e:
raise NetworkError(f"启动HTTP客户端失败: {str(e)}")
async def stop(self):
"""停止HTTP客户端"""
if self._client:
await self._client.aclose()
self._client = None
async def get(self, url: str, **kwargs) -> httpx.Response:
"""
发送GET请求
Args:
url: 请求URL
**kwargs: 其他请求参数
Returns:
HTTP响应对象
Raises:
NetworkError: 网络相关错误
"""
if not self._client:
raise NetworkError("HTTP客户端未启动,请先调用start()方法")
# 实现重试机制
last_exception = None
for attempt in range(self.request_config.max_retries):
try:
response = await self._client.get(url, **kwargs)
response.raise_for_status()
return response
except Exception as e:
last_exception = e
if attempt < self.request_config.max_retries - 1:
# 等待一段时间后重试
await asyncio.sleep(self.request_config.retry_delay * (attempt + 1))
continue
else:
# 所有重试都失败了
if isinstance(e, httpx.TimeoutException):
raise NetworkError(f"请求超时: {str(e)}")
elif isinstance(e, httpx.NetworkError):
raise NetworkError(f"网络错误: {str(e)}")
elif isinstance(e, httpx.HTTPStatusError):
raise NetworkError(f"HTTP错误: {e.response.status_code} - {str(e)}")
else:
raise NetworkError(f"请求失败: {str(e)}")
# 如果所有重试都失败,抛出最后一个异常
if last_exception:
raise last_exception
@asynccontextmanager
async def get_browser_client(browser_config: BrowserConfig, request_config: RequestConfig):
"""
获取浏览器客户端的上下文管理器
Args:
browser_config: 浏览器配置
request_config: 请求配置
Yields:
BrowserClient实例
"""
client = BrowserClient(browser_config, request_config)
try:
await client.start()
yield client
finally:
await client.stop()
@asynccontextmanager
async def get_http_client(request_config: RequestConfig):
"""
获取HTTP客户端的上下文管理器
Args:
request_config: 请求配置
Yields:
HttpClient实例
"""
client = HttpClient(request_config)
try:
await client.start()
yield client
finally:
await client.stop()
async def fetch_wechat_article(url: str, browser_config: BrowserConfig,
request_config: RequestConfig) -> str:
"""
便捷函数:获取微信公众号文章内容
Args:
url: 微信公众号文章URL
browser_config: 浏览器配置
request_config: 请求配置
Returns:
文章HTML内容
"""
async with get_browser_client(browser_config, request_config) as client:
return await client.fetch_wechat_article(url)