"""
文章内容读取工具
通过 Jina AI Reader API 将 URL 转换为 LLM 友好的 Markdown 格式。
支持单篇和批量读取,内置速率限制和并发控制。
"""
import time
from typing import Dict, List
import requests
from ..utils.errors import MCPError, InvalidParameterError
# Jina Reader 配置
JINA_READER_BASE = "https://r.jina.ai"
DEFAULT_TIMEOUT = 30 # 秒
MAX_BATCH_SIZE = 5 # 单次批量最大篇数
BATCH_INTERVAL = 5.0 # 批量请求间隔(秒)
class ArticleReaderTools:
"""文章内容读取工具类"""
def __init__(self, project_root: str = None, jina_api_key: str = None):
"""
初始化文章读取工具
Args:
project_root: 项目根目录
jina_api_key: Jina API Key(可选,有 Key 可提升速率限制)
"""
self.project_root = project_root
self.jina_api_key = jina_api_key
self._last_request_time = 0.0
def _build_headers(self) -> Dict[str, str]:
"""构建请求头"""
headers = {
"Accept": "text/markdown",
"X-Return-Format": "markdown",
"X-No-Cache": "true",
}
if self.jina_api_key:
headers["Authorization"] = f"Bearer {self.jina_api_key}"
return headers
def _throttle(self):
"""速率控制:确保请求间隔 5 秒"""
now = time.time()
elapsed = now - self._last_request_time
if elapsed < BATCH_INTERVAL:
time.sleep(BATCH_INTERVAL - elapsed)
self._last_request_time = time.time()
def read_article(
self,
url: str,
timeout: int = DEFAULT_TIMEOUT
) -> Dict:
"""
读取单篇文章内容(Markdown 格式)
Args:
url: 文章链接
timeout: 请求超时时间(秒),默认 30
Returns:
文章内容字典
"""
try:
if not url or not url.startswith(("http://", "https://")):
raise InvalidParameterError(
f"无效的 URL: {url}",
suggestion="URL 必须以 http:// 或 https:// 开头"
)
self._throttle()
response = requests.get(
f"{JINA_READER_BASE}/{url}",
headers=self._build_headers(),
timeout=timeout
)
if response.status_code == 200:
return {
"success": True,
"data": {
"url": url,
"content": response.text,
"format": "markdown",
"content_length": len(response.text)
}
}
elif response.status_code == 429:
return {
"success": False,
"error": {
"code": "RATE_LIMITED",
"message": "Jina Reader 速率限制,请稍后重试",
"suggestion": "免费限制: 100 RPM / 2 并发,可配置 API Key 提升限额"
}
}
else:
return {
"success": False,
"error": {
"code": "FETCH_FAILED",
"message": f"HTTP {response.status_code}: {response.reason}",
"url": url
}
}
except requests.Timeout:
return {
"success": False,
"error": {
"code": "TIMEOUT",
"message": f"请求超时({timeout}秒)",
"url": url,
"suggestion": "可尝试增加 timeout 参数"
}
}
except MCPError as e:
return {"success": False, "error": e.to_dict()}
except Exception as e:
return {
"success": False,
"error": {
"code": "REQUEST_ERROR",
"message": str(e),
"url": url
}
}
def read_articles_batch(
self,
urls: List[str],
timeout: int = DEFAULT_TIMEOUT
) -> Dict:
"""
批量读取多篇文章内容(最多 5 篇,间隔 5 秒)
Args:
urls: 文章链接列表
timeout: 每篇的请求超时时间(秒)
Returns:
批量读取结果
"""
try:
if not urls:
raise InvalidParameterError(
"URL 列表不能为空",
suggestion="请提供至少一个 URL"
)
# 限制最多 5 篇
actual_urls = urls[:MAX_BATCH_SIZE]
skipped = len(urls) - len(actual_urls)
results = []
succeeded = 0
failed = 0
for i, url in enumerate(actual_urls):
result = self.read_article(url=url, timeout=timeout)
results.append({
"index": i + 1,
"url": url,
"success": result["success"],
"data": result.get("data"),
"error": result.get("error")
})
if result["success"]:
succeeded += 1
else:
failed += 1
return {
"success": True,
"summary": {
"description": "批量文章读取结果",
"requested": len(urls),
"processed": len(actual_urls),
"succeeded": succeeded,
"failed": failed,
"skipped": skipped,
"interval_seconds": BATCH_INTERVAL,
},
"articles": results,
"note": f"已跳过 {skipped} 篇(单次上限 {MAX_BATCH_SIZE} 篇)" if skipped > 0 else None
}
except MCPError as e:
return {"success": False, "error": e.to_dict()}
except Exception as e:
return {
"success": False,
"error": {
"code": "BATCH_ERROR",
"message": str(e)
}
}