web_extract
Extract structured information from any URL, including title, description, body summary, and all links. Optionally retrieve page links.
Instructions
从网页 URL 提取结构化信息:标题、描述、正文摘要、所有链接。
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | 目标 URL | |
| extract_links | No | 是否提取页面链接(默认 false) | |
| timeout | No |
Implementation Reference
- The core handler function `_web_extract` that executes the web_extract tool logic: fetches a URL, extracts structured info (title, description, body summary, optional links) using httpx and BeautifulSoup.
async def _web_extract(args: dict) -> list[types.TextContent]: url = args["url"] extract_links = bool(args.get("extract_links", False)) timeout = int(args.get("timeout", 15)) try: import httpx from bs4 import BeautifulSoup async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client: resp = await client.get(url, headers={"User-Agent": "Mozilla/5.0"}) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") title = soup.title.string.strip() if soup.title and soup.title.string else "(无标题)" desc_tag = soup.find("meta", attrs={"name": "description"}) _content = desc_tag.get("content") if desc_tag else "" _content = _content[0] if isinstance(_content, list) else _content description = _content.strip() if _content else "(无描述)" # 正文摘要(取前 500 字) for tag in soup(["script", "style", "nav", "footer"]): tag.decompose() body_text = soup.get_text(separator=" ", strip=True)[:500] lines = [ f"🌐 **{title}**", f"URL: {url}", f"描述: {description}", f"\n**正文摘要:**\n{body_text}...", ] if extract_links: links = [] for a in soup.find_all("a", href=True)[:20]: href = str(a["href"]) text = a.get_text(strip=True) if href.startswith("http") and text: links.append(f"- [{text}]({href})") if links: lines.append(f"\n**页面链接(前 {len(links)} 条):**") lines.extend(links) return [types.TextContent(type="text", text="\n".join(lines))] except ImportError: return [types.TextContent(type="text", text="❌ 需要安装依赖: pip install httpx beautifulsoup4")] except Exception as e: return [types.TextContent(type="text", text=f"❌ 提取失败: {e}")] - Tool registration definition for 'web_extract' in the WEB_TOOLS list, including inputSchema (url required, optional extract_links and timeout).
types.Tool( name="web_extract", description="从网页 URL 提取结构化信息:标题、描述、正文摘要、所有链接。", inputSchema={ "type": "object", "properties": { "url": {"type": "string", "description": "目标 URL"}, "extract_links": { "type": "boolean", "description": "是否提取页面链接(默认 false)", "default": False, }, "timeout": {"type": "integer", "default": 15}, }, "required": ["url"], }, ), - src/onion_mcp_server/tools/web.py:78-92 (registration)The `handle_web` dispatch function that routes 'web_extract' calls to `_web_extract`.
async def handle_web(name: str, arguments: dict) -> list[types.TextContent]: if name == "web_fetch": return await _web_fetch(arguments) elif name == "web_search": return await _web_search(arguments) elif name == "web_extract": return await _web_extract(arguments) raise ValueError(f"未知 web 工具: {name}") async def _web_fetch(args: dict) -> list[types.TextContent]: url = args["url"] max_len = int(args.get("max_len", 5000)) timeout = int(args.get("timeout", 15)) try: - src/onion_mcp_server/server.py:58-59 (registration)Server-level registration: maps each WEB_TOOLS tool name (including 'web_extract') to the handle_web dispatcher via _HANDLERS dict.
for _t in WEB_TOOLS: _HANDLERS[_t.name] = handle_web - Re-exports WEB_TOOLS and handle_web from the tools package.
from onion_mcp_server.tools.ai import AI_TOOLS, handle_ai from onion_mcp_server.tools.code import CODE_TOOLS, handle_code from onion_mcp_server.tools.text import TEXT_TOOLS, handle_text from onion_mcp_server.tools.data import DATA_TOOLS, handle_data from onion_mcp_server.tools.web import WEB_TOOLS, handle_web from onion_mcp_server.tools.system import SYSTEM_TOOLS, handle_system __all__ = [ "AI_TOOLS", "handle_ai", "CODE_TOOLS", "handle_code", "TEXT_TOOLS", "handle_text", "DATA_TOOLS", "handle_data", "WEB_TOOLS", "handle_web", "SYSTEM_TOOLS", "handle_system", ]