get_url_content_direct
Fetch webpage content and metadata directly from any URL using HTTP requests to retrieve information for analysis or processing.
Instructions
Get webpage content directly using HTTP request
Args:
url (str): The URL to fetch content from
Returns:
str: The webpage content and metadata
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes |
Implementation Reference
- mcp2brave.py:428-438 (handler)The MCP tool handler for get_url_content_direct, decorated with @mcp.tool(). It defines the input schema via type hints and docstring, and delegates the core logic to the internal _get_url_content_direct helper.@mcp.tool() def get_url_content_direct(url: str) -> str: """Get webpage content directly using HTTP request Args: url (str): The URL to fetch content from Returns: str: The webpage content and metadata """ return _get_url_content_direct(url)
- mcp2brave.py:282-355 (helper)The core implementation of the tool logic: fetches the URL content via requests, parses HTML with BeautifulSoup, extracts main text content, cleans it (removes short lines, limits to 1000 chars), adds metadata, and returns formatted string.def _get_url_content_direct(url: str) -> str: """Internal function to get content directly using requests""" try: logger.debug(f"Directly fetching content from URL: {url}") response = requests.get(url, timeout=10, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }) response.raise_for_status() # 尝试检测编码 if 'charset' in response.headers.get('content-type', '').lower(): response.encoding = response.apparent_encoding try: from bs4 import BeautifulSoup soup = BeautifulSoup(response.text, 'html.parser') # 移除不需要的元素 for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'iframe', 'ad', '.advertisement']): element.decompose() # 尝试找到主要内容区域 main_content = None possible_content_elements = [ soup.find('article'), soup.find('main'), soup.find(class_='content'), soup.find(id='content'), soup.find(class_='post-content'), soup.find(class_='article-content'), soup.find(class_='entry-content'), soup.find(class_='main-content'), soup.select_one('div[class*="content"]'), # 包含 "content" 的任何 class ] for element in possible_content_elements: if element: main_content = element break if not main_content: main_content = soup text = main_content.get_text(separator='\n') lines = [] for line in text.split('\n'): line = line.strip() if line and len(line) > 30: lines.append(line) cleaned_text = ' '.join(lines) if len(cleaned_text) > 1000: end_pos = cleaned_text.rfind('. ', 0, 1000) if end_pos > 0: cleaned_text = cleaned_text[:end_pos + 1] else: cleaned_text = cleaned_text[:1000] metadata = f"URL: {url}\n" metadata += f"Content Length: {len(response.text)} characters\n" metadata += f"Content Type: {response.headers.get('content-type', 'Unknown')}\n" metadata += "---\n\n" return f"{metadata}{cleaned_text}" except Exception as e: logger.error(f"Error extracting text from HTML: {str(e)}") return f"Error extracting text: {str(e)}" except Exception as e: logger.error(f"Error fetching URL content directly: {str(e)}") return f"Error getting content: {str(e)}"