nexus_read
Extracts clean content from URLs by parsing articles or documentation with intelligent focus modes for general text or code-focused reading.
Instructions
Reads a URL with intelligent parsing logic.
Args:
url: The URL to visit.
focus:
'general' = Returns clean article text (Exa style).
'code' = Returns only headers, code blocks, and tables (Ref style).
'auto' = Detects if it's a doc site and switches to 'code' mode.
Returns:
Parsed and cleaned content from the URL.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | ||
| focus | No | auto |
Implementation Reference
- nexus_server.py:97-232 (handler)The complete implementation of the 'nexus_read' tool handler, including registration via @mcp.tool() decorator. It fetches the given URL, parses HTML with BeautifulSoup, and extracts content based on the 'focus' parameter: 'general' for full text, 'code' for headers/code/tables, 'auto' detects type.@mcp.tool() async def nexus_read(url: str, focus: str = "auto") -> str: """ Reads a URL with intelligent parsing logic. Args: url: The URL to visit. focus: 'general' = Returns clean article text (Exa style). 'code' = Returns only headers, code blocks, and tables (Ref style). 'auto' = Detects if it's a doc site and switches to 'code' mode. Returns: Parsed and cleaned content from the URL. """ logger.info(f"Read requested - URL: '{url}', Focus: {focus}") # Validate inputs if not url or not url.strip(): error_msg = "URL cannot be empty" logger.error(error_msg) return f"Error: {error_msg}" if focus not in ["auto", "general", "code"]: error_msg = f"Invalid focus '{focus}'. Must be 'auto', 'general', or 'code'" logger.error(error_msg) return f"Error: {error_msg}" url = url.strip() # Validate URL format if not url.startswith(("http://", "https://")): error_msg = "URL must start with http:// or https://" logger.error(error_msg) return f"Error: {error_msg}" # Auto-detection logic original_focus = focus if focus == "auto": technical_indicators = ["docs", "api", "reference", "github", "guide", "documentation"] if any(ind in url.lower() for ind in technical_indicators): focus = "code" logger.debug(f"Auto-detected technical site, switching to code focus") else: focus = "general" logger.debug(f"Auto-detected general site, using general focus") async with httpx.AsyncClient( follow_redirects=True, headers={"User-Agent": "NexusMCP/1.0"}, timeout=REQUEST_TIMEOUT ) as client: try: response = await client.get(url) response.raise_for_status() logger.debug(f"Successfully fetched URL: {url} (Status: {response.status_code})") soup = bs4.BeautifulSoup(response.text, 'html.parser') # Pre-cleaning (Remove junk common to all modes) for trash in soup(["script", "style", "nav", "footer", "iframe", "svg", "noscript"]): trash.decompose() output = [] output.append(f"=== SOURCE: {url} ===") output.append(f"=== MODE: {focus.upper()} ===\n") if focus == "code": # REF MODE (High Signal / Low Noise) relevant_tags = soup.find_all(['h1', 'h2', 'h3', 'h4', 'pre', 'code', 'table']) for tag in relevant_tags: if tag.name in ['h1', 'h2', 'h3', 'h4']: header_text = tag.get_text(strip=True) if header_text: output.append(f"\n## {header_text}") elif tag.name == 'pre': code_text = tag.get_text() if code_text.strip(): output.append(f"```\n{code_text}\n```") elif tag.name == 'code' and tag.parent.name != 'pre': code_text = tag.get_text(strip=True) if code_text: output.append(f"`{code_text}`") elif tag.name == 'table': # Enhanced table extraction try: rows = tag.find_all('tr') if rows: output.append("\n[Table]") for row in rows[:10]: # Limit to first 10 rows cells = row.find_all(['td', 'th']) cell_texts = [cell.get_text(strip=True) for cell in cells] if cell_texts: output.append(" | ".join(cell_texts)) except Exception as table_error: logger.warning(f"Table parsing failed: {table_error}") output.append("\n[Table - parsing failed]") if len(output) < MIN_CODE_ELEMENTS_THRESHOLD: fallback_msg = f"Code-focused extraction found minimal content ({len(output)} elements). The page may not contain structured documentation. Try focus='general' for better results." logger.warning(f"Insufficient code elements found at {url}") return fallback_msg else: # GENERAL MODE (Full Context) text = soup.get_text(separator='\n') lines = [line.strip() for line in text.split('\n') if line.strip()] output.append("\n".join(lines)) result = "\n".join(output)[:MAX_CONTENT_LENGTH] logger.info(f"Read successful - Extracted {len(result)} characters from {url}") if len("\n".join(output)) > MAX_CONTENT_LENGTH: result += f"\n\n[Content truncated at {MAX_CONTENT_LENGTH} characters]" logger.debug(f"Content truncated for {url}") return result except httpx.TimeoutException: error_msg = f"Request timed out after {REQUEST_TIMEOUT}s" logger.error(f"Timeout reading {url}") return f"Error: {error_msg}" except httpx.HTTPStatusError as e: error_msg = f"HTTP error {e.response.status_code}: {e.response.reason_phrase}" logger.error(f"HTTP error reading {url}: {error_msg}") return f"Error: {error_msg}" except httpx.RequestError as e: error_msg = f"Network error: {str(e)}" logger.error(f"Network error reading {url}: {error_msg}") return f"Error: {error_msg}" except Exception as e: error_msg = f"Unexpected error reading URL: {str(e)}" logger.exception(f"Unexpected error reading {url}") return f"Error: {error_msg}"
- nexus_server.py:99-111 (schema)The docstring providing input/output schema description for the nexus_read tool, used by FastMCP for tool schema generation.""" Reads a URL with intelligent parsing logic. Args: url: The URL to visit. focus: 'general' = Returns clean article text (Exa style). 'code' = Returns only headers, code blocks, and tables (Ref style). 'auto' = Detects if it's a doc site and switches to 'code' mode. Returns: Parsed and cleaned content from the URL. """
- nexus_server.py:97-97 (registration)The @mcp.tool() decorator that registers the nexus_read function as an MCP tool in the FastMCP server.@mcp.tool()