add_from_url
Fetch content from any URL, convert it to markdown, and add it to your knowledge base for search and retrieval.
Instructions
Fetch content from a URL and add it to the knowledge base.
Fetches the page, strips HTML, converts to markdown, and indexes.
Args:
url: URL to fetch content from
category: Document category (default: general)
title: Optional title for the document (auto-detected if not provided)
Returns:
JSON string with indexing resultsInput Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | ||
| category | No | general | |
| title | No |
Implementation Reference
- mcp_server/server.py:1503-1528 (handler)The MCP tool registration and handler for add_from_url.
@mcp.tool() def add_from_url(url: str, category: str = "general", title: str = None) -> str: """ Fetch content from a URL and add it to the knowledge base. Fetches the page, strips HTML, converts to markdown, and indexes. Args: url: URL to fetch content from category: Document category (default: general) title: Optional title for the document (auto-detected if not provided) Returns: JSON string with indexing results """ if not url or not url.strip(): return json.dumps({"status": "error", "message": "URL cannot be empty"}) orchestrator = get_orchestrator() result = orchestrator.add_from_url(url.strip(), category, title) if "error" in result: return json.dumps({"status": "error", "message": result["error"]}) return json.dumps({"status": "success", **result}, indent=2) - mcp_server/server.py:1069-1103 (handler)The logic implementation of add_from_url inside the KnowledgeOrchestrator class.
def add_from_url(self, url: str, category: str, title: str = None) -> Dict[str, Any]: """Fetch URL content, convert to markdown, and add to knowledge base.""" import requests from bs4 import BeautifulSoup # Validate URL scheme (only http/https allowed) if not url.startswith(("http://", "https://")): return {"error": "Only http:// and https:// URLs are supported"} try: response = requests.get(url, timeout=30, headers={ "User-Agent": "Mozilla/5.0 (knowledge-rag-ingester)" }) response.raise_for_status() except Exception as e: return {"error": f"Failed to fetch URL: {e}"} soup = BeautifulSoup(response.text, "html.parser") for tag in soup(["script", "style", "nav", "footer", "header"]): tag.decompose() if not title: title_tag = soup.find("title") title = title_tag.get_text(strip=True) if title_tag else url.split("/")[-1] text = soup.get_text(separator="\n", strip=True) lines = [line.strip() for line in text.splitlines() if line.strip()] clean_text = f"# {title}\n\nSource: {url}\n\n" + "\n\n".join(lines) safe_title = re.sub(r'[^\w\s-]', '', title).strip().replace(' ', '-').lower()[:60] filename = f"{safe_title}.md" filepath = f"{category}/{filename}" return self.add_document_from_content(clean_text, filepath, category)