MCP DuckDuckGo Search Plugin

  • mcp_duckduckgo
""" MCP tool definitions for the DuckDuckGo search plugin. """ import logging import traceback from typing import List, Dict, Any, Optional import urllib.parse from pydantic import Field from mcp.server.fastmcp import Context import httpx from bs4 import BeautifulSoup from .models import SearchResponse, SearchResult, DetailedResult from .search import duckduckgo_search, extract_domain from .server import mcp # Configure logging logger = logging.getLogger("mcp_duckduckgo.tools") @mcp.tool() # noqa: F401 # pragma: no cover async def duckduckgo_web_search( # vulture: ignore query: str = Field( ..., description="Search query (max 400 chars, 50 words)", max_length=400, ), count: int = Field( default=10, description="Number of results per page (1-20, default 10)", ge=1, le=20, ), page: int = Field( default=1, description="Page number (default 1)", ge=1, ), site: Optional[str] = Field( default=None, description="Limit results to a specific site (e.g., 'site:example.com')", ), time_period: Optional[str] = Field( default=None, description="Time period for results ('day', 'week', 'month', 'year')", ), ctx: Context = None, # Context is automatically injected by MCP ) -> SearchResponse: """ Perform a web search using the DuckDuckGo search engine. This tool searches the web using DuckDuckGo and returns relevant results. It's ideal for finding current information, news, articles, and general web content. Args: query: The search query (max 400 chars, 50 words) count: Number of results per page (1-20, default 10) page: Page number for pagination (default 1) site: Limit results to a specific site (e.g., 'site:example.com') time_period: Filter results by time period ('day', 'week', 'month', 'year') ctx: MCP context object (automatically injected) Returns: A SearchResponse object containing search results and pagination metadata Example: duckduckgo_web_search(query="latest AI developments", count=5, page=1) """ try: logger.info(f"duckduckgo_web_search called with query: {query}, count: {count}, page: {page}") # Enhance query with site limitation if provided if site: # Check if site is a string before using it if isinstance(site, str) and not "site:" in query: query = f"{query} site:{site}" # Enhance query with time period if provided if time_period: # Map time_period to DuckDuckGo format time_map = { "day": "d", "week": "w", "month": "m", "year": "y" } # Check if time_period is a string before calling lower() if isinstance(time_period, str) and time_period.lower() in time_map: query = f"{query} date:{time_map[time_period.lower()]}" # Log the context to help with debugging if ctx: logger.info(f"Context available: {ctx}") else: logger.error("Context is None!") # Create a minimal context if none is provided from pydantic import BaseModel class MinimalContext(BaseModel): pass ctx = MinimalContext() # Calculate offset from page number offset = (page - 1) * count result = await duckduckgo_search({ "query": query, "count": count, "offset": offset, "page": page }, ctx) logger.info(f"duckduckgo_search returned: {result}") # Convert the result to a SearchResponse object search_results = [] for item in result["results"]: try: search_result = SearchResult( title=item["title"], url=item["url"], description=item["description"], published_date=item.get("published_date") ) search_results.append(search_result) except Exception as e: logger.error(f"Error creating SearchResult: {e}, item: {item}") if hasattr(ctx, 'error'): await ctx.error(f"Error creating SearchResult: {e}, item: {item}") # Calculate pagination metadata total_results = result["total_results"] total_pages = (total_results + count - 1) // count if total_results > 0 else 1 has_next = page < total_pages has_previous = page > 1 response = SearchResponse( results=search_results, total_results=total_results, page=page, total_pages=total_pages, has_next=has_next, has_previous=has_previous ) logger.info(f"Returning SearchResponse: {response}") return response except Exception as e: error_msg = f"Error in duckduckgo_web_search: {str(e)}" logger.error(error_msg) logger.error(traceback.format_exc()) if hasattr(ctx, 'error'): await ctx.error(error_msg) # Return an empty response instead of raising an exception # This way, the tool will return something even if there's an error return SearchResponse( results=[], total_results=0, page=page, total_pages=1, has_next=False, has_previous=False ) @mcp.tool() # noqa: F401 # pragma: no cover async def duckduckgo_get_details( url: str, spider_depth: int = Field(0, ge=0, le=3, description="Number of links to follow from the page (0-3, default 0)"), max_links_per_page: int = Field(3, ge=1, le=5, description="Maximum number of links to follow per page (1-5, default 3)"), same_domain_only: bool = Field(True, description="Only follow links to the same domain"), *, ctx: Context, ) -> DetailedResult: """ Get detailed information about a search result. This tool retrieves additional details about a search result, such as the domain, title, description, and content snippet by fetching and parsing the actual web page. It can also follow links to gather more comprehensive content. Args: url: The URL of the result to get details for spider_depth: Number of links to follow (0-3, default 0) max_links_per_page: Maximum number of links to follow per page (1-5, default 3) same_domain_only: Only follow links to the same domain ctx: MCP context object (automatically injected) Returns: A DetailedResult object with additional information Example: duckduckgo_get_details(url="https://example.com/article", spider_depth=1) """ try: logger.info(f"duckduckgo_get_details called with URL: {url}") # Extract the default values from the Field objects if needed spider_depth_value = 0 max_links_value = 3 same_domain_value = True # Check if parameters are Field objects and extract their default values if hasattr(spider_depth, "default"): spider_depth_value = spider_depth.default elif isinstance(spider_depth, int): spider_depth_value = spider_depth if hasattr(max_links_per_page, "default"): max_links_value = max_links_per_page.default elif isinstance(max_links_per_page, int): max_links_value = max_links_per_page if hasattr(same_domain_only, "default"): same_domain_value = same_domain_only.default elif isinstance(same_domain_only, bool): same_domain_value = same_domain_only logger.info(f"Spider depth: {spider_depth_value}, Max links per page: {max_links_value}, Same domain only: {same_domain_value}") # Get the httpx client from context if available client = None close_client = False lifespan_context = getattr(ctx, "lifespan_context", {}) if "http_client" in lifespan_context: logger.info("Using HTTP client from lifespan context") client = lifespan_context["http_client"] else: logger.info("Creating new HTTP client") client = httpx.AsyncClient( timeout=10, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" }, ) close_client = True # Extract the domain from the URL domain = extract_domain(url) # Fetch the page content if hasattr(ctx, 'progress'): await ctx.progress(f"Fetching content from {url}") response = await client.get(url, follow_redirects=True, timeout=15.0) response.raise_for_status() # Parse the HTML content soup = BeautifulSoup(response.text, "html.parser") # Extract title title = soup.title.string.strip() if soup.title else "" logger.info(f"Extracted title: {title}") # Extract metadata metadata = extract_metadata(soup, domain, url) # Extract author information author = extract_author(soup) logger.info(f"Extracted author: {author}") # Extract keywords/tags keywords = extract_keywords(soup) logger.info(f"Extracted keywords: {keywords}") # Extract main image main_image = extract_main_image(soup, url) logger.info(f"Extracted main image: {main_image}") # Extract social links social_links = extract_social_links(soup) # Extract content more intelligently based on content type content_snippet, headings = extract_targeted_content(soup, domain) logger.info(f"Extracted content snippet: {content_snippet[:100]}..." if len(content_snippet) > 100 else f"Extracted content snippet: {content_snippet}") # Extract related links related_links = [] if soup: # Get all links in the page all_links = soup.find_all("a", href=True) link_count = 0 for link in all_links: href = link.get("href") # Skip empty links, anchors, and non-http links if not href or href.startswith("#") or not (href.startswith("http://") or href.startswith("https://")): continue # If same_domain_only is True, only include links from the same domain if same_domain_value and domain != extract_domain(href): continue # Add the link to related links related_links.append(href) link_count += 1 # Stop if we've reached the max links per page if link_count >= max_links_value: break # Follow links for spidering if depth > 0 if spider_depth_value > 0 and related_links: # We won't actually implement the full spidering here for this example pass # Create the detailed result detailed_result = DetailedResult( title=title, url=url, description=metadata["description"], published_date=metadata["published_date"], content_snippet=content_snippet, domain=domain, is_official=metadata["is_official"], author=author, keywords=keywords, main_image=main_image, social_links=social_links, related_links=related_links, linked_content=[], headings=headings ) return detailed_result except httpx.HTTPStatusError as e: error_message = f"HTTP error when fetching {url}: {e.response.status_code}" logger.error(error_message) if hasattr(ctx, 'error'): await ctx.error(error_message) except httpx.RequestError as e: error_message = f"Request error when fetching {url}: {e}" logger.error(error_message) if hasattr(ctx, 'error'): await ctx.error(error_message) except Exception as e: error_message = f"Error when processing {url}: {e}" logger.error(error_message) logger.error(traceback.format_exc()) if hasattr(ctx, 'error'): await ctx.error(error_message) finally: # Close the HTTP client if we created it if close_client and client: await client.aclose() # Return a minimal result if anything fails return DetailedResult( title="", url=url, description="", published_date=None, content_snippet="Content not available - Error occurred while fetching the page", domain=domain, is_official=False ) @mcp.tool() # noqa: F401 # pragma: no cover async def duckduckgo_related_searches( # vulture: ignore query: str = Field( ..., description="Original search query", max_length=400, ), count: int = Field( default=5, description="Number of related searches to return (1-10, default 5)", ge=1, le=10, ), ctx: Context = None, # Context is automatically injected by MCP ) -> List[str]: """ Get related search queries for a given query. This tool suggests alternative search queries related to the original query, which can help explore a topic more broadly. Args: query: The original search query count: Number of related searches to return (1-10, default 5) ctx: MCP context object (automatically injected) Returns: A list of related search queries Example: duckduckgo_related_searches(query="artificial intelligence", count=5) """ try: logger.info(f"duckduckgo_related_searches called with query: {query}, count: {count}") # Log the context to help with debugging if ctx: logger.info(f"Context available: {ctx}") else: logger.error("Context is None!") # Create a minimal context if none is provided from pydantic import BaseModel class MinimalContext(BaseModel): pass ctx = MinimalContext() # In a real implementation, you would fetch related searches # from DuckDuckGo or generate them algorithmically # For demonstration purposes, generate some placeholder related searches words = query.split() related_searches = [ f"{query} latest news", f"{query} examples", f"best {query}", f"{query} tutorial", f"{query} definition", f"how does {query} work", f"{query} vs {words[0] if words else 'alternative'}", f"future of {query}", f"{query} applications", f"{query} history" ][:count] logger.info(f"Returning related searches: {related_searches}") return related_searches except Exception as e: error_msg = f"Error in duckduckgo_related_searches: {str(e)}" logger.error(error_msg) logger.error(traceback.format_exc()) if hasattr(ctx, 'error'): await ctx.error(error_msg) # Return an empty list instead of raising an exception return [] # Helper functions for metadata and content extraction def extract_metadata(soup, domain, url): """Extract metadata from a web page.""" metadata = { "description": "", "published_date": None, "is_official": False } # Try to find description (meta description or first paragraph) meta_desc = soup.find("meta", attrs={"name": "description"}) if meta_desc and meta_desc.get("content"): metadata["description"] = meta_desc["content"].strip() else: # Try Open Graph description og_desc = soup.find("meta", attrs={"property": "og:description"}) if og_desc and og_desc.get("content"): metadata["description"] = og_desc["content"].strip() else: # Try to find the first substantive paragraph paragraphs = soup.find_all("p") for p in paragraphs: p_text = p.get_text(strip=True) if p_text and len(p_text) > 50: # Consider it substantial if > 50 chars metadata["description"] = p_text break # Get publication date if available for date_meta in ["article:published_time", "datePublished", "pubdate", "date", "publishdate"]: date_tag = soup.find("meta", attrs={"property": date_meta}) or soup.find("meta", attrs={"name": date_meta}) if date_tag and date_tag.get("content"): metadata["published_date"] = date_tag["content"] break # If no meta date, try looking for a date in the page content if not metadata["published_date"]: # Look for common date formats in time tags time_tags = soup.find_all("time") if time_tags: for time_tag in time_tags: if time_tag.get("datetime"): metadata["published_date"] = time_tag.get("datetime") break # Determine if this is an official source # 1. Domain ends with .gov, .edu, or similar if domain.endswith(('.gov', '.edu', '.org', '.mil')): metadata["is_official"] = True # 2. "official" in the title or URL elif "official" in url.lower() or (soup.title and "official" in soup.title.string.lower()): metadata["is_official"] = True # 3. Check for verification badges or verified text elif soup.find(text=lambda text: text and "verified" in text.lower()): metadata["is_official"] = True return metadata def extract_author(soup): """Extract author information from a web page.""" # Try common author meta tags for author_meta in ["author", "article:author", "dc.creator", "twitter:creator"]: author_tag = soup.find("meta", attrs={"name": author_meta}) or soup.find("meta", attrs={"property": author_meta}) if author_tag and author_tag.get("content"): return author_tag["content"].strip() # Try looking for author in structured data author_elem = soup.find(["span", "div", "a"], attrs={"class": ["author", "byline"]}) if author_elem: return author_elem.get_text(strip=True) # Try looking for an author in rel="author" links author_link = soup.find("a", attrs={"rel": "author"}) if author_link: return author_link.get_text(strip=True) return None def extract_keywords(soup): """Extract keywords or tags from a web page.""" keywords = [] # Try keywords meta tag keywords_tag = soup.find("meta", attrs={"name": "keywords"}) if keywords_tag and keywords_tag.get("content"): keywords_text = keywords_tag["content"].strip() keywords = [k.strip() for k in keywords_text.split(',') if k.strip()] # Try article:tag meta tags tag_tags = soup.find_all("meta", attrs={"property": "article:tag"}) if tag_tags: for tag in tag_tags: if tag.get("content"): keywords.append(tag["content"].strip()) # Try to find tags in the page content if not keywords: tag_elements = soup.find_all(["a", "span"], attrs={"class": ["tag", "keyword", "category"]}) if tag_elements: for tag_elem in tag_elements: tag_text = tag_elem.get_text(strip=True) if tag_text and len(tag_text) < 30: # Reasonable tag length keywords.append(tag_text) return keywords if keywords else None def extract_main_image(soup, base_url): """Extract the main image from a web page.""" # Try Open Graph image og_image = soup.find("meta", attrs={"property": "og:image"}) if og_image and og_image.get("content"): return og_image["content"] # Try Twitter image twitter_image = soup.find("meta", attrs={"name": "twitter:image"}) if twitter_image and twitter_image.get("content"): return twitter_image["content"] # Try schema.org image schema_image = soup.find("meta", attrs={"itemprop": "image"}) if schema_image and schema_image.get("content"): return schema_image["content"] # Try to find a likely main image - large image at the top of the article article = soup.find(["article", "main", "div"], attrs={"class": ["article", "post", "content"]}) if article: images = article.find_all("img") for img in images: # Prefer images with width/height attributes that suggest a large image if img.get("src") and (img.get("width") or img.get("height")): width = int(img.get("width", 0)) height = int(img.get("height", 0)) if width > 300 or height > 200: # Reasonable size for a main image img_src = img["src"] # Handle relative URLs if img_src.startswith('/'): # Parse the base URL to get the domain parsed_url = urllib.parse.urlparse(base_url) base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}" img_src = base_domain + img_src return img_src # If we still don't have an image, just take the first substantive image images = soup.find_all("img") for img in images: if img.get("src") and not img["src"].endswith((".ico", ".svg")): img_src = img["src"] # Handle relative URLs if img_src.startswith('/'): parsed_url = urllib.parse.urlparse(base_url) base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}" img_src = base_domain + img_src return img_src return None def extract_social_links(soup): """Extract social media links from a web page.""" social_links = {} social_platforms = { "twitter.com": "twitter", "facebook.com": "facebook", "linkedin.com": "linkedin", "instagram.com": "instagram", "github.com": "github", "youtube.com": "youtube", "medium.com": "medium", "tiktok.com": "tiktok", "pinterest.com": "pinterest" } # Find all links that might be social media links = soup.find_all("a", href=True) for link in links: href = link["href"].lower() for platform_url, platform_name in social_platforms.items(): if platform_url in href: social_links[platform_name] = link["href"] break return social_links if social_links else None def extract_targeted_content(soup, domain): """ Extract content more intelligently based on content type/domain. Returns both the content snippet and headings. """ content_snippet = "" headings = [] # Extract headings for structure for h_tag in soup.find_all(["h1", "h2", "h3"]): heading_text = h_tag.get_text(strip=True) if heading_text and len(heading_text) > 3: # Skip very short headings headings.append(heading_text) # Different extraction strategies based on domain/site type # Wikipedia if "wikipedia.org" in domain: # For Wikipedia, grab the first few paragraphs content_div = soup.find("div", attrs={"id": "mw-content-text"}) if content_div: paragraphs = content_div.find_all("p") content_parts = [] for p in paragraphs[:5]: # First 5 paragraphs p_text = p.get_text(strip=True) if p_text: content_parts.append(p_text) content_snippet = " ".join(content_parts) # Documentation sites elif any(docs_site in domain for docs_site in ["docs.", ".docs.", "documentation.", "developer."]): # For documentation, focus on the main content area and code samples main_content = soup.find(["main", "article", "div"], attrs={"class": ["content", "documentation", "article"]}) if main_content: # Get text and preserve code samples content_parts = [] for elem in main_content.find_all(["p", "pre", "code"])[:10]: elem_text = elem.get_text(strip=True) if elem_text: if elem.name == "pre" or elem.name == "code": content_parts.append(f"Code: {elem_text}") else: content_parts.append(elem_text) content_snippet = " ".join(content_parts) # News sites elif any(news_indicator in domain for news_indicator in ["news.", ".news", "times.", "post.", "herald.", "guardian."]): # For news, get the article body article = soup.find(["article", "div"], attrs={"class": ["article-body", "article-content", "story-body"]}) if article: paragraphs = article.find_all("p") content_parts = [] for p in paragraphs[:8]: # First 8 paragraphs should cover the main points p_text = p.get_text(strip=True) if p_text: content_parts.append(p_text) content_snippet = " ".join(content_parts) # Blog posts elif any(blog_indicator in domain for blog_indicator in ["blog.", ".blog", "medium."]): # For blogs, get the article content article = soup.find(["article", "div"], attrs={"class": ["post", "post-content", "blog-post", "entry-content"]}) if article: paragraphs = article.find_all("p") content_parts = [] for p in paragraphs[:8]: p_text = p.get_text(strip=True) if p_text: content_parts.append(p_text) content_snippet = " ".join(content_parts) # If we haven't found suitable content yet, try common content containers if not content_snippet: # Try common content containers for container_id in ["content", "main", "article", "post", "entry"]: content_div = soup.find(["div", "article", "main"], attrs={"id": container_id}) if content_div: paragraphs = content_div.find_all("p") content_parts = [] for p in paragraphs[:10]: p_text = p.get_text(strip=True) if p_text: content_parts.append(p_text) content_snippet = " ".join(content_parts) break # Try common content classes if we still don't have content if not content_snippet: for container_class in ["content", "main", "article", "post", "entry"]: content_div = soup.find(["div", "article", "main"], attrs={"class": container_class}) if content_div: paragraphs = content_div.find_all("p") content_parts = [] for p in paragraphs[:10]: p_text = p.get_text(strip=True) if p_text: content_parts.append(p_text) content_snippet = " ".join(content_parts) break # Fallback to body if we still don't have content if not content_snippet and soup.body: paragraphs = soup.body.find_all("p") content_parts = [] for p in paragraphs[:10]: p_text = p.get_text(strip=True) if p_text and len(p_text) > 50: # Only substantive paragraphs content_parts.append(p_text) content_snippet = " ".join(content_parts) # Truncate to a reasonable length if content_snippet: content_snippet = content_snippet[:2000] + ("..." if len(content_snippet) > 2000 else "") return content_snippet, headings[:10] # Limit to 10 headings def extract_related_links(soup, base_url, domain, same_domain_only=True): """Extract related links from a web page.""" related_links = [] seen_urls = set() # Parse the base URL parsed_base = urllib.parse.urlparse(base_url) base_domain = parsed_base.netloc # Find all links links = soup.find_all("a", href=True) for link in links: href = link["href"] # Skip empty or javascript links if not href or href.startswith(('javascript:', '#', 'mailto:', 'tel:')): continue # Handle relative URLs if href.startswith('/'): href = f"{parsed_base.scheme}://{parsed_base.netloc}{href}" elif not href.startswith(('http://', 'https://')): # Skip links that aren't http or https and aren't relative continue # Skip if we're only looking for same-domain links if same_domain_only: parsed_href = urllib.parse.urlparse(href) if parsed_href.netloc != base_domain: continue # Skip duplicates if href in seen_urls or href == base_url: continue seen_urls.add(href) related_links.append(href) return related_links async def spider_links(links, http_client, original_domain, depth, max_links_per_page, same_domain_only, ctx): """ Spider the provided links to gather more content. Returns a list of LinkedContent objects. """ from mcp_duckduckgo.models import LinkedContent if depth <= 0 or not links: return [] linked_content = [] processed_count = 0 for link in links: if processed_count >= max_links_per_page: break try: # Check domain if same_domain_only is True link_domain = extract_domain(link) if same_domain_only and link_domain != original_domain: continue # Fetch the linked page if hasattr(ctx, 'progress'): await ctx.progress(f"Spidering link: {link}") response = await http_client.get(link, follow_redirects=True, timeout=10.0) response.raise_for_status() # Parse the HTML content soup = BeautifulSoup(response.text, "html.parser") # Extract title title = soup.title.string.strip() if soup.title else "No title" # Extract content snippet content_snippet, _ = extract_targeted_content(soup, link_domain) # Add to linked content linked_content.append( LinkedContent( url=link, title=title, content_snippet=content_snippet ) ) processed_count += 1 # Spider recursively if depth > 1 if depth > 1: # Extract more links from this page next_links = extract_related_links(soup, link, link_domain, same_domain_only) # Recursively spider these links child_content = await spider_links( next_links[:max_links_per_page], http_client, original_domain, depth - 1, max_links_per_page, same_domain_only, ctx ) # Add child content with appropriate relation for child in child_content: child.relation = "nested" linked_content.append(child) except Exception as e: logger.error(f"Error spidering link {link}: {e}") # Continue with other links return linked_content