"""Search service implementation for SearXNG integration."""
import json
import logging
from typing import Any
import aiohttp
from fastmcp import Context
from src.config import get_settings
from src.core import MCPToolError
from src.core.exceptions import FetchError, SearchError
from src.utils.type_guards import is_valid_url
from .crawling import process_urls_for_mcp
logger = logging.getLogger(__name__)
settings = get_settings()
async def search_and_process(
ctx: Context,
query: str,
return_raw_markdown: bool = False,
num_results: int = 6,
batch_size: int = 20,
) -> str:
"""
Perform search using SearXNG and process results.
Args:
ctx: FastMCP context
query: Search query
return_raw_markdown: Return raw markdown instead of storing
num_results: Number of results to return
batch_size: Batch size for processing
Returns:
JSON string with results
"""
if not settings.searxng_url:
msg = "SearXNG URL not configured. Please set SEARXNG_URL in your environment."
raise MCPToolError(
msg,
)
try:
# Perform SearXNG search
search_results = await _search_searxng(query, num_results)
if not search_results:
return json.dumps(
{
"success": False,
"message": "No search results found",
"results": [],
},
)
# Extract URLs from search results
urls = [result["url"] for result in search_results]
# Process URLs with process_urls_for_mcp
crawl_result = await process_urls_for_mcp(
ctx=ctx,
urls=urls,
batch_size=batch_size,
return_raw_markdown=return_raw_markdown,
)
# Parse crawl result
crawl_data = json.loads(crawl_result)
# Combine search metadata with crawl results
combined_results = []
for i, search_result in enumerate(search_results):
combined_result = {
"title": search_result["title"],
"url": search_result["url"],
"snippet": search_result.get("snippet", ""),
}
# Find corresponding crawl result
if crawl_data.get("success") and i < len(crawl_data.get("results", [])):
crawl_info = crawl_data["results"][i]
if return_raw_markdown:
combined_result["markdown"] = crawl_info.get("markdown", "")
else:
combined_result["stored"] = crawl_info.get("success", False)
combined_result["chunks"] = crawl_info.get("chunks_stored", 0)
combined_results.append(combined_result)
return json.dumps(
{
"success": True,
"query": query,
"total_results": len(combined_results),
"results": combined_results,
},
)
except Exception as e:
logger.exception("Error in search_and_process: %s", e)
msg = f"Search processing failed: {e!s}"
raise MCPToolError(msg)
async def _search_searxng(query: str, num_results: int) -> list[dict[str, Any]]:
"""
Search using SearXNG instance.
Args:
query: Search query
num_results: Number of results to return
Returns:
List of search results
"""
# Use type guard for better type narrowing
if not is_valid_url(settings.searxng_url):
logger.error("SearXNG URL is not configured or invalid")
return []
# mypy now knows searxng_url is str
searxng_url = settings.searxng_url.rstrip("/")
search_url = f"{searxng_url}/search"
params: dict[str, str | int] = {
"q": query,
"format": "json",
"categories": "general",
"engines": settings.searxng_default_engines or "",
"safesearch": "1",
"limit": num_results,
}
headers = {
"User-Agent": settings.searxng_user_agent,
"Accept": "application/json",
"X-Forwarded-For": "127.0.0.1",
"X-Real-IP": "127.0.0.1",
}
try:
async with (
aiohttp.ClientSession() as session,
session.get(
search_url,
params=params,
headers=headers,
timeout=aiohttp.ClientTimeout(total=settings.searxng_timeout),
) as response,
):
if response.status != 200:
logger.error("SearXNG returned status %s", response.status)
return []
data = await response.json()
# Parse JSON results
results = []
for item in data.get("results", [])[:num_results]:
result = {
"title": item.get("title", ""),
"url": item.get("url", ""),
"snippet": item.get("content", "") or item.get("snippet", ""),
}
if result.get("url"):
results.append(result)
return results
except FetchError as e:
logger.error("Failed to fetch from SearXNG: %s", e)
return []
except SearchError as e:
logger.error("SearXNG search error: %s", e)
return []
except Exception as e:
logger.exception("Unexpected error in SearXNG search: %s", e)
return []