mcp-server-ollama-deep-researcher
by Cam10001110101
Verified
- mcp-server-ollama-deep-researcher
- src
- assistant
import os
import sys
import requests
from typing import Dict, Any
from langsmith import traceable
from tavily import TavilyClient
def deduplicate_and_format_sources(search_response, max_tokens_per_source, include_raw_content=False):
"""
Takes either a single search response or list of responses from search APIs and formats them.
Limits the raw_content to approximately max_tokens_per_source.
include_raw_content specifies whether to include the raw_content from Tavily in the formatted string.
Args:
search_response: Either:
- A dict with a 'results' key containing a list of search results
- A list of dicts, each containing search results
Returns:
str: Formatted string with deduplicated sources
"""
# Convert input to list of results
if isinstance(search_response, dict):
sources_list = search_response['results']
elif isinstance(search_response, list):
sources_list = []
for response in search_response:
if isinstance(response, dict) and 'results' in response:
sources_list.extend(response['results'])
else:
sources_list.extend(response)
else:
raise ValueError("Input must be either a dict with 'results' or a list of search results")
# Deduplicate by URL
unique_sources = {}
for source in sources_list:
if source['url'] not in unique_sources:
unique_sources[source['url']] = source
# Format output
formatted_text = "Sources:\n\n"
for i, source in enumerate(unique_sources.values(), 1):
formatted_text += f"Source {source['title']}:\n===\n"
formatted_text += f"URL: {source['url']}\n===\n"
formatted_text += f"Most relevant content from source: {source['content']}\n===\n"
if include_raw_content:
# Using rough estimate of 4 characters per token
char_limit = max_tokens_per_source * 4
# Handle None raw_content
raw_content = source.get('raw_content', '')
if raw_content is None:
raw_content = ''
print(f"Warning: No raw_content found for source {source['url']}", file=sys.stderr)
if len(raw_content) > char_limit:
raw_content = raw_content[:char_limit] + "... [truncated]"
formatted_text += f"Full source content limited to {max_tokens_per_source} tokens: {raw_content}\n\n"
return formatted_text.strip()
def format_sources(search_results):
"""Format search results into a bullet-point list of sources.
Args:
search_results (dict): Tavily search response containing results
Returns:
str: Formatted string with sources and their URLs
"""
return '\n'.join(
f"* {source['title']} : {source['url']}"
for source in search_results['results']
)
@traceable
def tavily_search(query, include_raw_content=True, max_results=3):
""" Search the web using the Tavily API.
Args:
query (str): The search query to execute
include_raw_content (bool): Whether to include the raw_content from Tavily in the formatted string
max_results (int): Maximum number of results to return
Returns:
dict: Search response containing:
- results (list): List of search result dictionaries, each containing:
- title (str): Title of the search result
- url (str): URL of the search result
- content (str): Snippet/summary of the content
- raw_content (str): Full content of the page if available"""
# Get API key from environment
api_key = os.environ.get('TAVILY_API_KEY')
if not api_key:
raise ValueError("TAVILY_API_KEY environment variable is required")
# Keep the tvly- prefix intact
api_key = api_key.strip()
# Initialize Tavily client with the full API key
tavily_client = TavilyClient(api_key=api_key)
return tavily_client.search(query,
max_results=max_results,
include_raw_content=include_raw_content)
@traceable
def perplexity_search(query: str, perplexity_search_loop_count: int) -> Dict[str, Any]:
"""Search the web using the Perplexity API.
Args:
query (str): The search query to execute
perplexity_search_loop_count (int): The loop step for perplexity search (starts at 0)
Returns:
dict: Search response containing:
- results (list): List of search result dictionaries, each containing:
- title (str): Title of the search result
- url (str): URL of the search result
- content (str): Snippet/summary of the content
- raw_content (str): Full content of the page if available
"""
headers = {
"accept": "application/json",
"content-type": "application/json",
"Authorization": f"Bearer {os.getenv('PERPLEXITY_API_KEY')}"
}
payload = {
"model": "sonar-pro",
"messages": [
{
"role": "system",
"content": "Search the web and provide factual information with sources."
},
{
"role": "user",
"content": query
}
]
}
response = requests.post(
"https://api.perplexity.ai/chat/completions",
headers=headers,
json=payload
)
response.raise_for_status() # Raise exception for bad status codes
# Parse the response
data = response.json()
content = data["choices"][0]["message"]["content"]
# Perplexity returns a list of citations for a single search result
citations = data.get("citations", ["https://perplexity.ai"])
# Return first citation with full content, others just as references
results = [{
"title": f"Perplexity Search {perplexity_search_loop_count + 1}, Source 1",
"url": citations[0],
"content": content,
"raw_content": content
}]
# Add additional citations without duplicating content
for i, citation in enumerate(citations[1:], start=2):
results.append({
"title": f"Perplexity Search {perplexity_search_loop_count + 1}, Source {i}",
"url": citation,
"content": "See above for full content",
"raw_content": None
})
return {"results": results}