"""Wikipedia API client for the MCP server."""
import asyncio
import re
from typing import Dict, List, Optional
from urllib.parse import quote
import httpx
from bs4 import BeautifulSoup
from .models import (
SearchResult,
WikipediaSearchResult,
WikipediaArticle,
WikipediaSummary,
RelatedArticle,
RelatedArticles,
)
class WikipediaClient:
"""Async client for Wikipedia API interactions."""
def __init__(self):
self.session = httpx.AsyncClient(
timeout=30.0,
follow_redirects=True,
headers={
"User-Agent": "WikipediaMCPServer/0.1.0 (https://github.com/user/wikipedia-mcp-server)"
}
)
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.session.aclose()
def _get_api_url(self, language: str) -> str:
"""Get the API base URL for a given language."""
return f"https://{language}.wikipedia.org/w/rest.php/v1"
def _get_page_url(self, language: str, title: str) -> str:
"""Get the full Wikipedia page URL."""
encoded_title = quote(title.replace(" ", "_"))
return f"https://{language}.wikipedia.org/wiki/{encoded_title}"
def _clean_html(self, html_content: str) -> str:
"""Clean HTML content and extract plain text."""
soup = BeautifulSoup(html_content, 'html.parser')
# Remove unwanted elements
for element in soup(['script', 'style', 'sup', 'table']):
element.decompose()
# Get text and clean up whitespace
text = soup.get_text()
text = re.sub(r'\n+', '\n', text)
text = re.sub(r' +', ' ', text)
return text.strip()
def _extract_sections(self, html_content: str) -> List[str]:
"""Extract section headings from HTML content."""
soup = BeautifulSoup(html_content, 'html.parser')
sections = []
for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
if heading.text.strip():
sections.append(heading.text.strip())
return sections
def _extract_key_facts(self, content: str, limit: int = 5) -> List[str]:
"""Extract key facts from article content."""
# Simple fact extraction - get first few sentences of each paragraph
paragraphs = content.split('\n\n')
facts = []
for paragraph in paragraphs[:3]: # First 3 paragraphs
if len(paragraph.strip()) > 50: # Skip very short paragraphs
# Get first sentence
sentences = paragraph.split('. ')
if sentences:
fact = sentences[0].strip()
if fact and len(fact) > 20:
facts.append(fact + ('.' if not fact.endswith('.') else ''))
return facts[:limit]
async def search_wikipedia(
self,
query: str,
limit: int = 5,
language: str = "en"
) -> WikipediaSearchResult:
"""Search Wikipedia for articles matching the query."""
try:
api_url = self._get_api_url(language)
url = f"{api_url}/search/page"
response = await self.session.get(url, params={"q": query, "limit": limit})
response.raise_for_status()
data = response.json()
results = []
for page in data.get("pages", []):
result = SearchResult(
title=page.get("title", ""),
snippet=page.get("excerpt", "").replace('<span class="searchmatch">', '').replace('</span>', ''),
url=self._get_page_url(language, page.get("title", "")),
page_id=page.get("id", 0)
)
results.append(result)
return WikipediaSearchResult(
results=results,
query=query,
total_found=len(results)
)
except Exception as e:
raise Exception(f"Failed to search Wikipedia: {str(e)}")
async def get_article(
self,
title: str,
language: str = "en"
) -> WikipediaArticle:
"""Retrieve the full content of a Wikipedia article."""
try:
api_url = self._get_api_url(language)
encoded_title = quote(title)
# Get page summary for metadata
summary_url = f"{api_url}/page/{encoded_title}"
summary_response = await self.session.get(summary_url)
summary_response.raise_for_status()
summary_data = summary_response.json()
# Get HTML content
html_url = f"{api_url}/page/{encoded_title}/html"
html_response = await self.session.get(html_url)
html_response.raise_for_status()
html_content = html_response.text
clean_content = self._clean_html(html_content)
sections = self._extract_sections(html_content)
return WikipediaArticle(
title=summary_data.get("title", title),
content=clean_content,
url=self._get_page_url(language, title),
last_modified=summary_data.get("latest", {}).get("timestamp", ""),
page_id=summary_data.get("id", 0),
word_count=len(clean_content.split()),
sections=sections
)
except Exception as e:
raise Exception(f"Failed to get article '{title}': {str(e)}")
async def get_summary(
self,
title: str,
language: str = "en"
) -> WikipediaSummary:
"""Get a concise summary of a Wikipedia article."""
try:
# Use the old API for summary since it's simpler
api_base = f"https://{language}.wikipedia.org/w/api.php"
params = {
"action": "query",
"format": "json",
"titles": title,
"prop": "extracts",
"exintro": True,
"explaintext": True,
"exsectionformat": "plain"
}
response = await self.session.get(api_base, params=params)
response.raise_for_status()
data = response.json()
pages = data.get("query", {}).get("pages", {})
if not pages:
raise Exception(f"No article found for '{title}'")
page_data = next(iter(pages.values()))
if "missing" in page_data:
raise Exception(f"Article '{title}' not found")
summary_text = page_data.get("extract", "")
page_id = page_data.get("pageid", 0)
# Extract key facts from the summary
key_facts = self._extract_key_facts(summary_text, limit=3)
return WikipediaSummary(
title=page_data.get("title", title),
summary=summary_text,
url=self._get_page_url(language, title),
key_facts=key_facts,
page_id=page_id
)
except Exception as e:
raise Exception(f"Failed to get summary for '{title}': {str(e)}")
async def find_related(
self,
title: str,
limit: int = 5,
language: str = "en"
) -> RelatedArticles:
"""Find articles related to the given article."""
try:
# Use a combination of links and categories to find related articles
api_base = f"https://{language}.wikipedia.org/w/api.php"
# First, get some links from the page
params = {
"action": "query",
"format": "json",
"titles": title,
"prop": "links|categories",
"pllimit": limit,
"plnamespace": 0, # Main namespace only
"cllimit": 3 # Get a few categories
}
response = await self.session.get(api_base, params=params)
response.raise_for_status()
data = response.json()
pages = data.get("query", {}).get("pages", {})
related_articles = []
if not pages:
return RelatedArticles(source_title=title, related=[], total_found=0)
page_data = next(iter(pages.values()))
# Get related articles from links
links = page_data.get("links", [])
categories = page_data.get("categories", [])
# Process links first
for link in links[:limit]:
link_title = link.get("title", "")
if link_title and len(related_articles) < limit:
related_article = RelatedArticle(
title=link_title,
snippet=f"Article linked from {title}",
url=self._get_page_url(language, link_title),
page_id=0, # We don't have page ID from links API
relation_type="linked_from"
)
related_articles.append(related_article)
# If we still need more, try to get articles from the same categories
if len(related_articles) < limit and categories:
category_title = categories[0].get("title", "").replace("Category:", "")
if category_title:
cat_params = {
"action": "query",
"format": "json",
"list": "categorymembers",
"cmtitle": f"Category:{category_title}",
"cmnamespace": 0,
"cmlimit": limit - len(related_articles)
}
cat_response = await self.session.get(api_base, params=cat_params)
if cat_response.status_code == 200:
cat_data = cat_response.json()
cat_members = cat_data.get("query", {}).get("categorymembers", [])
for member in cat_members:
member_title = member.get("title", "")
if member_title != title and len(related_articles) < limit:
related_article = RelatedArticle(
title=member_title,
snippet=f"Article in category: {category_title}",
url=self._get_page_url(language, member_title),
page_id=member.get("pageid", 0),
relation_type="category"
)
related_articles.append(related_article)
return RelatedArticles(
source_title=title,
related=related_articles,
total_found=len(related_articles)
)
except Exception as e:
raise Exception(f"Failed to find related articles for '{title}': {str(e)}")