wikipedia_client.py•27.6 kB
"""
Wikipedia API client implementation.
"""
import logging
import wikipediaapi
import requests
from typing import Dict, List, Optional, Any
import functools
from wikipedia_mcp import __version__
logger = logging.getLogger(__name__)
class WikipediaClient:
"""Client for interacting with the Wikipedia API."""
# Language variant mappings - maps variant codes to their base language
LANGUAGE_VARIANTS = {
'zh-hans': 'zh', # Simplified Chinese
'zh-hant': 'zh', # Traditional Chinese
'zh-tw': 'zh', # Traditional Chinese (Taiwan)
'zh-hk': 'zh', # Traditional Chinese (Hong Kong)
'zh-mo': 'zh', # Traditional Chinese (Macau)
'zh-cn': 'zh', # Simplified Chinese (China)
'zh-sg': 'zh', # Simplified Chinese (Singapore)
'zh-my': 'zh', # Simplified Chinese (Malaysia)
# Add more language variants as needed
# Serbian variants
'sr-latn': 'sr', # Serbian Latin
'sr-cyrl': 'sr', # Serbian Cyrillic
# Norwegian variants
'no': 'nb', # Norwegian Bokmål (default)
# Kurdish variants
'ku-latn': 'ku', # Kurdish Latin
'ku-arab': 'ku', # Kurdish Arabic
}
# Country/locale to language code mappings
COUNTRY_TO_LANGUAGE = {
# English-speaking countries
'US': 'en', 'USA': 'en', 'United States': 'en',
'UK': 'en', 'GB': 'en', 'United Kingdom': 'en',
'CA': 'en', 'Canada': 'en',
'AU': 'en', 'Australia': 'en',
'NZ': 'en', 'New Zealand': 'en',
'IE': 'en', 'Ireland': 'en',
'ZA': 'en', 'South Africa': 'en',
# Chinese-speaking countries/regions
'CN': 'zh-hans', 'China': 'zh-hans',
'TW': 'zh-tw', 'Taiwan': 'zh-tw',
'HK': 'zh-hk', 'Hong Kong': 'zh-hk',
'MO': 'zh-mo', 'Macau': 'zh-mo',
'SG': 'zh-sg', 'Singapore': 'zh-sg',
'MY': 'zh-my', 'Malaysia': 'zh-my',
# Major European countries
'DE': 'de', 'Germany': 'de',
'FR': 'fr', 'France': 'fr',
'ES': 'es', 'Spain': 'es',
'IT': 'it', 'Italy': 'it',
'PT': 'pt', 'Portugal': 'pt',
'NL': 'nl', 'Netherlands': 'nl',
'PL': 'pl', 'Poland': 'pl',
'RU': 'ru', 'Russia': 'ru',
'UA': 'uk', 'Ukraine': 'uk',
'TR': 'tr', 'Turkey': 'tr',
'GR': 'el', 'Greece': 'el',
'SE': 'sv', 'Sweden': 'sv',
'NO': 'no', 'Norway': 'no',
'DK': 'da', 'Denmark': 'da',
'FI': 'fi', 'Finland': 'fi',
'IS': 'is', 'Iceland': 'is',
'CZ': 'cs', 'Czech Republic': 'cs',
'SK': 'sk', 'Slovakia': 'sk',
'HU': 'hu', 'Hungary': 'hu',
'RO': 'ro', 'Romania': 'ro',
'BG': 'bg', 'Bulgaria': 'bg',
'HR': 'hr', 'Croatia': 'hr',
'SI': 'sl', 'Slovenia': 'sl',
'RS': 'sr', 'Serbia': 'sr',
'BA': 'bs', 'Bosnia and Herzegovina': 'bs',
'MK': 'mk', 'Macedonia': 'mk',
'AL': 'sq', 'Albania': 'sq',
'MT': 'mt', 'Malta': 'mt',
# Asian countries
'JP': 'ja', 'Japan': 'ja',
'KR': 'ko', 'South Korea': 'ko',
'IN': 'hi', 'India': 'hi',
'TH': 'th', 'Thailand': 'th',
'VN': 'vi', 'Vietnam': 'vi',
'ID': 'id', 'Indonesia': 'id',
'PH': 'tl', 'Philippines': 'tl',
'BD': 'bn', 'Bangladesh': 'bn',
'PK': 'ur', 'Pakistan': 'ur',
'LK': 'si', 'Sri Lanka': 'si',
'MM': 'my', 'Myanmar': 'my',
'KH': 'km', 'Cambodia': 'km',
'LA': 'lo', 'Laos': 'lo',
'MN': 'mn', 'Mongolia': 'mn',
'KZ': 'kk', 'Kazakhstan': 'kk',
'UZ': 'uz', 'Uzbekistan': 'uz',
'AF': 'fa', 'Afghanistan': 'fa',
# Middle Eastern countries
'IR': 'fa', 'Iran': 'fa',
'SA': 'ar', 'Saudi Arabia': 'ar',
'AE': 'ar', 'UAE': 'ar',
'EG': 'ar', 'Egypt': 'ar',
'IQ': 'ar', 'Iraq': 'ar',
'SY': 'ar', 'Syria': 'ar',
'JO': 'ar', 'Jordan': 'ar',
'LB': 'ar', 'Lebanon': 'ar',
'IL': 'he', 'Israel': 'he',
# African countries
'MA': 'ar', 'Morocco': 'ar',
'DZ': 'ar', 'Algeria': 'ar',
'TN': 'ar', 'Tunisia': 'ar',
'LY': 'ar', 'Libya': 'ar',
'SD': 'ar', 'Sudan': 'ar',
'ET': 'am', 'Ethiopia': 'am',
'KE': 'sw', 'Kenya': 'sw',
'TZ': 'sw', 'Tanzania': 'sw',
'NG': 'ha', 'Nigeria': 'ha',
'GH': 'en', 'Ghana': 'en',
# Latin American countries
'MX': 'es', 'Mexico': 'es',
'AR': 'es', 'Argentina': 'es',
'CO': 'es', 'Colombia': 'es',
'VE': 'es', 'Venezuela': 'es',
'PE': 'es', 'Peru': 'es',
'CL': 'es', 'Chile': 'es',
'EC': 'es', 'Ecuador': 'es',
'BO': 'es', 'Bolivia': 'es',
'PY': 'es', 'Paraguay': 'es',
'UY': 'es', 'Uruguay': 'es',
'CR': 'es', 'Costa Rica': 'es',
'PA': 'es', 'Panama': 'es',
'GT': 'es', 'Guatemala': 'es',
'HN': 'es', 'Honduras': 'es',
'SV': 'es', 'El Salvador': 'es',
'NI': 'es', 'Nicaragua': 'es',
'CU': 'es', 'Cuba': 'es',
'DO': 'es', 'Dominican Republic': 'es',
'BR': 'pt', 'Brazil': 'pt',
# Additional countries
'BY': 'be', 'Belarus': 'be',
'EE': 'et', 'Estonia': 'et',
'LV': 'lv', 'Latvia': 'lv',
'LT': 'lt', 'Lithuania': 'lt',
'GE': 'ka', 'Georgia': 'ka',
'AM': 'hy', 'Armenia': 'hy',
'AZ': 'az', 'Azerbaijan': 'az',
}
def __init__(self, language: str = "en", country: Optional[str] = None, enable_cache: bool = False, access_token: Optional[str] = None):
"""Initialize the Wikipedia client.
Args:
language: The language code for Wikipedia (default: "en" for English).
Supports language variants like 'zh-hans', 'zh-tw', etc.
country: The country/locale code (e.g., 'US', 'CN', 'TW').
If provided, overrides language parameter.
enable_cache: Whether to enable caching for API calls (default: False).
access_token: Personal Access Token for Wikipedia API authentication (optional).
Used to increase rate limits and avoid 403 errors.
"""
# Resolve country to language if country is provided
if country:
resolved_language = self._resolve_country_to_language(country)
self.original_input = country
self.input_type = "country"
self.resolved_language = resolved_language
# Maintain backward compatibility
self.original_language = resolved_language
else:
self.original_input = language
self.input_type = "language"
self.resolved_language = language
# Maintain backward compatibility
self.original_language = language
self.enable_cache = enable_cache
self.access_token = access_token
self.user_agent = f"WikipediaMCPServer/{__version__} (https://github.com/rudra-ravi/wikipedia-mcp)"
# Parse language and variant
self.base_language, self.language_variant = self._parse_language_variant(self.resolved_language)
# Use base language for API and library initialization
self.wiki = wikipediaapi.Wikipedia(
user_agent=self.user_agent,
language=self.base_language,
extract_format=wikipediaapi.ExtractFormat.WIKI
)
self.api_url = f"https://{self.base_language}.wikipedia.org/w/api.php"
if self.enable_cache:
self.search = functools.lru_cache(maxsize=128)(self.search)
self.get_article = functools.lru_cache(maxsize=128)(self.get_article)
self.get_summary = functools.lru_cache(maxsize=128)(self.get_summary)
self.get_sections = functools.lru_cache(maxsize=128)(self.get_sections)
self.get_links = functools.lru_cache(maxsize=128)(self.get_links)
self.get_related_topics = functools.lru_cache(maxsize=128)(self.get_related_topics)
self.summarize_for_query = functools.lru_cache(maxsize=128)(self.summarize_for_query)
self.summarize_section = functools.lru_cache(maxsize=128)(self.summarize_section)
self.extract_facts = functools.lru_cache(maxsize=128)(self.extract_facts)
self.get_coordinates = functools.lru_cache(maxsize=128)(self.get_coordinates)
def _resolve_country_to_language(self, country: str) -> str:
"""Resolve country/locale code to language code.
Args:
country: The country/locale code (e.g., 'US', 'CN', 'Taiwan').
Returns:
The corresponding language code.
Raises:
ValueError: If the country code is not supported.
"""
# Normalize country code (upper case, handle common variations)
country_upper = country.upper().strip()
country_title = country.title().strip()
# Try exact matches first
if country_upper in self.COUNTRY_TO_LANGUAGE:
return self.COUNTRY_TO_LANGUAGE[country_upper]
# Try title case
if country_title in self.COUNTRY_TO_LANGUAGE:
return self.COUNTRY_TO_LANGUAGE[country_title]
# Try original case
if country in self.COUNTRY_TO_LANGUAGE:
return self.COUNTRY_TO_LANGUAGE[country]
# Provide helpful error message with suggestions
available_countries = list(self.COUNTRY_TO_LANGUAGE.keys())
# Get first 10 country codes for suggestions
country_codes = [c for c in available_countries if len(c) <= 3][:10]
raise ValueError(
f"Unsupported country/locale: '{country}'. "
f"Supported country codes include: {', '.join(country_codes)}. "
f"Use --language parameter for direct language codes instead."
)
def _parse_language_variant(self, language: str) -> tuple[str, Optional[str]]:
"""Parse language code and extract base language and variant.
Args:
language: The language code, possibly with variant (e.g., 'zh-hans', 'zh-tw').
Returns:
A tuple of (base_language, variant) where variant is None if not a variant.
"""
if language in self.LANGUAGE_VARIANTS:
base_language = self.LANGUAGE_VARIANTS[language]
return base_language, language
else:
return language, None
def _get_request_headers(self) -> Dict[str, str]:
"""Get request headers for API calls, including authentication if available.
Returns:
Dictionary of headers to use for requests.
"""
headers = {'User-Agent': self.user_agent}
if self.access_token:
headers['Authorization'] = f'Bearer {self.access_token}'
return headers
def _add_variant_to_params(self, params: Dict[str, Any]) -> Dict[str, Any]:
"""Add language variant parameter to API request parameters if needed.
Args:
params: The API request parameters.
Returns:
Updated parameters with variant if applicable.
"""
if self.language_variant:
params = params.copy()
params['variant'] = self.language_variant
return params
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
"""Search Wikipedia for articles matching a query.
Args:
query: The search query.
limit: Maximum number of results to return.
Returns:
A list of search results.
"""
params = {
'action': 'query',
'format': 'json',
'list': 'search',
'utf8': 1,
'srsearch': query,
'srlimit': limit
}
# Add variant parameter if needed
params = self._add_variant_to_params(params)
try:
response = requests.get(self.api_url, headers=self._get_request_headers(), params=params)
response.raise_for_status()
data = response.json()
results = []
for item in data.get('query', {}).get('search', []):
results.append({
'title': item.get('title', ''),
'snippet': item.get('snippet', ''),
'pageid': item.get('pageid', 0),
'wordcount': item.get('wordcount', 0),
'timestamp': item.get('timestamp', '')
})
return results
except Exception as e:
logger.error(f"Error searching Wikipedia: {e}")
return []
def get_article(self, title: str) -> Dict[str, Any]:
"""Get the full content of a Wikipedia article.
Args:
title: The title of the Wikipedia article.
Returns:
A dictionary containing the article information.
"""
try:
page = self.wiki.page(title)
if not page.exists():
return {
'title': title,
'exists': False,
'error': 'Page does not exist'
}
# Get sections
sections = self._extract_sections(page.sections)
# Get categories
categories = [cat for cat in page.categories.keys()]
# Get links
links = [link for link in page.links.keys()]
return {
'title': page.title,
'pageid': page.pageid,
'summary': page.summary,
'text': page.text,
'url': page.fullurl,
'sections': sections,
'categories': categories,
'links': links[:100], # Limit to 100 links to avoid too much data
'exists': True
}
except Exception as e:
logger.error(f"Error getting Wikipedia article: {e}")
return {
'title': title,
'exists': False,
'error': str(e)
}
def get_summary(self, title: str) -> str:
"""Get a summary of a Wikipedia article.
Args:
title: The title of the Wikipedia article.
Returns:
The article summary.
"""
try:
page = self.wiki.page(title)
if not page.exists():
return f"No Wikipedia article found for '{title}'."
return page.summary
except Exception as e:
logger.error(f"Error getting Wikipedia summary: {e}")
return f"Error retrieving summary for '{title}': {str(e)}"
def get_sections(self, title: str) -> List[Dict[str, Any]]:
"""Get the sections of a Wikipedia article.
Args:
title: The title of the Wikipedia article.
Returns:
A list of sections.
"""
try:
page = self.wiki.page(title)
if not page.exists():
return []
return self._extract_sections(page.sections)
except Exception as e:
logger.error(f"Error getting Wikipedia sections: {e}")
return []
def get_links(self, title: str) -> List[str]:
"""Get the links in a Wikipedia article.
Args:
title: The title of the Wikipedia article.
Returns:
A list of links.
"""
try:
page = self.wiki.page(title)
if not page.exists():
return []
return [link for link in page.links.keys()]
except Exception as e:
logger.error(f"Error getting Wikipedia links: {e}")
return []
def get_related_topics(self, title: str, limit: int = 10) -> List[Dict[str, Any]]:
"""Get topics related to a Wikipedia article based on links and categories.
Args:
title: The title of the Wikipedia article.
limit: Maximum number of related topics to return.
Returns:
A list of related topics.
"""
try:
page = self.wiki.page(title)
if not page.exists():
return []
# Get links from the page
links = list(page.links.keys())
# Get categories
categories = list(page.categories.keys())
# Combine and limit
related = []
# Add links first
for link in links[:limit]:
link_page = self.wiki.page(link)
if link_page.exists():
related.append({
'title': link,
'summary': link_page.summary[:200] + '...' if len(link_page.summary) > 200 else link_page.summary,
'url': link_page.fullurl,
'type': 'link'
})
if len(related) >= limit:
break
# Add categories if we still have room
remaining = limit - len(related)
if remaining > 0:
for category in categories[:remaining]:
# Remove "Category:" prefix if present
clean_category = category.replace("Category:", "")
related.append({
'title': clean_category,
'type': 'category'
})
return related
except Exception as e:
logger.error(f"Error getting related topics: {e}")
return []
def _extract_sections(self, sections, level=0) -> List[Dict[str, Any]]:
"""Extract sections recursively.
Args:
sections: The sections to extract.
level: The current section level.
Returns:
A list of sections.
"""
result = []
for section in sections:
section_data = {
'title': section.title,
'level': level,
'text': section.text,
'sections': self._extract_sections(section.sections, level + 1)
}
result.append(section_data)
return result
def summarize_for_query(self, title: str, query: str, max_length: int = 250) -> str:
"""
Get a summary of a Wikipedia article tailored to a specific query.
This is a simplified implementation that returns a snippet around the query.
Args:
title: The title of the Wikipedia article.
query: The query to focus the summary on.
max_length: The maximum length of the summary.
Returns:
A query-focused summary.
"""
try:
page = self.wiki.page(title)
if not page.exists():
return f"No Wikipedia article found for '{title}'."
text_content = page.text
query_lower = query.lower()
text_lower = text_content.lower()
start_index = text_lower.find(query_lower)
if start_index == -1:
# If query not found, return the beginning of the summary or article text
summary_part = page.summary[:max_length]
if not summary_part:
summary_part = text_content[:max_length]
return summary_part + "..." if len(summary_part) >= max_length else summary_part
# Try to get context around the query
context_start = max(0, start_index - (max_length // 2))
context_end = min(len(text_content), start_index + len(query) + (max_length // 2))
snippet = text_content[context_start:context_end]
if len(snippet) > max_length:
snippet = snippet[:max_length]
return snippet + "..." if len(snippet) >= max_length or context_end < len(text_content) else snippet
except Exception as e:
logger.error(f"Error generating query-focused summary for '{title}': {e}")
return f"Error generating query-focused summary for '{title}': {str(e)}"
def summarize_section(self, title: str, section_title: str, max_length: int = 150) -> str:
"""
Get a summary of a specific section of a Wikipedia article.
Args:
title: The title of the Wikipedia article.
section_title: The title of the section to summarize.
max_length: The maximum length of the summary.
Returns:
A summary of the specified section.
"""
try:
page = self.wiki.page(title)
if not page.exists():
return f"No Wikipedia article found for '{title}'."
target_section = None
# Helper function to find the section
def find_section_recursive(sections_list, target_title):
for sec in sections_list:
if sec.title.lower() == target_title.lower():
return sec
# Check subsections
found_in_subsection = find_section_recursive(sec.sections, target_title)
if found_in_subsection:
return found_in_subsection
return None
target_section = find_section_recursive(page.sections, section_title)
if not target_section or not target_section.text:
return f"Section '{section_title}' not found or is empty in article '{title}'."
summary = target_section.text[:max_length]
return summary + "..." if len(target_section.text) > max_length else summary
except Exception as e:
logger.error(f"Error summarizing section '{section_title}' for article '{title}': {e}")
return f"Error summarizing section '{section_title}': {str(e)}"
def extract_facts(self, title: str, topic_within_article: Optional[str] = None, count: int = 5) -> List[str]:
"""
Extract key facts from a Wikipedia article.
This is a simplified implementation returning the first few sentences of the summary
or a relevant section if topic_within_article is provided.
Args:
title: The title of the Wikipedia article.
topic_within_article: Optional topic/section to focus fact extraction.
count: The number of facts to extract.
Returns:
A list of key facts (strings).
"""
try:
page = self.wiki.page(title)
if not page.exists():
return [f"No Wikipedia article found for '{title}'."]
text_to_process = ""
if topic_within_article:
# Try to find the section text
def find_section_text_recursive(sections_list, target_title):
for sec in sections_list:
if sec.title.lower() == target_title.lower():
return sec.text
found_in_subsection = find_section_text_recursive(sec.sections, target_title)
if found_in_subsection:
return found_in_subsection
return None
section_text = find_section_text_recursive(page.sections, topic_within_article)
if section_text:
text_to_process = section_text
else:
# Fallback to summary if specific topic section not found
text_to_process = page.summary
else:
text_to_process = page.summary
if not text_to_process:
return ["No content found to extract facts from."]
# Basic sentence splitting (can be improved with NLP libraries like nltk or spacy)
sentences = [s.strip() for s in text_to_process.split('.') if s.strip()]
facts = []
for sentence in sentences[:count]:
if sentence: # Ensure not an empty string after strip
facts.append(sentence + ".") # Add back the period
return facts if facts else ["Could not extract facts from the provided text."]
except Exception as e:
logger.error(f"Error extracting key facts for '{title}': {e}")
return [f"Error extracting key facts for '{title}': {str(e)}"]
def get_coordinates(self, title: str) -> Dict[str, Any]:
"""Get the coordinates of a Wikipedia article.
Args:
title: The title of the Wikipedia article.
Returns:
A dictionary containing the coordinates information.
"""
params = {
'action': 'query',
'format': 'json',
'prop': 'coordinates',
'titles': title
}
# Add variant parameter if needed
params = self._add_variant_to_params(params)
try:
response = requests.get(self.api_url, headers=self._get_request_headers(), params=params)
response.raise_for_status()
data = response.json()
pages = data.get('query', {}).get('pages', {})
if not pages:
return {
'title': title,
'coordinates': None,
'exists': False,
'error': 'No page found'
}
# Get the first (and typically only) page
page_data = next(iter(pages.values()))
# Check if page exists (pageid > 0 means page exists)
if page_data.get('pageid', -1) < 0:
return {
'title': title,
'coordinates': None,
'exists': False,
'error': 'Page does not exist'
}
coordinates = page_data.get('coordinates', [])
if not coordinates:
return {
'title': page_data.get('title', title),
'pageid': page_data.get('pageid'),
'coordinates': None,
'exists': True,
'error': None,
'message': 'No coordinates available for this article'
}
# Process coordinates - typically there's one primary coordinate
processed_coordinates = []
for coord in coordinates:
processed_coordinates.append({
'latitude': coord.get('lat'),
'longitude': coord.get('lon'),
'primary': coord.get('primary', False),
'globe': coord.get('globe', 'earth'),
'type': coord.get('type', ''),
'name': coord.get('name', ''),
'region': coord.get('region', ''),
'country': coord.get('country', '')
})
return {
'title': page_data.get('title', title),
'pageid': page_data.get('pageid'),
'coordinates': processed_coordinates,
'exists': True,
'error': None
}
except Exception as e:
logger.error(f"Error getting coordinates for Wikipedia article: {e}")
return {
'title': title,
'coordinates': None,
'exists': False,
'error': str(e)
}