Skip to main content
Glama

Wikipedia MCP Server

wikipedia_client.py32.6 kB
""" Wikipedia API client implementation. """ import logging import wikipediaapi import requests from typing import Dict, List, Optional, Any import functools from wikipedia_mcp import __version__ logger = logging.getLogger(__name__) class WikipediaClient: """Client for interacting with the Wikipedia API.""" # Language variant mappings - maps variant codes to their base language LANGUAGE_VARIANTS = { "zh-hans": "zh", # Simplified Chinese "zh-hant": "zh", # Traditional Chinese "zh-tw": "zh", # Traditional Chinese (Taiwan) "zh-hk": "zh", # Traditional Chinese (Hong Kong) "zh-mo": "zh", # Traditional Chinese (Macau) "zh-cn": "zh", # Simplified Chinese (China) "zh-sg": "zh", # Simplified Chinese (Singapore) "zh-my": "zh", # Simplified Chinese (Malaysia) # Add more language variants as needed # Serbian variants "sr-latn": "sr", # Serbian Latin "sr-cyrl": "sr", # Serbian Cyrillic # Norwegian variants "no": "nb", # Norwegian Bokmål (default) # Kurdish variants "ku-latn": "ku", # Kurdish Latin "ku-arab": "ku", # Kurdish Arabic } # Country/locale to language code mappings COUNTRY_TO_LANGUAGE = { # English-speaking countries "US": "en", "USA": "en", "United States": "en", "UK": "en", "GB": "en", "United Kingdom": "en", "CA": "en", "Canada": "en", "AU": "en", "Australia": "en", "NZ": "en", "New Zealand": "en", "IE": "en", "Ireland": "en", "ZA": "en", "South Africa": "en", # Chinese-speaking countries/regions "CN": "zh-hans", "China": "zh-hans", "TW": "zh-tw", "Taiwan": "zh-tw", "HK": "zh-hk", "Hong Kong": "zh-hk", "MO": "zh-mo", "Macau": "zh-mo", "SG": "zh-sg", "Singapore": "zh-sg", "MY": "zh-my", "Malaysia": "zh-my", # Major European countries "DE": "de", "Germany": "de", "FR": "fr", "France": "fr", "ES": "es", "Spain": "es", "IT": "it", "Italy": "it", "PT": "pt", "Portugal": "pt", "NL": "nl", "Netherlands": "nl", "PL": "pl", "Poland": "pl", "RU": "ru", "Russia": "ru", "UA": "uk", "Ukraine": "uk", "TR": "tr", "Turkey": "tr", "GR": "el", "Greece": "el", "SE": "sv", "Sweden": "sv", "NO": "no", "Norway": "no", "DK": "da", "Denmark": "da", "FI": "fi", "Finland": "fi", "IS": "is", "Iceland": "is", "CZ": "cs", "Czech Republic": "cs", "SK": "sk", "Slovakia": "sk", "HU": "hu", "Hungary": "hu", "RO": "ro", "Romania": "ro", "BG": "bg", "Bulgaria": "bg", "HR": "hr", "Croatia": "hr", "SI": "sl", "Slovenia": "sl", "RS": "sr", "Serbia": "sr", "BA": "bs", "Bosnia and Herzegovina": "bs", "MK": "mk", "Macedonia": "mk", "AL": "sq", "Albania": "sq", "MT": "mt", "Malta": "mt", # Asian countries "JP": "ja", "Japan": "ja", "KR": "ko", "South Korea": "ko", "IN": "hi", "India": "hi", "TH": "th", "Thailand": "th", "VN": "vi", "Vietnam": "vi", "ID": "id", "Indonesia": "id", "PH": "tl", "Philippines": "tl", "BD": "bn", "Bangladesh": "bn", "PK": "ur", "Pakistan": "ur", "LK": "si", "Sri Lanka": "si", "MM": "my", "Myanmar": "my", "KH": "km", "Cambodia": "km", "LA": "lo", "Laos": "lo", "MN": "mn", "Mongolia": "mn", "KZ": "kk", "Kazakhstan": "kk", "UZ": "uz", "Uzbekistan": "uz", "AF": "fa", "Afghanistan": "fa", # Middle Eastern countries "IR": "fa", "Iran": "fa", "SA": "ar", "Saudi Arabia": "ar", "AE": "ar", "UAE": "ar", "EG": "ar", "Egypt": "ar", "IQ": "ar", "Iraq": "ar", "SY": "ar", "Syria": "ar", "JO": "ar", "Jordan": "ar", "LB": "ar", "Lebanon": "ar", "IL": "he", "Israel": "he", # African countries "MA": "ar", "Morocco": "ar", "DZ": "ar", "Algeria": "ar", "TN": "ar", "Tunisia": "ar", "LY": "ar", "Libya": "ar", "SD": "ar", "Sudan": "ar", "ET": "am", "Ethiopia": "am", "KE": "sw", "Kenya": "sw", "TZ": "sw", "Tanzania": "sw", "NG": "ha", "Nigeria": "ha", "GH": "en", "Ghana": "en", # Latin American countries "MX": "es", "Mexico": "es", "AR": "es", "Argentina": "es", "CO": "es", "Colombia": "es", "VE": "es", "Venezuela": "es", "PE": "es", "Peru": "es", "CL": "es", "Chile": "es", "EC": "es", "Ecuador": "es", "BO": "es", "Bolivia": "es", "PY": "es", "Paraguay": "es", "UY": "es", "Uruguay": "es", "CR": "es", "Costa Rica": "es", "PA": "es", "Panama": "es", "GT": "es", "Guatemala": "es", "HN": "es", "Honduras": "es", "SV": "es", "El Salvador": "es", "NI": "es", "Nicaragua": "es", "CU": "es", "Cuba": "es", "DO": "es", "Dominican Republic": "es", "BR": "pt", "Brazil": "pt", # Additional countries "BY": "be", "Belarus": "be", "EE": "et", "Estonia": "et", "LV": "lv", "Latvia": "lv", "LT": "lt", "Lithuania": "lt", "GE": "ka", "Georgia": "ka", "AM": "hy", "Armenia": "hy", "AZ": "az", "Azerbaijan": "az", } def __init__( self, language: str = "en", country: Optional[str] = None, enable_cache: bool = False, access_token: Optional[str] = None, ): """Initialize the Wikipedia client. Args: language: The language code for Wikipedia (default: "en" for English). Supports language variants like 'zh-hans', 'zh-tw', etc. country: The country/locale code (e.g., 'US', 'CN', 'TW'). If provided, overrides language parameter. enable_cache: Whether to enable caching for API calls (default: False). access_token: Personal Access Token for Wikipedia API authentication (optional). Used to increase rate limits and avoid 403 errors. """ # Resolve country to language if country is provided if country: resolved_language = self._resolve_country_to_language(country) self.original_input = country self.input_type = "country" self.resolved_language = resolved_language # Maintain backward compatibility self.original_language = resolved_language else: self.original_input = language self.input_type = "language" self.resolved_language = language # Maintain backward compatibility self.original_language = language self.enable_cache = enable_cache self.access_token = access_token self.user_agent = f"WikipediaMCPServer/{__version__} (https://github.com/rudra-ravi/wikipedia-mcp)" # Parse language and variant self.base_language, self.language_variant = self._parse_language_variant(self.resolved_language) # Use base language for API and library initialization self.wiki = wikipediaapi.Wikipedia( user_agent=self.user_agent, language=self.base_language, extract_format=wikipediaapi.ExtractFormat.WIKI, ) self.api_url = f"https://{self.base_language}.wikipedia.org/w/api.php" if self.enable_cache: self.search = functools.lru_cache(maxsize=128)(self.search) self.get_article = functools.lru_cache(maxsize=128)(self.get_article) self.get_summary = functools.lru_cache(maxsize=128)(self.get_summary) self.get_sections = functools.lru_cache(maxsize=128)(self.get_sections) self.get_links = functools.lru_cache(maxsize=128)(self.get_links) self.get_related_topics = functools.lru_cache(maxsize=128)(self.get_related_topics) self.summarize_for_query = functools.lru_cache(maxsize=128)(self.summarize_for_query) self.summarize_section = functools.lru_cache(maxsize=128)(self.summarize_section) self.extract_facts = functools.lru_cache(maxsize=128)(self.extract_facts) self.get_coordinates = functools.lru_cache(maxsize=128)(self.get_coordinates) def _resolve_country_to_language(self, country: str) -> str: """Resolve country/locale code to language code. Args: country: The country/locale code (e.g., 'US', 'CN', 'Taiwan'). Returns: The corresponding language code. Raises: ValueError: If the country code is not supported. """ # Normalize country code (upper case, handle common variations) country_upper = country.upper().strip() country_title = country.title().strip() # Try exact matches first if country_upper in self.COUNTRY_TO_LANGUAGE: return self.COUNTRY_TO_LANGUAGE[country_upper] # Try title case if country_title in self.COUNTRY_TO_LANGUAGE: return self.COUNTRY_TO_LANGUAGE[country_title] # Try original case if country in self.COUNTRY_TO_LANGUAGE: return self.COUNTRY_TO_LANGUAGE[country] # Provide helpful error message with suggestions available_countries = list(self.COUNTRY_TO_LANGUAGE.keys()) # Get first 10 country codes for suggestions country_codes = [c for c in available_countries if len(c) <= 3][:10] raise ValueError( f"Unsupported country/locale: '{country}'. " f"Supported country codes include: {', '.join(country_codes)}. " f"Use --language parameter for direct language codes instead." ) def _parse_language_variant(self, language: str) -> tuple[str, Optional[str]]: """Parse language code and extract base language and variant. Args: language: The language code, possibly with variant (e.g., 'zh-hans', 'zh-tw'). Returns: A tuple of (base_language, variant) where variant is None if not a variant. """ if language in self.LANGUAGE_VARIANTS: base_language = self.LANGUAGE_VARIANTS[language] return base_language, language else: return language, None def _get_request_headers(self) -> Dict[str, str]: """Get request headers for API calls, including authentication if available. Returns: Dictionary of headers to use for requests. """ headers = {"User-Agent": self.user_agent} if self.access_token: headers["Authorization"] = f"Bearer {self.access_token}" return headers def _add_variant_to_params(self, params: Dict[str, Any]) -> Dict[str, Any]: """Add language variant parameter to API request parameters if needed. Args: params: The API request parameters. Returns: Updated parameters with variant if applicable. """ if self.language_variant: params = params.copy() params["variant"] = self.language_variant return params def test_connectivity(self) -> Dict[str, Any]: """Test connectivity to the Wikipedia API and return diagnostics.""" test_url = f"https://{self.base_language}.wikipedia.org/w/api.php" test_params = { "action": "query", "format": "json", "meta": "siteinfo", "siprop": "general", } try: logger.info(f"Testing connectivity to {test_url}") response = requests.get( test_url, headers=self._get_request_headers(), params=test_params, timeout=10, ) response.raise_for_status() data = response.json() site_info = data.get("query", {}).get("general", {}) return { "status": "success", "url": test_url, "language": self.base_language, "site_name": site_info.get("sitename", "Unknown"), "server": site_info.get("server", "Unknown"), "response_time_ms": response.elapsed.total_seconds() * 1000, } except Exception as exc: # pragma: no cover - safeguarded logger.error("Connectivity test failed: %s", exc) return { "status": "failed", "url": test_url, "language": self.base_language, "error": str(exc), "error_type": type(exc).__name__, } def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]: """Search Wikipedia for articles matching a query with validation and diagnostics.""" if not query or not query.strip(): logger.warning("Empty search query provided") return [] trimmed_query = query.strip() if len(trimmed_query) > 300: logger.warning( "Search query too long (%d chars), truncating to 300", len(trimmed_query), ) trimmed_query = trimmed_query[:300] if limit <= 0: logger.warning("Invalid limit %d provided, using default 10", limit) limit = 10 elif limit > 500: logger.warning("Limit %d exceeds maximum, capping at 500", limit) limit = 500 params = { "action": "query", "format": "json", "list": "search", "utf8": 1, "srsearch": trimmed_query, "srlimit": limit, } params = self._add_variant_to_params(params) try: logger.debug("Making search request to %s with params %s", self.api_url, params) response = requests.get( self.api_url, headers=self._get_request_headers(), params=params, timeout=30, ) response.raise_for_status() data = response.json() if "error" in data: error_info = data["error"] logger.error( "Wikipedia API error: %s - %s", error_info.get("code", "unknown"), error_info.get("info", "No details"), ) return [] if "warnings" in data: for warning_type, warning_body in data["warnings"].items(): logger.warning("Wikipedia API warning (%s): %s", warning_type, warning_body) query_data = data.get("query", {}) search_results = query_data.get("search", []) logger.info( "Search for '%s' returned %d results", trimmed_query, len(search_results), ) results: List[Dict[str, Any]] = [] for item in search_results: title = item.get("title") if not title: logger.warning("Search result missing title: %s", item) continue results.append( { "title": title, "snippet": item.get("snippet", ""), "pageid": item.get("pageid", 0), "wordcount": item.get("wordcount", 0), "timestamp": item.get("timestamp", ""), } ) return results except requests.exceptions.Timeout as exc: logger.error("Search request timed out for query '%s': %s", trimmed_query, exc) return [] except requests.exceptions.ConnectionError as exc: logger.error("Connection error when searching for '%s': %s", trimmed_query, exc) return [] except requests.exceptions.HTTPError as exc: logger.error("HTTP error when searching for '%s': %s", trimmed_query, exc) return [] except requests.exceptions.RequestException as exc: logger.error("Request error when searching for '%s': %s", trimmed_query, exc) return [] except ValueError as exc: logger.error("JSON decode error when searching for '%s': %s", trimmed_query, exc) return [] except Exception as exc: # pragma: no cover - unexpected safeguard logger.error("Unexpected error searching Wikipedia for '%s': %s", trimmed_query, exc) return [] def get_article(self, title: str) -> Dict[str, Any]: """Get the full content of a Wikipedia article. Args: title: The title of the Wikipedia article. Returns: A dictionary containing the article information. """ try: page = self.wiki.page(title) if not page.exists(): return {"title": title, "exists": False, "error": "Page does not exist"} # Get sections sections = self._extract_sections(page.sections) # Get categories categories = [cat for cat in page.categories.keys()] # Get links links = [link for link in page.links.keys()] return { "title": page.title, "pageid": page.pageid, "summary": page.summary, "text": page.text, "url": page.fullurl, "sections": sections, "categories": categories, "links": links[:100], # Limit to 100 links to avoid too much data "exists": True, } except Exception as e: logger.error(f"Error getting Wikipedia article: {e}") return {"title": title, "exists": False, "error": str(e)} def get_summary(self, title: str) -> str: """Get a summary of a Wikipedia article. Args: title: The title of the Wikipedia article. Returns: The article summary. """ try: page = self.wiki.page(title) if not page.exists(): return f"No Wikipedia article found for '{title}'." return page.summary except Exception as e: logger.error(f"Error getting Wikipedia summary: {e}") return f"Error retrieving summary for '{title}': {str(e)}" def get_sections(self, title: str) -> List[Dict[str, Any]]: """Get the sections of a Wikipedia article. Args: title: The title of the Wikipedia article. Returns: A list of sections. """ try: page = self.wiki.page(title) if not page.exists(): return [] return self._extract_sections(page.sections) except Exception as e: logger.error(f"Error getting Wikipedia sections: {e}") return [] def get_links(self, title: str) -> List[str]: """Get the links in a Wikipedia article. Args: title: The title of the Wikipedia article. Returns: A list of links. """ try: page = self.wiki.page(title) if not page.exists(): return [] return [link for link in page.links.keys()] except Exception as e: logger.error(f"Error getting Wikipedia links: {e}") return [] def get_related_topics(self, title: str, limit: int = 10) -> List[Dict[str, Any]]: """Get topics related to a Wikipedia article based on links and categories. Args: title: The title of the Wikipedia article. limit: Maximum number of related topics to return. Returns: A list of related topics. """ try: page = self.wiki.page(title) if not page.exists(): return [] # Get links from the page links = list(page.links.keys()) # Get categories categories = list(page.categories.keys()) # Combine and limit related = [] # Add links first for link in links[:limit]: link_page = self.wiki.page(link) if link_page.exists(): related.append( { "title": link, "summary": ( link_page.summary[:200] + "..." if len(link_page.summary) > 200 else link_page.summary ), "url": link_page.fullurl, "type": "link", } ) if len(related) >= limit: break # Add categories if we still have room remaining = limit - len(related) if remaining > 0: for category in categories[:remaining]: # Remove "Category:" prefix if present clean_category = category.replace("Category:", "") related.append({"title": clean_category, "type": "category"}) return related except Exception as e: logger.error(f"Error getting related topics: {e}") return [] def _extract_sections(self, sections, level=0) -> List[Dict[str, Any]]: """Extract sections recursively. Args: sections: The sections to extract. level: The current section level. Returns: A list of sections. """ result = [] for section in sections: section_data = { "title": section.title, "level": level, "text": section.text, "sections": self._extract_sections(section.sections, level + 1), } result.append(section_data) return result def summarize_for_query(self, title: str, query: str, max_length: int = 250) -> str: """ Get a summary of a Wikipedia article tailored to a specific query. This is a simplified implementation that returns a snippet around the query. Args: title: The title of the Wikipedia article. query: The query to focus the summary on. max_length: The maximum length of the summary. Returns: A query-focused summary. """ try: page = self.wiki.page(title) if not page.exists(): return f"No Wikipedia article found for '{title}'." text_content = page.text query_lower = query.lower() text_lower = text_content.lower() start_index = text_lower.find(query_lower) if start_index == -1: # If query not found, return the beginning of the summary or article text summary_part = page.summary[:max_length] if not summary_part: summary_part = text_content[:max_length] return summary_part + "..." if len(summary_part) >= max_length else summary_part # Try to get context around the query context_start = max(0, start_index - (max_length // 2)) context_end = min(len(text_content), start_index + len(query) + (max_length // 2)) snippet = text_content[context_start:context_end] if len(snippet) > max_length: snippet = snippet[:max_length] return snippet + "..." if len(snippet) >= max_length or context_end < len(text_content) else snippet except Exception as e: logger.error(f"Error generating query-focused summary for '{title}': {e}") return f"Error generating query-focused summary for '{title}': {str(e)}" def summarize_section(self, title: str, section_title: str, max_length: int = 150) -> str: """ Get a summary of a specific section of a Wikipedia article. Args: title: The title of the Wikipedia article. section_title: The title of the section to summarize. max_length: The maximum length of the summary. Returns: A summary of the specified section. """ try: page = self.wiki.page(title) if not page.exists(): return f"No Wikipedia article found for '{title}'." target_section = None # Helper function to find the section def find_section_recursive(sections_list, target_title): for sec in sections_list: if sec.title.lower() == target_title.lower(): return sec # Check subsections found_in_subsection = find_section_recursive(sec.sections, target_title) if found_in_subsection: return found_in_subsection return None target_section = find_section_recursive(page.sections, section_title) if not target_section or not target_section.text: return f"Section '{section_title}' not found or is empty in article '{title}'." summary = target_section.text[:max_length] return summary + "..." if len(target_section.text) > max_length else summary except Exception as e: logger.error(f"Error summarizing section '{section_title}' for article '{title}': {e}") return f"Error summarizing section '{section_title}': {str(e)}" def extract_facts(self, title: str, topic_within_article: Optional[str] = None, count: int = 5) -> List[str]: """ Extract key facts from a Wikipedia article. This is a simplified implementation returning the first few sentences of the summary or a relevant section if topic_within_article is provided. Args: title: The title of the Wikipedia article. topic_within_article: Optional topic/section to focus fact extraction. count: The number of facts to extract. Returns: A list of key facts (strings). """ try: page = self.wiki.page(title) if not page.exists(): return [f"No Wikipedia article found for '{title}'."] text_to_process = "" if topic_within_article: # Try to find the section text def find_section_text_recursive(sections_list, target_title): for sec in sections_list: if sec.title.lower() == target_title.lower(): return sec.text found_in_subsection = find_section_text_recursive(sec.sections, target_title) if found_in_subsection: return found_in_subsection return None section_text = find_section_text_recursive(page.sections, topic_within_article) if section_text: text_to_process = section_text else: # Fallback to summary if specific topic section not found text_to_process = page.summary else: text_to_process = page.summary if not text_to_process: return ["No content found to extract facts from."] # Basic sentence splitting (can be improved with NLP libraries like nltk or spacy) sentences = [s.strip() for s in text_to_process.split(".") if s.strip()] facts = [] for sentence in sentences[:count]: if sentence: # Ensure not an empty string after strip facts.append(sentence + ".") # Add back the period return facts if facts else ["Could not extract facts from the provided text."] except Exception as e: logger.error(f"Error extracting key facts for '{title}': {e}") return [f"Error extracting key facts for '{title}': {str(e)}"] def get_coordinates(self, title: str) -> Dict[str, Any]: """Get the coordinates of a Wikipedia article. Args: title: The title of the Wikipedia article. Returns: A dictionary containing the coordinates information. """ params = { "action": "query", "format": "json", "prop": "coordinates", "titles": title, } # Add variant parameter if needed params = self._add_variant_to_params(params) try: response = requests.get(self.api_url, headers=self._get_request_headers(), params=params) response.raise_for_status() data = response.json() pages = data.get("query", {}).get("pages", {}) if not pages: return { "title": title, "coordinates": None, "exists": False, "error": "No page found", } # Get the first (and typically only) page page_data = next(iter(pages.values())) # Check if page exists (pageid > 0 means page exists) if page_data.get("pageid", -1) < 0: return { "title": title, "coordinates": None, "exists": False, "error": "Page does not exist", } coordinates = page_data.get("coordinates", []) if not coordinates: return { "title": page_data.get("title", title), "pageid": page_data.get("pageid"), "coordinates": None, "exists": True, "error": None, "message": "No coordinates available for this article", } # Process coordinates - typically there's one primary coordinate processed_coordinates = [] for coord in coordinates: processed_coordinates.append( { "latitude": coord.get("lat"), "longitude": coord.get("lon"), "primary": coord.get("primary", False), "globe": coord.get("globe", "earth"), "type": coord.get("type", ""), "name": coord.get("name", ""), "region": coord.get("region", ""), "country": coord.get("country", ""), } ) return { "title": page_data.get("title", title), "pageid": page_data.get("pageid"), "coordinates": processed_coordinates, "exists": True, "error": None, } except Exception as e: logger.error(f"Error getting coordinates for Wikipedia article: {e}") return { "title": title, "coordinates": None, "exists": False, "error": str(e), }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Rudra-ravi/wikipedia-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server