OpenEdu MCP Server

MIT License

Overview InspectNew Endpoints Schema Related Servers Reviews Score

wikipedia.py•19.8 kB

""" Wikipedia API client for OpenEdu MCP Server. This module provides a comprehensive client for interacting with the Wikipedia/Wikimedia API, including article search, content retrieval, featured articles, and educational content analysis with proper error handling and rate limiting. """ import asyncio import logging import re from typing import Dict, Any, List, Optional, Union from urllib.parse import quote_plus, urljoin import aiohttp from datetime import datetime, date import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from config import Config from exceptions import APIError, ValidationError logger = logging.getLogger(__name__) class WikipediaClient: """Client for Wikipedia API with educational focus.""" def __init__(self, config: Config): """ Initialize the Wikipedia client. Args: config: Application configuration """ self.config = config self.base_url = config.apis.wikipedia.base_url self.action_api_url = "https://en.wikipedia.org/w/api.php" self.timeout = config.apis.wikipedia.timeout self.retry_attempts = config.apis.wikipedia.retry_attempts self.backoff_factor = config.apis.wikipedia.backoff_factor # User agent for respectful API usage self.headers = { 'User-Agent': f'{config.server.name}/{config.server.version} (Educational MCP Server; https://github.com/openedu-mcp)' } # Session will be created when needed self._session: Optional[aiohttp.ClientSession] = None async def _get_session(self) -> aiohttp.ClientSession: """Get or create HTTP session.""" if self._session is None or self._session.closed: timeout = aiohttp.ClientTimeout(total=self.timeout) self._session = aiohttp.ClientSession( headers=self.headers, timeout=timeout ) return self._session async def close(self): """Close the HTTP session.""" if self._session and not self._session.closed: await self._session.close() async def _make_request( self, url: str, params: Optional[Dict[str, Any]] = None, retry_count: int = 0, use_action_api: bool = False ) -> Dict[str, Any]: """ Make HTTP request with retry logic and error handling. Args: url: Full URL or endpoint params: Query parameters retry_count: Current retry attempt use_action_api: Whether to use the action API base URL Returns: JSON response data Raises: APIError: If request fails after all retries """ if not url.startswith('http'): base = self.action_api_url if use_action_api else self.base_url url = urljoin(base, url) session = await self._get_session() try: async with session.get(url, params=params) as response: if response.status == 200: data = await response.json() return data elif response.status == 429: # Rate limited if retry_count < self.retry_attempts: wait_time = self.backoff_factor ** retry_count logger.warning(f"Rate limited, waiting {wait_time}s before retry") await asyncio.sleep(wait_time) return await self._make_request(url, params, retry_count + 1, use_action_api) else: raise APIError(f"Rate limited after {self.retry_attempts} retries", "wikipedia") elif response.status == 404: return {} # Return empty dict for not found else: error_text = await response.text() raise APIError(f"HTTP {response.status}: {error_text}", "wikipedia") except aiohttp.ClientError as e: if retry_count < self.retry_attempts: wait_time = self.backoff_factor ** retry_count logger.warning(f"Request failed, retrying in {wait_time}s: {e}") await asyncio.sleep(wait_time) return await self._make_request(url, params, retry_count + 1, use_action_api) else: raise APIError(f"Request failed after {self.retry_attempts} retries: {e}", "wikipedia") except Exception as e: raise APIError(f"Unexpected error: {e}", "wikipedia") def _validate_search_params(self, query: str, limit: int, lang: str) -> None: """ Validate search parameters. Args: query: Search query limit: Result limit lang: Language code Raises: ValidationError: If parameters are invalid """ if not query or not query.strip(): raise ValidationError("Search query cannot be empty") if limit < 1 or limit > 50: raise ValidationError("Limit must be between 1 and 50") if not re.match(r'^[a-z]{2,3}$', lang): raise ValidationError("Language code must be 2-3 lowercase letters") def _validate_title(self, title: str) -> str: """ Validate and normalize article title. Args: title: Article title Returns: Normalized title Raises: ValidationError: If title is invalid """ if not title or not title.strip(): raise ValidationError("Article title cannot be empty") # Basic title normalization normalized = title.strip() # Replace spaces with underscores for URL compatibility normalized = normalized.replace(' ', '_') return normalized async def search_wikipedia( self, query: str, lang: str = 'en', limit: int = 5 ) -> List[Dict[str, Any]]: """ Search Wikipedia articles. Args: query: Search query lang: Language code (default: 'en') limit: Maximum number of results (1-50) Returns: List of article search results Raises: ValidationError: If parameters are invalid APIError: If API request fails """ self._validate_search_params(query, limit, lang) params = { 'action': 'query', 'format': 'json', 'list': 'search', 'srsearch': query, 'srlimit': limit, 'srprop': 'snippet|titlesnippet|size|wordcount|timestamp', 'utf8': 1 } try: # Use action API for search response = await self._make_request('', params, use_action_api=True) if 'query' not in response or 'search' not in response['query']: return [] results = [] for item in response['query']['search']: # Get additional details for each result try: summary = await self.get_article_summary(item['title'], lang) result = { 'title': item['title'], 'snippet': item.get('snippet', ''), 'size': item.get('size', 0), 'wordcount': item.get('wordcount', 0), 'timestamp': item.get('timestamp', ''), 'url': f"https://{lang}.wikipedia.org/wiki/{item['title'].replace(' ', '_')}", 'summary': summary.get('extract', '') if summary else '', 'pageid': item.get('pageid') } results.append(result) except Exception as e: logger.warning(f"Failed to get summary for {item['title']}: {e}") # Add basic result without summary result = { 'title': item['title'], 'snippet': item.get('snippet', ''), 'size': item.get('size', 0), 'wordcount': item.get('wordcount', 0), 'timestamp': item.get('timestamp', ''), 'url': f"https://{lang}.wikipedia.org/wiki/{item['title'].replace(' ', '_')}", 'pageid': item.get('pageid') } results.append(result) logger.info(f"Found {len(results)} articles for query: {query}") return results except Exception as e: logger.error(f"Error searching Wikipedia: {e}") raise async def get_article_summary( self, title: str, lang: str = 'en' ) -> Dict[str, Any]: """ Get article summary/extract. Args: title: Article title lang: Language code (default: 'en') Returns: Article summary data Raises: ValidationError: If title is invalid APIError: If API request fails """ normalized_title = self._validate_title(title) try: # Try REST API first for summary rest_url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{quote_plus(normalized_title)}" response = await self._make_request(rest_url) if response: return response # Fallback to action API params = { 'action': 'query', 'format': 'json', 'titles': title, 'prop': 'extracts|info|categories', 'exintro': True, 'explaintext': True, 'exsectionformat': 'plain', 'inprop': 'url', 'utf8': 1 } response = await self._make_request('', params, use_action_api=True) if 'query' not in response or 'pages' not in response['query']: return {} pages = response['query']['pages'] page_data = next(iter(pages.values())) if 'missing' in page_data: return {} return { 'title': page_data.get('title', title), 'extract': page_data.get('extract', ''), 'pageid': page_data.get('pageid'), 'fullurl': page_data.get('fullurl', ''), 'categories': page_data.get('categories', []) } except Exception as e: logger.error(f"Error getting article summary for {title}: {e}") raise async def get_article_content( self, title: str, lang: str = 'en' ) -> Dict[str, Any]: """ Get full article content. Args: title: Article title lang: Language code (default: 'en') Returns: Full article content data Raises: ValidationError: If title is invalid APIError: If API request fails """ normalized_title = self._validate_title(title) try: # Get full extract using action API params = { 'action': 'query', 'format': 'json', 'titles': title, 'prop': 'extracts|info|categories|links|images', 'explaintext': '1', 'exsectionformat': 'plain', 'inprop': 'url', 'pllimit': 50, # Limit links 'imlimit': 10, # Limit images 'utf8': 1 } response = await self._make_request('', params, use_action_api=True) if 'query' not in response or 'pages' not in response['query']: return {} pages = response['query']['pages'] page_data = next(iter(pages.values())) if 'missing' in page_data: return {} # Extract links and images links = [] if 'links' in page_data: links = [link['title'] for link in page_data['links']] images = [] if 'images' in page_data: images = [img['title'] for img in page_data['images']] categories = [] if 'categories' in page_data: categories = [cat['title'].replace('Category:', '') for cat in page_data['categories']] return { 'title': page_data.get('title', title), 'extract': page_data.get('extract', ''), 'pageid': page_data.get('pageid'), 'fullurl': page_data.get('fullurl', ''), 'categories': categories, 'links': links[:20], # Limit to first 20 links 'images': images, 'wordcount': len(page_data.get('extract', '').split()) if page_data.get('extract') else 0 } except Exception as e: logger.error(f"Error getting article content for {title}: {e}") raise async def get_daily_featured( self, date_param: Optional[Union[str, date]] = None, lang: str = 'en' ) -> Dict[str, Any]: """ Get featured article of the day. Args: date_param: Date (YYYY/MM/DD format or date object), defaults to today lang: Language code (default: 'en') Returns: Featured article data Raises: APIError: If API request fails """ if date_param is None: date_param = date.today() if isinstance(date_param, date): date_str = date_param.strftime('%Y/%m/%d') else: # Validate date string format try: datetime.strptime(date_param, '%Y/%m/%d') date_str = date_param except ValueError: raise ValidationError("Date must be in YYYY/MM/DD format") try: # Use REST API for featured content url = f"https://{lang}.wikipedia.org/api/rest_v1/feed/featured/{date_str}" response = await self._make_request(url) if not response: return {} # Extract the featured article if 'tfa' in response: # Today's Featured Article tfa = response['tfa'] return { 'title': tfa.get('title', ''), 'extract': tfa.get('extract', ''), 'description': tfa.get('description', ''), 'content_urls': tfa.get('content_urls', {}), 'thumbnail': tfa.get('thumbnail', {}), 'date': date_str, 'type': 'featured_article' } return {} except Exception as e: logger.error(f"Error getting featured article for {date_str}: {e}") raise async def get_article_images( self, title: str, lang: str = 'en' ) -> List[Dict[str, Any]]: """ Get article images. Args: title: Article title lang: Language code (default: 'en') Returns: List of image data Raises: ValidationError: If title is invalid APIError: If API request fails """ normalized_title = self._validate_title(title) try: # Get images using action API params = { 'action': 'query', 'format': 'json', 'titles': title, 'prop': 'images', 'imlimit': 10, 'utf8': 1 } response = await self._make_request('', params, use_action_api=True) if 'query' not in response or 'pages' not in response['query']: return [] pages = response['query']['pages'] page_data = next(iter(pages.values())) if 'missing' in page_data or 'images' not in page_data: return [] images = [] for img in page_data['images']: img_title = img['title'] # Get image info try: img_params = { 'action': 'query', 'format': 'json', 'titles': img_title, 'prop': 'imageinfo', 'iiprop': 'url|size|mime', 'utf8': 1 } img_response = await self._make_request('', img_params, use_action_api=True) if 'query' in img_response and 'pages' in img_response['query']: img_pages = img_response['query']['pages'] img_data = next(iter(img_pages.values())) if 'imageinfo' in img_data and img_data['imageinfo']: img_info = img_data['imageinfo'][0] images.append({ 'title': img_title, 'url': img_info.get('url', ''), 'width': img_info.get('width', 0), 'height': img_info.get('height', 0), 'mime': img_info.get('mime', '') }) except Exception as e: logger.warning(f"Failed to get info for image {img_title}: {e}") continue return images except Exception as e: logger.error(f"Error getting images for {title}: {e}") raise async def health_check(self) -> Dict[str, Any]: """ Perform health check on Wikipedia API. Returns: Health status information """ try: start_time = datetime.now() # Simple search to test API availability await self.search_wikipedia('test', limit=1) end_time = datetime.now() response_time = (end_time - start_time).total_seconds() return { 'status': 'healthy', 'response_time_seconds': response_time, 'timestamp': end_time.isoformat(), 'api_url': self.base_url } except Exception as e: return { 'status': 'unhealthy', 'error': str(e), 'timestamp': datetime.now().isoformat(), 'api_url': self.base_url }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Cicatriiz/openedu-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server