Skip to main content
Glama
imajumd1

Wikipedia MCP Server

by imajumd1
wikipedia_client.py11.6 kB
"""Wikipedia API client for the MCP server.""" import asyncio import re from typing import Dict, List, Optional from urllib.parse import quote import httpx from bs4 import BeautifulSoup from .models import ( SearchResult, WikipediaSearchResult, WikipediaArticle, WikipediaSummary, RelatedArticle, RelatedArticles, ) class WikipediaClient: """Async client for Wikipedia API interactions.""" def __init__(self): self.session = httpx.AsyncClient( timeout=30.0, follow_redirects=True, headers={ "User-Agent": "WikipediaMCPServer/0.1.0 (https://github.com/user/wikipedia-mcp-server)" } ) async def __aenter__(self): return self async def __aexit__(self, exc_type, exc_val, exc_tb): await self.session.aclose() def _get_api_url(self, language: str) -> str: """Get the API base URL for a given language.""" return f"https://{language}.wikipedia.org/w/rest.php/v1" def _get_page_url(self, language: str, title: str) -> str: """Get the full Wikipedia page URL.""" encoded_title = quote(title.replace(" ", "_")) return f"https://{language}.wikipedia.org/wiki/{encoded_title}" def _clean_html(self, html_content: str) -> str: """Clean HTML content and extract plain text.""" soup = BeautifulSoup(html_content, 'html.parser') # Remove unwanted elements for element in soup(['script', 'style', 'sup', 'table']): element.decompose() # Get text and clean up whitespace text = soup.get_text() text = re.sub(r'\n+', '\n', text) text = re.sub(r' +', ' ', text) return text.strip() def _extract_sections(self, html_content: str) -> List[str]: """Extract section headings from HTML content.""" soup = BeautifulSoup(html_content, 'html.parser') sections = [] for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): if heading.text.strip(): sections.append(heading.text.strip()) return sections def _extract_key_facts(self, content: str, limit: int = 5) -> List[str]: """Extract key facts from article content.""" # Simple fact extraction - get first few sentences of each paragraph paragraphs = content.split('\n\n') facts = [] for paragraph in paragraphs[:3]: # First 3 paragraphs if len(paragraph.strip()) > 50: # Skip very short paragraphs # Get first sentence sentences = paragraph.split('. ') if sentences: fact = sentences[0].strip() if fact and len(fact) > 20: facts.append(fact + ('.' if not fact.endswith('.') else '')) return facts[:limit] async def search_wikipedia( self, query: str, limit: int = 5, language: str = "en" ) -> WikipediaSearchResult: """Search Wikipedia for articles matching the query.""" try: api_url = self._get_api_url(language) url = f"{api_url}/search/page" response = await self.session.get(url, params={"q": query, "limit": limit}) response.raise_for_status() data = response.json() results = [] for page in data.get("pages", []): result = SearchResult( title=page.get("title", ""), snippet=page.get("excerpt", "").replace('<span class="searchmatch">', '').replace('</span>', ''), url=self._get_page_url(language, page.get("title", "")), page_id=page.get("id", 0) ) results.append(result) return WikipediaSearchResult( results=results, query=query, total_found=len(results) ) except Exception as e: raise Exception(f"Failed to search Wikipedia: {str(e)}") async def get_article( self, title: str, language: str = "en" ) -> WikipediaArticle: """Retrieve the full content of a Wikipedia article.""" try: api_url = self._get_api_url(language) encoded_title = quote(title) # Get page summary for metadata summary_url = f"{api_url}/page/{encoded_title}" summary_response = await self.session.get(summary_url) summary_response.raise_for_status() summary_data = summary_response.json() # Get HTML content html_url = f"{api_url}/page/{encoded_title}/html" html_response = await self.session.get(html_url) html_response.raise_for_status() html_content = html_response.text clean_content = self._clean_html(html_content) sections = self._extract_sections(html_content) return WikipediaArticle( title=summary_data.get("title", title), content=clean_content, url=self._get_page_url(language, title), last_modified=summary_data.get("latest", {}).get("timestamp", ""), page_id=summary_data.get("id", 0), word_count=len(clean_content.split()), sections=sections ) except Exception as e: raise Exception(f"Failed to get article '{title}': {str(e)}") async def get_summary( self, title: str, language: str = "en" ) -> WikipediaSummary: """Get a concise summary of a Wikipedia article.""" try: # Use the old API for summary since it's simpler api_base = f"https://{language}.wikipedia.org/w/api.php" params = { "action": "query", "format": "json", "titles": title, "prop": "extracts", "exintro": True, "explaintext": True, "exsectionformat": "plain" } response = await self.session.get(api_base, params=params) response.raise_for_status() data = response.json() pages = data.get("query", {}).get("pages", {}) if not pages: raise Exception(f"No article found for '{title}'") page_data = next(iter(pages.values())) if "missing" in page_data: raise Exception(f"Article '{title}' not found") summary_text = page_data.get("extract", "") page_id = page_data.get("pageid", 0) # Extract key facts from the summary key_facts = self._extract_key_facts(summary_text, limit=3) return WikipediaSummary( title=page_data.get("title", title), summary=summary_text, url=self._get_page_url(language, title), key_facts=key_facts, page_id=page_id ) except Exception as e: raise Exception(f"Failed to get summary for '{title}': {str(e)}") async def find_related( self, title: str, limit: int = 5, language: str = "en" ) -> RelatedArticles: """Find articles related to the given article.""" try: # Use a combination of links and categories to find related articles api_base = f"https://{language}.wikipedia.org/w/api.php" # First, get some links from the page params = { "action": "query", "format": "json", "titles": title, "prop": "links|categories", "pllimit": limit, "plnamespace": 0, # Main namespace only "cllimit": 3 # Get a few categories } response = await self.session.get(api_base, params=params) response.raise_for_status() data = response.json() pages = data.get("query", {}).get("pages", {}) related_articles = [] if not pages: return RelatedArticles(source_title=title, related=[], total_found=0) page_data = next(iter(pages.values())) # Get related articles from links links = page_data.get("links", []) categories = page_data.get("categories", []) # Process links first for link in links[:limit]: link_title = link.get("title", "") if link_title and len(related_articles) < limit: related_article = RelatedArticle( title=link_title, snippet=f"Article linked from {title}", url=self._get_page_url(language, link_title), page_id=0, # We don't have page ID from links API relation_type="linked_from" ) related_articles.append(related_article) # If we still need more, try to get articles from the same categories if len(related_articles) < limit and categories: category_title = categories[0].get("title", "").replace("Category:", "") if category_title: cat_params = { "action": "query", "format": "json", "list": "categorymembers", "cmtitle": f"Category:{category_title}", "cmnamespace": 0, "cmlimit": limit - len(related_articles) } cat_response = await self.session.get(api_base, params=cat_params) if cat_response.status_code == 200: cat_data = cat_response.json() cat_members = cat_data.get("query", {}).get("categorymembers", []) for member in cat_members: member_title = member.get("title", "") if member_title != title and len(related_articles) < limit: related_article = RelatedArticle( title=member_title, snippet=f"Article in category: {category_title}", url=self._get_page_url(language, member_title), page_id=member.get("pageid", 0), relation_type="category" ) related_articles.append(related_article) return RelatedArticles( source_title=title, related=related_articles, total_found=len(related_articles) ) except Exception as e: raise Exception(f"Failed to find related articles for '{title}': {str(e)}")

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/imajumd1/Wiki-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server