librarian

librarian
wiki

api.py

api.py•13.4 KiB

""" Wikipedia API Module This module provides common methods for interacting with Wikipedia using the MediaWiki API. It includes functions for searching, retrieving page content, getting summaries, and more. """ import httpx from urllib.parse import quote_plus from logging_config import get_logger logger = get_logger(__name__) class WikipediaAPI: """A class to interact with Wikipedia's MediaWiki API.""" def __init__(self, language: str = "en"): """ Initialize the Wikipedia API client. Args: language (str): Language code for Wikipedia (e.g., 'en', 'es', 'fr') """ self.language = language self.base_url = f"https://{language}.wikipedia.org/api/rest_v1" self.api_url = f"https://{language}.wikipedia.org/w/api.php" # Initialize HTTP client self.client = httpx.Client() self.client.headers.update({ 'User-Agent': 'Librarian/1.0 (https://github.com/user/librarian)' }) def __enter__(self): """Context manager entry.""" return self def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit - close the client.""" self.client.close() def close(self): """Explicitly close the HTTP client.""" self.client.close() def search(self, query: str, limit: int = 10) -> list[dict[str, any]]: """ Search for Wikipedia articles. Args: query (str): Search query limit (int): Maximum number of results to return Returns: list[dict]: List of search results with title and snippet """ params = { 'action': 'query', 'format': 'json', 'list': 'search', 'srsearch': query, 'srlimit': limit, 'srprop': 'snippet|titlesnippet|size|wordcount|timestamp' } try: response = self.client.get(self.api_url, params=params) response.raise_for_status() data = response.json() return data.get('query', {}).get('search', []) except httpx.RequestError as e: logger.error(f"Error searching Wikipedia: {e}") return [] def get_page_summary(self, title: str) -> dict[str, any] | None: """ Get a summary of a Wikipedia page. Args: title (str): Page title Returns: dict | None: Page summary including extract and basic info """ encoded_title = quote_plus(title.replace(' ', '_')) url = f"{self.base_url}/page/summary/{encoded_title}" try: response = self.client.get(url) response.raise_for_status() return response.json() except httpx.RequestError as e: logger.error(f"Error getting page summary for '{title}': {e}") return None def get_page_content(self, title: str) -> str | None: """ Get the full content of a Wikipedia page. Args: title (str): Page title Returns: str | None: Page content in wikitext format """ params = { 'action': 'query', 'format': 'json', 'titles': title, 'prop': 'revisions', 'rvprop': 'content', 'rvslots': 'main' } try: response = self.client.get(self.api_url, params=params) response.raise_for_status() data = response.json() pages = data.get('query', {}).get('pages', {}) for page_id, page_data in pages.items(): if page_id != '-1': # Page exists revisions = page_data.get('revisions', []) if revisions: return revisions[0]['slots']['main']['*'] return None except httpx.RequestError as e: logger.error(f"Error getting page content for '{title}': {e}") return None def get_page_sections(self, title: str) -> list[dict[str, any]]: """ Get the table of contents (section structure) of a Wikipedia page. Args: title (str): Page title Returns: list[dict]: List of sections with index, title, level, and anchor """ params = { 'action': 'parse', 'format': 'json', 'page': title, 'prop': 'sections' } try: response = self.client.get(self.api_url, params=params) response.raise_for_status() data = response.json() if 'parse' in data and 'sections' in data['parse']: sections = data['parse']['sections'] # Format sections for easier use formatted_sections = [] for section in sections: formatted_sections.append({ 'index': section.get('index', ''), 'title': section.get('line', ''), 'level': int(section.get('level', 1)), 'anchor': section.get('anchor', ''), 'number': section.get('number', '') }) return formatted_sections return [] except httpx.RequestError as e: logger.error(f"Error getting page sections for '{title}': {e}") return [] def get_page_sections_content(self, title: str, section_indices: list[str]) -> dict[str, str]: """ Get the content of specific sections from a Wikipedia page. Args: title (str): Page title section_indices (list[str]): List of section indices to retrieve Returns: dict[str, str]: Dictionary mapping section indices to their content """ result = {} for section_index in section_indices: params = { 'action': 'parse', 'format': 'json', 'page': title, 'section': section_index, 'prop': 'wikitext' } try: response = self.client.get(self.api_url, params=params) response.raise_for_status() data = response.json() if 'parse' in data and 'wikitext' in data['parse']: wikitext = data['parse']['wikitext']['*'] result[section_index] = wikitext else: result[section_index] = None except httpx.RequestError as e: logger.error(f"Error getting section {section_index} for '{title}': {e}") result[section_index] = None return result def get_page_sections_content_by_title(self, title: str, section_titles: list[str]) -> dict[str, str]: """ Get the content of specific sections from a Wikipedia page by section titles. Args: title (str): Page title section_titles (list[str]): List of section titles to retrieve Returns: dict[str, str]: Dictionary mapping section titles to their content """ # First get all sections to find the indices sections = self.get_page_sections(title) if not sections: return {} # Create a mapping of section titles to indices title_to_index = {} for section in sections: section_title = section['title'].strip() for target_title in section_titles: if section_title.lower() == target_title.lower(): title_to_index[target_title] = section['index'] # Get content for found sections if not title_to_index: return {} indices = list(title_to_index.values()) content_by_index = self.get_page_sections_content(title, indices) # Map back to section titles result = {} for section_title, section_index in title_to_index.items(): result[section_title] = content_by_index.get(section_index) return result def get_page_categories(self, title: str) -> list[str]: """ Get categories for a Wikipedia page. Args: title (str): Page title Returns: list[str]: List of category names """ params = { 'action': 'query', 'format': 'json', 'titles': title, 'prop': 'categories', 'cllimit': 'max' } try: response = self.client.get(self.api_url, params=params) response.raise_for_status() data = response.json() pages = data.get('query', {}).get('pages', {}) for page_id, page_data in pages.items(): if page_id != '-1': # Page exists categories = page_data.get('categories', []) return [cat['title'].replace('Category:', '') for cat in categories] return [] except httpx.RequestError as e: logger.error(f"Error getting categories for '{title}': {e}") return [] def get_page_links(self, title: str, limit: int = 100) -> list[str]: """ Get links from a Wikipedia page. Args: title (str): Page title limit (int): Maximum number of links to return Returns: list[str]: List of linked page titles """ params = { 'action': 'query', 'format': 'json', 'titles': title, 'prop': 'links', 'pllimit': limit, 'plnamespace': 0 # Main namespace only } try: response = self.client.get(self.api_url, params=params) response.raise_for_status() data = response.json() pages = data.get('query', {}).get('pages', {}) for page_id, page_data in pages.items(): if page_id != '-1': # Page exists links = page_data.get('links', []) return [link['title'] for link in links] return [] except httpx.RequestError as e: logger.error(f"Error getting links for '{title}': {e}") return [] def get_page_images(self, title: str) -> list[dict[str, str]]: """ Get images from a Wikipedia page. Args: title (str): Page title Returns: list[dict]: List of image information """ params = { 'action': 'query', 'format': 'json', 'titles': title, 'prop': 'images' } try: response = self.client.get(self.api_url, params=params) response.raise_for_status() data = response.json() pages = data.get('query', {}).get('pages', {}) for page_id, page_data in pages.items(): if page_id != '-1': # Page exists images = page_data.get('images', []) return [{'title': img['title']} for img in images] return [] except httpx.RequestError as e: logger.error(f"Error getting images for '{title}': {e}") return [] def page_exists(self, title: str) -> bool: """ Check if a Wikipedia page exists. Args: title (str): Page title Returns: bool: True if page exists, False otherwise """ params = { 'action': 'query', 'format': 'json', 'titles': title } try: response = self.client.get(self.api_url, params=params) response.raise_for_status() data = response.json() pages = data.get('query', {}).get('pages', {}) return '-1' not in pages # -1 indicates missing page except httpx.RequestError as e: logger.error(f"Error checking if page exists '{title}': {e}") return False def get_page_info(self, title: str) -> dict[str, any] | None: """ Get basic information about a Wikipedia page. Args: title (str): Page title Returns: dict | None: Page information including length, last modified, etc. """ params = { 'action': 'query', 'format': 'json', 'titles': title, 'prop': 'info', 'inprop': 'url|displaytitle|length|touched' } try: response = self.client.get(self.api_url, params=params) response.raise_for_status() data = response.json() pages = data.get('query', {}).get('pages', {}) for page_id, page_data in pages.items(): if page_id != '-1': # Page exists return page_data return None except httpx.RequestError as e: logger.error(f"Error getting page info for '{title}': {e}") return None if __name__ == "__main__": pass

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/mlziade/librarian'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

api.py•13.4 KiB