Skip to main content
Glama
wiki.py•7.35 kB
# SPDX-License-Identifier: GPL-3.0-only OR MIT """ Arch Wiki interface module. Provides search and page retrieval via MediaWiki API with BeautifulSoup fallback. """ import logging from typing import List, Dict, Any, Optional import httpx from bs4 import BeautifulSoup from markdownify import markdownify as md from .utils import create_error_response logger = logging.getLogger(__name__) # Arch Wiki API endpoint WIKI_API_URL = "https://wiki.archlinux.org/api.php" WIKI_BASE_URL = "https://wiki.archlinux.org" # HTTP client settings DEFAULT_TIMEOUT = 10.0 async def search_wiki(query: str, limit: int = 10) -> Dict[str, Any]: """ Search the Arch Wiki using MediaWiki API. Uses the opensearch action which returns suggestions. Args: query: Search term limit: Maximum number of results (default: 10) Returns: Dict containing search results with titles, snippets, and URLs """ logger.info(f"Searching Arch Wiki for: {query}") params = { "action": "opensearch", "search": query, "limit": limit, "namespace": "0", # Main namespace only "format": "json" } try: async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client: response = await client.get(WIKI_API_URL, params=params) response.raise_for_status() data = response.json() # OpenSearch returns: [query, [titles], [descriptions], [urls]] if len(data) >= 4: titles = data[1] descriptions = data[2] urls = data[3] results = [ { "title": title, "snippet": desc, "url": url } for title, desc, url in zip(titles, descriptions, urls) ] logger.info(f"Found {len(results)} results for '{query}'") return { "query": query, "count": len(results), "results": results } else: return { "query": query, "count": 0, "results": [] } except httpx.TimeoutException: logger.error(f"Wiki search timed out for query: {query}") return create_error_response( "TimeoutError", f"Arch Wiki search timed out for query: {query}", "The Wiki server did not respond in time. Try again later." ) except httpx.HTTPStatusError as e: logger.error(f"Wiki search HTTP error: {e}") return create_error_response( "HTTPError", f"Wiki search failed with status {e.response.status_code}", str(e) ) except Exception as e: logger.error(f"Wiki search failed: {e}") return create_error_response( "SearchError", f"Failed to search Arch Wiki: {str(e)}" ) async def get_wiki_page(title: str, as_markdown: bool = True) -> str: """ Fetch a Wiki page using MediaWiki API, with scraping fallback. Args: title: Page title (e.g., "Installation_guide") as_markdown: Convert HTML to Markdown (default: True) Returns: Page content as Markdown or HTML string """ logger.info(f"Fetching Wiki page: {title}") # Try MediaWiki API first content = await _fetch_via_api(title) # Fallback to scraping if API fails if content is None: logger.warning(f"API fetch failed for {title}, falling back to scraping") content = await _fetch_via_scraping(title) if content is None: error_msg = f"Page '{title}' not found or could not be retrieved" logger.error(error_msg) raise ValueError(error_msg) # Convert to Markdown if requested if as_markdown and content: try: content = md(content, heading_style="ATX", strip=['script', 'style']) except Exception as e: logger.warning(f"Markdown conversion failed: {e}, returning HTML") return content async def _fetch_via_api(title: str) -> Optional[str]: """ Fetch page content via MediaWiki API. Args: title: Page title Returns: HTML content or None if failed """ params = { "action": "parse", "page": title, "format": "json", "prop": "text", "disableeditsection": "1", "disabletoc": "1" } try: async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client: response = await client.get(WIKI_API_URL, params=params) response.raise_for_status() data = response.json() # Check for errors in response if "error" in data: logger.warning(f"API error: {data['error'].get('info', 'Unknown error')}") return None # Extract HTML content if "parse" in data and "text" in data["parse"]: html_content = data["parse"]["text"]["*"] logger.info(f"Successfully fetched {title} via API") return html_content return None except Exception as e: logger.warning(f"API fetch failed for {title}: {e}") return None async def _fetch_via_scraping(title: str) -> Optional[str]: """ Fetch page content via direct HTTP scraping (fallback). Args: title: Page title Returns: HTML content or None if failed """ # Construct URL url = f"{WIKI_BASE_URL}/title/{title}" try: async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client: response = await client.get(url, follow_redirects=True) response.raise_for_status() # Parse HTML soup = BeautifulSoup(response.text, 'lxml') # Find main content div content_div = soup.find('div', {'id': 'bodyContent'}) if content_div: # Remove unnecessary elements for element in content_div.find_all(['script', 'style', 'nav']): element.decompose() logger.info(f"Successfully scraped {title}") return str(content_div) return None except httpx.HTTPStatusError as e: if e.response.status_code == 404: logger.error(f"Page not found: {title}") else: logger.error(f"HTTP error scraping {title}: {e}") return None except Exception as e: logger.error(f"Scraping failed for {title}: {e}") return None async def get_wiki_page_as_text(title: str) -> str: """ Convenience wrapper to get Wiki page as clean Markdown text. Args: title: Page title Returns: Markdown content Raises: ValueError: If page cannot be retrieved """ return await get_wiki_page(title, as_markdown=True)

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/nihalxkumar/arch-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server