Skip to main content
Glama
scraper.py9.6 kB
#!/usr/bin/env python3 """ Daraja API Documentation Scraper This script scrapes comprehensive documentation from Safaricom's Daraja API portal, including all API endpoints, images, and metadata. The scraped data is formatted for consumption by the MCP server. Features: - Automated authentication with session persistence - Complete documentation scraping for 22+ APIs - Image downloading with local referencing - Markdown conversion with clean formatting - Structured data index generation Usage: python scraper.py Requirements: - Python 3.8+ - Playwright browser automation - BeautifulSoup for HTML parsing - Markdownify for HTML to Markdown conversion Author: Daraja Documentation Team License: MIT """ import asyncio import os import re import json import base64 from urllib.parse import urljoin, urlparse from playwright.async_api import async_playwright from markdownify import markdownify as md from bs4 import BeautifulSoup # --- CONFIGURATION --- BASE_URL = "https://developer.safaricom.co.ke" OUTPUT_DIR = "daraja_docs_v3" DOCS_DIR = os.path.join(OUTPUT_DIR, "docs") IMG_DIR = os.path.join(OUTPUT_DIR, "images") AUTH_FILE = "auth.json" # exact links provided URLS = [ "https://developer.safaricom.co.ke/apis/Authorization", "https://developer.safaricom.co.ke/apis/DynamicQRCode", "https://developer.safaricom.co.ke/apis/CustomerToBusiness", "https://developer.safaricom.co.ke/apis/CustomerToBusinessRegisterURL", "https://developer.safaricom.co.ke/apis/MpesaExpressSimulate", "https://developer.safaricom.co.ke/apis/MpesaExpressQuery", "https://developer.safaricom.co.ke/apis/BusinessToCustomer", "https://developer.safaricom.co.ke/apis/TransactionStatus", "https://developer.safaricom.co.ke/apis/AccountBalance", "https://developer.safaricom.co.ke/apis/Reversal", "https://developer.safaricom.co.ke/apis/TaxRemittance", "https://developer.safaricom.co.ke/apis/BusinessPayBill", "https://developer.safaricom.co.ke/apis/BusinessBuyGoods", "https://developer.safaricom.co.ke/apis/BillManager", "https://developer.safaricom.co.ke/apis/B2BExpressCheckout", "https://developer.safaricom.co.ke/apis/PullTransaction", "https://developer.safaricom.co.ke/apis/BusinessToPochi", "https://developer.safaricom.co.ke/apis/Swap", "https://developer.safaricom.co.ke/apis/IMSI", "https://developer.safaricom.co.ke/apis/B2CAccountTopUp", "https://developer.safaricom.co.ke/apis/MpesaRatiba", "https://developer.safaricom.co.ke/apis/IotSimManagement" ] async def download_image(page, img_url, local_filename): """ Downloads image using the browser context to share cookies and authentication. Args: page: Playwright page object with authentication context img_url: URL of the image to download local_filename: Local path where image should be saved Returns: bool: True if download successful, False otherwise """ try: # If it's a base64 string, save directly if img_url.startswith("data:image"): header, encoded = img_url.split(",", 1) data = base64.b64decode(encoded) with open(local_filename, "wb") as f: f.write(data) return True # Use Playwright's API request context to fetch with current cookies response = await page.request.get(img_url) if response.status == 200: data = await response.body() with open(local_filename, "wb") as f: f.write(data) return True except Exception as e: print(f" Warning: Failed to download image {img_url}: {e}") return False async def run(): """ Main scraper function that handles authentication, scraping, and data processing. This function: 1. Sets up output directories 2. Handles browser authentication with session persistence 3. Scrapes all configured Daraja API endpoints 4. Downloads and processes images with local referencing 5. Converts HTML to markdown format 6. Generates a structured index for MCP server consumption """ # Setup directories os.makedirs(DOCS_DIR, exist_ok=True) os.makedirs(IMG_DIR, exist_ok=True) async with async_playwright() as p: # Browser Setup - Keep visible for login debugging browser = await p.chromium.launch(headless=False) # Load existing authentication or create new context if os.path.exists(AUTH_FILE) and os.path.getsize(AUTH_FILE) > 0: try: # Verify the auth file contains valid JSON with open(AUTH_FILE, 'r') as f: json.load(f) print("Loading saved authentication session...") context = await browser.new_context(storage_state=AUTH_FILE) except (json.JSONDecodeError, Exception) as e: print(f"Saved session is corrupted ({e}). Starting fresh...") context = await browser.new_context() else: print("No session found. Creating new context.") context = await browser.new_context() page = await context.new_page() # Check authentication status with first URL print("Checking login status...") await page.goto(URLS[0], timeout=60000) await asyncio.sleep(3) # Wait for UI to settle # Prompt for manual login if needed print("\nACTION REQUIRED: Check the browser window.") print("If you are not logged in, please log in now.") print("Press ENTER here once you can see the API documentation on screen.") input() # Save session for future runs await context.storage_state(path=AUTH_FILE) print("Session saved for future use.") index_data = [] print(f"Starting scrape of {len(URLS)} API endpoints...") for url in URLS: api_name = url.split("/")[-1] print(f"\nProcessing: {api_name}...") try: await page.goto(url, timeout=60000) await page.wait_for_load_state("networkidle") await page.wait_for_timeout(2000) # Grace period for rendering # Extract HTML content from main container content_html = await page.evaluate("""() => { const main = document.querySelector('main') || document.querySelector('.api-details-container') || document.body; return main.innerHTML; }""") # Parse HTML with BeautifulSoup for image processing soup = BeautifulSoup(content_html, "html.parser") # Process and download images images = soup.find_all("img") for i, img in enumerate(images): src = img.get("src") if not src: continue # Determine file extension ext = "png" # Default fallback if ".svg" in src: ext = "svg" elif ".jpg" in src or ".jpeg" in src: ext = "jpg" img_filename = f"{api_name}_img_{i}.{ext}" local_path = os.path.join(IMG_DIR, img_filename) # Resolve full URL for downloading full_img_url = urljoin(url, src) # Download image and update reference success = await download_image(page, full_img_url, local_path) if success: # Update HTML to point to local relative path for Markdown img['src'] = f"../images/{img_filename}" else: img['src'] = full_img_url # Fallback to remote URL # Convert to Markdown format markdown_text = md(str(soup), heading_style="ATX", code_language="json") # Clean up excessive newlines markdown_text = re.sub(r'\n\s*\n', '\n\n', markdown_text) # Save Markdown file with header md_filename = os.path.join(DOCS_DIR, f"{api_name}.md") header = f"# {api_name}\n**Source:** {url}\n\n---\n\n" with open(md_filename, "w", encoding="utf-8") as f: f.write(header + markdown_text) print(f" Saved documentation: {api_name}.md") # Add to index for MCP server index_data.append({ "name": api_name, "url": url, "local_path": f"docs/{api_name}.md", "description": f"Documentation for {api_name}" }) except Exception as e: print(f" Error processing {api_name}: {e}") # Save index file for MCP server consumption with open(os.path.join(OUTPUT_DIR, "data_index.json"), "w") as f: json.dump(index_data, f, indent=2) print(f"\nScraping completed successfully.") print(f"Documentation stored in: {OUTPUT_DIR}/") print(f"Total APIs processed: {len(index_data)}") await browser.close() if __name__ == "__main__": asyncio.run(run())

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/JacksCodeVault/mpesa-daraja-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server