Skip to main content
Glama

1mcpserver

scrape.py5.91 kB
import requests import re import os import sqlite3 from typing import List from dotenv import load_dotenv from langchain.schema import Document from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores import FAISS load_dotenv() GH_TOKEN = os.getenv("GITHUB_TOKEN") # Constants HEADER = { "Authorization": f"Bearer {GH_TOKEN}", # use “token” scheme for PAT "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/58.0.3029.110 Safari/537.3" ) } DB_PATH = 'db/server_list.db' TXT_PATH = 'db/mcp_servers.txt' INDEX_DIR = "db/faiss_index" # Scraping functions def clean_text(text: str) -> List[str]: """ Remove tags, bold markdown, new lines, and common emoji ranges from the given text. """ clean = re.sub(r"<[^>]+>", "", text) clean = clean.replace("**", "").replace("__", "").replace("\n", "") clean = re.sub(r"[^\x00-\x7F]+", "", clean) return clean def get_source1(): repo_url = ( "https://raw.githubusercontent.com/punkpeye/awesome-mcp-servers/refs/heads/main/README.md" ) response = requests.get(repo_url, headers=HEADER) section = response.text.split("## Server Implementations", 1)[1] section = section.split("## Frameworks", 1)[0] lines = [clean_text(ln) for ln in section.splitlines() if ln.startswith("- ")] return lines def get_source2(): repo_url = "https://raw.githubusercontent.com/metorial/mcp-containers/refs/heads/main/README.md" response = requests.get(repo_url, headers=HEADER) text = response.text text = re.sub(r'<img[^>]*>', '', text) text = re.sub(r'\*\*', '', text) section = text.split("## Featured Servers", 1)[1] section = section.split("# License", 1)[0].replace("## Available Servers", '') lines = [clean_text(ln) for ln in section.split("\n\n") if ln.strip().startswith("- ")] return lines def get_source3(): repo_url = "https://raw.githubusercontent.com/wong2/awesome-mcp-servers/refs/heads/main/README.md" response = requests.get(repo_url, headers=HEADER) section = response.text.split("## Official Servers", 1)[1] section = section.split("## Clients", 1)[0].replace("## Community Servers", '') lines = [clean_text(ln) for ln in section.splitlines() if ln.strip().startswith("- ")] return lines def get_all_sources(): s1 = get_source1() s2 = get_source2() s3 = get_source3() return s1 + s2 + s3 # Database functions def create_db_and_table(db_path): conn = sqlite3.connect(db_path) c = conn.cursor() c.execute(''' CREATE TABLE IF NOT EXISTS servers ( name TEXT PRIMARY KEY, description TEXT, url TEXT ) ''') conn.commit() conn.close() def read_servers_from_txt(txt_path): servers = [] if not os.path.exists(txt_path): return servers with open(txt_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line or line.startswith('#'): continue match = re.match(r'- \[([^\]]+)\]\(([^)]+)\)\s*-\s*(.+)', line) if not match: continue name, url, description = match.groups() servers.append((name, description, url)) return servers def update_db(db_path, servers): conn = sqlite3.connect(db_path) c = conn.cursor() for name, description, url in servers: c.execute('SELECT 1 FROM servers WHERE url = ?', (url,)) if c.fetchone() is None: try: response = requests.get(url, headers=HEADER, timeout=10) if response.status_code == 200: try: c.execute(''' INSERT INTO servers (name, description, url) VALUES (?, ?, ?) ''', (name, description, url)) print(f"Added: {name}, {url} to {DB_PATH}") except sqlite3.IntegrityError: print(f"Name conflict: {name}, {url}") elif response.status_code == 404: # Remove without logging c.execute('DELETE FROM servers WHERE url = ?', (url,)) elif response.status_code == 403: print(f"Access denied for {url}: HTTP 403 Forbidden, Skipping as well. ") else: print(f"Skipping {url}: HTTP {response.status_code}") except Exception as e: print(f"Error accessing {url}: {e}") conn.commit() conn.close() def generate_embeddings(db_path): conn = sqlite3.connect(db_path) c = conn.cursor() c.execute('SELECT name, description, url FROM servers') rows = c.fetchall() conn.close() docs = [Document(page_content=desc, metadata={"name": name, "url": url}) for name, desc, url in rows] embeddings = OpenAIEmbeddings() vector_store = FAISS.from_documents(docs, embeddings) vector_store.save_local(INDEX_DIR) return vector_store # Main workflow if __name__ == '__main__': # 1. Scrape and write to text file sources = get_all_sources() with open(TXT_PATH, 'w', encoding='utf-8') as f: f.write("\n".join(sources)) print(f"Scraped {len(sources)} server entries to {TXT_PATH}") # 2. Initialize DB create_db_and_table(DB_PATH) # 3. Clean up unreachable entries for prev database --> Moved to maintain.py # 4. Read scraped entries and update DB servers = read_servers_from_txt(TXT_PATH) if servers: update_db(DB_PATH, servers) else: print(f"No valid servers found in {TXT_PATH}") # 5. Generate and save embeddings generate_embeddings(DB_PATH) print("Finished scraping, DB update, and embedding generation.")

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/particlefuture/1mcpserver'

If you have feedback or need assistance with the MCP directory API, please join our Discord server