Skip to main content
Glama

VideoDB Director

Official
by video-db
crawl_coda_tree.py4.04 kB
import argparse import requests from bs4 import BeautifulSoup import json import sys # Constants DEFAULT_URL = "https://docs.videodb.io" HTML_PARSER = "html.parser" DEFAULT_SELECTOR = "data-coda-ui-id" DEFAULT_SELECTOR_VALUE = ( "page-list" # Default attribute value to locate the parent element ) def find_a_tags_with_depth(parent_tag, depth=0): """ Recursively find all <a> tags within a parent tag and track their depth level. """ results = [] for child in parent_tag.find_all(recursive=False): # Iterate over direct children if child.name == "a": results.append((child, depth)) # Store <a> tag with its depth results.extend(find_a_tags_with_depth(child, depth + 1)) # Recurse deeper return results def list_to_nested_json(data): """ Convert a list of tuples (element, depth) into a nested JSON-like structure, where items with the smallest depth are at the top level and items of the same depth become siblings. Parameters: data (list of tuple): Each tuple is (element, depth) Returns: list: A list of nested dictionaries representing the JSON structure. """ result = [] stack = [] for element, depth in data: node = { "element": element.get_text(strip=True), "href": element.get("href"), "children": [], } # Adjust the stack to match the current depth while stack and stack[-1][1] >= depth: stack.pop() if stack: parent_node, _ = stack[-1] parent_node["children"].append(node) else: result.append(node) stack.append((node, depth)) return result def fetch_and_parse(url): """ Fetch the webpage content from the given URL and parse it with BeautifulSoup. """ response = requests.get(url) if response.status_code == 200: return BeautifulSoup(response.text, HTML_PARSER) else: raise Exception( f"Failed to fetch the webpage. Status code: {response.status_code}" ) def scrape_and_save( output_file, url=DEFAULT_URL, selector=DEFAULT_SELECTOR, selector_value=DEFAULT_SELECTOR_VALUE, ): """ Scrape the webpage, convert <a> tags into a nested JSON structure, and save it to a file. Parameters: output_file (str): Path to the output JSON file. url (str): URL of the docs page to scrape. selector (str): HTML attribute name to locate the parent element. selector_value (str): Value for the attribute selector. """ soup = fetch_and_parse(url) parent_tag = soup.find(attrs={selector: selector_value}) if not parent_tag: raise Exception(f"Element with {selector}='{selector_value}' not found.") a_tags_with_levels = find_a_tags_with_depth(parent_tag) nested_json = list_to_nested_json(a_tags_with_levels) with open(output_file, "w", encoding="utf-8") as f: json.dump(nested_json, f, indent=4) def main(): parser = argparse.ArgumentParser( description="Scrape a webpage, convert <a> tags into a nested JSON structure, and save it to a file." ) parser.add_argument("output", help="Path to the output JSON file") parser.add_argument( "--url", default=DEFAULT_URL, help="URL of the docs page (default: %(default)s)" ) parser.add_argument( "--selector", default=DEFAULT_SELECTOR, help="Attribute selector to locate the parent element (default: %(default)s)", ) parser.add_argument( "--selector-value", default=DEFAULT_SELECTOR_VALUE, help="Value for the attribute selector (default: %(default)s)", ) args = parser.parse_args() try: scrape_and_save( args.output, url=args.url, selector=args.selector, selector_value=args.selector_value, ) except Exception as e: sys.exit(str(e)) if __name__ == "__main__": main()

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/video-db/agent-toolkit'

If you have feedback or need assistance with the MCP directory API, please join our Discord server