crawl_coda_tree.py•4.04 kB
import argparse
import requests
from bs4 import BeautifulSoup
import json
import sys
# Constants
DEFAULT_URL = "https://docs.videodb.io"
HTML_PARSER = "html.parser"
DEFAULT_SELECTOR = "data-coda-ui-id"
DEFAULT_SELECTOR_VALUE = (
    "page-list"  # Default attribute value to locate the parent element
)
def find_a_tags_with_depth(parent_tag, depth=0):
    """
    Recursively find all <a> tags within a parent tag and track their depth level.
    """
    results = []
    for child in parent_tag.find_all(recursive=False):  # Iterate over direct children
        if child.name == "a":
            results.append((child, depth))  # Store <a> tag with its depth
        results.extend(find_a_tags_with_depth(child, depth + 1))  # Recurse deeper
    return results
def list_to_nested_json(data):
    """
    Convert a list of tuples (element, depth) into a nested JSON-like structure,
    where items with the smallest depth are at the top level and items of the same
    depth become siblings.
    Parameters:
        data (list of tuple): Each tuple is (element, depth)
    Returns:
        list: A list of nested dictionaries representing the JSON structure.
    """
    result = []
    stack = []
    for element, depth in data:
        node = {
            "element": element.get_text(strip=True),
            "href": element.get("href"),
            "children": [],
        }
        # Adjust the stack to match the current depth
        while stack and stack[-1][1] >= depth:
            stack.pop()
        if stack:
            parent_node, _ = stack[-1]
            parent_node["children"].append(node)
        else:
            result.append(node)
        stack.append((node, depth))
    return result
def fetch_and_parse(url):
    """
    Fetch the webpage content from the given URL and parse it with BeautifulSoup.
    """
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.text, HTML_PARSER)
    else:
        raise Exception(
            f"Failed to fetch the webpage. Status code: {response.status_code}"
        )
def scrape_and_save(
    output_file,
    url=DEFAULT_URL,
    selector=DEFAULT_SELECTOR,
    selector_value=DEFAULT_SELECTOR_VALUE,
):
    """
    Scrape the webpage, convert <a> tags into a nested JSON structure, and save it to a file.
    Parameters:
        output_file (str): Path to the output JSON file.
        url (str): URL of the docs page to scrape.
        selector (str): HTML attribute name to locate the parent element.
        selector_value (str): Value for the attribute selector.
    """
    soup = fetch_and_parse(url)
    parent_tag = soup.find(attrs={selector: selector_value})
    if not parent_tag:
        raise Exception(f"Element with {selector}='{selector_value}' not found.")
    a_tags_with_levels = find_a_tags_with_depth(parent_tag)
    nested_json = list_to_nested_json(a_tags_with_levels)
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(nested_json, f, indent=4)
def main():
    parser = argparse.ArgumentParser(
        description="Scrape a webpage, convert <a> tags into a nested JSON structure, and save it to a file."
    )
    parser.add_argument("output", help="Path to the output JSON file")
    parser.add_argument(
        "--url", default=DEFAULT_URL, help="URL of the docs page (default: %(default)s)"
    )
    parser.add_argument(
        "--selector",
        default=DEFAULT_SELECTOR,
        help="Attribute selector to locate the parent element (default: %(default)s)",
    )
    parser.add_argument(
        "--selector-value",
        default=DEFAULT_SELECTOR_VALUE,
        help="Value for the attribute selector (default: %(default)s)",
    )
    args = parser.parse_args()
    try:
        scrape_and_save(
            args.output,
            url=args.url,
            selector=args.selector,
            selector_value=args.selector_value,
        )
    except Exception as e:
        sys.exit(str(e))
if __name__ == "__main__":
    main()