RAGStack-Lambda

Overview Schema Related Servers Score Discussions

RAGStack-Lambda
src
lambda
scrape_discover

index.py

index.py•9.08 KiB

"""
Scrape Discover Lambda

Processes URLs from the discovery queue, extracts links, and adds
new discovered URLs back to the queue (recursive discovery).

Input event (SQS triggered):
{
    "Records": [{
        "body": "{\"job_id\": \"uuid\", \"url\": \"https://...\", \"depth\": 0}"
    }]
}

Output:
{
    "processed": 1,
    "discovered": 5,
    "skipped": 2
}
"""

import json
import logging
import os
import time
from datetime import UTC, datetime

import boto3
from botocore.exceptions import ClientError

from ragstack_common.appsync import publish_scrape_update
from ragstack_common.scraper import ScrapePage, ScrapeStatus, UrlStatus
from ragstack_common.scraper.discovery import (
    extract_links,
    filter_discovered_urls,
    normalize_url,
)
from ragstack_common.scraper.fetcher import HttpFetcher
from ragstack_common.scraper.models import ScrapeConfig

logger = logging.getLogger()
logger.setLevel(os.environ.get("LOG_LEVEL", "INFO"))


def lambda_handler(event, context):
    """
    Main Lambda handler - processes discovery queue messages.
    """
    # Get environment variables
    jobs_table = os.environ.get("SCRAPE_JOBS_TABLE")
    urls_table = os.environ.get("SCRAPE_URLS_TABLE")
    discovery_queue_url = os.environ.get("SCRAPE_DISCOVERY_QUEUE_URL")
    processing_queue_url = os.environ.get("SCRAPE_PROCESSING_QUEUE_URL")
    request_delay_ms = int(os.environ.get("REQUEST_DELAY_MS", "500"))

    if not jobs_table:
        raise ValueError("SCRAPE_JOBS_TABLE environment variable required")
    if not urls_table:
        raise ValueError("SCRAPE_URLS_TABLE environment variable required")
    if not discovery_queue_url:
        raise ValueError("SCRAPE_DISCOVERY_QUEUE_URL environment variable required")
    if not processing_queue_url:
        raise ValueError("SCRAPE_PROCESSING_QUEUE_URL environment variable required")

    dynamodb = boto3.resource("dynamodb")
    jobs_tbl = dynamodb.Table(jobs_table)
    urls_tbl = dynamodb.Table(urls_table)
    sqs = boto3.client("sqs")

    processed = 0
    discovered = 0
    skipped = 0
    batch_item_failures = []

    # Process SQS records
    for record in event.get("Records", []):
        try:
            message = json.loads(record["body"])
            job_id = message["job_id"]
            url = message["url"]
            depth = message.get("depth", 0)

            logger.info(f"Processing discovery: job={job_id}, url={url}, depth={depth}")

            # Check if job is still active
            job_response = jobs_tbl.get_item(Key={"job_id": job_id})
            job_item = job_response.get("Item")

            if not job_item:
                logger.warning(f"Job not found: {job_id}")
                continue

            job_status = job_item.get("status")
            if job_status in [
                ScrapeStatus.CANCELLED.value,
                ScrapeStatus.FAILED.value,
                ScrapeStatus.COMPLETED.value,
            ]:
                logger.info(f"Job {job_id} is {job_status}, skipping")
                skipped += 1
                continue

            # Normalize URL for deduplication
            normalized_url = normalize_url(url)

            # Check if URL already visited
            existing = urls_tbl.get_item(Key={"job_id": job_id, "url": normalized_url})
            if existing.get("Item"):
                logger.info(f"URL already visited: {normalized_url}")
                skipped += 1
                continue

            # Create page record (mark as discovered)
            page = ScrapePage(
                job_id=job_id,
                url=normalized_url,
                status=UrlStatus.PENDING,
                depth=depth,
            )

            page_data = page.to_dict()
            urls_tbl.put_item(Item=page_data)

            # Get job config
            config_data = job_item.get("config", {})
            config = ScrapeConfig.from_dict(config_data)
            base_url = job_item.get("base_url", url)

            # Fetch the page to extract links
            fetcher = HttpFetcher(
                delay_ms=request_delay_ms,
                cookies=config.cookies,
                headers=config.headers,
            )
            result = fetcher.fetch(normalized_url)

            if result.error:
                logger.warning(f"Fetch failed during discovery: {normalized_url} - {result.error}")
                # Mark URL as failed but continue - processing will retry
                urls_tbl.update_item(
                    Key={"job_id": job_id, "url": normalized_url},
                    UpdateExpression="SET #status = :status, #error = :err",
                    ExpressionAttributeNames={"#status": "status", "#error": "error"},
                    ExpressionAttributeValues={
                        ":status": UrlStatus.FAILED.value,
                        ":err": result.error,
                    },
                )
                # Update job failed count
                jobs_tbl.update_item(
                    Key={"job_id": job_id},
                    UpdateExpression="SET failed_count = failed_count + :inc, updated_at = :ts",
                    ExpressionAttributeValues={
                        ":inc": 1,
                        ":ts": datetime.now(UTC).isoformat(),
                    },
                )
                continue

            # Send URL to processing queue
            sqs.send_message(
                QueueUrl=processing_queue_url,
                MessageBody=json.dumps(
                    {
                        "job_id": job_id,
                        "url": normalized_url,
                        "depth": depth,
                    }
                ),
            )

            # Update job total URLs count
            jobs_tbl.update_item(
                Key={"job_id": job_id},
                UpdateExpression="SET total_urls = total_urls + :inc, updated_at = :ts",
                ExpressionAttributeValues={
                    ":inc": 1,
                    ":ts": datetime.now(UTC).isoformat(),
                },
            )

            # Publish discovery progress update to subscribers
            graphql_endpoint = os.environ.get("GRAPHQL_ENDPOINT")
            current_total = int(job_item.get("total_urls", 0)) + 1
            publish_scrape_update(
                graphql_endpoint=graphql_endpoint,
                job_id=job_id,
                base_url=job_item.get("base_url", url),
                title=job_item.get("title") or job_item.get("base_url", url),
                status=job_item.get("status", ScrapeStatus.DISCOVERING.value),
                total_urls=current_total,
                processed_count=int(job_item.get("processed_count", 0)),
                failed_count=int(job_item.get("failed_count", 0)),
            )

            # Extract and filter links if within depth limit
            max_depth = config.max_depth
            max_pages = config.max_pages

            if depth < max_depth and result.is_html:
                links = extract_links(result.content, normalized_url)

                # Track URLs we've already seen (including current URL)
                # DynamoDB provides cross-invocation dedup; this handles within-batch
                visited = {normalized_url}

                filtered = filter_discovered_urls(
                    urls=links,
                    base_url=base_url,
                    config=config,
                    visited=visited,
                )

                logger.info(f"Discovered {len(filtered)} new URLs from {normalized_url}")

                # Check max pages limit
                job_refresh = jobs_tbl.get_item(Key={"job_id": job_id})
                total_discovered = int(job_refresh.get("Item", {}).get("total_urls", 0))
                remaining = max_pages - total_discovered

                # Queue new URLs for discovery
                urls_to_queue = filtered[:remaining] if remaining > 0 else []

                for link in urls_to_queue:
                    sqs.send_message(
                        QueueUrl=discovery_queue_url,
                        MessageBody=json.dumps(
                            {
                                "job_id": job_id,
                                "url": link,
                                "depth": depth + 1,
                            }
                        ),
                    )
                    discovered += 1

                if remaining <= 0:
                    logger.info(f"Max pages limit ({max_pages}) reached for job {job_id}")

            processed += 1

            # Respect rate limit between pages
            time.sleep(request_delay_ms / 1000.0)

        except ClientError as e:
            error_code = e.response.get("Error", {}).get("Code", "")
            logger.error(f"AWS error processing record: {error_code} - {e}")
            batch_item_failures.append({"itemIdentifier": record["messageId"]})
        except Exception as e:
            logger.error(f"Error processing record: {e}", exc_info=True)
            batch_item_failures.append({"itemIdentifier": record["messageId"]})

    return {
        "processed": processed,
        "discovered": discovered,
        "skipped": skipped,
        "batchItemFailures": batch_item_failures,
    }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/HatmanStack/RAGStack-Lambda'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

index.py•9.08 KiB

"""
Scrape Discover Lambda

Processes URLs from the discovery queue, extracts links, and adds
new discovered URLs back to the queue (recursive discovery).

Input event (SQS triggered):
{
    "Records": [{
        "body": "{\"job_id\": \"uuid\", \"url\": \"https://...\", \"depth\": 0}"
    }]
}

Output:
{
    "processed": 1,
    "discovered": 5,
    "skipped": 2
}
"""

import json
import logging
import os
import time
from datetime import UTC, datetime

import boto3
from botocore.exceptions import ClientError

from ragstack_common.appsync import publish_scrape_update
from ragstack_common.scraper import ScrapePage, ScrapeStatus, UrlStatus
from ragstack_common.scraper.discovery import (
    extract_links,
    filter_discovered_urls,
    normalize_url,
)
from ragstack_common.scraper.fetcher import HttpFetcher
from ragstack_common.scraper.models import ScrapeConfig

logger = logging.getLogger()
logger.setLevel(os.environ.get("LOG_LEVEL", "INFO"))


def lambda_handler(event, context):
    """
    Main Lambda handler - processes discovery queue messages.
    """
    # Get environment variables
    jobs_table = os.environ.get("SCRAPE_JOBS_TABLE")
    urls_table = os.environ.get("SCRAPE_URLS_TABLE")
    discovery_queue_url = os.environ.get("SCRAPE_DISCOVERY_QUEUE_URL")
    processing_queue_url = os.environ.get("SCRAPE_PROCESSING_QUEUE_URL")
    request_delay_ms = int(os.environ.get("REQUEST_DELAY_MS", "500"))

    if not jobs_table:
        raise ValueError("SCRAPE_JOBS_TABLE environment variable required")
    if not urls_table:
        raise ValueError("SCRAPE_URLS_TABLE environment variable required")
    if not discovery_queue_url:
        raise ValueError("SCRAPE_DISCOVERY_QUEUE_URL environment variable required")
    if not processing_queue_url:
        raise ValueError("SCRAPE_PROCESSING_QUEUE_URL environment variable required")

    dynamodb = boto3.resource("dynamodb")
    jobs_tbl = dynamodb.Table(jobs_table)
    urls_tbl = dynamodb.Table(urls_table)
    sqs = boto3.client("sqs")

    processed = 0
    discovered = 0
    skipped = 0
    batch_item_failures = []

    # Process SQS records
    for record in event.get("Records", []):
        try:
            message = json.loads(record["body"])
            job_id = message["job_id"]
            url = message["url"]
            depth = message.get("depth", 0)

            logger.info(f"Processing discovery: job={job_id}, url={url}, depth={depth}")

            # Check if job is still active
            job_response = jobs_tbl.get_item(Key={"job_id": job_id})
            job_item = job_response.get("Item")

            if not job_item:
                logger.warning(f"Job not found: {job_id}")
                continue

            job_status = job_item.get("status")
            if job_status in [
                ScrapeStatus.CANCELLED.value,
                ScrapeStatus.FAILED.value,
                ScrapeStatus.COMPLETED.value,
            ]:
                logger.info(f"Job {job_id} is {job_status}, skipping")
                skipped += 1
                continue

            # Normalize URL for deduplication
            normalized_url = normalize_url(url)

            # Check if URL already visited
            existing = urls_tbl.get_item(Key={"job_id": job_id, "url": normalized_url})
            if existing.get("Item"):
                logger.info(f"URL already visited: {normalized_url}")
                skipped += 1
                continue

            # Create page record (mark as discovered)
            page = ScrapePage(
                job_id=job_id,
                url=normalized_url,
                status=UrlStatus.PENDING,
                depth=depth,
            )

            page_data = page.to_dict()
            urls_tbl.put_item(Item=page_data)

            # Get job config
            config_data = job_item.get("config", {})
            config = ScrapeConfig.from_dict(config_data)
            base_url = job_item.get("base_url", url)

            # Fetch the page to extract links
            fetcher = HttpFetcher(
                delay_ms=request_delay_ms,
                cookies=config.cookies,
                headers=config.headers,
            )
            result = fetcher.fetch(normalized_url)

            if result.error:
                logger.warning(f"Fetch failed during discovery: {normalized_url} - {result.error}")
                # Mark URL as failed but continue - processing will retry
                urls_tbl.update_item(
                    Key={"job_id": job_id, "url": normalized_url},
                    UpdateExpression="SET #status = :status, #error = :err",
                    ExpressionAttributeNames={"#status": "status", "#error": "error"},
                    ExpressionAttributeValues={
                        ":status": UrlStatus.FAILED.value,
                        ":err": result.error,
                    },
                )
                # Update job failed count
                jobs_tbl.update_item(
                    Key={"job_id": job_id},
                    UpdateExpression="SET failed_count = failed_count + :inc, updated_at = :ts",
                    ExpressionAttributeValues={
                        ":inc": 1,
                        ":ts": datetime.now(UTC).isoformat(),
                    },
                )
                continue

            # Send URL to processing queue
            sqs.send_message(
                QueueUrl=processing_queue_url,
                MessageBody=json.dumps(
                    {
                        "job_id": job_id,
                        "url": normalized_url,
                        "depth": depth,
                    }
                ),
            )

            # Update job total URLs count
            jobs_tbl.update_item(
                Key={"job_id": job_id},
                UpdateExpression="SET total_urls = total_urls + :inc, updated_at = :ts",
                ExpressionAttributeValues={
                    ":inc": 1,
                    ":ts": datetime.now(UTC).isoformat(),
                },
            )

            # Publish discovery progress update to subscribers
            graphql_endpoint = os.environ.get("GRAPHQL_ENDPOINT")
            current_total = int(job_item.get("total_urls", 0)) + 1
            publish_scrape_update(
                graphql_endpoint=graphql_endpoint,
                job_id=job_id,
                base_url=job_item.get("base_url", url),
                title=job_item.get("title") or job_item.get("base_url", url),
                status=job_item.get("status", ScrapeStatus.DISCOVERING.value),
                total_urls=current_total,
                processed_count=int(job_item.get("processed_count", 0)),
                failed_count=int(job_item.get("failed_count", 0)),
            )

            # Extract and filter links if within depth limit
            max_depth = config.max_depth
            max_pages = config.max_pages

            if depth < max_depth and result.is_html:
                links = extract_links(result.content, normalized_url)

                # Track URLs we've already seen (including current URL)
                # DynamoDB provides cross-invocation dedup; this handles within-batch
                visited = {normalized_url}

                filtered = filter_discovered_urls(
                    urls=links,
                    base_url=base_url,
                    config=config,
                    visited=visited,
                )

                logger.info(f"Discovered {len(filtered)} new URLs from {normalized_url}")

                # Check max pages limit
                job_refresh = jobs_tbl.get_item(Key={"job_id": job_id})
                total_discovered = int(job_refresh.get("Item", {}).get("total_urls", 0))
                remaining = max_pages - total_discovered

                # Queue new URLs for discovery
                urls_to_queue = filtered[:remaining] if remaining > 0 else []

                for link in urls_to_queue:
                    sqs.send_message(
                        QueueUrl=discovery_queue_url,
                        MessageBody=json.dumps(
                            {
                                "job_id": job_id,
                                "url": link,
                                "depth": depth + 1,
                            }
                        ),
                    )
                    discovered += 1

                if remaining <= 0:
                    logger.info(f"Max pages limit ({max_pages}) reached for job {job_id}")

            processed += 1

            # Respect rate limit between pages
            time.sleep(request_delay_ms / 1000.0)

        except ClientError as e:
            error_code = e.response.get("Error", {}).get("Code", "")
            logger.error(f"AWS error processing record: {error_code} - {e}")
            batch_item_failures.append({"itemIdentifier": record["messageId"]})
        except Exception as e:
            logger.error(f"Error processing record: {e}", exc_info=True)
            batch_item_failures.append({"itemIdentifier": record["messageId"]})

    return {
        "processed": processed,
        "discovered": discovered,
        "skipped": skipped,
        "batchItemFailures": batch_item_failures,
    }