AWSTemplateFormatVersion: '2010-09-09'
Transform: AWS::Serverless-2016-10-31
Description: RAGStack - Serverless document processing with AI chat. IMPORTANT - Stack name must be lowercase (e.g., my-docs).
Metadata:
AWS::CloudFormation::Interface:
ParameterGroups:
- Label:
default: Required Settings
Parameters:
- AdminEmail
- Label:
default: Build Options
Parameters:
- BuildDashboard
- BuildWebComponent
- Label:
default: Advanced Settings (Optional)
Parameters:
- OcrBackend
- BedrockOcrModelId
- CaptionModelId
- UISourceBucket
- UISourceKey
- WebComponentSourceKey
ParameterLabels:
AdminEmail:
default: Admin Email
BuildDashboard:
default: Build Dashboard UI
BuildWebComponent:
default: Build Chat Widget
OcrBackend:
default: OCR Backend
BedrockOcrModelId:
default: Bedrock OCR Model
CaptionModelId:
default: Image Caption Model
UISourceBucket:
default: UI Source Bucket
UISourceKey:
default: UI Source Key
WebComponentSourceKey:
default: Web Component Source Key
Parameters:
OcrBackend:
Type: String
Default: textract
AllowedValues:
- textract
- bedrock
Description: OCR backend to use (textract or bedrock)
BedrockOcrModelId:
Type: String
Default: meta.llama3-2-90b-instruct-v1:0
Description: Bedrock model ID for OCR (if backend=bedrock)
AllowedValues:
- meta.llama3-2-90b-instruct-v1:0
- meta.llama3-2-11b-instruct-v1:0
- us.anthropic.claude-sonnet-4-20250514-v1:0
- us.anthropic.claude-haiku-4-5-20251001-v1:0
ConstraintDescription: Must be a valid Bedrock vision-capable model ID
CaptionModelId:
Type: String
Default: us.anthropic.claude-haiku-4-5-20251001-v1:0
Description: Bedrock model ID for image caption generation
UISourceBucket:
Type: String
Description: S3 bucket containing UI source code zip
Default: 'ragstack-quicklaunch-public-631094035453'
UISourceKey:
Type: String
Description: S3 key for UI source code zip
Default: 'source/ui.zip'
WebComponentSourceKey:
Type: String
Description: S3 key for web component source code zip
Default: 'source/ragstack-chat.zip'
AdminEmail:
Type: String
Description: Admin email for Cognito user and CloudWatch/budget alerts
AllowedPattern: '^[\w.+-]+@([\w-]+\.)+[\w-]{2,6}$'
ConstraintDescription: Must be a valid email address
BuildDashboard:
Type: String
Default: 'true'
AllowedValues:
- 'true'
- 'false'
Description: Build and deploy the React admin dashboard UI
BuildWebComponent:
Type: String
Default: 'true'
AllowedValues:
- 'true'
- 'false'
Description: Build and deploy the embeddable chat web component
DemoMode:
Type: String
Default: 'false'
AllowedValues:
- 'true'
- 'false'
Description: Enable demo mode with rate limits (5 uploads/day, 30 chats/day) and disabled features (reindex, reprocess, delete)
Globals:
Function:
Runtime: python3.13
Timeout: 30 # Default 30s, override per-function as needed
MemorySize: 256 # Default 256MB, override per-function as needed
Environment:
Variables:
LOG_LEVEL: INFO
Conditions:
# Build UI if BuildDashboard=true
BuildUI: !Equals [!Ref BuildDashboard, 'true']
# Build web component if BuildWebComponent=true
BuildWC: !Equals [!Ref BuildWebComponent, 'true']
# Build any UI (dashboard or web component) - used for shared resources
BuildAnyUI: !Or [!Condition BuildUI, !Condition BuildWC]
Resources:
# =========================================================================
# S3 Buckets
# =========================================================================
# Unified data bucket with prefix-based organization:
# input/ - User uploads (EventBridge triggers processing)
# content/ - All KB content (documents, images, scraped pages)
# working/ - Temporary files (7-day TTL)
DataBucket:
Type: AWS::S3::Bucket
Properties:
BucketName: !Sub '${AWS::StackName}-data-${AWS::AccountId}'
BucketEncryption:
ServerSideEncryptionConfiguration:
- ServerSideEncryptionByDefault:
SSEAlgorithm: AES256
PublicAccessBlockConfiguration:
BlockPublicAcls: true
BlockPublicPolicy: true
IgnorePublicAcls: true
RestrictPublicBuckets: true
VersioningConfiguration:
Status: Enabled
LifecycleConfiguration:
Rules:
- Id: CleanupIncompleteUploads
Status: Enabled
AbortIncompleteMultipartUpload:
DaysAfterInitiation: 7
- Id: DeleteWorkingFiles
Status: Enabled
ExpirationInDays: 7
Prefix: working/
NotificationConfiguration:
EventBridgeConfiguration:
EventBridgeEnabled: true
CorsConfiguration:
CorsRules:
- AllowedHeaders:
- '*'
AllowedMethods:
- PUT
- POST
- GET
- HEAD
AllowedOrigins:
- !Sub 'https://${CloudFrontDistribution.DomainName}'
ExposedHeaders:
- ETag
- x-amz-meta-auto-process
- x-amz-meta-user-caption
MaxAge: 3000
Tags:
- Key: Project
Value: !Ref AWS::StackName
- Key: CostCenter
Value: Engineering
VectorBucket:
Type: AWS::S3::Bucket
Properties:
BucketName: !Sub '${AWS::StackName}-vectors-${AWS::AccountId}'
BucketEncryption:
ServerSideEncryptionConfiguration:
- ServerSideEncryptionByDefault:
SSEAlgorithm: AES256
PublicAccessBlockConfiguration:
BlockPublicAcls: true
BlockPublicPolicy: true
IgnorePublicAcls: true
RestrictPublicBuckets: true
VersioningConfiguration:
Status: Enabled
LifecycleConfiguration:
Rules:
- Id: CleanupOldVectors
Status: Enabled
AbortIncompleteMultipartUpload:
DaysAfterInitiation: 7
NotificationConfiguration:
EventBridgeConfiguration:
EventBridgeEnabled: true
Tags:
- Key: Project
Value: !Ref AWS::StackName
- Key: CostCenter
Value: Engineering
# =========================================================================
# CodeBuild Resources for UI Deployment
# =========================================================================
UICodeBuildServiceRole:
Type: AWS::IAM::Role
Condition: BuildUI
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Service: !Sub 'codebuild.${AWS::URLSuffix}'
Action: sts:AssumeRole
Policies:
- PolicyName: CodeBuildUIPolicy
PolicyDocument:
Version: '2012-10-17'
Statement:
# S3 read access for UI source artifacts
- Effect: Allow
Action:
- s3:GetObject
- s3:GetObjectVersion
Resource:
- !Sub 'arn:${AWS::Partition}:s3:::${UISourceBucket}/*'
- Effect: Allow
Action:
- s3:ListBucket
Resource:
- !Sub 'arn:${AWS::Partition}:s3:::${UISourceBucket}'
# S3 write access for UI deployment bucket
- Effect: Allow
Action:
- s3:ListBucket
- s3:PutObject
- s3:DeleteObject
Resource:
- !Sub '${UIBucket.Arn}'
- !Sub '${UIBucket.Arn}/*'
# CloudFront invalidation
- Effect: Allow
Action:
- cloudfront:CreateInvalidation
Resource: !Sub 'arn:${AWS::Partition}:cloudfront::${AWS::AccountId}:distribution/${CloudFrontDistribution}'
# CloudFormation read access for outputs
- Effect: Allow
Action:
- cloudformation:DescribeStacks
Resource: !Ref AWS::StackId
# EventBridge rule creation
- Effect: Allow
Action:
- events:PutRule
- events:PutTargets
- events:RemoveTargets
- events:DeleteRule
Resource: !Sub 'arn:${AWS::Partition}:events:${AWS::Region}:${AWS::AccountId}:rule/*'
# Logs
- Effect: Allow
Action:
- logs:CreateLogGroup
- logs:CreateLogStream
- logs:PutLogEvents
Resource: '*'
UICodeBuildProject:
Type: AWS::CodeBuild::Project
Condition: BuildUI
DependsOn: UICodeBuildServiceRole
Properties:
Name: !Sub '${AWS::StackName}-webui-build'
Description: !Sub 'Web UI build for ${AWS::StackName}'
ServiceRole: !GetAtt UICodeBuildServiceRole.Arn
EncryptionKey: alias/aws/s3
Artifacts:
Type: NO_ARTIFACTS
Source:
Type: S3
Location: !Sub '${UISourceBucket}/${UISourceKey}'
BuildSpec: |
version: 0.2
phases:
install:
runtime-versions:
nodejs: 24
pre_build:
commands:
- echo "Installing dependencies..."
- cd ui
- npm install
build:
commands:
- echo "Building React application..."
- npm run build
post_build:
commands:
- echo "Deploying to S3..."
- aws s3 sync dist/ s3://${UI_BUCKET}/ --delete
- echo "Invalidating CloudFront cache..."
- aws cloudfront create-invalidation --distribution-id ${CLOUDFRONT_DIST_ID} --paths "/*"
- echo "============================================================"
- echo "DEPLOYMENT COMPLETE - Dashboard UI https://${CLOUDFRONT_DOMAIN}"
- echo "============================================================"
Environment:
Type: LINUX_CONTAINER
ComputeType: BUILD_GENERAL1_SMALL
Image: aws/codebuild/standard:7.0
EnvironmentVariables:
- Name: VITE_AWS_REGION
Value: !Ref AWS::Region
- Name: VITE_USER_POOL_ID
Value: !Ref UserPool
- Name: VITE_USER_POOL_CLIENT_ID
Value: !Ref UserPoolClient
- Name: VITE_IDENTITY_POOL_ID
Value: !Ref IdentityPool
- Name: VITE_GRAPHQL_URL
Value: !GetAtt GraphQLApi.GraphQLUrl
- Name: VITE_DATA_BUCKET
Value: !Ref DataBucket
- Name: UI_BUCKET
Value: !Ref UIBucket
- Name: CLOUDFRONT_DIST_ID
Value: !Ref CloudFrontDistribution
- Name: CLOUDFRONT_DOMAIN
Value: !GetAtt CloudFrontDistribution.DomainName
TimeoutInMinutes: 30
# =========================================================================
# UI S3 Bucket
# =========================================================================
UIBucket:
Type: AWS::S3::Bucket
Properties:
BucketName: !Sub '${AWS::StackName}-ui-${AWS::AccountId}'
BucketEncryption:
ServerSideEncryptionConfiguration:
- ServerSideEncryptionByDefault:
SSEAlgorithm: AES256
PublicAccessBlockConfiguration:
BlockPublicAcls: true
BlockPublicPolicy: false # CloudFront needs access
IgnorePublicAcls: true
RestrictPublicBuckets: false
WebsiteConfiguration:
IndexDocument: index.html
ErrorDocument: index.html
Tags:
- Key: Project
Value: !Ref AWS::StackName
- Key: CostCenter
Value: Engineering
# =========================================================================
# CloudFront Distribution for UI
# =========================================================================
CloudFrontOriginAccessIdentity:
Type: AWS::CloudFront::CloudFrontOriginAccessIdentity
Properties:
CloudFrontOriginAccessIdentityConfig:
Comment: !Sub 'OAI for ${AWS::StackName} UI'
UIBucketPolicy:
Type: AWS::S3::BucketPolicy
Properties:
Bucket: !Ref UIBucket
PolicyDocument:
Statement:
- Effect: Allow
Principal:
CanonicalUser: !GetAtt CloudFrontOriginAccessIdentity.S3CanonicalUserId
Action: s3:GetObject
Resource: !Sub '${UIBucket.Arn}/*'
CloudFrontDistribution:
Type: AWS::CloudFront::Distribution
Properties:
DistributionConfig:
Enabled: true
Comment: !Sub '${AWS::StackName} UI Distribution'
DefaultRootObject: index.html
HttpVersion: http2
PriceClass: PriceClass_100 # Use only North America and Europe
Origins:
- Id: S3Origin
DomainName: !GetAtt UIBucket.RegionalDomainName
S3OriginConfig:
OriginAccessIdentity: !Sub 'origin-access-identity/cloudfront/${CloudFrontOriginAccessIdentity}'
DefaultCacheBehavior:
TargetOriginId: S3Origin
ViewerProtocolPolicy: redirect-to-https
AllowedMethods:
- GET
- HEAD
- OPTIONS
CachedMethods:
- GET
- HEAD
Compress: true
ForwardedValues:
QueryString: false
Cookies:
Forward: none
# Custom error pages for SPA routing
CustomErrorResponses:
- ErrorCode: 403
ResponseCode: 200
ResponsePagePath: /index.html
ErrorCachingMinTTL: 300
- ErrorCode: 404
ResponseCode: 200
ResponsePagePath: /index.html
ErrorCachingMinTTL: 300
ViewerCertificate:
CloudFrontDefaultCertificate: true
# For custom domain, use ACM certificate:
# AcmCertificateArn: !Ref CertificateArn
# SslSupportMethod: sni-only
# MinimumProtocolVersion: TLSv1.2_2021
# =========================================================================
# Web Component CDN Infrastructure
# =========================================================================
WebComponentAssetsBucket:
Type: AWS::S3::Bucket
Properties:
BucketName: !Sub '${AWS::StackName}-wc-assets-${AWS::AccountId}'
PublicAccessBlockConfiguration:
BlockPublicAcls: true
BlockPublicPolicy: false # Allow CloudFront OAI bucket policy
IgnorePublicAcls: true
RestrictPublicBuckets: false # Allow CloudFront OAI access
VersioningConfiguration:
Status: Enabled
LifecycleConfiguration:
Rules:
- Id: DeleteOldVersions
Status: Enabled
NoncurrentVersionExpirationInDays: 30
Tags:
- Key: Project
Value: !Ref AWS::StackName
- Key: CostCenter
Value: Engineering
WebComponentOriginAccessIdentity:
Type: AWS::CloudFront::CloudFrontOriginAccessIdentity
Properties:
CloudFrontOriginAccessIdentityConfig:
Comment: !Sub 'OAI for ${AWS::StackName} web component CDN'
WebComponentBucketPolicy:
Type: AWS::S3::BucketPolicy
Properties:
Bucket: !Ref WebComponentAssetsBucket
PolicyDocument:
Statement:
- Effect: Allow
Principal:
CanonicalUser: !GetAtt WebComponentOriginAccessIdentity.S3CanonicalUserId
Action: 's3:GetObject'
Resource: !Sub '${WebComponentAssetsBucket.Arn}/*'
WebComponentCORSPolicy:
Type: AWS::CloudFront::ResponseHeadersPolicy
Properties:
ResponseHeadersPolicyConfig:
Name: !Sub '${AWS::StackName}-wc-cors'
Comment: CORS policy for web component CDN
CorsConfig:
AccessControlAllowOrigins:
Items:
- '*'
AccessControlAllowHeaders:
Items:
- '*'
AccessControlAllowMethods:
Items:
- GET
- HEAD
- OPTIONS
AccessControlAllowCredentials: false
OriginOverride: true
WebComponentDistribution:
Type: AWS::CloudFront::Distribution
Properties:
DistributionConfig:
Enabled: true
Comment: !Sub 'CDN for ${AWS::StackName} web component'
DefaultRootObject: 'ragstack-chat.js'
Origins:
- Id: WebComponentS3Origin
DomainName: !GetAtt WebComponentAssetsBucket.RegionalDomainName
S3OriginConfig:
OriginAccessIdentity: !Sub 'origin-access-identity/cloudfront/${WebComponentOriginAccessIdentity}'
DefaultCacheBehavior:
TargetOriginId: WebComponentS3Origin
ViewerProtocolPolicy: redirect-to-https
AllowedMethods:
- GET
- HEAD
- OPTIONS
CachedMethods:
- GET
- HEAD
Compress: true
CachePolicyId: 658327ea-f89d-4fab-a63d-7e88639e58f6 # CachingOptimized managed policy
ResponseHeadersPolicyId: !Ref WebComponentCORSPolicy
HttpVersion: http2
PriceClass: PriceClass_100
Tags:
- Key: Project
Value: !Ref AWS::StackName
- Key: CostCenter
Value: Engineering
WebComponentBuildRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Service: codebuild.amazonaws.com
Action: 'sts:AssumeRole'
Policies:
- PolicyName: WebComponentBuildPolicy
PolicyDocument:
Version: '2012-10-17'
Statement:
# CloudWatch Logs - scoped to this CodeBuild project
- Effect: Allow
Action:
- 'logs:CreateLogGroup'
- 'logs:CreateLogStream'
- 'logs:PutLogEvents'
Resource:
- !Sub 'arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/codebuild/${AWS::StackName}-*'
- !Sub 'arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/codebuild/${AWS::StackName}-*:*'
# S3 access for web component assets bucket
- Effect: Allow
Action:
- 's3:PutObject'
- 's3:GetObject'
- 's3:ListBucket'
Resource:
- !GetAtt WebComponentAssetsBucket.Arn
- !Sub '${WebComponentAssetsBucket.Arn}/*'
# CloudFront invalidation
- Effect: Allow
Action:
- 'cloudfront:CreateInvalidation'
Resource: !Sub 'arn:${AWS::Partition}:cloudfront::${AWS::AccountId}:distribution/${WebComponentDistribution}'
# S3 access for source artifacts
- Effect: Allow
Action:
- 's3:GetObject'
- 's3:GetObjectVersion'
Resource:
- !Sub 'arn:${AWS::Partition}:s3:::${UISourceBucket}/*'
- Effect: Allow
Action:
- 's3:ListBucket'
Resource:
- !Sub 'arn:${AWS::Partition}:s3:::${UISourceBucket}'
# DynamoDB access for reading theme configuration
- Effect: Allow
Action:
- 'dynamodb:GetItem'
Resource:
- !GetAtt ConfigurationTable.Arn
Tags:
- Key: Project
Value: !Ref AWS::StackName
WebComponentBuildProject:
Type: AWS::CodeBuild::Project
DependsOn: WebComponentBuildRole
Properties:
Name: !Sub '${AWS::StackName}-wc-build'
Description: Build and deploy web component to CDN
ServiceRole: !GetAtt WebComponentBuildRole.Arn
EncryptionKey: alias/aws/s3
Artifacts:
Type: NO_ARTIFACTS
Source:
Type: S3
Location: !Sub '${UISourceBucket}/${WebComponentSourceKey}'
BuildSpec: |
version: 0.2
phases:
install:
runtime-versions:
nodejs: 24
commands:
- echo "Installing dependencies..."
- cd src/ragstack-chat
- npm ci
pre_build:
commands:
- echo "Setting up build environment..."
- echo "✓ SAM API endpoint for chat queries - $SAM_API_ENDPOINT"
build:
commands:
- echo "Building web component..."
- SAM_GRAPHQL_ENDPOINT="$SAM_API_ENDPOINT" npm run build:wc
- ls -lh dist/
post_build:
commands:
- echo "Deploying to S3..."
- aws s3 cp dist/wc.js s3://${ASSET_BUCKET}/ragstack-chat.js --content-type application/javascript --cache-control "public, max-age=31536000"
- aws s3 cp dist/wc.esm.js s3://${ASSET_BUCKET}/ragstack-chat.esm.js --content-type application/javascript --cache-control "public, max-age=31536000"
- echo "Generating config.json..."
- echo "{\"apiEndpoint\":\"${SAM_API_ENDPOINT}\",\"identityPoolId\":\"${IDENTITY_POOL_ID}\",\"region\":\"${AWS_REGION}\"}" > dist/config.json
- aws s3 cp dist/config.json s3://${ASSET_BUCKET}/config.json --content-type application/json --cache-control "public, max-age=300"
- echo "Invalidating CloudFront cache..."
- aws cloudfront create-invalidation --distribution-id ${DISTRIBUTION_ID} --paths "/ragstack-chat.js" "/ragstack-chat.esm.js" "/config.json"
- echo "Deployment complete!"
Environment:
Type: LINUX_CONTAINER
ComputeType: BUILD_GENERAL1_SMALL
Image: aws/codebuild/standard:7.0
EnvironmentVariables:
- Name: ASSET_BUCKET
Value: !Ref WebComponentAssetsBucket
- Name: DISTRIBUTION_ID
Value: !Ref WebComponentDistribution
- Name: ARTIFACT_BUCKET
Value: !Ref UISourceBucket
- Name: CONFIG_TABLE
Value: !Ref ConfigurationTable
- Name: SAM_API_ENDPOINT
Value: !GetAtt GraphQLApi.GraphQLUrl
- Name: IDENTITY_POOL_ID
Value: !Ref IdentityPool
- Name: AWS_REGION
Value: !Ref AWS::Region
TimeoutInMinutes: 15
Tags:
- Key: Project
Value: !Ref AWS::StackName
# =========================================================================
# DynamoDB Tables
# =========================================================================
TrackingTable:
Type: AWS::DynamoDB::Table
Properties:
TableName: !Sub '${AWS::StackName}-tracking'
BillingMode: PAY_PER_REQUEST
PointInTimeRecoverySpecification:
PointInTimeRecoveryEnabled: true
SSESpecification:
SSEEnabled: true
AttributeDefinitions:
- AttributeName: document_id
AttributeType: S
KeySchema:
- AttributeName: document_id
KeyType: HASH
StreamSpecification:
StreamViewType: NEW_AND_OLD_IMAGES
Tags:
- Key: Project
Value: !Ref AWS::StackName
- Key: CostCenter
Value: Engineering
MeteringTable:
Type: AWS::DynamoDB::Table
Properties:
TableName: !Sub '${AWS::StackName}-metering'
BillingMode: PAY_PER_REQUEST
PointInTimeRecoverySpecification:
PointInTimeRecoveryEnabled: true
SSESpecification:
SSEEnabled: true
AttributeDefinitions:
- AttributeName: document_id
AttributeType: S
- AttributeName: timestamp
AttributeType: S
KeySchema:
- AttributeName: document_id
KeyType: HASH
- AttributeName: timestamp
KeyType: RANGE
TimeToLiveSpecification:
Enabled: true
AttributeName: ttl
Tags:
- Key: Project
Value: !Ref AWS::StackName
- Key: CostCenter
Value: Engineering
##########################################################################
# Configuration Table - Runtime configurable parameters
##########################################################################
ConfigurationTable:
Type: AWS::DynamoDB::Table
Properties:
TableName: !Sub '${AWS::StackName}-config'
BillingMode: PAY_PER_REQUEST
PointInTimeRecoverySpecification:
PointInTimeRecoveryEnabled: true
SSESpecification:
SSEEnabled: true
AttributeDefinitions:
- AttributeName: Configuration
AttributeType: S
KeySchema:
- AttributeName: Configuration
KeyType: HASH
StreamSpecification:
StreamViewType: NEW_AND_OLD_IMAGES
Tags:
- Key: Project
Value: !Ref AWS::StackName
- Key: Purpose
Value: Runtime Configuration Storage
##########################################################################
# Conversation History Table - Stores multi-turn chat context
##########################################################################
ConversationHistoryTable:
Type: AWS::DynamoDB::Table
Properties:
TableName: !Sub '${AWS::StackName}-conversations'
BillingMode: PAY_PER_REQUEST
PointInTimeRecoverySpecification:
PointInTimeRecoveryEnabled: true
SSESpecification:
SSEEnabled: true
AttributeDefinitions:
- AttributeName: conversationId
AttributeType: S
- AttributeName: turnNumber
AttributeType: N
KeySchema:
- AttributeName: conversationId
KeyType: HASH
- AttributeName: turnNumber
KeyType: RANGE
TimeToLiveSpecification:
AttributeName: ttl
Enabled: true
Tags:
- Key: Project
Value: !Ref AWS::StackName
- Key: Purpose
Value: Conversation History Storage
##########################################################################
# Metadata Key Library Table - Tracks discovered metadata fields
##########################################################################
MetadataKeyLibraryTable:
Type: AWS::DynamoDB::Table
Properties:
TableName: !Sub '${AWS::StackName}-metadata-keys'
BillingMode: PAY_PER_REQUEST
PointInTimeRecoverySpecification:
PointInTimeRecoveryEnabled: true
SSESpecification:
SSEEnabled: true
AttributeDefinitions:
- AttributeName: key_name
AttributeType: S
KeySchema:
- AttributeName: key_name
KeyType: HASH
Tags:
- Key: Project
Value: !Ref AWS::StackName
- Key: Purpose
Value: Metadata Key Library
##########################################################################
# Scrape Jobs Table - Tracks web scraping job state
##########################################################################
ScrapeJobsTable:
Type: AWS::DynamoDB::Table
Properties:
TableName: !Sub '${AWS::StackName}-scrape-jobs'
BillingMode: PAY_PER_REQUEST
PointInTimeRecoverySpecification:
PointInTimeRecoveryEnabled: true
SSESpecification:
SSEEnabled: true
AttributeDefinitions:
- AttributeName: job_id
AttributeType: S
- AttributeName: base_url
AttributeType: S
- AttributeName: created_at
AttributeType: S
KeySchema:
- AttributeName: job_id
KeyType: HASH
GlobalSecondaryIndexes:
- IndexName: BaseUrlIndex
KeySchema:
- AttributeName: base_url
KeyType: HASH
- AttributeName: created_at
KeyType: RANGE
Projection:
ProjectionType: ALL
Tags:
- Key: Project
Value: !Ref AWS::StackName
- Key: Purpose
Value: Web Scraping Job Tracking
##########################################################################
# Scrape URLs Table - Tracks individual URLs within scrape jobs
##########################################################################
ScrapeUrlsTable:
Type: AWS::DynamoDB::Table
Properties:
TableName: !Sub '${AWS::StackName}-scrape-urls'
BillingMode: PAY_PER_REQUEST
PointInTimeRecoverySpecification:
PointInTimeRecoveryEnabled: true
SSESpecification:
SSEEnabled: true
AttributeDefinitions:
- AttributeName: job_id
AttributeType: S
- AttributeName: url
AttributeType: S
- AttributeName: url_hash
AttributeType: S
KeySchema:
- AttributeName: job_id
KeyType: HASH
- AttributeName: url
KeyType: RANGE
GlobalSecondaryIndexes:
- IndexName: UrlHashIndex
KeySchema:
- AttributeName: url_hash
KeyType: HASH
Projection:
ProjectionType: ALL
Tags:
- Key: Project
Value: !Ref AWS::StackName
- Key: Purpose
Value: Web Scraping URL Tracking
# =========================================================================
# Lambda Layers
# =========================================================================
RagstackCommonLayer:
Type: AWS::Serverless::LayerVersion
Properties:
LayerName: !Sub '${AWS::StackName}-Common'
Description: Shared utilities for Lambda functions
ContentUri: lib/
CompatibleRuntimes:
- python3.13
Metadata:
BuildMethod: python3.13
# =========================================================================
# Lambda Functions
# =========================================================================
ProcessDocumentFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-process'
CodeUri: src/lambda/process_document/
Handler: index.lambda_handler
Description: Process document - OCR and text extraction
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 900 # 15 minutes for large documents
MemorySize: 3008 # High memory for OCR processing (Pillow, PDF parsing)
# No reserved concurrency - allows multi-stack deployments
DeadLetterQueue:
Type: SQS
TargetArn: !GetAtt ProcessingDLQ.Arn
Environment:
Variables:
LOG_LEVEL: INFO
TRACKING_TABLE: !Ref TrackingTable
DATA_BUCKET: !Ref DataBucket
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
GRAPHQL_ENDPOINT: !GetAtt GraphQLApi.GraphQLUrl
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- S3CrudPolicy:
BucketName: !Ref DataBucket
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- DynamoDBCrudPolicy:
TableName: !Ref MeteringTable
- DynamoDBReadPolicy:
TableName: !Ref ConfigurationTable
- Statement:
- Effect: Allow
Action:
- textract:DetectDocumentText
- textract:AnalyzeDocument
- textract:StartDocumentTextDetection
- textract:GetDocumentTextDetection
Resource: '*'
- Effect: Allow
Action: bedrock:InvokeModel
Resource:
# Wildcard regions needed: inference profiles route to any region
- !Sub 'arn:${AWS::Partition}:bedrock:*::foundation-model/*'
- !Sub 'arn:${AWS::Partition}:bedrock:*:${AWS::AccountId}:inference-profile/*'
- Effect: Allow
Action: appsync:GraphQL
Resource: !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/${GraphQLApi.ApiId}/*'
DetectFileTypeFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-detect-file-type'
CodeUri: src/lambda/detect_file_type/
Handler: index.lambda_handler
Description: Detect file type and get page info for OCR routing
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 120 # 2 minutes for PDF downloads and page counting
MemorySize: 512 # Needs memory for PDF parsing with PyMuPDF
Environment:
Variables:
LOG_LEVEL: INFO
TRACKING_TABLE: !Ref TrackingTable
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- S3ReadPolicy:
BucketName: !Ref DataBucket
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
ProcessTextFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-process-text'
CodeUri: src/lambda/process_text/
Handler: index.lambda_handler
Description: Process text-based files (HTML, TXT, CSV, JSON, XML, EML, EPUB, DOCX, XLSX)
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 300
MemorySize: 1024
Environment:
Variables:
LOG_LEVEL: INFO
TRACKING_TABLE: !Ref TrackingTable
DATA_BUCKET: !Ref DataBucket
GRAPHQL_ENDPOINT: !GetAtt GraphQLApi.GraphQLUrl
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- S3CrudPolicy:
BucketName: !Ref DataBucket
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- Statement:
- Effect: Allow
Action: appsync:GraphQL
Resource: !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/${GraphQLApi.ApiId}/*'
ProcessMediaFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-process-media'
CodeUri: src/lambda/process_media/
Handler: index.lambda_handler
Description: Process video/audio files through AWS Transcribe
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 900 # 15 minutes (Lambda max) to handle transcription polling
MemorySize: 512
Environment:
Variables:
LOG_LEVEL: INFO
TRACKING_TABLE: !Ref TrackingTable
VECTOR_BUCKET: !Ref VectorBucket
GRAPHQL_ENDPOINT: !GetAtt GraphQLApi.GraphQLUrl
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
INGEST_MEDIA_FUNCTION_ARN: !GetAtt IngestMediaFunction.Arn
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- S3CrudPolicy:
BucketName: !Ref DataBucket
- S3CrudPolicy:
BucketName: !Ref VectorBucket
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- DynamoDBReadPolicy:
TableName: !Ref ConfigurationTable
- Statement:
- Effect: Allow
Action: appsync:GraphQL
Resource: !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/${GraphQLApi.ApiId}/*'
- Statement:
- Effect: Allow
Action:
- transcribe:StartTranscriptionJob
- transcribe:GetTranscriptionJob
- transcribe:DeleteTranscriptionJob
Resource: '*'
- Statement:
- Effect: Allow
Action: lambda:InvokeFunction
Resource: !GetAtt IngestMediaFunction.Arn
CombinePagesFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-combinepages'
CodeUri: src/lambda/combine_pages/
Handler: index.lambda_handler
Description: Combine partial text files from batch processing
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 300 # 5 minutes for large doc concatenation
MemorySize: 1024 # Needs memory for string concatenation
Environment:
Variables:
LOG_LEVEL: INFO
TRACKING_TABLE: !Ref TrackingTable
GRAPHQL_ENDPOINT: !GetAtt GraphQLApi.GraphQLUrl
INGEST_TO_KB_FUNCTION_ARN: !GetAtt IngestToKBFunction.Arn
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- S3CrudPolicy:
BucketName: !Ref DataBucket
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- Statement:
- Effect: Allow
Action: appsync:GraphQL
Resource: !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/${GraphQLApi.ApiId}/*'
- Statement:
- Effect: Allow
Action: lambda:InvokeFunction
Resource: !GetAtt IngestToKBFunction.Arn
QueueProcessorFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-queue-processor'
CodeUri: src/lambda/queue_processor/
Handler: index.lambda_handler
Description: Process SQS messages to start Step Functions executions
Runtime: python3.13
Timeout: 30
MemorySize: 128
ReservedConcurrentExecutions: 3 # Limit concurrent large doc processing
Environment:
Variables:
LOG_LEVEL: INFO
STATE_MACHINE_ARN: !GetAtt ProcessingStateMachine.Arn
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- StepFunctionsExecutionPolicy:
StateMachineName: !GetAtt ProcessingStateMachine.Name
- DynamoDBReadPolicy:
TableName: !Ref ConfigurationTable
Events:
SQSTrigger:
Type: SQS
Properties:
Queue: !GetAtt DocumentProcessingQueue.Arn
BatchSize: 1
FunctionResponseTypes:
- ReportBatchItemFailures
EnqueueBatchesFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-enqueue-batches'
CodeUri: src/lambda/enqueue_batches/
Handler: index.lambda_handler
Description: Queue individual batches to SQS for rate-limited processing
Runtime: python3.13
Timeout: 60
MemorySize: 256
Layers:
- !Ref RagstackCommonLayer
Environment:
Variables:
LOG_LEVEL: INFO
TRACKING_TABLE: !Ref TrackingTable
BATCH_QUEUE_URL: !Ref BatchProcessingQueue
GRAPHQL_ENDPOINT: !GetAtt GraphQLApi.GraphQLUrl
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- SQSSendMessagePolicy:
QueueName: !GetAtt BatchProcessingQueue.QueueName
- Statement:
- Effect: Allow
Action: appsync:GraphQL
Resource: !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/${GraphQLApi.ApiId}/*'
BatchProcessorFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-batch-processor'
CodeUri: src/lambda/batch_processor/
Handler: index.lambda_handler
Description: Process individual 10-page batches with global rate limiting
Runtime: python3.13
Timeout: 900 # 15 minutes per batch
MemorySize: 3008 # Same as ProcessDocument for OCR workload
ReservedConcurrentExecutions: 10 # GLOBAL LIMIT - controls Bedrock API rate
Layers:
- !Ref RagstackCommonLayer
Environment:
Variables:
LOG_LEVEL: INFO
TRACKING_TABLE: !Ref TrackingTable
DATA_BUCKET: !Ref DataBucket
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
COMBINE_PAGES_FUNCTION_ARN: !GetAtt CombinePagesFunction.Arn
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- S3CrudPolicy:
BucketName: !Ref DataBucket
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- DynamoDBReadPolicy:
TableName: !Ref ConfigurationTable
- Statement:
- Effect: Allow
Action: lambda:InvokeFunction
Resource: !GetAtt CombinePagesFunction.Arn
- Effect: Allow
Action:
- bedrock:InvokeModel
Resource:
# Wildcard regions needed: inference profiles route to any region
- !Sub 'arn:${AWS::Partition}:bedrock:*::foundation-model/*'
- !Sub 'arn:${AWS::Partition}:bedrock:*:${AWS::AccountId}:inference-profile/*'
- Effect: Allow
Action:
- textract:DetectDocumentText
- textract:AnalyzeDocument
Resource: '*'
Events:
SQSTrigger:
Type: SQS
Properties:
Queue: !GetAtt BatchProcessingQueue.Arn
BatchSize: 1
FunctionResponseTypes:
- ReportBatchItemFailures
IngestToKBFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-ingest'
CodeUri: src/lambda/ingest_to_kb/
Handler: index.lambda_handler
Description: Ingest documents directly into Knowledge Base
Runtime: python3.13
Timeout: 300 # 5 minutes for KB sync
MemorySize: 128 # Minimal - just API calls to Bedrock
# No reserved concurrency - allows multi-stack deployments
Environment:
Variables:
LOG_LEVEL: INFO
KNOWLEDGE_BASE_ID: !GetAtt KnowledgeBase.KnowledgeBaseId
DATA_SOURCE_ID: !GetAtt KnowledgeBase.DataSourceId # Unified content data source
TRACKING_TABLE: !Ref TrackingTable
GRAPHQL_ENDPOINT: !GetAtt GraphQLApi.GraphQLUrl
METADATA_KEY_LIBRARY_TABLE: !Ref MetadataKeyLibraryTable
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
Layers:
- !Ref RagstackCommonLayer
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- DynamoDBCrudPolicy:
TableName: !Ref MetadataKeyLibraryTable
- DynamoDBReadPolicy:
TableName: !Ref ConfigurationTable
- S3CrudPolicy:
BucketName: !Ref DataBucket
- Statement:
- Effect: Allow
Action:
- bedrock:IngestKnowledgeBaseDocuments
- bedrock:GetKnowledgeBaseDocuments
- bedrock:StartIngestionJob
- bedrock:GetKnowledgeBase
- bedrock:GetDataSource
- bedrock:ListDataSources
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*'
- Effect: Allow
Action:
- bedrock:StartIngestionJob
- bedrock:GetIngestionJob
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*/data-source/*'
- Effect: Allow
Action: appsync:GraphQL
Resource: !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/${GraphQLApi.ApiId}/*'
- Effect: Allow
Action: bedrock:InvokeModel
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:*::foundation-model/*'
- !Sub 'arn:${AWS::Partition}:bedrock:*:${AWS::AccountId}:inference-profile/*'
# IngestMedia Lambda - handles dual embedding ingestion for video/audio
IngestMediaFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-ingest-media'
CodeUri: src/lambda/ingest_media/
Handler: index.lambda_handler
Description: Ingest media content with dual embeddings (transcript + visual)
Runtime: python3.13
Timeout: 600 # 10 minutes for embedding generation
MemorySize: 512 # Moderate - handles embedding processing
Environment:
Variables:
LOG_LEVEL: INFO
KNOWLEDGE_BASE_ID: !GetAtt KnowledgeBase.KnowledgeBaseId
DATA_SOURCE_ID: !GetAtt KnowledgeBase.DataSourceId
TRACKING_TABLE: !Ref TrackingTable
GRAPHQL_ENDPOINT: !GetAtt GraphQLApi.GraphQLUrl
METADATA_KEY_LIBRARY_TABLE: !Ref MetadataKeyLibraryTable
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
SYNC_REQUEST_QUEUE_URL: !Ref SyncRequestQueue
Layers:
- !Ref RagstackCommonLayer
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- S3CrudPolicy:
BucketName: !Ref DataBucket
- S3CrudPolicy:
BucketName: !Ref VectorBucket
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- DynamoDBCrudPolicy:
TableName: !Ref MetadataKeyLibraryTable
- DynamoDBReadPolicy:
TableName: !Ref ConfigurationTable
- SQSSendMessagePolicy:
QueueName: !GetAtt SyncRequestQueue.QueueName
- Statement:
- Effect: Allow
Action:
- bedrock:IngestKnowledgeBaseDocuments
- bedrock:GetKnowledgeBaseDocuments
- bedrock:StartIngestionJob
- bedrock:GetIngestionJob
- bedrock:GetKnowledgeBase
- bedrock:GetDataSource
- bedrock:ListDataSources
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*'
- Effect: Allow
Action:
- bedrock:StartIngestionJob
- bedrock:GetIngestionJob
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*/data-source/*'
- Effect: Allow
Action: appsync:GraphQL
Resource: !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/${GraphQLApi.ApiId}/*'
- Effect: Allow
Action: bedrock:InvokeModel
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:*::foundation-model/*'
- !Sub 'arn:${AWS::Partition}:bedrock:*:${AWS::AccountId}:inference-profile/*'
ProcessImageFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-process-image'
CodeUri: src/lambda/process_image/
Handler: index.lambda_handler
Description: Process uploaded images and ingest to Knowledge Base
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 300 # 5 minutes for image processing
MemorySize: 512 # Moderate - image handling + Bedrock calls
# No reserved concurrency - allows multi-stack deployments
Environment:
Variables:
LOG_LEVEL: INFO
KNOWLEDGE_BASE_ID: !GetAtt KnowledgeBase.KnowledgeBaseId
DATA_SOURCE_ID: !GetAtt KnowledgeBase.DataSourceId # Unified content data source
TRACKING_TABLE: !Ref TrackingTable
GRAPHQL_ENDPOINT: !GetAtt GraphQLApi.GraphQLUrl
DATA_BUCKET: !Ref DataBucket
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
METADATA_KEY_LIBRARY_TABLE: !Ref MetadataKeyLibraryTable
SYNC_REQUEST_QUEUE_URL: !Ref SyncRequestQueue
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- S3ReadPolicy:
BucketName: !Ref DataBucket
- S3WritePolicy:
BucketName: !Ref DataBucket
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- DynamoDBCrudPolicy:
TableName: !Ref MetadataKeyLibraryTable
- DynamoDBReadPolicy:
TableName: !Ref ConfigurationTable
- SQSSendMessagePolicy:
QueueName: !GetAtt SyncRequestQueue.QueueName
- Statement:
- Effect: Allow
Action:
- bedrock:IngestKnowledgeBaseDocuments
- bedrock:GetKnowledgeBase
- bedrock:GetDataSource
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*'
- Effect: Allow
Action: appsync:GraphQL
Resource: !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/${GraphQLApi.ApiId}/*'
- Effect: Allow
Action:
- bedrock:InvokeModel
Resource:
# Wildcard regions needed: inference profiles route to any region
- !Sub 'arn:${AWS::Partition}:bedrock:*::foundation-model/*'
- !Sub 'arn:${AWS::Partition}:bedrock:*:${AWS::AccountId}:inference-profile/*'
# Metadata analyzer - samples KB vectors, analyzes metadata fields, generates filter examples
MetadataAnalyzerFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-metadata-analyzer'
CodeUri: src/lambda/metadata_analyzer/
Handler: index.lambda_handler
Description: Analyze Knowledge Base vectors to discover metadata fields and generate filter examples
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 300 # 5 minutes - analysis can take time
MemorySize: 512
Environment:
Variables:
LOG_LEVEL: INFO
KNOWLEDGE_BASE_ID: !GetAtt KnowledgeBase.KnowledgeBaseId
DATA_SOURCE_ID: !GetAtt KnowledgeBase.DataSourceId # Unified content data source
DATA_BUCKET: !Ref DataBucket
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
METADATA_KEY_LIBRARY_TABLE: !Ref MetadataKeyLibraryTable
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- DynamoDBCrudPolicy:
TableName: !Ref MetadataKeyLibraryTable
- DynamoDBCrudPolicy:
TableName: !Ref ConfigurationTable
- S3CrudPolicy:
BucketName: !Ref DataBucket
- Statement:
- Effect: Allow
Action:
- bedrock:Retrieve
- bedrock:RetrieveAndGenerate
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*'
- Effect: Allow
Action:
- bedrock:InvokeModel
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:*::foundation-model/*'
- !Sub 'arn:${AWS::Partition}:bedrock:*:${AWS::AccountId}:inference-profile/*'
# =========================================================================
# KB Reindex Lambda
# =========================================================================
ReindexKBFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-reindex-kb'
CodeUri: src/lambda/reindex_kb/
Handler: index.lambda_handler
Description: Reindex Knowledge Base - creates new KB, re-ingests documents, deletes old KB
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 900 # 15 minutes for long-running batch operations
MemorySize: 512
Environment:
Variables:
LOG_LEVEL: INFO
TRACKING_TABLE: !Ref TrackingTable
DATA_BUCKET: !Ref DataBucket
VECTOR_BUCKET: !Ref VectorBucket
STACK_NAME: !Ref AWS::StackName
KB_ROLE_ARN: !GetAtt KnowledgeBaseRole.Arn
EMBEDDING_MODEL_ARN: !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}::foundation-model/amazon.nova-2-multimodal-embeddings-v1:0'
KNOWLEDGE_BASE_ID: !GetAtt KnowledgeBase.KnowledgeBaseId
GRAPHQL_ENDPOINT: !GetAtt GraphQLApi.GraphQLUrl
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
METADATA_KEY_LIBRARY_TABLE: !Ref MetadataKeyLibraryTable
SCRAPE_JOBS_TABLE: !Ref ScrapeJobsTable
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- DynamoDBCrudPolicy:
TableName: !Ref MetadataKeyLibraryTable
- DynamoDBCrudPolicy:
TableName: !Ref ConfigurationTable
- DynamoDBReadPolicy:
TableName: !Ref ScrapeJobsTable
- S3CrudPolicy:
BucketName: !Ref DataBucket
- S3CrudPolicy:
BucketName: !Ref VectorBucket
- Statement:
- Effect: Allow
Action:
- bedrock:CreateKnowledgeBase
- bedrock:DeleteKnowledgeBase
- bedrock:GetKnowledgeBase
- bedrock:ListKnowledgeBases
- bedrock:CreateDataSource
- bedrock:DeleteDataSource
- bedrock:GetDataSource
- bedrock:ListDataSources
- bedrock:IngestKnowledgeBaseDocuments
- bedrock:StartIngestionJob
- bedrock:GetIngestionJob
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*'
- Effect: Allow
Action:
- bedrock:CreateKnowledgeBase
Resource: '*'
- Effect: Allow
Action: appsync:GraphQL
Resource: !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/${GraphQLApi.ApiId}/*'
- Effect: Allow
Action:
- bedrock:InvokeModel
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:*::foundation-model/*'
- !Sub 'arn:${AWS::Partition}:bedrock:*:${AWS::AccountId}:inference-profile/*'
- Effect: Allow
Action:
- iam:PassRole
Resource: !GetAtt KnowledgeBaseRole.Arn
- Effect: Allow
Action:
- s3vectors:CreateVectorBucket
- s3vectors:GetVectorBucket
- s3vectors:CreateIndex
- s3vectors:DeleteIndex
- s3vectors:GetIndex
- s3vectors:ListVectors
Resource:
- !Sub 'arn:${AWS::Partition}:s3vectors:${AWS::Region}:${AWS::AccountId}:bucket/${VectorBucket}'
- !Sub 'arn:${AWS::Partition}:s3vectors:${AWS::Region}:${AWS::AccountId}:bucket/${VectorBucket}/*'
- Effect: Allow
Action:
- lambda:GetFunctionConfiguration
- lambda:UpdateFunctionConfiguration
Resource:
- !Sub 'arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:${AWS::StackName}-query'
- !Sub 'arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:${AWS::StackName}-search'
- !Sub 'arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:${AWS::StackName}-ingest'
- !Sub 'arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:${AWS::StackName}-ingest-media'
- !Sub 'arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:${AWS::StackName}-reindex-kb'
- !Sub 'arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:${AWS::StackName}-process-image'
- !Sub 'arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:${AWS::StackName}-metadata-analyzer'
- !Sub 'arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:${AWS::StackName}-process-zip'
ProcessZipFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-process-zip'
CodeUri: src/lambda/process_zip/
Handler: index.lambda_handler
Description: Process ZIP archives containing images with optional captions manifest
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 600 # 10 minutes for ZIP processing (potentially many images)
MemorySize: 1024 # Higher memory for ZIP extraction in memory
# No reserved concurrency - allows multi-stack deployments
Environment:
Variables:
LOG_LEVEL: INFO
KNOWLEDGE_BASE_ID: !GetAtt KnowledgeBase.KnowledgeBaseId
DATA_SOURCE_ID: !GetAtt KnowledgeBase.DataSourceId # Unified content data source
TRACKING_TABLE: !Ref TrackingTable
GRAPHQL_ENDPOINT: !GetAtt GraphQLApi.GraphQLUrl
DATA_BUCKET: !Ref DataBucket
CAPTION_MODEL_ID: !Ref CaptionModelId
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- S3ReadPolicy:
BucketName: !Ref DataBucket
- S3WritePolicy:
BucketName: !Ref DataBucket
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- Statement:
- Effect: Allow
Action:
- bedrock:InvokeModel
Resource:
# Wildcard regions needed: inference profiles route to any region
- !Sub 'arn:${AWS::Partition}:bedrock:*::foundation-model/*'
- !Sub 'arn:${AWS::Partition}:bedrock:*:${AWS::AccountId}:inference-profile/*'
- Effect: Allow
Action:
- bedrock:IngestKnowledgeBaseDocuments
- bedrock:StartIngestionJob
- bedrock:GetKnowledgeBase
- bedrock:GetDataSource
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*'
- Effect: Allow
Action:
- bedrock:StartIngestionJob
- bedrock:GetIngestionJob
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*/data-source/*'
- Effect: Allow
Action: appsync:GraphQL
Resource: !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/${GraphQLApi.ApiId}/*'
QueryKBFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-query'
CodeUri: src/lambda/query_kb/
Handler: index.lambda_handler
Description: Query Bedrock Knowledge Base
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 60 # 1 minute for chat responses
MemorySize: 1769 # 1 full vCPU for CPU-bound chat response generation
# No reserved concurrency - allows multi-stack deployments
Environment:
Variables:
LOG_LEVEL: INFO
KNOWLEDGE_BASE_ID: !GetAtt KnowledgeBase.KnowledgeBaseId
DATA_SOURCE_ID: !GetAtt KnowledgeBase.DataSourceId # Unified content data source
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
CONVERSATION_TABLE_NAME: !Ref ConversationHistoryTable
TRACKING_TABLE: !Ref TrackingTable
METADATA_KEY_LIBRARY_TABLE: !Ref MetadataKeyLibraryTable
DEMO_MODE: !Ref DemoMode
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- DynamoDBCrudPolicy:
TableName: !Ref ConfigurationTable
- DynamoDBCrudPolicy:
TableName: !Ref ConversationHistoryTable
- DynamoDBReadPolicy:
TableName: !Ref TrackingTable
- DynamoDBReadPolicy:
TableName: !Ref MetadataKeyLibraryTable
- S3ReadPolicy:
BucketName: !Ref DataBucket
- Statement:
- Effect: Allow
Action:
- bedrock:Retrieve
- bedrock:RetrieveAndGenerate
Resource: !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*'
- Effect: Allow
Action: bedrock:InvokeModel
Resource:
# Wildcard regions needed: inference profiles route to any region
- !Sub 'arn:${AWS::Partition}:bedrock:*::foundation-model/*'
- !Sub 'arn:${AWS::Partition}:bedrock:*:${AWS::AccountId}:inference-profile/*'
- Effect: Allow
Action: bedrock:GetInferenceProfile
Resource: !Sub 'arn:${AWS::Partition}:bedrock:*:${AWS::AccountId}:inference-profile/*'
- Effect: Allow
Action: dynamodb:DescribeTable
Resource: !GetAtt MetadataKeyLibraryTable.Arn
SearchKBFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-search'
CodeUri: src/lambda/search_kb/
Handler: index.lambda_handler
Description: Search Bedrock Knowledge Base (raw vector search)
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 30 # Vector search is fast
MemorySize: 128 # Minimal - just API calls to Bedrock
# No reserved concurrency - allows multi-stack deployments
Environment:
Variables:
LOG_LEVEL: INFO
KNOWLEDGE_BASE_ID: !GetAtt KnowledgeBase.KnowledgeBaseId
DATA_SOURCE_ID: !GetAtt KnowledgeBase.DataSourceId # Unified content data source
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
TRACKING_TABLE: !Ref TrackingTable
METADATA_KEY_LIBRARY_TABLE: !Ref MetadataKeyLibraryTable
DATA_BUCKET: !Ref DataBucket
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- Statement:
- Effect: Allow
Action:
- bedrock:Retrieve
Resource: !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*'
- Statement:
- Effect: Allow
Action:
- bedrock:InvokeModel
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:*::foundation-model/*'
- !Sub 'arn:${AWS::Partition}:bedrock:*:${AWS::AccountId}:inference-profile/*'
- Statement:
- Effect: Allow
Action:
- dynamodb:GetItem
- dynamodb:Scan
- dynamodb:DescribeTable
Resource:
- !GetAtt ConfigurationTable.Arn
- !GetAtt TrackingTable.Arn
- !GetAtt MetadataKeyLibraryTable.Arn
# S3 permissions for generating presigned URLs
- Statement:
- Effect: Allow
Action:
- s3:GetObject
Resource:
- !Sub 'arn:${AWS::Partition}:s3:::${DataBucket}/*'
##########################################################################
# Configuration Resolver Lambda
##########################################################################
ConfigurationResolverFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-config'
CodeUri: src/lambda/configuration_resolver/
Handler: index.lambda_handler
Description: GraphQL resolver for configuration management
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 30 # Simple DynamoDB operations
MemorySize: 128 # Minimal - just DynamoDB calls
# No reserved concurrency - allows multi-stack deployments
Environment:
Variables:
LOG_LEVEL: INFO
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
TRACKING_TABLE: !Ref TrackingTable
STATE_MACHINE_ARN: !Ref ProcessingStateMachine
DEMO_MODE: !Ref DemoMode
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- Statement:
- Effect: Allow
Action:
- dynamodb:GetItem
- dynamodb:PutItem
- dynamodb:UpdateItem
- dynamodb:Query
Resource: !GetAtt ConfigurationTable.Arn
- Statement:
- Effect: Allow
Action:
- dynamodb:Scan
- dynamodb:Query
Resource:
- !GetAtt TrackingTable.Arn
- !Sub '${TrackingTable.Arn}/index/StatusIndex'
- Statement:
- Effect: Allow
Action:
- states:StartExecution
Resource: !Ref ProcessingStateMachine
##########################################################################
# API Key Resolver Lambda
##########################################################################
ApiKeyResolverFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-apikey'
CodeUri: src/lambda/api_key_resolver/
Handler: index.lambda_handler
Description: GraphQL resolver for API key management
Runtime: python3.13
Timeout: 30 # Simple AppSync API calls
MemorySize: 128 # Minimal - just API calls
# No reserved concurrency - allows multi-stack deployments
Environment:
Variables:
LOG_LEVEL: INFO
APPSYNC_API_ID: !GetAtt GraphQLApi.ApiId
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- Statement:
- Effect: Allow
Action:
- appsync:ListApiKeys
- appsync:CreateApiKey
- appsync:DeleteApiKey
Resource: !Sub 'arn:aws:appsync:${AWS::Region}:${AWS::AccountId}:*'
# =========================================================================
# Scrape Workflow Lambda Functions
# =========================================================================
ScrapeStartFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-scrape-start'
CodeUri: src/lambda/scrape_start/
Handler: index.lambda_handler
Description: Initiate web scraping job
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 60 # Includes metadata extraction from seed URL
MemorySize: 256 # Metadata extraction needs more memory
# No reserved concurrency - allows multi-stack deployments
Environment:
Variables:
LOG_LEVEL: INFO
SCRAPE_JOBS_TABLE: !Ref ScrapeJobsTable
SCRAPE_DISCOVERY_QUEUE_URL: !Ref ScrapeDiscoveryQueue
SCRAPE_STATE_MACHINE_ARN: !Ref ScrapeStateMachine
GRAPHQL_ENDPOINT: !GetAtt GraphQLApi.GraphQLUrl
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
TRACKING_TABLE: !Ref TrackingTable
DATA_BUCKET: !Ref DataBucket
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- DynamoDBCrudPolicy:
TableName: !Ref ScrapeJobsTable
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- DynamoDBReadPolicy:
TableName: !Ref ConfigurationTable
- SQSSendMessagePolicy:
QueueName: !GetAtt ScrapeDiscoveryQueue.QueueName
- Statement:
- Effect: Allow
Action:
- states:StartExecution
Resource: !Ref ScrapeStateMachine
- Statement:
- Effect: Allow
Action: appsync:GraphQL
Resource: !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/${GraphQLApi.ApiId}/*'
- Statement:
- Effect: Allow
Action: bedrock:InvokeModel
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:*::foundation-model/*'
- !Sub 'arn:${AWS::Partition}:bedrock:*:${AWS::AccountId}:inference-profile/*'
ScrapeDiscoverFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-scrape-discover'
CodeUri: src/lambda/scrape_discover/
Handler: index.lambda_handler
Description: Discover URLs during web scraping
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 300 # URL discovery can take time
MemorySize: 256 # Moderate - HTTP requests + HTML parsing
# No reserved concurrency - allows multi-stack deployments
Environment:
Variables:
LOG_LEVEL: INFO
SCRAPE_JOBS_TABLE: !Ref ScrapeJobsTable
SCRAPE_URLS_TABLE: !Ref ScrapeUrlsTable
SCRAPE_DISCOVERY_QUEUE_URL: !Ref ScrapeDiscoveryQueue
SCRAPE_PROCESSING_QUEUE_URL: !Ref ScrapeProcessingQueue
GRAPHQL_ENDPOINT: !GetAtt GraphQLApi.GraphQLUrl
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Events:
SQSTrigger:
Type: SQS
Properties:
Queue: !GetAtt ScrapeDiscoveryQueue.Arn
BatchSize: 1
Policies:
- DynamoDBCrudPolicy:
TableName: !Ref ScrapeJobsTable
- DynamoDBCrudPolicy:
TableName: !Ref ScrapeUrlsTable
- SQSSendMessagePolicy:
QueueName: !GetAtt ScrapeDiscoveryQueue.QueueName
- SQSSendMessagePolicy:
QueueName: !GetAtt ScrapeProcessingQueue.QueueName
- Statement:
- Effect: Allow
Action: appsync:GraphQL
Resource: !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/${GraphQLApi.ApiId}/*'
ScrapeProcessFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-scrape-process'
CodeUri: src/lambda/scrape_process/
Handler: index.lambda_handler
Description: Process scraped pages and save to S3
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 300 # 5 minutes for page processing (reduced from 15)
MemorySize: 512 # Moderate - HTML processing + S3 upload
# No reserved concurrency - allows multi-stack deployments
Environment:
Variables:
LOG_LEVEL: INFO
SCRAPE_JOBS_TABLE: !Ref ScrapeJobsTable
SCRAPE_URLS_TABLE: !Ref ScrapeUrlsTable
DATA_BUCKET: !Ref DataBucket
GRAPHQL_ENDPOINT: !GetAtt GraphQLApi.GraphQLUrl
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Events:
SQSTrigger:
Type: SQS
Properties:
Queue: !GetAtt ScrapeProcessingQueue.Arn
BatchSize: 1
Policies:
- DynamoDBCrudPolicy:
TableName: !Ref ScrapeJobsTable
- DynamoDBCrudPolicy:
TableName: !Ref ScrapeUrlsTable
- S3CrudPolicy:
BucketName: !Ref DataBucket
- Statement:
- Effect: Allow
Action: appsync:GraphQL
Resource: !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/${GraphQLApi.ApiId}/*'
ScrapeStatusFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-scrape-status'
CodeUri: src/lambda/scrape_status/
Handler: index.lambda_handler
Description: Return scrape job status for Step Functions polling
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 10 # Quick status check
MemorySize: 128 # Minimal - just DynamoDB/SQS reads
# No reserved concurrency - allows multi-stack deployments
Environment:
Variables:
LOG_LEVEL: INFO
SCRAPE_JOBS_TABLE: !Ref ScrapeJobsTable
SCRAPE_URLS_TABLE: !Ref ScrapeUrlsTable
SCRAPE_DISCOVERY_QUEUE_URL: !Ref ScrapeDiscoveryQueue
SCRAPE_PROCESSING_QUEUE_URL: !Ref ScrapeProcessingQueue
GRAPHQL_ENDPOINT: !GetAtt GraphQLApi.GraphQLUrl
TRACKING_TABLE: !Ref TrackingTable
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- DynamoDBCrudPolicy:
TableName: !Ref ScrapeJobsTable
- DynamoDBCrudPolicy:
TableName: !Ref ScrapeUrlsTable
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- Statement:
- Effect: Allow
Action:
- sqs:GetQueueAttributes
- sqs:SendMessage
Resource:
- !GetAtt ScrapeDiscoveryQueue.Arn
- !GetAtt ScrapeProcessingQueue.Arn
- Statement:
- Effect: Allow
Action: appsync:GraphQL
Resource: !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/${GraphQLApi.ApiId}/*'
# =========================================================================
# Step Functions State Machine
# =========================================================================
StateMachineExecutionRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Service: states.amazonaws.com
Action: sts:AssumeRole
Policies:
- PolicyName: InvokeLambdas
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action:
- lambda:InvokeFunction
Resource:
- !GetAtt ProcessDocumentFunction.Arn
- !GetAtt IngestToKBFunction.Arn
- !GetAtt IngestMediaFunction.Arn
- !GetAtt EnqueueBatchesFunction.Arn
- !GetAtt DetectFileTypeFunction.Arn
- !GetAtt ProcessTextFunction.Arn
- !GetAtt ProcessMediaFunction.Arn
- PolicyName: CloudWatchLogs
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action:
- logs:CreateLogDelivery
- logs:GetLogDelivery
- logs:UpdateLogDelivery
- logs:DeleteLogDelivery
- logs:ListLogDeliveries
- logs:PutLogEvents
- logs:PutResourcePolicy
- logs:DescribeResourcePolicies
- logs:DescribeLogGroups
Resource: '*'
ProcessingStateMachine:
Type: AWS::Serverless::StateMachine
Properties:
Name: !Sub '${AWS::StackName}-ProcessingPipeline'
DefinitionUri: src/statemachine/pipeline.asl.json
DefinitionSubstitutions:
ProcessDocumentFunctionArn: !GetAtt ProcessDocumentFunction.Arn
IngestToKBFunctionArn: !GetAtt IngestToKBFunction.Arn
IngestMediaFunctionArn: !GetAtt IngestMediaFunction.Arn
EnqueueBatchesFunctionArn: !GetAtt EnqueueBatchesFunction.Arn
DetectFileTypeFunctionArn: !GetAtt DetectFileTypeFunction.Arn
ProcessTextFunctionArn: !GetAtt ProcessTextFunction.Arn
ProcessMediaFunctionArn: !GetAtt ProcessMediaFunction.Arn
Role: !GetAtt StateMachineExecutionRole.Arn
Logging:
Level: ALL
IncludeExecutionData: true
Destinations:
- CloudWatchLogsLogGroup:
LogGroupArn: !GetAtt StateMachineLogGroup.Arn
StateMachineLogGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub '/aws/vendedlogs/states/${AWS::StackName}-Pipeline'
RetentionInDays: 30
# =========================================================================
# Scrape Workflow State Machine
# =========================================================================
ScrapeStateMachineExecutionRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Service: states.amazonaws.com
Action: sts:AssumeRole
Policies:
- PolicyName: InvokeLambdas
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action:
- lambda:InvokeFunction
Resource:
- !GetAtt ScrapeStatusFunction.Arn
- PolicyName: DynamoDBAccess
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action:
- dynamodb:UpdateItem
Resource:
- !GetAtt ScrapeJobsTable.Arn
- PolicyName: SQSAccess
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action:
- sqs:SendMessage
Resource:
- !GetAtt SyncRequestQueue.Arn
- PolicyName: CloudWatchLogs
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action:
- logs:CreateLogDelivery
- logs:GetLogDelivery
- logs:UpdateLogDelivery
- logs:DeleteLogDelivery
- logs:ListLogDeliveries
- logs:PutLogEvents
- logs:PutResourcePolicy
- logs:DescribeResourcePolicies
- logs:DescribeLogGroups
Resource: '*'
ScrapeStateMachine:
Type: AWS::Serverless::StateMachine
Properties:
Name: !Sub '${AWS::StackName}-ScrapeWorkflow'
DefinitionUri: src/statemachine/scrape.asl.json
DefinitionSubstitutions:
ScrapeStatusFunctionArn: !GetAtt ScrapeStatusFunction.Arn
ScrapeJobsTable: !Ref ScrapeJobsTable
SyncRequestQueueUrl: !Ref SyncRequestQueue
Role: !GetAtt ScrapeStateMachineExecutionRole.Arn
Logging:
Level: ALL
IncludeExecutionData: true
Destinations:
- CloudWatchLogsLogGroup:
LogGroupArn: !GetAtt ScrapeStateMachineLogGroup.Arn
ScrapeStateMachineLogGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub '/aws/vendedlogs/states/${AWS::StackName}-ScrapeWorkflow'
RetentionInDays: 30
# =========================================================================
# KB Reindex State Machine
# =========================================================================
ReindexStateMachineExecutionRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Service: states.amazonaws.com
Action: sts:AssumeRole
Policies:
- PolicyName: InvokeLambdas
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action:
- lambda:InvokeFunction
Resource:
- !GetAtt ReindexKBFunction.Arn
- PolicyName: CloudWatchLogs
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action:
- logs:CreateLogDelivery
- logs:GetLogDelivery
- logs:UpdateLogDelivery
- logs:DeleteLogDelivery
- logs:ListLogDeliveries
- logs:PutLogEvents
- logs:PutResourcePolicy
- logs:DescribeResourcePolicies
- logs:DescribeLogGroups
Resource: '*'
ReindexStateMachine:
Type: AWS::Serverless::StateMachine
Properties:
Name: !Sub '${AWS::StackName}-ReindexWorkflow'
DefinitionUri: src/statemachine/reindex.asl.json
DefinitionSubstitutions:
ReindexKBFunctionArn: !GetAtt ReindexKBFunction.Arn
Role: !GetAtt ReindexStateMachineExecutionRole.Arn
Logging:
Level: ALL
IncludeExecutionData: true
Destinations:
- CloudWatchLogsLogGroup:
LogGroupArn: !GetAtt ReindexStateMachineLogGroup.Arn
ReindexStateMachineLogGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub '/aws/vendedlogs/states/${AWS::StackName}-ReindexWorkflow'
RetentionInDays: 30
# =========================================================================
# EventBridge Rule for S3 Upload Trigger
# =========================================================================
S3UploadRule:
Type: AWS::Events::Rule
Properties:
Name: !Sub '${AWS::StackName}-S3UploadTrigger'
Description: Trigger processing pipeline on S3 upload to input/ prefix
EventPattern:
source:
- aws.s3
detail-type:
- Object Created
detail:
bucket:
name:
- !Ref DataBucket
object:
key:
# Match only input/ prefix (array elements are OR, so only use one)
- prefix: 'input/'
State: ENABLED
Targets:
- Arn: !GetAtt DocumentProcessingQueue.Arn
Id: SendToProcessingQueue
InputTransformer:
InputPathsMap:
bucket: $.detail.bucket.name
key: $.detail.object.key
InputTemplate: |
{
"document_id": "<key>",
"input_s3_uri": "s3://<bucket>/<key>",
"output_s3_prefix": "s3://<bucket>/content/<key>/"
}
RetryPolicy:
MaximumRetryAttempts: 2
# =========================================================================
# EventBridge Rule for Image Upload Trigger
# =========================================================================
ImageUploadRule:
Type: AWS::Events::Rule
Properties:
Name: !Sub '${AWS::StackName}-ImageUploadTrigger'
Description: Trigger image processing on S3 upload to content/ prefix (metadata.json)
EventPattern:
source:
- aws.s3
detail-type:
- Object Created
detail:
bucket:
name:
- !Ref DataBucket
object:
key:
- suffix: '/metadata.json'
State: ENABLED
Targets:
- Arn: !GetAtt ProcessImageFunction.Arn
Id: TriggerImageProcessing
InputTransformer:
InputPathsMap:
bucket: $.detail.bucket.name
key: $.detail.object.key
InputTemplate: |
{
"image_id": "<key>",
"input_s3_uri": "s3://<bucket>/<key>"
}
RetryPolicy:
MaximumRetryAttempts: 2
ImageUploadRulePermission:
Type: AWS::Lambda::Permission
Properties:
FunctionName: !Ref ProcessImageFunction
Action: lambda:InvokeFunction
Principal: events.amazonaws.com
SourceArn: !GetAtt ImageUploadRule.Arn
# EventBridge Rule for Auto-Process Image Uploads (API/MCP)
# Triggers on actual image file uploads to content/ prefix
# Uses suffix patterns to only match image files (not caption.txt or metadata.json)
# The Lambda checks if auto_process=true in DynamoDB before processing
ImageAutoProcessRule:
Type: AWS::Events::Rule
Properties:
Name: !Sub '${AWS::StackName}-ImageAutoProcess'
Description: Trigger image processing on direct image upload (for API/MCP with autoProcess=true)
EventPattern:
source:
- aws.s3
detail-type:
- Object Created
detail:
bucket:
name:
- !Ref DataBucket
object:
key:
# Match only files in content/ prefix (Lambda skips media artifacts)
- prefix: 'content/'
State: ENABLED
Targets:
- Arn: !GetAtt ProcessImageFunction.Arn
Id: TriggerImageAutoProcess
InputTransformer:
InputPathsMap:
bucket: $.detail.bucket.name
key: $.detail.object.key
InputTemplate: |
{
"image_id": "<key>",
"input_s3_uri": "s3://<bucket>/<key>",
"trigger_type": "auto_process"
}
RetryPolicy:
MaximumRetryAttempts: 0
ImageAutoProcessRulePermission:
Type: AWS::Lambda::Permission
Properties:
FunctionName: !Ref ProcessImageFunction
Action: lambda:InvokeFunction
Principal: events.amazonaws.com
SourceArn: !GetAtt ImageAutoProcessRule.Arn
# EventBridge rule to trigger ZIP processing
ZipUploadRule:
Type: AWS::Events::Rule
Properties:
Name: !Sub '${AWS::StackName}-zip-upload'
Description: Trigger ZIP processing on S3 upload to uploads/ prefix (.zip files)
EventPattern:
source:
- aws.s3
detail-type:
- Object Created
detail:
bucket:
name:
- !Ref DataBucket
object:
key:
- prefix: 'uploads/'
- suffix: '.zip'
State: ENABLED
Targets:
- Arn: !GetAtt ProcessZipFunction.Arn
Id: TriggerZipProcessing
InputTransformer:
InputPathsMap:
bucket: $.detail.bucket.name
key: $.detail.object.key
InputTemplate: |
{
"bucket": "<bucket>",
"key": "<key>"
}
RetryPolicy:
MaximumRetryAttempts: 2
ZipUploadRulePermission:
Type: AWS::Lambda::Permission
Properties:
FunctionName: !Ref ProcessZipFunction
Action: lambda:InvokeFunction
Principal: events.amazonaws.com
SourceArn: !GetAtt ZipUploadRule.Arn
# =========================================================================
# Visual Embeddings Pipeline - EventBridge Rules
# =========================================================================
# Rule 1: Trigger ProcessMediaFunction when media files land in content/
# This enables direct upload to content/ without Step Functions for media
# Uses suffix patterns for common media extensions
MediaContentUploadRule:
Type: AWS::Events::Rule
Properties:
Name: !Sub '${AWS::StackName}-MediaContentTrigger'
Description: Trigger media processing when video/audio uploaded to content/
EventPattern:
source:
- aws.s3
detail-type:
- Object Created
detail:
bucket:
name:
- !Ref DataBucket
object:
key:
- prefix: content/
- suffix: .mp4
State: ENABLED
Targets:
- Arn: !GetAtt ProcessMediaFunction.Arn
Id: TriggerProcessMedia
# Rule for .webm video files
MediaContentWebmRule:
Type: AWS::Events::Rule
Properties:
Name: !Sub '${AWS::StackName}-MediaContentWebm'
Description: Trigger media processing for webm files
EventPattern:
source:
- aws.s3
detail-type:
- Object Created
detail:
bucket:
name:
- !Ref DataBucket
object:
key:
- prefix: content/
- suffix: .webm
State: ENABLED
Targets:
- Arn: !GetAtt ProcessMediaFunction.Arn
Id: TriggerProcessMedia
# Rule for .mp3 audio files
MediaContentMp3Rule:
Type: AWS::Events::Rule
Properties:
Name: !Sub '${AWS::StackName}-MediaContentMp3'
Description: Trigger media processing for mp3 files
EventPattern:
source:
- aws.s3
detail-type:
- Object Created
detail:
bucket:
name:
- !Ref DataBucket
object:
key:
- prefix: content/
- suffix: .mp3
State: ENABLED
Targets:
- Arn: !GetAtt ProcessMediaFunction.Arn
Id: TriggerProcessMedia
# Rule for .wav audio files
MediaContentWavRule:
Type: AWS::Events::Rule
Properties:
Name: !Sub '${AWS::StackName}-MediaContentWav'
Description: Trigger media processing for wav files
EventPattern:
source:
- aws.s3
detail-type:
- Object Created
detail:
bucket:
name:
- !Ref DataBucket
object:
key:
- prefix: content/
- suffix: .wav
State: ENABLED
Targets:
- Arn: !GetAtt ProcessMediaFunction.Arn
Id: TriggerProcessMedia
ProcessMediaFunctionEventBridgePermissionMp4:
Type: AWS::Lambda::Permission
Properties:
FunctionName: !Ref ProcessMediaFunction
Action: lambda:InvokeFunction
Principal: events.amazonaws.com
SourceArn: !GetAtt MediaContentUploadRule.Arn
ProcessMediaFunctionEventBridgePermissionWebm:
Type: AWS::Lambda::Permission
Properties:
FunctionName: !Ref ProcessMediaFunction
Action: lambda:InvokeFunction
Principal: events.amazonaws.com
SourceArn: !GetAtt MediaContentWebmRule.Arn
ProcessMediaFunctionEventBridgePermissionMp3:
Type: AWS::Lambda::Permission
Properties:
FunctionName: !Ref ProcessMediaFunction
Action: lambda:InvokeFunction
Principal: events.amazonaws.com
SourceArn: !GetAtt MediaContentMp3Rule.Arn
ProcessMediaFunctionEventBridgePermissionWav:
Type: AWS::Lambda::Permission
Properties:
FunctionName: !Ref ProcessMediaFunction
Action: lambda:InvokeFunction
Principal: events.amazonaws.com
SourceArn: !GetAtt MediaContentWavRule.Arn
# Note: Visual embedding ingestion for videos is now handled by IngestMedia
# which calls StartIngestionJob after transcript ingestion completes.
# Images are handled by ProcessImageFunction which calls StartIngestionJob directly.
# The VisualContentUploadRule has been removed as part of the simplified media pipeline.
# =========================================================================
# Bedrock Knowledge Base
# =========================================================================
KnowledgeBaseRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Service: bedrock.amazonaws.com
Action: sts:AssumeRole
Condition:
StringEquals:
'aws:SourceAccount': !Ref AWS::AccountId
ArnLike:
'aws:SourceArn': !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*'
Policies:
- PolicyName: S3DataSourceAccess
PolicyDocument:
Version: '2012-10-17'
Statement:
# Bucket-level permissions for data source and multimodal storage
- Effect: Allow
Action:
- s3:ListBucket
- s3:GetBucketLocation
Resource:
- !Sub '${DataBucket.Arn}'
# Object-level permissions (read, write, delete for multimodal storage)
- Effect: Allow
Action:
- s3:GetObject
- s3:PutObject
- s3:DeleteObject
Resource:
- !Sub '${DataBucket.Arn}/*'
- PolicyName: S3VectorsAccess
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action:
- s3:ListBucket
Resource:
- !Sub '${VectorBucket.Arn}'
- Effect: Allow
Action:
- s3:GetObject
- s3:PutObject
Resource:
- !Sub '${VectorBucket.Arn}/*'
- Effect: Allow
Action:
- s3vectors:DescribeIndex
- s3vectors:ReadVectors
- s3vectors:WriteVectors
- s3vectors:PutVectors
- s3vectors:QueryVectors
- s3vectors:GetVectors
- s3vectors:DeleteVectors
Resource:
- !Sub 'arn:${AWS::Partition}:s3vectors:${AWS::Region}:${AWS::AccountId}:bucket/${VectorBucket}'
- !Sub 'arn:${AWS::Partition}:s3vectors:${AWS::Region}:${AWS::AccountId}:bucket/${VectorBucket}/*'
- PolicyName: BedrockModelAccess
PolicyDocument:
Version: '2012-10-17'
Statement:
# Nova Multimodal Embeddings - sync invocation
- Effect: Allow
Action:
- bedrock:InvokeModel
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}::foundation-model/amazon.nova-2-multimodal-embeddings-v1:0'
# Nova Multimodal Embeddings - async invocation for images/video/audio
- Effect: Allow
Action:
- bedrock:InvokeModel
- bedrock:GetAsyncInvoke
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:async-invoke/*'
KnowledgeBaseCustomResourceFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-kb-init'
CodeUri: src/lambda/kb_custom_resource/
Handler: index.lambda_handler
Description: Custom resource for Knowledge Base creation with S3 Vectors
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 300 # KB creation can take time
MemorySize: 128 # Minimal - just API calls to Bedrock
# No reserved concurrency - allows multi-stack deployments
Environment:
Variables:
LOG_LEVEL: INFO
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- Statement:
- Effect: Allow
Action:
- bedrock:CreateKnowledgeBase
- bedrock:DeleteKnowledgeBase
- bedrock:GetKnowledgeBase
- bedrock:UpdateKnowledgeBase
- bedrock:ListKnowledgeBases
- bedrock:CreateDataSource
- bedrock:DeleteDataSource
- bedrock:GetDataSource
- bedrock:ListDataSources
- bedrock-agent:CreateKnowledgeBase
- bedrock-agent:DeleteKnowledgeBase
- bedrock-agent:GetKnowledgeBase
- bedrock-agent:UpdateKnowledgeBase
- bedrock-agent:ListKnowledgeBases
- bedrock-agent:CreateDataSource
- bedrock-agent:DeleteDataSource
- bedrock-agent:GetDataSource
- bedrock-agent:ListDataSources
Resource: '*'
- Effect: Allow
Action:
- s3vectors:CreateVectorBucket
- s3vectors:GetVectorBucket
- s3vectors:CreateIndex
- s3vectors:DeleteIndex
- s3vectors:DescribeIndex
- s3vectors:ListIndices
Resource:
- !Sub 'arn:${AWS::Partition}:s3vectors:${AWS::Region}:${AWS::AccountId}:bucket/${VectorBucket}'
- !Sub 'arn:${AWS::Partition}:s3vectors:${AWS::Region}:${AWS::AccountId}:bucket/${VectorBucket}/*'
- Effect: Allow
Action:
- iam:PassRole
Resource: !GetAtt KnowledgeBaseRole.Arn
- Effect: Allow
Action:
- ssm:PutParameter
- ssm:DeleteParameter
- ssm:GetParameter
Resource: !Sub 'arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:parameter/${AWS::StackName}/KnowledgeBaseId'
KnowledgeBase:
Type: Custom::KnowledgeBase
Properties:
ServiceToken: !GetAtt KnowledgeBaseCustomResourceFunction.Arn
KnowledgeBaseName: !Sub '${AWS::StackName}-kb'
RoleArn: !GetAtt KnowledgeBaseRole.Arn
VectorBucket: !Ref VectorBucket
DataBucket: !Ref DataBucket
# Nova Multimodal Embeddings - supports text, images, video, audio
# Currently only available in us-east-1. When available elsewhere, just deploy there.
EmbedModelArn: !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}::foundation-model/amazon.nova-2-multimodal-embeddings-v1:0'
IndexName: !Sub '${AWS::StackName}-index'
Region: !Ref AWS::Region
ProjectName: !Ref AWS::StackName
# Version property forces custom resource update to return new attributes
Version: "1" # Reset for clean deployment
# Initial sync to establish KB tracking baseline
InitialSyncFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-initial-sync'
CodeUri: src/lambda/initial_sync/
Handler: index.lambda_handler
Description: Triggers initial KB sync on stack creation
Runtime: python3.13
Timeout: 60
MemorySize: 128
Policies:
- Statement:
- Effect: Allow
Action:
- bedrock:StartIngestionJob
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*'
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*/data-source/*'
InitialSync:
Type: Custom::InitialSync
DependsOn:
- KnowledgeBase
Properties:
ServiceToken: !GetAtt InitialSyncFunction.Arn
KnowledgeBaseId: !GetAtt KnowledgeBase.KnowledgeBaseId
DataSourceId: !GetAtt KnowledgeBase.DataSourceId
# =========================================================================
# Cognito Authentication
# =========================================================================
UserPool:
Type: AWS::Cognito::UserPool
Properties:
UserPoolName: !Sub '${AWS::StackName}-Users'
AutoVerifiedAttributes:
- email
UsernameAttributes:
- email
Schema:
- Name: email
Required: true
Mutable: false
Policies:
PasswordPolicy:
MinimumLength: 8
RequireUppercase: true
RequireLowercase: true
RequireNumbers: true
RequireSymbols: true
MfaConfiguration: OPTIONAL
EnabledMfas:
- SOFTWARE_TOKEN_MFA
AccountRecoverySetting:
RecoveryMechanisms:
- Name: verified_email
Priority: 1
AdminCreateUserConfig:
AllowAdminCreateUserOnly: true
UserPoolClient:
Type: AWS::Cognito::UserPoolClient
Properties:
ClientName: !Sub '${AWS::StackName}-WebClient'
UserPoolId: !Ref UserPool
GenerateSecret: false
ExplicitAuthFlows:
- ALLOW_USER_SRP_AUTH
- ALLOW_REFRESH_TOKEN_AUTH
PreventUserExistenceErrors: ENABLED
RefreshTokenValidity: 30
AccessTokenValidity: 60
IdTokenValidity: 60
TokenValidityUnits:
RefreshToken: days
AccessToken: minutes
IdToken: minutes
IdentityPool:
Type: AWS::Cognito::IdentityPool
Properties:
IdentityPoolName: !Sub '${AWS::StackName}Identity'
AllowUnauthenticatedIdentities: true # Enable guest access for chat widget
CognitoIdentityProviders:
- ClientId: !Ref UserPoolClient
ProviderName: !GetAtt UserPool.ProviderName
IdentityPoolRoleAttachment:
Type: AWS::Cognito::IdentityPoolRoleAttachment
Properties:
IdentityPoolId: !Ref IdentityPool
Roles:
authenticated: !GetAtt AuthenticatedRole.Arn
unauthenticated: !GetAtt UnauthenticatedRole.Arn
UnauthenticatedRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Federated: cognito-identity.amazonaws.com
Action: sts:AssumeRoleWithWebIdentity
Condition:
StringEquals:
cognito-identity.amazonaws.com:aud: !Ref IdentityPool
ForAnyValue:StringLike:
cognito-identity.amazonaws.com:amr: unauthenticated
Policies:
- PolicyName: UnauthenticatedAccess
PolicyDocument:
Version: '2012-10-17'
Statement:
# Allow access to both SAM AppSync API and Amplify AppSync API
- Effect: Allow
Action:
- appsync:GraphQL
Resource:
- !Sub '${GraphQLApi.Arn}/*'
- !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/*/types/*/fields/*'
AuthenticatedRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Federated: cognito-identity.amazonaws.com
Action: sts:AssumeRoleWithWebIdentity
Condition:
StringEquals:
cognito-identity.amazonaws.com:aud: !Ref IdentityPool
ForAnyValue:StringLike:
cognito-identity.amazonaws.com:amr: authenticated
Policies:
- PolicyName: AuthenticatedAccess
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action:
- s3:PutObject
- s3:GetObject
Resource:
- !Sub '${DataBucket.Arn}/*'
- Effect: Allow
Action:
- appsync:GraphQL
Resource:
- !Sub '${GraphQLApi.Arn}/*'
# =========================================================================
# AppSync GraphQL API
# =========================================================================
GraphQLApi:
Type: AWS::AppSync::GraphQLApi
Properties:
Name: !Sub '${AWS::StackName}-API'
AuthenticationType: AMAZON_COGNITO_USER_POOLS
UserPoolConfig:
UserPoolId: !Ref UserPool
AwsRegion: !Ref AWS::Region
DefaultAction: ALLOW
AdditionalAuthenticationProviders:
- AuthenticationType: AWS_IAM
- AuthenticationType: API_KEY
LogConfig:
CloudWatchLogsRoleArn: !GetAtt AppSyncLogsRole.Arn
FieldLogLevel: ERROR
GraphQLSchema:
Type: AWS::AppSync::GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
DefinitionS3Location: ./src/api/schema.graphql
# API Key for public access (theme config)
GraphQLApiKey:
Type: AWS::AppSync::ApiKey
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
Description: Public API key for theme configuration
Expires: 1795791181
AppSyncLogsRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Service: appsync.amazonaws.com
Action: sts:AssumeRole
ManagedPolicyArns:
- arn:aws:iam::aws:policy/service-role/AWSAppSyncPushToCloudWatchLogs
# AppSync resolver Lambda - main CRUD operations
AppSyncResolverFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-appsync'
CodeUri: src/lambda/appsync_resolvers/
Handler: index.lambda_handler
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 60 # 1 minute for complex operations
MemorySize: 512 # Moderate - image caption generation uses Bedrock
# No reserved concurrency - allows multi-stack deployments
Environment:
Variables:
TRACKING_TABLE: !Ref TrackingTable
DATA_BUCKET: !Ref DataBucket
STATE_MACHINE_ARN: !GetAtt ProcessingStateMachine.Arn
# Scrape environment variables
SCRAPE_JOBS_TABLE: !Ref ScrapeJobsTable
SCRAPE_URLS_TABLE: !Ref ScrapeUrlsTable
SCRAPE_START_FUNCTION_ARN: !GetAtt ScrapeStartFunction.Arn
# Configuration table for caption generation model selection
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
# Metadata key library for manual extraction mode
METADATA_KEY_LIBRARY_TABLE: !Ref MetadataKeyLibraryTable
# Metadata analyzer for KB analysis
METADATA_ANALYZER_FUNCTION_ARN: !GetAtt MetadataAnalyzerFunction.Arn
# Reindex state machine for KB reindex operations
REINDEX_STATE_MACHINE_ARN: !GetAtt ReindexStateMachine.Arn
# Process image function for submitImage
PROCESS_IMAGE_FUNCTION_ARN: !GetAtt ProcessImageFunction.Arn
# Ingest to KB function for single document reindexing
INGEST_TO_KB_FUNCTION_ARN: !GetAtt IngestToKBFunction.Arn
# Demo mode for rate limiting and feature restrictions
DEMO_MODE: !Ref DemoMode
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- S3CrudPolicy:
BucketName: !Ref DataBucket
- Statement:
- Effect: Allow
Action:
- states:StartExecution
Resource:
- !GetAtt ProcessingStateMachine.Arn
- !GetAtt ReindexStateMachine.Arn
# Scrape policies
- DynamoDBCrudPolicy:
TableName: !Ref ScrapeJobsTable
- DynamoDBReadPolicy:
TableName: !Ref ScrapeUrlsTable
# Metadata key library access
- DynamoDBReadPolicy:
TableName: !Ref MetadataKeyLibraryTable
- Statement:
- Effect: Allow
Action:
- lambda:InvokeFunction
Resource: !GetAtt ScrapeStartFunction.Arn
- Statement:
- Effect: Allow
Action:
- lambda:InvokeFunction
Resource: !GetAtt MetadataAnalyzerFunction.Arn
- Statement:
- Effect: Allow
Action:
- lambda:InvokeFunction
Resource: !GetAtt ProcessImageFunction.Arn
- Statement:
- Effect: Allow
Action:
- lambda:InvokeFunction
Resource: !GetAtt IngestToKBFunction.Arn
- Statement:
- Effect: Allow
Action:
- states:StopExecution
Resource: !Sub 'arn:${AWS::Partition}:states:${AWS::Region}:${AWS::AccountId}:execution:${AWS::StackName}-ScrapeWorkflow:*'
# Configuration table read for caption model selection
- DynamoDBReadPolicy:
TableName: !Ref ConfigurationTable
# Bedrock InvokeModel for AI caption generation
- Statement:
- Effect: Allow
Action:
- bedrock:InvokeModel
Resource:
# Wildcard regions needed: inference profiles route to any region
- !Sub 'arn:${AWS::Partition}:bedrock:*::foundation-model/*'
- !Sub 'arn:${AWS::Partition}:bedrock:*:${AWS::AccountId}:inference-profile/*'
# Bedrock Agent for document deletion and scraped content reindex
# DeleteKnowledgeBaseDocuments requires StartIngestionJob permission internally
# IngestKnowledgeBaseDocuments for direct scraped content reindex (no LLM extraction)
- Statement:
- Effect: Allow
Action:
- bedrock:DeleteKnowledgeBaseDocuments
- bedrock:IngestKnowledgeBaseDocuments
- bedrock:StartIngestionJob
- bedrock:GetIngestionJob
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*'
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*/data-source/*'
StartUICodeBuild:
Type: AWS::Serverless::Function
Condition: BuildAnyUI
Properties:
FunctionName: !Sub '${AWS::StackName}-ui-builder'
CodeUri: src/lambda/start_codebuild/
Handler: index.lambda_handler
Description: Custom resource to trigger UI CodeBuild project
Layers:
- !Ref RagstackCommonLayer
Runtime: python3.13
Timeout: 300 # 5 minutes (waits for CodeBuild)
MemorySize: 128 # Minimal - just triggers CodeBuild
# No reserved concurrency - allows multi-stack deployments
Environment:
Variables:
LOG_LEVEL: INFO
Policies:
- Statement:
- Effect: Allow
Action:
- codebuild:StartBuild
- codebuild:BatchGetBuilds
Resource:
- !If [BuildUI, !GetAtt UICodeBuildProject.Arn, !Ref 'AWS::NoValue']
- !If [BuildWC, !GetAtt WebComponentBuildProject.Arn, !Ref 'AWS::NoValue']
- Statement:
- Effect: Allow
Action:
- events:PutRule
- events:DeleteRule
- events:PutTargets
- events:RemoveTargets
Resource: !Sub 'arn:${AWS::Partition}:events:${AWS::Region}:${AWS::AccountId}:rule/*'
- Statement:
- Effect: Allow
Action:
- lambda:AddPermission
- lambda:RemovePermission
Resource: !Sub 'arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:${AWS::StackName}-ui-builder'
- Statement:
- Effect: Allow
Action:
- logs:CreateLogGroup
- logs:CreateLogStream
- logs:PutLogEvents
Resource: !Sub 'arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*'
CodeBuildRun:
Type: Custom::CodeBuildRun
Condition: BuildUI
Properties:
ServiceToken: !GetAtt StartUICodeBuild.Arn
BuildProjectName: !Ref UICodeBuildProject
SourceLocationOverride: !Sub '${UISourceBucket}/${UISourceKey}'
# Trigger web component build
WCCodeBuildRun:
Type: Custom::CodeBuildRun
Condition: BuildWC
Properties:
ServiceToken: !GetAtt StartUICodeBuild.Arn
BuildProjectName: !Ref WebComponentBuildProject
SourceLocationOverride: !Sub '${UISourceBucket}/${WebComponentSourceKey}'
# Data source for resolvers
AppSyncLambdaDataSource:
Type: AWS::AppSync::DataSource
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
Name: LambdaDataSource
Type: AWS_LAMBDA
ServiceRoleArn: !GetAtt AppSyncLambdaRole.Arn
LambdaConfig:
LambdaFunctionArn: !GetAtt AppSyncResolverFunction.Arn
AppSyncLambdaRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Service: appsync.amazonaws.com
Action: sts:AssumeRole
Policies:
- PolicyName: InvokeLambda
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action:
- lambda:InvokeFunction
Resource:
- !GetAtt AppSyncResolverFunction.Arn
- !GetAtt QueryKBFunction.Arn
- !GetAtt SearchKBFunction.Arn
- !GetAtt ConfigurationResolverFunction.Arn
- !GetAtt ApiKeyResolverFunction.Arn
# Data source for KB queries
KBQueryDataSource:
Type: AWS::AppSync::DataSource
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
Name: KBQueryDataSource
Type: AWS_LAMBDA
ServiceRoleArn: !GetAtt AppSyncLambdaRole.Arn
LambdaConfig:
LambdaFunctionArn: !GetAtt QueryKBFunction.Arn
# Data source for KB search
KBSearchDataSource:
Type: AWS::AppSync::DataSource
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
Name: KBSearchDataSource
Type: AWS_LAMBDA
ServiceRoleArn: !GetAtt AppSyncLambdaRole.Arn
LambdaConfig:
LambdaFunctionArn: !GetAtt SearchKBFunction.Arn
# Data source for Configuration Resolver
ConfigurationResolverDataSource:
Type: AWS::AppSync::DataSource
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
Name: ConfigurationResolverDataSource
Type: AWS_LAMBDA
ServiceRoleArn: !GetAtt AppSyncLambdaRole.Arn
LambdaConfig:
LambdaFunctionArn: !GetAtt ConfigurationResolverFunction.Arn
# Data source for API Key Resolver
ApiKeyResolverDataSource:
Type: AWS::AppSync::DataSource
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
Name: ApiKeyResolverDataSource
Type: AWS_LAMBDA
ServiceRoleArn: !GetAtt AppSyncLambdaRole.Arn
LambdaConfig:
LambdaFunctionArn: !GetAtt ApiKeyResolverFunction.Arn
# NONE data source for subscription mutations (passthrough)
NoneDataSource:
Type: AWS::AppSync::DataSource
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
Name: NoneDataSource
Type: NONE
# Resolvers
GetDocumentResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Query
FieldName: getDocument
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
ListDocumentsResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Query
FieldName: listDocuments
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
GetMetadataStatsResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Query
FieldName: getMetadataStats
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
GetFilterExamplesResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Query
FieldName: getFilterExamples
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
GetKeyLibraryResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Query
FieldName: getKeyLibrary
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
CheckKeySimilarityResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Query
FieldName: checkKeySimilarity
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
CreateUploadUrlResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: createUploadUrl
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
ProcessDocumentResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: processDocument
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
QueryKBResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Query
FieldName: queryKnowledgeBase
DataSourceName: !GetAtt KBQueryDataSource.Name
SearchKBResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Query
FieldName: searchKnowledgeBase
DataSourceName: !GetAtt KBSearchDataSource.Name
# Configuration Resolvers
GetConfigurationResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Query
FieldName: getConfiguration
DataSourceName: !GetAtt ConfigurationResolverDataSource.Name
UpdateConfigurationResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: updateConfiguration
DataSourceName: !GetAtt ConfigurationResolverDataSource.Name
# API Key Resolvers
GetApiKeyResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Query
FieldName: getApiKey
DataSourceName: !GetAtt ApiKeyResolverDataSource.Name
RegenerateApiKeyResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: regenerateApiKey
DataSourceName: !GetAtt ApiKeyResolverDataSource.Name
# =========================================================================
# Scrape Resolvers
# =========================================================================
GetScrapeJobResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Query
FieldName: getScrapeJob
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
ListScrapeJobsResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Query
FieldName: listScrapeJobs
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
CheckScrapeUrlResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Query
FieldName: checkScrapeUrl
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
StartScrapeResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: startScrape
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
CancelScrapeResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: cancelScrape
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
# Image resolvers
CreateImageUploadUrlResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: createImageUploadUrl
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
GenerateCaptionResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: generateCaption
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
SubmitImageResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: submitImage
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
GetImageResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Query
FieldName: getImage
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
ListImagesResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Query
FieldName: listImages
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
DeleteImageResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: deleteImage
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
DeleteDocumentsResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: deleteDocuments
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
ReprocessDocumentResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: reprocessDocument
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
ReindexDocumentResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: reindexDocument
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
CreateZipUploadUrlResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: createZipUploadUrl
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
AnalyzeMetadataResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: analyzeMetadata
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
StartReindexResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: startReindex
DataSourceName: !GetAtt AppSyncLambdaDataSource.Name
# Subscription publish resolvers (NONE data source - just pass through for subscriptions)
PublishReindexUpdateResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: publishReindexUpdate
DataSourceName: !GetAtt NoneDataSource.Name
RequestMappingTemplate: |
{
"version": "2018-05-29",
"payload": $util.toJson($context.arguments)
}
ResponseMappingTemplate: |
$util.toJson($context.result)
PublishDocumentUpdateResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: publishDocumentUpdate
DataSourceName: !GetAtt NoneDataSource.Name
RequestMappingTemplate: |
{
"version": "2018-05-29",
"payload": $util.toJson($context.arguments)
}
ResponseMappingTemplate: |
$util.toJson($context.result)
PublishScrapeUpdateResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: publishScrapeUpdate
DataSourceName: !GetAtt NoneDataSource.Name
RequestMappingTemplate: |
{
"version": "2018-05-29",
"payload": $util.toJson($context.arguments)
}
ResponseMappingTemplate: |
$util.toJson($context.result)
PublishImageUpdateResolver:
Type: AWS::AppSync::Resolver
DependsOn: GraphQLSchema
Properties:
ApiId: !GetAtt GraphQLApi.ApiId
TypeName: Mutation
FieldName: publishImageUpdate
DataSourceName: !GetAtt NoneDataSource.Name
RequestMappingTemplate: |
{
"version": "2018-05-29",
"payload": $util.toJson($context.arguments)
}
ResponseMappingTemplate: |
$util.toJson($context.result)
# =========================================================================
# Dead Letter Queue for Error Handling
# =========================================================================
ProcessingDLQ:
Type: AWS::SQS::Queue
Properties:
QueueName: !Sub '${AWS::StackName}-Processing-DLQ'
MessageRetentionPeriod: 1209600 # 14 days
VisibilityTimeout: 300
SqsManagedSseEnabled: true
# =========================================================================
# Document Processing Queue (SQS → Step Functions)
# =========================================================================
DocumentProcessingQueue:
Type: AWS::SQS::Queue
Properties:
QueueName: !Sub '${AWS::StackName}-doc-processing'
VisibilityTimeout: 1800 # 30 min - longer than Step Functions execution
MessageRetentionPeriod: 86400 # 1 day
SqsManagedSseEnabled: true
RedrivePolicy:
deadLetterTargetArn: !GetAtt ProcessingDLQ.Arn
maxReceiveCount: 3
Tags:
- Key: Project
Value: !Ref AWS::StackName
DocumentProcessingQueuePolicy:
Type: AWS::SQS::QueuePolicy
Properties:
Queues:
- !Ref DocumentProcessingQueue
PolicyDocument:
Statement:
- Effect: Allow
Principal:
Service: events.amazonaws.com
Action: sqs:SendMessage
Resource: !GetAtt DocumentProcessingQueue.Arn
Condition:
ArnEquals:
aws:SourceArn: !GetAtt S3UploadRule.Arn
# =========================================================================
# Batch Processing Queue (individual 10-page batches with global concurrency)
# =========================================================================
BatchProcessingDLQ:
Type: AWS::SQS::Queue
Properties:
QueueName: !Sub '${AWS::StackName}-batch-processing-dlq'
MessageRetentionPeriod: 1209600 # 14 days
SqsManagedSseEnabled: true
Tags:
- Key: Project
Value: !Ref AWS::StackName
BatchProcessingQueue:
Type: AWS::SQS::Queue
Properties:
QueueName: !Sub '${AWS::StackName}-batch-processing'
VisibilityTimeout: 960 # 16 min (Lambda 900s timeout + buffer)
MessageRetentionPeriod: 86400 # 1 day
SqsManagedSseEnabled: true
RedrivePolicy:
deadLetterTargetArn: !GetAtt BatchProcessingDLQ.Arn
maxReceiveCount: 3
Tags:
- Key: Project
Value: !Ref AWS::StackName
# =========================================================================
# Scrape Workflow SQS Queues
# =========================================================================
ScrapeDiscoveryDLQ:
Type: AWS::SQS::Queue
Properties:
QueueName: !Sub '${AWS::StackName}-scrape-discovery-dlq'
MessageRetentionPeriod: 1209600 # 14 days
SqsManagedSseEnabled: true
Tags:
- Key: Project
Value: !Ref AWS::StackName
ScrapeDiscoveryQueue:
Type: AWS::SQS::Queue
Properties:
QueueName: !Sub '${AWS::StackName}-scrape-discovery'
VisibilityTimeout: 300 # 5 minutes - matches Lambda timeout
MessageRetentionPeriod: 86400 # 1 day
SqsManagedSseEnabled: true
RedrivePolicy:
deadLetterTargetArn: !GetAtt ScrapeDiscoveryDLQ.Arn
maxReceiveCount: 3
Tags:
- Key: Project
Value: !Ref AWS::StackName
ScrapeProcessingDLQ:
Type: AWS::SQS::Queue
Properties:
QueueName: !Sub '${AWS::StackName}-scrape-processing-dlq'
MessageRetentionPeriod: 1209600 # 14 days
SqsManagedSseEnabled: true
Tags:
- Key: Project
Value: !Ref AWS::StackName
ScrapeProcessingQueue:
Type: AWS::SQS::Queue
Properties:
QueueName: !Sub '${AWS::StackName}-scrape-processing'
VisibilityTimeout: 900 # 15 minutes - matches Lambda timeout for Playwright
MessageRetentionPeriod: 86400 # 1 day
SqsManagedSseEnabled: true
RedrivePolicy:
deadLetterTargetArn: !GetAtt ScrapeProcessingDLQ.Arn
maxReceiveCount: 3
Tags:
- Key: Project
Value: !Ref AWS::StackName
# =========================================================================
# Sync Coordinator Queue (FIFO) - Serializes KB ingestion sync requests
# =========================================================================
SyncRequestDLQ:
Type: AWS::SQS::Queue
Properties:
QueueName: !Sub '${AWS::StackName}-sync-request-dlq.fifo'
FifoQueue: true
MessageRetentionPeriod: 1209600 # 14 days
SqsManagedSseEnabled: true
Tags:
- Key: Project
Value: !Ref AWS::StackName
SyncRequestQueue:
Type: AWS::SQS::Queue
Properties:
QueueName: !Sub '${AWS::StackName}-sync-request.fifo'
FifoQueue: true
ContentBasedDeduplication: true # Dedup identical messages within 5-min window
VisibilityTimeout: 600 # 10 min - enough for sync coordinator to complete
MessageRetentionPeriod: 86400 # 1 day
SqsManagedSseEnabled: true
RedrivePolicy:
deadLetterTargetArn: !GetAtt SyncRequestDLQ.Arn
maxReceiveCount: 3
Tags:
- Key: Project
Value: !Ref AWS::StackName
# Sync Coordinator Lambda - processes sync requests from FIFO queue
SyncCoordinatorFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-sync-coordinator'
CodeUri: src/lambda/sync_coordinator/
Handler: index.lambda_handler
Description: Coordinates KB sync requests - waits for running syncs, then starts new one
Runtime: python3.13
Timeout: 600 # 10 min - allows waiting for long syncs
MemorySize: 256
ReservedConcurrentExecutions: 1 # CRITICAL: prevents race conditions
Layers:
- !Ref RagstackCommonLayer
Environment:
Variables:
LOG_LEVEL: INFO
KNOWLEDGE_BASE_ID: !GetAtt KnowledgeBase.KnowledgeBaseId
DATA_SOURCE_ID: !GetAtt KnowledgeBase.DataSourceId
TRACKING_TABLE: !Ref TrackingTable
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Events:
SQSEvent:
Type: SQS
Properties:
Queue: !GetAtt SyncRequestQueue.Arn
BatchSize: 1 # Process one message at a time
Policies:
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- DynamoDBReadPolicy:
TableName: !Ref ConfigurationTable
- Statement:
- Effect: Allow
Action:
- bedrock:StartIngestionJob
- bedrock:ListIngestionJobs
- bedrock:GetIngestionJob
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*'
- Effect: Allow
Action:
- bedrock:StartIngestionJob
- bedrock:GetIngestionJob
- bedrock:ListIngestionJobs
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*/data-source/*'
# Sync Status Checker Lambda - verifies SYNC_QUEUED documents are indexed
SyncStatusCheckerFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-sync-status-checker'
CodeUri: src/lambda/sync_status_checker/
Handler: index.lambda_handler
Description: Checks KB status for SYNC_QUEUED documents and updates tracking table
Runtime: python3.13
Timeout: 120 # 2 minutes - enough to check batch of documents
MemorySize: 256
Layers:
- !Ref RagstackCommonLayer
Environment:
Variables:
LOG_LEVEL: INFO
KNOWLEDGE_BASE_ID: !GetAtt KnowledgeBase.KnowledgeBaseId
DATA_SOURCE_ID: !GetAtt KnowledgeBase.DataSourceId
TRACKING_TABLE: !Ref TrackingTable
GRAPHQL_ENDPOINT: !GetAtt GraphQLApi.GraphQLUrl
CONFIGURATION_TABLE_NAME: !Ref ConfigurationTable
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Events:
ScheduleEvent:
Type: Schedule
Properties:
Schedule: rate(1 minute)
Description: Check status of documents waiting for KB sync
Enabled: true
Policies:
- DynamoDBCrudPolicy:
TableName: !Ref TrackingTable
- DynamoDBReadPolicy:
TableName: !Ref ConfigurationTable
- Statement:
- Effect: Allow
Action:
- bedrock:GetKnowledgeBaseDocuments
Resource:
- !Sub 'arn:${AWS::Partition}:bedrock:${AWS::Region}:${AWS::AccountId}:knowledge-base/*'
- Effect: Allow
Action: appsync:GraphQL
Resource: !Sub 'arn:${AWS::Partition}:appsync:${AWS::Region}:${AWS::AccountId}:apis/${GraphQLApi.ApiId}/*'
# =========================================================================
# SNS Topic for Alarm Notifications
# =========================================================================
AlarmTopic:
Type: AWS::SNS::Topic
Properties:
TopicName: !Sub '${AWS::StackName}-Alarms'
DisplayName: !Sub '${AWS::StackName} CloudWatch Alarms'
KmsMasterKeyId: alias/aws/sns
Subscription:
- Endpoint: !Ref AdminEmail
Protocol: email
# =========================================================================
# CloudWatch Dashboard
# =========================================================================
MonitoringDashboard:
Type: AWS::CloudWatch::Dashboard
Properties:
DashboardName: !Sub '${AWS::StackName}-Monitor'
DashboardBody: !Sub |
{
"widgets": [
{
"type": "metric",
"x": 0,
"y": 0,
"width": 12,
"height": 6,
"properties": {
"metrics": [
["AWS/Lambda", "Invocations", {"stat": "Sum", "label": "ProcessDocument", "color": "#1f77b4"}],
["...", {"stat": "Sum", "label": "QueryKB", "color": "#2ca02c"}]
],
"period": 300,
"stat": "Sum",
"region": "${AWS::Region}",
"title": "Lambda Invocations",
"yAxis": {
"left": {
"min": 0
}
}
}
},
{
"type": "metric",
"x": 12,
"y": 0,
"width": 12,
"height": 6,
"properties": {
"metrics": [
["AWS/Lambda", "Errors", {"stat": "Sum", "label": "ProcessDocument Errors", "color": "#d62728"}],
["...", {"stat": "Sum", "label": "QueryKB Errors", "color": "#e377c2"}]
],
"period": 300,
"stat": "Sum",
"region": "${AWS::Region}",
"title": "Lambda Errors",
"yAxis": {
"left": {
"min": 0
}
}
}
},
{
"type": "metric",
"x": 0,
"y": 6,
"width": 12,
"height": 6,
"properties": {
"metrics": [
["AWS/States", "ExecutionsFailed", {"stat": "Sum", "color": "#d62728"}],
[".", "ExecutionsSucceeded", {"stat": "Sum", "color": "#2ca02c"}],
[".", "ExecutionsTimedOut", {"stat": "Sum", "color": "#ff7f0e"}]
],
"period": 300,
"stat": "Sum",
"region": "${AWS::Region}",
"title": "Step Functions Executions"
}
},
{
"type": "metric",
"x": 12,
"y": 6,
"width": 12,
"height": 6,
"properties": {
"metrics": [
["AWS/SQS", "ApproximateNumberOfMessagesVisible", {"label": "DLQ Messages", "color": "#d62728"}]
],
"period": 300,
"stat": "Average",
"region": "${AWS::Region}",
"title": "Dead Letter Queue",
"yAxis": {
"left": {
"min": 0
}
}
}
},
{
"type": "metric",
"x": 0,
"y": 12,
"width": 12,
"height": 6,
"properties": {
"metrics": [
["AWS/DynamoDB", "ConsumedReadCapacityUnits", {"stat": "Sum", "color": "#1f77b4"}],
[".", "ConsumedWriteCapacityUnits", {"stat": "Sum", "color": "#ff7f0e"}]
],
"period": 300,
"stat": "Sum",
"region": "${AWS::Region}",
"title": "DynamoDB Capacity"
}
},
{
"type": "metric",
"x": 12,
"y": 12,
"width": 12,
"height": 6,
"properties": {
"metrics": [
["AWS/Lambda", "Duration", {"stat": "Average", "label": "ProcessDocument", "color": "#1f77b4"}],
["...", {"stat": "Average", "label": "QueryKB", "color": "#2ca02c"}]
],
"period": 300,
"stat": "Average",
"region": "${AWS::Region}",
"title": "Lambda Duration (ms)",
"yAxis": {
"left": {
"min": 0
}
}
}
}
]
}
# =========================================================================
# CloudWatch Alarms
# =========================================================================
ProcessDocumentErrorAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmName: !Sub '${AWS::StackName}-ProcessDocument-Errors'
AlarmDescription: Alert when ProcessDocument Lambda has errors
MetricName: Errors
Namespace: AWS/Lambda
Statistic: Sum
Period: 300
EvaluationPeriods: 1
Threshold: 5
ComparisonOperator: GreaterThanThreshold
Dimensions:
- Name: FunctionName
Value: !Ref ProcessDocumentFunction
TreatMissingData: notBreaching
AlarmActions:
- !Ref AlarmTopic
OKActions:
- !Ref AlarmTopic
DLQMessagesAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmName: !Sub '${AWS::StackName}-DLQ-Messages'
AlarmDescription: Alert when messages appear in DLQ
MetricName: ApproximateNumberOfMessagesVisible
Namespace: AWS/SQS
Statistic: Average
Period: 300
EvaluationPeriods: 1
Threshold: 1
ComparisonOperator: GreaterThanThreshold
Dimensions:
- Name: QueueName
Value: !GetAtt ProcessingDLQ.QueueName
TreatMissingData: notBreaching
AlarmActions:
- !Ref AlarmTopic
OKActions:
- !Ref AlarmTopic
StepFunctionsFailureAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmName: !Sub '${AWS::StackName}-StepFunctions-Failures'
AlarmDescription: Alert when Step Functions executions fail
MetricName: ExecutionsFailed
Namespace: AWS/States
Statistic: Sum
Period: 300
EvaluationPeriods: 1
Threshold: 3
ComparisonOperator: GreaterThanThreshold
Dimensions:
- Name: StateMachineArn
Value: !Ref ProcessingStateMachine
TreatMissingData: notBreaching
AlarmActions:
- !Ref AlarmTopic
OKActions:
- !Ref AlarmTopic
ProcessDocumentThrottleAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmName: !Sub '${AWS::StackName}-ProcessDocument-Throttles'
AlarmDescription: Alert when ProcessDocument Lambda is throttled
MetricName: Throttles
Namespace: AWS/Lambda
Statistic: Sum
Period: 300
EvaluationPeriods: 1
Threshold: 10
ComparisonOperator: GreaterThanThreshold
Dimensions:
- Name: FunctionName
Value: !Ref ProcessDocumentFunction
TreatMissingData: notBreaching
AlarmActions:
- !Ref AlarmTopic
OKActions:
- !Ref AlarmTopic
# =========================================================================
# Budget Sync Function - Creates and syncs budget via Lambda
# (Avoids CloudFormation "same name different internalId" conflicts)
# =========================================================================
BudgetSyncFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-budget-sync'
CodeUri: src/lambda/budget_sync/
Handler: index.lambda_handler
Description: Sync budget configuration changes to AWS Budgets
Runtime: python3.13
Timeout: 30
MemorySize: 128
Environment:
Variables:
LOG_LEVEL: INFO
BUDGET_NAME: !Sub '${AWS::StackName}-Monthly-Budget'
ADMIN_EMAIL: !Ref AdminEmail
PROJECT_NAME: !Ref AWS::StackName
Events:
ConfigStream:
Type: DynamoDB
Properties:
Stream: !GetAtt ConfigurationTable.StreamArn
StartingPosition: TRIM_HORIZON
BatchSize: 1
FilterCriteria:
Filters:
- Pattern: '{"dynamodb":{"Keys":{"Configuration":{"S":["Custom"]}}}}'
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- Statement:
- Effect: Allow
Action:
- budgets:CreateBudget
- budgets:ModifyBudget
- budgets:DescribeBudget
- budgets:ViewBudget
- budgets:UpdateBudget
Resource: '*'
# Custom resource to trigger budget creation on initial deployment
BudgetInitTrigger:
Type: Custom::BudgetInit
Properties:
ServiceToken: !GetAtt BudgetSyncFunction.Arn
ServiceTimeout: 120 # 2 minutes instead of default 1 hour
# Include a version to trigger on updates if needed
Version: '1'
# Pass config to trigger budget creation
Records:
- eventName: INSERT
dynamodb:
Keys:
Configuration:
S: Custom
NewImage:
Configuration:
S: Custom
budget_alert_threshold:
N: '100'
budget_alert_enabled:
BOOL: true
# =========================================================================
# Configuration Seeder - Seeds Schema and Default config on deployment
# =========================================================================
ConfigurationSeederFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-config-seeder'
Runtime: python3.13
Handler: index.lambda_handler
Timeout: 60
MemorySize: 128
Environment:
Variables:
TABLE_NAME: !Ref ConfigurationTable
WC_CDN_URL: !Sub 'https://${WebComponentDistribution.DomainName}/ragstack-chat.js'
KNOWLEDGE_BASE_ID: !GetAtt KnowledgeBase.KnowledgeBaseId
DATA_SOURCE_ID: !GetAtt KnowledgeBase.DataSourceId
Policies:
- DynamoDBCrudPolicy:
TableName: !Ref ConfigurationTable
InlineCode: |
import json
import os
import boto3
from urllib.request import Request, urlopen
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table(os.environ['TABLE_NAME'])
SCHEMA = {
'type': 'object',
'required': ['ocr_backend'],
'properties': {
'ocr_backend': {
'type': 'string',
'order': 1,
'description': 'OCR backend for document processing',
'enum': ['textract', 'bedrock'],
'default': 'textract'
},
'bedrock_ocr_model_id': {
'type': 'string',
'order': 2,
'description': 'Bedrock model for OCR (only if backend is bedrock)',
'enum': [
'us.anthropic.claude-haiku-4-5-20251001-v1:0',
'us.anthropic.claude-sonnet-4-20250514-v1:0',
'meta.llama3-2-90b-instruct-v1:0',
'meta.llama3-2-11b-instruct-v1:0'
],
'default': 'us.anthropic.claude-haiku-4-5-20251001-v1:0',
'dependsOn': {'field': 'ocr_backend', 'value': 'bedrock'}
},
'chat_system_prompt': {
'type': 'string',
'order': 3,
'description': 'System prompt for chat responses',
'default': 'You are a helpful assistant that answers questions based on information from a knowledge base. Always base your answers on the provided knowledge base information. If the provided information doesn\'t contain the answer, clearly state that and provide what relevant information you can. Be concise but thorough.'
},
'chat_primary_model': {
'type': 'string',
'order': 3,
'description': 'Primary chat model',
'enum': [
'us.anthropic.claude-sonnet-4-20250514-v1:0',
'us.anthropic.claude-haiku-4-5-20251001-v1:0',
'us.amazon.nova-pro-v1:0',
'us.amazon.nova-lite-v1:0'
],
'default': 'us.anthropic.claude-haiku-4-5-20251001-v1:0'
},
'chat_fallback_model': {
'type': 'string',
'order': 4,
'description': 'Fallback model when quotas exceeded',
'enum': [
'us.anthropic.claude-haiku-4-5-20251001-v1:0',
'us.amazon.nova-micro-v1:0',
'us.amazon.nova-lite-v1:0'
],
'default': 'us.amazon.nova-lite-v1:0'
},
'chat_global_quota_daily': {
'type': 'number',
'order': 5,
'description': 'Max messages per day (all users)',
'default': 10000
},
'chat_per_user_quota_daily': {
'type': 'number',
'order': 6,
'description': 'Max messages per user per day',
'default': 100
},
'chat_cdn_url': {
'type': 'string',
'order': 7,
'description': 'Web component CDN URL',
'readOnly': True
},
'chat_allow_document_access': {
'type': 'boolean',
'order': 8,
'description': 'Allow document downloads via presigned URLs',
'default': False
},
'public_access_chat': {
'type': 'boolean',
'order': 9,
'description': 'Allow unauthenticated chat',
'default': True
},
'public_access_search': {
'type': 'boolean',
'order': 10,
'description': 'Allow unauthenticated search',
'default': True
},
'public_access_upload': {
'type': 'boolean',
'order': 11,
'description': 'Allow unauthenticated uploads',
'default': False
},
'public_access_image_upload': {
'type': 'boolean',
'order': 12,
'description': 'Allow unauthenticated image uploads',
'default': False
},
'image_caption_prompt': {
'type': 'string',
'order': 14,
'description': 'System prompt for image caption generation',
'default': 'You are an image captioning assistant. Generate concise, descriptive captions that are suitable for use as search keywords. Focus on the main subject, setting, and any notable visual elements. Keep captions under 200 characters.'
},
'public_access_scrape': {
'type': 'boolean',
'order': 13,
'description': 'Allow unauthenticated web scraping',
'default': False
},
'budget_alert_threshold': {
'type': 'number',
'order': 14,
'description': 'Monthly budget alert threshold (USD)',
'default': 100
},
'budget_alert_enabled': {
'type': 'boolean',
'order': 15,
'description': 'Enable budget alerts',
'default': True
},
'demo_mode_enabled': {
'type': 'boolean',
'order': 50,
'description': 'Demo mode active (read-only, set via deployment)',
'readOnly': True
},
'demo_upload_quota_daily': {
'type': 'number',
'order': 51,
'description': 'Max uploads per user per day in demo mode',
'default': 5,
'dependsOn': { 'field': 'demo_mode_enabled', 'value': True }
},
'demo_chat_quota_daily': {
'type': 'number',
'order': 52,
'description': 'Max chat messages per user per day in demo mode',
'default': 30,
'dependsOn': { 'field': 'demo_mode_enabled', 'value': True }
},
'filter_generation_enabled': {
'type': 'boolean',
'order': 16,
'description': 'Enable LLM-based metadata filter generation for queries',
'default': True
},
'filter_generation_model': {
'type': 'string',
'order': 17,
'description': 'Model for filter generation',
'enum': [
'us.anthropic.claude-haiku-4-5-20251001-v1:0',
'us.anthropic.claude-3-5-haiku-20241022-v1:0',
'us.amazon.nova-lite-v1:0'
],
'default': 'us.anthropic.claude-haiku-4-5-20251001-v1:0'
},
'multislice_enabled': {
'type': 'boolean',
'order': 18,
'description': 'Enable multi-slice retrieval (parallel filtered/unfiltered queries)',
'default': True
},
'multislice_count': {
'type': 'number',
'order': 19,
'description': 'Number of parallel retrieval slices (2-4)',
'default': 2
},
'multislice_timeout_ms': {
'type': 'number',
'order': 20,
'description': 'Timeout per slice in milliseconds',
'default': 5000
},
'metadata_filter_examples': {
'type': 'array',
'order': 21,
'description': 'Filter examples for few-shot learning (JSON array)',
'default': []
},
'metadata_filter_examples_disabled': {
'type': 'array',
'order': 211,
'description': 'Names of disabled filter examples',
'default': []
},
'metadata_filter_examples_updated_at': {
'type': 'string',
'order': 212,
'description': 'Timestamp when filter examples were last generated',
'default': ''
},
'metadata_extraction_enabled': {
'type': 'boolean',
'order': 22,
'description': 'Enable LLM metadata extraction during ingestion',
'default': True
},
'metadata_extraction_model': {
'type': 'string',
'order': 23,
'description': 'Model for metadata extraction',
'enum': [
'us.anthropic.claude-haiku-4-5-20251001-v1:0',
'us.anthropic.claude-3-5-haiku-20241022-v1:0',
'us.amazon.nova-micro-v1:0',
'us.amazon.nova-lite-v1:0'
],
'default': 'us.anthropic.claude-haiku-4-5-20251001-v1:0'
},
'metadata_max_keys': {
'type': 'number',
'order': 24,
'description': 'Maximum metadata fields to extract per document',
'default': 8
},
'metadata_extraction_mode': {
'type': 'string',
'order': 25,
'description': 'Extraction mode: auto (LLM decides) or manual (admin specifies)',
'enum': ['auto', 'manual'],
'default': 'auto'
},
'metadata_manual_keys': {
'type': 'array',
'order': 26,
'description': 'Keys to extract in manual mode',
'default': [],
'dependsOn': {'field': 'metadata_extraction_mode', 'value': 'manual'}
},
'knowledge_base_id': {
'type': 'string',
'order': 100,
'description': 'Active Bedrock Knowledge Base ID (updated by reindex)',
'readOnly': True
},
'data_source_id': {
'type': 'string',
'order': 101,
'description': 'Active Data Source ID (updated by reindex)',
'readOnly': True
}
}
}
def lambda_handler(event, context):
print(f"Event: {json.dumps(event)}")
request_type = event.get('RequestType', '')
try:
if request_type in ['Create', 'Update']:
# Build defaults from schema
defaults = {'Configuration': 'Default'}
for key, prop in SCHEMA['properties'].items():
if 'default' in prop:
defaults[key] = prop['default']
defaults['chat_cdn_url'] = os.environ.get('WC_CDN_URL', '')
defaults['knowledge_base_id'] = os.environ.get('KNOWLEDGE_BASE_ID', '')
defaults['data_source_id'] = os.environ.get('DATA_SOURCE_ID', '')
# Seed Schema (stored as dict, not JSON string)
table.put_item(Item={
'Configuration': 'Schema',
'Schema': SCHEMA
})
# Seed Default (values at top level, not nested)
table.put_item(Item=defaults)
print("Configuration seeded successfully")
# Send success response
send_response(event, 'SUCCESS', {})
except Exception as e:
print(f"Error: {e}")
send_response(event, 'FAILED', {}, str(e))
def send_response(event, status, data, reason=''):
body = {
'Status': status,
'PhysicalResourceId': 'config-seeder',
'StackId': event['StackId'],
'RequestId': event['RequestId'],
'LogicalResourceId': event['LogicalResourceId'],
'Reason': reason or 'See CloudWatch logs',
'Data': data
}
request = Request(
event['ResponseURL'],
data=json.dumps(body).encode('utf-8'),
headers={'Content-Type': ''},
method='PUT'
)
urlopen(request)
ConfigurationSeeder:
Type: Custom::ConfigurationSeeder
Properties:
ServiceToken: !GetAtt ConfigurationSeederFunction.Arn
# Increment when schema changes to force CloudFormation to re-seed config
SchemaVersion: "5" # v5: added demo_mode_enabled, demo_upload_quota_daily, demo_chat_quota_daily
# =========================================================================
# Admin User Provisioner - Creates admin user idempotently
# =========================================================================
AdminUserProvisionerFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: !Sub '${AWS::StackName}-admin-provisioner'
CodeUri: src/lambda/admin_user_provisioner/
Handler: index.lambda_handler
Description: Creates admin user in Cognito idempotently (skips if exists)
Runtime: python3.13
Timeout: 30
MemorySize: 128
Tags:
Project: !Ref AWS::StackName
CostCenter: Engineering
Policies:
- Statement:
- Effect: Allow
Action:
- cognito-idp:AdminGetUser
- cognito-idp:AdminCreateUser
Resource: !GetAtt UserPool.Arn
AdminUser:
Type: Custom::AdminUser
Properties:
ServiceToken: !GetAtt AdminUserProvisionerFunction.Arn
ServiceTimeout: 120 # 2 minutes - user creation is fast
UserPoolId: !Ref UserPool
Email: !Ref AdminEmail
Outputs:
# S3 Buckets
DataBucketName:
Description: S3 bucket for data (input/, content/, working/ prefixes)
Value: !Ref DataBucket
Export:
Name: !Sub '${AWS::StackName}-DataBucket'
VectorBucketName:
Description: S3 bucket for embeddings and vectors
Value: !Ref VectorBucket
Export:
Name: !Sub '${AWS::StackName}-VectorBucket'
UIBucketName:
Description: S3 bucket for WebUI hosting
Value: !Ref UIBucket
Export:
Name: !Sub '${AWS::StackName}-UIBucket'
ArtifactBucketName:
Condition: BuildUI
Description: S3 bucket for deployment artifacts (UI source, web component source)
Value: !Ref UISourceBucket
Export:
Name: !Sub '${AWS::StackName}-ArtifactBucket'
# DynamoDB Tables
TrackingTableName:
Description: DynamoDB table for document tracking
Value: !Ref TrackingTable
Export:
Name: !Sub '${AWS::StackName}-TrackingTable'
MeteringTableName:
Description: DynamoDB table for usage metering
Value: !Ref MeteringTable
Export:
Name: !Sub '${AWS::StackName}-MeteringTable'
ConfigurationTableName:
Description: Configuration DynamoDB Table Name
Value: !Ref ConfigurationTable
Export:
Name: !Sub '${AWS::StackName}-ConfigurationTable'
ConfigurationTableArn:
Description: Configuration DynamoDB Table ARN
Value: !GetAtt ConfigurationTable.Arn
Export:
Name: !Sub '${AWS::StackName}-ConfigurationTableArn'
ConversationHistoryTableName:
Description: DynamoDB table for conversation history
Value: !Ref ConversationHistoryTable
Export:
Name: !Sub '${AWS::StackName}-ConversationHistoryTable'
ConversationHistoryTableArn:
Description: DynamoDB table ARN for conversation history
Value: !GetAtt ConversationHistoryTable.Arn
Export:
Name: !Sub '${AWS::StackName}-ConversationHistoryTableArn'
# Scrape DynamoDB Tables
ScrapeJobsTableName:
Description: DynamoDB table for scrape job tracking
Value: !Ref ScrapeJobsTable
Export:
Name: !Sub '${AWS::StackName}-ScrapeJobsTable'
ScrapeJobsTableArn:
Description: DynamoDB table ARN for scrape job tracking
Value: !GetAtt ScrapeJobsTable.Arn
Export:
Name: !Sub '${AWS::StackName}-ScrapeJobsTableArn'
ScrapeUrlsTableName:
Description: DynamoDB table for scrape URL tracking
Value: !Ref ScrapeUrlsTable
Export:
Name: !Sub '${AWS::StackName}-ScrapeUrlsTable'
ScrapeUrlsTableArn:
Description: DynamoDB table ARN for scrape URL tracking
Value: !GetAtt ScrapeUrlsTable.Arn
Export:
Name: !Sub '${AWS::StackName}-ScrapeUrlsTableArn'
# Scrape SQS Queues
ScrapeDiscoveryQueueUrl:
Description: SQS queue URL for scrape URL discovery
Value: !Ref ScrapeDiscoveryQueue
Export:
Name: !Sub '${AWS::StackName}-ScrapeDiscoveryQueueUrl'
ScrapeDiscoveryQueueArn:
Description: SQS queue ARN for scrape URL discovery
Value: !GetAtt ScrapeDiscoveryQueue.Arn
Export:
Name: !Sub '${AWS::StackName}-ScrapeDiscoveryQueueArn'
ScrapeProcessingQueueUrl:
Description: SQS queue URL for scrape page processing
Value: !Ref ScrapeProcessingQueue
Export:
Name: !Sub '${AWS::StackName}-ScrapeProcessingQueueUrl'
ScrapeProcessingQueueArn:
Description: SQS queue ARN for scrape page processing
Value: !GetAtt ScrapeProcessingQueue.Arn
Export:
Name: !Sub '${AWS::StackName}-ScrapeProcessingQueueArn'
# Lambda Functions
ProcessDocumentFunctionArn:
Description: Process document Lambda ARN
Value: !GetAtt ProcessDocumentFunction.Arn
Export:
Name: !Sub '${AWS::StackName}-ProcessDocumentFunction'
QueryKBFunctionArn:
Description: Query Knowledge Base Lambda ARN
Value: !GetAtt QueryKBFunction.Arn
Export:
Name: !Sub '${AWS::StackName}-QueryKBFunction'
SearchKBFunctionArn:
Description: Search Knowledge Base Lambda ARN
Value: !GetAtt SearchKBFunction.Arn
Export:
Name: !Sub '${AWS::StackName}-SearchKBFunction'
# Step Functions
StateMachineArn:
Description: Step Functions state machine ARN
Value: !GetAtt ProcessingStateMachine.Arn
Export:
Name: !Sub '${AWS::StackName}-StateMachine'
ScrapeStateMachineArn:
Description: Scrape workflow Step Functions state machine ARN
Value: !GetAtt ScrapeStateMachine.Arn
Export:
Name: !Sub '${AWS::StackName}-ScrapeStateMachine'
# Scrape Lambda Functions
ScrapeStartFunctionArn:
Description: Scrape start Lambda ARN
Value: !GetAtt ScrapeStartFunction.Arn
Export:
Name: !Sub '${AWS::StackName}-ScrapeStartFunction'
ScrapeStatusFunctionArn:
Description: Scrape status Lambda ARN
Value: !GetAtt ScrapeStatusFunction.Arn
Export:
Name: !Sub '${AWS::StackName}-ScrapeStatusFunction'
# Knowledge Base
KnowledgeBaseId:
Description: Bedrock Knowledge Base ID
Value: !GetAtt KnowledgeBase.KnowledgeBaseId
Export:
Name: !Sub '${AWS::StackName}-KnowledgeBaseId'
KnowledgeBaseArn:
Description: Bedrock Knowledge Base ARN
Value: !GetAtt KnowledgeBase.KnowledgeBaseArn
Export:
Name: !Sub '${AWS::StackName}-KnowledgeBaseArn'
DataSourceId:
Description: Bedrock Knowledge Base Data Source ID (text documents - backwards compatible)
Value: !GetAtt KnowledgeBase.DataSourceId
Export:
Name: !Sub '${AWS::StackName}-DataSourceId'
# API & Authentication
GraphQLApiUrl:
Description: GraphQL API URL
Value: !GetAtt GraphQLApi.GraphQLUrl
Export:
Name: !Sub '${AWS::StackName}-GraphQLApiUrl'
GraphQLApiId:
Description: GraphQL API ID
Value: !GetAtt GraphQLApi.ApiId
Export:
Name: !Sub '${AWS::StackName}-GraphQLApiId'
GraphQLApiKey:
Description: GraphQL API Key for public theme config access
Value: !GetAtt GraphQLApiKey.ApiKey
Export:
Name: !Sub '${AWS::StackName}-GraphQLApiKey'
UserPoolId:
Description: Cognito User Pool ID
Value: !Ref UserPool
Export:
Name: !Sub '${AWS::StackName}-UserPoolId'
UserPoolClientId:
Description: Cognito User Pool Client ID
Value: !Ref UserPoolClient
Export:
Name: !Sub '${AWS::StackName}-UserPoolClientId'
IdentityPoolId:
Description: Cognito Identity Pool ID
Value: !Ref IdentityPool
Export:
Name: !Sub '${AWS::StackName}-IdentityPoolId'
# Configuration
Region:
Description: AWS Region
Value: !Ref AWS::Region
StackName:
Description: CloudFormation Stack Name
Value: !Ref AWS::StackName
# CloudFront
CloudFrontDomain:
Description: CloudFront distribution domain
Value: !GetAtt CloudFrontDistribution.DomainName
CloudFrontDistributionId:
Description: CloudFront distribution ID
Value: !Ref CloudFrontDistribution
UIUrl:
Description: UI URL (HTTPS via CloudFront)
Value: !Sub 'https://${CloudFrontDistribution.DomainName}'
# Web Component CDN
WebComponentCDNUrl:
Description: CDN URL for embeddable chat web component
Value: !Sub 'https://${WebComponentDistribution.DomainName}/ragstack-chat.js'
Export:
Name: !Sub '${AWS::StackName}-WebComponentCDNUrl'
WebComponentDistributionId:
Description: CloudFront distribution ID for web component
Value: !Ref WebComponentDistribution
Export:
Name: !Sub '${AWS::StackName}-WebComponentDistributionId'
WebComponentBuildProjectName:
Description: CodeBuild project name for web component deployment
Value: !Ref WebComponentBuildProject
Export:
Name: !Sub '${AWS::StackName}-WebComponentBuildProject'
WebComponentAssetsBucketName:
Description: S3 bucket for web component assets
Value: !Ref WebComponentAssetsBucket
Export:
Name: !Sub '${AWS::StackName}-WebComponentAssetsBucket'