name: Cypress E2E Tests
on:
workflow_call:
inputs:
environment:
type: string
required: true
description: "where to test"
test_dir:
type: string
required: true
description: "which test directory to execute"
workflow_dispatch:
inputs:
environment:
type: choice
required: true
description: "where to test"
default: "tools"
options:
- tools
- production
- perf
- ec2-node
test_dir:
type: string
required: true
description: "which test directory to execute"
default: "cypress/e2e"
jobs:
define-test-matrix:
runs-on: ubuntu-latest
outputs:
tests: ${{ steps.tests.outputs.tests }}
steps:
- uses: actions/checkout@v4
- id: tests
working-directory: app/web/${{ inputs.test_dir }}
run: |
test_dirs=$(find . -mindepth 1 -maxdepth 1 -type d | sed 's|^\./||')
test_array="[]"
for d in $test_dirs; do
test_array=$(echo "$test_array" | jq --arg d "$d" '. += [$d]')
done
test_array=$(echo "$test_array" | jq -c '.')
echo "$test_array"
echo "tests=$test_array" >> "$GITHUB_OUTPUT"
launch-ec2-node:
environment: ${{ inputs.environment }}
runs-on: ubuntu-latest
if: ${{ inputs.environment == 'ec2-node' }}
outputs:
remote-ip: ${{ steps.get-ip.outputs.remote_ip }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Delete any lingering nodes
working-directory: .ci/
run: |
export SI_API_TOKEN="${{ secrets.SI_API_TOKEN }}"
export SI_WORKSPACE_ID="${{ vars.MANAGEMENT_WORKSPACE_ID }}"
python3 ./delete-stacks.py
- name: Deploy EC2 node
working-directory: .ci/
run: |
export SI_API_TOKEN="${{ secrets.SI_API_TOKEN }}"
export SI_WORKSPACE_ID="${{ vars.MANAGEMENT_WORKSPACE_ID }}"
python3 ./deploy-stack.py
- name: Upload deployment error (if any)
if: failure()
uses: actions/upload-artifact@v4
with:
name: deployment-error
path: .ci/error
if-no-files-found: ignore
retention-days: 1
- name: Save IP
id: get-ip
working-directory: .ci/
run: |
remote_ip=$(grep -Eo '([0-9]{1,3}\.){3}[0-9]{1,3}' "./ip")
echo "Remote IP set to ${remote_ip}"
echo "remote_ip=$remote_ip" >> "$GITHUB_OUTPUT"
echo "remote_ip=$remote_ip" >> "$GITHUB_ENV"
- name: Validate Service's are healthy
if: ${{ inputs.environment == 'ec2-node' }}
working-directory: .ci/
run: |
echo "$SSH_KEY" > ssh-key.pem
chmod 600 ssh-key.pem
echo "Tunneling EC2 node @ $remote_ip"
# Start SSH tunnel in background for 3020 (Bedrock) with retry logic
tunnel_retries=0
max_tunnel_retries=5
while [ $tunnel_retries -lt $max_tunnel_retries ]; do
echo "Attempting to establish SSH tunnel for port 3020 (attempt $((tunnel_retries + 1))/$max_tunnel_retries)..."
# Kill any existing SSH processes to this host
pkill -f "ssh.*arch@$remote_ip.*3020" || true
sleep 2
# Start SSH tunnel
nohup ssh -o StrictHostKeyChecking=no -o ConnectTimeout=30 -o ServerAliveInterval=60 -L 3020:localhost:3020 "arch@$remote_ip" -i ssh-key.pem -N &
ssh_pid=$!
# Give SSH time to establish connection
sleep 5
# Verify tunnel is working
if nc -z localhost 3020; then
echo "✅ SSH tunnel for port 3020 established successfully"
break
else
echo "⚠️ SSH tunnel attempt $((tunnel_retries + 1)) failed, retrying..."
kill $ssh_pid 2>/dev/null || true
tunnel_retries=$((tunnel_retries + 1))
sleep 3
fi
done
if [ $tunnel_retries -eq $max_tunnel_retries ]; then
echo "❌ Failed to establish SSH tunnel for port 3020 after $max_tunnel_retries attempts"
exit 1
fi
# Wait for Bedrock (EC2 localhost:3020) to be ready
echo "Waiting for Bedrock to be ready..."
for i in {1..180}; do
if curl --fail --silent --max-time 2 http://localhost:3020/; then
echo "✅ Bedrock service is up and returned a valid response, preparing db"
curl --location 'http://localhost:3020/prepare' \
--header 'Content;' \
--header 'Content-Type: application/json' \
--data '{
"recording_id": "W=01JYPR32SD5RKR3AMG298J7263-CS=01JZ3W5XX6QHQZ6PYSBHK4SB3K (39 components)",
"parameters": {},
"executionParameters": {}
}'
break
fi
echo "⏳ Attempt $i/180: Bedrock not responding yet. Retrying in 10s..."
sleep 10
done
# Fail if still not up after 30 min
if ! nc -z localhost 3020; then
echo "❌ Timed out waiting for bedrock service on port 3020"
exit 1
fi
# Start SSH tunnel in background for 8080 (Web App) with retry logic
tunnel_retries=0
max_tunnel_retries=5
while [ $tunnel_retries -lt $max_tunnel_retries ]; do
echo "Attempting to establish SSH tunnel for port 8080 (attempt $((tunnel_retries + 1))/$max_tunnel_retries)..."
# Kill any existing SSH processes to this host
pkill -f "ssh.*arch@$remote_ip.*8080" || true
sleep 2
# Start SSH tunnel
nohup ssh -o StrictHostKeyChecking=no -o ConnectTimeout=30 -o ServerAliveInterval=60 -L 8080:localhost:8080 "arch@$remote_ip" -i ssh-key.pem -N &
ssh_pid=$!
# Give SSH time to establish connection
sleep 5
# Verify tunnel is working
if nc -z localhost 8080; then
echo "✅ SSH tunnel for port 8080 established successfully"
break
else
echo "⚠️ SSH tunnel attempt $((tunnel_retries + 1)) failed, retrying..."
kill $ssh_pid 2>/dev/null || true
tunnel_retries=$((tunnel_retries + 1))
sleep 3
fi
done
if [ $tunnel_retries -eq $max_tunnel_retries ]; then
echo "❌ Failed to establish SSH tunnel for port 8080 after $max_tunnel_retries attempts"
exit 1
fi
# Wait for tunnel Web App (EC2 localhost:8080) to be ready
echo "Waiting up to 30 minutes for remote web app to be ready..."
for i in {1..180}; do
if curl --fail --silent --max-time 2 http://localhost:8080/health; then
echo "✅ Remote service is up and returned a valid response!"
break
fi
echo "⏳ Attempt $i/180: Service not responding yet. Retrying in 10s..."
sleep 10
done
# Fail if still not up after 30 min
if ! curl --fail --silent --max-time 2 http://localhost:8080/health; then
echo "❌ Timed out waiting for web app health endpoint to respond"
echo "📋 Checking cloud-init logs for debugging..."
ssh -o StrictHostKeyChecking=no -o ConnectTimeout=30 -i ssh-key.pem "arch@$remote_ip" "tail -50 /var/log/cloud-init-output.log" || echo "⚠️ Could not retrieve cloud-init logs"
exit 1
fi
env:
SSH_KEY: ${{ secrets.SSH_KEY }}
cypress-tests:
environment: ${{ inputs.environment }}
runs-on: ubuntu-latest
needs: [define-test-matrix, launch-ec2-node]
if: always() && (needs.define-test-matrix.result == 'success') && (inputs.environment != 'ec2-node' || needs.launch-ec2-node.result == 'success')
strategy:
fail-fast: true
matrix:
tests: ${{ fromJSON(needs.define-test-matrix.outputs.tests) }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '18.18.2'
- name: Setup pnpm
uses: pnpm/action-setup@v4
- name: Install Deps
working-directory: app/web
run: |
pnpm i
npx cypress install
- name: install uuid
run: |
sudo apt update
sudo apt install uuid -y
- name: Setup SSH tunnel if ec2-node for web access
if: ${{ inputs.environment == 'ec2-node' }}
working-directory: .ci/
run: |
echo "$SSH_KEY" > ssh-key.pem
chmod 600 ssh-key.pem
remote_ip="${{ needs.launch-ec2-node.outputs.remote-ip }}"
echo "Tunneling EC2 node @ $remote_ip"
# Start SSH tunnel in background for 8080 (Web App) with retry logic
tunnel_retries=0
max_tunnel_retries=5
while [ $tunnel_retries -lt $max_tunnel_retries ]; do
echo "Attempting to establish SSH tunnel for port 8080 (attempt $((tunnel_retries + 1))/$max_tunnel_retries)..."
# Kill any existing SSH processes to this host
pkill -f "ssh.*arch@$remote_ip.*8080" || true
sleep 2
# Start SSH tunnel
nohup ssh -o StrictHostKeyChecking=no -o ConnectTimeout=30 -o ServerAliveInterval=60 -L 8080:localhost:8080 arch@$remote_ip -i ssh-key.pem -N &
ssh_pid=$!
# Give SSH time to establish connection
sleep 5
# Verify tunnel is working
if nc -z localhost 8080; then
echo "✅ SSH tunnel for port 8080 established successfully"
break
else
echo "⚠️ SSH tunnel attempt $((tunnel_retries + 1)) failed, retrying..."
kill $ssh_pid 2>/dev/null || true
tunnel_retries=$((tunnel_retries + 1))
sleep 3
fi
done
if [ $tunnel_retries -eq $max_tunnel_retries ]; then
echo "❌ Failed to establish SSH tunnel for port 8080 after $max_tunnel_retries attempts"
exit 1
fi
# Wait for tunnel Web App (EC2 localhost:8080) to be ready
echo "Waiting up to 30 minutes for remote web app to be ready..."
for i in {1..180}; do
if curl --fail --silent --max-time 2 http://localhost:8080/health; then
echo "✅ Remote service is up and returned a valid response!"
break
fi
echo "⏳ Attempt $i/180: Service not responding yet. Retrying in 10s..."
sleep 10
done
# Fail if still not up after 30 min
if ! curl --fail --silent --max-time 2 http://localhost:8080/health; then
echo "❌ Timed out waiting for web app health endpoint to respond"
echo "📋 Checking cloud-init logs for debugging..."
ssh -o StrictHostKeyChecking=no -o ConnectTimeout=30 -i ssh-key.pem "arch@$remote_ip" "tail -50 /var/log/cloud-init-output.log" || echo "⚠️ Could not retrieve cloud-init logs"
exit 1
fi
env:
SSH_KEY: ${{ secrets.SSH_KEY }}
- name: Run Cypress Tests
working-directory: app/web
run: |
export VITE_AUTH0_USERNAME="${{ secrets.VITE_AUTH0_USERNAME }}"
export VITE_AUTH0_PASSWORD="${{ secrets.VITE_AUTH0_PASSWORD }}"
export VITE_SI_CYPRESS_MULTIPLIER="${{ vars.VITE_SI_CYPRESS_MULTIPLIER }}"
export VITE_SI_WORKSPACE_URL="${{ vars.VITE_SI_WORKSPACE_URL }}"
export VITE_HOST_URL="${{ vars.VITE_SI_WORKSPACE_URL }}"
export VITE_SI_WORKSPACE_ID="${{ vars.VITE_SI_WORKSPACE_ID }}"
VITE_UUID="$(uuid)"
export VITE_UUID
export VITE_AUTH_API_URL="https://auth-api.systeminit.com"
export VITE_AUTH_PORTAL_URL="https://auth.systeminit.com"
n=0
max_retries=3
until [ $n -ge $max_retries ]; do
unset exit_code || echo "exit_code not set"
npx cypress run --spec "${{ inputs.test_dir }}/${{ matrix.tests }}/**" || exit_code=$?
if [ -z "$exit_code" ]; then
echo "Cypress Test task succeeded!"
break
fi
n=$((n+1))
echo "Attempt $n/$max_retries failed with exit code $exit_code! Retrying..."
done
if [ $n -ge $max_retries ]; then
echo "All $max_retries attempts failed."
exit 1
fi
- name: 'Upload Cypress Recordings to Github'
uses: actions/upload-artifact@v4
if: failure()
with:
name: cypress-recordings-run-${{ matrix.tests }}
path: app/web/cypress/videos/**/*.mp4
retention-days: 5
- name: Check Test Results
if: failure()
run: exit 1
cleanup:
name: Cleanup EC2 Nodes
runs-on: ubuntu-latest
needs: cypress-tests
environment: ${{ inputs.environment }}
if: inputs.environment == 'ec2-node' && always()
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Delete EC2 nodes
working-directory: .ci/
run: |
export SI_API_TOKEN="${{ secrets.SI_API_TOKEN }}"
export SI_WORKSPACE_ID="${{ vars.MANAGEMENT_WORKSPACE_ID }}"
python3 ./delete-stacks.py
on-failure:
runs-on: ubuntu-latest
needs: [cypress-tests, launch-ec2-node]
environment: ${{ inputs.environment }}
if: failure() && always()
steps:
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: artifacts
- name: Check for deployment error
if: ${{ inputs.environment == 'ec2-node' && needs.launch-ec2-node.result == 'failure' }}
id: deployment-error
run: |
if [ -f "./artifacts/deployment-error/error" ]; then
error_message=$(cat ./artifacts/deployment-error/error)
{
echo "deployment_error<<EOF"
echo "$error_message"
echo "EOF"
echo "has_deployment_error=true"
} >> "$GITHUB_OUTPUT"
else
echo "has_deployment_error=false" >> "$GITHUB_OUTPUT"
fi
- name: Check for failed Cypress tests
id: failed-tests
run: |
failed_tests=""
has_test_failures=false
# Check for video artifacts which indicate test failures
for artifact_dir in artifacts/cypress-recordings-run-*; do
if [ -d "$artifact_dir" ]; then
# Extract test name from artifact directory name
test_name=$(basename "$artifact_dir" | sed 's/cypress-recordings-run-//')
if [ -n "$failed_tests" ]; then
failed_tests="$failed_tests, $test_name"
else
failed_tests="$test_name"
fi
has_test_failures=true
fi
done
{
echo "failed_tests=$failed_tests"
echo "has_test_failures=$has_test_failures"
} >> "$GITHUB_OUTPUT"
echo "Failed tests: $failed_tests"
- run: |
has_artifacts=false
for marker in artifacts/*/*.mp4; do
if [ -f "$marker" ]; then
echo "Artifact detected for failed test: $marker"
has_artifacts=true
break
fi
done
if [ "$has_artifacts" = true ] && [ "${{ github.ref_name }}" = "main" ]; then
curl --location "${{ secrets.FIREHYDRANT_WEBHOOK_URL }}" \
--header "Content-Type: application/json" \
--data "{
\"summary\": \"E2E ${{ inputs.environment }} Tests Fail\",
\"body\": \"E2E Tests have failed for ${{ inputs.environment }}.\",
\"links\": [
{
\"href\": \"https://github.com/systeminit/si/actions/runs/$GITHUB_RUN_ID\",
\"text\": \"E2E Test Run ${{ inputs.environment }}\"
}
],
\"tags\": [
\"service:github\"
]
}"
fi
- name: Send Slack notification with deployment error
if: ${{ inputs.environment == 'ec2-node' && steps.deployment-error.outputs.has_deployment_error == 'true' }}
run: |
error_message="${{ steps.deployment-error.outputs.deployment_error }}"
escaped_error=$(echo "$error_message" | sed 's/"/\\"/g' | tr '\n' ' ')
curl -X POST \
--header 'Content-type: application/json' \
--data "{\"text\": \":si: Failed EC2 Deployment for E2E Test: <https://github.com/systeminit/si/actions/runs/$GITHUB_RUN_ID|:test_tube: Link>\n\`\`\`$escaped_error\`\`\`\"}" \
${{ secrets.SLACK_WEBHOOK_URL }}
- name: Send regular Slack notification
if: ${{ inputs.environment != 'ec2-node' || (inputs.environment == 'ec2-node' && needs.launch-ec2-node.result != 'failure') }}
run: |
failed_tests="${{ steps.failed-tests.outputs.failed_tests }}"
if [ -n "$failed_tests" ]; then
curl -X POST \
--header 'Content-type: application/json' \
--data "{\"text\": \":si: Failed Cypress E2E Test for ${{ inputs.environment }}: <https://github.com/systeminit/si/actions/runs/$GITHUB_RUN_ID|:test_tube: Link>\n\`\`\`Failed tests: $failed_tests\`\`\`\"}" \
${{ secrets.SLACK_WEBHOOK_URL }}
else
curl -X POST \
--header 'Content-type: application/json' \
--data "{\"text\": \":si: Failed Cypress E2E Test for ${{ inputs.environment }}: <https://github.com/systeminit/si/actions/runs/$GITHUB_RUN_ID|:test_tube: Link>\"}" \
${{ secrets.SLACK_WEBHOOK_URL }}
fi