Skip to content

Flow - Random Stress Chaos #347

Flow - Random Stress Chaos

Flow - Random Stress Chaos #347

name: Flow - Random Stress Chaos
on:
schedule:
# Run every 2 hours
- cron: '0 */2 * * *'
workflow_dispatch:
inputs:
scenario:
description: 'Stress scenario to run'
required: false
type: choice
default: 'random'
options:
- random
- relibank-cpu-stress-test
- relibank-high-cpu-stress
- relibank-memory-stress-test
- relibank-high-memory-stress
- relibank-combined-stress-test
jobs:
trigger-stress:
runs-on: ubuntu-latest
environment: events
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: |
pip install requests locust
- name: Select stress scenario
id: random
run: |
# Always use high-cpu-stress on transaction service
SCENARIO="relibank-high-cpu-stress"
echo "Using stress scenario: $SCENARIO"
echo "scenario=$SCENARIO" >> $GITHUB_OUTPUT
echo "target_service=transaction-service" >> $GITHUB_OUTPUT
echo "locust_file=transaction_service_load.py" >> $GITHUB_OUTPUT
# # Use user-selected scenario if provided, otherwise random
# if [ "${{ github.event.inputs.scenario }}" == "random" ] || [ -z "${{ github.event.inputs.scenario }}" ]; then
# # Array of stress scenarios
# scenarios=("relibank-cpu-stress-test" "relibank-high-cpu-stress" "relibank-memory-stress-test" "relibank-high-memory-stress" "relibank-combined-stress-test")
# # Select random scenario
# RANDOM_INDEX=$((RANDOM % ${#scenarios[@]}))
# SCENARIO=${scenarios[$RANDOM_INDEX]}
# echo "Randomly selected stress scenario: $SCENARIO"
# else
# SCENARIO="${{ github.event.inputs.scenario }}"
# echo "User selected stress scenario: $SCENARIO"
# fi
# echo "scenario=$SCENARIO" >> $GITHUB_OUTPUT
# # Determine target service and locust file
# if [[ "$SCENARIO" == *"cpu"* ]] || [[ "$SCENARIO" == *"combined"* ]]; then
# echo "target_service=transaction-service" >> $GITHUB_OUTPUT
# echo "locust_file=transaction_service_load.py" >> $GITHUB_OUTPUT
# else
# echo "target_service=bill-pay-service" >> $GITHUB_OUTPUT
# echo "locust_file=bill_pay_service_load.py" >> $GITHUB_OUTPUT
# fi
- name: Reset rate limit
run: |
echo "Resetting rate limit to ensure scenario can be triggered..."
curl -X POST "${{ vars.SCENARIO_SERVICE_URL }}/api/chaos-rate-limit-reset" || echo "Rate limit reset failed (may not be available)"
- name: Trigger stress scenario
id: trigger
run: |
echo "Triggering stress scenario: ${{ steps.random.outputs.scenario }}"
echo "Target service: ${{ steps.random.outputs.target_service }}"
RESPONSE=$(curl -s -X POST "${{ vars.SCENARIO_SERVICE_URL }}/api/trigger_stress/${{ steps.random.outputs.scenario }}")
echo "$RESPONSE"
STATUS=$(echo "$RESPONSE" | jq -r '.status')
if [ "$STATUS" != "success" ]; then
echo "Failed to trigger stress scenario"
echo "$RESPONSE" | jq .
exit 1
fi
echo "✓ Stress scenario triggered successfully"
- name: Run load test and validate stress during experiment
env:
NEW_RELIC_API_KEY: ${{ secrets.NR_USER_API_KEY }}
NEW_RELIC_ACCOUNT_ID: ${{ vars.NR_ACCOUNT_ID }}
run: |
echo "Starting load test on ${{ steps.random.outputs.target_service }}..."
# Run locust in headless mode for 20 minutes
# 3 users, spawn rate of 3 users/second (start all immediately)
cd demo_flows/stress_loadgen && \
locust -f ${{ steps.random.outputs.locust_file }} \
--host=${{ vars.BASE_URL }} \
--users=3 \
--spawn-rate=3 \
--run-time=20m \
--headless \
--only-summary &
LOCUST_PID=$!
echo "Locust running with PID: $LOCUST_PID"
# Wait for stress to ramp up
echo "Waiting 2 minutes for stress to ramp up..."
sleep 120
# Validate stress metrics with one retry
echo "Validating stress experiment in New Relic..."
cd ${{ github.workspace }}
if python tests/workflow_validation/validate_stress_metrics.py "${{ steps.random.outputs.scenario }}" 2; then
echo "✓ Stress validated successfully"
else
echo "⚠ Stress not detected on first check, waiting 2 minutes and retrying..."
sleep 120
if python tests/workflow_validation/validate_stress_metrics.py "${{ steps.random.outputs.scenario }}" 2; then
echo "✓ Stress validated successfully on retry"
else
echo "❌ Stress was not detected after 2 attempts"
# Kill locust and exit
kill $LOCUST_PID 2>/dev/null || true
exit 1
fi
fi
# Wait for locust to complete
echo "Waiting for load test to complete..."
wait $LOCUST_PID || echo "Locust completed"
echo "✓ Load test completed"
echo "✓ Stress experiment validated successfully"
- name: Summary
if: always()
run: |
echo "## Stress Chaos Test Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "**Scenario**: ${{ steps.random.outputs.scenario }}" >> $GITHUB_STEP_SUMMARY
echo "**Status**: ${{ job.status }}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Check New Relic dashboards for detailed metrics during the stress period." >> $GITHUB_STEP_SUMMARY
- name: Execute New Relic Log Export Script
if: always()
# Set the environment variables required by export_to_newrelic.py
env:
# --- GitHub Action Context Variables ---
JOB_STATUS: ${{ job.status }}
JOB_NAME: ${{ github.job }}
RUN_ID: ${{ github.run_id }}
REPOSITORY: ${{ github.repository }}
SERVER_URL: ${{ github.server_url }}
WORKFLOW: ${{ github.workflow }}
# --- New Relic Configuration (Set these as repository secrets) ---
NR_LICENSE_KEY: ${{ secrets.NR_LICENSE_KEY_ALERTS }}
NR_ACCOUNT_ID: ${{ vars.NR_ACCOUNT_ID_ALERTS }}
ENVIRONMENT: ${{ matrix.environment }}
# Base URL for the New Relic API endpoint (US region)
NR_ENDPOINT_BASE: "https://log-api.newrelic.com/log/v1"
run: |
cd utils/github_action_monitoring && python export_to_newrelic.py