Flow - Random Stress Chaos #348
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Flow - Random Stress Chaos | |
| on: | |
| schedule: | |
| # Run every 2 hours | |
| - cron: '0 */2 * * *' | |
| workflow_dispatch: | |
| inputs: | |
| scenario: | |
| description: 'Stress scenario to run' | |
| required: false | |
| type: choice | |
| default: 'random' | |
| options: | |
| - random | |
| - relibank-cpu-stress-test | |
| - relibank-high-cpu-stress | |
| - relibank-memory-stress-test | |
| - relibank-high-memory-stress | |
| - relibank-combined-stress-test | |
| jobs: | |
| trigger-stress: | |
| runs-on: ubuntu-latest | |
| environment: events | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.11' | |
| - name: Install dependencies | |
| run: | | |
| pip install requests locust | |
| - name: Select stress scenario | |
| id: random | |
| run: | | |
| # Always use high-cpu-stress on transaction service | |
| SCENARIO="relibank-high-cpu-stress" | |
| echo "Using stress scenario: $SCENARIO" | |
| echo "scenario=$SCENARIO" >> $GITHUB_OUTPUT | |
| echo "target_service=transaction-service" >> $GITHUB_OUTPUT | |
| echo "locust_file=transaction_service_load.py" >> $GITHUB_OUTPUT | |
| # # Use user-selected scenario if provided, otherwise random | |
| # if [ "${{ github.event.inputs.scenario }}" == "random" ] || [ -z "${{ github.event.inputs.scenario }}" ]; then | |
| # # Array of stress scenarios | |
| # scenarios=("relibank-cpu-stress-test" "relibank-high-cpu-stress" "relibank-memory-stress-test" "relibank-high-memory-stress" "relibank-combined-stress-test") | |
| # # Select random scenario | |
| # RANDOM_INDEX=$((RANDOM % ${#scenarios[@]})) | |
| # SCENARIO=${scenarios[$RANDOM_INDEX]} | |
| # echo "Randomly selected stress scenario: $SCENARIO" | |
| # else | |
| # SCENARIO="${{ github.event.inputs.scenario }}" | |
| # echo "User selected stress scenario: $SCENARIO" | |
| # fi | |
| # echo "scenario=$SCENARIO" >> $GITHUB_OUTPUT | |
| # # Determine target service and locust file | |
| # if [[ "$SCENARIO" == *"cpu"* ]] || [[ "$SCENARIO" == *"combined"* ]]; then | |
| # echo "target_service=transaction-service" >> $GITHUB_OUTPUT | |
| # echo "locust_file=transaction_service_load.py" >> $GITHUB_OUTPUT | |
| # else | |
| # echo "target_service=bill-pay-service" >> $GITHUB_OUTPUT | |
| # echo "locust_file=bill_pay_service_load.py" >> $GITHUB_OUTPUT | |
| # fi | |
| - name: Reset rate limit | |
| run: | | |
| echo "Resetting rate limit to ensure scenario can be triggered..." | |
| curl -X POST "${{ vars.SCENARIO_SERVICE_URL }}/api/chaos-rate-limit-reset" || echo "Rate limit reset failed (may not be available)" | |
| - name: Trigger stress scenario | |
| id: trigger | |
| run: | | |
| echo "Triggering stress scenario: ${{ steps.random.outputs.scenario }}" | |
| echo "Target service: ${{ steps.random.outputs.target_service }}" | |
| RESPONSE=$(curl -s -X POST "${{ vars.SCENARIO_SERVICE_URL }}/api/trigger_stress/${{ steps.random.outputs.scenario }}") | |
| echo "$RESPONSE" | |
| STATUS=$(echo "$RESPONSE" | jq -r '.status') | |
| if [ "$STATUS" != "success" ]; then | |
| echo "Failed to trigger stress scenario" | |
| echo "$RESPONSE" | jq . | |
| exit 1 | |
| fi | |
| echo "✓ Stress scenario triggered successfully" | |
| - name: Run load test and validate stress during experiment | |
| env: | |
| NEW_RELIC_API_KEY: ${{ secrets.NR_USER_API_KEY }} | |
| NEW_RELIC_ACCOUNT_ID: ${{ vars.NR_ACCOUNT_ID }} | |
| run: | | |
| echo "Starting load test on ${{ steps.random.outputs.target_service }}..." | |
| # Run locust in headless mode for 20 minutes | |
| # 3 users, spawn rate of 3 users/second (start all immediately) | |
| cd demo_flows/stress_loadgen && \ | |
| locust -f ${{ steps.random.outputs.locust_file }} \ | |
| --host=${{ vars.BASE_URL }} \ | |
| --users=3 \ | |
| --spawn-rate=3 \ | |
| --run-time=20m \ | |
| --headless \ | |
| --only-summary & | |
| LOCUST_PID=$! | |
| echo "Locust running with PID: $LOCUST_PID" | |
| # Wait for stress to ramp up | |
| echo "Waiting 2 minutes for stress to ramp up..." | |
| sleep 120 | |
| # Validate stress metrics with one retry | |
| echo "Validating stress experiment in New Relic..." | |
| cd ${{ github.workspace }} | |
| if python tests/workflow_validation/validate_stress_metrics.py "${{ steps.random.outputs.scenario }}" 2; then | |
| echo "✓ Stress validated successfully" | |
| else | |
| echo "⚠ Stress not detected on first check, waiting 2 minutes and retrying..." | |
| sleep 120 | |
| if python tests/workflow_validation/validate_stress_metrics.py "${{ steps.random.outputs.scenario }}" 2; then | |
| echo "✓ Stress validated successfully on retry" | |
| else | |
| echo "❌ Stress was not detected after 2 attempts" | |
| # Kill locust and exit | |
| kill $LOCUST_PID 2>/dev/null || true | |
| exit 1 | |
| fi | |
| fi | |
| # Wait for locust to complete | |
| echo "Waiting for load test to complete..." | |
| wait $LOCUST_PID || echo "Locust completed" | |
| echo "✓ Load test completed" | |
| echo "✓ Stress experiment validated successfully" | |
| - name: Summary | |
| if: always() | |
| run: | | |
| echo "## Stress Chaos Test Summary" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "**Scenario**: ${{ steps.random.outputs.scenario }}" >> $GITHUB_STEP_SUMMARY | |
| echo "**Status**: ${{ job.status }}" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "Check New Relic dashboards for detailed metrics during the stress period." >> $GITHUB_STEP_SUMMARY | |
| - name: Execute New Relic Log Export Script | |
| if: always() | |
| # Set the environment variables required by export_to_newrelic.py | |
| env: | |
| # --- GitHub Action Context Variables --- | |
| JOB_STATUS: ${{ job.status }} | |
| JOB_NAME: ${{ github.job }} | |
| RUN_ID: ${{ github.run_id }} | |
| REPOSITORY: ${{ github.repository }} | |
| SERVER_URL: ${{ github.server_url }} | |
| WORKFLOW: ${{ github.workflow }} | |
| # --- New Relic Configuration (Set these as repository secrets) --- | |
| NR_LICENSE_KEY: ${{ secrets.NR_LICENSE_KEY_ALERTS }} | |
| NR_ACCOUNT_ID: ${{ vars.NR_ACCOUNT_ID_ALERTS }} | |
| ENVIRONMENT: ${{ matrix.environment }} | |
| # Base URL for the New Relic API endpoint (US region) | |
| NR_ENDPOINT_BASE: "https://log-api.newrelic.com/log/v1" | |
| run: | | |
| cd utils/github_action_monitoring && python export_to_newrelic.py |