Flow - Random Stress Chaos #348

Workflow file for this run

.github/workflows/flow-stress-chaos.yml at a3815dc

	name: Flow - Random Stress Chaos

	on:
	schedule:
	# Run every 2 hours
	- cron: '0 /2 * *'
	workflow_dispatch:
	inputs:
	scenario:
	description: 'Stress scenario to run'
	required: false
	type: choice
	default: 'random'
	options:
	- random
	- relibank-cpu-stress-test
	- relibank-high-cpu-stress
	- relibank-memory-stress-test
	- relibank-high-memory-stress
	- relibank-combined-stress-test

	jobs:
	trigger-stress:
	runs-on: ubuntu-latest
	environment: events
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Install dependencies
	run: \|
	pip install requests locust

	- name: Select stress scenario
	id: random
	run: \|
	# Always use high-cpu-stress on transaction service
	SCENARIO="relibank-high-cpu-stress"
	echo "Using stress scenario: $SCENARIO"

	echo "scenario=$SCENARIO" >> $GITHUB_OUTPUT
	echo "target_service=transaction-service" >> $GITHUB_OUTPUT
	echo "locust_file=transaction_service_load.py" >> $GITHUB_OUTPUT

	# # Use user-selected scenario if provided, otherwise random
	# if [ "${{ github.event.inputs.scenario }}" == "random" ] \|\| [ -z "${{ github.event.inputs.scenario }}" ]; then
	# # Array of stress scenarios
	# scenarios=("relibank-cpu-stress-test" "relibank-high-cpu-stress" "relibank-memory-stress-test" "relibank-high-memory-stress" "relibank-combined-stress-test")

	# # Select random scenario
	# RANDOM_INDEX=$((RANDOM % ${#scenarios[@]}))
	# SCENARIO=${scenarios[$RANDOM_INDEX]}
	# echo "Randomly selected stress scenario: $SCENARIO"
	# else
	# SCENARIO="${{ github.event.inputs.scenario }}"
	# echo "User selected stress scenario: $SCENARIO"
	# fi

	# echo "scenario=$SCENARIO" >> $GITHUB_OUTPUT

	# # Determine target service and locust file
	# if [[ "$SCENARIO" == "cpu" ]] \|\| [[ "$SCENARIO" == "combined" ]]; then
	# echo "target_service=transaction-service" >> $GITHUB_OUTPUT
	# echo "locust_file=transaction_service_load.py" >> $GITHUB_OUTPUT
	# else
	# echo "target_service=bill-pay-service" >> $GITHUB_OUTPUT
	# echo "locust_file=bill_pay_service_load.py" >> $GITHUB_OUTPUT
	# fi

	- name: Reset rate limit
	run: \|
	echo "Resetting rate limit to ensure scenario can be triggered..."
	curl -X POST "${{ vars.SCENARIO_SERVICE_URL }}/api/chaos-rate-limit-reset" \|\| echo "Rate limit reset failed (may not be available)"

	- name: Trigger stress scenario
	id: trigger
	run: \|
	echo "Triggering stress scenario: ${{ steps.random.outputs.scenario }}"
	echo "Target service: ${{ steps.random.outputs.target_service }}"

	RESPONSE=$(curl -s -X POST "${{ vars.SCENARIO_SERVICE_URL }}/api/trigger_stress/${{ steps.random.outputs.scenario }}")
	echo "$RESPONSE"

	STATUS=$(echo "$RESPONSE" \| jq -r '.status')

	if [ "$STATUS" != "success" ]; then
	echo "Failed to trigger stress scenario"
	echo "$RESPONSE" \| jq .
	exit 1
	fi

	echo "✓ Stress scenario triggered successfully"

	- name: Run load test and validate stress during experiment
	env:
	NEW_RELIC_API_KEY: ${{ secrets.NR_USER_API_KEY }}
	NEW_RELIC_ACCOUNT_ID: ${{ vars.NR_ACCOUNT_ID }}
	run: \|
	echo "Starting load test on ${{ steps.random.outputs.target_service }}..."

	# Run locust in headless mode for 20 minutes
	# 3 users, spawn rate of 3 users/second (start all immediately)
	cd demo_flows/stress_loadgen && \
	locust -f ${{ steps.random.outputs.locust_file }} \
	--host=${{ vars.BASE_URL }} \
	--users=3 \
	--spawn-rate=3 \
	--run-time=20m \
	--headless \
	--only-summary &

	LOCUST_PID=$!
	echo "Locust running with PID: $LOCUST_PID"

	# Wait for stress to ramp up
	echo "Waiting 2 minutes for stress to ramp up..."
	sleep 120

	# Validate stress metrics with one retry
	echo "Validating stress experiment in New Relic..."
	cd ${{ github.workspace }}

	if python tests/workflow_validation/validate_stress_metrics.py "${{ steps.random.outputs.scenario }}" 2; then
	echo "✓ Stress validated successfully"
	else
	echo "⚠ Stress not detected on first check, waiting 2 minutes and retrying..."
	sleep 120

	if python tests/workflow_validation/validate_stress_metrics.py "${{ steps.random.outputs.scenario }}" 2; then
	echo "✓ Stress validated successfully on retry"
	else
	echo "❌ Stress was not detected after 2 attempts"
	# Kill locust and exit
	kill $LOCUST_PID 2>/dev/null \|\| true
	exit 1
	fi
	fi

	# Wait for locust to complete
	echo "Waiting for load test to complete..."
	wait $LOCUST_PID \|\| echo "Locust completed"

	echo "✓ Load test completed"
	echo "✓ Stress experiment validated successfully"

	- name: Summary
	if: always()
	run: \|
	echo "## Stress Chaos Test Summary" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "Scenario: ${{ steps.random.outputs.scenario }}" >> $GITHUB_STEP_SUMMARY
	echo "Status: ${{ job.status }}" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "Check New Relic dashboards for detailed metrics during the stress period." >> $GITHUB_STEP_SUMMARY

	- name: Execute New Relic Log Export Script
	if: always()

	# Set the environment variables required by export_to_newrelic.py
	env:
	# --- GitHub Action Context Variables ---
	JOB_STATUS: ${{ job.status }}
	JOB_NAME: ${{ github.job }}
	RUN_ID: ${{ github.run_id }}
	REPOSITORY: ${{ github.repository }}
	SERVER_URL: ${{ github.server_url }}
	WORKFLOW: ${{ github.workflow }}

	# --- New Relic Configuration (Set these as repository secrets) ---
	NR_LICENSE_KEY: ${{ secrets.NR_LICENSE_KEY_ALERTS }}
	NR_ACCOUNT_ID: ${{ vars.NR_ACCOUNT_ID_ALERTS }}
	ENVIRONMENT: ${{ matrix.environment }}


	# Base URL for the New Relic API endpoint (US region)
	NR_ENDPOINT_BASE: "https://log-api.newrelic.com/log/v1"

	run: \|
	cd utils/github_action_monitoring && python export_to_newrelic.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Flow - Random Stress Chaos #348

Workflow file

Flow - Random Stress Chaos #348

Uh oh!

Workflow file for this run