Skip to content

Rollback Production Deployment #1

Rollback Production Deployment

Rollback Production Deployment #1

name: Rollback Production Deployment
on:
workflow_dispatch:
inputs:
services:
description: 'Services to rollback (comma-separated: api,worker,beat,docker,redis or "all")'
required: false
default: 'all'
type: string
rollback_to_commit:
description: 'Specific commit SHA to rollback to (optional, leave blank for automatic rollback)'
required: false
default: ''
type: string
reason:
description: 'Reason for rollback'
required: true
type: string
env:
REGISTRY: ${{ secrets.DOCKER_REGISTRY }}
IMAGE_NAME: trendsearth-api
COMPOSE_FILE: docker-compose.prod.yml
jobs:
rollback-production:
name: Rollback Production Services
runs-on: ubuntu-latest
environment: production
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION || 'us-east-1' }}
- name: Get runner IP and update security group
id: security-group
run: |
# Get the public IP of the GitHub Actions runner
RUNNER_IP=$(curl -s https://checkip.amazonaws.com)
echo "Runner IP: $RUNNER_IP"
echo "runner-ip=$RUNNER_IP" >> $GITHUB_OUTPUT
# Add SSH access rule for the runner IP
echo "Adding SSH access for runner IP $RUNNER_IP to security group ${{ secrets.PROD_SECURITY_GROUP_ID }}"
aws ec2 authorize-security-group-ingress \
--group-id "${{ secrets.PROD_SECURITY_GROUP_ID }}" \
--protocol tcp \
--port 22 \
--cidr "${RUNNER_IP}/32" || true
echo "Security group rule added successfully"
- name: Set SSH port variable
id: ssh-port
run: |
# Trim whitespace and newlines from the port
PORT="${{ secrets.PROD_SSH_PORT || 22 }}"
PORT=$(echo "$PORT" | tr -d '[:space:]')
echo "port=$PORT" >> $GITHUB_OUTPUT
- name: Validate services input
id: validate-services
run: |
SERVICES="${{ github.event.inputs.services }}"
VALID_SERVICES="api worker beat docker redis"
echo "Requested services: $SERVICES"
if [ "$SERVICES" = "all" ]; then
SERVICES_TO_ROLLBACK="$VALID_SERVICES"
else
# Validate each service in the comma-separated list
SERVICES_TO_ROLLBACK=""
IFS=',' read -ra SERVICE_ARRAY <<< "$SERVICES"
for service in "${SERVICE_ARRAY[@]}"; do
service=$(echo "$service" | tr -d '[:space:]')
if echo "$VALID_SERVICES" | grep -q "$service"; then
SERVICES_TO_ROLLBACK="$SERVICES_TO_ROLLBACK $service"
else
echo "❌ Invalid service: $service"
echo "Valid services are: $VALID_SERVICES"
exit 1
fi
done
fi
echo "Services to rollback: $SERVICES_TO_ROLLBACK"
echo "services-to-rollback=$SERVICES_TO_ROLLBACK" >> $GITHUB_OUTPUT
- name: Validate and prepare rollback target
id: rollback-target
run: |
ROLLBACK_TO_COMMIT="${{ github.event.inputs.rollback_to_commit }}"
echo "Rollback to commit: $ROLLBACK_TO_COMMIT"
# Determine the rollback method
if [ -n "$ROLLBACK_TO_COMMIT" ]; then
# Validate commit SHA format (should be at least 7 characters, alphanumeric)
if ! echo "$ROLLBACK_TO_COMMIT" | grep -qE '^[a-fA-F0-9]{7,}$'; then
echo "❌ Error: Invalid commit SHA format: $ROLLBACK_TO_COMMIT"
echo "Commit SHA should be at least 7 alphanumeric characters"
exit 1
fi
echo "✅ Will rollback to commit SHA: $ROLLBACK_TO_COMMIT"
echo "rollback-method=commit" >> $GITHUB_OUTPUT
echo "commit-sha=$ROLLBACK_TO_COMMIT" >> $GITHUB_OUTPUT
else
echo "✅ Using automatic service rollback (no specific target specified)"
echo "rollback-method=automatic" >> $GITHUB_OUTPUT
fi
- name: Perform production rollback
uses: appleboy/ssh-action@v1.0.0
with:
host: ${{ secrets.PROD_HOST }}
username: ${{ secrets.PROD_USERNAME }}
key: ${{ secrets.PROD_SSH_KEY }}
port: ${{ steps.ssh-port.outputs.port }}
script: |
set -e
echo "🔄 Starting production rollback..."
echo "Reason: ${{ github.event.inputs.reason }}"
echo "Requested by: ${{ github.actor }}"
echo "Services to rollback: ${{ steps.validate-services.outputs.services-to-rollback }}"
echo "Rollback method: ${{ steps.rollback-target.outputs.rollback-method }}"
if [ "${{ steps.rollback-target.outputs.rollback-method }}" = "commit" ]; then
echo "Target commit SHA: ${{ steps.rollback-target.outputs.commit-sha }}"
fi
# Navigate to application directory
cd ${{ secrets.PROD_APP_PATH || '/opt/trends-earth-api' }}
# Check current service status
echo "📊 Current service status before rollback:"
docker service ls --filter "name=trends-earth-prod"
SERVICES_TO_ROLLBACK="${{ steps.validate-services.outputs.services-to-rollback }}"
ROLLBACK_METHOD="${{ steps.rollback-target.outputs.rollback-method }}"
# Handle commit SHA rollback by rebuilding from the specified commit
if [ "$ROLLBACK_METHOD" = "commit" ]; then
COMMIT_SHA="${{ steps.rollback-target.outputs.commit-sha }}"
echo "🔄 Rolling back to commit SHA: $COMMIT_SHA"
# Fetch latest changes and checkout the specific commit
echo "📥 Fetching latest changes from repository..."
git fetch origin
echo "📋 Current branch and commit:"
echo " Branch: $(git branch --show-current)"
echo " Current commit: $(git rev-parse HEAD)"
# Verify the commit exists
if git rev-parse --verify "$COMMIT_SHA" >/dev/null 2>&1; then
echo "✅ Commit $COMMIT_SHA found in repository"
else
echo "❌ Commit $COMMIT_SHA not found in repository"
echo "🔍 Recent commits:"
git log --oneline -10
exit 1
fi
# Checkout the specific commit
echo "🔄 Checking out commit $COMMIT_SHA..."
git reset --hard "$COMMIT_SHA"
echo "✅ Successfully checked out commit: $(git rev-parse HEAD)"
echo "📋 Commit details:"
git log --oneline -1
# Configure insecure registry on server if needed
echo "📋 Configuring insecure registry on server..."
sudo mkdir -p /etc/docker
if [ ! -f /etc/docker/daemon.json ] || ! grep -q "insecure-registries" /etc/docker/daemon.json; then
echo '{"insecure-registries":["${{ env.REGISTRY }}"]}' | sudo tee /etc/docker/daemon.json > /dev/null
sudo systemctl restart docker
sleep 10
fi
# Build new image from the rolled-back commit
echo "🔨 Building Docker image from commit $COMMIT_SHA..."
REGISTRY="${{ env.REGISTRY }}"
IMAGE_NAME="${{ env.IMAGE_NAME }}"
SHORT_SHA="${COMMIT_SHA:0:7}"
ROLLBACK_TAG="rollback-$SHORT_SHA"
BUILT_TAGS="$REGISTRY/$IMAGE_NAME:$ROLLBACK_TAG
$REGISTRY/$IMAGE_NAME:latest"
# Build with the rollback tag
PRIMARY_TAG="$REGISTRY/$IMAGE_NAME:$ROLLBACK_TAG"
echo "Building with tag: $PRIMARY_TAG"
docker build -t "$PRIMARY_TAG" .
# Tag with latest as well
docker tag "$PRIMARY_TAG" "$REGISTRY/$IMAGE_NAME:latest"
# Push the images to local registry
echo "🚀 Pushing rollback images to local registry..."
echo "$BUILT_TAGS" | while IFS= read -r tag; do
if [ -n "$tag" ]; then
echo "Pushing: $tag"
docker push "$tag"
fi
done
# Set the image tag for service updates
FINAL_IMAGE_TAG="$ROLLBACK_TAG"
fi
# Perform rollback for each service
for service in $SERVICES_TO_ROLLBACK; do
SERVICE_NAME="trends-earth-prod_$service"
echo "🔄 Rolling back service: $SERVICE_NAME"
# Check if service exists
if ! docker service ls --format "{{.Name}}" | grep -q "^$SERVICE_NAME$"; then
echo "⚠️ Service $SERVICE_NAME not found, skipping..."
continue
fi
if [ "$ROLLBACK_METHOD" = "commit" ]; then
# Rollback to specific image (built from commit SHA)
FULL_IMAGE="${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:$FINAL_IMAGE_TAG"
echo "📦 Rolling back $SERVICE_NAME to image: $FULL_IMAGE"
# For commit rollback, we just built the image so it should exist
# Update service to use specific image
if docker service update --image "$FULL_IMAGE" "$SERVICE_NAME"; then
echo "✅ Successfully updated $SERVICE_NAME to image $FULL_IMAGE"
else
echo "❌ Failed to update $SERVICE_NAME to image $FULL_IMAGE"
exit 1
fi
else
# Use Docker Swarm's built-in rollback functionality
echo "⏪ Rolling back $SERVICE_NAME to previous version"
# Check if service has update history for rollback
UPDATE_STATUS=$(docker service inspect "$SERVICE_NAME" --format='{{.UpdateStatus}}' 2>/dev/null || echo "null")
if [ "$UPDATE_STATUS" = "null" ] || [ -z "$UPDATE_STATUS" ]; then
echo "⚠️ No update history found for $SERVICE_NAME, cannot perform automatic rollback"
echo "🔍 Current service details:"
docker service inspect "$SERVICE_NAME" --format='{{.Spec.TaskTemplate.ContainerSpec.Image}}'
continue
fi
if docker service rollback "$SERVICE_NAME"; then
echo "✅ Successfully rolled back $SERVICE_NAME"
else
echo "❌ Failed to rollback $SERVICE_NAME"
# Continue with other services rather than failing completely
continue
fi
fi
# Wait for service to stabilize
echo "⏳ Waiting for $SERVICE_NAME to stabilize..."
sleep 10
done
# Wait for all services to be ready after rollback
echo "⏳ Waiting for all services to be ready after rollback..."
max_wait=120
wait_time=0
while [ $wait_time -lt $max_wait ]; do
# Check if all services have desired replicas running
pending_services=$(docker service ls --filter "name=trends-earth-prod" --format "table {{.Name}}\t{{.Replicas}}" | grep -v "1/1\|0/1" | wc -l)
# Account for migrate service which runs once and exits (expected to be 0/1)
if [ $pending_services -le 2 ]; then # Header line and possibly migrate service
echo "✅ All services are running after rollback"
break
fi
echo "⏳ Waiting for services to be ready... ($wait_time/$max_wait seconds)"
docker service ls --filter "name=trends-earth-prod"
sleep 10
wait_time=$((wait_time + 10))
done
if [ $wait_time -ge $max_wait ]; then
echo "⚠️ Some services may not be fully ready after $max_wait seconds"
fi
# Final service status
echo "📊 Final service status after rollback:"
docker service ls --filter "name=trends-earth-prod"
echo "🎉 Production rollback completed!"
- name: Verify rollback health
uses: appleboy/ssh-action@v1.0.0
with:
host: ${{ secrets.PROD_HOST }}
username: ${{ secrets.PROD_USERNAME }}
key: ${{ secrets.PROD_SSH_KEY }}
port: ${{ steps.ssh-port.outputs.port }}
script: |
set -e
echo "🏥 Performing health check after rollback..."
# Navigate to application directory
cd ${{ secrets.PROD_APP_PATH || '/opt/trends-earth-api' }}
max_attempts=20
attempt=1
while [ $attempt -le $max_attempts ]; do
echo "⏳ Health check attempt $attempt/$max_attempts..."
# First check if port is listening
if ! nc -z 127.0.0.1 3001 2>/dev/null; then
echo "⚠️ Port 3001 is not listening yet"
else
echo "✅ Port 3001 is listening"
fi
# Perform health check request
health_response=$(curl -f -s -w "HTTP_CODE:%{http_code}" http://127.0.0.1:3001/api-health 2>&1)
curl_exit_code=$?
if [ $curl_exit_code -eq 0 ]; then
echo "✅ Health check passed after rollback"
echo "Response: $health_response"
break
else
echo "⏳ Health check failed with exit code $curl_exit_code"
echo "Response: $health_response"
fi
sleep 15
attempt=$((attempt + 1))
done
if [ $attempt -gt $max_attempts ]; then
echo "❌ Health check failed after rollback - $max_attempts attempts"
echo "🔍 Debugging information:"
echo " - All running containers:"
docker ps --filter "name=trends-earth-prod"
echo " - API service details:"
docker service inspect trends-earth-prod_api --format="{{json .Spec.TaskTemplate.ContainerSpec}}" || echo "Service not found"
echo " - Recent API service logs:"
docker service logs --tail 50 trends-earth-prod_api || echo "No logs available"
exit 1
fi
echo "🎉 Rollback health verification completed successfully!"
- name: Run basic integration tests after rollback
uses: appleboy/ssh-action@v1.0.0
with:
host: ${{ secrets.PROD_HOST }}
username: ${{ secrets.PROD_USERNAME }}
key: ${{ secrets.PROD_SSH_KEY }}
port: ${{ steps.ssh-port.outputs.port }}
script: |
set -e
echo "🧪 Running basic integration tests after rollback..."
# Navigate to application directory
cd ${{ secrets.PROD_APP_PATH || '/opt/trends-earth-api' }}
# Test API health endpoint
echo "⏳ Testing health endpoint..."
if curl -f -s http://127.0.0.1:3001/api-health >/dev/null 2>&1; then
echo "✅ Health endpoint working"
else
echo "❌ Health endpoint failed"
exit 1
fi
# Test API documentation endpoint
echo "⏳ Testing API documentation endpoint..."
if curl -f -s http://127.0.0.1:3001/api/docs/ >/dev/null 2>&1; then
echo "✅ API documentation endpoint working"
else
echo "❌ API documentation endpoint failed"
exit 1
fi
# Test basic API functionality
echo "⏳ Testing basic API functionality..."
api_response=$(curl -s -w "HTTP_CODE:%{http_code}" http://127.0.0.1:3001/api/v1/user/me 2>&1)
echo "🔍 API response: $api_response"
if echo "$api_response" | grep -q "HTTP_CODE:200\|HTTP_CODE:401\|HTTP_CODE:403"; then
echo "✅ API responding correctly after rollback"
else
echo "❌ API not responding correctly: $api_response"
echo "🔍 Testing alternative endpoint for basic connectivity..."
swagger_response=$(curl -s -w "HTTP_CODE:%{http_code}" http://127.0.0.1:3001/swagger.json 2>&1)
echo "🔍 Swagger response: $swagger_response"
if echo "$swagger_response" | grep -q "HTTP_CODE:200"; then
echo "✅ API is responding (swagger endpoint working)"
else
echo "❌ API completely unresponsive after rollback"
exit 1
fi
fi
echo "✅ All basic integration tests passed after rollback"
- name: Notify Rollbar of rollback
if: success()
run: |
echo "🔔 Notifying Rollbar of rollback..."
# Get rollback information
ENVIRONMENT="production"
REVISION="${{ github.sha }}"
LOCAL_USERNAME="${{ github.actor }}"
# Build comment based on rollback method
ROLLBACK_METHOD="${{ steps.rollback-target.outputs.rollback-method }}"
if [ "$ROLLBACK_METHOD" = "commit" ]; then
COMMENT="ROLLBACK TO COMMIT: ${{ steps.rollback-target.outputs.commit-sha }} - Reason: ${{ github.event.inputs.reason }} - Services: ${{ github.event.inputs.services }} - Triggered by: ${{ github.actor }}"
else
COMMENT="ROLLBACK (automatic): ${{ github.event.inputs.reason }} - Services: ${{ github.event.inputs.services }} - Triggered by: ${{ github.actor }}"
fi
# Send rollback notification to Rollbar
curl -X POST 'https://api.rollbar.com/api/1/deploy/' \
-H 'Content-Type: application/json' \
-d '{
"access_token": "${{ secrets.ROLLBAR_SERVER_ACCESS_TOKEN }}",
"environment": "'"$ENVIRONMENT"'",
"revision": "'"$REVISION"'",
"local_username": "'"$LOCAL_USERNAME"'",
"comment": "'"$COMMENT"'"
}'
echo "✅ Rollbar rollback notification sent successfully"
- name: Cleanup security group access
if: always()
run: |
# Remove the SSH access rule for the runner IP
RUNNER_IP="${{ steps.security-group.outputs.runner-ip }}"
if [ -n "$RUNNER_IP" ]; then
echo "Removing SSH access for runner IP $RUNNER_IP from security group ${{ secrets.PROD_SECURITY_GROUP_ID }}"
aws ec2 revoke-security-group-ingress \
--group-id "${{ secrets.PROD_SECURITY_GROUP_ID }}" \
--protocol tcp \
--port 22 \
--cidr "${RUNNER_IP}/32" || true
echo "Security group rule removed successfully"
else
echo "No runner IP found to remove from security group"
fi