Add automatic retry of failed workflows to improve E2E stability (#51627)

cipolleschi · facebook-github-bot · commit 4b43f09e6434 · 2025-05-28T06:50:28.000-07:00
Summary: This cdhanges are inspired by https://stackoverflow.com/a/78314483 and they should help with the stability of E2E tests on main. Most of the time, those tests fails because of flakyness in the E2E infrastructure on GHA. Usually, rerunning the tests manually makes the workflow pass. These couple of jobs automatically reruns the workflow up to 3 times in case one of the E2E tests fails ## Changelog: [Internal] - improve CI by rerunning the workflow if the E2E tests fails Reviewed By: cortinico Differential Revision: D75449445
diff --git a/.github/workflows/retry-workflow.yml b/.github/workflows/retry-workflow.yml
@@ -0,0 +1,19 @@
+name: Retry workflow
+# Based on https://stackoverflow.com/a/78314483
+
+on:
+    workflow_dispatch:
+        inputs:
+            run_id:
+                required: true
+jobs:
+    rerun:
+        runs-on: ubuntu-latest
+        steps:
+            - name: rerun ${{ inputs.run_id }}
+              env:
+                  GH_REPO: ${{ github.repository }}
+                  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+              run: |
+                  gh run watch ${{ inputs.run_id }} > /dev/null 2>&1
+                  gh run rerun ${{ inputs.run_id }} --failed
diff --git a/.github/workflows/test-all.yml b/.github/workflows/test-all.yml
@@ -187,7 +187,7 @@ jobs:
           flavor: ${{ matrix.flavor }}
 
   test_e2e_ios_rntester:
-    if: ${{ github.ref == 'refs/heads/main' || contains(github.ref,  'stable') || inputs.run-e2e-tests }}
+    # if: ${{ github.ref == 'refs/heads/main' || contains(github.ref,  'stable') || inputs.run-e2e-tests }}
     runs-on: macos-14-large
     needs:
       [test_ios_rntester]
@@ -221,7 +221,7 @@ jobs:
           flavor: ${{ matrix.flavor }}
 
   test_e2e_ios_templateapp:
-    if: ${{ github.ref == 'refs/heads/main' || contains(github.ref,  'stable') || inputs.run-e2e-tests }}
+    # if: ${{ github.ref == 'refs/heads/main' || contains(github.ref,  'stable') || inputs.run-e2e-tests }}
     runs-on: macos-14-large
     needs: [build_npm_package, prebuild_apple_dependencies]
     env:
@@ -312,7 +312,7 @@ jobs:
           architecture: ${{ matrix.architecture }}
 
   test_e2e_android_templateapp:
-    if: ${{ github.ref == 'refs/heads/main' || contains(github.ref,  'stable') || inputs.run-e2e-tests }}
+    # if: ${{ github.ref == 'refs/heads/main' || contains(github.ref,  'stable') || inputs.run-e2e-tests }}
     runs-on: 4-core-ubuntu
     needs: build_npm_package
     continue-on-error: true
@@ -439,11 +439,11 @@ jobs:
         uses: ./.github/actions/build-android
         with:
           release-type: ${{ needs.set_release_type.outputs.RELEASE_TYPE }}
-          run-e2e-tests: ${{ github.ref == 'refs/heads/main' || contains(github.ref,  'stable') || inputs.run-e2e-tests }}
+          run-e2e-tests: true # ${{ github.ref == 'refs/heads/main' || contains(github.ref,  'stable') || inputs.run-e2e-tests }}
           gradle-cache-encryption-key: ${{ secrets.GRADLE_CACHE_ENCRYPTION_KEY }}
 
   test_e2e_android_rntester:
-    if: ${{ github.ref == 'refs/heads/main' || contains(github.ref,  'stable') || inputs.run-e2e-tests }}
+    # if: ${{ github.ref == 'refs/heads/main' || contains(github.ref,  'stable') || inputs.run-e2e-tests }}
     runs-on: 4-core-ubuntu
     needs: [build_android]
     strategy:
@@ -632,3 +632,22 @@ jobs:
         uses: ./.github/actions/lint
         with:
           github-token: ${{ env.GH_TOKEN }}
+
+  # This job should help with the E2E flakyness.
+  # In case E2E tests fails, it launches a new retry-workflow workflow, passing the current run_id as input.
+  # The retry-workflow reruns only the failed jobs of the current test-all workflow using
+  # ```
+  # gh run rerun ${{ inputs.run_id }} --failed
+  # ```
+  # From https://stackoverflow.com/a/78314483 it seems like that adding the extra workflow
+  # rather then calling directly this command should improve stability of this solution.
+  # This is exactly the same as rerunning failed tests from the GH UI, but automated.
+  rerun-failed-jobs:
+    runs-on: ubuntu-latest
+    needs: [test_e2e_ios_rntester, test_e2e_android_rntester,test_e2e_ios_templateapp, test_e2e_android_templateapp]
+    if: failure() && ${{ github.ref == 'refs/heads/main' || contains(github.ref,  'stable') || inputs.run-e2e-tests }} && fromJSON(github.run_attempt) < 3
+    steps:
+      - name: Rerun failed jobs in the current workflow
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: gh workflow run ./.github/workflows/retry-workflow.yml -F run_id=${{ github.run_id }}