From a0699f787f1205f7f7689f55ae664fd20110a0f0 Mon Sep 17 00:00:00 2001 From: Quoc Truong Date: Tue, 27 Aug 2024 11:55:52 -0700 Subject: [PATCH 01/10] Run stress tests --- .github/workflows/stress-test.yml | 39 +++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 .github/workflows/stress-test.yml diff --git a/.github/workflows/stress-test.yml b/.github/workflows/stress-test.yml new file mode 100644 index 000000000000..26c24d2729d7 --- /dev/null +++ b/.github/workflows/stress-test.yml @@ -0,0 +1,39 @@ +# Stress Tests with multiple runners +name: Stress Test +# Run on pull_request that is labeled as "optional_ci_tpu" or workflow dispatch +on: + pull_request: + branches: + - main + types: [labeled, synchronize] + workflow_dispatch: +# Cancel any previous iterations if a new commit is pushed. +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true +jobs: + cloud-stress-test: + name: "Stress Test" + strategy: + fail-fast: false # don't cancel all jobs on failure + matrix: + instances: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + runners: ["arc-linux-x86-n2-64", "arc-linux-x86-n2-128", "arc-linux-arm64-t2a-48", "arc-linux-x86-g2-96-l4-8gpu", "arc-linux-x86-ct5lp-224-8tpu"] + env: + ENABLE_PJRT_COMPATIBILITY: 1 + runs-on: ${{ matrix.runners }} + container: + image: ${{ contains(matrix.runners, "t2a") && 'us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/build-arm64:jax-latest-multi-python') || ('index.docker.io/tensorflow/build@sha256:7fb38f0319bda36393cad7f40670aa22352b44421bb906f5cf34d543acd8e1d2') }} + timeout-minutes: 15 + defaults: + run: + shell: bash -ex {0} + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # ratchet:actions/checkout@v4 + - name: Install JAX test requirements + run: | + pip install -U -r build/test-requirements.txt + - name: DEBUG HALT + run: | + echo "Halting" + sleep 5m \ No newline at end of file From 1fce65a37a560da9cbf21bfdb0e951f7cee8ec47 Mon Sep 17 00:00:00 2001 From: Quoc Truong Date: Tue, 27 Aug 2024 11:56:26 -0700 Subject: [PATCH 02/10] Run stress tests --- .github/workflows/stress-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stress-test.yml b/.github/workflows/stress-test.yml index 26c24d2729d7..69ca308b21a7 100644 --- a/.github/workflows/stress-test.yml +++ b/.github/workflows/stress-test.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false # don't cancel all jobs on failure matrix: - instances: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + instances: [1, 2, 3] runners: ["arc-linux-x86-n2-64", "arc-linux-x86-n2-128", "arc-linux-arm64-t2a-48", "arc-linux-x86-g2-96-l4-8gpu", "arc-linux-x86-ct5lp-224-8tpu"] env: ENABLE_PJRT_COMPATIBILITY: 1 From dace222be469f343b8ce209f7e49c8a573642ff0 Mon Sep 17 00:00:00 2001 From: Quoc Truong Date: Tue, 27 Aug 2024 12:13:20 -0700 Subject: [PATCH 03/10] Add logging --- .github/workflows/stress-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stress-test.yml b/.github/workflows/stress-test.yml index 69ca308b21a7..6dd609c26d4e 100644 --- a/.github/workflows/stress-test.yml +++ b/.github/workflows/stress-test.yml @@ -13,7 +13,7 @@ concurrency: cancel-in-progress: true jobs: cloud-stress-test: - name: "Stress Test" + name: "Stress Test ${{ matrix.runners }} - ${{ matrix.instances }}" strategy: fail-fast: false # don't cancel all jobs on failure matrix: From d9e4c476ecdb8ff27f653b2fa3a7abcfb41f8a8b Mon Sep 17 00:00:00 2001 From: Quoc Truong Date: Tue, 27 Aug 2024 12:27:06 -0700 Subject: [PATCH 04/10] Make stress test run --- .github/workflows/stress-test.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/stress-test.yml b/.github/workflows/stress-test.yml index 6dd609c26d4e..12eacb659133 100644 --- a/.github/workflows/stress-test.yml +++ b/.github/workflows/stress-test.yml @@ -5,8 +5,6 @@ on: pull_request: branches: - main - types: [labeled, synchronize] - workflow_dispatch: # Cancel any previous iterations if a new commit is pushed. concurrency: group: ${{ github.workflow }}-${{ github.ref }} From eb6df847aae8fd8591cf97eb2d37200ec07a52be Mon Sep 17 00:00:00 2001 From: Quoc Truong Date: Tue, 27 Aug 2024 12:28:53 -0700 Subject: [PATCH 05/10] Fix stress test --- .github/workflows/stress-test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/stress-test.yml b/.github/workflows/stress-test.yml index 12eacb659133..6dd609c26d4e 100644 --- a/.github/workflows/stress-test.yml +++ b/.github/workflows/stress-test.yml @@ -5,6 +5,8 @@ on: pull_request: branches: - main + types: [labeled, synchronize] + workflow_dispatch: # Cancel any previous iterations if a new commit is pushed. concurrency: group: ${{ github.workflow }}-${{ github.ref }} From 1b7ab9002a8afe9b3f9576f477a16431e663f8cc Mon Sep 17 00:00:00 2001 From: Quoc Truong Date: Tue, 27 Aug 2024 13:11:05 -0700 Subject: [PATCH 06/10] Fix stress test to run at 11pm every night --- .github/workflows/stress-test.yml | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/.github/workflows/stress-test.yml b/.github/workflows/stress-test.yml index 6dd609c26d4e..32ebcebd65b8 100644 --- a/.github/workflows/stress-test.yml +++ b/.github/workflows/stress-test.yml @@ -1,30 +1,26 @@ -# Stress Tests with multiple runners -name: Stress Test -# Run on pull_request that is labeled as "optional_ci_tpu" or workflow dispatch +# Nightly Stress Test for self-hosted runners +name: Self-hosted Runners Nightly Stress Test on: - pull_request: - branches: - - main - types: [labeled, synchronize] - workflow_dispatch: + schedule: + # Triggers at 11pm every night. + - cron: '0 23 * * *' # Cancel any previous iterations if a new commit is pushed. concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: - cloud-stress-test: + nightly-stress-test: name: "Stress Test ${{ matrix.runners }} - ${{ matrix.instances }}" strategy: fail-fast: false # don't cancel all jobs on failure matrix: - instances: [1, 2, 3] + instances: [1, 2, 3, 4, 5] runners: ["arc-linux-x86-n2-64", "arc-linux-x86-n2-128", "arc-linux-arm64-t2a-48", "arc-linux-x86-g2-96-l4-8gpu", "arc-linux-x86-ct5lp-224-8tpu"] - env: - ENABLE_PJRT_COMPATIBILITY: 1 + # TODO: Needs final runs-on value runs-on: ${{ matrix.runners }} container: - image: ${{ contains(matrix.runners, "t2a") && 'us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/build-arm64:jax-latest-multi-python') || ('index.docker.io/tensorflow/build@sha256:7fb38f0319bda36393cad7f40670aa22352b44421bb906f5cf34d543acd8e1d2') }} - timeout-minutes: 15 + image: ${{ (contains(matrix.runners, 't2a') && 'us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/build-arm64:jax-latest-multi-python') || 'index.docker.io/tensorflow/build@sha256:7fb38f0319bda36393cad7f40670aa22352b44421bb906f5cf34d543acd8e1d2' }} + timeout-minutes: 10 defaults: run: shell: bash -ex {0} From c0cd0c55b62574c1a6938bc6d79d2e06bf8ddaba Mon Sep 17 00:00:00 2001 From: Quoc Truong Date: Tue, 27 Aug 2024 13:13:20 -0700 Subject: [PATCH 07/10] Fix formatting --- .github/workflows/stress-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stress-test.yml b/.github/workflows/stress-test.yml index 32ebcebd65b8..75bc7937bf95 100644 --- a/.github/workflows/stress-test.yml +++ b/.github/workflows/stress-test.yml @@ -32,4 +32,4 @@ jobs: - name: DEBUG HALT run: | echo "Halting" - sleep 5m \ No newline at end of file + sleep 5m From 766a39c4e0a4bcdea9315fc1d89da7847dd086ae Mon Sep 17 00:00:00 2001 From: Quoc Truong Date: Wed, 28 Aug 2024 12:45:51 -0700 Subject: [PATCH 08/10] Address comments --- .github/workflows/stress-test.yml | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/.github/workflows/stress-test.yml b/.github/workflows/stress-test.yml index 75bc7937bf95..8bf2ca39884c 100644 --- a/.github/workflows/stress-test.yml +++ b/.github/workflows/stress-test.yml @@ -1,16 +1,17 @@ -# Nightly Stress Test for self-hosted runners -name: Self-hosted Runners Nightly Stress Test +# Nightly Scale Test for self-hosted runners +name: Self-hosted Runners Nightly Scale Test on: schedule: - # Triggers at 11pm every night. - - cron: '0 23 * * *' + # Triggers at 6AM UTC, which is 11PM PST. + - cron: '0 06 * * *' + workflow_dispatch: # Cancel any previous iterations if a new commit is pushed. concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: - nightly-stress-test: - name: "Stress Test ${{ matrix.runners }} - ${{ matrix.instances }}" + nightly-scale-test: + name: "Scale Test ${{ matrix.runners }} - ${{ matrix.instances }}" strategy: fail-fast: false # don't cancel all jobs on failure matrix: @@ -26,9 +27,6 @@ jobs: shell: bash -ex {0} steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # ratchet:actions/checkout@v4 - - name: Install JAX test requirements - run: | - pip install -U -r build/test-requirements.txt - name: DEBUG HALT run: | echo "Halting" From eb58999480925dbecfdbf4bef464a4188eaf55c8 Mon Sep 17 00:00:00 2001 From: Quoc Truong Date: Fri, 30 Aug 2024 09:46:39 -0700 Subject: [PATCH 09/10] Update name for runners --- .github/workflows/stress-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stress-test.yml b/.github/workflows/stress-test.yml index 8bf2ca39884c..f86c33214775 100644 --- a/.github/workflows/stress-test.yml +++ b/.github/workflows/stress-test.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false # don't cancel all jobs on failure matrix: instances: [1, 2, 3, 4, 5] - runners: ["arc-linux-x86-n2-64", "arc-linux-x86-n2-128", "arc-linux-arm64-t2a-48", "arc-linux-x86-g2-96-l4-8gpu", "arc-linux-x86-ct5lp-224-8tpu"] + runners: ["linux-x86-n2-64", "linux-x86-n2-128", "linux-arm64-t2a-48", "linux-x86-g2-96-l4-8gpu", "linux-x86-ct5lp-224-8tpu"] # TODO: Needs final runs-on value runs-on: ${{ matrix.runners }} container: From 5132a5ee64ea6443e817a650a1a560578c974719 Mon Sep 17 00:00:00 2001 From: Quoc Truong Date: Fri, 30 Aug 2024 11:16:22 -0700 Subject: [PATCH 10/10] Fix wait-for-connection-test workflow --- .github/workflows/wait-for-connection-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wait-for-connection-test.yaml b/.github/workflows/wait-for-connection-test.yaml index 6770d7d543fe..cf746a73ad69 100644 --- a/.github/workflows/wait-for-connection-test.yaml +++ b/.github/workflows/wait-for-connection-test.yaml @@ -20,7 +20,7 @@ jobs: strategy: fail-fast: false matrix: - runner: ["arc-linux-x86-n2-64","arc-linux-arm64-t2a-48"] + runner: ["linux-x86-n2-64","linux-arm64-t2a-48"] instances: ["1"] runs-on: ${{ matrix.runner }} timeout-minutes: 60