Skip to content

Commit 82f7d10

Browse files
authored
Nvidia-runners S3 cache implementation (#2053)
1 parent 5ce0ae2 commit 82f7d10

3 files changed

Lines changed: 141 additions & 12 deletions

File tree

.github/actions/build-container/action.yml

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,18 @@ inputs:
5454
description: "URL of the Bazel remote cache to use for building the image"
5555
required: true
5656
default: ""
57+
docker-build-allow:
58+
description: "Additional BuildKit entitlements to allow during docker build"
59+
required: false
60+
default: ""
61+
docker-build-network:
62+
description: "Networking mode for RUN instructions during docker build"
63+
required: false
64+
default: ""
65+
registry-build-context-name:
66+
description: "Optional named build context to source from a runner registry for the final image handoff"
67+
required: false
68+
default: ""
5769

5870
outputs:
5971
DOCKER_TAG_MEALKIT:
@@ -72,6 +84,7 @@ runs:
7284
run: |
7385
echo 'UPLD_IMAGE=ghcr.io/nvidia/jax-toolbox-internal' >> $GITHUB_ENV
7486
echo "BADGE_FILENAME_FULL=${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json" >> $GITHUB_ENV
87+
echo "BUILDX_BUILDER_NAME=jax-toolbox-${{ inputs.CONTAINER_NAME }}-${{ inputs.ARCHITECTURE }}-${{ inputs.docker-build-network }}" >> $GITHUB_ENV
7588
7689
- name: Setup SSH
7790
id: setup-ssh
@@ -90,9 +103,14 @@ runs:
90103
- name: Set up Docker Buildx
91104
uses: docker/setup-buildx-action@v4
92105
with:
106+
name: ${{ env.BUILDX_BUILDER_NAME }}
93107
driver-opts: |
94-
image=moby/buildkit:v0.12.1
95-
version: v0.30.1
108+
image=moby/buildkit:v0.13.2
109+
network=${{ inputs.docker-build-network }}
110+
keep-state: true
111+
# TODO: check whether we need the oci.worker specification
112+
113+
96114

97115
- name: Download nsys-jax version.py
98116
uses: actions/download-artifact@v8
@@ -124,7 +142,9 @@ runs:
124142
id: mealkit-build
125143
uses: docker/build-push-action@v7
126144
with:
145+
builder: ${{ env.BUILDX_BUILDER_NAME }}
127146
context: ${{ inputs.DOCKER_CONTEXT }}
147+
load: ${{inputs.registry-build-context-name == ''}}
128148
push: true
129149
file: ${{ inputs.DOCKERFILE }}
130150
platforms: linux/${{ inputs.ARCHITECTURE }}
@@ -134,6 +154,8 @@ runs:
134154
ssh: default
135155
secret-files: |
136156
"SSH_KNOWN_HOSTS=${{ steps.setup-ssh.outputs.known-hosts-file }}"
157+
allow: ${{ inputs.docker-build-allow }}
158+
network: ${{ inputs.docker-build-network }}
137159
build-args: |
138160
BASE_IMAGE=${{ inputs.BASE_IMAGE }}
139161
BAZEL_CACHE=${{ inputs.bazel-remote-cache-url }}
@@ -157,7 +179,9 @@ runs:
157179
id: final-build
158180
uses: docker/build-push-action@v7
159181
with:
182+
builder: ${{ env.BUILDX_BUILDER_NAME }}
160183
context: ${{ inputs.DOCKER_CONTEXT }}
184+
build-contexts: ${{ inputs.registry-build-context-name != '' && format('{0}=docker-image://{1}@{2}', inputs.registry-build-context-name, env.UPLD_IMAGE, steps.mealkit-build.outputs.digest) || '' }}
161185
push: true
162186
file: ${{ inputs.DOCKERFILE }}
163187
platforms: linux/${{ inputs.ARCHITECTURE }}
@@ -167,6 +191,8 @@ runs:
167191
ssh: default
168192
secret-files: |
169193
"SSH_KNOWN_HOSTS=${{ steps.setup-ssh.outputs.known-hosts-file }}"
194+
allow: ${{ inputs.docker-build-allow }}
195+
network: ${{ inputs.docker-build-network }}
170196
build-args: |
171197
BASE_IMAGE=${{ inputs.BASE_IMAGE }}
172198
BAZEL_CACHE=${{ inputs.bazel-remote-cache-url }}

.github/workflows/_ci.yaml

Lines changed: 112 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ concurrency:
4343
permissions:
4444
contents: read # to fetch code
4545
actions: write # to cancel previous workflows
46+
id-token: write # to assume the AWS role for bazel-remote cache
4647
packages: write # to upload container
4748

4849
jobs:
@@ -65,10 +66,90 @@ jobs:
6566

6667
build-jax:
6768
needs: build-base
68-
runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"]
69+
runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu32m' || 'linux-arm64-cpu32m' }}
6970
steps:
7071
- name: Checkout repository
7172
uses: actions/checkout@v6
73+
- name: Configure AWS credentials via OIDC
74+
uses: aws-actions/configure-aws-credentials@v4
75+
with:
76+
role-to-assume: ${{ secrets.S3_CACHE_ROLE }}
77+
aws-region: ${{ secrets.AWS_REGION}}
78+
- name: Start bazel-remote sidecar
79+
env:
80+
BAZEL_REMOTE_BUCKET: ${{ secrets.BAZEL_REMOTE_BUCKET }}
81+
BAZEL_REMOTE_AWS_DIR: /tmp/bazel-remote-aws
82+
BAZEL_REMOTE_CONTAINER: bazel-remote-cache-${{ inputs.ARCHITECTURE }}
83+
BAZEL_REMOTE_DISK_DIR: /tmp/bazel-remote-cache
84+
BAZEL_REMOTE_DIAGNOSTICS_DIR: /tmp/bazel-remote-diagnostics
85+
BAZEL_REMOTE_PORT: 9090
86+
BAZEL_REMOTE_PREFIX: bazel-remote/${{ inputs.ARCHITECTURE }}
87+
BAZEL_REMOTE_REGION: ${{ secrets.AWS_REGION }}
88+
run: |
89+
set -euxo pipefail
90+
: "${AWS_ACCESS_KEY_ID:?missing AWS_ACCESS_KEY_ID}"
91+
: "${AWS_SECRET_ACCESS_KEY:?missing AWS_SECRET_ACCESS_KEY}"
92+
: "${AWS_SESSION_TOKEN:?missing AWS_SESSION_TOKEN}"
93+
BAZEL_REMOTE_AWS_CREDENTIALS="${BAZEL_REMOTE_AWS_DIR}/credentials"
94+
mkdir -p "${BAZEL_REMOTE_DIAGNOSTICS_DIR}" "${BAZEL_REMOTE_DISK_DIR}" "${BAZEL_REMOTE_AWS_DIR}"
95+
BAZEL_REMOTE_UID="$(id -u)"
96+
BAZEL_REMOTE_GID="$(id -g)"
97+
set +x
98+
umask 077
99+
{
100+
printf '[default]\n'
101+
printf 'aws_access_key_id=%s\n' "${AWS_ACCESS_KEY_ID}"
102+
printf 'aws_secret_access_key=%s\n' "${AWS_SECRET_ACCESS_KEY}"
103+
printf 'aws_session_token=%s\n' "${AWS_SESSION_TOKEN}"
104+
} > "${BAZEL_REMOTE_AWS_CREDENTIALS}"
105+
set -x
106+
docker rm -f "${BAZEL_REMOTE_CONTAINER}" >/dev/null 2>&1 || true
107+
docker run -d \
108+
--name "${BAZEL_REMOTE_CONTAINER}" \
109+
--user "${BAZEL_REMOTE_UID}:${BAZEL_REMOTE_GID}" \
110+
--publish "127.0.0.1:${BAZEL_REMOTE_PORT}:8080" \
111+
--volume "${BAZEL_REMOTE_DISK_DIR}:/data" \
112+
--volume "${BAZEL_REMOTE_AWS_CREDENTIALS}:/aws-config/credentials:ro" \
113+
buchgr/bazel-remote-cache \
114+
--dir /data \
115+
--max_size 50 \
116+
--http_address=0.0.0.0:8080 \
117+
--s3.auth_method=aws_credentials_file \
118+
--s3.aws_profile=default \
119+
--s3.aws_shared_credentials_file=/aws-config/credentials \
120+
--s3.bucket="${BAZEL_REMOTE_BUCKET}" \
121+
--s3.endpoint="s3.${BAZEL_REMOTE_REGION}.amazonaws.com" \
122+
--s3.prefix="${BAZEL_REMOTE_PREFIX}" \
123+
--s3.region="${BAZEL_REMOTE_REGION}" \
124+
--s3.update_timestamps
125+
- name: Verify bazel-remote sidecar
126+
env:
127+
BAZEL_REMOTE_CONTAINER: bazel-remote-cache-${{ inputs.ARCHITECTURE }}
128+
BAZEL_REMOTE_DIAGNOSTICS_DIR: /tmp/bazel-remote-diagnostics
129+
BAZEL_REMOTE_PORT: 9090
130+
run: |
131+
set -euxo pipefail
132+
ready=0
133+
for _ in $(seq 1 30); do
134+
if curl -sSf "http://127.0.0.1:${BAZEL_REMOTE_PORT}/status" > "${BAZEL_REMOTE_DIAGNOSTICS_DIR}/status-before.json"; then
135+
ready=1
136+
break
137+
fi
138+
container_running="$(docker inspect -f '{{.State.Running}}' "${BAZEL_REMOTE_CONTAINER}" 2>/dev/null || echo false)"
139+
if [[ "${container_running}" != "true" ]]; then
140+
docker ps -a
141+
docker logs "${BAZEL_REMOTE_CONTAINER}" || true
142+
exit 1
143+
fi
144+
sleep 1
145+
done
146+
if [[ "${ready}" != "1" ]]; then
147+
docker ps -a
148+
docker logs "${BAZEL_REMOTE_CONTAINER}" || true
149+
exit 1
150+
fi
151+
curl -sSIf "http://127.0.0.1:${BAZEL_REMOTE_PORT}/cas/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" \
152+
> "${BAZEL_REMOTE_DIAGNOSTICS_DIR}/empty-cas-head.txt"
72153
- name: Build JAX container
73154
id: build-jax
74155
uses: ./.github/actions/build-container
@@ -84,19 +165,44 @@ jobs:
84165
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
85166
ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
86167
github-token: ${{ secrets.GITHUB_TOKEN }}
87-
bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
168+
bazel-remote-cache-url: http://127.0.0.1:9090
169+
docker-build-allow: network.host
170+
docker-build-network: host
171+
registry-build-context-name: mealkit
88172
EXTRA_BUILD_ARGS: |
89173
URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }}
90174
URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }}
91175
URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }}
92176
URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
177+
- name: Collect bazel-remote diagnostics
178+
if: always()
179+
env:
180+
BAZEL_REMOTE_CONTAINER: bazel-remote-cache-${{ inputs.ARCHITECTURE }}
181+
BAZEL_REMOTE_DIAGNOSTICS_DIR: /tmp/bazel-remote-diagnostics
182+
BAZEL_REMOTE_PORT: 9090
183+
run: |
184+
set -euxo pipefail
185+
mkdir -p "${BAZEL_REMOTE_DIAGNOSTICS_DIR}"
186+
curl -sSf "http://127.0.0.1:${BAZEL_REMOTE_PORT}/status" > "${BAZEL_REMOTE_DIAGNOSTICS_DIR}/status-after.json" || true
187+
docker logs "${BAZEL_REMOTE_CONTAINER}" > "${BAZEL_REMOTE_DIAGNOSTICS_DIR}/container.log" 2>&1 || true
188+
- name: Upload bazel-remote diagnostics
189+
if: always()
190+
uses: actions/upload-artifact@v6
191+
with:
192+
name: bazel-remote-diagnostics-${{ inputs.ARCHITECTURE }}
193+
path: /tmp/bazel-remote-diagnostics
194+
- name: Stop bazel-remote sidecar
195+
if: always()
196+
run: |
197+
docker rm -f "bazel-remote-cache-${{ inputs.ARCHITECTURE }}" || true
198+
rm -rf /tmp/bazel-remote-aws
93199
outputs:
94200
DOCKER_TAG_MEALKIT: ${{ steps.build-jax.outputs.DOCKER_TAG_MEALKIT }}
95201
DOCKER_TAG_FINAL: ${{ steps.build-jax.outputs.DOCKER_TAG_FINAL }}
96202

97203
build-equinox:
98204
needs: build-jax
99-
runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
205+
runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }}
100206
outputs:
101207
DOCKER_TAG_MEALKIT: ${{ steps.build-equinox.outputs.DOCKER_TAG_MEALKIT }}
102208
DOCKER_TAG_FINAL: ${{ steps.build-equinox.outputs.DOCKER_TAG_FINAL }}
@@ -118,13 +224,12 @@ jobs:
118224
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
119225
ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
120226
github-token: ${{ secrets.GITHUB_TOKEN }}
121-
bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
122227
EXTRA_BUILD_ARGS: |
123228
URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
124229
125230
build-maxtext:
126231
needs: build-jax
127-
runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
232+
runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }}
128233
outputs:
129234
DOCKER_TAG_MEALKIT: ${{ steps.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}
130235
DOCKER_TAG_FINAL: ${{ steps.build-maxtext.outputs.DOCKER_TAG_FINAL }}
@@ -146,13 +251,12 @@ jobs:
146251
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
147252
ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
148253
github-token: ${{ secrets.GITHUB_TOKEN }}
149-
bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
150254
EXTRA_BUILD_ARGS: |
151255
URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
152256
153257
build-torchax:
154258
needs: build-jax
155-
runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
259+
runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }}
156260
outputs:
157261
DOCKER_TAG_MEALKIT: ${{ steps.build-torchax.outputs.DOCKER_TAG_MEALKIT }}
158262
DOCKER_TAG_FINAL: ${{ steps.build-torchax.outputs.DOCKER_TAG_FINAL }}
@@ -174,13 +278,12 @@ jobs:
174278
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
175279
ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
176280
github-token: ${{ secrets.GITHUB_TOKEN }}
177-
bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
178281
EXTRA_BUILD_ARGS: |
179282
URLREF_TORCHAX=${{ fromJson(inputs.SOURCE_URLREFS).TORCHAX }}
180283
181284
build-axlearn:
182285
needs: build-jax
183-
runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"]
286+
runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }}
184287
outputs:
185288
DOCKER_TAG_MEALKIT: ${{ steps.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}
186289
DOCKER_TAG_FINAL: ${{ steps.build-axlearn.outputs.DOCKER_TAG_FINAL }}
@@ -202,7 +305,6 @@ jobs:
202305
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
203306
ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
204307
github-token: ${{ secrets.GITHUB_TOKEN }}
205-
bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
206308
EXTRA_BUILD_ARGS: |
207309
URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }}
208310

.github/workflows/ci.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ concurrency:
6666
permissions:
6767
contents: write # to fetch code and push branch
6868
actions: write # to cancel previous workflows
69+
id-token: write # to assume the AWS role for bazel-remote cache
6970
packages: write # to upload container
7071
pull-requests: write # to make pull request for manifest bump
7172

0 commit comments

Comments
 (0)