@@ -43,6 +43,7 @@ concurrency:
4343permissions :
4444 contents : read # to fetch code
4545 actions : write # to cancel previous workflows
46+ id-token : write # to assume the AWS role for bazel-remote cache
4647 packages : write # to upload container
4748
4849jobs :
@@ -65,10 +66,90 @@ jobs:
6566
6667 build-jax :
6768 needs : build-base
68- runs-on : [self-hosted, " ${{ inputs.ARCHITECTURE }}", "large"]
69+ runs-on : ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu32m' || 'linux-arm64-cpu32m' }}
6970 steps :
7071 - name : Checkout repository
7172 uses : actions/checkout@v6
73+ - name : Configure AWS credentials via OIDC
74+ uses : aws-actions/configure-aws-credentials@v4
75+ with :
76+ role-to-assume : ${{ secrets.S3_CACHE_ROLE }}
77+ aws-region : ${{ secrets.AWS_REGION}}
78+ - name : Start bazel-remote sidecar
79+ env :
80+ BAZEL_REMOTE_BUCKET : ${{ secrets.BAZEL_REMOTE_BUCKET }}
81+ BAZEL_REMOTE_AWS_DIR : /tmp/bazel-remote-aws
82+ BAZEL_REMOTE_CONTAINER : bazel-remote-cache-${{ inputs.ARCHITECTURE }}
83+ BAZEL_REMOTE_DISK_DIR : /tmp/bazel-remote-cache
84+ BAZEL_REMOTE_DIAGNOSTICS_DIR : /tmp/bazel-remote-diagnostics
85+ BAZEL_REMOTE_PORT : 9090
86+ BAZEL_REMOTE_PREFIX : bazel-remote/${{ inputs.ARCHITECTURE }}
87+ BAZEL_REMOTE_REGION : ${{ secrets.AWS_REGION }}
88+ run : |
89+ set -euxo pipefail
90+ : "${AWS_ACCESS_KEY_ID:?missing AWS_ACCESS_KEY_ID}"
91+ : "${AWS_SECRET_ACCESS_KEY:?missing AWS_SECRET_ACCESS_KEY}"
92+ : "${AWS_SESSION_TOKEN:?missing AWS_SESSION_TOKEN}"
93+ BAZEL_REMOTE_AWS_CREDENTIALS="${BAZEL_REMOTE_AWS_DIR}/credentials"
94+ mkdir -p "${BAZEL_REMOTE_DIAGNOSTICS_DIR}" "${BAZEL_REMOTE_DISK_DIR}" "${BAZEL_REMOTE_AWS_DIR}"
95+ BAZEL_REMOTE_UID="$(id -u)"
96+ BAZEL_REMOTE_GID="$(id -g)"
97+ set +x
98+ umask 077
99+ {
100+ printf '[default]\n'
101+ printf 'aws_access_key_id=%s\n' "${AWS_ACCESS_KEY_ID}"
102+ printf 'aws_secret_access_key=%s\n' "${AWS_SECRET_ACCESS_KEY}"
103+ printf 'aws_session_token=%s\n' "${AWS_SESSION_TOKEN}"
104+ } > "${BAZEL_REMOTE_AWS_CREDENTIALS}"
105+ set -x
106+ docker rm -f "${BAZEL_REMOTE_CONTAINER}" >/dev/null 2>&1 || true
107+ docker run -d \
108+ --name "${BAZEL_REMOTE_CONTAINER}" \
109+ --user "${BAZEL_REMOTE_UID}:${BAZEL_REMOTE_GID}" \
110+ --publish "127.0.0.1:${BAZEL_REMOTE_PORT}:8080" \
111+ --volume "${BAZEL_REMOTE_DISK_DIR}:/data" \
112+ --volume "${BAZEL_REMOTE_AWS_CREDENTIALS}:/aws-config/credentials:ro" \
113+ buchgr/bazel-remote-cache \
114+ --dir /data \
115+ --max_size 50 \
116+ --http_address=0.0.0.0:8080 \
117+ --s3.auth_method=aws_credentials_file \
118+ --s3.aws_profile=default \
119+ --s3.aws_shared_credentials_file=/aws-config/credentials \
120+ --s3.bucket="${BAZEL_REMOTE_BUCKET}" \
121+ --s3.endpoint="s3.${BAZEL_REMOTE_REGION}.amazonaws.com" \
122+ --s3.prefix="${BAZEL_REMOTE_PREFIX}" \
123+ --s3.region="${BAZEL_REMOTE_REGION}" \
124+ --s3.update_timestamps
125+ - name : Verify bazel-remote sidecar
126+ env :
127+ BAZEL_REMOTE_CONTAINER : bazel-remote-cache-${{ inputs.ARCHITECTURE }}
128+ BAZEL_REMOTE_DIAGNOSTICS_DIR : /tmp/bazel-remote-diagnostics
129+ BAZEL_REMOTE_PORT : 9090
130+ run : |
131+ set -euxo pipefail
132+ ready=0
133+ for _ in $(seq 1 30); do
134+ if curl -sSf "http://127.0.0.1:${BAZEL_REMOTE_PORT}/status" > "${BAZEL_REMOTE_DIAGNOSTICS_DIR}/status-before.json"; then
135+ ready=1
136+ break
137+ fi
138+ container_running="$(docker inspect -f '{{.State.Running}}' "${BAZEL_REMOTE_CONTAINER}" 2>/dev/null || echo false)"
139+ if [[ "${container_running}" != "true" ]]; then
140+ docker ps -a
141+ docker logs "${BAZEL_REMOTE_CONTAINER}" || true
142+ exit 1
143+ fi
144+ sleep 1
145+ done
146+ if [[ "${ready}" != "1" ]]; then
147+ docker ps -a
148+ docker logs "${BAZEL_REMOTE_CONTAINER}" || true
149+ exit 1
150+ fi
151+ curl -sSIf "http://127.0.0.1:${BAZEL_REMOTE_PORT}/cas/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" \
152+ > "${BAZEL_REMOTE_DIAGNOSTICS_DIR}/empty-cas-head.txt"
72153 - name : Build JAX container
73154 id : build-jax
74155 uses : ./.github/actions/build-container
@@ -84,19 +165,44 @@ jobs:
84165 ssh-private-key : ${{ secrets.SSH_PRIVATE_KEY }}
85166 ssh-known-hosts : ${{ vars.SSH_KNOWN_HOSTS }}
86167 github-token : ${{ secrets.GITHUB_TOKEN }}
87- bazel-remote-cache-url : ${{ vars.BAZEL_REMOTE_CACHE_URL }}
168+ bazel-remote-cache-url : http://127.0.0.1:9090
169+ docker-build-allow : network.host
170+ docker-build-network : host
171+ registry-build-context-name : mealkit
88172 EXTRA_BUILD_ARGS : |
89173 URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }}
90174 URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }}
91175 URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }}
92176 URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
177+ - name : Collect bazel-remote diagnostics
178+ if : always()
179+ env :
180+ BAZEL_REMOTE_CONTAINER : bazel-remote-cache-${{ inputs.ARCHITECTURE }}
181+ BAZEL_REMOTE_DIAGNOSTICS_DIR : /tmp/bazel-remote-diagnostics
182+ BAZEL_REMOTE_PORT : 9090
183+ run : |
184+ set -euxo pipefail
185+ mkdir -p "${BAZEL_REMOTE_DIAGNOSTICS_DIR}"
186+ curl -sSf "http://127.0.0.1:${BAZEL_REMOTE_PORT}/status" > "${BAZEL_REMOTE_DIAGNOSTICS_DIR}/status-after.json" || true
187+ docker logs "${BAZEL_REMOTE_CONTAINER}" > "${BAZEL_REMOTE_DIAGNOSTICS_DIR}/container.log" 2>&1 || true
188+ - name : Upload bazel-remote diagnostics
189+ if : always()
190+ uses : actions/upload-artifact@v6
191+ with :
192+ name : bazel-remote-diagnostics-${{ inputs.ARCHITECTURE }}
193+ path : /tmp/bazel-remote-diagnostics
194+ - name : Stop bazel-remote sidecar
195+ if : always()
196+ run : |
197+ docker rm -f "bazel-remote-cache-${{ inputs.ARCHITECTURE }}" || true
198+ rm -rf /tmp/bazel-remote-aws
93199 outputs :
94200 DOCKER_TAG_MEALKIT : ${{ steps.build-jax.outputs.DOCKER_TAG_MEALKIT }}
95201 DOCKER_TAG_FINAL : ${{ steps.build-jax.outputs.DOCKER_TAG_FINAL }}
96202
97203 build-equinox :
98204 needs : build-jax
99- runs-on : [self-hosted, " ${{ inputs.ARCHITECTURE }}", "small"]
205+ runs-on : ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }}
100206 outputs :
101207 DOCKER_TAG_MEALKIT : ${{ steps.build-equinox.outputs.DOCKER_TAG_MEALKIT }}
102208 DOCKER_TAG_FINAL : ${{ steps.build-equinox.outputs.DOCKER_TAG_FINAL }}
@@ -118,13 +224,12 @@ jobs:
118224 ssh-private-key : ${{ secrets.SSH_PRIVATE_KEY }}
119225 ssh-known-hosts : ${{ vars.SSH_KNOWN_HOSTS }}
120226 github-token : ${{ secrets.GITHUB_TOKEN }}
121- bazel-remote-cache-url : ${{ vars.BAZEL_REMOTE_CACHE_URL }}
122227 EXTRA_BUILD_ARGS : |
123228 URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
124229
125230 build-maxtext :
126231 needs : build-jax
127- runs-on : [self-hosted, " ${{ inputs.ARCHITECTURE }}", "small"]
232+ runs-on : ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }}
128233 outputs :
129234 DOCKER_TAG_MEALKIT : ${{ steps.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}
130235 DOCKER_TAG_FINAL : ${{ steps.build-maxtext.outputs.DOCKER_TAG_FINAL }}
@@ -146,13 +251,12 @@ jobs:
146251 ssh-private-key : ${{ secrets.SSH_PRIVATE_KEY }}
147252 ssh-known-hosts : ${{ vars.SSH_KNOWN_HOSTS }}
148253 github-token : ${{ secrets.GITHUB_TOKEN }}
149- bazel-remote-cache-url : ${{ vars.BAZEL_REMOTE_CACHE_URL }}
150254 EXTRA_BUILD_ARGS : |
151255 URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
152256
153257 build-torchax :
154258 needs : build-jax
155- runs-on : [self-hosted, " ${{ inputs.ARCHITECTURE }}", "small"]
259+ runs-on : ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }}
156260 outputs :
157261 DOCKER_TAG_MEALKIT : ${{ steps.build-torchax.outputs.DOCKER_TAG_MEALKIT }}
158262 DOCKER_TAG_FINAL : ${{ steps.build-torchax.outputs.DOCKER_TAG_FINAL }}
@@ -174,13 +278,12 @@ jobs:
174278 ssh-private-key : ${{ secrets.SSH_PRIVATE_KEY }}
175279 ssh-known-hosts : ${{ vars.SSH_KNOWN_HOSTS }}
176280 github-token : ${{ secrets.GITHUB_TOKEN }}
177- bazel-remote-cache-url : ${{ vars.BAZEL_REMOTE_CACHE_URL }}
178281 EXTRA_BUILD_ARGS : |
179282 URLREF_TORCHAX=${{ fromJson(inputs.SOURCE_URLREFS).TORCHAX }}
180283
181284 build-axlearn :
182285 needs : build-jax
183- runs-on : [self-hosted, " ${{ inputs.ARCHITECTURE }}", "large"]
286+ runs-on : ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }}
184287 outputs :
185288 DOCKER_TAG_MEALKIT : ${{ steps.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}
186289 DOCKER_TAG_FINAL : ${{ steps.build-axlearn.outputs.DOCKER_TAG_FINAL }}
@@ -202,7 +305,6 @@ jobs:
202305 ssh-private-key : ${{ secrets.SSH_PRIVATE_KEY }}
203306 ssh-known-hosts : ${{ vars.SSH_KNOWN_HOSTS }}
204307 github-token : ${{ secrets.GITHUB_TOKEN }}
205- bazel-remote-cache-url : ${{ vars.BAZEL_REMOTE_CACHE_URL }}
206308 EXTRA_BUILD_ARGS : |
207309 URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }}
208310
0 commit comments