Skip to content

Test GitHub app

Test GitHub app #7

Workflow file for this run

name: Multi-process run using K8s
on:
push:
branches:
- main
paths:
- '.github/workflows/k8s.yaml'
- 'ci/k8s/**'
- 'jax/distributed.py'
- 'jax/_src/distributed.py'
- 'jax/_src/clusters/**'
pull_request:
branches:
- main
paths:
- '.github/workflows/k8s.yaml'
- 'ci/k8s/**'
- 'jax/distributed.py'
- 'jax/_src/distributed.py'
- 'jax/_src/clusters/**'
permissions: {}
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
cancel-in-progress: true
defaults:
run:
shell: bash -ex -o pipefail {0}
jobs:
distributed-initialize:
runs-on: ubuntu-22.04
strategy:
fail-fast: false
matrix:
controller: [jobset, indexed-job]
steps:
- name: Checkout
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # ratchet:actions/checkout@v4
with:
path: jax
persist-credentials: false
- name: Start Minikube cluster
uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # ratchet:medyagh/[email protected]
- name: Install K8s Jobset
if: matrix.controller == 'jobset'
run: |
kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml
kubectl wait --for=condition=established crd/jobsets.jobset.x-k8s.io --timeout=60s
kubectl rollout status -n jobset-system deploy/jobset-controller-manager --timeout=120s
- name: Build image
run: |
cat > Dockerfile <<EOF
FROM ubuntu:22.04
ADD jax /opt/jax
RUN apt-get update && apt-get install -y python-is-python3 python3-pip
RUN pip install -e /opt/jax[k8s]
EOF
minikube image build -t local/jax:latest .
- name: Create service account for K8s job introspection
run: kubectl apply -f jax/examples/k8s/svc-acct.yaml
- name: Submit test job
run: kubectl apply -f jax/ci/k8s/${{ matrix.controller }}.yaml
- name: Check job status
shell: bash -e -o pipefail {0}
run: |
while true; do
completion_time=$(
kubectl get jobs -o yaml | \
yq '.items[] | select(.metadata.name | test("^jaxjob")) | .status.completionTime'
)
timestamp=$(date +"%Y-%m-%d %H:%M:%S")
echo "[$timestamp] Checking job status..."
if [ "$completion_time" == "null" ]; then
echo "[$timestamp] Job is still running. Current pod status:"
kubectl get pods --no-headers
echo "[$timestamp] Waiting for 3 seconds before checking again..."
sleep 3
else
succeeded=$(
kubectl get jobs -o yaml | \
yq '.items[] | select(.metadata.name | test("^jaxjob")) | .status.succeeded'
)
parallelism=$(
kubectl get jobs -o yaml | \
yq '.items[] | select(.metadata.name | test("^jaxjob")) | .spec.parallelism'
)
if [ "$succeeded" == "$parallelism" ]; then
echo "[$timestamp] Job has completed successfully!"
exit 0
else
echo "[$timestamp] Job has failed!"
exit 1
fi
fi
done
- name: Examine individual pod outputs
if: ${{ !cancelled() }}
run: |
set +x
kubectl get pods --no-headers | awk '{print $1}' | while read -s pod; do
echo "========================================"
echo "Pod $pod output:"
echo "----------------------------------------"
kubectl logs $pod
echo "========================================"
done