Test GitHub app #7
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Multi-process run using K8s | |
| on: | |
| push: | |
| branches: | |
| - main | |
| paths: | |
| - '.github/workflows/k8s.yaml' | |
| - 'ci/k8s/**' | |
| - 'jax/distributed.py' | |
| - 'jax/_src/distributed.py' | |
| - 'jax/_src/clusters/**' | |
| pull_request: | |
| branches: | |
| - main | |
| paths: | |
| - '.github/workflows/k8s.yaml' | |
| - 'ci/k8s/**' | |
| - 'jax/distributed.py' | |
| - 'jax/_src/distributed.py' | |
| - 'jax/_src/clusters/**' | |
| permissions: {} | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} | |
| cancel-in-progress: true | |
| defaults: | |
| run: | |
| shell: bash -ex -o pipefail {0} | |
| jobs: | |
| distributed-initialize: | |
| runs-on: ubuntu-22.04 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| controller: [jobset, indexed-job] | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # ratchet:actions/checkout@v4 | |
| with: | |
| path: jax | |
| persist-credentials: false | |
| - name: Start Minikube cluster | |
| uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # ratchet:medyagh/[email protected] | |
| - name: Install K8s Jobset | |
| if: matrix.controller == 'jobset' | |
| run: | | |
| kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml | |
| kubectl wait --for=condition=established crd/jobsets.jobset.x-k8s.io --timeout=60s | |
| kubectl rollout status -n jobset-system deploy/jobset-controller-manager --timeout=120s | |
| - name: Build image | |
| run: | | |
| cat > Dockerfile <<EOF | |
| FROM ubuntu:22.04 | |
| ADD jax /opt/jax | |
| RUN apt-get update && apt-get install -y python-is-python3 python3-pip | |
| RUN pip install -e /opt/jax[k8s] | |
| EOF | |
| minikube image build -t local/jax:latest . | |
| - name: Create service account for K8s job introspection | |
| run: kubectl apply -f jax/examples/k8s/svc-acct.yaml | |
| - name: Submit test job | |
| run: kubectl apply -f jax/ci/k8s/${{ matrix.controller }}.yaml | |
| - name: Check job status | |
| shell: bash -e -o pipefail {0} | |
| run: | | |
| while true; do | |
| completion_time=$( | |
| kubectl get jobs -o yaml | \ | |
| yq '.items[] | select(.metadata.name | test("^jaxjob")) | .status.completionTime' | |
| ) | |
| timestamp=$(date +"%Y-%m-%d %H:%M:%S") | |
| echo "[$timestamp] Checking job status..." | |
| if [ "$completion_time" == "null" ]; then | |
| echo "[$timestamp] Job is still running. Current pod status:" | |
| kubectl get pods --no-headers | |
| echo "[$timestamp] Waiting for 3 seconds before checking again..." | |
| sleep 3 | |
| else | |
| succeeded=$( | |
| kubectl get jobs -o yaml | \ | |
| yq '.items[] | select(.metadata.name | test("^jaxjob")) | .status.succeeded' | |
| ) | |
| parallelism=$( | |
| kubectl get jobs -o yaml | \ | |
| yq '.items[] | select(.metadata.name | test("^jaxjob")) | .spec.parallelism' | |
| ) | |
| if [ "$succeeded" == "$parallelism" ]; then | |
| echo "[$timestamp] Job has completed successfully!" | |
| exit 0 | |
| else | |
| echo "[$timestamp] Job has failed!" | |
| exit 1 | |
| fi | |
| fi | |
| done | |
| - name: Examine individual pod outputs | |
| if: ${{ !cancelled() }} | |
| run: | | |
| set +x | |
| kubectl get pods --no-headers | awk '{print $1}' | while read -s pod; do | |
| echo "========================================" | |
| echo "Pod $pod output:" | |
| echo "----------------------------------------" | |
| kubectl logs $pod | |
| echo "========================================" | |
| done |