Skip to content

feat(docs): proposal for adding TTLSecondsAfterFinished and ActiveDeadlineSeconds fields to TrainJob CRD #2764

feat(docs): proposal for adding TTLSecondsAfterFinished and ActiveDeadlineSeconds fields to TrainJob CRD

feat(docs): proposal for adding TTLSecondsAfterFinished and ActiveDeadlineSeconds fields to TrainJob CRD #2764

Workflow file for this run

name: GPU E2E Test
on:
pull_request:
types: [opened, reopened, synchronize, labeled]
permissions:
contents: read
pull-requests: read
jobs:
gpu-e2e-test:
name: GPU E2E Test
runs-on:
labels: oracle-vm-gpu-a10-1
group: GPUs
env:
GOPATH: ${{ github.workspace }}/go
defaults:
run:
working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
strategy:
fail-fast: false
matrix:
kubernetes-version: ["1.33.1"]
steps:
- name: Check out code
uses: actions/checkout@v6
with:
ref: ${{ github.event.pull_request.head.sha }}
path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
- name: Setup Go
uses: actions/setup-go@v6
with:
go-version-file: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/go.mod
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: 3.11
- name: Install dependencies
run: |
pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5
pip install git+https://github.com/kubeflow/sdk.git@main
- name: Setup GPU cluster with nvkind
run: |
make test-e2e-setup-gpu-cluster K8S_VERSION=${{ matrix.kubernetes-version }}
- name: Run e2e test on GPU cluster
run: |
mkdir -p artifacts/notebooks
make test-e2e-notebook NOTEBOOK_INPUT=./examples/torchtune/qwen2_5/qwen2.5-1.5B-with-alpaca.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_qwen2_5_with_alpaca-trainjob-yaml.ipynb PAPERMILL_TIMEOUT=1800
make test-e2e-notebook NOTEBOOK_INPUT=./examples/jax/image-classification/mnist.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_jax_mnist.ipynb PAPERMILL_PARAMS="-p num_cpu 8 -p num_gpu 1 -p num_nodes 1" PAPERMILL_TIMEOUT=1800
- name: Upload Artifacts to GitHub
if: always()
uses: actions/upload-artifact@v6
with:
name: ${{ matrix.kubernetes-version }}
path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/artifacts/*
retention-days: 1