Skip to content

feat(ray): staged Ray scheduler-executor support (RayJobClient + RayExecutor + get_executor branch) #5684

feat(ray): staged Ray scheduler-executor support (RayJobClient + RayExecutor + get_executor branch)

feat(ray): staged Ray scheduler-executor support (RayJobClient + RayExecutor + get_executor branch) #5684

Workflow file for this run

name: Integration tests
on:
pull_request:
branches: [ "main" ]
types: [opened, synchronize, reopened, labeled]
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
gpu-tests-qwen:
runs-on: self-hosted-nemo-gpus-1
if: ${{ github.event.label.name == 'run GPU tests' }}
steps:
- name: Cleanup old Docker images and build cache
run: |
docker system prune --all --filter "until=168h" --force
docker builder prune --all --filter "until=168h" --force
- name: Cleanup old HF cache
run: |
docker run --rm -v /mnt/datadrive:/mnt/datadrive alpine \
sh -c 'find /mnt/datadrive/nemo-skills-test-data/hf-cache/datasets -maxdepth 2 -mindepth 2 -type d -mtime +7 -exec rm -rf {} + 2>/dev/null;
find /mnt/datadrive/nemo-skills-test-data/hf-cache/hub -maxdepth 1 -mindepth 1 -type d -mtime +7 -exec rm -rf {} + 2>/dev/null;
true'
- uses: actions/checkout@v6
with:
path: ${{ github.run_id }}
- name: Set up Python 3.10
uses: actions/setup-python@v6
with:
python-version: "3.10"
- name: Install dependencies
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
cd ${{ github.run_id }}
python -m pip install --upgrade pip uv
uv pip uninstall --system nemo-skills nemo_run || true
# Use `uv pip` so [tool.uv].override-dependencies in pyproject.toml is honored
# (relaxes leptonai's httpx==0.27.2 pin so litellm 1.83.x can be installed).
uv pip install --system -e .
uv pip install --system -r requirements/common-tests.txt
ns prepare_data gsm8k human-eval mbpp algebra222 mmlu ifeval math-500 amc23 aime24
- name: Build Docker image
run: |
cd ${{ github.run_id }}
docker build -t nemo-skills-image -f dockerfiles/Dockerfile.nemo-skills .
- name: Run GPU tests
timeout-minutes: 240
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
cd ${{ github.run_id }}
nvidia-smi
set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
# Run heartbeat in background, capture its PID, and ensure cleanup
(while true; do sleep 60; echo "[HEARTBEAT] $(date '+%Y-%m-%d %H:%M:%S') - still running..."; done) &
HEARTBEAT_PID=$!
# Run tests and capture exit code
EXIT_CODE=0
./tests/gpu-tests/run_qwen.sh || EXIT_CODE=$?
# Kill heartbeat and exit with test result
kill $HEARTBEAT_PID 2>/dev/null || true
exit $EXIT_CODE
- name: Cleanup
if: always()
run: |
docker run --rm -v /tmp:/tmp -v /home:/home nemo-skills-image bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
docker ps -a -q | xargs -r docker stop