Skip to content

Add vLLM to dependency list since it's OSS-ed. #37

Add vLLM to dependency list since it's OSS-ed.

Add vLLM to dependency list since it's OSS-ed. #37

Workflow file for this run

# Copyright 2025 Google LLC

Check failure on line 1 in .github/workflows/tpu-tests.yml

View workflow run for this annotation

GitHub Actions / .github/workflows/tpu-tests.yml

Invalid workflow file

(Line: 172, Col: 1): Unexpected value 'run-dev'
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: TPU Tests
on:
workflow_call:
secrets:
HF_TOKEN:
required: true
description: 'HuggingFace token for model downloads'
concurrency:
# Dedup pull requests (canceling previous runs of the same workflow for same PR), and scheduled runs but nothing else
group: ${{ github.event_name == 'pull_request' && format('{0}-pr-{1}', github.workflow, github.event.pull_request.number) || github.event_name == 'schedule' && format('{0}-schedule', github.workflow) || github.run_id }}
cancel-in-progress: true
env:
HF_HOME: ~/.cache/huggingface
HF_HUB_ENABLE_HF_TRANSFER: "1"
jobs:
run-prod:
runs-on: [linux-x86-ct5lp-224-8tpu]
environment: testing
container:
image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:jax0.7.1_rev1
options: --privileged
env:
CLOUD_TPU_ACCELERATOR: v5e-8
JAX_PLATFORMS: tpu
steps:
# Cache Hugging Face hub
- name: Cache HF hub
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
key: hf-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'requirements*.txt', 'constraints*.txt') }}
restore-keys: |
hf-${{ runner.os }}-
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install tunix dependencies
run: |
pip install -e .[prod]
pip install pytest pytest-xdist
- name: Verify TPU availability
run: |
python -c "
import jax
print(f'JAX version: {jax.__version__}')
print(f'JAX devices: {jax.devices()}')
# Check if we have TPU devices specifically
devices = jax.devices()
has_tpu = len(devices) > 0 and all(device.platform == 'tpu' for device in devices)
print(f'TPU available: {has_tpu}')
if not has_tpu:
print('ERROR: No TPU devices found! Expected TPU devices but got:', [device.platform for device in devices])
exit(1)
else:
print(f'SUCCESS: Found {len(devices)} TPU device(s)')
"
- name: Run tunix model tests
run: |
python -m pytest tests/models/ -v --tb=short -m "not cpu_only and not gpu_only"
- name: Run tunix cli tests
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
run: |
# Config tests that passed
python -m pytest tests/cli/ -v --tb=short \
--ignore=tests/cli/utils/model_test.py
- name: Run tunix generation tests (PASSED only)
run: |
# tokenizer_adapter_test requires access to gated repo
python -m pytest tests/generate/ -v --tb=short \
--ignore=tests/generate/vllm_sampler_test.py \
--ignore=tests/generate/tokenizer_adapter_test.py
- name: Run tunix SFT tests
run: |
python -m pytest tests/sft/ -v --tb=short
- name: Run tunix SFT integration tests
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
./tests/sft/sft_tpu_smoke_test.sh
- name: Run tunix distillation tests
run: |
python -m pytest tests/distillation/ -v --tb=short
- name: Run tunix RL tests
run: |
# RL common tests that passed
# b/448133814: test_grpo_with_lora_model fails
python -m pytest tests/rl/ -v --tb=short -k "not test_grpo_with_lora_model" --ignore=tests/rl/experimental/agentic
- name: GRPO Integration Test
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
# Download GSM8K dataset
mkdir -p /tmp/grpo_test/rl/grpo/data
python3 -c "
from datasets import load_dataset
import json
# Download and save GSM8K train split
dataset = load_dataset('openai/gsm8k', 'main', split='train')
train_data = [{'question': item['question'], 'answer': item['answer']} for item in dataset]
with open('/tmp/grpo_test/rl/grpo/data/gsm8k_train.json', 'w') as f:
json.dump(train_data, f)
# Download and save GSM8K test split
dataset = load_dataset('openai/gsm8k', 'main', split='test')
test_data = [{'question': item['question'], 'answer': item['answer']} for item in dataset]
with open('/tmp/grpo_test/rl/grpo/data/gsm8k_test.json', 'w') as f:
json.dump(test_data, f)
print('GSM8K dataset downloaded successfully')
"
# Run GRPO demo script with minimal configuration
python3 scripts/grpo_demo_llama3_qwen2.py \
--root-dir=/tmp/grpo_test \
--model-version=Qwen/Qwen2.5-0.5B-Instruct \
--num-batches=8 \
--num-test-batches=4 \
--rollout-engine=vanilla
- name: Run tunix tests not covered by the above categories
run: |
# This category is to catch tests added but not covered by CI yet. Whenever you add new folders under tests/, please add a new category above and skip those tests here.
python -m pytest tests/ -v --tb=short --ignore=tests/models/ --ignore=tests/cli/ --ignore=tests/generate/ --ignore=tests/sft/ --ignore=tests/distillation/ --ignore=tests/rl/ || code=$?
if [ "${code:-0}" = "5" ]; then
echo "No tests collected (expected)."
exit 0
else
exit "${code:-0}"
fi
run-dev:
runs-on: [linux-x86-ct5lp-224-8tpu] # your existing k8s-based runner labels
environment: testing
# IMPORTANT: remove the 'container:' block here; we want the job to run
# directly on the runner pod so it can call kubectl.
steps:
- name: Cache HF hub (runner-side only; pod won’t use this cache)
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
key: hf-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'requirements*.txt', 'constraints*.txt') }}
restore-keys: |
hf-${{ runner.os }}-
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Create test pod
run: |
set -eux
cat <<'YAML' | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: vllm-tests
labels:
app: vllm-tests
spec:
restartPolicy: Never
containers:
- name: test
# Use your exact image tag/digest here:
image: vllm/vllm-tpu:785d8b6410c3f1dc138947ea861a194b061f0293
command: ["bash","-lc","sleep infinity"]
# Add privileged if your cluster policy allows and you truly need it.
securityContext:
privileged: true
YAML
- name: Wait for pod Ready
run: kubectl wait --for=condition=Ready pod/vllm-tests --timeout=300s
- name: Copy workspace into pod
run: |
set -eux
kubectl exec vllm-tests -- mkdir -p /workspace
# Copy your repo into the container
kubectl cp "${GITHUB_WORKSPACE}/." vllm-tests:/workspace
- name: Run tests inside the pod
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
set -eux
# Everything below runs inside the container
kubectl exec -i vllm-tests -- bash -lc '
set -eux
export HF_TOKEN="${HF_TOKEN:-}"
cd /workspace
python3 -V || true
# Base tooling
pip install --upgrade pip setuptools wheel
# Install Tunix (from PR checkout)
# Your original steps included removing TPU/torch stacks before installing.
pip uninstall -y torch torch-xla libtpu jax jaxlib || true
if [ -f constraints_dev.txt ]; then
pip install -c constraints_dev.txt -e .[dev]
else
pip install -e .[dev]
fi
# Install tpu-inference and test deps
pip uninstall -y torch torch-xla libtpu jax jaxlib || true
pip install tpu-inference==v0.11.1
pip install pytest pytest-xdist
# Run the specific vLLM tests
pytest tests/generate/vllm_sampler_test.py -v --tb=short
pytest tests/generate/vllm_driver_test.py -v --tb=short
'
- name: Pod logs (on failure)
if: failure()
run: kubectl logs vllm-tests --all-containers=true || true
- name: Cleanup pod
if: always()
run: kubectl delete pod vllm-tests --ignore-not-found