Add vLLM to dependency list since it's OSS-ed. #37

Workflow file for this run

.github/workflows/tpu-tests.yml at 988b30a

	# Copyright 2025 Google LLC
Check failure on line 1 in .github/workflows/tpu-tests.yml View workflow run for this annotation GitHub Actions / .github/workflows/tpu-tests.yml Invalid workflow file `(Line: 172, Col: 1): Unexpected value 'run-dev'`
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# https://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
	# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

	name: TPU Tests

	on:
	workflow_call:
	secrets:
	HF_TOKEN:
	required: true
	description: 'HuggingFace token for model downloads'

	concurrency:
	# Dedup pull requests (canceling previous runs of the same workflow for same PR), and scheduled runs but nothing else
	group: ${{ github.event_name == 'pull_request' && format('{0}-pr-{1}', github.workflow, github.event.pull_request.number) \|\| github.event_name == 'schedule' && format('{0}-schedule', github.workflow) \|\| github.run_id }}
	cancel-in-progress: true

	env:
	HF_HOME: ~/.cache/huggingface
	HF_HUB_ENABLE_HF_TRANSFER: "1"

	jobs:
	run-prod:
	runs-on: [linux-x86-ct5lp-224-8tpu]
	environment: testing
	container:
	image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:jax0.7.1_rev1
	options: --privileged
	env:
	CLOUD_TPU_ACCELERATOR: v5e-8
	JAX_PLATFORMS: tpu
	steps:

	# Cache Hugging Face hub
	- name: Cache HF hub
	uses: actions/cache@v4
	with:
	path: ~/.cache/huggingface
	key: hf-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'requirements.txt', 'constraints.txt') }}
	restore-keys: \|
	hf-${{ runner.os }}-

	- name: Checkout code
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Install tunix dependencies
	run: \|
	pip install -e .[prod]
	pip install pytest pytest-xdist

	- name: Verify TPU availability
	run: \|
	python -c "
	import jax
	print(f'JAX version: {jax.__version__}')
	print(f'JAX devices: {jax.devices()}')

	# Check if we have TPU devices specifically
	devices = jax.devices()
	has_tpu = len(devices) > 0 and all(device.platform == 'tpu' for device in devices)
	print(f'TPU available: {has_tpu}')

	if not has_tpu:
	print('ERROR: No TPU devices found! Expected TPU devices but got:', [device.platform for device in devices])
	exit(1)
	else:
	print(f'SUCCESS: Found {len(devices)} TPU device(s)')
	"

	- name: Run tunix model tests
	run: \|
	python -m pytest tests/models/ -v --tb=short -m "not cpu_only and not gpu_only"

	- name: Run tunix cli tests
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
	KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
	run: \|
	# Config tests that passed
	python -m pytest tests/cli/ -v --tb=short \
	--ignore=tests/cli/utils/model_test.py

	- name: Run tunix generation tests (PASSED only)
	run: \|
	# tokenizer_adapter_test requires access to gated repo
	python -m pytest tests/generate/ -v --tb=short \
	--ignore=tests/generate/vllm_sampler_test.py \
	--ignore=tests/generate/tokenizer_adapter_test.py

	- name: Run tunix SFT tests
	run: \|
	python -m pytest tests/sft/ -v --tb=short

	- name: Run tunix SFT integration tests
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	./tests/sft/sft_tpu_smoke_test.sh

	- name: Run tunix distillation tests
	run: \|
	python -m pytest tests/distillation/ -v --tb=short

	- name: Run tunix RL tests
	run: \|
	# RL common tests that passed
	# b/448133814: test_grpo_with_lora_model fails
	python -m pytest tests/rl/ -v --tb=short -k "not test_grpo_with_lora_model" --ignore=tests/rl/experimental/agentic

	- name: GRPO Integration Test
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|

	# Download GSM8K dataset
	mkdir -p /tmp/grpo_test/rl/grpo/data
	python3 -c "
	from datasets import load_dataset
	import json

	# Download and save GSM8K train split
	dataset = load_dataset('openai/gsm8k', 'main', split='train')
	train_data = [{'question': item['question'], 'answer': item['answer']} for item in dataset]
	with open('/tmp/grpo_test/rl/grpo/data/gsm8k_train.json', 'w') as f:
	json.dump(train_data, f)

	# Download and save GSM8K test split
	dataset = load_dataset('openai/gsm8k', 'main', split='test')
	test_data = [{'question': item['question'], 'answer': item['answer']} for item in dataset]
	with open('/tmp/grpo_test/rl/grpo/data/gsm8k_test.json', 'w') as f:
	json.dump(test_data, f)

	print('GSM8K dataset downloaded successfully')
	"

	# Run GRPO demo script with minimal configuration
	python3 scripts/grpo_demo_llama3_qwen2.py \
	--root-dir=/tmp/grpo_test \
	--model-version=Qwen/Qwen2.5-0.5B-Instruct \
	--num-batches=8 \
	--num-test-batches=4 \
	--rollout-engine=vanilla

	- name: Run tunix tests not covered by the above categories
	run: \|
	# This category is to catch tests added but not covered by CI yet. Whenever you add new folders under tests/, please add a new category above and skip those tests here.
	python -m pytest tests/ -v --tb=short --ignore=tests/models/ --ignore=tests/cli/ --ignore=tests/generate/ --ignore=tests/sft/ --ignore=tests/distillation/ --ignore=tests/rl/ \|\| code=$?
	if [ "${code:-0}" = "5" ]; then
	echo "No tests collected (expected)."
	exit 0
	else
	exit "${code:-0}"
	fi

	run-dev:
	runs-on: [linux-x86-ct5lp-224-8tpu] # your existing k8s-based runner labels
	environment: testing

	# IMPORTANT: remove the 'container:' block here; we want the job to run
	# directly on the runner pod so it can call kubectl.

	steps:
	- name: Cache HF hub (runner-side only; pod won’t use this cache)
	uses: actions/cache@v4
	with:
	path: ~/.cache/huggingface
	key: hf-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'requirements.txt', 'constraints.txt') }}
	restore-keys: \|
	hf-${{ runner.os }}-

	- name: Checkout code
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Create test pod
	run: \|
	set -eux
	cat <<'YAML' \| kubectl apply -f -
	apiVersion: v1
	kind: Pod
	metadata:
	name: vllm-tests
	labels:
	app: vllm-tests
	spec:
	restartPolicy: Never
	containers:
	- name: test
	# Use your exact image tag/digest here:
	image: vllm/vllm-tpu:785d8b6410c3f1dc138947ea861a194b061f0293
	command: ["bash","-lc","sleep infinity"]
	# Add privileged if your cluster policy allows and you truly need it.
	securityContext:
	privileged: true
	YAML

	- name: Wait for pod Ready
	run: kubectl wait --for=condition=Ready pod/vllm-tests --timeout=300s

	- name: Copy workspace into pod
	run: \|
	set -eux
	kubectl exec vllm-tests -- mkdir -p /workspace
	# Copy your repo into the container
	kubectl cp "${GITHUB_WORKSPACE}/." vllm-tests:/workspace

	- name: Run tests inside the pod
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	set -eux
	# Everything below runs inside the container
	kubectl exec -i vllm-tests -- bash -lc '
	set -eux
	export HF_TOKEN="${HF_TOKEN:-}"

	cd /workspace
	python3 -V \|\| true

	# Base tooling
	pip install --upgrade pip setuptools wheel

	# Install Tunix (from PR checkout)
	# Your original steps included removing TPU/torch stacks before installing.
	pip uninstall -y torch torch-xla libtpu jax jaxlib \|\| true
	if [ -f constraints_dev.txt ]; then
	pip install -c constraints_dev.txt -e .[dev]
	else
	pip install -e .[dev]
	fi

	# Install tpu-inference and test deps
	pip uninstall -y torch torch-xla libtpu jax jaxlib \|\| true
	pip install tpu-inference==v0.11.1
	pip install pytest pytest-xdist

	# Run the specific vLLM tests
	pytest tests/generate/vllm_sampler_test.py -v --tb=short
	pytest tests/generate/vllm_driver_test.py -v --tb=short
	'

	- name: Pod logs (on failure)
	if: failure()
	run: kubectl logs vllm-tests --all-containers=true \|\| true

	- name: Cleanup pod
	if: always()
	run: kubectl delete pod vllm-tests --ignore-not-found

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add vLLM to dependency list since it's OSS-ed. #37

Workflow file

Add vLLM to dependency list since it's OSS-ed. #37

Uh oh!

Workflow file for this run

GitHub Actions / .github/workflows/tpu-tests.yml