Model creation smoke test #40

Workflow file for this run

.github/workflows/tpu-tests.yml at 61a192d

	# Copyright 2025 Google LLC
Check failure on line 1 in .github/workflows/tpu-tests.yml View workflow run for this annotation GitHub Actions / .github/workflows/tpu-tests.yml Invalid workflow file `(Line: 138, Col: 5): Unexpected value 'secrets'`
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# https://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
	# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

	name: TPU Tests

	on:
	workflow_call:
	secrets:
	HF_TOKEN:
	description: 'HuggingFace token for model downloads'
	KAGGLE_USERNAME:
	description: 'Kaggle Username'
	KAGGLE_KEY:
	description: 'Kaggle API Key'

	concurrency:
	# Dedup pull requests (canceling previous runs of the same workflow for same PR), and scheduled runs but nothing else
	group: ${{ github.event_name == 'pull_request' && format('{0}-pr-{1}', github.workflow, github.event.pull_request.number) \|\| github.event_name == 'schedule' && format('{0}-schedule', github.workflow) \|\| github.run_id }}
	cancel-in-progress: true

	env:
	HF_HOME: ~/.cache/huggingface
	HF_HUB_ENABLE_HF_TRANSFER: "1"

	jobs:
	# run_prod:
	# runs-on: [linux-x86-ct5lp-224-8tpu]
	# environment: testing
	# container:
	# image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:latest
	# options: --privileged
	# env:
	# CLOUD_TPU_ACCELERATOR: v5e-8
	# JAX_PLATFORMS: tpu
	# steps:

	# # Cache Hugging Face hub
	# - name: Cache HF hub
	# uses: actions/cache@v4
	# with:
	# path: ~/.cache/huggingface
	# key: hf-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'requirements.txt', 'constraints.txt') }}
	# restore-keys: \|
	# hf-${{ runner.os }}-

	# - name: Checkout code
	# uses: actions/checkout@v4
	# with:
	# fetch-depth: 0

	# - name: Install tunix dependencies
	# run: \|
	# pip install --upgrade pip
	# pip install -e .[prod] --force-reinstall
	# pip install pytest pytest-xdist

	# - name: Verify TPU availability
	# run: \|
	# python -c "
	# import jax
	# print(f'JAX version: {jax.__version__}')
	# print(f'JAX devices: {jax.devices()}')

	# # Check if we have TPU devices specifically
	# devices = jax.devices()
	# has_tpu = len(devices) > 0 and all(device.platform == 'tpu' for device in devices)
	# print(f'TPU available: {has_tpu}')

	# if not has_tpu:
	# print('ERROR: No TPU devices found! Expected TPU devices but got:', [device.platform for device in devices])
	# exit(1)
	# else:
	# print(f'SUCCESS: Found {len(devices)} TPU device(s)')
	# "

	# - name: Run tunix model tests
	# run: \|
	# python -m pytest tests/models/ -v --tb=short -m "not cpu_only and not gpu_only"

	# - name: Run tunix generation tests (PASSED only)
	# run: \|
	# # tokenizer_adapter_test requires access to gated repo
	# python -m pytest tests/generate/ -v --tb=short \
	# --ignore=tests/generate/vllm_sampler_test.py \
	# --ignore=tests/generate/vllm_driver_test.py \
	# --ignore=tests/generate/tokenizer_adapter_test.py \
	# --ignore=tests/generate/sglang_jax_sampler_test.py

	# - name: Run tunix SFT tests
	# run: \|
	# python -m pytest tests/sft/ -v --tb=short

	# - name: Run tunix distillation tests
	# run: \|
	# python -m pytest tests/distillation/ -v --tb=short

	# - name: Run tunix RL tests
	# run: \|
	# # RL common tests that passed
	# # b/448133814: test_grpo_with_lora_model fails
	# python -m pytest tests/rl/ -v --tb=short -k "not test_grpo_with_lora_model" --ignore=tests/rl/experimental/agentic

	# - name: Run tunix tests not covered by the above categories
	# run: \|
	# # This category is to catch tests added but not covered by CI yet. Whenever you add new folders under tests/, please add a new category above and skip those tests here.
	# python -m pytest tests/ -v --tb=short --ignore=tests/perf/ --ignore=tests/model_alignment/ --ignore=tests/models/ --ignore=tests/cli/ --ignore=tests/generate/ --ignore=tests/sft/ --ignore=tests/distillation/ --ignore=tests/rl/ --ignore=tests/smoke_tests/ \|\| code=$?
	# if [ "${code:-0}" = "5" ]; then
	# echo "No tests collected (expected)."
	# exit 0
	# else
	# exit "${code:-0}"
	# fi
	#
	run_dev:
	if: ${{ github.event_name != 'pull_request' \|\| github.event.pull_request.head.repo.full_name == github.repository }}
	runs-on: [linux-x86-ct5lp-224-8tpu]
	environment: testing
	container:
	image: vllm/vllm-tpu:nightly-a9f13e53dc3511599e873225f0e5adbda07f3993
	options: --privileged
	env:
	CLOUD_TPU_ACCELERATOR: v5e-8
	JAX_PLATFORMS: tpu,cpu
	secrets:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
	KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
	steps:
	# Cache Hugging Face hub
	- name: Cache HF hub
	uses: actions/cache@v4
	with:
	path: ~/.cache/huggingface
	key: hf-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'requirements.txt', 'constraints.txt') }}
	restore-keys: \|
	hf-${{ runner.os }}-

	- name: Checkout code
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Setup Tunix , tpu-inference and dependencies
	run: \|
	echo "Current directory:"
	pwd
	pip install --upgrade pip setuptools wheel

	# Install Tunix with dev dependencies without overwriting the vLLM dependencies.
	pip install -e .[dev]
	pip install transformers==4.57.1 --force-reinstall # Issue: https://github.com/google/tunix/pull/795

	# - name: GRPO Integration Test
	# env:
	# HF_TOKEN: ${{ secrets.HF_TOKEN }}
	# run: \|

	# # Download GSM8K dataset
	# mkdir -p /tmp/grpo_test/rl/grpo/data
	# python3 -c "
	# from datasets import load_dataset
	# import json

	# # Download and save GSM8K train split
	# dataset = load_dataset('openai/gsm8k', 'main', split='train')
	# train_data = [{'question': item['question'], 'answer': item['answer']} for item in dataset]
	# with open('/tmp/grpo_test/rl/grpo/data/gsm8k_train.json', 'w') as f:
	# json.dump(train_data, f)

	# # Download and save GSM8K test split
	# dataset = load_dataset('openai/gsm8k', 'main', split='test')
	# test_data = [{'question': item['question'], 'answer': item['answer']} for item in dataset]
	# with open('/tmp/grpo_test/rl/grpo/data/gsm8k_test.json', 'w') as f:
	# json.dump(test_data, f)

	# print('GSM8K dataset downloaded successfully')
	# "

	# # TODO(lancewang): Re-enable this test once the segfault is fixed.
	# # Run GRPO demo script with minimal configuration
	# # python3 scripts/grpo_demo_llama3_qwen2.py \
	# # --root-dir=/tmp/grpo_test \
	# # --model-version=Qwen/Qwen2.5-0.5B-Instruct \
	# # --num-batches=1 \
	# # --num-test-batches=1 \
	# # --rollout-engine=vanilla
	# - name: Run vllm tests
	# env:
	# HF_TOKEN: ${{ secrets.HF_TOKEN }}
	# run: \|
	# unset JAX_PLATFORMS
	# pytest tests/generate/vllm_driver_test.py -v --tb=short
	# pytest tests/generate/vllm_sampler_test.py --collect-only -q --no-header --no-summary --disable-warnings \| grep '::' > test_collections.txt
	# while read -r test; do
	# pytest -s "$test" -v --tb=short
	# done < test_collections.txt

	# - name: Run install sglang-jax && test
	# env:
	# HF_TOKEN: ${{ secrets.HF_TOKEN }}
	# run: \|
	# ## because sglang-jax has codes like jax.local_devices('cpu')
	# # TODO(lancewang): Re-enable this test once the bug is fixed.
	# unset JAX_PLATFORMS
	# pip list \| egrep 'jax\|flax\|libtpu'
	# cd ..
	# git clone https://github.com/sgl-project/sglang-jax.git && cd sglang-jax/python && pip install -e . --force-reinstall && cd ../..
	# pip list \| egrep 'jax\|flax\|libtpu'

	# # Install bookworm, vllm container defaults to bullseye causes segfault for sglang-jax.
	# cat >/etc/apt/sources.list <<'EOF'
	# deb http://deb.debian.org/debian bookworm main contrib non-free
	# deb http://deb.debian.org/debian bookworm-updates main contrib non-free
	# deb http://security.debian.org/debian-security bookworm-security main contrib non-free
	# EOF
	# apt-get update; apt-get install -y less

	# cd tunix && python tests/generate/sglang_jax_sampler_test.py
	# - name: Run tunix SFT integration tests
	# env:
	# HF_TOKEN: ${{ secrets.HF_TOKEN }}
	# run: \|
	# # Reinstall Tunix with prod dependencies
	# pip install -e .[prod] --force-reinstall

	# # Loading tfds requires tensorflow.
	# pip install tensorflow

	# export JAX_PLATFORMS=tpu,cpu
	# ./tests/sft/sft_tpu_smoke_test.sh
	- name: Run Smoke tests
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
	KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
	run: \|
	echo "Running Smoke tests..."
	# Debugging: Check if env vars are set (don't print values)
	if [ -n "$KAGGLE_USERNAME" ]; then echo "KAGGLE_USERNAME is set"; else echo "KAGGLE_USERNAME is NOT set"; fi
	if [ -n "$KAGGLE_KEY" ]; then echo "KAGGLE_KEY is set"; else echo "KAGGLE_KEY is NOT set"; fi
	echo HF_TOKEN: ${HF_TOKEN}
	echo KAGGLE_USERNAME: ${KAGGLE_USERNAME}
	echo KAGGLE_KEY: ${KAGGLE_KEY}
	python -m pytest tests/smoke_tests/model_creation_test.py -v --tb=short
	- name: Run tunix cli tests
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
	KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
	run: \|
	# Config tests that passed
	python -m pytest tests/cli/ -v --tb=short \
	--ignore=tests/cli/utils/model_test.py
	- name: Run model alignment tests
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	python -m pip install torch
	JAX_PLATFORMS=cpu python -m pytest tests/model_alignment/ -v --tb=short
	unset JAX_PLATFORMS

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Model creation smoke test #40

Workflow file

Model creation smoke test #40

Uh oh!

Workflow file for this run

GitHub Actions / .github/workflows/tpu-tests.yml