Skip to content

Model creation smoke test #40

Model creation smoke test

Model creation smoke test #40

Workflow file for this run

# Copyright 2025 Google LLC

Check failure on line 1 in .github/workflows/tpu-tests.yml

View workflow run for this annotation

GitHub Actions / .github/workflows/tpu-tests.yml

Invalid workflow file

(Line: 138, Col: 5): Unexpected value 'secrets'
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: TPU Tests
on:
workflow_call:
secrets:
HF_TOKEN:
description: 'HuggingFace token for model downloads'
KAGGLE_USERNAME:
description: 'Kaggle Username'
KAGGLE_KEY:
description: 'Kaggle API Key'
concurrency:
# Dedup pull requests (canceling previous runs of the same workflow for same PR), and scheduled runs but nothing else
group: ${{ github.event_name == 'pull_request' && format('{0}-pr-{1}', github.workflow, github.event.pull_request.number) || github.event_name == 'schedule' && format('{0}-schedule', github.workflow) || github.run_id }}
cancel-in-progress: true
env:
HF_HOME: ~/.cache/huggingface
HF_HUB_ENABLE_HF_TRANSFER: "1"
jobs:
# run_prod:
# runs-on: [linux-x86-ct5lp-224-8tpu]
# environment: testing
# container:
# image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:latest
# options: --privileged
# env:
# CLOUD_TPU_ACCELERATOR: v5e-8
# JAX_PLATFORMS: tpu
# steps:
# # Cache Hugging Face hub
# - name: Cache HF hub
# uses: actions/cache@v4
# with:
# path: ~/.cache/huggingface
# key: hf-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'requirements*.txt', 'constraints*.txt') }}
# restore-keys: |
# hf-${{ runner.os }}-
# - name: Checkout code
# uses: actions/checkout@v4
# with:
# fetch-depth: 0
# - name: Install tunix dependencies
# run: |
# pip install --upgrade pip
# pip install -e .[prod] --force-reinstall
# pip install pytest pytest-xdist
# - name: Verify TPU availability
# run: |
# python -c "
# import jax
# print(f'JAX version: {jax.__version__}')
# print(f'JAX devices: {jax.devices()}')
# # Check if we have TPU devices specifically
# devices = jax.devices()
# has_tpu = len(devices) > 0 and all(device.platform == 'tpu' for device in devices)
# print(f'TPU available: {has_tpu}')
# if not has_tpu:
# print('ERROR: No TPU devices found! Expected TPU devices but got:', [device.platform for device in devices])
# exit(1)
# else:
# print(f'SUCCESS: Found {len(devices)} TPU device(s)')
# "
# - name: Run tunix model tests
# run: |
# python -m pytest tests/models/ -v --tb=short -m "not cpu_only and not gpu_only"
# - name: Run tunix generation tests (PASSED only)
# run: |
# # tokenizer_adapter_test requires access to gated repo
# python -m pytest tests/generate/ -v --tb=short \
# --ignore=tests/generate/vllm_sampler_test.py \
# --ignore=tests/generate/vllm_driver_test.py \
# --ignore=tests/generate/tokenizer_adapter_test.py \
# --ignore=tests/generate/sglang_jax_sampler_test.py
# - name: Run tunix SFT tests
# run: |
# python -m pytest tests/sft/ -v --tb=short
# - name: Run tunix distillation tests
# run: |
# python -m pytest tests/distillation/ -v --tb=short
# - name: Run tunix RL tests
# run: |
# # RL common tests that passed
# # b/448133814: test_grpo_with_lora_model fails
# python -m pytest tests/rl/ -v --tb=short -k "not test_grpo_with_lora_model" --ignore=tests/rl/experimental/agentic
# - name: Run tunix tests not covered by the above categories
# run: |
# # This category is to catch tests added but not covered by CI yet. Whenever you add new folders under tests/, please add a new category above and skip those tests here.
# python -m pytest tests/ -v --tb=short --ignore=tests/perf/ --ignore=tests/model_alignment/ --ignore=tests/models/ --ignore=tests/cli/ --ignore=tests/generate/ --ignore=tests/sft/ --ignore=tests/distillation/ --ignore=tests/rl/ --ignore=tests/smoke_tests/ || code=$?
# if [ "${code:-0}" = "5" ]; then
# echo "No tests collected (expected)."
# exit 0
# else
# exit "${code:-0}"
# fi
#
run_dev:
if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository }}
runs-on: [linux-x86-ct5lp-224-8tpu]
environment: testing
container:
image: vllm/vllm-tpu:nightly-a9f13e53dc3511599e873225f0e5adbda07f3993
options: --privileged
env:
CLOUD_TPU_ACCELERATOR: v5e-8
JAX_PLATFORMS: tpu,cpu
secrets:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
steps:
# Cache Hugging Face hub
- name: Cache HF hub
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
key: hf-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'requirements*.txt', 'constraints*.txt') }}
restore-keys: |
hf-${{ runner.os }}-
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup Tunix , tpu-inference and dependencies
run: |
echo "Current directory:"
pwd
pip install --upgrade pip setuptools wheel
# Install Tunix with dev dependencies without overwriting the vLLM dependencies.
pip install -e .[dev]
pip install transformers==4.57.1 --force-reinstall # Issue: https://github.com/google/tunix/pull/795
# - name: GRPO Integration Test
# env:
# HF_TOKEN: ${{ secrets.HF_TOKEN }}
# run: |
# # Download GSM8K dataset
# mkdir -p /tmp/grpo_test/rl/grpo/data
# python3 -c "
# from datasets import load_dataset
# import json
# # Download and save GSM8K train split
# dataset = load_dataset('openai/gsm8k', 'main', split='train')
# train_data = [{'question': item['question'], 'answer': item['answer']} for item in dataset]
# with open('/tmp/grpo_test/rl/grpo/data/gsm8k_train.json', 'w') as f:
# json.dump(train_data, f)
# # Download and save GSM8K test split
# dataset = load_dataset('openai/gsm8k', 'main', split='test')
# test_data = [{'question': item['question'], 'answer': item['answer']} for item in dataset]
# with open('/tmp/grpo_test/rl/grpo/data/gsm8k_test.json', 'w') as f:
# json.dump(test_data, f)
# print('GSM8K dataset downloaded successfully')
# "
# # TODO(lancewang): Re-enable this test once the segfault is fixed.
# # Run GRPO demo script with minimal configuration
# # python3 scripts/grpo_demo_llama3_qwen2.py \
# # --root-dir=/tmp/grpo_test \
# # --model-version=Qwen/Qwen2.5-0.5B-Instruct \
# # --num-batches=1 \
# # --num-test-batches=1 \
# # --rollout-engine=vanilla
# - name: Run vllm tests
# env:
# HF_TOKEN: ${{ secrets.HF_TOKEN }}
# run: |
# unset JAX_PLATFORMS
# pytest tests/generate/vllm_driver_test.py -v --tb=short
# pytest tests/generate/vllm_sampler_test.py --collect-only -q --no-header --no-summary --disable-warnings | grep '::' > test_collections.txt
# while read -r test; do
# pytest -s "$test" -v --tb=short
# done < test_collections.txt
# - name: Run install sglang-jax && test
# env:
# HF_TOKEN: ${{ secrets.HF_TOKEN }}
# run: |
# ## because sglang-jax has codes like jax.local_devices('cpu')
# # TODO(lancewang): Re-enable this test once the bug is fixed.
# unset JAX_PLATFORMS
# pip list | egrep 'jax|flax|libtpu'
# cd ..
# git clone https://github.com/sgl-project/sglang-jax.git && cd sglang-jax/python && pip install -e . --force-reinstall && cd ../..
# pip list | egrep 'jax|flax|libtpu'
# # Install bookworm, vllm container defaults to bullseye causes segfault for sglang-jax.
# cat >/etc/apt/sources.list <<'EOF'
# deb http://deb.debian.org/debian bookworm main contrib non-free
# deb http://deb.debian.org/debian bookworm-updates main contrib non-free
# deb http://security.debian.org/debian-security bookworm-security main contrib non-free
# EOF
# apt-get update; apt-get install -y less
# cd tunix && python tests/generate/sglang_jax_sampler_test.py
# - name: Run tunix SFT integration tests
# env:
# HF_TOKEN: ${{ secrets.HF_TOKEN }}
# run: |
# # Reinstall Tunix with prod dependencies
# pip install -e .[prod] --force-reinstall
# # Loading tfds requires tensorflow.
# pip install tensorflow
# export JAX_PLATFORMS=tpu,cpu
# ./tests/sft/sft_tpu_smoke_test.sh
- name: Run Smoke tests
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
run: |
echo "Running Smoke tests..."
# Debugging: Check if env vars are set (don't print values)
if [ -n "$KAGGLE_USERNAME" ]; then echo "KAGGLE_USERNAME is set"; else echo "KAGGLE_USERNAME is NOT set"; fi
if [ -n "$KAGGLE_KEY" ]; then echo "KAGGLE_KEY is set"; else echo "KAGGLE_KEY is NOT set"; fi
echo HF_TOKEN: ${HF_TOKEN}
echo KAGGLE_USERNAME: ${KAGGLE_USERNAME}
echo KAGGLE_KEY: ${KAGGLE_KEY}
python -m pytest tests/smoke_tests/model_creation_test.py -v --tb=short
- name: Run tunix cli tests
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
run: |
# Config tests that passed
python -m pytest tests/cli/ -v --tb=short \
--ignore=tests/cli/utils/model_test.py
- name: Run model alignment tests
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
python -m pip install torch
JAX_PLATFORMS=cpu python -m pytest tests/model_alignment/ -v --tb=short
unset JAX_PLATFORMS