Skip to content

fix

fix #39

Workflow file for this run

# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: TPU Tests
on:
workflow_call:
secrets:
HF_TOKEN:
required: true
description: 'HuggingFace token for model downloads'
concurrency:
# Dedup pull requests (canceling previous runs of the same workflow for same PR), and scheduled runs but nothing else
group: ${{ github.event_name == 'pull_request' && format('{0}-pr-{1}', github.workflow, github.event.pull_request.number) || github.event_name == 'schedule' && format('{0}-schedule', github.workflow) || github.run_id }}
cancel-in-progress: true
env:
HF_HOME: ~/.cache/huggingface
HF_HUB_ENABLE_HF_TRANSFER: "1"
jobs:
# run_prod:
# runs-on: [linux-x86-ct5lp-224-8tpu]
# environment: testing
# container:
# image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:jax0.7.1_rev1
# options: --privileged
# env:
# CLOUD_TPU_ACCELERATOR: v5e-8
# JAX_PLATFORMS: tpu
# steps:
#
# # Cache Hugging Face hub
# - name: Cache HF hub
# uses: actions/cache@v4
# with:
# path: ~/.cache/huggingface
# key: hf-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'requirements*.txt', 'constraints*.txt') }}
# restore-keys: |
# hf-${{ runner.os }}-
#
# - name: Checkout code
# uses: actions/checkout@v4
# with:
# fetch-depth: 0
#
# - name: Install tunix dependencies
# run: |
# pip install -e .[prod]
# pip install pytest pytest-xdist
#
# - name: Verify TPU availability
# run: |
# python -c "
# import jax
# print(f'JAX version: {jax.__version__}')
# print(f'JAX devices: {jax.devices()}')
#
# # Check if we have TPU devices specifically
# devices = jax.devices()
# has_tpu = len(devices) > 0 and all(device.platform == 'tpu' for device in devices)
# print(f'TPU available: {has_tpu}')
#
# if not has_tpu:
# print('ERROR: No TPU devices found! Expected TPU devices but got:', [device.platform for device in devices])
# exit(1)
# else:
# print(f'SUCCESS: Found {len(devices)} TPU device(s)')
# "
#
# - name: Run tunix model tests
# run: |
# python -m pytest tests/models/ -v --tb=short -m "not cpu_only and not gpu_only"
#
# - name: Run tunix cli tests
# env:
# HF_TOKEN: ${{ secrets.HF_TOKEN }}
# KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
# KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
# run: |
# # Config tests that passed
# python -m pytest tests/cli/ -v --tb=short \
# --ignore=tests/cli/utils/model_test.py
#
# - name: Run tunix generation tests (PASSED only)
# run: |
# # tokenizer_adapter_test requires access to gated repo
# python -m pytest tests/generate/ -v --tb=short \
# --ignore=tests/generate/vllm_sampler_test.py \
# --ignore=tests/generate/vllm_driver_test.py \
# --ignore=tests/generate/tokenizer_adapter_test.py \
# --ignore=tests/generate/sglang_jax_sampler_test.py
#
# - name: Run tunix SFT tests
# run: |
# python -m pytest tests/sft/ -v --tb=short
#
# - name: Run tunix SFT integration tests
# env:
# HF_TOKEN: ${{ secrets.HF_TOKEN }}
# run: |
# ./tests/sft/sft_tpu_smoke_test.sh
#
# - name: Run tunix distillation tests
# run: |
# python -m pytest tests/distillation/ -v --tb=short
#
# - name: Run tunix RL tests
# run: |
# # RL common tests that passed
# # b/448133814: test_grpo_with_lora_model fails
# python -m pytest tests/rl/ -v --tb=short -k "not test_grpo_with_lora_model" --ignore=tests/rl/experimental/agentic
#
# - name: GRPO Integration Test
# env:
# HF_TOKEN: ${{ secrets.HF_TOKEN }}
# run: |
#
# # Download GSM8K dataset
# mkdir -p /tmp/grpo_test/rl/grpo/data
# python3 -c "
# from datasets import load_dataset
# import json
#
# # Download and save GSM8K train split
# dataset = load_dataset('openai/gsm8k', 'main', split='train')
# train_data = [{'question': item['question'], 'answer': item['answer']} for item in dataset]
# with open('/tmp/grpo_test/rl/grpo/data/gsm8k_train.json', 'w') as f:
# json.dump(train_data, f)
#
# # Download and save GSM8K test split
# dataset = load_dataset('openai/gsm8k', 'main', split='test')
# test_data = [{'question': item['question'], 'answer': item['answer']} for item in dataset]
# with open('/tmp/grpo_test/rl/grpo/data/gsm8k_test.json', 'w') as f:
# json.dump(test_data, f)
#
# print('GSM8K dataset downloaded successfully')
# "
#
# # Run GRPO demo script with minimal configuration
# python3 scripts/grpo_demo_llama3_qwen2.py \
# --root-dir=/tmp/grpo_test \
# --model-version=Qwen/Qwen2.5-0.5B-Instruct \
# --num-batches=8 \
# --num-test-batches=4 \
# --rollout-engine=vanilla
#
# - name: Run tunix tests not covered by the above categories
# run: |
# # This category is to catch tests added but not covered by CI yet. Whenever you add new folders under tests/, please add a new category above and skip those tests here.
# python -m pytest tests/ -v --tb=short --ignore=tests/models/ --ignore=tests/cli/ --ignore=tests/generate/ --ignore=tests/sft/ --ignore=tests/distillation/ --ignore=tests/rl/ || code=$?
# if [ "${code:-0}" = "5" ]; then
# echo "No tests collected (expected)."
# exit 0
# else
# exit "${code:-0}"
# fi
run_dev:
runs-on: [linux-x86-ct5lp-224-8tpu]
environment: testing
container:
image: vllm/vllm-tpu:v0.11.1
options: --privileged
env:
CLOUD_TPU_ACCELERATOR: v5e-8
JAX_PLATFORMS: tpu
steps:
# Cache Hugging Face hub
- name: Cache HF hub
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
key: hf-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'requirements*.txt', 'constraints*.txt') }}
restore-keys: |
hf-${{ runner.os }}-
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0

Check failure on line 197 in .github/workflows/tpu-tests.yml

View workflow run for this annotation

GitHub Actions / .github/workflows/tpu-tests.yml

Invalid workflow file

You have an error in your yaml syntax on line 197
- name: Setup Tunix , tpu-inference and dependencies
run: |
echo "Current directory:"
pwd
pip install --upgrade pip setuptools wheel
# Install Tunix
pip uninstall torch torch-xla libtpu jax jaxlib -y
pip install -e .[dev]
# Install tpu-inference
# pip uninstall torch libtpu jax jaxlib -y
# pip install tpu-inference==v0.11.1 --force-reinstall
pip install pytest pytest-xdist
# - name: Run tests
# env:
# HF_TOKEN: ${{ secrets.HF_TOKEN }}
# run: |
# pytest tests/generate/vllm_driver_test.py -v --tb=short
# pytest tests/generate/vllm_sampler_test.py --collect-only -q --no-header --no-summary --disable-warnings | grep '::' > test_collections.txt
# while read -r test; do
# pytest "$test" -v --tb=short
# done < test_collections.txt
- name: Run install sglang-jax && test
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
DEBUG: true
run: |
cd ..
git clone https://github.com/sgl-project/sglang-jax.git && cd sglang-jax/python && pip install -e . && cd ../..
cd tunix && python -m pytest tests/generate/sglang_jax_sampler_test.py -v --tb=long