[1/2] feat/add_fault_torlance #1437

Workflow file for this run

.github/workflows/pr-test.yml at 66108e0

	################################################################################
	# This file is auto-generated from the .j2 file via generate_github_workflows.py. Do not edit manually.
	################################################################################

	name: PR Test

	on:
	# Do not run CI on push to reduce CI time
	# push:
	# branches: [main]
	pull_request:
	branches: [main]
	types: [synchronize, labeled]
	workflow_dispatch:
	inputs:
	infinite_run:
	description: 'Run training infinitely'
	required: false
	type: boolean
	default: false

	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.ref }}
	cancel-in-progress: true

	jobs:

	e2e-test-short:
	if: (github.event_name == 'workflow_dispatch') \|\| (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-short'))
	runs-on: self-hosted
	container:
	image: slimerl/slime:latest
	options: >
	--gpus all
	--ipc=host
	--shm-size=16g
	--ulimit memlock=-1
	--ulimit stack=67108864
	--memory=0
	--memory-swap=0
	-e http_proxy=$http_proxy
	-e https_proxy=$https_proxy
	-e HTTP_PROXY=$HTTP_PROXY
	-e HTTPS_PROXY=$HTTPS_PROXY
	-v /mnt/nvme0n1/slime_ci:/data/slime_ci
	-v /mnt/nvme0n1/slime_ci/models:/root/models
	-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
	strategy:
	fail-fast: false
	matrix:
	info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_sglang_config.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_sglang_config_distributed.py"}]
	defaults:
	run:
	working-directory: ${{ github.workspace }}
	env:
	GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number \|\| 'non-pr' }}
	WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
	SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) \|\| 'false' }}
	SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep \|\| '0' }}
	SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout \|\| '0' }}
	SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval \|\| '1' }}

	steps:
	- name: Checkout repository
	uses: actions/checkout@v6

	- name: Install
	shell: bash
	run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages

	- name: Execute
	shell: bash
	run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}

	e2e-test-fsdp:
	if: (github.event_name == 'workflow_dispatch') \|\| (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-fsdp'))
	runs-on: self-hosted
	container:
	image: slimerl/slime:latest
	options: >
	--gpus all
	--ipc=host
	--shm-size=16g
	--ulimit memlock=-1
	--ulimit stack=67108864
	--memory=0
	--memory-swap=0
	-e http_proxy=$http_proxy
	-e https_proxy=$https_proxy
	-e HTTP_PROXY=$HTTP_PROXY
	-e HTTPS_PROXY=$HTTPS_PROXY
	-v /mnt/nvme0n1/slime_ci:/data/slime_ci
	-v /mnt/nvme0n1/slime_ci/models:/root/models
	-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
	strategy:
	fail-fast: false
	matrix:
	info: [{"num_gpus": 4, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py --colocated"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}]
	defaults:
	run:
	working-directory: ${{ github.workspace }}
	env:
	GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number \|\| 'non-pr' }}
	WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
	SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) \|\| 'false' }}
	SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep \|\| '0' }}
	SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout \|\| '0' }}
	SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval \|\| '1' }}

	steps:
	- name: Checkout repository
	uses: actions/checkout@v6

	- name: Install
	shell: bash
	run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages

	- name: Execute
	shell: bash
	run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}

	e2e-test-megatron:
	if: (github.event_name == 'workflow_dispatch') \|\| (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-megatron'))
	runs-on: self-hosted
	container:
	image: slimerl/slime:latest
	options: >
	--gpus all
	--ipc=host
	--shm-size=16g
	--ulimit memlock=-1
	--ulimit stack=67108864
	--memory=0
	--memory-swap=0
	-e http_proxy=$http_proxy
	-e https_proxy=$https_proxy
	-e HTTP_PROXY=$HTTP_PROXY
	-e HTTPS_PROXY=$HTTPS_PROXY
	-v /mnt/nvme0n1/slime_ci:/data/slime_ci
	-v /mnt/nvme0n1/slime_ci/models:/root/models
	-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
	strategy:
	fail-fast: false
	matrix:
	info: [{"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py", "use_deepep": "1", "use_fp8_rollout": "1"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_qwen3_30B_A3B_r3.py", "use_deepep": "1", "use_fp8_rollout": "1"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_qwen3_30B_A3B_r3.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo_train_critic_only.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_moonlight_16B_A3B_r3.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_debug_rollout_then_train.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_opd_sglang.py"}]
	defaults:
	run:
	working-directory: ${{ github.workspace }}
	env:
	GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number \|\| 'non-pr' }}
	WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
	SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) \|\| 'false' }}
	SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep \|\| '0' }}
	SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout \|\| '0' }}
	SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval \|\| '1' }}

	steps:
	- name: Checkout repository
	uses: actions/checkout@v6

	- name: Install
	shell: bash
	run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages

	- name: Execute
	shell: bash
	run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}

	e2e-test-precision:
	if: (github.event_name == 'workflow_dispatch') \|\| (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-precision'))
	runs-on: self-hosted
	container:
	image: slimerl/slime:latest
	options: >
	--gpus all
	--ipc=host
	--shm-size=16g
	--ulimit memlock=-1
	--ulimit stack=67108864
	--memory=0
	--memory-swap=0
	-e http_proxy=$http_proxy
	-e https_proxy=$https_proxy
	-e HTTP_PROXY=$HTTP_PROXY
	-e HTTPS_PROXY=$HTTPS_PROXY
	-v /mnt/nvme0n1/slime_ci:/data/slime_ci
	-v /mnt/nvme0n1/slime_ci/models:/root/models
	-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
	strategy:
	fail-fast: false
	matrix:
	info: [{"num_gpus": 8, "test_file": "test_qwen3_0.6B_parallel_check.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}]
	defaults:
	run:
	working-directory: ${{ github.workspace }}
	env:
	GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number \|\| 'non-pr' }}
	WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
	SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) \|\| 'false' }}
	SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep \|\| '0' }}
	SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout \|\| '0' }}
	SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval \|\| '1' }}

	steps:
	- name: Checkout repository
	uses: actions/checkout@v6

	- name: Install
	shell: bash
	run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages

	- name: Execute
	shell: bash
	run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}

	e2e-test-ckpt:
	if: (github.event_name == 'workflow_dispatch') \|\| (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-ckpt'))
	runs-on: self-hosted
	container:
	image: slimerl/slime:latest
	options: >
	--gpus all
	--ipc=host
	--shm-size=16g
	--ulimit memlock=-1
	--ulimit stack=67108864
	--memory=0
	--memory-swap=0
	-e http_proxy=$http_proxy
	-e https_proxy=$https_proxy
	-e HTTP_PROXY=$HTTP_PROXY
	-e HTTPS_PROXY=$HTTPS_PROXY
	-v /mnt/nvme0n1/slime_ci:/data/slime_ci
	-v /mnt/nvme0n1/slime_ci/models:/root/models
	-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
	strategy:
	fail-fast: false
	matrix:
	info: [{"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py --async-save"}]
	defaults:
	run:
	working-directory: ${{ github.workspace }}
	env:
	GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number \|\| 'non-pr' }}
	WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
	SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) \|\| 'false' }}
	SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep \|\| '0' }}
	SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout \|\| '0' }}
	SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval \|\| '1' }}

	steps:
	- name: Checkout repository
	uses: actions/checkout@v6

	- name: Install
	shell: bash
	run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages

	- name: Execute
	shell: bash
	run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}

	e2e-test-image:
	if: (github.event_name == 'workflow_dispatch') \|\| (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-image'))
	runs-on: self-hosted
	container:
	image: slimerl/slime-test:latest
	options: >
	--gpus all
	--ipc=host
	--shm-size=16g
	--ulimit memlock=-1
	--ulimit stack=67108864
	--memory=0
	--memory-swap=0
	-e http_proxy=$http_proxy
	-e https_proxy=$https_proxy
	-e HTTP_PROXY=$HTTP_PROXY
	-e HTTPS_PROXY=$HTTPS_PROXY
	-v /mnt/nvme0n1/slime_ci:/data/slime_ci
	-v /mnt/nvme0n1/slime_ci/models:/root/models
	-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
	strategy:
	fail-fast: false
	matrix:
	info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}, {"num_gpus": 2, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}, {"num_gpus": 8, "test_file": "test_qwen3_0.6B_parallel_check.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py --async-save"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_debug_rollout_then_train.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_opd_sglang.py"}]
	defaults:
	run:
	working-directory: ${{ github.workspace }}
	env:
	GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number \|\| 'non-pr' }}
	WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
	SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) \|\| 'false' }}
	SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep \|\| '0' }}
	SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout \|\| '0' }}
	SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval \|\| '1' }}

	steps:
	- name: Checkout repository
	uses: actions/checkout@v6

	- name: Install
	shell: bash
	run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages

	- name: Execute
	shell: bash
	run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}


	e2e-test-changed-detect:
	if: (github.event_name == 'workflow_dispatch') \|\| (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-changed'))
	runs-on: self-hosted
	container:
	image: slimerl/slime:latest
	options: >
	--gpus all
	--ipc=host
	--shm-size=16g
	--ulimit memlock=-1
	--ulimit stack=67108864
	--memory=0
	--memory-swap=0
	outputs:
	matrix: ${{ steps.detect.outputs.matrix }}
	has_tests: ${{ steps.detect.outputs.has_tests }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Detect changed tests
	id: detect
	shell: bash
	run: \|
	CHANGED=$(git diff --name-only --diff-filter=AM origin/main...HEAD -- 'tests/test_*.py' \|\| true)
	if [ -z "$CHANGED" ]; then
	echo "No new or modified test files found."
	echo "has_tests=false" >> $GITHUB_OUTPUT
	echo 'matrix={"info":[]}' >> $GITHUB_OUTPUT
	else
	echo "Changed test files:"
	echo "$CHANGED"
	MATRIX="["
	FIRST=true
	for filepath in $CHANGED; do
	filename=$(basename "$filepath")
	# Extract NUM_GPUS from the test file, default to 8
	NGPU=$(grep -oP '^NUM_GPUS\s=\s\K\d+' "$filepath" \| head -1)
	NGPU=${NGPU:-8}
	if [ "$FIRST" = true ]; then FIRST=false; else MATRIX+=","; fi
	MATRIX+="{\"test_file\":\"$filename\",\"num_gpus\":$NGPU}"
	done
	MATRIX+="]"
	echo "has_tests=true" >> $GITHUB_OUTPUT
	echo "matrix={\"info\":$MATRIX}" >> $GITHUB_OUTPUT
	echo "Generated matrix: $MATRIX"
	fi

	e2e-test-changed:
	needs: e2e-test-changed-detect
	if: needs.e2e-test-changed-detect.outputs.has_tests == 'true'
	runs-on: self-hosted
	container:
	image: slimerl/slime:latest
	options: >
	--gpus all
	--ipc=host
	--shm-size=16g
	--ulimit memlock=-1
	--ulimit stack=67108864
	--memory=0
	--memory-swap=0
	-e http_proxy=$http_proxy
	-e https_proxy=$https_proxy
	-e HTTP_PROXY=$HTTP_PROXY
	-e HTTPS_PROXY=$HTTPS_PROXY
	-v /mnt/nvme0n1/slime_ci:/data/slime_ci
	-v /mnt/nvme0n1/slime_ci/models:/root/models
	-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
	strategy:
	fail-fast: false
	matrix: ${{ fromJson(needs.e2e-test-changed-detect.outputs.matrix) }}
	defaults:
	run:
	working-directory: ${{ github.workspace }}
	env:
	GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number \|\| 'non-pr' }}
	WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
	SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) \|\| 'false' }}
	SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep \|\| '0' }}
	SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout \|\| '0' }}
	SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval \|\| '1' }}

	steps:
	- name: Checkout repository
	uses: actions/checkout@v6

	- name: Install
	shell: bash
	run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages

	- name: Execute
	shell: bash
	run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[1/2] feat/add_fault_torlance #1437

Workflow file

[1/2] feat/add_fault_torlance #1437

Uh oh!

Workflow file for this run