CI/CD - Initiate CI/CD pipeline for unit tests (#37)

yzygitzh · web-flow · commit 4a292eddfb70 · 2025-06-07T23:01:05.000+08:00
This PR initiates CI/CD pipeline for unit tests, including:

* Workflow YAML to run unit tests on LTP
* Scripts to run unit tests on single-node NVIDIA H200 and AMD MI300X platforms, excluding current failure cases
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -0,0 +1,46 @@
+name: unit-tests
+
+on:
+  push:
+    branches:
+    - dev
+    - canary/*
+    - release/*
+  pull_request:
+    branches:
+    - dev
+    - canary/*
+    - release/*
+  release:
+    types:
+    - published
+
+jobs:
+  nvidia-h200-1n8g:
+    name: nvidia-h200-1n8g
+    runs-on: unit-tests
+    timeout-minutes: 120
+    permissions:
+      contents: read
+    strategy:
+      fail-fast: true
+    steps:
+      - name: Checkout target branch
+        uses: actions/checkout@v3
+      - name: Test target branch
+        run: |
+          bash ${LTP_RUN_CI_SCRIPT_PATH} unit-tests-nvidia-h200-1n8g
+  amd-mi300x-1n8g:
+    name: amd-mi300x-1n8g
+    runs-on: unit-tests
+    timeout-minutes: 120
+    permissions:
+      contents: read
+    strategy:
+      fail-fast: true
+    steps:
+      - name: Checkout target branch
+        uses: actions/checkout@v3
+      - name: Test target branch
+        run: |
+          bash ${LTP_RUN_CI_SCRIPT_PATH} unit-tests-amd-mi300x-1n8g
diff --git a/tests/test_utils/ltp_scripts/run_unit_tests_amd_mi300x_1n8g.sh b/tests/test_utils/ltp_scripts/run_unit_tests_amd_mi300x_1n8g.sh
@@ -0,0 +1,146 @@
+set -e
+
+pip install -r requirements_ci.txt
+pip install mock
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HIP_FORCE_DEV_KERNARG=1
+export HSA_ENABLE_SDMA=1
+export HSA_NO_SCRATCH_RECLAIM=1
+export NCCL_DEBUG=WARN
+export NCCL_SOCKET_IFNAME=eth0
+export RCCL_MSCCL_ENABLE=0
+
+TORCHRUN_ARGS=(
+  --nproc_per_node 8
+  --nnodes 1
+  --node_rank 0
+  --master_addr localhost
+  --master_port 50326
+)
+
+PYTEST_COV_ARGS=(
+  --cov-branch
+  --cov megatron
+  --cov-append
+  --no-cov-on-fail
+)
+
+clear_previous_runs() {
+    ps axu | grep '[p]ython' | awk '{print $2}' | xargs -r -n 1 kill -9 2>/dev/null || true
+    sleep 10
+}
+
+# Exclude test categories that fail to pass in the full test.
+# Some test cases fail in:
+# - data
+# - dist_checkpointing
+# - models
+# - test_checkpointing
+# - test_parallel_state
+# - transformer
+# All test cases fail in:
+# - inference/engines/test_dynamic_engine.py
+# Hangs in full test but passes in separate run:
+# - distributed/test_torch_fully_sharded_parallel.py
+# - ssm/test_mamba_hybrid_layer_allocation.py
+
+clear_previous_runs
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  --ignore tests/unit_tests/data \
+  --ignore tests/unit_tests/dist_checkpointing \
+  --ignore tests/unit_tests/distributed/test_torch_fully_sharded_parallel.py \
+  --ignore tests/unit_tests/inference/engines/test_dynamic_engine.py \
+  --ignore tests/unit_tests/models \
+  --ignore tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py \
+  --ignore tests/unit_tests/test_checkpointing.py \
+  --ignore tests/unit_tests/test_parallel_state.py \
+  --ignore tests/unit_tests/transformer \
+  tests/unit_tests
+
+clear_previous_runs
+disable_pattern="not test_preprocess_data_bert"
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  -k "${disable_pattern}" \
+  tests/unit_tests/data
+
+clear_previous_runs
+disable_pattern="not test_dp_sharding and "
+disable_pattern+="not test_errors_are_reported and "
+disable_pattern+="not test_memory_usage and "
+disable_pattern+="not test_remove_sharded_tensors and "
+disable_pattern+="not test_te_grouped_linear_torch_native"
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  -k "${disable_pattern}" \
+  tests/unit_tests/dist_checkpointing
+
+clear_previous_runs
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  tests/unit_tests/distributed/test_torch_fully_sharded_parallel.py
+
+clear_previous_runs
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  --deselect "tests/unit_tests/models/test_bert_model.py::TestBertModelAttentionDimensions::test_transformer_engine_version_1_7_to_1_10_rng_error" \
+  --deselect "tests/unit_tests/models/test_clip_vit_model.py::TestCLIPViTModel::test_save_load" \
+  --deselect "tests/unit_tests/models/test_llava_model.py::TestLLaVAModel::test_save_load" \
+  --deselect "tests/unit_tests/models/test_mamba_model.py::TestMambaModel::test_save_load" \
+  --deselect "tests/unit_tests/models/test_multimodal_projector.py::TestMultimodalProjector::test_save_load" \
+  --deselect "tests/unit_tests/models/test_radio_model.py::TestRADIOViTModel::test_save_load" \
+  --deselect "tests/unit_tests/models/test_t5_model.py::TestT5Model::test_forward_output_encoder_hidden_only" \
+  --deselect "tests/unit_tests/models/test_t5_model.py::TestT5Model::test_forward_with_encoder_hidden_states" \
+  --deselect "tests/unit_tests/models/test_t5_model.py::TestT5Model::test_post_process_forward" \
+  tests/unit_tests/models
+
+clear_previous_runs
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py
+
+clear_previous_runs
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  --deselect "tests/unit_tests/test_checkpointing.py::test_load_checkpoint[torch]" \
+  --deselect "tests/unit_tests/test_checkpointing.py::test_save_checkpoint[torch]" \
+  --deselect "tests/unit_tests/test_checkpointing.py::test_save_checkpoint[torch_dcp]" \
+  tests/unit_tests/test_checkpointing.py
+
+clear_previous_runs
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  --deselect "tests/unit_tests/test_parallel_state.py::test_different_initialize_order_unconsistency[src_tp_pp3-2]" \
+  --deselect "tests/unit_tests/test_parallel_state.py::test_different_initialize_order_unconsistency[src_tp_pp4-2]" \
+  --deselect "tests/unit_tests/test_parallel_state.py::test_different_initialize_order_unconsistency[src_tp_pp5-2]" \
+  tests/unit_tests/test_parallel_state.py
+
+clear_previous_runs
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  --deselect "tests/unit_tests/transformer/test_retro_attention.py::TestRetroAttention::test_gpu_forward" \
+  --deselect "tests/unit_tests/transformer/test_attention.py::TestParallelAttention::test_gpu_forward" \
+  --deselect "tests/unit_tests/transformer/test_attention.py::TestParallelAttention::test_fused_rope_gpu_forward" \
+  --deselect "tests/unit_tests/transformer/test_attention.py::TestParallelAttention::test_checkpointed_gpu_forward" \
+  --ignore "tests/unit_tests/transformer/moe/test_moe_layer_discrepancy.py" \
+  tests/unit_tests/transformer
diff --git a/tests/test_utils/ltp_scripts/run_unit_tests_nvidia_h200_1n8g.sh b/tests/test_utils/ltp_scripts/run_unit_tests_nvidia_h200_1n8g.sh
@@ -0,0 +1,131 @@
+set -e
+
+pip install -r requirements_ci.txt
+CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.2.post1
+pip install git+https://github.com/fanshiqing/grouped_gemm@v1.1.2
+MAMBA_FORCE_BUILD=TRUE pip install git+https://github.com/state-spaces/mamba.git@v2.2.0
+apt purge -y python3-blinker
+pip install flask flask-restful tiktoken tensorstore
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_DEBUG=WARN
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_NVLS_ENABLE=0
+
+TORCHRUN_ARGS=(
+  --nproc_per_node 8
+  --nnodes 1
+  --node_rank 0
+  --master_addr localhost
+  --master_port 50326
+)
+
+PYTEST_COV_ARGS=(
+  --cov-branch
+  --cov megatron
+  --cov-append
+  --no-cov-on-fail
+)
+
+clear_previous_runs() {
+    ps axu | grep '[p]ython' | awk '{print $2}' | xargs -r -n 1 kill -9 2>/dev/null || true
+    sleep 10
+}
+
+# Exclude test categories that fail to pass in the full test.
+# Some test cases fail in:
+# - data
+# - dist_checkpointing
+# - models
+# - test_checkpointing
+# - test_parallel_state
+# - test_tokenizer.py \
+# - transformer
+# All test cases fail in:
+# - inference/engines/test_dynamic_engine.py
+
+clear_previous_runs
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  --ignore tests/unit_tests/data \
+  --ignore tests/unit_tests/dist_checkpointing \
+  --ignore tests/unit_tests/inference/engines/test_dynamic_engine.py \
+  --ignore tests/unit_tests/models \
+  --ignore tests/unit_tests/test_checkpointing.py \
+  --ignore tests/unit_tests/test_parallel_state.py \
+  --ignore tests/unit_tests/test_tokenizer.py \
+  --ignore tests/unit_tests/transformer \
+  tests/unit_tests
+
+clear_previous_runs
+disable_pattern="not test_preprocess_data_bert"
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  -k "${disable_pattern}" \
+  tests/unit_tests/data
+
+clear_previous_runs
+disable_pattern="not test_dp_sharding and "
+disable_pattern+="not test_memory_usage and "
+disable_pattern+="not test_remove_sharded_tensors"
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  -k "${disable_pattern}" \
+  tests/unit_tests/dist_checkpointing
+
+clear_previous_runs
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  --deselect "tests/unit_tests/models/test_bert_model.py::TestBertModelAttentionDimensions::test_transformer_engine_version_1_7_to_1_10_rng_error" \
+  --deselect "tests/unit_tests/models/test_t5_model.py::TestT5Model::test_forward_output_encoder_hidden_only" \
+  --deselect "tests/unit_tests/models/test_t5_model.py::TestT5Model::test_forward_with_encoder_hidden_states" \
+  --deselect "tests/unit_tests/models/test_t5_model.py::TestT5Model::test_post_process_forward" \
+  tests/unit_tests/models
+
+clear_previous_runs
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  --deselect "tests/unit_tests/test_checkpointing.py::test_load_checkpoint[torch]" \
+  --deselect "tests/unit_tests/test_checkpointing.py::test_save_checkpoint[torch]" \
+  --deselect "tests/unit_tests/test_checkpointing.py::test_save_checkpoint[torch_dcp]" \
+  tests/unit_tests/test_checkpointing.py
+
+clear_previous_runs
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  --deselect "tests/unit_tests/test_parallel_state.py::test_different_initialize_order_unconsistency[src_tp_pp3-2]" \
+  --deselect "tests/unit_tests/test_parallel_state.py::test_different_initialize_order_unconsistency[src_tp_pp4-2]" \
+  --deselect "tests/unit_tests/test_parallel_state.py::test_different_initialize_order_unconsistency[src_tp_pp5-2]" \
+  tests/unit_tests/test_parallel_state.py
+
+clear_previous_runs
+disable_pattern="not test_gpt2_tiktok_tokenizer"
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  -k "${disable_pattern}" \
+  tests/unit_tests/test_tokenizer.py
+
+clear_previous_runs
+torchrun \
+  ${TORCHRUN_ARGS[@]} \
+  -m pytest -vxs \
+  ${PYTEST_COV_ARGS[@]} \
+  --deselect "tests/unit_tests/transformer/test_retro_attention.py::TestRetroAttention::test_gpu_forward" \
+  --deselect "tests/unit_tests/transformer/test_attention.py::TestParallelAttention::test_gpu_forward" \
+  --deselect "tests/unit_tests/transformer/test_attention.py::TestParallelAttention::test_fused_rope_gpu_forward" \
+  --deselect "tests/unit_tests/transformer/test_attention.py::TestParallelAttention::test_checkpointed_gpu_forward" \
+  tests/unit_tests/transformer