Skip to content

Commit 4a292ed

Browse files
authored
CI/CD - Initiate CI/CD pipeline for unit tests (#37)
This PR initiates CI/CD pipeline for unit tests, including: * Workflow YAML to run unit tests on LTP * Scripts to run unit tests on single-node NVIDIA H200 and AMD MI300X platforms, excluding current failure cases
1 parent 4c5b401 commit 4a292ed

File tree

3 files changed

+323
-0
lines changed

3 files changed

+323
-0
lines changed

.github/workflows/unit-tests.yml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
name: unit-tests
2+
3+
on:
4+
push:
5+
branches:
6+
- dev
7+
- canary/*
8+
- release/*
9+
pull_request:
10+
branches:
11+
- dev
12+
- canary/*
13+
- release/*
14+
release:
15+
types:
16+
- published
17+
18+
jobs:
19+
nvidia-h200-1n8g:
20+
name: nvidia-h200-1n8g
21+
runs-on: unit-tests
22+
timeout-minutes: 120
23+
permissions:
24+
contents: read
25+
strategy:
26+
fail-fast: true
27+
steps:
28+
- name: Checkout target branch
29+
uses: actions/checkout@v3
30+
- name: Test target branch
31+
run: |
32+
bash ${LTP_RUN_CI_SCRIPT_PATH} unit-tests-nvidia-h200-1n8g
33+
amd-mi300x-1n8g:
34+
name: amd-mi300x-1n8g
35+
runs-on: unit-tests
36+
timeout-minutes: 120
37+
permissions:
38+
contents: read
39+
strategy:
40+
fail-fast: true
41+
steps:
42+
- name: Checkout target branch
43+
uses: actions/checkout@v3
44+
- name: Test target branch
45+
run: |
46+
bash ${LTP_RUN_CI_SCRIPT_PATH} unit-tests-amd-mi300x-1n8g
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
set -e
2+
3+
pip install -r requirements_ci.txt
4+
pip install mock
5+
6+
export CUDA_DEVICE_MAX_CONNECTIONS=1
7+
export HIP_FORCE_DEV_KERNARG=1
8+
export HSA_ENABLE_SDMA=1
9+
export HSA_NO_SCRATCH_RECLAIM=1
10+
export NCCL_DEBUG=WARN
11+
export NCCL_SOCKET_IFNAME=eth0
12+
export RCCL_MSCCL_ENABLE=0
13+
14+
TORCHRUN_ARGS=(
15+
--nproc_per_node 8
16+
--nnodes 1
17+
--node_rank 0
18+
--master_addr localhost
19+
--master_port 50326
20+
)
21+
22+
PYTEST_COV_ARGS=(
23+
--cov-branch
24+
--cov megatron
25+
--cov-append
26+
--no-cov-on-fail
27+
)
28+
29+
clear_previous_runs() {
30+
ps axu | grep '[p]ython' | awk '{print $2}' | xargs -r -n 1 kill -9 2>/dev/null || true
31+
sleep 10
32+
}
33+
34+
# Exclude test categories that fail to pass in the full test.
35+
# Some test cases fail in:
36+
# - data
37+
# - dist_checkpointing
38+
# - models
39+
# - test_checkpointing
40+
# - test_parallel_state
41+
# - transformer
42+
# All test cases fail in:
43+
# - inference/engines/test_dynamic_engine.py
44+
# Hangs in full test but passes in separate run:
45+
# - distributed/test_torch_fully_sharded_parallel.py
46+
# - ssm/test_mamba_hybrid_layer_allocation.py
47+
48+
clear_previous_runs
49+
torchrun \
50+
${TORCHRUN_ARGS[@]} \
51+
-m pytest -vxs \
52+
${PYTEST_COV_ARGS[@]} \
53+
--ignore tests/unit_tests/data \
54+
--ignore tests/unit_tests/dist_checkpointing \
55+
--ignore tests/unit_tests/distributed/test_torch_fully_sharded_parallel.py \
56+
--ignore tests/unit_tests/inference/engines/test_dynamic_engine.py \
57+
--ignore tests/unit_tests/models \
58+
--ignore tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py \
59+
--ignore tests/unit_tests/test_checkpointing.py \
60+
--ignore tests/unit_tests/test_parallel_state.py \
61+
--ignore tests/unit_tests/transformer \
62+
tests/unit_tests
63+
64+
clear_previous_runs
65+
disable_pattern="not test_preprocess_data_bert"
66+
torchrun \
67+
${TORCHRUN_ARGS[@]} \
68+
-m pytest -vxs \
69+
${PYTEST_COV_ARGS[@]} \
70+
-k "${disable_pattern}" \
71+
tests/unit_tests/data
72+
73+
clear_previous_runs
74+
disable_pattern="not test_dp_sharding and "
75+
disable_pattern+="not test_errors_are_reported and "
76+
disable_pattern+="not test_memory_usage and "
77+
disable_pattern+="not test_remove_sharded_tensors and "
78+
disable_pattern+="not test_te_grouped_linear_torch_native"
79+
torchrun \
80+
${TORCHRUN_ARGS[@]} \
81+
-m pytest -vxs \
82+
${PYTEST_COV_ARGS[@]} \
83+
-k "${disable_pattern}" \
84+
tests/unit_tests/dist_checkpointing
85+
86+
clear_previous_runs
87+
torchrun \
88+
${TORCHRUN_ARGS[@]} \
89+
-m pytest -vxs \
90+
${PYTEST_COV_ARGS[@]} \
91+
tests/unit_tests/distributed/test_torch_fully_sharded_parallel.py
92+
93+
clear_previous_runs
94+
torchrun \
95+
${TORCHRUN_ARGS[@]} \
96+
-m pytest -vxs \
97+
${PYTEST_COV_ARGS[@]} \
98+
--deselect "tests/unit_tests/models/test_bert_model.py::TestBertModelAttentionDimensions::test_transformer_engine_version_1_7_to_1_10_rng_error" \
99+
--deselect "tests/unit_tests/models/test_clip_vit_model.py::TestCLIPViTModel::test_save_load" \
100+
--deselect "tests/unit_tests/models/test_llava_model.py::TestLLaVAModel::test_save_load" \
101+
--deselect "tests/unit_tests/models/test_mamba_model.py::TestMambaModel::test_save_load" \
102+
--deselect "tests/unit_tests/models/test_multimodal_projector.py::TestMultimodalProjector::test_save_load" \
103+
--deselect "tests/unit_tests/models/test_radio_model.py::TestRADIOViTModel::test_save_load" \
104+
--deselect "tests/unit_tests/models/test_t5_model.py::TestT5Model::test_forward_output_encoder_hidden_only" \
105+
--deselect "tests/unit_tests/models/test_t5_model.py::TestT5Model::test_forward_with_encoder_hidden_states" \
106+
--deselect "tests/unit_tests/models/test_t5_model.py::TestT5Model::test_post_process_forward" \
107+
tests/unit_tests/models
108+
109+
clear_previous_runs
110+
torchrun \
111+
${TORCHRUN_ARGS[@]} \
112+
-m pytest -vxs \
113+
${PYTEST_COV_ARGS[@]} \
114+
tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py
115+
116+
clear_previous_runs
117+
torchrun \
118+
${TORCHRUN_ARGS[@]} \
119+
-m pytest -vxs \
120+
${PYTEST_COV_ARGS[@]} \
121+
--deselect "tests/unit_tests/test_checkpointing.py::test_load_checkpoint[torch]" \
122+
--deselect "tests/unit_tests/test_checkpointing.py::test_save_checkpoint[torch]" \
123+
--deselect "tests/unit_tests/test_checkpointing.py::test_save_checkpoint[torch_dcp]" \
124+
tests/unit_tests/test_checkpointing.py
125+
126+
clear_previous_runs
127+
torchrun \
128+
${TORCHRUN_ARGS[@]} \
129+
-m pytest -vxs \
130+
${PYTEST_COV_ARGS[@]} \
131+
--deselect "tests/unit_tests/test_parallel_state.py::test_different_initialize_order_unconsistency[src_tp_pp3-2]" \
132+
--deselect "tests/unit_tests/test_parallel_state.py::test_different_initialize_order_unconsistency[src_tp_pp4-2]" \
133+
--deselect "tests/unit_tests/test_parallel_state.py::test_different_initialize_order_unconsistency[src_tp_pp5-2]" \
134+
tests/unit_tests/test_parallel_state.py
135+
136+
clear_previous_runs
137+
torchrun \
138+
${TORCHRUN_ARGS[@]} \
139+
-m pytest -vxs \
140+
${PYTEST_COV_ARGS[@]} \
141+
--deselect "tests/unit_tests/transformer/test_retro_attention.py::TestRetroAttention::test_gpu_forward" \
142+
--deselect "tests/unit_tests/transformer/test_attention.py::TestParallelAttention::test_gpu_forward" \
143+
--deselect "tests/unit_tests/transformer/test_attention.py::TestParallelAttention::test_fused_rope_gpu_forward" \
144+
--deselect "tests/unit_tests/transformer/test_attention.py::TestParallelAttention::test_checkpointed_gpu_forward" \
145+
--ignore "tests/unit_tests/transformer/moe/test_moe_layer_discrepancy.py" \
146+
tests/unit_tests/transformer
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
set -e
2+
3+
pip install -r requirements_ci.txt
4+
CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.2.post1
5+
pip install git+https://github.com/fanshiqing/grouped_gemm@v1.1.2
6+
MAMBA_FORCE_BUILD=TRUE pip install git+https://github.com/state-spaces/mamba.git@v2.2.0
7+
apt purge -y python3-blinker
8+
pip install flask flask-restful tiktoken tensorstore
9+
10+
export CUDA_DEVICE_MAX_CONNECTIONS=1
11+
export NCCL_DEBUG=WARN
12+
export NCCL_SOCKET_IFNAME=eth0
13+
export NCCL_NVLS_ENABLE=0
14+
15+
TORCHRUN_ARGS=(
16+
--nproc_per_node 8
17+
--nnodes 1
18+
--node_rank 0
19+
--master_addr localhost
20+
--master_port 50326
21+
)
22+
23+
PYTEST_COV_ARGS=(
24+
--cov-branch
25+
--cov megatron
26+
--cov-append
27+
--no-cov-on-fail
28+
)
29+
30+
clear_previous_runs() {
31+
ps axu | grep '[p]ython' | awk '{print $2}' | xargs -r -n 1 kill -9 2>/dev/null || true
32+
sleep 10
33+
}
34+
35+
# Exclude test categories that fail to pass in the full test.
36+
# Some test cases fail in:
37+
# - data
38+
# - dist_checkpointing
39+
# - models
40+
# - test_checkpointing
41+
# - test_parallel_state
42+
# - test_tokenizer.py \
43+
# - transformer
44+
# All test cases fail in:
45+
# - inference/engines/test_dynamic_engine.py
46+
47+
clear_previous_runs
48+
torchrun \
49+
${TORCHRUN_ARGS[@]} \
50+
-m pytest -vxs \
51+
${PYTEST_COV_ARGS[@]} \
52+
--ignore tests/unit_tests/data \
53+
--ignore tests/unit_tests/dist_checkpointing \
54+
--ignore tests/unit_tests/inference/engines/test_dynamic_engine.py \
55+
--ignore tests/unit_tests/models \
56+
--ignore tests/unit_tests/test_checkpointing.py \
57+
--ignore tests/unit_tests/test_parallel_state.py \
58+
--ignore tests/unit_tests/test_tokenizer.py \
59+
--ignore tests/unit_tests/transformer \
60+
tests/unit_tests
61+
62+
clear_previous_runs
63+
disable_pattern="not test_preprocess_data_bert"
64+
torchrun \
65+
${TORCHRUN_ARGS[@]} \
66+
-m pytest -vxs \
67+
${PYTEST_COV_ARGS[@]} \
68+
-k "${disable_pattern}" \
69+
tests/unit_tests/data
70+
71+
clear_previous_runs
72+
disable_pattern="not test_dp_sharding and "
73+
disable_pattern+="not test_memory_usage and "
74+
disable_pattern+="not test_remove_sharded_tensors"
75+
torchrun \
76+
${TORCHRUN_ARGS[@]} \
77+
-m pytest -vxs \
78+
${PYTEST_COV_ARGS[@]} \
79+
-k "${disable_pattern}" \
80+
tests/unit_tests/dist_checkpointing
81+
82+
clear_previous_runs
83+
torchrun \
84+
${TORCHRUN_ARGS[@]} \
85+
-m pytest -vxs \
86+
${PYTEST_COV_ARGS[@]} \
87+
--deselect "tests/unit_tests/models/test_bert_model.py::TestBertModelAttentionDimensions::test_transformer_engine_version_1_7_to_1_10_rng_error" \
88+
--deselect "tests/unit_tests/models/test_t5_model.py::TestT5Model::test_forward_output_encoder_hidden_only" \
89+
--deselect "tests/unit_tests/models/test_t5_model.py::TestT5Model::test_forward_with_encoder_hidden_states" \
90+
--deselect "tests/unit_tests/models/test_t5_model.py::TestT5Model::test_post_process_forward" \
91+
tests/unit_tests/models
92+
93+
clear_previous_runs
94+
torchrun \
95+
${TORCHRUN_ARGS[@]} \
96+
-m pytest -vxs \
97+
${PYTEST_COV_ARGS[@]} \
98+
--deselect "tests/unit_tests/test_checkpointing.py::test_load_checkpoint[torch]" \
99+
--deselect "tests/unit_tests/test_checkpointing.py::test_save_checkpoint[torch]" \
100+
--deselect "tests/unit_tests/test_checkpointing.py::test_save_checkpoint[torch_dcp]" \
101+
tests/unit_tests/test_checkpointing.py
102+
103+
clear_previous_runs
104+
torchrun \
105+
${TORCHRUN_ARGS[@]} \
106+
-m pytest -vxs \
107+
${PYTEST_COV_ARGS[@]} \
108+
--deselect "tests/unit_tests/test_parallel_state.py::test_different_initialize_order_unconsistency[src_tp_pp3-2]" \
109+
--deselect "tests/unit_tests/test_parallel_state.py::test_different_initialize_order_unconsistency[src_tp_pp4-2]" \
110+
--deselect "tests/unit_tests/test_parallel_state.py::test_different_initialize_order_unconsistency[src_tp_pp5-2]" \
111+
tests/unit_tests/test_parallel_state.py
112+
113+
clear_previous_runs
114+
disable_pattern="not test_gpt2_tiktok_tokenizer"
115+
torchrun \
116+
${TORCHRUN_ARGS[@]} \
117+
-m pytest -vxs \
118+
${PYTEST_COV_ARGS[@]} \
119+
-k "${disable_pattern}" \
120+
tests/unit_tests/test_tokenizer.py
121+
122+
clear_previous_runs
123+
torchrun \
124+
${TORCHRUN_ARGS[@]} \
125+
-m pytest -vxs \
126+
${PYTEST_COV_ARGS[@]} \
127+
--deselect "tests/unit_tests/transformer/test_retro_attention.py::TestRetroAttention::test_gpu_forward" \
128+
--deselect "tests/unit_tests/transformer/test_attention.py::TestParallelAttention::test_gpu_forward" \
129+
--deselect "tests/unit_tests/transformer/test_attention.py::TestParallelAttention::test_fused_rope_gpu_forward" \
130+
--deselect "tests/unit_tests/transformer/test_attention.py::TestParallelAttention::test_checkpointed_gpu_forward" \
131+
tests/unit_tests/transformer

0 commit comments

Comments
 (0)