Skip to content

Commit 11cb3a1

Browse files
authored
[ci] CI fixes for NovaSky-AI#1769 (NovaSky-AI#1795)
- fix cpu ci (mock megatron imports in cpu ci - bump anyscale version for h100 from 0.24.79 to 0.26.103 for h100 CI to try to resolve bucket access issue with k8s - add language_model_only flag for `test - bump timeout for megatron models test from 1000 -> 1500 to account for new qwen3.5-0.8b seq packing test
1 parent bbb0bc1 commit 11cb3a1

6 files changed

Lines changed: 10 additions & 4 deletions

File tree

.github/workflows/gpu_ci_h100.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ jobs:
5353
with:
5454
activate-environment: true
5555
- name: Install basic dependencies
56-
run: uv pip install anyscale==0.24.79 typer==0.9.0
56+
run: uv pip install anyscale==0.26.103 typer==0.9.0
5757
# Run h100 tests via anyscale staging (compute config llm-team-h100-4x:1)
5858
- name: GPU tests
5959
env:

.github/workflows/gpu_skyrl_train_megatron_models.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,5 +64,5 @@ jobs:
6464
run: |
6565
COMMIT_SHA="${{ github.event.pull_request.head.sha || github.sha }}"
6666
JOB_NAME="skyrl-train-gpu-ci-megatron-models-${COMMIT_SHA:0:7}-${{ github.run_id }}"
67-
anyscale job submit -f ci/anyscale_gpu_ci_skyrl_train_megatron_models.yaml --name "$JOB_NAME" --timeout 1000
68-
anyscale job wait --cloud sky-anyscale-aws-us-east-1 --name "$JOB_NAME" --timeout 1000
67+
anyscale job submit -f ci/anyscale_gpu_ci_skyrl_train_megatron_models.yaml --name "$JOB_NAME" --timeout 1500
68+
anyscale job wait --cloud sky-anyscale-aws-us-east-1 --name "$JOB_NAME" --timeout 1500

tests/backends/skyrl_train/distributed/test_preprocess_packed_seqs_cp.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class _PackedSeqParams:
6161
_mock_modules["megatron.core.optimizer"].ChainedOptimizer = MagicMock
6262
_mock_modules["megatron.core.transformer.module"].Float16Module = MagicMock
6363
_mock_modules["megatron.core.utils"].get_attr_wrapped_model = MagicMock()
64+
_mock_modules["megatron.core.utils"].unwrap_model = MagicMock()
6465
_mock_modules["megatron.core.transformer.moe.moe_utils"].clear_aux_losses_tracker = MagicMock()
6566
_mock_modules["megatron.core.transformer.moe.moe_utils"].reduce_aux_losses_tracker_across_ranks = MagicMock()
6667
_mock_modules["megatron.core.transformer.moe.moe_utils"].get_moe_layer_wise_logging_tracker = MagicMock()

tests/backends/skyrl_train/distributed/test_preprocess_packed_seqs_multiseq.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ class _PackedSeqParams:
5353
_mock_modules["megatron.core.transformer.moe.moe_utils"].get_moe_layer_wise_logging_tracker = MagicMock()
5454
_mock_modules["megatron.core.transformer.moe.moe_utils"].reduce_aux_losses_tracker_across_ranks = MagicMock()
5555
_mock_modules["megatron.core.utils"].get_attr_wrapped_model = MagicMock()
56+
_mock_modules["megatron.core.utils"].unwrap_model = MagicMock()
5657

5758

5859
@pytest.fixture(scope="module", autouse=True)

tests/backends/skyrl_train/gpu/gpu_ci/megatron/test_megatron_extractor_consistency.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
``name``).
1717
1818
Run with::
19-
uv run --isolated --extra megatron --extra dev pytest -s -vvv tests/backends/skyrl_train/gpu/gpu_ci/test_megatron_extractor_consistency.py
19+
uv run --isolated --extra megatron --extra dev pytest -s -vvv tests/backends/skyrl_train/gpu/gpu_ci/megatron/test_megatron_extractor_consistency.py
2020
2121
"""
2222

@@ -100,6 +100,9 @@ def _make_ref_cfg(model_name: str) -> SkyRLTrainConfig:
100100
cfg.trainer.ref.megatron_config.transformer_config_kwargs["mtp_num_layers"] = 0
101101
if is_moe:
102102
cfg.trainer.gradient_checkpointing_use_reentrant = True
103+
if "qwen3.5" in model_name.lower(): # use LM only path for qwen3.5
104+
cfg.trainer.ref.language_model_only = True
105+
cfg.generator.inference_engine.language_model_only = True
103106
validate_cfg(cfg)
104107
return cfg
105108

tests/train/test_packing_round_trip.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ class _PackedSeqParams:
7171
_mock_modules["megatron.core.transformer.moe.moe_utils"].get_moe_layer_wise_logging_tracker = MagicMock()
7272
_mock_modules["megatron.core.transformer.moe.moe_utils"].reduce_aux_losses_tracker_across_ranks = MagicMock()
7373
_mock_modules["megatron.core.utils"].get_attr_wrapped_model = MagicMock()
74+
_mock_modules["megatron.core.utils"].unwrap_model = MagicMock()
7475

7576

7677
@pytest.fixture(scope="module", autouse=True)

0 commit comments

Comments
 (0)