THUDM
diff --git a/‎.github/workflows/pr-test.yml‎
Lines changed: 1 addition & 48 deletions b/‎.github/workflows/pr-test.yml‎
Lines changed: 1 addition & 48 deletions
diff --git a/‎.github/workflows/pr-test.yml.j2‎
Lines changed: 1 addition & 12 deletions b/‎.github/workflows/pr-test.yml.j2‎
Lines changed: 1 addition & 12 deletions
diff --git a/‎examples/true_on_policy/run_simple.py‎
Lines changed: 1 addition & 6 deletions b/‎examples/true_on_policy/run_simple.py‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎examples/true_on_policy_vlm/run_simple.py‎
Lines changed: 1 addition & 6 deletions b/‎examples/true_on_policy_vlm/run_simple.py‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎slime/ray/rollout.py‎
Lines changed: 2 additions & 13 deletions b/‎slime/ray/rollout.py‎
Lines changed: 2 additions & 13 deletions
diff --git a/‎slime/utils/arguments.py‎
Lines changed: 0 additions & 10 deletions b/‎slime/utils/arguments.py‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎slime/utils/metric_utils.py‎
Lines changed: 0 additions & 25 deletions b/‎slime/utils/metric_utils.py‎
Lines changed: 0 additions & 25 deletions
@@ -95,7 +95,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        info: [{"num_gpus": 4, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py --colocated"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}]
+        info: [{"num_gpus": 4, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py --colocated"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}]
     defaults:
       run:
         working-directory: ${{ github.workspace }}
@@ -260,53 +260,6 @@ jobs:
         shell: bash
         run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}
 
-  e2e-test-long:
-    if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-long'))
-    runs-on: self-hosted
-    container:
-      image: slimerl/slime:latest
-      options: >
-        --gpus all
-        --ipc=host
-        --shm-size=16g
-        --ulimit memlock=-1
-        --ulimit stack=67108864
-        --memory=0
-        --memory-swap=0
-        -e http_proxy=$http_proxy
-        -e https_proxy=$https_proxy
-        -e HTTP_PROXY=$HTTP_PROXY
-        -e HTTPS_PROXY=$HTTPS_PROXY
-        -v /mnt/nvme0n1/slime_ci:/data/slime_ci
-        -v /mnt/nvme0n1/slime_ci/models:/root/models
-        -v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
-    strategy:
-      fail-fast: false
-      matrix:
-        info: [{"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k.py"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k_async.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_colocated_2xGPU.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}]
-    defaults:
-      run:
-        working-directory: ${{ github.workspace }}
-    env:
-      GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }}
-      WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
-      SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }}
-      SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep || '0' }}
-      SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout || '0' }}
-      SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval || '1' }}
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Install
-        shell: bash
-        run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages
-
-      - name: Execute
-        shell: bash
-        run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}
-
   e2e-test-image:
     if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-image'))
     runs-on: self-hosted
 
@@ -11,7 +11,6 @@
       'tests': [
         {'test_file': 'test_qwen3_4B_fsdp_true_on_policy.py --colocated', 'num_gpus': 4},
         {'test_file': 'test_qwen3_vl_4B_fsdp.py', 'num_gpus': 8},
-        {'test_file': 'test_qwen3_0.6B_fsdp_distributed.py', 'num_gpus': 2},
         {'test_file': 'test_qwen3_0.6B_megatron_fsdp_align.py', 'num_gpus': 4},
       ],
     },
@@ -43,25 +42,15 @@
         {'test_file': 'test_qwen3_4B_ckpt.py --async-save', 'num_gpus': 8},
       ],
     },
-    'e2e-test-long': {
-      'label': 'run-ci-long',
-      'tests': [
-        {'test_file': 'test_qwen2.5_0.5B_gsm8k.py', 'num_gpus': 2},
-        {'test_file': 'test_qwen2.5_0.5B_gsm8k_async.py', 'num_gpus': 2},
-        {'test_file': 'test_qwen3_0.6B_fsdp_colocated_2xGPU.py', 'num_gpus': 2},
-        {'test_file': 'test_qwen3_0.6B_fsdp_distributed.py', 'num_gpus': 2},
-      ],
-    },
+
     'e2e-test-image': {
       'label': 'run-ci-image',
       'image': 'slimerl/slime-test:latest',
       'tests': [
         {'test_file': 'test_qwen2.5_0.5B_gsm8k_async_short.py', 'num_gpus': 4},
         {'test_file': 'test_qwen2.5_0.5B_gsm8k_short.py', 'num_gpus': 4},
-        {'test_file': 'test_qwen3_0.6B_fsdp_colocated_2xGPU.py', 'num_gpus': 2},
         {'test_file': 'test_qwen3_4B_fsdp_true_on_policy.py', 'num_gpus': 2},
         {'test_file': 'test_qwen3_vl_4B_fsdp.py', 'num_gpus': 8},
-        {'test_file': 'test_qwen3_0.6B_fsdp_distributed.py', 'num_gpus': 2},
         {'test_file': 'test_quick_start_glm4_9B.py', 'num_gpus': 8},
         {'test_file': 'test_qwen3_30B_A3B.py', 'num_gpus': 8},
         {'test_file': 'test_qwen3_4B_ppo.py', 'num_gpus': 8},
 
@@ -84,12 +84,7 @@ def execute():
         "--update-weight-buffer-size 536870912 "  # 512MB
     )
 
-    ci_args = (
-        "--ci-test "
-        "--ci-disable-kl-checker "
-        "--ci-metric-checker-key eval/gsm8k "
-        "--ci-metric-checker-threshold 0.71 "  # loose threshold at 60 step
-    )
+    ci_args = "--ci-test " "--ci-disable-kl-checker "
 
     misc_args = "--actor-num-nodes 1 " f"--actor-num-gpus-per-node {NUM_GPUS} " "--colocate " "--train-backend fsdp "
 
 
@@ -82,12 +82,7 @@ def execute():
         "--attn-implementation flash_attention_3 "
     )
 
-    ci_args = (
-        "--ci-test "
-        "--ci-disable-kl-checker "
-        "--ci-metric-checker-key eval/geo3k "
-        "--ci-metric-checker-threshold 0.5 "  # loose threshold at 60 step
-    )
+    ci_args = "--ci-test " "--ci-disable-kl-checker "
 
     misc_args = "--actor-num-nodes 1 " f"--actor-num-gpus-per-node {NUM_GPUS} " "--colocate "
 
 
@@ -19,13 +19,7 @@
 from slime.utils.health_monitor import RolloutHealthMonitor
 from slime.utils.http_utils import _wrap_ipv6, find_available_port, get_host_info, init_http_client
 from slime.utils.logging_utils import configure_logger, init_tracking
-from slime.utils.metric_utils import (
-    MetricChecker,
-    compute_pass_rate,
-    compute_rollout_step,
-    compute_statistics,
-    dict_add_prefix,
-)
+from slime.utils.metric_utils import compute_pass_rate, compute_rollout_step, compute_statistics, dict_add_prefix
 from slime.utils.misc import Box, group_by, load_function
 from slime.utils.seqlen_balancing import get_seqlen_balanced_partitions
 from slime.utils.types import Sample
@@ -80,7 +74,6 @@ def __init__(self, args, pg):
         self.rollout_engine_lock = Lock.options(num_cpus=1, num_gpus=0).remote()
         self.rollout_id = -1
 
-        self._metric_checker = MetricChecker.maybe_create(args)
         self._health_monitor = None
         if not self.args.debug_train_only and self.args.use_fault_tolerance:
             self._health_monitor = RolloutHealthMonitor(self, args)
@@ -109,8 +102,6 @@ def _try_ci_fault_injection(self):
                 logger.warning(f"CI Fault Injection failed: {e}")
 
     def dispose(self):
-        if self._metric_checker is not None:
-            self._metric_checker.dispose()
         if self._health_monitor is not None:
             self._health_monitor.stop()
 
@@ -153,9 +144,7 @@ def eval(self, rollout_id):
         result = call_rollout_fn(self.eval_generate_rollout, self.args, rollout_id, self.data_source, evaluation=True)
         data = result.data
         self._save_debug_rollout_data(data, rollout_id=rollout_id, evaluation=True)
-        metrics = _log_eval_rollout_data(rollout_id, self.args, data, result.metrics)
-        if self._metric_checker is not None:
-            self._metric_checker.on_eval(metrics)
+        _log_eval_rollout_data(rollout_id, self.args, data, result.metrics)
 
     def save(self, rollout_id):
         self.data_source.save(rollout_id)
 
@@ -1379,16 +1379,6 @@ def add_ci_arguments(parser):
                 "--ci-disable-kl-checker",
                 action="store_true",
             )
-            parser.add_argument(
-                "--ci-metric-checker-key",
-                type=str,
-                default=None,
-            )
-            parser.add_argument(
-                "--ci-metric-checker-threshold",
-                type=float,
-                default=None,
-            )
             parser.add_argument(
                 "--ci-save-grad-norm",
                 type=str,
 
@@ -121,28 +121,3 @@ def compute_rollout_step(args, rollout_id):
     if args.wandb_always_use_train_step:
         return rollout_id * args.rollout_batch_size * args.n_samples_per_prompt // args.global_batch_size
     return rollout_id
-
-
-class MetricChecker:
-    @staticmethod
-    def maybe_create(args):
-        if args.ci_test and (args.ci_metric_checker_key is not None):
-            return MetricChecker(args)
-        return None
-
-    def __init__(self, args):
-        self.args = args
-        self._exists_check_success = False
-
-    def on_eval(self, metrics: dict[str, float]):
-        actual_value = metrics.get(self.args.ci_metric_checker_key)
-        assert actual_value is not None, f"{metrics=} {self.args.ci_metric_checker_key=}"
-
-        check_success = actual_value >= self.args.ci_metric_checker_threshold
-        logger.info(f"[MetricChecker] {check_success=} {actual_value=} {self.args.ci_metric_checker_threshold=}")
-
-        self._exists_check_success |= check_success
-
-    def dispose(self):
-        assert self._exists_check_success, "[MetricChecker] accuracy check failed"
-        logger.info("[MetricChecker] pass dispose check")