SchumiDing
diff --git a/‎.github/workflows/e2e_ppo_grpo_trainer_trtllm.yml‎
Lines changed: 27 additions & 1 deletion b/‎.github/workflows/e2e_ppo_grpo_trainer_trtllm.yml‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎.github/workflows/gpu_unit_tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/gpu_unit_tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/npu_unit_tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/npu_unit_tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/reward_model_sglang.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/reward_model_sglang.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/reward_model_vllm.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/reward_model_vllm.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/reward_model_vllm_ascend.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/reward_model_vllm_ascend.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/vllm.yml‎
Lines changed: 6 additions & 4 deletions b/‎.github/workflows/vllm.yml‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/advance/ppo_lora.rst‎
Lines changed: 7 additions & 1 deletion b/‎docs/advance/ppo_lora.rst‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎docs/algo/spin.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/algo/spin.md‎
Lines changed: 1 addition & 1 deletion
@@ -106,6 +106,32 @@ jobs:
           faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
           mlp-image: "${{ env.IMAGE }}"
 
+  trtllm_unit_tests:
+    needs: setup
+    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
+    timeout-minutes: 30 # Increase this timeout value as needed
+    env:
+      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
+      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
+      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
+      HF_ENDPOINT: "https://hf-mirror.com"
+      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+      - name: Install the current repository
+        run: |
+          pip3 install pytest-asyncio
+          pip3 install -r requirements-test.txt
+          pip3 install --no-deps -e .
+      - name: Run TRTLLM unit tests
+        run: |
+          export TRTLLM_TEST_MODEL_PATH_ROOT="${HOME}/models"
+          pytest -v -s \
+            tests/workers/rollout/rollout_trtllm/test_adapter.py \
+            tests/workers/rollout/rollout_trtllm/test_async_server.py
+
   e2e_grpo_trainer_fsdp-qwen2:
     needs: setup
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
@@ -179,7 +205,7 @@ jobs:
 
   cleanup:
     runs-on: ubuntu-latest
-    needs: [setup, e2e_grpo_trainer_fsdp-qwen2, e2e_grpo_trainer_megatron-qwen2]
+    needs: [setup, trtllm_unit_tests, e2e_grpo_trainer_fsdp-qwen2, e2e_grpo_trainer_megatron-qwen2]
     if: always()
     steps:
       - id: destroy-runner
 
@@ -113,7 +113,7 @@ jobs:
           pip3 install --ignore-installed mlflow "numpy<2.0"
       - name: Run all GPU unit tests
         run: |
-          pytest -s -x --ignore-glob="*on_npu.py" --ignore-glob="*test_special_*.py" --ignore-glob='*on_cpu.py' --ignore-glob="*test_vllm*" --ignore-glob="*_sglang*" --ignore-glob="*_hf_rollout*" --ignore-glob="tests/models/" --ignore-glob='tests/special*' --ignore-glob="tests/experimental" --ignore-glob="tests/workers/reward_model" --ignore-glob="*test_shared_memory*" tests/
+          pytest -s -x --ignore-glob="*on_npu.py" --ignore-glob="*test_special_*.py" --ignore-glob='*on_cpu.py' --ignore-glob="*test_vllm*" --ignore-glob="*_sglang*" --ignore-glob="*_hf_rollout*" --ignore-glob="tests/models/" --ignore-glob='tests/special*' --ignore-glob="tests/experimental" --ignore-glob="tests/workers/reward_model" --ignore-glob="*test_shared_memory*" --ignore-glob="tests/workers/rollout/rollout_trtllm" tests/
       - name: Testing LinearCrossEntropyTP Correctness, Computation Time and Memory Consumption
         run: |
           LOW_MEMORY=True torchrun --standalone --nnodes=1 --nproc-per-node=8 tests/utils/test_special_linear_cross_entropy_tp.py
 
@@ -109,7 +109,7 @@ jobs:
       - name: Run all NPU unit tests
         run: |
           export PYTHONPATH=$PYTHONPATH:/Megatron-LM
-          pytest -s -x --ignore-glob="*test_special_*.py" --ignore-glob="*on_cpu.py" --ignore-glob="*test_vllm*" --ignore-glob="*_sglang*" --ignore-glob="*_hf_rollout*" --ignore-glob="tests/models/" --ignore-glob="tests/special*" --ignore-glob="tests/experimental" --ignore-glob="tests/workers/reward_model" --ignore-glob="*test_rvdz*" --ignore-glob="*test_ray_collectives*" --ignore-glob="*test_nvtx_profile*" --ignore-glob="tests/checkpoint_engine" --ignore-glob="*test_shared_memory*" tests/
+          pytest -s -x --ignore-glob="*test_special_*.py" --ignore-glob="*on_cpu.py" --ignore-glob="*test_vllm*" --ignore-glob="*_sglang*" --ignore-glob="*_hf_rollout*" --ignore-glob="tests/models/" --ignore-glob="tests/special*" --ignore-glob="tests/experimental" --ignore-glob="tests/workers/reward_model" --ignore-glob="*test_rvdz*" --ignore-glob="*test_ray_collectives*" --ignore-glob="*test_nvtx_profile*" --ignore-glob="tests/checkpoint_engine" --ignore-glob="*test_shared_memory*" --ignore-glob="tests/workers/rollout/rollout_trtllm" tests/
       - name: Testing FSDP2 actor functionality
         run: |
           torchrun --standalone --nnodes=1 --nproc-per-node=2 tests/workers/actor/test_special_dp_actor.py
 
@@ -115,7 +115,7 @@ jobs:
       - name: Running sglang agent loop with reward manager tests on 8 L20 GPUs
         run: |
           unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
-          ROLLOUT_NAME=sglang pytest -s -x tests/experimental/reward_loop/test_agent_loop_reward_manager.py
+          ROLLOUT_NAME=sglang pytest -s -x tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
       - name: Running sglang agent loop with reward model colocate tests on 8 L20 GPUs
         run: |
           unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
 
@@ -115,7 +115,7 @@ jobs:
       - name: Running vllm agent loop with reward manager tests on 8 L20 GPUs
         run: |
           unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
-          ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_agent_loop_reward_manager.py
+          ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
       - name: Running vllm agent loop with reward model colocate tests on 8 L20 GPUs
         run: |
           unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
 
@@ -105,7 +105,7 @@ jobs:
           ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_reward_model_disrm.py
       - name: Running vllm agent loop with reward manager tests on 8 NPUs
         run: |
-          ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_agent_loop_reward_manager.py
+          ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
       - name: Running vllm agent loop with reward model colocate tests on 8 NPUs
         run: |
           export HCCL_HOST_SOCKET_PORT_RANGE=auto
 
@@ -109,12 +109,13 @@ jobs:
         run: |
           pip3 install -r requirements-test.txt
           pip3 install --no-deps -e .
+          pip3 install --upgrade "transformers<5.0"
       #      - name: Download Model to Use
       #        run: |
-      #          huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct --local-dir ${HOME}/models/Qwen/Qwen2.5-0.5B-Instruct
-      #          huggingface-cli download Qwen/Qwen2.5-1.5B-Instruct --local-dir ${HOME}/models/Qwen/Qwen2.5-1.5B-Instruct
-      #          huggingface-cli download Qwen/Qwen2.5-VL-3B-Instruct --local-dir ${HOME}/models/Qwen/Qwen2.5-VL-3B-Instruct
-      #          huggingface-cli download OldKingMeister/Qwen2.5-1.5B-Instruct-YaRN --local-dir ${HOME}/models/OldKingMeister/Qwen2.5-1.5B-Instruct-YaRN
+      #          hf download Qwen/Qwen2.5-0.5B-Instruct --local-dir ${HOME}/models/Qwen/Qwen2.5-0.5B-Instruct
+      #          hf download Qwen/Qwen2.5-1.5B-Instruct --local-dir ${HOME}/models/Qwen/Qwen2.5-1.5B-Instruct
+      #          hf download Qwen/Qwen2.5-VL-3B-Instruct --local-dir ${HOME}/models/Qwen/Qwen2.5-VL-3B-Instruct
+      #          hf download OldKingMeister/Qwen2.5-1.5B-Instruct-YaRN --local-dir ${HOME}/models/OldKingMeister/Qwen2.5-1.5B-Instruct-YaRN
       #          export HF_HUB_OFFLINE=1
       - name: Prepare gsm8k dataset
         run: |
@@ -146,6 +147,7 @@ jobs:
           pip3 install cupy-cuda12x pytest-asyncio
           pip3 install -r requirements-test.txt
           pip3 install --no-deps -e .
+          pip3 install --upgrade "transformers<5.0"
       - name: Test vLLM ServerAdapter with Checkpoint Engine (NCCL)
         run: |
           ROLLOUT_NAME=vllm pytest -svvv tests/checkpoint_engine/test_special_server_adapter.py
 
@@ -283,6 +283,7 @@ Welcome to register your awesome project build with `verl` for other developers'
 - [DAPO](https://dapo-sia.github.io/): the fully open source SOTA RL algorithm that beats DeepSeek-R1-zero-32B ![GitHub Repo stars](https://img.shields.io/github/stars/volcengine/verl)
 - [NoisyRollout](https://github.com/NUS-TRAIL/NoisyRollout): Reinforcing Visual Reasoning with Data Augmentation ![GitHub Repo stars](https://img.shields.io/github/stars/NUS-TRAIL/NoisyRollout)
 - [SPEAR](https://github.com/TencentYoutuResearch/SPEAR): **Self-imitation** with **Progressive Exploration** for Agentic Reinforcement Learning (ICLR 2026) ![GitHub Repo stars](https://img.shields.io/github/stars/TencentYoutuResearch/SPEAR)
+- [RuleReasoner](https://github.com/bigai-nlco/RuleReasoner): **RuleReasoner:** Reinforced Rule-based Reasoning via **Domain-aware Dynamic Sampling** (ICLR 2026) ![GitHub Repo stars](https://img.shields.io/github/stars/bigai-nlco/RuleReasoner)
 
 ## Contribution Guide
 
 
@@ -1,7 +1,7 @@
 RL(HF) algorithms with LoRA Support
 ===========================================
 
-Last updated: 12/17/2025.
+Last updated: 02/03/2026.
 
 We support LoRA (Low-Rank Adaptation) for reinforcement learning algorithms such as PPO, GRPO, and others.
 
@@ -42,6 +42,8 @@ FSDP Backend Usage Guide
 - `actor_rollout_ref.model.lora_adapter_path`: string, path to a pretrained LoRA adapter directory. 
    If provided, loads existing adapter instead of creating new one. Enables multi-stage training from previously saved adapters.
    Directory need contain `adapter_model.safetensors` and `adapter_config.json`.
+- `actor_rollout_ref.model.lora.merge`: bool, whether to merge LoRA adapters into the base model weights before transferring to vLLM. 
+   If True, it will merge LoRA adapters into the base model weights before transferring to vLLM. If False, it will transfer only adapters to vLLM. This option is currently supported **only for engine-based rollout workers** (i.e. vLLM engine workers using the new worker implementation with ``trainer.use_legacy_worker_impl`` disabled) and is not available when using the legacy worker implementation.
 
 5. Recommend options:
 
@@ -137,6 +139,10 @@ Make sure you use Megatron-Bridge later than 0.2.0, and we recommended using `th
         # Path to pre-trained LoRA adapter weights (null to train from scratch)
         adapter_path: null
 
+        # Whether to fully shard LoRA adapters. Defaults to False
+        # https://docs.vllm.ai/en/latest/api/vllm/config/lora/#vllm.config.lora.LoRAConfig.fully_sharded_loras
+        fully_sharded_loras: bool
+
         # VLMLoRA additionally allows the user to specify whether the language or vision models should be frozen.
         # For example, a common finetuning workload for multimodal models is to apply adapters to language model and fully
         # finetune the vision model.
 
@@ -118,7 +118,7 @@ The following steps outline how to set up the environment and run the SPIN recip
     python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k # Adjusted path
 
     # Download the base model (Example: Qwen2.5-3B-Instruct)
-    huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
+    hf download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
     ```
 
 4.  **Configure:**