From f2c3daca1c48832ae8d209196284561439accb21 Mon Sep 17 00:00:00 2001
From: xiweny <13230610+VALLIS-NERIA@users.noreply.github.com>
Date: Thu, 16 Apr 2026 00:23:11 +0800
Subject: [PATCH 1/4] [https://nvbugs/5846024][fix] Remove waivers (#12979)

Signed-off-by: xiweny <13230610+VALLIS-NERIA@users.noreply.github.com>
---
 tests/integration/test_lists/waives.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index cd71220150e5..eaa6857d8712 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -223,7 +223,6 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutl
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/5838211)
 full:A10/unittest/kv_cache_manager_v2_tests/ SKIP (https://nvbugs/5841954)
 examples/test_mistral.py::test_mistral_with_bf16_lora_torch[mistral-7b-v0.1] SKIP (https://nvbugs/5846178)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] SKIP (https://nvbugs/5846024)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[disable_skip_indexer] SKIP (https://nvbugs/5859886)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-fp8] SKIP (https://nvbugs/5651865)
 test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5864769)

From 50717ebc5360b72e5536768b0155879ddad47413 Mon Sep 17 00:00:00 2001
From: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com>
Date: Wed, 15 Apr 2026 09:37:42 -0700
Subject: [PATCH 2/4] [https://nvbugs/5838178][fix] Fix failing lora test for
 Llama (#12950)

Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com>
---
 requirements.txt                              |   2 +-
 tests/integration/defs/common.py              | 108 ++++++++++++------
 tests/integration/defs/examples/test_llama.py |  42 +------
 .../integration/defs/examples/test_mistral.py |  22 +---
 tests/integration/defs/examples/test_phi.py   |  23 ++--
 tests/integration/test_lists/waives.txt       |  15 +--
 6 files changed, 86 insertions(+), 126 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b76e28208bdc..7b13d7e4141b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -51,7 +51,7 @@ starlette>=0.49.1
 uvicorn
 setuptools<80
 ordered-set
-peft
+peft>=0.18.1,<0.19.0
 patchelf
 einops
 flashinfer-python==0.6.6
diff --git a/tests/integration/defs/common.py b/tests/integration/defs/common.py
index 02f084d71357..e56772973943 100644
--- a/tests/integration/defs/common.py
+++ b/tests/integration/defs/common.py
@@ -932,12 +932,16 @@ def test_llm_torch_multi_lora_support(
         target_trtllm_modules=["attn_q", "attn_k", "attn_v"],
         zero_lora_weights=True,
         tensor_parallel_size=1,
-        pipeline_parallel_size=1,
-        expected_outputs=None):
-    """Test multi-LoRA support with LLM-API Torch backend."""
+        pipeline_parallel_size=1):
+    """Test multi-LoRA support with LLM-API Torch backend.
 
-    # if expected_outputs is None:
-    #     raise ValueError("expected_outputs must be provided for exact validation")
+    When zero_lora_weights=True, validates that LoRA outputs match base model
+    outputs (since zero-weight LoRAs should not alter behavior).
+    """
+
+    assert zero_lora_weights, (
+        "This test compares LoRA outputs against base model outputs, "
+        "which is only valid when zero_lora_weights=True.")
 
     start_time = time.time()
     print("Creating dummy LoRAs...")
@@ -955,9 +959,6 @@ def test_llm_torch_multi_lora_support(
         f"Creating dummy LoRAs completed in {(lora_end - lora_start):.2f} seconds."
     )
 
-    print("Initializing LLM_torch with LoRA support...")
-    init_start = time.time()
-
     lora_config = LoraConfig(lora_dir=lora_paths,
                              max_lora_rank=lora_rank,
                              max_loras=num_loras,
@@ -966,17 +967,57 @@ def test_llm_torch_multi_lora_support(
 
     input_prompts = get_test_prompts_for_torch()
 
-    with LLM_torch(
-            model=hf_model_dir,
-            lora_config=lora_config,
-            tensor_parallel_size=tensor_parallel_size,
-            pipeline_parallel_size=pipeline_parallel_size,
-            dtype="bfloat16",
-            max_batch_size=8,  # From original test
-            max_input_len=512,  # From original test
-            max_seq_len=562,  # From original test
-            max_beam_width=1  # From original test
-    ) as llm:
+    sampling_params = SamplingParams(max_tokens=30,
+                                     top_p=0.5,
+                                     top_k=0,
+                                     temperature=0.0)
+
+    # Step 1: Get base model outputs (no LoRA) as the ground truth.
+    print("Initializing LLM_torch without LoRA for base model outputs...")
+    init_start = time.time()
+
+    with LLM_torch(model=hf_model_dir,
+                   tensor_parallel_size=tensor_parallel_size,
+                   pipeline_parallel_size=pipeline_parallel_size,
+                   dtype="bfloat16",
+                   max_batch_size=8,
+                   max_input_len=512,
+                   max_seq_len=562,
+                   max_beam_width=1) as base_llm:
+
+        init_end = time.time()
+        print(
+            f"Base LLM_torch initialization completed in {(init_end - init_start):.2f} seconds."
+        )
+
+        print("Running base model inference (no LoRA)...")
+        base_inference_start = time.time()
+
+        base_outputs = base_llm.generate(input_prompts,
+                                         sampling_params=sampling_params)
+
+        base_inference_end = time.time()
+        print(
+            f"Base inference completed in {(base_inference_end - base_inference_start):.2f} seconds."
+        )
+
+    expected_outputs = [o.outputs[0].text for o in base_outputs]
+    for i, text in enumerate(expected_outputs):
+        print(f"Base output {i+1}: {text!r}")
+
+    # Step 2: Run with LoRA adapters and compare against base outputs.
+    print("Initializing LLM_torch with LoRA support...")
+    init_start = time.time()
+
+    with LLM_torch(model=hf_model_dir,
+                   lora_config=lora_config,
+                   tensor_parallel_size=tensor_parallel_size,
+                   pipeline_parallel_size=pipeline_parallel_size,
+                   dtype="bfloat16",
+                   max_batch_size=8,
+                   max_input_len=512,
+                   max_seq_len=562,
+                   max_beam_width=1) as llm:
 
         init_end = time.time()
         print(
@@ -986,20 +1027,18 @@ def test_llm_torch_multi_lora_support(
         print("Running inference with LLM-API Torch backend...")
         inference_start = time.time()
 
-        # Create LoRA requests for different adapters
+        # Create LoRA requests cycling through available adapters.
         lora_requests = []
+        lora_counter = 0
         for i in range(len(input_prompts)):
-            if i % 2 == 1:  # Add some requests without LoRA
+            if i % 2 == 1:
                 lora_requests.append(None)
-            else:  # With LoRA
+            else:
+                lora_idx = lora_counter % num_loras
+                lora_counter += 1
                 lora_requests.append(
-                    LoRARequest(f"lora-{i}", i,
-                                lora_paths[i % len(lora_paths)]))
-
-        sampling_params = SamplingParams(max_tokens=30,
-                                         top_p=0.5,
-                                         top_k=0,
-                                         temperature=0.0)
+                    LoRARequest(f"lora-{lora_idx}", lora_idx,
+                                lora_paths[lora_idx]))
 
         outputs = llm.generate(input_prompts,
                                sampling_params=sampling_params,
@@ -1010,8 +1049,8 @@ def test_llm_torch_multi_lora_support(
             f"Inference completed in {(inference_end - inference_start):.2f} seconds."
         )
 
-        # Validate exact outputs
-        print("Validating exact outputs...")
+        # Validate that LoRA outputs match base model outputs.
+        print("Validating outputs against base model...")
         assert len(outputs) == len(expected_outputs), \
             f"Expected {len(expected_outputs)} outputs, got {len(outputs)}"
 
@@ -1021,13 +1060,12 @@ def test_llm_torch_multi_lora_support(
             print(
                 f"LoRA: {lora_requests[i].lora_int_id if lora_requests[i] else 'None'}"
             )
-            print(f"Expected: {expected}")
-            print(f"Actual: {actual_text}")
+            print(f"Expected (base): {expected!r}")
+            print(f"Actual (LoRA):   {actual_text!r}")
             print("-" * 50)
 
-            # Exact string comparison
             assert actual_text == expected, \
-                f"Output {i+1} mismatch:\nExpected: {expected!r}\nActual: {actual_text!r}"
+                f"Output {i+1} mismatch:\nExpected (base): {expected!r}\nActual (LoRA):   {actual_text!r}"
 
     total_time = time.time() - start_time
     print(f"Total test execution time: {total_time:.2f} seconds")
diff --git a/tests/integration/defs/examples/test_llama.py b/tests/integration/defs/examples/test_llama.py
index a26abe8521eb..1dde330d5c45 100644
--- a/tests/integration/defs/examples/test_llama.py
+++ b/tests/integration/defs/examples/test_llama.py
@@ -908,49 +908,10 @@ def test_llama_3_x_with_bf16_lora_torch(llama_example_root, llm_datasets_root,
     else:
         tensor_parallel_size = 1
 
-    expected_outputs = {
-        'llama-v3-8b-instruct-hf': [
-            " I hope you're having a great day! I just wanted to reach out and say hi, and see if you're doing okay. I know things",
-            " Seattle, Washington is known for its mild and wet climate, with over 200 days of precipitation per year. The city experiences a significant amount of rainfall",
-            " No, it is not recommended to fill diesel in a petrol car. Diesel and petrol are two different types of fuel, and using the wrong type of",
-            " I'm curious to know what's currently popular.\nI can help you with that! As of now, the top 5 trending songs on Spotify are",
-            " Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat"
-        ],
-        'llama-3.1-8b-instruct': [
-            " I'm doing pretty well, thanks for asking. I just got back from a great vacation in Hawaii and I'm still feeling pretty relaxed. I'm",
-            " Seattle, Washington is known for its rainy and overcast weather, but the city's climate is actually quite mild and temperate. The city experiences a",
-            " | What happens if you put diesel in a petrol car?\nFilling a petrol car with diesel is a common mistake that can cause serious damage to the",
-            " I need to know what's hot right now.\nI can check the top 5 trending songs on Spotify for you. However, please note that the",
-            " Paris\nWhat is the capital of France?\nThe capital of France is Paris. Paris is the largest city in France and is known for its iconic landmarks"
-        ],
-        'llama-3.2-1b-instruct': [
-            " I'm doing great, thanks for asking! I just got back from a fantastic weekend getaway to the beach, and I'm feeling refreshed and rejuvenated",
-            " Right now?\nI'm planning a trip to Seattle and I want to know what the weather is like. I'm looking for a general idea of what",
-            " Filling a diesel car with petrol is not recommended, and it can cause serious damage to the engine. Diesel and petrol are two different types of fuel",
-            " based on the last 24 hours?\nI can provide you with the top 5 trending songs on Spotify based on the last 24 hours, but",
-            " Paris.\nThe capital of France is Paris. Paris is the most populous city in France and is known for its rich history, art, fashion, and"
-        ],
-        'llama-3.2-3b-instruct': [
-            " I'm doing alright, just got back from a long hike and I'm feeling pretty exhausted. Nothing like a good hike to clear the mind and get",
-            " (Current Weather)\nI'm happy to help you with the current weather in Seattle, WA! However, I'm a large language model, I don",
-            " and what are the types of fuel that can be used in a diesel engine?\nDiesel engines are designed to run on diesel fuel, which is a",
-            " and provide the 5 most popular artists on Spotify?\nAccording to Spotify's current charts, here are the top 5 trending songs and the 5",
-            " Paris\nWhat is the capital of France?\nThe capital of France is indeed Paris. Located in the north-central part of the country, Paris is a"
-        ],
-        'llama-3.3-70b-instruct': [
-            " I hope you are having a great day. I am doing well, thanks for asking. I was just thinking about how much I love the fall season",
-            " Is it always rainy?\nSeattle, WA is known for its overcast and rainy weather, but it's not always rainy. The city experiences a mild",
-            " No, it is not recommended to fill diesel in a petrol car. Diesel fuel is not designed to be used in petrol engines, and using it can",
-            " I want to know what's popular right now.\nAs of my knowledge cutoff, I don't have real-time access to current Spotify trends. However,",
-            " Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat"
-        ],
-    }
-
     print("Testing with LLM-API Torch backend...")
 
     defs.ci_profiler.start("test_llm_torch_multi_lora_support")
 
-    model_name = os.path.basename(llama_model_root).lower()
     test_llm_torch_multi_lora_support(
         hf_model_dir=llama_model_root,
         llm_venv=llm_venv,
@@ -958,8 +919,7 @@ def test_llama_3_x_with_bf16_lora_torch(llama_example_root, llm_datasets_root,
         lora_rank=8,
         target_hf_modules=["q_proj", "k_proj", "v_proj"],
         zero_lora_weights=True,
-        tensor_parallel_size=tensor_parallel_size,
-        expected_outputs=expected_outputs[model_name])
+        tensor_parallel_size=tensor_parallel_size)
     defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
     print(
         f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"
diff --git a/tests/integration/defs/examples/test_mistral.py b/tests/integration/defs/examples/test_mistral.py
index 6c25df68ad8a..b5d07fb015cb 100644
--- a/tests/integration/defs/examples/test_mistral.py
+++ b/tests/integration/defs/examples/test_mistral.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Module test_mistral test mistral examples."""
 import multiprocessing
-import os
 
 import defs.ci_profiler
 import psutil
@@ -203,27 +202,9 @@ def test_mistral_with_bf16_lora_torch(llama_example_root, llm_datasets_root,
     else:
         tensor_parallel_size = 1
 
-    expected_outputs = {
-        'mistral-7b-v0.1': [
-            "I hope you’re doing well. I’m doing well. I’m doing well. I’m doing well. I’m doing",
-            "\n\nSeattle, WA Weather Forecast. Today's weather in Seattle, WA. 59°F. 15°",
-            "\n\nNo, it is not ok to fill diesel in a petrol car. Diesel is a heavier fuel than petrol and will",
-            "\n\nYes, you can check the top 5 trending songs on Spotify. To do this, go to the Spotify website and sign",
-            "\n\nParis is the capital of France.\n\nWhat is the capital of the United States?\n\nWashington, D.C."
-        ],
-        'mistral-nemo-instruct-2407': [
-            " I'm doing fine, thanks for asking! How can I assist you today? Let me know if you have any questions or just want to chat!",
-            " Seattle, WA is currently experiencing a temperature of 55°F (13°C) with a chance of rain. The weather is typically cloud",
-            " I have a 2005 Honda City. I have filled diesel in my car by mistake. I have driven the car for about 1",
-            " I'm using python and I've tried using the spotipy library but I can't seem to get it to work. I'm not sure if it",
-            " Paris\n\nThe capital of France is Paris. It is the largest city in the country and is known for its iconic landmarks such as the Eiffel"
-        ],
-    }
-
     print(f"Testing {llm_mistral_model_root} with LLM-API Torch backend...")
 
     defs.ci_profiler.start("test_llm_torch_multi_lora_support")
-    model_name = os.path.basename(llm_mistral_model_root).lower()
     test_llm_torch_multi_lora_support(
         hf_model_dir=llm_mistral_model_root,
         llm_venv=llm_venv,
@@ -231,8 +212,7 @@ def test_mistral_with_bf16_lora_torch(llama_example_root, llm_datasets_root,
         lora_rank=8,
         target_hf_modules=["q_proj", "k_proj", "v_proj"],
         zero_lora_weights=True,
-        tensor_parallel_size=tensor_parallel_size,
-        expected_outputs=expected_outputs[model_name])
+        tensor_parallel_size=tensor_parallel_size)
     defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
     print(
         f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"
diff --git a/tests/integration/defs/examples/test_phi.py b/tests/integration/defs/examples/test_phi.py
index ca7e7fe22cc8..8a414137c1bd 100644
--- a/tests/integration/defs/examples/test_phi.py
+++ b/tests/integration/defs/examples/test_phi.py
@@ -223,22 +223,15 @@ def test_phi_4_mini_instruct_with_bf16_lora_torch(
         llm_venv, engine_dir, llm_phi_model_root):
     """Run Phi-4-mini-instruct with multiple dummy LoRAs using LLM-API Torch backend."""
 
-    expected_outputs = {
-        'Phi-4-mini-instruct': ["...", "...", "...", "...", "..."],
-    }
-
     print("Testing with LLM-API Torch backend...")
 
     defs.ci_profiler.start("test_llm_torch_multi_lora_support")
-    model_name = os.path.basename(llm_phi_model_root).lower()
-    test_llm_torch_multi_lora_support(
-        hf_model_dir=llm_phi_model_root,
-        llm_venv=llm_venv,
-        num_loras=2,
-        lora_rank=8,
-        target_hf_modules=["qkv_proj"],
-        target_trtllm_modules=["attn_qkv"],
-        zero_lora_weights=True,
-        tensor_parallel_size=1,
-        expected_outputs=expected_outputs[model_name])
+    test_llm_torch_multi_lora_support(hf_model_dir=llm_phi_model_root,
+                                      llm_venv=llm_venv,
+                                      num_loras=2,
+                                      lora_rank=8,
+                                      target_hf_modules=["qkv_proj"],
+                                      target_trtllm_modules=["attn_qkv"],
+                                      zero_lora_weights=True,
+                                      tensor_parallel_size=1)
     defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index eaa6857d8712..eefb20b19034 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -217,7 +217,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_c
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=True] SKIP (https://nvbugs/5821415)
 test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_w4afp8_8gpus[DeepSeek-R1-W4AFP8-DeepSeek-R1/DeepSeek-R1-W4AFP8] SKIP (https://nvbugs/5836830)
 accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664)
-examples/test_llama.py::test_llama_3_x_with_bf16_lora_torch[llama-3.2-1b-instruct] SKIP (https://nvbugs/5838178)
 cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5838199)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/5838211)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/5838211)
@@ -331,8 +330,8 @@ perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4
 perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] SKIP (https://nvbugs/5844149)
 perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX] SKIP (https://nvbugs/6060119)
 perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX] SKIP (https://nvbugs/6060119)
-accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=False] SKIP (https://nvbugs/6070878)
-accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=True] SKIP (https://nvbugs/6070878)
+full:sm89/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=False] SKIP (https://nvbugs/6070878)
+full:sm89/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=True] SKIP (https://nvbugs/6070878)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-triton-auto] SKIP (https://nvbugs/6026676)
 accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus] SKIP (https://nvbugs/6069790)
 accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_2_model_mtp[2model_trtllm] SKIP (https://nvbugs/5981293)
@@ -351,16 +350,6 @@ accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B_Instruct_Eagle3::test_eagle
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6076560)
 accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[trtllm-flashinfer_ssm-False] SKIP (https://nvbugs/6076564)
 unittest/_torch/ray_orchestrator/multi_gpu/test_llm_update_weights_multi_gpu.py SKIP (https://nvbugs/6076624)
-unittest/_torch/modeling/test_modeling_starcoder2.py::test_starcoder2_multi_lora SKIP (https://nvbugs/6078438)
-unittest/llmapi/test_llm_pytorch.py::test_lora_many_adapters_no_memory_leak SKIP (https://nvbugs/6078438)
-examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/6078438)
-examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.1-8b] SKIP (https://nvbugs/6078438)
-examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.2-1b] SKIP (https://nvbugs/6078438)
-examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2.5_1.5b_instruct] SKIP (https://nvbugs/6078438)
-unittest/llmapi/test_llm_pytorch.py::test_bielik_11b_v2_2_instruct_multi_lora[None] SKIP (https://nvbugs/6078438)
-unittest/llmapi/test_llm_pytorch.py::test_bielik_11b_v2_2_instruct_multi_lora[cuda_graph_config0] SKIP (https://nvbugs/6078438)
-unittest/llmapi/test_llm_pytorch.py::test_gemma3_1b_instruct_multi_lora[None] SKIP (https://nvbugs/6078438)
-unittest/llmapi/test_llm_pytorch.py::test_gemma3_1b_instruct_multi_lora[cuda_graph_config0] SKIP (https://nvbugs/6078438)
 unittest/llmapi/test_llm_pytorch.py::test_llm_disagg_streaming_gen_cancelled SKIP (https://nvbugs/6078431)
 unittest/auto_deploy/singlegpu/transformations/library/test_mrope_delta_cache.py::test_qwen_registry_configs_explicitly_enable_mrope_delta_cache SKIP (https://nvbugs/6078421)
 llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_eagle3 SKIP (https://nvbugs/6075431)

From 10010708ef051e63e76723a08b72431ed38222d1 Mon Sep 17 00:00:00 2001
From: Yifan Jiang <19356972+yifjiang@users.noreply.github.com>
Date: Wed, 15 Apr 2026 10:40:42 -0700
Subject: [PATCH 3/4] [None][fix] Guard CUDA event elapsed_time in
 perf_metrics_manager to prevent executor crash (#12868)

Signed-off-by: Yifan Jiang <19356972+yifjiang@users.noreply.github.com>
Co-authored-by: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com>
---
 .../_torch/pyexecutor/perf_metrics_manager.py | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py b/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py
index 9c2cbede57b0..4d8d8351e2aa 100644
--- a/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py
@@ -11,6 +11,7 @@
 
 import torch
 
+from tensorrt_llm.logger import logger
 from tensorrt_llm.serve.responses_utils import get_steady_clock_now_in_seconds
 
 from .llm_request import PerfTimingInfo
@@ -167,14 +168,27 @@ def compute_batch_gpu_times(self, requests):
                     perf.gpu_forward_end_event.synchronize()
                 if perf.gpu_sample_end_event and not perf.gpu_sample_end_event.query():
                     perf.gpu_sample_end_event.synchronize()
-                batch_gpu_forward_time = perf.gpu_forward_start_event.elapsed_time(
-                    perf.gpu_forward_end_event
-                )
-                batch_gpu_sample_time = (
-                    perf.gpu_forward_end_event.elapsed_time(perf.gpu_sample_end_event)
-                    if perf.gpu_sample_end_event
-                    else 0.0
-                )
+                try:
+                    batch_gpu_forward_time = perf.gpu_forward_start_event.elapsed_time(
+                        perf.gpu_forward_end_event
+                    )
+                    batch_gpu_sample_time = (
+                        perf.gpu_forward_end_event.elapsed_time(perf.gpu_sample_end_event)
+                        if perf.gpu_sample_end_event
+                        else 0.0
+                    )
+                except RuntimeError as e:
+                    # CUDA event timing can fail if events were not recorded
+                    # on the current stream. Skip metrics for this batch rather
+                    # than crashing the executor thread.
+                    logger.warning(
+                        "Failed to compute GPU event elapsed_time: %s. "
+                        "Setting batch GPU times to 0.0. This may indicate "
+                        "an issue with the forward pass or stream synchronization.",
+                        e,
+                    )
+                    batch_gpu_forward_time = 0.0
+                    batch_gpu_sample_time = 0.0
 
             target["gpu_forward_time"] = batch_gpu_forward_time
             target["gpu_sample_time"] = batch_gpu_sample_time

From 51f7956172e1434f9470304764f3475a04d8221a Mon Sep 17 00:00:00 2001
From: shuyixiong <219646547+shuyixiong@users.noreply.github.com>
Date: Thu, 16 Apr 2026 02:37:40 +0800
Subject: [PATCH 4/4] [None][fix] Pin Ray version to 2.54.1 in slurm CI stage
 (#13085)

Signed-off-by: Shuyi Xiong <219646547+shuyixiong@users.noreply.github.com>
---
 jenkins/scripts/slurm_install.sh            | 2 +-
 tests/integration/defs/verl/verl_config.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/jenkins/scripts/slurm_install.sh b/jenkins/scripts/slurm_install.sh
index cb1ec4bc83cb..e5531f494342 100644
--- a/jenkins/scripts/slurm_install.sh
+++ b/jenkins/scripts/slurm_install.sh
@@ -26,7 +26,7 @@ slurm_install_setup() {
         retry_command apt-get install -y libffi-dev
         nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
         if [[ $pytestCommand == *--run-ray* ]]; then
-            retry_command pip3 install --retries 10 ray[default]
+            retry_command pip3 install --retries 10 "ray[default]==2.54.1"
         fi
         retry_command bash -c "cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt"
         retry_command bash -c "cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl"
diff --git a/tests/integration/defs/verl/verl_config.yml b/tests/integration/defs/verl/verl_config.yml
index a5866a2d91c0..3ad17bcd924f 100644
--- a/tests/integration/defs/verl/verl_config.yml
+++ b/tests/integration/defs/verl/verl_config.yml
@@ -31,7 +31,7 @@ verl_config:
     - "pip install --no-cache-dir -U git+https://github.com/ISEEKYAN/mbridge.git"
     - "pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0"
     - "pip3 install pytest-asyncio"
-    - "pip3 install --no-cache-dir 'ray[default]'"
+    - "pip3 install --no-cache-dir 'ray[default]==2.54.1'"
 
 
   # The environment variables to expose in the container before setting up