yingguo-trt · pull · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/jenkins/scripts/slurm_install.sh b/jenkins/scripts/slurm_install.sh
@@ -26,7 +26,7 @@ slurm_install_setup() {
         retry_command apt-get install -y libffi-dev
         nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
         if [[ $pytestCommand == *--run-ray* ]]; then
-            retry_command pip3 install --retries 10 ray[default]
+            retry_command pip3 install --retries 10 "ray[default]==2.54.1"
         fi
         retry_command bash -c "cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt"
         retry_command bash -c "cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl"

diff --git a/requirements.txt b/requirements.txt
@@ -51,7 +51,7 @@ starlette>=0.49.1
 uvicorn
 setuptools<80
 ordered-set
-peft
+peft>=0.18.1,<0.19.0
 patchelf
 einops
 flashinfer-python==0.6.6

diff --git a/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py b/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py
@@ -11,6 +11,7 @@
 
 import torch
 
+from tensorrt_llm.logger import logger
 from tensorrt_llm.serve.responses_utils import get_steady_clock_now_in_seconds
 
 from .llm_request import PerfTimingInfo
@@ -167,14 +168,27 @@ def compute_batch_gpu_times(self, requests):
                     perf.gpu_forward_end_event.synchronize()
                 if perf.gpu_sample_end_event and not perf.gpu_sample_end_event.query():
                     perf.gpu_sample_end_event.synchronize()
-                batch_gpu_forward_time = perf.gpu_forward_start_event.elapsed_time(
-                    perf.gpu_forward_end_event
-                )
-                batch_gpu_sample_time = (
-                    perf.gpu_forward_end_event.elapsed_time(perf.gpu_sample_end_event)
-                    if perf.gpu_sample_end_event
-                    else 0.0
-                )
+                try:
+                    batch_gpu_forward_time = perf.gpu_forward_start_event.elapsed_time(
+                        perf.gpu_forward_end_event
+                    )
+                    batch_gpu_sample_time = (
+                        perf.gpu_forward_end_event.elapsed_time(perf.gpu_sample_end_event)
+                        if perf.gpu_sample_end_event
+                        else 0.0
+                    )
+                except RuntimeError as e:
+                    # CUDA event timing can fail if events were not recorded
+                    # on the current stream. Skip metrics for this batch rather
+                    # than crashing the executor thread.
+                    logger.warning(
+                        "Failed to compute GPU event elapsed_time: %s. "
+                        "Setting batch GPU times to 0.0. This may indicate "
+                        "an issue with the forward pass or stream synchronization.",
+                        e,
+                    )
+                    batch_gpu_forward_time = 0.0
+                    batch_gpu_sample_time = 0.0
 
             target["gpu_forward_time"] = batch_gpu_forward_time
             target["gpu_sample_time"] = batch_gpu_sample_time

diff --git a/tests/integration/defs/common.py b/tests/integration/defs/common.py
@@ -932,12 +932,16 @@ def test_llm_torch_multi_lora_support(
         target_trtllm_modules=["attn_q", "attn_k", "attn_v"],
         zero_lora_weights=True,
         tensor_parallel_size=1,
-        pipeline_parallel_size=1,
-        expected_outputs=None):
-    """Test multi-LoRA support with LLM-API Torch backend."""
+        pipeline_parallel_size=1):
+    """Test multi-LoRA support with LLM-API Torch backend.
 
-    # if expected_outputs is None:
-    #     raise ValueError("expected_outputs must be provided for exact validation")
+    When zero_lora_weights=True, validates that LoRA outputs match base model
+    outputs (since zero-weight LoRAs should not alter behavior).
+    """
+
+    assert zero_lora_weights, (
+        "This test compares LoRA outputs against base model outputs, "
+        "which is only valid when zero_lora_weights=True.")
 
     start_time = time.time()
     print("Creating dummy LoRAs...")
@@ -955,9 +959,6 @@ def test_llm_torch_multi_lora_support(
         f"Creating dummy LoRAs completed in {(lora_end - lora_start):.2f} seconds."
     )
 
-    print("Initializing LLM_torch with LoRA support...")
-    init_start = time.time()
-
     lora_config = LoraConfig(lora_dir=lora_paths,
                              max_lora_rank=lora_rank,
                              max_loras=num_loras,
@@ -966,17 +967,57 @@ def test_llm_torch_multi_lora_support(
 
     input_prompts = get_test_prompts_for_torch()
 
-    with LLM_torch(
-            model=hf_model_dir,
-            lora_config=lora_config,
-            tensor_parallel_size=tensor_parallel_size,
-            pipeline_parallel_size=pipeline_parallel_size,
-            dtype="bfloat16",
-            max_batch_size=8,  # From original test
-            max_input_len=512,  # From original test
-            max_seq_len=562,  # From original test
-            max_beam_width=1  # From original test
-    ) as llm:
+    sampling_params = SamplingParams(max_tokens=30,
+                                     top_p=0.5,
+                                     top_k=0,
+                                     temperature=0.0)
+
+    # Step 1: Get base model outputs (no LoRA) as the ground truth.
+    print("Initializing LLM_torch without LoRA for base model outputs...")
+    init_start = time.time()
+
+    with LLM_torch(model=hf_model_dir,
+                   tensor_parallel_size=tensor_parallel_size,
+                   pipeline_parallel_size=pipeline_parallel_size,
+                   dtype="bfloat16",
+                   max_batch_size=8,
+                   max_input_len=512,
+                   max_seq_len=562,
+                   max_beam_width=1) as base_llm:
+
+        init_end = time.time()
+        print(
+            f"Base LLM_torch initialization completed in {(init_end - init_start):.2f} seconds."
+        )
+
+        print("Running base model inference (no LoRA)...")
+        base_inference_start = time.time()
+
+        base_outputs = base_llm.generate(input_prompts,
+                                         sampling_params=sampling_params)
+
+        base_inference_end = time.time()
+        print(
+            f"Base inference completed in {(base_inference_end - base_inference_start):.2f} seconds."
+        )
+
+    expected_outputs = [o.outputs[0].text for o in base_outputs]
+    for i, text in enumerate(expected_outputs):
+        print(f"Base output {i+1}: {text!r}")
+
+    # Step 2: Run with LoRA adapters and compare against base outputs.
+    print("Initializing LLM_torch with LoRA support...")
+    init_start = time.time()
+
+    with LLM_torch(model=hf_model_dir,
+                   lora_config=lora_config,
+                   tensor_parallel_size=tensor_parallel_size,
+                   pipeline_parallel_size=pipeline_parallel_size,
+                   dtype="bfloat16",
+                   max_batch_size=8,
+                   max_input_len=512,
+                   max_seq_len=562,
+                   max_beam_width=1) as llm:
 
         init_end = time.time()
         print(
@@ -986,20 +1027,18 @@ def test_llm_torch_multi_lora_support(
         print("Running inference with LLM-API Torch backend...")
         inference_start = time.time()
 
-        # Create LoRA requests for different adapters
+        # Create LoRA requests cycling through available adapters.
         lora_requests = []
+        lora_counter = 0
         for i in range(len(input_prompts)):
-            if i % 2 == 1:  # Add some requests without LoRA
+            if i % 2 == 1:
                 lora_requests.append(None)
-            else:  # With LoRA
+            else:
+                lora_idx = lora_counter % num_loras
+                lora_counter += 1
                 lora_requests.append(
-                    LoRARequest(f"lora-{i}", i,
-                                lora_paths[i % len(lora_paths)]))
-
-        sampling_params = SamplingParams(max_tokens=30,
-                                         top_p=0.5,
-                                         top_k=0,
-                                         temperature=0.0)
+                    LoRARequest(f"lora-{lora_idx}", lora_idx,
+                                lora_paths[lora_idx]))
 
         outputs = llm.generate(input_prompts,
                                sampling_params=sampling_params,
@@ -1010,8 +1049,8 @@ def test_llm_torch_multi_lora_support(
             f"Inference completed in {(inference_end - inference_start):.2f} seconds."
         )
 
-        # Validate exact outputs
-        print("Validating exact outputs...")
+        # Validate that LoRA outputs match base model outputs.
+        print("Validating outputs against base model...")
         assert len(outputs) == len(expected_outputs), \
             f"Expected {len(expected_outputs)} outputs, got {len(outputs)}"
 
@@ -1021,13 +1060,12 @@ def test_llm_torch_multi_lora_support(
             print(
                 f"LoRA: {lora_requests[i].lora_int_id if lora_requests[i] else 'None'}"
             )
-            print(f"Expected: {expected}")
-            print(f"Actual: {actual_text}")
+            print(f"Expected (base): {expected!r}")
+            print(f"Actual (LoRA):   {actual_text!r}")
             print("-" * 50)
 
-            # Exact string comparison
             assert actual_text == expected, \
-                f"Output {i+1} mismatch:\nExpected: {expected!r}\nActual: {actual_text!r}"
+                f"Output {i+1} mismatch:\nExpected (base): {expected!r}\nActual (LoRA):   {actual_text!r}"
 
     total_time = time.time() - start_time
     print(f"Total test execution time: {total_time:.2f} seconds")

diff --git a/tests/integration/defs/examples/test_llama.py b/tests/integration/defs/examples/test_llama.py
@@ -908,58 +908,18 @@ def test_llama_3_x_with_bf16_lora_torch(llama_example_root, llm_datasets_root,
     else:
         tensor_parallel_size = 1
 
-    expected_outputs = {
-        'llama-v3-8b-instruct-hf': [
-            " I hope you're having a great day! I just wanted to reach out and say hi, and see if you're doing okay. I know things",
-            " Seattle, Washington is known for its mild and wet climate, with over 200 days of precipitation per year. The city experiences a significant amount of rainfall",
-            " No, it is not recommended to fill diesel in a petrol car. Diesel and petrol are two different types of fuel, and using the wrong type of",
-            " I'm curious to know what's currently popular.\nI can help you with that! As of now, the top 5 trending songs on Spotify are",
-            " Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat"
-        ],
-        'llama-3.1-8b-instruct': [
-            " I'm doing pretty well, thanks for asking. I just got back from a great vacation in Hawaii and I'm still feeling pretty relaxed. I'm",
-            " Seattle, Washington is known for its rainy and overcast weather, but the city's climate is actually quite mild and temperate. The city experiences a",
-            " | What happens if you put diesel in a petrol car?\nFilling a petrol car with diesel is a common mistake that can cause serious damage to the",
-            " I need to know what's hot right now.\nI can check the top 5 trending songs on Spotify for you. However, please note that the",
-            " Paris\nWhat is the capital of France?\nThe capital of France is Paris. Paris is the largest city in France and is known for its iconic landmarks"
-        ],
-        'llama-3.2-1b-instruct': [
-            " I'm doing great, thanks for asking! I just got back from a fantastic weekend getaway to the beach, and I'm feeling refreshed and rejuvenated",
-            " Right now?\nI'm planning a trip to Seattle and I want to know what the weather is like. I'm looking for a general idea of what",
-            " Filling a diesel car with petrol is not recommended, and it can cause serious damage to the engine. Diesel and petrol are two different types of fuel",
-            " based on the last 24 hours?\nI can provide you with the top 5 trending songs on Spotify based on the last 24 hours, but",
-            " Paris.\nThe capital of France is Paris. Paris is the most populous city in France and is known for its rich history, art, fashion, and"
-        ],
-        'llama-3.2-3b-instruct': [
-            " I'm doing alright, just got back from a long hike and I'm feeling pretty exhausted. Nothing like a good hike to clear the mind and get",
-            " (Current Weather)\nI'm happy to help you with the current weather in Seattle, WA! However, I'm a large language model, I don",
-            " and what are the types of fuel that can be used in a diesel engine?\nDiesel engines are designed to run on diesel fuel, which is a",
-            " and provide the 5 most popular artists on Spotify?\nAccording to Spotify's current charts, here are the top 5 trending songs and the 5",
-            " Paris\nWhat is the capital of France?\nThe capital of France is indeed Paris. Located in the north-central part of the country, Paris is a"
-        ],
-        'llama-3.3-70b-instruct': [
-            " I hope you are having a great day. I am doing well, thanks for asking. I was just thinking about how much I love the fall season",
-            " Is it always rainy?\nSeattle, WA is known for its overcast and rainy weather, but it's not always rainy. The city experiences a mild",
-            " No, it is not recommended to fill diesel in a petrol car. Diesel fuel is not designed to be used in petrol engines, and using it can",
-            " I want to know what's popular right now.\nAs of my knowledge cutoff, I don't have real-time access to current Spotify trends. However,",
-            " Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat"
-        ],
-    }
-
     print("Testing with LLM-API Torch backend...")
 
     defs.ci_profiler.start("test_llm_torch_multi_lora_support")
 
-    model_name = os.path.basename(llama_model_root).lower()
     test_llm_torch_multi_lora_support(
         hf_model_dir=llama_model_root,
         llm_venv=llm_venv,
         num_loras=2,
         lora_rank=8,
         target_hf_modules=["q_proj", "k_proj", "v_proj"],
         zero_lora_weights=True,
-        tensor_parallel_size=tensor_parallel_size,
-        expected_outputs=expected_outputs[model_name])
+        tensor_parallel_size=tensor_parallel_size)
     defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
     print(
         f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"

diff --git a/tests/integration/defs/examples/test_mistral.py b/tests/integration/defs/examples/test_mistral.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Module test_mistral test mistral examples."""
 import multiprocessing
-import os
 
 import defs.ci_profiler
 import psutil
@@ -203,36 +202,17 @@ def test_mistral_with_bf16_lora_torch(llama_example_root, llm_datasets_root,
     else:
         tensor_parallel_size = 1
 
-    expected_outputs = {
-        'mistral-7b-v0.1': [
-            "I hope you’re doing well. I’m doing well. I’m doing well. I’m doing well. I’m doing",
-            "\n\nSeattle, WA Weather Forecast. Today's weather in Seattle, WA. 59°F. 15°",
-            "\n\nNo, it is not ok to fill diesel in a petrol car. Diesel is a heavier fuel than petrol and will",
-            "\n\nYes, you can check the top 5 trending songs on Spotify. To do this, go to the Spotify website and sign",
-            "\n\nParis is the capital of France.\n\nWhat is the capital of the United States?\n\nWashington, D.C."
-        ],
-        'mistral-nemo-instruct-2407': [
-            " I'm doing fine, thanks for asking! How can I assist you today? Let me know if you have any questions or just want to chat!",
-            " Seattle, WA is currently experiencing a temperature of 55°F (13°C) with a chance of rain. The weather is typically cloud",
-            " I have a 2005 Honda City. I have filled diesel in my car by mistake. I have driven the car for about 1",
-            " I'm using python and I've tried using the spotipy library but I can't seem to get it to work. I'm not sure if it",
-            " Paris\n\nThe capital of France is Paris. It is the largest city in the country and is known for its iconic landmarks such as the Eiffel"
-        ],
-    }
-
     print(f"Testing {llm_mistral_model_root} with LLM-API Torch backend...")
 
     defs.ci_profiler.start("test_llm_torch_multi_lora_support")
-    model_name = os.path.basename(llm_mistral_model_root).lower()
     test_llm_torch_multi_lora_support(
         hf_model_dir=llm_mistral_model_root,
         llm_venv=llm_venv,
         num_loras=2,
         lora_rank=8,
         target_hf_modules=["q_proj", "k_proj", "v_proj"],
         zero_lora_weights=True,
-        tensor_parallel_size=tensor_parallel_size,
-        expected_outputs=expected_outputs[model_name])
+        tensor_parallel_size=tensor_parallel_size)
     defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
     print(
         f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"

diff --git a/tests/integration/defs/examples/test_phi.py b/tests/integration/defs/examples/test_phi.py
@@ -223,22 +223,15 @@ def test_phi_4_mini_instruct_with_bf16_lora_torch(
         llm_venv, engine_dir, llm_phi_model_root):
     """Run Phi-4-mini-instruct with multiple dummy LoRAs using LLM-API Torch backend."""
 
-    expected_outputs = {
-        'Phi-4-mini-instruct': ["...", "...", "...", "...", "..."],
-    }
-
     print("Testing with LLM-API Torch backend...")
 
     defs.ci_profiler.start("test_llm_torch_multi_lora_support")
-    model_name = os.path.basename(llm_phi_model_root).lower()
-    test_llm_torch_multi_lora_support(
-        hf_model_dir=llm_phi_model_root,
-        llm_venv=llm_venv,
-        num_loras=2,
-        lora_rank=8,
-        target_hf_modules=["qkv_proj"],
-        target_trtllm_modules=["attn_qkv"],
-        zero_lora_weights=True,
-        tensor_parallel_size=1,
-        expected_outputs=expected_outputs[model_name])
+    test_llm_torch_multi_lora_support(hf_model_dir=llm_phi_model_root,
+                                      llm_venv=llm_venv,
+                                      num_loras=2,
+                                      lora_rank=8,
+                                      target_hf_modules=["qkv_proj"],
+                                      target_trtllm_modules=["attn_qkv"],
+                                      zero_lora_weights=True,
+                                      tensor_parallel_size=1)
     defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
diff --git a/tests/integration/defs/verl/verl_config.yml b/tests/integration/defs/verl/verl_config.yml
@@ -31,7 +31,7 @@ verl_config:
     - "pip install --no-cache-dir -U git+https://github.com/ISEEKYAN/mbridge.git"
     - "pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0"
     - "pip3 install pytest-asyncio"
-    - "pip3 install --no-cache-dir 'ray[default]'"
+    - "pip3 install --no-cache-dir 'ray[default]==2.54.1'"
 
 
   # The environment variables to expose in the container before setting up