From f2c3daca1c48832ae8d209196284561439accb21 Mon Sep 17 00:00:00 2001 From: xiweny <13230610+VALLIS-NERIA@users.noreply.github.com> Date: Thu, 16 Apr 2026 00:23:11 +0800 Subject: [PATCH 1/4] [https://nvbugs/5846024][fix] Remove waivers (#12979) Signed-off-by: xiweny <13230610+VALLIS-NERIA@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index cd71220150e5..eaa6857d8712 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -223,7 +223,6 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutl accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/5838211) full:A10/unittest/kv_cache_manager_v2_tests/ SKIP (https://nvbugs/5841954) examples/test_mistral.py::test_mistral_with_bf16_lora_torch[mistral-7b-v0.1] SKIP (https://nvbugs/5846178) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] SKIP (https://nvbugs/5846024) accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[disable_skip_indexer] SKIP (https://nvbugs/5859886) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-fp8] SKIP (https://nvbugs/5651865) test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5864769) From 50717ebc5360b72e5536768b0155879ddad47413 Mon Sep 17 00:00:00 2001 From: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> Date: Wed, 15 Apr 2026 09:37:42 -0700 Subject: [PATCH 2/4] [https://nvbugs/5838178][fix] Fix failing lora test for Llama (#12950) Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> --- requirements.txt | 2 +- tests/integration/defs/common.py | 108 ++++++++++++------ tests/integration/defs/examples/test_llama.py | 42 +------ .../integration/defs/examples/test_mistral.py | 22 +--- tests/integration/defs/examples/test_phi.py | 23 ++-- tests/integration/test_lists/waives.txt | 15 +-- 6 files changed, 86 insertions(+), 126 deletions(-) diff --git a/requirements.txt b/requirements.txt index b76e28208bdc..7b13d7e4141b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -51,7 +51,7 @@ starlette>=0.49.1 uvicorn setuptools<80 ordered-set -peft +peft>=0.18.1,<0.19.0 patchelf einops flashinfer-python==0.6.6 diff --git a/tests/integration/defs/common.py b/tests/integration/defs/common.py index 02f084d71357..e56772973943 100644 --- a/tests/integration/defs/common.py +++ b/tests/integration/defs/common.py @@ -932,12 +932,16 @@ def test_llm_torch_multi_lora_support( target_trtllm_modules=["attn_q", "attn_k", "attn_v"], zero_lora_weights=True, tensor_parallel_size=1, - pipeline_parallel_size=1, - expected_outputs=None): - """Test multi-LoRA support with LLM-API Torch backend.""" + pipeline_parallel_size=1): + """Test multi-LoRA support with LLM-API Torch backend. - # if expected_outputs is None: - # raise ValueError("expected_outputs must be provided for exact validation") + When zero_lora_weights=True, validates that LoRA outputs match base model + outputs (since zero-weight LoRAs should not alter behavior). + """ + + assert zero_lora_weights, ( + "This test compares LoRA outputs against base model outputs, " + "which is only valid when zero_lora_weights=True.") start_time = time.time() print("Creating dummy LoRAs...") @@ -955,9 +959,6 @@ def test_llm_torch_multi_lora_support( f"Creating dummy LoRAs completed in {(lora_end - lora_start):.2f} seconds." ) - print("Initializing LLM_torch with LoRA support...") - init_start = time.time() - lora_config = LoraConfig(lora_dir=lora_paths, max_lora_rank=lora_rank, max_loras=num_loras, @@ -966,17 +967,57 @@ def test_llm_torch_multi_lora_support( input_prompts = get_test_prompts_for_torch() - with LLM_torch( - model=hf_model_dir, - lora_config=lora_config, - tensor_parallel_size=tensor_parallel_size, - pipeline_parallel_size=pipeline_parallel_size, - dtype="bfloat16", - max_batch_size=8, # From original test - max_input_len=512, # From original test - max_seq_len=562, # From original test - max_beam_width=1 # From original test - ) as llm: + sampling_params = SamplingParams(max_tokens=30, + top_p=0.5, + top_k=0, + temperature=0.0) + + # Step 1: Get base model outputs (no LoRA) as the ground truth. + print("Initializing LLM_torch without LoRA for base model outputs...") + init_start = time.time() + + with LLM_torch(model=hf_model_dir, + tensor_parallel_size=tensor_parallel_size, + pipeline_parallel_size=pipeline_parallel_size, + dtype="bfloat16", + max_batch_size=8, + max_input_len=512, + max_seq_len=562, + max_beam_width=1) as base_llm: + + init_end = time.time() + print( + f"Base LLM_torch initialization completed in {(init_end - init_start):.2f} seconds." + ) + + print("Running base model inference (no LoRA)...") + base_inference_start = time.time() + + base_outputs = base_llm.generate(input_prompts, + sampling_params=sampling_params) + + base_inference_end = time.time() + print( + f"Base inference completed in {(base_inference_end - base_inference_start):.2f} seconds." + ) + + expected_outputs = [o.outputs[0].text for o in base_outputs] + for i, text in enumerate(expected_outputs): + print(f"Base output {i+1}: {text!r}") + + # Step 2: Run with LoRA adapters and compare against base outputs. + print("Initializing LLM_torch with LoRA support...") + init_start = time.time() + + with LLM_torch(model=hf_model_dir, + lora_config=lora_config, + tensor_parallel_size=tensor_parallel_size, + pipeline_parallel_size=pipeline_parallel_size, + dtype="bfloat16", + max_batch_size=8, + max_input_len=512, + max_seq_len=562, + max_beam_width=1) as llm: init_end = time.time() print( @@ -986,20 +1027,18 @@ def test_llm_torch_multi_lora_support( print("Running inference with LLM-API Torch backend...") inference_start = time.time() - # Create LoRA requests for different adapters + # Create LoRA requests cycling through available adapters. lora_requests = [] + lora_counter = 0 for i in range(len(input_prompts)): - if i % 2 == 1: # Add some requests without LoRA + if i % 2 == 1: lora_requests.append(None) - else: # With LoRA + else: + lora_idx = lora_counter % num_loras + lora_counter += 1 lora_requests.append( - LoRARequest(f"lora-{i}", i, - lora_paths[i % len(lora_paths)])) - - sampling_params = SamplingParams(max_tokens=30, - top_p=0.5, - top_k=0, - temperature=0.0) + LoRARequest(f"lora-{lora_idx}", lora_idx, + lora_paths[lora_idx])) outputs = llm.generate(input_prompts, sampling_params=sampling_params, @@ -1010,8 +1049,8 @@ def test_llm_torch_multi_lora_support( f"Inference completed in {(inference_end - inference_start):.2f} seconds." ) - # Validate exact outputs - print("Validating exact outputs...") + # Validate that LoRA outputs match base model outputs. + print("Validating outputs against base model...") assert len(outputs) == len(expected_outputs), \ f"Expected {len(expected_outputs)} outputs, got {len(outputs)}" @@ -1021,13 +1060,12 @@ def test_llm_torch_multi_lora_support( print( f"LoRA: {lora_requests[i].lora_int_id if lora_requests[i] else 'None'}" ) - print(f"Expected: {expected}") - print(f"Actual: {actual_text}") + print(f"Expected (base): {expected!r}") + print(f"Actual (LoRA): {actual_text!r}") print("-" * 50) - # Exact string comparison assert actual_text == expected, \ - f"Output {i+1} mismatch:\nExpected: {expected!r}\nActual: {actual_text!r}" + f"Output {i+1} mismatch:\nExpected (base): {expected!r}\nActual (LoRA): {actual_text!r}" total_time = time.time() - start_time print(f"Total test execution time: {total_time:.2f} seconds") diff --git a/tests/integration/defs/examples/test_llama.py b/tests/integration/defs/examples/test_llama.py index a26abe8521eb..1dde330d5c45 100644 --- a/tests/integration/defs/examples/test_llama.py +++ b/tests/integration/defs/examples/test_llama.py @@ -908,49 +908,10 @@ def test_llama_3_x_with_bf16_lora_torch(llama_example_root, llm_datasets_root, else: tensor_parallel_size = 1 - expected_outputs = { - 'llama-v3-8b-instruct-hf': [ - " I hope you're having a great day! I just wanted to reach out and say hi, and see if you're doing okay. I know things", - " Seattle, Washington is known for its mild and wet climate, with over 200 days of precipitation per year. The city experiences a significant amount of rainfall", - " No, it is not recommended to fill diesel in a petrol car. Diesel and petrol are two different types of fuel, and using the wrong type of", - " I'm curious to know what's currently popular.\nI can help you with that! As of now, the top 5 trending songs on Spotify are", - " Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat" - ], - 'llama-3.1-8b-instruct': [ - " I'm doing pretty well, thanks for asking. I just got back from a great vacation in Hawaii and I'm still feeling pretty relaxed. I'm", - " Seattle, Washington is known for its rainy and overcast weather, but the city's climate is actually quite mild and temperate. The city experiences a", - " | What happens if you put diesel in a petrol car?\nFilling a petrol car with diesel is a common mistake that can cause serious damage to the", - " I need to know what's hot right now.\nI can check the top 5 trending songs on Spotify for you. However, please note that the", - " Paris\nWhat is the capital of France?\nThe capital of France is Paris. Paris is the largest city in France and is known for its iconic landmarks" - ], - 'llama-3.2-1b-instruct': [ - " I'm doing great, thanks for asking! I just got back from a fantastic weekend getaway to the beach, and I'm feeling refreshed and rejuvenated", - " Right now?\nI'm planning a trip to Seattle and I want to know what the weather is like. I'm looking for a general idea of what", - " Filling a diesel car with petrol is not recommended, and it can cause serious damage to the engine. Diesel and petrol are two different types of fuel", - " based on the last 24 hours?\nI can provide you with the top 5 trending songs on Spotify based on the last 24 hours, but", - " Paris.\nThe capital of France is Paris. Paris is the most populous city in France and is known for its rich history, art, fashion, and" - ], - 'llama-3.2-3b-instruct': [ - " I'm doing alright, just got back from a long hike and I'm feeling pretty exhausted. Nothing like a good hike to clear the mind and get", - " (Current Weather)\nI'm happy to help you with the current weather in Seattle, WA! However, I'm a large language model, I don", - " and what are the types of fuel that can be used in a diesel engine?\nDiesel engines are designed to run on diesel fuel, which is a", - " and provide the 5 most popular artists on Spotify?\nAccording to Spotify's current charts, here are the top 5 trending songs and the 5", - " Paris\nWhat is the capital of France?\nThe capital of France is indeed Paris. Located in the north-central part of the country, Paris is a" - ], - 'llama-3.3-70b-instruct': [ - " I hope you are having a great day. I am doing well, thanks for asking. I was just thinking about how much I love the fall season", - " Is it always rainy?\nSeattle, WA is known for its overcast and rainy weather, but it's not always rainy. The city experiences a mild", - " No, it is not recommended to fill diesel in a petrol car. Diesel fuel is not designed to be used in petrol engines, and using it can", - " I want to know what's popular right now.\nAs of my knowledge cutoff, I don't have real-time access to current Spotify trends. However,", - " Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat" - ], - } - print("Testing with LLM-API Torch backend...") defs.ci_profiler.start("test_llm_torch_multi_lora_support") - model_name = os.path.basename(llama_model_root).lower() test_llm_torch_multi_lora_support( hf_model_dir=llama_model_root, llm_venv=llm_venv, @@ -958,8 +919,7 @@ def test_llama_3_x_with_bf16_lora_torch(llama_example_root, llm_datasets_root, lora_rank=8, target_hf_modules=["q_proj", "k_proj", "v_proj"], zero_lora_weights=True, - tensor_parallel_size=tensor_parallel_size, - expected_outputs=expected_outputs[model_name]) + tensor_parallel_size=tensor_parallel_size) defs.ci_profiler.stop("test_llm_torch_multi_lora_support") print( f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec" diff --git a/tests/integration/defs/examples/test_mistral.py b/tests/integration/defs/examples/test_mistral.py index 6c25df68ad8a..b5d07fb015cb 100644 --- a/tests/integration/defs/examples/test_mistral.py +++ b/tests/integration/defs/examples/test_mistral.py @@ -14,7 +14,6 @@ # limitations under the License. """Module test_mistral test mistral examples.""" import multiprocessing -import os import defs.ci_profiler import psutil @@ -203,27 +202,9 @@ def test_mistral_with_bf16_lora_torch(llama_example_root, llm_datasets_root, else: tensor_parallel_size = 1 - expected_outputs = { - 'mistral-7b-v0.1': [ - "I hope you’re doing well. I’m doing well. I’m doing well. I’m doing well. I’m doing", - "\n\nSeattle, WA Weather Forecast. Today's weather in Seattle, WA. 59°F. 15°", - "\n\nNo, it is not ok to fill diesel in a petrol car. Diesel is a heavier fuel than petrol and will", - "\n\nYes, you can check the top 5 trending songs on Spotify. To do this, go to the Spotify website and sign", - "\n\nParis is the capital of France.\n\nWhat is the capital of the United States?\n\nWashington, D.C." - ], - 'mistral-nemo-instruct-2407': [ - " I'm doing fine, thanks for asking! How can I assist you today? Let me know if you have any questions or just want to chat!", - " Seattle, WA is currently experiencing a temperature of 55°F (13°C) with a chance of rain. The weather is typically cloud", - " I have a 2005 Honda City. I have filled diesel in my car by mistake. I have driven the car for about 1", - " I'm using python and I've tried using the spotipy library but I can't seem to get it to work. I'm not sure if it", - " Paris\n\nThe capital of France is Paris. It is the largest city in the country and is known for its iconic landmarks such as the Eiffel" - ], - } - print(f"Testing {llm_mistral_model_root} with LLM-API Torch backend...") defs.ci_profiler.start("test_llm_torch_multi_lora_support") - model_name = os.path.basename(llm_mistral_model_root).lower() test_llm_torch_multi_lora_support( hf_model_dir=llm_mistral_model_root, llm_venv=llm_venv, @@ -231,8 +212,7 @@ def test_mistral_with_bf16_lora_torch(llama_example_root, llm_datasets_root, lora_rank=8, target_hf_modules=["q_proj", "k_proj", "v_proj"], zero_lora_weights=True, - tensor_parallel_size=tensor_parallel_size, - expected_outputs=expected_outputs[model_name]) + tensor_parallel_size=tensor_parallel_size) defs.ci_profiler.stop("test_llm_torch_multi_lora_support") print( f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec" diff --git a/tests/integration/defs/examples/test_phi.py b/tests/integration/defs/examples/test_phi.py index ca7e7fe22cc8..8a414137c1bd 100644 --- a/tests/integration/defs/examples/test_phi.py +++ b/tests/integration/defs/examples/test_phi.py @@ -223,22 +223,15 @@ def test_phi_4_mini_instruct_with_bf16_lora_torch( llm_venv, engine_dir, llm_phi_model_root): """Run Phi-4-mini-instruct with multiple dummy LoRAs using LLM-API Torch backend.""" - expected_outputs = { - 'Phi-4-mini-instruct': ["...", "...", "...", "...", "..."], - } - print("Testing with LLM-API Torch backend...") defs.ci_profiler.start("test_llm_torch_multi_lora_support") - model_name = os.path.basename(llm_phi_model_root).lower() - test_llm_torch_multi_lora_support( - hf_model_dir=llm_phi_model_root, - llm_venv=llm_venv, - num_loras=2, - lora_rank=8, - target_hf_modules=["qkv_proj"], - target_trtllm_modules=["attn_qkv"], - zero_lora_weights=True, - tensor_parallel_size=1, - expected_outputs=expected_outputs[model_name]) + test_llm_torch_multi_lora_support(hf_model_dir=llm_phi_model_root, + llm_venv=llm_venv, + num_loras=2, + lora_rank=8, + target_hf_modules=["qkv_proj"], + target_trtllm_modules=["attn_qkv"], + zero_lora_weights=True, + tensor_parallel_size=1) defs.ci_profiler.stop("test_llm_torch_multi_lora_support") diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index eaa6857d8712..eefb20b19034 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -217,7 +217,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_c accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=True] SKIP (https://nvbugs/5821415) test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_w4afp8_8gpus[DeepSeek-R1-W4AFP8-DeepSeek-R1/DeepSeek-R1-W4AFP8] SKIP (https://nvbugs/5836830) accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664) -examples/test_llama.py::test_llama_3_x_with_bf16_lora_torch[llama-3.2-1b-instruct] SKIP (https://nvbugs/5838178) cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5838199) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/5838211) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/5838211) @@ -331,8 +330,8 @@ perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4 perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] SKIP (https://nvbugs/5844149) perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX] SKIP (https://nvbugs/6060119) perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX] SKIP (https://nvbugs/6060119) -accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=False] SKIP (https://nvbugs/6070878) -accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=True] SKIP (https://nvbugs/6070878) +full:sm89/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=False] SKIP (https://nvbugs/6070878) +full:sm89/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=True] SKIP (https://nvbugs/6070878) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-triton-auto] SKIP (https://nvbugs/6026676) accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus] SKIP (https://nvbugs/6069790) accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_2_model_mtp[2model_trtllm] SKIP (https://nvbugs/5981293) @@ -351,16 +350,6 @@ accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B_Instruct_Eagle3::test_eagle accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6076560) accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[trtllm-flashinfer_ssm-False] SKIP (https://nvbugs/6076564) unittest/_torch/ray_orchestrator/multi_gpu/test_llm_update_weights_multi_gpu.py SKIP (https://nvbugs/6076624) -unittest/_torch/modeling/test_modeling_starcoder2.py::test_starcoder2_multi_lora SKIP (https://nvbugs/6078438) -unittest/llmapi/test_llm_pytorch.py::test_lora_many_adapters_no_memory_leak SKIP (https://nvbugs/6078438) -examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/6078438) -examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.1-8b] SKIP (https://nvbugs/6078438) -examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.2-1b] SKIP (https://nvbugs/6078438) -examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2.5_1.5b_instruct] SKIP (https://nvbugs/6078438) -unittest/llmapi/test_llm_pytorch.py::test_bielik_11b_v2_2_instruct_multi_lora[None] SKIP (https://nvbugs/6078438) -unittest/llmapi/test_llm_pytorch.py::test_bielik_11b_v2_2_instruct_multi_lora[cuda_graph_config0] SKIP (https://nvbugs/6078438) -unittest/llmapi/test_llm_pytorch.py::test_gemma3_1b_instruct_multi_lora[None] SKIP (https://nvbugs/6078438) -unittest/llmapi/test_llm_pytorch.py::test_gemma3_1b_instruct_multi_lora[cuda_graph_config0] SKIP (https://nvbugs/6078438) unittest/llmapi/test_llm_pytorch.py::test_llm_disagg_streaming_gen_cancelled SKIP (https://nvbugs/6078431) unittest/auto_deploy/singlegpu/transformations/library/test_mrope_delta_cache.py::test_qwen_registry_configs_explicitly_enable_mrope_delta_cache SKIP (https://nvbugs/6078421) llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_eagle3 SKIP (https://nvbugs/6075431) From 10010708ef051e63e76723a08b72431ed38222d1 Mon Sep 17 00:00:00 2001 From: Yifan Jiang <19356972+yifjiang@users.noreply.github.com> Date: Wed, 15 Apr 2026 10:40:42 -0700 Subject: [PATCH 3/4] [None][fix] Guard CUDA event elapsed_time in perf_metrics_manager to prevent executor crash (#12868) Signed-off-by: Yifan Jiang <19356972+yifjiang@users.noreply.github.com> Co-authored-by: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> --- .../_torch/pyexecutor/perf_metrics_manager.py | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py b/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py index 9c2cbede57b0..4d8d8351e2aa 100644 --- a/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py @@ -11,6 +11,7 @@ import torch +from tensorrt_llm.logger import logger from tensorrt_llm.serve.responses_utils import get_steady_clock_now_in_seconds from .llm_request import PerfTimingInfo @@ -167,14 +168,27 @@ def compute_batch_gpu_times(self, requests): perf.gpu_forward_end_event.synchronize() if perf.gpu_sample_end_event and not perf.gpu_sample_end_event.query(): perf.gpu_sample_end_event.synchronize() - batch_gpu_forward_time = perf.gpu_forward_start_event.elapsed_time( - perf.gpu_forward_end_event - ) - batch_gpu_sample_time = ( - perf.gpu_forward_end_event.elapsed_time(perf.gpu_sample_end_event) - if perf.gpu_sample_end_event - else 0.0 - ) + try: + batch_gpu_forward_time = perf.gpu_forward_start_event.elapsed_time( + perf.gpu_forward_end_event + ) + batch_gpu_sample_time = ( + perf.gpu_forward_end_event.elapsed_time(perf.gpu_sample_end_event) + if perf.gpu_sample_end_event + else 0.0 + ) + except RuntimeError as e: + # CUDA event timing can fail if events were not recorded + # on the current stream. Skip metrics for this batch rather + # than crashing the executor thread. + logger.warning( + "Failed to compute GPU event elapsed_time: %s. " + "Setting batch GPU times to 0.0. This may indicate " + "an issue with the forward pass or stream synchronization.", + e, + ) + batch_gpu_forward_time = 0.0 + batch_gpu_sample_time = 0.0 target["gpu_forward_time"] = batch_gpu_forward_time target["gpu_sample_time"] = batch_gpu_sample_time From 51f7956172e1434f9470304764f3475a04d8221a Mon Sep 17 00:00:00 2001 From: shuyixiong <219646547+shuyixiong@users.noreply.github.com> Date: Thu, 16 Apr 2026 02:37:40 +0800 Subject: [PATCH 4/4] [None][fix] Pin Ray version to 2.54.1 in slurm CI stage (#13085) Signed-off-by: Shuyi Xiong <219646547+shuyixiong@users.noreply.github.com> --- jenkins/scripts/slurm_install.sh | 2 +- tests/integration/defs/verl/verl_config.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jenkins/scripts/slurm_install.sh b/jenkins/scripts/slurm_install.sh index cb1ec4bc83cb..e5531f494342 100644 --- a/jenkins/scripts/slurm_install.sh +++ b/jenkins/scripts/slurm_install.sh @@ -26,7 +26,7 @@ slurm_install_setup() { retry_command apt-get install -y libffi-dev nvidia-smi && nvidia-smi -q && nvidia-smi topo -m if [[ $pytestCommand == *--run-ray* ]]; then - retry_command pip3 install --retries 10 ray[default] + retry_command pip3 install --retries 10 "ray[default]==2.54.1" fi retry_command bash -c "cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt" retry_command bash -c "cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl" diff --git a/tests/integration/defs/verl/verl_config.yml b/tests/integration/defs/verl/verl_config.yml index a5866a2d91c0..3ad17bcd924f 100644 --- a/tests/integration/defs/verl/verl_config.yml +++ b/tests/integration/defs/verl/verl_config.yml @@ -31,7 +31,7 @@ verl_config: - "pip install --no-cache-dir -U git+https://github.com/ISEEKYAN/mbridge.git" - "pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0" - "pip3 install pytest-asyncio" - - "pip3 install --no-cache-dir 'ray[default]'" + - "pip3 install --no-cache-dir 'ray[default]==2.54.1'" # The environment variables to expose in the container before setting up