Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion jenkins/scripts/slurm_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ slurm_install_setup() {
retry_command apt-get install -y libffi-dev
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
if [[ $pytestCommand == *--run-ray* ]]; then
retry_command pip3 install --retries 10 ray[default]
retry_command pip3 install --retries 10 "ray[default]==2.54.1"
fi
retry_command bash -c "cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt"
retry_command bash -c "cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl"
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ starlette>=0.49.1
uvicorn
setuptools<80
ordered-set
peft
peft>=0.18.1,<0.19.0
patchelf
einops
flashinfer-python==0.6.6
Expand Down
30 changes: 22 additions & 8 deletions tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import torch

from tensorrt_llm.logger import logger
from tensorrt_llm.serve.responses_utils import get_steady_clock_now_in_seconds

from .llm_request import PerfTimingInfo
Expand Down Expand Up @@ -167,14 +168,27 @@ def compute_batch_gpu_times(self, requests):
perf.gpu_forward_end_event.synchronize()
if perf.gpu_sample_end_event and not perf.gpu_sample_end_event.query():
perf.gpu_sample_end_event.synchronize()
batch_gpu_forward_time = perf.gpu_forward_start_event.elapsed_time(
perf.gpu_forward_end_event
)
batch_gpu_sample_time = (
perf.gpu_forward_end_event.elapsed_time(perf.gpu_sample_end_event)
if perf.gpu_sample_end_event
else 0.0
)
try:
batch_gpu_forward_time = perf.gpu_forward_start_event.elapsed_time(
perf.gpu_forward_end_event
)
batch_gpu_sample_time = (
perf.gpu_forward_end_event.elapsed_time(perf.gpu_sample_end_event)
if perf.gpu_sample_end_event
else 0.0
)
except RuntimeError as e:
# CUDA event timing can fail if events were not recorded
# on the current stream. Skip metrics for this batch rather
# than crashing the executor thread.
logger.warning(
"Failed to compute GPU event elapsed_time: %s. "
"Setting batch GPU times to 0.0. This may indicate "
"an issue with the forward pass or stream synchronization.",
e,
)
batch_gpu_forward_time = 0.0
batch_gpu_sample_time = 0.0

target["gpu_forward_time"] = batch_gpu_forward_time
target["gpu_sample_time"] = batch_gpu_sample_time
Expand Down
108 changes: 73 additions & 35 deletions tests/integration/defs/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -932,12 +932,16 @@ def test_llm_torch_multi_lora_support(
target_trtllm_modules=["attn_q", "attn_k", "attn_v"],
zero_lora_weights=True,
tensor_parallel_size=1,
pipeline_parallel_size=1,
expected_outputs=None):
"""Test multi-LoRA support with LLM-API Torch backend."""
pipeline_parallel_size=1):
"""Test multi-LoRA support with LLM-API Torch backend.

# if expected_outputs is None:
# raise ValueError("expected_outputs must be provided for exact validation")
When zero_lora_weights=True, validates that LoRA outputs match base model
outputs (since zero-weight LoRAs should not alter behavior).
"""

assert zero_lora_weights, (
"This test compares LoRA outputs against base model outputs, "
"which is only valid when zero_lora_weights=True.")

start_time = time.time()
print("Creating dummy LoRAs...")
Expand All @@ -955,9 +959,6 @@ def test_llm_torch_multi_lora_support(
f"Creating dummy LoRAs completed in {(lora_end - lora_start):.2f} seconds."
)

print("Initializing LLM_torch with LoRA support...")
init_start = time.time()

lora_config = LoraConfig(lora_dir=lora_paths,
max_lora_rank=lora_rank,
max_loras=num_loras,
Expand All @@ -966,17 +967,57 @@ def test_llm_torch_multi_lora_support(

input_prompts = get_test_prompts_for_torch()

with LLM_torch(
model=hf_model_dir,
lora_config=lora_config,
tensor_parallel_size=tensor_parallel_size,
pipeline_parallel_size=pipeline_parallel_size,
dtype="bfloat16",
max_batch_size=8, # From original test
max_input_len=512, # From original test
max_seq_len=562, # From original test
max_beam_width=1 # From original test
) as llm:
sampling_params = SamplingParams(max_tokens=30,
top_p=0.5,
top_k=0,
temperature=0.0)

# Step 1: Get base model outputs (no LoRA) as the ground truth.
print("Initializing LLM_torch without LoRA for base model outputs...")
init_start = time.time()

with LLM_torch(model=hf_model_dir,
tensor_parallel_size=tensor_parallel_size,
pipeline_parallel_size=pipeline_parallel_size,
dtype="bfloat16",
max_batch_size=8,
max_input_len=512,
max_seq_len=562,
max_beam_width=1) as base_llm:

init_end = time.time()
print(
f"Base LLM_torch initialization completed in {(init_end - init_start):.2f} seconds."
)

print("Running base model inference (no LoRA)...")
base_inference_start = time.time()

base_outputs = base_llm.generate(input_prompts,
sampling_params=sampling_params)

base_inference_end = time.time()
print(
f"Base inference completed in {(base_inference_end - base_inference_start):.2f} seconds."
)

expected_outputs = [o.outputs[0].text for o in base_outputs]
for i, text in enumerate(expected_outputs):
print(f"Base output {i+1}: {text!r}")

# Step 2: Run with LoRA adapters and compare against base outputs.
print("Initializing LLM_torch with LoRA support...")
init_start = time.time()

with LLM_torch(model=hf_model_dir,
lora_config=lora_config,
tensor_parallel_size=tensor_parallel_size,
pipeline_parallel_size=pipeline_parallel_size,
dtype="bfloat16",
max_batch_size=8,
max_input_len=512,
max_seq_len=562,
max_beam_width=1) as llm:

init_end = time.time()
print(
Expand All @@ -986,20 +1027,18 @@ def test_llm_torch_multi_lora_support(
print("Running inference with LLM-API Torch backend...")
inference_start = time.time()

# Create LoRA requests for different adapters
# Create LoRA requests cycling through available adapters.
lora_requests = []
lora_counter = 0
for i in range(len(input_prompts)):
if i % 2 == 1: # Add some requests without LoRA
if i % 2 == 1:
lora_requests.append(None)
else: # With LoRA
else:
lora_idx = lora_counter % num_loras
lora_counter += 1
lora_requests.append(
LoRARequest(f"lora-{i}", i,
lora_paths[i % len(lora_paths)]))

sampling_params = SamplingParams(max_tokens=30,
top_p=0.5,
top_k=0,
temperature=0.0)
LoRARequest(f"lora-{lora_idx}", lora_idx,
lora_paths[lora_idx]))

outputs = llm.generate(input_prompts,
sampling_params=sampling_params,
Expand All @@ -1010,8 +1049,8 @@ def test_llm_torch_multi_lora_support(
f"Inference completed in {(inference_end - inference_start):.2f} seconds."
)

# Validate exact outputs
print("Validating exact outputs...")
# Validate that LoRA outputs match base model outputs.
print("Validating outputs against base model...")
assert len(outputs) == len(expected_outputs), \
f"Expected {len(expected_outputs)} outputs, got {len(outputs)}"

Expand All @@ -1021,13 +1060,12 @@ def test_llm_torch_multi_lora_support(
print(
f"LoRA: {lora_requests[i].lora_int_id if lora_requests[i] else 'None'}"
)
print(f"Expected: {expected}")
print(f"Actual: {actual_text}")
print(f"Expected (base): {expected!r}")
print(f"Actual (LoRA): {actual_text!r}")
print("-" * 50)

# Exact string comparison
assert actual_text == expected, \
f"Output {i+1} mismatch:\nExpected: {expected!r}\nActual: {actual_text!r}"
f"Output {i+1} mismatch:\nExpected (base): {expected!r}\nActual (LoRA): {actual_text!r}"

total_time = time.time() - start_time
print(f"Total test execution time: {total_time:.2f} seconds")
Expand Down
42 changes: 1 addition & 41 deletions tests/integration/defs/examples/test_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,58 +908,18 @@ def test_llama_3_x_with_bf16_lora_torch(llama_example_root, llm_datasets_root,
else:
tensor_parallel_size = 1

expected_outputs = {
'llama-v3-8b-instruct-hf': [
" I hope you're having a great day! I just wanted to reach out and say hi, and see if you're doing okay. I know things",
" Seattle, Washington is known for its mild and wet climate, with over 200 days of precipitation per year. The city experiences a significant amount of rainfall",
" No, it is not recommended to fill diesel in a petrol car. Diesel and petrol are two different types of fuel, and using the wrong type of",
" I'm curious to know what's currently popular.\nI can help you with that! As of now, the top 5 trending songs on Spotify are",
" Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat"
],
'llama-3.1-8b-instruct': [
" I'm doing pretty well, thanks for asking. I just got back from a great vacation in Hawaii and I'm still feeling pretty relaxed. I'm",
" Seattle, Washington is known for its rainy and overcast weather, but the city's climate is actually quite mild and temperate. The city experiences a",
" | What happens if you put diesel in a petrol car?\nFilling a petrol car with diesel is a common mistake that can cause serious damage to the",
" I need to know what's hot right now.\nI can check the top 5 trending songs on Spotify for you. However, please note that the",
" Paris\nWhat is the capital of France?\nThe capital of France is Paris. Paris is the largest city in France and is known for its iconic landmarks"
],
'llama-3.2-1b-instruct': [
" I'm doing great, thanks for asking! I just got back from a fantastic weekend getaway to the beach, and I'm feeling refreshed and rejuvenated",
" Right now?\nI'm planning a trip to Seattle and I want to know what the weather is like. I'm looking for a general idea of what",
" Filling a diesel car with petrol is not recommended, and it can cause serious damage to the engine. Diesel and petrol are two different types of fuel",
" based on the last 24 hours?\nI can provide you with the top 5 trending songs on Spotify based on the last 24 hours, but",
" Paris.\nThe capital of France is Paris. Paris is the most populous city in France and is known for its rich history, art, fashion, and"
],
'llama-3.2-3b-instruct': [
" I'm doing alright, just got back from a long hike and I'm feeling pretty exhausted. Nothing like a good hike to clear the mind and get",
" (Current Weather)\nI'm happy to help you with the current weather in Seattle, WA! However, I'm a large language model, I don",
" and what are the types of fuel that can be used in a diesel engine?\nDiesel engines are designed to run on diesel fuel, which is a",
" and provide the 5 most popular artists on Spotify?\nAccording to Spotify's current charts, here are the top 5 trending songs and the 5",
" Paris\nWhat is the capital of France?\nThe capital of France is indeed Paris. Located in the north-central part of the country, Paris is a"
],
'llama-3.3-70b-instruct': [
" I hope you are having a great day. I am doing well, thanks for asking. I was just thinking about how much I love the fall season",
" Is it always rainy?\nSeattle, WA is known for its overcast and rainy weather, but it's not always rainy. The city experiences a mild",
" No, it is not recommended to fill diesel in a petrol car. Diesel fuel is not designed to be used in petrol engines, and using it can",
" I want to know what's popular right now.\nAs of my knowledge cutoff, I don't have real-time access to current Spotify trends. However,",
" Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat"
],
}

print("Testing with LLM-API Torch backend...")

defs.ci_profiler.start("test_llm_torch_multi_lora_support")

model_name = os.path.basename(llama_model_root).lower()
test_llm_torch_multi_lora_support(
hf_model_dir=llama_model_root,
llm_venv=llm_venv,
num_loras=2,
lora_rank=8,
target_hf_modules=["q_proj", "k_proj", "v_proj"],
zero_lora_weights=True,
tensor_parallel_size=tensor_parallel_size,
expected_outputs=expected_outputs[model_name])
tensor_parallel_size=tensor_parallel_size)
defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
print(
f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"
Expand Down
22 changes: 1 addition & 21 deletions tests/integration/defs/examples/test_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
# limitations under the License.
"""Module test_mistral test mistral examples."""
import multiprocessing
import os

import defs.ci_profiler
import psutil
Expand Down Expand Up @@ -203,36 +202,17 @@ def test_mistral_with_bf16_lora_torch(llama_example_root, llm_datasets_root,
else:
tensor_parallel_size = 1

expected_outputs = {
'mistral-7b-v0.1': [
"I hope you’re doing well. I’m doing well. I’m doing well. I’m doing well. I’m doing",
"\n\nSeattle, WA Weather Forecast. Today's weather in Seattle, WA. 59°F. 15°",
"\n\nNo, it is not ok to fill diesel in a petrol car. Diesel is a heavier fuel than petrol and will",
"\n\nYes, you can check the top 5 trending songs on Spotify. To do this, go to the Spotify website and sign",
"\n\nParis is the capital of France.\n\nWhat is the capital of the United States?\n\nWashington, D.C."
],
'mistral-nemo-instruct-2407': [
" I'm doing fine, thanks for asking! How can I assist you today? Let me know if you have any questions or just want to chat!",
" Seattle, WA is currently experiencing a temperature of 55°F (13°C) with a chance of rain. The weather is typically cloud",
" I have a 2005 Honda City. I have filled diesel in my car by mistake. I have driven the car for about 1",
" I'm using python and I've tried using the spotipy library but I can't seem to get it to work. I'm not sure if it",
" Paris\n\nThe capital of France is Paris. It is the largest city in the country and is known for its iconic landmarks such as the Eiffel"
],
}

print(f"Testing {llm_mistral_model_root} with LLM-API Torch backend...")

defs.ci_profiler.start("test_llm_torch_multi_lora_support")
model_name = os.path.basename(llm_mistral_model_root).lower()
test_llm_torch_multi_lora_support(
hf_model_dir=llm_mistral_model_root,
llm_venv=llm_venv,
num_loras=2,
lora_rank=8,
target_hf_modules=["q_proj", "k_proj", "v_proj"],
zero_lora_weights=True,
tensor_parallel_size=tensor_parallel_size,
expected_outputs=expected_outputs[model_name])
tensor_parallel_size=tensor_parallel_size)
defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
print(
f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"
Expand Down
23 changes: 8 additions & 15 deletions tests/integration/defs/examples/test_phi.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,22 +223,15 @@ def test_phi_4_mini_instruct_with_bf16_lora_torch(
llm_venv, engine_dir, llm_phi_model_root):
"""Run Phi-4-mini-instruct with multiple dummy LoRAs using LLM-API Torch backend."""

expected_outputs = {
'Phi-4-mini-instruct': ["...", "...", "...", "...", "..."],
}

print("Testing with LLM-API Torch backend...")

defs.ci_profiler.start("test_llm_torch_multi_lora_support")
model_name = os.path.basename(llm_phi_model_root).lower()
test_llm_torch_multi_lora_support(
hf_model_dir=llm_phi_model_root,
llm_venv=llm_venv,
num_loras=2,
lora_rank=8,
target_hf_modules=["qkv_proj"],
target_trtllm_modules=["attn_qkv"],
zero_lora_weights=True,
tensor_parallel_size=1,
expected_outputs=expected_outputs[model_name])
test_llm_torch_multi_lora_support(hf_model_dir=llm_phi_model_root,
llm_venv=llm_venv,
num_loras=2,
lora_rank=8,
target_hf_modules=["qkv_proj"],
target_trtllm_modules=["attn_qkv"],
zero_lora_weights=True,
tensor_parallel_size=1)
defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
2 changes: 1 addition & 1 deletion tests/integration/defs/verl/verl_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ verl_config:
- "pip install --no-cache-dir -U git+https://github.com/ISEEKYAN/mbridge.git"
- "pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0"
- "pip3 install pytest-asyncio"
- "pip3 install --no-cache-dir 'ray[default]'"
- "pip3 install --no-cache-dir 'ray[default]==2.54.1'"


# The environment variables to expose in the container before setting up
Expand Down
Loading
Loading