Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 42 additions & 7 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
REQUIRED_OPEN_DRIVER_TYPES = ["b100-ts2", "rtx-5080", "rtx-5090", "rtx-pro-6000", "rtx-pro-6000d"]

// GPU types that don't support dynamic driver flashing
REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200"]
REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200", "gb10x"]

// ENABLE_NGC_DEVEL_IMAGE_TEST is currently disabled in the Jenkins BuildDockerImageSanityTest job config
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
Expand Down Expand Up @@ -672,7 +672,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,

if (cluster.host.contains("dlcluster")) {
dockerArgs += " " + sh(script: 'echo " -e NVIDIA_IMEX_CHANNELS=${NVIDIA_IMEX_CHANNELS:-0}"', returnStdout: true).trim()
dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
if (fileExists('/dev/gdrdrv')) {
dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
}
}
}

Expand Down Expand Up @@ -1562,7 +1564,7 @@ EOF_TIMEOUT_XML

def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMode = false)
{
def targetCould = "kubernetes-cpu"
def targetCloud = "kubernetes-cpu"
def selectors = """
nvidia.com/node_type: builder
kubernetes.io/arch: ${arch}
Expand All @@ -1571,6 +1573,8 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
def nodeLabelPrefix = ""
def jobName = getShortenedJobName(env.JOB_NAME)
def buildID = env.BUILD_ID
def tolerations = ""
def extraDeviceEnv = ""

def archSuffix = arch == "arm64" ? "arm" : "amd"
def jnlpImage = "urm.nvidia.com/sw-ipp-blossom-sre-docker-local/lambda/custom_jnlp_images_${archSuffix}_linux:jdk17"
Expand Down Expand Up @@ -1653,14 +1657,40 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
def gpuType = KubernetesManager.selectGPU(type)
nodeLabelPrefix = type

targetCould = "kubernetes"
targetCloud = "kubernetes"
// DGX Spark requires a special setting for accessing the device.
// It has 128GB unified memory as per spec. Use half of the memory at the CPU side.
if (type == "gb10x") {
targetCloud = "nvks-sparks-cloud"
memorySize = "64Gi"
tolerations = """
tolerations:
- key: "node_for_blossom_trt"
operator: "Exists"
effect: "NoSchedule"
"""
extraDeviceEnv = """
- name: NVIDIA_VISIBLE_DEVICES
value: "all"
- name: NVIDIA_DRIVER_CAPABILITIES
value: "compute,utility"
"""
}

// The following GPU types doesn't support dynamic driver flashing.
if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) {
selectors = """
if (type == "gb10x") {
selectors = """
kubernetes.io/arch: ${arch}
kubernetes.io/os: linux
nvidia.com/gpu.machine: NVIDIA_DGX_Spark
nvidia.com/tenant: blossom_trt"""
} else {
selectors = """
kubernetes.io/arch: ${arch}
kubernetes.io/os: linux
nvidia.com/gpu_type: ${gpuType}"""
}
} else if (perfMode && !hasMultipleGPUs) {
// Use single GPU machine with "tensorrt/test_type: perf" for stable perf testing.
// H100 / A100 single GPU machine has this unique label in TensorRT Blossom pool.
Expand Down Expand Up @@ -1744,7 +1774,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
}

def podConfig = [
cloud: targetCould,
cloud: targetCloud,
namespace: "sw-tensorrt",
label: nodeLabel,
yaml: """
Expand All @@ -1771,6 +1801,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
valueFrom:
fieldRef:
fieldPath: spec.nodeName
${extraDeviceEnv}
- name: jnlp
image: ${jnlpImage}
args: ['\$(JENKINS_SECRET)', '\$(JENKINS_NAME)']
Expand All @@ -1790,6 +1821,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
medium: Memory
${llmModelVolume}
${pvcVolume}
${tolerations}
""".stripIndent(),
]

Expand Down Expand Up @@ -3202,16 +3234,19 @@ def launchTestJobs(pipeline, testFilter)
parallelJobs += parallelSlurmJobs

// Try to match what are being tested on x86 H100_PCIe.
// The total machine time is scaled proportionally according to the number of each GPU.
// SBSA machines from the Blossom machine pool
SBSATestConfigs = [
"GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1],
// DGX Spark is also named as GB10 Grace Blackwell Superchip.
"GB10-PyTorch-1": ["gb10x", "l0_gb10", 1, 1],
]
fullSet += SBSATestConfigs.keySet()

SBSASlurmTestConfigs = [
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
"GB10-PyTorch-Post-Merge-1": ["gb10x-single", "l0_gb10", 1, 1],
// Perf sanity post merge test
"GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
// Disable GB300 stages due to nodes will be offline temporarily.
Expand Down
172 changes: 156 additions & 16 deletions tensorrt_llm/_torch/models/modeling_deepseekv3.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from tqdm import tqdm
from transformers import PretrainedConfig

import tensorrt_llm.quantization.utils.fp4_utils as fp4_utils
from tensorrt_llm._ipc_utils import can_access_peer
from tensorrt_llm._utils import get_sm_version
from tensorrt_llm.functional import PositionEmbeddingType
Expand Down Expand Up @@ -142,6 +143,44 @@ def __init__(self, model, is_draft_model: bool = False):

def load_weights(self, weights: Dict, skip_modules: List[str] = []):

def requantize_weight_with_new_scale(weight, weight_scale, old_scale_2,
new_scale_2, device):
"""
Dequantize FP4 weights and requantize with a new scale.

Args:
weight: FP4 quantized weight tensor 2D [,]
weight_scale: FP8 per-block scaling factors
old_scale_2: original global scale (amax/(448*6))
new_scale_2: new global scale (amax/(448*6))
device: target device for computation

Returns:
(requantized_weight, new_weight_scale)
"""
# Remember original dtype of weight_scale
original_scale_dtype = weight_scale.dtype
original_scale_shape = weight_scale.shape

# Dequantize
dequant_shape = (weight.shape[0], weight.shape[1] * 2)
weight_dequant = torch.ops.tensorrt_llm.e2m1_and_ufp8sf_scale_to_float_v2(
weight.contiguous(),
weight_scale.flatten().view(
fp4_utils.float4_sf_dtype).contiguous(), old_scale_2, 16, 1,
True).to(dtype=torch.bfloat16).reshape(dequant_shape)

# Requantize using the new_scale_2
weight_requant, weight_scale_requant = torch.ops.trtllm.fp4_quantize(
weight_dequant.to(device),
1.0 / new_scale_2.to(device),
16, # scaling_vector_size
False)

# Ensure the returned scale has the same dtype as the input scale
return weight_requant.cpu(), weight_scale_requant.reshape(
original_scale_shape).view(original_scale_dtype).cpu()

def rename_moe_weight(weights: Dict, rename_rules: Dict):
result = {}
for key, value in weights.items():
Expand Down Expand Up @@ -355,27 +394,128 @@ def split_kv_b_proj(kv_b_proj: torch.Tensor,
).view(*attn_module.v_b_proj_dequant.shape).to(
attn_module.v_b_proj_dequant.dtype))
elif names[-1] == "kv_a_proj_with_mqa":
fused_a = weights[
f"{'.'.join(names[:-1])}.kv_a_proj_with_mqa.weight"][:]
if not is_lite:
q_a_proj = weights[
f"{'.'.join(names[:-1])}.q_a_proj.weight"][:]
fused_a = torch.cat([q_a_proj, fused_a], dim=0)

if f"{'.'.join(names[:-1])}.kv_a_proj_with_mqa.weight_scale_inv" in weights:
fused_a_scale = weights[
f"{'.'.join(names[:-1])}.kv_a_proj_with_mqa.weight_scale_inv"]
nvfp4_fused_a = self.model_config.get_quant_config(
).layer_quant_mode.has_nvfp4() and weights[
f"{'.'.join(names[:-1])}.kv_a_proj_with_mqa.weight"].dtype == fp4_utils.float4_e2m1x2 and weights[
f"{'.'.join(names[:-1])}.q_a_proj.weight"].dtype == fp4_utils.float4_e2m1x2
if nvfp4_fused_a:
########### input_scale
kv_a_proj_with_mqa_input_scale = weights[
f"{'.'.join(names[:-1])}.kv_a_proj_with_mqa.input_scale"]
if not is_lite:
q_a_proj_input_scale = weights[
f"{'.'.join(names[:-1])}.q_a_proj.input_scale"]
assert kv_a_proj_with_mqa_input_scale == q_a_proj_input_scale, "kv_a_proj_with_mqa.input_scale and q_a_proj.input_scale should be the same"
# modelopt ckpt stores amax/(448*6), convert to (448*6)/amax
shared_input_scale = kv_a_proj_with_mqa_input_scale
module.input_scale.data.copy_(1.0 / shared_input_scale)
E2M1_MAX = 6.0
module.inv_input_scale.data.copy_(module.input_scale /
E2M1_MAX)
########### weight_scale_2
need_requant_kv_a_proj_with_mqa = False
need_requant_q_a_proj = False
kv_a_proj_with_mqa_scale_2 = weights[
f"{'.'.join(names[:-1])}.kv_a_proj_with_mqa.weight_scale_2"]
shared_weight_scale_2 = kv_a_proj_with_mqa_scale_2
if not is_lite:
q_a_proj_scale_2 = weights[
f"{'.'.join(names[:-1])}.q_a_proj.weight_scale_2"]
if kv_a_proj_with_mqa_scale_2 < q_a_proj_scale_2:
shared_weight_scale_2 = q_a_proj_scale_2
need_requant_kv_a_proj_with_mqa = True
elif q_a_proj_scale_2 < kv_a_proj_with_mqa_scale_2:
need_requant_q_a_proj = True

########### alpha
alpha = shared_input_scale.float(
) * shared_weight_scale_2.float()
module.alpha.data.copy_(alpha)
module.scalar_alpha = alpha.item()

########### weights
kv_a_proj_with_mqa = weights[
f"{'.'.join(names[:-1])}.kv_a_proj_with_mqa.weight"][:]

if not is_lite:
q_a_proj = weights[
f"{'.'.join(names[:-1])}.q_a_proj.weight"][:]

########### weight_scale
kv_a_proj_with_mqa_scale = weights[
f"{'.'.join(names[:-1])}.kv_a_proj_with_mqa.weight_scale"][:]
kv_a_proj_with_mqa_scale = torch.ops.trtllm.block_scale_interleave(
kv_a_proj_with_mqa_scale.view(
fp4_utils.float4_sf_dtype))
if not is_lite:
q_a_proj_scale = weights[
f"{'.'.join(names[:-1])}.q_a_proj.weight_scale_inv"][:]
fused_a_scale = torch.cat(
[q_a_proj_scale, fused_a_scale], dim=0)
f"{'.'.join(names[:-1])}.q_a_proj.weight_scale"][:]
q_a_proj_scale = torch.ops.trtllm.block_scale_interleave(
q_a_proj_scale.view(fp4_utils.float4_sf_dtype))

########### requantize
if need_requant_kv_a_proj_with_mqa:
# requant kv_a_proj_with_mqa
kv_a_proj_with_mqa, kv_a_proj_with_mqa_scale = requantize_weight_with_new_scale(
kv_a_proj_with_mqa,
kv_a_proj_with_mqa_scale,
kv_a_proj_with_mqa_scale_2,
shared_weight_scale_2,
device=module.weight.device,
)
if need_requant_q_a_proj:
# requant q_a_proj
q_a_proj, q_a_proj_scale = requantize_weight_with_new_scale(
q_a_proj,
q_a_proj_scale,
q_a_proj_scale_2,
shared_weight_scale_2,
device=module.weight.device)

########### fuse and load weights
if not is_lite:
fused_a = torch.cat([q_a_proj, kv_a_proj_with_mqa],
dim=0)
else:
fused_a = kv_a_proj_with_mqa

# For DeepseekV32: kv_a_proj_with_mqa is oversized
# to include indexer k weights, which is filled in post_load_weights.
module.weight.data[0:fused_a.shape[0]].copy_(fused_a)

########### fuse weight_scale
if not is_lite:
fused_a_scale = torch.cat(
[q_a_proj_scale, kv_a_proj_with_mqa_scale],
dim=0)
else:
fused_a_scale = kv_a_proj_with_mqa_scale
# For DeepseekV32: kv_a_proj_with_mqa is oversized
# to include indexer k weights, which is filled in post_load_weights.
module.weight_scale.data[0:fused_a_scale.
shape[0]].copy_(fused_a_scale)
# For DeepseekV32: kv_a_proj_with_mqa is oversized
# to include indexer k weights, which is filled in post_load_weights.
module.weight.data[0:fused_a.shape[0]].copy_(fused_a)
else:
fused_a = weights[
f"{'.'.join(names[:-1])}.kv_a_proj_with_mqa.weight"][:]
if not is_lite:
q_a_proj = weights[
f"{'.'.join(names[:-1])}.q_a_proj.weight"][:]
fused_a = torch.cat([q_a_proj, fused_a], dim=0)

if f"{'.'.join(names[:-1])}.kv_a_proj_with_mqa.weight_scale_inv" in weights:
fused_a_scale = weights[
f"{'.'.join(names[:-1])}.kv_a_proj_with_mqa.weight_scale_inv"]
if not is_lite:
q_a_proj_scale = weights[
f"{'.'.join(names[:-1])}.q_a_proj.weight_scale_inv"][:]
fused_a_scale = torch.cat(
[q_a_proj_scale, fused_a_scale], dim=0)

module.weight_scale.data[
0:fused_a_scale.shape[0]].copy_(fused_a_scale)
# For DeepseekV32: kv_a_proj_with_mqa is oversized
# to include indexer k weights, which is filled in post_load_weights.
module.weight.data[0:fused_a.shape[0]].copy_(fused_a)
elif names[-1] in params_map:
module_weights = []
for new_name in params_map[names[-1]]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from tensorrt_llm import LLM
from tensorrt_llm.llmapi import CudaGraphConfig, KvCacheConfig, MoeConfig, SamplingParams

from ..conftest import llm_models_root, skip_pre_blackwell
from ..conftest import llm_models_root, skip_post_blackwell, skip_pre_blackwell, skip_pre_hopper
from .accuracy_core import MMMU, LlmapiAccuracyTestHarness


Expand Down Expand Up @@ -216,6 +216,8 @@ def test_auto_dtype(self):
task.evaluate(llm, sampling_params=self.sampling_params)


@skip_pre_hopper
@skip_post_blackwell
class TestGemma3_27BInstruct(LlmapiAccuracyTestHarness):
MODEL_NAME = "google/gemma-3-27b-it"
MODEL_PATH = f"{llm_models_root()}/gemma/gemma-3-27b-it/"
Expand Down
Loading
Loading