Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 26 additions & 52 deletions security_scanning/examples/models/contrib/hyperclovax/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ requires-python = ">=3.10,<3.13"
dependencies = [
"decord (>=0.6.0,<0.7.0)",
"timm (>=1.0.25,<2.0.0)",
"av (>=16.1.0,<17.0.0)"
"av (>=17.0.0,<18.0.0)"
]


Expand Down
6 changes: 3 additions & 3 deletions security_scanning/examples/serve/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions security_scanning/metadata.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"commit_hash": "9a9dc3c678c3c42e4e9dbe15e6d4843cbf7bba1d",
"timestamp": "2026-03-14T02:47:34Z"
"commit_hash": "267396cba9b1699a9a162852f69a193e6d7bc153",
"timestamp": "2026-03-15T02:47:36Z"
}
12 changes: 8 additions & 4 deletions tensorrt_llm/_torch/modules/fused_moe/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,6 +908,7 @@ def _maybe_padding_weights(tensor: torch.Tensor, row_alignment: int,

class DeepSeekFP8BlockScalesFusedMoEMethod(FusedMoEMethodBase):
eplb_support_status = EplbSupportStatus.NOT_VERIFIED
FP8_QUANT_BLOCK_SIZE = 128

def create_weights(self, module: torch.nn.Module):
weight_dtype = torch.float8_e4m3fn
Expand All @@ -926,16 +927,18 @@ def create_weights(self, module: torch.nn.Module):
cell_div = lambda x, y: (x + y - 1) // y
w3_w1_weight_scaling_factor = nn.Parameter(torch.empty(
(module.expert_size_per_partition,
cell_div(module.intermediate_size_per_partition, 128) * 2,
cell_div(w3_w1_weight_shape[2], 128)),
cell_div(module.intermediate_size_per_partition,
self.FP8_QUANT_BLOCK_SIZE) * 2,
cell_div(w3_w1_weight_shape[2], self.FP8_QUANT_BLOCK_SIZE)),
dtype=torch.float32),
requires_grad=False)
module.register_parameter("w3_w1_weight_scaling_factor",
w3_w1_weight_scaling_factor)

w2_weight_scaling_factor = nn.Parameter(torch.empty(
(module.expert_size_per_partition, cell_div(
w2_weight_shape[1], 128), cell_div(w2_weight_shape[2], 128)),
(module.expert_size_per_partition,
cell_div(w2_weight_shape[1], self.FP8_QUANT_BLOCK_SIZE),
cell_div(w2_weight_shape[2], self.FP8_QUANT_BLOCK_SIZE)),
dtype=torch.float32),
requires_grad=False)
module.register_parameter("w2_weight_scaling_factor",
Expand Down Expand Up @@ -986,6 +989,7 @@ def load_expert_all_weight_scale_fp8_block_scale(
f"{expert_id}.w2.weight_scale_inv"] if f"{expert_id}.w2.weight_scale_inv" in weights else None
dst_w3_weight_scale, dst_w1_weight_scale = dst_w3_w1_weight_scale[
local_slot_id].chunk(2, dim=0)
assert module.intermediate_size_per_partition % self.FP8_QUANT_BLOCK_SIZE == 0, "For DeepSeekFP8BlockScalesFusedMoEMethod, intermediate_size_per_partition should be divisible by FP8_QUANT_BLOCK_SIZE."
if w1_scale is not None:
w1_scale_shard = load_weight_shard(
w1_scale,
Expand Down
2 changes: 1 addition & 1 deletion tests/unittest/_torch/modules/moe/quantize_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,7 @@ def check_accuracy(self, output, ref_output):
# Relaxed percent from 0.98 to 0.97 to account for NVFP4 quantization
# error accumulation with certain routing methods (e.g. Llama4Renormalize).
# Max observed mismatch in non-skipped cases is ~2.7% < 3%.
check_accuracy(output, ref_output, rtol=1e-2, atol=0.15, percent=0.97)
check_accuracy(output, ref_output, rtol=0.1, atol=0.15, percent=0.97)


class NVFP4QuantizeUtil(BaseQuantizeUtil):
Expand Down
Loading