xinhe-nv · pull · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026
diff --git a/security_scanning/examples/models/contrib/hyperclovax/poetry.lock b/security_scanning/examples/models/contrib/hyperclovax/poetry.lock
diff --git a/security_scanning/examples/models/contrib/hyperclovax/pyproject.toml b/security_scanning/examples/models/contrib/hyperclovax/pyproject.toml
@@ -9,7 +9,7 @@ requires-python = ">=3.10,<3.13"
 dependencies = [
     "decord (>=0.6.0,<0.7.0)",
     "timm (>=1.0.25,<2.0.0)",
-    "av (>=16.1.0,<17.0.0)"
+    "av (>=17.0.0,<18.0.0)"
 ]
 
 

diff --git a/security_scanning/examples/serve/poetry.lock b/security_scanning/examples/serve/poetry.lock
diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json
@@ -1,4 +1,4 @@
 {
-  "commit_hash": "9a9dc3c678c3c42e4e9dbe15e6d4843cbf7bba1d",
-  "timestamp": "2026-03-14T02:47:34Z"
+  "commit_hash": "267396cba9b1699a9a162852f69a193e6d7bc153",
+  "timestamp": "2026-03-15T02:47:36Z"
 }
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -908,6 +908,7 @@ def _maybe_padding_weights(tensor: torch.Tensor, row_alignment: int,
 
 class DeepSeekFP8BlockScalesFusedMoEMethod(FusedMoEMethodBase):
     eplb_support_status = EplbSupportStatus.NOT_VERIFIED
+    FP8_QUANT_BLOCK_SIZE = 128
 
     def create_weights(self, module: torch.nn.Module):
         weight_dtype = torch.float8_e4m3fn
@@ -926,16 +927,18 @@ def create_weights(self, module: torch.nn.Module):
         cell_div = lambda x, y: (x + y - 1) // y
         w3_w1_weight_scaling_factor = nn.Parameter(torch.empty(
             (module.expert_size_per_partition,
-             cell_div(module.intermediate_size_per_partition, 128) * 2,
-             cell_div(w3_w1_weight_shape[2], 128)),
+             cell_div(module.intermediate_size_per_partition,
+                      self.FP8_QUANT_BLOCK_SIZE) * 2,
+             cell_div(w3_w1_weight_shape[2], self.FP8_QUANT_BLOCK_SIZE)),
             dtype=torch.float32),
                                                    requires_grad=False)
         module.register_parameter("w3_w1_weight_scaling_factor",
                                   w3_w1_weight_scaling_factor)
 
         w2_weight_scaling_factor = nn.Parameter(torch.empty(
-            (module.expert_size_per_partition, cell_div(
-                w2_weight_shape[1], 128), cell_div(w2_weight_shape[2], 128)),
+            (module.expert_size_per_partition,
+             cell_div(w2_weight_shape[1], self.FP8_QUANT_BLOCK_SIZE),
+             cell_div(w2_weight_shape[2], self.FP8_QUANT_BLOCK_SIZE)),
             dtype=torch.float32),
                                                 requires_grad=False)
         module.register_parameter("w2_weight_scaling_factor",
@@ -986,6 +989,7 @@ def load_expert_all_weight_scale_fp8_block_scale(
                     f"{expert_id}.w2.weight_scale_inv"] if f"{expert_id}.w2.weight_scale_inv" in weights else None
                 dst_w3_weight_scale, dst_w1_weight_scale = dst_w3_w1_weight_scale[
                     local_slot_id].chunk(2, dim=0)
+                assert module.intermediate_size_per_partition % self.FP8_QUANT_BLOCK_SIZE == 0, "For DeepSeekFP8BlockScalesFusedMoEMethod, intermediate_size_per_partition should be divisible by FP8_QUANT_BLOCK_SIZE."
                 if w1_scale is not None:
                     w1_scale_shard = load_weight_shard(
                         w1_scale,

diff --git a/tests/unittest/_torch/modules/moe/quantize_utils.py b/tests/unittest/_torch/modules/moe/quantize_utils.py
@@ -593,7 +593,7 @@ def check_accuracy(self, output, ref_output):
             # Relaxed percent from 0.98 to 0.97 to account for NVFP4 quantization
             # error accumulation with certain routing methods (e.g. Llama4Renormalize).
             # Max observed mismatch in non-skipped cases is ~2.7% < 3%.
-            check_accuracy(output, ref_output, rtol=1e-2, atol=0.15, percent=0.97)
+            check_accuracy(output, ref_output, rtol=0.1, atol=0.15, percent=0.97)
 
 
 class NVFP4QuantizeUtil(BaseQuantizeUtil):