JiantaoXu
diff --git a/‎tests/model_executor/model_loader/test_reload.py‎
Lines changed: 57 additions & 0 deletions b/‎tests/model_executor/model_loader/test_reload.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎vllm/model_executor/layers/quantization/fp8.py‎
Lines changed: 27 additions & 202 deletions b/‎vllm/model_executor/layers/quantization/fp8.py‎
Lines changed: 27 additions & 202 deletions
@@ -148,3 +148,60 @@ def test_reload_weights(base_model, mul_model, add_model, tp_size, vllm_runner):
         mul_perp = llm.generate_prompt_perplexity(["3 4 = 12"], mask=["3 4 ="])[0]
         add_perp = llm.generate_prompt_perplexity(["3 4 = 7"], mask=["3 4 ="])[0]
         assert add_perp < mul_perp
+
+
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize(
+    "base_model,mul_model,add_model,quantization",
+    [
+        (
+            "Qwen/Qwen3-0.6B",
+            "inference-optimization/Qwen3-0.6B-debug-multiply",
+            "inference-optimization/Qwen3-0.6B-debug-add",
+            "fp8",
+        ),
+        (
+            "inference-optimization/DeepSeek-V3-debug-empty",
+            "inference-optimization/DeepSeek-V3-debug-multiply",
+            "inference-optimization/DeepSeek-V3-debug-add",
+            "fp8",
+        ),
+        (
+            "Qwen/Qwen3-0.6B",
+            "inference-optimization/Qwen3-0.6B-debug-multiply",
+            "inference-optimization/Qwen3-0.6B-debug-add",
+            "mxfp8",
+        ),
+        # (  TODO: support mxfp4 & mla
+        #     "inference-optimization/DeepSeek-V3-debug-empty",
+        #     "inference-optimization/DeepSeek-V3-debug-multiply",
+        #     "inference-optimization/DeepSeek-V3-debug-add",
+        #     "mxfp8",
+        # ),
+    ],
+)
+def test_online_quantize_reload(
+    base_model, mul_model, add_model, quantization, tp_size, vllm_runner
+):
+    if cuda_device_count_stateless() < tp_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    if quantization == "fp8" and not current_platform.supports_fp8():
+        pytest.skip(reason="Requires FP8 support")
+
+    with vllm_runner(
+        model_name=base_model,
+        quantization=quantization,
+        tensor_parallel_size=tp_size,
+        enable_expert_parallel=(tp_size > 1 and "DeepSeek" in base_model),
+        enable_prefix_caching=False,
+    ) as llm:
+        llm.collective_rpc("reload_weights", kwargs={"weights_path": mul_model})
+        mul_perp = llm.generate_prompt_perplexity(["3 4 = 12"], mask=["3 4 ="])[0]
+        add_perp = llm.generate_prompt_perplexity(["3 4 = 7"], mask=["3 4 ="])[0]
+        assert mul_perp < add_perp
+
+        llm.collective_rpc("reload_weights", kwargs={"weights_path": add_model})
+        mul_perp = llm.generate_prompt_perplexity(["3 4 = 12"], mask=["3 4 ="])[0]
+        add_perp = llm.generate_prompt_perplexity(["3 4 = 7"], mask=["3 4 ="])[0]
+        assert add_perp < mul_perp
@@ -73,7 +73,9 @@
     cutlass_fp8_supported,
     normalize_e4m3fn_to_e4m3fnuz,
 )
-from vllm.model_executor.model_loader.weight_utils import initialize_single_dummy_weight
+from vllm.model_executor.model_loader.reload.layerwise import (
+    initialize_online_processing,
+)
 from vllm.model_executor.parameter import (
     BlockQuantScaleParameter,
     ModelWeightParameter,
@@ -496,8 +498,8 @@ def apply(
 
 
 class Fp8OnlineLinearMethod(Fp8LinearMethod):
-    """Online version of Fp8LinearMethod, loads the fp16/bf16 checkpoint
-    and quantized the weights during loading."""
+    """Online version of Fp8LinearMethod which loads a full precision checkpoint
+    and quantizes weights during loading."""
 
     uses_meta_device: bool = True
 
@@ -519,84 +521,25 @@ def create_weights(
         layer.orig_dtype = params_dtype
         layer.weight_block_size = None
 
-        # WEIGHT
-        def patched_weight_loader(param, loaded_weight, *args, **kwargs):
-            # track how many elements we have updated
-            if not hasattr(layer, "_loaded_numel"):
-                layer._loaded_numel = 0
-
-                # when the first `loaded_weight` is about to be
-                # loaded to `param`, materialize `param` just-in-time
-                weight = ModelWeightParameter(
-                    data=torch.empty_like(layer.weight, device=layer._load_device),
-                    input_dim=1,
-                    output_dim=0,
-                    weight_loader=patched_weight_loader,
-                )
-                _copy_missing_attrs(layer.weight, weight)
-                layer.register_parameter("weight", weight)
-                del layer._load_device
-
-            # refresh the reference to `param` to reflect just-in-time
-            # materialization
-            param = layer.weight
-
-            # load the current weight chunk
-            copy_numel_counter = CopyNumelCounter()
-            with copy_numel_counter:
-                res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
-            layer._loaded_numel += copy_numel_counter.copied_numel
-
-            # if we have loaded all of the elements, call
-            # process_weights_after_loading
-            target_loaded_numel = layer.weight.numel()
-            if layer._loaded_numel == target_loaded_numel:
-                self.process_weights_after_loading(layer)
-
-                # Prevent the usual `process_weights_after_loading` call from doing
-                # anything
-                layer._already_called_process_weights_after_loading = True
-
-                # Note that we keep `layer._loaded_numel` around just in case
-                # there is logic added to vllm in the future which calls a
-                # weight loader twice - we do not want to re-initialize in
-                # that case.
-
-            return res
-
         weight = ModelWeightParameter(
             data=torch.empty(
                 output_size_per_partition,
                 input_size_per_partition,
-                # materialized just-in-time in `patched_weight_loader`
-                device="meta",
+                device="meta",  # materialized and processed during loading
                 dtype=params_dtype,
             ),
             input_dim=1,
             output_dim=0,
-            weight_loader=patched_weight_loader,
+            weight_loader=weight_loader,
         )
-        # stash the correct device for `patched_weight_loader`
-        layer._load_device = torch.get_default_device()
         layer.register_parameter("weight", weight)
 
+        initialize_online_processing(layer)
+
     def process_weights_after_loading(self, layer: Module) -> None:
         if getattr(layer, "_already_called_process_weights_after_loading", False):
             return
 
-        # deferred initialization of randomly initialized weights for the
-        # `--load_format dummy` feature
-        if layer.weight.device == torch.device("meta"):
-            weight = ModelWeightParameter(
-                data=torch.empty_like(layer.weight, device=layer._load_device),
-                input_dim=1,
-                output_dim=0,
-                weight_loader=layer.weight.weight_loader,
-            )
-            _copy_missing_attrs(layer.weight, weight)
-            layer.register_parameter("weight", weight)
-            initialize_single_dummy_weight(layer.weight)
-
         # TODO(future): support block_quant in online quant path
         assert not self.block_quant
 
@@ -845,9 +788,6 @@ def _setup_kernel(
             )
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        if getattr(layer, "_already_called_process_weights_after_loading", False):
-            return
-
         # Allow for accessing weights and scales in standard way.
         w13 = layer.w13_weight
         w2 = layer.w2_weight
@@ -892,9 +832,6 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer, w13, w2, w13_scale, w2_scale, w13_input_scale, w2_input_scale
         )
 
-        # Prevent duplicate processing (e.g., during weight reload)
-        layer._already_called_process_weights_after_loading = True
-
     def maybe_make_prepare_finalize(
         self,
         routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
@@ -1013,86 +950,12 @@ def create_weights(
         layer.orig_dtype = params_dtype
         layer.weight_block_size = None
 
-        # We are doing online quantization, patch the weight loaded
-        # to call `process_weights_after_loading` in a streaming fashion
-        # as soon as the last weight chunk is loaded.
-        weight_loader = extra_weight_attrs["weight_loader"]
-        # create a new holder to prevent modifying behavior of any other
-        # objects which might depend on the old one
-        new_extra_weight_attrs = extra_weight_attrs
-
-        def patched_weight_loader(param, loaded_weight, *args, **kwargs):
-            # add a counter to track how many elements we have updated
-            if not hasattr(layer, "_loaded_numel"):
-                layer._loaded_numel = 0
-
-                # save the ids of original w13 and w2 so that we can
-                # distinguish which one `param` should map to further
-                # down in this file
-                layer._w13_weight_orig_id = id(layer.w13_weight)
-                layer._w2_weight_orig_id = id(layer.w2_weight)
-
-                # when the first `loaded_weight` is about to be
-                # loaded to `param`, materialize `param` just-in-time
-
-                w13_weight = torch.nn.Parameter(
-                    torch.empty_like(layer.w13_weight, device=layer._load_device),
-                    requires_grad=False,
-                )
-                set_weight_attrs(w13_weight, extra_weight_attrs)
-                _copy_missing_attrs(layer.w13_weight, w13_weight)
-                layer.register_parameter("w13_weight", w13_weight)
-
-                w2_weight = torch.nn.Parameter(
-                    torch.empty_like(layer.w2_weight, device=layer._load_device),
-                    requires_grad=False,
-                )
-                set_weight_attrs(w2_weight, extra_weight_attrs)
-                _copy_missing_attrs(layer.w2_weight, w2_weight)
-                layer.register_parameter("w2_weight", w2_weight)
-                del layer._load_device
-
-            # refresh the reference to `param` to reflect just-in-time
-            # materialization
-            if id(param) == layer._w13_weight_orig_id:
-                param = layer.w13_weight
-            elif id(param) == layer._w2_weight_orig_id:
-                param = layer.w2_weight
-
-            # load the current weight chunk
-            copy_numel_counter = CopyNumelCounter()
-            with copy_numel_counter:
-                res = weight_loader(param, loaded_weight, *args, **kwargs)  # type: ignore[misc]
-            layer._loaded_numel += copy_numel_counter.copied_numel
-
-            # if we have loaded all of the elements, call
-            # process_weights_after_loading
-            target_loaded_numel = layer.w13_weight.numel() + layer.w2_weight.numel()
-            if layer._loaded_numel == target_loaded_numel:
-                self.process_weights_after_loading(layer)
-
-                # Prevent the usual `process_weights_after_loading` call
-                # from doing anything
-                layer._already_called_process_weights_after_loading = True
-
-                # Note that we keep `layer._loaded_numel`,
-                # `layer._w13_weight_orig_id` and `layer._w2_weight_orig_id`
-                # around because if EP is on, weight loaders for non-local
-                # experts will run but not actually copy any elements, and we
-                # need to not re-initialize in that case.
-
-            return res
-
-        new_extra_weight_attrs["weight_loader"] = patched_weight_loader
-        extra_weight_attrs = new_extra_weight_attrs
-
         # WEIGHTS
         w13_weight = torch.nn.Parameter(
             torch.empty(
                 num_experts,
                 2 * intermediate_size_per_partition,
                 hidden_size,
-                # materialized just-in-time in `patched_weight_loader`
                 device="meta",
                 dtype=params_dtype,
             ),
@@ -1106,91 +969,53 @@ def patched_weight_loader(param, loaded_weight, *args, **kwargs):
                 num_experts,
                 hidden_size,
                 intermediate_size_per_partition,
-                # materialized just-in-time in `patched_weight_loader`
-                device="meta",
+                device="meta",  # materialized and processed during loading
                 dtype=params_dtype,
             ),
             requires_grad=False,
         )
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
-        # stash the correct device for `patched_weight_loader`
-        layer._load_device = torch.get_default_device()
 
         # BIASES (for models like GPT-OSS that have biased MoE)
         if self.moe.has_bias:
-            # Use the original weight_loader (not patched) for biases
-            orig_extra_weight_attrs = dict(extra_weight_attrs)
-            orig_extra_weight_attrs["weight_loader"] = weight_loader
             w13_bias = torch.nn.Parameter(
                 torch.zeros(
                     num_experts,
                     2 * intermediate_size_per_partition,
+                    device="meta",  # materialized and processed during loading
                     dtype=layer.orig_dtype,
                 ),
                 requires_grad=False,
             )
             layer.register_parameter("w13_bias", w13_bias)
-            set_weight_attrs(w13_bias, orig_extra_weight_attrs)
+            set_weight_attrs(w13_bias, extra_weight_attrs)
+
             w2_bias = torch.nn.Parameter(
-                torch.zeros(num_experts, hidden_size, dtype=layer.orig_dtype),
+                torch.zeros(
+                    num_experts,
+                    hidden_size,
+                    device="meta",  # materialized and processed during loading
+                    dtype=layer.orig_dtype,
+                ),
                 requires_grad=False,
             )
             layer.register_parameter("w2_bias", w2_bias)
-            set_weight_attrs(w2_bias, orig_extra_weight_attrs)
-
-        # WEIGHT_SCALES
-        # Allocate 2 scales for w1 and w3 respectively.
-        # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
-        )
-        w2_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
-        )
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_bias, extra_weight_attrs)
 
-        layer.w13_input_scale = None
-        layer.w2_input_scale = None
+        initialize_online_processing(layer)
 
     def process_weights_after_loading(self, layer: Module) -> None:
         if getattr(layer, "_already_called_process_weights_after_loading", False):
             return
 
-        # deferred initialization of randomly initialized weights for the
-        # `--load_format dummy` feature
-        if layer.w13_weight.device == torch.device("meta"):
-            w13_weight = torch.nn.Parameter(
-                torch.empty_like(layer.w13_weight, device=layer._load_device),
-                requires_grad=False,
-            )
-            set_weight_attrs(
-                w13_weight, {"weight_loader": layer.w13_weight.weight_loader}
-            )
-            _copy_missing_attrs(layer.w13_weight, w13_weight)
-            layer.register_parameter("w13_weight", w13_weight)
-            initialize_single_dummy_weight(layer.w13_weight)
-        if layer.w2_weight.device == torch.device("meta"):
-            w2_weight = torch.nn.Parameter(
-                torch.empty_like(layer.w2_weight, device=layer._load_device),
-                requires_grad=False,
-            )
-            set_weight_attrs(
-                w2_weight, {"weight_loader": layer.w2_weight.weight_loader}
-            )
-            _copy_missing_attrs(layer.w2_weight, w2_weight)
-            layer.register_parameter("w2_weight", w2_weight)
-            initialize_single_dummy_weight(layer.w2_weight)
-
-        # If checkpoint is fp16, quantize in place.
         fp8_dtype = current_platform.fp8_dtype()
         w13 = torch.empty_like(layer.w13_weight, dtype=fp8_dtype)
         w2 = torch.empty_like(layer.w2_weight, dtype=fp8_dtype)
-        w13_scale = layer.w13_weight_scale
-        w2_scale = layer.w2_weight_scale
+        w13_scale = torch.ones(layer.num_experts, dtype=torch.float32)
+        w2_scale = torch.ones(layer.num_experts, dtype=torch.float32)
+        layer.w13_input_scale = None
+        layer.w2_input_scale = None
 
         for expert in range(layer.local_num_experts):
             w13[expert, :, :], w13_scale[expert] = ops.scaled_fp8_quant(
@@ -1207,8 +1032,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
             w2,
             w13_scale,
             w2_scale,
-            layer.w13_input_scale,
-            layer.w2_input_scale,
+            w13_input_scale=layer.w13_input_scale,
+            w2_input_scale=layer.w2_input_scale,
         )
 
         # Prevent duplicate processing (e.g., during weight reload)