_replace_linears_for_quant_paths runs, projection paths point at StreamingQLinear modules.

pythongiant · pythongiant · commit 52b9334a0170 · 2026-05-16T20:09:27.000+05:30
diff --git a/src/kvboost/streaming/awq_loader.py b/src/kvboost/streaming/awq_loader.py
@@ -374,6 +374,7 @@ def materialize_into_module(
         hf_model: "torch.nn.Module",
         *,
         only_resident: bool = True,
+        skip_quant_projections: bool = True,
     ) -> None:
         """Write resident tensors into the matching submodules of ``hf_model``.
 
@@ -385,12 +386,31 @@ def materialize_into_module(
         This is the bridge that lets us use ``accelerate.init_empty_weights``
         for the skeleton and then selectively materialize only the layers that
         should live in VRAM.
+
+        ``skip_quant_projections`` (default True) skips ``*.qweight``,
+        ``*.scales``, ``*.qzeros`` tensors. When the streaming pipeline
+        replaces projection modules with :class:`StreamingQLinear` (which
+        has no ``qweight`` parameter slot), naively assigning those tensors
+        via ``setattr`` creates **orphaned duplicate** allocations — they
+        sit on the new module as bare attributes while the real binding
+        happens later via :meth:`bind_streaming_qlinears` into the
+        ``_qweight`` / ``_scales`` / ``_qzeros`` slots. Skipping them here
+        avoids that double-allocation (~2 GiB for a 32B model with 8
+        resident layers).
         """
         assert self.index is not None
 
+        def _is_quant_proj(name: str) -> bool:
+            return (
+                name.endswith(".qweight")
+                or name.endswith(".scales")
+                or name.endswith(".qzeros")
+            )
+
         wanted = [
             spec for spec in self.index.tensors.values()
             if (spec.is_resident if only_resident else True)
+            and not (skip_quant_projections and _is_quant_proj(spec.name))
         ]
 
         by_shard: dict[Path, list[TensorSpec]] = {}
diff --git a/tests/streaming/test_mps_path.py b/tests/streaming/test_mps_path.py
@@ -224,6 +224,61 @@ def test_bound_streaming_qlinear_runs_forward(tmp_path):
     assert out.shape == (2, 16)
 
 
+def test_materialize_skips_quant_projections_by_default(tmp_path):
+    """Regression: ``materialize_into_module`` must not write the packed
+    AWQ tensors (``*.qweight``/``*.scales``/``*.qzeros``) onto submodules.
+    Those tensors are bound separately via ``bind_streaming_qlinears`` into
+    the StreamingQLinear's ``_qweight`` etc. slots. If materialize also
+    assigns them via ``setattr``, we get a second on-device copy that
+    nothing reads — pure VRAM waste (~2 GiB on a 32B model).
+    """
+    loader = _make_loader(tmp_path, num_layers=2)
+    model = _FakeHfModel(num_layers=2)
+
+    # Replace projections with StreamingQLinear (this is what the streaming
+    # path does before calling materialize).
+    replacements = _replace_linears_for_quant_paths(
+        model, loader=loader, group_size=8, prefer="torch",
+    )
+
+    # Materialize with the default (skip_quant_projections=True).
+    loader.materialize_into_module(model, only_resident=False)
+
+    # Verify the StreamingQLinears were NOT polluted with bare-attribute
+    # qweight/scales/qzeros tensors. Their packed slots should still be
+    # empty (only ``bind_streaming_qlinears`` fills those).
+    for layer_idx, layer_repl in replacements.items():
+        for sub_path, qlin in layer_repl.items():
+            # _qweight is the bound slot; should be None pre-bind.
+            assert qlin._qweight is None, (
+                f"layer {layer_idx} {sub_path}: _qweight set without bind"
+            )
+            # And NO bare-attribute pollution either.
+            for attr in ("qweight", "scales", "qzeros"):
+                assert not hasattr(qlin, attr) or getattr(qlin, attr) is None, (
+                    f"layer {layer_idx} {sub_path}: bare .{attr} attr leaked"
+                )
+
+
+def test_materialize_with_skip_disabled_does_assign_projections(tmp_path):
+    """The opt-out exists for callers that genuinely want to write
+    projections via setattr (e.g. when the target is autoawq's
+    WQLinear_GEMM with real qweight buffers).
+    """
+    loader = _make_loader(tmp_path, num_layers=1)
+    # Use a raw _FakeHfModel WITHOUT replacing — q_proj is plain nn.Linear,
+    # which doesn't have a qweight slot. setattr should set a bare attribute.
+    model = _FakeHfModel(num_layers=1)
+
+    loader.materialize_into_module(
+        model, only_resident=False, skip_quant_projections=False,
+    )
+
+    # Projections should now have qweight as a bare attribute (since the
+    # plain Linear doesn't pre-declare one).
+    assert hasattr(model.model.layers[0].self_attn.q_proj, "qweight")
+
+
 def test_iter_decoder_layers_on_fake_skeleton():
     model = _FakeHfModel(num_layers=4)
     pairs = _iter_decoder_layers(model)