@@ -224,6 +224,61 @@ def test_bound_streaming_qlinear_runs_forward(tmp_path):
224224 assert out .shape == (2 , 16 )
225225
226226
227+ def test_materialize_skips_quant_projections_by_default (tmp_path ):
228+ """Regression: ``materialize_into_module`` must not write the packed
229+ AWQ tensors (``*.qweight``/``*.scales``/``*.qzeros``) onto submodules.
230+ Those tensors are bound separately via ``bind_streaming_qlinears`` into
231+ the StreamingQLinear's ``_qweight`` etc. slots. If materialize also
232+ assigns them via ``setattr``, we get a second on-device copy that
233+ nothing reads — pure VRAM waste (~2 GiB on a 32B model).
234+ """
235+ loader = _make_loader (tmp_path , num_layers = 2 )
236+ model = _FakeHfModel (num_layers = 2 )
237+
238+ # Replace projections with StreamingQLinear (this is what the streaming
239+ # path does before calling materialize).
240+ replacements = _replace_linears_for_quant_paths (
241+ model , loader = loader , group_size = 8 , prefer = "torch" ,
242+ )
243+
244+ # Materialize with the default (skip_quant_projections=True).
245+ loader .materialize_into_module (model , only_resident = False )
246+
247+ # Verify the StreamingQLinears were NOT polluted with bare-attribute
248+ # qweight/scales/qzeros tensors. Their packed slots should still be
249+ # empty (only ``bind_streaming_qlinears`` fills those).
250+ for layer_idx , layer_repl in replacements .items ():
251+ for sub_path , qlin in layer_repl .items ():
252+ # _qweight is the bound slot; should be None pre-bind.
253+ assert qlin ._qweight is None , (
254+ f"layer { layer_idx } { sub_path } : _qweight set without bind"
255+ )
256+ # And NO bare-attribute pollution either.
257+ for attr in ("qweight" , "scales" , "qzeros" ):
258+ assert not hasattr (qlin , attr ) or getattr (qlin , attr ) is None , (
259+ f"layer { layer_idx } { sub_path } : bare .{ attr } attr leaked"
260+ )
261+
262+
263+ def test_materialize_with_skip_disabled_does_assign_projections (tmp_path ):
264+ """The opt-out exists for callers that genuinely want to write
265+ projections via setattr (e.g. when the target is autoawq's
266+ WQLinear_GEMM with real qweight buffers).
267+ """
268+ loader = _make_loader (tmp_path , num_layers = 1 )
269+ # Use a raw _FakeHfModel WITHOUT replacing — q_proj is plain nn.Linear,
270+ # which doesn't have a qweight slot. setattr should set a bare attribute.
271+ model = _FakeHfModel (num_layers = 1 )
272+
273+ loader .materialize_into_module (
274+ model , only_resident = False , skip_quant_projections = False ,
275+ )
276+
277+ # Projections should now have qweight as a bare attribute (since the
278+ # plain Linear doesn't pre-declare one).
279+ assert hasattr (model .model .layers [0 ].self_attn .q_proj , "qweight" )
280+
281+
227282def test_iter_decoder_layers_on_fake_skeleton ():
228283 model = _FakeHfModel (num_layers = 4 )
229284 pairs = _iter_decoder_layers (model )
0 commit comments