diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py index 72fc993f18..458770ab2b 100644 --- a/test/dtypes/test_affine_quantized.py +++ b/test/dtypes/test_affine_quantized.py @@ -169,7 +169,7 @@ def apply_uint6_weight_only_quant(linear): deregister_aqt_quantized_linear_dispatch(dispatch_condition) - @skip_if_rocm("ROCm enablement in progress") + @skip_if_rocm("hipSPARSELt reports available but fails at runtime on this machine") @unittest.skipIf(len(GPU_DEVICES) == 0, "Need GPU available") def test_print_quantized_module(self): for device in self.GPU_DEVICES: @@ -254,7 +254,6 @@ class TestAffineQuantizedBasic(TestCase): @common_utils.parametrize("device", COMMON_DEVICES) @common_utils.parametrize("dtype", COMMON_DTYPES) - @skip_if_rocm("ROCm enablement in progress") def test_flatten_unflatten(self, device, dtype): if device == "cuda" and dtype == torch.bfloat16 and is_fbcode(): raise unittest.SkipTest("TODO: Failing for cuda + bfloat16 in fbcode") diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py index f03fec1780..b838dd21ba 100644 --- a/test/integration/test_integration.py +++ b/test/integration/test_integration.py @@ -281,7 +281,6 @@ def test_per_token_linear_cpu(self): self._test_per_token_linear_impl("cpu", dtype) @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") - @skip_if_rocm("ROCm enablement in progress") def test_per_token_linear_cuda(self): device = get_current_accelerator_device() for dtype in (torch.float32, torch.float16, torch.bfloat16): @@ -643,7 +642,9 @@ def test_gemlite_layout(self, device, dtype): ) @parameterized.expand(COMMON_DEVICE_DTYPE) - @skip_if_rocm("ROCm enablement in progress") + @skip_if_rocm( + "_weight_int4pack_mm qScaleAndZeros shape mismatch on small N (16, 8)" + ) @skip_if_xpu("XPU enablement in progress") def test_int4_weight_only_quant_subclass_api_grouped(self, device, dtype): if dtype != torch.bfloat16: diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py index 6eb05929da..2d3e63f878 100644 --- a/test/quantization/test_quant_api.py +++ b/test/quantization/test_quant_api.py @@ -57,9 +57,10 @@ XNNPACKQuantizer, get_symmetric_quantization_config, ) -from torchao.testing.utils import skip_if_rocm, skip_if_xpu +from torchao.testing.utils import skip_if_xpu from torchao.utils import ( get_current_accelerator_device, + is_ROCM, is_sm_at_least_89, is_sm_at_least_90, unwrap_tensor_subclass, @@ -351,22 +352,26 @@ def reset_memory(): ], ) @skip_if_xpu("XPU enablement in progress") - @skip_if_rocm("ROCm enablement in progress") def test_workflow_e2e_numerics(self, config): """ Simple test of e2e Int4WeightOnlyConfig workflow, comparing numerics to a bfloat16 baseline. """ - if ( - isinstance( - config, - Float8DynamicActivationFloat8WeightConfig, - ) + if isinstance(config, GemliteUIntXWeightOnlyConfig) and not has_gemlite: + return unittest.skip("gemlite not available") + if is_ROCM(): + if isinstance(config, Float8DynamicActivationFloat8WeightConfig): + # Default PerTensor granularity on 128x128 linear triggers a + # false positive in _is_128_128_scaled (block_size matches shape). + # PerRow works; this is an upstream issue not specific to ROCm. + return unittest.skip( + "Float8DynActFloat8Weight default PerTensor hits _is_128_128_scaled collision at 128x128" + ) + elif ( + isinstance(config, Float8DynamicActivationFloat8WeightConfig) and not is_sm_at_least_89() ): return unittest.skip("requires CUDA capability 8.9 or greater") - elif isinstance(config, GemliteUIntXWeightOnlyConfig) and not has_gemlite: - return unittest.skip("gemlite not available") dtype = torch.bfloat16 if isinstance(config, GemliteUIntXWeightOnlyConfig):