diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
index 72fc993f18..458770ab2b 100644
--- a/test/dtypes/test_affine_quantized.py
+++ b/test/dtypes/test_affine_quantized.py
@@ -169,7 +169,7 @@ def apply_uint6_weight_only_quant(linear):
 
         deregister_aqt_quantized_linear_dispatch(dispatch_condition)
 
-    @skip_if_rocm("ROCm enablement in progress")
+    @skip_if_rocm("hipSPARSELt reports available but fails at runtime on this machine")
     @unittest.skipIf(len(GPU_DEVICES) == 0, "Need GPU available")
     def test_print_quantized_module(self):
         for device in self.GPU_DEVICES:
@@ -254,7 +254,6 @@ class TestAffineQuantizedBasic(TestCase):
 
     @common_utils.parametrize("device", COMMON_DEVICES)
     @common_utils.parametrize("dtype", COMMON_DTYPES)
-    @skip_if_rocm("ROCm enablement in progress")
     def test_flatten_unflatten(self, device, dtype):
         if device == "cuda" and dtype == torch.bfloat16 and is_fbcode():
             raise unittest.SkipTest("TODO: Failing for cuda + bfloat16 in fbcode")
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
index f03fec1780..b838dd21ba 100644
--- a/test/integration/test_integration.py
+++ b/test/integration/test_integration.py
@@ -281,7 +281,6 @@ def test_per_token_linear_cpu(self):
             self._test_per_token_linear_impl("cpu", dtype)
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
-    @skip_if_rocm("ROCm enablement in progress")
     def test_per_token_linear_cuda(self):
         device = get_current_accelerator_device()
         for dtype in (torch.float32, torch.float16, torch.bfloat16):
@@ -643,7 +642,9 @@ def test_gemlite_layout(self, device, dtype):
         )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
-    @skip_if_rocm("ROCm enablement in progress")
+    @skip_if_rocm(
+        "_weight_int4pack_mm qScaleAndZeros shape mismatch on small N (16, 8)"
+    )
     @skip_if_xpu("XPU enablement in progress")
     def test_int4_weight_only_quant_subclass_api_grouped(self, device, dtype):
         if dtype != torch.bfloat16:
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index 6eb05929da..2d3e63f878 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -57,9 +57,10 @@
     XNNPACKQuantizer,
     get_symmetric_quantization_config,
 )
-from torchao.testing.utils import skip_if_rocm, skip_if_xpu
+from torchao.testing.utils import skip_if_xpu
 from torchao.utils import (
     get_current_accelerator_device,
+    is_ROCM,
     is_sm_at_least_89,
     is_sm_at_least_90,
     unwrap_tensor_subclass,
@@ -351,22 +352,26 @@ def reset_memory():
         ],
     )
     @skip_if_xpu("XPU enablement in progress")
-    @skip_if_rocm("ROCm enablement in progress")
     def test_workflow_e2e_numerics(self, config):
         """
         Simple test of e2e Int4WeightOnlyConfig workflow, comparing numerics
         to a bfloat16 baseline.
         """
-        if (
-            isinstance(
-                config,
-                Float8DynamicActivationFloat8WeightConfig,
-            )
+        if isinstance(config, GemliteUIntXWeightOnlyConfig) and not has_gemlite:
+            return unittest.skip("gemlite not available")
+        if is_ROCM():
+            if isinstance(config, Float8DynamicActivationFloat8WeightConfig):
+                # Default PerTensor granularity on 128x128 linear triggers a
+                # false positive in _is_128_128_scaled (block_size matches shape).
+                # PerRow works; this is an upstream issue not specific to ROCm.
+                return unittest.skip(
+                    "Float8DynActFloat8Weight default PerTensor hits _is_128_128_scaled collision at 128x128"
+                )
+        elif (
+            isinstance(config, Float8DynamicActivationFloat8WeightConfig)
             and not is_sm_at_least_89()
         ):
             return unittest.skip("requires CUDA capability 8.9 or greater")
-        elif isinstance(config, GemliteUIntXWeightOnlyConfig) and not has_gemlite:
-            return unittest.skip("gemlite not available")
 
         dtype = torch.bfloat16
         if isinstance(config, GemliteUIntXWeightOnlyConfig):