Lightning-AI · Devesh-Maheshwari · May 26, 2026 · May 26, 2026 · May 27, 2026 · May 27, 2026
@@ -88,12 +88,39 @@ class Throughput:
         world_size: Number of devices available across hosts. Global metrics are not included if the world size is 1.
         window_size: Number of batches to use for a rolling average.
         separator: Key separator to use when creating per-device and global metrics.
+        using_sparse_model: User intent for whether the model exploits structured sparsity.
+            ``True`` scales ``available_flops`` by ``sparse_cuda_acceleration_factor`` to reflect the higher
+            theoretical peak achievable with structured sparsity (e.g. NVIDIA's 2:4 sparse Tensor Cores on
+            Ampere and later). ``False`` keeps the dense peak. ``None`` (the default) keeps the dense peak
+            and emits a one-time warning so users explicitly choose; set this flag to silence the warning.
+        sparse_cuda_acceleration_factor: Multiplier applied to ``available_flops`` when ``using_sparse_model``
+            is ``True``. Defaults to ``2.0``, matching the 2x speedup of NVIDIA 2:4 structured sparsity.
 
     """
 
     def __init__(
-        self, available_flops: Optional[float] = None, world_size: int = 1, window_size: int = 100, separator: str = "/"
+        self,
+        available_flops: Optional[float] = None,
+        world_size: int = 1,
+        window_size: int = 100,
+        separator: str = "/",
+        using_sparse_model: Optional[bool] = None,
+        sparse_cuda_acceleration_factor: float = 2.0,
     ) -> None:
+        # For sparse models, hardware can achieve a higher theoretical peak (e.g. NVIDIA's 2:4 structured
+        # sparsity doubles tensor-core throughput). Scaling the peak here keeps MFU = measured / peak
+        # consistent with the sparse ceiling, so users get a meaningful utilization figure.
+        # If `available_flops is None` the peak is unknown, so MFU is skipped in `compute()` regardless
+        # (see the `if self.available_flops:` guard), and we leave it as None rather than erroring.
+        assert sparse_cuda_acceleration_factor >= 1.0, "sparse acceleration factor cannot reduce peak FLOPs"
+        if using_sparse_model is None and available_flops is not None:
+            # the user didn't tell us their intent; default to dense and warn so MFU isn't silently ambiguous
+            rank_zero_warn(
+                "MFU assumes dense model FLOPs (no sparsity acceleration)."
+                " Set 'using_sparse_model=True' for mfu to use sparse flops."
+            )
+        if using_sparse_model and available_flops is not None:
+            available_flops = available_flops * sparse_cuda_acceleration_factor
         self.available_flops = available_flops
         self.separator = separator
         assert world_size > 0
@@ -306,37 +333,37 @@ def measure_flops(
     # Hopper
     # source: https://nvdam.widen.net/s/nb5zzzsjdf/hpc-datasheet-sc23-h200-datasheet-3002446
     "h200 sxm1": {
-        torch.float64: 3.4e13,
-        torch.float32: 6.7e13,
-        "tfloat32": 9.9e14,
-        torch.bfloat16: 2.0e15,
-        torch.float16: 2.0e15,
-        torch.int8: 4.0e15,
+        torch.float64: 34e12,
+        torch.float32: 67e12,
+        "tfloat32": 494.5e12,
+        torch.bfloat16: 989.5e12,
+        torch.float16: 989.5e12,
+        torch.int8: 1979e12,
     },
     "h200 nvl1": {
         torch.float64: 3.0e13,
         torch.float32: 6.0e13,
-        "tfloat32": 8.4e14,
-        torch.bfloat16: 1.7e15,
-        torch.float16: 1.7e15,
-        torch.int8: 3.3e15,
+        "tfloat32": 4.2e14,
+        torch.bfloat16: 8.4e14,
+        torch.float16: 8.4e14,
+        torch.int8: 1.68e15,
     },
-    # source: https://resources.nvidia.com/en-us-tensor-core
+    # source: https://resources.nvidia.com/en-us-gpu-resources/h100-datasheet-24306
     "h100 nvl": {
-        torch.float64: 67e12,
-        torch.float32: 133.8e12,
-        "tfloat32": 989.4e12,
-        torch.bfloat16: 1978.8e12,
-        torch.float16: 1978.8e12,
-        torch.int8: 3957.8e12,
+        torch.float64: 30e12,
+        torch.float32: 60e12,
+        "tfloat32": 417.5e12,
+        torch.bfloat16: 835.5e12,
+        torch.float16: 835.5e12,
+        torch.int8: 1670.5e12,
     },
     "h100 sxm": {
-        torch.float64: 33.5e12,
-        torch.float32: 66.9e12,
-        "tfloat32": 494.7e12,
-        torch.bfloat16: 989.4e12,
-        torch.float16: 989.4e12,
-        torch.int8: 1978.9e12,
+        torch.float64: 34e12,
+        torch.float32: 67e12,
+        "tfloat32": 494.5e12,
+        torch.bfloat16: 989.5e12,
+        torch.float16: 989.5e12,
+        torch.int8: 1979e12,
     },
     "h100 pcie": {
         torch.float64: 25.6e12,
@@ -407,11 +434,12 @@ def measure_flops(
         torch.int8: 309.7e12,
         "int4": 619.3e12,
     },
+    # source: https://images.nvidia.com/content/Solutions/data-center/a40/nvidia-a40-datasheet.pdf
     "a40": {
         torch.float32: 37.4e12,
         "tfloat32": 74.8e12,
-        torch.bfloat16: 37.4e12,
-        torch.float16: 37.4e12,
+        torch.bfloat16: 149.7e12,
+        torch.float16: 149.7e12,
         torch.int8: 299.3e12,
         "int4": 598.7e12,
     },
@@ -448,6 +476,7 @@ def measure_flops(
         torch.int8: 272.8e12,
         "int4": 546.6e12,
     },
+    # source: https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf
     "rtx 3080": {
         torch.float32: 29.8e12,
         "tfloat32": 29.8e12,

@@ -1,3 +1,4 @@
+import warnings
 from unittest import mock
 from unittest.mock import Mock, call
 
@@ -183,6 +184,60 @@ def test_throughput():
         throughput.update(time=0, batches=2, samples=2, lengths=1)
 
 
+def test_throughput_sparse_model_scaling():
+    """``using_sparse_model`` scales ``available_flops`` by the acceleration factor."""
+    # explicit False — available_flops untouched (also avoids the unset-flag warning)
+    assert Throughput(available_flops=100.0, using_sparse_model=False).available_flops == 100.0
+
+    # sparse flag on with default 2.0 factor
+    assert Throughput(available_flops=100.0, using_sparse_model=True).available_flops == 200.0
+
+    # sparse flag on with custom factor
+    throughput = Throughput(available_flops=100.0, using_sparse_model=True, sparse_cuda_acceleration_factor=4.0)
+    assert throughput.available_flops == 400.0
+
+    # sparse flag on with unknown peak — stays None, no error (MFU is skipped in compute())
+    assert Throughput(available_flops=None, using_sparse_model=True).available_flops is None
+
+    # factor below 1.0 is physically meaningless (sparsity can never lower the peak)
+    with pytest.raises(AssertionError, match="sparse acceleration factor cannot reduce"):
+        Throughput(available_flops=100.0, using_sparse_model=True, sparse_cuda_acceleration_factor=0.5)
+
+
+def test_throughput_sparse_model_warning():
+    """When ``using_sparse_model`` is unset and a peak is known, warn so MFU is not silently ambiguous."""
+    # warning fires only when the peak is known and the user didn't specify intent
+    with pytest.warns(UserWarning, match="MFU assumes dense model FLOPs"):
+        throughput = Throughput(available_flops=100.0)
+    assert throughput.available_flops == 100.0  # dense default, no scaling applied
+
+    # explicit choice (True or False) silences the warning
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        Throughput(available_flops=100.0, using_sparse_model=False)
+        Throughput(available_flops=100.0, using_sparse_model=True)
+
+    # no peak known → MFU is never computed, so no warning is needed
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        Throughput(available_flops=None)
+
+
+def test_throughput_sparse_model_mfu():
+    """MFU denominator reflects the sparse-scaled peak, so MFU halves when the peak doubles."""
+    # baseline dense: mfu = flops_per_sec / available_flops = 10 / 50 = 0.2
+    throughput = Throughput(available_flops=50, window_size=2, using_sparse_model=False)
+    throughput.update(time=1, batches=1, samples=2, flops=10)
+    throughput.update(time=2, batches=2, samples=4, flops=10)
+    assert throughput.compute()["device/mfu"] == 0.2
+
+    # sparse: peak doubles to 100, so mfu = 10 / 100 = 0.1
+    throughput = Throughput(available_flops=50, window_size=2, using_sparse_model=True)
+    throughput.update(time=1, batches=1, samples=2, flops=10)
+    throughput.update(time=2, batches=2, samples=4, flops=10)
+    assert throughput.compute()["device/mfu"] == 0.1
+
+
 def mock_train_loop(monitor):
     # simulate lit-gpt style loop
     total_lengths = 0
@@ -318,6 +373,17 @@ def test_throughput_monitor_world_size():
     ]
 
 
+def test_throughput_monitor_sparse_model():
+    """``using_sparse_model`` and ``sparse_cuda_acceleration_factor`` propagate through ThroughputMonitor."""
+    fabric_mock = Mock()
+    fabric_mock.world_size = 1
+    fabric_mock.strategy.precision = Precision()
+    with mock.patch("lightning.fabric.utilities.throughput.get_available_flops", return_value=100):
+        monitor = ThroughputMonitor(fabric_mock, using_sparse_model=True, sparse_cuda_acceleration_factor=2.0)
+    # 100 (from hardware lookup) scaled by 2.0 (sparse factor) → 200
+    assert monitor.available_flops == 200
+
+
 def test_monotonic_window():
     w = _MonotonicWindow(maxlen=3)
     assert w == []