diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
index 324cf25cd9527..f300a87e93031 100644
--- a/src/lightning/fabric/CHANGELOG.md
+++ b/src/lightning/fabric/CHANGELOG.md
@@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
--
+- Added `using_sparse_model` and `sparse_cuda_acceleration_factor` parameters to `Throughput` so MFU defaults to the dense peak and opts into the sparse peak explicitly ([#21743](https://github.com/Lightning-AI/pytorch-lightning/pull/21743))
 
 ### Changed
 
@@ -20,7 +20,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
--
+- Fixed inconsistent FLOPs reporting on NVIDIA H100/H200 GPUs by defaulting to dense FLOPs, with sparse FLOPs now requiring an explicit opt-in. ([#21743](https://github.com/Lightning-AI/pytorch-lightning/pull/21743))
 
 ---
 
diff --git a/src/lightning/fabric/utilities/throughput.py b/src/lightning/fabric/utilities/throughput.py
index 6bc329fa1c3be..ed5ed6a3fb51a 100644
--- a/src/lightning/fabric/utilities/throughput.py
+++ b/src/lightning/fabric/utilities/throughput.py
@@ -88,12 +88,30 @@ class Throughput:
         world_size: Number of devices available across hosts. Global metrics are not included if the world size is 1.
         window_size: Number of batches to use for a rolling average.
         separator: Key separator to use when creating per-device and global metrics.
+        using_sparse_model: Whether the model uses structured sparsity. If ``True``, scales ``available_flops`` by
+            ``sparse_cuda_acceleration_factor``. ``None`` (default) assumes dense and warns once.
+        sparse_cuda_acceleration_factor: Multiplier applied to ``available_flops`` when ``using_sparse_model`` is
+            ``True``. Defaults to ``2.0`` for NVIDIA 2:4 structured sparsity.
 
     """
 
     def __init__(
-        self, available_flops: Optional[float] = None, world_size: int = 1, window_size: int = 100, separator: str = "/"
+        self,
+        available_flops: Optional[float] = None,
+        world_size: int = 1,
+        window_size: int = 100,
+        separator: str = "/",
+        using_sparse_model: Optional[bool] = None,
+        sparse_cuda_acceleration_factor: float = 2.0,
     ) -> None:
+        assert sparse_cuda_acceleration_factor >= 1.0, "sparse acceleration factor cannot reduce peak FLOPs"
+        if using_sparse_model is None and available_flops is not None:
+            rank_zero_warn(
+                "MFU assumes dense model FLOPs (no sparsity acceleration)."
+                " Set 'using_sparse_model=True' for MFU to use sparse FLOPs."
+            )
+        if using_sparse_model and available_flops is not None:
+            available_flops = available_flops * sparse_cuda_acceleration_factor
         self.available_flops = available_flops
         self.separator = separator
         assert world_size > 0
@@ -308,223 +326,225 @@ def measure_flops(
     "h200 sxm1": {
         torch.float64: 3.4e13,
         torch.float32: 6.7e13,
-        "tfloat32": 9.9e14,
-        torch.bfloat16: 2.0e15,
-        torch.float16: 2.0e15,
-        torch.int8: 4.0e15,
+        "tfloat32": 4.945e14,
+        torch.bfloat16: 9.895e14,
+        torch.float16: 9.895e14,
+        torch.int8: 1.979e15,
     },
     "h200 nvl1": {
         torch.float64: 3.0e13,
         torch.float32: 6.0e13,
-        "tfloat32": 8.4e14,
-        torch.bfloat16: 1.7e15,
-        torch.float16: 1.7e15,
-        torch.int8: 3.3e15,
+        "tfloat32": 4.2e14,
+        torch.bfloat16: 8.4e14,
+        torch.float16: 8.4e14,
+        torch.int8: 1.68e15,
     },
-    # source: https://resources.nvidia.com/en-us-tensor-core
+    # source: https://resources.nvidia.com/en-us-gpu-resources/h100-datasheet-24306
     "h100 nvl": {
-        torch.float64: 67e12,
-        torch.float32: 133.8e12,
-        "tfloat32": 989.4e12,
-        torch.bfloat16: 1978.8e12,
-        torch.float16: 1978.8e12,
-        torch.int8: 3957.8e12,
+        torch.float64: 3.0e13,
+        torch.float32: 6.0e13,
+        "tfloat32": 4.175e14,
+        torch.bfloat16: 8.355e14,
+        torch.float16: 8.355e14,
+        torch.int8: 1.6705e15,
     },
     "h100 sxm": {
-        torch.float64: 33.5e12,
-        torch.float32: 66.9e12,
-        "tfloat32": 494.7e12,
-        torch.bfloat16: 989.4e12,
-        torch.float16: 989.4e12,
-        torch.int8: 1978.9e12,
+        torch.float64: 3.4e13,
+        torch.float32: 6.7e13,
+        "tfloat32": 4.945e14,
+        torch.bfloat16: 9.895e14,
+        torch.float16: 9.895e14,
+        torch.int8: 1.979e15,
     },
     "h100 pcie": {
-        torch.float64: 25.6e12,
-        torch.float32: 51.2e12,
-        "tfloat32": 378e12,
-        torch.bfloat16: 756e12,
-        torch.float16: 756e12,
-        torch.int8: 1513e12,
+        torch.float64: 2.56e13,
+        torch.float32: 5.12e13,
+        "tfloat32": 3.78e14,
+        torch.bfloat16: 7.56e14,
+        torch.float16: 7.56e14,
+        torch.int8: 1.513e15,
     },
     # Ada
     # source: https://images.nvidia.com/aem-dam/Solutions/Data-Center/l4/nvidia-ada-gpu-architecture-whitepaper-v2.1.pdf
     "rtx 4090": {
-        torch.float32: 82.6e12,
-        "tfloat32": 82.6e12,
-        torch.bfloat16: 82.6e12,
-        torch.float16: 82.6e12,
-        torch.int8: 660.6e12,
-        "int4": 1321.2e12,
+        torch.float32: 8.26e13,
+        "tfloat32": 8.26e13,
+        torch.bfloat16: 8.26e13,
+        torch.float16: 8.26e13,
+        torch.int8: 6.606e14,
+        "int4": 1.3212e15,
     },
     "rtx 4080": {
-        torch.float32: 48.7e12,
-        "tfloat32": 48.7e12,
-        torch.bfloat16: 48.7e12,
-        torch.float16: 48.7e12,
-        torch.int8: 389.9e12,
-        "int4": 779.8e12,
+        torch.float32: 4.87e13,
+        "tfloat32": 4.87e13,
+        torch.bfloat16: 4.87e13,
+        torch.float16: 4.87e13,
+        torch.int8: 3.899e14,
+        "int4": 7.798e14,
     },
     "rtx 4080 super": {
-        torch.float32: 52.2e12,
-        "tfloat32": 52.2e12,
-        torch.bfloat16: 52.2e12,
-        torch.float16: 52.2e12,
-        torch.int8: 417.6e12,
-        "int4": 835.2e12,
+        torch.float32: 5.22e13,
+        "tfloat32": 5.22e13,
+        torch.bfloat16: 5.22e13,
+        torch.float16: 5.22e13,
+        torch.int8: 4.176e14,
+        "int4": 8.352e14,
     },
     "l4": {
-        torch.float32: 30.3e12,
-        "tfloat32": 60e12,
-        torch.bfloat16: 121e12,
-        torch.float16: 121e12,
-        torch.int8: 242e12,
-        "int4": 484e12,
+        torch.float32: 3.03e13,
+        "tfloat32": 6.0e13,
+        torch.bfloat16: 1.21e14,
+        torch.float16: 1.21e14,
+        torch.int8: 2.42e14,
+        "int4": 4.84e14,
     },
     "l40": {
-        torch.float32: 90.5e12,
-        "tfloat32": 90.5e12,
-        torch.bfloat16: 181e12,
-        torch.float16: 181e12,
-        torch.int8: 362e12,
-        "int4": 724e12,
+        torch.float32: 9.05e13,
+        "tfloat32": 9.05e13,
+        torch.bfloat16: 1.81e14,
+        torch.float16: 1.81e14,
+        torch.int8: 3.62e14,
+        "int4": 7.24e14,
     },
     # Ampere
     # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
     # sxm and pcie have same flop counts
     "a100": {
         torch.float64: 9.7e12,
-        torch.float32: 19.5e12,
-        "tfloat32": 156e12,
-        torch.bfloat16: 312e12,
-        torch.float16: 312e12,
-        torch.int8: 624e12,
+        torch.float32: 1.95e13,
+        "tfloat32": 1.56e14,
+        torch.bfloat16: 3.12e14,
+        torch.float16: 3.12e14,
+        torch.int8: 6.24e14,
     },
     "a6000": {
-        torch.float32: 38.7e12,
-        "tfloat32": 77.4e12,
-        torch.bfloat16: 38.7e12,
-        torch.float16: 38.7e12,
-        torch.int8: 309.7e12,
-        "int4": 619.3e12,
+        torch.float32: 3.87e13,
+        "tfloat32": 7.74e13,
+        torch.bfloat16: 3.87e13,
+        torch.float16: 3.87e13,
+        torch.int8: 3.097e14,
+        "int4": 6.193e14,
     },
+    # source: https://images.nvidia.com/content/Solutions/data-center/a40/nvidia-a40-datasheet.pdf
     "a40": {
-        torch.float32: 37.4e12,
-        "tfloat32": 74.8e12,
-        torch.bfloat16: 37.4e12,
-        torch.float16: 37.4e12,
-        torch.int8: 299.3e12,
-        "int4": 598.7e12,
+        torch.float32: 3.74e13,
+        "tfloat32": 7.48e13,
+        torch.bfloat16: 1.497e14,
+        torch.float16: 1.497e14,
+        torch.int8: 2.993e14,
+        "int4": 5.987e14,
     },
     # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a10/pdf/a10-datasheet.pdf
     "a10g": {
-        torch.float32: 31.2e12,
-        "tfloat32": 62.5e12,
-        torch.bfloat16: 125e12,
-        torch.float16: 125e12,
-        torch.int8: 250e12,
-        "int4": 500e12,
+        torch.float32: 3.12e13,
+        "tfloat32": 6.25e13,
+        torch.bfloat16: 1.25e14,
+        torch.float16: 1.25e14,
+        torch.int8: 2.5e14,
+        "int4": 5.0e14,
     },
     "rtx 3090 ti": {
-        torch.float32: 40e12,
-        "tfloat32": 40e12,
-        torch.bfloat16: 40e12,
-        torch.float16: 40e12,
-        torch.int8: 320e12,
-        "int4": 640e12,
+        torch.float32: 4.0e13,
+        "tfloat32": 4.0e13,
+        torch.bfloat16: 4.0e13,
+        torch.float16: 4.0e13,
+        torch.int8: 3.2e14,
+        "int4": 6.4e14,
     },
     "rtx 3090": {
-        torch.float32: 35.6e12,
-        "tfloat32": 35.6e12,
-        torch.bfloat16: 35.6e12,
-        torch.float16: 35.6e12,
-        torch.int8: 284e12,
-        "int4": 568e12,
+        torch.float32: 3.56e13,
+        "tfloat32": 3.56e13,
+        torch.bfloat16: 3.56e13,
+        torch.float16: 3.56e13,
+        torch.int8: 2.84e14,
+        "int4": 5.68e14,
     },
     "rtx 3080 ti": {
-        torch.float32: 34.1e12,
-        "tfloat32": 34.1e12,
-        torch.bfloat16: 34.1e12,
-        torch.float16: 34.1e12,
-        torch.int8: 272.8e12,
-        "int4": 546.6e12,
+        torch.float32: 3.41e13,
+        "tfloat32": 3.41e13,
+        torch.bfloat16: 3.41e13,
+        torch.float16: 3.41e13,
+        torch.int8: 2.728e14,
+        "int4": 5.466e14,
     },
+    # source: https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf
     "rtx 3080": {
-        torch.float32: 29.8e12,
-        "tfloat32": 29.8e12,
-        torch.bfloat16: 29.8e12,
-        torch.float16: 29.8e12,
-        torch.int8: 238e12,
-        "int4": 476e12,
+        torch.float32: 2.98e13,
+        "tfloat32": 2.98e13,
+        torch.bfloat16: 2.98e13,
+        torch.float16: 2.98e13,
+        torch.int8: 2.38e14,
+        "int4": 4.76e14,
     },
     "rtx 3070": {
-        torch.float32: 20.3e12,
-        "tfloat32": 20.3e12,
-        torch.bfloat16: 20.3e12,
-        torch.float16: 20.3e12,
-        torch.int8: 162.6e12,
-        "int4": 325.2e12,
+        torch.float32: 2.03e13,
+        "tfloat32": 2.03e13,
+        torch.bfloat16: 2.03e13,
+        torch.float16: 2.03e13,
+        torch.int8: 1.626e14,
+        "int4": 3.252e14,
     },
     # Turing
     # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/tesla-t4/t4-tensor-core-datasheet-951643.pdf
     # sxm and pcie have same flop counts
     "t4": {
         torch.float32: 8.1e12,
-        torch.float16: 65e12,
-        torch.int8: 130e12,
-        "int4": 260e12,
+        torch.float16: 6.5e13,
+        torch.int8: 1.3e14,
+        "int4": 2.6e14,
     },
     # https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/quadro-product-literature/quadro-rtx-5000-data-sheet-us-nvidia-704120-r4-web.pdf
     "quadro rtx 5000": {
-        torch.float32: 11.2e12,
-        torch.float16: 89.2e12,
+        torch.float32: 1.12e13,
+        torch.float16: 8.92e13,
     },
     "rtx 2080 super": {
-        torch.float32: 11.2e12,
-        torch.float16: 22.3e12,
-        torch.int8: 178.4e12,
-        "int4": 356.8e12,
+        torch.float32: 1.12e13,
+        torch.float16: 2.23e13,
+        torch.int8: 1.784e14,
+        "int4": 3.568e14,
     },
     "rtx 2080 ti": {
-        torch.float32: 14.2e12,
-        torch.float16: 28.5e12,
-        torch.int8: 227.7e12,
-        "int4": 455.4e12,
+        torch.float32: 1.42e13,
+        torch.float16: 2.85e13,
+        torch.int8: 2.277e14,
+        "int4": 4.554e14,
     },
     "rtx 2080": {
-        torch.float32: 10.6e12,
-        torch.float16: 21.2e12,
-        torch.int8: 169.6e12,
-        "int4": 339.1e12,
+        torch.float32: 1.06e13,
+        torch.float16: 2.12e13,
+        torch.int8: 1.696e14,
+        "int4": 3.391e14,
     },
     # https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf
     "rtx 2070 super": {
         torch.float32: 9.1e12,
-        torch.float16: 18.1e12,
-        torch.int8: 145e12,
-        "int4": 290e12,
+        torch.float16: 1.81e13,
+        torch.int8: 1.45e14,
+        "int4": 2.9e14,
     },
     "titan rtx": {
-        torch.float32: 16.3e12,
-        torch.float16: 32.6e12,
-        torch.int8: 261e12,
-        "int4": 522e12,
+        torch.float32: 1.63e13,
+        torch.float16: 3.26e13,
+        torch.int8: 2.61e14,
+        "int4": 5.22e14,
     },
     # Volta
     # source: https://images.nvidia.com/content/technologies/volta/pdf/volta-v100-datasheet-update-us-1165301-r5.pdf
     "v100 sxm": {
         torch.float64: 7.8e12,
-        torch.float32: 15.7e12,
-        torch.float16: 125e12,
+        torch.float32: 1.57e13,
+        torch.float16: 1.25e14,
     },
     "v100 pcie": {
-        torch.float64: 7e12,
-        torch.float32: 14e12,
-        torch.float16: 112e12,
+        torch.float64: 7.0e12,
+        torch.float32: 1.4e13,
+        torch.float16: 1.12e14,
     },
     "v100s pcie": {
         torch.float64: 8.2e12,
-        torch.float32: 16.4e12,
-        torch.float16: 130e12,
+        torch.float32: 1.64e13,
+        torch.float16: 1.3e14,
     },
 }
 
diff --git a/tests/tests_fabric/utilities/test_throughput.py b/tests/tests_fabric/utilities/test_throughput.py
index a175fa97fd444..a493e18a0306e 100644
--- a/tests/tests_fabric/utilities/test_throughput.py
+++ b/tests/tests_fabric/utilities/test_throughput.py
@@ -1,3 +1,4 @@
+import warnings
 from unittest import mock
 from unittest.mock import Mock, call
 
@@ -183,6 +184,60 @@ def test_throughput():
         throughput.update(time=0, batches=2, samples=2, lengths=1)
 
 
+def test_throughput_sparse_model_scaling():
+    """``using_sparse_model`` scales ``available_flops`` by the acceleration factor."""
+    # explicit False — available_flops untouched (also avoids the unset-flag warning)
+    assert Throughput(available_flops=100.0, using_sparse_model=False).available_flops == 100.0
+
+    # sparse flag on with default 2.0 factor
+    assert Throughput(available_flops=100.0, using_sparse_model=True).available_flops == 200.0
+
+    # sparse flag on with custom factor
+    throughput = Throughput(available_flops=100.0, using_sparse_model=True, sparse_cuda_acceleration_factor=4.0)
+    assert throughput.available_flops == 400.0
+
+    # sparse flag on with unknown peak — stays None, no error (MFU is skipped in compute())
+    assert Throughput(available_flops=None, using_sparse_model=True).available_flops is None
+
+    # factor below 1.0 is physically meaningless (sparsity can never lower the peak)
+    with pytest.raises(AssertionError, match="sparse acceleration factor cannot reduce"):
+        Throughput(available_flops=100.0, using_sparse_model=True, sparse_cuda_acceleration_factor=0.5)
+
+
+def test_throughput_sparse_model_warning():
+    """When ``using_sparse_model`` is unset and a peak is known, warn so MFU is not silently ambiguous."""
+    # warning fires only when the peak is known and the user didn't specify intent
+    with pytest.warns(UserWarning, match="MFU assumes dense model FLOPs"):
+        throughput = Throughput(available_flops=100.0)
+    assert throughput.available_flops == 100.0  # dense default, no scaling applied
+
+    # explicit choice (True or False) silences the warning
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        Throughput(available_flops=100.0, using_sparse_model=False)
+        Throughput(available_flops=100.0, using_sparse_model=True)
+
+    # no peak known → MFU is never computed, so no warning is needed
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        Throughput(available_flops=None)
+
+
+def test_throughput_sparse_model_mfu():
+    """MFU denominator reflects the sparse-scaled peak, so MFU halves when the peak doubles."""
+    # baseline dense: mfu = flops_per_sec / available_flops = 10 / 50 = 0.2
+    throughput = Throughput(available_flops=50, window_size=2, using_sparse_model=False)
+    throughput.update(time=1, batches=1, samples=2, flops=10)
+    throughput.update(time=2, batches=2, samples=4, flops=10)
+    assert throughput.compute()["device/mfu"] == 0.2
+
+    # sparse: peak doubles to 100, so mfu = 10 / 100 = 0.1
+    throughput = Throughput(available_flops=50, window_size=2, using_sparse_model=True)
+    throughput.update(time=1, batches=1, samples=2, flops=10)
+    throughput.update(time=2, batches=2, samples=4, flops=10)
+    assert throughput.compute()["device/mfu"] == 0.1
+
+
 def mock_train_loop(monitor):
     # simulate lit-gpt style loop
     total_lengths = 0
@@ -318,6 +373,17 @@ def test_throughput_monitor_world_size():
     ]
 
 
+def test_throughput_monitor_sparse_model():
+    """``using_sparse_model`` and ``sparse_cuda_acceleration_factor`` propagate through ThroughputMonitor."""
+    fabric_mock = Mock()
+    fabric_mock.world_size = 1
+    fabric_mock.strategy.precision = Precision()
+    with mock.patch("lightning.fabric.utilities.throughput.get_available_flops", return_value=100):
+        monitor = ThroughputMonitor(fabric_mock, using_sparse_model=True, sparse_cuda_acceleration_factor=2.0)
+    # 100 (from hardware lookup) scaled by 2.0 (sparse factor) → 200
+    assert monitor.available_flops == 200
+
+
 def test_monotonic_window():
     w = _MonotonicWindow(maxlen=3)
     assert w == []