Lightning-AI · Devesh-Maheshwari · May 26, 2026 · May 26, 2026 · May 27, 2026 · May 27, 2026
@@ -88,12 +88,30 @@ class Throughput:
         world_size: Number of devices available across hosts. Global metrics are not included if the world size is 1.
         window_size: Number of batches to use for a rolling average.
         separator: Key separator to use when creating per-device and global metrics.
+        using_sparse_model: Whether the model uses structured sparsity. If ``True``, scales ``available_flops`` by
+            ``sparse_cuda_acceleration_factor``. ``None`` (default) assumes dense and warns once.
+        sparse_cuda_acceleration_factor: Multiplier applied to ``available_flops`` when ``using_sparse_model`` is
+            ``True``. Defaults to ``2.0`` for NVIDIA 2:4 structured sparsity.
 
     """
 
     def __init__(
-        self, available_flops: Optional[float] = None, world_size: int = 1, window_size: int = 100, separator: str = "/"
+        self,
+        available_flops: Optional[float] = None,
+        world_size: int = 1,
+        window_size: int = 100,
+        separator: str = "/",
+        using_sparse_model: Optional[bool] = None,
+        sparse_cuda_acceleration_factor: float = 2.0,
     ) -> None:
+        assert sparse_cuda_acceleration_factor >= 1.0, "sparse acceleration factor cannot reduce peak FLOPs"
+        if using_sparse_model is None and available_flops is not None:
+            rank_zero_warn(
+                "MFU assumes dense model FLOPs (no sparsity acceleration)."
+                " Set 'using_sparse_model=True' for mfu to use sparse flops."
+            )
+        if using_sparse_model and available_flops is not None:
+            available_flops = available_flops * sparse_cuda_acceleration_factor
         self.available_flops = available_flops
         self.separator = separator
         assert world_size > 0
@@ -308,223 +326,225 @@ def measure_flops(
     "h200 sxm1": {
         torch.float64: 3.4e13,
         torch.float32: 6.7e13,
-        "tfloat32": 9.9e14,
-        torch.bfloat16: 2.0e15,
-        torch.float16: 2.0e15,
-        torch.int8: 4.0e15,
+        "tfloat32": 4.945e14,
+        torch.bfloat16: 9.895e14,
+        torch.float16: 9.895e14,
+        torch.int8: 1.979e15,
     },
     "h200 nvl1": {
         torch.float64: 3.0e13,
         torch.float32: 6.0e13,
-        "tfloat32": 8.4e14,
-        torch.bfloat16: 1.7e15,
-        torch.float16: 1.7e15,
-        torch.int8: 3.3e15,
+        "tfloat32": 4.2e14,
+        torch.bfloat16: 8.4e14,
+        torch.float16: 8.4e14,
+        torch.int8: 1.68e15,
     },
-    # source: https://resources.nvidia.com/en-us-tensor-core
+    # source: https://resources.nvidia.com/en-us-gpu-resources/h100-datasheet-24306
     "h100 nvl": {
-        torch.float64: 67e12,
-        torch.float32: 133.8e12,
-        "tfloat32": 989.4e12,
-        torch.bfloat16: 1978.8e12,
-        torch.float16: 1978.8e12,
-        torch.int8: 3957.8e12,
+        torch.float64: 3.0e13,
+        torch.float32: 6.0e13,
+        "tfloat32": 4.175e14,
+        torch.bfloat16: 8.355e14,
+        torch.float16: 8.355e14,
+        torch.int8: 1.6705e15,
     },
     "h100 sxm": {
-        torch.float64: 33.5e12,
-        torch.float32: 66.9e12,
-        "tfloat32": 494.7e12,
-        torch.bfloat16: 989.4e12,
-        torch.float16: 989.4e12,
-        torch.int8: 1978.9e12,
+        torch.float64: 3.4e13,
+        torch.float32: 6.7e13,
+        "tfloat32": 4.945e14,
+        torch.bfloat16: 9.895e14,
+        torch.float16: 9.895e14,
+        torch.int8: 1.979e15,
     },
     "h100 pcie": {
-        torch.float64: 25.6e12,
-        torch.float32: 51.2e12,
-        "tfloat32": 378e12,
-        torch.bfloat16: 756e12,
-        torch.float16: 756e12,
-        torch.int8: 1513e12,
+        torch.float64: 2.56e13,
+        torch.float32: 5.12e13,
+        "tfloat32": 3.78e14,
+        torch.bfloat16: 7.56e14,
+        torch.float16: 7.56e14,
+        torch.int8: 1.513e15,
     },
     # Ada
     # source: https://images.nvidia.com/aem-dam/Solutions/Data-Center/l4/nvidia-ada-gpu-architecture-whitepaper-v2.1.pdf
     "rtx 4090": {
-        torch.float32: 82.6e12,
-        "tfloat32": 82.6e12,
-        torch.bfloat16: 82.6e12,
-        torch.float16: 82.6e12,
-        torch.int8: 660.6e12,
-        "int4": 1321.2e12,
+        torch.float32: 8.26e13,
+        "tfloat32": 8.26e13,
+        torch.bfloat16: 8.26e13,
+        torch.float16: 8.26e13,
+        torch.int8: 6.606e14,
+        "int4": 1.3212e15,
     },
     "rtx 4080": {
-        torch.float32: 48.7e12,
-        "tfloat32": 48.7e12,
-        torch.bfloat16: 48.7e12,
-        torch.float16: 48.7e12,
-        torch.int8: 389.9e12,
-        "int4": 779.8e12,
+        torch.float32: 4.87e13,
+        "tfloat32": 4.87e13,
+        torch.bfloat16: 4.87e13,
+        torch.float16: 4.87e13,
+        torch.int8: 3.899e14,
+        "int4": 7.798e14,
     },
     "rtx 4080 super": {
-        torch.float32: 52.2e12,
-        "tfloat32": 52.2e12,
-        torch.bfloat16: 52.2e12,
-        torch.float16: 52.2e12,
-        torch.int8: 417.6e12,
-        "int4": 835.2e12,
+        torch.float32: 5.22e13,
+        "tfloat32": 5.22e13,
+        torch.bfloat16: 5.22e13,
+        torch.float16: 5.22e13,
+        torch.int8: 4.176e14,
+        "int4": 8.352e14,
     },
     "l4": {
-        torch.float32: 30.3e12,
-        "tfloat32": 60e12,
-        torch.bfloat16: 121e12,
-        torch.float16: 121e12,
-        torch.int8: 242e12,
-        "int4": 484e12,
+        torch.float32: 3.03e13,
+        "tfloat32": 6.0e13,
+        torch.bfloat16: 1.21e14,
+        torch.float16: 1.21e14,
+        torch.int8: 2.42e14,
+        "int4": 4.84e14,
     },
     "l40": {
-        torch.float32: 90.5e12,
-        "tfloat32": 90.5e12,
-        torch.bfloat16: 181e12,
-        torch.float16: 181e12,
-        torch.int8: 362e12,
-        "int4": 724e12,
+        torch.float32: 9.05e13,
+        "tfloat32": 9.05e13,
+        torch.bfloat16: 1.81e14,
+        torch.float16: 1.81e14,
+        torch.int8: 3.62e14,
+        "int4": 7.24e14,
     },
     # Ampere
     # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
     # sxm and pcie have same flop counts
     "a100": {
         torch.float64: 9.7e12,
-        torch.float32: 19.5e12,
-        "tfloat32": 156e12,
-        torch.bfloat16: 312e12,
-        torch.float16: 312e12,
-        torch.int8: 624e12,
+        torch.float32: 1.95e13,
+        "tfloat32": 1.56e14,
+        torch.bfloat16: 3.12e14,
+        torch.float16: 3.12e14,
+        torch.int8: 6.24e14,
     },
     "a6000": {
-        torch.float32: 38.7e12,
-        "tfloat32": 77.4e12,
-        torch.bfloat16: 38.7e12,
-        torch.float16: 38.7e12,
-        torch.int8: 309.7e12,
-        "int4": 619.3e12,
+        torch.float32: 3.87e13,
+        "tfloat32": 7.74e13,
+        torch.bfloat16: 3.87e13,
+        torch.float16: 3.87e13,
+        torch.int8: 3.097e14,
+        "int4": 6.193e14,
     },
+    # source: https://images.nvidia.com/content/Solutions/data-center/a40/nvidia-a40-datasheet.pdf
     "a40": {
-        torch.float32: 37.4e12,
-        "tfloat32": 74.8e12,
-        torch.bfloat16: 37.4e12,
-        torch.float16: 37.4e12,
-        torch.int8: 299.3e12,
-        "int4": 598.7e12,
+        torch.float32: 3.74e13,
+        "tfloat32": 7.48e13,
+        torch.bfloat16: 1.497e14,
+        torch.float16: 1.497e14,
+        torch.int8: 2.993e14,
+        "int4": 5.987e14,
     },
     # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a10/pdf/a10-datasheet.pdf
     "a10g": {
-        torch.float32: 31.2e12,
-        "tfloat32": 62.5e12,
-        torch.bfloat16: 125e12,
-        torch.float16: 125e12,
-        torch.int8: 250e12,
-        "int4": 500e12,
+        torch.float32: 3.12e13,
+        "tfloat32": 6.25e13,
+        torch.bfloat16: 1.25e14,
+        torch.float16: 1.25e14,
+        torch.int8: 2.5e14,
+        "int4": 5.0e14,
     },
     "rtx 3090 ti": {
-        torch.float32: 40e12,
-        "tfloat32": 40e12,
-        torch.bfloat16: 40e12,
-        torch.float16: 40e12,
-        torch.int8: 320e12,
-        "int4": 640e12,
+        torch.float32: 4.0e13,
+        "tfloat32": 4.0e13,
+        torch.bfloat16: 4.0e13,
+        torch.float16: 4.0e13,
+        torch.int8: 3.2e14,
+        "int4": 6.4e14,
     },
     "rtx 3090": {
-        torch.float32: 35.6e12,
-        "tfloat32": 35.6e12,
-        torch.bfloat16: 35.6e12,
-        torch.float16: 35.6e12,
-        torch.int8: 284e12,
-        "int4": 568e12,
+        torch.float32: 3.56e13,
+        "tfloat32": 3.56e13,
+        torch.bfloat16: 3.56e13,
+        torch.float16: 3.56e13,
+        torch.int8: 2.84e14,
+        "int4": 5.68e14,
     },
     "rtx 3080 ti": {
-        torch.float32: 34.1e12,
-        "tfloat32": 34.1e12,
-        torch.bfloat16: 34.1e12,
-        torch.float16: 34.1e12,
-        torch.int8: 272.8e12,
-        "int4": 546.6e12,
+        torch.float32: 3.41e13,
+        "tfloat32": 3.41e13,
+        torch.bfloat16: 3.41e13,
+        torch.float16: 3.41e13,
+        torch.int8: 2.728e14,
+        "int4": 5.466e14,
     },
+    # source: https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf
     "rtx 3080": {
-        torch.float32: 29.8e12,
-        "tfloat32": 29.8e12,
-        torch.bfloat16: 29.8e12,
-        torch.float16: 29.8e12,
-        torch.int8: 238e12,
-        "int4": 476e12,
+        torch.float32: 2.98e13,
+        "tfloat32": 2.98e13,
+        torch.bfloat16: 2.98e13,
+        torch.float16: 2.98e13,
+        torch.int8: 2.38e14,
+        "int4": 4.76e14,
     },
     "rtx 3070": {
-        torch.float32: 20.3e12,
-        "tfloat32": 20.3e12,
-        torch.bfloat16: 20.3e12,
-        torch.float16: 20.3e12,
-        torch.int8: 162.6e12,
-        "int4": 325.2e12,
+        torch.float32: 2.03e13,
+        "tfloat32": 2.03e13,
+        torch.bfloat16: 2.03e13,
+        torch.float16: 2.03e13,
+        torch.int8: 1.626e14,
+        "int4": 3.252e14,
     },
     # Turing
     # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/tesla-t4/t4-tensor-core-datasheet-951643.pdf
     # sxm and pcie have same flop counts
     "t4": {
         torch.float32: 8.1e12,
-        torch.float16: 65e12,
-        torch.int8: 130e12,
-        "int4": 260e12,
+        torch.float16: 6.5e13,
+        torch.int8: 1.3e14,
+        "int4": 2.6e14,
     },
     # https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/quadro-product-literature/quadro-rtx-5000-data-sheet-us-nvidia-704120-r4-web.pdf
     "quadro rtx 5000": {
-        torch.float32: 11.2e12,
-        torch.float16: 89.2e12,
+        torch.float32: 1.12e13,
+        torch.float16: 8.92e13,
     },
     "rtx 2080 super": {
-        torch.float32: 11.2e12,
-        torch.float16: 22.3e12,
-        torch.int8: 178.4e12,
-        "int4": 356.8e12,
+        torch.float32: 1.12e13,
+        torch.float16: 2.23e13,
+        torch.int8: 1.784e14,
+        "int4": 3.568e14,
     },
     "rtx 2080 ti": {
-        torch.float32: 14.2e12,
-        torch.float16: 28.5e12,
-        torch.int8: 227.7e12,
-        "int4": 455.4e12,
+        torch.float32: 1.42e13,
+        torch.float16: 2.85e13,
+        torch.int8: 2.277e14,
+        "int4": 4.554e14,
     },
     "rtx 2080": {
-        torch.float32: 10.6e12,
-        torch.float16: 21.2e12,
-        torch.int8: 169.6e12,
-        "int4": 339.1e12,
+        torch.float32: 1.06e13,
+        torch.float16: 2.12e13,
+        torch.int8: 1.696e14,
+        "int4": 3.391e14,
     },
     # https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf
     "rtx 2070 super": {
         torch.float32: 9.1e12,
-        torch.float16: 18.1e12,
-        torch.int8: 145e12,
-        "int4": 290e12,
+        torch.float16: 1.81e13,
+        torch.int8: 1.45e14,
+        "int4": 2.9e14,
     },
     "titan rtx": {
-        torch.float32: 16.3e12,
-        torch.float16: 32.6e12,
-        torch.int8: 261e12,
-        "int4": 522e12,
+        torch.float32: 1.63e13,
+        torch.float16: 3.26e13,
+        torch.int8: 2.61e14,
+        "int4": 5.22e14,
     },
     # Volta
     # source: https://images.nvidia.com/content/technologies/volta/pdf/volta-v100-datasheet-update-us-1165301-r5.pdf
     "v100 sxm": {
         torch.float64: 7.8e12,
-        torch.float32: 15.7e12,
-        torch.float16: 125e12,
+        torch.float32: 1.57e13,
+        torch.float16: 1.25e14,
     },
     "v100 pcie": {
-        torch.float64: 7e12,
-        torch.float32: 14e12,
-        torch.float16: 112e12,
+        torch.float64: 7.0e12,
+        torch.float32: 1.4e13,
+        torch.float16: 1.12e14,
     },
     "v100s pcie": {
         torch.float64: 8.2e12,
-        torch.float32: 16.4e12,
-        torch.float16: 130e12,
+        torch.float32: 1.64e13,
+        torch.float16: 1.3e14,
     },
 }