diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 324cf25cd9527..f300a87e93031 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- +- Added `using_sparse_model` and `sparse_cuda_acceleration_factor` parameters to `Throughput` so MFU defaults to the dense peak and opts into the sparse peak explicitly ([#21743](https://github.com/Lightning-AI/pytorch-lightning/pull/21743)) ### Changed @@ -20,7 +20,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- +- Fixed inconsistent FLOPs reporting on NVIDIA H100/H200 GPUs by defaulting to dense FLOPs, with sparse FLOPs now requiring an explicit opt-in. ([#21743](https://github.com/Lightning-AI/pytorch-lightning/pull/21743)) --- diff --git a/src/lightning/fabric/utilities/throughput.py b/src/lightning/fabric/utilities/throughput.py index 6bc329fa1c3be..ed5ed6a3fb51a 100644 --- a/src/lightning/fabric/utilities/throughput.py +++ b/src/lightning/fabric/utilities/throughput.py @@ -88,12 +88,30 @@ class Throughput: world_size: Number of devices available across hosts. Global metrics are not included if the world size is 1. window_size: Number of batches to use for a rolling average. separator: Key separator to use when creating per-device and global metrics. + using_sparse_model: Whether the model uses structured sparsity. If ``True``, scales ``available_flops`` by + ``sparse_cuda_acceleration_factor``. ``None`` (default) assumes dense and warns once. + sparse_cuda_acceleration_factor: Multiplier applied to ``available_flops`` when ``using_sparse_model`` is + ``True``. Defaults to ``2.0`` for NVIDIA 2:4 structured sparsity. """ def __init__( - self, available_flops: Optional[float] = None, world_size: int = 1, window_size: int = 100, separator: str = "/" + self, + available_flops: Optional[float] = None, + world_size: int = 1, + window_size: int = 100, + separator: str = "/", + using_sparse_model: Optional[bool] = None, + sparse_cuda_acceleration_factor: float = 2.0, ) -> None: + assert sparse_cuda_acceleration_factor >= 1.0, "sparse acceleration factor cannot reduce peak FLOPs" + if using_sparse_model is None and available_flops is not None: + rank_zero_warn( + "MFU assumes dense model FLOPs (no sparsity acceleration)." + " Set 'using_sparse_model=True' for MFU to use sparse FLOPs." + ) + if using_sparse_model and available_flops is not None: + available_flops = available_flops * sparse_cuda_acceleration_factor self.available_flops = available_flops self.separator = separator assert world_size > 0 @@ -308,223 +326,225 @@ def measure_flops( "h200 sxm1": { torch.float64: 3.4e13, torch.float32: 6.7e13, - "tfloat32": 9.9e14, - torch.bfloat16: 2.0e15, - torch.float16: 2.0e15, - torch.int8: 4.0e15, + "tfloat32": 4.945e14, + torch.bfloat16: 9.895e14, + torch.float16: 9.895e14, + torch.int8: 1.979e15, }, "h200 nvl1": { torch.float64: 3.0e13, torch.float32: 6.0e13, - "tfloat32": 8.4e14, - torch.bfloat16: 1.7e15, - torch.float16: 1.7e15, - torch.int8: 3.3e15, + "tfloat32": 4.2e14, + torch.bfloat16: 8.4e14, + torch.float16: 8.4e14, + torch.int8: 1.68e15, }, - # source: https://resources.nvidia.com/en-us-tensor-core + # source: https://resources.nvidia.com/en-us-gpu-resources/h100-datasheet-24306 "h100 nvl": { - torch.float64: 67e12, - torch.float32: 133.8e12, - "tfloat32": 989.4e12, - torch.bfloat16: 1978.8e12, - torch.float16: 1978.8e12, - torch.int8: 3957.8e12, + torch.float64: 3.0e13, + torch.float32: 6.0e13, + "tfloat32": 4.175e14, + torch.bfloat16: 8.355e14, + torch.float16: 8.355e14, + torch.int8: 1.6705e15, }, "h100 sxm": { - torch.float64: 33.5e12, - torch.float32: 66.9e12, - "tfloat32": 494.7e12, - torch.bfloat16: 989.4e12, - torch.float16: 989.4e12, - torch.int8: 1978.9e12, + torch.float64: 3.4e13, + torch.float32: 6.7e13, + "tfloat32": 4.945e14, + torch.bfloat16: 9.895e14, + torch.float16: 9.895e14, + torch.int8: 1.979e15, }, "h100 pcie": { - torch.float64: 25.6e12, - torch.float32: 51.2e12, - "tfloat32": 378e12, - torch.bfloat16: 756e12, - torch.float16: 756e12, - torch.int8: 1513e12, + torch.float64: 2.56e13, + torch.float32: 5.12e13, + "tfloat32": 3.78e14, + torch.bfloat16: 7.56e14, + torch.float16: 7.56e14, + torch.int8: 1.513e15, }, # Ada # source: https://images.nvidia.com/aem-dam/Solutions/Data-Center/l4/nvidia-ada-gpu-architecture-whitepaper-v2.1.pdf "rtx 4090": { - torch.float32: 82.6e12, - "tfloat32": 82.6e12, - torch.bfloat16: 82.6e12, - torch.float16: 82.6e12, - torch.int8: 660.6e12, - "int4": 1321.2e12, + torch.float32: 8.26e13, + "tfloat32": 8.26e13, + torch.bfloat16: 8.26e13, + torch.float16: 8.26e13, + torch.int8: 6.606e14, + "int4": 1.3212e15, }, "rtx 4080": { - torch.float32: 48.7e12, - "tfloat32": 48.7e12, - torch.bfloat16: 48.7e12, - torch.float16: 48.7e12, - torch.int8: 389.9e12, - "int4": 779.8e12, + torch.float32: 4.87e13, + "tfloat32": 4.87e13, + torch.bfloat16: 4.87e13, + torch.float16: 4.87e13, + torch.int8: 3.899e14, + "int4": 7.798e14, }, "rtx 4080 super": { - torch.float32: 52.2e12, - "tfloat32": 52.2e12, - torch.bfloat16: 52.2e12, - torch.float16: 52.2e12, - torch.int8: 417.6e12, - "int4": 835.2e12, + torch.float32: 5.22e13, + "tfloat32": 5.22e13, + torch.bfloat16: 5.22e13, + torch.float16: 5.22e13, + torch.int8: 4.176e14, + "int4": 8.352e14, }, "l4": { - torch.float32: 30.3e12, - "tfloat32": 60e12, - torch.bfloat16: 121e12, - torch.float16: 121e12, - torch.int8: 242e12, - "int4": 484e12, + torch.float32: 3.03e13, + "tfloat32": 6.0e13, + torch.bfloat16: 1.21e14, + torch.float16: 1.21e14, + torch.int8: 2.42e14, + "int4": 4.84e14, }, "l40": { - torch.float32: 90.5e12, - "tfloat32": 90.5e12, - torch.bfloat16: 181e12, - torch.float16: 181e12, - torch.int8: 362e12, - "int4": 724e12, + torch.float32: 9.05e13, + "tfloat32": 9.05e13, + torch.bfloat16: 1.81e14, + torch.float16: 1.81e14, + torch.int8: 3.62e14, + "int4": 7.24e14, }, # Ampere # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf # sxm and pcie have same flop counts "a100": { torch.float64: 9.7e12, - torch.float32: 19.5e12, - "tfloat32": 156e12, - torch.bfloat16: 312e12, - torch.float16: 312e12, - torch.int8: 624e12, + torch.float32: 1.95e13, + "tfloat32": 1.56e14, + torch.bfloat16: 3.12e14, + torch.float16: 3.12e14, + torch.int8: 6.24e14, }, "a6000": { - torch.float32: 38.7e12, - "tfloat32": 77.4e12, - torch.bfloat16: 38.7e12, - torch.float16: 38.7e12, - torch.int8: 309.7e12, - "int4": 619.3e12, + torch.float32: 3.87e13, + "tfloat32": 7.74e13, + torch.bfloat16: 3.87e13, + torch.float16: 3.87e13, + torch.int8: 3.097e14, + "int4": 6.193e14, }, + # source: https://images.nvidia.com/content/Solutions/data-center/a40/nvidia-a40-datasheet.pdf "a40": { - torch.float32: 37.4e12, - "tfloat32": 74.8e12, - torch.bfloat16: 37.4e12, - torch.float16: 37.4e12, - torch.int8: 299.3e12, - "int4": 598.7e12, + torch.float32: 3.74e13, + "tfloat32": 7.48e13, + torch.bfloat16: 1.497e14, + torch.float16: 1.497e14, + torch.int8: 2.993e14, + "int4": 5.987e14, }, # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a10/pdf/a10-datasheet.pdf "a10g": { - torch.float32: 31.2e12, - "tfloat32": 62.5e12, - torch.bfloat16: 125e12, - torch.float16: 125e12, - torch.int8: 250e12, - "int4": 500e12, + torch.float32: 3.12e13, + "tfloat32": 6.25e13, + torch.bfloat16: 1.25e14, + torch.float16: 1.25e14, + torch.int8: 2.5e14, + "int4": 5.0e14, }, "rtx 3090 ti": { - torch.float32: 40e12, - "tfloat32": 40e12, - torch.bfloat16: 40e12, - torch.float16: 40e12, - torch.int8: 320e12, - "int4": 640e12, + torch.float32: 4.0e13, + "tfloat32": 4.0e13, + torch.bfloat16: 4.0e13, + torch.float16: 4.0e13, + torch.int8: 3.2e14, + "int4": 6.4e14, }, "rtx 3090": { - torch.float32: 35.6e12, - "tfloat32": 35.6e12, - torch.bfloat16: 35.6e12, - torch.float16: 35.6e12, - torch.int8: 284e12, - "int4": 568e12, + torch.float32: 3.56e13, + "tfloat32": 3.56e13, + torch.bfloat16: 3.56e13, + torch.float16: 3.56e13, + torch.int8: 2.84e14, + "int4": 5.68e14, }, "rtx 3080 ti": { - torch.float32: 34.1e12, - "tfloat32": 34.1e12, - torch.bfloat16: 34.1e12, - torch.float16: 34.1e12, - torch.int8: 272.8e12, - "int4": 546.6e12, + torch.float32: 3.41e13, + "tfloat32": 3.41e13, + torch.bfloat16: 3.41e13, + torch.float16: 3.41e13, + torch.int8: 2.728e14, + "int4": 5.466e14, }, + # source: https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf "rtx 3080": { - torch.float32: 29.8e12, - "tfloat32": 29.8e12, - torch.bfloat16: 29.8e12, - torch.float16: 29.8e12, - torch.int8: 238e12, - "int4": 476e12, + torch.float32: 2.98e13, + "tfloat32": 2.98e13, + torch.bfloat16: 2.98e13, + torch.float16: 2.98e13, + torch.int8: 2.38e14, + "int4": 4.76e14, }, "rtx 3070": { - torch.float32: 20.3e12, - "tfloat32": 20.3e12, - torch.bfloat16: 20.3e12, - torch.float16: 20.3e12, - torch.int8: 162.6e12, - "int4": 325.2e12, + torch.float32: 2.03e13, + "tfloat32": 2.03e13, + torch.bfloat16: 2.03e13, + torch.float16: 2.03e13, + torch.int8: 1.626e14, + "int4": 3.252e14, }, # Turing # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/tesla-t4/t4-tensor-core-datasheet-951643.pdf # sxm and pcie have same flop counts "t4": { torch.float32: 8.1e12, - torch.float16: 65e12, - torch.int8: 130e12, - "int4": 260e12, + torch.float16: 6.5e13, + torch.int8: 1.3e14, + "int4": 2.6e14, }, # https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/quadro-product-literature/quadro-rtx-5000-data-sheet-us-nvidia-704120-r4-web.pdf "quadro rtx 5000": { - torch.float32: 11.2e12, - torch.float16: 89.2e12, + torch.float32: 1.12e13, + torch.float16: 8.92e13, }, "rtx 2080 super": { - torch.float32: 11.2e12, - torch.float16: 22.3e12, - torch.int8: 178.4e12, - "int4": 356.8e12, + torch.float32: 1.12e13, + torch.float16: 2.23e13, + torch.int8: 1.784e14, + "int4": 3.568e14, }, "rtx 2080 ti": { - torch.float32: 14.2e12, - torch.float16: 28.5e12, - torch.int8: 227.7e12, - "int4": 455.4e12, + torch.float32: 1.42e13, + torch.float16: 2.85e13, + torch.int8: 2.277e14, + "int4": 4.554e14, }, "rtx 2080": { - torch.float32: 10.6e12, - torch.float16: 21.2e12, - torch.int8: 169.6e12, - "int4": 339.1e12, + torch.float32: 1.06e13, + torch.float16: 2.12e13, + torch.int8: 1.696e14, + "int4": 3.391e14, }, # https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf "rtx 2070 super": { torch.float32: 9.1e12, - torch.float16: 18.1e12, - torch.int8: 145e12, - "int4": 290e12, + torch.float16: 1.81e13, + torch.int8: 1.45e14, + "int4": 2.9e14, }, "titan rtx": { - torch.float32: 16.3e12, - torch.float16: 32.6e12, - torch.int8: 261e12, - "int4": 522e12, + torch.float32: 1.63e13, + torch.float16: 3.26e13, + torch.int8: 2.61e14, + "int4": 5.22e14, }, # Volta # source: https://images.nvidia.com/content/technologies/volta/pdf/volta-v100-datasheet-update-us-1165301-r5.pdf "v100 sxm": { torch.float64: 7.8e12, - torch.float32: 15.7e12, - torch.float16: 125e12, + torch.float32: 1.57e13, + torch.float16: 1.25e14, }, "v100 pcie": { - torch.float64: 7e12, - torch.float32: 14e12, - torch.float16: 112e12, + torch.float64: 7.0e12, + torch.float32: 1.4e13, + torch.float16: 1.12e14, }, "v100s pcie": { torch.float64: 8.2e12, - torch.float32: 16.4e12, - torch.float16: 130e12, + torch.float32: 1.64e13, + torch.float16: 1.3e14, }, } diff --git a/tests/tests_fabric/utilities/test_throughput.py b/tests/tests_fabric/utilities/test_throughput.py index a175fa97fd444..a493e18a0306e 100644 --- a/tests/tests_fabric/utilities/test_throughput.py +++ b/tests/tests_fabric/utilities/test_throughput.py @@ -1,3 +1,4 @@ +import warnings from unittest import mock from unittest.mock import Mock, call @@ -183,6 +184,60 @@ def test_throughput(): throughput.update(time=0, batches=2, samples=2, lengths=1) +def test_throughput_sparse_model_scaling(): + """``using_sparse_model`` scales ``available_flops`` by the acceleration factor.""" + # explicit False — available_flops untouched (also avoids the unset-flag warning) + assert Throughput(available_flops=100.0, using_sparse_model=False).available_flops == 100.0 + + # sparse flag on with default 2.0 factor + assert Throughput(available_flops=100.0, using_sparse_model=True).available_flops == 200.0 + + # sparse flag on with custom factor + throughput = Throughput(available_flops=100.0, using_sparse_model=True, sparse_cuda_acceleration_factor=4.0) + assert throughput.available_flops == 400.0 + + # sparse flag on with unknown peak — stays None, no error (MFU is skipped in compute()) + assert Throughput(available_flops=None, using_sparse_model=True).available_flops is None + + # factor below 1.0 is physically meaningless (sparsity can never lower the peak) + with pytest.raises(AssertionError, match="sparse acceleration factor cannot reduce"): + Throughput(available_flops=100.0, using_sparse_model=True, sparse_cuda_acceleration_factor=0.5) + + +def test_throughput_sparse_model_warning(): + """When ``using_sparse_model`` is unset and a peak is known, warn so MFU is not silently ambiguous.""" + # warning fires only when the peak is known and the user didn't specify intent + with pytest.warns(UserWarning, match="MFU assumes dense model FLOPs"): + throughput = Throughput(available_flops=100.0) + assert throughput.available_flops == 100.0 # dense default, no scaling applied + + # explicit choice (True or False) silences the warning + with warnings.catch_warnings(): + warnings.simplefilter("error") + Throughput(available_flops=100.0, using_sparse_model=False) + Throughput(available_flops=100.0, using_sparse_model=True) + + # no peak known → MFU is never computed, so no warning is needed + with warnings.catch_warnings(): + warnings.simplefilter("error") + Throughput(available_flops=None) + + +def test_throughput_sparse_model_mfu(): + """MFU denominator reflects the sparse-scaled peak, so MFU halves when the peak doubles.""" + # baseline dense: mfu = flops_per_sec / available_flops = 10 / 50 = 0.2 + throughput = Throughput(available_flops=50, window_size=2, using_sparse_model=False) + throughput.update(time=1, batches=1, samples=2, flops=10) + throughput.update(time=2, batches=2, samples=4, flops=10) + assert throughput.compute()["device/mfu"] == 0.2 + + # sparse: peak doubles to 100, so mfu = 10 / 100 = 0.1 + throughput = Throughput(available_flops=50, window_size=2, using_sparse_model=True) + throughput.update(time=1, batches=1, samples=2, flops=10) + throughput.update(time=2, batches=2, samples=4, flops=10) + assert throughput.compute()["device/mfu"] == 0.1 + + def mock_train_loop(monitor): # simulate lit-gpt style loop total_lengths = 0 @@ -318,6 +373,17 @@ def test_throughput_monitor_world_size(): ] +def test_throughput_monitor_sparse_model(): + """``using_sparse_model`` and ``sparse_cuda_acceleration_factor`` propagate through ThroughputMonitor.""" + fabric_mock = Mock() + fabric_mock.world_size = 1 + fabric_mock.strategy.precision = Precision() + with mock.patch("lightning.fabric.utilities.throughput.get_available_flops", return_value=100): + monitor = ThroughputMonitor(fabric_mock, using_sparse_model=True, sparse_cuda_acceleration_factor=2.0) + # 100 (from hardware lookup) scaled by 2.0 (sparse factor) → 200 + assert monitor.available_flops == 200 + + def test_monotonic_window(): w = _MonotonicWindow(maxlen=3) assert w == []