Skip to content
81 changes: 55 additions & 26 deletions src/lightning/fabric/utilities/throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,39 @@ class Throughput:
world_size: Number of devices available across hosts. Global metrics are not included if the world size is 1.
window_size: Number of batches to use for a rolling average.
separator: Key separator to use when creating per-device and global metrics.
using_sparse_model: User intent for whether the model exploits structured sparsity.
Comment thread
Devesh-Maheshwari marked this conversation as resolved.
Outdated
``True`` scales ``available_flops`` by ``sparse_cuda_acceleration_factor`` to reflect the higher
theoretical peak achievable with structured sparsity (e.g. NVIDIA's 2:4 sparse Tensor Cores on
Ampere and later). ``False`` keeps the dense peak. ``None`` (the default) keeps the dense peak
and emits a one-time warning so users explicitly choose; set this flag to silence the warning.
sparse_cuda_acceleration_factor: Multiplier applied to ``available_flops`` when ``using_sparse_model``
is ``True``. Defaults to ``2.0``, matching the 2x speedup of NVIDIA 2:4 structured sparsity.
Comment thread
Devesh-Maheshwari marked this conversation as resolved.
Outdated

"""

def __init__(
self, available_flops: Optional[float] = None, world_size: int = 1, window_size: int = 100, separator: str = "/"
self,
available_flops: Optional[float] = None,
world_size: int = 1,
window_size: int = 100,
separator: str = "/",
using_sparse_model: Optional[bool] = None,
sparse_cuda_acceleration_factor: float = 2.0,
) -> None:
# For sparse models, hardware can achieve a higher theoretical peak (e.g. NVIDIA's 2:4 structured
# sparsity doubles tensor-core throughput). Scaling the peak here keeps MFU = measured / peak
# consistent with the sparse ceiling, so users get a meaningful utilization figure.
# If `available_flops is None` the peak is unknown, so MFU is skipped in `compute()` regardless
# (see the `if self.available_flops:` guard), and we leave it as None rather than erroring.
Comment thread
Devesh-Maheshwari marked this conversation as resolved.
Outdated
assert sparse_cuda_acceleration_factor >= 1.0, "sparse acceleration factor cannot reduce peak FLOPs"
if using_sparse_model is None and available_flops is not None:
# the user didn't tell us their intent; default to dense and warn so MFU isn't silently ambiguous
Comment thread
Devesh-Maheshwari marked this conversation as resolved.
Outdated
rank_zero_warn(
"MFU assumes dense model FLOPs (no sparsity acceleration)."
" Set 'using_sparse_model=True' for mfu to use sparse flops."
)
Comment thread
deependujha marked this conversation as resolved.
if using_sparse_model and available_flops is not None:
available_flops = available_flops * sparse_cuda_acceleration_factor
self.available_flops = available_flops
self.separator = separator
assert world_size > 0
Expand Down Expand Up @@ -306,37 +333,37 @@ def measure_flops(
# Hopper
# source: https://nvdam.widen.net/s/nb5zzzsjdf/hpc-datasheet-sc23-h200-datasheet-3002446
"h200 sxm1": {
torch.float64: 3.4e13,
torch.float32: 6.7e13,
"tfloat32": 9.9e14,
torch.bfloat16: 2.0e15,
torch.float16: 2.0e15,
torch.int8: 4.0e15,
torch.float64: 34e12,
Comment thread
deependujha marked this conversation as resolved.
Outdated
torch.float32: 67e12,
"tfloat32": 494.5e12,
torch.bfloat16: 989.5e12,
torch.float16: 989.5e12,
torch.int8: 1979e12,
},
"h200 nvl1": {
torch.float64: 3.0e13,
torch.float32: 6.0e13,
"tfloat32": 8.4e14,
torch.bfloat16: 1.7e15,
torch.float16: 1.7e15,
torch.int8: 3.3e15,
"tfloat32": 4.2e14,
torch.bfloat16: 8.4e14,
torch.float16: 8.4e14,
torch.int8: 1.68e15,
},
# source: https://resources.nvidia.com/en-us-tensor-core
# source: https://resources.nvidia.com/en-us-gpu-resources/h100-datasheet-24306
"h100 nvl": {
torch.float64: 67e12,
torch.float32: 133.8e12,
"tfloat32": 989.4e12,
torch.bfloat16: 1978.8e12,
torch.float16: 1978.8e12,
torch.int8: 3957.8e12,
torch.float64: 30e12,
torch.float32: 60e12,
"tfloat32": 417.5e12,
torch.bfloat16: 835.5e12,
torch.float16: 835.5e12,
torch.int8: 1670.5e12,
},
"h100 sxm": {
torch.float64: 33.5e12,
torch.float32: 66.9e12,
"tfloat32": 494.7e12,
torch.bfloat16: 989.4e12,
torch.float16: 989.4e12,
torch.int8: 1978.9e12,
torch.float64: 34e12,
torch.float32: 67e12,
"tfloat32": 494.5e12,
torch.bfloat16: 989.5e12,
torch.float16: 989.5e12,
torch.int8: 1979e12,
},
"h100 pcie": {
torch.float64: 25.6e12,
Expand Down Expand Up @@ -407,11 +434,12 @@ def measure_flops(
torch.int8: 309.7e12,
"int4": 619.3e12,
},
# source: https://images.nvidia.com/content/Solutions/data-center/a40/nvidia-a40-datasheet.pdf
"a40": {
torch.float32: 37.4e12,
"tfloat32": 74.8e12,
torch.bfloat16: 37.4e12,
torch.float16: 37.4e12,
torch.bfloat16: 149.7e12,
torch.float16: 149.7e12,
torch.int8: 299.3e12,
"int4": 598.7e12,
},
Expand Down Expand Up @@ -448,6 +476,7 @@ def measure_flops(
torch.int8: 272.8e12,
"int4": 546.6e12,
},
# source: https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf
"rtx 3080": {
torch.float32: 29.8e12,
"tfloat32": 29.8e12,
Expand Down
66 changes: 66 additions & 0 deletions tests/tests_fabric/utilities/test_throughput.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings
from unittest import mock
from unittest.mock import Mock, call

Expand Down Expand Up @@ -183,6 +184,60 @@ def test_throughput():
throughput.update(time=0, batches=2, samples=2, lengths=1)


def test_throughput_sparse_model_scaling():
"""``using_sparse_model`` scales ``available_flops`` by the acceleration factor."""
# explicit False — available_flops untouched (also avoids the unset-flag warning)
assert Throughput(available_flops=100.0, using_sparse_model=False).available_flops == 100.0

# sparse flag on with default 2.0 factor
assert Throughput(available_flops=100.0, using_sparse_model=True).available_flops == 200.0

# sparse flag on with custom factor
throughput = Throughput(available_flops=100.0, using_sparse_model=True, sparse_cuda_acceleration_factor=4.0)
assert throughput.available_flops == 400.0

# sparse flag on with unknown peak — stays None, no error (MFU is skipped in compute())
assert Throughput(available_flops=None, using_sparse_model=True).available_flops is None

# factor below 1.0 is physically meaningless (sparsity can never lower the peak)
with pytest.raises(AssertionError, match="sparse acceleration factor cannot reduce"):
Throughput(available_flops=100.0, using_sparse_model=True, sparse_cuda_acceleration_factor=0.5)


def test_throughput_sparse_model_warning():
"""When ``using_sparse_model`` is unset and a peak is known, warn so MFU is not silently ambiguous."""
# warning fires only when the peak is known and the user didn't specify intent
with pytest.warns(UserWarning, match="MFU assumes dense model FLOPs"):
throughput = Throughput(available_flops=100.0)
assert throughput.available_flops == 100.0 # dense default, no scaling applied

Comment thread
deependujha marked this conversation as resolved.
# explicit choice (True or False) silences the warning
with warnings.catch_warnings():
warnings.simplefilter("error")
Throughput(available_flops=100.0, using_sparse_model=False)
Throughput(available_flops=100.0, using_sparse_model=True)

# no peak known → MFU is never computed, so no warning is needed
with warnings.catch_warnings():
warnings.simplefilter("error")
Throughput(available_flops=None)


def test_throughput_sparse_model_mfu():
"""MFU denominator reflects the sparse-scaled peak, so MFU halves when the peak doubles."""
# baseline dense: mfu = flops_per_sec / available_flops = 10 / 50 = 0.2
throughput = Throughput(available_flops=50, window_size=2, using_sparse_model=False)
throughput.update(time=1, batches=1, samples=2, flops=10)
throughput.update(time=2, batches=2, samples=4, flops=10)
assert throughput.compute()["device/mfu"] == 0.2

# sparse: peak doubles to 100, so mfu = 10 / 100 = 0.1
throughput = Throughput(available_flops=50, window_size=2, using_sparse_model=True)
throughput.update(time=1, batches=1, samples=2, flops=10)
throughput.update(time=2, batches=2, samples=4, flops=10)
assert throughput.compute()["device/mfu"] == 0.1


def mock_train_loop(monitor):
# simulate lit-gpt style loop
total_lengths = 0
Expand Down Expand Up @@ -318,6 +373,17 @@ def test_throughput_monitor_world_size():
]


def test_throughput_monitor_sparse_model():
"""``using_sparse_model`` and ``sparse_cuda_acceleration_factor`` propagate through ThroughputMonitor."""
fabric_mock = Mock()
fabric_mock.world_size = 1
fabric_mock.strategy.precision = Precision()
with mock.patch("lightning.fabric.utilities.throughput.get_available_flops", return_value=100):
monitor = ThroughputMonitor(fabric_mock, using_sparse_model=True, sparse_cuda_acceleration_factor=2.0)
# 100 (from hardware lookup) scaled by 2.0 (sparse factor) → 200
assert monitor.available_flops == 200


def test_monotonic_window():
w = _MonotonicWindow(maxlen=3)
assert w == []
Expand Down
Loading