Skip to content
304 changes: 162 additions & 142 deletions src/lightning/fabric/utilities/throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,30 @@ class Throughput:
world_size: Number of devices available across hosts. Global metrics are not included if the world size is 1.
window_size: Number of batches to use for a rolling average.
separator: Key separator to use when creating per-device and global metrics.
using_sparse_model: Whether the model uses structured sparsity. If ``True``, scales ``available_flops`` by
``sparse_cuda_acceleration_factor``. ``None`` (default) assumes dense and warns once.
sparse_cuda_acceleration_factor: Multiplier applied to ``available_flops`` when ``using_sparse_model`` is
``True``. Defaults to ``2.0`` for NVIDIA 2:4 structured sparsity.

"""

def __init__(
self, available_flops: Optional[float] = None, world_size: int = 1, window_size: int = 100, separator: str = "/"
self,
available_flops: Optional[float] = None,
world_size: int = 1,
window_size: int = 100,
separator: str = "/",
using_sparse_model: Optional[bool] = None,
sparse_cuda_acceleration_factor: float = 2.0,
) -> None:
assert sparse_cuda_acceleration_factor >= 1.0, "sparse acceleration factor cannot reduce peak FLOPs"
if using_sparse_model is None and available_flops is not None:
rank_zero_warn(
"MFU assumes dense model FLOPs (no sparsity acceleration)."
" Set 'using_sparse_model=True' for mfu to use sparse flops."
)
Comment thread
deependujha marked this conversation as resolved.
if using_sparse_model and available_flops is not None:
available_flops = available_flops * sparse_cuda_acceleration_factor
self.available_flops = available_flops
self.separator = separator
assert world_size > 0
Expand Down Expand Up @@ -308,223 +326,225 @@ def measure_flops(
"h200 sxm1": {
torch.float64: 3.4e13,
torch.float32: 6.7e13,
"tfloat32": 9.9e14,
torch.bfloat16: 2.0e15,
torch.float16: 2.0e15,
torch.int8: 4.0e15,
"tfloat32": 4.945e14,
torch.bfloat16: 9.895e14,
torch.float16: 9.895e14,
torch.int8: 1.979e15,
},
"h200 nvl1": {
torch.float64: 3.0e13,
torch.float32: 6.0e13,
"tfloat32": 8.4e14,
torch.bfloat16: 1.7e15,
torch.float16: 1.7e15,
torch.int8: 3.3e15,
"tfloat32": 4.2e14,
torch.bfloat16: 8.4e14,
torch.float16: 8.4e14,
torch.int8: 1.68e15,
},
# source: https://resources.nvidia.com/en-us-tensor-core
# source: https://resources.nvidia.com/en-us-gpu-resources/h100-datasheet-24306
"h100 nvl": {
torch.float64: 67e12,
torch.float32: 133.8e12,
"tfloat32": 989.4e12,
torch.bfloat16: 1978.8e12,
torch.float16: 1978.8e12,
torch.int8: 3957.8e12,
torch.float64: 3.0e13,
torch.float32: 6.0e13,
"tfloat32": 4.175e14,
torch.bfloat16: 8.355e14,
torch.float16: 8.355e14,
torch.int8: 1.6705e15,
},
"h100 sxm": {
torch.float64: 33.5e12,
torch.float32: 66.9e12,
"tfloat32": 494.7e12,
torch.bfloat16: 989.4e12,
torch.float16: 989.4e12,
torch.int8: 1978.9e12,
torch.float64: 3.4e13,
torch.float32: 6.7e13,
"tfloat32": 4.945e14,
torch.bfloat16: 9.895e14,
torch.float16: 9.895e14,
torch.int8: 1.979e15,
},
"h100 pcie": {
torch.float64: 25.6e12,
torch.float32: 51.2e12,
"tfloat32": 378e12,
torch.bfloat16: 756e12,
torch.float16: 756e12,
torch.int8: 1513e12,
torch.float64: 2.56e13,
torch.float32: 5.12e13,
"tfloat32": 3.78e14,
torch.bfloat16: 7.56e14,
torch.float16: 7.56e14,
torch.int8: 1.513e15,
},
# Ada
# source: https://images.nvidia.com/aem-dam/Solutions/Data-Center/l4/nvidia-ada-gpu-architecture-whitepaper-v2.1.pdf
"rtx 4090": {
torch.float32: 82.6e12,
"tfloat32": 82.6e12,
torch.bfloat16: 82.6e12,
torch.float16: 82.6e12,
torch.int8: 660.6e12,
"int4": 1321.2e12,
torch.float32: 8.26e13,
"tfloat32": 8.26e13,
torch.bfloat16: 8.26e13,
torch.float16: 8.26e13,
torch.int8: 6.606e14,
"int4": 1.3212e15,
},
"rtx 4080": {
torch.float32: 48.7e12,
"tfloat32": 48.7e12,
torch.bfloat16: 48.7e12,
torch.float16: 48.7e12,
torch.int8: 389.9e12,
"int4": 779.8e12,
torch.float32: 4.87e13,
"tfloat32": 4.87e13,
torch.bfloat16: 4.87e13,
torch.float16: 4.87e13,
torch.int8: 3.899e14,
"int4": 7.798e14,
},
"rtx 4080 super": {
torch.float32: 52.2e12,
"tfloat32": 52.2e12,
torch.bfloat16: 52.2e12,
torch.float16: 52.2e12,
torch.int8: 417.6e12,
"int4": 835.2e12,
torch.float32: 5.22e13,
"tfloat32": 5.22e13,
torch.bfloat16: 5.22e13,
torch.float16: 5.22e13,
torch.int8: 4.176e14,
"int4": 8.352e14,
},
"l4": {
torch.float32: 30.3e12,
"tfloat32": 60e12,
torch.bfloat16: 121e12,
torch.float16: 121e12,
torch.int8: 242e12,
"int4": 484e12,
torch.float32: 3.03e13,
"tfloat32": 6.0e13,
torch.bfloat16: 1.21e14,
torch.float16: 1.21e14,
torch.int8: 2.42e14,
"int4": 4.84e14,
},
"l40": {
torch.float32: 90.5e12,
"tfloat32": 90.5e12,
torch.bfloat16: 181e12,
torch.float16: 181e12,
torch.int8: 362e12,
"int4": 724e12,
torch.float32: 9.05e13,
"tfloat32": 9.05e13,
torch.bfloat16: 1.81e14,
torch.float16: 1.81e14,
torch.int8: 3.62e14,
"int4": 7.24e14,
},
# Ampere
# source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
# sxm and pcie have same flop counts
"a100": {
torch.float64: 9.7e12,
torch.float32: 19.5e12,
"tfloat32": 156e12,
torch.bfloat16: 312e12,
torch.float16: 312e12,
torch.int8: 624e12,
torch.float32: 1.95e13,
"tfloat32": 1.56e14,
torch.bfloat16: 3.12e14,
torch.float16: 3.12e14,
torch.int8: 6.24e14,
},
"a6000": {
torch.float32: 38.7e12,
"tfloat32": 77.4e12,
torch.bfloat16: 38.7e12,
torch.float16: 38.7e12,
torch.int8: 309.7e12,
"int4": 619.3e12,
torch.float32: 3.87e13,
"tfloat32": 7.74e13,
torch.bfloat16: 3.87e13,
torch.float16: 3.87e13,
torch.int8: 3.097e14,
"int4": 6.193e14,
},
# source: https://images.nvidia.com/content/Solutions/data-center/a40/nvidia-a40-datasheet.pdf
"a40": {
torch.float32: 37.4e12,
"tfloat32": 74.8e12,
torch.bfloat16: 37.4e12,
torch.float16: 37.4e12,
torch.int8: 299.3e12,
"int4": 598.7e12,
torch.float32: 3.74e13,
"tfloat32": 7.48e13,
torch.bfloat16: 1.497e14,
torch.float16: 1.497e14,
torch.int8: 2.993e14,
"int4": 5.987e14,
},
# source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a10/pdf/a10-datasheet.pdf
"a10g": {
torch.float32: 31.2e12,
"tfloat32": 62.5e12,
torch.bfloat16: 125e12,
torch.float16: 125e12,
torch.int8: 250e12,
"int4": 500e12,
torch.float32: 3.12e13,
"tfloat32": 6.25e13,
torch.bfloat16: 1.25e14,
torch.float16: 1.25e14,
torch.int8: 2.5e14,
"int4": 5.0e14,
},
"rtx 3090 ti": {
torch.float32: 40e12,
"tfloat32": 40e12,
torch.bfloat16: 40e12,
torch.float16: 40e12,
torch.int8: 320e12,
"int4": 640e12,
torch.float32: 4.0e13,
"tfloat32": 4.0e13,
torch.bfloat16: 4.0e13,
torch.float16: 4.0e13,
torch.int8: 3.2e14,
"int4": 6.4e14,
},
"rtx 3090": {
torch.float32: 35.6e12,
"tfloat32": 35.6e12,
torch.bfloat16: 35.6e12,
torch.float16: 35.6e12,
torch.int8: 284e12,
"int4": 568e12,
torch.float32: 3.56e13,
"tfloat32": 3.56e13,
torch.bfloat16: 3.56e13,
torch.float16: 3.56e13,
torch.int8: 2.84e14,
"int4": 5.68e14,
},
"rtx 3080 ti": {
torch.float32: 34.1e12,
"tfloat32": 34.1e12,
torch.bfloat16: 34.1e12,
torch.float16: 34.1e12,
torch.int8: 272.8e12,
"int4": 546.6e12,
torch.float32: 3.41e13,
"tfloat32": 3.41e13,
torch.bfloat16: 3.41e13,
torch.float16: 3.41e13,
torch.int8: 2.728e14,
"int4": 5.466e14,
},
# source: https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf
"rtx 3080": {
torch.float32: 29.8e12,
"tfloat32": 29.8e12,
torch.bfloat16: 29.8e12,
torch.float16: 29.8e12,
torch.int8: 238e12,
"int4": 476e12,
torch.float32: 2.98e13,
"tfloat32": 2.98e13,
torch.bfloat16: 2.98e13,
torch.float16: 2.98e13,
torch.int8: 2.38e14,
"int4": 4.76e14,
},
"rtx 3070": {
torch.float32: 20.3e12,
"tfloat32": 20.3e12,
torch.bfloat16: 20.3e12,
torch.float16: 20.3e12,
torch.int8: 162.6e12,
"int4": 325.2e12,
torch.float32: 2.03e13,
"tfloat32": 2.03e13,
torch.bfloat16: 2.03e13,
torch.float16: 2.03e13,
torch.int8: 1.626e14,
"int4": 3.252e14,
},
# Turing
# source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/tesla-t4/t4-tensor-core-datasheet-951643.pdf
# sxm and pcie have same flop counts
"t4": {
torch.float32: 8.1e12,
torch.float16: 65e12,
torch.int8: 130e12,
"int4": 260e12,
torch.float16: 6.5e13,
torch.int8: 1.3e14,
"int4": 2.6e14,
},
# https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/quadro-product-literature/quadro-rtx-5000-data-sheet-us-nvidia-704120-r4-web.pdf
"quadro rtx 5000": {
torch.float32: 11.2e12,
torch.float16: 89.2e12,
torch.float32: 1.12e13,
torch.float16: 8.92e13,
},
"rtx 2080 super": {
torch.float32: 11.2e12,
torch.float16: 22.3e12,
torch.int8: 178.4e12,
"int4": 356.8e12,
torch.float32: 1.12e13,
torch.float16: 2.23e13,
torch.int8: 1.784e14,
"int4": 3.568e14,
},
"rtx 2080 ti": {
torch.float32: 14.2e12,
torch.float16: 28.5e12,
torch.int8: 227.7e12,
"int4": 455.4e12,
torch.float32: 1.42e13,
torch.float16: 2.85e13,
torch.int8: 2.277e14,
"int4": 4.554e14,
},
"rtx 2080": {
torch.float32: 10.6e12,
torch.float16: 21.2e12,
torch.int8: 169.6e12,
"int4": 339.1e12,
torch.float32: 1.06e13,
torch.float16: 2.12e13,
torch.int8: 1.696e14,
"int4": 3.391e14,
},
# https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf
"rtx 2070 super": {
torch.float32: 9.1e12,
torch.float16: 18.1e12,
torch.int8: 145e12,
"int4": 290e12,
torch.float16: 1.81e13,
torch.int8: 1.45e14,
"int4": 2.9e14,
},
"titan rtx": {
torch.float32: 16.3e12,
torch.float16: 32.6e12,
torch.int8: 261e12,
"int4": 522e12,
torch.float32: 1.63e13,
torch.float16: 3.26e13,
torch.int8: 2.61e14,
"int4": 5.22e14,
},
# Volta
# source: https://images.nvidia.com/content/technologies/volta/pdf/volta-v100-datasheet-update-us-1165301-r5.pdf
"v100 sxm": {
torch.float64: 7.8e12,
torch.float32: 15.7e12,
torch.float16: 125e12,
torch.float32: 1.57e13,
torch.float16: 1.25e14,
},
"v100 pcie": {
torch.float64: 7e12,
torch.float32: 14e12,
torch.float16: 112e12,
torch.float64: 7.0e12,
torch.float32: 1.4e13,
torch.float16: 1.12e14,
},
"v100s pcie": {
torch.float64: 8.2e12,
torch.float32: 16.4e12,
torch.float16: 130e12,
torch.float32: 1.64e13,
torch.float16: 1.3e14,
},
}

Expand Down
Loading
Loading