@@ -88,13 +88,10 @@ class Throughput:
8888 world_size: Number of devices available across hosts. Global metrics are not included if the world size is 1.
8989 window_size: Number of batches to use for a rolling average.
9090 separator: Key separator to use when creating per-device and global metrics.
91- using_sparse_model: User intent for whether the model uses structured sparsity.
92- ``True`` scales ``available_flops`` by ``sparse_cuda_acceleration_factor`` to reflect the higher
93- theoretical peak achievable with structured sparsity (e.g. NVIDIA's 2:4 sparse Tensor Cores on
94- Ampere and later). ``False`` keeps the dense peak. ``None`` (the default) keeps the dense peak
95- and emits a one-time warning so users explicitly choose; set this flag to silence the warning.
96- sparse_cuda_acceleration_factor: Multiplier applied to ``available_flops`` when ``using_sparse_model``
97- is ``True``. Defaults to ``2.0``, matching the 2x speedup of NVIDIA 2:4 structured sparsity.
91+ using_sparse_model: Whether the model uses structured sparsity. If ``True``, scales ``available_flops`` by
92+ ``sparse_cuda_acceleration_factor``. ``None`` (default) assumes dense and warns once.
93+ sparse_cuda_acceleration_factor: Multiplier applied to ``available_flops`` when ``using_sparse_model`` is
94+ ``True``. Defaults to ``2.0`` for NVIDIA 2:4 structured sparsity.
9895
9996 """
10097
@@ -107,14 +104,8 @@ def __init__(
107104 using_sparse_model : Optional [bool ] = None ,
108105 sparse_cuda_acceleration_factor : float = 2.0 ,
109106 ) -> None :
110- # For sparse models, hardware can achieve a higher theoretical peak (e.g. NVIDIA's 2:4 structured
111- # sparsity doubles tensor-core throughput). Scaling the peak here keeps MFU = measured / peak
112- # consistent with the sparse ceiling, so users get a meaningful utilization figure.
113- # If `available_flops is None` the peak is unknown, so MFU is skipped in `compute()` regardless
114- # (see the `if self.available_flops:` guard), and we leave it as None rather than erroring.
115107 assert sparse_cuda_acceleration_factor >= 1.0 , "sparse acceleration factor cannot reduce peak FLOPs"
116108 if using_sparse_model is None and available_flops is not None :
117- # the user didn't tell us their intent; default to dense and warn so MFU isn't silently ambiguous
118109 rank_zero_warn (
119110 "MFU assumes dense model FLOPs (no sparsity acceleration)."
120111 " Set 'using_sparse_model=True' for mfu to use sparse flops."
0 commit comments