We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent b32ad27 commit bb6fb12Copy full SHA for bb6fb12
1 file changed
examples/distributed_training/mfu_benchmark.py
@@ -134,7 +134,8 @@ def train_step():
134
elapsed = elapsed_tensor.item()
135
136
step_seconds = elapsed / measure_steps
137
- # Linear layer training FLOPs: forward matmul + dInput + dWeight.
+ # Linear layer training FLOPs: forward matmul + dInput + dWeight (6ND rule).
138
+ # See PaLM paper, Appendix B: https://arxiv.org/abs/2204.02311
139
flops_per_rank_step = 6.0 * batch_size * hidden_size * hidden_size * layers
140
tflops_per_gpu = flops_per_rank_step / step_seconds / 1e12
141
device_name = torch.cuda.get_device_name(device)
0 commit comments