Merge branch 'main' into sbosisio/cuda-dl-base

Steboss · web-flow · commit ce1f3f300b8e · 2025-01-27T10:23:57.000Z
diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax
@@ -102,7 +102,7 @@ RUN <<"EOF" bash -ex
 for component in $(ls ${BUILD_PATH_JAXLIB}); do
     echo "-e file://${BUILD_PATH_JAXLIB}/${component}" >> /opt/pip-tools.d/requirements-jax.in;
 done
-echo "-e file://${SRC_PATH_JAX}" >> /opt/pip-tools.d/requirements-jax.in
+echo "-e file://${SRC_PATH_JAX}[k8s]" >> /opt/pip-tools.d/requirements-jax.in
 echo "numpy<2.0.0" >> /opt/pip-tools.d/requirements-jax.in
 EOF
 
diff --git a/.github/container/test-maxtext.sh b/.github/container/test-maxtext.sh
@@ -233,7 +233,7 @@ fi
 
 export BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
                 --xla_gpu_enable_triton_gemm=false
-                --xla_gpu_graph_level=0 
+                --xla_gpu_enable_command_buffer=
                 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 
                 --xla_gpu_all_gather_combine_threshold_bytes=1073741824 
                 --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728
diff --git a/rosetta/docs/GPU_performance.md b/rosetta/docs/GPU_performance.md
@@ -140,3 +140,24 @@ The following flags were used previously used but no longer required.
 - --xla_gpu_enable_highest_priority_async_stream ; Turned on by default
 - --xla_gpu_enable_triton_softmax_fusion ; Deprecated, no longer used
 
+## Tips for Good LLM Training Performance on Blackwell (B200)
+
+### **Support for Attention Mask Type**
+MaxText uses the `padding_causal` mask type for [cuDNN Flash Attention](https://github.com/AI-Hypercomputer/maxtext/blob/6ec3368af31fff6e6d735ac9d5fb77f91fc0f784/MaxText/layers/attentions.py#L411). However, this mask type is not yet supported on Blackwell systems through TransformerEngine. Using `padding_causal` will default to the `unfused_attention` backend, which may reduce performance. As a temporary workaround, you can use the `causal` mask type for attention to maintain performance.
+
+### **No Need to Set `CUDA_DEVICE_MAX_CONNECTIONS=1`**
+Hopper was requiring CUDA_DEVICE_MAX_CONNECTIONS=1 to achieve better communication-compute overlap. This isn't needed for Blackwell and is in fact slower. On Blackwell systems, kernels assigned to higher-priority streams can utilize SM (Streaming Multiprocessor) resources without waiting for lower-priority kernels to release them. Therefore, it is better to leave `CUDA_DEVICE_MAX_CONNECTIONS` at its default value.
+
+### **Additional XLA Flags**
+Enabling CUDA Graphs only for Fusions and Custom Calls reduces CPU launch latency overheads on B200, ensure that you set the following XLA flags: `--xla_gpu_enable_command_buffer=FUSION,CUSTOM_CALL`
+
+This configuration improves performance on Blackwell systems by leveraging efficient command buffer execution in all the models tested on B200.
+
+### **Better Utilizing Additional Memory in Blackwell**
+Blackwell (B200) GPUs have a memory capacity of 180GB, significantly more than H100 GPUs. To take full advantage of this additional memory and enhance performance:
+
+- Adjust model parallelism configurations: can use less model parallelism to fit the same model in memory.
+- Increase batch sizes where possible: larger batch sizes can improve GeMM kernel efficiency.
+- Optimize activation checkpointing policies: fewer activation tensors need to be recomputed in the backward pass on B200.
+
+Careful tuning of these parameters is essential when transitioning from H100 to B200 systems to fully utilize the available resources.
diff --git a/rosetta/docs/PGLE.md b/rosetta/docs/PGLE.md
@@ -62,7 +62,7 @@ In order to get the best performance with PGLE, here is a list of all recommende
 ```
 export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
 --xla_gpu_enable_triton_gemm=false
---xla_gpu_graph_level=0
+--xla_gpu_enable_command_buffer=
 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
 --xla_gpu_all_gather_combine_threshold_bytes=1073741824
 --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
diff --git a/rosetta/rosetta/projects/diffusion/common/set_gpu_xla_flags.sh b/rosetta/rosetta/projects/diffusion/common/set_gpu_xla_flags.sh
@@ -1,2 +1,2 @@
 # These XLA flags are meant to be used with the JAX version in the imagen container
-export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=false --xla_gpu_enable_async_all_gather=false --xla_gpu_enable_async_reduce_scatter=false --xla_gpu_enable_triton_gemm=false --xla_gpu_cuda_graph_level=0 --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_async_all_reduce=false ${XLA_FLAGS}"
+export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=false --xla_gpu_enable_async_all_gather=false --xla_gpu_enable_async_reduce_scatter=false --xla_gpu_enable_triton_gemm=false --xla_gpu_cuda_graph_level=0 --xla_gpu_enable_async_all_reduce=false ${XLA_FLAGS}"
diff --git a/rosetta/rosetta/projects/maxtext/README.md b/rosetta/rosetta/projects/maxtext/README.md
@@ -69,7 +69,7 @@ The [GPU Performance document](../../../docs/GPU_performance.md) provides a deta
 ```
 XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
             --xla_gpu_enable_triton_gemm=false
-            --xla_gpu_graph_level=0
+            --xla_gpu_enable_command_buffer=
             --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 
             --xla_gpu_all_gather_combine_threshold_bytes=1073741824 
             --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728
diff --git a/rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub b/rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub
@@ -54,7 +54,7 @@ export NCCL_IB_SL=1
 # Set XLA Flags
 export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
                 --xla_gpu_enable_triton_gemm=false
-                --xla_gpu_graph_level=0
+                --xla_gpu_enable_command_buffer=
                 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
                 --xla_gpu_all_gather_combine_threshold_bytes=1073741824
                 --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728
diff --git a/rosetta/rosetta/projects/maxtext/xla_flags/llama2-7b-1N8G.env b/rosetta/rosetta/projects/maxtext/xla_flags/llama2-7b-1N8G.env
@@ -5,7 +5,7 @@ THRESHOLD_BYTES=1073741824
 export XLA_FLAGS="\
     --xla_gpu_enable_latency_hiding_scheduler=true \
     --xla_gpu_enable_triton_gemm=false \
-    --xla_gpu_graph_level=0 \
+    --xla_gpu_enable_command_buffer= \
     --xla_gpu_enable_highest_priority_async_stream=true \
     --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \
     --xla_gpu_all_gather_combine_threshold_bytes=$((THRESHOLD_BYTES/(NUM_NODES*NUM_GPUS))) \
@@ -14,7 +14,6 @@ export XLA_FLAGS="\
     --xla_gpu_enable_pipelined_reduce_scatter=true \
     --xla_gpu_enable_pipelined_all_reduce=true \
     --xla_gpu_enable_while_loop_double_buffering=true \
-    --xla_gpu_enable_triton_softmax_fusion=false \
     --xla_gpu_enable_all_gather_combine_by_dim=false \
     --xla_gpu_enable_reduce_scatter_combine_by_dim=false \
     --xla_disable_hlo_passes=rematerialization \
diff --git a/rosetta/rosetta/projects/pax/README.md b/rosetta/rosetta/projects/pax/README.md
@@ -141,7 +141,7 @@ For the the 126M model, we recommend setting `--xla_gpu_all_reduce_combine_thres
 BASE_XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
                 --xla_gpu_enable_triton_gemm=false
                 --xla_gpu_all_reduce_combine_threshold_bytes=33554432
-                --xla_gpu_graph_level=0" bash run_pile_multinode.sh ...
+                --xla_gpu_enable_command_buffer=" bash run_pile_multinode.sh ...
 ```
 
 # Configs
diff --git a/rosetta/rosetta/projects/pax/xla_flags/common.env b/rosetta/rosetta/projects/pax/xla_flags/common.env
@@ -4,9 +4,8 @@ export XLA_FLAGS="\
     --xla_gpu_enable_latency_hiding_scheduler=true \
     --xla_allow_excess_precision \
     --xla_gpu_enable_highest_priority_async_stream=true \
-    --xla_gpu_enable_triton_softmax_fusion=false \
     --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \
-    --xla_gpu_graph_level=0 \
+    --xla_gpu_enable_command_buffer= \
     "
 export XLA_PYTHON_CLIENT_MEM_FRACTION=0.8
 unset THRESHOLD_BYTES
diff --git a/rosetta/rosetta/projects/pax/xla_flags/gpt-126m.env b/rosetta/rosetta/projects/pax/xla_flags/gpt-126m.env
@@ -4,9 +4,8 @@ export XLA_FLAGS="\
     --xla_gpu_enable_latency_hiding_scheduler=true \
     --xla_allow_excess_precision \
     --xla_gpu_enable_highest_priority_async_stream=true \
-    --xla_gpu_enable_triton_softmax_fusion=false \
     --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \
-    --xla_gpu_graph_level=0 \
+    --xla_gpu_enable_command_buffer= \
     --xla_gpu_enable_cudnn_fmha=false \
     "
 export XLA_PYTHON_CLIENT_MEM_FRACTION=0.8
diff --git a/rosetta/rosetta/projects/pax/xla_flags/grok-proxy.env b/rosetta/rosetta/projects/pax/xla_flags/grok-proxy.env
@@ -6,9 +6,8 @@ export XLA_FLAGS="\
     --xla_gpu_enable_latency_hiding_scheduler=true \
     --xla_allow_excess_precision \
     --xla_gpu_enable_highest_priority_async_stream=true \
-    --xla_gpu_enable_triton_softmax_fusion=false \
     --xla_gpu_all_reduce_combine_threshold_bytes=${ALL_REDUCE_THRESHOLD_BYTES} \
-    --xla_gpu_graph_level=0 \
+    --xla_gpu_enable_command_buffer= \
     --xla_gpu_all_gather_combine_threshold_bytes=${ALL_GATHER_THRESHOLD_BYTES} \
     --xla_gpu_reduce_scatter_combine_threshold_bytes=${REDUCE_SCATTER_THRESHOLD_BYTES} \
     --xla_gpu_enable_pipelined_all_gather=true \

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# These XLA flags are meant to be used with the JAX version in the imagen container`
`2`		`-export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=false --xla_gpu_enable_async_all_gather=false --xla_gpu_enable_async_reduce_scatter=false --xla_gpu_enable_triton_gemm=false --xla_gpu_cuda_graph_level=0 --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_async_all_reduce=false ${XLA_FLAGS}"`
	`2`	`+export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=false --xla_gpu_enable_async_all_gather=false --xla_gpu_enable_async_reduce_scatter=false --xla_gpu_enable_triton_gemm=false --xla_gpu_cuda_graph_level=0 --xla_gpu_enable_async_all_reduce=false ${XLA_FLAGS}"`