removed --xla_gpu_enable_triton_gemm=false flag (#1469)

hmaqboolnv · web-flow · commit 28603900b78b · 2025-05-27T18:38:19.000-04:00
Removes --xla_gpu_enable_triton_gemm=false.
diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax
@@ -85,7 +85,6 @@ ENV BUILD_DATE=${BUILD_DATE}
 # The following environment variables tune performance
 ENV XLA_FLAGS=""
 ENV XLA_FLAGS="${XLA_FLAGS} --xla_gpu_enable_latency_hiding_scheduler=true"
-ENV XLA_FLAGS="${XLA_FLAGS} --xla_gpu_enable_triton_gemm=false"
 ENV NCCL_NVLS_ENABLE=0
 
 COPY --from=builder ${BUILD_PATH_JAXLIB} ${BUILD_PATH_JAXLIB}
diff --git a/.github/container/test-maxtext.sh b/.github/container/test-maxtext.sh
@@ -228,7 +228,6 @@ if [[ "${local_arch}" == "9.0" ]]; then
 fi
 
 export BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
-                --xla_gpu_enable_triton_gemm=false
                 --xla_gpu_enable_command_buffer=
                 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 
                 --xla_gpu_all_gather_combine_threshold_bytes=1073741824 
diff --git a/README.md b/README.md
@@ -217,7 +217,6 @@ The [JAX image](https://github.com/NVIDIA/JAX-Toolbox/pkgs/container/jax) is emb
 | XLA Flags | Value | Explanation |
 | --------- | ----- | ----------- |
 | `--xla_gpu_enable_latency_hiding_scheduler` | `true`  | allows XLA to move communication collectives to increase overlap with compute kernels |
-| `--xla_gpu_enable_triton_gemm` | `false` | use cuBLAS instead of Trition GeMM kernels |
 
 | Environment Variable | Value | Explanation |
 | -------------------- | ----- | ----------- |
diff --git a/rosetta/docs/NATIVE_FP8.md b/rosetta/docs/NATIVE_FP8.md
@@ -111,7 +111,6 @@ Enabling this feature is effortless. Users only need to include the option `--fd
 In addition to the suggested XLA flags mentioned in [this section](https://github.com/NVIDIA/JAX-Toolbox/blob/main/rosetta/rosetta/projects/pax/README.md#xla-flags), we also recommend setting these following XLA flags. The execution script should look like:
 ```bash
 export XLA_FLAGS=" \
-    --xla_gpu_enable_triton_gemm=false \
     --xla_gpu_enable_pipelined_all_reduce=false \
     --xla_gpu_enable_pipelined_all_gather=false \
     --xla_gpu_enable_pipelined_reduce_scatter=false \
diff --git a/rosetta/docs/PGLE.md b/rosetta/docs/PGLE.md
@@ -97,7 +97,6 @@ PGLE found latency for async op custom-call-start.1 and (assumed)custom-call-don
 In order to get the best performance with PGLE, here is a list of all recommended XLA flags:
 ```
 export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
---xla_gpu_enable_triton_gemm=false
 --xla_gpu_enable_command_buffer=
 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
 --xla_gpu_all_gather_combine_threshold_bytes=1073741824
diff --git a/rosetta/rosetta/projects/diffusion/common/set_gpu_xla_flags.sh b/rosetta/rosetta/projects/diffusion/common/set_gpu_xla_flags.sh
@@ -1,2 +1,2 @@
 # These XLA flags are meant to be used with the JAX version in the imagen container
-export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=false --xla_gpu_enable_async_all_gather=false --xla_gpu_enable_async_reduce_scatter=false --xla_gpu_enable_triton_gemm=false --xla_gpu_cuda_graph_level=0 --xla_gpu_enable_async_all_reduce=false ${XLA_FLAGS}"
+export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=false --xla_gpu_enable_async_all_gather=false --xla_gpu_enable_async_reduce_scatter=false --xla_gpu_cuda_graph_level=0 --xla_gpu_enable_async_all_reduce=false ${XLA_FLAGS}"
diff --git a/rosetta/rosetta/projects/maxtext/README.md b/rosetta/rosetta/projects/maxtext/README.md
@@ -68,7 +68,6 @@ The [GPU Performance document](../../../docs/GPU_performance.md) provides a deta
 
 ```
 XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
-            --xla_gpu_enable_triton_gemm=false
             --xla_gpu_enable_command_buffer=
             --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 
             --xla_gpu_all_gather_combine_threshold_bytes=1073741824 
diff --git a/rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub b/rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub
@@ -53,7 +53,6 @@ export NCCL_IB_SL=1
 
 # Set XLA Flags
 export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
-                --xla_gpu_enable_triton_gemm=false
                 --xla_gpu_enable_command_buffer=
                 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
                 --xla_gpu_all_gather_combine_threshold_bytes=1073741824
diff --git a/rosetta/rosetta/projects/maxtext/xla_flags/llama2-7b-1N8G.env b/rosetta/rosetta/projects/maxtext/xla_flags/llama2-7b-1N8G.env
@@ -4,7 +4,6 @@ NUM_GPUS=8
 THRESHOLD_BYTES=1073741824
 export XLA_FLAGS="\
     --xla_gpu_enable_latency_hiding_scheduler=true \
-    --xla_gpu_enable_triton_gemm=false \
     --xla_gpu_enable_command_buffer= \
     --xla_gpu_enable_highest_priority_async_stream=true \
     --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \
diff --git a/rosetta/rosetta/projects/pax/README.md b/rosetta/rosetta/projects/pax/README.md
@@ -142,7 +142,6 @@ For the the 126M model, we recommend setting `--xla_gpu_all_reduce_combine_thres
 
 ```
 BASE_XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
-                --xla_gpu_enable_triton_gemm=false
                 --xla_gpu_all_reduce_combine_threshold_bytes=33554432
                 --xla_gpu_enable_command_buffer=" bash run_pile_multinode.sh ...
 ```

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# These XLA flags are meant to be used with the JAX version in the imagen container`
`2`		`-export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=false --xla_gpu_enable_async_all_gather=false --xla_gpu_enable_async_reduce_scatter=false --xla_gpu_enable_triton_gemm=false --xla_gpu_cuda_graph_level=0 --xla_gpu_enable_async_all_reduce=false ${XLA_FLAGS}"`
	`2`	`+export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=false --xla_gpu_enable_async_all_gather=false --xla_gpu_enable_async_reduce_scatter=false --xla_gpu_cuda_graph_level=0 --xla_gpu_enable_async_all_reduce=false ${XLA_FLAGS}"`