Skip to content

Commit 2860390

Browse files
authored
removed --xla_gpu_enable_triton_gemm=false flag (#1469)
Removes --xla_gpu_enable_triton_gemm=false.
1 parent ec474b5 commit 2860390

File tree

10 files changed

+1
-10
lines changed

10 files changed

+1
-10
lines changed

.github/container/Dockerfile.jax

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,6 @@ ENV BUILD_DATE=${BUILD_DATE}
8585
# The following environment variables tune performance
8686
ENV XLA_FLAGS=""
8787
ENV XLA_FLAGS="${XLA_FLAGS} --xla_gpu_enable_latency_hiding_scheduler=true"
88-
ENV XLA_FLAGS="${XLA_FLAGS} --xla_gpu_enable_triton_gemm=false"
8988
ENV NCCL_NVLS_ENABLE=0
9089

9190
COPY --from=builder ${BUILD_PATH_JAXLIB} ${BUILD_PATH_JAXLIB}

.github/container/test-maxtext.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,6 @@ if [[ "${local_arch}" == "9.0" ]]; then
228228
fi
229229

230230
export BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
231-
--xla_gpu_enable_triton_gemm=false
232231
--xla_gpu_enable_command_buffer=
233232
--xla_gpu_all_reduce_combine_threshold_bytes=1073741824
234233
--xla_gpu_all_gather_combine_threshold_bytes=1073741824

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,6 @@ The [JAX image](https://github.com/NVIDIA/JAX-Toolbox/pkgs/container/jax) is emb
217217
| XLA Flags | Value | Explanation |
218218
| --------- | ----- | ----------- |
219219
| `--xla_gpu_enable_latency_hiding_scheduler` | `true` | allows XLA to move communication collectives to increase overlap with compute kernels |
220-
| `--xla_gpu_enable_triton_gemm` | `false` | use cuBLAS instead of Trition GeMM kernels |
221220

222221
| Environment Variable | Value | Explanation |
223222
| -------------------- | ----- | ----------- |

rosetta/docs/NATIVE_FP8.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,6 @@ Enabling this feature is effortless. Users only need to include the option `--fd
111111
In addition to the suggested XLA flags mentioned in [this section](https://github.com/NVIDIA/JAX-Toolbox/blob/main/rosetta/rosetta/projects/pax/README.md#xla-flags), we also recommend setting these following XLA flags. The execution script should look like:
112112
```bash
113113
export XLA_FLAGS=" \
114-
--xla_gpu_enable_triton_gemm=false \
115114
--xla_gpu_enable_pipelined_all_reduce=false \
116115
--xla_gpu_enable_pipelined_all_gather=false \
117116
--xla_gpu_enable_pipelined_reduce_scatter=false \

rosetta/docs/PGLE.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,6 @@ PGLE found latency for async op custom-call-start.1 and (assumed)custom-call-don
9797
In order to get the best performance with PGLE, here is a list of all recommended XLA flags:
9898
```
9999
export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
100-
--xla_gpu_enable_triton_gemm=false
101100
--xla_gpu_enable_command_buffer=
102101
--xla_gpu_all_reduce_combine_threshold_bytes=1073741824
103102
--xla_gpu_all_gather_combine_threshold_bytes=1073741824
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# These XLA flags are meant to be used with the JAX version in the imagen container
2-
export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=false --xla_gpu_enable_async_all_gather=false --xla_gpu_enable_async_reduce_scatter=false --xla_gpu_enable_triton_gemm=false --xla_gpu_cuda_graph_level=0 --xla_gpu_enable_async_all_reduce=false ${XLA_FLAGS}"
2+
export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=false --xla_gpu_enable_async_all_gather=false --xla_gpu_enable_async_reduce_scatter=false --xla_gpu_cuda_graph_level=0 --xla_gpu_enable_async_all_reduce=false ${XLA_FLAGS}"

rosetta/rosetta/projects/maxtext/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ The [GPU Performance document](../../../docs/GPU_performance.md) provides a deta
6868

6969
```
7070
XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
71-
--xla_gpu_enable_triton_gemm=false
7271
--xla_gpu_enable_command_buffer=
7372
--xla_gpu_all_reduce_combine_threshold_bytes=1073741824
7473
--xla_gpu_all_gather_combine_threshold_bytes=1073741824

rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ export NCCL_IB_SL=1
5353

5454
# Set XLA Flags
5555
export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
56-
--xla_gpu_enable_triton_gemm=false
5756
--xla_gpu_enable_command_buffer=
5857
--xla_gpu_all_reduce_combine_threshold_bytes=1073741824
5958
--xla_gpu_all_gather_combine_threshold_bytes=1073741824

rosetta/rosetta/projects/maxtext/xla_flags/llama2-7b-1N8G.env

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ NUM_GPUS=8
44
THRESHOLD_BYTES=1073741824
55
export XLA_FLAGS="\
66
--xla_gpu_enable_latency_hiding_scheduler=true \
7-
--xla_gpu_enable_triton_gemm=false \
87
--xla_gpu_enable_command_buffer= \
98
--xla_gpu_enable_highest_priority_async_stream=true \
109
--xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \

rosetta/rosetta/projects/pax/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,6 @@ For the the 126M model, we recommend setting `--xla_gpu_all_reduce_combine_thres
142142

143143
```
144144
BASE_XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
145-
--xla_gpu_enable_triton_gemm=false
146145
--xla_gpu_all_reduce_combine_threshold_bytes=33554432
147146
--xla_gpu_enable_command_buffer=" bash run_pile_multinode.sh ...
148147
```

0 commit comments

Comments
 (0)