Replace deprecated flag xla_gpu_graph_level. (#1244)

sergachev · web-flow · commit 037829852a1a · 2025-01-21T11:46:20.000+01:00
The replacement is xla_gpu_enable_command_buffer: https://github.com/openxla/xla/blob/5d92a4430f26fd73593ac92657507db21d131f13/xla/debug_options_flags.cc#L1412-L1414
diff --git a/.github/container/test-maxtext.sh b/.github/container/test-maxtext.sh
@@ -233,7 +233,7 @@ fi
 
 export BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
                 --xla_gpu_enable_triton_gemm=false
-                --xla_gpu_graph_level=0 
+                --xla_gpu_enable_command_buffer=
                 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 
                 --xla_gpu_all_gather_combine_threshold_bytes=1073741824 
                 --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728
diff --git a/rosetta/docs/PGLE.md b/rosetta/docs/PGLE.md
@@ -62,7 +62,7 @@ In order to get the best performance with PGLE, here is a list of all recommende
 ```
 export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
 --xla_gpu_enable_triton_gemm=false
---xla_gpu_graph_level=0
+--xla_gpu_enable_command_buffer=
 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
 --xla_gpu_all_gather_combine_threshold_bytes=1073741824
 --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
diff --git a/rosetta/rosetta/projects/maxtext/README.md b/rosetta/rosetta/projects/maxtext/README.md
@@ -69,7 +69,7 @@ The [GPU Performance document](../../../docs/GPU_performance.md) provides a deta
 ```
 XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
             --xla_gpu_enable_triton_gemm=false
-            --xla_gpu_graph_level=0
+            --xla_gpu_enable_command_buffer=
             --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 
             --xla_gpu_all_gather_combine_threshold_bytes=1073741824 
             --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728
diff --git a/rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub b/rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub
@@ -54,7 +54,7 @@ export NCCL_IB_SL=1
 # Set XLA Flags
 export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
                 --xla_gpu_enable_triton_gemm=false
-                --xla_gpu_graph_level=0
+                --xla_gpu_enable_command_buffer=
                 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
                 --xla_gpu_all_gather_combine_threshold_bytes=1073741824
                 --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728
diff --git a/rosetta/rosetta/projects/maxtext/xla_flags/llama2-7b-1N8G.env b/rosetta/rosetta/projects/maxtext/xla_flags/llama2-7b-1N8G.env
@@ -5,7 +5,7 @@ THRESHOLD_BYTES=1073741824
 export XLA_FLAGS="\
     --xla_gpu_enable_latency_hiding_scheduler=true \
     --xla_gpu_enable_triton_gemm=false \
-    --xla_gpu_graph_level=0 \
+    --xla_gpu_enable_command_buffer= \
     --xla_gpu_enable_highest_priority_async_stream=true \
     --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \
     --xla_gpu_all_gather_combine_threshold_bytes=$((THRESHOLD_BYTES/(NUM_NODES*NUM_GPUS))) \
diff --git a/rosetta/rosetta/projects/pax/README.md b/rosetta/rosetta/projects/pax/README.md
@@ -141,7 +141,7 @@ For the the 126M model, we recommend setting `--xla_gpu_all_reduce_combine_thres
 BASE_XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
                 --xla_gpu_enable_triton_gemm=false
                 --xla_gpu_all_reduce_combine_threshold_bytes=33554432
-                --xla_gpu_graph_level=0" bash run_pile_multinode.sh ...
+                --xla_gpu_enable_command_buffer=" bash run_pile_multinode.sh ...
 ```
 
 # Configs
diff --git a/rosetta/rosetta/projects/pax/xla_flags/common.env b/rosetta/rosetta/projects/pax/xla_flags/common.env
@@ -6,7 +6,7 @@ export XLA_FLAGS="\
     --xla_gpu_enable_highest_priority_async_stream=true \
     --xla_gpu_enable_triton_softmax_fusion=false \
     --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \
-    --xla_gpu_graph_level=0 \
+    --xla_gpu_enable_command_buffer= \
     "
 export XLA_PYTHON_CLIENT_MEM_FRACTION=0.8
 unset THRESHOLD_BYTES
diff --git a/rosetta/rosetta/projects/pax/xla_flags/gpt-126m.env b/rosetta/rosetta/projects/pax/xla_flags/gpt-126m.env
@@ -6,7 +6,7 @@ export XLA_FLAGS="\
     --xla_gpu_enable_highest_priority_async_stream=true \
     --xla_gpu_enable_triton_softmax_fusion=false \
     --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \
-    --xla_gpu_graph_level=0 \
+    --xla_gpu_enable_command_buffer= \
     --xla_gpu_enable_cudnn_fmha=false \
     "
 export XLA_PYTHON_CLIENT_MEM_FRACTION=0.8
diff --git a/rosetta/rosetta/projects/pax/xla_flags/grok-proxy.env b/rosetta/rosetta/projects/pax/xla_flags/grok-proxy.env
@@ -8,7 +8,7 @@ export XLA_FLAGS="\
     --xla_gpu_enable_highest_priority_async_stream=true \
     --xla_gpu_enable_triton_softmax_fusion=false \
     --xla_gpu_all_reduce_combine_threshold_bytes=${ALL_REDUCE_THRESHOLD_BYTES} \
-    --xla_gpu_graph_level=0 \
+    --xla_gpu_enable_command_buffer= \
     --xla_gpu_all_gather_combine_threshold_bytes=${ALL_GATHER_THRESHOLD_BYTES} \
     --xla_gpu_reduce_scatter_combine_threshold_bytes=${REDUCE_SCATTER_THRESHOLD_BYTES} \
     --xla_gpu_enable_pipelined_all_gather=true \

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@ export XLA_FLAGS="\`
`6`	`6`	`--xla_gpu_enable_highest_priority_async_stream=true \`
`7`	`7`	`--xla_gpu_enable_triton_softmax_fusion=false \`
`8`	`8`	`--xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \`
`9`		`- --xla_gpu_graph_level=0 \`
	`9`	`+ --xla_gpu_enable_command_buffer= \`
`10`	`10`	`"`
`11`	`11`	`export XLA_PYTHON_CLIENT_MEM_FRACTION=0.8`
`12`	`12`	`unset THRESHOLD_BYTES`