Skip to content

Commit 556aba3

Browse files
authored
disable goodput recorder in test-maxtest.sh (#1106)
The goodput recorder was turned off by default and we do not use it. However, it has been turned on by default from MaxText side 2 days ago which is breaking the CI tests. We just need to explicitly disable these whenever we're running. --------- Signed-off-by: Md Fahim Faysal Khan <[email protected]>
1 parent ddec4bd commit 556aba3

File tree

1 file changed

+50
-14
lines changed

1 file changed

+50
-14
lines changed

.github/container/test-maxtext.sh

Lines changed: 50 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -245,22 +245,58 @@ RUN_NAME="logdir" ## the RUN_NAME cannot be changed
245245
if [ -z "$DECODER_BLOCK" ]; then
246246

247247
# this part could be used to test different model ootb
248-
RUN_SETTINGS="MaxText/train.py MaxText/configs/base.yml run_name=${RUN_NAME} model_name=${MODEL}\
249-
steps=$STEPS per_device_batch_size=${BATCH_PER_GPU} remat_policy=${REMAT_POLICY} enable_checkpointing=false\
250-
base_output_directory=$OUTPUT dataset_path=local dataset_type=synthetic hardware=$HARDWARE\
251-
dcn_fsdp_parallelism=$dcn_FSDP ici_fsdp_parallelism=$ici_FSDP\
252-
ici_data_parallelism=$ici_DP dcn_data_parallelism=$dcn_DP\
253-
ici_tensor_parallelism=$ici_TP dcn_tensor_parallelism=1 ${ADDITIONAL_ARGS}"
254-
248+
RUN_SETTINGS="MaxText/train.py \
249+
MaxText/configs/base.yml \
250+
run_name=${RUN_NAME} \
251+
model_name=${MODEL} \
252+
steps=${STEPS} \
253+
per_device_batch_size=${BATCH_PER_GPU} \
254+
remat_policy=${REMAT_POLICY} \
255+
enable_checkpointing=false\
256+
base_output_directory=${OUTPUT} \
257+
dataset_path=local \
258+
dataset_type=synthetic \
259+
hardware=${HARDWARE} \
260+
enable_goodput_recording=false \
261+
monitor_goodput=false \
262+
dcn_fsdp_parallelism=${dcn_FSDP} \
263+
ici_fsdp_parallelism=${ici_FSDP} \
264+
ici_data_parallelism=${ici_DP} \
265+
dcn_data_parallelism=${dcn_DP} \
266+
ici_tensor_parallelism=${ici_TP} \
267+
dcn_tensor_parallelism=1 \
268+
${ADDITIONAL_ARGS}"
255269
else
256270
# this is essentially used for CI run
257-
RUN_SETTINGS="MaxText/train.py MaxText/configs/base.yml run_name=${RUN_NAME} logits_via_embedding=true decoder_block=${DECODER_BLOCK} \
258-
steps=$STEPS per_device_batch_size=${BATCH_PER_GPU} base_emb_dim=2560 base_mlp_dim=8192 remat_policy=${REMAT_POLICY} attention=${ATTN_TYPE}\
259-
base_num_query_heads=8 base_num_kv_heads=8 base_num_decoder_layers=8 head_dim=128 enable_checkpointing=false\
260-
base_output_directory=$OUTPUT dataset_path=local dataset_type=synthetic hardware=$HARDWARE\
261-
dcn_fsdp_parallelism=$dcn_FSDP ici_fsdp_parallelism=$ici_FSDP\
262-
ici_data_parallelism=$ici_DP dcn_data_parallelism=$dcn_DP\
263-
ici_tensor_parallelism=$ici_TP dcn_tensor_parallelism=1 ${ADDITIONAL_ARGS}"
271+
RUN_SETTINGS="MaxText/train.py \
272+
MaxText/configs/base.yml \
273+
run_name=${RUN_NAME} \
274+
decoder_block=${DECODER_BLOCK} \
275+
steps=$STEPS \
276+
per_device_batch_size=${BATCH_PER_GPU} \
277+
base_emb_dim=2560 \
278+
base_mlp_dim=8192 \
279+
remat_policy=${REMAT_POLICY} \
280+
attention=${ATTN_TYPE} \
281+
base_num_query_heads=8 \
282+
base_num_kv_heads=8 \
283+
base_num_decoder_layers=8 \
284+
head_dim=128 \
285+
logits_via_embedding=true \
286+
enable_checkpointing=false \
287+
base_output_directory=${OUTPUT} \
288+
dataset_path=local \
289+
dataset_type=synthetic \
290+
hardware=${HARDWARE} \
291+
enable_goodput_recording=false \
292+
monitor_goodput=false \
293+
dcn_fsdp_parallelism=${dcn_FSDP} \
294+
ici_fsdp_parallelism=${ici_FSDP} \
295+
ici_data_parallelism=${ici_DP} \
296+
dcn_data_parallelism=${dcn_DP} \
297+
ici_tensor_parallelism=${ici_TP} \
298+
dcn_tensor_parallelism=1 \
299+
${ADDITIONAL_ARGS}"
264300
fi
265301

266302
echo "Command: python3 $RUN_SETTINGS"

0 commit comments

Comments
 (0)