Skip to content

Commit e70a017

Browse files
committed
Fix MaxText metrics/baselines
1 parent a7f2fbc commit e70a017

File tree

10 files changed

+75
-29
lines changed

10 files changed

+75
-29
lines changed

.github/workflows/_test_maxtext.yaml

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -373,24 +373,6 @@ jobs:
373373
ARTIFACT_NAME: ${{ inputs.ARTIFACT_NAME }}
374374
FW_NAME: ${{ inputs.FW_NAME }}
375375

376-
summary:
377-
name: test-maxtext-summary
378-
runs-on: ubuntu-22.04
379-
needs: [single-process-multi-device, maxtext-multinode]
380-
if: "!cancelled()"
381-
steps:
382-
- name: Generate TensorBoard query URL
383-
run: |
384-
(
385-
cat << EOF
386-
387-
## MaxText training
388-
389-
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per)
390-
391-
EOF
392-
) | tee $GITHUB_STEP_SUMMARY
393-
394376
outcome:
395377
name: test-maxtext-outcome
396378
needs: sitrep

.github/workflows/baselines/MAXTEXT/upstream/1DP1FSDP1TP1PP.json

Lines changed: 0 additions & 1 deletion
This file was deleted.

.github/workflows/baselines/MAXTEXT/upstream/1DP1FSDP8TP1PP.json

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,33 @@
1-
{"start_step":1,"end_step":9,"step_interval":1,"loss_values":[0.19917848706245422,0,0,0,0,0,0,0,0],"step_times":[0.27129199107487995,0.17545133332411447,0.1774536669254303,0.18130967020988464,0.17997999986012778,0.17623033126195273,0.17701533436775208,0.17688766618569693,0.1763359953959783],"step_time_avg":0.1879951098450908,"e2e_time_seconds":89.195,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729826963/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729993999/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7731914601/artifacts"],"date":"2024-01-31"}
1+
{
2+
"start_step": 1,
3+
"end_step": 9,
4+
"step_interval": 1,
5+
"loss_values": [
6+
24.950599670410156,
7+
23.6308536529541,
8+
22.22606086730957,
9+
20.85379981994629,
10+
19.61219596862793,
11+
18.5745849609375,
12+
17.77853012084961,
13+
17.22124481201172,
14+
16.864944458007812
15+
],
16+
"step_times": [
17+
0.34216299653053284,
18+
0.20161199569702148,
19+
0.20115399360656738,
20+
0.19551700353622437,
21+
0.19348999857902527,
22+
0.19705399870872498,
23+
0.19354699552059174,
24+
0.1978529989719391,
25+
0.19385899603366852
26+
],
27+
"step_time_avg": 0.2129165530204773,
28+
"e2e_time_seconds": 34.555,
29+
"run_urls": [
30+
"https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/14054620516/artifacts"
31+
],
32+
"date": "2025-03-25"
33+
}

.github/workflows/baselines/MAXTEXT/upstream/1DP4FSDP2TP1PP.json

Lines changed: 0 additions & 1 deletion
This file was deleted.

.github/workflows/baselines/MAXTEXT/upstream/1DP8FSDP1TP1PP.json

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,33 @@
1-
{"start_step":1,"end_step":9,"step_interval":1,"loss_values":[0.20010541379451752,3.576278402306343e-07,0,0,0,0,0,0,0],"step_times":[0.2453316698471705,0.15532933175563812,0.15473033487796783,0.15553300082683563,0.15387233098347983,0.1558946669101715,0.15286600093046823,0.1544460008541743,0.15370899935563406],"step_time_avg":0.1646347040379489,"e2e_time_seconds":184.883,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729826963/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729993999/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7731914601/artifacts"],"date":"2024-01-31"}
1+
{
2+
"start_step": 1,
3+
"end_step": 9,
4+
"step_interval": 1,
5+
"loss_values": [
6+
24.9504337310791,
7+
23.63066291809082,
8+
22.226167678833008,
9+
20.853334426879883,
10+
19.611804962158203,
11+
18.574153900146484,
12+
17.778240203857422,
13+
17.22144317626953,
14+
16.864635467529297
15+
],
16+
"step_times": [
17+
0.31094300746917725,
18+
0.21168699860572815,
19+
0.16145099699497223,
20+
0.1525229960680008,
21+
0.153779998421669,
22+
0.15510499477386475,
23+
0.1530109941959381,
24+
0.1535159945487976,
25+
0.15335600078105927
26+
],
27+
"step_time_avg": 0.17837466465102303,
28+
"e2e_time_seconds": 32.688,
29+
"run_urls": [
30+
"https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/14054620516/artifacts"
31+
],
32+
"date": "2025-03-25"
33+
}

.github/workflows/baselines/MAXTEXT/upstream/4DP2FSDP2TP1PP.json

Lines changed: 0 additions & 1 deletion
This file was deleted.

.github/workflows/baselines/create_baselines.sh

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,10 @@ elif [[ "$TYPE" == "rosetta-t5x" ]]; then
4646
)
4747
OUTPUT_DIR=T5X_MGMN/rosetta
4848
elif [[ "$TYPE" == "upstream-maxtext" ]]; then
49-
CONFIGS=("1DP1FSDP1TP1PP" "1DP1FSDP8TP1PP" "1DP2FSDP4TP1PP_single_process" "1DP4FSDP2TP1PP" "1DP8FSDP1TP1PP" "2DP2FSDP2TP1PP" "4DP2FSDP2TP1PP")
49+
CONFIGS=(
50+
"1DP2FSDP4TP1PP_single_process"
51+
"2DP2FSDP2TP1PP"
52+
)
5053
OUTPUT_DIR=MAXTEXT/upstream
5154
else
5255
usage
@@ -60,8 +63,10 @@ bash ${UTIL_DIR}/download_artifacts.sh ${ALL_WF_RUNS[@]}
6063
URLS=()
6164
for WORKFLOW_RUN in ${ALL_WF_RUNS[@]}; do
6265
for CFG in ${CONFIGS[@]}; do
63-
if [[ $(find . -mindepth 1 -maxdepth 2 -type d -name $CFG | wc -l) -ne 1 ]]; then
64-
echo "Expected one artifact to have a '$CFG' dir under '$PWD', but found $(find . -mindepth 1 -maxdepth 2 -type d -name $CFG)"
66+
CFG=$TYPE-$WORKFLOW_RUN-$CFG
67+
ARTS=$(find . -mindepth 1 -maxdepth 2 -type d -name $CFG)
68+
if (( $(echo ${ARTS} | wc -l) != 1 )); then
69+
echo "Expected one artifact to have a '$CFG' dir under '$PWD', but found ${ARTS}"
6570
exit 1
6671
fi
6772
done

.github/workflows/baselines/summarize_metrics.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def main():
4040
if not os.path.exists(searchpath):
4141
searchpath = os.path.join(args.test_config, "summaries/train")
4242
if not os.path.exists(searchpath):
43-
searchpath = os.path.join(args.test_config, "logdir/tensorboard")
43+
searchpath = os.path.join(args.test_config, "logdir/tensorboard/logdir")
4444
assert os.path.exists(searchpath), f"Neither {args.test_config}/train nor {args.test_config}/summaries/train nor {args.test_config}/logdir/tensorboard dirs exist"
4545
event_files = glob.glob(os.path.join(searchpath, "events*"))
4646
assert len(event_files) > 0, f"{searchpath} did not contain a tensorboard events file"

0 commit comments

Comments
 (0)