Skip to content

Commit 3a20b9d

Browse files
committed
more fixups
1 parent 3452dee commit 3a20b9d

File tree

3 files changed

+18
-5
lines changed

3 files changed

+18
-5
lines changed

.github/workflows/_ci.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -556,7 +556,7 @@ jobs:
556556
docker run -i --gpus all --shm-size=1g \
557557
${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
558558
bash <<"EOF" |& tee test-levanter.log
559-
pip install flake8 pytest pytest-asyncio soundfile librosa
559+
pip install flake8 pytest pytest-asyncio soundfile tensorboardx librosa
560560
PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
561561
EOF
562562
STATISTICS_SCRIPT: |

.github/workflows/_test_t5x_rosetta.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ jobs:
221221
shell: bash -eux {0}
222222
run: |
223223
pip install 'numpy<2.0.0' pytest pytest-reportlog tensorboard
224-
for i in ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-* ${{ inputs.FW_NAME }}-vit-${GITHUB_RUN_ID}-*; do
224+
for i in ${{ inputs.FW_NAME }}-vit-${GITHUB_RUN_ID}-*; do
225225
JOB_NAME=$(echo $i | awk -F "${GITHUB_RUN_ID}-" '{print $2}')
226226
METRIC_PATH=${JOB_NAME}_metrics.json
227227
python3 .github/workflows/baselines/summarize_metrics.py $i/$JOB_NAME --perf_summary_name "timing/steps_per_second" --output_json_path $METRIC_PATH

.github/workflows/baselines/test_maxtext_metrics.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
import os
33
import json
44
import glob
5-
import sys
5+
from numpy.testing import assert_allclose
66
import test_utils
77
from statistics import mean
88

9+
LOSS_RTOL = 0.10
910
STEP_TIME_MULT = 0.95
1011
E2E_TIME_MULT = 0.95
1112
test_dir = os.path.dirname(os.path.abspath(__file__))
@@ -22,9 +23,21 @@ def test_loss(baseline_filename):
2223
event_file = os.path.join(results_dir, test_config, "logdir/tensorboard/logdir/events*")
2324
event_file = glob.glob(event_file)[0]
2425
with open(baseline_filepath, "r") as baseline_file:
25-
end_step = json.load(baseline_file)["end_step"]
26+
baseline_data = json.load(baseline_file)
27+
loss_expected_values = baseline_data["loss_values"]
28+
start_step = baseline_data["start_step"]
29+
end_step = baseline_data["end_step"]
30+
interval = baseline_data["step_interval"]
31+
loss_expected = {step: loss_expected_values[i] for i, step in enumerate(
32+
range(start_step, end_step+1, interval))}
2633
loss_actual = test_utils.read_maxtext_tb_tag(event_file, loss_summary_name)
27-
assert 0 <= loss_actual[end_step] < 1.8e-3, f"Loss at final step: {loss_actual[end_step]}, Expected 0 <= loss < 1.8e-3"
34+
assert loss_expected.keys() == loss_actual.keys(), \
35+
f"Steps at which loss was emitted for run do not match baseline. \
36+
Actual steps: {loss_actual.keys()}, Baseline steps: {loss_expected.keys()}"
37+
assert_allclose(list(loss_actual.values()), list(loss_expected.values()),
38+
rtol=LOSS_RTOL,
39+
err_msg=f"Run loss values: {loss_actual.values()}, \
40+
Baseline loss values: {loss_expected.values()}")
2841

2942

3043
@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir))

0 commit comments

Comments
 (0)