Skip to content

Commit 65e69a4

Browse files
Merge branch 'master' into matthew-frank/view-rcp-interpolation-also
2 parents 1b60055 + 782506c commit 65e69a4

50 files changed

Lines changed: 2504 additions & 169 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
4.1.23
1+
4.1.45

mlperf_logging/benchmark_meta.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
# TODO: Update with official values
2424
'llama31_8b': 10,
2525
'flux1': 10,
26+
'gpt_oss_20b': 10,
27+
'deepseekv3_671b': 3,
2628
},
2729

2830
'hpc' : {
@@ -155,7 +157,16 @@
155157
'llama2_70b_lora',
156158
'rgat',
157159
'llama31_405b'
158-
]
160+
],
161+
'6.0': [
162+
'llama31_8b',
163+
'dlrm_dcnv2',
164+
'flux1',
165+
'llama2_70b_lora',
166+
'llama31_405b',
167+
'gpt_oss_20b',
168+
'deepseekv3_671b'
169+
]
159170
},
160171

161172
'hpc': {

mlperf_logging/compliance_checker/README.md

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ To check a log file for compliance:
1010

1111
python -m mlperf_logging.compliance_checker [--config YAML] [--usage training/hpc] [--ruleset MLPERF_EDITION] FILENAME
1212

13-
By default, 5.1.0 training edition rules are used and the default config is set to `5.1.0/common.yaml`.
13+
By default, 6.0.0 training edition rules are used and the default config is set to `6.0.0/common.yaml`.
1414
This config will check all common keys and enqueue benchmark specific config to be checked as well.
15-
Old training editions, still supported are 5.0.0, 4.1.0, 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
15+
Old training editions, still supported are 6.0.0, 5.1.0, 5.0.0, 4.1.0, 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
1616

1717
To check hpc compliance rules (only 1.0.0 ruleset is supported), set --usage hpc --ruleset 1.0.0.
1818

@@ -22,23 +22,21 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_
2222

2323
### Existing config files for training submissions
2424

25-
5.1.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
26-
5.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks
27-
5.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks
28-
5.1.0/closed_retinanet.yaml - Per-benchmark rules, closed submissions.
29-
5.1.0/closed_llama31_8b.yaml
30-
5.1.0/closed_llama31_405b.yaml
31-
5.1.0/closed_dlrm_dcnv2.yaml
32-
5.1.0/closed_rgat.yaml
33-
5.1.0/closed_llama2_70b_lora.yaml
34-
5.1.0/closed_flux1.yaml
35-
5.1.0/open_retinanet.yaml - Per-benchmark rules, open submissions.
36-
5.1.0/open_llama31_8b.yaml
37-
5.1.0/open_llama31_405b.yaml
38-
5.1.0/open_dlrm_dcnv2.yaml
39-
5.1.0/open_rgat.yaml
40-
5.1.0/open_llama2_70b_lora.yaml
41-
5.1.0/open_flux1.yaml
25+
6.0.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
26+
6.0.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks
27+
6.0.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks
28+
6.0.0/closed_llama31_8b.yaml
29+
6.0.0/closed_llama31_405b.yaml
30+
6.0.0/closed_dlrm_dcnv2.yaml
31+
6.0.0/closed_llama2_70b_lora.yaml
32+
6.0.0/closed_flux1.yaml
33+
6.0.0/closed_gpt_oss_20b.yaml
34+
6.0.0/open_llama31_8b.yaml
35+
6.0.0/open_llama31_405b.yaml
36+
6.0.0/open_dlrm_dcnv2.yaml
37+
6.0.0/open_llama2_70b_lora.yaml
38+
6.0.0/open_flux1.yaml
39+
6.0.0/open_gpt_oss_20b.yaml
4240

4341
### Existing config files for HPC submissions
4442

mlperf_logging/compliance_checker/mlp_compliance.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ def get_parser():
315315
parser.add_argument('--usage', type=str, default='training',
316316
choices=usage_choices(),
317317
help='what WG do the benchmarks come from')
318-
parser.add_argument('--ruleset', type=str, default='5.1.0',
318+
parser.add_argument('--ruleset', type=str, default='6.0.0',
319319
choices=rule_choices(),
320320
help='what version of rules to check the log against')
321321
parser.add_argument('--config', type=str,

mlperf_logging/compliance_checker/mlp_parser/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from .ruleset_410 import parse_file as parse_file_410
1111
from .ruleset_500 import parse_file as parse_file_500
1212
from .ruleset_510 import parse_file as parse_file_510
13+
from .ruleset_600 import parse_file as parse_file_600
1314

1415
def parse_file(filename, ruleset='0.6.0'):
1516
if ruleset == '0.6.0':
@@ -36,5 +37,7 @@ def parse_file(filename, ruleset='0.6.0'):
3637
return parse_file_500(filename)
3738
elif ruleset == '5.1.0':
3839
return parse_file_510(filename)
40+
elif ruleset == '6.0.0':
41+
return parse_file_600(filename)
3942
else:
4043
raise Exception(f'Ruleset "{ruleset}" is not supported')
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
'''
2+
Parses a text MLPerf log into a structured format.
3+
'''
4+
5+
from __future__ import print_function
6+
7+
import collections
8+
import json
9+
import re
10+
import sys
11+
from dataclasses import dataclass
12+
13+
from io import open
14+
15+
@dataclass
16+
class LogLine:
17+
"""Class for keeping track of an item in inventory."""
18+
full_string: str
19+
timestamp: float
20+
key: str
21+
value: str
22+
lineno: int
23+
24+
TOKEN = ':::MLLOG '
25+
26+
27+
def parse_line(line):
28+
if not line.startswith(TOKEN):
29+
return None
30+
31+
return json.loads(line[len(TOKEN):])
32+
33+
34+
def string_to_logline(lineno, string):
35+
''' Returns a LogLine or raises a ValueError '''
36+
m = parse_line(string)
37+
38+
if m is None:
39+
raise ValueError('does not match regex')
40+
41+
args = []
42+
args.append(string) # full string
43+
44+
ts = float(m['time_ms']) # may raise error, e.g. "1.2.3"
45+
# TODO check for weird values
46+
args.append(ts)
47+
48+
args.append(m['key']) # key
49+
50+
j = { 'value': m['value'], 'metadata': m['metadata'] }
51+
args.append(j)
52+
53+
args.append(lineno)
54+
return LogLine(*args)
55+
56+
57+
def parse_file(filename):
58+
''' Reads a file by name and returns list of loglines and list of errors'''
59+
with open(filename, encoding='latin-1') as f:
60+
return parse_generator(f)
61+
62+
63+
def strip_and_dedup(gen):
64+
lines = []
65+
for l in gen:
66+
if TOKEN not in l:
67+
continue
68+
lines.append(re.sub(".*"+TOKEN, TOKEN, l))
69+
return lines
70+
71+
72+
73+
def parse_generator(gen):
74+
''' Reads a generator of lines and returns (loglines, errors)
75+
The list of errors are any parsing issues as a tuple (str_line, error_msg)
76+
'''
77+
loglines = []
78+
failed = []
79+
for lineno, line in enumerate(strip_and_dedup(gen)):
80+
line = line.strip()
81+
try:
82+
ll = string_to_logline(lineno, line)
83+
loglines.append(ll)
84+
except ValueError as e:
85+
failed.append((line, str(e)))
86+
return loglines, failed
87+
88+
89+
if __name__ == '__main__':
90+
if len(sys.argv) != 2:
91+
print('usage: mlp_parser.py FILENAME')
92+
print(' tests parsing on the file.')
93+
sys.exit(1)
94+
95+
filename = sys.argv[1]
96+
lines, errors = parse_file(filename)
97+
98+
print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors)))
99+
100+
if len(errors) > 0:
101+
print('Lines which failed to parse:')
102+
for line, error in errors:
103+
print(' Following line failed: {}'.format(error))
104+
print(line)
105+

mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_405b.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,15 @@
2424
REQ: EXACTLY_ONE
2525

2626
- KEY:
27-
NAME: opt_learning_rate_decay_steps
27+
NAME: opt_learning_rate_warmup_steps
2828
REQ: EXACTLY_ONE
29+
POST: >
30+
s['opt_learning_rate_warmup_steps'] = math.ceil(8000 * 1152 / s['global_batch_size'] )
2931
3032
- KEY:
31-
NAME: opt_learning_rate_warmup_steps
33+
NAME: opt_learning_rate_decay_steps
3234
REQ: EXACTLY_ONE
35+
CHECK: " v['value'] == math.ceil(1_200_000 * 1152 / s['global_batch_size'] ) - s['opt_learning_rate_warmup_steps'] "
3336

3437
- KEY:
3538
NAME: opt_learning_rate_decay_schedule

mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_8b.yaml

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,46 +4,84 @@
44
POST: >
55
s['global_batch_size'] = v['value']
66
7-
# TODO: Update with official compliance requirements
87
- KEY:
9-
NAME: opt_base_learning_rate
8+
NAME: max_sequence_length
9+
REQ: EXACTLY_ONE
10+
CHECK: " v['value'] == 8192 "
11+
12+
- KEY:
13+
NAME: opt_name
1014
REQ: EXACTLY_ONE
15+
CHECK: " v['value'] == 'adamw' "
1116

1217
- KEY:
13-
NAME: opt_lamb_epsilon
18+
NAME: opt_base_learning_rate
1419
REQ: EXACTLY_ONE
1520

1621
- KEY:
17-
NAME: opt_learning_rate_training_steps
22+
NAME: opt_end_learning_rate
1823
REQ: EXACTLY_ONE
1924

2025
- KEY:
2126
NAME: opt_learning_rate_warmup_steps
2227
REQ: EXACTLY_ONE
28+
POST: >
29+
s['opt_learning_rate_warmup_steps'] = v['value']
2330
2431
- KEY:
25-
NAME: num_warmup_steps
32+
NAME: opt_learning_rate_decay_steps
2633
REQ: EXACTLY_ONE
34+
CHECK: " v['value'] == 1_200_000 - s['opt_learning_rate_warmup_steps'] "
2735

2836
- KEY:
29-
NAME: start_warmup_step
37+
NAME: opt_learning_rate_decay_schedule
3038
REQ: EXACTLY_ONE
39+
CHECK: " v['value'] == 'cosine with linear warmup' "
3140

3241
- KEY:
33-
NAME: opt_lamb_beta_1
42+
NAME: opt_adamw_beta_1
3443
REQ: EXACTLY_ONE
44+
CHECK: " v['value'] == 0.9 "
3545

3646
- KEY:
37-
NAME: opt_lamb_beta_2
47+
NAME: opt_adamw_beta_2
3848
REQ: EXACTLY_ONE
49+
CHECK: " v['value'] == 0.95 "
3950

4051
- KEY:
41-
NAME: opt_lamb_weight_decay_rate
52+
NAME: opt_adamw_epsilon
4253
REQ: EXACTLY_ONE
54+
CHECK: " v['value'] == 1e-05 "
55+
56+
- KEY:
57+
NAME: opt_adamw_weight_decay
58+
REQ: EXACTLY_ONE
59+
CHECK: " v['value'] == 0.1 "
60+
61+
- KEY:
62+
NAME: opt_gradient_clip_norm
63+
REQ: EXACTLY_ONE
64+
CHECK: " v['value'] == 1.0 "
65+
66+
- KEY:
67+
NAME: gradient_accumulation_steps
68+
REQ: EXACTLY_ONE
69+
CHECK: " v['value'] > 0 "
70+
71+
- KEY:
72+
NAME: eval_samples
73+
REQ: EXACTLY_ONE
74+
CHECK: " v['value'] == 1024 "
4375

4476
- KEY:
4577
NAME: eval_accuracy
4678
REQ: AT_LEAST_ONE
4779
CHECK:
48-
- "'epoch_num' in v['metadata']"
49-
ATLEAST_ONE_CHECK: "(v['value'] >= 0.720) and v['value'] < 1.0"
80+
- "'samples_count' in v['metadata']"
81+
ATLEAST_ONE_CHECK: "(v['value'] <= 3.3) and v['value'] > 0.0"
82+
83+
- KEY:
84+
NAME: max_steps
85+
REQ: EXACTLY_ONE
86+
CHECK: " v['value'] == 1200000 "
87+
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2+
- KEY:
3+
NAME: submission_benchmark
4+
REQ: EXACTLY_ONE
5+
CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'gpt_oss_20b', 'deepseekv3_671b'] "
6+
POST: " enqueue_config('training_6.0.0/closed_{}.yaml'.format(v['value'])) "
7+
8+
- KEY:
9+
NAME: gradient_accumulation_steps
10+
REQ: EXACTLY_ONE
11+
CHECK: " v['value'] > 0 "

0 commit comments

Comments
 (0)