Skip to content

Commit f73c2c4

Browse files
authored
Merge pull request #444 from CarlosGomes98/cgomes/flux_dont_dropout
Flux update, v6.0 added
2 parents 3d91927 + 11073f7 commit f73c2c4

34 files changed

+1338
-50
lines changed

mlperf_logging/benchmark_meta.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,14 @@
155155
'llama2_70b_lora',
156156
'rgat',
157157
'llama31_405b'
158-
]
158+
],
159+
'6.0': [
160+
'llama31_8b',
161+
'dlrm_dcnv2',
162+
'flux1',
163+
'llama2_70b_lora',
164+
'llama31_405b'
165+
]
159166
},
160167

161168
'hpc': {

mlperf_logging/compliance_checker/README.md

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ To check a log file for compliance:
1010

1111
python -m mlperf_logging.compliance_checker [--config YAML] [--usage training/hpc] [--ruleset MLPERF_EDITION] FILENAME
1212

13-
By default, 5.1.0 training edition rules are used and the default config is set to `5.1.0/common.yaml`.
13+
By default, 6.0.0 training edition rules are used and the default config is set to `6.0.0/common.yaml`.
1414
This config will check all common keys and enqueue benchmark specific config to be checked as well.
15-
Old training editions, still supported are 5.0.0, 4.1.0, 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
15+
Old training editions, still supported are 6.0.0, 5.1.0, 5.0.0, 4.1.0, 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
1616

1717
To check hpc compliance rules (only 1.0.0 ruleset is supported), set --usage hpc --ruleset 1.0.0.
1818

@@ -22,23 +22,19 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_
2222

2323
### Existing config files for training submissions
2424

25-
5.1.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
26-
5.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks
27-
5.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks
28-
5.1.0/closed_retinanet.yaml - Per-benchmark rules, closed submissions.
29-
5.1.0/closed_llama31_8b.yaml
30-
5.1.0/closed_llama31_405b.yaml
31-
5.1.0/closed_dlrm_dcnv2.yaml
32-
5.1.0/closed_rgat.yaml
33-
5.1.0/closed_llama2_70b_lora.yaml
34-
5.1.0/closed_flux1.yaml
35-
5.1.0/open_retinanet.yaml - Per-benchmark rules, open submissions.
36-
5.1.0/open_llama31_8b.yaml
37-
5.1.0/open_llama31_405b.yaml
38-
5.1.0/open_dlrm_dcnv2.yaml
39-
5.1.0/open_rgat.yaml
40-
5.1.0/open_llama2_70b_lora.yaml
41-
5.1.0/open_flux1.yaml
25+
6.0.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
26+
6.0.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks
27+
6.0.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks
28+
6.0.0/closed_llama31_8b.yaml
29+
6.0.0/closed_llama31_405b.yaml
30+
6.0.0/closed_dlrm_dcnv2.yaml
31+
6.0.0/closed_llama2_70b_lora.yaml
32+
6.0.0/closed_flux1.yaml
33+
6.0.0/open_llama31_8b.yaml
34+
6.0.0/open_llama31_405b.yaml
35+
6.0.0/open_dlrm_dcnv2.yaml
36+
6.0.0/open_llama2_70b_lora.yaml
37+
6.0.0/open_flux1.yaml
4238

4339
### Existing config files for HPC submissions
4440

mlperf_logging/compliance_checker/mlp_compliance.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ def get_parser():
315315
parser.add_argument('--usage', type=str, default='training',
316316
choices=usage_choices(),
317317
help='what WG do the benchmarks come from')
318-
parser.add_argument('--ruleset', type=str, default='5.1.0',
318+
parser.add_argument('--ruleset', type=str, default='6.0.0',
319319
choices=rule_choices(),
320320
help='what version of rules to check the log against')
321321
parser.add_argument('--config', type=str,

mlperf_logging/compliance_checker/mlp_parser/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from .ruleset_410 import parse_file as parse_file_410
1111
from .ruleset_500 import parse_file as parse_file_500
1212
from .ruleset_510 import parse_file as parse_file_510
13+
from .ruleset_600 import parse_file as parse_file_600
1314

1415
def parse_file(filename, ruleset='0.6.0'):
1516
if ruleset == '0.6.0':
@@ -36,5 +37,7 @@ def parse_file(filename, ruleset='0.6.0'):
3637
return parse_file_500(filename)
3738
elif ruleset == '5.1.0':
3839
return parse_file_510(filename)
40+
elif ruleset == '6.0.0':
41+
return parse_file_600(filename)
3942
else:
4043
raise Exception(f'Ruleset "{ruleset}" is not supported')
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
'''
2+
Parses a text MLPerf log into a structured format.
3+
'''
4+
5+
from __future__ import print_function
6+
7+
import collections
8+
import json
9+
import re
10+
import sys
11+
from dataclasses import dataclass
12+
13+
from io import open
14+
15+
@dataclass
16+
class LogLine:
17+
"""Class for keeping track of an item in inventory."""
18+
full_string: str
19+
timestamp: float
20+
key: str
21+
value: str
22+
lineno: int
23+
24+
TOKEN = ':::MLLOG '
25+
26+
27+
def parse_line(line):
28+
if not line.startswith(TOKEN):
29+
return None
30+
31+
return json.loads(line[len(TOKEN):])
32+
33+
34+
def string_to_logline(lineno, string):
35+
''' Returns a LogLine or raises a ValueError '''
36+
m = parse_line(string)
37+
38+
if m is None:
39+
raise ValueError('does not match regex')
40+
41+
args = []
42+
args.append(string) # full string
43+
44+
ts = float(m['time_ms']) # may raise error, e.g. "1.2.3"
45+
# TODO check for weird values
46+
args.append(ts)
47+
48+
args.append(m['key']) # key
49+
50+
j = { 'value': m['value'], 'metadata': m['metadata'] }
51+
args.append(j)
52+
53+
args.append(lineno)
54+
return LogLine(*args)
55+
56+
57+
def parse_file(filename):
58+
''' Reads a file by name and returns list of loglines and list of errors'''
59+
with open(filename, encoding='latin-1') as f:
60+
return parse_generator(f)
61+
62+
63+
def strip_and_dedup(gen):
64+
lines = []
65+
for l in gen:
66+
if TOKEN not in l:
67+
continue
68+
lines.append(re.sub(".*"+TOKEN, TOKEN, l))
69+
return lines
70+
71+
72+
73+
def parse_generator(gen):
74+
''' Reads a generator of lines and returns (loglines, errors)
75+
The list of errors are any parsing issues as a tuple (str_line, error_msg)
76+
'''
77+
loglines = []
78+
failed = []
79+
for lineno, line in enumerate(strip_and_dedup(gen)):
80+
line = line.strip()
81+
try:
82+
ll = string_to_logline(lineno, line)
83+
loglines.append(ll)
84+
except ValueError as e:
85+
failed.append((line, str(e)))
86+
return loglines, failed
87+
88+
89+
if __name__ == '__main__':
90+
if len(sys.argv) != 2:
91+
print('usage: mlp_parser.py FILENAME')
92+
print(' tests parsing on the file.')
93+
sys.exit(1)
94+
95+
filename = sys.argv[1]
96+
lines, errors = parse_file(filename)
97+
98+
print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors)))
99+
100+
if len(errors) > 0:
101+
print('Lines which failed to parse:')
102+
for line, error in errors:
103+
print(' Following line failed: {}'.format(error))
104+
print(line)
105+
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2+
- KEY:
3+
NAME: submission_benchmark
4+
REQ: EXACTLY_ONE
5+
CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b'] "
6+
POST: " enqueue_config('training_6.0.0/closed_{}.yaml'.format(v['value'])) "
7+
8+
- KEY:
9+
NAME: gradient_accumulation_steps
10+
REQ: EXACTLY_ONE
11+
CHECK: " v['value'] > 0 "
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
- KEY:
2+
NAME: global_batch_size
3+
REQ: EXACTLY_ONE
4+
5+
- KEY:
6+
NAME: opt_name
7+
REQ: EXACTLY_ONE
8+
CHECK: " v['value'] == 'adagrad' "
9+
10+
- KEY:
11+
NAME: opt_base_learning_rate
12+
REQ: EXACTLY_ONE
13+
14+
- KEY:
15+
NAME: opt_adagrad_learning_rate_decay
16+
REQ: EXACTLY_ONE
17+
CHECK: " v['value'] == 0 "
18+
19+
- KEY:
20+
NAME: opt_weight_decay
21+
REQ: EXACTLY_ONE
22+
CHECK: " v['value'] == 0 "
23+
24+
- KEY:
25+
NAME: opt_adagrad_initial_accumulator_value
26+
REQ: EXACTLY_ONE
27+
CHECK: " v['value'] == 0 "
28+
29+
- KEY:
30+
NAME: opt_adagrad_epsilon
31+
REQ: EXACTLY_ONE
32+
CHECK: " v['value'] == 1e-8 "
33+
34+
- KEY:
35+
NAME: opt_learning_rate_warmup_steps
36+
REQ: EXACTLY_ONE
37+
CHECK: " v['value'] == 0 "
38+
39+
- KEY:
40+
NAME: opt_learning_rate_decay_start_step
41+
REQ: EXACTLY_ONE
42+
CHECK: " v['value'] == 0 "
43+
44+
- KEY:
45+
NAME: opt_learning_rate_decay_steps
46+
REQ: EXACTLY_ONE
47+
CHECK: " v['value'] == 0 "
48+
49+
- KEY:
50+
NAME: eval_accuracy
51+
REQ: AT_LEAST_ONE
52+
CHECK:
53+
- "'epoch_num' in v['metadata']"
54+
ATLEAST_ONE_CHECK: "v['value'] >= 0.80275 and v['value'] <= 1.0"
55+
56+
- KEY:
57+
NAME: eval_samples
58+
REQ: EXACTLY_ONE
59+
CHECK: " v['value'] == 89137319 "
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
- KEY:
2+
NAME: global_batch_size
3+
REQ: AT_LEAST_ONE
4+
CHECK: " v['value'] >= 0 "
5+
6+
- KEY:
7+
NAME: evaluation_frequency
8+
REQ: EXACTLY_ONE
9+
CHECK: " v['value'] == 262144"
10+
11+
- KEY:
12+
NAME: opt_name
13+
REQ: EXACTLY_ONE
14+
CHECK: " v['value'] == 'adamw' "
15+
16+
- KEY:
17+
NAME: opt_adamw_beta_1
18+
REQ: EXACTLY_ONE
19+
CHECK: " v['value'] == 0.9 "
20+
21+
- KEY:
22+
NAME: opt_adamw_beta_2
23+
REQ: EXACTLY_ONE
24+
CHECK: " v['value'] == 0.95 "
25+
26+
- KEY:
27+
NAME: opt_adamw_epsilon
28+
REQ: EXACTLY_ONE
29+
CHECK: " v['value'] == 1e-08 "
30+
31+
- KEY:
32+
NAME: opt_adamw_weight_decay
33+
REQ: EXACTLY_ONE
34+
CHECK: " v['value'] == 0.1 "
35+
36+
- KEY:
37+
NAME: opt_base_learning_rate
38+
REQ: EXACTLY_ONE
39+
CHECK: " v['value'] >= 0.0 "
40+
41+
- KEY:
42+
NAME: opt_learning_rate_warmup_steps
43+
REQ: EXACTLY_ONE
44+
CHECK: " v['value'] >= 0 "
45+
46+
- KEY:
47+
NAME: opt_gradient_clip_norm
48+
REQ: EXACTLY_ONE
49+
CHECK: " v['value'] == 1.0 "
50+
51+
- KEY:
52+
NAME: eval_accuracy
53+
REQ: AT_LEAST_ONE
54+
CHECK:
55+
- "'samples_count' in v['metadata']"
56+
ATLEAST_ONE_CHECK: "v['value'] <= 0.586 and v['value'] > 0.0"
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
- KEY:
2+
NAME: global_batch_size
3+
REQ: EXACTLY_ONE
4+
POST: >
5+
s['global_batch_size'] = v['value']
6+
7+
- KEY:
8+
NAME: opt_base_learning_rate
9+
REQ: EXACTLY_ONE
10+
11+
12+
- KEY:
13+
NAME: opt_learning_rate_training_steps
14+
REQ: EXACTLY_ONE
15+
16+
- KEY:
17+
NAME: opt_gradient_clip_norm
18+
REQ: EXACTLY_ONE
19+
20+
- KEY:
21+
NAME: opt_adamw_weight_decay
22+
REQ: EXACTLY_ONE
23+
24+
- KEY:
25+
NAME: gradient_accumulation_steps
26+
REQ: EXACTLY_ONE
27+
28+
- KEY:
29+
NAME: lora_alpha
30+
REQ: EXACTLY_ONE
31+
32+
- KEY:
33+
NAME: lora_rank
34+
REQ: EXACTLY_ONE
35+
CHECK: " v['value'] == 16"
36+
37+
- KEY:
38+
NAME: eval_accuracy
39+
REQ: AT_LEAST_ONE
40+
CHECK:
41+
- "'samples_count' in v['metadata']"
42+
ATLEAST_ONE_CHECK: "(v['value'] <= 0.925) and v['value'] > 0.0"

0 commit comments

Comments
 (0)