Skip to content

Commit a340bb8

Browse files
committed
Added retry logic for mlperf-inference runs
1 parent c394650 commit a340bb8

3 files changed

Lines changed: 109 additions & 1 deletion

File tree

script/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# MLCommons Automation Scripts
22

3-
*Last updated: 2026-04-20 17:41:59*
3+
*Last updated: 2026-04-20 21:26:30*
44

55
This directory contains automation scripts for MLPerf benchmarks, AI/ML workflows, and development operations.
66

script/app-mlperf-inference/customize.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,72 @@ def postprocess(i):
378378
][model][scenario]['power_efficiency'] = power_efficiency
379379
state['mlc-mlperf-inference-results-last']['power_efficiency'] = power_efficiency
380380

381+
# Automatic rerun for valid mode if performance result is invalid
382+
if mode == "performance" and not valid.get('performance', False) and \
383+
env.get('MLC_MLPERF_RUN_STYLE', '') == 'valid':
384+
retry_count = int(env.get('MLC_MLPERF_VALID_RERUN_COUNT', '0'))
385+
max_retries = int(env.get('MLC_MLPERF_VALID_RERUN_MAX', '2'))
386+
if retry_count < max_retries:
387+
retry_count += 1
388+
logger.warning(f"Performance result is INVALID. Automatically rerunning with adjusted parameters (attempt {retry_count}/{max_retries})...")
389+
390+
rerun_env = copy.deepcopy(env)
391+
392+
if scenario == "Server":
393+
# Use loadgen's FindPeakPerformance mode to discover the optimal QPS
394+
logger.info("Using loadgen FindPeakPerformance mode to discover optimal Server target QPS...")
395+
extra_opts = rerun_env.get('MLC_MLPERF_LOADGEN_EXTRA_OPTIONS', '')
396+
if '--find-peak-performance' not in extra_opts:
397+
rerun_env['MLC_MLPERF_LOADGEN_EXTRA_OPTIONS'] = extra_opts + ' --find-peak-performance '
398+
elif scenario == "Offline":
399+
# For Offline, reduce target QPS to 90% of measured
400+
measured_value = float(result)
401+
adjusted_value = str(round(measured_value * 0.9, 3))
402+
logger.info(f"Adjusting target_qps from measured {result} to {adjusted_value} (90%)")
403+
rerun_env['MLC_MLPERF_LOADGEN_TARGET_QPS'] = adjusted_value
404+
rerun_env['MLC_MLPERF_LOADGEN_OFFLINE_TARGET_QPS'] = adjusted_value
405+
elif scenario.endswith("Stream"):
406+
# For Stream, increase target latency to 110% of measured
407+
measured_value = float(result)
408+
adjusted_value = str(round(measured_value * 1.1, 3))
409+
logger.info(f"Adjusting target_latency from measured {result} ms to {adjusted_value} ms (110%)")
410+
rerun_env['MLC_MLPERF_LOADGEN_TARGET_LATENCY'] = adjusted_value
411+
if scenario == "SingleStream":
412+
rerun_env['MLC_MLPERF_LOADGEN_SINGLESTREAM_TARGET_LATENCY'] = adjusted_value
413+
elif scenario == "MultiStream":
414+
rerun_env['MLC_MLPERF_LOADGEN_MULTISTREAM_TARGET_LATENCY'] = adjusted_value
415+
416+
rerun_env['MLC_MLPERF_VALID_RERUN_COUNT'] = str(retry_count)
417+
# Remove cached output path so a fresh run is triggered
418+
rerun_env.pop('MLC_MLPERF_OUTPUT_DIR', None)
419+
420+
# Re-trigger the same script via mlc.access
421+
script_tags = inp.get('tags', '')
422+
script_adr = inp.get('adr', {})
423+
424+
mlc_input = {
425+
'action': 'run',
426+
'automation': 'script',
427+
'tags': script_tags,
428+
'adr': script_adr,
429+
'env': rerun_env,
430+
'quiet': True,
431+
}
432+
logger.info(f"Re-triggering MLPerf inference run...")
433+
r = mlc.access(mlc_input)
434+
if r['return'] > 0:
435+
return r
436+
437+
# Update state with results from the rerun
438+
if 'new_state' in r:
439+
for key in ['mlc-mlperf-inference-results', 'mlc-mlperf-inference-results-last']:
440+
if key in r['new_state']:
441+
state[key] = r['new_state'][key]
442+
return {'return': 0}
443+
else:
444+
logger.warning(f"Performance result is INVALID after {max_retries} rerun attempt(s). Giving up.")
445+
446+
381447
# Record basic host info
382448
host_info = {
383449
"os_version": platform.platform(),
@@ -607,6 +673,45 @@ def postprocess(i):
607673
state['mlc-mlperf-inference-results'][state['MLC_SUT_CONFIG_NAME']
608674
][model][scenario][test] = "passed" if is_valid else "failed"
609675

676+
# Automatic rerun for compliance if check failed
677+
if not is_valid and env.get('MLC_MLPERF_RUN_STYLE', '') == 'valid':
678+
retry_count = int(env.get('MLC_MLPERF_COMPLIANCE_RERUN_COUNT', '0'))
679+
max_retries = int(env.get('MLC_MLPERF_COMPLIANCE_RERUN_MAX', '3'))
680+
if retry_count < max_retries:
681+
retry_count += 1
682+
logger.warning(f"Compliance {test} check FAILED. Automatically rerunning (attempt {retry_count}/{max_retries})...")
683+
684+
rerun_env = copy.deepcopy(env)
685+
rerun_env['MLC_MLPERF_COMPLIANCE_RERUN_COUNT'] = str(retry_count)
686+
# Remove cached output path so a fresh run is triggered
687+
rerun_env.pop('MLC_MLPERF_OUTPUT_DIR', None)
688+
689+
# Re-trigger the same script via mlc.access
690+
script_tags = inp.get('tags', '')
691+
script_adr = inp.get('adr', {})
692+
693+
mlc_input = {
694+
'action': 'run',
695+
'automation': 'script',
696+
'tags': script_tags,
697+
'adr': script_adr,
698+
'env': rerun_env,
699+
'quiet': True,
700+
}
701+
logger.info(f"Re-triggering compliance {test} run...")
702+
r = mlc.access(mlc_input)
703+
if r['return'] > 0:
704+
return r
705+
706+
# Update state with results from the rerun
707+
if 'new_state' in r:
708+
for key in ['mlc-mlperf-inference-results', 'mlc-mlperf-inference-results-last']:
709+
if key in r['new_state']:
710+
state[key] = r['new_state'][key]
711+
return {'return': 0}
712+
else:
713+
logger.warning(f"Compliance {test} FAILED after {max_retries} rerun attempt(s). Giving up.")
714+
610715
# portion of the code where the avg utilisation and system informations are extracted
611716
# NOTE: The section is under development and print statements are added
612717
# for further debugging

script/app-mlperf-inference/meta.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ input_mapping:
7171
test_query_count: MLC_TEST_QUERY_COUNT
7272
tp_size: MLC_NVIDIA_TP_SIZE
7373
use_dataset_from_host: MLC_USE_DATASET_FROM_HOST
74+
valid_rerun_max: MLC_MLPERF_VALID_RERUN_MAX
7475
input_description:
7576
scenario:
7677
desc: MLPerf inference scenario
@@ -92,6 +93,8 @@ input_description:
9293
desc: Target QPS
9394
target_latency:
9495
desc: Target Latency
96+
valid_rerun_max:
97+
desc: Maximum number of automatic reruns when performance result is invalid in valid mode (default 2)
9598
max_batchsize:
9699
desc: Maximum batchsize to be used
97100
num_threads:

0 commit comments

Comments
 (0)