Added retry logic for mlperf-inference runs

amd-arsuresh · amd-arsuresh · commit a340bb820d49 · 2026-04-20T21:26:29.000+05:30
diff --git a/script/README.md b/script/README.md
@@ -1,6 +1,6 @@
 # MLCommons Automation Scripts
 
-*Last updated: 2026-04-20 17:41:59*
+*Last updated: 2026-04-20 21:26:30*
 
 This directory contains automation scripts for MLPerf benchmarks, AI/ML workflows, and development operations.
 
diff --git a/script/app-mlperf-inference/customize.py b/script/app-mlperf-inference/customize.py
@@ -378,6 +378,72 @@ def postprocess(i):
                                                   ][model][scenario]['power_efficiency'] = power_efficiency
             state['mlc-mlperf-inference-results-last']['power_efficiency'] = power_efficiency
 
+        # Automatic rerun for valid mode if performance result is invalid
+        if mode == "performance" and not valid.get('performance', False) and \
+                env.get('MLC_MLPERF_RUN_STYLE', '') == 'valid':
+            retry_count = int(env.get('MLC_MLPERF_VALID_RERUN_COUNT', '0'))
+            max_retries = int(env.get('MLC_MLPERF_VALID_RERUN_MAX', '2'))
+            if retry_count < max_retries:
+                retry_count += 1
+                logger.warning(f"Performance result is INVALID. Automatically rerunning with adjusted parameters (attempt {retry_count}/{max_retries})...")
+
+                rerun_env = copy.deepcopy(env)
+
+                if scenario == "Server":
+                    # Use loadgen's FindPeakPerformance mode to discover the optimal QPS
+                    logger.info("Using loadgen FindPeakPerformance mode to discover optimal Server target QPS...")
+                    extra_opts = rerun_env.get('MLC_MLPERF_LOADGEN_EXTRA_OPTIONS', '')
+                    if '--find-peak-performance' not in extra_opts:
+                        rerun_env['MLC_MLPERF_LOADGEN_EXTRA_OPTIONS'] = extra_opts + ' --find-peak-performance '
+                elif scenario == "Offline":
+                    # For Offline, reduce target QPS to 90% of measured
+                    measured_value = float(result)
+                    adjusted_value = str(round(measured_value * 0.9, 3))
+                    logger.info(f"Adjusting target_qps from measured {result} to {adjusted_value} (90%)")
+                    rerun_env['MLC_MLPERF_LOADGEN_TARGET_QPS'] = adjusted_value
+                    rerun_env['MLC_MLPERF_LOADGEN_OFFLINE_TARGET_QPS'] = adjusted_value
+                elif scenario.endswith("Stream"):
+                    # For Stream, increase target latency to 110% of measured
+                    measured_value = float(result)
+                    adjusted_value = str(round(measured_value * 1.1, 3))
+                    logger.info(f"Adjusting target_latency from measured {result} ms to {adjusted_value} ms (110%)")
+                    rerun_env['MLC_MLPERF_LOADGEN_TARGET_LATENCY'] = adjusted_value
+                    if scenario == "SingleStream":
+                        rerun_env['MLC_MLPERF_LOADGEN_SINGLESTREAM_TARGET_LATENCY'] = adjusted_value
+                    elif scenario == "MultiStream":
+                        rerun_env['MLC_MLPERF_LOADGEN_MULTISTREAM_TARGET_LATENCY'] = adjusted_value
+
+                rerun_env['MLC_MLPERF_VALID_RERUN_COUNT'] = str(retry_count)
+                # Remove cached output path so a fresh run is triggered
+                rerun_env.pop('MLC_MLPERF_OUTPUT_DIR', None)
+
+                # Re-trigger the same script via mlc.access
+                script_tags = inp.get('tags', '')
+                script_adr = inp.get('adr', {})
+
+                mlc_input = {
+                    'action': 'run',
+                    'automation': 'script',
+                    'tags': script_tags,
+                    'adr': script_adr,
+                    'env': rerun_env,
+                    'quiet': True,
+                }
+                logger.info(f"Re-triggering MLPerf inference run...")
+                r = mlc.access(mlc_input)
+                if r['return'] > 0:
+                    return r
+
+                # Update state with results from the rerun
+                if 'new_state' in r:
+                    for key in ['mlc-mlperf-inference-results', 'mlc-mlperf-inference-results-last']:
+                        if key in r['new_state']:
+                            state[key] = r['new_state'][key]
+                return {'return': 0}
+            else:
+                logger.warning(f"Performance result is INVALID after {max_retries} rerun attempt(s). Giving up.")
+
+
         # Record basic host info
         host_info = {
             "os_version": platform.platform(),
@@ -607,6 +673,45 @@ def postprocess(i):
         state['mlc-mlperf-inference-results'][state['MLC_SUT_CONFIG_NAME']
                                               ][model][scenario][test] = "passed" if is_valid else "failed"
 
+        # Automatic rerun for compliance if check failed
+        if not is_valid and env.get('MLC_MLPERF_RUN_STYLE', '') == 'valid':
+            retry_count = int(env.get('MLC_MLPERF_COMPLIANCE_RERUN_COUNT', '0'))
+            max_retries = int(env.get('MLC_MLPERF_COMPLIANCE_RERUN_MAX', '3'))
+            if retry_count < max_retries:
+                retry_count += 1
+                logger.warning(f"Compliance {test} check FAILED. Automatically rerunning (attempt {retry_count}/{max_retries})...")
+
+                rerun_env = copy.deepcopy(env)
+                rerun_env['MLC_MLPERF_COMPLIANCE_RERUN_COUNT'] = str(retry_count)
+                # Remove cached output path so a fresh run is triggered
+                rerun_env.pop('MLC_MLPERF_OUTPUT_DIR', None)
+
+                # Re-trigger the same script via mlc.access
+                script_tags = inp.get('tags', '')
+                script_adr = inp.get('adr', {})
+
+                mlc_input = {
+                    'action': 'run',
+                    'automation': 'script',
+                    'tags': script_tags,
+                    'adr': script_adr,
+                    'env': rerun_env,
+                    'quiet': True,
+                }
+                logger.info(f"Re-triggering compliance {test} run...")
+                r = mlc.access(mlc_input)
+                if r['return'] > 0:
+                    return r
+
+                # Update state with results from the rerun
+                if 'new_state' in r:
+                    for key in ['mlc-mlperf-inference-results', 'mlc-mlperf-inference-results-last']:
+                        if key in r['new_state']:
+                            state[key] = r['new_state'][key]
+                return {'return': 0}
+            else:
+                logger.warning(f"Compliance {test} FAILED after {max_retries} rerun attempt(s). Giving up.")
+
     # portion of the code where the avg utilisation and system informations are extracted
     # NOTE: The section is under development and print statements are added
     # for further debugging
diff --git a/script/app-mlperf-inference/meta.yaml b/script/app-mlperf-inference/meta.yaml
@@ -71,6 +71,7 @@ input_mapping:
   test_query_count: MLC_TEST_QUERY_COUNT
   tp_size: MLC_NVIDIA_TP_SIZE
   use_dataset_from_host: MLC_USE_DATASET_FROM_HOST
+  valid_rerun_max: MLC_MLPERF_VALID_RERUN_MAX
 input_description:
   scenario:
     desc: MLPerf inference scenario
@@ -92,6 +93,8 @@ input_description:
     desc: Target QPS
   target_latency:
     desc: Target Latency
+  valid_rerun_max:
+    desc: Maximum number of automatic reruns when performance result is invalid in valid mode (default 2)
   max_batchsize:
     desc: Maximum batchsize to be used
   num_threads: