@@ -378,6 +378,72 @@ def postprocess(i):
378378 ][model ][scenario ]['power_efficiency' ] = power_efficiency
379379 state ['mlc-mlperf-inference-results-last' ]['power_efficiency' ] = power_efficiency
380380
381+ # Automatic rerun for valid mode if performance result is invalid
382+ if mode == "performance" and not valid .get ('performance' , False ) and \
383+ env .get ('MLC_MLPERF_RUN_STYLE' , '' ) == 'valid' :
384+ retry_count = int (env .get ('MLC_MLPERF_VALID_RERUN_COUNT' , '0' ))
385+ max_retries = int (env .get ('MLC_MLPERF_VALID_RERUN_MAX' , '2' ))
386+ if retry_count < max_retries :
387+ retry_count += 1
388+ logger .warning (f"Performance result is INVALID. Automatically rerunning with adjusted parameters (attempt { retry_count } /{ max_retries } )..." )
389+
390+ rerun_env = copy .deepcopy (env )
391+
392+ if scenario == "Server" :
393+ # Use loadgen's FindPeakPerformance mode to discover the optimal QPS
394+ logger .info ("Using loadgen FindPeakPerformance mode to discover optimal Server target QPS..." )
395+ extra_opts = rerun_env .get ('MLC_MLPERF_LOADGEN_EXTRA_OPTIONS' , '' )
396+ if '--find-peak-performance' not in extra_opts :
397+ rerun_env ['MLC_MLPERF_LOADGEN_EXTRA_OPTIONS' ] = extra_opts + ' --find-peak-performance '
398+ elif scenario == "Offline" :
399+ # For Offline, reduce target QPS to 90% of measured
400+ measured_value = float (result )
401+ adjusted_value = str (round (measured_value * 0.9 , 3 ))
402+ logger .info (f"Adjusting target_qps from measured { result } to { adjusted_value } (90%)" )
403+ rerun_env ['MLC_MLPERF_LOADGEN_TARGET_QPS' ] = adjusted_value
404+ rerun_env ['MLC_MLPERF_LOADGEN_OFFLINE_TARGET_QPS' ] = adjusted_value
405+ elif scenario .endswith ("Stream" ):
406+ # For Stream, increase target latency to 110% of measured
407+ measured_value = float (result )
408+ adjusted_value = str (round (measured_value * 1.1 , 3 ))
409+ logger .info (f"Adjusting target_latency from measured { result } ms to { adjusted_value } ms (110%)" )
410+ rerun_env ['MLC_MLPERF_LOADGEN_TARGET_LATENCY' ] = adjusted_value
411+ if scenario == "SingleStream" :
412+ rerun_env ['MLC_MLPERF_LOADGEN_SINGLESTREAM_TARGET_LATENCY' ] = adjusted_value
413+ elif scenario == "MultiStream" :
414+ rerun_env ['MLC_MLPERF_LOADGEN_MULTISTREAM_TARGET_LATENCY' ] = adjusted_value
415+
416+ rerun_env ['MLC_MLPERF_VALID_RERUN_COUNT' ] = str (retry_count )
417+ # Remove cached output path so a fresh run is triggered
418+ rerun_env .pop ('MLC_MLPERF_OUTPUT_DIR' , None )
419+
420+ # Re-trigger the same script via mlc.access
421+ script_tags = inp .get ('tags' , '' )
422+ script_adr = inp .get ('adr' , {})
423+
424+ mlc_input = {
425+ 'action' : 'run' ,
426+ 'automation' : 'script' ,
427+ 'tags' : script_tags ,
428+ 'adr' : script_adr ,
429+ 'env' : rerun_env ,
430+ 'quiet' : True ,
431+ }
432+ logger .info (f"Re-triggering MLPerf inference run..." )
433+ r = mlc .access (mlc_input )
434+ if r ['return' ] > 0 :
435+ return r
436+
437+ # Update state with results from the rerun
438+ if 'new_state' in r :
439+ for key in ['mlc-mlperf-inference-results' , 'mlc-mlperf-inference-results-last' ]:
440+ if key in r ['new_state' ]:
441+ state [key ] = r ['new_state' ][key ]
442+ return {'return' : 0 }
443+ else :
444+ logger .warning (f"Performance result is INVALID after { max_retries } rerun attempt(s). Giving up." )
445+
446+
381447 # Record basic host info
382448 host_info = {
383449 "os_version" : platform .platform (),
@@ -607,6 +673,45 @@ def postprocess(i):
607673 state ['mlc-mlperf-inference-results' ][state ['MLC_SUT_CONFIG_NAME' ]
608674 ][model ][scenario ][test ] = "passed" if is_valid else "failed"
609675
676+ # Automatic rerun for compliance if check failed
677+ if not is_valid and env .get ('MLC_MLPERF_RUN_STYLE' , '' ) == 'valid' :
678+ retry_count = int (env .get ('MLC_MLPERF_COMPLIANCE_RERUN_COUNT' , '0' ))
679+ max_retries = int (env .get ('MLC_MLPERF_COMPLIANCE_RERUN_MAX' , '3' ))
680+ if retry_count < max_retries :
681+ retry_count += 1
682+ logger .warning (f"Compliance { test } check FAILED. Automatically rerunning (attempt { retry_count } /{ max_retries } )..." )
683+
684+ rerun_env = copy .deepcopy (env )
685+ rerun_env ['MLC_MLPERF_COMPLIANCE_RERUN_COUNT' ] = str (retry_count )
686+ # Remove cached output path so a fresh run is triggered
687+ rerun_env .pop ('MLC_MLPERF_OUTPUT_DIR' , None )
688+
689+ # Re-trigger the same script via mlc.access
690+ script_tags = inp .get ('tags' , '' )
691+ script_adr = inp .get ('adr' , {})
692+
693+ mlc_input = {
694+ 'action' : 'run' ,
695+ 'automation' : 'script' ,
696+ 'tags' : script_tags ,
697+ 'adr' : script_adr ,
698+ 'env' : rerun_env ,
699+ 'quiet' : True ,
700+ }
701+ logger .info (f"Re-triggering compliance { test } run..." )
702+ r = mlc .access (mlc_input )
703+ if r ['return' ] > 0 :
704+ return r
705+
706+ # Update state with results from the rerun
707+ if 'new_state' in r :
708+ for key in ['mlc-mlperf-inference-results' , 'mlc-mlperf-inference-results-last' ]:
709+ if key in r ['new_state' ]:
710+ state [key ] = r ['new_state' ][key ]
711+ return {'return' : 0 }
712+ else :
713+ logger .warning (f"Compliance { test } FAILED after { max_retries } rerun attempt(s). Giving up." )
714+
610715 # portion of the code where the avg utilisation and system informations are extracted
611716 # NOTE: The section is under development and print statements are added
612717 # for further debugging
0 commit comments