@@ -38,8 +38,8 @@ struct Args {
3838 /// Tolerate software throttling if the TFLOPS are in the acceptable range
3939 #[ clap( long, default_value = "false" ) ]
4040 tolerate_software_throttling : bool ,
41- /// TFLOPS tolerance (%) from the average
42- /// If the TFLOPS are within this range , test pass
41+ /// TFLOPS tolerance (%) compared to best GPU
42+ /// If the TFLOPS are within `tflops_tolerance`% of the best performing GPU , test will pass
4343 #[ clap( long, default_value = "10" ) ]
4444 tflops_tolerance : f64 ,
4545}
@@ -378,15 +378,16 @@ fn are_gpus_healthy(
378378 tolerate_software_throttling : bool ,
379379) -> ( bool , Vec < String > ) {
380380 let mut reasons = vec ! [ ] ;
381- let mut avg_flops = 0.0 ;
382- for r in burn_results. iter ( ) {
383- avg_flops += r. flops_avg ( ) ;
384- }
385- avg_flops /= burn_results. len ( ) as f64 ;
381+ // acceptable_flops is tflops_tolerance% lower than best gpu avg flops
382+ let acceptable_flops: f64 = burn_results
383+ . iter ( )
384+ . map ( |r| r. flops_avg ( ) )
385+ . fold ( 0. , |max, avg| {
386+ max. max ( avg * ( 100. - tflops_tolerance) / 100. )
387+ } ) ;
386388 for r in burn_results. iter ( ) {
387389 let mut low_flops = false ;
388- // if we have less than tflops_tolerance difference in average flops between GPUs
389- if ( r. flops_avg ( ) - avg_flops) . abs ( ) > tflops_tolerance / 100. * avg_flops {
390+ if r. flops_avg ( ) < acceptable_flops {
390391 reasons. push ( format ! ( "GPU {} - " , r. gpu_idx) + GPU_FLOPS_REASON ) ;
391392 low_flops = true ;
392393 }
0 commit comments