Skip to content

Commit ca39a94

Browse files
authored
tflops_tolerance is based from the fastest GPU (#1)
* update --tflops-tolerance to compare with the best gpu
1 parent 62a13ef commit ca39a94

File tree

2 files changed

+11
-10
lines changed

2 files changed

+11
-10
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ Options:
4141
--tolerate-software-throttling
4242
Tolerate software throttling if the TFLOPS are in the acceptable range
4343
--tflops-tolerance <TFLOPS_TOLERANCE>
44-
TFLOPS tolerance (%) from the average If the TFLOPS are within this range, test pass [default: 10]
44+
TFLOPS tolerance (%) compared to best GPU If the TFLOPS are within `tflops_tolerance`% of the best performing GPU, test will pass [default: 10]
4545
-h, --help
4646
Print help
4747
-V, --version

src/main.rs

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ struct Args {
3838
/// Tolerate software throttling if the TFLOPS are in the acceptable range
3939
#[clap(long, default_value = "false")]
4040
tolerate_software_throttling: bool,
41-
/// TFLOPS tolerance (%) from the average
42-
/// If the TFLOPS are within this range, test pass
41+
/// TFLOPS tolerance (%) compared to best GPU
42+
/// If the TFLOPS are within `tflops_tolerance`% of the best performing GPU, test will pass
4343
#[clap(long, default_value = "10")]
4444
tflops_tolerance: f64,
4545
}
@@ -378,15 +378,16 @@ fn are_gpus_healthy(
378378
tolerate_software_throttling: bool,
379379
) -> (bool, Vec<String>) {
380380
let mut reasons = vec![];
381-
let mut avg_flops = 0.0;
382-
for r in burn_results.iter() {
383-
avg_flops += r.flops_avg();
384-
}
385-
avg_flops /= burn_results.len() as f64;
381+
// acceptable_flops is tflops_tolerance% lower than best gpu avg flops
382+
let acceptable_flops: f64 = burn_results
383+
.iter()
384+
.map(|r| r.flops_avg())
385+
.fold(0., |max, avg| {
386+
max.max(avg * (100. - tflops_tolerance) / 100.)
387+
});
386388
for r in burn_results.iter() {
387389
let mut low_flops = false;
388-
// if we have less than tflops_tolerance difference in average flops between GPUs
389-
if (r.flops_avg() - avg_flops).abs() > tflops_tolerance / 100. * avg_flops {
390+
if r.flops_avg() < acceptable_flops {
390391
reasons.push(format!("GPU {} - ", r.gpu_idx) + GPU_FLOPS_REASON);
391392
low_flops = true;
392393
}

0 commit comments

Comments
 (0)