Skip to content

Commit 7ce1f5e

Browse files
Fix peak_flops calculation for generic OpenCL GPU:
- if report clock is 1 MHz, use 1 GHz instead (Adreno reports this for some reason) - peak flops is clock * # CUs * ALUs per CU OpenCL doesn't tell us the latter. default to 8. if Adreno, use 128; it could be 32 to 256; we'd need to look at the model to decide, I don't want to get into that level right now
1 parent b5730ff commit 7ce1f5e

1 file changed

Lines changed: 27 additions & 5 deletions

File tree

client/gpu_opencl.cpp

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -741,15 +741,37 @@ void COPROCS::get_opencl(
741741
// TODO: is there a better way to estimate peak_flops?
742742
//
743743
prop.peak_flops = 0;
744-
if (prop.max_compute_units) {
745-
double freq = ((double)prop.max_clock_frequency) * MEGA;
746-
prop.peak_flops = ((double)prop.max_compute_units) * freq;
744+
745+
double freq;
746+
if (prop.max_clock_frequency == 1) {
747+
// Adreno reports 1MHz, which is not correct.
748+
// Actual rate could be 155 MHz to 1.5 GHz;
749+
// split the difference.
750+
freq = 1e9;
751+
} else {
752+
freq = ((double)prop.max_clock_frequency) * 1e6;
753+
}
754+
755+
// OpenCL doesn't tell us this critical parameter;
756+
// it varies between manufacturer and model.
757+
// For recent Intel GPUs it's 8; we'll use this as a default
758+
//
759+
int alus_per_compute_unit = 8;
760+
761+
// other manufacturers
762+
//
763+
if (strcasestr(prop.vendor, "QUALCOMM")) {
764+
// can be 32 to 256; most are 128
765+
alus_per_compute_unit = 128;
747766
}
767+
768+
prop.peak_flops = freq * prop.max_compute_units * alus_per_compute_unit;
748769
if (prop.peak_flops <= 0 || prop.peak_flops > GPU_MAX_PEAK_FLOPS) {
749770
char buf2[256];
750771
snprintf(buf2, sizeof(buf2),
751-
"OpenCL generic: bad peak FLOPS; Max units %u, max freq %u MHz",
752-
prop.max_compute_units, prop.max_clock_frequency
772+
"OpenCL generic: bad peak FLOPS; Max units %u, max freq %u MHz ALUs per CU %d",
773+
prop.max_compute_units, prop.max_clock_frequency,
774+
alus_per_compute_unit
753775
);
754776
gpu_warning(warnings, buf2);
755777
prop.peak_flops = GPU_DEFAULT_PEAK_FLOPS;

0 commit comments

Comments
 (0)