Skip to content

Commit c7e8987

Browse files
committed
Cosmetics
1 parent d0110bb commit c7e8987

File tree

2 files changed

+30
-26
lines changed

2 files changed

+30
-26
lines changed

README.md

+7-3
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,13 @@ Works with any GPU in Windows, Linux, macOS and Android.
77

88

99
## Measurements
10-
- compute performance (FP64, FP32, FP16, INT64, INT32, INT16, INT8)
11-
- memory bandwidth (coalesced/misaligned read/write)
12-
- PCIe bandwidth (send/receive/bidirectional)
10+
- compute performance (`FP64`, `FP32`, `FP16`, `INT64`, `INT32`, `INT16`, `INT8`)
11+
- closest possible fraction/multiplicator of `measured compute performance` divided by `reported theoretical FP32 performance` is shown in `(round brackets)`
12+
- for example when OpenCL reports `19.492` TFLOPs/s theoretical FP32, and the benchmark measures `9.512` TFLOPs/s for FP64, the ratio of `(measured FP64)/(theoretical FP32) = 9.512/19.492 = 1/2.05` is rounded to the next possible value of `1/2` and reported as such
13+
- these ratios for any GPU/CPU architecture can only be either `1/64`, `1/32`, `1/24`, `1/16`, `1/12`, `1/8`, `1/4`, `1/3`, `1/2`, `2/3`, `1x`, `2x`, `4x`, `8x`, `16x`, `32x`, `64x`, and nothing in between
14+
- memory bandwidth (`coalesced`/`misaligned` `read`/`write`)
15+
- PCIe bandwidth (`send`/`receive`/`bidirectional`)
16+
- PCIe Gen is estimated based on measured PCIe bandwidth and assumed x16 link width
1317

1418

1519

src/opencl.hpp

+23-23
Original file line numberDiff line numberDiff line change
@@ -54,21 +54,21 @@ sudo shutdown -r now
5454
)"+string("\033[96m")+R"(.-----------------------------------------------------------------------------.
5555
| CPU Option 1: Intel CPU Runtime for OpenCL (works for both AMD/Intel CPUs) |
5656
'-----------------------------------------------------------------------------'
57-
export OCLCPUEXP_VERSION="2024.18.6.0.02_rel"
58-
export ONEAPI_TBB_VERSION="2021.13.0"
57+
export OCLV="2024.18.6.0.02_rel"
58+
export TBBV="2021.13.0"
5959
sudo apt update && sudo apt upgrade -y
6060
sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev
61-
sudo mkdir -p ~/cpuruntime /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION} /etc/OpenCL/vendors /etc/ld.so.conf.d
62-
sudo wget -P ~/cpuruntime https://github.com/intel/llvm/releases/download/2024-WW25/oclcpuexp-${OCLCPUEXP_VERSION}.tar.gz
63-
sudo wget -P ~/cpuruntime https://github.com/oneapi-src/oneTBB/releases/download/v${ONEAPI_TBB_VERSION}/oneapi-tbb-${ONEAPI_TBB_VERSION}-lin.tgz
64-
sudo tar -zxvf ~/cpuruntime/oclcpuexp-${OCLCPUEXP_VERSION}.tar.gz -C /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION}
65-
sudo tar -zxvf ~/cpuruntime/oneapi-tbb-${ONEAPI_TBB_VERSION}-lin.tgz -C /opt/intel
66-
echo /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION}/x64/libintelocl.so | sudo tee /etc/OpenCL/vendors/intel_expcpu.icd
67-
echo /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION}/x64 | sudo tee /etc/ld.so.conf.d/libintelopenclexp.conf
68-
sudo ln -sf /opt/intel/oneapi-tbb-${ONEAPI_TBB_VERSION}/lib/intel64/gcc4.8/libtbb.so /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION}/x64
69-
sudo ln -sf /opt/intel/oneapi-tbb-${ONEAPI_TBB_VERSION}/lib/intel64/gcc4.8/libtbbmalloc.so /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION}/x64
70-
sudo ln -sf /opt/intel/oneapi-tbb-${ONEAPI_TBB_VERSION}/lib/intel64/gcc4.8/libtbb.so.12 /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION}/x64
71-
sudo ln -sf /opt/intel/oneapi-tbb-${ONEAPI_TBB_VERSION}/lib/intel64/gcc4.8/libtbbmalloc.so.2 /opt/intel/oclcpuexp_${OCLCPUEXP_VERSION}/x64
61+
sudo mkdir -p ~/cpuruntime /opt/intel/oclcpuexp_${OCLV} /etc/OpenCL/vendors /etc/ld.so.conf.d
62+
sudo wget -P ~/cpuruntime https://github.com/intel/llvm/releases/download/2024-WW25/oclcpuexp-${OCLV}.tar.gz
63+
sudo wget -P ~/cpuruntime https://github.com/oneapi-src/oneTBB/releases/download/v${TBBV}/oneapi-tbb-${TBBV}-lin.tgz
64+
sudo tar -zxvf ~/cpuruntime/oclcpuexp-${OCLV}.tar.gz -C /opt/intel/oclcpuexp_${OCLV}
65+
sudo tar -zxvf ~/cpuruntime/oneapi-tbb-${TBBV}-lin.tgz -C /opt/intel
66+
echo /opt/intel/oclcpuexp_${OCLV}/x64/libintelocl.so | sudo tee /etc/OpenCL/vendors/intel_expcpu.icd
67+
echo /opt/intel/oclcpuexp_${OCLV}/x64 | sudo tee /etc/ld.so.conf.d/libintelopenclexp.conf
68+
sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbb.so /opt/intel/oclcpuexp_${OCLV}/x64
69+
sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbbmalloc.so /opt/intel/oclcpuexp_${OCLV}/x64
70+
sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbb.so.12 /opt/intel/oclcpuexp_${OCLV}/x64
71+
sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbbmalloc.so.2 /opt/intel/oclcpuexp_${OCLV}/x64
7272
sudo ldconfig -f /etc/ld.so.conf.d/libintelopenclexp.conf
7373
sudo rm -r ~/cpuruntime
7474

@@ -85,20 +85,20 @@ struct Device_Info {
8585
cl::Device cl_device; // OpenCL device
8686
cl::Context cl_context; // multiple devices in the same context can communicate buffers
8787
uint id = 0u; // unique device ID assigned by get_devices()
88-
string name, vendor; // device name, vendor
89-
string driver_version, opencl_c_version; // device driver version, OpenCL C version
90-
uint memory=0u; // global memory in MB
91-
uint memory_used=0u; // track global memory usage in MB
88+
string name="", vendor=""; // device name, vendor
89+
string driver_version="", opencl_c_version=""; // device driver version, OpenCL C version
90+
uint memory = 0u; // global memory in MB
91+
uint memory_used = 0u; // track global memory usage in MB
9292
uint global_cache=0u, local_cache=0u; // global cache in KB, local cache in KB
9393
uint max_global_buffer=0u, max_constant_buffer=0u; // maximum global buffer size in MB, maximum constant buffer size in KB
94-
uint compute_units=0u; // compute units (CUs) can contain multiple cores depending on the microarchitecture
95-
uint clock_frequency=0u; // in MHz
94+
uint compute_units = 0u; // compute units (CUs) can contain multiple cores depending on the microarchitecture
95+
uint clock_frequency = 0u; // in MHz
9696
bool is_cpu=false, is_gpu=false;
9797
bool intel_gpu_above_4gb_patch = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
9898
bool legacy_gpu_fma_patch = false; // some old GPUs have terrible fma performance, so replace with a*b+c
9999
uint is_fp64_capable=0u, is_fp32_capable=0u, is_fp16_capable=0u, is_int64_capable=0u, is_int32_capable=0u, is_int16_capable=0u, is_int8_capable=0u;
100-
uint cores=0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
101-
float tflops=0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s
100+
uint cores = 0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
101+
float tflops = 0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s
102102
inline Device_Info(const cl::Device& cl_device, const cl::Context& cl_context, const uint id) {
103103
this->cl_device = cl_device; // see https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
104104
this->cl_context = cl_context;
@@ -565,14 +565,14 @@ class Kernel {
565565
if(!device.is_initialized()) print_error("No OpenCL Device selected. Call Device constructor.");
566566
this->name = name;
567567
cl_kernel = cl::Kernel(device.get_cl_program(), name.c_str());
568-
link_parameters(number_of_parameters, parameters...); // expand variadic template to link kernel parameters
568+
link_parameters(0u, parameters...); // expand variadic template to link kernel parameters
569569
set_ranges(N);
570570
cl_queue = device.get_cl_queue();
571571
}
572572
template<class... T> inline Kernel(const Device& device, const ulong N, const uint workgroup_size, const string& name, const T&... parameters) { // accepts Memory<T> objects and fundamental data type constants
573573
if(!device.is_initialized()) print_error("No OpenCL Device selected. Call Device constructor.");
574574
cl_kernel = cl::Kernel(device.get_cl_program(), name.c_str());
575-
link_parameters(number_of_parameters, parameters...); // expand variadic template to link kernel parameters
575+
link_parameters(0u, parameters...); // expand variadic template to link kernel parameters
576576
set_ranges(N, (ulong)workgroup_size);
577577
cl_queue = device.get_cl_queue();
578578
}

0 commit comments

Comments
 (0)