Skip to content

Commit 8c25a1f

Browse files
committed
FluidX3D v2.6 update: patched OpenCL issues of Intel Arc GPUs: now VRAM allocations >4GB are possible and correct VRAM capacity is reported
1 parent 042f51a commit 8c25a1f

File tree

3 files changed

+24
-7
lines changed

3 files changed

+24
-7
lines changed

README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
5555
- improved raytracing framerate when camera is inside fluid
5656
- fixed skybox pole flickering artifacts
5757
- fixed bug where moving objects during re-voxelization would leave an erroneous trail of solid grid cells behind
58+
- v2.6 (16.04.2023)
59+
- patched OpenCL issues of Intel Arc GPUs: now VRAM allocations >4GB are possible and correct VRAM capacity is reported
5860

5961
</details>
6062

@@ -66,8 +68,10 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
6668
- streaming (part 2/2)<p align="center"><i>f</i><sub>0</sub><sup>temp</sup>(<i>x</i>,<i>t</i>) = <i>f</i><sub>0</sub>(<i>x</i>, <i>t</i>)<br><i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>) = <i>f</i><sub>(<i>t</i>%2 ? <i>i</i> : (<i>i</i>%2 ? <i>i</i>+1 : <i>i</i>-1))</sub>(<i>i</i>%2 ? <i>x</i> : <i>x</i>-<i>e<sub>i</sub></i>, <i>t</i>) &nbsp; for &nbsp; <i>i</i> &isin; [1, <i>q</i>-1]</p>
6769
- collision<p align="center"><i>&rho;</i>(<i>x</i>,<i>t</i>) = (&Sigma;<sub><i>i</i></sub> <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>)) + 1<br><br><i>u</i>(<i>x</i>,<i>t</i>) = <sup>1</sup>&#8725;<sub><i>&rho;</i>(<i>x</i>,<i>t</i>)</sub> &Sigma;<sub><i>i</i></sub> <i>c<sub>i</sub></i> <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>)<br><br><i>f<sub>i</sub></i><sup>eq-shifted</sup>(<i>x</i>,<i>t</i>) = <i>w<sub>i</sub></i> <i>&rho;</i> · (<sup>(<i>u</i><sub>°</sub><i>c<sub>i</sub></i>)<sup>2</sup></sup>&#8725;<sub>(2<i>c</i><sup>4</sup>)</sub> - <sup>(<i>u</i><sub>°</sub><i>u</i>)</sup>&#8725;<sub>(2c<sup>2</sup>)</sub> + <sup>(<i>u</i><sub>°</sub><i>c<sub>i</sub></i>)</sup>&#8725;<sub><i>c</i><sup>2</sup></sub>) + <i>w<sub>i</sub></i> (<i>&rho;</i>-1)<br><br><i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>) + <i>&Omega;<sub>i</sub></i>(<i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>), <i>f<sub>i</sub></i><sup>eq-shifted</sup>(<i>x</i>,<i>t</i>), <i>&tau;</i>)</p>
6870
- streaming (part 1/2)<p align="center"><i>f</i><sub>0</sub>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f</i><sub>0</sub><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>)<br><i>f</i><sub>(<i>t</i>%2 ? (<i>i</i>%2 ? <i>i</i>+1 : <i>i</i>-1) : <i>i</i>)</sub>(<i>i</i>%2 ? <i>x</i>+<i>e<sub>i</sub></i> : <i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) &nbsp; for &nbsp; <i>i</i> &isin; [1, <i>q</i>-1]</p>
71+
- velocity sets: D2Q9, D3Q15, D3Q19 (default), D3Q27
72+
- collision operators: single-relaxation-time (SRT/BGK) (default), two-relaxation-time (TRT)
6973

70-
</details>
74+
</details>
7175

7276
<!-- markdown equations don't render properly in mobile browser
7377
- streaming (part 2/2):
@@ -206,8 +210,6 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
206210
</details>
207211
- [peak performance on GPUs](#single-gpu-benchmarks) (datacenter/gaming/professional/laptop), validated with roofline model
208212
- [DDF-shifting](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats) and other algebraic optimization to minimize round-off error
209-
- velocity sets: D2Q9, D3Q15, D3Q19 (default), D3Q27
210-
- collision operators: single-relaxation-time (SRT/BGK) (default), two-relaxation-time (TRT)
211213

212214

213215

src/info.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ void Info::print_logo() const {
6767
print("| "); print("\\ \\ / /", c); print(" |\n");
6868
print("| "); print("\\ ' /", c); print(" |\n");
6969
print("| "); print("\\ /", c); print(" |\n");
70-
print("| "); print("\\ /", c); print(" FluidX3D Version 2.5 |\n");
70+
print("| "); print("\\ /", c); print(" FluidX3D Version 2.6 |\n");
7171
print("| "); print("'", c); print(" Copyright (c) Moritz Lehmann |\n");
7272
}
7373
void Info::print_initialize() {

src/opencl.hpp

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ struct Device_Info {
2424
uint compute_units=0u; // compute units (CUs) can contain multiple cores depending on the microarchitecture
2525
uint clock_frequency=0u; // in MHz
2626
bool is_cpu=false, is_gpu=false;
27+
bool intel_gpu_above_4gb_patch = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
2728
uint is_fp64_capable=0u, is_fp32_capable=0u, is_fp16_capable=0u, is_int64_capable=0u, is_int32_capable=0u, is_int16_capable=0u, is_int8_capable=0u;
2829
uint cores=0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
2930
float tflops=0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s
@@ -63,6 +64,19 @@ struct Device_Info {
6364
const float arm = (float)(contains(to_lower(vendor), "arm"))*(is_gpu?8.0f:1.0f); // ARM GPUs usually have 8 cores/CU, ARM CPUs have 1 core/CU
6465
cores = to_uint((float)compute_units*(nvidia+amd+intel+apple+arm)); // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
6566
tflops = 1E-6f*(float)cores*(float)ipc*(float)clock_frequency; // estimated device floating point performance in TeraFLOPs/s
67+
if(intel==8.0f) { // fix wrong global memory reporting for Intel Arc GPUs
68+
if(contains_any(name, {"A770", "0x56a0"})&&(memory==12992u)) memory = 16240u; // fix wrong (80% on Windows) memory reporting on Intel Arc A770 16GB
69+
if(contains_any(name, {"A770", "0x56a0"})&&(memory== 6476u)) memory = 8096u; // fix wrong (80% on Windows) memory reporting on Intel Arc A770 8GB
70+
if(contains_any(name, {"A750", "0x56a1"})&&(memory== 6476u)) memory = 8096u; // fix wrong (80% on Windows) memory reporting on Intel Arc A750 8GB
71+
if(contains_any(name, {"A580", "0x56a2"})&&(memory== 6476u)) memory = 8096u; // fix wrong (80% on Windows) memory reporting on Intel Arc A580 8GB
72+
if(contains_any(name, {"A380", "0x56a5"})&&(memory== 4844u)) memory = 6056u; // fix wrong (80% on Windows) memory reporting on Intel Arc A380 6GB
73+
if(contains_any(name, {"A770", "0x56a0"})&&(memory==15473u)) memory = 16288u; // fix wrong (95% on Linux) memory reporting on Intel Arc A770 16GB
74+
if(contains_any(name, {"A770", "0x56a0"})&&(memory== 7721u)) memory = 8128u; // fix wrong (95% on Linux) memory reporting on Intel Arc A770 8GB
75+
if(contains_any(name, {"A750", "0x56a1"})&&(memory== 7721u)) memory = 8128u; // fix wrong (95% on Linux) memory reporting on Intel Arc A750 8GB
76+
if(contains_any(name, {"A580", "0x56a2"})&&(memory== 7721u)) memory = 8128u; // fix wrong (95% on Linux) memory reporting on Intel Arc A580 8GB
77+
if(contains_any(name, {"A380" "0x56a5"})&&(memory== 5783u)) memory = 6088u; // fix wrong (95% on Linux) memory reporting on Intel Arc A380 6GB
78+
}
79+
intel_gpu_above_4gb_patch = (intel==8.0f)&&(memory>4096); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
6680
}
6781
inline Device_Info() {}; // default constructor
6882
};
@@ -161,11 +175,12 @@ class Device {
161175
const string kernel_code = enable_device_capabilities()+"\n"+opencl_c_code;
162176
cl_source.push_back({ kernel_code.c_str(), kernel_code.length() });
163177
this->cl_program = cl::Program(info.cl_context, cl_source);
178+
const string build_options = string("-cl-fast-relaxed-math")+(info.intel_gpu_above_4gb_patch ? " -cl-intel-greater-than-4GB-buffer-required" : "");
164179
#ifndef LOG
165-
int error = cl_program.build({ info.cl_device }, "-cl-fast-relaxed-math -w"); // compile OpenCL C code, disable warnings
180+
int error = cl_program.build({ info.cl_device }, (build_options+" -w").c_str()); // compile OpenCL C code, disable warnings
166181
if(error) print_warning(cl_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(info.cl_device)); // print build log
167182
#else // LOG, generate logfile for OpenCL code compilation
168-
int error = cl_program.build({ info.cl_device }, "-cl-fast-relaxed-math"); // compile OpenCL C code
183+
int error = cl_program.build({ info.cl_device }, build_options.c_str()); // compile OpenCL C code
169184
const string log = cl_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(info.cl_device);
170185
write_file("bin/kernel.log", log); // save build log
171186
if((uint)log.length()>2u) print_warning(log); // print build log
@@ -210,7 +225,7 @@ template<typename T> class Memory {
210225
device.info.memory_used += (uint)(capacity()/1048576ull); // track device memory usage
211226
if(device.info.memory_used>device.info.memory) print_error("Device \""+device.info.name+"\" does not have enough memory. Allocating another "+to_string((uint)(capacity()/1048576ull))+" MB would use a total of "+to_string(device.info.memory_used)+" MB / "+to_string(device.info.memory)+" MB.");
212227
int error = 0;
213-
device_buffer = cl::Buffer(device.get_cl_context(), CL_MEM_READ_WRITE, capacity(), nullptr, &error);
228+
device_buffer = cl::Buffer(device.get_cl_context(), CL_MEM_READ_WRITE|((int)device.info.intel_gpu_above_4gb_patch<<23), capacity(), nullptr, &error); // for Intel GPUs, set flag CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL = (1<<23)
214229
if(error==-61) print_error("Memory size is too large at "+to_string((uint)(capacity()/1048576ull))+" MB. Device \""+device.info.name+"\" accepts a maximum buffer size of "+to_string(device.info.max_global_buffer)+" MB.");
215230
else if(error) print_error("Device buffer allocation failed with error code "+to_string(error)+".");
216231
device_buffer_exists = true;

0 commit comments

Comments
 (0)