FluidX3D v2.6 update: patched OpenCL issues of Intel Arc GPUs: now VRAM allocations >4GB are possible and correct VRAM capacity is reported

ProjectPhysX · ProjectPhysX · commit 8c25a1f6624c · 2023-04-16T12:16:31.000+02:00
diff --git a/README.md b/README.md
@@ -55,6 +55,8 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
   - improved raytracing framerate when camera is inside fluid
   - fixed skybox pole flickering artifacts
   - fixed bug where moving objects during re-voxelization would leave an erroneous trail of solid grid cells behind
+- v2.6 (16.04.2023)
+  - patched OpenCL issues of Intel Arc GPUs: now VRAM allocations >4GB are possible and correct VRAM capacity is reported
 
 </details>
 
@@ -66,8 +68,10 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
   - streaming (part 2/2)<p align="center"><i>f</i><sub>0</sub><sup>temp</sup>(<i>x</i>,<i>t</i>) = <i>f</i><sub>0</sub>(<i>x</i>, <i>t</i>)<br><i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>) = <i>f</i><sub>(<i>t</i>%2 ? <i>i</i> : (<i>i</i>%2 ? <i>i</i>+1 : <i>i</i>-1))</sub>(<i>i</i>%2 ? <i>x</i> : <i>x</i>-<i>e<sub>i</sub></i>, <i>t</i>) &nbsp; for &nbsp; <i>i</i> &isin; [1, <i>q</i>-1]</p>
   - collision<p align="center"><i>&rho;</i>(<i>x</i>,<i>t</i>) = (&Sigma;<sub><i>i</i></sub> <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>)) + 1<br><br><i>u</i>(<i>x</i>,<i>t</i>) = <sup>1</sup>&#8725;<sub><i>&rho;</i>(<i>x</i>,<i>t</i>)</sub> &Sigma;<sub><i>i</i></sub> <i>c<sub>i</sub></i> <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>)<br><br><i>f<sub>i</sub></i><sup>eq-shifted</sup>(<i>x</i>,<i>t</i>) = <i>w<sub>i</sub></i> <i>&rho;</i> · (<sup>(<i>u</i><sub>°</sub><i>c<sub>i</sub></i>)<sup>2</sup></sup>&#8725;<sub>(2<i>c</i><sup>4</sup>)</sub> - <sup>(<i>u</i><sub>°</sub><i>u</i>)</sup>&#8725;<sub>(2c<sup>2</sup>)</sub> + <sup>(<i>u</i><sub>°</sub><i>c<sub>i</sub></i>)</sup>&#8725;<sub><i>c</i><sup>2</sup></sub>) + <i>w<sub>i</sub></i> (<i>&rho;</i>-1)<br><br><i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>) + <i>&Omega;<sub>i</sub></i>(<i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>), <i>f<sub>i</sub></i><sup>eq-shifted</sup>(<i>x</i>,<i>t</i>), <i>&tau;</i>)</p>
   - streaming (part 1/2)<p align="center"><i>f</i><sub>0</sub>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f</i><sub>0</sub><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>)<br><i>f</i><sub>(<i>t</i>%2 ? (<i>i</i>%2 ? <i>i</i>+1 : <i>i</i>-1) : <i>i</i>)</sub>(<i>i</i>%2 ? <i>x</i>+<i>e<sub>i</sub></i> : <i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) &nbsp; for &nbsp; <i>i</i> &isin; [1, <i>q</i>-1]</p>
+  - velocity sets: D2Q9, D3Q15, D3Q19 (default), D3Q27
+  - collision operators: single-relaxation-time (SRT/BGK) (default), two-relaxation-time (TRT)
 
-</details>
+  </details>
 
 <!-- markdown equations don't render properly in mobile browser
   - streaming (part 2/2):
@@ -206,8 +210,6 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
   </details>
 - [peak performance on GPUs](#single-gpu-benchmarks) (datacenter/gaming/professional/laptop), validated with roofline model
 - [DDF-shifting](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats) and other algebraic optimization to minimize round-off error
-- velocity sets: D2Q9, D3Q15, D3Q19 (default), D3Q27
-- collision operators: single-relaxation-time (SRT/BGK) (default), two-relaxation-time (TRT)
 
 
 
diff --git a/src/info.cpp b/src/info.cpp
@@ -67,7 +67,7 @@ void Info::print_logo() const {
 	print("|                                  ");                print("\\  \\ /  /", c);                 print("                                  |\n");
 	print("|                                   ");                print("\\  '  /", c);                  print("                                   |\n");
 	print("|                                    ");                print("\\   /", c);                  print("                                    |\n");
-	print("|                                     ");                print("\\ /", c);                  print("                FluidX3D Version 2.5 |\n");
+	print("|                                     ");                print("\\ /", c);                  print("                FluidX3D Version 2.6 |\n");
 	print("|                                      ");                 print("'", c);                  print("         Copyright (c) Moritz Lehmann |\n");
 }
 void Info::print_initialize() {
diff --git a/src/opencl.hpp b/src/opencl.hpp
@@ -24,6 +24,7 @@ struct Device_Info {
 	uint compute_units=0u; // compute units (CUs) can contain multiple cores depending on the microarchitecture
 	uint clock_frequency=0u; // in MHz
 	bool is_cpu=false, is_gpu=false;
+	bool intel_gpu_above_4gb_patch = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
 	uint is_fp64_capable=0u, is_fp32_capable=0u, is_fp16_capable=0u, is_int64_capable=0u, is_int32_capable=0u, is_int16_capable=0u, is_int8_capable=0u;
 	uint cores=0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
 	float tflops=0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s
@@ -63,6 +64,19 @@ struct Device_Info {
 		const float arm = (float)(contains(to_lower(vendor), "arm"))*(is_gpu?8.0f:1.0f); // ARM GPUs usually have 8 cores/CU, ARM CPUs have 1 core/CU
 		cores = to_uint((float)compute_units*(nvidia+amd+intel+apple+arm)); // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
 		tflops = 1E-6f*(float)cores*(float)ipc*(float)clock_frequency; // estimated device floating point performance in TeraFLOPs/s
+		if(intel==8.0f) { // fix wrong global memory reporting for Intel Arc GPUs
+			if(contains_any(name, {"A770", "0x56a0"})&&(memory==12992u)) memory = 16240u; // fix wrong (80% on Windows) memory reporting on Intel Arc A770 16GB
+			if(contains_any(name, {"A770", "0x56a0"})&&(memory== 6476u)) memory =  8096u; // fix wrong (80% on Windows) memory reporting on Intel Arc A770 8GB
+			if(contains_any(name, {"A750", "0x56a1"})&&(memory== 6476u)) memory =  8096u; // fix wrong (80% on Windows) memory reporting on Intel Arc A750 8GB
+			if(contains_any(name, {"A580", "0x56a2"})&&(memory== 6476u)) memory =  8096u; // fix wrong (80% on Windows) memory reporting on Intel Arc A580 8GB
+			if(contains_any(name, {"A380", "0x56a5"})&&(memory== 4844u)) memory =  6056u; // fix wrong (80% on Windows) memory reporting on Intel Arc A380 6GB
+			if(contains_any(name, {"A770", "0x56a0"})&&(memory==15473u)) memory = 16288u; // fix wrong (95% on Linux) memory reporting on Intel Arc A770 16GB
+			if(contains_any(name, {"A770", "0x56a0"})&&(memory== 7721u)) memory =  8128u; // fix wrong (95% on Linux) memory reporting on Intel Arc A770 8GB
+			if(contains_any(name, {"A750", "0x56a1"})&&(memory== 7721u)) memory =  8128u; // fix wrong (95% on Linux) memory reporting on Intel Arc A750 8GB
+			if(contains_any(name, {"A580", "0x56a2"})&&(memory== 7721u)) memory =  8128u; // fix wrong (95% on Linux) memory reporting on Intel Arc A580 8GB
+			if(contains_any(name, {"A380"  "0x56a5"})&&(memory== 5783u)) memory =  6088u; // fix wrong (95% on Linux) memory reporting on Intel Arc A380 6GB
+		}
+		intel_gpu_above_4gb_patch = (intel==8.0f)&&(memory>4096); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
 	}
 	inline Device_Info() {}; // default constructor
 };
@@ -161,11 +175,12 @@ class Device {
 		const string kernel_code = enable_device_capabilities()+"\n"+opencl_c_code;
 		cl_source.push_back({ kernel_code.c_str(), kernel_code.length() });
 		this->cl_program = cl::Program(info.cl_context, cl_source);
+		const string build_options = string("-cl-fast-relaxed-math")+(info.intel_gpu_above_4gb_patch ? " -cl-intel-greater-than-4GB-buffer-required" : "");
 #ifndef LOG
-		int error = cl_program.build({ info.cl_device }, "-cl-fast-relaxed-math -w"); // compile OpenCL C code, disable warnings
+		int error = cl_program.build({ info.cl_device }, (build_options+" -w").c_str()); // compile OpenCL C code, disable warnings
 		if(error) print_warning(cl_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(info.cl_device)); // print build log
 #else // LOG, generate logfile for OpenCL code compilation
-		int error = cl_program.build({ info.cl_device }, "-cl-fast-relaxed-math"); // compile OpenCL C code
+		int error = cl_program.build({ info.cl_device }, build_options.c_str()); // compile OpenCL C code
 		const string log = cl_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(info.cl_device);
 		write_file("bin/kernel.log", log); // save build log
 		if((uint)log.length()>2u) print_warning(log); // print build log
@@ -210,7 +225,7 @@ template<typename T> class Memory {
 			device.info.memory_used += (uint)(capacity()/1048576ull); // track device memory usage
 			if(device.info.memory_used>device.info.memory) print_error("Device \""+device.info.name+"\" does not have enough memory. Allocating another "+to_string((uint)(capacity()/1048576ull))+" MB would use a total of "+to_string(device.info.memory_used)+" MB / "+to_string(device.info.memory)+" MB.");
 			int error = 0;
-			device_buffer = cl::Buffer(device.get_cl_context(), CL_MEM_READ_WRITE, capacity(), nullptr, &error);
+			device_buffer = cl::Buffer(device.get_cl_context(), CL_MEM_READ_WRITE|((int)device.info.intel_gpu_above_4gb_patch<<23), capacity(), nullptr, &error); // for Intel GPUs, set flag CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL = (1<<23)
 			if(error==-61) print_error("Memory size is too large at "+to_string((uint)(capacity()/1048576ull))+" MB. Device \""+device.info.name+"\" accepts a maximum buffer size of "+to_string(device.info.max_global_buffer)+" MB.");
 			else if(error) print_error("Device buffer allocation failed with error code "+to_string(error)+".");
 			device_buffer_exists = true;

Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@ void Info::print_logo() const {`
`67`	`67`	`print("\| "); print("\\ \\ / /", c); print(" \|\n");`
`68`	`68`	`print("\| "); print("\\ ' /", c); print(" \|\n");`
`69`	`69`	`print("\| "); print("\\ /", c); print(" \|\n");`
`70`		`- print("\| "); print("\\ /", c); print(" FluidX3D Version 2.5 \|\n");`
	`70`	`+ print("\| "); print("\\ /", c); print(" FluidX3D Version 2.6 \|\n");`
`71`	`71`	`print("\| "); print("'", c); print(" Copyright (c) Moritz Lehmann \|\n");`
`72`	`72`	`}`
`73`	`73`	`void Info::print_initialize() {`