.vtk export now converts and writes data in chunks, to reduce memory footprint and time for large memory allocation

ProjectPhysX · ProjectPhysX · commit b93584665a6f · 2025-05-17T10:45:33.000+02:00
diff --git a/README.md b/README.md
@@ -219,6 +219,17 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
   - improved coloring in `VIS_FIELD`/`ray_grid_traverse_sum()`
   - updated OpenCL-Wrapper now compiles OpenCL C code with `-cl-std=CL3.0` if available
   - fixed compiling on macOS with new OpenCL headers
+- [v3.3](https://github.com/ProjectPhysX/FluidX3D/releases/tag/v3.3) (17.05.2025) [changes](https://github.com/ProjectPhysX/FluidX3D/compare/v3.2...v3.3) (faster .vtk export)
+  - `.vtk` export now converts and writes data in chunks, to reduce memory footprint and time for large memory allocation
+  - `.vtk` files now contain original file name as metadata in title
+  - `INTERACTIVE_GRAPHICS_ASCII` now renders in 2x vertical resolution but less colors
+  - updated OpenCL-Wrapper: more robust dp4a detection, fixed core count reporting for RDNA4 GPUs
+  - fixed `update_moving_boundaries()` kernel not being called with flags other than `TYPE_S`
+  - fixed corrupted first frame until resizing with `INTERACTIVE_GRAPHICS_ASCII`
+  - fixed `resolution()` function for D2Q9
+  - fixed missing `<chrono>` header on some compilers
+  - fixed bug in `split_regex()`
+  - fixed compiler warning with `min_int`
 
 </details>
 
@@ -748,6 +759,8 @@ section RTX 3050M Ti
 	2341 : 0, 2341
 section RTX 3050M
 	2339 : 0, 2339
+section RTX 3050 6GB
+	1898 : 0, 1898
 section Titan RTX
 	7554 : 0, 7554
 section RTX 6000
@@ -816,6 +829,8 @@ section M60 (1 GPU)
 	1571 : 0, 1571
 section GTX 960M
 	872 : 0, 872
+section GTX 780 Ti
+	2776 : 0, 2776
 section GTX 770
 	1215 : 0, 1215
 section GTX 680 4GB
@@ -1110,6 +1125,7 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, ⚪ Apple, 🟡 ARM, 🟤 Glenfly
 | 🟢&nbsp;A2                                       |               4.53 |          15 |          200 |             1031 (79%) |              2051 (79%) |              1199 (46%) |
 | 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3050M&nbsp;Ti      |               7.60 |           4 |          192 |             1181 (94%) |              2341 (94%) |              2253 (90%) |
 | 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3050M              |               7.13 |           4 |          192 |             1180 (94%) |              2339 (94%) |              2016 (81%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3050&nbsp;6GB      |               6.77 |           6 |          168 |              993 (90%) |              1898 (87%) |              1879 (86%) |
 | 🟢&nbsp;Titan&nbsp;RTX                           |              16.31 |          24 |          672 |             3471 (79%) |              7456 (85%) |              7554 (87%) |
 | 🟢&nbsp;Quadro&nbsp;RTX&nbsp;6000                |              16.31 |          24 |          672 |             3307 (75%) |              6836 (78%) |              6879 (79%) |
 | 🟢&nbsp;Quadro&nbsp;RTX&nbsp;8000&nbsp;Passive   |              14.93 |          48 |          624 |             2591 (64%) |              5408 (67%) |              5607 (69%) |
@@ -1144,6 +1160,7 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, ⚪ Apple, 🟡 ARM, 🟤 Glenfly
 | 🟢&nbsp;Quadro&nbsp;M4000                        |               2.57 |           8 |          192 |              899 (72%) |              1519 (61%) |              1050 (42%) |
 | 🟢&nbsp;Tesla&nbsp;M60&nbsp;(1&nbsp;GPU)         |               4.82 |           8 |          160 |              853 (82%) |              1571 (76%) |              1557 (75%) |
 | 🟢&nbsp;GeForce&nbsp;GTX&nbsp;960M               |               1.51 |           4 |           80 |              442 (84%) |               872 (84%) |               627 (60%) |
+| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;780&nbsp;Ti        |               5.35 |           3 |          336 |             1710 (78%) |              2776 (64%) |              1302 (30%) |
 | 🟢&nbsp;GeForce&nbsp;GTX&nbsp;770                |               3.33 |           2 |          224 |              800 (55%) |              1215 (42%) |               876 (30%) |
 | 🟢&nbsp;GeForce&nbsp;GTX&nbsp;680&nbsp;4GB       |               3.33 |           4 |          192 |              783 (62%) |              1274 (51%) |               814 (33%) |
 | 🟢&nbsp;Quadro&nbsp;K2000                        |               0.73 |           2 |           64 |              312 (75%) |               444 (53%) |               171 (21%) |
diff --git a/src/info.cpp b/src/info.cpp
@@ -42,7 +42,7 @@ void Info::print_logo() const {
 	print("|                                  ");                 print("\\  \\ /  /", c);                print("                                  |\n");
 	print("|                                   ");                 print("\\  '  /", c);                 print("                                   |\n");
 	print("|                                    ");                 print("\\   /", c);                 print("                                    |\n");
-	print("|                                     ");                 print("\\ /", c);                 print("                FluidX3D Version 3.2 |\n");
+	print("|                                     ");                 print("\\ /", c);                 print("                FluidX3D Version 3.3 |\n");
 	print("|                                      ");                 print( "'", c);                 print("     Copyright (c) Dr. Moritz Lehmann |\n");
 	print("|-----------------------------------------------------------------------------|\n");
 }
diff --git a/src/lbm.cpp b/src/lbm.cpp
@@ -1053,7 +1053,8 @@ void LBM::unvoxelize_mesh_on_device(const Mesh* mesh, const uchar flag) { // rem
 	for(uint d=0u; d<get_D(); d++) lbm_domain[d]->finish_queue();
 }
 void LBM::write_mesh_to_vtk(const Mesh* mesh, const string& path, const bool convert_to_si_units) const { // write mesh to binary .vtk file
-	const string header_1 = "# vtk DataFile Version 3.0\nData\nBINARY\nDATASET POLYDATA\nPOINTS "+to_string(3u*mesh->triangle_number)+" float\n";
+	const string filename = default_filename(path, "mesh", ".vtk", get_t());
+	const string header_1 = "# vtk DataFile Version 3.0\nFluidX3D "+filename.substr(filename.rfind('/')+1)+"\nBINARY\nDATASET POLYDATA\nPOINTS "+to_string(3u*mesh->triangle_number)+" float\n";
 	const string header_2 = "POLYGONS "+to_string(mesh->triangle_number)+" "+to_string(4u*mesh->triangle_number)+"\n";
 	float* points = new float[9u*mesh->triangle_number];
 	int* triangles = new int[4u*mesh->triangle_number];
@@ -1074,7 +1075,6 @@ void LBM::write_mesh_to_vtk(const Mesh* mesh, const string& path, const bool con
 		triangles[4u*i+2u] = reverse_bytes(3*(int)i+1); // vertex 1
 		triangles[4u*i+3u] = reverse_bytes(3*(int)i+2); // vertex 2
 	});
-	const string filename = default_filename(path, "mesh", ".vtk", get_t());
 	create_folder(filename);
 	std::ofstream file(filename, std::ios::out|std::ios::binary);
 	file.write(header_1.c_str(), header_1.length()); // write non-binary file header
diff --git a/src/lbm.hpp b/src/lbm.hpp
@@ -299,25 +299,33 @@ class LBM {
 				if(name=="F"  ) unit_conversion_factor = (T)units.si_F  (1.0f);
 				if(name=="T"  ) unit_conversion_factor = (T)units.si_T  (1.0f);
 			}
+			const string filename = create_file_extension(path, ".vtk");
 			const float3 origin = spacing*float3(0.5f-0.5f*(float)Nx, 0.5f-0.5f*(float)Ny, 0.5f-0.5f*(float)Nz);
 			const string header =
-				"# vtk DataFile Version 3.0\nData\nBINARY\nDATASET STRUCTURED_POINTS\n"
+				"# vtk DataFile Version 3.0\nFluidX3D "+filename.substr(filename.rfind('/')+1)+"\nBINARY\nDATASET STRUCTURED_POINTS\n"
 				"DIMENSIONS "+to_string(Nx)+" "+to_string(Ny)+" "+to_string(Nz)+"\n"
 				"ORIGIN "+to_string(origin.x)+" "+to_string(origin.y)+" "+to_string(origin.z)+"\n"
 				"SPACING "+to_string(spacing)+" "+to_string(spacing)+" "+to_string(spacing)+"\n"
-				"POINT_DATA "+to_string((ulong)Nx*(ulong)Ny*(ulong)Nz)+"\nSCALARS data "+vtk_type()+" "+to_string(dimensions())+"\nLOOKUP_TABLE default\n"
+				"POINT_DATA "+to_string((ulong)Nx*(ulong)Ny*(ulong)Nz)+"\n"
+				"SCALARS data "+vtk_type()+" "+to_string(dimensions())+"\nLOOKUP_TABLE default\n"
 			;
-			T* data = new T[range()];
-			parallel_for(length(), [&](ulong i) {
-				for(uint d=0u; d<dimensions(); d++) {
-					data[i*(ulong)dimensions()+(ulong)d] = reverse_bytes((T)(unit_conversion_factor*reference(i, d))); // SoA <- AoS
-				}
-			});
-			const string filename = create_file_extension(path, ".vtk");
+			const uint chunk_size_MB = 4u*thread::hardware_concurrency(); // in MB; convert and write data in chunks, to reduce memory footprint and time for large memory allocation
+			const ulong chunk_elements = (1048576ull*(ulong)chunk_size_MB)/((ulong)dimensions()*sizeof(T));
+			const ulong chunks=length()/chunk_elements, chunk_remainder=length()%chunk_elements;
+			T* data = new T[chunk_elements*(ulong)dimensions()];
 			create_folder(filename);
 			std::ofstream file(filename, std::ios::out|std::ios::binary);
 			file.write(header.c_str(), header.length()); // write non-binary file header
-			file.write((char*)data, capacity()); // write binary data
+			for(ulong c=0u; c<chunks+1ull; c++) { // iterate over all full chunks + last chunk_remainder chunk
+				const ulong N = c<chunks ? chunk_elements : chunk_remainder;
+				if(N==0ull) break; // chunk_remainder may be 0, then skip last iteration
+				parallel_for(N, [&](ulong i) {
+					for(uint d=0u; d<dimensions(); d++) { // LBM to SI units, LittleEndian to BigEndian, AoS to SoA
+						data[i*(ulong)dimensions()+(ulong)d] = reverse_bytes((T)(unit_conversion_factor*reference(c*chunk_elements+i, d)));
+					}
+				});
+				file.write((char*)data, N*(ulong)dimensions()*sizeof(T)); // write binary data
+			}
 			file.close();
 			delete[] data;
 			info.allow_printing.lock();
diff --git a/src/main.cpp b/src/main.cpp
@@ -77,7 +77,7 @@ void main_label(const double frametime) {
 			draw_label(ox, oy+i, "Steps "          +alignr(31u, /************************************/ alignr(10u, info.lbm->get_t())+" ("+alignr(5, to_uint(1.0/info.runtime_lbm_timestep_smooth))+" Steps/s)"), c); i+=FONT_HEIGHT;
 			draw_label(ox, oy+i, "FPS "            +alignr(33u, /************************************************************/ alignr(4u, to_uint(1.0/frametime))+" ("+alignr(5u, camera.fps_limit)+" fps max)"), c);
 		}
-		draw_label(2, camera.height-1*(FONT_HEIGHT)-1, "FluidX3D v3.2 Copyright (c) Dr. Moritz Lehmann", c);
+		draw_label(2, camera.height-1*(FONT_HEIGHT)-1, "FluidX3D v3.3 Copyright (c) Dr. Moritz Lehmann", c);
 		if(!key_H) {
 			draw_label(camera.width-16*(FONT_WIDTH)-1, 2, "Press H for Help", c);
 		} else {
diff --git a/src/opencl.hpp b/src/opencl.hpp
@@ -154,12 +154,13 @@ struct Device_Info {
 			cores_per_cu = is_gpu ? (intel_16_cores_per_cu ? 16.0f : 8.0f) : 0.5f; // Intel GPUs have 16 cores/CU (PVC) or 8 cores/CU (integrated/Arc), Intel CPUs (with HT) have 1/2 core/CU
 			if(is_gpu&&!uses_ram) { // fix wrong global memory capacity reporting for Intel dGPUs
 #if defined(_WIN32)
-				memory = (uint)((cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()*50ull/49ull)/1048576ull); // 98% on Windows https://github.com/intel/compute-runtime/blob/master/shared/source/os_interface/windows/wddm_memory_manager.cpp#L964
+				memory = (uint)((cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()*50ull/49ull)/1048576ull); // 98% on Windows https://github.com/intel/compute-runtime/blob/master/shared/source/os_interface/windows/wddm_memory_manager.cpp#L958
 #elif defined(__linux__)
-				memory = (uint)((cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()*20ull/19ull)/1048576ull); // 95% on Linux   https://github.com/intel/compute-runtime/blob/master/shared/source/os_interface/linux/drm_memory_manager.cpp#L1424
+				memory = (uint)((cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()*20ull/19ull)/1048576ull); // 95% on Linux   https://github.com/intel/compute-runtime/blob/master/shared/source/os_interface/linux/drm_memory_manager.cpp#L1521
 #endif // Linux
 			}
 			patch_intel_gpu_above_4gb = patch_intel_gpu_above_4gb||(is_gpu&&memory>4096u); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
+			if(is_cpu) is_dp4a_capable = 0u; // native dp4a in Intel CPU Runtime for OpenCL is slower than emulated dp4a
 		} else if(vendor_id==0x10DE||vendor_id==0x13B5) { // Nvidia GPU/CPU
 			nvidia_compute_capability = 10u*(uint)cl_device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV>()+(uint)cl_device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV>();
 			const bool nvidia__32_cores_per_cu = (nvidia_compute_capability <30); // identify Fermi GPUs
diff --git a/src/resource.rc b/src/resource.rc
@@ -24,7 +24,7 @@ BEGIN
 			VALUE "LegalCopyright", "(c) Dr. Moritz Lehmann"
 			VALUE "OriginalFilename", "FluidX3D.exe"
 			VALUE "ProductName", "FluidX3D"
-			VALUE "ProductVersion", "v3.2"
+			VALUE "ProductVersion", "v3.3"
 		END
 	END
 	BLOCK "VarFileInfo"
diff --git a/src/utilities.hpp b/src/utilities.hpp
@@ -40,10 +40,10 @@ typedef uint64_t ulong;
 #define min_short ((short)-32768)
 #define max_short ((short)32767)
 #define max_ushort ((ushort)65535)
-#define min_int -2147483648
+#define min_int ((int)-2147483648)
 #define max_int 2147483647
 #define max_uint 4294967295u
-#define min_slong -9223372036854775808ll
+#define min_slong ((slong)-9223372036854775808ll)
 #define max_slong 9223372036854775807ll
 #define max_ulong 18446744073709551615ull
 #define min_float 1.401298464E-45f

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ void Info::print_logo() const {`
`42`	`42`	`print("\| "); print("\\ \\ / /", c); print(" \|\n");`
`43`	`43`	`print("\| "); print("\\ ' /", c); print(" \|\n");`
`44`	`44`	`print("\| "); print("\\ /", c); print(" \|\n");`
`45`		`- print("\| "); print("\\ /", c); print(" FluidX3D Version 3.2 \|\n");`
	`45`	`+ print("\| "); print("\\ /", c); print(" FluidX3D Version 3.3 \|\n");`
`46`	`46`	`print("\| "); print( "'", c); print(" Copyright (c) Dr. Moritz Lehmann \|\n");`
`47`	`47`	`print("\|-----------------------------------------------------------------------------\|\n");`
`48`	`48`	`}`
Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ void main_label(const double frametime) {`
`77`	`77`	`draw_label(ox, oy+i, "Steps " +alignr(31u, /************************************/ alignr(10u, info.lbm->get_t())+" ("+alignr(5, to_uint(1.0/info.runtime_lbm_timestep_smooth))+" Steps/s)"), c); i+=FONT_HEIGHT;`
`78`	`78`	`draw_label(ox, oy+i, "FPS " +alignr(33u, /************************************************************/ alignr(4u, to_uint(1.0/frametime))+" ("+alignr(5u, camera.fps_limit)+" fps max)"), c);`
`79`	`79`	`}`
`80`		`- draw_label(2, camera.height-1*(FONT_HEIGHT)-1, "FluidX3D v3.2 Copyright (c) Dr. Moritz Lehmann", c);`
	`80`	`+ draw_label(2, camera.height-1*(FONT_HEIGHT)-1, "FluidX3D v3.3 Copyright (c) Dr. Moritz Lehmann", c);`
`81`	`81`	`if(!key_H) {`
`82`	`82`	`draw_label(camera.width-16*(FONT_WIDTH)-1, 2, "Press H for Help", c);`
`83`	`83`	`} else {`