diff --git a/CMakeLists.txt b/CMakeLists.txt index 608957ca..e2fd29ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,8 @@ project(GPGPUTasks) set(CMAKE_CXX_STANDARD 17) -# Если вы установили CUDA SDK - можете включить поддержку CUDA +include_directories(${CMAKE_SOURCE_DIR}) + option(GPU_CUDA_SUPPORT "CUDA support." OFF) # GLSLC_BIN используйтся для компиляции Vulkan шейдеров (GLSL) @@ -64,7 +65,12 @@ set(LIBRARIES set(OPENCL_SOURCES src/kernels/cl/aplusb.cl src/kernels/cl/ray_tracing_render_brute_force.cl - src/kernels/cl/ray_tracing_render_using_lbvh.cl) + src/kernels/cl/ray_tracing_render_using_lbvh.cl + src/kernels/cl/lbvh_compute_triangle_data.cl + src/kernels/cl/lbvh_compute_bounds.cl + src/kernels/cl/lbvh_compute_morton_codes.cl + src/kernels/cl/lbvh_build_nodes.cl + src/kernels/cl/lbvh_compute_aabbs.cl) set(OPENCL_INCLUDES src/kernels/defines.h src/kernels/cl/helpers/rassert.cl src/kernels/cl/camera_helpers.cl src/kernels/cl/geometry_helpers.cl src/kernels/cl/random_helpers.cl src/kernels/shared_structs/aabb_gpu_shared.h src/kernels/shared_structs/bvh_node_gpu_shared.h src/kernels/shared_structs/camera_gpu_shared.h src/kernels/shared_structs/morton_code_gpu_shared.h src/kernels/shared_structs/struct_helpers.h) set(OPENCL_DEFINES) diff --git a/src/kernels/cl/lbvh_build_nodes.cl b/src/kernels/cl/lbvh_build_nodes.cl new file mode 100644 index 00000000..2eef773a --- /dev/null +++ b/src/kernels/cl/lbvh_build_nodes.cl @@ -0,0 +1,93 @@ +#ifdef __CLION_IDE__ +#include +#endif + +#include "helpers/rassert.cl" +#include "../defines.h" +#include "../shared_structs/bvh_node_gpu_shared.h" +#include "../shared_structs/morton_code_gpu_shared.h" + +static inline int clz32(uint x) { + return x == 0 ? 32 : clz(x); +} + +static inline int delta( + __global const MortonCode* codes, + int n, int i, int j) +{ + if (j < 0 || j >= n) return -1; + + MortonCode ci = codes[i]; + MortonCode cj = codes[j]; + + if (ci == cj) { + return 32 + clz32((uint)i ^ (uint)j); + } else { + return clz32(ci ^ cj); + } +} + +__attribute__((reqd_work_group_size(256, 1, 1))) +__kernel void lbvh_build_nodes( + __global const MortonCode* sorted_morton_codes, + __global const uint* sorted_indices, + __global BVHNodeGPU* nodes, + __global uint* leaf_triangle_indices, + uint nfaces) +{ + uint i = get_global_id(0); + int n = (int)nfaces; + + if (i >= nfaces) return; + + if (i < nfaces - 1) { + int d_left = delta(sorted_morton_codes, n, i, i-1); + int d_right = delta(sorted_morton_codes, n, i, i+1); + + int d = (d_right > d_left) ? 1 : -1; + + int delta_min = delta(sorted_morton_codes, n, i, i - d); + int lmax = 2; + while (delta(sorted_morton_codes, n, i, i + lmax * d) > delta_min) { + lmax *= 2; + } + + int l = 0; + for (int t = lmax / 2; t > 0; t /= 2) { + if (delta(sorted_morton_codes, n, i, i + (l + t) * d) > delta_min) { + l += t; + } + } + + int j = i + l * d; + int first = min((int)i, j); + int last = max((int)i, j); + + int common_prefix = delta(sorted_morton_codes, n, first, last); + int split = first; + int step = last - first; + + do { + step = (step + 1) >> 1; + int new_split = split + step; + if (new_split < last) { + int split_prefix = delta(sorted_morton_codes, n, first, new_split); + if (split_prefix > common_prefix) { + split = new_split; + } + } + } while (step > 1); + + int leaf_start = (int)nfaces - 1; + int left_child = (split == first) ? (leaf_start + split) : split; + int right_child = (split + 1 == last) ? (leaf_start + split + 1) : (split + 1); + + nodes[i].leftChildIndex = left_child; + nodes[i].rightChildIndex = right_child; + } + + int leaf_idx = (int)nfaces - 1 + i; + leaf_triangle_indices[i] = sorted_indices[i]; + nodes[leaf_idx].leftChildIndex = 0xFFFFFFFFu; + nodes[leaf_idx].rightChildIndex = 0xFFFFFFFFu; +} diff --git a/src/kernels/cl/lbvh_compute_aabbs.cl b/src/kernels/cl/lbvh_compute_aabbs.cl new file mode 100644 index 00000000..63497112 --- /dev/null +++ b/src/kernels/cl/lbvh_compute_aabbs.cl @@ -0,0 +1,68 @@ +#ifdef __CLION_IDE__ +#include +#endif + +#include "helpers/rassert.cl" +#include "../defines.h" +#include "../shared_structs/bvh_node_gpu_shared.h" +#include "../shared_structs/aabb_gpu_shared.h" + +static inline AABBGPU merge_aabb(AABBGPU a, AABBGPU b) { + AABBGPU result; + result.min_x = fmin(a.min_x, b.min_x); + result.min_y = fmin(a.min_y, b.min_y); + result.min_z = fmin(a.min_z, b.min_z); + result.max_x = fmax(a.max_x, b.max_x); + result.max_y = fmax(a.max_y, b.max_y); + result.max_z = fmax(a.max_z, b.max_z); + return result; +} + +__attribute__((reqd_work_group_size(256, 1, 1))) +__kernel void lbvh_compute_aabbs( + __global BVHNodeGPU* nodes, + __global const AABBGPU* triangle_aabbs, + __global const uint* leaf_triangle_indices, + __global volatile int* node_visit_counts, + uint nfaces) +{ + uint i = get_global_id(0); + if (i >= nfaces) return; + + int leaf_idx = (int)nfaces - 1 + i; + nodes[leaf_idx].aabb = triangle_aabbs[leaf_triangle_indices[i]]; + + int current = -1; + for (int p = 0; p < (int)nfaces - 1; p++) { + if (nodes[p].leftChildIndex == (uint)leaf_idx || nodes[p].rightChildIndex == (uint)leaf_idx) { + current = p; + break; + } + } + + while (current >= 0) { + int visit_count = atomic_inc(&node_visit_counts[current]); + + if (visit_count == 0) { + return; + } + + BVHNodeGPU node = nodes[current]; + AABBGPU left_aabb = nodes[node.leftChildIndex].aabb; + AABBGPU right_aabb = nodes[node.rightChildIndex].aabb; + nodes[current].aabb = merge_aabb(left_aabb, right_aabb); + + if (current == 0) break; + + int parent_idx = -1; + for (int p = 0; p < (int)nfaces - 1; p++) { + if (nodes[p].leftChildIndex == (uint)current || nodes[p].rightChildIndex == (uint)current) { + parent_idx = p; + break; + } + } + + current = parent_idx; + if (current < 0) break; + } +} diff --git a/src/kernels/cl/lbvh_compute_bounds.cl b/src/kernels/cl/lbvh_compute_bounds.cl new file mode 100644 index 00000000..29ad3419 --- /dev/null +++ b/src/kernels/cl/lbvh_compute_bounds.cl @@ -0,0 +1,55 @@ +#ifdef __CLION_IDE__ +#include +#endif + +#include "helpers/rassert.cl" +#include "../defines.h" + +__attribute__((reqd_work_group_size(256, 1, 1))) +__kernel void lbvh_compute_bounds( + __global const float* triangle_centroids, + __global float* bounds_min, + __global float* bounds_max, + uint nfaces) +{ + __local float local_min[3 * 256]; + __local float local_max[3 * 256]; + + uint local_id = get_local_id(0); + uint global_id = get_global_id(0); + uint group_id = get_group_id(0); + + float3 local_min_val = (float3)(FLT_MAX, FLT_MAX, FLT_MAX); + float3 local_max_val = (float3)(-FLT_MAX, -FLT_MAX, -FLT_MAX); + + for (uint i = global_id; i < nfaces; i += get_global_size(0)) { + float3 c = (float3)(triangle_centroids[3*i], triangle_centroids[3*i+1], triangle_centroids[3*i+2]); + local_min_val = fmin(local_min_val, c); + local_max_val = fmax(local_max_val, c); + } + + local_min[local_id * 3] = local_min_val.x; + local_min[local_id * 3 + 1] = local_min_val.y; + local_min[local_id * 3 + 2] = local_min_val.z; + local_max[local_id * 3] = local_max_val.x; + local_max[local_id * 3 + 1] = local_max_val.y; + local_max[local_id * 3 + 2] = local_max_val.z; + barrier(CLK_LOCAL_MEM_FENCE); + + for (uint stride = 128; stride > 0; stride >>= 1) { + if (local_id < stride) { + for (int axis = 0; axis < 3; axis++) { + local_min[local_id * 3 + axis] = fmin(local_min[local_id * 3 + axis], local_min[(local_id + stride) * 3 + axis]); + local_max[local_id * 3 + axis] = fmax(local_max[local_id * 3 + axis], local_max[(local_id + stride) * 3 + axis]); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (local_id == 0) { + for (int axis = 0; axis < 3; axis++) { + atomic_min((volatile __global int*)&bounds_min[axis], as_int(local_min[axis])); + atomic_max((volatile __global int*)&bounds_max[axis], as_int(local_max[axis])); + } + } +} diff --git a/src/kernels/cl/lbvh_compute_morton_codes.cl b/src/kernels/cl/lbvh_compute_morton_codes.cl new file mode 100644 index 00000000..54a2d893 --- /dev/null +++ b/src/kernels/cl/lbvh_compute_morton_codes.cl @@ -0,0 +1,52 @@ +#ifdef __CLION_IDE__ +#include +#endif + +#include "helpers/rassert.cl" +#include "../defines.h" +#include "../shared_structs/morton_code_gpu_shared.h" + +static inline uint expandBits(uint v) { + v = (v * 0x00010001u) & 0xFF0000FFu; + v = (v * 0x00000101u) & 0x0F00F00Fu; + v = (v * 0x00000011u) & 0xC30C30C3u; + v = (v * 0x00000005u) & 0x49249249u; + return v; +} + +static inline MortonCode morton3D(float x, float y, float z) { + uint ix = min(max((int)(x * 1024.0f), 0), 1023); + uint iy = min(max((int)(y * 1024.0f), 0), 1023); + uint iz = min(max((int)(z * 1024.0f), 0), 1023); + + uint xx = expandBits(ix); + uint yy = expandBits(iy); + uint zz = expandBits(iz); + + return (xx << 2) | (yy << 1) | zz; +} + +__attribute__((reqd_work_group_size(256, 1, 1))) +__kernel void lbvh_compute_morton_codes( + __global const float* triangle_centroids, + __global MortonCode* morton_codes, + __global uint* sorted_indices, + float centroid_min_x, float centroid_min_y, float centroid_min_z, + float centroid_max_x, float centroid_max_y, float centroid_max_z, + uint nfaces) +{ + uint i = get_global_id(0); + if (i >= nfaces) return; + + float3 c = (float3)(triangle_centroids[3*i], triangle_centroids[3*i+1], triangle_centroids[3*i+2]); + float3 centroid_min = (float3)(centroid_min_x, centroid_min_y, centroid_min_z); + float3 centroid_max = (float3)(centroid_max_x, centroid_max_y, centroid_max_z); + + float eps = 1e-9f; + float3 d = fmax(centroid_max - centroid_min, eps); + float3 n = (c - centroid_min) / d; + n = clamp(n, 0.0f, 1.0f); + + morton_codes[i] = morton3D(n.x, n.y, n.z); + sorted_indices[i] = i; +} diff --git a/src/kernels/cl/lbvh_compute_triangle_data.cl b/src/kernels/cl/lbvh_compute_triangle_data.cl new file mode 100644 index 00000000..f5080e22 --- /dev/null +++ b/src/kernels/cl/lbvh_compute_triangle_data.cl @@ -0,0 +1,39 @@ +#ifdef __CLION_IDE__ +#include +#endif + +#include "helpers/rassert.cl" +#include "../defines.h" +#include "../shared_structs/aabb_gpu_shared.h" + +__attribute__((reqd_work_group_size(256, 1, 1))) +__kernel void lbvh_compute_triangle_data( + __global const float* vertices, + __global const uint* faces, + __global AABBGPU* triangle_aabbs, + __global float* triangle_centroids, + uint nfaces) +{ + uint i = get_global_id(0); + if (i >= nfaces) return; + + uint3 f = (uint3)(faces[3*i], faces[3*i+1], faces[3*i+2]); + float3 v0 = (float3)(vertices[3*f.x], vertices[3*f.x+1], vertices[3*f.x+2]); + float3 v1 = (float3)(vertices[3*f.y], vertices[3*f.y+1], vertices[3*f.y+2]); + float3 v2 = (float3)(vertices[3*f.z], vertices[3*f.z+1], vertices[3*f.z+2]); + + AABBGPU aabb; + aabb.min_x = fmin(fmin(v0.x, v1.x), v2.x); + aabb.min_y = fmin(fmin(v0.y, v1.y), v2.y); + aabb.min_z = fmin(fmin(v0.z, v1.z), v2.z); + aabb.max_x = fmax(fmax(v0.x, v1.x), v2.x); + aabb.max_y = fmax(fmax(v0.y, v1.y), v2.y); + aabb.max_z = fmax(fmax(v0.z, v1.z), v2.z); + + triangle_aabbs[i] = aabb; + + float3 centroid = (v0 + v1 + v2) / 3.0f; + triangle_centroids[3*i] = centroid.x; + triangle_centroids[3*i+1] = centroid.y; + triangle_centroids[3*i+2] = centroid.z; +} diff --git a/src/kernels/cl/ray_tracing_render_using_lbvh.cl b/src/kernels/cl/ray_tracing_render_using_lbvh.cl index 35b5ca49..475a366a 100644 --- a/src/kernels/cl/ray_tracing_render_using_lbvh.cl +++ b/src/kernels/cl/ray_tracing_render_using_lbvh.cl @@ -31,7 +31,55 @@ static inline bool bvh_closest_hit( const int rootIndex = 0; const int leafStart = (int)nfaces - 1; - // TODO implement BVH travering (with stack, don't use recursion) + int stack[64]; + int stackPtr = 0; + stack[stackPtr++] = rootIndex; + + float bestT = FLT_MAX; + int bestFace = -1; + float bestU = 0.0f; + float bestV = 0.0f; + + while (stackPtr > 0) { + int nodeIdx = stack[--stackPtr]; + + if (nodeIdx >= leafStart) { + uint triIdx = leafTriIndices[nodeIdx - leafStart]; + uint3 f = loadFace(faces, triIdx); + float3 v0 = loadVertex(vertices, f.x); + float3 v1 = loadVertex(vertices, f.y); + float3 v2 = loadVertex(vertices, f.z); + + float t, u, v; + if (intersect_ray_triangle(orig, dir, v0, v1, v2, tMin, bestT, false, &t, &u, &v)) { + bestT = t; + bestFace = (int)triIdx; + bestU = u; + bestV = v; + } + } else { + BVHNodeGPU node = nodes[nodeIdx]; + float tNear, tFar; + + if (intersect_ray_aabb(orig, dir, node.aabb, tMin, bestT, &tNear, &tFar)) { + int left = (int)node.leftChildIndex; + int right = (int)node.rightChildIndex; + + if (stackPtr + 2 <= 64) { + stack[stackPtr++] = left; + stack[stackPtr++] = right; + } + } + } + } + + if (bestFace >= 0) { + *outT = bestT; + *outFaceId = bestFace; + *outU = bestU; + *outV = bestV; + return true; + } return false; } @@ -50,7 +98,41 @@ static inline bool any_hit_from( const int rootIndex = 0; const int leafStart = (int)nfaces - 1; - // TODO implement BVH travering (with stack, don't use recursion) + int stack[64]; + int stackPtr = 0; + stack[stackPtr++] = rootIndex; + + while (stackPtr > 0) { + int nodeIdx = stack[--stackPtr]; + + if (nodeIdx >= leafStart) { + uint triIdx = leafTriIndices[nodeIdx - leafStart]; + if ((int)triIdx == ignore_face) continue; + + uint3 f = loadFace(faces, triIdx); + float3 v0 = loadVertex(vertices, f.x); + float3 v1 = loadVertex(vertices, f.y); + float3 v2 = loadVertex(vertices, f.z); + + float t, u, v; + if (intersect_ray_triangle_any(orig, dir, v0, v1, v2, false, &t, &u, &v)) { + return true; + } + } else { + BVHNodeGPU node = nodes[nodeIdx]; + float tNear, tFar; + + if (intersect_ray_aabb_any(orig, dir, node.aabb, &tNear, &tFar)) { + int left = (int)node.leftChildIndex; + int right = (int)node.rightChildIndex; + + if (stackPtr + 2 <= 64) { + stack[stackPtr++] = left; + stack[stackPtr++] = right; + } + } + } + } return false; } diff --git a/src/kernels/kernels.cpp b/src/kernels/kernels.cpp index 51a1c4b2..9f8ff8db 100644 --- a/src/kernels/kernels.cpp +++ b/src/kernels/kernels.cpp @@ -3,6 +3,11 @@ #include "cl/generated_kernels/aplusb.h" #include "cl/generated_kernels/ray_tracing_render_brute_force.h" #include "cl/generated_kernels/ray_tracing_render_using_lbvh.h" +#include "cl/generated_kernels/lbvh_compute_triangle_data.h" +#include "cl/generated_kernels/lbvh_compute_bounds.h" +#include "cl/generated_kernels/lbvh_compute_morton_codes.h" +#include "cl/generated_kernels/lbvh_build_nodes.h" +#include "cl/generated_kernels/lbvh_compute_aabbs.h" #include "vk/generated_kernels/aplusb_comp.h" #include "vk/generated_kernels/ray_tracing_render_brute_force_comp.h" @@ -55,6 +60,31 @@ const ProgramBinaries& getRTWithLBVH() { return opencl_binaries_ray_tracing_render_using_lbvh; } + +const ProgramBinaries& getLBVHComputeTriangleData() +{ + return opencl_binaries_lbvh_compute_triangle_data; +} + +const ProgramBinaries& getLBVHComputeBounds() +{ + return opencl_binaries_lbvh_compute_bounds; +} + +const ProgramBinaries& getLBVHComputeMortonCodes() +{ + return opencl_binaries_lbvh_compute_morton_codes; +} + +const ProgramBinaries& getLBVHBuildNodes() +{ + return opencl_binaries_lbvh_build_nodes; +} + +const ProgramBinaries& getLBVHComputeAABBs() +{ + return opencl_binaries_lbvh_compute_aabbs; +} } // namespace ocl namespace avk2 { diff --git a/src/kernels/kernels.h b/src/kernels/kernels.h index b3c989e6..d2da35a7 100644 --- a/src/kernels/kernels.h +++ b/src/kernels/kernels.h @@ -28,6 +28,12 @@ const ProgramBinaries& getAplusB(); const ProgramBinaries& getRTBruteForce(); const ProgramBinaries& getRTWithLBVH(); + +const ProgramBinaries& getLBVHComputeTriangleData(); +const ProgramBinaries& getLBVHComputeBounds(); +const ProgramBinaries& getLBVHComputeMortonCodes(); +const ProgramBinaries& getLBVHBuildNodes(); +const ProgramBinaries& getLBVHComputeAABBs(); } namespace avk2 { diff --git a/src/main_linear_bvh.cpp b/src/main_linear_bvh.cpp index 35676eea..5082c634 100644 --- a/src/main_linear_bvh.cpp +++ b/src/main_linear_bvh.cpp @@ -17,6 +17,7 @@ #include #include +#include // Считает сколько непустых пикселей template @@ -77,6 +78,11 @@ void run(int argc, char** argv) ocl::KernelSource ocl_rt_brute_force(ocl::getRTBruteForce()); ocl::KernelSource ocl_rt_with_lbvh(ocl::getRTWithLBVH()); + ocl::KernelSource ocl_lbvh_compute_triangle_data(ocl::getLBVHComputeTriangleData()); + ocl::KernelSource ocl_lbvh_compute_bounds(ocl::getLBVHComputeBounds()); + ocl::KernelSource ocl_lbvh_compute_morton_codes(ocl::getLBVHComputeMortonCodes()); + ocl::KernelSource ocl_lbvh_build_nodes(ocl::getLBVHBuildNodes()); + ocl::KernelSource ocl_lbvh_compute_aabbs(ocl::getLBVHComputeAABBs()); avk2::KernelSource vk_rt_brute_force(avk2::getRTBruteForce()); avk2::KernelSource vk_rt_with_lbvh(avk2::getRTWithLBVH()); @@ -229,9 +235,6 @@ void run(int argc, char** argv) for (int iter = 0; iter < niters; ++iter) { timer t; - // TODO оттрасируйте лучи на GPU используя построенный на CPU LBVH - throw std::runtime_error(CODE_IS_NOT_IMPLEMENTED); - if (context.type() == gpu::Context::TypeOpenCL) { ocl_rt_with_lbvh.exec( gpu::WorkSize(16, 16, width, height), @@ -288,16 +291,77 @@ void run(int argc, char** argv) double gpu_lbvh_time_sum = 0.0; double rt_times_with_gpu_lbvh_sum = 0.0; - // TODO постройте LBVH на GPU - // TODO оттрасируйте лучи на GPU используя построенный на GPU LBVH - bool gpu_lbvg_gpu_rt_done = false; + gpu::gpu_mem_32f triangle_centroids_gpu(nfaces * 3); + gpu::shared_device_buffer_typed triangle_aabbs_gpu(nfaces); + gpu::gpu_mem_32f centroid_min_gpu(3); + gpu::gpu_mem_32f centroid_max_gpu(3); + gpu::gpu_mem_32u morton_codes_gpu(nfaces); + gpu::gpu_mem_32u sorted_indices_gpu(nfaces); + gpu::shared_device_buffer_typed lbvh_nodes_gpu(2*nfaces - 1); + gpu::gpu_mem_32u leaf_triangle_indices_gpu(nfaces); + gpu::gpu_mem_32i node_visit_counts_gpu(nfaces - 1); + + bool gpu_lbvg_gpu_rt_done = true; if (gpu_lbvg_gpu_rt_done) { std::vector gpu_lbvh_times; for (int iter = 0; iter < niters; ++iter) { timer t; - // TODO постройте LBVH на GPU + ocl_lbvh_compute_triangle_data.exec( + gpu::WorkSize(256, ((nfaces+255)/256)*256), + vertices_gpu, faces_gpu, + triangle_aabbs_gpu.clmem(), triangle_centroids_gpu, nfaces); + + std::vector init_min = {FLT_MAX, FLT_MAX, FLT_MAX}; + std::vector init_max = {-FLT_MAX, -FLT_MAX, -FLT_MAX}; + centroid_min_gpu.writeN(init_min.data(), 3); + centroid_max_gpu.writeN(init_max.data(), 3); + + ocl_lbvh_compute_bounds.exec( + gpu::WorkSize(256, ((nfaces+255)/256)*256), + triangle_centroids_gpu, centroid_min_gpu, centroid_max_gpu, nfaces); + + std::vector cmin = centroid_min_gpu.readVector(3); + std::vector cmax = centroid_max_gpu.readVector(3); + point3f centroid_min{cmin[0], cmin[1], cmin[2]}; + point3f centroid_max{cmax[0], cmax[1], cmax[2]}; + + ocl_lbvh_compute_morton_codes.exec( + gpu::WorkSize(256, ((nfaces+255)/256)*256), + triangle_centroids_gpu, morton_codes_gpu, sorted_indices_gpu, + centroid_min.x, centroid_min.y, centroid_min.z, + centroid_max.x, centroid_max.y, centroid_max.z, nfaces); + + std::vector morton_codes_cpu = morton_codes_gpu.readVector(); + std::vector sorted_indices_cpu = sorted_indices_gpu.readVector(); + + std::vector> pairs(nfaces); + for (uint i = 0; i < nfaces; i++) { + pairs[i] = {morton_codes_cpu[i], sorted_indices_cpu[i]}; + } + + std::sort(pairs.begin(), pairs.end(), + [](const auto& a, const auto& b) { return a.first < b.first; }); + + for (uint i = 0; i < nfaces; i++) { + morton_codes_cpu[i] = pairs[i].first; + sorted_indices_cpu[i] = pairs[i].second; + } + + morton_codes_gpu.writeN(morton_codes_cpu.data(), nfaces); + sorted_indices_gpu.writeN(sorted_indices_cpu.data(), nfaces); + + ocl_lbvh_build_nodes.exec( + gpu::WorkSize(256, ((nfaces+255)/256)*256), + morton_codes_gpu, sorted_indices_gpu, + lbvh_nodes_gpu.clmem(), leaf_triangle_indices_gpu, nfaces); + + node_visit_counts_gpu.fill(0); + ocl_lbvh_compute_aabbs.exec( + gpu::WorkSize(256, ((nfaces+255)/256)*256), + lbvh_nodes_gpu.clmem(), triangle_aabbs_gpu.clmem(), + leaf_triangle_indices_gpu, node_visit_counts_gpu, nfaces); gpu_lbvh_times.push_back(t.elapsed()); } @@ -316,7 +380,12 @@ void run(int argc, char** argv) for (int iter = 0; iter < niters; ++iter) { timer t; - // TODO оттрасируйте лучи на GPU используя построенный на GPU LBVH + ocl_rt_with_lbvh.exec( + gpu::WorkSize(16, 16, width, height), + vertices_gpu, faces_gpu, + lbvh_nodes_gpu.clmem(), leaf_triangle_indices_gpu.clmem(), + framebuffer_face_id_gpu, framebuffer_ambient_occlusion_gpu, + camera_gpu.clmem(), nfaces); gpu_lbvh_rt_times.push_back(t.elapsed()); }