Skip to content

Commit 41e5f29

Browse files
committed
NEON support pointcloud
1 parent 9bba62e commit 41e5f29

File tree

4 files changed

+293
-3
lines changed

4 files changed

+293
-3
lines changed

src/proc/neon/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@
33
target_sources(${LRS_TARGET}
44
PRIVATE
55
"${CMAKE_CURRENT_LIST_DIR}/image-neon.cpp"
6+
"${CMAKE_CURRENT_LIST_DIR}/neon-pointcloud.cpp"
67
)

src/proc/neon/neon-pointcloud.cpp

Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
// License: Apache 2.0. See LICENSE file in root directory.
2+
// Copyright(c) 2017 Intel Corporation. All Rights Reserved.
3+
4+
#include <librealsense2/rs.hpp>
5+
6+
#include "neon-pointcloud.h"
7+
8+
#include <iostream>
9+
10+
#if defined(__ARM_NEON) && ! defined ANDROID
11+
#include <arm_neon.h>
12+
13+
namespace librealsense
14+
{
15+
template <rs2_distortion dist>
16+
static inline void distorte_x_y(
17+
const float32x4_t &x, const float32x4_t &y,
18+
float32x4_t *distorted_x, float32x4_t *distorted_y, const float32x4_t(&c)[5])
19+
{
20+
*distorted_x = x;
21+
*distorted_y = y;
22+
}
23+
24+
template <>
25+
inline void distorte_x_y<RS2_DISTORTION_MODIFIED_BROWN_CONRADY>(
26+
const float32x4_t &x, const float32x4_t &y,
27+
float32x4_t *distorted_x, float32x4_t *distorted_y, const float32x4_t(&c)[5])
28+
{
29+
const auto one = vdupq_n_f32(1);
30+
const auto two = vdupq_n_f32(2);
31+
32+
// r2 = x * x + y * y
33+
const auto r2 = vfmaq_f32(vmulq_f32(x, x), y, y);
34+
// f = 1 + c[0] * r2 + c[1] * r2 ^ 2 + c[4] * r2 ^ 3
35+
// = 1 + r2 * (c[0] + r2 * (c[1] + r2 * c[4]))
36+
const auto f = vfmaq_f32(one, r2, vfmaq_f32(c[0], r2, vfmaq_f32(c[1], r2, c[4])));
37+
38+
const auto x_f = vmulq_f32(x, f);
39+
const auto y_f = vmulq_f32(y, f);
40+
41+
// dx = x_f + 2 * c[2] * x_f * y_f + c[3] * (r2 + 2 * x_f * x_f)
42+
// = x_f * (1 + 2 * c[2] * y_f + c[3] * 2 * x_f) + c[3] * r2
43+
// = x_f * (1 + 2 * (c[2] * y_f + c[3] * x_f)) + c[3] * r2
44+
*distorted_x = vfmaq_f32(vmulq_f32(x_f, vfmaq_f32(one, two, vfmaq_f32(vmulq_f32(c[2], y_f), c[3], x_f))), c[3], r2);
45+
46+
// dy = y_f + 2 * c[3] * x_f * y_f + c[2] * (r2 + 2 * y_f * y_f)
47+
// = y_f * (1 + 2 * c[3] * x_f + c[2] * 2 * y_f) + c[2] * r2
48+
// = y_f * (1 + 2 * (c[3] * x_f + c[2] * y_f)) + c[2] * r2
49+
*distorted_y = vfmaq_f32(vmulq_f32(y_f, vfmaq_f32(one, two, vfmaq_f32(vmulq_f32(c[3], x_f), c[2], y_f))), c[2], r2);
50+
}
51+
52+
pointcloud_neon::pointcloud_neon() : pointcloud("Pointcloud (NEON)") {}
53+
54+
void pointcloud_neon::preprocess()
55+
{
56+
_pre_compute_map_x.resize(_depth_intrinsics->width * _depth_intrinsics->height);
57+
_pre_compute_map_y.resize(_depth_intrinsics->width * _depth_intrinsics->height);
58+
59+
for (int h = 0; h < _depth_intrinsics->height; ++h)
60+
{
61+
for (int w = 0; w < _depth_intrinsics->width; ++w)
62+
{
63+
const float pixel[] = {(float)w, (float)h};
64+
65+
float x = (pixel[0] - _depth_intrinsics->ppx) / _depth_intrinsics->fx;
66+
float y = (pixel[1] - _depth_intrinsics->ppy) / _depth_intrinsics->fy;
67+
68+
if (_depth_intrinsics->model == RS2_DISTORTION_INVERSE_BROWN_CONRADY)
69+
{
70+
const float r2 = x * x + y * y;
71+
const float f = 1.0f + _depth_intrinsics->coeffs[0] * r2 + _depth_intrinsics->coeffs[1] * r2 * r2 + _depth_intrinsics->coeffs[4] * r2 * r2 * r2;
72+
const float ux = x * f + 2.0f * _depth_intrinsics->coeffs[2] * x * y + _depth_intrinsics->coeffs[3] * (r2 + 2.0f * x * x);
73+
const float uy = y * f + 2.0f * _depth_intrinsics->coeffs[3] * x * y + _depth_intrinsics->coeffs[2] * (r2 + 2.0f * y * y);
74+
x = ux;
75+
y = uy;
76+
}
77+
78+
_pre_compute_map_x[h * _depth_intrinsics->width + w] = x;
79+
_pre_compute_map_y[h * _depth_intrinsics->width + w] = y;
80+
}
81+
}
82+
}
83+
84+
const float3 *pointcloud_neon::depth_to_points(rs2::points output,
85+
const rs2_intrinsics &depth_intrinsics,
86+
const rs2::depth_frame &depth_frame)
87+
{
88+
auto depth_image = (const uint16_t *)depth_frame.get_data();
89+
90+
float *pre_compute_x = _pre_compute_map_x.data();
91+
float *pre_compute_y = _pre_compute_map_y.data();
92+
93+
const uint32_t size = depth_intrinsics.height * depth_intrinsics.width;
94+
95+
auto points = (float *)output.get_vertices();
96+
const auto scale = vdupq_n_f32(depth_frame.get_units());
97+
98+
for (unsigned int i = 0; i < size; i += 8)
99+
{
100+
const auto x0 = vld1q_f32(pre_compute_x + i);
101+
const auto x1 = vld1q_f32(pre_compute_x + i + 4);
102+
103+
const auto y0 = vld1q_f32(pre_compute_y + i);
104+
const auto y1 = vld1q_f32(pre_compute_y + i + 4);
105+
106+
const auto d = vld1q_u16(depth_image + i);
107+
const auto depth0 = vmulq_f32(vcvtq_f32_s32((int32x4_t)vmovl_u16(vget_low_u16(d))), scale);
108+
const auto depth1 = vmulq_f32(vcvtq_f32_s32((int32x4_t)vmovl_u16(vget_high_u16(d))), scale);
109+
110+
// calculate 3D points
111+
float32x4x3_t xyz0;
112+
xyz0.val[0] = vmulq_f32(depth0, x0);
113+
xyz0.val[1] = vmulq_f32(depth0, y0);
114+
xyz0.val[2] = depth0;
115+
vst3q_f32(&points[0], xyz0);
116+
117+
float32x4x3_t xyz1;
118+
xyz1.val[0] = vmulq_f32(depth1, x1);
119+
xyz1.val[1] = vmulq_f32(depth1, y1);
120+
xyz1.val[2] = depth1;
121+
vst3q_f32(&points[12], xyz1);
122+
123+
points += 24;
124+
}
125+
return (float3 *)output.get_vertices();
126+
}
127+
128+
template <rs2_distortion dist>
129+
void pointcloud_neon::get_texture_map_neon(float2 *texture_map,
130+
const float3 *points,
131+
const unsigned int width,
132+
const unsigned int height,
133+
const rs2_intrinsics &other_intrinsics,
134+
const rs2_extrinsics &extr,
135+
float2 *pixels_ptr)
136+
{
137+
auto point = reinterpret_cast<const float *>(points);
138+
auto res = reinterpret_cast<float *>(texture_map);
139+
auto res1 = reinterpret_cast<float *>(pixels_ptr);
140+
141+
float32x4_t r[9];
142+
float32x4_t t[3];
143+
float32x4_t c[5];
144+
for (int i = 0; i < 9; ++i)
145+
{
146+
r[i] = vdupq_n_f32(extr.rotation[i]);
147+
}
148+
for (int i = 0; i < 3; ++i)
149+
{
150+
t[i] = vdupq_n_f32(extr.translation[i]);
151+
}
152+
for (int i = 0; i < 5; ++i)
153+
{
154+
c[i] = vdupq_n_f32(other_intrinsics.coeffs[i]);
155+
}
156+
const auto fx = vdupq_n_f32(other_intrinsics.fx);
157+
const auto fy = vdupq_n_f32(other_intrinsics.fy);
158+
const auto ppx = vdupq_n_f32(other_intrinsics.ppx);
159+
const auto ppy = vdupq_n_f32(other_intrinsics.ppy);
160+
const auto w = vdupq_n_f32(float(other_intrinsics.width));
161+
const auto h = vdupq_n_f32(float(other_intrinsics.height));
162+
const auto zero = vdupq_n_f32(0.0f);
163+
164+
const uint32_t size = height * width * 3;
165+
for (uint32_t i = 0; i < size; i+=12)
166+
{
167+
// load 4 points (x,y,z)
168+
const float32x4x3_t xyz = vld3q_f32(point + i);
169+
170+
// transform to other
171+
auto p_x = vfmaq_f32(vfmaq_f32(vfmaq_f32(t[0], r[6], xyz.val[2]), r[3], xyz.val[1]), r[0], xyz.val[0]);
172+
auto p_y = vfmaq_f32(vfmaq_f32(vfmaq_f32(t[1], r[7], xyz.val[2]), r[4], xyz.val[1]), r[1], xyz.val[0]);
173+
auto p_z = vfmaq_f32(vfmaq_f32(vfmaq_f32(t[2], r[8], xyz.val[2]), r[5], xyz.val[1]), r[2], xyz.val[0]);
174+
175+
p_x = vdivq_f32(p_x, p_z);
176+
p_y = vdivq_f32(p_y, p_z);
177+
178+
distorte_x_y<dist>(p_x, p_y, &p_x, &p_y, c);
179+
180+
p_x = vfmaq_f32(ppx, p_x, fx);
181+
p_y = vfmaq_f32(ppy, p_y, fy);
182+
183+
// zero the x and y if z is zero
184+
{
185+
const uint32x4_t gt_zero = vcgtq_f32(p_z, zero);
186+
p_x = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(p_x), gt_zero));
187+
p_y = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(p_y), gt_zero));
188+
}
189+
190+
// texture_map
191+
{
192+
float32x4x2_t xy;
193+
xy.val[0] = p_x;
194+
xy.val[1] = p_y;
195+
vst2q_f32(res1, xy);
196+
res1 += 8;
197+
}
198+
199+
// pixels_ptr
200+
{
201+
float32x4x2_t xy;
202+
xy.val[0] = vdivq_f32(p_x, w);
203+
xy.val[1] = vdivq_f32(p_y, h);
204+
vst2q_f32(res, xy);
205+
res += 8;
206+
}
207+
}
208+
}
209+
210+
void pointcloud_neon::get_texture_map(rs2::points output,
211+
const float3 *points,
212+
const unsigned int width,
213+
const unsigned int height,
214+
const rs2_intrinsics &other_intrinsics,
215+
const rs2_extrinsics &extr,
216+
float2 *pixels_ptr)
217+
{
218+
if (other_intrinsics.model == RS2_DISTORTION_MODIFIED_BROWN_CONRADY)
219+
{
220+
get_texture_map_neon<RS2_DISTORTION_MODIFIED_BROWN_CONRADY>(
221+
(float2 *)output.get_texture_coordinates(),
222+
points,
223+
width,
224+
height,
225+
other_intrinsics,
226+
extr,
227+
pixels_ptr);
228+
}
229+
else
230+
{
231+
get_texture_map_neon<RS2_DISTORTION_NONE>(
232+
(float2 *)output.get_texture_coordinates(),
233+
points,
234+
width,
235+
height,
236+
other_intrinsics,
237+
extr,
238+
pixels_ptr);
239+
}
240+
}
241+
}
242+
#endif

src/proc/neon/neon-pointcloud.h

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// License: Apache 2.0. See LICENSE file in root directory.
2+
// Copyright(c) 2024 Intel Corporation. All Rights Reserved.
3+
4+
#pragma once
5+
#include "../pointcloud.h"
6+
7+
namespace librealsense
8+
{
9+
#if defined(__ARM_NEON) && ! defined ANDROID
10+
class pointcloud_neon : public pointcloud
11+
{
12+
public:
13+
pointcloud_neon();
14+
15+
void preprocess() override;
16+
const float3 * depth_to_points(
17+
rs2::points output,
18+
const rs2_intrinsics &depth_intrinsics,
19+
const rs2::depth_frame& depth_frame) override;
20+
void get_texture_map(
21+
rs2::points output,
22+
const float3* points,
23+
const unsigned int width,
24+
const unsigned int height,
25+
const rs2_intrinsics &other_intrinsics,
26+
const rs2_extrinsics& extr,
27+
float2* pixels_ptr) override;
28+
29+
private:
30+
template <rs2_distortion dist>
31+
void get_texture_map_neon(float2 * texture_map,
32+
const float3 * points,
33+
const unsigned int width,
34+
const unsigned int height,
35+
const rs2_intrinsics & other_intrinsics,
36+
const rs2_extrinsics & extr,
37+
float2 * pixels_ptr);
38+
39+
std::vector<float> _pre_compute_map_x;
40+
std::vector<float> _pre_compute_map_y;
41+
42+
void pre_compute_x_y_map();
43+
};
44+
#endif
45+
}

src/proc/pointcloud.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#ifdef __SSSE3__
2323
#include "proc/sse/sse-pointcloud.h"
2424
#endif
25+
#include "proc/neon/neon-pointcloud.h"
26+
2527

2628
namespace librealsense
2729
{
@@ -395,13 +397,13 @@ namespace librealsense
395397
{
396398
#ifdef RS2_USE_CUDA
397399
return std::make_shared<librealsense::pointcloud_cuda>();
398-
#else
399-
#ifdef __SSSE3__
400+
#elif defined(__SSSE3__)
400401
return std::make_shared<librealsense::pointcloud_sse>();
402+
#elif defined(__ARM_NEON) && ! defined ANDROID
403+
return std::make_shared<librealsense::pointcloud_neon>();
401404
#else
402405
return std::make_shared<librealsense::pointcloud>();
403406
#endif
404-
#endif
405407
}
406408

407409
bool pointcloud::run__occlusion_filter(const rs2_extrinsics& extr)

0 commit comments

Comments
 (0)