Skip to content

Commit e8106c0

Browse files
committed
Add div_approx and inverse_approx builtins for approximate PTX intrinsics (GH-1199)
Add standalone wp.div_approx() and wp.inverse_approx() builtins that use fast GPU intrinsics (div.approx.f32, rcp.approx.ftz.f64) for approximate division and matrix inverse. Only floating-point types are supported; falls back to exact arithmetic on CPU. Signed-off-by: Eric Shi <ershi@nvidia.com>
1 parent 2d78485 commit e8106c0

File tree

9 files changed

+735
-7
lines changed

9 files changed

+735
-7
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@
3737
[GH-1168](https://github.com/NVIDIA/warp/issues/1168)).
3838
- Add quaternion and spatial transformation helpers (`wp.quat_from_euler()`, `wp.quat_to_euler()`,
3939
`wp.transform_twist()`, etc.) ([GH-1237](https://github.com/NVIDIA/warp/issues/1237)).
40+
- Add `wp.div_approx()` and `wp.inverse_approx()` built-ins for approximate PTX intrinsics
41+
(`div.approx.f32`, `rcp.approx.ftz.f64`) on GPU. Only floating-point types are supported;
42+
falls back to exact arithmetic on CPU
43+
([GH-1199](https://github.com/NVIDIA/warp/issues/1199)).
4044
- Add public API for marching cubes lookup tables as class attributes on `wp.MarchingCubes`: `CUBE_CORNER_OFFSETS`,
4145
`EDGE_TO_CORNERS`, `CASE_TO_TRI_RANGE`, and `TRI_LOCAL_INDICES`. These enable custom marching cubes implementations
4246
for advanced use cases like sparse volume extraction ([GH-1151](https://github.com/NVIDIA/warp/issues/1151)).

docs/language_reference/builtins.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ Vector Math
7575
get_diag
7676
identity
7777
inverse
78+
inverse_approx
7879
length
7980
length_sq
8081
matrix
@@ -390,6 +391,7 @@ Operators
390391
bit_or
391392
bit_xor
392393
div
394+
div_approx
393395
floordiv
394396
invert
395397
lshift

warp/__init__.pyi

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2237,6 +2237,30 @@ def inverse(a: Matrix[Float, Literal[4], Literal[4]]) -> Matrix[Float, Any, Any]
22372237
"""Compute the inverse of matrix ``a``."""
22382238
...
22392239

2240+
@over
2241+
def inverse_approx(a: Matrix[Float, Literal[2], Literal[2]]) -> Matrix[Float, Any, Any]:
2242+
"""Compute the inverse of matrix ``a`` using approximate GPU intrinsics.
2243+
2244+
Falls back to exact inverse on CPU.
2245+
"""
2246+
...
2247+
2248+
@over
2249+
def inverse_approx(a: Matrix[Float, Literal[3], Literal[3]]) -> Matrix[Float, Any, Any]:
2250+
"""Compute the inverse of matrix ``a`` using approximate GPU intrinsics.
2251+
2252+
Falls back to exact inverse on CPU.
2253+
"""
2254+
...
2255+
2256+
@over
2257+
def inverse_approx(a: Matrix[Float, Literal[4], Literal[4]]) -> Matrix[Float, Any, Any]:
2258+
"""Compute the inverse of matrix ``a`` using approximate GPU intrinsics.
2259+
2260+
Falls back to exact inverse on CPU.
2261+
"""
2262+
...
2263+
22402264
@over
22412265
def determinant(a: Matrix[Float, Literal[2], Literal[2]]) -> Float:
22422266
"""Compute the determinant of matrix ``a``."""
@@ -6665,6 +6689,66 @@ def div(a: Any, b: Tile[Any, tuple[int, ...]]) -> Tile[Any, tuple[int, ...]]:
66656689
"""
66666690
...
66676691

6692+
@over
6693+
def div_approx(a: Float, b: Float) -> Float:
6694+
"""Divide two values using approximate GPU intrinsics.
6695+
6696+
Falls back to exact division on CPU.
6697+
"""
6698+
...
6699+
6700+
@over
6701+
def div_approx(a: Vector[Float, Any], b: Float) -> Vector[Float, Any]:
6702+
"""Divide two values using approximate GPU intrinsics.
6703+
6704+
Divide a vector by a scalar. Falls back to exact division on CPU.
6705+
"""
6706+
...
6707+
6708+
@over
6709+
def div_approx(a: Float, b: Vector[Float, Any]) -> Vector[Float, Any]:
6710+
"""Divide two values using approximate GPU intrinsics.
6711+
6712+
Divide a scalar by each element of a vector. Falls back to exact division on CPU.
6713+
"""
6714+
...
6715+
6716+
@over
6717+
def div_approx(a: Matrix[Float, Any, Any], b: Float) -> Matrix[Float, Any, Any]:
6718+
"""Divide two values using approximate GPU intrinsics.
6719+
6720+
Divide a matrix by a scalar. Falls back to exact division on CPU.
6721+
"""
6722+
...
6723+
6724+
@over
6725+
def div_approx(a: Float, b: Matrix[Float, Any, Any]) -> Matrix[Float, Any, Any]:
6726+
"""Divide two values using approximate GPU intrinsics.
6727+
6728+
Divide a scalar by each element of a matrix. Falls back to exact division on CPU.
6729+
"""
6730+
...
6731+
6732+
@over
6733+
def div_approx(a: Quaternion[Float], b: Float) -> Quaternion[Float]:
6734+
"""Divide two values using approximate GPU intrinsics.
6735+
6736+
Divide a quaternion by a scalar.
6737+
6738+
The result is unnormalized. Falls back to exact division on CPU.
6739+
"""
6740+
...
6741+
6742+
@over
6743+
def div_approx(a: Float, b: Quaternion[Float]) -> Quaternion[Float]:
6744+
"""Divide two values using approximate GPU intrinsics.
6745+
6746+
Divide a scalar by a quaternion.
6747+
6748+
The result is unnormalized. Falls back to exact division on CPU.
6749+
"""
6750+
...
6751+
66686752
def floordiv(a: Scalar, b: Scalar) -> Scalar:
66696753
"""Divide two scalars using floor division."""
66706754
...

warp/_src/builtins.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -843,6 +843,45 @@ def inverse_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, A
843843
require_original_output_arg=True,
844844
)
845845

846+
add_builtin(
847+
"inverse_approx",
848+
input_types={"a": matrix(shape=(2, 2), dtype=Float)},
849+
value_func=inverse_value_func,
850+
native_func="approx_inverse",
851+
group="Vector Math",
852+
doc="""Compute the inverse of matrix ``a`` using approximate GPU intrinsics.
853+
854+
Falls back to exact inverse on CPU.""",
855+
require_original_output_arg=True,
856+
export=False,
857+
)
858+
859+
add_builtin(
860+
"inverse_approx",
861+
input_types={"a": matrix(shape=(3, 3), dtype=Float)},
862+
value_func=inverse_value_func,
863+
native_func="approx_inverse",
864+
group="Vector Math",
865+
doc="""Compute the inverse of matrix ``a`` using approximate GPU intrinsics.
866+
867+
Falls back to exact inverse on CPU.""",
868+
require_original_output_arg=True,
869+
export=False,
870+
)
871+
872+
add_builtin(
873+
"inverse_approx",
874+
input_types={"a": matrix(shape=(4, 4), dtype=Float)},
875+
value_func=inverse_value_func,
876+
native_func="approx_inverse",
877+
group="Vector Math",
878+
doc="""Compute the inverse of matrix ``a`` using approximate GPU intrinsics.
879+
880+
Falls back to exact inverse on CPU.""",
881+
require_original_output_arg=True,
882+
export=False,
883+
)
884+
846885

847886
def determinant_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
848887
if arg_types is None:
@@ -11079,6 +11118,89 @@ def matmat_mul_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str
1107911118
group="Operators",
1108011119
)
1108111120

11121+
add_builtin(
11122+
"div_approx",
11123+
input_types={"a": Float, "b": Float},
11124+
value_func=sametypes_create_value_func(Float),
11125+
native_func="approx_div",
11126+
doc="""Divide two values using approximate GPU intrinsics.
11127+
11128+
Falls back to exact division on CPU.""",
11129+
group="Operators",
11130+
require_original_output_arg=True,
11131+
export=False,
11132+
)
11133+
add_builtin(
11134+
"div_approx",
11135+
input_types={"a": vector(length=Any, dtype=Float), "b": Float},
11136+
value_func=scalar_mul_create_value_func(vector(length=Any, dtype=Float)),
11137+
native_func="approx_div",
11138+
doc="""Divide two values using approximate GPU intrinsics.
11139+
11140+
Divide a vector by a scalar. Falls back to exact division on CPU.""",
11141+
group="Operators",
11142+
export=False,
11143+
)
11144+
add_builtin(
11145+
"div_approx",
11146+
input_types={"a": Float, "b": vector(length=Any, dtype=Float)},
11147+
value_func=scalar_mul_create_value_func(vector(length=Any, dtype=Float)),
11148+
native_func="approx_div",
11149+
doc="""Divide two values using approximate GPU intrinsics.
11150+
11151+
Divide a scalar by each element of a vector. Falls back to exact division on CPU.""",
11152+
group="Operators",
11153+
export=False,
11154+
)
11155+
add_builtin(
11156+
"div_approx",
11157+
input_types={"a": matrix(shape=(Any, Any), dtype=Float), "b": Float},
11158+
value_func=scalar_mul_create_value_func(matrix(shape=(Any, Any), dtype=Float)),
11159+
native_func="approx_div",
11160+
doc="""Divide two values using approximate GPU intrinsics.
11161+
11162+
Divide a matrix by a scalar. Falls back to exact division on CPU.""",
11163+
group="Operators",
11164+
export=False,
11165+
)
11166+
add_builtin(
11167+
"div_approx",
11168+
input_types={"a": Float, "b": matrix(shape=(Any, Any), dtype=Float)},
11169+
value_func=scalar_mul_create_value_func(matrix(shape=(Any, Any), dtype=Float)),
11170+
native_func="approx_div",
11171+
doc="""Divide two values using approximate GPU intrinsics.
11172+
11173+
Divide a scalar by each element of a matrix. Falls back to exact division on CPU.""",
11174+
group="Operators",
11175+
export=False,
11176+
)
11177+
add_builtin(
11178+
"div_approx",
11179+
input_types={"a": quaternion(dtype=Float), "b": Float},
11180+
value_func=scalar_mul_create_value_func(quaternion(dtype=Float)),
11181+
native_func="approx_div",
11182+
doc="""Divide two values using approximate GPU intrinsics.
11183+
11184+
Divide a quaternion by a scalar.
11185+
11186+
The result is unnormalized. Falls back to exact division on CPU.""",
11187+
group="Operators",
11188+
export=False,
11189+
)
11190+
add_builtin(
11191+
"div_approx",
11192+
input_types={"a": Float, "b": quaternion(dtype=Float)},
11193+
value_func=scalar_mul_create_value_func(quaternion(dtype=Float)),
11194+
native_func="approx_div",
11195+
doc="""Divide two values using approximate GPU intrinsics.
11196+
11197+
Divide a scalar by a quaternion.
11198+
11199+
The result is unnormalized. Falls back to exact division on CPU.""",
11200+
group="Operators",
11201+
export=False,
11202+
)
11203+
1108211204
add_builtin(
1108311205
"floordiv",
1108411206
input_types={"a": Scalar, "b": Scalar},

warp/native/builtin.h

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,58 @@ static_assert(sizeof(half) == 2, "Size of half / float16 type must be 2-bytes");
182182

183183
typedef half float16;
184184

185+
// Approximate division/reciprocal intrinsics
186+
#if defined(__CUDA_ARCH__)
187+
188+
inline __device__ float approx_rcp(float a)
189+
{
190+
float r;
191+
asm("rcp.approx.f32 %0, %1;" : "=f"(r) : "f"(a));
192+
return r;
193+
}
194+
195+
inline __device__ double approx_rcp(double a)
196+
{
197+
double r;
198+
asm("rcp.approx.ftz.f64 %0, %1;" : "=d"(r) : "d"(a));
199+
return r;
200+
}
201+
202+
inline __device__ float16 approx_rcp(float16 a)
203+
{
204+
return float16(1.0f / float(a)); // No approx PTX for f16; falls back to exact fp32 reciprocal
205+
}
206+
207+
inline __device__ float approx_div(float a, float b)
208+
{
209+
float r;
210+
asm("div.approx.f32 %0, %1, %2;" : "=f"(r) : "f"(a), "f"(b));
211+
return r;
212+
}
213+
214+
inline __device__ double approx_div(double a, double b)
215+
{
216+
// No div.approx.f64 in PTX; use rcp then multiply
217+
return a * approx_rcp(b);
218+
}
219+
220+
inline __device__ float16 approx_div(float16 a, float16 b)
221+
{
222+
return float16(float(a) / float(b)); // No approx PTX for f16; falls back to exact fp32 division
223+
}
224+
225+
#else
226+
227+
// CPU fallbacks: exact division
228+
inline CUDA_CALLABLE float approx_rcp(float a) { return 1.0f / a; }
229+
inline CUDA_CALLABLE double approx_rcp(double a) { return 1.0 / a; }
230+
inline CUDA_CALLABLE float16 approx_rcp(float16 a) { return float16(1.0f / float(a)); }
231+
inline CUDA_CALLABLE float approx_div(float a, float b) { return a / b; }
232+
inline CUDA_CALLABLE double approx_div(double a, double b) { return a / b; }
233+
inline CUDA_CALLABLE float16 approx_div(float16 a, float16 b) { return float16(float(a) / float(b)); }
234+
235+
#endif
236+
185237
#if defined(__CUDA_ARCH__)
186238

187239
CUDA_CALLABLE inline half float_to_half(float x)
@@ -475,6 +527,20 @@ DECLARE_FLOAT_OPS(float16)
475527
DECLARE_FLOAT_OPS(float32)
476528
DECLARE_FLOAT_OPS(float64)
477529

530+
// Adjoint for approximate scalar division
531+
#define DECLARE_ADJ_APPROX_DIV(T) \
532+
inline CUDA_CALLABLE void adj_approx_div(T a, T b, T ret, T& adj_a, T& adj_b, T adj_ret) \
533+
{ \
534+
adj_a += approx_div(adj_ret, b); \
535+
adj_b -= approx_div(T(adj_ret * ret), b); \
536+
}
537+
538+
DECLARE_ADJ_APPROX_DIV(float16)
539+
DECLARE_ADJ_APPROX_DIV(float32)
540+
DECLARE_ADJ_APPROX_DIV(float64)
541+
542+
#undef DECLARE_ADJ_APPROX_DIV
543+
478544

479545
// basic ops for float types
480546
inline CUDA_CALLABLE float16 mod(float16 a, float16 b)

0 commit comments

Comments
 (0)