NVIDIA
diff --git a/‎CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/language_reference/builtins.rst‎
Lines changed: 2 additions & 0 deletions b/‎docs/language_reference/builtins.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎warp/__init__.pyi‎
Lines changed: 84 additions & 0 deletions b/‎warp/__init__.pyi‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎warp/_src/builtins.py‎
Lines changed: 122 additions & 0 deletions b/‎warp/_src/builtins.py‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎warp/native/builtin.h‎
Lines changed: 66 additions & 0 deletions b/‎warp/native/builtin.h‎
Lines changed: 66 additions & 0 deletions
@@ -37,6 +37,10 @@
   [GH-1168](https://github.com/NVIDIA/warp/issues/1168)).
 - Add quaternion and spatial transformation helpers (`wp.quat_from_euler()`, `wp.quat_to_euler()`,
   `wp.transform_twist()`, etc.) ([GH-1237](https://github.com/NVIDIA/warp/issues/1237)).
+- Add `wp.div_approx()` and `wp.inverse_approx()` built-ins for approximate PTX intrinsics
+  (`div.approx.f32`, `rcp.approx.ftz.f64`) on GPU. Only floating-point types are supported;
+  falls back to exact arithmetic on CPU
+  ([GH-1199](https://github.com/NVIDIA/warp/issues/1199)).
 - Add public API for marching cubes lookup tables as class attributes on `wp.MarchingCubes`: `CUBE_CORNER_OFFSETS`,
   `EDGE_TO_CORNERS`, `CASE_TO_TRI_RANGE`, and `TRI_LOCAL_INDICES`. These enable custom marching cubes implementations
   for advanced use cases like sparse volume extraction ([GH-1151](https://github.com/NVIDIA/warp/issues/1151)).
 
@@ -75,6 +75,7 @@ Vector Math
    get_diag
    identity
    inverse
+   inverse_approx
    length
    length_sq
    matrix
@@ -390,6 +391,7 @@ Operators
    bit_or
    bit_xor
    div
+   div_approx
    floordiv
    invert
    lshift
 
@@ -2237,6 +2237,30 @@ def inverse(a: Matrix[Float, Literal[4], Literal[4]]) -> Matrix[Float, Any, Any]
     """Compute the inverse of matrix ``a``."""
     ...
 
+@over
+def inverse_approx(a: Matrix[Float, Literal[2], Literal[2]]) -> Matrix[Float, Any, Any]:
+    """Compute the inverse of matrix ``a`` using approximate GPU intrinsics.
+
+    Falls back to exact inverse on CPU.
+    """
+    ...
+
+@over
+def inverse_approx(a: Matrix[Float, Literal[3], Literal[3]]) -> Matrix[Float, Any, Any]:
+    """Compute the inverse of matrix ``a`` using approximate GPU intrinsics.
+
+    Falls back to exact inverse on CPU.
+    """
+    ...
+
+@over
+def inverse_approx(a: Matrix[Float, Literal[4], Literal[4]]) -> Matrix[Float, Any, Any]:
+    """Compute the inverse of matrix ``a`` using approximate GPU intrinsics.
+
+    Falls back to exact inverse on CPU.
+    """
+    ...
+
 @over
 def determinant(a: Matrix[Float, Literal[2], Literal[2]]) -> Float:
     """Compute the determinant of matrix ``a``."""
@@ -6665,6 +6689,66 @@ def div(a: Any, b: Tile[Any, tuple[int, ...]]) -> Tile[Any, tuple[int, ...]]:
     """
     ...
 
+@over
+def div_approx(a: Float, b: Float) -> Float:
+    """Divide two values using approximate GPU intrinsics.
+
+    Falls back to exact division on CPU.
+    """
+    ...
+
+@over
+def div_approx(a: Vector[Float, Any], b: Float) -> Vector[Float, Any]:
+    """Divide two values using approximate GPU intrinsics.
+
+    Divide a vector by a scalar. Falls back to exact division on CPU.
+    """
+    ...
+
+@over
+def div_approx(a: Float, b: Vector[Float, Any]) -> Vector[Float, Any]:
+    """Divide two values using approximate GPU intrinsics.
+
+    Divide a scalar by each element of a vector. Falls back to exact division on CPU.
+    """
+    ...
+
+@over
+def div_approx(a: Matrix[Float, Any, Any], b: Float) -> Matrix[Float, Any, Any]:
+    """Divide two values using approximate GPU intrinsics.
+
+    Divide a matrix by a scalar. Falls back to exact division on CPU.
+    """
+    ...
+
+@over
+def div_approx(a: Float, b: Matrix[Float, Any, Any]) -> Matrix[Float, Any, Any]:
+    """Divide two values using approximate GPU intrinsics.
+
+    Divide a scalar by each element of a matrix. Falls back to exact division on CPU.
+    """
+    ...
+
+@over
+def div_approx(a: Quaternion[Float], b: Float) -> Quaternion[Float]:
+    """Divide two values using approximate GPU intrinsics.
+
+    Divide a quaternion by a scalar.
+
+    The result is unnormalized. Falls back to exact division on CPU.
+    """
+    ...
+
+@over
+def div_approx(a: Float, b: Quaternion[Float]) -> Quaternion[Float]:
+    """Divide two values using approximate GPU intrinsics.
+
+    Divide a scalar by a quaternion.
+
+    The result is unnormalized. Falls back to exact division on CPU.
+    """
+    ...
+
 def floordiv(a: Scalar, b: Scalar) -> Scalar:
     """Divide two scalars using floor division."""
     ...
 
@@ -843,6 +843,45 @@ def inverse_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, A
     require_original_output_arg=True,
 )
 
+add_builtin(
+    "inverse_approx",
+    input_types={"a": matrix(shape=(2, 2), dtype=Float)},
+    value_func=inverse_value_func,
+    native_func="approx_inverse",
+    group="Vector Math",
+    doc="""Compute the inverse of matrix ``a`` using approximate GPU intrinsics.
+
+    Falls back to exact inverse on CPU.""",
+    require_original_output_arg=True,
+    export=False,
+)
+
+add_builtin(
+    "inverse_approx",
+    input_types={"a": matrix(shape=(3, 3), dtype=Float)},
+    value_func=inverse_value_func,
+    native_func="approx_inverse",
+    group="Vector Math",
+    doc="""Compute the inverse of matrix ``a`` using approximate GPU intrinsics.
+
+    Falls back to exact inverse on CPU.""",
+    require_original_output_arg=True,
+    export=False,
+)
+
+add_builtin(
+    "inverse_approx",
+    input_types={"a": matrix(shape=(4, 4), dtype=Float)},
+    value_func=inverse_value_func,
+    native_func="approx_inverse",
+    group="Vector Math",
+    doc="""Compute the inverse of matrix ``a`` using approximate GPU intrinsics.
+
+    Falls back to exact inverse on CPU.""",
+    require_original_output_arg=True,
+    export=False,
+)
+
 
 def determinant_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     if arg_types is None:
@@ -11079,6 +11118,89 @@ def matmat_mul_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str
     group="Operators",
 )
 
+add_builtin(
+    "div_approx",
+    input_types={"a": Float, "b": Float},
+    value_func=sametypes_create_value_func(Float),
+    native_func="approx_div",
+    doc="""Divide two values using approximate GPU intrinsics.
+
+    Falls back to exact division on CPU.""",
+    group="Operators",
+    require_original_output_arg=True,
+    export=False,
+)
+add_builtin(
+    "div_approx",
+    input_types={"a": vector(length=Any, dtype=Float), "b": Float},
+    value_func=scalar_mul_create_value_func(vector(length=Any, dtype=Float)),
+    native_func="approx_div",
+    doc="""Divide two values using approximate GPU intrinsics.
+
+    Divide a vector by a scalar. Falls back to exact division on CPU.""",
+    group="Operators",
+    export=False,
+)
+add_builtin(
+    "div_approx",
+    input_types={"a": Float, "b": vector(length=Any, dtype=Float)},
+    value_func=scalar_mul_create_value_func(vector(length=Any, dtype=Float)),
+    native_func="approx_div",
+    doc="""Divide two values using approximate GPU intrinsics.
+
+    Divide a scalar by each element of a vector. Falls back to exact division on CPU.""",
+    group="Operators",
+    export=False,
+)
+add_builtin(
+    "div_approx",
+    input_types={"a": matrix(shape=(Any, Any), dtype=Float), "b": Float},
+    value_func=scalar_mul_create_value_func(matrix(shape=(Any, Any), dtype=Float)),
+    native_func="approx_div",
+    doc="""Divide two values using approximate GPU intrinsics.
+
+    Divide a matrix by a scalar. Falls back to exact division on CPU.""",
+    group="Operators",
+    export=False,
+)
+add_builtin(
+    "div_approx",
+    input_types={"a": Float, "b": matrix(shape=(Any, Any), dtype=Float)},
+    value_func=scalar_mul_create_value_func(matrix(shape=(Any, Any), dtype=Float)),
+    native_func="approx_div",
+    doc="""Divide two values using approximate GPU intrinsics.
+
+    Divide a scalar by each element of a matrix. Falls back to exact division on CPU.""",
+    group="Operators",
+    export=False,
+)
+add_builtin(
+    "div_approx",
+    input_types={"a": quaternion(dtype=Float), "b": Float},
+    value_func=scalar_mul_create_value_func(quaternion(dtype=Float)),
+    native_func="approx_div",
+    doc="""Divide two values using approximate GPU intrinsics.
+
+    Divide a quaternion by a scalar.
+
+    The result is unnormalized. Falls back to exact division on CPU.""",
+    group="Operators",
+    export=False,
+)
+add_builtin(
+    "div_approx",
+    input_types={"a": Float, "b": quaternion(dtype=Float)},
+    value_func=scalar_mul_create_value_func(quaternion(dtype=Float)),
+    native_func="approx_div",
+    doc="""Divide two values using approximate GPU intrinsics.
+
+    Divide a scalar by a quaternion.
+
+    The result is unnormalized. Falls back to exact division on CPU.""",
+    group="Operators",
+    export=False,
+)
+
 add_builtin(
     "floordiv",
     input_types={"a": Scalar, "b": Scalar},
 
@@ -182,6 +182,58 @@ static_assert(sizeof(half) == 2, "Size of half / float16 type must be 2-bytes");
 
 typedef half float16;
 
+// Approximate division/reciprocal intrinsics
+#if defined(__CUDA_ARCH__)
+
+inline __device__ float approx_rcp(float a)
+{
+    float r;
+    asm("rcp.approx.f32 %0, %1;" : "=f"(r) : "f"(a));
+    return r;
+}
+
+inline __device__ double approx_rcp(double a)
+{
+    double r;
+    asm("rcp.approx.ftz.f64 %0, %1;" : "=d"(r) : "d"(a));
+    return r;
+}
+
+inline __device__ float16 approx_rcp(float16 a)
+{
+    return float16(1.0f / float(a));  // No approx PTX for f16; falls back to exact fp32 reciprocal
+}
+
+inline __device__ float approx_div(float a, float b)
+{
+    float r;
+    asm("div.approx.f32 %0, %1, %2;" : "=f"(r) : "f"(a), "f"(b));
+    return r;
+}
+
+inline __device__ double approx_div(double a, double b)
+{
+    // No div.approx.f64 in PTX; use rcp then multiply
+    return a * approx_rcp(b);
+}
+
+inline __device__ float16 approx_div(float16 a, float16 b)
+{
+    return float16(float(a) / float(b));  // No approx PTX for f16; falls back to exact fp32 division
+}
+
+#else
+
+// CPU fallbacks: exact division
+inline CUDA_CALLABLE float approx_rcp(float a) { return 1.0f / a; }
+inline CUDA_CALLABLE double approx_rcp(double a) { return 1.0 / a; }
+inline CUDA_CALLABLE float16 approx_rcp(float16 a) { return float16(1.0f / float(a)); }
+inline CUDA_CALLABLE float approx_div(float a, float b) { return a / b; }
+inline CUDA_CALLABLE double approx_div(double a, double b) { return a / b; }
+inline CUDA_CALLABLE float16 approx_div(float16 a, float16 b) { return float16(float(a) / float(b)); }
+
+#endif
+
 #if defined(__CUDA_ARCH__)
 
 CUDA_CALLABLE inline half float_to_half(float x)
@@ -475,6 +527,20 @@ DECLARE_FLOAT_OPS(float16)
 DECLARE_FLOAT_OPS(float32)
 DECLARE_FLOAT_OPS(float64)
 
+// Adjoint for approximate scalar division
+#define DECLARE_ADJ_APPROX_DIV(T) \
+inline CUDA_CALLABLE void adj_approx_div(T a, T b, T ret, T& adj_a, T& adj_b, T adj_ret) \
+{ \
+    adj_a += approx_div(adj_ret, b); \
+    adj_b -= approx_div(T(adj_ret * ret), b); \
+}
+
+DECLARE_ADJ_APPROX_DIV(float16)
+DECLARE_ADJ_APPROX_DIV(float32)
+DECLARE_ADJ_APPROX_DIV(float64)
+
+#undef DECLARE_ADJ_APPROX_DIV
+
 
 // basic ops for float types
 inline CUDA_CALLABLE float16 mod(float16 a, float16 b)