Fix API documentation. Improve measuring accuracy. Fix vector_math test not touching input: prevents constant folding.

mcourteaux · mcourteaux · commit f0357dc60da6 · 2025-02-11T12:35:47.000+01:00
diff --git a/src/IROperator.cpp b/src/IROperator.cpp
@@ -1380,14 +1380,10 @@ Expr fast_log(const Expr &x, ApproximationPrecision prec) {
     return Call::make(x.type(), Call::fast_log, {x, make_approximation_precision_info(prec)}, Call::PureIntrinsic);
 }
 
-Expr fast_pow(Expr x, Expr y, ApproximationPrecision prec) {
+Expr fast_pow(const Expr &x, const Expr &y, ApproximationPrecision prec) {
     if (auto i = as_const_int(y)) {
-        return raise_to_integer_power(std::move(x), *i);
+        return raise_to_integer_power(x, *i);
     }
-
-    // TODO: figure out what to do with these casts...
-    x = cast<float>(std::move(x));
-    y = cast<float>(std::move(y));
     return Call::make(x.type(), Call::fast_pow, {x, y, make_approximation_precision_info(prec)}, Call::PureIntrinsic);
 }
 
diff --git a/src/IROperator.h b/src/IROperator.h
@@ -979,21 +979,40 @@ Expr pow(Expr x, Expr y);
 Expr erf(const Expr &x);
 
 /** Struct that allows the user to specify precision requirements for functions
- * that are approximated. These polynomials can be
- * optimized for four different metrics: Mean Squared Error, Maximum Absolute Error,
- * Maximum Units in Last Place (ULP) Error, or a 50%/50% blend of MAE and MULPE.
- *
- * Orthogonally to the optimization objective, these polynomials can vary
- * in degree. Higher degree polynomials will give more precise results.
- * Note that instead of specifying the degree, the number of terms is used instead.
- * E.g., even (i.e., symmetric) functions may be implemented using only even powers,
- * for which a number of terms of 4 would actually mean that terms
- * in [1, x^2, x^4, x^6] are used, which is degree 6.
- *
- * Additionally, if you don't care about number of terms in the polynomial
- * and you do care about the maximal absolute error the approximation may have
- * over the domain, you may specify values and the implementation
- * will decide the appropriate polynomial degree that achieves this precision.
+ * that are approximated. Several functions can be approximated using specialized
+ * hardware instructions. If no hardware instructions are available, approximations
+ * are implemented in Halide using polynomials or potentially Padé approximants.
+ * Both the hardware instructions and the in-house approximations have a certain behavior
+ * and precision. This struct allows you to specifiy which behavior and precision you
+ * are interested in. Halide will select an appropriate implemenation that satisfies
+ * these requirements.
+ *
+ * There are two main aspects of specifying the precision:
+ *  1. The objective for which the approximation is optimzed. This can be to reduce the
+ *     maximal absolute error (MAE), or to reduce the maximal error measured in
+ *     units in last place (ULP). Some applications tend to naturally require low
+ *     absolute error, whereas others might favor low relative error (for which maximal ULP
+ *     error is a good metric).
+ *  2. The minimal required precision in either MAE, or MULPE.
+ *
+ * Both of these parameters are optional:
+ *
+ *  - When omitting the optimization objective (i.e., AUTO), Halide is free to pick any
+ *    implementation that satisfies the precision requirement. Sometimes, hardware instructions
+ *    have vendor-specific behavior (one vendor might optimize MAE, another might optimize
+ *    MULPE), so requiring a specific behavior might rule out the ability to use the hardware
+ *    instruction if it doesn't behave the way requested. When polynomial approximations are
+ *    selected, and AUTO is requested, Halide will pick a sensible optimization objective for
+ *    each function.
+ *  - When omitting the precision requirements (both \ref constraint_max_ulp_error and
+ *    \ref constraint_max_absolute_error), Halide will try to favor hardware instructions
+ *    when available in order to favor speed. Otherwise, Halide will select a polynomial with
+ *    reasonable precision.
+ *
+ * The default-initialized ApproximationPrecision consists of AUTO-behavior, and default-precision.
+ * In general, when only approximate values are required without hard requirements on their
+ * precision, calling any of the fast_-version functions without specifying the ApproximationPrecision
+ * struct is fine, and will get you most likely the fastest implementation possible.
  */
 struct ApproximationPrecision {
     enum OptimizationObjective {
@@ -1067,45 +1086,50 @@ Expr fast_atan2(const Expr &y, const Expr &x, ApproximationPrecision = {});
 
 /** Fast approximate log for Float(32).
  * Returns nonsense for x <= 0.0f.
- * Accurate up to the last 5 bits of the mantissa.
+ * Approximation available up to the Max 5 ULP, Mean 2 ULP.
  * Vectorizes cleanly when using polynomials.
  * Slow on x86 if you don't have at least sse 4.1.
  * On NVIDIA CUDA: default-precision maps to a combination of lg2.approx.f32 and a multiplication.
+ * See \ref ApproximationPrecision for details on specifying precision.
  */
 Expr fast_log(const Expr &x, ApproximationPrecision precision = {});
 
 /** Fast approximate exp for Float(32).
  * Returns nonsense for inputs that would overflow.
- * Typically accurate up to the last 5 bits of the mantissa.
- * Approximation
+ * Approximation available up to Max 3 ULP, Mean 1 ULP.
  * Vectorizes cleanly when using polynomials.
  * Slow on x86 if you don't have at least sse 4.1.
  * On NVIDIA CUDA: default-precision maps to a combination of ex2.approx.f32 and a multiplication.
+ * See \ref ApproximationPrecision for details on specifying precision.
  */
 Expr fast_exp(const Expr &x, ApproximationPrecision precision = {});
 
 /** Fast approximate pow for Float(32).
  * Returns nonsense for x < 0.0f.
- * Accurate up to the last 5 bits of the mantissa for typical exponents.
+ * Returns 1 when x == y == 0.0.
+ * Approximations accurate up to Max 53 ULPs, Mean 13 ULPs.
  * Gets worse when approaching overflow.
  * Vectorizes cleanly when using polynomials.
  * Slow on x86 if you don't have at least sse 4.1.
  * On NVIDIA CUDA: default-precision maps to a combination of ex2.approx.f32 and lg2.approx.f32.
+ * See \ref ApproximationPrecision for details on specifying precision.
  */
-Expr fast_pow(Expr x, Expr y, ApproximationPrecision precision = {});
+Expr fast_pow(const Expr &x, const Expr &y, ApproximationPrecision precision = {});
 
 /** Fast approximate pow for Float(32).
- * Vectorizes cleanly when using polynomials (caveat: no polynomial approximation implemented yet).
+ * Approximations accurate to 2e-7 MAE, and Max 2500 ULPs (on average < 1 ULP) available.
+ * Vectorizes cleanly when using polynomials.
  * Slow on x86 if you don't have at least sse 4.1.
  * On NVIDIA CUDA: default-precision maps to a combination of ex2.approx.f32 and lg2.approx.f32.
+ * See \ref ApproximationPrecision for details on specifying precision.
  */
 Expr fast_tanh(const Expr &x, ApproximationPrecision precision = {});
 
 /** Fast approximate inverse for Float(32). Corresponds to the rcpps
- * instruction on x86, and the vrecpe instruction on ARM. Vectorizes
- * cleanly. Note that this can produce slightly different results
- * across different implementations of the same architecture (e.g. AMD vs Intel),
- * even when strict_float is enabled. */
+ * instruction on x86, the vrecpe instruction on ARM, and the rcp.approx.f32 instruction on CUDA.
+ * Vectorizes cleanly.
+ * Note that this can produce slightly different results across different implementations
+ * of the same architecture (e.g. AMD vs Intel), even when strict_float is enabled. */
 Expr fast_inverse(Expr x);
 
 /** Fast approximate inverse square root for Float(32). Corresponds to
diff --git a/src/runtime/ptx_dev.ll b/src/runtime/ptx_dev.ll
@@ -54,7 +54,6 @@ define weak_odr double @sqrt_f64(double %x) nounwind uwtable readnone alwaysinli
 declare float @__nv_frcp_rn(float) nounwind readnone
 
 define weak_odr float @fast_inverse_f32(float %x) nounwind uwtable readnone alwaysinline {
-       ; %y = tail call float @__nv_frcp_rn(float %x) nounwind readnone
        %y = call float asm "rcp.approx.f32     $0, $1;", "=f,f" (float %x)
        ret float %y
 }
diff --git a/test/correctness/vector_math.cpp b/test/correctness/vector_math.cpp
@@ -526,8 +526,8 @@ bool test(int lanes, int seed) {
     if (type_of<A>() == Float(32)) {
         if (verbose) printf("Fast transcendentals\n");
         Buffer<float> im15, im16, im17, im18, im19, im20;
-        Expr a = input(x, y) * 0.5f;
-        Expr b = input((x + 1) % W, y) * 0.5f;
+        Expr a = input(x, y);
+        Expr b = input((x + 1) % W, y);
         {
             Func f15;
             f15(x, y) = log(a);
@@ -568,8 +568,8 @@ bool test(int lanes, int seed) {
 
         for (int y = 0; y < H; y++) {
             for (int x = 0; x < W; x++) {
-                float a = float(input(x, y)) * 0.5f;
-                float b = float(input((x + 1) % W, y)) * 0.5f;
+                float a = float(input(x, y));
+                float b = float(input((x + 1) % W, y));
                 float correct_log = logf(a);
                 float correct_exp = expf(b);
                 float correct_pow = powf(a, b / 16.0f);
@@ -626,16 +626,16 @@ bool test(int lanes, int seed) {
                            a, b / 16.0f, im17(x, y), correct_pow, correct_pow_mantissa, pow_mantissa);
                 }
                 if (std::isfinite(correct_log) && fast_log_mantissa_error > 64) {
-                    printf("fast_log(%f) = %1.10f instead of %1.10f (mantissa: %d vs %d)\n",
-                           a, im18(x, y), correct_log, correct_log_mantissa, fast_log_mantissa);
+                    printf("fast_log(%f) = %1.10f instead of %1.10f (mantissa: %d vs %d ; error %d)\n",
+                           a, im18(x, y), correct_log, correct_log_mantissa, fast_log_mantissa, fast_log_mantissa_error);
                 }
                 if (std::isfinite(correct_exp) && fast_exp_mantissa_error > 64) {
-                    printf("fast_exp(%f) = %1.10f instead of %1.10f (mantissa: %d vs %d)\n",
-                           b, im19(x, y), correct_exp, correct_exp_mantissa, fast_exp_mantissa);
+                    printf("fast_exp(%f) = %1.10f instead of %1.10f (mantissa: %d vs %d ; error %d)\n",
+                           b, im19(x, y), correct_exp, correct_exp_mantissa, fast_exp_mantissa, fast_exp_mantissa_error);
                 }
                 if (a >= 0 && std::isfinite(correct_pow) && fast_pow_mantissa_error > 128) {
-                    printf("fast_pow(%f, %f) = %1.10f instead of %1.10f (mantissa: %d vs %d)\n",
-                           a, b / 16.0f, im20(x, y), correct_pow, correct_pow_mantissa, fast_pow_mantissa);
+                    printf("fast_pow(%f, %f) = %1.10f instead of %1.10f (mantissa: %d vs %d ; error %d)\n",
+                           a, b / 16.0f, im20(x, y), correct_pow, correct_pow_mantissa, fast_pow_mantissa, fast_pow_mantissa_error);
                 }
             }
         }
diff --git a/tools/polynomial_optimizer.py b/tools/polynomial_optimizer.py
@@ -115,6 +115,12 @@ def optimize_approximation(loss, order):
         print("Unknown function:", args.func)
         exit(1)
 
+    X_dense = np.linspace(lower, upper, 512 * 31 * 11)
+    if lower >= 0.0:
+        loglow = -5.0 if lower == 0.0 else np.log(lower)
+        X_dense = np.concatenate([X_dense, np.logspace(loglow, np.log(upper), num=2048 * 17)])
+        X_dense = np.sort(X_dense)
+
 
     if X is None: X = np.linspace(lower, upper, 512 * 31)
     target = func(X)
@@ -203,16 +209,19 @@ def optimize_approximation(loss, order):
     float64_metrics = Metrics(mean_squared_error, max_abs_error, max_ulp_error)
 
     # Reevaluate with float32 precision.
-    f32_powers = np.power(X[:,None].astype(np.float32), exponents).astype(np.float32)
-    f32_y_hat = fixed_part.astype(np.float32) + np.sum((f32_powers * coeffs.astype(np.float32))[:,::-1], axis=-1).astype(np.float32)
-    f32_diff = f32_y_hat - target.astype(np.float32)
+    f32_x_dense = X_dense.astype(np.float32)
+    f32_target_dense = func(f32_x_dense).astype(np.float32)
+    f32_fixed_part_dense = func_fixed_part(f32_x_dense)
+    f32_powers = np.power(f32_x_dense[:,None], exponents).astype(np.float32)
+    f32_y_hat = f32_fixed_part_dense.astype(np.float32) + np.sum((f32_powers * coeffs.astype(np.float32))[:,::-1], axis=-1).astype(np.float32)
+    f32_diff = f32_y_hat - f32_target_dense.astype(np.float32)
     f32_abs_diff = np.abs(f32_diff)
     # MSE metric
     f32_mean_squared_error = np.mean(np.square(f32_diff))
     # MAE metric
     f32_max_abs_error = np.amax(f32_abs_diff)
     # MaxULP metric
-    f32_ulp_error = f32_diff / np.spacing(np.abs(target).astype(np.float32))
+    f32_ulp_error = f32_diff / np.spacing(np.abs(f32_target_dense).astype(np.float32))
     f32_abs_ulp_error = np.abs(f32_ulp_error)
     f32_max_ulp_error = np.amax(f32_abs_ulp_error)
 

Original file line number	Diff line number	Diff line change
`@@ -1380,14 +1380,10 @@ Expr fast_log(const Expr &x, ApproximationPrecision prec) {`
`1380`	`1380`	`return Call::make(x.type(), Call::fast_log, {x, make_approximation_precision_info(prec)}, Call::PureIntrinsic);`
`1381`	`1381`	`}`
`1382`	`1382`
`1383`		`-Expr fast_pow(Expr x, Expr y, ApproximationPrecision prec) {`
	`1383`	`+Expr fast_pow(const Expr &x, const Expr &y, ApproximationPrecision prec) {`
`1384`	`1384`	`if (auto i = as_const_int(y)) {`
`1385`		`- return raise_to_integer_power(std::move(x), *i);`
	`1385`	`+ return raise_to_integer_power(x, *i);`
`1386`	`1386`	`}`
`1387`		`-`
`1388`		`- // TODO: figure out what to do with these casts...`
`1389`		`- x = cast<float>(std::move(x));`
`1390`		`- y = cast<float>(std::move(y));`
`1391`	`1387`	`return Call::make(x.type(), Call::fast_pow, {x, y, make_approximation_precision_info(prec)}, Call::PureIntrinsic);`
`1392`	`1388`	`}`
`1393`	`1389`
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,6 @@ define weak_odr double @sqrt_f64(double %x) nounwind uwtable readnone alwaysinli`
`54`	`54`	`declare float @__nv_frcp_rn(float) nounwind readnone`
`55`	`55`
`56`	`56`	`define weak_odr float @fast_inverse_f32(float %x) nounwind uwtable readnone alwaysinline {`
`57`		`- ; %y = tail call float @__nv_frcp_rn(float %x) nounwind readnone`
`58`	`57`	`%y = call float asm "rcp.approx.f32 $0, $1;", "=f,f" (float %x)`
`59`	`58`	`ret float %y`
`60`	`59`	`}`