@@ -979,21 +979,40 @@ Expr pow(Expr x, Expr y);
979
979
Expr erf (const Expr &x);
980
980
981
981
/* * Struct that allows the user to specify precision requirements for functions
982
- * that are approximated. These polynomials can be
983
- * optimized for four different metrics: Mean Squared Error, Maximum Absolute Error,
984
- * Maximum Units in Last Place (ULP) Error, or a 50%/50% blend of MAE and MULPE.
985
- *
986
- * Orthogonally to the optimization objective, these polynomials can vary
987
- * in degree. Higher degree polynomials will give more precise results.
988
- * Note that instead of specifying the degree, the number of terms is used instead.
989
- * E.g., even (i.e., symmetric) functions may be implemented using only even powers,
990
- * for which a number of terms of 4 would actually mean that terms
991
- * in [1, x^2, x^4, x^6] are used, which is degree 6.
992
- *
993
- * Additionally, if you don't care about number of terms in the polynomial
994
- * and you do care about the maximal absolute error the approximation may have
995
- * over the domain, you may specify values and the implementation
996
- * will decide the appropriate polynomial degree that achieves this precision.
982
+ * that are approximated. Several functions can be approximated using specialized
983
+ * hardware instructions. If no hardware instructions are available, approximations
984
+ * are implemented in Halide using polynomials or potentially Padé approximants.
985
+ * Both the hardware instructions and the in-house approximations have a certain behavior
986
+ * and precision. This struct allows you to specifiy which behavior and precision you
987
+ * are interested in. Halide will select an appropriate implemenation that satisfies
988
+ * these requirements.
989
+ *
990
+ * There are two main aspects of specifying the precision:
991
+ * 1. The objective for which the approximation is optimzed. This can be to reduce the
992
+ * maximal absolute error (MAE), or to reduce the maximal error measured in
993
+ * units in last place (ULP). Some applications tend to naturally require low
994
+ * absolute error, whereas others might favor low relative error (for which maximal ULP
995
+ * error is a good metric).
996
+ * 2. The minimal required precision in either MAE, or MULPE.
997
+ *
998
+ * Both of these parameters are optional:
999
+ *
1000
+ * - When omitting the optimization objective (i.e., AUTO), Halide is free to pick any
1001
+ * implementation that satisfies the precision requirement. Sometimes, hardware instructions
1002
+ * have vendor-specific behavior (one vendor might optimize MAE, another might optimize
1003
+ * MULPE), so requiring a specific behavior might rule out the ability to use the hardware
1004
+ * instruction if it doesn't behave the way requested. When polynomial approximations are
1005
+ * selected, and AUTO is requested, Halide will pick a sensible optimization objective for
1006
+ * each function.
1007
+ * - When omitting the precision requirements (both \ref constraint_max_ulp_error and
1008
+ * \ref constraint_max_absolute_error), Halide will try to favor hardware instructions
1009
+ * when available in order to favor speed. Otherwise, Halide will select a polynomial with
1010
+ * reasonable precision.
1011
+ *
1012
+ * The default-initialized ApproximationPrecision consists of AUTO-behavior, and default-precision.
1013
+ * In general, when only approximate values are required without hard requirements on their
1014
+ * precision, calling any of the fast_-version functions without specifying the ApproximationPrecision
1015
+ * struct is fine, and will get you most likely the fastest implementation possible.
997
1016
*/
998
1017
struct ApproximationPrecision {
999
1018
enum OptimizationObjective {
@@ -1067,45 +1086,50 @@ Expr fast_atan2(const Expr &y, const Expr &x, ApproximationPrecision = {});
1067
1086
1068
1087
/* * Fast approximate log for Float(32).
1069
1088
* Returns nonsense for x <= 0.0f.
1070
- * Accurate up to the last 5 bits of the mantissa .
1089
+ * Approximation available up to the Max 5 ULP, Mean 2 ULP .
1071
1090
* Vectorizes cleanly when using polynomials.
1072
1091
* Slow on x86 if you don't have at least sse 4.1.
1073
1092
* On NVIDIA CUDA: default-precision maps to a combination of lg2.approx.f32 and a multiplication.
1093
+ * See \ref ApproximationPrecision for details on specifying precision.
1074
1094
*/
1075
1095
Expr fast_log (const Expr &x, ApproximationPrecision precision = {});
1076
1096
1077
1097
/* * Fast approximate exp for Float(32).
1078
1098
* Returns nonsense for inputs that would overflow.
1079
- * Typically accurate up to the last 5 bits of the mantissa.
1080
- * Approximation
1099
+ * Approximation available up to Max 3 ULP, Mean 1 ULP.
1081
1100
* Vectorizes cleanly when using polynomials.
1082
1101
* Slow on x86 if you don't have at least sse 4.1.
1083
1102
* On NVIDIA CUDA: default-precision maps to a combination of ex2.approx.f32 and a multiplication.
1103
+ * See \ref ApproximationPrecision for details on specifying precision.
1084
1104
*/
1085
1105
Expr fast_exp (const Expr &x, ApproximationPrecision precision = {});
1086
1106
1087
1107
/* * Fast approximate pow for Float(32).
1088
1108
* Returns nonsense for x < 0.0f.
1089
- * Accurate up to the last 5 bits of the mantissa for typical exponents.
1109
+ * Returns 1 when x == y == 0.0.
1110
+ * Approximations accurate up to Max 53 ULPs, Mean 13 ULPs.
1090
1111
* Gets worse when approaching overflow.
1091
1112
* Vectorizes cleanly when using polynomials.
1092
1113
* Slow on x86 if you don't have at least sse 4.1.
1093
1114
* On NVIDIA CUDA: default-precision maps to a combination of ex2.approx.f32 and lg2.approx.f32.
1115
+ * See \ref ApproximationPrecision for details on specifying precision.
1094
1116
*/
1095
- Expr fast_pow (Expr x, Expr y, ApproximationPrecision precision = {});
1117
+ Expr fast_pow (const Expr & x, const Expr & y, ApproximationPrecision precision = {});
1096
1118
1097
1119
/* * Fast approximate pow for Float(32).
1098
- * Vectorizes cleanly when using polynomials (caveat: no polynomial approximation implemented yet).
1120
+ * Approximations accurate to 2e-7 MAE, and Max 2500 ULPs (on average < 1 ULP) available.
1121
+ * Vectorizes cleanly when using polynomials.
1099
1122
* Slow on x86 if you don't have at least sse 4.1.
1100
1123
* On NVIDIA CUDA: default-precision maps to a combination of ex2.approx.f32 and lg2.approx.f32.
1124
+ * See \ref ApproximationPrecision for details on specifying precision.
1101
1125
*/
1102
1126
Expr fast_tanh (const Expr &x, ApproximationPrecision precision = {});
1103
1127
1104
1128
/* * Fast approximate inverse for Float(32). Corresponds to the rcpps
1105
- * instruction on x86, and the vrecpe instruction on ARM. Vectorizes
1106
- * cleanly. Note that this can produce slightly different results
1107
- * across different implementations of the same architecture (e.g. AMD vs Intel),
1108
- * even when strict_float is enabled. */
1129
+ * instruction on x86, the vrecpe instruction on ARM, and the rcp.approx.f32 instruction on CUDA.
1130
+ * Vectorizes cleanly.
1131
+ * Note that this can produce slightly different results across different implementations
1132
+ * of the same architecture (e.g. AMD vs Intel), even when strict_float is enabled. */
1109
1133
Expr fast_inverse (Expr x);
1110
1134
1111
1135
/* * Fast approximate inverse square root for Float(32). Corresponds to
0 commit comments