Skip to content

Commit f0357dc

Browse files
committed
Fix API documentation. Improve measuring accuracy. Fix vector_math test not touching input: prevents constant folding.
1 parent 41d072c commit f0357dc

File tree

5 files changed

+74
-46
lines changed

5 files changed

+74
-46
lines changed

src/IROperator.cpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1380,14 +1380,10 @@ Expr fast_log(const Expr &x, ApproximationPrecision prec) {
13801380
return Call::make(x.type(), Call::fast_log, {x, make_approximation_precision_info(prec)}, Call::PureIntrinsic);
13811381
}
13821382

1383-
Expr fast_pow(Expr x, Expr y, ApproximationPrecision prec) {
1383+
Expr fast_pow(const Expr &x, const Expr &y, ApproximationPrecision prec) {
13841384
if (auto i = as_const_int(y)) {
1385-
return raise_to_integer_power(std::move(x), *i);
1385+
return raise_to_integer_power(x, *i);
13861386
}
1387-
1388-
// TODO: figure out what to do with these casts...
1389-
x = cast<float>(std::move(x));
1390-
y = cast<float>(std::move(y));
13911387
return Call::make(x.type(), Call::fast_pow, {x, y, make_approximation_precision_info(prec)}, Call::PureIntrinsic);
13921388
}
13931389

src/IROperator.h

Lines changed: 49 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -979,21 +979,40 @@ Expr pow(Expr x, Expr y);
979979
Expr erf(const Expr &x);
980980

981981
/** Struct that allows the user to specify precision requirements for functions
982-
* that are approximated. These polynomials can be
983-
* optimized for four different metrics: Mean Squared Error, Maximum Absolute Error,
984-
* Maximum Units in Last Place (ULP) Error, or a 50%/50% blend of MAE and MULPE.
985-
*
986-
* Orthogonally to the optimization objective, these polynomials can vary
987-
* in degree. Higher degree polynomials will give more precise results.
988-
* Note that instead of specifying the degree, the number of terms is used instead.
989-
* E.g., even (i.e., symmetric) functions may be implemented using only even powers,
990-
* for which a number of terms of 4 would actually mean that terms
991-
* in [1, x^2, x^4, x^6] are used, which is degree 6.
992-
*
993-
* Additionally, if you don't care about number of terms in the polynomial
994-
* and you do care about the maximal absolute error the approximation may have
995-
* over the domain, you may specify values and the implementation
996-
* will decide the appropriate polynomial degree that achieves this precision.
982+
* that are approximated. Several functions can be approximated using specialized
983+
* hardware instructions. If no hardware instructions are available, approximations
984+
* are implemented in Halide using polynomials or potentially Padé approximants.
985+
* Both the hardware instructions and the in-house approximations have a certain behavior
986+
* and precision. This struct allows you to specifiy which behavior and precision you
987+
* are interested in. Halide will select an appropriate implemenation that satisfies
988+
* these requirements.
989+
*
990+
* There are two main aspects of specifying the precision:
991+
* 1. The objective for which the approximation is optimzed. This can be to reduce the
992+
* maximal absolute error (MAE), or to reduce the maximal error measured in
993+
* units in last place (ULP). Some applications tend to naturally require low
994+
* absolute error, whereas others might favor low relative error (for which maximal ULP
995+
* error is a good metric).
996+
* 2. The minimal required precision in either MAE, or MULPE.
997+
*
998+
* Both of these parameters are optional:
999+
*
1000+
* - When omitting the optimization objective (i.e., AUTO), Halide is free to pick any
1001+
* implementation that satisfies the precision requirement. Sometimes, hardware instructions
1002+
* have vendor-specific behavior (one vendor might optimize MAE, another might optimize
1003+
* MULPE), so requiring a specific behavior might rule out the ability to use the hardware
1004+
* instruction if it doesn't behave the way requested. When polynomial approximations are
1005+
* selected, and AUTO is requested, Halide will pick a sensible optimization objective for
1006+
* each function.
1007+
* - When omitting the precision requirements (both \ref constraint_max_ulp_error and
1008+
* \ref constraint_max_absolute_error), Halide will try to favor hardware instructions
1009+
* when available in order to favor speed. Otherwise, Halide will select a polynomial with
1010+
* reasonable precision.
1011+
*
1012+
* The default-initialized ApproximationPrecision consists of AUTO-behavior, and default-precision.
1013+
* In general, when only approximate values are required without hard requirements on their
1014+
* precision, calling any of the fast_-version functions without specifying the ApproximationPrecision
1015+
* struct is fine, and will get you most likely the fastest implementation possible.
9971016
*/
9981017
struct ApproximationPrecision {
9991018
enum OptimizationObjective {
@@ -1067,45 +1086,50 @@ Expr fast_atan2(const Expr &y, const Expr &x, ApproximationPrecision = {});
10671086

10681087
/** Fast approximate log for Float(32).
10691088
* Returns nonsense for x <= 0.0f.
1070-
* Accurate up to the last 5 bits of the mantissa.
1089+
* Approximation available up to the Max 5 ULP, Mean 2 ULP.
10711090
* Vectorizes cleanly when using polynomials.
10721091
* Slow on x86 if you don't have at least sse 4.1.
10731092
* On NVIDIA CUDA: default-precision maps to a combination of lg2.approx.f32 and a multiplication.
1093+
* See \ref ApproximationPrecision for details on specifying precision.
10741094
*/
10751095
Expr fast_log(const Expr &x, ApproximationPrecision precision = {});
10761096

10771097
/** Fast approximate exp for Float(32).
10781098
* Returns nonsense for inputs that would overflow.
1079-
* Typically accurate up to the last 5 bits of the mantissa.
1080-
* Approximation
1099+
* Approximation available up to Max 3 ULP, Mean 1 ULP.
10811100
* Vectorizes cleanly when using polynomials.
10821101
* Slow on x86 if you don't have at least sse 4.1.
10831102
* On NVIDIA CUDA: default-precision maps to a combination of ex2.approx.f32 and a multiplication.
1103+
* See \ref ApproximationPrecision for details on specifying precision.
10841104
*/
10851105
Expr fast_exp(const Expr &x, ApproximationPrecision precision = {});
10861106

10871107
/** Fast approximate pow for Float(32).
10881108
* Returns nonsense for x < 0.0f.
1089-
* Accurate up to the last 5 bits of the mantissa for typical exponents.
1109+
* Returns 1 when x == y == 0.0.
1110+
* Approximations accurate up to Max 53 ULPs, Mean 13 ULPs.
10901111
* Gets worse when approaching overflow.
10911112
* Vectorizes cleanly when using polynomials.
10921113
* Slow on x86 if you don't have at least sse 4.1.
10931114
* On NVIDIA CUDA: default-precision maps to a combination of ex2.approx.f32 and lg2.approx.f32.
1115+
* See \ref ApproximationPrecision for details on specifying precision.
10941116
*/
1095-
Expr fast_pow(Expr x, Expr y, ApproximationPrecision precision = {});
1117+
Expr fast_pow(const Expr &x, const Expr &y, ApproximationPrecision precision = {});
10961118

10971119
/** Fast approximate pow for Float(32).
1098-
* Vectorizes cleanly when using polynomials (caveat: no polynomial approximation implemented yet).
1120+
* Approximations accurate to 2e-7 MAE, and Max 2500 ULPs (on average < 1 ULP) available.
1121+
* Vectorizes cleanly when using polynomials.
10991122
* Slow on x86 if you don't have at least sse 4.1.
11001123
* On NVIDIA CUDA: default-precision maps to a combination of ex2.approx.f32 and lg2.approx.f32.
1124+
* See \ref ApproximationPrecision for details on specifying precision.
11011125
*/
11021126
Expr fast_tanh(const Expr &x, ApproximationPrecision precision = {});
11031127

11041128
/** Fast approximate inverse for Float(32). Corresponds to the rcpps
1105-
* instruction on x86, and the vrecpe instruction on ARM. Vectorizes
1106-
* cleanly. Note that this can produce slightly different results
1107-
* across different implementations of the same architecture (e.g. AMD vs Intel),
1108-
* even when strict_float is enabled. */
1129+
* instruction on x86, the vrecpe instruction on ARM, and the rcp.approx.f32 instruction on CUDA.
1130+
* Vectorizes cleanly.
1131+
* Note that this can produce slightly different results across different implementations
1132+
* of the same architecture (e.g. AMD vs Intel), even when strict_float is enabled. */
11091133
Expr fast_inverse(Expr x);
11101134

11111135
/** Fast approximate inverse square root for Float(32). Corresponds to

src/runtime/ptx_dev.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ define weak_odr double @sqrt_f64(double %x) nounwind uwtable readnone alwaysinli
5454
declare float @__nv_frcp_rn(float) nounwind readnone
5555

5656
define weak_odr float @fast_inverse_f32(float %x) nounwind uwtable readnone alwaysinline {
57-
; %y = tail call float @__nv_frcp_rn(float %x) nounwind readnone
5857
%y = call float asm "rcp.approx.f32 $0, $1;", "=f,f" (float %x)
5958
ret float %y
6059
}

test/correctness/vector_math.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -526,8 +526,8 @@ bool test(int lanes, int seed) {
526526
if (type_of<A>() == Float(32)) {
527527
if (verbose) printf("Fast transcendentals\n");
528528
Buffer<float> im15, im16, im17, im18, im19, im20;
529-
Expr a = input(x, y) * 0.5f;
530-
Expr b = input((x + 1) % W, y) * 0.5f;
529+
Expr a = input(x, y);
530+
Expr b = input((x + 1) % W, y);
531531
{
532532
Func f15;
533533
f15(x, y) = log(a);
@@ -568,8 +568,8 @@ bool test(int lanes, int seed) {
568568

569569
for (int y = 0; y < H; y++) {
570570
for (int x = 0; x < W; x++) {
571-
float a = float(input(x, y)) * 0.5f;
572-
float b = float(input((x + 1) % W, y)) * 0.5f;
571+
float a = float(input(x, y));
572+
float b = float(input((x + 1) % W, y));
573573
float correct_log = logf(a);
574574
float correct_exp = expf(b);
575575
float correct_pow = powf(a, b / 16.0f);
@@ -626,16 +626,16 @@ bool test(int lanes, int seed) {
626626
a, b / 16.0f, im17(x, y), correct_pow, correct_pow_mantissa, pow_mantissa);
627627
}
628628
if (std::isfinite(correct_log) && fast_log_mantissa_error > 64) {
629-
printf("fast_log(%f) = %1.10f instead of %1.10f (mantissa: %d vs %d)\n",
630-
a, im18(x, y), correct_log, correct_log_mantissa, fast_log_mantissa);
629+
printf("fast_log(%f) = %1.10f instead of %1.10f (mantissa: %d vs %d ; error %d)\n",
630+
a, im18(x, y), correct_log, correct_log_mantissa, fast_log_mantissa, fast_log_mantissa_error);
631631
}
632632
if (std::isfinite(correct_exp) && fast_exp_mantissa_error > 64) {
633-
printf("fast_exp(%f) = %1.10f instead of %1.10f (mantissa: %d vs %d)\n",
634-
b, im19(x, y), correct_exp, correct_exp_mantissa, fast_exp_mantissa);
633+
printf("fast_exp(%f) = %1.10f instead of %1.10f (mantissa: %d vs %d ; error %d)\n",
634+
b, im19(x, y), correct_exp, correct_exp_mantissa, fast_exp_mantissa, fast_exp_mantissa_error);
635635
}
636636
if (a >= 0 && std::isfinite(correct_pow) && fast_pow_mantissa_error > 128) {
637-
printf("fast_pow(%f, %f) = %1.10f instead of %1.10f (mantissa: %d vs %d)\n",
638-
a, b / 16.0f, im20(x, y), correct_pow, correct_pow_mantissa, fast_pow_mantissa);
637+
printf("fast_pow(%f, %f) = %1.10f instead of %1.10f (mantissa: %d vs %d ; error %d)\n",
638+
a, b / 16.0f, im20(x, y), correct_pow, correct_pow_mantissa, fast_pow_mantissa, fast_pow_mantissa_error);
639639
}
640640
}
641641
}

tools/polynomial_optimizer.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,12 @@ def optimize_approximation(loss, order):
115115
print("Unknown function:", args.func)
116116
exit(1)
117117

118+
X_dense = np.linspace(lower, upper, 512 * 31 * 11)
119+
if lower >= 0.0:
120+
loglow = -5.0 if lower == 0.0 else np.log(lower)
121+
X_dense = np.concatenate([X_dense, np.logspace(loglow, np.log(upper), num=2048 * 17)])
122+
X_dense = np.sort(X_dense)
123+
118124

119125
if X is None: X = np.linspace(lower, upper, 512 * 31)
120126
target = func(X)
@@ -203,16 +209,19 @@ def optimize_approximation(loss, order):
203209
float64_metrics = Metrics(mean_squared_error, max_abs_error, max_ulp_error)
204210

205211
# Reevaluate with float32 precision.
206-
f32_powers = np.power(X[:,None].astype(np.float32), exponents).astype(np.float32)
207-
f32_y_hat = fixed_part.astype(np.float32) + np.sum((f32_powers * coeffs.astype(np.float32))[:,::-1], axis=-1).astype(np.float32)
208-
f32_diff = f32_y_hat - target.astype(np.float32)
212+
f32_x_dense = X_dense.astype(np.float32)
213+
f32_target_dense = func(f32_x_dense).astype(np.float32)
214+
f32_fixed_part_dense = func_fixed_part(f32_x_dense)
215+
f32_powers = np.power(f32_x_dense[:,None], exponents).astype(np.float32)
216+
f32_y_hat = f32_fixed_part_dense.astype(np.float32) + np.sum((f32_powers * coeffs.astype(np.float32))[:,::-1], axis=-1).astype(np.float32)
217+
f32_diff = f32_y_hat - f32_target_dense.astype(np.float32)
209218
f32_abs_diff = np.abs(f32_diff)
210219
# MSE metric
211220
f32_mean_squared_error = np.mean(np.square(f32_diff))
212221
# MAE metric
213222
f32_max_abs_error = np.amax(f32_abs_diff)
214223
# MaxULP metric
215-
f32_ulp_error = f32_diff / np.spacing(np.abs(target).astype(np.float32))
224+
f32_ulp_error = f32_diff / np.spacing(np.abs(f32_target_dense).astype(np.float32))
216225
f32_abs_ulp_error = np.abs(f32_ulp_error)
217226
f32_max_ulp_error = np.amax(f32_abs_ulp_error)
218227

0 commit comments

Comments
 (0)