Make use of the new strict_float intrinsics for the fast math functions.

mcourteaux · mcourteaux · commit 58bf5235c41c · 2025-06-14T12:26:39.000+02:00
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
@@ -408,7 +408,7 @@ void CodeGen_LLVM::init_codegen(const std::string &name) {
     module->addModuleFlag(llvm::Module::Warning, "halide_mabi", MDString::get(*context, mabi()));
     module->addModuleFlag(llvm::Module::Warning, "halide_use_pic", use_pic() ? 1 : 0);
     module->addModuleFlag(llvm::Module::Warning, "halide_use_large_code_model", llvm_large_code_model ? 1 : 0);
-    module->addModuleFlag(llvm::Module::Warning, "halide_per_instruction_fast_math_flags", any_strict_float);
+    module->addModuleFlag(llvm::Module::Warning, "halide_per_instruction_fast_math_flags", any_strict_float ? 1 : 0);
     if (effective_vscale != 0) {
         module->addModuleFlag(llvm::Module::Warning, "halide_effective_vscale", effective_vscale);
     }
@@ -498,6 +498,7 @@ CodeGen_LLVM::ScopedFastMath::~ScopedFastMath() {
 
 std::unique_ptr<llvm::Module> CodeGen_LLVM::compile(const Module &input) {
     any_strict_float = input.any_strict_float();
+    debug(2) << "Module: any_strict_float = " << any_strict_float << "\n";
 
     init_codegen(input.name());
 
diff --git a/src/FastMathFunctions.cpp b/src/FastMathFunctions.cpp
@@ -97,16 +97,15 @@ Expr eval_poly_horner(const std::vector<double> &coefs, const Expr &x) {
 }
 
 inline std::pair<Expr, Expr> two_sum(const Expr &a, const Expr &b) {
-    // TODO(mcourteaux): replace with proper strict_float intrinsic ops.
-    Expr x = strict_float(a + b);
-    Expr z = strict_float(x - a);
-    Expr y = strict_float(strict_float(a - strict_float(x - z)) + strict_float(b - z));
+    Expr x = strict_add(a, b);
+    Expr z = strict_sub(x, a);
+    Expr y = strict_add(strict_sub(a, strict_sub(x, z)), strict_sub(b, z));
     return {x, y};
 }
 
 inline std::pair<Expr, Expr> two_prod(const Expr &a, const Expr &b) {
-    // TODO(mcourteaux): replace with proper strict_float intrinsic ops.
-    Expr x = strict_float(a * b);
+    Expr x = strict_mul(a, b);
+    // TODO(mcourteaux): replace with proper strict_float fma intrinsic op.
     Expr y = (a * b - x);  // No strict float, so let's hope it gets compiled as FMA.
     return {x, y};
 }
@@ -176,8 +175,7 @@ Expr fast_sin(const Expr &x_full, ApproximationPrecision precision) {
     Expr pi_over_two_minus_x = make_const(type, PI_OVER_TWO) - x;
     if (type == Float(32) && precision.optimized_for == ApproximationPrecision::MULPE) {
         auto [hi, lo] = split_float(PI_OVER_TWO);
-        // TODO(mcourteaux): replace with proper strict_float intrinsic ops.
-        pi_over_two_minus_x = strict_float(make_const(type, hi) - x) + make_const(type, lo);
+        pi_over_two_minus_x = strict_sub(make_const(type, hi), x) + make_const(type, lo);
     }
     x = select(mirror, pi_over_two_minus_x, x);
 
@@ -210,7 +208,7 @@ Expr fast_cos(const Expr &x_full, ApproximationPrecision precision) {
     if (type == Float(32) && precision.optimized_for == ApproximationPrecision::MULPE) {
         auto [hi, lo] = split_float(PI_OVER_TWO);
         // TODO(mcourteaux): replace with proper strict_float intrinsic ops.
-        pi_over_two_minus_x = strict_float(strict_float(make_const(type, hi) - x) + make_const(type, lo));
+        pi_over_two_minus_x = strict_add(strict_sub(make_const(type, hi), x), make_const(type, lo));
     }
     x = select(mirror, pi_over_two_minus_x, x);
 
@@ -238,8 +236,7 @@ Expr fast_tan(const Expr &x_full, ApproximationPrecision precision) {
     Expr x = x_full - k_real * make_const(type, PI);
     if (type == Float(32) && precision.optimized_for == ApproximationPrecision::MULPE) {
         auto [pi_hi, pi_lo] = split_float(PI);
-        // TODO(mcourteaux): replace with proper strict_float intrinsic ops.
-        x = strict_float(strict_float(x_full - k_real * make_const(type, pi_hi)) - (k_real * make_const(type, pi_lo)));
+        x = strict_sub((x_full - k_real * make_const(type, pi_hi)), (k_real * make_const(type, pi_lo)));
     }
 
     // When polynomial: x is assumed to be reduced to [-pi/2, pi/2]!
@@ -250,11 +247,11 @@ Expr fast_tan(const Expr &x_full, ApproximationPrecision precision) {
     Expr use_cotan = abs_x > make_const(type, PI / 4.0);
     Expr pi_over_two_minus_abs_x;
     if (type == Float(64)) {
+        // TODO(mcourteaux): We could do split floats here too.
         pi_over_two_minus_abs_x = make_const(type, PI_OVER_TWO) - abs_x;
     } else if (type == Float(32)) {  // We want to do this trick always, because we invert later.
         auto [hi, lo] = split_float(PI_OVER_TWO);
-        // TODO(mcourteaux): replace with proper strict_float intrinsic ops.
-        pi_over_two_minus_abs_x = strict_float(make_const(type, hi) - abs_x) + make_const(type, lo);
+        pi_over_two_minus_abs_x = strict_sub(make_const(type, hi), abs_x) + make_const(type, lo);
     }
     Expr arg = select(use_cotan, pi_over_two_minus_abs_x, abs_x);
 
diff --git a/src/IROperator.cpp b/src/IROperator.cpp
@@ -2670,6 +2670,29 @@ Expr strict_float(const Expr &e) {
     return strictify_float(e);
 }
 
+inline Expr strict_float_op(const Expr &a, const Expr &b, Call::IntrinsicOp op) {
+    user_assert(a.type() == b.type()) << "strict_float ops should be done on equal types.";
+    user_assert(a.type().is_float()) << "strict_float ops should be done on floating point types.";
+    return Call::make(a.type(), op, {a, b}, Call::CallType::PureIntrinsic);
+}
+
+#define impl_strict_op(x)                                    \
+    Expr strict_##x(const Expr &a, const Expr &b) {     \
+        return strict_float_op(a, b, Call::strict_##x); \
+    }
+
+impl_strict_op(add);
+impl_strict_op(sub);
+impl_strict_op(div);
+impl_strict_op(mul);
+impl_strict_op(max);
+impl_strict_op(min);
+impl_strict_op(eq);
+impl_strict_op(le);
+impl_strict_op(lt);
+
+#undef impl_strict_op
+
 Expr undef(Type t) {
     return Call::make(t, Call::undef,
                       std::vector<Expr>(),
diff --git a/src/IROperator.h b/src/IROperator.h
@@ -1578,6 +1578,22 @@ Expr saturating_cast(Type t, Expr e);
  * generated code. */
 Expr strict_float(const Expr &e);
 
+/**
+ * Helper functions to the strict-float variants of the
+ * basic floating point operators.
+ */
+/// @{
+Expr strict_add(const Expr &a, const Expr &b);
+Expr strict_sub(const Expr &a, const Expr &b);
+Expr strict_mul(const Expr &a, const Expr &b);
+Expr strict_div(const Expr &a, const Expr &b);
+Expr strict_max(const Expr &a, const Expr &b);
+Expr strict_min(const Expr &a, const Expr &b);
+Expr strict_eq(const Expr &a, const Expr &b);
+Expr strict_le(const Expr &a, const Expr &b);
+Expr strict_lt(const Expr &a, const Expr &b);
+/// @}
+
 /** Create an Expr that that promises another Expr is clamped but do
  * not generate code to check the assertion or modify the value. No
  * attempt is made to prove the bound at compile time. (If it is
diff --git a/src/Lower.cpp b/src/Lower.cpp
@@ -148,8 +148,8 @@ void lower_impl(const vector<Function> &output_funcs,
 
     lower_target_query_ops(env, t);
 
-    bool any_strict_float = strictify_float(env, t);
-    result_module.set_any_strict_float(any_strict_float);
+    bool has_any_strict_float = strictify_float(env, t);
+    result_module.set_any_strict_float(has_any_strict_float);
 
     // Output functions should all be computed and stored at root.
     for (const Function &f : outputs) {
@@ -333,6 +333,13 @@ void lower_impl(const vector<Function> &output_funcs,
     debug(1) << "Selecting fast math function implementations...\n";
     s = lower_fast_math_functions(s, t);
     log("Lowering after selecting fast math functions:", s);
+    if (!has_any_strict_float) {
+        has_any_strict_float = any_strict_float(s);
+        if (has_any_strict_float) {
+            debug(2) << "Detected strict_float ops after selecting fast math functions.\n";
+            result_module.set_any_strict_float(has_any_strict_float);
+        }
+    }
 
     debug(1) << "Simplifying...\n";
     s = simplify(s);
diff --git a/src/StrictifyFloat.cpp b/src/StrictifyFloat.cpp
@@ -164,5 +164,17 @@ bool strictify_float(std::map<std::string, Function> &env, const Target &t) {
     return checker.any_strict || t.has_feature(Target::StrictFloat);
 }
 
+bool any_strict_float(const Stmt &s) {
+    AnyStrictIntrinsics c;
+    s.accept(&c);
+    return c.any_strict;
+}
+
+bool any_strict_float(const Expr &e) {
+    AnyStrictIntrinsics c;
+    e.accept(&c);
+    return c.any_strict;
+}
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/StrictifyFloat.h b/src/StrictifyFloat.h
@@ -12,6 +12,7 @@ namespace Halide {
 
 struct Target;
 struct Expr;
+struct Stmt;
 
 namespace Internal {
 
@@ -33,6 +34,12 @@ Expr unstrictify_float(const Call *op);
  * strictness). */
 bool strictify_float(std::map<std::string, Function> &env, const Target &t);
 
+/** Checks the passed Stmt for the precense of any strict_float ops. */
+bool any_strict_float(const Stmt &s);
+
+/** Checks the passed Expr for the precense of any strict_float ops. */
+bool any_strict_float(const Expr &s);
+
 }  // namespace Internal
 }  // namespace Halide
 
diff --git a/test/correctness/fast_function_approximations.cpp b/test/correctness/fast_function_approximations.cpp
@@ -111,7 +111,7 @@ constexpr RangedAccuracyTest::Validation rlx_abs_val = {1.02, 1e-7};
 constexpr RangedAccuracyTest::Validation vrlx_abs_val = {1.1, 1e-6};
 constexpr RangedAccuracyTest::Validation rsnbl_abs_val = {2.0, 1e-5};
 constexpr RangedAccuracyTest::Validation rlx_abs_val_pct(double pct) {
-    return {1.0 + 100 * pct, 1e-7};
+    return {1.0 + 0.01 * pct, 1e-7};
 }
 constexpr RangedAccuracyTest::Validation max_abs_val(double max_val) {
     return {0.0f, max_val};
@@ -171,7 +171,7 @@ struct FunctionToTest {
         [](Expr x, Expr y, Halide::ApproximationPrecision prec) { return Halide::fast_atan2(x, y, prec); },
         Halide::Internal::ApproximationTables::best_atan_approximation,
         {
-            { "precise" , {{ -10.0f, 10.0f}, {-10.0f, 10.0f}}, rlx_abs_val_pct(4), rlx_abs_val, rlx_ulp_val, rlx_ulp_val, 70, 30 },
+            { "precise" , {{ -10.0f, 10.0f}, {-10.0f, 10.0f}}, rlx_abs_val_pct(6), rlx_abs_val, rlx_ulp_val, rlx_ulp_val, 70, 30 },
         }
     },
     {
@@ -385,7 +385,7 @@ int main(int argc, char **argv) {
     Buffer<float, 1> out_ref{steps * steps};
     Buffer<float, 1> out_approx{steps * steps};
 
-    bool target_has_proper_strict_float_support = !target.has_gpu_feature();
+    bool target_has_proper_strict_float_support = !target.has_gpu_feature() || target.has_feature(Target::CUDA);
 
     double best_mae_for_backend = 0.0;
     if (target.has_feature(Halide::Target::Vulkan)) {
@@ -528,7 +528,7 @@ int main(int argc, char **argv) {
                         .vectorize(ii, 4);
                     // TODO(mcourteaux): When vector legalization lowering pass is in, increase vectorize for testing.
                 } else {
-                    approx_func.vectorize(i, 8);
+                    approx_func.vectorize(i, target.natural_vector_size<float>());
                 }
                 approx_func.realize(out_approx);
                 if (emit_asm) {