Merge branch 'main' into abadams/fix_x86_transpose

mcourteaux · mcourteaux · commit 23b79bace652 · 2026-02-01T12:43:13.000+01:00
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
@@ -2104,12 +2104,8 @@ void CodeGen_ARM::visit(const Call *op) {
         // llvm's roundeven intrinsic reliably lowers to the correct
         // instructions on aarch64, but despite having the same instruction
         // available, it doesn't seem to work for arm-32.
-        if (target.bits == 64) {
-            value = call_overloaded_intrin(op->type, "round", op->args);
-            if (value) {
-                return;
-            }
-        } else {
+        if (target.bits == 32) {
+            // So let's call the fallback to make sure it's vectorizable.
             value = codegen(lower_round_to_nearest_ties_to_even(op->args[0]));
             return;
         }
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
@@ -1276,7 +1276,7 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b,
                 Value *packed = call_intrin_cast(
                     native2_ty,
                     INTRINSIC_128B(vdealvdd),
-                    {ab_i1, ab_i0, ConstantInt::get(i32_t, -element_bytes)});
+                    {ab_i1, ab_i0, ConstantInt::getSigned(i32_t, -element_bytes)});
                 llvm::Intrinsic::ID intrin = start == 0 ? INTRINSIC_128B(lo) : INTRINSIC_128B(hi);
                 ret_i = call_intrin_cast(native_ty, intrin, {packed});
             } else {
@@ -1589,6 +1589,8 @@ Value *CodeGen_Hexagon::vdelta(Value *lut, const vector<int> &indices) {
         if (generate_vdelta(indices, reverse, switches)) {
             vector<Constant *> control_elements(switches.size());
             for (int i = 0; i < (int)switches.size(); i++) {
+                internal_assert(switches[i] >= 0 && switches[i] <= 255)
+                    << "vdelta switch value " << switches[i] << " doesn't fit in 8 bits\n";
                 control_elements[i] = ConstantInt::get(i8_t, switches[i]);
             }
             Value *control = ConstantVector::get(control_elements);
@@ -1606,7 +1608,7 @@ Value *CodeGen_Hexagon::vdelta(Value *lut, const vector<int> &indices) {
 
 Value *CodeGen_Hexagon::create_vector(llvm::Type *ty, int val) {
     llvm::Type *scalar_ty = ty->getScalarType();
-    Constant *value = ConstantInt::get(scalar_ty, val);
+    Constant *value = ConstantInt::getSigned(scalar_ty, val);
     return get_splat(get_vector_num_elements(ty), value);
 }
 
@@ -1637,7 +1639,7 @@ Value *CodeGen_Hexagon::vlut(Value *lut, Value *idx, int min_index, int max_inde
         vector<Value *> indices;
         Value *replicate_val = ConstantInt::get(i8_t, replicate);
         for (int i = 0; i < replicate; i++) {
-            Value *pos = ConstantInt::get(idx16->getType(), i);
+            Value *pos = get_splat(idx16_elems, ConstantInt::get(i16_t, i));
             indices.emplace_back(call_intrin(idx16->getType(),
                                              "halide.hexagon.add_mul.vh.vh.b",
                                              {pos, idx16, replicate_val}));
@@ -1719,7 +1721,7 @@ Value *CodeGen_Hexagon::vlut(Value *lut, const vector<int> &indices) {
             min_index = std::min(min_index, i);
             max_index = std::max(max_index, i);
         }
-        llvm_indices.push_back(ConstantInt::get(i16_t, i));
+        llvm_indices.push_back(ConstantInt::getSigned(i16_t, i));
     }
 
     // We use i16 indices because we can't support LUTs with more than
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
@@ -2855,7 +2855,11 @@ void CodeGen_LLVM::visit(const Call *op) {
             value = phi;
         }
     } else if (op->is_intrinsic(Call::round)) {
-        value = codegen(lower_round_to_nearest_ties_to_even(op->args[0]));
+        value = call_overloaded_intrin(op->type, "roundeven", op->args);
+        if (!value) {
+            debug(2) << "llvm.roundeven intrinsic not available (Is FP16 enabled?). Using fallback instead.\n";
+            value = codegen(lower_round_to_nearest_ties_to_even(op->args[0]));
+        }
     } else if (op->is_intrinsic(Call::require)) {
         internal_assert(op->args.size() == 3);
         Expr cond = op->args[0];
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
@@ -585,13 +585,6 @@ void CodeGen_X86::visit(const Cast *op) {
 }
 
 void CodeGen_X86::visit(const Call *op) {
-    if (op->is_intrinsic(Call::round)) {
-        value = call_overloaded_intrin(op->type, "round", op->args);
-        if (value) {
-            return;
-        }
-    }
-
     if (!op->type.is_vector()) {
         // We only have peephole optimizations for vectors beyond this point.
         CodeGen_Posix::visit(op);
diff --git a/src/LLVM_Output.cpp b/src/LLVM_Output.cpp
@@ -351,6 +351,7 @@ std::unique_ptr<llvm::Module> clone_module(const llvm::Module &module_in) {
 
 void emit_file(const llvm::Module &module_in, Internal::LLVMOStream &out,
                llvm::CodeGenFileType file_type) {
+    // Make sure to run this with a large stack!
     debug(1) << "emit_file.Compiling to native code...\n";
 #if LLVM_VERSION >= 210
     debug(2) << "Target triple: " << module_in.getTargetTriple().str() << "\n";
@@ -443,11 +444,15 @@ std::unique_ptr<llvm::Module> compile_module_to_llvm_module(const Module &module
 }
 
 void compile_llvm_module_to_object(llvm::Module &module, Internal::LLVMOStream &out) {
-    emit_file(module, out, llvm::CodeGenFileType::ObjectFile);
+    Internal::run_with_large_stack([&]() {
+        emit_file(module, out, llvm::CodeGenFileType::ObjectFile);
+    });
 }
 
 void compile_llvm_module_to_assembly(llvm::Module &module, Internal::LLVMOStream &out) {
-    emit_file(module, out, llvm::CodeGenFileType::AssemblyFile);
+    Internal::run_with_large_stack([&]() {
+        emit_file(module, out, llvm::CodeGenFileType::AssemblyFile);
+    });
 }
 
 void compile_llvm_module_to_llvm_bitcode(llvm::Module &module, Internal::LLVMOStream &out) {
diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
@@ -425,16 +425,15 @@ std::optional<llvm::VersionTuple> get_os_version_constraint(const llvm::Triple &
         return std::nullopt;
     }
 
+    // These version constraints track the minimum deployment targets
+    // supported by the latest Xcode version, which is currently 26.2.
+    // See the table here: https://developer.apple.com/support/xcode/
     if (triple.isMacOSX() && triple.isX86()) {
-        // At time of writing (January 2025), this is one version prior
-        // to the oldest version still supported by Apple.
-        return llvm::VersionTuple(12, 0, 0);
+        return llvm::VersionTuple(11, 0, 0);
     }
 
     if (triple.isiOS()) {
-        // At time of writing (January 2025), this is one version prior
-        // to the oldest version still supported by Apple.
-        return llvm::VersionTuple(17, 0, 0);
+        return llvm::VersionTuple(15, 0, 0);
     }
 
     llvm::VersionTuple t = triple.getMinimumSupportedOSVersion();
diff --git a/src/PythonExtensionGen.cpp b/src/PythonExtensionGen.cpp
@@ -214,7 +214,7 @@ bool unpack_buffer(PyObject *py_obj,
         return false;
     }
 
-    memset(&halide_buf, 0, sizeof(halide_buf));
+    halide_buf = {};
     needs_device_free = true;
     if (!py_buf.format) {
         halide_buf.type.code = halide_type_uint;
diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h
@@ -164,14 +164,15 @@ class SimdOpCheckTest {
                                    int vector_width,
                                    const std::vector<Argument> &arg_types,
                                    std::ostringstream &error_msg) {
-        std::string fn_name = "test_" + name;
+        std::string fn_name = "test_" + name + "_vecwidth" + std::to_string(vector_width);
         std::string file_name = output_directory + fn_name;
 
         auto ext = Internal::get_output_info(target);
         std::map<OutputFileType, std::string> outputs = {
             {OutputFileType::c_header, file_name + ext.at(OutputFileType::c_header).extension},
             {OutputFileType::object, file_name + ext.at(OutputFileType::object).extension},
             {OutputFileType::assembly, file_name + ".s"},
+            {OutputFileType::llvm_assembly, file_name + ".ll"},
         };
         error.compile_to(outputs, arg_types, fn_name, target);
 
@@ -319,7 +320,7 @@ class SimdOpCheckTest {
         e.accept(&has_inline_reduction);
 
         // Define a vectorized Halide::Func that uses the pattern.
-        Halide::Func f(name);
+        Halide::Func f(name + "_vecwidth" + std::to_string(vector_width));
         f(x, y) = e;
         f.bound(x, 0, W).vectorize(x, vector_width);
         f.compute_root();
@@ -471,15 +472,18 @@ class SimdOpCheckTest {
         // settings.
         if (!wildcard_match(filter, op)) return;
 
-        tasks.emplace_back(Task{op, name, vector_width, e});
+        tasks.emplace_back(Task{op, name, vector_width, std::move(e)});
     }
     virtual void add_tests() = 0;
     virtual int image_param_alignment() {
         return 16;
     }
 
-    virtual bool use_multiple_threads() const {
-        return true;
+    int num_worker_threads() const {
+        if (std::string t = Halide::Internal::get_env_variable("HL_OP_CHECK_THREADS"); !t.empty()) {
+            return std::atoi(t.c_str());
+        }
+        return Halide::Tools::ThreadPool<void>::num_processors_online();
     }
 
     virtual bool test_all() {
@@ -492,10 +496,7 @@ class SimdOpCheckTest {
 
         Sharder sharder;
 
-        Halide::Tools::ThreadPool<TestResult> pool(
-            use_multiple_threads() ?
-                Halide::Tools::ThreadPool<TestResult>::num_processors_online() :
-                1);
+        Halide::Tools::ThreadPool<TestResult> pool(num_worker_threads());
         std::vector<std::future<TestResult>> futures;
 
         for (size_t t = 0; t < tasks.size(); t++) {
diff --git a/test/correctness/simd_op_check_arm.cpp b/test/correctness/simd_op_check_arm.cpp
@@ -888,7 +888,9 @@ class SimdOpCheckARM : public SimdOpCheckTest {
             // VFRINTN
             if (target.bits == 64) {
                 // LLVM doesn't want to emit vfrintn on arm-32
-                check(arm32 ? "vfrintn.f16" : "frintn", 8 * w, round(f16_1));
+                if (target.has_feature(Target::ARMFp16)) {
+                    check(arm32 ? "vfrintn.f16" : "frintn", 8 * w, round(f16_1));
+                }
                 check(arm32 ? "vfrintn.f32" : "frintn", 4 * w, round(f32_1));
                 check(arm32 ? "vfrintn.f64" : "frintn", 2 * w, round(f64_1));
             }

Original file line number	Diff line number	Diff line change
`@@ -425,16 +425,15 @@ std::optional<llvm::VersionTuple> get_os_version_constraint(const llvm::Triple &`
`425`	`425`	`return std::nullopt;`
`426`	`426`	`}`
`427`	`427`
	`428`	`+ // These version constraints track the minimum deployment targets`
	`429`	`+ // supported by the latest Xcode version, which is currently 26.2.`
	`430`	`+ // See the table here: https://developer.apple.com/support/xcode/`
`428`	`431`	`if (triple.isMacOSX() && triple.isX86()) {`
`429`		`- // At time of writing (January 2025), this is one version prior`
`430`		`- // to the oldest version still supported by Apple.`
`431`		`- return llvm::VersionTuple(12, 0, 0);`
	`432`	`+ return llvm::VersionTuple(11, 0, 0);`
`432`	`433`	`}`
`433`	`434`
`434`	`435`	`if (triple.isiOS()) {`
`435`		`- // At time of writing (January 2025), this is one version prior`
`436`		`- // to the oldest version still supported by Apple.`
`437`		`- return llvm::VersionTuple(17, 0, 0);`
	`436`	`+ return llvm::VersionTuple(15, 0, 0);`
`438`	`437`	`}`
`439`	`438`
`440`	`439`	`llvm::VersionTuple t = triple.getMinimumSupportedOSVersion();`
Original file line number	Diff line number	Diff line change
`@@ -214,7 +214,7 @@ bool unpack_buffer(PyObject *py_obj,`
`214`	`214`	`return false;`
`215`	`215`	`}`
`216`	`216`
`217`		`- memset(&halide_buf, 0, sizeof(halide_buf));`
	`217`	`+ halide_buf = {};`
`218`	`218`	`needs_device_free = true;`
`219`	`219`	`if (!py_buf.format) {`
`220`	`220`	`halide_buf.type.code = halide_type_uint;`