Skip to content

Commit 23b79ba

Browse files
committed
Merge branch 'main' into abadams/fix_x86_transpose
2 parents cdc1de2 + d3a8638 commit 23b79ba

9 files changed

Lines changed: 39 additions & 37 deletions

src/CodeGen_ARM.cpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2104,12 +2104,8 @@ void CodeGen_ARM::visit(const Call *op) {
21042104
// llvm's roundeven intrinsic reliably lowers to the correct
21052105
// instructions on aarch64, but despite having the same instruction
21062106
// available, it doesn't seem to work for arm-32.
2107-
if (target.bits == 64) {
2108-
value = call_overloaded_intrin(op->type, "round", op->args);
2109-
if (value) {
2110-
return;
2111-
}
2112-
} else {
2107+
if (target.bits == 32) {
2108+
// So let's call the fallback to make sure it's vectorizable.
21132109
value = codegen(lower_round_to_nearest_ties_to_even(op->args[0]));
21142110
return;
21152111
}

src/CodeGen_Hexagon.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1276,7 +1276,7 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b,
12761276
Value *packed = call_intrin_cast(
12771277
native2_ty,
12781278
INTRINSIC_128B(vdealvdd),
1279-
{ab_i1, ab_i0, ConstantInt::get(i32_t, -element_bytes)});
1279+
{ab_i1, ab_i0, ConstantInt::getSigned(i32_t, -element_bytes)});
12801280
llvm::Intrinsic::ID intrin = start == 0 ? INTRINSIC_128B(lo) : INTRINSIC_128B(hi);
12811281
ret_i = call_intrin_cast(native_ty, intrin, {packed});
12821282
} else {
@@ -1589,6 +1589,8 @@ Value *CodeGen_Hexagon::vdelta(Value *lut, const vector<int> &indices) {
15891589
if (generate_vdelta(indices, reverse, switches)) {
15901590
vector<Constant *> control_elements(switches.size());
15911591
for (int i = 0; i < (int)switches.size(); i++) {
1592+
internal_assert(switches[i] >= 0 && switches[i] <= 255)
1593+
<< "vdelta switch value " << switches[i] << " doesn't fit in 8 bits\n";
15921594
control_elements[i] = ConstantInt::get(i8_t, switches[i]);
15931595
}
15941596
Value *control = ConstantVector::get(control_elements);
@@ -1606,7 +1608,7 @@ Value *CodeGen_Hexagon::vdelta(Value *lut, const vector<int> &indices) {
16061608

16071609
Value *CodeGen_Hexagon::create_vector(llvm::Type *ty, int val) {
16081610
llvm::Type *scalar_ty = ty->getScalarType();
1609-
Constant *value = ConstantInt::get(scalar_ty, val);
1611+
Constant *value = ConstantInt::getSigned(scalar_ty, val);
16101612
return get_splat(get_vector_num_elements(ty), value);
16111613
}
16121614

@@ -1637,7 +1639,7 @@ Value *CodeGen_Hexagon::vlut(Value *lut, Value *idx, int min_index, int max_inde
16371639
vector<Value *> indices;
16381640
Value *replicate_val = ConstantInt::get(i8_t, replicate);
16391641
for (int i = 0; i < replicate; i++) {
1640-
Value *pos = ConstantInt::get(idx16->getType(), i);
1642+
Value *pos = get_splat(idx16_elems, ConstantInt::get(i16_t, i));
16411643
indices.emplace_back(call_intrin(idx16->getType(),
16421644
"halide.hexagon.add_mul.vh.vh.b",
16431645
{pos, idx16, replicate_val}));
@@ -1719,7 +1721,7 @@ Value *CodeGen_Hexagon::vlut(Value *lut, const vector<int> &indices) {
17191721
min_index = std::min(min_index, i);
17201722
max_index = std::max(max_index, i);
17211723
}
1722-
llvm_indices.push_back(ConstantInt::get(i16_t, i));
1724+
llvm_indices.push_back(ConstantInt::getSigned(i16_t, i));
17231725
}
17241726

17251727
// We use i16 indices because we can't support LUTs with more than

src/CodeGen_LLVM.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2855,7 +2855,11 @@ void CodeGen_LLVM::visit(const Call *op) {
28552855
value = phi;
28562856
}
28572857
} else if (op->is_intrinsic(Call::round)) {
2858-
value = codegen(lower_round_to_nearest_ties_to_even(op->args[0]));
2858+
value = call_overloaded_intrin(op->type, "roundeven", op->args);
2859+
if (!value) {
2860+
debug(2) << "llvm.roundeven intrinsic not available (Is FP16 enabled?). Using fallback instead.\n";
2861+
value = codegen(lower_round_to_nearest_ties_to_even(op->args[0]));
2862+
}
28592863
} else if (op->is_intrinsic(Call::require)) {
28602864
internal_assert(op->args.size() == 3);
28612865
Expr cond = op->args[0];

src/CodeGen_X86.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -585,13 +585,6 @@ void CodeGen_X86::visit(const Cast *op) {
585585
}
586586

587587
void CodeGen_X86::visit(const Call *op) {
588-
if (op->is_intrinsic(Call::round)) {
589-
value = call_overloaded_intrin(op->type, "round", op->args);
590-
if (value) {
591-
return;
592-
}
593-
}
594-
595588
if (!op->type.is_vector()) {
596589
// We only have peephole optimizations for vectors beyond this point.
597590
CodeGen_Posix::visit(op);

src/LLVM_Output.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,7 @@ std::unique_ptr<llvm::Module> clone_module(const llvm::Module &module_in) {
351351

352352
void emit_file(const llvm::Module &module_in, Internal::LLVMOStream &out,
353353
llvm::CodeGenFileType file_type) {
354+
// Make sure to run this with a large stack!
354355
debug(1) << "emit_file.Compiling to native code...\n";
355356
#if LLVM_VERSION >= 210
356357
debug(2) << "Target triple: " << module_in.getTargetTriple().str() << "\n";
@@ -443,11 +444,15 @@ std::unique_ptr<llvm::Module> compile_module_to_llvm_module(const Module &module
443444
}
444445

445446
void compile_llvm_module_to_object(llvm::Module &module, Internal::LLVMOStream &out) {
446-
emit_file(module, out, llvm::CodeGenFileType::ObjectFile);
447+
Internal::run_with_large_stack([&]() {
448+
emit_file(module, out, llvm::CodeGenFileType::ObjectFile);
449+
});
447450
}
448451

449452
void compile_llvm_module_to_assembly(llvm::Module &module, Internal::LLVMOStream &out) {
450-
emit_file(module, out, llvm::CodeGenFileType::AssemblyFile);
453+
Internal::run_with_large_stack([&]() {
454+
emit_file(module, out, llvm::CodeGenFileType::AssemblyFile);
455+
});
451456
}
452457

453458
void compile_llvm_module_to_llvm_bitcode(llvm::Module &module, Internal::LLVMOStream &out) {

src/LLVM_Runtime_Linker.cpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -425,16 +425,15 @@ std::optional<llvm::VersionTuple> get_os_version_constraint(const llvm::Triple &
425425
return std::nullopt;
426426
}
427427

428+
// These version constraints track the minimum deployment targets
429+
// supported by the latest Xcode version, which is currently 26.2.
430+
// See the table here: https://developer.apple.com/support/xcode/
428431
if (triple.isMacOSX() && triple.isX86()) {
429-
// At time of writing (January 2025), this is one version prior
430-
// to the oldest version still supported by Apple.
431-
return llvm::VersionTuple(12, 0, 0);
432+
return llvm::VersionTuple(11, 0, 0);
432433
}
433434

434435
if (triple.isiOS()) {
435-
// At time of writing (January 2025), this is one version prior
436-
// to the oldest version still supported by Apple.
437-
return llvm::VersionTuple(17, 0, 0);
436+
return llvm::VersionTuple(15, 0, 0);
438437
}
439438

440439
llvm::VersionTuple t = triple.getMinimumSupportedOSVersion();

src/PythonExtensionGen.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ bool unpack_buffer(PyObject *py_obj,
214214
return false;
215215
}
216216
217-
memset(&halide_buf, 0, sizeof(halide_buf));
217+
halide_buf = {};
218218
needs_device_free = true;
219219
if (!py_buf.format) {
220220
halide_buf.type.code = halide_type_uint;

test/correctness/simd_op_check.h

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -164,14 +164,15 @@ class SimdOpCheckTest {
164164
int vector_width,
165165
const std::vector<Argument> &arg_types,
166166
std::ostringstream &error_msg) {
167-
std::string fn_name = "test_" + name;
167+
std::string fn_name = "test_" + name + "_vecwidth" + std::to_string(vector_width);
168168
std::string file_name = output_directory + fn_name;
169169

170170
auto ext = Internal::get_output_info(target);
171171
std::map<OutputFileType, std::string> outputs = {
172172
{OutputFileType::c_header, file_name + ext.at(OutputFileType::c_header).extension},
173173
{OutputFileType::object, file_name + ext.at(OutputFileType::object).extension},
174174
{OutputFileType::assembly, file_name + ".s"},
175+
{OutputFileType::llvm_assembly, file_name + ".ll"},
175176
};
176177
error.compile_to(outputs, arg_types, fn_name, target);
177178

@@ -319,7 +320,7 @@ class SimdOpCheckTest {
319320
e.accept(&has_inline_reduction);
320321

321322
// Define a vectorized Halide::Func that uses the pattern.
322-
Halide::Func f(name);
323+
Halide::Func f(name + "_vecwidth" + std::to_string(vector_width));
323324
f(x, y) = e;
324325
f.bound(x, 0, W).vectorize(x, vector_width);
325326
f.compute_root();
@@ -471,15 +472,18 @@ class SimdOpCheckTest {
471472
// settings.
472473
if (!wildcard_match(filter, op)) return;
473474

474-
tasks.emplace_back(Task{op, name, vector_width, e});
475+
tasks.emplace_back(Task{op, name, vector_width, std::move(e)});
475476
}
476477
virtual void add_tests() = 0;
477478
virtual int image_param_alignment() {
478479
return 16;
479480
}
480481

481-
virtual bool use_multiple_threads() const {
482-
return true;
482+
int num_worker_threads() const {
483+
if (std::string t = Halide::Internal::get_env_variable("HL_OP_CHECK_THREADS"); !t.empty()) {
484+
return std::atoi(t.c_str());
485+
}
486+
return Halide::Tools::ThreadPool<void>::num_processors_online();
483487
}
484488

485489
virtual bool test_all() {
@@ -492,10 +496,7 @@ class SimdOpCheckTest {
492496

493497
Sharder sharder;
494498

495-
Halide::Tools::ThreadPool<TestResult> pool(
496-
use_multiple_threads() ?
497-
Halide::Tools::ThreadPool<TestResult>::num_processors_online() :
498-
1);
499+
Halide::Tools::ThreadPool<TestResult> pool(num_worker_threads());
499500
std::vector<std::future<TestResult>> futures;
500501

501502
for (size_t t = 0; t < tasks.size(); t++) {

test/correctness/simd_op_check_arm.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -888,7 +888,9 @@ class SimdOpCheckARM : public SimdOpCheckTest {
888888
// VFRINTN
889889
if (target.bits == 64) {
890890
// LLVM doesn't want to emit vfrintn on arm-32
891-
check(arm32 ? "vfrintn.f16" : "frintn", 8 * w, round(f16_1));
891+
if (target.has_feature(Target::ARMFp16)) {
892+
check(arm32 ? "vfrintn.f16" : "frintn", 8 * w, round(f16_1));
893+
}
892894
check(arm32 ? "vfrintn.f32" : "frintn", 4 * w, round(f32_1));
893895
check(arm32 ? "vfrintn.f64" : "frintn", 2 * w, round(f64_1));
894896
}

0 commit comments

Comments
 (0)