Skip to content

Commit dd187a2

Browse files
abadamsmcourteauxclaudealexreinking
authored
Better vector interleaves (#8925)
* Specialized x86 implementation of interleave_vectors * Update test to be more exhaustive * Fix comment. The previous comment reported a time that seemed to have regressed. It was not 8.2ms on main - more like 11 * Comment fix * clang-tidy fixes * Make variable names more consistent * Simplify code with helper lambda * Comment tweaks * Don't do half-width unpcks * Use optimization fences in the base class too Before: Computing best tile sizes for each type ................................................. bytes, tile width, tile height, bandwidth (GB/s): 1 8 8 20.9997 1 16 8 20.8329 1 8 16 18.5702 1 8 32 17.2463 1 8 64 14.312 2 8 16 19.2047 2 8 8 18.8368 2 16 8 17.0593 2 8 32 17.0591 2 4 8 15.7681 4 8 8 24.9364 4 4 16 22.9699 4 8 16 22.5743 4 4 32 22.255 4 4 8 20.4468 8 8 8 38.4094 8 16 4 28.4167 8 16 8 27.6184 8 8 4 27.6062 8 8 16 26.8693 After: Computing best tile sizes for each type ................................................. bytes, tile width, tile height, bandwidth (GB/s): 1 16 32 34.1921 1 16 16 31.8399 1 8 16 25.575 1 16 64 25.1665 1 32 16 25.0061 2 8 32 28.2635 2 8 16 27.7648 2 16 16 27.2126 2 16 32 23.9034 2 8 8 23.6345 4 8 16 34.5303 4 8 8 28.3653 4 16 8 26.8521 4 8 32 26.084 4 16 16 24.4519 8 8 8 33.7163 8 8 4 29.1339 8 4 16 26.418 8 16 4 25.4663 8 2 8 24.3949 * Use Catanzaro's algorithm for non-power-of-two interleaves * Support more interleave and deinterleave patterns * clang-tidy fix * Handle multiple let injections at same site Also better algorithm for innermost containing stmt * better simplification and better handling of composite factors * Fix innermost_containing_node * Fix some simd op check failures * Fix infinite recursion issue and missed case in interleave codegen * Adjust expectations in stage_strided_loads test * Allow reversed suffix or not in sve test * Don't use optimization fences on hexagon * Fix infinite simplifier loop * Don't hoist transposes on hexagon * Make distinct strided load nodes in the IR distinct in memory too * arm-32 has no vst2 for 64-bit elements * Windows bad filename fix in simd op check * Temporary dumping of cpu info to debug github actions issue * dump cpuinfo in makefile testing workflow To help diagnose occasional illegal instruction errors * Address review comments * Remove duplicate function body * Use slice of predicate * clang-format * SVE fixes Co-authored-by: Claude Code <noreply@anthropic.com> * Move optimization_fence back * Try to thread the needle with webassembly nonsense * Fix msvc warning * Skip simd_op_check_sve2 on old llvms * Skip test on sve2 with llvm 21 * Skip block transpose performance test for sve2 on llvm 21 * Skip sub-test that triggers llvm bug * Test should hopefully now work with llvm main * Back out dump of cpuinfo * Fix bad merge * Use structured bindings in transpose_idioms.cpp * Avoid bad simplify rule These rules were broken for triply-nested ramps. A better version of these rules will come in a later PR. --------- Co-authored-by: Martijn Courteaux <courteauxmartijn@gmail.com> Co-authored-by: Claude Code <noreply@anthropic.com> Co-authored-by: Alex Reinking <areinking@adobe.com>
1 parent c936df9 commit dd187a2

33 files changed

Lines changed: 1838 additions & 188 deletions

apps/iir_blur/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ $(BIN)/%/filter: filter.cpp $(BIN)/%/iir_blur.a $(BIN)/%/iir_blur_auto_schedule.
2525
$(CXX) $(CXXFLAGS) -I$(BIN)/$* -Wall -O3 $^ -o $@ $(LDFLAGS) $(IMAGE_IO_FLAGS) $(CUDA_LDFLAGS) $(OPENCL_LDFLAGS)
2626

2727
$(BIN)/%/out.png: $(BIN)/%/filter
28-
$< ../images/rgba.png $(BIN)/$*/out.png
28+
$< ../images/rgb.png $(BIN)/$*/out.png
2929

3030
clean:
3131
rm -rf $(BIN)

apps/iir_blur/iir_blur_generator.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,26 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule
3636
if (!skip_schedule) {
3737
if (!target.has_gpu_feature()) {
3838
// CPU schedule.
39-
// 8.2ms on an Intel i9-9960X using 16 threads
39+
// 9.7ms on an Intel i9-9960X at 3.1 GHz using 16 threads
4040
// Split the transpose into tiles of rows. Parallelize over channels
41-
// and strips (Halide supports nested parallelism).
42-
Var xo, yo, t;
41+
// and strips.
42+
Var xo, yo, t, yi;
4343
transpose.compute_root()
4444
.tile(x, y, xo, yo, x, y, vec, vec * 4)
45+
.split(y, y, yi, vec)
46+
.vectorize(yi)
4547
.vectorize(x)
46-
.parallel(yo)
47-
.parallel(c);
48+
.fuse(yo, c, t)
49+
.parallel(t);
50+
51+
blur.in(transpose)
52+
.compute_at(transpose, y)
53+
.vectorize(x)
54+
.unroll(y);
4855

4956
// Run the filter on each row of tiles (which corresponds to a strip of
5057
// columns in the input).
51-
blur.compute_at(transpose, yo);
58+
blur.compute_at(transpose, t);
5259

5360
// Vectorize computations within the strips.
5461
blur.update(0)

src/CSE.cpp

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -239,10 +239,39 @@ class CSEEveryExprInStmt : public IRMutator {
239239
}
240240
const Call *bundle = Call::as_intrinsic(dummy, {Call::bundle});
241241
internal_assert(bundle && bundle->args.size() == 2);
242-
Stmt s = Store::make(op->name, bundle->args[0], bundle->args[1],
242+
243+
Expr value = bundle->args[0], index = bundle->args[1];
244+
245+
// Figure out which ones are actually needed by the index
246+
247+
auto add_all_vars_to_set = [&](const Expr &e, std::set<std::string> &s) {
248+
visit_with(e, [&](auto *, const Variable *var) {
249+
s.insert(var->name);
250+
});
251+
};
252+
253+
std::set<string> index_lets;
254+
add_all_vars_to_set(index, index_lets);
255+
for (const auto &[var, val] : reverse_view(lets)) {
256+
if (index_lets.count(var)) {
257+
add_all_vars_to_set(val, index_lets);
258+
}
259+
}
260+
261+
vector<pair<string, Expr>> deferred;
262+
for (const auto &[var, val] : reverse_view(lets)) {
263+
if (index_lets.count(var)) {
264+
deferred.emplace_back(var, val);
265+
} else {
266+
value = Let::make(var, val, value);
267+
}
268+
}
269+
270+
Stmt s = Store::make(op->name, value, index,
243271
op->param, mutate(op->predicate), op->alignment);
244-
for (const auto &[var, value] : reverse_view(lets)) {
245-
s = LetStmt::make(var, value, s);
272+
273+
for (const auto &[var, val] : deferred) {
274+
s = LetStmt::make(var, val, s);
246275
}
247276
return s;
248277
}

src/CodeGen_ARM.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1499,10 +1499,11 @@ void CodeGen_ARM::visit(const Store *op) {
14991499
intrin_type = t;
15001500
Type elt = t.element_of();
15011501
int vec_bits = t.bits() * t.lanes();
1502-
if (elt == Float(32) || elt == Float(64) ||
1503-
is_float16_and_has_feature(elt) ||
1504-
elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
1505-
elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64)) {
1502+
if (t.bits() <= target.bits &&
1503+
(elt == Float(32) || elt == Float(64) ||
1504+
is_float16_and_has_feature(elt) ||
1505+
elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
1506+
elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64))) {
15061507
const int target_vector_bits = native_vector_bits();
15071508
if (vec_bits % 128 == 0) {
15081509
type_ok_for_vst = true;
@@ -1978,6 +1979,7 @@ void CodeGen_ARM::visit(const Shuffle *op) {
19781979
if (target.os != Target::IOS && target.os != Target::OSX &&
19791980
load &&
19801981
op->vectors.size() == 1 &&
1982+
op->is_slice() &&
19811983
2 <= stride && stride <= 4 &&
19821984
op->slice_begin() < stride &&
19831985
load->type.lanes() == stride * op->type.lanes()) {

src/CodeGen_Hexagon.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ class CodeGen_Hexagon : public CodeGen_CPU {
9595
llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &v) override;
9696
llvm::Value *shuffle_vectors(llvm::Value *a, llvm::Value *b,
9797
const std::vector<int> &indices) override;
98+
llvm::Value *optimization_fence(llvm::Value *v) override;
9899
using CodeGen_CPU::shuffle_vectors;
99100
///@}
100101

@@ -1301,6 +1302,12 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b,
13011302
return vdelta(concat_vectors({a, b}), indices);
13021303
}
13031304

1305+
Value *CodeGen_Hexagon::optimization_fence(Value *v) {
1306+
// As of llvm 21, the base class version seems to trip up LLVM's hexagon
1307+
// backend, possibly because it relies on a floating point type.
1308+
return v;
1309+
}
1310+
13041311
Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
13051312
int max_index) {
13061313
llvm::Type *lut_ty = lut->getType();
@@ -1409,10 +1416,6 @@ Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
14091416
return slice_vector(concat_vectors(result), 0, idx_elements);
14101417
}
14111418

1412-
bool is_power_of_two(int x) {
1413-
return (x & (x - 1)) == 0;
1414-
}
1415-
14161419
// vdelta and vrdelta are instructions that take an input vector and
14171420
// pass it through a network made up of levels. Each element x at each
14181421
// level i can either take the element from the previous level at the

0 commit comments

Comments
 (0)