Skip to content

Commit d6d20f3

Browse files
Remove align-up vectors in dense Load/Store of SVE2
Now that fallback mechanism excludes odd sized vectors in SVE2, this change results in better instructions by LLVM optimization than using predicates.
1 parent 4f2e6b3 commit d6d20f3

2 files changed

Lines changed: 4 additions & 41 deletions

File tree

src/CodeGen_ARM.cpp

Lines changed: 3 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1795,24 +1795,8 @@ void CodeGen_ARM::visit(const Store *op) {
17951795
if (target_vscale() > 0) {
17961796
const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
17971797
if (stride && stride->value == 1) {
1798-
// Basically we can deal with vanilla codegen,
1799-
// but to avoid LLVM error, process with the multiple of natural_lanes
1800-
const int natural_lanes = natural_vector_size(op->value.type());
1801-
if (ramp->lanes % natural_lanes && !emit_atomic_stores) {
1802-
int aligned_lanes = align_up(ramp->lanes, natural_lanes);
1803-
// Use predicate to prevent overrun
1804-
Expr vpred;
1805-
if (is_predicated_store) {
1806-
vpred = Shuffle::make_concat({op->predicate, const_false(aligned_lanes - ramp->lanes)});
1807-
} else {
1808-
vpred = make_vector_predicate_1s_0s(ramp->lanes, aligned_lanes - ramp->lanes);
1809-
}
1810-
auto aligned_index = Ramp::make(ramp->base, stride, aligned_lanes);
1811-
Expr padding = make_zero(op->value.type().with_lanes(aligned_lanes - ramp->lanes));
1812-
Expr aligned_value = Shuffle::make_concat({op->value, padding});
1813-
codegen(Store::make(op->name, aligned_value, aligned_index, op->param, vpred, op->alignment));
1814-
return;
1815-
}
1798+
CodeGen_CPU::visit(op);
1799+
return;
18161800
} else if (op->index.type().is_vector()) {
18171801
// Scatter
18181802
Type elt = op->value.type().element_of();
@@ -1965,30 +1949,9 @@ void CodeGen_ARM::visit(const Load *op) {
19651949
}
19661950

19671951
if ((target_vscale() > 0)) {
1968-
if (stride && stride->value < 1) {
1952+
if (stride && stride->value <= 1) {
19691953
CodeGen_CPU::visit(op);
19701954
return;
1971-
} else if (stride && stride->value == 1) {
1972-
const int natural_lanes = natural_vector_size(op->type);
1973-
if (ramp->lanes % natural_lanes) {
1974-
// Load with lanes multiple of natural_lanes
1975-
int aligned_lanes = align_up(ramp->lanes, natural_lanes);
1976-
// Use predicate to prevent from overrun
1977-
Expr vpred;
1978-
if (is_predicated_load) {
1979-
vpred = Shuffle::make_concat({op->predicate, const_false(aligned_lanes - ramp->lanes)});
1980-
} else {
1981-
vpred = make_vector_predicate_1s_0s(ramp->lanes, aligned_lanes - ramp->lanes);
1982-
}
1983-
auto aligned_index = Ramp::make(ramp->base, stride, aligned_lanes);
1984-
auto aligned_type = op->type.with_lanes(aligned_lanes);
1985-
value = codegen(Load::make(aligned_type, op->name, aligned_index, op->image, op->param, vpred, op->alignment));
1986-
value = slice_vector(value, 0, ramp->lanes);
1987-
return;
1988-
} else {
1989-
CodeGen_CPU::visit(op);
1990-
return;
1991-
}
19921955
} else if (op->index.type().is_vector()) {
19931956
// General Gather Load
19941957

test/correctness/simd_op_check_sve2.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -871,7 +871,7 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
871871
// In case of lanes shorter than native's, predicate pattern is generated by
872872
// "whilelt" intrinsic.
873873
// <vscale x 8 x i1> @llvm.aarch64.sve.whilelt.nxv8i1.i32(i32 0, i32 4)
874-
if (factor == 0.5f) {
874+
if (factor == 0.5f && bits >= 32) {
875875
string constraint("vl" + to_string(total_lanes));
876876
add("whilelt", {get_ptrue_instr_with_constraint(bits, constraint)}, total_lanes, scatter);
877877
}

0 commit comments

Comments
 (0)