-
Notifications
You must be signed in to change notification settings - Fork 16.5k
Open
Description
Consider the following code for finding the first non-zero element in a vector of 16 bytes (eg finding the first match in string parsing code):
https://godbolt.org/z/ac4q8o994
define i16 @src_16(<16 x i8> noundef %mask) {
%icmp = icmp slt <16 x i8> %mask, splat (i8 0)
%bitmask = bitcast <16 x i1> %icmp to i16
%ctz = call i16 @llvm.cttz.i16(i16 %bitmask)
ret i16 %ctz
}x86 can do this in a single movmask and ctz:
src_16: # @src_16
vpmovmskb eax, xmm0
or eax, 65536
tzcnt eax, eax
retAarch64 does not have an equivalent to movmask and so has to emulate it:
.LCPI1_0:
.byte 1 // 0x1
.byte 2 // 0x2
.byte 4 // 0x4
.byte 8 // 0x8
.byte 16 // 0x10
.byte 32 // 0x20
.byte 64 // 0x40
.byte 128 // 0x80
.byte 1 // 0x1
.byte 2 // 0x2
.byte 4 // 0x4
.byte 8 // 0x8
.byte 16 // 0x10
.byte 32 // 0x20
.byte 64 // 0x40
.byte 128 // 0x80
src_16: // @src_16
adrp x8, .LCPI1_0
cmlt v0.16b, v0.16b, #0
ldr q1, [x8, :lo12:.LCPI1_0]
and v0.16b, v0.16b, v1.16b
ext v1.16b, v0.16b, v0.16b, #8
zip1 v0.16b, v0.16b, v1.16b
addv h0, v0.8h
fmov w8, s0
orr w8, w8, #0x10000
rbit w8, w8
clz w0, w8
retBut instead of emulating a movmask, we can find the index with a more direct method, assuming we don't need the actual bitstring produced by movmask:
.LCPI2_0:
.byte 0 // 0x0
.byte 1 // 0x1
.byte 2 // 0x2
.byte 3 // 0x3
.byte 4 // 0x4
.byte 5 // 0x5
.byte 6 // 0x6
.byte 7 // 0x7
.byte 8 // 0x8
.byte 9 // 0x9
.byte 10 // 0xa
.byte 11 // 0xb
.byte 12 // 0xc
.byte 13 // 0xd
.byte 14 // 0xe
.byte 15 // 0xf
tgt_16: // @tgt_16
adrp x8, .LCPI2_0
cmge v0.16b, v0.16b, #0
mov w9, #16 // =0x10
ldr q1, [x8, :lo12:.LCPI2_0]
orr v0.16b, v0.16b, v1.16b
uminv b0, v0.16b
fmov w8, s0
tst w8, #0xf0
csel w0, w8, w9, eq
retAlive proof:
https://alive2.llvm.org/ce/z/rZuabM
define i16 @src_16(<16 x i8> noundef %mask) {
#0:
%icmp = icmp slt <16 x i8> noundef %mask, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
%bitmask = bitcast <16 x i1> %icmp to i16
%ctz = cttz i16 %bitmask, 0
ret i16 %ctz
}
=>
define i16 @tgt_16(<16 x i8> noundef %mask) {
#0:
%icmp = icmp sge <16 x i8> noundef %mask, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
%not = sext <16 x i1> %icmp to <16 x i8>
%or = or <16 x i8> %not, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }
%min = reduce_umin <16 x i8> %or
%clamp = umin i8 %min, 16
%ret = zext i8 %clamp to i16
ret i16 %ret
}
Transformation seems to be correct!Reactions are currently unavailable