Skip to content

[AArch64] Suboptimal code for getting index of first non-zero element in a vector #186295

@Kmeakin

Description

@Kmeakin

Consider the following code for finding the first non-zero element in a vector of 16 bytes (eg finding the first match in string parsing code):

https://godbolt.org/z/ac4q8o994

define i16 @src_16(<16 x i8> noundef %mask) {
    %icmp = icmp slt <16 x i8> %mask, splat (i8 0)
    %bitmask = bitcast <16 x i1> %icmp to i16
    %ctz = call i16 @llvm.cttz.i16(i16 %bitmask)
    ret i16 %ctz
}

x86 can do this in a single movmask and ctz:

src_16:                                 # @src_16
        vpmovmskb       eax, xmm0
        or      eax, 65536
        tzcnt   eax, eax
        ret

Aarch64 does not have an equivalent to movmask and so has to emulate it:

.LCPI1_0:
        .byte   1                               // 0x1
        .byte   2                               // 0x2
        .byte   4                               // 0x4
        .byte   8                               // 0x8
        .byte   16                              // 0x10
        .byte   32                              // 0x20
        .byte   64                              // 0x40
        .byte   128                             // 0x80
        .byte   1                               // 0x1
        .byte   2                               // 0x2
        .byte   4                               // 0x4
        .byte   8                               // 0x8
        .byte   16                              // 0x10
        .byte   32                              // 0x20
        .byte   64                              // 0x40
        .byte   128                             // 0x80
src_16:                                 // @src_16
        adrp    x8, .LCPI1_0
        cmlt    v0.16b, v0.16b, #0
        ldr     q1, [x8, :lo12:.LCPI1_0]
        and     v0.16b, v0.16b, v1.16b
        ext     v1.16b, v0.16b, v0.16b, #8
        zip1    v0.16b, v0.16b, v1.16b
        addv    h0, v0.8h
        fmov    w8, s0
        orr     w8, w8, #0x10000
        rbit    w8, w8
        clz     w0, w8
        ret

But instead of emulating a movmask, we can find the index with a more direct method, assuming we don't need the actual bitstring produced by movmask:

.LCPI2_0:
        .byte   0                               // 0x0
        .byte   1                               // 0x1
        .byte   2                               // 0x2
        .byte   3                               // 0x3
        .byte   4                               // 0x4
        .byte   5                               // 0x5
        .byte   6                               // 0x6
        .byte   7                               // 0x7
        .byte   8                               // 0x8
        .byte   9                               // 0x9
        .byte   10                              // 0xa
        .byte   11                              // 0xb
        .byte   12                              // 0xc
        .byte   13                              // 0xd
        .byte   14                              // 0xe
        .byte   15                              // 0xf
tgt_16:                                 // @tgt_16
        adrp    x8, .LCPI2_0
        cmge    v0.16b, v0.16b, #0
        mov     w9, #16                         // =0x10
        ldr     q1, [x8, :lo12:.LCPI2_0]
        orr     v0.16b, v0.16b, v1.16b
        uminv   b0, v0.16b
        fmov    w8, s0
        tst     w8, #0xf0
        csel    w0, w8, w9, eq
        ret

Alive proof:

https://alive2.llvm.org/ce/z/rZuabM

define i16 @src_16(<16 x i8> noundef %mask) {
#0:
  %icmp = icmp slt <16 x i8> noundef %mask, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
  %bitmask = bitcast <16 x i1> %icmp to i16
  %ctz = cttz i16 %bitmask, 0
  ret i16 %ctz
}
=>
define i16 @tgt_16(<16 x i8> noundef %mask) {
#0:
  %icmp = icmp sge <16 x i8> noundef %mask, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
  %not = sext <16 x i1> %icmp to <16 x i8>
  %or = or <16 x i8> %not, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }
  %min = reduce_umin <16 x i8> %or
  %clamp = umin i8 %min, 16
  %ret = zext i8 %clamp to i16
  ret i16 %ret
}
Transformation seems to be correct!

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions