[AArch64] Suboptimal code for getting index of first non-zero element in a vector

Consider the following code for finding the first non-zero element in a vector of 16 bytes (eg finding the first match in string parsing code):

https://godbolt.org/z/ac4q8o994
```llvm
define i16 @src_16(<16 x i8> noundef %mask) {
    %icmp = icmp slt <16 x i8> %mask, splat (i8 0)
    %bitmask = bitcast <16 x i1> %icmp to i16
    %ctz = call i16 @llvm.cttz.i16(i16 %bitmask)
    ret i16 %ctz
}
```

x86 can do this in a single `movmask` and `ctz`:
```asm
src_16:                                 # @src_16
        vpmovmskb       eax, xmm0
        or      eax, 65536
        tzcnt   eax, eax
        ret
```

Aarch64 does not have an equivalent to `movmask` and so has to emulate it:
```asm
.LCPI1_0:
        .byte   1                               // 0x1
        .byte   2                               // 0x2
        .byte   4                               // 0x4
        .byte   8                               // 0x8
        .byte   16                              // 0x10
        .byte   32                              // 0x20
        .byte   64                              // 0x40
        .byte   128                             // 0x80
        .byte   1                               // 0x1
        .byte   2                               // 0x2
        .byte   4                               // 0x4
        .byte   8                               // 0x8
        .byte   16                              // 0x10
        .byte   32                              // 0x20
        .byte   64                              // 0x40
        .byte   128                             // 0x80
src_16:                                 // @src_16
        adrp    x8, .LCPI1_0
        cmlt    v0.16b, v0.16b, #0
        ldr     q1, [x8, :lo12:.LCPI1_0]
        and     v0.16b, v0.16b, v1.16b
        ext     v1.16b, v0.16b, v0.16b, #8
        zip1    v0.16b, v0.16b, v1.16b
        addv    h0, v0.8h
        fmov    w8, s0
        orr     w8, w8, #0x10000
        rbit    w8, w8
        clz     w0, w8
        ret
```

But instead of emulating a `movmask`, we can find the index with a more direct method, assuming we don't need the actual bitstring produced by `movmask`:
```asm
.LCPI2_0:
        .byte   0                               // 0x0
        .byte   1                               // 0x1
        .byte   2                               // 0x2
        .byte   3                               // 0x3
        .byte   4                               // 0x4
        .byte   5                               // 0x5
        .byte   6                               // 0x6
        .byte   7                               // 0x7
        .byte   8                               // 0x8
        .byte   9                               // 0x9
        .byte   10                              // 0xa
        .byte   11                              // 0xb
        .byte   12                              // 0xc
        .byte   13                              // 0xd
        .byte   14                              // 0xe
        .byte   15                              // 0xf
tgt_16:                                 // @tgt_16
        adrp    x8, .LCPI2_0
        cmge    v0.16b, v0.16b, #0
        mov     w9, #16                         // =0x10
        ldr     q1, [x8, :lo12:.LCPI2_0]
        orr     v0.16b, v0.16b, v1.16b
        uminv   b0, v0.16b
        fmov    w8, s0
        tst     w8, #0xf0
        csel    w0, w8, w9, eq
        ret
```


# Alive proof:
https://alive2.llvm.org/ce/z/rZuabM
```llvm
define i16 @src_16(<16 x i8> noundef %mask) {
#0:
  %icmp = icmp slt <16 x i8> noundef %mask, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
  %bitmask = bitcast <16 x i1> %icmp to i16
  %ctz = cttz i16 %bitmask, 0
  ret i16 %ctz
}
=>
define i16 @tgt_16(<16 x i8> noundef %mask) {
#0:
  %icmp = icmp sge <16 x i8> noundef %mask, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
  %not = sext <16 x i1> %icmp to <16 x i8>
  %or = or <16 x i8> %not, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }
  %min = reduce_umin <16 x i8> %or
  %clamp = umin i8 %min, 16
  %ret = zext i8 %clamp to i16
  ret i16 %ret
}
Transformation seems to be correct!
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AArch64] Suboptimal code for getting index of first non-zero element in a vector #186295

Alive proof:

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[AArch64] Suboptimal code for getting index of first non-zero element in a vector #186295

Description

Alive proof:

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions