|
| 1 | +//go:build !purego |
| 2 | + |
| 3 | +#include "textflag.h" |
| 4 | + |
| 5 | +// ----------------------------------------------------------------------------- |
| 6 | +// NEON Shuffle masks and shift tables for unpacking int32 values |
| 7 | +// |
| 8 | +// NEON uses 128-bit registers (vs AVX2's 256-bit), so we process 4 int32 |
| 9 | +// values per iteration instead of 8. |
| 10 | +// |
| 11 | +// TBL instruction: byte shuffle within 16 bytes using indices 0-15 |
| 12 | +// USHL instruction: variable shift per lane (negative = right shift) |
| 13 | +// ----------------------------------------------------------------------------- |
| 14 | + |
| 15 | +// Shuffle masks for unpacking values from bit widths 1 to 16. |
| 16 | +// |
| 17 | +// For NEON, we process 4 int32 values at a time. Each mask is 16 bytes. |
| 18 | +// The masks are indexed by: offset = 16 * (bitWidth - 1) |
| 19 | +// |
| 20 | +// Special value 0xFF means "load zero byte" |
| 21 | +// |
| 22 | +GLOBL ·shuffleInt32x1to16bitsNEON(SB), RODATA|NOPTR, $256 |
| 23 | + |
| 24 | +// 1 bit => 32 bits (4 values from 4 bits = 0.5 bytes) |
| 25 | +// Values 0,1,2,3 packed in first byte |
| 26 | +DATA ·shuffleInt32x1to16bitsNEON+0+0(SB)/4, $0x808080FF // value 0: byte 0, bits 0 |
| 27 | +DATA ·shuffleInt32x1to16bitsNEON+0+4(SB)/4, $0x808080FF // value 1: byte 0, bits 1 |
| 28 | +DATA ·shuffleInt32x1to16bitsNEON+0+8(SB)/4, $0x808080FF // value 2: byte 0, bits 2 |
| 29 | +DATA ·shuffleInt32x1to16bitsNEON+0+12(SB)/4, $0x808080FF // value 3: byte 0, bits 3 |
| 30 | + |
| 31 | +// 2 bits => 32 bits (4 values from 8 bits = 1 byte) |
| 32 | +DATA ·shuffleInt32x1to16bitsNEON+16+0(SB)/4, $0x808080FF // value 0: byte 0, bits 0-1 |
| 33 | +DATA ·shuffleInt32x1to16bitsNEON+16+4(SB)/4, $0x808080FF // value 1: byte 0, bits 2-3 |
| 34 | +DATA ·shuffleInt32x1to16bitsNEON+16+8(SB)/4, $0x808080FF // value 2: byte 0, bits 4-5 |
| 35 | +DATA ·shuffleInt32x1to16bitsNEON+16+12(SB)/4, $0x808080FF // value 3: byte 0, bits 6-7 |
| 36 | + |
| 37 | +// 3 bits => 32 bits (4 values from 12 bits = 1.5 bytes) |
| 38 | +DATA ·shuffleInt32x1to16bitsNEON+32+0(SB)/4, $0x808080FF // value 0: byte 0, bits 0-2 |
| 39 | +DATA ·shuffleInt32x1to16bitsNEON+32+4(SB)/4, $0x808080FF // value 1: byte 0-1, bits 3-5 |
| 40 | +DATA ·shuffleInt32x1to16bitsNEON+32+8(SB)/4, $0x808080FF // value 2: byte 1, bits 6-7,0 |
| 41 | +DATA ·shuffleInt32x1to16bitsNEON+32+12(SB)/4, $0x808080FF // value 3: byte 1, bits 1-3 |
| 42 | + |
| 43 | +// 4 bits => 32 bits (4 values from 16 bits = 2 bytes) |
| 44 | +DATA ·shuffleInt32x1to16bitsNEON+48+0(SB)/4, $0x808080FF // value 0: byte 0, bits 0-3 |
| 45 | +DATA ·shuffleInt32x1to16bitsNEON+48+4(SB)/4, $0x808080FF // value 1: byte 0, bits 4-7 |
| 46 | +DATA ·shuffleInt32x1to16bitsNEON+48+8(SB)/4, $0x808080FF // value 2: byte 1, bits 0-3 |
| 47 | +DATA ·shuffleInt32x1to16bitsNEON+48+12(SB)/4, $0x808080FF // value 3: byte 1, bits 4-7 |
| 48 | + |
| 49 | +// 5-16 bits: Similar pattern, will implement incrementally |
| 50 | +// For now, using placeholders - these will be filled in based on the algorithm |
| 51 | + |
| 52 | +// 8 bits => 32 bits (4 values from 32 bits = 4 bytes) |
| 53 | +DATA ·shuffleInt32x1to16bitsNEON+112+0(SB)/4, $0x80808000 // value 0: byte 0 |
| 54 | +DATA ·shuffleInt32x1to16bitsNEON+112+4(SB)/4, $0x80808001 // value 1: byte 1 |
| 55 | +DATA ·shuffleInt32x1to16bitsNEON+112+8(SB)/4, $0x80808002 // value 2: byte 2 |
| 56 | +DATA ·shuffleInt32x1to16bitsNEON+112+12(SB)/4, $0x80808003 // value 3: byte 3 |
| 57 | + |
| 58 | +// 16 bits => 32 bits (4 values from 64 bits = 8 bytes) |
| 59 | +DATA ·shuffleInt32x1to16bitsNEON+240+0(SB)/4, $0x80800100 // value 0: bytes 0-1 |
| 60 | +DATA ·shuffleInt32x1to16bitsNEON+240+4(SB)/4, $0x80800302 // value 1: bytes 2-3 |
| 61 | +DATA ·shuffleInt32x1to16bitsNEON+240+8(SB)/4, $0x80800504 // value 2: bytes 4-5 |
| 62 | +DATA ·shuffleInt32x1to16bitsNEON+240+12(SB)/4, $0x80800706 // value 3: bytes 6-7 |
| 63 | + |
| 64 | +// Shift amounts for NEON USHL instruction |
| 65 | +// USHL uses signed shift amounts: negative = right shift, positive = left shift |
| 66 | +// Each entry contains 4 int32 shift amounts for 4 values |
| 67 | +// |
| 68 | +// Formula: shift[i] = -(i * bitWidth) % 8 (negative for right shift) |
| 69 | +// |
| 70 | +GLOBL ·shiftRightInt32NEON(SB), RODATA|NOPTR, $256 |
| 71 | + |
| 72 | +// 1 bit: shifts are 0, -1, -2, -3 |
| 73 | +DATA ·shiftRightInt32NEON+0+0(SB)/4, $0 // value 0: shift right by 0 |
| 74 | +DATA ·shiftRightInt32NEON+0+4(SB)/4, $0xFFFFFFFF // value 1: shift right by 1 |
| 75 | +DATA ·shiftRightInt32NEON+0+8(SB)/4, $0xFFFFFFFE // value 2: shift right by 2 |
| 76 | +DATA ·shiftRightInt32NEON+0+12(SB)/4, $0xFFFFFFFD // value 3: shift right by 3 |
| 77 | + |
| 78 | +// 2 bits: shifts are 0, -2, -4, -6 |
| 79 | +DATA ·shiftRightInt32NEON+16+0(SB)/4, $0 // value 0: shift right by 0 |
| 80 | +DATA ·shiftRightInt32NEON+16+4(SB)/4, $0xFFFFFFFE // value 1: shift right by 2 |
| 81 | +DATA ·shiftRightInt32NEON+16+8(SB)/4, $0xFFFFFFFC // value 2: shift right by 4 |
| 82 | +DATA ·shiftRightInt32NEON+16+12(SB)/4, $0xFFFFFFFA // value 3: shift right by 6 |
| 83 | + |
| 84 | +// 3 bits: shifts are 0, -3, -6, -1 (wraps at 8) |
| 85 | +DATA ·shiftRightInt32NEON+32+0(SB)/4, $0 // value 0: shift right by 0 |
| 86 | +DATA ·shiftRightInt32NEON+32+4(SB)/4, $0xFFFFFFFD // value 1: shift right by 3 |
| 87 | +DATA ·shiftRightInt32NEON+32+8(SB)/4, $0xFFFFFFFA // value 2: shift right by 6 |
| 88 | +DATA ·shiftRightInt32NEON+32+12(SB)/4, $0xFFFFFFFF // value 3: shift right by 1 |
| 89 | + |
| 90 | +// 4 bits: shifts are 0, -4, 0, -4 (wraps at 8) |
| 91 | +DATA ·shiftRightInt32NEON+48+0(SB)/4, $0 // value 0: shift right by 0 |
| 92 | +DATA ·shiftRightInt32NEON+48+4(SB)/4, $0xFFFFFFFC // value 1: shift right by 4 |
| 93 | +DATA ·shiftRightInt32NEON+48+8(SB)/4, $0 // value 2: shift right by 0 |
| 94 | +DATA ·shiftRightInt32NEON+48+12(SB)/4, $0xFFFFFFFC // value 3: shift right by 4 |
| 95 | + |
| 96 | +// 8 bits: no shift needed |
| 97 | +DATA ·shiftRightInt32NEON+112+0(SB)/4, $0 // value 0: shift right by 0 |
| 98 | +DATA ·shiftRightInt32NEON+112+4(SB)/4, $0 // value 1: shift right by 0 |
| 99 | +DATA ·shiftRightInt32NEON+112+8(SB)/4, $0 // value 2: shift right by 0 |
| 100 | +DATA ·shiftRightInt32NEON+112+12(SB)/4, $0 // value 3: shift right by 0 |
| 101 | + |
| 102 | +// 16 bits: no shift needed |
| 103 | +DATA ·shiftRightInt32NEON+240+0(SB)/4, $0 // value 0: shift right by 0 |
| 104 | +DATA ·shiftRightInt32NEON+240+4(SB)/4, $0 // value 1: shift right by 0 |
| 105 | +DATA ·shiftRightInt32NEON+240+8(SB)/4, $0 // value 2: shift right by 0 |
| 106 | +DATA ·shiftRightInt32NEON+240+12(SB)/4, $0 // value 3: shift right by 0 |
0 commit comments