Skip to content

Commit dc7abba

Browse files
authored
Merge pull request #2 from parquet-go/unpack-arm64
implement bit unpacking optimizations for arm64
2 parents 5d37665 + f304527 commit dc7abba

File tree

7 files changed

+940
-2
lines changed

7 files changed

+940
-2
lines changed

masks_int32_arm64.s

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
//go:build !purego
2+
3+
#include "textflag.h"
4+
5+
// -----------------------------------------------------------------------------
6+
// NEON Shuffle masks and shift tables for unpacking int32 values
7+
//
8+
// NEON uses 128-bit registers (vs AVX2's 256-bit), so we process 4 int32
9+
// values per iteration instead of 8.
10+
//
11+
// TBL instruction: byte shuffle within 16 bytes using indices 0-15
12+
// USHL instruction: variable shift per lane (negative = right shift)
13+
// -----------------------------------------------------------------------------
14+
15+
// Shuffle masks for unpacking values from bit widths 1 to 16.
16+
//
17+
// For NEON, we process 4 int32 values at a time. Each mask is 16 bytes.
18+
// The masks are indexed by: offset = 16 * (bitWidth - 1)
19+
//
20+
// Special value 0xFF means "load zero byte"
21+
//
22+
GLOBL ·shuffleInt32x1to16bitsNEON(SB), RODATA|NOPTR, $256
23+
24+
// 1 bit => 32 bits (4 values from 4 bits = 0.5 bytes)
25+
// Values 0,1,2,3 packed in first byte
26+
DATA ·shuffleInt32x1to16bitsNEON+0+0(SB)/4, $0x808080FF // value 0: byte 0, bits 0
27+
DATA ·shuffleInt32x1to16bitsNEON+0+4(SB)/4, $0x808080FF // value 1: byte 0, bits 1
28+
DATA ·shuffleInt32x1to16bitsNEON+0+8(SB)/4, $0x808080FF // value 2: byte 0, bits 2
29+
DATA ·shuffleInt32x1to16bitsNEON+0+12(SB)/4, $0x808080FF // value 3: byte 0, bits 3
30+
31+
// 2 bits => 32 bits (4 values from 8 bits = 1 byte)
32+
DATA ·shuffleInt32x1to16bitsNEON+16+0(SB)/4, $0x808080FF // value 0: byte 0, bits 0-1
33+
DATA ·shuffleInt32x1to16bitsNEON+16+4(SB)/4, $0x808080FF // value 1: byte 0, bits 2-3
34+
DATA ·shuffleInt32x1to16bitsNEON+16+8(SB)/4, $0x808080FF // value 2: byte 0, bits 4-5
35+
DATA ·shuffleInt32x1to16bitsNEON+16+12(SB)/4, $0x808080FF // value 3: byte 0, bits 6-7
36+
37+
// 3 bits => 32 bits (4 values from 12 bits = 1.5 bytes)
38+
DATA ·shuffleInt32x1to16bitsNEON+32+0(SB)/4, $0x808080FF // value 0: byte 0, bits 0-2
39+
DATA ·shuffleInt32x1to16bitsNEON+32+4(SB)/4, $0x808080FF // value 1: byte 0-1, bits 3-5
40+
DATA ·shuffleInt32x1to16bitsNEON+32+8(SB)/4, $0x808080FF // value 2: byte 1, bits 6-7,0
41+
DATA ·shuffleInt32x1to16bitsNEON+32+12(SB)/4, $0x808080FF // value 3: byte 1, bits 1-3
42+
43+
// 4 bits => 32 bits (4 values from 16 bits = 2 bytes)
44+
DATA ·shuffleInt32x1to16bitsNEON+48+0(SB)/4, $0x808080FF // value 0: byte 0, bits 0-3
45+
DATA ·shuffleInt32x1to16bitsNEON+48+4(SB)/4, $0x808080FF // value 1: byte 0, bits 4-7
46+
DATA ·shuffleInt32x1to16bitsNEON+48+8(SB)/4, $0x808080FF // value 2: byte 1, bits 0-3
47+
DATA ·shuffleInt32x1to16bitsNEON+48+12(SB)/4, $0x808080FF // value 3: byte 1, bits 4-7
48+
49+
// 5-16 bits: Similar pattern, will implement incrementally
50+
// For now, using placeholders - these will be filled in based on the algorithm
51+
52+
// 8 bits => 32 bits (4 values from 32 bits = 4 bytes)
53+
DATA ·shuffleInt32x1to16bitsNEON+112+0(SB)/4, $0x80808000 // value 0: byte 0
54+
DATA ·shuffleInt32x1to16bitsNEON+112+4(SB)/4, $0x80808001 // value 1: byte 1
55+
DATA ·shuffleInt32x1to16bitsNEON+112+8(SB)/4, $0x80808002 // value 2: byte 2
56+
DATA ·shuffleInt32x1to16bitsNEON+112+12(SB)/4, $0x80808003 // value 3: byte 3
57+
58+
// 16 bits => 32 bits (4 values from 64 bits = 8 bytes)
59+
DATA ·shuffleInt32x1to16bitsNEON+240+0(SB)/4, $0x80800100 // value 0: bytes 0-1
60+
DATA ·shuffleInt32x1to16bitsNEON+240+4(SB)/4, $0x80800302 // value 1: bytes 2-3
61+
DATA ·shuffleInt32x1to16bitsNEON+240+8(SB)/4, $0x80800504 // value 2: bytes 4-5
62+
DATA ·shuffleInt32x1to16bitsNEON+240+12(SB)/4, $0x80800706 // value 3: bytes 6-7
63+
64+
// Shift amounts for NEON USHL instruction
65+
// USHL uses signed shift amounts: negative = right shift, positive = left shift
66+
// Each entry contains 4 int32 shift amounts for 4 values
67+
//
68+
// Formula: shift[i] = -(i * bitWidth) % 8 (negative for right shift)
69+
//
70+
GLOBL ·shiftRightInt32NEON(SB), RODATA|NOPTR, $256
71+
72+
// 1 bit: shifts are 0, -1, -2, -3
73+
DATA ·shiftRightInt32NEON+0+0(SB)/4, $0 // value 0: shift right by 0
74+
DATA ·shiftRightInt32NEON+0+4(SB)/4, $0xFFFFFFFF // value 1: shift right by 1
75+
DATA ·shiftRightInt32NEON+0+8(SB)/4, $0xFFFFFFFE // value 2: shift right by 2
76+
DATA ·shiftRightInt32NEON+0+12(SB)/4, $0xFFFFFFFD // value 3: shift right by 3
77+
78+
// 2 bits: shifts are 0, -2, -4, -6
79+
DATA ·shiftRightInt32NEON+16+0(SB)/4, $0 // value 0: shift right by 0
80+
DATA ·shiftRightInt32NEON+16+4(SB)/4, $0xFFFFFFFE // value 1: shift right by 2
81+
DATA ·shiftRightInt32NEON+16+8(SB)/4, $0xFFFFFFFC // value 2: shift right by 4
82+
DATA ·shiftRightInt32NEON+16+12(SB)/4, $0xFFFFFFFA // value 3: shift right by 6
83+
84+
// 3 bits: shifts are 0, -3, -6, -1 (wraps at 8)
85+
DATA ·shiftRightInt32NEON+32+0(SB)/4, $0 // value 0: shift right by 0
86+
DATA ·shiftRightInt32NEON+32+4(SB)/4, $0xFFFFFFFD // value 1: shift right by 3
87+
DATA ·shiftRightInt32NEON+32+8(SB)/4, $0xFFFFFFFA // value 2: shift right by 6
88+
DATA ·shiftRightInt32NEON+32+12(SB)/4, $0xFFFFFFFF // value 3: shift right by 1
89+
90+
// 4 bits: shifts are 0, -4, 0, -4 (wraps at 8)
91+
DATA ·shiftRightInt32NEON+48+0(SB)/4, $0 // value 0: shift right by 0
92+
DATA ·shiftRightInt32NEON+48+4(SB)/4, $0xFFFFFFFC // value 1: shift right by 4
93+
DATA ·shiftRightInt32NEON+48+8(SB)/4, $0 // value 2: shift right by 0
94+
DATA ·shiftRightInt32NEON+48+12(SB)/4, $0xFFFFFFFC // value 3: shift right by 4
95+
96+
// 8 bits: no shift needed
97+
DATA ·shiftRightInt32NEON+112+0(SB)/4, $0 // value 0: shift right by 0
98+
DATA ·shiftRightInt32NEON+112+4(SB)/4, $0 // value 1: shift right by 0
99+
DATA ·shiftRightInt32NEON+112+8(SB)/4, $0 // value 2: shift right by 0
100+
DATA ·shiftRightInt32NEON+112+12(SB)/4, $0 // value 3: shift right by 0
101+
102+
// 16 bits: no shift needed
103+
DATA ·shiftRightInt32NEON+240+0(SB)/4, $0 // value 0: shift right by 0
104+
DATA ·shiftRightInt32NEON+240+4(SB)/4, $0 // value 1: shift right by 0
105+
DATA ·shiftRightInt32NEON+240+8(SB)/4, $0 // value 2: shift right by 0
106+
DATA ·shiftRightInt32NEON+240+12(SB)/4, $0 // value 3: shift right by 0

unpack_int32_arm64.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
//go:build !purego
2+
3+
package bitpack
4+
5+
import (
6+
"github.com/parquet-go/bitpack/unsafecast"
7+
)
8+
9+
//go:noescape
10+
func unpackInt32Default(dst []int32, src []byte, bitWidth uint)
11+
12+
//go:noescape
13+
func unpackInt32x1to16bitsNEON(dst []int32, src []byte, bitWidth uint)
14+
15+
func unpackInt32(dst []int32, src []byte, bitWidth uint) {
16+
// For ARM64, we use simplified NEON for byte-aligned cases (8, 16 bits)
17+
// and scalar for other cases
18+
switch {
19+
case bitWidth <= 16:
20+
unpackInt32x1to16bitsNEON(dst, src, bitWidth)
21+
case bitWidth == 32:
22+
copy(dst, unsafecast.Slice[int32](src))
23+
default:
24+
unpackInt32Default(dst, src, bitWidth)
25+
}
26+
}

0 commit comments

Comments
 (0)