Skip to content

Commit c17926b

Browse files
implement 64 bit unpacking optimizations for arm64
Signed-off-by: Achille Roussel <achille.roussel@gmail.com>
1 parent dc7abba commit c17926b

File tree

4 files changed

+648
-11
lines changed

4 files changed

+648
-11
lines changed

unpack_int32_arm64.go

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,13 @@ import (
1010
func unpackInt32Default(dst []int32, src []byte, bitWidth uint)
1111

1212
//go:noescape
13-
func unpackInt32x1to16bitsNEON(dst []int32, src []byte, bitWidth uint)
13+
func unpackInt32x1to16bitsARM64(dst []int32, src []byte, bitWidth uint)
1414

1515
func unpackInt32(dst []int32, src []byte, bitWidth uint) {
16-
// For ARM64, we use simplified NEON for byte-aligned cases (8, 16 bits)
17-
// and scalar for other cases
16+
// For ARM64, we use optimized scalar operations for small bit widths
1817
switch {
1918
case bitWidth <= 16:
20-
unpackInt32x1to16bitsNEON(dst, src, bitWidth)
19+
unpackInt32x1to16bitsARM64(dst, src, bitWidth)
2120
case bitWidth == 32:
2221
copy(dst, unsafecast.Slice[int32](src))
2322
default:

unpack_int32_arm64.s

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,11 @@ test:
6060
BNE loop
6161
RET
6262

63-
// unpackInt32x1to16bitsNEON implements NEON-optimized unpacking for bit widths 1-16
64-
// This simplified version handles byte-aligned cases (8, 16 bits) with NEON,
65-
// and falls back to scalar for complex cases.
63+
// unpackInt32x1to16bitsARM64 implements optimized unpacking for bit widths 1-16
64+
// Uses optimized scalar ARM64 operations with batched processing
6665
//
67-
// func unpackInt32x1to16bitsNEON(dst []int32, src []byte, bitWidth uint)
68-
TEXT ·unpackInt32x1to16bitsNEON(SB), NOSPLIT, $0-56
66+
// func unpackInt32x1to16bitsARM64(dst []int32, src []byte, bitWidth uint)
67+
TEXT ·unpackInt32x1to16bitsARM64(SB), NOSPLIT, $0-56
6968
MOVD dst_base+0(FP), R0 // R0 = dst pointer
7069
MOVD dst_len+8(FP), R1 // R1 = dst length
7170
MOVD src_base+24(FP), R2 // R2 = src pointer

unpack_int64_arm64.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,14 @@ import (
99
//go:noescape
1010
func unpackInt64Default(dst []int64, src []byte, bitWidth uint)
1111

12+
//go:noescape
13+
func unpackInt64x1to32bitsARM64(dst []int64, src []byte, bitWidth uint)
14+
1215
func unpackInt64(dst []int64, src []byte, bitWidth uint) {
13-
// For ARM64, we'll use NEON instructions
14-
// TODO: Implement NEON optimizations - using default for now
16+
// For ARM64, use optimized scalar operations for common bit widths
1517
switch {
18+
case bitWidth <= 32:
19+
unpackInt64x1to32bitsARM64(dst, src, bitWidth)
1620
case bitWidth == 64:
1721
copy(dst, unsafecast.Slice[int64](src))
1822
default:

0 commit comments

Comments
 (0)