Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ARM's assembly code for huffman decoding #4204

Closed
wants to merge 11 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Enable arm asm code on apple
Nicoshev committed Nov 26, 2024
commit f4a1f49e49f4614c3f9e3b359e289ce7014289b4
2 changes: 1 addition & 1 deletion lib/common/portability_macros.h
Original file line number Diff line number Diff line change
@@ -139,7 +139,7 @@
/* For now only enable ARM64 assembly when ZSTD_EXPERIMENTAL_ARM64
+ * is defined. This ensures that it is only enabled for your tests.
+ */
# if defined(__aarch64__) && defined(ZSTD_EXPERIMENTAL_ARM64) && !defined(__APPLE__)
# if defined(__aarch64__)
# define ZSTD_ENABLE_ASM_ARM64 1
# else
# define ZSTD_ENABLE_ASM_ARM64 0
4 changes: 2 additions & 2 deletions lib/decompress/huf_decompress.c
Original file line number Diff line number Diff line change
@@ -923,7 +923,7 @@ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize,
#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
#elif ZSTD_ENABLE_ASM_ARM64
loopFn = HUF_decompress4X1_usingDTable_internal_fast_arm64_loop
loopFn = HUF_decompress4X1_usingDTable_internal_fast_arm64_loop;
#endif
}

@@ -1754,7 +1754,7 @@ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize,
#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
#elif ZSTD_ENABLE_ASM_ARM64
loopFn = HUF_decompress4X2_usingDTable_internal_fast_arm64_loop
loopFn = HUF_decompress4X2_usingDTable_internal_fast_arm64_loop;
#endif
}

85 changes: 44 additions & 41 deletions lib/decompress/huf_decompress_arm64.S
Original file line number Diff line number Diff line change
@@ -19,6 +19,11 @@

#if ZSTD_ENABLE_ASM_ARM64

#if !(__APPLE__)
/* on Apple platforms ' %%' is used as seperator instead of ';' */
#define %% ;
#endif

/* Calling convention:
*
* x0 contains the first argument: HUF_DecompressAsmArgs*.
@@ -75,16 +80,16 @@ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_arm64_loop)

/* Calls X(N) for each stream 0, 1, 2, 3. */
#define FOR_EACH_STREAM(X) \
X(0); \
X(1); \
X(2); \
X(0) %% \
X(1) %% \
X(2) %% \
X(3)

/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
X(0, idx); \
X(1, idx); \
X(2, idx); \
X(0, idx) %% \
X(1, idx) %% \
X(2, idx) %% \
X(3, idx)

/* Define both _HUF_* & HUF_* symbols because MacOS
@@ -164,30 +169,28 @@ HUF_decompress4X1_usingDTable_internal_fast_arm64_loop:
/* Reads top 11 bits from bits[n]
* Loads dt[bits[n]] into var[n]
*/
#define GET_NEXT_DELT(n) \
lsr var##n, bits##n, #53; \
ldrh vard##n, [dtable, var##n, lsl #1];
#define GET_NEXT_DELT(n) \
lsr var##n, bits##n, #53 %% \
ldrh vard##n, [dtable, var##n, lsl #1] %%

/* var[n] must contain the DTable entry computed with GET_NEXT_DELT
* Moves var[n] to %rax
* Moves var[n] to x21
* bits[n] <<= var[n] & 63
* op[n][idx] = %rax >> 8
* %ah is a way to access bits [8, 16) of %rax
* op[n][idx] = x21 >> 8
*/
#define DECODE_FROM_DELT(n, idx) \
lsr x21, var##n, #8; \
lsl bits##n, bits##n, var##n; \
strb w21, [op##n, ##idx];
#define DECODE_FROM_DELT(n, idx) \
lsr x21, var##n, #8 %% \
lsl bits##n, bits##n, var##n %% \
strb w21, [op##n, ##idx] %%

/* Assumes GET_NEXT_DELT has been called.
* Calls DECODE_FROM_DELT then GET_NEXT_DELT
*/
#define DECODE_AND_GET_NEXT(n, idx) \
DECODE_FROM_DELT(n, idx); \
DECODE_FROM_DELT(n, idx) %% \
GET_NEXT_DELT(n) \

/* // ctz & nbBytes is stored in bits[n]
* // nbBits is stored in %rax
* ctz = CTZ[bits[n]]
* nbBits = ctz & 7
* nbBytes = ctz >> 3
@@ -198,14 +201,14 @@ HUF_decompress4X1_usingDTable_internal_fast_arm64_loop:
* bits[n] <<= nbBits
*/
#define RELOAD_BITS(n) \
rbit bits##n, bits##n; \
clz bits##n, bits##n; \
sub ip##n, ip##n, bits##n, lsr #3; \
and x22, bits##n, #7; \
add op##n, op##n, #5; \
ldr bits##n, [ip##n]; \
orr bits##n, bits##n, #1; \
lsl bits##n, bits##n, x22;
rbit bits##n, bits##n %% \
clz bits##n, bits##n %% \
sub ip##n, ip##n, bits##n, lsr #3 %% \
and x22, bits##n, #7 %% \
add op##n, op##n, #5 %% \
ldr bits##n, [ip##n] %% \
orr bits##n, bits##n, #1 %% \
lsl bits##n, bits##n, x22 %%

/* Call GET_NEXT_DELT for each stream */
FOR_EACH_STREAM(GET_NEXT_DELT)
@@ -350,23 +353,23 @@ HUF_decompress4X2_usingDTable_internal_fast_arm64_loop:
add x1, x20, x20, lsl #2
add olimit, op3, x1

#define DECODE(n, idx) \
lsr x21, bits##n, #53; \
ldr w21, [dtable, x21, lsl #2]; \
strh w21, [op##n]; \
lsr w20, w21, #16; \
add op##n, op##n, x21, lsr #24; \
and x20, x20, #255; \
lsl bits##n, bits##n, x20;
#define DECODE(n, idx) \
lsr x21, bits##n, #53 %% \
ldr w21, [dtable, x21, lsl #2] %% \
strh w21, [op##n] %% \
lsr w20, w21, #16 %% \
add op##n, op##n, x21, lsr #24 %% \
and x20, x20, #255 %% \
lsl bits##n, bits##n, x20 %%

#define RELOAD_BITS(n) \
rbit bits##n, bits##n; \
clz bits##n, bits##n; \
sub ip##n, ip##n, bits##n, lsr #3; \
and x1, bits##n, #7; \
ldr bits##n, [ip##n]; \
orr bits##n, bits##n, #1; \
lsl bits##n, bits##n, x1;
rbit bits##n, bits##n %% \
clz bits##n, bits##n %% \
sub ip##n, ip##n, bits##n, lsr #3 %% \
and x1, bits##n, #7 %% \
ldr bits##n, [ip##n] %% \
orr bits##n, bits##n, #1 %% \
lsl bits##n, bits##n, x1 %%

.p2align 6