Skip to content

Commit ee36b77

Browse files
committed
crypto/keccak: fuse xor into asm keccak permutations
1 parent a8c906d commit ee36b77

File tree

7 files changed

+129
-183
lines changed

7 files changed

+129
-183
lines changed

crypto/keccak/keccaf_arm64.s

Lines changed: 28 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -6,148 +6,18 @@
66

77
#include "textflag.h"
88

9-
// func keccakF1600(a *[200]byte)
10-
TEXT ·keccakF1600(SB), $200-8
9+
// func keccakF1600Sha3(a *[200]byte, buf *byte)
10+
// When buf != nil, XORs rate bytes into state before permuting.
11+
// When buf == nil, just permutes.
12+
TEXT ·keccakF1600Sha3(SB), $200-16
1113
MOVD a+0(FP), R0
14+
MOVD buf+8(FP), R3
1215
MOVD $round_consts<>(SB), R1
1316
MOVD $24, R2 // counter for loop
1417

15-
VLD1.P 16(R0), [V0.D1, V1.D1]
16-
VLD1.P 16(R0), [V2.D1, V3.D1]
17-
VLD1.P 16(R0), [V4.D1, V5.D1]
18-
VLD1.P 16(R0), [V6.D1, V7.D1]
19-
VLD1.P 16(R0), [V8.D1, V9.D1]
20-
VLD1.P 16(R0), [V10.D1, V11.D1]
21-
VLD1.P 16(R0), [V12.D1, V13.D1]
22-
VLD1.P 16(R0), [V14.D1, V15.D1]
23-
VLD1.P 16(R0), [V16.D1, V17.D1]
24-
VLD1.P 16(R0), [V18.D1, V19.D1]
25-
VLD1.P 16(R0), [V20.D1, V21.D1]
26-
VLD1.P 16(R0), [V22.D1, V23.D1]
27-
VLD1 (R0), [V24.D1]
28-
29-
SUB $192, R0, R0
30-
31-
loop:
32-
// theta
33-
VEOR3 V20.B16, V15.B16, V10.B16, V25.B16
34-
VEOR3 V21.B16, V16.B16, V11.B16, V26.B16
35-
VEOR3 V22.B16, V17.B16, V12.B16, V27.B16
36-
VEOR3 V23.B16, V18.B16, V13.B16, V28.B16
37-
VEOR3 V24.B16, V19.B16, V14.B16, V29.B16
38-
VEOR3 V25.B16, V5.B16, V0.B16, V25.B16
39-
VEOR3 V26.B16, V6.B16, V1.B16, V26.B16
40-
VEOR3 V27.B16, V7.B16, V2.B16, V27.B16
41-
VEOR3 V28.B16, V8.B16, V3.B16, V28.B16
42-
VEOR3 V29.B16, V9.B16, V4.B16, V29.B16
43-
44-
VRAX1 V27.D2, V25.D2, V30.D2
45-
VRAX1 V28.D2, V26.D2, V31.D2
46-
VRAX1 V29.D2, V27.D2, V27.D2
47-
VRAX1 V25.D2, V28.D2, V28.D2
48-
VRAX1 V26.D2, V29.D2, V29.D2
49-
50-
// theta and rho and Pi
51-
VEOR V29.B16, V0.B16, V0.B16
52-
53-
VXAR $63, V30.D2, V1.D2, V25.D2
54-
55-
VXAR $20, V30.D2, V6.D2, V1.D2
56-
VXAR $44, V28.D2, V9.D2, V6.D2
57-
VXAR $3, V31.D2, V22.D2, V9.D2
58-
VXAR $25, V28.D2, V14.D2, V22.D2
59-
VXAR $46, V29.D2, V20.D2, V14.D2
60-
61-
VXAR $2, V31.D2, V2.D2, V26.D2
62-
63-
VXAR $21, V31.D2, V12.D2, V2.D2
64-
VXAR $39, V27.D2, V13.D2, V12.D2
65-
VXAR $56, V28.D2, V19.D2, V13.D2
66-
VXAR $8, V27.D2, V23.D2, V19.D2
67-
VXAR $23, V29.D2, V15.D2, V23.D2
68-
69-
VXAR $37, V28.D2, V4.D2, V15.D2
70-
71-
VXAR $50, V28.D2, V24.D2, V28.D2
72-
VXAR $62, V30.D2, V21.D2, V24.D2
73-
VXAR $9, V27.D2, V8.D2, V8.D2
74-
VXAR $19, V30.D2, V16.D2, V4.D2
75-
VXAR $28, V29.D2, V5.D2, V16.D2
76-
77-
VXAR $36, V27.D2, V3.D2, V5.D2
18+
CBZ R3, load_state
7819

79-
VXAR $43, V27.D2, V18.D2, V27.D2
80-
VXAR $49, V31.D2, V17.D2, V3.D2
81-
VXAR $54, V30.D2, V11.D2, V30.D2
82-
VXAR $58, V31.D2, V7.D2, V31.D2
83-
VXAR $61, V29.D2, V10.D2, V29.D2
84-
85-
// chi and iota
86-
VBCAX V8.B16, V22.B16, V26.B16, V20.B16
87-
VBCAX V22.B16, V23.B16, V8.B16, V21.B16
88-
VBCAX V23.B16, V24.B16, V22.B16, V22.B16
89-
VBCAX V24.B16, V26.B16, V23.B16, V23.B16
90-
VBCAX V26.B16, V8.B16, V24.B16, V24.B16
91-
92-
VLD1R.P 8(R1), [V26.D2]
93-
94-
VBCAX V3.B16, V19.B16, V30.B16, V17.B16
95-
VBCAX V19.B16, V15.B16, V3.B16, V18.B16
96-
VBCAX V15.B16, V16.B16, V19.B16, V19.B16
97-
VBCAX V16.B16, V30.B16, V15.B16, V15.B16
98-
VBCAX V30.B16, V3.B16, V16.B16, V16.B16
99-
100-
VBCAX V31.B16, V12.B16, V25.B16, V10.B16
101-
VBCAX V12.B16, V13.B16, V31.B16, V11.B16
102-
VBCAX V13.B16, V14.B16, V12.B16, V12.B16
103-
VBCAX V14.B16, V25.B16, V13.B16, V13.B16
104-
VBCAX V25.B16, V31.B16, V14.B16, V14.B16
105-
106-
VBCAX V4.B16, V9.B16, V29.B16, V7.B16
107-
VBCAX V9.B16, V5.B16, V4.B16, V8.B16
108-
VBCAX V5.B16, V6.B16, V9.B16, V9.B16
109-
VBCAX V6.B16, V29.B16, V5.B16, V5.B16
110-
VBCAX V29.B16, V4.B16, V6.B16, V6.B16
111-
112-
VBCAX V28.B16, V0.B16, V27.B16, V3.B16
113-
VBCAX V0.B16, V1.B16, V28.B16, V4.B16
114-
115-
VBCAX V1.B16, V2.B16, V0.B16, V0.B16 // iota (chi part)
116-
117-
VBCAX V2.B16, V27.B16, V1.B16, V1.B16
118-
VBCAX V27.B16, V28.B16, V2.B16, V2.B16
119-
120-
VEOR V26.B16, V0.B16, V0.B16 // iota
121-
122-
SUB $1, R2, R2
123-
CBNZ R2, loop
124-
125-
VST1.P [V0.D1, V1.D1], 16(R0)
126-
VST1.P [V2.D1, V3.D1], 16(R0)
127-
VST1.P [V4.D1, V5.D1], 16(R0)
128-
VST1.P [V6.D1, V7.D1], 16(R0)
129-
VST1.P [V8.D1, V9.D1], 16(R0)
130-
VST1.P [V10.D1, V11.D1], 16(R0)
131-
VST1.P [V12.D1, V13.D1], 16(R0)
132-
VST1.P [V14.D1, V15.D1], 16(R0)
133-
VST1.P [V16.D1, V17.D1], 16(R0)
134-
VST1.P [V18.D1, V19.D1], 16(R0)
135-
VST1.P [V20.D1, V21.D1], 16(R0)
136-
VST1.P [V22.D1, V23.D1], 16(R0)
137-
VST1 [V24.D1], (R0)
138-
139-
RET
140-
141-
// func xorAndPermute(state *[200]byte, buf *byte)
142-
// Loads state, XORs a full rate (136 bytes = 17 lanes) of data, then runs keccakF1600.
143-
// Eliminates one state store+load cycle per block vs separate xorIn + keccakF1600.
144-
TEXT ·xorAndPermute(SB), $200-16
145-
MOVD state+0(FP), R0
146-
MOVD buf+8(FP), R3
147-
MOVD $round_consts<>(SB), R1
148-
MOVD $24, R2
149-
150-
// Load state and XOR data for lanes 0-15 (8 pairs × 16 bytes = 128 bytes)
20+
// XOR path: load state and XOR with buf (17 lanes = 136 bytes)
15121
VLD1.P 16(R0), [V0.D1, V1.D1]
15222
VLD1.P 16(R3), [V25.D1, V26.D1]
15323
VEOR V25.B16, V0.B16, V0.B16
@@ -188,7 +58,7 @@ TEXT ·xorAndPermute(SB), $200-16
18858
VEOR V25.B16, V14.B16, V14.B16
18959
VEOR V26.B16, V15.B16, V15.B16
19060

191-
// Lane 16-17: XOR only lane 16 (last data lane, 8 bytes at data offset 128)
61+
// Lane 16: last data lane (8 bytes at buf offset 128)
19262
VLD1.P 16(R0), [V16.D1, V17.D1]
19363
VLD1 (R3), [V25.D1]
19464
VEOR V25.B16, V16.B16, V16.B16
@@ -200,8 +70,26 @@ TEXT ·xorAndPermute(SB), $200-16
20070
VLD1 (R0), [V24.D1]
20171

20272
SUB $192, R0, R0
73+
B rounds
74+
75+
load_state:
76+
VLD1.P 16(R0), [V0.D1, V1.D1]
77+
VLD1.P 16(R0), [V2.D1, V3.D1]
78+
VLD1.P 16(R0), [V4.D1, V5.D1]
79+
VLD1.P 16(R0), [V6.D1, V7.D1]
80+
VLD1.P 16(R0), [V8.D1, V9.D1]
81+
VLD1.P 16(R0), [V10.D1, V11.D1]
82+
VLD1.P 16(R0), [V12.D1, V13.D1]
83+
VLD1.P 16(R0), [V14.D1, V15.D1]
84+
VLD1.P 16(R0), [V16.D1, V17.D1]
85+
VLD1.P 16(R0), [V18.D1, V19.D1]
86+
VLD1.P 16(R0), [V20.D1, V21.D1]
87+
VLD1.P 16(R0), [V22.D1, V23.D1]
88+
VLD1 (R0), [V24.D1]
89+
90+
SUB $192, R0, R0
20391

204-
loop_xp:
92+
rounds:
20593
// theta
20694
VEOR3 V20.B16, V15.B16, V10.B16, V25.B16
20795
VEOR3 V21.B16, V16.B16, V11.B16, V26.B16
@@ -293,7 +181,7 @@ loop_xp:
293181
VEOR V26.B16, V0.B16, V0.B16 // iota
294182

295183
SUB $1, R2, R2
296-
CBNZ R2, loop_xp
184+
CBNZ R2, rounds
297185

298186
VST1.P [V0.D1, V1.D1], 16(R0)
299187
VST1.P [V2.D1, V3.D1], 16(R0)

crypto/keccak/keccak_arm64.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,16 @@ func init() {
1515
useASM = runtime.GOOS == "darwin" || runtime.GOOS == "ios" || cpu.ARM64.HasSHA3
1616
}
1717

18+
// keccakF1600Sha3 permutes state. When buf != nil, it first XORs rate bytes
19+
// of buf into state, saving one full memory pass.
20+
//
1821
//go:noescape
19-
func keccakF1600(a *[200]byte)
22+
func keccakF1600Sha3(a *[200]byte, buf *byte)
2023

21-
//go:noescape
22-
func xorAndPermute(state *[200]byte, buf *byte)
24+
func keccakF1600(a *[200]byte) {
25+
keccakF1600Sha3(a, nil)
26+
}
27+
28+
func xorAndPermute(state *[200]byte, buf *byte) {
29+
keccakF1600Sha3(state, buf)
30+
}

crypto/keccak/keccak_asm.go

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
package keccak
44

55
import (
6-
"unsafe"
6+
"encoding/binary"
77

88
"golang.org/x/crypto/sha3"
99
)
@@ -59,7 +59,11 @@ func (s *sponge) Write(p []byte) (int, error) {
5959

6060
// Sum256 finalizes and returns the 32-byte Keccak-256 digest.
6161
// Does not modify the sponge state.
62+
// Panics if called after Read.
6263
func (s *sponge) Sum256() [32]byte {
64+
if s.squeezing {
65+
panic("keccak: Sum after Read")
66+
}
6367
state := s.state
6468
xorIn(&state, s.buf[:s.absorbed])
6569
state[s.absorbed] ^= 0x01
@@ -217,14 +221,13 @@ func (h *Hasher) Read(out []byte) (int, error) {
217221
return h.sponge.Read(out)
218222
}
219223

224+
// xorIn XORs data into the first len(data) bytes of state using uint64 loads.
220225
func xorIn(state *[200]byte, data []byte) {
221-
stateU64 := (*[25]uint64)(unsafe.Pointer(state))
222-
n := len(data) >> 3
223-
p := unsafe.Pointer(unsafe.SliceData(data))
224-
for i := range n {
225-
stateU64[i] ^= *(*uint64)(unsafe.Add(p, uintptr(i)<<3))
226+
for i := 0; i+8 <= len(data); i += 8 {
227+
v := binary.LittleEndian.Uint64(state[i:]) ^ binary.LittleEndian.Uint64(data[i:])
228+
binary.LittleEndian.PutUint64(state[i:], v)
226229
}
227-
for i := n << 3; i < len(data); i++ {
230+
for i := len(data) &^ 7; i < len(data); i++ {
228231
state[i] ^= data[i]
229232
}
230233
}

crypto/keccak/keccak_test.go

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -246,25 +246,14 @@ func FuzzSum256(f *testing.F) {
246246
})
247247
}
248248

249-
func BenchmarkSum256_500K(b *testing.B) {
250-
data := make([]byte, 500*1024)
251-
b.SetBytes(int64(len(data)))
252-
b.ReportAllocs()
253-
for b.Loop() {
254-
Sum256(data)
255-
}
256-
}
257-
258249
// Comparison benchmarks: faster_keccak vs golang.org/x/crypto/sha3.
259250
var benchSizes = []int{32, 128, 256, 1024, 4096, 500 * 1024}
260251

261252
func benchName(size int) string {
262-
switch {
263-
case size >= 1024:
253+
if size >= 1024 {
264254
return fmt.Sprintf("%dK", size/1024)
265-
default:
266-
return fmt.Sprintf("%dB", size)
267255
}
256+
return fmt.Sprintf("%dB", size)
268257
}
269258

270259
// BenchmarkKeccak256Sum tests Sum256 with local faster_keccak implementation.

crypto/keccak/keccakf_amd64.go

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,20 @@
22

33
package keccak
44

5-
import (
6-
"unsafe"
5+
import "golang.org/x/sys/cpu"
76

8-
"golang.org/x/sys/cpu"
9-
)
10-
11-
func init() { useASM = cpu.X86.HasBMI2 && cpu.X86.HasBMI1 }
7+
func init() { useASM = cpu.X86.HasBMI1 && cpu.X86.HasBMI2 }
128

9+
// keccakF1600BMI2 permutes state. When buf != nil, it first XORs rate bytes
10+
// of buf into state, saving one full memory pass.
11+
//
1312
//go:noescape
14-
func keccakF1600BMI2(a *[200]byte)
13+
func keccakF1600BMI2(a *[200]byte, buf *byte)
1514

1615
func keccakF1600(a *[200]byte) {
17-
keccakF1600BMI2(a)
16+
keccakF1600BMI2(a, nil)
1817
}
1918

2019
func xorAndPermute(state *[200]byte, buf *byte) {
21-
xorIn(state, unsafe.Slice(buf, rate))
22-
keccakF1600(state)
20+
keccakF1600BMI2(state, buf)
2321
}

crypto/keccak/keccakf_amd64_bmi2.s

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,50 @@
44

55
#include "textflag.h"
66

7-
// func keccakF1600BMI2(a *[200]byte)
8-
TEXT ·keccakF1600BMI2(SB), NOSPLIT, $200-8
7+
// func keccakF1600BMI2(a *[200]byte, buf *byte)
8+
TEXT ·keccakF1600BMI2(SB), NOSPLIT, $200-16
99
MOVQ a+0(FP), DI
10+
MOVQ buf+8(FP), BX
11+
TESTQ BX, BX
12+
JZ rounds
13+
14+
// XOR 17 lanes (136 bytes) of buf into state.
15+
MOVQ 0(BX), AX
16+
XORQ AX, 0(DI)
17+
MOVQ 8(BX), AX
18+
XORQ AX, 8(DI)
19+
MOVQ 16(BX), AX
20+
XORQ AX, 16(DI)
21+
MOVQ 24(BX), AX
22+
XORQ AX, 24(DI)
23+
MOVQ 32(BX), AX
24+
XORQ AX, 32(DI)
25+
MOVQ 40(BX), AX
26+
XORQ AX, 40(DI)
27+
MOVQ 48(BX), AX
28+
XORQ AX, 48(DI)
29+
MOVQ 56(BX), AX
30+
XORQ AX, 56(DI)
31+
MOVQ 64(BX), AX
32+
XORQ AX, 64(DI)
33+
MOVQ 72(BX), AX
34+
XORQ AX, 72(DI)
35+
MOVQ 80(BX), AX
36+
XORQ AX, 80(DI)
37+
MOVQ 88(BX), AX
38+
XORQ AX, 88(DI)
39+
MOVQ 96(BX), AX
40+
XORQ AX, 96(DI)
41+
MOVQ 104(BX), AX
42+
XORQ AX, 104(DI)
43+
MOVQ 112(BX), AX
44+
XORQ AX, 112(DI)
45+
MOVQ 120(BX), AX
46+
XORQ AX, 120(DI)
47+
MOVQ 128(BX), AX
48+
XORQ AX, 128(DI)
49+
50+
rounds:
1051

1152
// Round 0
1253
MOVQ $0x0000000000000001, R13

0 commit comments

Comments
 (0)