Skip to content

Commit 38a0b5d

Browse files
Garrett-Bodleyrolandshoemaker
authored andcommitted
argon2: Avo port of blamka_amd64.s
This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="argon2/blamka_amd64.s" REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: I3567eb80ef80dff248225f17470122c0a4e6951e Reviewed-on: https://go-review.googlesource.com/c/crypto/+/600315 Reviewed-by: Filippo Valsorda <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Dmitri Shuralyov <[email protected]> Reviewed-by: Roland Shoemaker <[email protected]>
1 parent bf5f14f commit 38a0b5d

File tree

4 files changed

+3074
-212
lines changed

4 files changed

+3074
-212
lines changed

argon2/_asm/blamka_amd64.go

+287
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package main
6+
7+
import (
8+
. "github.com/mmcloughlin/avo/build"
9+
. "github.com/mmcloughlin/avo/operand"
10+
. "github.com/mmcloughlin/avo/reg"
11+
_ "golang.org/x/crypto/argon2"
12+
)
13+
14+
//go:generate go run . -out ../blamka_amd64.s -pkg argon2
15+
16+
func main() {
17+
Package("golang.org/x/crypto/argon2")
18+
ConstraintExpr("amd64,gc,!purego")
19+
20+
blamkaSSE4()
21+
mixBlocksSSE2()
22+
xorBlocksSSE2()
23+
Generate()
24+
}
25+
26+
func blamkaSSE4() {
27+
Implement("blamkaSSE4")
28+
Attributes(NOSPLIT)
29+
AllocLocal(0)
30+
31+
Load(Param("b"), RAX)
32+
33+
c40 := c40_DATA()
34+
c48 := c48_DATA()
35+
36+
MOVOU(c40, X10)
37+
MOVOU(c48, X11)
38+
39+
BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11)
40+
BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11)
41+
BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11)
42+
BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11)
43+
BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11)
44+
BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11)
45+
BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11)
46+
BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11)
47+
48+
BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11)
49+
BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11)
50+
BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11)
51+
BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11)
52+
BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11)
53+
BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11)
54+
BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11)
55+
BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11)
56+
RET()
57+
}
58+
59+
func mixBlocksSSE2() {
60+
Implement("mixBlocksSSE2")
61+
Attributes(NOSPLIT)
62+
AllocLocal(0)
63+
64+
Load(Param("out"), RDX)
65+
Load(Param("a"), RAX)
66+
Load(Param("b"), RBX)
67+
Load(Param("c"), RCX)
68+
MOVQ(U32(128), RDI)
69+
70+
Label("loop")
71+
MOVOU(Mem{Base: AX}.Offset(0), X0)
72+
MOVOU(Mem{Base: BX}.Offset(0), X1)
73+
MOVOU(Mem{Base: CX}.Offset(0), X2)
74+
PXOR(X1, X0)
75+
PXOR(X2, X0)
76+
MOVOU(X0, Mem{Base: DX}.Offset(0))
77+
ADDQ(Imm(16), RAX)
78+
ADDQ(Imm(16), RBX)
79+
ADDQ(Imm(16), RCX)
80+
ADDQ(Imm(16), RDX)
81+
SUBQ(Imm(2), RDI)
82+
JA(LabelRef("loop"))
83+
RET()
84+
}
85+
86+
func xorBlocksSSE2() {
87+
Implement("xorBlocksSSE2")
88+
Attributes(NOSPLIT)
89+
AllocLocal(0)
90+
91+
Load(Param("out"), RDX)
92+
Load(Param("a"), RAX)
93+
Load(Param("b"), RBX)
94+
Load(Param("c"), RCX)
95+
MOVQ(U32(128), RDI)
96+
97+
Label("loop")
98+
MOVOU(Mem{Base: AX}.Offset(0), X0)
99+
MOVOU(Mem{Base: BX}.Offset(0), X1)
100+
MOVOU(Mem{Base: CX}.Offset(0), X2)
101+
MOVOU(Mem{Base: DX}.Offset(0), X3)
102+
PXOR(X1, X0)
103+
PXOR(X2, X0)
104+
PXOR(X3, X0)
105+
MOVOU(X0, Mem{Base: DX}.Offset(0))
106+
ADDQ(Imm(16), RAX)
107+
ADDQ(Imm(16), RBX)
108+
ADDQ(Imm(16), RCX)
109+
ADDQ(Imm(16), RDX)
110+
SUBQ(Imm(2), RDI)
111+
JA(LabelRef("loop"))
112+
RET()
113+
}
114+
115+
func SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) {
116+
MOVO(v4, t1)
117+
MOVO(v5, v4)
118+
MOVO(t1, v5)
119+
MOVO(v6, t1)
120+
PUNPCKLQDQ(v6, t2)
121+
PUNPCKHQDQ(v7, v6)
122+
PUNPCKHQDQ(t2, v6)
123+
PUNPCKLQDQ(v7, t2)
124+
MOVO(t1, v7)
125+
MOVO(v2, t1)
126+
PUNPCKHQDQ(t2, v7)
127+
PUNPCKLQDQ(v3, t2)
128+
PUNPCKHQDQ(t2, v2)
129+
PUNPCKLQDQ(t1, t2)
130+
PUNPCKHQDQ(t2, v3)
131+
}
132+
133+
func SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) {
134+
MOVO(v4, t1)
135+
MOVO(v5, v4)
136+
MOVO(t1, v5)
137+
MOVO(v2, t1)
138+
PUNPCKLQDQ(v2, t2)
139+
PUNPCKHQDQ(v3, v2)
140+
PUNPCKHQDQ(t2, v2)
141+
PUNPCKLQDQ(v3, t2)
142+
MOVO(t1, v3)
143+
MOVO(v6, t1)
144+
PUNPCKHQDQ(t2, v3)
145+
PUNPCKLQDQ(v7, t2)
146+
PUNPCKHQDQ(t2, v6)
147+
PUNPCKLQDQ(t1, t2)
148+
PUNPCKHQDQ(t2, v7)
149+
}
150+
151+
func HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48 VecPhysical) {
152+
MOVO(v0, t0)
153+
PMULULQ(v2, t0)
154+
PADDQ(v2, v0)
155+
PADDQ(t0, v0)
156+
PADDQ(t0, v0)
157+
PXOR(v0, v6)
158+
PSHUFD(Imm(0xB1), v6, v6)
159+
MOVO(v4, t0)
160+
PMULULQ(v6, t0)
161+
PADDQ(v6, v4)
162+
PADDQ(t0, v4)
163+
PADDQ(t0, v4)
164+
PXOR(v4, v2)
165+
PSHUFB(c40, v2)
166+
MOVO(v0, t0)
167+
PMULULQ(v2, t0)
168+
PADDQ(v2, v0)
169+
PADDQ(t0, v0)
170+
PADDQ(t0, v0)
171+
PXOR(v0, v6)
172+
PSHUFB(c48, v6)
173+
MOVO(v4, t0)
174+
PMULULQ(v6, t0)
175+
PADDQ(v6, v4)
176+
PADDQ(t0, v4)
177+
PADDQ(t0, v4)
178+
PXOR(v4, v2)
179+
MOVO(v2, t0)
180+
PADDQ(v2, t0)
181+
PSRLQ(Imm(63), v2)
182+
PXOR(t0, v2)
183+
MOVO(v1, t0)
184+
PMULULQ(v3, t0)
185+
PADDQ(v3, v1)
186+
PADDQ(t0, v1)
187+
PADDQ(t0, v1)
188+
PXOR(v1, v7)
189+
PSHUFD(Imm(0xB1), v7, v7)
190+
MOVO(v5, t0)
191+
PMULULQ(v7, t0)
192+
PADDQ(v7, v5)
193+
PADDQ(t0, v5)
194+
PADDQ(t0, v5)
195+
PXOR(v5, v3)
196+
PSHUFB(c40, v3)
197+
MOVO(v1, t0)
198+
PMULULQ(v3, t0)
199+
PADDQ(v3, v1)
200+
PADDQ(t0, v1)
201+
PADDQ(t0, v1)
202+
PXOR(v1, v7)
203+
PSHUFB(c48, v7)
204+
MOVO(v5, t0)
205+
PMULULQ(v7, t0)
206+
PADDQ(v7, v5)
207+
PADDQ(t0, v5)
208+
PADDQ(t0, v5)
209+
PXOR(v5, v3)
210+
MOVO(v3, t0)
211+
PADDQ(v3, t0)
212+
PSRLQ(Imm(63), v3)
213+
PXOR(t0, v3)
214+
}
215+
216+
func LOAD_MSG_0(block GPPhysical, off int) {
217+
var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7}
218+
for i, r := range registers {
219+
MOVOU(Mem{Base: block}.Offset(8*(off+(i*2))), r)
220+
}
221+
}
222+
223+
func STORE_MSG_0(block GPPhysical, off int) {
224+
var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7}
225+
for i, r := range registers {
226+
MOVOU(r, Mem{Base: block}.Offset(8*(off+(i*2))))
227+
}
228+
}
229+
230+
func LOAD_MSG_1(block GPPhysical, off int) {
231+
var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7}
232+
for i, r := range registers {
233+
MOVOU(Mem{Base: block}.Offset(8*off+i*16*8), r)
234+
}
235+
}
236+
237+
func STORE_MSG_1(block GPPhysical, off int) {
238+
var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7}
239+
for i, r := range registers {
240+
MOVOU(r, Mem{Base: block}.Offset(8*off+i*16*8))
241+
}
242+
}
243+
244+
func BLAMKA_ROUND_0(block GPPhysical, off int, t0, t1, c40, c48 VecPhysical) {
245+
LOAD_MSG_0(block, off)
246+
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48)
247+
SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1)
248+
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48)
249+
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1)
250+
STORE_MSG_0(block, off)
251+
}
252+
253+
func BLAMKA_ROUND_1(block GPPhysical, off int, t0, t1, c40, c48 VecPhysical) {
254+
LOAD_MSG_1(block, off)
255+
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48)
256+
SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1)
257+
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48)
258+
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1)
259+
STORE_MSG_1(block, off)
260+
}
261+
262+
// ##------------------DATA SECTION-------------------##
263+
264+
var c40_DATA_ptr, c48_DATA_ptr *Mem
265+
266+
func c40_DATA() Mem {
267+
if c40_DATA_ptr != nil {
268+
return *c40_DATA_ptr
269+
}
270+
271+
c40_DATA := GLOBL("·c40", NOPTR|RODATA)
272+
c40_DATA_ptr = &c40_DATA
273+
DATA(0x00, U64(0x0201000706050403))
274+
DATA(0x08, U64(0x0a09080f0e0d0c0b))
275+
return c40_DATA
276+
}
277+
func c48_DATA() Mem {
278+
if c48_DATA_ptr != nil {
279+
return *c48_DATA_ptr
280+
}
281+
282+
c48_DATA := GLOBL("·c48", NOPTR|RODATA)
283+
c48_DATA_ptr = &c48_DATA
284+
DATA(0x00, U64(0x0100070605040302))
285+
DATA(0x08, U64(0x09080f0e0d0c0b0a))
286+
return c48_DATA
287+
}

argon2/_asm/go.mod

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
module argon2/_asm
2+
3+
go 1.23
4+
5+
require (
6+
github.com/mmcloughlin/avo v0.6.0
7+
golang.org/x/crypto v0.26.0
8+
)
9+
10+
require (
11+
golang.org/x/mod v0.20.0 // indirect
12+
golang.org/x/sync v0.8.0 // indirect
13+
golang.org/x/sys v0.24.0 // indirect
14+
golang.org/x/tools v0.24.0 // indirect
15+
)

argon2/_asm/go.sum

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY=
2+
github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8=
3+
golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw=
4+
golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54=
5+
golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0=
6+
golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
7+
golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
8+
golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
9+
golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg=
10+
golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
11+
golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24=
12+
golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ=

0 commit comments

Comments
 (0)