Skip to content

Commit bcb0f91

Browse files
Garrett-Bodleyrolandshoemaker
authored andcommitted
internal/poly1305: Port sum_amd64.s to Avo
This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="internal/poly1305/sum_amd64.s" REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) Change-Id: I80212c95d1b05335d7f6b73a3030b6f812f6105b Reviewed-on: https://go-review.googlesource.com/c/crypto/+/600035 Reviewed-by: Roland Shoemaker <[email protected]> Reviewed-by: Filippo Valsorda <[email protected]> Reviewed-by: Dmitri Shuralyov <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]>
1 parent 7eace71 commit bcb0f91

File tree

4 files changed

+212
-74
lines changed

4 files changed

+212
-74
lines changed

internal/poly1305/_asm/go.mod

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
module internal/poly1305/_asm
2+
3+
go 1.23
4+
5+
require (
6+
github.com/mmcloughlin/avo v0.6.0
7+
golang.org/x/crypto v0.26.0
8+
)
9+
10+
require (
11+
golang.org/x/mod v0.20.0 // indirect
12+
golang.org/x/sync v0.8.0 // indirect
13+
golang.org/x/sys v0.24.0 // indirect
14+
golang.org/x/tools v0.24.0 // indirect
15+
)

internal/poly1305/_asm/go.sum

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY=
2+
github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8=
3+
golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw=
4+
golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54=
5+
golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0=
6+
golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
7+
golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
8+
golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
9+
golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg=
10+
golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
11+
golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24=
12+
golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ=
+126
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package main
6+
7+
import (
8+
. "github.com/mmcloughlin/avo/build"
9+
. "github.com/mmcloughlin/avo/operand"
10+
. "github.com/mmcloughlin/avo/reg"
11+
_ "golang.org/x/crypto/sha3"
12+
)
13+
14+
//go:generate go run . -out ../sum_amd64.s -pkg poly1305
15+
16+
func main() {
17+
Package("golang.org/x/crypto/internal/poly1305")
18+
ConstraintExpr("gc,!purego")
19+
update()
20+
Generate()
21+
}
22+
23+
func update() {
24+
Implement("update")
25+
26+
Load(Param("state"), RDI)
27+
MOVQ(NewParamAddr("msg_base", 8), RSI)
28+
MOVQ(NewParamAddr("msg_len", 16), R15)
29+
30+
MOVQ(Mem{Base: DI}.Offset(0), R8) // h0
31+
MOVQ(Mem{Base: DI}.Offset(8), R9) // h1
32+
MOVQ(Mem{Base: DI}.Offset(16), R10) // h2
33+
MOVQ(Mem{Base: DI}.Offset(24), R11) // r0
34+
MOVQ(Mem{Base: DI}.Offset(32), R12) // r1
35+
36+
CMPQ(R15, Imm(16))
37+
JB(LabelRef("bytes_between_0_and_15"))
38+
39+
Label("loop")
40+
POLY1305_ADD(RSI, R8, R9, R10)
41+
42+
Label("multiply")
43+
POLY1305_MUL(R8, R9, R10, R11, R12, RBX, RCX, R13, R14)
44+
SUBQ(Imm(16), R15)
45+
CMPQ(R15, Imm(16))
46+
JAE(LabelRef("loop"))
47+
48+
Label("bytes_between_0_and_15")
49+
TESTQ(R15, R15)
50+
JZ(LabelRef("done"))
51+
MOVQ(U32(1), RBX)
52+
XORQ(RCX, RCX)
53+
XORQ(R13, R13)
54+
ADDQ(R15, RSI)
55+
56+
Label("flush_buffer")
57+
SHLQ(Imm(8), RBX, RCX)
58+
SHLQ(Imm(8), RBX)
59+
MOVB(Mem{Base: SI}.Offset(-1), R13B)
60+
XORQ(R13, RBX)
61+
DECQ(RSI)
62+
DECQ(R15)
63+
JNZ(LabelRef("flush_buffer"))
64+
65+
ADDQ(RBX, R8)
66+
ADCQ(RCX, R9)
67+
ADCQ(Imm(0), R10)
68+
MOVQ(U32(16), R15)
69+
JMP(LabelRef("multiply"))
70+
71+
Label("done")
72+
MOVQ(R8, Mem{Base: DI}.Offset(0))
73+
MOVQ(R9, Mem{Base: DI}.Offset(8))
74+
MOVQ(R10, Mem{Base: DI}.Offset(16))
75+
RET()
76+
}
77+
78+
func POLY1305_ADD(msg, h0, h1, h2 GPPhysical) {
79+
ADDQ(Mem{Base: msg}.Offset(0), h0)
80+
ADCQ(Mem{Base: msg}.Offset(8), h1)
81+
ADCQ(Imm(1), h2)
82+
LEAQ(Mem{Base: msg}.Offset(16), msg)
83+
}
84+
85+
func POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3 GPPhysical) {
86+
MOVQ(r0, RAX)
87+
MULQ(h0)
88+
MOVQ(RAX, t0)
89+
MOVQ(RDX, t1)
90+
MOVQ(r0, RAX)
91+
MULQ(h1)
92+
ADDQ(RAX, t1)
93+
ADCQ(Imm(0), RDX)
94+
MOVQ(r0, t2)
95+
IMULQ(h2, t2)
96+
ADDQ(RDX, t2)
97+
98+
MOVQ(r1, RAX)
99+
MULQ(h0)
100+
ADDQ(RAX, t1)
101+
ADCQ(Imm(0), RDX)
102+
MOVQ(RDX, h0)
103+
MOVQ(r1, t3)
104+
IMULQ(h2, t3)
105+
MOVQ(r1, RAX)
106+
MULQ(h1)
107+
ADDQ(RAX, t2)
108+
ADCQ(RDX, t3)
109+
ADDQ(h0, t2)
110+
ADCQ(Imm(0), t3)
111+
112+
MOVQ(t0, h0)
113+
MOVQ(t1, h1)
114+
MOVQ(t2, h2)
115+
ANDQ(Imm(3), h2)
116+
MOVQ(t2, t0)
117+
ANDQ(I32(-4), t0)
118+
ADDQ(t0, h0)
119+
ADCQ(t3, h1)
120+
ADCQ(Imm(0), h2)
121+
SHRQ(Imm(2), t3, t2)
122+
SHRQ(Imm(2), t3)
123+
ADDQ(t2, h0)
124+
ADCQ(t3, h1)
125+
ADCQ(Imm(0), h2)
126+
}

internal/poly1305/sum_amd64.s

+59-74
Original file line numberDiff line numberDiff line change
@@ -1,108 +1,93 @@
1-
// Copyright 2012 The Go Authors. All rights reserved.
2-
// Use of this source code is governed by a BSD-style
3-
// license that can be found in the LICENSE file.
1+
// Code generated by command: go run sum_amd64_asm.go -out ../sum_amd64.s -pkg poly1305. DO NOT EDIT.
42

53
//go:build gc && !purego
64

7-
#include "textflag.h"
8-
9-
#define POLY1305_ADD(msg, h0, h1, h2) \
10-
ADDQ 0(msg), h0; \
11-
ADCQ 8(msg), h1; \
12-
ADCQ $1, h2; \
13-
LEAQ 16(msg), msg
14-
15-
#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
16-
MOVQ r0, AX; \
17-
MULQ h0; \
18-
MOVQ AX, t0; \
19-
MOVQ DX, t1; \
20-
MOVQ r0, AX; \
21-
MULQ h1; \
22-
ADDQ AX, t1; \
23-
ADCQ $0, DX; \
24-
MOVQ r0, t2; \
25-
IMULQ h2, t2; \
26-
ADDQ DX, t2; \
27-
\
28-
MOVQ r1, AX; \
29-
MULQ h0; \
30-
ADDQ AX, t1; \
31-
ADCQ $0, DX; \
32-
MOVQ DX, h0; \
33-
MOVQ r1, t3; \
34-
IMULQ h2, t3; \
35-
MOVQ r1, AX; \
36-
MULQ h1; \
37-
ADDQ AX, t2; \
38-
ADCQ DX, t3; \
39-
ADDQ h0, t2; \
40-
ADCQ $0, t3; \
41-
\
42-
MOVQ t0, h0; \
43-
MOVQ t1, h1; \
44-
MOVQ t2, h2; \
45-
ANDQ $3, h2; \
46-
MOVQ t2, t0; \
47-
ANDQ $0xFFFFFFFFFFFFFFFC, t0; \
48-
ADDQ t0, h0; \
49-
ADCQ t3, h1; \
50-
ADCQ $0, h2; \
51-
SHRQ $2, t3, t2; \
52-
SHRQ $2, t3; \
53-
ADDQ t2, h0; \
54-
ADCQ t3, h1; \
55-
ADCQ $0, h2
56-
57-
// func update(state *[7]uint64, msg []byte)
5+
// func update(state *macState, msg []byte)
586
TEXT ·update(SB), $0-32
597
MOVQ state+0(FP), DI
608
MOVQ msg_base+8(FP), SI
619
MOVQ msg_len+16(FP), R15
62-
63-
MOVQ 0(DI), R8 // h0
64-
MOVQ 8(DI), R9 // h1
65-
MOVQ 16(DI), R10 // h2
66-
MOVQ 24(DI), R11 // r0
67-
MOVQ 32(DI), R12 // r1
68-
69-
CMPQ R15, $16
10+
MOVQ (DI), R8
11+
MOVQ 8(DI), R9
12+
MOVQ 16(DI), R10
13+
MOVQ 24(DI), R11
14+
MOVQ 32(DI), R12
15+
CMPQ R15, $0x10
7016
JB bytes_between_0_and_15
7117

7218
loop:
73-
POLY1305_ADD(SI, R8, R9, R10)
19+
ADDQ (SI), R8
20+
ADCQ 8(SI), R9
21+
ADCQ $0x01, R10
22+
LEAQ 16(SI), SI
7423

7524
multiply:
76-
POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
77-
SUBQ $16, R15
78-
CMPQ R15, $16
79-
JAE loop
25+
MOVQ R11, AX
26+
MULQ R8
27+
MOVQ AX, BX
28+
MOVQ DX, CX
29+
MOVQ R11, AX
30+
MULQ R9
31+
ADDQ AX, CX
32+
ADCQ $0x00, DX
33+
MOVQ R11, R13
34+
IMULQ R10, R13
35+
ADDQ DX, R13
36+
MOVQ R12, AX
37+
MULQ R8
38+
ADDQ AX, CX
39+
ADCQ $0x00, DX
40+
MOVQ DX, R8
41+
MOVQ R12, R14
42+
IMULQ R10, R14
43+
MOVQ R12, AX
44+
MULQ R9
45+
ADDQ AX, R13
46+
ADCQ DX, R14
47+
ADDQ R8, R13
48+
ADCQ $0x00, R14
49+
MOVQ BX, R8
50+
MOVQ CX, R9
51+
MOVQ R13, R10
52+
ANDQ $0x03, R10
53+
MOVQ R13, BX
54+
ANDQ $-4, BX
55+
ADDQ BX, R8
56+
ADCQ R14, R9
57+
ADCQ $0x00, R10
58+
SHRQ $0x02, R14, R13
59+
SHRQ $0x02, R14
60+
ADDQ R13, R8
61+
ADCQ R14, R9
62+
ADCQ $0x00, R10
63+
SUBQ $0x10, R15
64+
CMPQ R15, $0x10
65+
JAE loop
8066

8167
bytes_between_0_and_15:
8268
TESTQ R15, R15
8369
JZ done
84-
MOVQ $1, BX
70+
MOVQ $0x00000001, BX
8571
XORQ CX, CX
8672
XORQ R13, R13
8773
ADDQ R15, SI
8874

8975
flush_buffer:
90-
SHLQ $8, BX, CX
91-
SHLQ $8, BX
76+
SHLQ $0x08, BX, CX
77+
SHLQ $0x08, BX
9278
MOVB -1(SI), R13
9379
XORQ R13, BX
9480
DECQ SI
9581
DECQ R15
9682
JNZ flush_buffer
97-
9883
ADDQ BX, R8
9984
ADCQ CX, R9
100-
ADCQ $0, R10
101-
MOVQ $16, R15
85+
ADCQ $0x00, R10
86+
MOVQ $0x00000010, R15
10287
JMP multiply
10388

10489
done:
105-
MOVQ R8, 0(DI)
90+
MOVQ R8, (DI)
10691
MOVQ R9, 8(DI)
10792
MOVQ R10, 16(DI)
10893
RET

0 commit comments

Comments
 (0)