|
| 1 | +// Copyright 2024 The Go Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style |
| 3 | +// license that can be found in the LICENSE file. |
| 4 | + |
| 5 | +package main |
| 6 | + |
| 7 | +import ( |
| 8 | + . "github.com/mmcloughlin/avo/build" |
| 9 | + . "github.com/mmcloughlin/avo/operand" |
| 10 | + . "github.com/mmcloughlin/avo/reg" |
| 11 | + _ "golang.org/x/crypto/argon2" |
| 12 | +) |
| 13 | + |
| 14 | +//go:generate go run . -out ../blamka_amd64.s -pkg argon2 |
| 15 | + |
| 16 | +func main() { |
| 17 | + Package("golang.org/x/crypto/argon2") |
| 18 | + ConstraintExpr("amd64,gc,!purego") |
| 19 | + |
| 20 | + blamkaSSE4() |
| 21 | + mixBlocksSSE2() |
| 22 | + xorBlocksSSE2() |
| 23 | + Generate() |
| 24 | +} |
| 25 | + |
| 26 | +func blamkaSSE4() { |
| 27 | + Implement("blamkaSSE4") |
| 28 | + Attributes(NOSPLIT) |
| 29 | + AllocLocal(0) |
| 30 | + |
| 31 | + Load(Param("b"), RAX) |
| 32 | + |
| 33 | + c40 := c40_DATA() |
| 34 | + c48 := c48_DATA() |
| 35 | + |
| 36 | + MOVOU(c40, X10) |
| 37 | + MOVOU(c48, X11) |
| 38 | + |
| 39 | + BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11) |
| 40 | + BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11) |
| 41 | + BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11) |
| 42 | + BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11) |
| 43 | + BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11) |
| 44 | + BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11) |
| 45 | + BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11) |
| 46 | + BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11) |
| 47 | + |
| 48 | + BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11) |
| 49 | + BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11) |
| 50 | + BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11) |
| 51 | + BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11) |
| 52 | + BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11) |
| 53 | + BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11) |
| 54 | + BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11) |
| 55 | + BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11) |
| 56 | + RET() |
| 57 | +} |
| 58 | + |
| 59 | +func mixBlocksSSE2() { |
| 60 | + Implement("mixBlocksSSE2") |
| 61 | + Attributes(NOSPLIT) |
| 62 | + AllocLocal(0) |
| 63 | + |
| 64 | + Load(Param("out"), RDX) |
| 65 | + Load(Param("a"), RAX) |
| 66 | + Load(Param("b"), RBX) |
| 67 | + Load(Param("c"), RCX) |
| 68 | + MOVQ(U32(128), RDI) |
| 69 | + |
| 70 | + Label("loop") |
| 71 | + MOVOU(Mem{Base: AX}.Offset(0), X0) |
| 72 | + MOVOU(Mem{Base: BX}.Offset(0), X1) |
| 73 | + MOVOU(Mem{Base: CX}.Offset(0), X2) |
| 74 | + PXOR(X1, X0) |
| 75 | + PXOR(X2, X0) |
| 76 | + MOVOU(X0, Mem{Base: DX}.Offset(0)) |
| 77 | + ADDQ(Imm(16), RAX) |
| 78 | + ADDQ(Imm(16), RBX) |
| 79 | + ADDQ(Imm(16), RCX) |
| 80 | + ADDQ(Imm(16), RDX) |
| 81 | + SUBQ(Imm(2), RDI) |
| 82 | + JA(LabelRef("loop")) |
| 83 | + RET() |
| 84 | +} |
| 85 | + |
| 86 | +func xorBlocksSSE2() { |
| 87 | + Implement("xorBlocksSSE2") |
| 88 | + Attributes(NOSPLIT) |
| 89 | + AllocLocal(0) |
| 90 | + |
| 91 | + Load(Param("out"), RDX) |
| 92 | + Load(Param("a"), RAX) |
| 93 | + Load(Param("b"), RBX) |
| 94 | + Load(Param("c"), RCX) |
| 95 | + MOVQ(U32(128), RDI) |
| 96 | + |
| 97 | + Label("loop") |
| 98 | + MOVOU(Mem{Base: AX}.Offset(0), X0) |
| 99 | + MOVOU(Mem{Base: BX}.Offset(0), X1) |
| 100 | + MOVOU(Mem{Base: CX}.Offset(0), X2) |
| 101 | + MOVOU(Mem{Base: DX}.Offset(0), X3) |
| 102 | + PXOR(X1, X0) |
| 103 | + PXOR(X2, X0) |
| 104 | + PXOR(X3, X0) |
| 105 | + MOVOU(X0, Mem{Base: DX}.Offset(0)) |
| 106 | + ADDQ(Imm(16), RAX) |
| 107 | + ADDQ(Imm(16), RBX) |
| 108 | + ADDQ(Imm(16), RCX) |
| 109 | + ADDQ(Imm(16), RDX) |
| 110 | + SUBQ(Imm(2), RDI) |
| 111 | + JA(LabelRef("loop")) |
| 112 | + RET() |
| 113 | +} |
| 114 | + |
| 115 | +func SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) { |
| 116 | + MOVO(v4, t1) |
| 117 | + MOVO(v5, v4) |
| 118 | + MOVO(t1, v5) |
| 119 | + MOVO(v6, t1) |
| 120 | + PUNPCKLQDQ(v6, t2) |
| 121 | + PUNPCKHQDQ(v7, v6) |
| 122 | + PUNPCKHQDQ(t2, v6) |
| 123 | + PUNPCKLQDQ(v7, t2) |
| 124 | + MOVO(t1, v7) |
| 125 | + MOVO(v2, t1) |
| 126 | + PUNPCKHQDQ(t2, v7) |
| 127 | + PUNPCKLQDQ(v3, t2) |
| 128 | + PUNPCKHQDQ(t2, v2) |
| 129 | + PUNPCKLQDQ(t1, t2) |
| 130 | + PUNPCKHQDQ(t2, v3) |
| 131 | +} |
| 132 | + |
| 133 | +func SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) { |
| 134 | + MOVO(v4, t1) |
| 135 | + MOVO(v5, v4) |
| 136 | + MOVO(t1, v5) |
| 137 | + MOVO(v2, t1) |
| 138 | + PUNPCKLQDQ(v2, t2) |
| 139 | + PUNPCKHQDQ(v3, v2) |
| 140 | + PUNPCKHQDQ(t2, v2) |
| 141 | + PUNPCKLQDQ(v3, t2) |
| 142 | + MOVO(t1, v3) |
| 143 | + MOVO(v6, t1) |
| 144 | + PUNPCKHQDQ(t2, v3) |
| 145 | + PUNPCKLQDQ(v7, t2) |
| 146 | + PUNPCKHQDQ(t2, v6) |
| 147 | + PUNPCKLQDQ(t1, t2) |
| 148 | + PUNPCKHQDQ(t2, v7) |
| 149 | +} |
| 150 | + |
| 151 | +func HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48 VecPhysical) { |
| 152 | + MOVO(v0, t0) |
| 153 | + PMULULQ(v2, t0) |
| 154 | + PADDQ(v2, v0) |
| 155 | + PADDQ(t0, v0) |
| 156 | + PADDQ(t0, v0) |
| 157 | + PXOR(v0, v6) |
| 158 | + PSHUFD(Imm(0xB1), v6, v6) |
| 159 | + MOVO(v4, t0) |
| 160 | + PMULULQ(v6, t0) |
| 161 | + PADDQ(v6, v4) |
| 162 | + PADDQ(t0, v4) |
| 163 | + PADDQ(t0, v4) |
| 164 | + PXOR(v4, v2) |
| 165 | + PSHUFB(c40, v2) |
| 166 | + MOVO(v0, t0) |
| 167 | + PMULULQ(v2, t0) |
| 168 | + PADDQ(v2, v0) |
| 169 | + PADDQ(t0, v0) |
| 170 | + PADDQ(t0, v0) |
| 171 | + PXOR(v0, v6) |
| 172 | + PSHUFB(c48, v6) |
| 173 | + MOVO(v4, t0) |
| 174 | + PMULULQ(v6, t0) |
| 175 | + PADDQ(v6, v4) |
| 176 | + PADDQ(t0, v4) |
| 177 | + PADDQ(t0, v4) |
| 178 | + PXOR(v4, v2) |
| 179 | + MOVO(v2, t0) |
| 180 | + PADDQ(v2, t0) |
| 181 | + PSRLQ(Imm(63), v2) |
| 182 | + PXOR(t0, v2) |
| 183 | + MOVO(v1, t0) |
| 184 | + PMULULQ(v3, t0) |
| 185 | + PADDQ(v3, v1) |
| 186 | + PADDQ(t0, v1) |
| 187 | + PADDQ(t0, v1) |
| 188 | + PXOR(v1, v7) |
| 189 | + PSHUFD(Imm(0xB1), v7, v7) |
| 190 | + MOVO(v5, t0) |
| 191 | + PMULULQ(v7, t0) |
| 192 | + PADDQ(v7, v5) |
| 193 | + PADDQ(t0, v5) |
| 194 | + PADDQ(t0, v5) |
| 195 | + PXOR(v5, v3) |
| 196 | + PSHUFB(c40, v3) |
| 197 | + MOVO(v1, t0) |
| 198 | + PMULULQ(v3, t0) |
| 199 | + PADDQ(v3, v1) |
| 200 | + PADDQ(t0, v1) |
| 201 | + PADDQ(t0, v1) |
| 202 | + PXOR(v1, v7) |
| 203 | + PSHUFB(c48, v7) |
| 204 | + MOVO(v5, t0) |
| 205 | + PMULULQ(v7, t0) |
| 206 | + PADDQ(v7, v5) |
| 207 | + PADDQ(t0, v5) |
| 208 | + PADDQ(t0, v5) |
| 209 | + PXOR(v5, v3) |
| 210 | + MOVO(v3, t0) |
| 211 | + PADDQ(v3, t0) |
| 212 | + PSRLQ(Imm(63), v3) |
| 213 | + PXOR(t0, v3) |
| 214 | +} |
| 215 | + |
| 216 | +func LOAD_MSG_0(block GPPhysical, off int) { |
| 217 | + var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7} |
| 218 | + for i, r := range registers { |
| 219 | + MOVOU(Mem{Base: block}.Offset(8*(off+(i*2))), r) |
| 220 | + } |
| 221 | +} |
| 222 | + |
| 223 | +func STORE_MSG_0(block GPPhysical, off int) { |
| 224 | + var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7} |
| 225 | + for i, r := range registers { |
| 226 | + MOVOU(r, Mem{Base: block}.Offset(8*(off+(i*2)))) |
| 227 | + } |
| 228 | +} |
| 229 | + |
| 230 | +func LOAD_MSG_1(block GPPhysical, off int) { |
| 231 | + var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7} |
| 232 | + for i, r := range registers { |
| 233 | + MOVOU(Mem{Base: block}.Offset(8*off+i*16*8), r) |
| 234 | + } |
| 235 | +} |
| 236 | + |
| 237 | +func STORE_MSG_1(block GPPhysical, off int) { |
| 238 | + var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7} |
| 239 | + for i, r := range registers { |
| 240 | + MOVOU(r, Mem{Base: block}.Offset(8*off+i*16*8)) |
| 241 | + } |
| 242 | +} |
| 243 | + |
| 244 | +func BLAMKA_ROUND_0(block GPPhysical, off int, t0, t1, c40, c48 VecPhysical) { |
| 245 | + LOAD_MSG_0(block, off) |
| 246 | + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48) |
| 247 | + SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1) |
| 248 | + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48) |
| 249 | + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1) |
| 250 | + STORE_MSG_0(block, off) |
| 251 | +} |
| 252 | + |
| 253 | +func BLAMKA_ROUND_1(block GPPhysical, off int, t0, t1, c40, c48 VecPhysical) { |
| 254 | + LOAD_MSG_1(block, off) |
| 255 | + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48) |
| 256 | + SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1) |
| 257 | + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48) |
| 258 | + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1) |
| 259 | + STORE_MSG_1(block, off) |
| 260 | +} |
| 261 | + |
| 262 | +// ##------------------DATA SECTION-------------------## |
| 263 | + |
| 264 | +var c40_DATA_ptr, c48_DATA_ptr *Mem |
| 265 | + |
| 266 | +func c40_DATA() Mem { |
| 267 | + if c40_DATA_ptr != nil { |
| 268 | + return *c40_DATA_ptr |
| 269 | + } |
| 270 | + |
| 271 | + c40_DATA := GLOBL("·c40", NOPTR|RODATA) |
| 272 | + c40_DATA_ptr = &c40_DATA |
| 273 | + DATA(0x00, U64(0x0201000706050403)) |
| 274 | + DATA(0x08, U64(0x0a09080f0e0d0c0b)) |
| 275 | + return c40_DATA |
| 276 | +} |
| 277 | +func c48_DATA() Mem { |
| 278 | + if c48_DATA_ptr != nil { |
| 279 | + return *c48_DATA_ptr |
| 280 | + } |
| 281 | + |
| 282 | + c48_DATA := GLOBL("·c48", NOPTR|RODATA) |
| 283 | + c48_DATA_ptr = &c48_DATA |
| 284 | + DATA(0x00, U64(0x0100070605040302)) |
| 285 | + DATA(0x08, U64(0x09080f0e0d0c0b0a)) |
| 286 | + return c48_DATA |
| 287 | +} |
0 commit comments