Skip to content

crypto/keccak: asm version of keccakF1600 on arm64#33752

Closed
cuiweixie wants to merge 1 commit intoethereum:masterfrom
cuiweixie:keccak-arm64-asm
Closed

crypto/keccak: asm version of keccakF1600 on arm64#33752
cuiweixie wants to merge 1 commit intoethereum:masterfrom
cuiweixie:keccak-arm64-asm

Conversation

@cuiweixie
Copy link
Copy Markdown
Contributor

@cuiweixie cuiweixie commented Feb 4, 2026

benchmark result:

goos: darwin
goarch: arm64
pkg: github.com/ethereum/go-ethereum/crypto/keccak
cpu: Apple M4 Pro
                       │   old.txt   │              new.txt               │
                       │   sec/op    │   sec/op     vs base               │
PermutationFunction-14   220.9n ± 1%   200.7n ± 3%  -9.12% (p=0.000 n=10)

                       │   old.txt    │               new.txt                │
                       │     B/s      │     B/s       vs base                │
PermutationFunction-14   863.3Mi ± 1%   950.3Mi ± 2%  +10.08% (p=0.000 n=10)

                       │  old.txt   │            new.txt             │
                       │    B/op    │    B/op     vs base            │
PermutationFunction-14   0.000 ± 0%   0.000 ± 0%  ~ (p=1.000 n=10) ¹
¹ all samples are equal

                       │  old.txt   │            new.txt             │
                       │ allocs/op  │ allocs/op   vs base            │
PermutationFunction-14   0.000 ± 0%   0.000 ± 0%  ~ (p=1.000 n=10) ¹

gen by:

//go:build ignore

// Generator for fully unrolled ARM64 Keccak-f[1600] implementation.
// Chi step uses registers R17, R19-R22 (R18 is reserved on ARM64) instead of stack.
// Usage: go run gen_arm64_asm.go  (writes keccakf_arm64.s)

package main

import (
	"fmt"
	"os"
)

var roundConstants = []uint64{
	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
	0x8000000000008080, 0x0000000080000001, 0x8000000080008008,
}

// Round patterns extracted from the Go reference implementation
type GroupPattern struct {
	src   [5]int
	rot   [5]int
	bcIdx [5]int
	dst   [5]int
}

type RoundPattern struct {
	groups [5]GroupPattern
}

// Chi output registers: R17, R19, R20, R21, R22 (R18 is platform-reserved on ARM64)
var chiOutRegs = [5]int{17, 19, 20, 21, 22}

var roundPatterns = []RoundPattern{
	// Round 0
	{groups: [5]GroupPattern{
		{[5]int{0, 6, 12, 18, 24}, [5]int{0, 44, 43, 21, 14}, [5]int{0, 1, 2, 3, 4}, [5]int{0, 6, 12, 18, 24}},
		{[5]int{10, 16, 22, 3, 9}, [5]int{3, 45, 61, 28, 20}, [5]int{2, 3, 4, 0, 1}, [5]int{10, 16, 22, 3, 9}},
		{[5]int{20, 1, 7, 13, 19}, [5]int{18, 1, 6, 25, 8}, [5]int{4, 0, 1, 2, 3}, [5]int{20, 1, 7, 13, 19}},
		{[5]int{5, 11, 17, 23, 4}, [5]int{36, 10, 15, 56, 27}, [5]int{1, 2, 3, 4, 0}, [5]int{5, 11, 17, 23, 4}},
		{[5]int{15, 21, 2, 8, 14}, [5]int{41, 2, 62, 55, 39}, [5]int{3, 4, 0, 1, 2}, [5]int{15, 21, 2, 8, 14}},
	}},
	// Round 1
	{groups: [5]GroupPattern{
		{[5]int{0, 16, 7, 23, 14}, [5]int{0, 44, 43, 21, 14}, [5]int{0, 1, 2, 3, 4}, [5]int{0, 16, 7, 23, 14}},
		{[5]int{20, 11, 2, 18, 9}, [5]int{3, 45, 61, 28, 20}, [5]int{2, 3, 4, 0, 1}, [5]int{20, 11, 2, 18, 9}},
		{[5]int{15, 6, 22, 13, 4}, [5]int{18, 1, 6, 25, 8}, [5]int{4, 0, 1, 2, 3}, [5]int{15, 6, 22, 13, 4}},
		{[5]int{10, 1, 17, 8, 24}, [5]int{36, 10, 15, 56, 27}, [5]int{1, 2, 3, 4, 0}, [5]int{10, 1, 17, 8, 24}},
		{[5]int{5, 21, 12, 3, 19}, [5]int{41, 2, 62, 55, 39}, [5]int{3, 4, 0, 1, 2}, [5]int{5, 21, 12, 3, 19}},
	}},
	// Round 2
	{groups: [5]GroupPattern{
		{[5]int{0, 11, 22, 8, 19}, [5]int{0, 44, 43, 21, 14}, [5]int{0, 1, 2, 3, 4}, [5]int{0, 11, 22, 8, 19}},
		{[5]int{15, 1, 12, 23, 9}, [5]int{3, 45, 61, 28, 20}, [5]int{2, 3, 4, 0, 1}, [5]int{15, 1, 12, 23, 9}},
		{[5]int{5, 16, 2, 13, 24}, [5]int{18, 1, 6, 25, 8}, [5]int{4, 0, 1, 2, 3}, [5]int{5, 16, 2, 13, 24}},
		{[5]int{20, 6, 17, 3, 14}, [5]int{36, 10, 15, 56, 27}, [5]int{1, 2, 3, 4, 0}, [5]int{20, 6, 17, 3, 14}},
		{[5]int{10, 21, 7, 18, 4}, [5]int{41, 2, 62, 55, 39}, [5]int{3, 4, 0, 1, 2}, [5]int{10, 21, 7, 18, 4}},
	}},
	// Round 3
	{groups: [5]GroupPattern{
		{[5]int{0, 1, 2, 3, 4}, [5]int{0, 44, 43, 21, 14}, [5]int{0, 1, 2, 3, 4}, [5]int{0, 1, 2, 3, 4}},
		{[5]int{5, 6, 7, 8, 9}, [5]int{3, 45, 61, 28, 20}, [5]int{2, 3, 4, 0, 1}, [5]int{5, 6, 7, 8, 9}},
		{[5]int{10, 11, 12, 13, 14}, [5]int{18, 1, 6, 25, 8}, [5]int{4, 0, 1, 2, 3}, [5]int{10, 11, 12, 13, 14}},
		{[5]int{15, 16, 17, 18, 19}, [5]int{36, 10, 15, 56, 27}, [5]int{1, 2, 3, 4, 0}, [5]int{15, 16, 17, 18, 19}},
		{[5]int{20, 21, 22, 23, 24}, [5]int{41, 2, 62, 55, 39}, [5]int{3, 4, 0, 1, 2}, [5]int{20, 21, 22, 23, 24}},
	}},
}

func main() {
	out := os.Stdout
	if len(os.Args) > 1 && os.Args[1] == "-o" && len(os.Args) > 2 {
		f, err := os.Create(os.Args[2])
		if err != nil {
			fmt.Fprintf(os.Stderr, "create %s: %v\n", os.Args[2], err)
			os.Exit(1)
		}
		defer f.Close()
		out = f
	} else {
		// Default: write to keccakf_arm64.s in same directory
		f, err := os.Create("keccakf_arm64.s")
		if err != nil {
			fmt.Fprintf(os.Stderr, "create keccakf_arm64.s: %v\n", err)
			os.Exit(1)
		}
		defer f.Close()
		out = f
	}
	print := func(s string) { fmt.Fprint(out, s) }
	printf := func(format string, args ...interface{}) { fmt.Fprintf(out, format, args...) }

	print(`// Copyright 2026 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build gc && !purego && arm64

#include "textflag.h"

// Code generated by go run gen_arm64_asm.go. DO NOT EDIT.

// Fully unrolled keccakF1600 for ARM64
// func keccakF1600(a *[25]uint64)
TEXT ·keccakF1600(SB), NOSPLIT, $0-8
	MOVD	a+0(FP), R0

`)

	// Generate all 24 rounds
	for round := 0; round < 24; round++ {
		generateRound(round, printf)
	}

	print(`
	RET
`)
}

type printfFunc func(string, ...interface{})

func generateRound(round int, printf printfFunc) {
	// Use repeating 4-round pattern
	pattern := roundPatterns[round%4]

	printf("\n\t// ========== ROUND %d ==========\n", round)

	// Theta: compute column parities C[0-4] in R1-R5
	printf("\t// Theta\n")
	for x := 0; x < 5; x++ {
		regC := x + 1 // R1-R5 for C[0]-C[4]
		printf("\tMOVD\t%d(R0), R%d\n", x*8, regC)
		for y := 1; y < 5; y++ {
			idx := x + y*5
			printf("\tMOVD\t%d(R0), R16\n", idx*8)
			printf("\tEOR\tR16, R%d, R%d\n", regC, regC)
		}
	}

	// Compute D[0-4] in R6-R10
	printf("\t// D values\n")
	for x := 0; x < 5; x++ {
		prevX := (x + 4) % 5
		nextX := (x + 1) % 5
		regD := x + 6
		regCprev := prevX + 1
		regCnext := nextX + 1

		printf("\tROR\t$63, R%d, R%d\n", regCnext, regD)
		printf("\tEOR\tR%d, R%d, R%d\n", regCprev, regD, regD)
	}

	// Process each lane group
	for groupIdx := 0; groupIdx < 5; groupIdx++ {
		group := pattern.groups[groupIdx]
		printf("\t// Group %d\n", groupIdx)

		// Load lanes, XOR with D, and rotate
		// Store in bc registers according to bcIdx mapping (R11-R15)
		for i := 0; i < 5; i++ {
			srcIdx := group.src[i]
			rot := group.rot[i]
			col := srcIdx % 5
			regD := col + 6           // D[0]-D[4] in R6-R10
			bcIndex := group.bcIdx[i] // which bc this lane maps to
			regBC := bcIndex + 11     // bc[0]-bc[4] in R11-R15

			printf("\tMOVD\t%d(R0), R%d\n", srcIdx*8, regBC)
			printf("\tEOR\tR%d, R%d, R%d\n", regD, regBC, regBC)
			if rot != 0 {
				rorAmt := (64 - rot) % 64
				printf("\tROR\t$%d, R%d, R%d\n", rorAmt, regBC, regBC)
			}
		}

		// Chi: compute all 5 results into R17, R19-R22 (R18 reserved on ARM64)
		// result[bc_i] = bc[i] ^ ((~bc[i+1]) & bc[i+2])
		for bcI := 0; bcI < 5; bcI++ {
			regBC_i := bcI + 11
			regBC_i1 := ((bcI + 1) % 5) + 11
			regBC_i2 := ((bcI + 2) % 5) + 11
			regOut := chiOutRegs[bcI]

			// (~bc[i+1]) & bc[i+2]  then  bc[i] ^ result
			printf("\tBIC\tR%d, R%d, R%d\n", regBC_i1, regBC_i2, regOut)
			printf("\tEOR\tR%d, R%d, R%d\n", regBC_i, regOut, regOut)

			// Iota: add round constant to first result (group 0, bcI 0)
			if groupIdx == 0 && bcI == 0 {
				printf("\tMOVD\t$0x%016x, R2\n", roundConstants[round])
				printf("\tEOR\tR2, R%d, R%d\n", regOut, regOut)
			}
		}

		// Write Chi results from R17, R19-R22 back to state
		for bcI := 0; bcI < 5; bcI++ {
			dstIdx := group.src[bcI]
			printf("\tMOVD\tR%d, %d(R0)\n", chiOutRegs[bcI], dstIdx*8)
		}
	}
}

@cuiweixie
Copy link
Copy Markdown
Contributor Author

any updates, will this pr be merged?

@lightclient
Copy link
Copy Markdown
Member

Related: #33879

Seems to offer better performance.

@lightclient
Copy link
Copy Markdown
Member

We discussed further and feel #33879 offers better performance and will pursue that PR.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants