transfer-lamports: Improve asm performance, update table (#12)

joncinque · web-flow · commit 7d0056835d5f · 2024-11-04T19:42:40.000Z
#### Problem

The assembly version of transfer-lamports does some redundant work on
certain registers. Also, the tables could use more info.

#### Summary of changes

Optimize the assembly version a bit further, and add some information
about the relative performance of different implementations.
diff --git a/README.md b/README.md
@@ -165,20 +165,25 @@ the amount given by a little-endian u64 in instruction data.
 | Rust | 459 |
 | Zig | 44 |
 | C | 104 |
-| Assembly | 31 |
+| Assembly | 30 |
 | Rust (pinocchio) | 32 |
 
 This one starts to get interesting since it requires parsing the instruction
 input. Since the assembly version knows exactly where to find everything, it can
-be hyper-optimized.
+be hyper-optimized. The pinocchio version performs very closely to the assembly
+implementation!
 
 * CPI: allocates a PDA given by the seed "You pass butter" and a bump seed in
 the instruction data. This requires a call to `create_program_address` to check
 the address and `invoke_signed` to CPI to the system program.
 
-| Language | CU Usage |
-| --- | --- |
-| Rust | 3662 |
-| Zig | 2825 |
-| C | 3122 |
-| Rust (pinocchio) | 2816 |
+| Language | CU Usage | CU Usage (minus syscalls) |
+| --- | --- | --- |
+| Rust | 3662 | 1162 |
+| Zig | 2825 | 325 |
+| C | 3122 | 622 |
+| Rust (pinocchio) | 2816 | 316 |
+
+Note: `create_program_address` consumes 1500 CUs, and `invoke` consumes 1000, so
+we can subtract 2500 CUs from each program to see the actual cost of the program
+logic.
diff --git a/transfer-lamports/asm/main.s b/transfer-lamports/asm/main.s
@@ -12,10 +12,9 @@ entrypoint:
 	add64 r4, 8 + 8 + 32 + 32 + 8 + 8 + 10240 + 8 # calculate end of account data
 	add64 r4, r3
 	mov64 r5, r4 # check how much padding we need to add
-	and64 r5, -8 # clear low bits
+	and64 r4, -8 # clear low bits
 	jeq r5, r4, 1 # no low bits set, jump ahead
 	add64 r4, 8 # add 8 for truncation if needed
-	and64 r4, -8 # clear low bits
 
 	ldxb r5, [r4 + 0] # get second account
 	jne r5, 0xff, error # we don't allow duplicates
@@ -25,11 +24,12 @@ entrypoint:
 	add64 r7, 8 + 32 + 32 + 8 + 8 + 10240 + 8 # calculate end of account data
 	add64 r7, r6
 	mov64 r8, r7 # check how much padding we need to add
-	and64 r8, -8 # clear low bits
+	and64 r7, -8 # clear low bits
 	jeq r8, r7, 1 # no low bits set, jump ahead
 	add64 r7, 8 # add 8 for truncation if low bits are set
+
 	ldxdw r8, [r7 + 0] # get instruction data size
-	jne r8, 0x08, error # need 8 bytes of instruction data
+	jne r8, 8, error # need 8 bytes of instruction data
 	ldxdw r8, [r7 + 8] # get instruction data as little-endian u64
 
 	sub64 r2, r8 # subtract lamports