-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathnootloader.S
More file actions
481 lines (395 loc) · 20.4 KB
/
nootloader.S
File metadata and controls
481 lines (395 loc) · 20.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
#define REG_BASE 0x4000000
#define REG_TM0CNT_L_OFFSET 0x100
#define REG_RCNT_OFFSET 0x134
#define REG_SIOCNT_OFFSET 0x128
#define REG_SIODATA32_OFFSET 0x120
#define REG_HALTCNT_OFFSET 0x301
#define REG_BIOS_IRQIF 0x3007FF8
#define IRQ_TIMER0 (1 << 3)
#define IRQ_SERIAL (1 << 7)
#define IRQ_KEYPAD (1 << 12)
#define IRQ_GAMEPAK (1 << 13)
#define SIO_32BIT (1 << 12)
#define SIO_IRQ_EN (1 << 14)
#define SIO_START (1 << 7)
#define USER_ISR_PTR 0x3007FFC
#define USER_ISR_PTR_OFFSET -4
#define REQ_RDYQ 0x3F594452 @ "RDY?", but flipped due to Little Endian integer encoding
#define REPLY_NOOT 0x544F4F4E @ "NOOT", ...and so on.
#define REPLY_LENQ 0x3F4E454C @ "LEN?"
#define REPLY_LENOK 0x214B4F4C @ "LOK!"
#define REPLY_CRCQ 0x3F435243 @ "CRC?"
#define CUSTOM_HALT_ADDR 0x1AC
#define HALT 0x00
#define STOP 0x80
#define SAFE_IWRAM_BEGIN 0x3007E00
#define TIMER_PRESCALE_256 0x2
#define TIMER_IRQ_EN (1 << 6)
#define TIMER_EN (1 << 7)
.syntax unified
.cpu arm7tdmi
.global _nootloader_entry
.type _nootloader_entry, %function
.arm
@ Nootloader, by nullstalgia, 2025
@ Summary:
@ A simple 2nd-stage bootloader for the GBA,
@ intended to be loaded via Multiboot/Xboo Burst Boot Backdoor (recommended),
@ or directly embedded into homebrew (less-recommended, but works in a pinch).
@ Flow:
@ Each transaction is 32 bits, with the Master and Slave exchanging their words simultaneously.
@ (SPI Mode 3, MSB First)
@ Master - PC/Other
@ Slave - GBA
@
@ Master -> "RDY?"
@ Slave -> "NOOT" @ Note: Master will typically repeat sending "RDY?" if a "NOOT" was not yet received, this could be detected similarly to the Xboo Burst Boot Backdoor and alternatively branched into, instead of downloading nootloader via the Xboo Backdoor.
@
@ Master -> length of incoming payload in bytes
@ Slave -> "LEN?" (to indicate expecting length)
@
@ - First payload word sent
@ Master -> first_word
@ Slave -> "LOK!" (to indicate length was accepted, i.e. 4 <= len <= 0x40000)
@
@ - Subsequent payload words sent
@ Master -> payload_word
@ Slave -> EWRAM Address for payload_word (the first expected would've been 0x2000000 if not for "LOK!", so the first expected should be 0x2000004)
@
@ - Final payload word sent
@ Master -> last_payload_word
@ Slave -> "CRC?" (to indicate expecting CRC as next and final word)
@
@ - CRC exchange and compare
@ Master -> crc
@ Slave -> crc
@
@ The CRC is *very* simply calculated: sum of all payload words XOR'd by payload length (u32)
@
@ If the CRC matches, the slave will boot the downloaded payload, otherwise it will wait for another "RDY?" to restart the process.
@ How nootloader boots downloaded payloads:
@ Upon verifying the crude CRC, nootloader will clear itself from IWRAM while simultaneously resetting all registers and peripherals.
@ This is achieved by copying a teeny subroutine into the (unused by nootloader) user stack area that overlaps with the final 0x200 bytes
@ in IWRAM that don't get cleared by RegisterRamReset.
@ The subroutine calls RegisterRamReset followed by SoftReset, which branches to 0x2000000 after resetting the final 0x200 bytes within IWRAM.
@ Regarding payload entry points and nootloader's (optional yet frequent) payload patching requirement:
@ When launching a binary off a cartridge, the GBA branches to (header-relative) address 0x0.
@ When launching a binary downloaded via Normal Mode Multiboot, the GBA branches to (header-relative) address 0xC0.
@ This allows developers to craft a single binary that can behave differently depending on _how_ it was loaded,
@ but poses an interesting problem for nootloader, as SoftReset branches to 0x2000000, whereas Multiboot would typically branch to 0x20000C0.
@
@ As a result, it is a _strong recommendation_ that payloads sent to nootloader have their root entry points either contain branches to
@ the same points in your code, or have the payloads _patched by the sending application_ to header-relative 0xC0 (or something custom if you're modifying my stuff~).
@
@ Otherwise, the binary may assume it's running on a cartridge and try to read extra data from the potentially-empty slot.
@
@ This patching is already performed (with safety checks) by gbasend (and the desktop crate using it, gbasend-cli), so no extra work
@ should be required from users, but it should be noted in case you use SoftReset yourself and potentially rely on that strange branching behavior,
@ or if there is a payload that performs a self-CRC check, that would be violated by the aforementioned patching. There _is_ an option to skip the
@ behavior within my crates, but if there's a payload that expects the initial branch to 0xC0 + SoftResets to 0x0 and/or self-CRCs,
@ then it would be deemed incompatible. I am unaware if any such payload exists, given the niche setups required, but I'd love to know!
@
@ To be clear, this _is_ an entirely self-imposed problem, but the idea of having a rather deterministic reset state from the combination of
@ RegisterRamReset and SoftReset combined was an appealing one. If there's no such benefit, then perhaps I'll reverse course. We'll see~
.section .text._nootloader,"ax",%progbits
#ifdef MULTIBOOT
_header:
b _main @ ROM entry point
@ Logo Data (filled by post-link tool)
.zero 156
@ Checksum Area Start
.zero 12 @ Game title
.zero 4 @ Game code
.zero 2 @ Maker code
.byte 0x96 @ Mandatory
.zero 1 @ Device code
.zero 1 @ Device type
.zero 7 @ Reserved
.zero 1 @ Game version
@ Checksum Area End
.zero 1 @ Checksum (filled by post-link tool)
.zero 2 @ Reserved
b _main @ Multiboot entry point
.zero 28
b _main @ JOYBUS entry point
.zero 4 @ Optional padding to allow mGBA Multiboot detection (it explicitly denies entries that branch to just after the header)
#endif
_nootloader_entry:
b _main
.pool
@ Yoinked from tonc's IRQ handler example and modified
@ Summary:
@ Simple ISR that checks if the IF == SIO_IRQ, and if so,
@ busy-waits for the Serial Start/Active bit to clear before returning.
_serial_isr:
@ Read IF/IE
ldr r0, [r6]
and r0, r0, r0, lsr #16
#orr r0, r0, r0, lsr #16
# @ Store last IRQ reason in r11
# mov r11, r0
@ Acknowledge IRQ in IF and in BIOS_IF
strh r0, [r6, #2]
ldr r2, [r6, #-0x208]
orr r2, r2, r0
str r2, [r6, #-0x208]
tst r0, IRQ_SERIAL
bne _wait_start_bit
tst r0, IRQ_TIMER0
beq _serial_isr_done @ Bit not recognized, skip to end
_timer_interrupt:
adr r2, _wait_for_start_timer0
str r2, [sp] @ Change LR to _wait_for_start_timer0
movs r0, r7 @ Move r7 into r0, checking if it's eq to 0
_not_graceful:
movne r8, #0 @ If not, we got here from an incomplete transfer/visible blinking dot
movne r7, #0 @ Set color to black/length of data to 0
_graceful:
moveq r8, #0x1F @ Otherwise we got here from a visible blinking dot
moveq r7, #1 @ Set color to Red/non-zero length to trigger dot-clearing
b _colored_line
_wait_start_bit:
@ Wait for Start Bit to go low to indicate transfer finished
@ (Sometimes we can go too fast and clobber the data in REG_SIODATA32 if we don't wait?)
add r1, r4, REG_SIOCNT_OFFSET
ldr r0, [r1]
tst r0, SIO_START
bne _wait_start_bit @ Bit still set, go check again. TODO, timeout? can use cascading timer for seconds
_serial_isr_done:
bx lr @ and exit
@ Summary:
@ Copies the words between *r0 and *r2 to *r1 using CpuFastSet, and then branches into *r1.
@ Input / Output:
@ r0 - Source address (should start with a valid ARM instruction)
@ r1 - Destination address (will branch into this address after copy concludes)
@ r2 - Source End address (should be higher than r0, will copy all the words from r0 up to r2)
@ Clobbers:
@ pc, will not return
_copy_and_branch:
mov r12, r1 @ Stash r1's value in case it gets clobbered by the swi
sub r2, r2, r0 @ Subtract r0 from r2 to get length in bytes
mov r2, r2, lsr #2 @ Shift result right by 2 to divide result by 4, leaving r2 with length in words.
# orr r2, (1 << 26) @ For CpuSet (swi 0xB) only, set word width to 32 bits
swi 0xC << 16 @ Normal trigger of CpuFastSet
mov pc, r12 @ Branch into copied code at destination
@ Summary:
@ Draws a centered, 200 x 1, progress bar.
@ Input / Output:
@ r0 - Bytes tranferred
@ r8 - Color (15-bit) (currently using CRC for color via r8)
@ Assumptions:
@ r10 - Contains bytes-per-pixel
@ Clobbers:
@ r2, r12
_colored_line_and_lr:
adr lr, _wait_for_start_timer0
_colored_line:
mov r2, #0 @ Init r2 to 0
ldr r12, =#0x06004B28 @ Load address in VRAM to write to,
@ here is about 1/4 down from the top,
@ and centered for 200 pixel wide progress bars (for the 240-wide screen)
_next_line_pixel:
strh r8, [r12], #2 @ Write value to VRAM and increment to next pixel address
add r2, r10 @ Add one pixel's worth of bytes to r2
cmp r2, r0 @ Check if we've gone past or equal to the written bytes
blo _next_line_pixel @ And if not, write the next pixel and loop back again
bx lr @ Return to caller
@ r4 - Contains the constant 0x4000000, REG_BASE
@ r6 - Contains the constant 0x4000200, REG_IE
@ TODO:
@ - reply-less(?) version for 4-player 32-bit transmission
@ - need to look into how the one-way daisy-chaining actually works.
@ - for much later, after initial public release, but would be a neat feature
@ - maybe clear vram at start in case of pre-existing data?
@ - hardware timers for timeouts and blinking idle "cursor"
@ - red progress bar upon failure
@ - handle when externally branched into from IWRAM
_main:
@ Load the base address for most registers
mov r4, #0x4000000
@ Clear Interrupt Master Enable (IME)
add r6, r4, #0x200
strh r4, [r6,#8]
@ Stack setup for all modes (supervisor, IRQ, system/user(?))
msr cpsr_c, 0xD3
ldr sp, =#0x03007FE0 @ sp_svc
msr cpsr_c, 0xD2
ldr sp, =#0x03007FA0 @ sp_irq
msr cpsr_c, 0x1F
ldr sp, =#0x03007F00 @ sp_sys
@ TODO disable all DMAs
and r0, pc, #0xFF000000
mov r1, #0x3000000 @ Also destination address for copy_and_branch
cmp r0, r1
@ If not equal, then we're not running in IWRAM! Copy everything there and try again.
adr r0, _nootloader_entry @ Load r0 with source address (program entry point)
adr r2, _end_of_code @ Load r2 with "final" address (end of code)
bne _copy_and_branch
@ Otherwise, begin preparing to receive serial data
@ Set our serial ISR as the Interrupt Handler
adr r0, _serial_isr
str r0, [r4, USER_ISR_PTR_OFFSET]
@ Clear TIMER0
str r4, [r4, REG_TM0CNT_L_OFFSET]
@ Enable only the Serial IO and TIMER0 interrupts in IE
mov r0, (IRQ_SERIAL | IRQ_TIMER0)
strh r0, [r6]
@ Set RCNT to 0b11, setting up the swap for Normal Mode to
@ reset the GBA communication circuitry
mov r0, (0b11 << 14)
strh r0, [r6, #-0xCC]
@ Set RCNT to 0 for Normal Mode
mov r0, #0
strh r0, [r6, #-0xCC]
@ Configure SIOCNT to 32-bit mode and to enable interrupts after every transaction
mov r0, (SIO_32BIT | SIO_IRQ_EN)
strh r0, [r6, #-0xD8]
# mrs r0, cpsr @ Read current CPSR
# bic r0, r0, #0x80 @ Unconditionally clear IRQ Disable bit
# msr cpsr_c, r0 @ Store modified CPSR
mov r0, #0x18
swi 0x1 << 16 @ RegisterRamReset VRAM and OAM for a clean workspace for our progress bar
@ Set Interrupt Master Enable (IME)
mov r0, #1
strh r0, [r6, #8]
# @ Set screen black (clear forced blank)
# strh r4, [r4]
@ Set screen black (clear forced blank + BG Mode 3)
ldr r1, =#0x0403
strh r1, [r4]
@ Continue on to wait for handshake and payload.
_wait_for_start:
mov r7, #0 @ incoming length-4
mov r10, #1 @ bytes per progress bar increase, default 1 for manual blinking control
_wait_for_start_timer0:
mov r8, #0 @ "crc" (all incoming data units added together, with length xor'd afterwards)
mov r9, #0x2000000 @ current cursor
ldr r0, =#REPLY_NOOT @ Load "NOOT" into r0 to send out pre-emptively
ldr r1, =#REQ_RDYQ @ Load "RDY?" into r1 to check against
bl _sio_transfer_32 @ Halts until transfer occurs
cmp r0, r1 @ Compare check if we rx'd "RDY?"
bne _wait_for_start @ If it doesn't match, try again
_wait_for_len:
ldr r0, =#REPLY_LENQ @ Load "LEN?" to indicate expecting len when tranferred
bl _sio_transfer_32 @ Halts until transfer occurs
@ TODO - AND with 0x00FFFFF and check that length
@ so that 0xFF000000 can be used for bit flags
cmp r0, #0x40000 @ Compare received length with what would be the max payload size (EWRAM Size)
bhi _wait_for_start @ If too big, start over.
cmp r0, #0x4 @ Ensure there's at least 4 bytes since we'll be subbing 4 from the len temporarily
blo _wait_for_start @ If too small, start over.
sub r7, r0, #4 @ Save length into r7, minus four bytes
mov r1, #200 @ Place divisor of 200 in r1, r0 already contains full length in bytes
swi 0x6 << 16 @ Run BIOS division subroutine
add r10, r0, #4 @ Add (quotient + 4) into r10, giving us the number of bytes per progress bar pixel.
@ Adding the remainder in r1 can skew the ratio of bytes to pixels, resulting in an uncentered bar when complete, while the small constant avoids these issues.
@ And not adding anything at all can result in tiny payloads looping forever in _colored_line.
ldr r0, =#REPLY_LENOK @ Load "LOK!" to indicate length was okay for first data transfer,
@ subsequent transfers will return the memory address being written to
ldr r1, =#0x7FFFFF @ Mask to use on length
_send_to_ewram:
bl _sio_transfer_32 @ Halts until transfer occurs
str r0, [r9], #4 @ Store received word into *r9, increment r9 by 4
add r8, r0 @ Add received word into r8 for "CRC"
and r0, r9, r1 @ Mask cursor to get written length
bl _colored_line @ Render progress bar
cmp r0, r7 @ Compare with expected length
mov r0, r9 @ Prepare for not enough sent, place next address in r0
blo _send_to_ewram @ If received less than length-4, keep receiving
ldr r0, =#REPLY_CRCQ @ Prepare for final word incoming, place "CRC?" in r0
beq _send_to_ewram @ If received exactly length-4, receive one final (data) word
_finish_download:
add r7, #4 @ Add back the 4-bytes we removed earlier
eor r8, r7, r8 @ XOR the length (to account for zero-padding bytes)
mov r0, r8 @ Store CRC in r0 to send
bl _sio_transfer_32 @ Halts until transfer occurs
cmp r0, r8 @ Compare sent CRC with received CRC
beq _download_success @ If match, prepare to launch payload.
mov r0, r7 @ Otherwise, prepare to draw red failure progress bar.
mov r8, #0x1F
b _colored_line_and_lr
_download_success:
strh r4, [r6,#8] @ Clear Interrupt Master Enable (IME)
adr r0, _clean_boot_ewram @ Load r0 with source addr (program entry point)
ldr r1, =#SAFE_IWRAM_BEGIN @ Load r1 with dest. addr (default user stack begin address)
adr r2, _clean_boot_end @ Load r2 with "final" address (_clean_boot_end)
b _copy_and_branch @ Copy words between r2 and r0 to r1, and branch to r1
_trap:
b . @ Loop forever, todo show with prog bar?
.pool
@ For now, assume normal mode SPI32
@ Summary:
@ Prepares r0 to be sent via SIO, hardware-HALTing until transfer completes, and returns the received value in r0.
@ Input / Output:
@ r0 - 32bit word in/out
@ Assumptions:
@ r4 == 0x4000000, REG_BASE
@ r6 == 0x4000200, REG_IE
@ Clobbers:
@ r2, r12
_sio_transfer_32:
@ Clear Start bit
strb r4, [r4, REG_SIOCNT_OFFSET]
@ Store outgoing word in SIO
str r0, [r4, REG_SIODATA32_OFFSET]
@ Clear then set TIMER0
str r4, [r4, REG_TM0CNT_L_OFFSET]
ldr r0, =#0x00C27FFF @ TIMER_EN | TIMER_IRQ_EN | TIMER_PRESCALE_256 << 16 & 0x7FFF
str r0, [r4, REG_TM0CNT_L_OFFSET]
@ Load IF VAL into IF, ensuring a valid state for halting
ldrh r2, [r6, #2]
strh r2, [r6, #2]
@ Set Start bit to allow transaction
mov r2, SIO_START
strb r2, [r4, REG_SIOCNT_OFFSET]
@ Stash the real lr
mov r0, lr
_sio_halt:
@ Halt and wait for Serial IRQ to trigger using HALTCNT, Inspired by https://github.com/extremscorner/gba-as-controller/blob/a5e207702282831a5ff3517bf9250bc5d4ba4436/source/bios.h#L186
@ I tried to just write HALT/STOP to the byte-register myself, but perhaps I don't have the privilege? Luckily, branching into BIOS is easy.
mov r2, HALT
adr lr, _sio_finish
mov pc, CUSTOM_HALT_ADDR
# tst r11, 0x80
# beq _sio_halt @ Bit wasn't set, ignore
_sio_finish:
@ Get lr back
mov lr, r0
@ Load fresh SIO data to r0
ldr r0, [r4, REG_SIODATA32_OFFSET]
@ Back to caller
bx lr
.pool
@ Summary:
@ Resets all peripherals, clears IWRAM, and branches into EWRAM
@ Input / Output:
@ N/A
@ Assumptions:
@ r4 == 0x4000000, REG_BASE
@ Valid code exists beginning at 0x2000000
@ Clobbers:
@ All registers and memory segments.
@ !Important!:
@ The current impl is under 32 bytes, which is the size of a single CpuFastSet transfer. Ideally, we keep it <= 32 bytes.
_clean_boot_ewram:
mov r0, #0xFE @ Set bits 1-7, intentionally leaving bit 0 cleared.
strb r0, [r4, #-6] @ Store non-zero value into 0x3007FFA, so SoftReset doesn't branch into ROM
@ Things get risky after this SWI completes, as it relies on a couple strong invariants:
@ 1. Valid ARM code exists beginning at 0x2000000
@ - Note to self: given multiboot images have their branch address at ROM+0xC0,
@ this may not be the best place to branch into, but SoftReset doesn't allow that much granularity.
@ I might add an option to the PC-side sender to swap out the top instruction with a branch to ROM+0xC0,
@ as having that be default behavior is also problematic.
@ 2. This subroutine is currently running in IWRAM within the range 0x3007E00 to 0x3007F00 (IWRAM End - 0xFF).
@ - This is also where the various CPU mode stacks + BIOS-allocated area lives.
@ - RegisterRamReset excludes that region when clearing IWRAM, but SoftReset does not.
@ - And since each SWI invocation moves words onto the user mode stack, that stack pointer
@ should be placed far enough away from where this subroutine will be placed so that it does not
@ get overwritten mid-execution.
swi 0x1 << 16 @ RegisterRamReset everything except EWRAM and IWRAM's final 0x200
mov r0, #0xFE @ Re-set r0, previous SWI may clobber it.
swi 0x0 @ SoftReset, resets r0-r12, IWRAM's final 0x200, and branches into 0x2000000
_clean_boot_end:
.pool
_end_of_code:
.end