Skip to content

Commit e52ce40

Browse files
committed
AARCH64 port using CMA, based on juj#261
1 parent 157115a commit e52ce40

File tree

7 files changed

+164
-6
lines changed

7 files changed

+164
-6
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@
66
*.S
77
*.symvers
88
*.order
9+
build/*

CMakeLists.txt

+15-1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ set(DEFAULT_TO_SINGLE_CORE_BOARD OFF)
2424
set(DEFAULT_TO_ARMV6Z OFF)
2525
set(DEFAULT_TO_ARMV7A OFF)
2626
set(DEFAULT_TO_ARMV8A OFF)
27+
set(DEFAULT_USE_VCSM_CMA OFF)
2728

2829
# http://ozzmaker.com/check-raspberry-software-hardware-version-command-line/
2930
if (BOARD_REVISION MATCHES "(0002)|(0003)|(0004)|(0005)|(0006)|(0007)|(0008)|(0009)" OR BOARD_REVISION MATCHES "(000d)|(000e)|(000f)|(0010)|(0011)|(0012)" OR BOARD_REVISION MATCHES "(900092)|(900093)|(9000c1)")
@@ -46,7 +47,20 @@ if (SINGLE_CORE_BOARD)
4647
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSINGLE_CORE_BOARD=1")
4748
endif()
4849

49-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -mabi=aapcs-linux -mhard-float -mfloat-abi=hard -mlittle-endian -mtls-dialect=gnu2 -funsafe-math-optimizations")
50+
option(AARCH64 "Target a Raspberry Pi with aarch64 architecture" NO)
51+
if (AARCH64)
52+
message(STATUS "Enable AARCH64 build")
53+
set(DEFAULT_USE_VCSM_CMA ON)
54+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlittle-endian -funsafe-math-optimizations -DTIMER_32BIT")
55+
else()
56+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -mabi=aapcs-linux -mhard-float -mfloat-abi=hard -mlittle-endian -mtls-dialect=gnu2 -funsafe-math-optimizations")
57+
endif()
58+
59+
option(USE_VCSM_CMA "Map Memory from CPU instead of GPU" ${DEFAULT_USE_VCSM_CMA})
60+
if (USE_VCSM_CMA)
61+
message(STATUS "Enabling Map Memory from CPU instead of GPU")
62+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_VCSM_CMA=1")
63+
endif()
5064

5165
option(ARMV6Z "Target a Raspberry Pi with ARMv6Z instruction set (Pi 1A, 1A+, 1B, 1B+, Zero, Zero W)" ${DEFAULT_TO_ARMV6Z})
5266
if (ARMV6Z)

cma.cpp

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#ifdef USE_VCSM_CMA
2+
3+
#include "config.h"
4+
#include "cma.h"
5+
#include "util.h"
6+
#include <sys/ioctl.h>
7+
#include <fcntl.h>
8+
#include <syslog.h>
9+
#include <stdio.h>
10+
#include <stdlib.h>
11+
#include <unistd.h>
12+
13+
static int cma_fd = -1;
14+
#define PAGE_SIZE 4096
15+
16+
void OpenVCSM(void) {
17+
cma_fd = open("/dev/vcsm-cma", O_RDWR|O_SYNC);
18+
if (cma_fd < 0) FATAL_ERROR("can't open /dev/vcsm-cma");
19+
}
20+
21+
void CloseVCSM(void) {
22+
if (cma_fd >= 0) {
23+
close(cma_fd);
24+
}
25+
}
26+
27+
const int NAME_LENGTH = 32;
28+
29+
struct Allocate {
30+
/* user -> kernel */
31+
uint32_t size;
32+
uint32_t num;
33+
uint32_t flags;
34+
uint32_t pad;
35+
char name[NAME_LENGTH];
36+
37+
/* kernel -> user */
38+
int32_t fd;
39+
uint32_t vcHandle;
40+
uint64_t dmaAddr;
41+
};
42+
43+
int AllocateCMA(const char* reason, size_t req, CMAInfo* res) {
44+
if (res == NULL) {
45+
return -1;
46+
}
47+
Allocate ctx;
48+
memset(&ctx, 0, sizeof(ctx));
49+
ctx.size = ALIGN_UP(req, PAGE_SIZE);
50+
ctx.flags = 0; // NO cache
51+
strncpy((char*)ctx.name, reason, NAME_LENGTH -1);
52+
ctx.num = 1;
53+
if (ioctl(cma_fd, _IOR('J', 0x5A, struct Allocate), &ctx) < 0 || ctx.fd < 0) { // allocate cmd
54+
return -1;
55+
}
56+
res->size = ctx.size;
57+
res->vcHandle = ctx.vcHandle;
58+
res->dmaAddr = ctx.dmaAddr;
59+
res->fd = ctx.fd;
60+
return 0;
61+
}
62+
63+
#endif

cma.h

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#pragma once
2+
#ifdef USE_VCSM_CMA
3+
4+
#include <memory.h>
5+
#include <inttypes.h>
6+
struct CMAInfo {
7+
size_t size;
8+
uintptr_t dmaAddr;
9+
uint32_t fd;
10+
uint32_t vcHandle;
11+
};
12+
13+
void OpenVCSM(void);
14+
void CloseVCSM(void);
15+
int AllocateCMA(const char* reason, size_t req, CMAInfo* res);
16+
#endif

dma.cpp

+36-3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
#include "util.h"
1515
#include "mailbox.h"
1616

17+
#ifdef USE_VCSM_CMA
18+
#include "cma.h"
19+
#endif
20+
1721
#ifdef USE_DMA_TRANSFERS
1822

1923
#define BCM2835_PERI_BASE 0x3F000000
@@ -36,6 +40,9 @@ struct GpuMemory
3640
void *virtualAddr;
3741
uintptr_t busAddress;
3842
uint32_t sizeBytes;
43+
#ifdef USE_VCSM_CMA
44+
uint32_t vcHandle;
45+
#endif
3946
};
4047

4148
#define NUM_DMA_CBS 1024
@@ -127,7 +134,33 @@ void FreeDMAChannel(int channel)
127134
#define VIRT_TO_BUS(block, x) ((uintptr_t)(x) - (uintptr_t)((block).virtualAddr) + (block).busAddress)
128135

129136
uint64_t totalGpuMemoryUsed = 0;
137+
#ifdef USE_VCSM_CMA
130138

139+
void FreeUncachedGpuMemory(GpuMemory mem) {
140+
munmap(mem.virtualAddr, mem.sizeBytes);
141+
close(mem.allocationHandle);
142+
}
143+
144+
GpuMemory AllocateUncachedGpuMemory(uint32_t numBytes, const char *reason) {
145+
GpuMemory mem;
146+
CMAInfo ctx;
147+
if (AllocateCMA(reason, numBytes, &ctx) != 0) {
148+
FATAL_ERROR("alloc cma failed");
149+
}
150+
mem.sizeBytes = ctx.size;
151+
mem.busAddress = ctx.dmaAddr;
152+
mem.allocationHandle = ctx.fd;
153+
mem.vcHandle = ctx.vcHandle;
154+
mem.virtualAddr = mmap(0, mem.sizeBytes, PROT_READ | PROT_WRITE, MAP_SHARED, ctx.fd, 0);
155+
totalGpuMemoryUsed += mem.sizeBytes;
156+
if (mem.virtualAddr == MAP_FAILED) {
157+
FreeUncachedGpuMemory(mem);
158+
FATAL_ERROR("Failed to mmap CMA memory!");
159+
}
160+
printf("Allocated %u bytes of GPU memory for %s (bus address=%p). Total GPU memory used: %llu bytes\n", mem.sizeBytes, reason, (void*)mem.busAddress, totalGpuMemoryUsed);
161+
return mem;
162+
}
163+
#else
131164
// Allocates the given number of bytes in GPU side memory, and returns the virtual address and physical bus address of the allocated memory block.
132165
// The virtual address holds an uncached view to the allocated memory, so writes and reads to that memory address bypass the L1 and L2 caches. Use
133166
// this kind of memory to pass data blocks over to the DMA controller to process.
@@ -154,7 +187,7 @@ void FreeUncachedGpuMemory(GpuMemory mem)
154187
Mailbox(MEM_UNLOCK_MESSAGE, mem.allocationHandle);
155188
Mailbox(MEM_FREE_MESSAGE, mem.allocationHandle);
156189
}
157-
190+
#endif
158191
volatile DMAChannelRegisterFile *GetDMAChannel(int channelNumber)
159192
{
160193
if (channelNumber < 0 || channelNumber >= BCM2835_NUM_DMA_CHANNELS)
@@ -720,8 +753,8 @@ void SPIDMATransfer(SPITask *task)
720753
while((dmaTx->cs & BCM2835_DMA_CS_ACTIVE))
721754
{
722755
CheckSPIDMAChannelsNotStolen();
723-
if (tick() - dmaTaskStart > 5000000)
724-
FATAL_ERROR("DMA TX channel has stalled!");
756+
if (tick() - dmaTaskStart > 5000000)
757+
FATAL_ERROR("DMA TX channel has stalled!");
725758
}
726759
while((dmaRx->cs & BCM2835_DMA_CS_ACTIVE))
727760
{

spi.cpp

+20-2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ void ChipSelectHigh();
3333
#define TOGGLE_CHIP_SELECT_LINE() ((void)0)
3434
#endif
3535

36+
#ifdef USE_VCSM_CMA
37+
#include "cma.h"
38+
#endif
39+
3640
static uint32_t writeCounter = 0;
3741

3842
#define WRITE_FIFO(word) do { \
@@ -49,7 +53,11 @@ volatile SPIRegisterFile *spi = 0;
4953

5054
// Points to the system timer register. N.B. spec sheet says this is two low and high parts, in an 32-bit aligned (but not 64-bit aligned) address. Profiling shows
5155
// that Pi 3 Model B does allow reading this as a u64 load, and even when unaligned, it is around 30% faster to do so compared to loading in parts "lo | (hi << 32)".
56+
#ifdef TIMER_32BIT
57+
volatile systemTimer *systemTimerRegister = 0;
58+
#else
5259
volatile uint64_t *systemTimerRegister = 0;
60+
#endif
5361

5462
void DumpSPICS(uint32_t reg)
5563
{
@@ -510,13 +518,20 @@ int InitSPI()
510518
// Memory map GPIO and SPI peripherals for direct access
511519
mem_fd = open("/dev/mem", O_RDWR|O_SYNC);
512520
if (mem_fd < 0) FATAL_ERROR("can't open /dev/mem (run as sudo)");
521+
#ifdef USE_VCSM_CMA
522+
OpenVCSM();
523+
#endif
513524
printf("bcm_host_get_peripheral_address: %p, bcm_host_get_peripheral_size: %u, bcm_host_get_sdram_address: %p\n", bcm_host_get_peripheral_address(), bcm_host_get_peripheral_size(), bcm_host_get_sdram_address());
514525
bcm2835 = mmap(NULL, bcm_host_get_peripheral_size(), (PROT_READ | PROT_WRITE), MAP_SHARED, mem_fd, bcm_host_get_peripheral_address());
515526
if (bcm2835 == MAP_FAILED) FATAL_ERROR("mapping /dev/mem failed");
516527
spi = (volatile SPIRegisterFile*)((uintptr_t)bcm2835 + BCM2835_SPI0_BASE);
517528
gpio = (volatile GPIORegisterFile*)((uintptr_t)bcm2835 + BCM2835_GPIO_BASE);
518-
systemTimerRegister = (volatile uint64_t*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine.
519-
// TODO: On graceful shutdown, (ctrl-c signal?) close(mem_fd)
529+
#ifdef TIMER_32BIT
530+
systemTimerRegister = (volatile TIMER_TYPE*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE);
531+
#else
532+
systemTimerRegister = (volatile TIMER_TYPE*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine.
533+
#endif
534+
// TODO: On graceful shutdown, (ctrl-c signal?) close(mem_fd)
520535
#endif
521536

522537
uint32_t currentBcmCoreSpeed = MailboxRet2(0x00030002/*Get Clock Rate*/, 0x4/*CORE*/);
@@ -658,6 +673,9 @@ void DeinitSPI()
658673
close(mem_fd);
659674
mem_fd = -1;
660675
}
676+
#ifdef USE_VCSM_CMA
677+
CloseVCSM();
678+
#endif
661679

662680
#ifndef KERNEL_MODULE_CLIENT
663681

tick.h

+13
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,21 @@
55
#include <unistd.h>
66

77
// Initialized in spi.cpp along with the rest of the BCM2835 peripheral:
8+
#ifdef TIMER_32BIT
9+
struct __attribute__((packed, aligned(4))) systemTimer {
10+
volatile uint32_t cs;
11+
volatile uint32_t clo;
12+
volatile uint32_t chi;
13+
volatile uint32_t c[4];
14+
};
15+
#define TIMER_TYPE systemTimer
16+
extern volatile systemTimer* systemTimerRegister;
17+
#define tick() (((uint64_t)systemTimerRegister->clo) | ((uint64_t)(systemTimerRegister->chi) << 32))
18+
#else
19+
#define TIMER_TYPE uint64_t
820
extern volatile uint64_t *systemTimerRegister;
921
#define tick() (*systemTimerRegister)
22+
#endif
1023

1124
#endif
1225

0 commit comments

Comments
 (0)