Skip to content

Add DWC2 cache maintenance routines for STM32 #2963

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 18 commits into
base: master
Choose a base branch
from

Conversation

HiFiPhile
Copy link
Collaborator

@HiFiPhile HiFiPhile commented Jan 25, 2025

Describe the PR

  • Support DMA + DCache ON
  • Refactored buffer alignment macro to take into account cache line size

Now #define CFG_TUD_DWC2_DMA_ENABLE 1 is enough.

It's prefer to declare a non-cached region with MPU instead of rely on cache invalidate+clean, benchmark on STM32H7S3 and i.MX RT1170 shows frequent cache invalidate+clean really hurts performance.

** Need rebase after #2960

Benchmark code
extern uint32_t SystemCoreClock;
void SWD_Init(void)
{
  //UNLOCK FUNNEL
  *(volatile uint32_t*)(0x5C004FB0) = 0xC5ACCE55; // SWTF_LAR
  *(volatile uint32_t*)(0x5C003FB0) = 0xC5ACCE55; // SWO_LAR
 
  //SWO current output divisor register
  //This divisor value (0x000000C7) corresponds to 400Mhz
  //To change it, you can use the following rule
  // value = (CPU Freq/sw speed )-1
  blink_interval_ms= *(volatile uint32_t*)(0x5C003010);
   *(volatile uint32_t*)(0x5C003010) = ((SystemCoreClock / 12000000) - 1); // SWO_CODR
 
  //SWO selected pin protocol register
   *(volatile uint32_t*)(0x5C0030F0) = 0x00000002; // SWO_SPPR
 
  //Enable ITM input of SWO trace funnel
   *(volatile uint32_t*)(0x5C004000) |= 0x00000001; // SWFT_CTRL
 
}

static void MPU_AdjustRegionAddressSize(uint32_t Address, uint32_t Size, MPU_Region_InitTypeDef* pInit);
static void MPU_Config(void)
{
  MPU_Region_InitTypeDef MPU_InitStruct = {0};
  uint32_t index = MPU_REGION_NUMBER0;
  uint32_t address;
  uint32_t size;

  /* Disable the MPU */
  HAL_MPU_Disable();

  /* Initialize the background region */
  MPU_InitStruct.Enable = MPU_REGION_ENABLE;
  MPU_InitStruct.Number = index;
  MPU_InitStruct.BaseAddress = 0x0;
  MPU_InitStruct.Size = MPU_REGION_SIZE_4GB;
  MPU_InitStruct.SubRegionDisable = 0x87;
  MPU_InitStruct.TypeExtField = MPU_TEX_LEVEL0;
  MPU_InitStruct.AccessPermission = MPU_REGION_NO_ACCESS;
  MPU_InitStruct.DisableExec = MPU_INSTRUCTION_ACCESS_DISABLE;
  MPU_InitStruct.IsShareable = MPU_ACCESS_SHAREABLE;
  MPU_InitStruct.IsCacheable = MPU_ACCESS_CACHEABLE;
  MPU_InitStruct.IsBufferable = MPU_ACCESS_BUFFERABLE;
  HAL_MPU_ConfigRegion(&MPU_InitStruct);
  index++;

  /* Initialize the non cacheable region */
#if defined ( __ICCARM__ )
  /* get the region attribute form the icf file */
  extern uint32_t NONCACHEABLEBUFFER_start;
  extern uint32_t NONCACHEABLEBUFFER_size;

  address = (uint32_t)&NONCACHEABLEBUFFER_start;
  size = (uint32_t)&NONCACHEABLEBUFFER_size;

#elif defined (__CC_ARM) || defined(__ARMCC_VERSION)
  extern uint32_t Image$$RW_NONCACHEABLEBUFFER$$Base;
  extern uint32_t Image$$RW_NONCACHEABLEBUFFER$$Length;
  extern uint32_t Image$$RW_NONCACHEABLEBUFFER$$ZI$$Length;

  address = (uint32_t)&Image$$RW_NONCACHEABLEBUFFER$$Base;
  size  = (uint32_t)&Image$$RW_NONCACHEABLEBUFFER$$Length + (uint32_t)&Image$$RW_NONCACHEABLEBUFFER$$ZI$$Length;
#elif defined ( __GNUC__ )
  extern int __NONCACHEABLEBUFFER_BEGIN;
  extern int __NONCACHEABLEBUFFER_END;

  address = (uint32_t)&__NONCACHEABLEBUFFER_BEGIN;
  size  = (uint32_t)&__NONCACHEABLEBUFFER_END - (uint32_t)&__NONCACHEABLEBUFFER_BEGIN;
#else
#error "Compiler toolchain is unsupported"
#endif

  if (size != 0)
  {
    /* Configure the MPU attributes as Normal Non Cacheable */
    MPU_InitStruct.Enable = MPU_REGION_ENABLE;
    MPU_InitStruct.AccessPermission = MPU_REGION_FULL_ACCESS;
    MPU_InitStruct.IsBufferable = MPU_ACCESS_NOT_BUFFERABLE;
    MPU_InitStruct.IsCacheable = MPU_ACCESS_NOT_CACHEABLE;
    MPU_InitStruct.IsShareable = MPU_ACCESS_NOT_SHAREABLE;
    MPU_InitStruct.Number = index;
    MPU_InitStruct.TypeExtField = MPU_TEX_LEVEL1;
    MPU_InitStruct.SubRegionDisable = 0x00;
    MPU_InitStruct.DisableExec = MPU_INSTRUCTION_ACCESS_DISABLE;
    MPU_AdjustRegionAddressSize(address, size, &MPU_InitStruct);
    HAL_MPU_ConfigRegion(&MPU_InitStruct);
    index++;
  }

  /* Initialize the region corresponding to the execution area
     (external or internal flash or external or internal RAM
     depending on scatter file definition) */
#if defined ( __ICCARM__ )
  extern uint32_t __ICFEDIT_region_ROM_start__;
  extern uint32_t __ICFEDIT_region_ROM_end__;
  address = (uint32_t)&__ICFEDIT_region_ROM_start__;
  size = (uint32_t)&__ICFEDIT_region_ROM_end__ - (uint32_t)&__ICFEDIT_region_ROM_start__ + 1;
#elif defined (__CC_ARM) || defined(__ARMCC_VERSION)
  extern uint32_t Image$$ER_ROM$$Base;
  extern uint32_t Image$$ER_ROM$$Limit;
  address = (uint32_t)&Image$$ER_ROM$$Base;
  size    = (uint32_t)&Image$$ER_ROM$$Limit-(uint32_t)&Image$$ER_ROM$$Base;
#elif defined ( __GNUC__ )
  extern uint32_t __FLASH_BEGIN;
  extern uint32_t __FLASH_SIZE;
  address = (uint32_t)&__FLASH_BEGIN;
  size  = (uint32_t)&__FLASH_SIZE;
#else
#error "Compiler toolchain is unsupported"
#endif

  MPU_InitStruct.Enable = MPU_REGION_ENABLE;
  MPU_InitStruct.Number = index;
  MPU_InitStruct.SubRegionDisable = 0u;
  MPU_InitStruct.TypeExtField = MPU_TEX_LEVEL1;
  MPU_InitStruct.AccessPermission = MPU_REGION_FULL_ACCESS;
  MPU_InitStruct.DisableExec = MPU_INSTRUCTION_ACCESS_ENABLE;
  MPU_InitStruct.IsShareable = MPU_ACCESS_SHAREABLE;
  MPU_InitStruct.IsCacheable = MPU_ACCESS_CACHEABLE;
  MPU_InitStruct.IsBufferable = MPU_ACCESS_BUFFERABLE;
  MPU_AdjustRegionAddressSize(address, size, &MPU_InitStruct);
  HAL_MPU_ConfigRegion(&MPU_InitStruct);
  index++;

  /* Reset unused MPU regions */
  for(; index < __MPU_REGIONCOUNT ; index++)
  {
    /* All unused regions disabled */
    MPU_InitStruct.Enable = MPU_REGION_DISABLE;
    MPU_InitStruct.Number = index;
    HAL_MPU_ConfigRegion(&MPU_InitStruct);
  }

  /* Enable the MPU */
  HAL_MPU_Enable(MPU_PRIVILEGED_DEFAULT);
}

/**
  * @brief This function adjusts the MPU region Address and Size within an MPU configuration.
  * @param Address memory address
  * @param Size memory size
  * @param pInit pointer to an MPU initialization structure
  * @retval None
  */
static void MPU_AdjustRegionAddressSize(uint32_t Address, uint32_t Size, MPU_Region_InitTypeDef* pInit)
{
  /* Compute the MPU region size */
  pInit->Size = ((31 - __CLZ(Size)) - 1);
  if (Size > (1 << (pInit->Size + 1)))
  {
    pInit->Size++;
  }
  uint32_t Modulo = Address % (1 << (pInit->Size - 1));
  if (0 != Modulo)
  {
    /* Align address with MPU region size considering there is no need to increase the size */
    pInit->BaseAddress = Address - Modulo;
  }
  else
  {
    pInit->BaseAddress = Address;
  }
}

#define TEST_SIZE 1024

__attribute__((section("dtcm_data")))
__attribute__((aligned(32)))
uint8_t buffer1[TEST_SIZE];

__attribute__((section("dtcm_data")))
__attribute__((aligned(32)))
uint8_t buffer2[TEST_SIZE];

__attribute__((section("noncacheable")))
__attribute__((aligned(32)))
uint8_t buffer_ncache[TEST_SIZE];

__attribute__((aligned(32)))
uint8_t buffer_cached[TEST_SIZE];

unsigned int test_loop(void* dst, const void* src, int size, bool flush, bool invalidate)
{
    volatile unsigned int *DWT_CYCCNT = (uint32_t *)0xE0001004; //address of the register
    volatile unsigned int *DWT_CONTROL = (uint32_t *)0xE0001000; //address of the register
    volatile unsigned int *SCB_DEMCR = (uint32_t *)0xE000EDFC; //address of the register
    
    *SCB_DEMCR = *SCB_DEMCR | 0x01000000;
    *DWT_CYCCNT = 0;
    *DWT_CONTROL |=  1;
    
    if(invalidate)
        SCB_InvalidateDCache_by_Addr((uint32_t*)src, size);
    
    
    memcpy(dst, src, size);
    

    if (flush)
        SCB_CleanDCache_by_Addr((uint32_t*)dst, size);
    
    *DWT_CONTROL &= ~1;
    return *DWT_CYCCNT;
}

int main(void) {
  MPU_Config();
  board_init();
  SWD_Init();
    
  unsigned int cycle;
    printf("\r\nmemcpy benchmark \r\n");

    for(int i = 0; i < TEST_SIZE; i++)
    {
        buffer1[i] = (uint8_t)i;
    }
    
    printf("DTCM - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer2, buffer1, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("DTCM - NonCache\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer_ncache, buffer1, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("DTCM - Cache\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer_cached, buffer1, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("DTCM - Cache+Flush\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer_cached, buffer1, TEST_SIZE, true, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    
    printf("NonCache - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer1, buffer_ncache, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("Cache - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer1, buffer_cached, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("Cache+Invalidate - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer1, buffer_cached, TEST_SIZE, false, true);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    
    while (1)
    {
    }
  
}

Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
@HiFiPhile
Copy link
Collaborator Author

Looks like my HIL instance has license issue, I think we can add the env locally.

@pstadelmann
Copy link

pstadelmann commented Jan 27, 2025

Works great for me, both for CDC and UVC. Thanks !

In addition to #define CFG_TUD_DWC2_DMA_ENABLE 1 I also had to add #define __CORTEX_M 7

@HiFiPhile
Copy link
Collaborator Author

Works great for me, both for CDC and UVC. Thanks !

In addition to #define CFG_TUD_DWC2_DMA_ENABLE 1 I also had to add #define __CORTEX_M 7

Thanks for your test.
It's little strange that you need to define __CORTEX_M, normally stm32h7xx.h includes devcie header eg. stm32h747xx.h which includes core_cm7.h and __CORTEX_M is defined inside. Did I miss something ?

@hathach
Copy link
Owner

hathach commented Jan 28, 2025

Thanks @HiFiPhile for great Pr as usual. Though I am off for TET (Lunar New Year) and won't be able to review this in 2 weeks. Happy New Year 🎉

@pstadelmann
Copy link

Works great for me, both for CDC and UVC. Thanks !
In addition to #define CFG_TUD_DWC2_DMA_ENABLE 1 I also had to add #define __CORTEX_M 7

Thanks for your test. It's little strange that you need to define __CORTEX_M, normally stm32h7xx.h includes devcie header eg. stm32h747xx.h which includes core_cm7.h and __CORTEX_M is defined inside. Did I miss something ?

I don't think so. I'm using a custom RTOS which relies on its own set of headers, that's why.

@HiFiPhile
Copy link
Collaborator Author

Thanks @HiFiPhile for great Pr as usual. Though I am off for TET (Lunar New Year) and won't be able to review this in 2 weeks. Happy New Year 🎉

Happy new year also 🎊

Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
{
__NONCACHEABLEBUFFER_BEGIN = .;/* create symbol for start of section */
KEEP(*(noncacheable))
__NONCACHEABLEBUFFER_END = .; /* create symbol for start of section */

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment is wrong. It should be end of section.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you, it's copied from STM32CubeH7RS.

RW_NONCACHEABLE :
{
__NONCACHEABLEBUFFER_BEGIN = .;/* create symbol for start of section */
KEEP(*(noncacheable))
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ST's latest default linker files for NUCLEO-H7S3L8 seem to use

Suggested change
KEEP(*(noncacheable))
KEEP(*(noncacheable_buffer))

Across the board.

By default linker files, I mean these ones (also just for reference I think STM32H7S3L8HX_ROMxspi2.ld is the preferred/recommended one for bootflash/OctoSPI setup on NUCLEO-H7S3L8):
image

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://github.com/search?q=repo%3ASTMicroelectronics%2FSTM32CubeH7RS+path%3A*.ld+noncacheable&type=code

Looks like STM32CubeH7RS is using noncacheable_buffer instead of noncacheable across the board.

Not a big deal, although keeping alignment with the section name they use probably will help with compatibility.

#elif CFG_TUSB_MCU == OPT_MCU_STM32H7RS
static mem_region_t uncached_regions[] = {
// DTCM (although USB DMA can't transfer to/from DTCM)
{.start = 0x20000000, .end = 0x2002FFFF},
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since DMA can't access DTCM on H7 and H7RS. Might it be worth also defining the idiomatic ST noncacheable_buffer region in here?

It could be grabbed from symbols something like:

extern int __NONCACHEABLEBUFFER_BEGIN;
extern int __NONCACHEABLEBUFFER_END;

uint32_t *start = (uint32_t*)&__NONCACHEABLEBUFFER_BEGIN;
uint32_t *end = (uint32_t*)&__NONCACHEABLEBUFFER_END;

Since noncacheable_buffer should be set as non-cacheable in MPU configuration, I imagine the MPU probably doesn't actually do anything when you try to do cache maintenance on those sections, so it probably doesn't incur the same performance penalty as the cache maintanance operations has when buffers are located in cacheable memory, but I guess it's still costing a few cycles every transfer for those who are using noncacheable memory.

[Note: this comment operates on the assumption that my other comment below regarding is_cache_mem is correct and there is a typo in the region start+end check.]

Co-authored-by: Joel Murphy <[email protected]>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants