From 3df27924d00ed23ef5709786af00bfd589197fa8 Mon Sep 17 00:00:00 2001 From: "peter.marcisovsky" Date: Fri, 17 Jan 2025 21:43:16 +0100 Subject: [PATCH] feat(lvgl_port_simd): Support for esp32s2, assembly rendering - renamed assembly src files to xtensa_pie, xtensa_base - xtensa_base src files use zero-overhead loops only for esp32 - added zero length matrix check into all src files --- components/esp_lvgl_port/CMakeLists.txt | 8 +- .../simd/lv_color_blend_to_argb8888_esp32.S | 81 ---------- .../lv_color_blend_to_argb8888_xtensa_base.S | 147 ++++++++++++++++++ ...> lv_color_blend_to_argb8888_xtensa_pie.S} | 19 ++- ...=> lv_color_blend_to_rgb565_xtensa_base.S} | 108 +++++++++---- ... => lv_color_blend_to_rgb565_xtensa_pie.S} | 14 +- ...=> lv_color_blend_to_rgb888_xtensa_base.S} | 30 +++- ... => lv_color_blend_to_rgb888_xtensa_pie.S} | 11 ++ .../src/lvgl9/simd/lv_macro_memset.S | 50 ++++++ ...b565_blend_normal_to_rgb565_xtensa_base.S} | 68 +++++++- ...gb565_blend_normal_to_rgb565_xtensa_pie.S} | 9 ++ ...b888_blend_normal_to_rgb888_xtensa_base.S} | 65 +++++++- ...gb888_blend_normal_to_rgb888_xtensa_pie.S} | 11 +- .../test_apps/simd/main/CMakeLists.txt | 8 +- .../test_apps/simd/main/lv_fill_common.h | 2 +- .../simd/main/test_lv_fill_benchmark.c | 19 +-- .../simd/main/test_lv_fill_functionality.c | 86 +++++----- .../simd/main/test_lv_image_benchmark.c | 2 +- .../simd/main/test_lv_image_functionality.c | 11 +- 19 files changed, 549 insertions(+), 200 deletions(-) delete mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32.S create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_xtensa_base.S rename components/esp_lvgl_port/src/lvgl9/simd/{lv_color_blend_to_argb8888_esp32s3.S => lv_color_blend_to_argb8888_xtensa_pie.S} (95%) rename components/esp_lvgl_port/src/lvgl9/simd/{lv_color_blend_to_rgb565_esp32.S => lv_color_blend_to_rgb565_xtensa_base.S} (60%) rename components/esp_lvgl_port/src/lvgl9/simd/{lv_color_blend_to_rgb565_esp32s3.S => lv_color_blend_to_rgb565_xtensa_pie.S} (97%) rename components/esp_lvgl_port/src/lvgl9/simd/{lv_color_blend_to_rgb888_esp32.S => lv_color_blend_to_rgb888_xtensa_base.S} (77%) rename components/esp_lvgl_port/src/lvgl9/simd/{lv_color_blend_to_rgb888_esp32s3.S => lv_color_blend_to_rgb888_xtensa_pie.S} (97%) create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memset.S rename components/esp_lvgl_port/src/lvgl9/simd/{lv_rgb565_blend_normal_to_rgb565_esp32.S => lv_rgb565_blend_normal_to_rgb565_xtensa_base.S} (83%) rename components/esp_lvgl_port/src/lvgl9/simd/{lv_rgb565_blend_normal_to_rgb565_esp32s3.S => lv_rgb565_blend_normal_to_rgb565_xtensa_pie.S} (97%) rename components/esp_lvgl_port/src/lvgl9/simd/{lv_rgb888_blend_normal_to_rgb888_esp32.S => lv_rgb888_blend_normal_to_rgb888_xtensa_base.S} (83%) rename components/esp_lvgl_port/src/lvgl9/simd/{lv_rgb888_blend_normal_to_rgb888_esp32s3.S => lv_rgb888_blend_normal_to_rgb888_xtensa_pie.S} (95%) diff --git a/components/esp_lvgl_port/CMakeLists.txt b/components/esp_lvgl_port/CMakeLists.txt index 94aac0418..c74fee001 100644 --- a/components/esp_lvgl_port/CMakeLists.txt +++ b/components/esp_lvgl_port/CMakeLists.txt @@ -76,14 +76,14 @@ if("usb_host_hid" IN_LIST build_components) list(APPEND ADD_LIBS idf::usb_host_hid) endif() -# Include SIMD assembly source code for rendering, only for (9.1.0 <= LVG_version < 9.2.0) and only for esp32 and esp32s3 +# Include SIMD assembly source code for rendering, only for (9.1.0 <= LVG_version < 9.2.0) and only for Xtensa targets (esp32, esp32s2, esp32s3) if((lvgl_ver VERSION_GREATER_EQUAL "9.1.0") AND (lvgl_ver VERSION_LESS "9.2.0")) - if(CONFIG_IDF_TARGET_ESP32 OR CONFIG_IDF_TARGET_ESP32S3) + if(CONFIG_IDF_TARGET_ESP32 OR CONFIG_IDF_TARGET_ESP32S3 OR CONFIG_IDF_TARGET_ESP32S2) message(VERBOSE "Compiling SIMD") if(CONFIG_IDF_TARGET_ESP32S3) - file(GLOB_RECURSE ASM_SRCS ${PORT_PATH}/simd/*_esp32s3.S) # Select only esp32s3 related files + file(GLOB_RECURSE ASM_SRCS ${PORT_PATH}/simd/*_xtensa_pie.S) # Select Xtensa PIE, for esp32s3 target else() - file(GLOB_RECURSE ASM_SRCS ${PORT_PATH}/simd/*_esp32.S) # Select only esp32 related files + file(GLOB_RECURSE ASM_SRCS ${PORT_PATH}/simd/*_xtensa_base.S) # Select Xtensa Base for esp32, esp32s2 targets endif() # Explicitly add all assembly macro files diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32.S deleted file mode 100644 index 7d0606757..000000000 --- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32.S +++ /dev/null @@ -1,81 +0,0 @@ -/* - * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD - * - * SPDX-License-Identifier: Apache-2.0 - */ - -// This is LVGL ARGB8888 simple fill for ESP32 processor - - .section .text - .align 4 - .global lv_color_blend_to_argb8888_esp - .type lv_color_blend_to_argb8888_esp,@function - -// The function implements the following C code: -// void lv_color_blend_to_argb8888(_lv_draw_sw_blend_fill_dsc_t * dsc); - -// Input params -// -// dsc - a2 - -// typedef struct { -// uint32_t opa; l32i 0 -// void * dst_buf; l32i 4 -// uint32_t dst_w; l32i 8 -// uint32_t dst_h; l32i 12 -// uint32_t dst_stride; l32i 16 -// const void * src_buf; l32i 20 -// uint32_t src_stride; l32i 24 -// const lv_opa_t * mask_buf; l32i 28 -// uint32_t mask_stride; l32i 32 -// } asm_dsc_t; - -lv_color_blend_to_argb8888_esp: - - entry a1, 32 - - l32i.n a3, a2, 4 // a3 - dest_buff - l32i.n a4, a2, 8 // a4 - dest_w in uint32_t - l32i.n a5, a2, 12 // a5 - dest_h in uint32_t - l32i.n a6, a2, 16 // a6 - dest_stride in bytes - l32i.n a7, a2, 20 // a7 - src_buff (color) - l32i.n a8, a7, 0 // a8 - color as value - slli a11, a4, 2 // a11 - dest_w_bytes = sizeof(uint32_t) * dest_w - - movi a7, 0xff000000 // oppactiy mask - or a10, a7, a8 // apply oppacity - - srli a9, a4, 2 // a9 - loop_len = dest_w / 4 - sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes - - .outer_loop: - - // Run main loop which sets 16 bytes in one loop run - loopnez a9, ._main_loop - s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3 - s32i.n a10, a3, 4 // save 32 bits from a10 to dest_buff a3 - s32i.n a10, a3, 8 // save 32 bits from a10 to dest_buff a3 - s32i.n a10, a3, 12 // save 32 bits from a10 to dest_buff a3 - addi.n a3, a3, 16 // increment dest_buff pointer by 16 bytes - ._main_loop: - - // Finish the remaining bytes out of the loop - // Check modulo 8 of the dest_w_bytes, if - then set 8 bytes - bbci a11, 3, _mod_8_check // branch if 2-nd bit of dest_w_bytes is clear - s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes - s32i.n a10, a3, 4 // save 32 bits from a10 to dest_buff a3, offset 0 bytes - addi.n a3, a3, 8 // increment dest_buff pointer by 8 bytes - _mod_8_check: - - // Check modulo 4 of the dest_w_bytes, if - then set 4 bytes - bbci a11, 2, _mod_4_check // branch if 2-nd bit of dest_w_bytes is clear - s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes - addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes - _mod_4_check: - - add a3, a3, a6 // dest_buff + dest_stride - addi.n a5, a5, -1 // decrease the outer loop - bnez a5, .outer_loop - - movi.n a2, 1 // return LV_RESULT_OK = 1 - retw.n // return diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_xtensa_base.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_xtensa_base.S new file mode 100644 index 000000000..d47aaee6b --- /dev/null +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_xtensa_base.S @@ -0,0 +1,147 @@ +/* + * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include "lv_macro_memset.S" + +// This is LVGL ARGB8888 simple fill for ESP32, ESP32S2 processor + + .section .text + .align 4 + .global lv_color_blend_to_argb8888_esp + .type lv_color_blend_to_argb8888_esp,@function + +// The function implements the following C code: +// void lv_color_blend_to_argb8888(_lv_draw_sw_blend_fill_dsc_t * dsc); + +// Input params +// +// dsc - a2 + +// typedef struct { +// uint32_t opa; l32i 0 +// void * dst_buf; l32i 4 +// uint32_t dst_w; l32i 8 +// uint32_t dst_h; l32i 12 +// uint32_t dst_stride; l32i 16 +// const void * src_buf; l32i 20 +// uint32_t src_stride; l32i 24 +// const lv_opa_t * mask_buf; l32i 28 +// uint32_t mask_stride; l32i 32 +// } asm_dsc_t; + +lv_color_blend_to_argb8888_esp: + + entry a1, 32 + + l32i.n a3, a2, 4 // a3 - dest_buff + l32i.n a4, a2, 8 // a4 - dest_w in uint32_t + l32i.n a5, a2, 12 // a5 - dest_h in uint32_t + l32i.n a6, a2, 16 // a6 - dest_stride in bytes + l32i.n a7, a2, 20 // a7 - src_buff (color) + l32i.n a8, a7, 0 // a8 - color as value + slli a11, a4, 2 // a11 - dest_w_bytes = sizeof(uint32_t) * dest_w + + beqz a4, _zero_matrix_len_check // Check if dest_w a4 is zero + beqz a5, _zero_matrix_len_check // Check if dest_h a5 is zero + movi a7, 0xff000000 // opacity mask + or a10, a7, a8 // apply opacity + sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes + + // Check dest_w length + bltui a4, 8, _matrix_width_check // Branch if dest_w (a4) is lower than 8 + srli a9, a4, 3 // a9 - loop_len = dest_w / 8 + +#if !XCHAL_HAVE_LOOPS + slli a14, a9, 5 // a14 = loop_len (a9) * 32 (main loop increments address pointers by 32) +#endif + + .outer_loop: + +#if XCHAL_HAVE_LOOPS + loopnez a9, ._main_loop // zero-overhead loop (not supported for esp32s2) +#else + // Init loop parameters + beqz a9, ._main_loop // Branch to the end, if a9 is 0 (no need to run the main loop) + add a15, a14, a3 // a15 = a14 + dest_buf address + .main_loop_done: +#endif + // Run main loop which sets 32 bytes (8 ARGB8888 pixels) in one loop run + s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes + s32i.n a10, a3, 4 // save 32 bits from a10 to dest_buff a3, offset 4 bytes + s32i.n a10, a3, 8 // save 32 bits from a10 to dest_buff a3, offset 8 bytes + s32i.n a10, a3, 12 // save 32 bits from a10 to dest_buff a3, offset 12 bytes + s32i.n a10, a3, 16 // save 32 bits from a10 to dest_buff a3, offset 16 bytes + s32i.n a10, a3, 20 // save 32 bits from a10 to dest_buff a3, offset 20 bytes + s32i.n a10, a3, 24 // save 32 bits from a10 to dest_buff a3, offset 24 bytes + s32i.n a10, a3, 28 // save 32 bits from a10 to dest_buff a3, offset 28 bytes + addi.n a3, a3, 32 // increment dest_buff a3 pointer by 32 bytes +#if !XCHAL_HAVE_LOOPS + blt a3, a15, .main_loop_done // Check end of the main loop, branch if dest_buf (a3) lower than a15 +#endif + ._main_loop: + + // Finish the remaining bytes out of the loop + + // Check modulo 16 of the dest_w_bytes (a11), if - then set 16 bytes (4 ARGB8888 pixels) + // src_reg a10, dest_buff a3, dest_w_bytes a11 + macro_memset_mod_16 a10, a3, a11, __LINE__ + + // Check modulo 8 of the dest_w_bytes (a11), if - then set 8 bytes (2 ARGB8888 pixels) + // src_reg a10, dest_buff a3, dest_w_bytes a11 + macro_memset_mod_8 a10, a3, a11, __LINE__ + + // Check modulo 4 of the dest_w_bytes (a11), if - then set 4 bytes (1 ARGB8888 pixel) + // src_reg a10, dest_buff a3, dest_w_bytes a11 + macro_memset_mod_4 a10, a3, a11, __LINE__ + + add a3, a3, a6 // dest_buff + dest_stride + addi.n a5, a5, -1 // decrease the outer loop + bnez a5, .outer_loop + + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return + +//********************************************************************************************************************** + + // Small matrix width, keep it simple for lengths less than 8 pixels + + _matrix_width_check: + +#if !XCHAL_HAVE_LOOPS + slli a14, a4, 2 // a14 = loop_len (a9) * 4 (main loop increments address pointers by 4) +#endif + + .outer_loop_short_matrix: + +#if XCHAL_HAVE_LOOPS + loopnez a4, ._main_loop_short_matrix // zero-overhead loop (not supported for esp32s2) +#else + // Init loop parameters + add a15, a14, a3 // a15 = a14 + dest_buf address + ._main_loop_short_matrix_done: +#endif + // Run main loop which sets 4 bytes (one ARGB8888 pixel) in one loop run + s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3 + addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes +#if !XCHAL_HAVE_LOOPS + blt a3, a15, ._main_loop_short_matrix_done // Check end of the main loop, branch if dest_buf (a3) lower than a15 +#endif + ._main_loop_short_matrix: + + add a3, a3, a6 // dest_buff + dest_stride + addi.n a5, a5, -1 // decrease the outer loop + bnez a5, .outer_loop_short_matrix + + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return + +//********************************************************************************************************************** + + // One of the matrix dimensions is zero, return early + _zero_matrix_len_check: + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_xtensa_pie.S similarity index 95% rename from components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S rename to components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_xtensa_pie.S index 10276f4f4..d9934dbfa 100644 --- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_xtensa_pie.S @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD + * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ @@ -42,11 +42,13 @@ lv_color_blend_to_argb8888_esp: l32i.n a8, a7, 0 // a8 - color as value slli a11, a4, 2 // a11 - dest_w_bytes = sizeof(uint32_t) * dest_w - movi a7, 0xff000000 // oppactiy mask - or a10, a7, a8 // apply oppacity + beqz a4, _zero_matrix_len_check // Check if dest_w a4 is zero + beqz a5, _zero_matrix_len_check // Check if dest_h a5 is zero + movi a7, 0xff000000 // opacity mask + or a10, a7, a8 // apply opacity // Check for short lengths - // dest_w should be at least 8, othewise it's not worth using esp32s3 TIE + // dest_w should be at least 8, otherwise it's not worth using esp32s3 TIE bgei a4, 8, _esp32s3_implementation // Branch if dest_w is greater than or equal to 8 j .lv_color_blend_to_argb8888_esp32_body // Jump to esp32 implementation @@ -227,7 +229,7 @@ lv_color_blend_to_argb8888_esp: addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes _dest_buff_aligned_by_1byte: - // Shift q reg, allowing to set 16-byte unaligned adata + // Shift q reg, allowing to set 16-byte unaligned data wur.sar_byte a15 // apply unalignment to the SAR_BYTE ee.src.q q2, q0, q1 // shift concat. of q0 and q1 to q2 by SAR_BYTE amount @@ -323,3 +325,10 @@ lv_color_blend_to_argb8888_esp: movi.n a2, 1 // return LV_RESULT_OK = 1 retw.n // return + +//********************************************************************************************************************** + + // One of the matrix dimensions is zero, return early + _zero_matrix_len_check: + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_xtensa_base.S similarity index 60% rename from components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32.S rename to components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_xtensa_base.S index 07b5aa111..a22ba064f 100644 --- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32.S +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_xtensa_base.S @@ -1,10 +1,13 @@ /* - * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD + * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ -// This is LVGL RGB565 simple fill for ESP32 processor +#include +#include "lv_macro_memset.S" + +// This is LVGL RGB565 simple fill for ESP32, ESP32S2 processor .section .text .align 4 @@ -41,7 +44,10 @@ lv_color_blend_to_rgb565_esp: l32i.n a8, a7, 0 // a8 - color as value slli a11, a4, 1 // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w - // Convert color to rgb656 + beqz a4, _zero_matrix_len_check // Check if dest_w a4 is zero + beqz a5, _zero_matrix_len_check // Check if dest_h a5 is zero + + // Convert color to rgb565 l8ui a15, a7, 2 // red movi.n a14, 0xf8 and a13, a15, a14 @@ -66,6 +72,9 @@ lv_color_blend_to_rgb565_esp: movi.n a8, 0x3 // a8 = 0x3, dest_buff align mask sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes + // Check dest_w length + bltui a4, 8, _matrix_width_check // Branch if dest_w (a4) is lower than 8 + // cache init // Prepare main loop length and dest_w_bytes srli a9, a4, 4 // a9 = loop_len = dest_w / 8, calculate main loop_len for original dest_w @@ -97,8 +106,16 @@ lv_color_blend_to_rgb565_esp: _dest_buff_unaligned: - // Run main loop which sets 16 bytes in one loop run - loopnez a9, ._main_loop +#if XCHAL_HAVE_LOOPS + loopnez a9, ._main_loop // zero-overhead loop (not supported in esp32s2) +#else + // Init loop parameters + beqz a9, ._main_loop // Branch to the end, if a9 is 0 (no need to run the main loop) + slli a15, a9, 5 // a15 = loop_len (a9) * 32 (main loop increments address pointers by 32) + add a15, a15, a3 // a15 += dest_buf address + ._main_loop_aligned_done: +#endif + // Run main loop which sets 32 bytes (16 RGB565 pixels) in one loop run s32i.n a10, a3, 0 // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0 s32i.n a10, a3, 4 // save 32 bits from 32-bit color a10 to dest_buff a3, offset 4 s32i.n a10, a3, 8 // save 32 bits from 32-bit color a10 to dest_buff a3, offset 8 @@ -108,37 +125,27 @@ lv_color_blend_to_rgb565_esp: s32i.n a10, a3, 24 // save 32 bits from 32-bit color a10 to dest_buff a3, offset 24 s32i.n a10, a3, 28 // save 32 bits from 32-bit color a10 to dest_buff a3, offset 28 addi.n a3, a3, 32 // increment dest_buff pointer by 32 +#if !XCHAL_HAVE_LOOPS + blt a3, a15, ._main_loop_aligned_done // Check end of the main loop, branch if dest_buf (a3) lower than a15 +#endif ._main_loop: // Finish the remaining bytes out of the loop - // Check modulo 8 of the dest_w_bytes, if - then set 16 bytes - bbci a11, 4, _mod_16_check // branch if 2-nd bit of dest_w_bytes is clear - s32i.n a10, a3, 0 // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0 - s32i.n a10, a3, 4 // save 32 bits from 32-bit color a10 to dest_buff a3, offset 4 - s32i.n a10, a3, 8 // save 32 bits from 32-bit color a10 to dest_buff a3, offset 8 - s32i.n a10, a3, 12 // save 32 bits from 32-bit color a10 to dest_buff a3, offset 12 - addi.n a3, a3, 16 // increment dest_buff pointer by 16 - _mod_16_check: + // Check modulo 16 of the dest_w_bytes (a11), if - then set 16 bytes (8 RGB565 pixels) + // src_reg a10, dest_buff a3, dest_w_bytes a11 + macro_memset_mod_16 a10, a3, a11, __LINE__ - // Finish the remaining bytes out of the loop - // Check modulo 8 of the dest_w_bytes, if - then set 8 bytes - bbci a11, 3, _mod_8_check // branch if 2-nd bit of dest_w_bytes is clear - s32i.n a10, a3, 0 // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0 - s32i.n a10, a3, 4 // save 32 bits from 32-bit color a10 to dest_buff a3, offset 4 - addi.n a3, a3, 8 // increment dest_buff pointer by 8 bytes - _mod_8_check: + // Check modulo 8 of the dest_w_bytes (a11), if - then set 8 bytes (4 RGB565 pixels) + // src_reg a10, dest_buff a3, dest_w_bytes a11 + macro_memset_mod_8 a10, a3, a11, __LINE__ - // Check modulo 4 of the dest_w_bytes, if - then set 4 bytes - bbci a11, 2, _mod_4_check // branch if 2-nd bit of dest_w_bytes is clear - s32i.n a10, a3, 0 // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0 - addi.n a3, a3, 4 // increment dest_buff pointer by 4 - _mod_4_check: + // Check modulo 4 of the dest_w_bytes (a11), if - then set 4 bytes (2 RGB565 pixels) + // src_reg a10, dest_buff a3, dest_w_bytes a11 + macro_memset_mod_4 a10, a3, a11, __LINE__ - // Check modulo 2 of the dest_w_bytes, if - then set 2 bytes - bbci a11, 1, _mod_2_check // branch if 1-st bit of dest_w_bytes is clear - s16i a12, a3, 0 // save 16 bits from 16-bit color a12 to dest_buff a3, offset 0 - addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes - _mod_2_check: + // Check modulo 2 of the dest_w_bytes (a11), if - then set 2 bytes (1 RGB565 pixel) + // src_reg a10, dest_buff a3, dest_w_bytes a11 + macro_memset_mod_2 a10, a3, a11, __LINE__ add a3, a3, a6 // dest_buff + dest_stride addi.n a5, a5, -1 // decrease the outer loop @@ -147,3 +154,44 @@ lv_color_blend_to_rgb565_esp: movi.n a2, 1 // return LV_RESULT_OK = 1 retw.n // return + +//********************************************************************************************************************** + + // Small matrix width, keep it simple for lengths less than 8 pixels + + _matrix_width_check: + +#if !XCHAL_HAVE_LOOPS + slli a14, a4, 1 // a14 = loop_len (a9) * 4 (main loop increments address pointers by 4) +#endif + + .outer_loop_short_matrix: + +#if XCHAL_HAVE_LOOPS + loopnez a4, ._main_loop_short_matrix // zero-overhead loop (not supported for esp32s2) +#else + // Init loop parameters + add a15, a14, a3 // a15 = a14 + dest_buf address + ._main_loop_short_matrix_done: +#endif + // Run main loop which sets 4 bytes (one ARGB8888 pixel) in one loop run + s16i a12, a3, 0 // save 16 bits from a12 to dest_buff a3 + addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes +#if !XCHAL_HAVE_LOOPS + blt a3, a15, ._main_loop_short_matrix_done // Check end of the main loop, branch if dest_buf (a3) lower than a15 +#endif + ._main_loop_short_matrix: + + add a3, a3, a6 // dest_buff + dest_stride + addi.n a5, a5, -1 // decrease the outer loop + bnez a5, .outer_loop_short_matrix + + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return + +//********************************************************************************************************************** + + // One of the matrix dimensions is zero, return early + _zero_matrix_len_check: + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_xtensa_pie.S similarity index 97% rename from components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S rename to components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_xtensa_pie.S index 3a9fe43cb..5444f1b2b 100644 --- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_xtensa_pie.S @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD + * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ @@ -41,7 +41,10 @@ lv_color_blend_to_rgb565_esp: l32i.n a8, a7, 0 // a8 - color as value slli a11, a4, 1 // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w - // Convert color to rgb656 + beqz a4, _zero_matrix_len_check // Check if dest_w a4 is zero + beqz a5, _zero_matrix_len_check // Check if dest_h a5 is zero + + // Convert color to rgb565 l8ui a15, a7, 2 // red movi.n a14, 0xf8 and a13, a15, a14 @@ -402,3 +405,10 @@ lv_color_blend_to_rgb565_esp: movi.n a2, 1 // return LV_RESULT_OK = 1 retw.n // return + +//********************************************************************************************************************** + + // One of the matrix dimensions is zero, return early + _zero_matrix_len_check: + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_xtensa_base.S similarity index 77% rename from components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S rename to components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_xtensa_base.S index 467b5348a..77fb80147 100644 --- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_xtensa_base.S @@ -1,10 +1,12 @@ /* - * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD + * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ -// This is LVGL RGB888 simple fill for ESP32 processor +#include + +// This is LVGL RGB888 simple fill for ESP32, ESP32S2 processor .section .text .align 4 @@ -40,6 +42,9 @@ lv_color_blend_to_rgb888_esp: l32i.n a7, a2, 20 // a7 - src_buff (color) l32i.n a8, a7, 0 // a8 - color as value + beqz a4, _zero_matrix_len_check // Check if dest_w a4 is zero + beqz a5, _zero_matrix_len_check // Check if dest_h a5 is zero + // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w = 3 * a4 slli a11, a4, 1 // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w add a11, a11, a4 // a11 - dest_w_bytes = a11 + a4 @@ -64,15 +69,27 @@ lv_color_blend_to_rgb888_esp: srli a9, a4, 2 // a9 = loop_len = dest_w / 4, calculate main loop_len for original dest_w movi.n a8, 0x3 // a8 = 0x3, remainder mask and a10, a4, a8 // a10 - remainder after division by 4 = a4 and 0x3 + movi.n a12, 12 // a12 = 12 (pointer increment in main loop for esp32s2) .outer_loop: +#if XCHAL_HAVE_LOOPS + loopnez a9, ._main_loop // zero-overhead loop (not supported for esp32s2) +#else + // Init loop parameters + beqz a9, ._main_loop // Branch to the end, if a9 is 0 (no need to run the main loop) + mul16u a11, a9, a12 // a11 = 12 (a12) * loop_len (a9) both operands must be lower than 16bit values (mul16u) + add a11, a11, a3 // a12 += dest_buf address + .main_loop_done: +#endif // Run main loop which sets 12 bytes (4 rgb888) in one loop run - loopnez a9, ._main_loop s32i.n a13, a3, 0 // save 32 bits from 32-bit color a13 to dest_buff a3, offset 0 s32i.n a14, a3, 4 // save 32 bits from 32-bit color a14 to dest_buff a3, offset 4 s32i.n a15, a3, 8 // save 32 bits from 32-bit color a15 to dest_buff a3, offset 8 addi.n a3, a3, 12 // increment dest_buff pointer by 12 +#if !XCHAL_HAVE_LOOPS + blt a3, a11, .main_loop_done // Check end of the main loop, branch if dest_buf (a3) lower than a11 +#endif ._main_loop: bnei a10, 0x3, _less_than_3 // branch if less than 3 values left @@ -103,3 +120,10 @@ lv_color_blend_to_rgb888_esp: movi.n a2, 1 // return LV_RESULT_OK = 1 retw.n // return + +//********************************************************************************************************************** + + // One of the matrix dimensions is zero, return early + _zero_matrix_len_check: + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_xtensa_pie.S similarity index 97% rename from components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S rename to components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_xtensa_pie.S index 955db4d00..e8bceb531 100644 --- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_xtensa_pie.S @@ -40,6 +40,9 @@ lv_color_blend_to_rgb888_esp: l32i.n a7, a2, 20 // a7 - src_buff (color) l32i.n a8, a7, 0 // a8 - color as value + beqz a4, _zero_matrix_len_check // Check if dest_w a4 is zero + beqz a5, _zero_matrix_len_check // Check if dest_h a5 is zero + // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w = 3 * a4 slli a11, a4, 1 // a11 - dest_w_bytes = 2 * dest_w add a11, a11, a4 // a11 - dest_w_bytes = a11 + a4 @@ -344,3 +347,11 @@ unalignment_table: movi.n a2, 1 // return LV_RESULT_OK = 1 retw.n // return + +//********************************************************************************************************************** + + // One of the matrix dimensions is zero, return early + _zero_matrix_len_check: + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return + diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memset.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memset.S new file mode 100644 index 000000000..342d8a566 --- /dev/null +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memset.S @@ -0,0 +1,50 @@ +/* + * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + */ + +// Memset macros for modulo checking +// After running the main loop, there is need to check remaining bytes to be set out of the main loop +// Macros work with both, aligned and unaligned (4-byte boundary) memories +// but performance is significantly lower when using unaligned memory, because of the unaligned memory access exception + +// Macro for checking modulo 16 + .macro macro_memset_mod_16 src_reg, dest_buf, condition, JUMP_TAG + // Check modulo 16 of the \condition, if - then set 16 bytes + bbci \condition, 4, ._mod_16_check_\JUMP_TAG // Branch if 4-th bit of \condition is clear + s32i.n \src_reg, \dest_buf, 0 // Save 32 bits from \src_reg to \dest_buff, offset 0 + s32i.n \src_reg, \dest_buf, 4 // Save 32 bits from \src_reg to \dest_buff, offset 4 + s32i.n \src_reg, \dest_buf, 8 // Save 32 bits from \src_reg to \dest_buff, offset 8 + s32i.n \src_reg, \dest_buf, 12 // Save 32 bits from \src_reg to \dest_buff, offset 12 + addi.n \dest_buf, \dest_buf, 16 // Increment \dest_buff pointer 16 + ._mod_16_check_\JUMP_TAG: +.endm // macro_memset_mod_16 + +// Macro for checking modulo 8 + .macro macro_memset_mod_8 src_reg, dest_buf, condition, JUMP_TAG + // Check modulo 8 of the \condition, if - then set 8 bytes + bbci \condition, 3, ._mod_8_check_\JUMP_TAG // Branch if 3-rd bit of \condition is clear + s32i.n \src_reg, \dest_buf, 0 // Save 32 bits from \src_reg to \dest_buff, offset 0 + s32i.n \src_reg, \dest_buf, 4 // Save 32 bits from \src_reg to \dest_buff, offset 4 + addi.n \dest_buf, \dest_buf, 8 // Increment \dest_buff pointer 8 + ._mod_8_check_\JUMP_TAG: +.endm // macro_memset_mod_8 + +// Macro for checking modulo 4 + .macro macro_memset_mod_4 src_reg, dest_buf, condition, JUMP_TAG + // Check modulo 4 of the \condition, if - then set 4 bytes + bbci \condition, 2, ._mod_4_check_\JUMP_TAG // Branch if 2-nd bit of \condition is clear + s32i.n \src_reg, \dest_buf, 0 // Save 32 bits from \src_reg to \dest_buff, offset 0 + addi.n \dest_buf, \dest_buf, 4 // Increment \dest_buff pointer 4 + ._mod_4_check_\JUMP_TAG: +.endm // macro_memset_mod_4 + +// Macro for checking modulo 2 + .macro macro_memset_mod_2 src_reg, dest_buf, condition, JUMP_TAG + // Check modulo 2 of the \condition, if - then set 2 bytes + bbci \condition, 1, ._mod_2_check_\JUMP_TAG // Branch if 1-st bit of \condition is clear + s16i \src_reg, \dest_buf, 0 // Save 32 bits from \src_reg to \dest_buff, offset 0 + addi.n \dest_buf, \dest_buf, 2 // Increment \dest_buff pointer 2 + ._mod_2_check_\JUMP_TAG: +.endm // macro_memset_mod_2 diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_xtensa_base.S similarity index 83% rename from components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S rename to components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_xtensa_base.S index eb08cd877..5150bda76 100644 --- a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_xtensa_base.S @@ -1,12 +1,13 @@ /* - * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD + * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ +#include #include "lv_macro_memcpy.S" // Memcpy macros -// This is LVGL RGB565 image blend to RGB565 for ESP32 processor +// This is LVGL RGB565 image blend to RGB565 for ESP32, ESP32S2 processor .section .text .align 4 @@ -42,6 +43,8 @@ lv_rgb565_blend_normal_to_rgb565_esp: l32i.n a8, a2, 24 // a8 - src_stride in bytes slli a11, a4, 1 // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w + beqz a4, _zero_matrix_len_check // Check if dest_w a4 is zero + beqz a5, _zero_matrix_len_check // Check if dest_h a5 is zero // No need to convert any colors here, we are copying from rgb565 to rgb565 // Check dest_w length @@ -73,8 +76,16 @@ lv_rgb565_blend_normal_to_rgb565_esp: .outer_loop_align: +#if XCHAL_HAVE_LOOPS + loopnez a9, ._main_loop_aligned // zero-overhead loop (not supported in esp32s2) +#else + // Init loop parameters + beqz a9, ._main_loop_aligned // Branch to the end, if a9 is 0 (no need to run the main loop) + slli a10, a9, 4 // a10 = loop_len (a9) * 16 (main loop increments address pointers by 16) + add a10, a10, a3 // a10 += dest_buf address + ._main_loop_aligned_done: +#endif // Run main loop which copies 16 bytes (8 RGB565 pixels) in one loop run - loopnez a9, ._main_loop_aligned l32i.n a15, a7, 0 // Load 32 bits from src_buff a7 to a15, offset 0 l32i.n a14, a7, 4 // Load 32 bits from src_buff a7 to a14, offset 4 l32i.n a13, a7, 8 // Load 32 bits from src_buff a7 to a13, offset 8 @@ -85,6 +96,9 @@ lv_rgb565_blend_normal_to_rgb565_esp: s32i.n a12, a3, 12 // Save 32 bits from a15 to dest_buff a3, offset 12 addi.n a7, a7, 16 // Increment src_buff pointer a7 by 16 addi.n a3, a3, 16 // Increment dest_buff pointer a3 by 16 +#if !XCHAL_HAVE_LOOPS + blt a3, a10, ._main_loop_aligned_done // Check end of the main loop, branch if dest_buf (a3) lower than a10 +#endif ._main_loop_aligned: // Finish the remaining bytes out of the main loop @@ -142,14 +156,23 @@ lv_rgb565_blend_normal_to_rgb565_esp: sub a10, a11, a14 // Get the dest_w_bytes after the aligning loop srli a9, a10, 4 // Calculate main loop len (a9 = dest_w_bytes_local / 16) +#if XCHAL_HAVE_LOOPS + loopnez a14, ._dest_aligning_loop // zero-overhead loop (not supported in esp32s2) +#else + // Init loop parameters + beqz a14, ._dest_aligning_loop // Branch to the end, if a14 is 0 (no need to run the main loop) + add a14, a14, a3 // loop_len = loop_len + dest_buf (a3) + ._dest_aligning_loop_done: +#endif // Run dest_buff aligning loop byte by byte - loopnez a14, ._dest_aligning_loop l8ui a15, a7, 0 // Load 8 bits from src_buff a7 to a15, offset 0 addi.n a7, a7, 1 // Increment src_buff pointer a7 by 1 s8i a15, a3, 0 // Save 8 bits from a15 to dest_buff a3, offset 0 addi.n a3, a3, 1 // Increment dest_buff pointer a3 by 1 +#if !XCHAL_HAVE_LOOPS + blt a3, a14, ._dest_aligning_loop_done // Check end of the main loop, branch if dest_buf (a3) lower than a14 +#endif ._dest_aligning_loop: - // Destination is aligned, source is unaligned // For more information about this implementation, see chapter 3.3.2 Shifts and the Shift Amount Register (SAR) @@ -162,8 +185,16 @@ lv_rgb565_blend_normal_to_rgb565_esp: // First preload for the loopnez cycle l32i.n a15, a7, 0 // Load 32 bits from 4-byte aligned src_buff a7 to a15, offset 0 +#if XCHAL_HAVE_LOOPS + loopnez a9, ._main_loop_unalign // zero-overhead loop (not supported in esp32s2) +#else + // Init loop parameters + beqz a9, ._main_loop_unalign // Branch to the end, if a9 is 0 (no need to run the main loop) + slli a9, a9, 4 // loop_len (a9) *= 16 (main loop increments address pointers by 16) + add a9, a9, a3 // a9 += dest_buff address + ._main_loop_unalign_done: +#endif // Run main loop which copies 16 bytes (8 RGB565 pixels) in one loop run - loopnez a9, ._main_loop_unalign l32i.n a14, a7, 4 // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4 l32i.n a13, a7, 8 // Load 32 bits from 4-byte aligned src_buff a7 to a13, offset 8 src a15, a14, a15 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15 @@ -178,6 +209,9 @@ lv_rgb565_blend_normal_to_rgb565_esp: src a12, a15, a12 // Concatenate a15 and a12 and shift by SAR_BYTE amount to a12 s32i.n a12, a3, 12 // Save 32 bits from shift-corrected a12 to dest_buff a3, offset 12 addi.n a3, a3, 16 // Increment dest_buff pointer a3 by 16 +#if !XCHAL_HAVE_LOOPS + blt a3, a9, ._main_loop_unalign_done // Check end of the main loop, branch if dest_buf (a3) lower than a10 +#endif ._main_loop_unalign: // Finish the remaining bytes out of the loop @@ -239,14 +273,25 @@ lv_rgb565_blend_normal_to_rgb565_esp: .outer_loop_short_matrix_length: +#if XCHAL_HAVE_LOOPS + loopnez a4, ._main_loop_short_matrix_length // zero-overhead loop (not supported in esp32s2) +#else + // Init loop parameters + beqz a4, ._main_loop_short_matrix_length // Branch to the end, if a4 is 0 (no need to run the main loop) + slli a10, a4, 1 // a10 = loop_len (a4) * 2 (main loop increments address pointers by 2) + add a10, a10, a3 // a10 += dest_buf address + ._main_loop_short_matrix_length_done: +#endif // Run main loop which copies 2 bytes (one RGB565 pixel) in one loop run - loopnez a4, ._main_loop_short_matrix_length l8ui a15, a7, 0 // Load 8 bits from src_buff a7 to a15, offset 0 l8ui a14, a7, 1 // Load 8 bits from src_buff a7 to a14, offset 1 s8i a15, a3, 0 // Save 8 bits from a15 to dest_buff a3, offset 0 s8i a14, a3, 1 // Save 8 bits from a14 to dest_buff a3, offset 1 - addi.n a7, a7, 2 // Increment src_buff pointer a7 by 1 + addi.n a7, a7, 2 // Increment src_buff pointer a7 by 2 addi.n a3, a3, 2 // Increment dest_buff pointer a3 by 2 +#if !XCHAL_HAVE_LOOPS + blt a3, a10, ._main_loop_short_matrix_length_done // Check end of the main loop, branch if dest_buf (a3) lower than a10 +#endif ._main_loop_short_matrix_length: // Finish remaining byte out of the main loop @@ -262,3 +307,10 @@ lv_rgb565_blend_normal_to_rgb565_esp: movi.n a2, 1 // Return LV_RESULT_OK = 1 retw.n // Return + +//********************************************************************************************************************** + + // One of the matrix dimensions is zero, return early + _zero_matrix_len_check: + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_xtensa_pie.S similarity index 97% rename from components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S rename to components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_xtensa_pie.S index 66de392f6..29d74f24b 100644 --- a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_xtensa_pie.S @@ -43,6 +43,8 @@ lv_rgb565_blend_normal_to_rgb565_esp: movi.n a10, 0xf // 0xf alignment mask (16-byte alignment) slli a11, a4, 1 // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w + beqz a4, _zero_matrix_len_check // Check if dest_w a4 is zero + beqz a5, _zero_matrix_len_check // Check if dest_h a5 is zero // No need to convert any colors here, we are copying from rgb565 to rgb565 // Check dest_w length @@ -370,3 +372,10 @@ lv_rgb565_blend_normal_to_rgb565_esp: movi.n a2, 1 // Return LV_RESULT_OK = 1 retw.n // Return + +//********************************************************************************************************************** + + // One of the matrix dimensions is zero, return early + _zero_matrix_len_check: + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_xtensa_base.S similarity index 83% rename from components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_esp32.S rename to components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_xtensa_base.S index f35175fe8..b66306685 100644 --- a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_esp32.S +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_xtensa_base.S @@ -4,9 +4,10 @@ * SPDX-License-Identifier: Apache-2.0 */ +#include #include "lv_macro_memcpy.S" // Memcpy macros -// This is LVGL RGB888 image blend to RGB888 for ESP32 processor +// This is LVGL RGB888 image blend to RGB888 for ESP3, ESP32S2 processor .section .text .align 4 @@ -43,6 +44,8 @@ lv_rgb888_blend_normal_to_rgb888_esp: slli a11, a4, 1 // a11 = (a4 << 1) + a4 add a11, a11, a4 // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w + beqz a4, _zero_matrix_len_check // Check if dest_w a4 is zero + beqz a5, _zero_matrix_len_check // Check if dest_h a5 is zero // No need to convert any colors here, we are copying from rgb888 to rgb888 // Check dest_w length @@ -74,8 +77,16 @@ lv_rgb888_blend_normal_to_rgb888_esp: .outer_loop_align: +#if XCHAL_HAVE_LOOPS + loopnez a9, ._main_loop_aligned // zero-overhead loop (not supported in esp32s2) +#else + // Init loop parameters + beqz a9, ._main_loop_aligned // Branch to the end, if a9 is 0 (no need to run the main loop) + slli a10, a9, 4 // a10 = loop_len (a9) * 16 (main loop increments address pointers by 16) + add a10, a10, a3 // a10 += dest_buf address + ._main_loop_aligned_done: +#endif // Run main loop which copies 16 bytes (5 and 1/3 of RGB888 pixels) in one loop run - loopnez a9, ._main_loop_aligned l32i.n a15, a7, 0 // Load 32 bits from src_buff a7 to a15, offset 0 l32i.n a14, a7, 4 // Load 32 bits from src_buff a7 to a14, offset 4 l32i.n a13, a7, 8 // Load 32 bits from src_buff a7 to a13, offset 8 @@ -86,6 +97,9 @@ lv_rgb888_blend_normal_to_rgb888_esp: s32i.n a12, a3, 12 // Save 32 bits from a15 to dest_buff a3, offset 12 addi.n a7, a7, 16 // Increment src_buff pointer a7 by 16 addi.n a3, a3, 16 // Increment dest_buff pointer a3 by 16 +#if !XCHAL_HAVE_LOOPS + blt a3, a10, ._main_loop_aligned_done // Check end of the main loop, branch if dest_buf (a3) lower than a10 +#endif ._main_loop_aligned: // Finish the remaining bytes out of the main loop @@ -143,12 +157,22 @@ lv_rgb888_blend_normal_to_rgb888_esp: sub a10, a11, a14 // Get the dest_w_bytes after the aligning loop srli a9, a10, 4 // Calculate main loop len (a9 = dest_w_bytes_local / 16) +#if XCHAL_HAVE_LOOPS + loopnez a14, ._dest_aligning_loop // zero-overhead loop (not supported in esp32s2) +#else + // Init loop parameters + beqz a14, ._dest_aligning_loop // Branch to the end, if a14 is 0 (no need to run the main loop) + add a14, a14, a3 // loop_len = loop_len + dest_buf (a3) + ._dest_aligning_loop_done: +#endif // Run dest_buff aligning loop byte by byte - loopnez a14, ._dest_aligning_loop l8ui a15, a7, 0 // Load 8 bits from src_buff a7 to a15, offset 0 addi.n a7, a7, 1 // Increment src_buff pointer a7 by 1 s8i a15, a3, 0 // Save 8 bits from a15 to dest_buff a3, offset 0 addi.n a3, a3, 1 // Increment dest_buff pointer a3 by 1 +#if !XCHAL_HAVE_LOOPS + blt a3, a14, ._dest_aligning_loop_done // Check end of the main loop, branch if dest_buf (a3) lower than a14 +#endif ._dest_aligning_loop: // Destination is aligned, source is unaligned @@ -163,8 +187,16 @@ lv_rgb888_blend_normal_to_rgb888_esp: // First preload for the loopnez cycle l32i.n a15, a7, 0 // Load 32 bits from 4-byte aligned src_buff a7 to a15, offset 0 +#if XCHAL_HAVE_LOOPS + loopnez a9, ._main_loop_unalign // zero-overhead loop (not supported in esp32s2) +#else + // Init loop parameters + beqz a9, ._main_loop_unalign // Branch to the end, if a9 is 0 (no need to run the main loop) + slli a9, a9, 4 // loop_len (a9) *= 16 (main loop increments address pointers by 16) + add a9, a9, a3 // a9 += dest_buff address + ._main_loop_unalign_done: +#endif // Run main loop which copies 16 bytes (5 and 1/3 of RGB888 pixels) in one loop run - loopnez a9, ._main_loop_unalign l32i.n a14, a7, 4 // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4 l32i.n a13, a7, 8 // Load 32 bits from 4-byte aligned src_buff a7 to a13, offset 8 src a15, a14, a15 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15 @@ -179,6 +211,9 @@ lv_rgb888_blend_normal_to_rgb888_esp: src a12, a15, a12 // Concatenate a15 and a12 and shift by SAR_BYTE amount to a12 s32i.n a12, a3, 12 // Save 32 bits from shift-corrected a12 to dest_buff a3, offset 12 addi.n a3, a3, 16 // Increment dest_buff pointer a3 by 16 +#if !XCHAL_HAVE_LOOPS + blt a3, a9, ._main_loop_unalign_done // Check end of the main loop, branch if dest_buf (a3) lower than a10 +#endif ._main_loop_unalign: // Finish the remaining bytes out of the loop @@ -237,11 +272,19 @@ lv_rgb888_blend_normal_to_rgb888_esp: // Convert strides to matrix paddings sub a6, a6, a11 // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11) sub a8, a8, a11 // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11) + movi.n a12, 3 // a12 = 3 (pointer increment in main loop for esp32s2) .outer_loop_short_matrix_length: - +#if XCHAL_HAVE_LOOPS + loopnez a4, ._main_loop_short_matrix_length // zero-overhead loop (not supported in esp32s2) +#else + // Init loop parameters + beqz a4, ._main_loop_short_matrix_length // Branch to the end, if a4 is 0 (no need to run the main loop) + mul16u a10, a4, a12 // a10 = loop_len (a4) * 3 (main loop increments address pointers by 3) + add a10, a10, a3 // a10 += dest_buf address + ._main_loop_short_matrix_length_done: +#endif // Run main loop which copies 3 bytes (one RGB888 pixel) in one loop run - loopnez a4, ._main_loop_short_matrix_length l8ui a15, a7, 0 // Load 8 bits from src_buff a7 to a15, offset 0 l8ui a14, a7, 1 // Load 8 bits from src_buff a7 to a14, offset 1 l8ui a13, a7, 2 // Load 8 bits from src_buff a7 to a13, offset 2 @@ -250,6 +293,9 @@ lv_rgb888_blend_normal_to_rgb888_esp: s8i a13, a3, 2 // Save 8 bits from a13 to dest_buff a3, offset 2 addi.n a7, a7, 3 // Increment src_buff pointer a7 by 3 addi.n a3, a3, 3 // Increment dest_buff pointer a3 by 3 +#if !XCHAL_HAVE_LOOPS + blt a3, a10, ._main_loop_short_matrix_length_done // Check end of the main loop, branch if dest_buf (a3) lower than a10 +#endif ._main_loop_short_matrix_length: add a3, a3, a6 // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6) @@ -259,3 +305,10 @@ lv_rgb888_blend_normal_to_rgb888_esp: movi.n a2, 1 // Return LV_RESULT_OK = 1 retw.n // Return + +//********************************************************************************************************************** + + // One of the matrix dimensions is zero, return early + _zero_matrix_len_check: + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_xtensa_pie.S similarity index 95% rename from components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_esp32s3.S rename to components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_xtensa_pie.S index cb31100ff..12c724bf2 100644 --- a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_esp32s3.S +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_xtensa_pie.S @@ -44,10 +44,12 @@ lv_rgb888_blend_normal_to_rgb888_esp: slli a11, a4, 1 // a11 = (a4 << 1) + a4 add a11, a11, a4 // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w + beqz a4, _zero_matrix_len_check // Check if dest_w a4 is zero + beqz a5, _zero_matrix_len_check // Check if dest_h a5 is zero // No need to convert any colors here, we are copying from rgb888 to rgb888 // Check dest_w length - bltui a4, 8, _matrix_width_check // Branch if dest_w (a4) is lower than 8 + bltui a4, 8, _matrix_width_check // Branch if dest_w (a4) is lower than 8 //********************************************************************************************************************** @@ -219,3 +221,10 @@ lv_rgb888_blend_normal_to_rgb888_esp: movi.n a2, 1 // Return LV_RESULT_OK = 1 retw.n // Return + +//********************************************************************************************************************** + + // One of the matrix dimensions is zero, return early + _zero_matrix_len_check: + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return diff --git a/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt b/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt index 20c061ff3..7b01d3a8b 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt +++ b/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt @@ -1,18 +1,18 @@ # Include SIMD assembly source code for rendering -if(CONFIG_IDF_TARGET_ESP32 OR CONFIG_IDF_TARGET_ESP32S3) +if(CONFIG_IDF_TARGET_ESP32 OR CONFIG_IDF_TARGET_ESP32S3 OR CONFIG_IDF_TARGET_ESP32S2) message(VERBOSE "Compiling SIMD") set(PORT_PATH "../../../src/lvgl9") if(CONFIG_IDF_TARGET_ESP32S3) - file(GLOB_RECURSE ASM_SOURCES ${PORT_PATH}/simd/*_esp32s3.S) # Select only esp32s3 related files + file(GLOB_RECURSE ASM_SOURCES ${PORT_PATH}/simd/*_xtensa_pie.S) # Select Xtensa PIE, for esp32s3 target else() - file(GLOB_RECURSE ASM_SOURCES ${PORT_PATH}/simd/*_esp32.S) # Select only esp32 related files + file(GLOB_RECURSE ASM_SOURCES ${PORT_PATH}/simd/*_xtensa_base.S) # Select Xtensa Base for esp32, esp32s2 targets endif() file(GLOB_RECURSE ASM_MACROS ${PORT_PATH}/simd/lv_macro_*.S) # Explicitly add all assembler macro files else() - message(WARNING "This test app is intended only for esp32 and esp32s3") + message(WARNING "This test app is intended only for Xtensa targets (esp32, esp32s2, esp32s3") endif() # Hard copy of LV files diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h b/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h index 5c9a53c20..4bc339d54 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h +++ b/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h @@ -30,7 +30,7 @@ typedef struct { unsigned int unalign_step; // Increment step in bytes unalignment of the test array unsigned int dest_stride_step; // Increment step in destination stride of the test array unsigned int test_combinations_count; // Count of fest combinations -} test_matrix_params_t; +} test_matrix_lv_fill_params_t; /** * @brief Functionality test case parameters diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c index 600b8eecc..bb915ffe2 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c +++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c @@ -17,9 +17,10 @@ #include "lv_draw_sw_blend_to_rgb565.h" #include "lv_draw_sw_blend_to_rgb888.h" -#define WIDTH 128 -#define HEIGHT 128 -#define STRIDE WIDTH +#define COMMON_DIM 128 // Common matrix dimension 128x128 pixels +#define WIDTH COMMON_DIM +#define HEIGHT COMMON_DIM +#define STRIDE COMMON_DIM #define UNALIGN_BYTES 1 #define BENCHMARK_CYCLES 1000 @@ -79,7 +80,7 @@ TEST_CASE("LV Fill benchmark ARGB8888", "[fill][benchmark][ARGB8888]") .height = HEIGHT, .width = WIDTH, .stride = STRIDE * sizeof(uint32_t), - .cc_height = HEIGHT - 1, + .cc_height = HEIGHT, .cc_width = WIDTH - 1, .benchmark_cycles = BENCHMARK_CYCLES, .array_align16 = (void *)dest_array_align16, @@ -87,7 +88,7 @@ TEST_CASE("LV Fill benchmark ARGB8888", "[fill][benchmark][ARGB8888]") .blend_api_func = &lv_draw_sw_blend_color_to_argb8888, }; - ESP_LOGI(TAG_LV_FILL_BENCH, "running test for ARGB8888 color format"); + ESP_LOGI(TAG_LV_FILL_BENCH, "running memset for ARGB8888 to ARGB8888 color format"); lv_fill_benchmark_init(&test_params); free(dest_array_align16); } @@ -104,7 +105,7 @@ TEST_CASE("LV Fill benchmark RGB565", "[fill][benchmark][RGB565]") .height = HEIGHT, .width = WIDTH, .stride = STRIDE * sizeof(uint16_t), - .cc_height = HEIGHT - 1, + .cc_height = HEIGHT, .cc_width = WIDTH - 1, .benchmark_cycles = BENCHMARK_CYCLES, .array_align16 = (void *)dest_array_align16, @@ -112,7 +113,7 @@ TEST_CASE("LV Fill benchmark RGB565", "[fill][benchmark][RGB565]") .blend_api_func = &lv_draw_sw_blend_color_to_rgb565, }; - ESP_LOGI(TAG_LV_FILL_BENCH, "running test for RGB565 color format"); + ESP_LOGI(TAG_LV_FILL_BENCH, "running memset for RGB565 to RGB565 color format"); lv_fill_benchmark_init(&test_params); free(dest_array_align16); } @@ -129,7 +130,7 @@ TEST_CASE("LV Fill benchmark RGB888", "[fill][benchmark][RGB888]") .height = HEIGHT, .width = WIDTH, .stride = STRIDE * 3, - .cc_height = HEIGHT - 1, + .cc_height = HEIGHT, .cc_width = WIDTH - 1, .benchmark_cycles = BENCHMARK_CYCLES, .array_align16 = (void *)dest_array_align16, @@ -137,7 +138,7 @@ TEST_CASE("LV Fill benchmark RGB888", "[fill][benchmark][RGB888]") .blend_api_px_func = &lv_draw_sw_blend_color_to_rgb888, }; - ESP_LOGI(TAG_LV_FILL_BENCH, "running test for RGB888 color format"); + ESP_LOGI(TAG_LV_FILL_BENCH, "running memset for RGB888 to RGB888 color format"); lv_fill_benchmark_init(&test_params); free(dest_array_align16); } diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c index 958eaae91..3256a6273 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c +++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c @@ -40,6 +40,28 @@ static lv_color_t test_color = { .red = 0x12, }; +static const test_matrix_lv_fill_params_t default_test_matrix_lv_fill = { +#if CONFIG_IDF_TARGET_ESP32S3 + .min_w = 8, + .min_h = 1, + .max_w = 40, + .max_h = 4, + .max_unalign_byte = 16, // Use 16-byte boundary check for Xtensa PIE + .unalign_step = 1, + .dest_stride_step = 1, +#else + .min_w = 1, + .min_h = 1, + .max_w = 32, + .max_h = 4, + .max_unalign_byte = 4, // Use 4-byte boundary check for Xtensa base + .unalign_step = 1, + .dest_stride_step = 1, +#endif + .min_unalign_byte = 0, + .test_combinations_count = 0, +}; + // ------------------------------------------------ Static function headers -------------------------------------------- /** @@ -50,7 +72,7 @@ static lv_color_t test_color = { * @param[in] test_matrix Pointer to structure defining test matrix - all the test combinations * @param[in] test_case Pointer to structure defining functionality test case */ -static void functionality_test_matrix(test_matrix_params_t *test_matrix, func_test_case_params_t *test_case); +static void functionality_test_matrix(test_matrix_lv_fill_params_t *test_matrix, func_test_case_params_t *test_case); /** * @brief Fill test buffers for functionality test @@ -107,19 +129,12 @@ Functionality tests // ------------------------------------------------ Test cases stages -------------------------------------------------- -TEST_CASE("Test fill functionality ARGB8888", "[fill][functionality][ARGB8888]") +TEST_CASE("LV Fill functionality ARGB8888", "[fill][functionality][ARGB8888]") { - test_matrix_params_t test_matrix = { - .min_w = 8, // 8 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed - .min_h = 1, - .max_w = 16, - .max_h = 16, - .min_unalign_byte = 0, - .max_unalign_byte = 16, - .unalign_step = 1, - .dest_stride_step = 1, - .test_combinations_count = 0, - }; + test_matrix_lv_fill_params_t test_matrix = default_test_matrix_lv_fill; +#if (CONFIG_IDF_TARGET_ESP32S3) + test_matrix.min_w = 8; // 8 is the lower limit for the PIE asm implementation, otherwise base asm is executed +#endif func_test_case_params_t test_case = { .blend_api_func = &lv_draw_sw_blend_color_to_argb8888, @@ -127,23 +142,17 @@ TEST_CASE("Test fill functionality ARGB8888", "[fill][functionality][ARGB8888]") .data_type_size = sizeof(uint32_t), }; - ESP_LOGI(TAG_LV_FILL_FUNC, "running test for ARGB8888 color format"); + ESP_LOGI(TAG_LV_FILL_FUNC, "running memset for ARGB8888 to ARGB8888 color format"); + ESP_LOGI(TAG_LV_FILL_FUNC, "test matrices dimensions: %dx%d to %dx%d", test_matrix.min_w, test_matrix.min_h, test_matrix.max_w, test_matrix.max_h); functionality_test_matrix(&test_matrix, &test_case); } -TEST_CASE("Test fill functionality RGB565", "[fill][functionality][RGB565]") +TEST_CASE("LV Fill functionality RGB565", "[fill][functionality][RGB565]") { - test_matrix_params_t test_matrix = { - .min_w = 16, // 16 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed - .min_h = 1, - .max_w = 32, - .max_h = 16, - .min_unalign_byte = 0, - .max_unalign_byte = 16, - .unalign_step = 1, - .dest_stride_step = 1, - .test_combinations_count = 0, - }; + test_matrix_lv_fill_params_t test_matrix = default_test_matrix_lv_fill; +#if (CONFIG_IDF_TARGET_ESP32S3) + test_matrix.min_w = 16; // 16 is the lower limit for the PIE asm implementation, otherwise base asm is executed +#endif func_test_case_params_t test_case = { .blend_api_func = &lv_draw_sw_blend_color_to_rgb565, @@ -151,23 +160,17 @@ TEST_CASE("Test fill functionality RGB565", "[fill][functionality][RGB565]") .data_type_size = sizeof(uint16_t), }; - ESP_LOGI(TAG_LV_FILL_FUNC, "running test for RGB565 color format"); + ESP_LOGI(TAG_LV_FILL_FUNC, "running memset for RGB565 to RGB565 color format"); + ESP_LOGI(TAG_LV_FILL_FUNC, "test matrices dimensions: %dx%d to %dx%d", test_matrix.min_w, test_matrix.min_h, test_matrix.max_w, test_matrix.max_h); functionality_test_matrix(&test_matrix, &test_case); } -TEST_CASE("Test fill functionality RGB888", "[fill][functionality][RGB888]") +TEST_CASE("LV Fill functionality RGB888", "[fill][functionality][RGB888]") { - test_matrix_params_t test_matrix = { - .min_w = 12, // 12 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed - .min_h = 1, - .max_w = 32, - .max_h = 3, - .min_unalign_byte = 0, - .max_unalign_byte = 16, - .unalign_step = 1, - .dest_stride_step = 1, - .test_combinations_count = 0, - }; + test_matrix_lv_fill_params_t test_matrix = default_test_matrix_lv_fill; +#if (CONFIG_IDF_TARGET_ESP32S3) + test_matrix.min_w = 12; // 12 is the lower limit for the PIE asm implementation, otherwise base asm is executed +#endif func_test_case_params_t test_case = { .blend_api_px_func = &lv_draw_sw_blend_color_to_rgb888, @@ -175,12 +178,13 @@ TEST_CASE("Test fill functionality RGB888", "[fill][functionality][RGB888]") .data_type_size = sizeof(uint8_t) * 3, // 24-bit data length }; - ESP_LOGI(TAG_LV_FILL_FUNC, "running test for RGB888 color format"); + ESP_LOGI(TAG_LV_FILL_FUNC, "running memset for RGB888 to RGB888 color format"); + ESP_LOGI(TAG_LV_FILL_FUNC, "test matrices dimensions: %dx%d to %dx%d", test_matrix.min_w, test_matrix.min_h, test_matrix.max_w, test_matrix.max_h); functionality_test_matrix(&test_matrix, &test_case); } // ------------------------------------------------ Static test functions ---------------------------------------------- -static void functionality_test_matrix(test_matrix_params_t *test_matrix, func_test_case_params_t *test_case) +static void functionality_test_matrix(test_matrix_lv_fill_params_t *test_matrix, func_test_case_params_t *test_case) { // Step destination array width for (int dest_w = test_matrix->min_w; dest_w <= test_matrix->max_w; dest_w++) { diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c index ba8d44501..c5408ae79 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c +++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c @@ -105,7 +105,7 @@ TEST_CASE("LV Image benchmark RGB565 blend to RGB565", "[image][benchmark][RGB56 .color_format = LV_COLOR_FORMAT_RGB565, }; - ESP_LOGI(TAG_LV_IMAGE_BENCH, "running test for RGB565 color format"); + ESP_LOGI(TAG_LV_IMAGE_BENCH, "running memcpy for RGB565 to RGB565 color format"); lv_image_benchmark_init(&test_params); free(dest_array_align16); free(src_array_align16); diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c index 5f5b06680..1ae58b84f 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c +++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c @@ -40,7 +40,7 @@ static char test_msg_buf[200]; static const test_matrix_lv_image_params_t default_test_matrix_image_blend = { #if CONFIG_IDF_TARGET_ESP32S3 - .min_w = 8, // 8 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed + .min_w = 8, // 8 is the lower limit for the PIE asm implementation, otherwise base assembly is executed .min_h = 1, .max_w = 24, .max_h = 2, @@ -53,7 +53,7 @@ static const test_matrix_lv_image_params_t default_test_matrix_image_blend = { #else .min_w = 1, .min_h = 1, - .max_w = 16, + .max_w = 12, .max_h = 2, .src_max_unalign_byte = 4, // Use 4-byte boundary check for Xtensa base .dest_max_unalign_byte = 4, @@ -142,7 +142,8 @@ TEST_CASE("LV Image functionality RGB565 blend to RGB565", "[image][functionalit .operation_type = OPERATION_FILL, }; - ESP_LOGI(TAG_LV_IMAGE_FUNC, "running test for RGB565 color format"); + ESP_LOGI(TAG_LV_IMAGE_FUNC, "running memcpy for RGB565 to RGB565 color format"); + ESP_LOGI(TAG_LV_IMAGE_FUNC, "test matrices dimensions: %dx%d to %dx%d", test_matrix.min_w, test_matrix.min_h, test_matrix.max_w, test_matrix.max_h); functionality_test_matrix(&test_matrix, &test_case); } @@ -160,7 +161,8 @@ TEST_CASE("LV Image functionality RGB888 blend to RGB888", "[image][functionalit .operation_type = OPERATION_FILL, }; - ESP_LOGI(TAG_LV_IMAGE_FUNC, "running test for RGB888 color format"); + ESP_LOGI(TAG_LV_IMAGE_FUNC, "running memcpy for RGB888 to RGB888 color format"); + ESP_LOGI(TAG_LV_IMAGE_FUNC, "test matrices dimensions: %dx%d to %dx%d", test_matrix.min_w, test_matrix.min_h, test_matrix.max_w, test_matrix.max_h); functionality_test_matrix(&test_matrix, &test_case); } @@ -186,6 +188,7 @@ static void functionality_test_matrix(test_matrix_lv_image_params_t *test_matrix // Step destination array unalignment for (int dest_unalign_byte = test_matrix->dest_min_unalign_byte; dest_unalign_byte <= test_matrix->dest_max_unalign_byte; dest_unalign_byte += test_matrix->dest_unalign_step) { + //printf("dest_w = %d, dest_h = %d, src_stride = %d, dest_stride = %d, src_unalign_byte = %d, dest_unalign_byte = %d\n", dest_w, dest_h, src_stride, dest_stride, src_unalign_byte, dest_unalign_byte); // Call functionality test UPDATE_TEST_CASE(test_case, dest_w, dest_h, src_stride, dest_stride, src_unalign_byte, dest_unalign_byte); lv_image_functionality(test_case);