From 3df27924d00ed23ef5709786af00bfd589197fa8 Mon Sep 17 00:00:00 2001
From: "peter.marcisovsky" <peter.marcisovsky@espressif.com>
Date: Fri, 17 Jan 2025 21:43:16 +0100
Subject: [PATCH] feat(lvgl_port_simd): Support for esp32s2, assembly rendering

    - renamed assembly src files to xtensa_pie, xtensa_base
    - xtensa_base src files use zero-overhead loops only for esp32
    - added zero length matrix check into all src files
---
 components/esp_lvgl_port/CMakeLists.txt       |   8 +-
 .../simd/lv_color_blend_to_argb8888_esp32.S   |  81 ----------
 .../lv_color_blend_to_argb8888_xtensa_base.S  | 147 ++++++++++++++++++
 ...> lv_color_blend_to_argb8888_xtensa_pie.S} |  19 ++-
 ...=> lv_color_blend_to_rgb565_xtensa_base.S} | 108 +++++++++----
 ... => lv_color_blend_to_rgb565_xtensa_pie.S} |  14 +-
 ...=> lv_color_blend_to_rgb888_xtensa_base.S} |  30 +++-
 ... => lv_color_blend_to_rgb888_xtensa_pie.S} |  11 ++
 .../src/lvgl9/simd/lv_macro_memset.S          |  50 ++++++
 ...b565_blend_normal_to_rgb565_xtensa_base.S} |  68 +++++++-
 ...gb565_blend_normal_to_rgb565_xtensa_pie.S} |   9 ++
 ...b888_blend_normal_to_rgb888_xtensa_base.S} |  65 +++++++-
 ...gb888_blend_normal_to_rgb888_xtensa_pie.S} |  11 +-
 .../test_apps/simd/main/CMakeLists.txt        |   8 +-
 .../test_apps/simd/main/lv_fill_common.h      |   2 +-
 .../simd/main/test_lv_fill_benchmark.c        |  19 +--
 .../simd/main/test_lv_fill_functionality.c    |  86 +++++-----
 .../simd/main/test_lv_image_benchmark.c       |   2 +-
 .../simd/main/test_lv_image_functionality.c   |  11 +-
 19 files changed, 549 insertions(+), 200 deletions(-)
 delete mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32.S
 create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_xtensa_base.S
 rename components/esp_lvgl_port/src/lvgl9/simd/{lv_color_blend_to_argb8888_esp32s3.S => lv_color_blend_to_argb8888_xtensa_pie.S} (95%)
 rename components/esp_lvgl_port/src/lvgl9/simd/{lv_color_blend_to_rgb565_esp32.S => lv_color_blend_to_rgb565_xtensa_base.S} (60%)
 rename components/esp_lvgl_port/src/lvgl9/simd/{lv_color_blend_to_rgb565_esp32s3.S => lv_color_blend_to_rgb565_xtensa_pie.S} (97%)
 rename components/esp_lvgl_port/src/lvgl9/simd/{lv_color_blend_to_rgb888_esp32.S => lv_color_blend_to_rgb888_xtensa_base.S} (77%)
 rename components/esp_lvgl_port/src/lvgl9/simd/{lv_color_blend_to_rgb888_esp32s3.S => lv_color_blend_to_rgb888_xtensa_pie.S} (97%)
 create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memset.S
 rename components/esp_lvgl_port/src/lvgl9/simd/{lv_rgb565_blend_normal_to_rgb565_esp32.S => lv_rgb565_blend_normal_to_rgb565_xtensa_base.S} (83%)
 rename components/esp_lvgl_port/src/lvgl9/simd/{lv_rgb565_blend_normal_to_rgb565_esp32s3.S => lv_rgb565_blend_normal_to_rgb565_xtensa_pie.S} (97%)
 rename components/esp_lvgl_port/src/lvgl9/simd/{lv_rgb888_blend_normal_to_rgb888_esp32.S => lv_rgb888_blend_normal_to_rgb888_xtensa_base.S} (83%)
 rename components/esp_lvgl_port/src/lvgl9/simd/{lv_rgb888_blend_normal_to_rgb888_esp32s3.S => lv_rgb888_blend_normal_to_rgb888_xtensa_pie.S} (95%)

diff --git a/components/esp_lvgl_port/CMakeLists.txt b/components/esp_lvgl_port/CMakeLists.txt
index 94aac0418..c74fee001 100644
--- a/components/esp_lvgl_port/CMakeLists.txt
+++ b/components/esp_lvgl_port/CMakeLists.txt
@@ -76,14 +76,14 @@ if("usb_host_hid" IN_LIST build_components)
     list(APPEND ADD_LIBS idf::usb_host_hid)
 endif()
 
-# Include SIMD assembly source code for rendering, only for (9.1.0 <= LVG_version < 9.2.0) and only for esp32 and esp32s3
+# Include SIMD assembly source code for rendering, only for (9.1.0 <= LVG_version < 9.2.0) and only for Xtensa targets (esp32, esp32s2, esp32s3)
 if((lvgl_ver VERSION_GREATER_EQUAL "9.1.0") AND (lvgl_ver VERSION_LESS "9.2.0"))
-    if(CONFIG_IDF_TARGET_ESP32 OR CONFIG_IDF_TARGET_ESP32S3)
+    if(CONFIG_IDF_TARGET_ESP32 OR CONFIG_IDF_TARGET_ESP32S3 OR CONFIG_IDF_TARGET_ESP32S2)
         message(VERBOSE "Compiling SIMD")
         if(CONFIG_IDF_TARGET_ESP32S3)
-            file(GLOB_RECURSE ASM_SRCS ${PORT_PATH}/simd/*_esp32s3.S)    # Select only esp32s3 related files
+            file(GLOB_RECURSE ASM_SRCS ${PORT_PATH}/simd/*_xtensa_pie.S)       # Select Xtensa PIE, for esp32s3 target
         else()
-            file(GLOB_RECURSE ASM_SRCS ${PORT_PATH}/simd/*_esp32.S)      # Select only esp32 related files
+            file(GLOB_RECURSE ASM_SRCS ${PORT_PATH}/simd/*_xtensa_base.S)      # Select Xtensa Base for esp32, esp32s2 targets
         endif()
 
         # Explicitly add all assembly macro files
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32.S
deleted file mode 100644
index 7d0606757..000000000
--- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32.S
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-// This is LVGL ARGB8888 simple fill for ESP32 processor
-
-    .section .text
-    .align  4
-    .global lv_color_blend_to_argb8888_esp
-    .type   lv_color_blend_to_argb8888_esp,@function
-
-// The function implements the following C code:
-// void lv_color_blend_to_argb8888(_lv_draw_sw_blend_fill_dsc_t * dsc);
-
-// Input params
-//
-// dsc - a2
-
-// typedef struct {
-//     uint32_t opa;                l32i    0
-//     void * dst_buf;              l32i    4
-//     uint32_t dst_w;              l32i    8
-//     uint32_t dst_h;              l32i    12
-//     uint32_t dst_stride;         l32i    16
-//     const void * src_buf;        l32i    20
-//     uint32_t src_stride;         l32i    24
-//     const lv_opa_t * mask_buf;   l32i    28
-//     uint32_t mask_stride;        l32i    32
-// } asm_dsc_t;
-
-lv_color_blend_to_argb8888_esp:
-
-    entry   a1,    32
-
-    l32i.n   a3,    a2,    4                    // a3 - dest_buff
-    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint32_t
-    l32i.n   a5,    a2,    12                   // a5 - dest_h                in uint32_t
-    l32i.n   a6,    a2,    16                   // a6 - dest_stride           in bytes
-    l32i.n   a7,    a2,    20                   // a7 - src_buff (color)
-    l32i.n   a8,    a7,    0                    // a8 - color as value
-    slli     a11,   a4,    2                    // a11 - dest_w_bytes = sizeof(uint32_t) * dest_w
-
-    movi     a7,    0xff000000                  // oppactiy mask
-    or       a10,    a7,    a8                  // apply oppacity
-
-    srli    a9,    a4,   2                      // a9 - loop_len = dest_w / 4
-    sub     a6,    a6,   a11                    // dest_stride = dest_stride - dest_w_bytes
-
-    .outer_loop:
-
-        // Run main loop which sets 16 bytes in one loop run
-        loopnez a9, ._main_loop
-            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3
-            s32i.n      a10,  a3,  4                    // save 32 bits from a10 to dest_buff a3
-            s32i.n      a10,  a3,  8                    // save 32 bits from a10 to dest_buff a3
-            s32i.n      a10,  a3,  12                   // save 32 bits from a10 to dest_buff a3
-            addi.n      a3,   a3,  16                   // increment dest_buff pointer by 16 bytes
-        ._main_loop:
-
-        // Finish the remaining bytes out of the loop
-        // Check modulo 8 of the dest_w_bytes, if - then set 8 bytes
-        bbci a11, 3, _mod_8_check                       // branch if 2-nd bit of dest_w_bytes is clear
-            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
-            s32i.n      a10,  a3,  4                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
-            addi.n      a3,   a3,  8                    // increment dest_buff pointer by 8 bytes
-        _mod_8_check:
-
-        // Check modulo 4 of the dest_w_bytes, if - then set 4 bytes
-        bbci a11, 2, _mod_4_check                       // branch if 2-nd bit of dest_w_bytes is clear
-            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
-            addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
-        _mod_4_check:
-
-        add     a3,  a3,  a6                             // dest_buff + dest_stride
-        addi.n  a5,  a5,  -1                             // decrease the outer loop
-    bnez a5, .outer_loop
-
-    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
-    retw.n                                              // return
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_xtensa_base.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_xtensa_base.S
new file mode 100644
index 000000000..d47aaee6b
--- /dev/null
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_xtensa_base.S
@@ -0,0 +1,147 @@
+/*
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <xtensa/config/core-isa.h>
+#include "lv_macro_memset.S"
+
+// This is LVGL ARGB8888 simple fill for ESP32, ESP32S2 processor
+
+    .section .text
+    .align  4
+    .global lv_color_blend_to_argb8888_esp
+    .type   lv_color_blend_to_argb8888_esp,@function
+
+// The function implements the following C code:
+// void lv_color_blend_to_argb8888(_lv_draw_sw_blend_fill_dsc_t * dsc);
+
+// Input params
+//
+// dsc - a2
+
+// typedef struct {
+//     uint32_t opa;                l32i    0
+//     void * dst_buf;              l32i    4
+//     uint32_t dst_w;              l32i    8
+//     uint32_t dst_h;              l32i    12
+//     uint32_t dst_stride;         l32i    16
+//     const void * src_buf;        l32i    20
+//     uint32_t src_stride;         l32i    24
+//     const lv_opa_t * mask_buf;   l32i    28
+//     uint32_t mask_stride;        l32i    32
+// } asm_dsc_t;
+
+lv_color_blend_to_argb8888_esp:
+
+    entry   a1,    32
+
+    l32i.n   a3,    a2,    4                    // a3 - dest_buff
+    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint32_t
+    l32i.n   a5,    a2,    12                   // a5 - dest_h                in uint32_t
+    l32i.n   a6,    a2,    16                   // a6 - dest_stride           in bytes
+    l32i.n   a7,    a2,    20                   // a7 - src_buff (color)
+    l32i.n   a8,    a7,    0                    // a8 - color as value
+    slli     a11,   a4,    2                    // a11 - dest_w_bytes = sizeof(uint32_t) * dest_w
+
+    beqz     a4,   _zero_matrix_len_check       // Check if dest_w a4 is zero
+    beqz     a5,   _zero_matrix_len_check       // Check if dest_h a5 is zero
+    movi     a7,    0xff000000                  // opacity mask
+    or       a10,    a7,    a8                  // apply opacity
+    sub      a6,     a6,   a11                  // dest_stride = dest_stride - dest_w_bytes
+
+    // Check dest_w length
+    bltui   a4,  8,  _matrix_width_check                // Branch if dest_w (a4) is lower than 8
+    srli    a9,  a4,   3                                // a9 - loop_len = dest_w / 8
+
+#if !XCHAL_HAVE_LOOPS
+    slli    a14,  a9,  5                                // a14 = loop_len (a9) * 32 (main loop increments address pointers by 32)
+#endif
+
+    .outer_loop:
+
+#if XCHAL_HAVE_LOOPS
+        loopnez a9,  ._main_loop                        // zero-overhead loop (not supported for esp32s2)
+#else
+        // Init loop parameters
+        beqz    a9,   ._main_loop                       // Branch to the end, if a9 is 0 (no need to run the main loop)
+        add     a15,  a14,  a3                          // a15 = a14 + dest_buf address
+        .main_loop_done:
+#endif
+        // Run main loop which sets 32 bytes (8 ARGB8888 pixels) in one loop run
+            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+            s32i.n      a10,  a3,  4                    // save 32 bits from a10 to dest_buff a3, offset 4 bytes
+            s32i.n      a10,  a3,  8                    // save 32 bits from a10 to dest_buff a3, offset 8 bytes
+            s32i.n      a10,  a3,  12                   // save 32 bits from a10 to dest_buff a3, offset 12 bytes
+            s32i.n      a10,  a3,  16                   // save 32 bits from a10 to dest_buff a3, offset 16 bytes
+            s32i.n      a10,  a3,  20                   // save 32 bits from a10 to dest_buff a3, offset 20 bytes
+            s32i.n      a10,  a3,  24                   // save 32 bits from a10 to dest_buff a3, offset 24 bytes
+            s32i.n      a10,  a3,  28                   // save 32 bits from a10 to dest_buff a3, offset 28 bytes
+            addi.n      a3,   a3,  32                   // increment dest_buff a3 pointer by 32 bytes
+#if !XCHAL_HAVE_LOOPS
+        blt     a3,   a15,  .main_loop_done             // Check end of the main loop, branch if dest_buf (a3) lower than a15
+#endif
+        ._main_loop:
+
+        // Finish the remaining bytes out of the loop
+
+        // Check modulo 16 of the dest_w_bytes (a11), if - then set 16 bytes (4 ARGB8888 pixels)
+        // src_reg a10, dest_buff a3, dest_w_bytes a11
+        macro_memset_mod_16 a10, a3, a11, __LINE__
+
+        // Check modulo 8 of the dest_w_bytes (a11), if - then set 8 bytes (2 ARGB8888 pixels)
+        // src_reg a10, dest_buff a3, dest_w_bytes a11
+        macro_memset_mod_8 a10, a3, a11, __LINE__
+
+        // Check modulo 4 of the dest_w_bytes (a11), if - then set 4 bytes (1 ARGB8888 pixel)
+        // src_reg a10, dest_buff a3, dest_w_bytes a11
+        macro_memset_mod_4 a10, a3, a11, __LINE__
+
+        add     a3,  a3,  a6                             // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                             // decrease the outer loop
+    bnez a5, .outer_loop
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
+
+//**********************************************************************************************************************
+
+    // Small matrix width, keep it simple for lengths less than 8 pixels
+
+    _matrix_width_check:
+
+#if !XCHAL_HAVE_LOOPS
+    slli    a14,  a4,   2                               // a14 = loop_len (a9) * 4 (main loop increments address pointers by 4)
+#endif
+
+    .outer_loop_short_matrix:
+
+#if XCHAL_HAVE_LOOPS
+        loopnez a4,  ._main_loop_short_matrix           // zero-overhead loop (not supported for esp32s2)
+#else
+        // Init loop parameters
+        add     a15,  a14,  a3                          // a15 = a14 + dest_buf address
+        ._main_loop_short_matrix_done:
+#endif
+        // Run main loop which sets 4 bytes (one ARGB8888 pixel) in one loop run
+            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3
+            addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
+#if !XCHAL_HAVE_LOOPS
+        blt     a3,   a15,  ._main_loop_short_matrix_done             // Check end of the main loop, branch if dest_buf (a3) lower than a15
+#endif
+        ._main_loop_short_matrix:
+
+        add     a3,  a3,  a6                            // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                            // decrease the outer loop
+    bnez a5, .outer_loop_short_matrix
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
+
+//**********************************************************************************************************************
+
+    // One of the matrix dimensions is zero, return early
+    _zero_matrix_len_check:
+    movi.n   a2, 1                              // return LV_RESULT_OK = 1
+    retw.n                                      // return
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_xtensa_pie.S
similarity index 95%
rename from components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S
rename to components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_xtensa_pie.S
index 10276f4f4..d9934dbfa 100644
--- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_xtensa_pie.S
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
  *
  * SPDX-License-Identifier: Apache-2.0
  */
@@ -42,11 +42,13 @@ lv_color_blend_to_argb8888_esp:
     l32i.n   a8,    a7,    0                    // a8 - color as value
     slli     a11,   a4,    2                    // a11 - dest_w_bytes = sizeof(uint32_t) * dest_w
 
-    movi     a7,    0xff000000                  // oppactiy mask
-    or       a10,    a7,    a8                  // apply oppacity
+    beqz     a4,   _zero_matrix_len_check       // Check if dest_w a4 is zero
+    beqz     a5,   _zero_matrix_len_check       // Check if dest_h a5 is zero
+    movi     a7,    0xff000000                  // opacity mask
+    or       a10,    a7,    a8                  // apply opacity
 
     // Check for short lengths
-    // dest_w should be at least 8, othewise it's not worth using esp32s3 TIE
+    // dest_w should be at least 8, otherwise it's not worth using esp32s3 TIE
     bgei     a4,   8,  _esp32s3_implementation          // Branch if dest_w is greater than or equal to 8
     j .lv_color_blend_to_argb8888_esp32_body            // Jump to esp32 implementation
 
@@ -227,7 +229,7 @@ lv_color_blend_to_argb8888_esp:
                 addi.n      a3,   a3,  4                // increment dest_buff pointer by 4 bytes
         _dest_buff_aligned_by_1byte:
 
-        // Shift q reg, allowing to set 16-byte unaligned adata
+        // Shift q reg, allowing to set 16-byte unaligned data
         wur.sar_byte     a15                            // apply unalignment to the SAR_BYTE
         ee.src.q   q2,  q0,  q1                         // shift concat. of q0 and q1 to q2 by SAR_BYTE amount
 
@@ -323,3 +325,10 @@ lv_color_blend_to_argb8888_esp:
 
     movi.n   a2, 1                                      // return LV_RESULT_OK = 1
     retw.n                                              // return
+
+//**********************************************************************************************************************
+
+    // One of the matrix dimensions is zero, return early
+    _zero_matrix_len_check:
+    movi.n   a2, 1                              // return LV_RESULT_OK = 1
+    retw.n                                      // return
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_xtensa_base.S
similarity index 60%
rename from components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32.S
rename to components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_xtensa_base.S
index 07b5aa111..a22ba064f 100644
--- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32.S
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_xtensa_base.S
@@ -1,10 +1,13 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
  *
  * SPDX-License-Identifier: Apache-2.0
  */
 
-// This is LVGL RGB565 simple fill for ESP32 processor
+#include <xtensa/config/core-isa.h>
+#include "lv_macro_memset.S"
+
+// This is LVGL RGB565 simple fill for ESP32, ESP32S2 processor
 
     .section .text
     .align  4
@@ -41,7 +44,10 @@ lv_color_blend_to_rgb565_esp:
     l32i.n   a8,    a7,    0                    // a8 - color as value
     slli     a11,   a4,    1                    // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w
 
-    // Convert color to rgb656
+    beqz     a4,   _zero_matrix_len_check       // Check if dest_w a4 is zero
+    beqz     a5,   _zero_matrix_len_check       // Check if dest_h a5 is zero
+
+    // Convert color to rgb565
     l8ui    a15,    a7,    2                    // red
     movi.n  a14,    0xf8
     and     a13,    a15,   a14
@@ -66,6 +72,9 @@ lv_color_blend_to_rgb565_esp:
     movi.n  a8,    0x3                          // a8 = 0x3, dest_buff align mask
     sub     a6,    a6,   a11                    // dest_stride = dest_stride - dest_w_bytes
 
+    // Check dest_w length
+    bltui   a4,  8,  _matrix_width_check         // Branch if dest_w (a4) is lower than 8
+
     // cache init
     // Prepare main loop length and dest_w_bytes
     srli     a9,     a4,    4                    // a9 = loop_len = dest_w / 8, calculate main loop_len for original dest_w
@@ -97,8 +106,16 @@ lv_color_blend_to_rgb565_esp:
 
         _dest_buff_unaligned:
 
-        // Run main loop which sets 16 bytes in one loop run
-        loopnez a9, ._main_loop
+#if XCHAL_HAVE_LOOPS
+        loopnez a9,   ._main_loop                       // zero-overhead loop (not supported in esp32s2)
+#else
+        // Init loop parameters
+        beqz    a9,   ._main_loop                       // Branch to the end, if a9 is 0 (no need to run the main loop)
+        slli    a15,  a9,   5                           // a15 = loop_len (a9) * 32 (main loop increments address pointers by 32)
+        add     a15,  a15,  a3                          // a15 += dest_buf address
+        ._main_loop_aligned_done:
+#endif
+        // Run main loop which sets 32 bytes (16 RGB565 pixels) in one loop run
             s32i.n      a10,  a3,  0                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0
             s32i.n      a10,  a3,  4                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 4
             s32i.n      a10,  a3,  8                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 8
@@ -108,37 +125,27 @@ lv_color_blend_to_rgb565_esp:
             s32i.n      a10,  a3,  24                   // save 32 bits from 32-bit color a10 to dest_buff a3, offset 24
             s32i.n      a10,  a3,  28                   // save 32 bits from 32-bit color a10 to dest_buff a3, offset 28
             addi.n      a3,   a3,  32                   // increment dest_buff pointer by 32
+#if !XCHAL_HAVE_LOOPS
+        blt     a3,  a15,  ._main_loop_aligned_done     // Check end of the main loop, branch if dest_buf (a3) lower than a15
+#endif
         ._main_loop:
 
         // Finish the remaining bytes out of the loop
-        // Check modulo 8 of the dest_w_bytes, if - then set 16 bytes
-        bbci a11, 4, _mod_16_check                      // branch if 2-nd bit of dest_w_bytes is clear
-            s32i.n      a10,  a3,  0                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0
-            s32i.n      a10,  a3,  4                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 4
-            s32i.n      a10,  a3,  8                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 8
-            s32i.n      a10,  a3,  12                   // save 32 bits from 32-bit color a10 to dest_buff a3, offset 12
-            addi.n      a3,   a3,  16                   // increment dest_buff pointer by 16
-        _mod_16_check:
+        // Check modulo 16 of the dest_w_bytes (a11), if - then set 16 bytes (8 RGB565 pixels)
+        // src_reg a10, dest_buff a3, dest_w_bytes a11
+        macro_memset_mod_16 a10, a3, a11, __LINE__
 
-        // Finish the remaining bytes out of the loop
-        // Check modulo 8 of the dest_w_bytes, if - then set 8 bytes
-        bbci a11, 3, _mod_8_check                       // branch if 2-nd bit of dest_w_bytes is clear
-            s32i.n      a10,  a3,  0                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0
-            s32i.n      a10,  a3,  4                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 4
-            addi.n      a3,   a3,  8                    // increment dest_buff pointer by 8 bytes
-        _mod_8_check:
+        // Check modulo 8 of the dest_w_bytes (a11), if - then set 8 bytes (4 RGB565 pixels)
+        // src_reg a10, dest_buff a3, dest_w_bytes a11
+        macro_memset_mod_8 a10, a3, a11, __LINE__
 
-        // Check modulo 4 of the dest_w_bytes, if - then set 4 bytes
-        bbci a11, 2, _mod_4_check                       // branch if 2-nd bit of dest_w_bytes is clear
-            s32i.n      a10,  a3,  0                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0
-            addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4
-        _mod_4_check:
+        // Check modulo 4 of the dest_w_bytes (a11), if - then set 4 bytes (2 RGB565 pixels)
+        // src_reg a10, dest_buff a3, dest_w_bytes a11
+        macro_memset_mod_4 a10, a3, a11, __LINE__
 
-        // Check modulo 2 of the dest_w_bytes, if - then set 2 bytes
-        bbci a11, 1, _mod_2_check                       // branch if 1-st bit of dest_w_bytes is clear
-            s16i        a12,  a3,  0                    // save 16 bits from 16-bit color a12 to dest_buff a3, offset 0
-            addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
-        _mod_2_check:
+        // Check modulo 2 of the dest_w_bytes (a11), if - then set 2 bytes (1 RGB565 pixel)
+        // src_reg a10, dest_buff a3, dest_w_bytes a11
+        macro_memset_mod_2 a10, a3, a11, __LINE__
 
         add     a3,  a3,  a6                            // dest_buff + dest_stride
         addi.n  a5,  a5,  -1                            // decrease the outer loop
@@ -147,3 +154,44 @@ lv_color_blend_to_rgb565_esp:
 
     movi.n   a2, 1                                      // return LV_RESULT_OK = 1
     retw.n                                              // return
+
+//**********************************************************************************************************************
+
+    // Small matrix width, keep it simple for lengths less than 8 pixels
+
+    _matrix_width_check:
+
+#if !XCHAL_HAVE_LOOPS
+    slli    a14,  a4,   1                               // a14 = loop_len (a9) * 4 (main loop increments address pointers by 4)
+#endif
+
+    .outer_loop_short_matrix:
+
+#if XCHAL_HAVE_LOOPS
+        loopnez a4,  ._main_loop_short_matrix           // zero-overhead loop (not supported for esp32s2)
+#else
+        // Init loop parameters
+        add     a15,  a14,  a3                          // a15 = a14 + dest_buf address
+        ._main_loop_short_matrix_done:
+#endif
+        // Run main loop which sets 4 bytes (one ARGB8888 pixel) in one loop run
+            s16i        a12,  a3,  0                    // save 16 bits from a12 to dest_buff a3
+            addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
+#if !XCHAL_HAVE_LOOPS
+        blt     a3,   a15,  ._main_loop_short_matrix_done             // Check end of the main loop, branch if dest_buf (a3) lower than a15
+#endif
+        ._main_loop_short_matrix:
+
+        add     a3,  a3,  a6                            // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                            // decrease the outer loop
+    bnez a5, .outer_loop_short_matrix
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
+
+//**********************************************************************************************************************
+
+    // One of the matrix dimensions is zero, return early
+    _zero_matrix_len_check:
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_xtensa_pie.S
similarity index 97%
rename from components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S
rename to components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_xtensa_pie.S
index 3a9fe43cb..5444f1b2b 100644
--- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_xtensa_pie.S
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
  *
  * SPDX-License-Identifier: Apache-2.0
  */
@@ -41,7 +41,10 @@ lv_color_blend_to_rgb565_esp:
     l32i.n   a8,    a7,    0                    // a8 - color as value
     slli     a11,   a4,    1                    // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w
 
-    // Convert color to rgb656
+    beqz     a4,   _zero_matrix_len_check       // Check if dest_w a4 is zero
+    beqz     a5,   _zero_matrix_len_check       // Check if dest_h a5 is zero
+
+    // Convert color to rgb565
     l8ui    a15,    a7,    2                    // red
     movi.n  a14,    0xf8
     and     a13,    a15,   a14
@@ -402,3 +405,10 @@ lv_color_blend_to_rgb565_esp:
 
     movi.n   a2, 1                                      // return LV_RESULT_OK = 1
     retw.n                                              // return
+
+//**********************************************************************************************************************
+
+    // One of the matrix dimensions is zero, return early
+    _zero_matrix_len_check:
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_xtensa_base.S
similarity index 77%
rename from components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S
rename to components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_xtensa_base.S
index 467b5348a..77fb80147 100644
--- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_xtensa_base.S
@@ -1,10 +1,12 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
  *
  * SPDX-License-Identifier: Apache-2.0
  */
 
-// This is LVGL RGB888 simple fill for ESP32 processor
+#include <xtensa/config/core-isa.h>
+
+// This is LVGL RGB888 simple fill for ESP32, ESP32S2 processor
 
     .section .text
     .align  4
@@ -40,6 +42,9 @@ lv_color_blend_to_rgb888_esp:
     l32i.n   a7,    a2,    20                   // a7 - src_buff (color)
     l32i.n   a8,    a7,    0                    // a8 - color as value
 
+    beqz     a4,   _zero_matrix_len_check       // Check if dest_w a4 is zero
+    beqz     a5,   _zero_matrix_len_check       // Check if dest_h a5 is zero
+
     // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w = 3 * a4
     slli     a11,   a4,    1                    // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w
     add      a11,   a11,   a4                   // a11 - dest_w_bytes = a11 + a4
@@ -64,15 +69,27 @@ lv_color_blend_to_rgb888_esp:
     srli     a9,     a4,    2                    // a9 = loop_len = dest_w / 4, calculate main loop_len for original dest_w
     movi.n   a8,     0x3                         // a8 = 0x3, remainder mask
     and      a10,    a4,    a8                   // a10 - remainder after division by 4 = a4 and 0x3
+    movi.n   a12,    12                          // a12 = 12 (pointer increment in main loop for esp32s2)
 
     .outer_loop:
 
+#if XCHAL_HAVE_LOOPS
+        loopnez a9,  ._main_loop                        // zero-overhead loop (not supported for esp32s2)
+#else
+        // Init loop parameters
+        beqz    a9,   ._main_loop                       // Branch to the end, if a9 is 0 (no need to run the main loop)
+        mul16u  a11,  a9,   a12                         // a11 = 12 (a12) * loop_len (a9) both operands must be lower than 16bit values (mul16u)
+        add     a11,  a11,  a3                          // a12 += dest_buf address
+        .main_loop_done:
+#endif
         // Run main loop which sets 12 bytes (4 rgb888) in one loop run
-        loopnez a9, ._main_loop
             s32i.n      a13,  a3,  0                    // save 32 bits from 32-bit color a13 to dest_buff a3, offset 0
             s32i.n      a14,  a3,  4                    // save 32 bits from 32-bit color a14 to dest_buff a3, offset 4
             s32i.n      a15,  a3,  8                    // save 32 bits from 32-bit color a15 to dest_buff a3, offset 8
             addi.n      a3,   a3,  12                   // increment dest_buff pointer by 12
+#if !XCHAL_HAVE_LOOPS
+        blt     a3,   a11,  .main_loop_done             // Check end of the main loop, branch if dest_buf (a3) lower than a11
+#endif
         ._main_loop:
 
         bnei   a10,  0x3,  _less_than_3                 // branch if less than 3 values left
@@ -103,3 +120,10 @@ lv_color_blend_to_rgb888_esp:
 
     movi.n   a2, 1                                      // return LV_RESULT_OK = 1
     retw.n                                              // return
+
+//**********************************************************************************************************************
+
+    // One of the matrix dimensions is zero, return early
+    _zero_matrix_len_check:
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_xtensa_pie.S
similarity index 97%
rename from components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S
rename to components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_xtensa_pie.S
index 955db4d00..e8bceb531 100644
--- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_xtensa_pie.S
@@ -40,6 +40,9 @@ lv_color_blend_to_rgb888_esp:
     l32i.n   a7,    a2,    20                   // a7 - src_buff (color)
     l32i.n   a8,    a7,    0                    // a8 - color as value
 
+    beqz     a4,   _zero_matrix_len_check       // Check if dest_w a4 is zero
+    beqz     a5,   _zero_matrix_len_check       // Check if dest_h a5 is zero
+
     // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w = 3 * a4
     slli     a11,   a4,    1                    // a11 - dest_w_bytes = 2 * dest_w
     add      a11,   a11,   a4                   // a11 - dest_w_bytes = a11 + a4
@@ -344,3 +347,11 @@ unalignment_table:
 
     movi.n   a2, 1                                      // return LV_RESULT_OK = 1
     retw.n                                              // return
+
+//**********************************************************************************************************************
+
+    // One of the matrix dimensions is zero, return early
+    _zero_matrix_len_check:
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
+
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memset.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memset.S
new file mode 100644
index 000000000..342d8a566
--- /dev/null
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memset.S
@@ -0,0 +1,50 @@
+/*
+ * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Memset macros for modulo checking
+// After running the main loop, there is need to check remaining bytes to be set out of the main loop
+// Macros work with both, aligned and unaligned (4-byte boundary) memories
+// but performance is significantly lower when using unaligned memory, because of the unaligned memory access exception
+
+// Macro for checking modulo 16
+ .macro macro_memset_mod_16 src_reg, dest_buf, condition, JUMP_TAG
+    // Check modulo 16 of the \condition, if - then set 16 bytes
+    bbci \condition, 4, ._mod_16_check_\JUMP_TAG     // Branch if 4-th bit of \condition is clear
+        s32i.n      \src_reg,   \dest_buf,  0        // Save 32 bits from \src_reg to \dest_buff, offset 0
+        s32i.n      \src_reg,   \dest_buf,  4        // Save 32 bits from \src_reg to \dest_buff, offset 4
+        s32i.n      \src_reg,   \dest_buf,  8        // Save 32 bits from \src_reg to \dest_buff, offset 8
+        s32i.n      \src_reg,   \dest_buf,  12       // Save 32 bits from \src_reg to \dest_buff, offset 12
+        addi.n      \dest_buf,  \dest_buf,  16       // Increment \dest_buff pointer 16
+    ._mod_16_check_\JUMP_TAG:
+.endm // macro_memset_mod_16
+
+// Macro for checking modulo 8
+ .macro macro_memset_mod_8 src_reg, dest_buf, condition, JUMP_TAG
+    // Check modulo 8 of the \condition, if - then set 8 bytes
+    bbci \condition, 3, ._mod_8_check_\JUMP_TAG      // Branch if 3-rd bit of \condition is clear
+        s32i.n      \src_reg,   \dest_buf,  0        // Save 32 bits from \src_reg to \dest_buff, offset 0
+        s32i.n      \src_reg,   \dest_buf,  4        // Save 32 bits from \src_reg to \dest_buff, offset 4
+        addi.n      \dest_buf,  \dest_buf,  8        // Increment \dest_buff pointer 8
+    ._mod_8_check_\JUMP_TAG:
+.endm // macro_memset_mod_8
+
+// Macro for checking modulo 4
+ .macro macro_memset_mod_4 src_reg, dest_buf, condition, JUMP_TAG
+    // Check modulo 4 of the \condition, if - then set 4 bytes
+    bbci \condition, 2, ._mod_4_check_\JUMP_TAG      // Branch if 2-nd bit of \condition is clear
+        s32i.n      \src_reg,   \dest_buf,  0        // Save 32 bits from \src_reg to \dest_buff, offset 0
+        addi.n      \dest_buf,  \dest_buf,  4        // Increment \dest_buff pointer 4
+    ._mod_4_check_\JUMP_TAG:
+.endm // macro_memset_mod_4
+
+// Macro for checking modulo 2
+ .macro macro_memset_mod_2 src_reg, dest_buf, condition, JUMP_TAG
+    // Check modulo 2 of the \condition, if - then set 2 bytes
+    bbci \condition, 1, ._mod_2_check_\JUMP_TAG      // Branch if 1-st bit of \condition is clear
+        s16i        \src_reg,   \dest_buf,  0        // Save 32 bits from \src_reg to \dest_buff, offset 0
+        addi.n      \dest_buf,  \dest_buf,  2        // Increment \dest_buff pointer 2
+    ._mod_2_check_\JUMP_TAG:
+.endm // macro_memset_mod_2
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_xtensa_base.S
similarity index 83%
rename from components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S
rename to components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_xtensa_base.S
index eb08cd877..5150bda76 100644
--- a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_xtensa_base.S
@@ -1,12 +1,13 @@
 /*
- * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2024-2025 Espressif Systems (Shanghai) CO LTD
  *
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <xtensa/config/core-isa.h>
 #include "lv_macro_memcpy.S"        // Memcpy macros
 
-// This is LVGL RGB565 image blend to RGB565 for ESP32 processor
+// This is LVGL RGB565 image blend to RGB565 for ESP32, ESP32S2 processor
 
     .section .text
     .align  4
@@ -42,6 +43,8 @@ lv_rgb565_blend_normal_to_rgb565_esp:
     l32i.n   a8,    a2,    24                   // a8 - src_stride            in bytes
     slli     a11,   a4,    1                    // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w
 
+    beqz     a4,   _zero_matrix_len_check       // Check if dest_w a4 is zero
+    beqz     a5,   _zero_matrix_len_check       // Check if dest_h a5 is zero
     // No need to convert any colors here, we are copying from rgb565 to rgb565
 
     // Check dest_w length
@@ -73,8 +76,16 @@ lv_rgb565_blend_normal_to_rgb565_esp:
 
     .outer_loop_align:
 
+#if XCHAL_HAVE_LOOPS
+        loopnez a9,   ._main_loop_aligned               // zero-overhead loop (not supported in esp32s2)
+#else
+        // Init loop parameters
+        beqz    a9,   ._main_loop_aligned               // Branch to the end, if a9 is 0 (no need to run the main loop)
+        slli    a10,  a9,   4                           // a10 = loop_len (a9) * 16 (main loop increments address pointers by 16)
+        add     a10,  a10,  a3                          // a10 += dest_buf address
+        ._main_loop_aligned_done:
+#endif
         // Run main loop which copies 16 bytes (8 RGB565 pixels) in one loop run
-        loopnez a9, ._main_loop_aligned
             l32i.n      a15,  a7,  0                    // Load 32 bits from src_buff a7 to a15, offset 0
             l32i.n      a14,  a7,  4                    // Load 32 bits from src_buff a7 to a14, offset 4
             l32i.n      a13,  a7,  8                    // Load 32 bits from src_buff a7 to a13, offset 8
@@ -85,6 +96,9 @@ lv_rgb565_blend_normal_to_rgb565_esp:
             s32i.n      a12,  a3,  12                   // Save 32 bits from a15 to dest_buff a3, offset 12
             addi.n      a7,   a7,  16                   // Increment src_buff pointer a7 by 16
             addi.n      a3,   a3,  16                   // Increment dest_buff pointer a3 by 16
+#if !XCHAL_HAVE_LOOPS
+        blt     a3,  a10,  ._main_loop_aligned_done     // Check end of the main loop, branch if dest_buf (a3) lower than a10
+#endif
         ._main_loop_aligned:
 
         // Finish the remaining bytes out of the main loop
@@ -142,14 +156,23 @@ lv_rgb565_blend_normal_to_rgb565_esp:
         sub         a10,  a11,  a14                     // Get the dest_w_bytes after the aligning loop
         srli        a9,   a10,  4                       // Calculate main loop len (a9 = dest_w_bytes_local / 16)
 
+#if XCHAL_HAVE_LOOPS
+        loopnez  a14,  ._dest_aligning_loop             // zero-overhead loop (not supported in esp32s2)
+#else
+        // Init loop parameters
+        beqz     a14,  ._dest_aligning_loop             // Branch to the end, if a14 is 0 (no need to run the main loop)
+        add      a14,  a14,  a3                         // loop_len = loop_len + dest_buf (a3)
+        ._dest_aligning_loop_done:
+#endif
         // Run dest_buff aligning loop byte by byte
-        loopnez a14, ._dest_aligning_loop
             l8ui        a15,  a7,  0                    // Load 8 bits from src_buff a7 to a15, offset 0
             addi.n      a7,   a7,  1                    // Increment src_buff pointer a7 by 1
             s8i         a15,  a3,  0                    // Save 8 bits from a15 to dest_buff a3, offset 0
             addi.n      a3,   a3,  1                    // Increment dest_buff pointer a3 by 1
+#if     !XCHAL_HAVE_LOOPS
+        blt      a3,  a14,  ._dest_aligning_loop_done   // Check end of the main loop, branch if dest_buf (a3) lower than a14
+#endif
         ._dest_aligning_loop:
-
         // Destination is aligned, source is unaligned
 
         // For more information about this implementation, see chapter 3.3.2 Shifts and the Shift Amount Register (SAR)
@@ -162,8 +185,16 @@ lv_rgb565_blend_normal_to_rgb565_esp:
         // First preload for the loopnez cycle
         l32i.n      a15,  a7,  0                        // Load 32 bits from 4-byte aligned src_buff a7 to a15, offset 0
 
+#if XCHAL_HAVE_LOOPS
+        loopnez  a9,  ._main_loop_unalign               // zero-overhead loop (not supported in esp32s2)
+#else
+        // Init loop parameters
+        beqz     a9,  ._main_loop_unalign               // Branch to the end, if a9 is 0 (no need to run the main loop)
+        slli     a9,  a9,  4                            // loop_len (a9) *= 16 (main loop increments address pointers by 16)
+        add      a9,  a9,  a3                           // a9 += dest_buff address
+        ._main_loop_unalign_done:
+#endif
         // Run main loop which copies 16 bytes (8 RGB565 pixels) in one loop run
-        loopnez a9, ._main_loop_unalign
             l32i.n      a14,  a7,   4                   // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4
             l32i.n      a13,  a7,   8                   // Load 32 bits from 4-byte aligned src_buff a7 to a13, offset 8
             src         a15,  a14,  a15                 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15
@@ -178,6 +209,9 @@ lv_rgb565_blend_normal_to_rgb565_esp:
             src         a12,  a15,  a12                 // Concatenate a15 and a12 and shift by SAR_BYTE amount to a12
             s32i.n      a12,  a3,   12                  // Save 32 bits from shift-corrected a12 to dest_buff a3, offset 12
             addi.n      a3,   a3,   16                  // Increment dest_buff pointer a3 by 16
+#if !XCHAL_HAVE_LOOPS
+        blt     a3,  a9,  ._main_loop_unalign_done      // Check end of the main loop, branch if dest_buf (a3) lower than a10
+#endif
         ._main_loop_unalign:
 
         // Finish the remaining bytes out of the loop
@@ -239,14 +273,25 @@ lv_rgb565_blend_normal_to_rgb565_esp:
 
     .outer_loop_short_matrix_length:
 
+#if XCHAL_HAVE_LOOPS
+        loopnez  a4,   ._main_loop_short_matrix_length  // zero-overhead loop (not supported in esp32s2)
+#else
+        // Init loop parameters
+        beqz     a4,   ._main_loop_short_matrix_length  // Branch to the end, if a4 is 0 (no need to run the main loop)
+        slli     a10,  a4,  1                           // a10 = loop_len (a4) * 2 (main loop increments address pointers by 2)
+        add      a10,  a10, a3                          // a10 += dest_buf address
+        ._main_loop_short_matrix_length_done:
+#endif
         // Run main loop which copies 2 bytes (one RGB565 pixel) in one loop run
-        loopnez a4, ._main_loop_short_matrix_length
             l8ui        a15,  a7,  0                    // Load 8 bits from src_buff a7 to a15, offset 0
             l8ui        a14,  a7,  1                    // Load 8 bits from src_buff a7 to a14, offset 1
             s8i         a15,  a3,  0                    // Save 8 bits from a15 to dest_buff a3, offset 0
             s8i         a14,  a3,  1                    // Save 8 bits from a14 to dest_buff a3, offset 1
-            addi.n      a7,   a7,  2                    // Increment src_buff pointer a7 by 1
+            addi.n      a7,   a7,  2                    // Increment src_buff pointer a7 by 2
             addi.n      a3,   a3,  2                    // Increment dest_buff pointer a3 by 2
+#if !XCHAL_HAVE_LOOPS
+        blt     a3,  a10,  ._main_loop_short_matrix_length_done // Check end of the main loop, branch if dest_buf (a3) lower than a10
+#endif
         ._main_loop_short_matrix_length:
 
         // Finish remaining byte out of the main loop
@@ -262,3 +307,10 @@ lv_rgb565_blend_normal_to_rgb565_esp:
 
     movi.n   a2, 1                                      // Return LV_RESULT_OK = 1
     retw.n                                              // Return
+
+//**********************************************************************************************************************
+
+    // One of the matrix dimensions is zero, return early
+    _zero_matrix_len_check:
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_xtensa_pie.S
similarity index 97%
rename from components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S
rename to components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_xtensa_pie.S
index 66de392f6..29d74f24b 100644
--- a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_xtensa_pie.S
@@ -43,6 +43,8 @@ lv_rgb565_blend_normal_to_rgb565_esp:
     movi.n   a10,   0xf                         // 0xf alignment mask (16-byte alignment)
     slli     a11,   a4,    1                    // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w
 
+    beqz     a4,   _zero_matrix_len_check       // Check if dest_w a4 is zero
+    beqz     a5,   _zero_matrix_len_check       // Check if dest_h a5 is zero
     // No need to convert any colors here, we are copying from rgb565 to rgb565
 
     // Check dest_w length
@@ -370,3 +372,10 @@ lv_rgb565_blend_normal_to_rgb565_esp:
 
     movi.n   a2, 1                                          // Return LV_RESULT_OK = 1
     retw.n                                                  // Return
+
+//**********************************************************************************************************************
+
+    // One of the matrix dimensions is zero, return early
+    _zero_matrix_len_check:
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_xtensa_base.S
similarity index 83%
rename from components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_esp32.S
rename to components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_xtensa_base.S
index f35175fe8..b66306685 100644
--- a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_esp32.S
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_xtensa_base.S
@@ -4,9 +4,10 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <xtensa/config/core-isa.h>
 #include "lv_macro_memcpy.S"        // Memcpy macros
 
-// This is LVGL RGB888 image blend to RGB888 for ESP32 processor
+// This is LVGL RGB888 image blend to RGB888 for ESP3, ESP32S2 processor
 
     .section .text
     .align  4
@@ -43,6 +44,8 @@ lv_rgb888_blend_normal_to_rgb888_esp:
     slli     a11,   a4,    1                    // a11 = (a4 << 1) + a4
     add      a11,   a11,   a4                   // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w
 
+    beqz     a4,   _zero_matrix_len_check       // Check if dest_w a4 is zero
+    beqz     a5,   _zero_matrix_len_check       // Check if dest_h a5 is zero
     // No need to convert any colors here, we are copying from rgb888 to rgb888
 
     // Check dest_w length
@@ -74,8 +77,16 @@ lv_rgb888_blend_normal_to_rgb888_esp:
 
     .outer_loop_align:
 
+#if XCHAL_HAVE_LOOPS
+        loopnez a9,   ._main_loop_aligned               // zero-overhead loop (not supported in esp32s2)
+#else
+        // Init loop parameters
+        beqz    a9,   ._main_loop_aligned               // Branch to the end, if a9 is 0 (no need to run the main loop)
+        slli    a10,  a9,   4                           // a10 = loop_len (a9) * 16 (main loop increments address pointers by 16)
+        add     a10,  a10,  a3                          // a10 += dest_buf address
+        ._main_loop_aligned_done:
+#endif
         // Run main loop which copies 16 bytes (5 and 1/3 of RGB888 pixels) in one loop run
-        loopnez a9, ._main_loop_aligned
             l32i.n      a15,  a7,  0                    // Load 32 bits from src_buff a7 to a15, offset 0
             l32i.n      a14,  a7,  4                    // Load 32 bits from src_buff a7 to a14, offset 4
             l32i.n      a13,  a7,  8                    // Load 32 bits from src_buff a7 to a13, offset 8
@@ -86,6 +97,9 @@ lv_rgb888_blend_normal_to_rgb888_esp:
             s32i.n      a12,  a3,  12                   // Save 32 bits from a15 to dest_buff a3, offset 12
             addi.n      a7,   a7,  16                   // Increment src_buff pointer a7 by 16
             addi.n      a3,   a3,  16                   // Increment dest_buff pointer a3 by 16
+#if !XCHAL_HAVE_LOOPS
+        blt     a3,  a10,  ._main_loop_aligned_done     // Check end of the main loop, branch if dest_buf (a3) lower than a10
+#endif
         ._main_loop_aligned:
 
         // Finish the remaining bytes out of the main loop
@@ -143,12 +157,22 @@ lv_rgb888_blend_normal_to_rgb888_esp:
         sub         a10,  a11,  a14                     // Get the dest_w_bytes after the aligning loop
         srli        a9,   a10,  4                       // Calculate main loop len (a9 = dest_w_bytes_local / 16)
 
+#if XCHAL_HAVE_LOOPS
+        loopnez  a14,  ._dest_aligning_loop             // zero-overhead loop (not supported in esp32s2)
+#else
+        // Init loop parameters
+        beqz     a14,  ._dest_aligning_loop             // Branch to the end, if a14 is 0 (no need to run the main loop)
+        add      a14,  a14,  a3                         // loop_len = loop_len + dest_buf (a3)
+        ._dest_aligning_loop_done:
+#endif
         // Run dest_buff aligning loop byte by byte
-        loopnez a14, ._dest_aligning_loop
             l8ui        a15,  a7,  0                    // Load 8 bits from src_buff a7 to a15, offset 0
             addi.n      a7,   a7,  1                    // Increment src_buff pointer a7 by 1
             s8i         a15,  a3,  0                    // Save 8 bits from a15 to dest_buff a3, offset 0
             addi.n      a3,   a3,  1                    // Increment dest_buff pointer a3 by 1
+#if     !XCHAL_HAVE_LOOPS
+        blt      a3,  a14,  ._dest_aligning_loop_done   // Check end of the main loop, branch if dest_buf (a3) lower than a14
+#endif
         ._dest_aligning_loop:
 
         // Destination is aligned, source is unaligned
@@ -163,8 +187,16 @@ lv_rgb888_blend_normal_to_rgb888_esp:
         // First preload for the loopnez cycle
         l32i.n      a15,  a7,  0                        // Load 32 bits from 4-byte aligned src_buff a7 to a15, offset 0
 
+#if XCHAL_HAVE_LOOPS
+        loopnez  a9,  ._main_loop_unalign               // zero-overhead loop (not supported in esp32s2)
+#else
+        // Init loop parameters
+        beqz     a9,  ._main_loop_unalign               // Branch to the end, if a9 is 0 (no need to run the main loop)
+        slli     a9,  a9,  4                            // loop_len (a9) *= 16 (main loop increments address pointers by 16)
+        add      a9,  a9,  a3                           // a9 += dest_buff address
+        ._main_loop_unalign_done:
+#endif
         // Run main loop which copies 16 bytes (5 and 1/3 of RGB888 pixels) in one loop run
-        loopnez a9, ._main_loop_unalign
             l32i.n      a14,  a7,   4                   // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4
             l32i.n      a13,  a7,   8                   // Load 32 bits from 4-byte aligned src_buff a7 to a13, offset 8
             src         a15,  a14,  a15                 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15
@@ -179,6 +211,9 @@ lv_rgb888_blend_normal_to_rgb888_esp:
             src         a12,  a15,  a12                 // Concatenate a15 and a12 and shift by SAR_BYTE amount to a12
             s32i.n      a12,  a3,   12                  // Save 32 bits from shift-corrected a12 to dest_buff a3, offset 12
             addi.n      a3,   a3,   16                  // Increment dest_buff pointer a3 by 16
+#if !XCHAL_HAVE_LOOPS
+        blt     a3,  a9,  ._main_loop_unalign_done      // Check end of the main loop, branch if dest_buf (a3) lower than a10
+#endif
         ._main_loop_unalign:
 
         // Finish the remaining bytes out of the loop
@@ -237,11 +272,19 @@ lv_rgb888_blend_normal_to_rgb888_esp:
     // Convert strides to matrix paddings
     sub     a6,  a6,  a11                               // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
     sub     a8,  a8,  a11                               // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
+    movi.n  a12, 3                                      // a12 = 3 (pointer increment in main loop for esp32s2)
 
     .outer_loop_short_matrix_length:
-
+#if XCHAL_HAVE_LOOPS
+        loopnez  a4,   ._main_loop_short_matrix_length  // zero-overhead loop (not supported in esp32s2)
+#else
+        // Init loop parameters
+        beqz     a4,   ._main_loop_short_matrix_length  // Branch to the end, if a4 is 0 (no need to run the main loop)
+        mul16u   a10,  a4,  a12                         // a10 = loop_len (a4) * 3 (main loop increments address pointers by 3)
+        add      a10,  a10, a3                          // a10 += dest_buf address
+        ._main_loop_short_matrix_length_done:
+#endif
         // Run main loop which copies 3 bytes (one RGB888 pixel) in one loop run
-        loopnez a4, ._main_loop_short_matrix_length
             l8ui        a15,  a7,  0                    // Load 8 bits from src_buff a7 to a15, offset 0
             l8ui        a14,  a7,  1                    // Load 8 bits from src_buff a7 to a14, offset 1
             l8ui        a13,  a7,  2                    // Load 8 bits from src_buff a7 to a13, offset 2
@@ -250,6 +293,9 @@ lv_rgb888_blend_normal_to_rgb888_esp:
             s8i         a13,  a3,  2                    // Save 8 bits from a13 to dest_buff a3, offset 2
             addi.n      a7,   a7,  3                    // Increment src_buff pointer a7 by 3
             addi.n      a3,   a3,  3                    // Increment dest_buff pointer a3 by 3
+#if !XCHAL_HAVE_LOOPS
+        blt     a3,  a10,  ._main_loop_short_matrix_length_done // Check end of the main loop, branch if dest_buf (a3) lower than a10
+#endif
         ._main_loop_short_matrix_length:
 
         add     a3,  a3,  a6                            // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6)
@@ -259,3 +305,10 @@ lv_rgb888_blend_normal_to_rgb888_esp:
 
     movi.n   a2, 1                                      // Return LV_RESULT_OK = 1
     retw.n                                              // Return
+
+//**********************************************************************************************************************
+
+    // One of the matrix dimensions is zero, return early
+    _zero_matrix_len_check:
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_xtensa_pie.S
similarity index 95%
rename from components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_esp32s3.S
rename to components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_xtensa_pie.S
index cb31100ff..12c724bf2 100644
--- a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_esp32s3.S
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_xtensa_pie.S
@@ -44,10 +44,12 @@ lv_rgb888_blend_normal_to_rgb888_esp:
     slli     a11,   a4,    1                    // a11 = (a4 << 1) + a4
     add      a11,   a11,   a4                   // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w
 
+    beqz     a4,   _zero_matrix_len_check       // Check if dest_w a4 is zero
+    beqz     a5,   _zero_matrix_len_check       // Check if dest_h a5 is zero
     // No need to convert any colors here, we are copying from rgb888 to rgb888
 
     // Check dest_w length
-    bltui   a4,  8,  _matrix_width_check                    // Branch if dest_w (a4) is lower than 8
+    bltui   a4,  8,  _matrix_width_check        // Branch if dest_w (a4) is lower than 8
 
 //**********************************************************************************************************************
 
@@ -219,3 +221,10 @@ lv_rgb888_blend_normal_to_rgb888_esp:
 
     movi.n   a2, 1                                          // Return LV_RESULT_OK = 1
     retw.n                                                  // Return
+
+//**********************************************************************************************************************
+
+    // One of the matrix dimensions is zero, return early
+    _zero_matrix_len_check:
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
diff --git a/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt b/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt
index 20c061ff3..7b01d3a8b 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt
+++ b/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt
@@ -1,18 +1,18 @@
 # Include SIMD assembly source code for rendering
-if(CONFIG_IDF_TARGET_ESP32 OR CONFIG_IDF_TARGET_ESP32S3)
+if(CONFIG_IDF_TARGET_ESP32 OR CONFIG_IDF_TARGET_ESP32S3 OR CONFIG_IDF_TARGET_ESP32S2)
     message(VERBOSE "Compiling SIMD")
     set(PORT_PATH "../../../src/lvgl9")
 
     if(CONFIG_IDF_TARGET_ESP32S3)
-        file(GLOB_RECURSE ASM_SOURCES ${PORT_PATH}/simd/*_esp32s3.S)    # Select only esp32s3 related files
+        file(GLOB_RECURSE ASM_SOURCES ${PORT_PATH}/simd/*_xtensa_pie.S)       # Select Xtensa PIE, for esp32s3 target
     else()
-        file(GLOB_RECURSE ASM_SOURCES ${PORT_PATH}/simd/*_esp32.S)      # Select only esp32 related files
+        file(GLOB_RECURSE ASM_SOURCES ${PORT_PATH}/simd/*_xtensa_base.S)      # Select Xtensa Base for esp32, esp32s2 targets
     endif()
 
     file(GLOB_RECURSE ASM_MACROS ${PORT_PATH}/simd/lv_macro_*.S)        # Explicitly add all assembler macro files
 
 else()
-    message(WARNING "This test app is intended only for esp32 and esp32s3")
+    message(WARNING "This test app is intended only for Xtensa targets (esp32, esp32s2, esp32s3")
 endif()
 
 # Hard copy of LV files
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h b/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h
index 5c9a53c20..4bc339d54 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h
@@ -30,7 +30,7 @@ typedef struct {
     unsigned int unalign_step;              // Increment step in bytes unalignment of the test array
     unsigned int dest_stride_step;          // Increment step in destination stride of the test array
     unsigned int test_combinations_count;   // Count of fest combinations
-} test_matrix_params_t;
+} test_matrix_lv_fill_params_t;
 
 /**
  * @brief Functionality test case parameters
diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c
index 600b8eecc..bb915ffe2 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c
+++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c
@@ -17,9 +17,10 @@
 #include "lv_draw_sw_blend_to_rgb565.h"
 #include "lv_draw_sw_blend_to_rgb888.h"
 
-#define WIDTH 128
-#define HEIGHT 128
-#define STRIDE WIDTH
+#define COMMON_DIM 128      // Common matrix dimension 128x128 pixels
+#define WIDTH COMMON_DIM
+#define HEIGHT COMMON_DIM
+#define STRIDE COMMON_DIM
 #define UNALIGN_BYTES 1
 #define BENCHMARK_CYCLES 1000
 
@@ -79,7 +80,7 @@ TEST_CASE("LV Fill benchmark ARGB8888", "[fill][benchmark][ARGB8888]")
         .height = HEIGHT,
         .width = WIDTH,
         .stride = STRIDE * sizeof(uint32_t),
-        .cc_height = HEIGHT - 1,
+        .cc_height = HEIGHT,
         .cc_width = WIDTH - 1,
         .benchmark_cycles = BENCHMARK_CYCLES,
         .array_align16 = (void *)dest_array_align16,
@@ -87,7 +88,7 @@ TEST_CASE("LV Fill benchmark ARGB8888", "[fill][benchmark][ARGB8888]")
         .blend_api_func = &lv_draw_sw_blend_color_to_argb8888,
     };
 
-    ESP_LOGI(TAG_LV_FILL_BENCH, "running test for ARGB8888 color format");
+    ESP_LOGI(TAG_LV_FILL_BENCH, "running memset for ARGB8888 to ARGB8888 color format");
     lv_fill_benchmark_init(&test_params);
     free(dest_array_align16);
 }
@@ -104,7 +105,7 @@ TEST_CASE("LV Fill benchmark RGB565", "[fill][benchmark][RGB565]")
         .height = HEIGHT,
         .width = WIDTH,
         .stride = STRIDE * sizeof(uint16_t),
-        .cc_height = HEIGHT - 1,
+        .cc_height = HEIGHT,
         .cc_width = WIDTH - 1,
         .benchmark_cycles = BENCHMARK_CYCLES,
         .array_align16 = (void *)dest_array_align16,
@@ -112,7 +113,7 @@ TEST_CASE("LV Fill benchmark RGB565", "[fill][benchmark][RGB565]")
         .blend_api_func = &lv_draw_sw_blend_color_to_rgb565,
     };
 
-    ESP_LOGI(TAG_LV_FILL_BENCH, "running test for RGB565 color format");
+    ESP_LOGI(TAG_LV_FILL_BENCH, "running memset for RGB565 to RGB565 color format");
     lv_fill_benchmark_init(&test_params);
     free(dest_array_align16);
 }
@@ -129,7 +130,7 @@ TEST_CASE("LV Fill benchmark RGB888", "[fill][benchmark][RGB888]")
         .height = HEIGHT,
         .width = WIDTH,
         .stride = STRIDE * 3,
-        .cc_height = HEIGHT - 1,
+        .cc_height = HEIGHT,
         .cc_width = WIDTH - 1,
         .benchmark_cycles = BENCHMARK_CYCLES,
         .array_align16 = (void *)dest_array_align16,
@@ -137,7 +138,7 @@ TEST_CASE("LV Fill benchmark RGB888", "[fill][benchmark][RGB888]")
         .blend_api_px_func = &lv_draw_sw_blend_color_to_rgb888,
     };
 
-    ESP_LOGI(TAG_LV_FILL_BENCH, "running test for RGB888 color format");
+    ESP_LOGI(TAG_LV_FILL_BENCH, "running memset for RGB888 to RGB888 color format");
     lv_fill_benchmark_init(&test_params);
     free(dest_array_align16);
 }
diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c
index 958eaae91..3256a6273 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c
+++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c
@@ -40,6 +40,28 @@ static lv_color_t test_color = {
     .red = 0x12,
 };
 
+static const test_matrix_lv_fill_params_t default_test_matrix_lv_fill = {
+#if CONFIG_IDF_TARGET_ESP32S3
+    .min_w = 8,
+    .min_h = 1,
+    .max_w = 40,
+    .max_h = 4,
+    .max_unalign_byte = 16,       // Use 16-byte boundary check for Xtensa PIE
+    .unalign_step = 1,
+    .dest_stride_step = 1,
+#else
+    .min_w = 1,
+    .min_h = 1,
+    .max_w = 32,
+    .max_h = 4,
+    .max_unalign_byte = 4,       // Use 4-byte boundary check for Xtensa base
+    .unalign_step = 1,
+    .dest_stride_step = 1,
+#endif
+    .min_unalign_byte = 0,
+    .test_combinations_count = 0,
+};
+
 // ------------------------------------------------ Static function headers --------------------------------------------
 
 /**
@@ -50,7 +72,7 @@ static lv_color_t test_color = {
  * @param[in] test_matrix Pointer to structure defining test matrix - all the test combinations
  * @param[in] test_case Pointer to structure defining functionality test case
  */
-static void functionality_test_matrix(test_matrix_params_t *test_matrix, func_test_case_params_t *test_case);
+static void functionality_test_matrix(test_matrix_lv_fill_params_t *test_matrix, func_test_case_params_t *test_case);
 
 /**
  * @brief Fill test buffers for functionality test
@@ -107,19 +129,12 @@ Functionality tests
 
 // ------------------------------------------------ Test cases stages --------------------------------------------------
 
-TEST_CASE("Test fill functionality ARGB8888", "[fill][functionality][ARGB8888]")
+TEST_CASE("LV Fill functionality ARGB8888", "[fill][functionality][ARGB8888]")
 {
-    test_matrix_params_t test_matrix = {
-        .min_w = 8,             // 8 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
-        .min_h = 1,
-        .max_w = 16,
-        .max_h = 16,
-        .min_unalign_byte = 0,
-        .max_unalign_byte = 16,
-        .unalign_step = 1,
-        .dest_stride_step = 1,
-        .test_combinations_count = 0,
-    };
+    test_matrix_lv_fill_params_t test_matrix = default_test_matrix_lv_fill;
+#if (CONFIG_IDF_TARGET_ESP32S3)
+    test_matrix.min_w = 8;  // 8 is the lower limit for the PIE asm implementation, otherwise base asm is executed
+#endif
 
     func_test_case_params_t test_case = {
         .blend_api_func = &lv_draw_sw_blend_color_to_argb8888,
@@ -127,23 +142,17 @@ TEST_CASE("Test fill functionality ARGB8888", "[fill][functionality][ARGB8888]")
         .data_type_size = sizeof(uint32_t),
     };
 
-    ESP_LOGI(TAG_LV_FILL_FUNC, "running test for ARGB8888 color format");
+    ESP_LOGI(TAG_LV_FILL_FUNC, "running memset for ARGB8888 to ARGB8888 color format");
+    ESP_LOGI(TAG_LV_FILL_FUNC, "test matrices dimensions: %dx%d to %dx%d", test_matrix.min_w, test_matrix.min_h, test_matrix.max_w, test_matrix.max_h);
     functionality_test_matrix(&test_matrix, &test_case);
 }
 
-TEST_CASE("Test fill functionality RGB565", "[fill][functionality][RGB565]")
+TEST_CASE("LV Fill functionality RGB565", "[fill][functionality][RGB565]")
 {
-    test_matrix_params_t test_matrix = {
-        .min_w = 16,            // 16 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
-        .min_h = 1,
-        .max_w = 32,
-        .max_h = 16,
-        .min_unalign_byte = 0,
-        .max_unalign_byte = 16,
-        .unalign_step = 1,
-        .dest_stride_step = 1,
-        .test_combinations_count = 0,
-    };
+    test_matrix_lv_fill_params_t test_matrix = default_test_matrix_lv_fill;
+#if (CONFIG_IDF_TARGET_ESP32S3)
+    test_matrix.min_w = 16;  // 16 is the lower limit for the PIE asm implementation, otherwise base asm is executed
+#endif
 
     func_test_case_params_t test_case = {
         .blend_api_func = &lv_draw_sw_blend_color_to_rgb565,
@@ -151,23 +160,17 @@ TEST_CASE("Test fill functionality RGB565", "[fill][functionality][RGB565]")
         .data_type_size = sizeof(uint16_t),
     };
 
-    ESP_LOGI(TAG_LV_FILL_FUNC, "running test for RGB565 color format");
+    ESP_LOGI(TAG_LV_FILL_FUNC, "running memset for RGB565 to RGB565 color format");
+    ESP_LOGI(TAG_LV_FILL_FUNC, "test matrices dimensions: %dx%d to %dx%d", test_matrix.min_w, test_matrix.min_h, test_matrix.max_w, test_matrix.max_h);
     functionality_test_matrix(&test_matrix, &test_case);
 }
 
-TEST_CASE("Test fill functionality RGB888", "[fill][functionality][RGB888]")
+TEST_CASE("LV Fill functionality RGB888", "[fill][functionality][RGB888]")
 {
-    test_matrix_params_t test_matrix = {
-        .min_w = 12,             // 12 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
-        .min_h = 1,
-        .max_w = 32,
-        .max_h = 3,
-        .min_unalign_byte = 0,
-        .max_unalign_byte = 16,
-        .unalign_step = 1,
-        .dest_stride_step = 1,
-        .test_combinations_count = 0,
-    };
+    test_matrix_lv_fill_params_t test_matrix = default_test_matrix_lv_fill;
+#if (CONFIG_IDF_TARGET_ESP32S3)
+    test_matrix.min_w = 12;  // 12 is the lower limit for the PIE asm implementation, otherwise base asm is executed
+#endif
 
     func_test_case_params_t test_case = {
         .blend_api_px_func = &lv_draw_sw_blend_color_to_rgb888,
@@ -175,12 +178,13 @@ TEST_CASE("Test fill functionality RGB888", "[fill][functionality][RGB888]")
         .data_type_size = sizeof(uint8_t) * 3,   // 24-bit data length
     };
 
-    ESP_LOGI(TAG_LV_FILL_FUNC, "running test for RGB888 color format");
+    ESP_LOGI(TAG_LV_FILL_FUNC, "running memset for RGB888 to RGB888 color format");
+    ESP_LOGI(TAG_LV_FILL_FUNC, "test matrices dimensions: %dx%d to %dx%d", test_matrix.min_w, test_matrix.min_h, test_matrix.max_w, test_matrix.max_h);
     functionality_test_matrix(&test_matrix, &test_case);
 }
 // ------------------------------------------------ Static test functions ----------------------------------------------
 
-static void functionality_test_matrix(test_matrix_params_t *test_matrix, func_test_case_params_t *test_case)
+static void functionality_test_matrix(test_matrix_lv_fill_params_t *test_matrix, func_test_case_params_t *test_case)
 {
     // Step destination array width
     for (int dest_w = test_matrix->min_w; dest_w <= test_matrix->max_w; dest_w++) {
diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c
index ba8d44501..c5408ae79 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c
+++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c
@@ -105,7 +105,7 @@ TEST_CASE("LV Image benchmark RGB565 blend to RGB565", "[image][benchmark][RGB56
         .color_format = LV_COLOR_FORMAT_RGB565,
     };
 
-    ESP_LOGI(TAG_LV_IMAGE_BENCH, "running test for RGB565 color format");
+    ESP_LOGI(TAG_LV_IMAGE_BENCH, "running memcpy for RGB565 to RGB565 color format");
     lv_image_benchmark_init(&test_params);
     free(dest_array_align16);
     free(src_array_align16);
diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c
index 5f5b06680..1ae58b84f 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c
+++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c
@@ -40,7 +40,7 @@ static char test_msg_buf[200];
 
 static const test_matrix_lv_image_params_t default_test_matrix_image_blend = {
 #if CONFIG_IDF_TARGET_ESP32S3
-    .min_w = 8,                   // 8 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
+    .min_w = 8,                   // 8 is the lower limit for the PIE asm implementation, otherwise base assembly is executed
     .min_h = 1,
     .max_w = 24,
     .max_h = 2,
@@ -53,7 +53,7 @@ static const test_matrix_lv_image_params_t default_test_matrix_image_blend = {
 #else
     .min_w = 1,
     .min_h = 1,
-    .max_w = 16,
+    .max_w = 12,
     .max_h = 2,
     .src_max_unalign_byte = 4,    // Use 4-byte boundary  check for Xtensa base
     .dest_max_unalign_byte = 4,
@@ -142,7 +142,8 @@ TEST_CASE("LV Image functionality RGB565 blend to RGB565", "[image][functionalit
         .operation_type = OPERATION_FILL,
     };
 
-    ESP_LOGI(TAG_LV_IMAGE_FUNC, "running test for RGB565 color format");
+    ESP_LOGI(TAG_LV_IMAGE_FUNC, "running memcpy for RGB565 to RGB565 color format");
+    ESP_LOGI(TAG_LV_IMAGE_FUNC, "test matrices dimensions: %dx%d to %dx%d", test_matrix.min_w, test_matrix.min_h, test_matrix.max_w, test_matrix.max_h);
     functionality_test_matrix(&test_matrix, &test_case);
 }
 
@@ -160,7 +161,8 @@ TEST_CASE("LV Image functionality RGB888 blend to RGB888", "[image][functionalit
         .operation_type = OPERATION_FILL,
     };
 
-    ESP_LOGI(TAG_LV_IMAGE_FUNC, "running test for RGB888 color format");
+    ESP_LOGI(TAG_LV_IMAGE_FUNC, "running memcpy for RGB888 to RGB888 color format");
+    ESP_LOGI(TAG_LV_IMAGE_FUNC, "test matrices dimensions: %dx%d to %dx%d", test_matrix.min_w, test_matrix.min_h, test_matrix.max_w, test_matrix.max_h);
     functionality_test_matrix(&test_matrix, &test_case);
 }
 
@@ -186,6 +188,7 @@ static void functionality_test_matrix(test_matrix_lv_image_params_t *test_matrix
                         // Step destination array unalignment
                         for (int dest_unalign_byte = test_matrix->dest_min_unalign_byte; dest_unalign_byte <= test_matrix->dest_max_unalign_byte; dest_unalign_byte += test_matrix->dest_unalign_step) {
 
+                            //printf("dest_w = %d, dest_h = %d, src_stride = %d, dest_stride = %d, src_unalign_byte = %d, dest_unalign_byte = %d\n", dest_w, dest_h, src_stride, dest_stride, src_unalign_byte, dest_unalign_byte);
                             // Call functionality test
                             UPDATE_TEST_CASE(test_case, dest_w, dest_h, src_stride, dest_stride, src_unalign_byte, dest_unalign_byte);
                             lv_image_functionality(test_case);