@@ -71,6 +71,10 @@ constexpr uint16_t RGB_MASK = RGB_UPPER_MASK | RGB_LOWER_MASK; // 0x003F
7171// Bit clear masks
7272constexpr uint16_t OE_CLEAR_MASK = ~(1 << OE_BIT);
7373
74+ // Pre-computed bit masks for BCM bit planes (avoids shift per iteration)
75+ static constexpr uint16_t BCM_BIT_MASKS[12 ] = {0x0001 , 0x0002 , 0x0004 , 0x0008 , 0x0010 , 0x0020 ,
76+ 0x0040 , 0x0080 , 0x0100 , 0x0200 , 0x0400 , 0x0800 };
77+
7478// ESP32 I2S TX FIFO position adjustment
7579// In 16-bit parallel mode with tx_fifo_mod=1, the FIFO outputs 16-bit words in swapped pairs.
7680// The FIFO reads 32-bit words from memory and outputs them as two 16-bit chunks in reversed order.
@@ -902,61 +906,80 @@ HUB75_IRAM void I2sDma::draw_pixels(uint16_t x, uint16_t y, uint16_t w, uint16_t
902906 h = rotated_height - y;
903907 }
904908
905- // Process each pixel based on format
909+ // Pre-compute pixel stride for pointer arithmetic (avoids multiply per pixel)
910+ const size_t pixel_stride = (format == Hub75PixelFormat::RGB888) ? 3
911+ : (format == Hub75PixelFormat::RGB565) ? 2
912+ : /* RGB888_32 */ 4 ;
913+
914+ // Check if we can use identity fast path (no coordinate transforms needed)
915+ const bool identity_transform = (rotation_ == Hub75Rotation::ROTATE_0) && !needs_layout_remap_ && !needs_scan_remap_;
916+
917+ // Pre-compute bit plane stride (bytes between bit planes)
918+ const size_t bit_plane_stride = dma_width_ * 2 ;
919+
920+ // Process each pixel
921+ const uint8_t *pixel_ptr = buffer;
906922 for (uint16_t dy = 0 ; dy < h; dy++) {
907923 for (uint16_t dx = 0 ; dx < w; dx++) {
908924 uint16_t px = x + dx;
909925 uint16_t py = y + dy;
926+ uint16_t row;
927+ bool is_lower;
928+
929+ // Fast path: identity transform (no rotation, standard layout, standard scan)
930+ if (identity_transform) {
931+ // Simple row/half calculation without modulo (subtraction is cheaper)
932+ if (py < num_rows_) {
933+ row = py;
934+ is_lower = false ;
935+ } else {
936+ row = py - num_rows_;
937+ is_lower = true ;
938+ }
939+ px = fifo_adjust_x (px);
940+ } else {
941+ // Full coordinate transformation pipeline
942+ auto transformed = transform_coordinate (px, py, rotation_, needs_layout_remap_, needs_scan_remap_, layout_,
943+ scan_wiring_, panel_width_, panel_height_, layout_rows_, layout_cols_,
944+ virtual_width_, virtual_height_, dma_width_, num_rows_);
945+ px = fifo_adjust_x (transformed.x );
946+ row = transformed.row ;
947+ is_lower = transformed.is_lower ;
948+ }
910949
911- // Coordinate transformation pipeline (rotation + layout + scan remapping)
912- auto transformed = transform_coordinate (px, py, rotation_, needs_layout_remap_, needs_scan_remap_, layout_,
913- scan_wiring_, panel_width_, panel_height_, layout_rows_, layout_cols_,
914- virtual_width_, virtual_height_, dma_width_, num_rows_);
915- px = fifo_adjust_x (transformed.x );
916- const uint16_t row = transformed.row ;
917- const bool is_lower = transformed.is_lower ;
918-
919- const size_t pixel_idx = (dy * w) + dx;
920- uint8_t r8 = 0 , g8 = 0 , b8 = 0 ;
921-
922- // Extract RGB888 from pixel format
923- extract_rgb888_from_format (buffer, pixel_idx, format, color_order, big_endian, r8, g8, b8);
950+ // Extract RGB888 from pixel format (always_inline will inline the switch)
951+ uint8_t r8, g8, b8;
952+ extract_rgb888_from_format (pixel_ptr, 0 , format, color_order, big_endian, r8, g8, b8);
953+ pixel_ptr += pixel_stride;
924954
925955 // Apply LUT correction
926956 const uint16_t r_corrected = lut_[r8];
927957 const uint16_t g_corrected = lut_[g8];
928958 const uint16_t b_corrected = lut_[b8];
929959
930- // Update all bit planes for this pixel
960+ // Pre-compute bit patterns for all bit planes (eliminates 24 branches in bit loop)
961+ uint16_t upper_patterns[HUB75_BIT_DEPTH];
962+ uint16_t lower_patterns[HUB75_BIT_DEPTH];
931963 for (int bit = 0 ; bit < bit_depth_; bit++) {
932- uint16_t *buf = (uint16_t *) (target_buffers[row].data + (bit * dma_width_ * 2 ));
933-
934- const uint16_t mask = (1 << bit);
935- uint16_t word = buf[px]; // Read existing word (preserves control bits)
964+ const uint16_t mask = BCM_BIT_MASKS[bit];
965+ upper_patterns[bit] = ((r_corrected & mask) ? (1 << R1_BIT) : 0 ) | ((g_corrected & mask) ? (1 << G1_BIT) : 0 ) |
966+ ((b_corrected & mask) ? (1 << B1_BIT) : 0 );
967+ lower_patterns[bit] = ((r_corrected & mask) ? (1 << R2_BIT) : 0 ) | ((g_corrected & mask) ? (1 << G2_BIT) : 0 ) |
968+ ((b_corrected & mask) ? (1 << B2_BIT) : 0 );
969+ }
936970
937- // Clear and update RGB bits for appropriate half
938- // IMPORTANT: Only modify RGB bits (0-5), preserve control bits (6-12)
939- if (is_lower) {
940- // Lower half: R2, G2, B2
941- word &= ~RGB_LOWER_MASK;
942- if (r_corrected & mask)
943- word |= (1 << R2_BIT);
944- if (g_corrected & mask)
945- word |= (1 << G2_BIT);
946- if (b_corrected & mask)
947- word |= (1 << B2_BIT);
948- } else {
949- // Upper half: R1, G1, B1
950- word &= ~RGB_UPPER_MASK;
951- if (r_corrected & mask)
952- word |= (1 << R1_BIT);
953- if (g_corrected & mask)
954- word |= (1 << G1_BIT);
955- if (b_corrected & mask)
956- word |= (1 << B1_BIT);
971+ // Update all bit planes using pre-computed patterns (is_lower hoisted outside loop)
972+ uint8_t *base_ptr = target_buffers[row].data ;
973+ if (is_lower) {
974+ for (int bit = 0 ; bit < bit_depth_; bit++) {
975+ uint16_t *buf = (uint16_t *) (base_ptr + (bit * bit_plane_stride));
976+ buf[px] = (buf[px] & ~RGB_LOWER_MASK) | lower_patterns[bit];
977+ }
978+ } else {
979+ for (int bit = 0 ; bit < bit_depth_; bit++) {
980+ uint16_t *buf = (uint16_t *) (base_ptr + (bit * bit_plane_stride));
981+ buf[px] = (buf[px] & ~RGB_UPPER_MASK) | upper_patterns[bit];
957982 }
958-
959- buf[px] = word;
960983 }
961984 }
962985 }
0 commit comments