@@ -877,66 +877,73 @@ HUB75_IRAM void ParlioDma::draw_pixels(uint16_t x, uint16_t y, uint16_t w, uint1
877877 h = rotated_height - y;
878878 }
879879
880- // Process each pixel based on format
880+ // Pre-compute pixel stride for pointer arithmetic (avoids multiply per pixel)
881+ const size_t pixel_stride = (format == Hub75PixelFormat::RGB888) ? 3
882+ : (format == Hub75PixelFormat::RGB565) ? 2
883+ : /* RGB888_32 */ 4 ;
884+
885+ // Check if we can use identity fast path (no coordinate transforms needed)
886+ const bool identity_transform = (rotation_ == Hub75Rotation::ROTATE_0) && !needs_layout_remap_ && !needs_scan_remap_;
887+
888+ // Process each pixel
889+ const uint8_t *pixel_ptr = buffer;
881890 for (uint16_t dy = 0 ; dy < h; dy++) {
882891 for (uint16_t dx = 0 ; dx < w; dx++) {
883892 uint16_t px = x + dx;
884893 uint16_t py = y + dy;
885- const size_t pixel_idx = (dy * w) + dx;
886-
887- // Coordinate transformation pipeline (rotation + layout + scan remapping)
888- auto transformed = transform_coordinate (px, py, rotation_, needs_layout_remap_, needs_scan_remap_, layout_,
889- scan_wiring_, panel_width_, panel_height_, layout_rows_, layout_cols_,
890- virtual_width_, virtual_height_, dma_width_, num_rows_);
891- px = transformed.x ;
892- const uint16_t row = transformed.row ;
893- const bool is_lower = transformed.is_lower ;
894+ uint16_t row;
895+ bool is_lower;
896+
897+ // Fast path: identity transform (no rotation, standard layout, standard scan)
898+ if (identity_transform) {
899+ // Simple row/half calculation without modulo (subtraction is cheaper)
900+ if (py < num_rows_) {
901+ row = py;
902+ is_lower = false ;
903+ } else {
904+ row = py - num_rows_;
905+ is_lower = true ;
906+ }
907+ } else {
908+ // Full coordinate transformation pipeline
909+ auto transformed = transform_coordinate (px, py, rotation_, needs_layout_remap_, needs_scan_remap_, layout_,
910+ scan_wiring_, panel_width_, panel_height_, layout_rows_, layout_cols_,
911+ virtual_width_, virtual_height_, dma_width_, num_rows_);
912+ px = transformed.x ;
913+ row = transformed.row ;
914+ is_lower = transformed.is_lower ;
915+ }
894916
917+ // Extract RGB888 from pixel format (always_inline will inline the switch)
895918 uint8_t r8 = 0 , g8 = 0 , b8 = 0 ;
896-
897- // Extract RGB888 from pixel format
898- extract_rgb888_from_format (buffer, pixel_idx, format, color_order, big_endian, r8, g8, b8);
919+ extract_rgb888_from_format (pixel_ptr, 0 , format, color_order, big_endian, r8, g8, b8);
920+ pixel_ptr += pixel_stride;
899921
900922 // Apply LUT correction
901923 const uint16_t r_corrected = lut_[r8];
902924 const uint16_t g_corrected = lut_[g8];
903925 const uint16_t b_corrected = lut_[b8];
904926
905- // Update all bit planes for this pixel
927+ // Pre-compute base index for this row's bit planes
928+ const int row_base_idx = row * bit_depth_;
929+
930+ // Branchless bit-plane update using shift+and
906931 // PARLIO bit layout: [CLK_GATE(15)|ADDR(14-11)|--|LAT(9)|OE(8)|--|--|R2(4)|R1(5)|G2(2)|G1(3)|B2(0)|B1(1)]
907- // Based on pin mapping in configure_parlio:
908- // data_pins[0] = B2, [1] = B1, [2] = G2, [3] = G1, [4] = R2, [5] = R1
909932 for (int bit = 0 ; bit < bit_depth_; bit++) {
910- int idx = (row * bit_depth_) + bit;
911- BitPlaneBuffer &bp = target_buffers[idx];
912- uint16_t *buf = bp.data ;
933+ BitPlaneBuffer &bp = target_buffers[row_base_idx + bit];
913934
914- const uint16_t mask = (1 << bit);
915- uint16_t word = buf[px]; // Read existing word (preserves control bits)
935+ // Extract single bits (0 or 1) without branches using shift+and
936+ const uint16_t r_bit = (r_corrected >> bit) & 1 ;
937+ const uint16_t g_bit = (g_corrected >> bit) & 1 ;
938+ const uint16_t b_bit = (b_corrected >> bit) & 1 ;
916939
917- // Clear and update RGB bits for appropriate half
918- // IMPORTANT: Only modify RGB bits (0-5), preserve control bits (8-15)
940+ uint16_t word = bp.data [px];
919941 if (is_lower) {
920- // Lower half: R2, G2, B2
921- word &= ~RGB_LOWER_MASK;
922- if (r_corrected & mask)
923- word |= (1 << R2_BIT);
924- if (g_corrected & mask)
925- word |= (1 << G2_BIT);
926- if (b_corrected & mask)
927- word |= (1 << B2_BIT);
942+ word = (word & ~RGB_LOWER_MASK) | (r_bit << R2_BIT) | (g_bit << G2_BIT) | (b_bit << B2_BIT);
928943 } else {
929- // Upper half: R1, G1, B1
930- word &= ~RGB_UPPER_MASK;
931- if (r_corrected & mask)
932- word |= (1 << R1_BIT);
933- if (g_corrected & mask)
934- word |= (1 << G1_BIT);
935- if (b_corrected & mask)
936- word |= (1 << B1_BIT);
944+ word = (word & ~RGB_UPPER_MASK) | (r_bit << R1_BIT) | (g_bit << G1_BIT) | (b_bit << B1_BIT);
937945 }
938-
939- buf[px] = word;
946+ bp.data [px] = word;
940947 }
941948 }
942949 }
@@ -1019,24 +1026,40 @@ HUB75_IRAM void ParlioDma::fill(uint16_t x, uint16_t y, uint16_t w, uint16_t h,
10191026 ((b_corrected & mask) ? (1 << B2_BIT) : 0 );
10201027 }
10211028
1022- // Fill loop - coordinate transforms still needed per-pixel
1029+ // Check if we can use identity fast path (no coordinate transforms needed)
1030+ const bool identity_transform = (rotation_ == Hub75Rotation::ROTATE_0) && !needs_layout_remap_ && !needs_scan_remap_;
1031+
1032+ // Fill loop
10231033 for (uint16_t dy = 0 ; dy < h; dy++) {
10241034 for (uint16_t dx = 0 ; dx < w; dx++) {
10251035 uint16_t px = x + dx;
10261036 uint16_t py = y + dy;
1027-
1028- // Coordinate transformation pipeline (rotation + layout + scan remapping)
1029- auto transformed = transform_coordinate (px, py, rotation_, needs_layout_remap_, needs_scan_remap_, layout_,
1030- scan_wiring_, panel_width_, panel_height_, layout_rows_, layout_cols_,
1031- virtual_width_, virtual_height_, dma_width_, num_rows_);
1032- px = transformed.x ;
1033- const uint16_t row = transformed.row ;
1034- const bool is_lower = transformed.is_lower ;
1037+ uint16_t row;
1038+ bool is_lower;
1039+
1040+ // Fast path: identity transform (no rotation, standard layout, standard scan)
1041+ if (identity_transform) {
1042+ if (py < num_rows_) {
1043+ row = py;
1044+ is_lower = false ;
1045+ } else {
1046+ row = py - num_rows_;
1047+ is_lower = true ;
1048+ }
1049+ } else {
1050+ // Full coordinate transformation pipeline
1051+ auto transformed = transform_coordinate (px, py, rotation_, needs_layout_remap_, needs_scan_remap_, layout_,
1052+ scan_wiring_, panel_width_, panel_height_, layout_rows_, layout_cols_,
1053+ virtual_width_, virtual_height_, dma_width_, num_rows_);
1054+ px = transformed.x ;
1055+ row = transformed.row ;
1056+ is_lower = transformed.is_lower ;
1057+ }
10351058
10361059 // Update all bit planes using pre-computed patterns
1060+ const int row_base_idx = row * bit_depth_;
10371061 for (int bit = 0 ; bit < bit_depth_; bit++) {
1038- int idx = (row * bit_depth_) + bit;
1039- BitPlaneBuffer &bp = target_buffers[idx];
1062+ BitPlaneBuffer &bp = target_buffers[row_base_idx + bit];
10401063 uint16_t word = bp.data [px]; // Read existing word (preserves control bits)
10411064
10421065 if (is_lower) {
0 commit comments