Skip to content

Commit d6c8520

Browse files
Apply same performance optimizations to PARLIO (ESP32-P4/C6)
Port the I2S/GDMA optimizations to the PARLIO platform: 1. Branchless bit-plane update using shift+and 2. Identity transform fast path (skips full transform pipeline) 3. Pixel pointer stride (avoids multiply per pixel) 4. Pre-computed row_base_idx (avoids multiply per bit plane) Note: PARLIO uses different buffer structure (BitPlaneBuffer array indexed by row*bit_depth+bit) but the same optimization principles apply.
1 parent 210b025 commit d6c8520

File tree

1 file changed

+75
-52
lines changed

1 file changed

+75
-52
lines changed

components/hub75/src/platforms/parlio/parlio_dma.cpp

Lines changed: 75 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -877,66 +877,73 @@ HUB75_IRAM void ParlioDma::draw_pixels(uint16_t x, uint16_t y, uint16_t w, uint1
877877
h = rotated_height - y;
878878
}
879879

880-
// Process each pixel based on format
880+
// Pre-compute pixel stride for pointer arithmetic (avoids multiply per pixel)
881+
const size_t pixel_stride = (format == Hub75PixelFormat::RGB888) ? 3
882+
: (format == Hub75PixelFormat::RGB565) ? 2
883+
: /* RGB888_32 */ 4;
884+
885+
// Check if we can use identity fast path (no coordinate transforms needed)
886+
const bool identity_transform = (rotation_ == Hub75Rotation::ROTATE_0) && !needs_layout_remap_ && !needs_scan_remap_;
887+
888+
// Process each pixel
889+
const uint8_t *pixel_ptr = buffer;
881890
for (uint16_t dy = 0; dy < h; dy++) {
882891
for (uint16_t dx = 0; dx < w; dx++) {
883892
uint16_t px = x + dx;
884893
uint16_t py = y + dy;
885-
const size_t pixel_idx = (dy * w) + dx;
886-
887-
// Coordinate transformation pipeline (rotation + layout + scan remapping)
888-
auto transformed = transform_coordinate(px, py, rotation_, needs_layout_remap_, needs_scan_remap_, layout_,
889-
scan_wiring_, panel_width_, panel_height_, layout_rows_, layout_cols_,
890-
virtual_width_, virtual_height_, dma_width_, num_rows_);
891-
px = transformed.x;
892-
const uint16_t row = transformed.row;
893-
const bool is_lower = transformed.is_lower;
894+
uint16_t row;
895+
bool is_lower;
896+
897+
// Fast path: identity transform (no rotation, standard layout, standard scan)
898+
if (identity_transform) {
899+
// Simple row/half calculation without modulo (subtraction is cheaper)
900+
if (py < num_rows_) {
901+
row = py;
902+
is_lower = false;
903+
} else {
904+
row = py - num_rows_;
905+
is_lower = true;
906+
}
907+
} else {
908+
// Full coordinate transformation pipeline
909+
auto transformed = transform_coordinate(px, py, rotation_, needs_layout_remap_, needs_scan_remap_, layout_,
910+
scan_wiring_, panel_width_, panel_height_, layout_rows_, layout_cols_,
911+
virtual_width_, virtual_height_, dma_width_, num_rows_);
912+
px = transformed.x;
913+
row = transformed.row;
914+
is_lower = transformed.is_lower;
915+
}
894916

917+
// Extract RGB888 from pixel format (always_inline will inline the switch)
895918
uint8_t r8 = 0, g8 = 0, b8 = 0;
896-
897-
// Extract RGB888 from pixel format
898-
extract_rgb888_from_format(buffer, pixel_idx, format, color_order, big_endian, r8, g8, b8);
919+
extract_rgb888_from_format(pixel_ptr, 0, format, color_order, big_endian, r8, g8, b8);
920+
pixel_ptr += pixel_stride;
899921

900922
// Apply LUT correction
901923
const uint16_t r_corrected = lut_[r8];
902924
const uint16_t g_corrected = lut_[g8];
903925
const uint16_t b_corrected = lut_[b8];
904926

905-
// Update all bit planes for this pixel
927+
// Pre-compute base index for this row's bit planes
928+
const int row_base_idx = row * bit_depth_;
929+
930+
// Branchless bit-plane update using shift+and
906931
// PARLIO bit layout: [CLK_GATE(15)|ADDR(14-11)|--|LAT(9)|OE(8)|--|--|R2(4)|R1(5)|G2(2)|G1(3)|B2(0)|B1(1)]
907-
// Based on pin mapping in configure_parlio:
908-
// data_pins[0] = B2, [1] = B1, [2] = G2, [3] = G1, [4] = R2, [5] = R1
909932
for (int bit = 0; bit < bit_depth_; bit++) {
910-
int idx = (row * bit_depth_) + bit;
911-
BitPlaneBuffer &bp = target_buffers[idx];
912-
uint16_t *buf = bp.data;
933+
BitPlaneBuffer &bp = target_buffers[row_base_idx + bit];
913934

914-
const uint16_t mask = (1 << bit);
915-
uint16_t word = buf[px]; // Read existing word (preserves control bits)
935+
// Extract single bits (0 or 1) without branches using shift+and
936+
const uint16_t r_bit = (r_corrected >> bit) & 1;
937+
const uint16_t g_bit = (g_corrected >> bit) & 1;
938+
const uint16_t b_bit = (b_corrected >> bit) & 1;
916939

917-
// Clear and update RGB bits for appropriate half
918-
// IMPORTANT: Only modify RGB bits (0-5), preserve control bits (8-15)
940+
uint16_t word = bp.data[px];
919941
if (is_lower) {
920-
// Lower half: R2, G2, B2
921-
word &= ~RGB_LOWER_MASK;
922-
if (r_corrected & mask)
923-
word |= (1 << R2_BIT);
924-
if (g_corrected & mask)
925-
word |= (1 << G2_BIT);
926-
if (b_corrected & mask)
927-
word |= (1 << B2_BIT);
942+
word = (word & ~RGB_LOWER_MASK) | (r_bit << R2_BIT) | (g_bit << G2_BIT) | (b_bit << B2_BIT);
928943
} else {
929-
// Upper half: R1, G1, B1
930-
word &= ~RGB_UPPER_MASK;
931-
if (r_corrected & mask)
932-
word |= (1 << R1_BIT);
933-
if (g_corrected & mask)
934-
word |= (1 << G1_BIT);
935-
if (b_corrected & mask)
936-
word |= (1 << B1_BIT);
944+
word = (word & ~RGB_UPPER_MASK) | (r_bit << R1_BIT) | (g_bit << G1_BIT) | (b_bit << B1_BIT);
937945
}
938-
939-
buf[px] = word;
946+
bp.data[px] = word;
940947
}
941948
}
942949
}
@@ -1019,24 +1026,40 @@ HUB75_IRAM void ParlioDma::fill(uint16_t x, uint16_t y, uint16_t w, uint16_t h,
10191026
((b_corrected & mask) ? (1 << B2_BIT) : 0);
10201027
}
10211028

1022-
// Fill loop - coordinate transforms still needed per-pixel
1029+
// Check if we can use identity fast path (no coordinate transforms needed)
1030+
const bool identity_transform = (rotation_ == Hub75Rotation::ROTATE_0) && !needs_layout_remap_ && !needs_scan_remap_;
1031+
1032+
// Fill loop
10231033
for (uint16_t dy = 0; dy < h; dy++) {
10241034
for (uint16_t dx = 0; dx < w; dx++) {
10251035
uint16_t px = x + dx;
10261036
uint16_t py = y + dy;
1027-
1028-
// Coordinate transformation pipeline (rotation + layout + scan remapping)
1029-
auto transformed = transform_coordinate(px, py, rotation_, needs_layout_remap_, needs_scan_remap_, layout_,
1030-
scan_wiring_, panel_width_, panel_height_, layout_rows_, layout_cols_,
1031-
virtual_width_, virtual_height_, dma_width_, num_rows_);
1032-
px = transformed.x;
1033-
const uint16_t row = transformed.row;
1034-
const bool is_lower = transformed.is_lower;
1037+
uint16_t row;
1038+
bool is_lower;
1039+
1040+
// Fast path: identity transform (no rotation, standard layout, standard scan)
1041+
if (identity_transform) {
1042+
if (py < num_rows_) {
1043+
row = py;
1044+
is_lower = false;
1045+
} else {
1046+
row = py - num_rows_;
1047+
is_lower = true;
1048+
}
1049+
} else {
1050+
// Full coordinate transformation pipeline
1051+
auto transformed = transform_coordinate(px, py, rotation_, needs_layout_remap_, needs_scan_remap_, layout_,
1052+
scan_wiring_, panel_width_, panel_height_, layout_rows_, layout_cols_,
1053+
virtual_width_, virtual_height_, dma_width_, num_rows_);
1054+
px = transformed.x;
1055+
row = transformed.row;
1056+
is_lower = transformed.is_lower;
1057+
}
10351058

10361059
// Update all bit planes using pre-computed patterns
1060+
const int row_base_idx = row * bit_depth_;
10371061
for (int bit = 0; bit < bit_depth_; bit++) {
1038-
int idx = (row * bit_depth_) + bit;
1039-
BitPlaneBuffer &bp = target_buffers[idx];
1062+
BitPlaneBuffer &bp = target_buffers[row_base_idx + bit];
10401063
uint16_t word = bp.data[px]; // Read existing word (preserves control bits)
10411064

10421065
if (is_lower) {

0 commit comments

Comments
 (0)