Skip to content

Commit bc4ce0a

Browse files
JPEG: Optimise DCT further
Larger test case goes from ~620 ms to ~370 ms
1 parent 5eeee49 commit bc4ce0a

File tree

1 file changed

+94
-53
lines changed

1 file changed

+94
-53
lines changed

MCGalaxy/util/Imaging/JpegDecoder.cs

Lines changed: 94 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ public override SimpleBitmap Decode(byte[] src) {
4343
SetBuffer(src);
4444
SimpleBitmap bmp = new SimpleBitmap();
4545

46-
ComputeIDCTFactors();
4746
ReadMarkers(src, bmp);
4847
return bmp;
4948
}
@@ -316,11 +315,6 @@ void DecodeMCUs(byte[] src, SimpleBitmap bmp) {
316315
float* output = stackalloc float[BLOCK_SIZE];
317316

318317
YCbCr[] colors = new YCbCr[mcu_w * mcu_h];
319-
for (int i = 0; i < colors.Length; i++)
320-
{
321-
colors[i].Cr = 128;
322-
colors[i].Cb = 128;
323-
}
324318

325319
for (int mcuY = 0; mcuY < mcus_y; mcuY++)
326320
for (int mcuX = 0; mcuX < mcus_x; mcuX++)
@@ -379,13 +373,19 @@ void DecodeMCUs(byte[] src, SimpleBitmap bmp) {
379373
int globalY = baseY + YY;
380374

381375
if (globalX < bmp.Width && globalY < bmp.Height) {
382-
float y = colors[j].Y;
376+
// Following standard code assumes JPEG DCT output is level shifted
377+
// float r = 1.40200f * (cr - 128) + y;
378+
// float g = -0.34414f * (cb - 128) - 0.71414f * (cr - 128) + y;
379+
// float b = 1.77200f * (cb - 128) + y;
380+
// Use a slightly different algorithm to avoid level shifting
381+
382+
float y = colors[j].Y + 128.0f;
383383
float cr = colors[j].Cr;
384384
float cb = colors[j].Cb;
385385

386-
float r = 1.40200f * (cr - 128) + y;
387-
float g = -0.34414f * (cb - 128) - 0.71414f * (cr - 128) + y;
388-
float b = 1.77200f * (cb - 128) + y;
386+
float r = 1.40200f * cr + y;
387+
float g = -0.34414f * cb - 0.71414f * cr + y;
388+
float b = 1.77200f * cb + y;
389389

390390
Pixel p = new Pixel(ByteClamp(r), ByteClamp(g), ByteClamp(b), 255);
391391
bmp.pixels[globalY * bmp.Width + globalX] = p;
@@ -437,52 +437,93 @@ void DecodeBlock(JpegComponent comp, byte[] src, int[] block) {
437437
}
438438
} while (idx < 64);
439439
}
440-
441-
float[] idct_factors;
442-
void ComputeIDCTFactors() {
443-
float[] factors = new float[64];
444-
445-
for (int xy = 0; xy < 8; xy++)
446-
{
447-
for (int uv = 0; uv < 8; uv++)
448-
{
449-
float cuv = uv == 0 ? 0.70710678f : 1.0f;
450-
float cosuv = (float)Math.Cos((2 * xy + 1) * uv * Math.PI / 16.0);
451-
452-
factors[(xy * 8) + uv] = cuv * cosuv;
453-
}
454-
}
455-
idct_factors = factors;
456-
}
457-
458-
void IDCT(int[] block, float* output) {
459-
float[] factors = idct_factors;
460-
float* tmp = stackalloc float[BLOCK_SAMPLES * BLOCK_SAMPLES];
461-
462-
for (int col = 0; col < BLOCK_SAMPLES; col++)
440+
441+
const float A1 = 0.98078528040f; // cos(1pi/16)
442+
const float A2 = 0.92387953251f; // cos(2pi/16)
443+
const float A3 = 0.83146961230f; // cos(3pi/16)
444+
const float A4 = 0.70710678118f; // cos(4pi/16)
445+
const float A5 = 0.55557023302f; // cos(5pi/16)
446+
const float A6 = 0.38268343236f; // cos(6pi/16)
447+
const float A7 = 0.19509032201f; // cos(7pi/16)
448+
449+
const float C1 = 0.98078528040f / 4.0f; // cos(1pi/16)
450+
const float C2 = 0.92387953251f / 4.0f; // cos(2pi/16)
451+
const float C3 = 0.83146961230f / 4.0f; // cos(3pi/16)
452+
const float C4 = 0.70710678118f / 4.0f; // cos(4pi/16)
453+
const float C5 = 0.55557023302f / 4.0f; // cos(5pi/16)
454+
const float C6 = 0.38268343236f / 4.0f; // cos(6pi/16)
455+
const float C7 = 0.19509032201f / 4.0f; // cos(7pi/16)
456+
457+
const int DCT_SIZE = 8;
458+
static void IDCT(int[] block, float* output) {
459+
float* tmp = stackalloc float[DCT_SIZE * DCT_SIZE];
460+
461+
for (int col = 0; col < DCT_SIZE; col++)
463462
{
464-
for (int y = 0; y < BLOCK_SAMPLES; y++)
465-
{
466-
float sum = 0.0f;
467-
for (int v = 0; v < BLOCK_SAMPLES; v++)
468-
{
469-
sum += block[v * 8 + col] * factors[(y * 8) + v];
470-
}
471-
tmp[y * 8 + col] = sum;
472-
}
463+
float B0 = block[0 * DCT_SIZE + col], B1 = block[1 * DCT_SIZE + col];
464+
float B2 = block[2 * DCT_SIZE + col], B3 = block[3 * DCT_SIZE + col];
465+
float B4 = block[4 * DCT_SIZE + col], B5 = block[5 * DCT_SIZE + col];
466+
float B6 = block[6 * DCT_SIZE + col], B7 = block[7 * DCT_SIZE + col];
467+
468+
/* Phase 1 */ float a4 = A4 * B0;
469+
/* Phase 5 */ float e4 = A4 * B4;
470+
/* Phase 3 */ float c2 = A2 * B2, c6 = A6 * B2;
471+
/* Phase 7 */ float g2 = A2 * B6, g6 = A6 * B6;
472+
/* Phase 2 */ float b1 = A1 * B1, b3 = A3 * B1, b5 = A5 * B1, b7 = A7 * B1;
473+
/* Phase 4 */ float d1 = A1 * B3, d3 = A3 * B3, d5 = A5 * B3, d7 = A7 * B3;
474+
/* Phase 6 */ float f1 = A1 * B5, f3 = A3 * B5, f5 = A5 * B5, f7 = A7 * B5;
475+
/* Phase 8 */ float h1 = A1 * B7, h3 = A3 * B7, h5 = A5 * B7, h7 = A7 * B7;
476+
477+
/* Phase 1+5 */ float w1 = a4 + e4, w2 = a4 - e4;
478+
/* Phase 3+7 */ float x1 = c2 + g6, x2 = c6 - g2;
479+
/* Phase 2+6 */ float y1 = b1 + d3, y2 = b3 - d7, y3 = b5 - d1, y4 = b7 - d5;
480+
/* Phase 4+8 */ float z1 = f5 + h7, z2 = f1 + h5, z3 = f7 + h3, z4 = f3 - h1;
481+
482+
/* Phase 1+3+5+7 */ float u1 = w1 + x1, u2 = w2 + x2, u3 = w2 - x2, u4 = w1 - x1;
483+
/* Phase 2+6+4+8 */ float v1 = y1 + z1, v2 = y2 - z2, v3 = y3 + z3, v4 = y4 + z4;
484+
485+
tmp[0 * DCT_SIZE + col] = u1 + v1;
486+
tmp[1 * DCT_SIZE + col] = u2 + v2;
487+
tmp[2 * DCT_SIZE + col] = u3 + v3;
488+
tmp[3 * DCT_SIZE + col] = u4 + v4;
489+
tmp[4 * DCT_SIZE + col] = u4 - v4;
490+
tmp[5 * DCT_SIZE + col] = u3 - v3;
491+
tmp[6 * DCT_SIZE + col] = u2 - v2;
492+
tmp[7 * DCT_SIZE + col] = u1 - v1;
473493
}
474494

475-
for (int row = 0; row < BLOCK_SAMPLES; row++)
495+
for (int row = 0; row < DCT_SIZE; row++)
476496
{
477-
for (int x = 0; x < BLOCK_SAMPLES; x++)
478-
{
479-
float sum = 0.0f;
480-
for (int u = 0; u < BLOCK_SAMPLES; u++)
481-
{
482-
sum += tmp[row * 8 + u] * factors[(x * 8) + u];
483-
}
484-
output[row * 8 + x] = (sum / 4.0f) + 128.0f; // undo level shift at end
485-
}
497+
float B0 = tmp[row * DCT_SIZE + 0], B1 = tmp[row * DCT_SIZE + 1];
498+
float B2 = tmp[row * DCT_SIZE + 2], B3 = tmp[row * DCT_SIZE + 3];
499+
float B4 = tmp[row * DCT_SIZE + 4], B5 = tmp[row * DCT_SIZE + 5];
500+
float B6 = tmp[row * DCT_SIZE + 6], B7 = tmp[row * DCT_SIZE + 7];
501+
502+
/* Phase 1 */ float a4 = C4 * B0;
503+
/* Phase 5 */ float e4 = C4 * B4;
504+
/* Phase 3 */ float c2 = C2 * B2, c6 = C6 * B2;
505+
/* Phase 7 */ float g2 = C2 * B6, g6 = C6 * B6;
506+
/* Phase 2 */ float b1 = C1 * B1, b3 = C3 * B1, b5 = C5 * B1, b7 = C7 * B1;
507+
/* Phase 4 */ float d1 = C1 * B3, d3 = C3 * B3, d5 = C5 * B3, d7 = C7 * B3;
508+
/* Phase 6 */ float f1 = C1 * B5, f3 = C3 * B5, f5 = C5 * B5, f7 = C7 * B5;
509+
/* Phase 8 */ float h1 = C1 * B7, h3 = C3 * B7, h5 = C5 * B7, h7 = C7 * B7;
510+
511+
/* Phase 1+5 */ float w1 = a4 + e4, w2 = a4 - e4;
512+
/* Phase 3+7 */ float x1 = c2 + g6, x2 = c6 - g2;
513+
/* Phase 2+6 */ float y1 = b1 + d3, y2 = b3 - d7, y3 = b5 - d1, y4 = b7 - d5;
514+
/* Phase 4+8 */ float z1 = f5 + h7, z2 = f1 + h5, z3 = f7 + h3, z4 = f3 - h1;
515+
516+
/* Phase 1+3+5+7 */ float u1 = w1 + x1, u2 = w2 + x2, u3 = w2 - x2, u4 = w1 - x1;
517+
/* Phase 2+6+4+8 */ float v1 = y1 + z1, v2 = y2 - z2, v3 = y3 + z3, v4 = y4 + z4;
518+
519+
output[row * DCT_SIZE + 0] = u1 + v1;
520+
output[row * DCT_SIZE + 1] = u2 + v2;
521+
output[row * DCT_SIZE + 2] = u3 + v3;
522+
output[row * DCT_SIZE + 3] = u4 + v4;
523+
output[row * DCT_SIZE + 4] = u4 - v4;
524+
output[row * DCT_SIZE + 5] = u3 - v3;
525+
output[row * DCT_SIZE + 6] = u2 - v2;
526+
output[row * DCT_SIZE + 7] = u1 - v1;
486527
}
487528
}
488529

@@ -548,7 +589,7 @@ byte ReadHuffman(HuffmanTable table, byte[] src) {
548589
return (byte)packed;
549590
}
550591

551-
ConsumeBits(HUFF_FAST_BITS);
592+
ConsumeBits(HUFF_FAST_BITS);
552593
for (int i = HUFF_FAST_BITS; i < HUFF_MAX_BITS; i++)
553594
{
554595
codeword <<= 1;

0 commit comments

Comments
 (0)