Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 107 additions & 86 deletions filter/sharpbilinear_flexible.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@

#define CLAMP_U8(x, lo, hi) ((x) < (lo) ? (lo) : ((x) > (hi) ? (hi) : (x)))

// ---- Gamma tables (unchanged behavior) --------------------------------------
// Smoothstep as in GLSL: cubic easing
static inline float smoothstep01(float x)
{
x = std::clamp(x, 0.0f, 1.0f);
return x * x * (3.0f - 2.0f * x);
}

static uint8_t gamma_r_encode[32];
static uint8_t gamma_g_encode[64];
Expand All @@ -29,8 +34,6 @@ static void init_gamma_tables()
gamma_decode[i] = uint8_t(CLAMP_U8(int(std::pow(i / 255.0f, inv_gamma) * 255.0f + 0.5f), 0, 255));
}

// ---- RGB565 helpers ---------------------------------------------------------

static inline uint16_t build_rgb565_fast(int r, int g, int b)
{
return ((r & 0xF8) << 8) | ((g & 0xFC) << 3) | (b >> 3);
Expand All @@ -39,28 +42,46 @@ static inline uint16_t build_rgb565_fast(int r, int g, int b)
static inline void unpack_rgb565_gamma(const uint8_t* src, int pitch, int x, int y, int& r, int& g, int& b)
{
const uint8_t* pixel = src + y * pitch + x * 2;
const uint16_t color = uint16_t(pixel[0]) | (uint16_t(pixel[1]) << 8);
uint16_t color = pixel[0] | (pixel[1] << 8);

const int r5 = (color >> 11) & 0x1F;
const int g6 = (color >> 5) & 0x3F;
const int b5 = color & 0x1F;
int r5 = (color >> 11) & 0x1F;
int g6 = (color >> 5) & 0x3F;
int b5 = color & 0x1F;

r = gamma_r_encode[r5];
g = gamma_g_encode[g6];
b = gamma_r_encode[b5]; // reuse red gamma table for blue
}

// ---- Fixed-point smoothstep weights at 4× sample locations ------------------
// smoothstep(0,1,x) at x in {0, 1/4, 1/2, 3/4} = {0, 5/32, 1/2, 27/32}
// Scale by 256 for 8.8 fixed point.
static constexpr uint16_t W[4] = { 0, 40, 128, 216 }; // w = smoothstep
static constexpr uint16_t IW[4] = { 256, 216, 128, 40 }; // 256 - w
// --------- NEW: safe floor div/mod by 4 (handles negatives correctly) ---------
// We need this because centered mapping uses (x-2)/(y-2), which can be negative
// for the first two output pixels. Right-shift of negative is implementation-defined,
// so do it explicitly and keep remainder in [0..3].
static inline void floor_divmod_4(int v, int& q, int& r)
{
if (v >= 0)
{
q = v >> 2;
r = v & 3;
}
else
{
// floor division: q = floor(v/4)
int a = -v;
q = -((a + 3) >> 2);
r = v - (q << 2); // remainder in [0..3]
// Safety: due to any oddities, clamp remainder to 0..3
if (r < 0) r = 0;
if (r > 3) r = 3;
}
}
// ---------------------------------------------------------------------------

void ApplySharpBilinear4x(uint8_t* __restrict dst, int dst_pitch,
const uint8_t* __restrict src,
extern "C"
void ApplySharpBilinear4x(uint8_t* __restrict dst, int dst_pitch, const uint8_t* __restrict src,
int src_width, int src_height, int src_pitch)
{
const int dst_width = src_width << 2; // *4
const int dst_width = src_width << 2; // *4
const int dst_height = src_height << 2; // *4

static bool gamma_ready = false;
Expand All @@ -70,87 +91,87 @@ void ApplySharpBilinear4x(uint8_t* __restrict dst, int dst_pitch,
gamma_ready = true;
}

// Iterate over source texels; each emits a 4×4 block in the destination.
for (int sy = 0; sy < src_height; ++sy)
// Gain > 1.0 = sharper (you can keep tweaking this one constant)
constexpr float gain = 1.1f;

// --------- CHANGED: centered fractional positions for 4× ---------
// Pixel-center mapping is:
// src_x = (x + 0.5)/4 - 0.5
// which corresponds to fractional phases:
// {0.125, 0.375, 0.625, 0.875}
const float fracLUT[4] = { 0.125f, 0.375f, 0.625f, 0.875f };
// ----------------------------------------------------------------

// Precompute 1D weights for x/y phases
float WX[4], IWX[4];
float WY[4], IWY[4];
for (int i = 0; i < 4; ++i)
{
// Clamp source rows to avoid reading past the bottom edge.
const int sy0 = (sy < src_height - 1) ? sy : (src_height - 2);
const int sy1 = sy0 + 1;
float xf = std::clamp(0.5f + (fracLUT[i] - 0.5f) * gain, 0.0f, 1.0f);
float w = smoothstep01(xf);
WX[i] = w; IWX[i] = 1.0f - w;
WY[i] = w; IWY[i] = 1.0f - w;
}

// Precompute destination row base once per source row.
const int dy_base = sy << 2; // sy * 4
for (int y = 0; y < dst_height; ++y)
{
// --------- CHANGED: centered phase mapping via (y - 2) ---------
// This implements src_y = (y + 0.5)/4 - 0.5 using integer ops.
int sy, dy;
floor_divmod_4(y - 2, sy, dy);
// Clamp for safe +1 sampling
sy = std::clamp(sy, 0, src_height - 2);
// ----------------------------------------------------------------

for (int sx = 0; sx < src_width; ++sx)
const float wy = WY[dy];
const float iwy = IWY[dy];

uint8_t* __restrict dst_row = dst + y * dst_pitch;

for (int x = 0; x < dst_width; ++x)
{
// Clamp source cols to avoid reading past the right edge.
const int sx0 = (sx < src_width - 1) ? sx : (src_width - 2);
const int sx1 = sx0 + 1;
// --------- CHANGED: centered phase mapping via (x - 2) ---------
int sx, dx;
floor_divmod_4(x - 2, sx, dx);
sx = std::clamp(sx, 0, src_width - 2);
// ----------------------------------------------------------------

const float wx = WX[dx];
const float iwx = IWX[dx];

// Unpack the 2×2 neighborhood exactly once per 4×4 block.
int r00, g00, b00;
int r10, g10, b10;
int r01, g01, b01;
int r11, g11, b11;

unpack_rgb565_gamma(src, src_pitch, sx0, sy0, r00, g00, b00);
unpack_rgb565_gamma(src, src_pitch, sx1, sy0, r10, g10, b10);
unpack_rgb565_gamma(src, src_pitch, sx0, sy1, r01, g01, b01);
unpack_rgb565_gamma(src, src_pitch, sx1, sy1, r11, g11, b11);

// Emit the 4×4 destination block using separable bilinear in 8.8 fixed-point.
const int dx_base = sx << 2; // sx * 4

// For each of the 4 subcolumns (dx), do horizontal mixes top/bottom once,
// then vertical mix for each of the 4 subrows (dy).
int rtop[4], gtop[4], btop[4];
int rbot[4], gbot[4], bbot[4];

for (int dx = 0; dx < 4; ++dx)
{
const uint16_t wx = W[dx];
const uint16_t iwx = IW[dx];

// Top row horizontal blend
rtop[dx] = (r00 * iwx + r10 * wx + 128) >> 8;
gtop[dx] = (g00 * iwx + g10 * wx + 128) >> 8;
btop[dx] = (b00 * iwx + b10 * wx + 128) >> 8;

// Bottom row horizontal blend
rbot[dx] = (r01 * iwx + r11 * wx + 128) >> 8;
gbot[dx] = (g01 * iwx + g11 * wx + 128) >> 8;
bbot[dx] = (b01 * iwx + b11 * wx + 128) >> 8;
}

for (int dy = 0; dy < 4; ++dy)
{
const uint16_t wy = W[dy];
const uint16_t iwy = IW[dy];

// Destination row pointer for this subrow
const int y = dy_base + dy;
uint8_t* __restrict dst_row = dst + y * dst_pitch;

for (int dx = 0; dx < 4; ++dx)
{
const int x = dx_base + dx;

// Final vertical blend
int r = (rtop[dx] * iwy + rbot[dx] * wy + 128) >> 8;
int g = (gtop[dx] * iwy + gbot[dx] * wy + 128) >> 8;
int b = (btop[dx] * iwy + bbot[dx] * wy + 128) >> 8;

// Gamma decode back to display space and pack
const uint16_t out = build_rgb565_fast(
gamma_decode[CLAMP_U8(r, 0, 255)],
gamma_decode[CLAMP_U8(g, 0, 255)],
gamma_decode[CLAMP_U8(b, 0, 255)]
);

uint8_t* __restrict dst_px = dst_row + (x << 1); // x*2
dst_px[0] = uint8_t(out & 0xFF);
dst_px[1] = uint8_t((out >> 8) & 0xFF);
}
}
unpack_rgb565_gamma(src, src_pitch, sx, sy, r00, g00, b00);
unpack_rgb565_gamma(src, src_pitch, sx + 1, sy, r10, g10, b10);
unpack_rgb565_gamma(src, src_pitch, sx, sy + 1, r01, g01, b01);
unpack_rgb565_gamma(src, src_pitch, sx + 1, sy + 1, r11, g11, b11);

// Horizontal blends
float r_h = r00 * iwx + r10 * wx;
float g_h = g00 * iwx + g10 * wx;
float b_h = b00 * iwx + b10 * wx;

float r_v = r01 * iwx + r11 * wx;
float g_v = g01 * iwx + g11 * wx;
float b_v = b01 * iwx + b11 * wx;

// Vertical blend
float rf = r_h * iwy + r_v * wy;
float gf = g_h * iwy + g_v * wy;
float bf = b_h * iwy + b_v * wy;

uint16_t out = build_rgb565_fast(
gamma_decode[CLAMP_U8(int(rf + 0.5f), 0, 255)],
gamma_decode[CLAMP_U8(int(gf + 0.5f), 0, 255)],
gamma_decode[CLAMP_U8(int(bf + 0.5f), 0, 255)]
);

uint8_t* __restrict dst_px = dst_row + (x << 1);
dst_px[0] = out & 0xFF;
dst_px[1] = (out >> 8) & 0xFF;
}
}
}
Expand Down