filter: optimize framerate shaper filter (#6656)

Run the metric on downscaled buffers only for high res.

Co-authored-by: Logaprakash Ramajayam <logaprakash.ramajayam@multicorewareinc.com>
This commit is contained in:
dashsantosh-mcw 2025-07-28 21:57:35 +05:30 committed by GitHub
parent 45d86b1edd
commit 71eeb9a609
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 193 additions and 114 deletions

View File

@ -12,29 +12,21 @@
#if defined (__aarch64__) && !defined(__APPLE__) #if defined (__aarch64__) && !defined(__APPLE__)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
struct hb_motion_metric_private_s struct hb_motion_metric_private_s
{ {
unsigned *gamma_lut; unsigned *gamma_lut;
int depth; int depth;
int bps; int bps;
int max_value; int max_value;
};
static int hb_motion_metric_init(hb_motion_metric_object_t *metric, uint8_t *approx_buf_a;
hb_filter_init_t *init); uint8_t *approx_buf_b;
static float hb_motion_metric_work(hb_motion_metric_object_t *metric, float (*motion_metric)(hb_motion_metric_private_t *pv,
hb_buffer_t *buf_a, int width, int height,
hb_buffer_t *buf_b); int stride_a, int stride_b,
const uint8_t *buf_a, const uint8_t *buf_b);
static void hb_motion_metric_close(hb_motion_metric_object_t *metric);
hb_motion_metric_object_t hb_motion_metric =
{
.name = "Motion metric",
.init = hb_motion_metric_init,
.work = hb_motion_metric_work,
.close = hb_motion_metric_close,
}; };
// Create gamma lookup table. // Create gamma lookup table.
@ -49,71 +41,102 @@ static void build_gamma_lut(hb_motion_metric_private_t *pv)
} }
} }
#define APPROX(a, b, c, d) (((((uint32_t)a + b + 1) >> 1) + (((uint32_t)c + d + 1) >> 1) + 1) >> 1)
#define APPROX_FRAME_DATA(nbits) \
static void approximate_frame_data##_##nbits(const uint##nbits##_t *source, uint##nbits##_t *dest, \
int source_stride, int dest_stride, int width, int height) \
{ \
int stride2 = source_stride * 2; \
int stride3 = source_stride * 3; \
int jj4; \
int top_left, top_right, bottom_left, bottom_right; \
for (int ii = 0; ii < height; ii++) \
{ \
for (int jj = 0; jj < width; jj++) \
{ \
jj4 = jj * 4; \
top_left = APPROX(source[jj4], source[jj4 + source_stride], \
source[jj4 + 1], source[jj4 + source_stride + 1]); \
top_right = APPROX(source[jj4 + 2], source[jj4 + source_stride + 2], \
source[jj4 + 3], source[jj4 + source_stride + 3]); \
bottom_left = APPROX(source[jj4 + stride2], source[jj4 + stride3], \
source[jj4 + stride2 + 1], source[jj4 + stride3 + 1]); \
bottom_right = APPROX(source[jj4 + stride2 + 2], source[jj4 + stride3 + 2], \
source[jj4 + stride2 + 3], source[jj4 + stride3 + 3]); \
dest[jj] = APPROX(top_left, top_right, bottom_left, bottom_right); \
} \
source += source_stride * 4; \
dest += dest_stride; \
} \
} \
APPROX_FRAME_DATA(8)
APPROX_FRAME_DATA(16)
// Compute the sum of squared errors for a 16x16 block // Compute the sum of squared errors for a 16x16 block
// Gamma adjusts pixel values so that less visible differences // Gamma adjusts pixel values so that less visible differences
// count less. // count less.
#if defined (__aarch64__) && !defined(__APPLE__) #if defined (__aarch64__) && !defined(__APPLE__)
static
float motion_metric_neon_8(hb_motion_metric_private_t *pv,
hb_buffer_t *a, hb_buffer_t *b)
{
int bw = a->f.width / 16;
int bh = a->f.height / 16;
int stride_a = a->plane[0].stride / pv->bps;
int stride_b = b->plane[0].stride / pv->bps;
const uint8_t *pa = (const uint8_t *)a->plane[0].data;
const uint8_t *pb = (const uint8_t *)b->plane[0].data;
uint64_t sum = 0;
for (int y = 0; y < bh; y++)
{
for (int x = 0; x < bw; x++)
{
const uint8_t *ra = pa + y * 16 * stride_a + x * 16;
const uint8_t *rb = pb + y * 16 * stride_b + x * 16;
for (int yy = 0; yy < 16; yy++) #define DEF_MOTION_METRIC(nbits) \
{ static float motion_metric##_##nbits(hb_motion_metric_private_t *pv, \
uint32_t arrga[16]; int width, int height, \
uint32_t arrgb[16]; int stride_a, int stride_b, \
const uint8_t *a, const uint8_t *b) \
{ \
int bw, bh; \
uint##nbits##_t *buf_a, *buf_b; \
\
buf_a = (uint##nbits##_t *)a; \
buf_b = (uint##nbits##_t *)b; \
bw = width / 16; \
bh = height / 16; \
\
uint64_t sum = 0; \
for (int y = 0; y < bh; y++) \
{ \
for (int x = 0; x < bw; x++) \
{ \
const uint##nbits##_t *ra = buf_a + y * 16 * stride_a + x * 16; \
const uint##nbits##_t *rb = buf_b + y * 16 * stride_b + x * 16; \
for (int yy = 0; yy < 16; yy++) \
{ \
uint32_t arrga[16]; \
uint32_t arrgb[16]; \
for (int xx = 0; xx < 16; xx++) \
{ \
arrga[xx] = pv->gamma_lut[ra[xx]]; \
arrgb[xx] = pv->gamma_lut[rb[xx]]; \
} \
uint32x4_t vga0 = vld1q_u32(arrga); \
uint32x4_t vga1 = vld1q_u32(arrga + 4); \
uint32x4_t vga2 = vld1q_u32(arrga + 8); \
uint32x4_t vga3 = vld1q_u32(arrga + 12); \
uint32x4_t vgb0 = vld1q_u32(arrgb); \
uint32x4_t vgb1 = vld1q_u32(arrgb + 4); \
uint32x4_t vgb2 = vld1q_u32(arrgb + 8); \
uint32x4_t vgb3 = vld1q_u32(arrgb + 12); \
uint32x4_t vdf0 = vsubq_u32(vga0, vgb0); \
uint32x4_t vdf1 = vsubq_u32(vga1, vgb1); \
uint32x4_t vdf2 = vsubq_u32(vga2, vgb2); \
uint32x4_t vdf3 = vsubq_u32(vga3, vgb3); \
uint32x4_t vsq0 = vmulq_u32(vdf0, vdf0); \
uint32x4_t vsq1 = vmulq_u32(vdf1, vdf1); \
uint32x4_t vsq2 = vmulq_u32(vdf2, vdf2); \
uint32x4_t vsq3 = vmulq_u32(vdf3, vdf3); \
sum += vaddvq_u32(vsq0); \
sum += vaddvq_u32(vsq1); \
sum += vaddvq_u32(vsq2); \
sum += vaddvq_u32(vsq3); \
ra += stride_a; \
rb += stride_b; \
} \
} \
} \
return (float)sum / (width * height); \
} \
for (int xx = 0; xx < 16; xx++) #else
{
arrga[xx] = pv->gamma_lut[ra[xx]];
arrgb[xx] = pv->gamma_lut[rb[xx]];
}
uint32x4_t vga0 = vld1q_u32(arrga);
uint32x4_t vga1 = vld1q_u32(arrga + 4);
uint32x4_t vga2 = vld1q_u32(arrga + 8);
uint32x4_t vga3 = vld1q_u32(arrga + 12);
uint32x4_t vgb0 = vld1q_u32(arrgb);
uint32x4_t vgb1 = vld1q_u32(arrgb + 4);
uint32x4_t vgb2 = vld1q_u32(arrgb + 8);
uint32x4_t vgb3 = vld1q_u32(arrgb + 12);
uint32x4_t vdf0 = vsubq_u32(vga0, vgb0);
uint32x4_t vdf1 = vsubq_u32(vga1, vgb1);
uint32x4_t vdf2 = vsubq_u32(vga2, vgb2);
uint32x4_t vdf3 = vsubq_u32(vga3, vgb3);
uint32x4_t vsq0 = vmulq_u32(vdf0, vdf0);
uint32x4_t vsq1 = vmulq_u32(vdf1, vdf1);
uint32x4_t vsq2 = vmulq_u32(vdf2, vdf2);
uint32x4_t vsq3 = vmulq_u32(vdf3, vdf3);
sum += vaddvq_u32(vsq0);
sum += vaddvq_u32(vsq1);
sum += vaddvq_u32(vsq2);
sum += vaddvq_u32(vsq3);
ra += stride_a;
rb += stride_b;
}
}
}
return (float)sum / (a->f.width * a->f.height);
}
#endif
#define DEF_SSE_BLOCK16(nbits) \ #define DEF_SSE_BLOCK16(nbits) \
static inline unsigned sse_block16##_##nbits(unsigned *gamma_lut, \ static inline unsigned sse_block16##_##nbits(unsigned *gamma_lut, \
@ -134,45 +157,73 @@ static inline unsigned sse_block16##_##nbits(unsigned *gamma_lut,
return sum; \ return sum; \
} \ } \
#if !(defined (__aarch64__) && !defined(__APPLE__))
DEF_SSE_BLOCK16(8) DEF_SSE_BLOCK16(8)
#endif
DEF_SSE_BLOCK16(16) DEF_SSE_BLOCK16(16)
// Sum of squared errors. Computes and sums the SSEs for all // Sum of squared errors. Computes and sums the SSEs for all
// 16x16 blocks in the images. Only checks the Y component. // 16x16 blocks in the images. Only checks the Y component.
#define DEF_MOTION_METRIC(nbits) \ #define DEF_MOTION_METRIC(nbits) \
static float motion_metric##_##nbits(hb_motion_metric_private_t *pv, \ static float motion_metric##_##nbits(hb_motion_metric_private_t *pv, \
hb_buffer_t *a, hb_buffer_t *b) \ int width, int height, \
int stride_a, int stride_b, \
const uint8_t *a, const uint8_t *b) \
{ \ { \
int bw = a->f.width / 16; \
int bh = a->f.height / 16; \
int stride_a = a->plane[0].stride / pv->bps; \
int stride_b = b->plane[0].stride / pv->bps; \
const uint##nbits##_t *pa = (const uint##nbits##_t *)a->plane[0].data; \
const uint##nbits##_t *pb = (const uint##nbits##_t *)b->plane[0].data; \
uint64_t sum = 0; \
\ \
int bw, bh; \
uint##nbits##_t *buf_a, *buf_b; \
\
buf_a = (uint##nbits##_t *)a; \
buf_b = (uint##nbits##_t *)b; \
bw = width / 16; \
bh = height / 16; \
\
uint64_t sum = 0; \
for (int y = 0; y < bh; y++) \ for (int y = 0; y < bh; y++) \
{ \ { \
for (int x = 0; x < bw; x++) \ for (int x = 0; x < bw; x++) \
{ \ { \
sum += sse_block16##_##nbits(pv->gamma_lut, \ sum += sse_block16##_##nbits(pv->gamma_lut, \
pa + y * 16 * stride_a + x * 16, \ buf_a + y * 16 * stride_a + x * 16, \
pb + y * 16 * stride_b + x * 16, \ buf_b + y * 16 * stride_b + x * 16, \
stride_a, stride_b); \ stride_a, stride_b); \
} \ } \
} \ } \
return (float)sum / (a->f.width * a->f.height); \ return (float)sum / (width * height); \
} \ } \
#if !(defined (__aarch64__) && !defined(__APPLE__))
DEF_MOTION_METRIC(8)
#endif #endif
DEF_MOTION_METRIC(8)
DEF_MOTION_METRIC(16) DEF_MOTION_METRIC(16)
#define DEF_MOTION_METRIC_FAST(nbits) \
static float motion_metric_fast##_##nbits(hb_motion_metric_private_t *pv, \
int width, int height, \
int stride_a, int stride_b, \
const uint8_t *a, const uint8_t *b) \
{ \
uint##nbits##_t *buf_a, *buf_b; \
int stride_buf_a, stride_buf_b; \
width /= 4; \
height /= 4; \
stride_buf_a = width; \
stride_buf_b = width; \
buf_a = (uint##nbits##_t *)pv->approx_buf_a; \
buf_b = (uint##nbits##_t *)pv->approx_buf_b; \
\
approximate_frame_data##_##nbits((const uint##nbits##_t *)a, buf_a, \
stride_a / pv->bps, stride_buf_a, width, height); \
approximate_frame_data##_##nbits((const uint##nbits##_t *)b, buf_b, \
stride_b / pv->bps, stride_buf_b, width, height); \
\
return motion_metric##_##nbits(pv, width, height, \
stride_buf_a, stride_buf_b, \
(const uint8_t *)buf_a, (const uint8_t *)buf_b); \
} \
DEF_MOTION_METRIC_FAST(8)
DEF_MOTION_METRIC_FAST(16)
static int hb_motion_metric_init(hb_motion_metric_object_t *metric, static int hb_motion_metric_init(hb_motion_metric_object_t *metric,
hb_filter_init_t *init) hb_filter_init_t *init)
{ {
@ -197,6 +248,31 @@ static int hb_motion_metric_init(hb_motion_metric_object_t *metric,
} }
build_gamma_lut(pv); build_gamma_lut(pv);
int fast = 0;
if (init->geometry.width >= 1920 || init->geometry.height >= 1080)
{
fast = 1;
int approx_height = init->geometry.height / 4;
int approx_width = init->geometry.width / 4;
int size = approx_height * approx_width * sizeof(uint8_t) * pv->bps;
pv->approx_buf_a = malloc(size);
pv->approx_buf_b = malloc(size);
if (pv->approx_buf_a == NULL || pv->approx_buf_b == NULL)
{
hb_error("motion_metric: malloc failed");
return -1;
}
}
switch (pv->depth)
{
case 8:
pv->motion_metric = fast ? motion_metric_fast_8 : motion_metric_8;
break;
default:
pv->motion_metric = fast ? motion_metric_fast_16 : motion_metric_16;
}
return 0; return 0;
} }
@ -206,17 +282,10 @@ static float hb_motion_metric_work(hb_motion_metric_object_t *metric,
{ {
hb_motion_metric_private_t *pv = metric->private_data; hb_motion_metric_private_t *pv = metric->private_data;
switch (pv->depth) return pv->motion_metric(metric->private_data,
{ buf_a->f.width, buf_a->f.height,
case 8: buf_a->plane[0].stride, buf_b->plane[0].stride,
#if defined (__aarch64__) && !defined(__APPLE__) buf_a->plane[0].data, buf_b->plane[0].data);
return motion_metric_neon_8(metric->private_data, buf_a, buf_b);
#else
return motion_metric_8(metric->private_data, buf_a, buf_b);
#endif
default:
return motion_metric_16(metric->private_data, buf_a, buf_b);
}
} }
static void hb_motion_metric_close(hb_motion_metric_object_t *metric) static void hb_motion_metric_close(hb_motion_metric_object_t *metric)
@ -229,5 +298,15 @@ static void hb_motion_metric_close(hb_motion_metric_object_t *metric)
} }
free(pv->gamma_lut); free(pv->gamma_lut);
free(pv->approx_buf_a);
free(pv->approx_buf_b);
free(pv); free(pv);
} }
hb_motion_metric_object_t hb_motion_metric =
{
.name = "Motion metric",
.init = hb_motion_metric_init,
.work = hb_motion_metric_work,
.close = hb_motion_metric_close,
};