filter: optimize framerate shaper filter (#6656)

Run the metric on downscaled buffers only for high res. Co-authored-by: Logaprakash Ramajayam <logaprakash.ramajayam@multicorewareinc.com>
2025-07-28 21:57:35 +05:30 · 2025-07-28 21:57:35 +05:30 · 71eeb9a609
parent 45d86b1edd
commit 71eeb9a609
1 changed files with 193 additions and 114 deletions
--- a/libhb/motion_metric.c
+++ b/libhb/motion_metric.c
@ -12,29 +12,21 @@
 #if defined (__aarch64__) && !defined(__APPLE__)
    #include <arm_neon.h>
 #endif
 struct hb_motion_metric_private_s
 {
    unsigned *gamma_lut;
    int       depth;
    int       bps;
    int       max_value;
 };
-static int hb_motion_metric_init(hb_motion_metric_object_t *metric,
+    uint8_t *approx_buf_a;
-                                 hb_filter_init_t *init);
+    uint8_t *approx_buf_b;
-static float hb_motion_metric_work(hb_motion_metric_object_t *metric,
+    float (*motion_metric)(hb_motion_metric_private_t *pv,
-                                   hb_buffer_t *buf_a,
+                           int width, int height,
-                                   hb_buffer_t *buf_b);
+                           int stride_a, int stride_b,
-
+                           const uint8_t *buf_a, const uint8_t *buf_b);
 static void hb_motion_metric_close(hb_motion_metric_object_t *metric);
 hb_motion_metric_object_t hb_motion_metric =
 {
    .name  = "Motion metric",
    .init  = hb_motion_metric_init,
    .work  = hb_motion_metric_work,
    .close = hb_motion_metric_close,
 };
 // Create gamma lookup table.
@ -49,71 +41,102 @@ static void build_gamma_lut(hb_motion_metric_private_t *pv)
    }
 }
 #define APPROX(a, b, c, d) (((((uint32_t)a + b + 1) >> 1) + (((uint32_t)c + d + 1) >> 1) + 1) >> 1)
 #define APPROX_FRAME_DATA(nbits)                                                                        \
 static void approximate_frame_data##_##nbits(const uint##nbits##_t *source, uint##nbits##_t *dest,      \
                                             int source_stride, int dest_stride, int width, int height) \
 {                                                                                                       \
    int stride2 = source_stride * 2;                                                                    \
    int stride3 = source_stride * 3;                                                                    \
    int jj4;                                                                                            \
    int top_left, top_right, bottom_left, bottom_right;                                                 \
    for (int ii = 0; ii < height; ii++)                                                                 \
    {                                                                                                   \
        for (int jj = 0; jj < width; jj++)                                                              \
        {                                                                                               \
            jj4 = jj * 4;                                                                               \
            top_left     = APPROX(source[jj4], source[jj4 + source_stride],                             \
                                  source[jj4 + 1], source[jj4 + source_stride + 1]);                    \
            top_right    = APPROX(source[jj4 + 2], source[jj4 + source_stride + 2],                     \
                                  source[jj4 + 3], source[jj4 + source_stride + 3]);                    \
            bottom_left  = APPROX(source[jj4 + stride2], source[jj4 + stride3],                         \
                                  source[jj4 + stride2 + 1], source[jj4 + stride3 + 1]);                \
            bottom_right = APPROX(source[jj4 + stride2 + 2], source[jj4 + stride3 + 2],                 \
                                  source[jj4 + stride2 + 3], source[jj4 + stride3 + 3]);                \
            dest[jj]     = APPROX(top_left, top_right, bottom_left, bottom_right);                      \
        }                                                                                               \
        source += source_stride * 4;                                                                    \
        dest += dest_stride;                                                                            \
    }                                                                                                   \
 }                                                                                                       \
 APPROX_FRAME_DATA(8)
 APPROX_FRAME_DATA(16)
 // Compute the sum of squared errors for a 16x16 block
 // Gamma adjusts pixel values so that less visible differences
 // count less.
 #if defined (__aarch64__) && !defined(__APPLE__)
 static
 float motion_metric_neon_8(hb_motion_metric_private_t *pv,
                                     hb_buffer_t *a, hb_buffer_t *b)
 {
    int bw = a->f.width / 16;
    int bh = a->f.height / 16;
    int stride_a = a->plane[0].stride / pv->bps;
    int stride_b = b->plane[0].stride / pv->bps;
    const uint8_t *pa = (const uint8_t *)a->plane[0].data;
    const uint8_t *pb = (const uint8_t *)b->plane[0].data;
    uint64_t sum = 0;
    for (int y = 0; y < bh; y++)
    {
        for (int x = 0; x < bw; x++)
        {
            const uint8_t *ra = pa + y * 16 * stride_a + x * 16;
            const uint8_t *rb = pb + y * 16 * stride_b + x * 16;
-            for (int yy = 0; yy < 16; yy++)
+#define DEF_MOTION_METRIC(nbits)                                                           \
-            {
+static float motion_metric##_##nbits(hb_motion_metric_private_t *pv,                       \
-                uint32_t arrga[16];
+                                     int width, int height,                                \
-                uint32_t arrgb[16];
+                                     int stride_a, int stride_b,                           \
                                     const uint8_t *a, const uint8_t *b)                   \
 {                                                                                          \
    int bw, bh;                                                                            \
    uint##nbits##_t *buf_a, *buf_b;                                                        \
                                                                                           \
    buf_a     = (uint##nbits##_t *)a;                                                      \
    buf_b     = (uint##nbits##_t *)b;                                                      \
    bw        = width / 16;                                                                \
    bh        = height / 16;                                                               \
                                                                                           \
    uint64_t sum = 0;                                                                      \
    for (int y = 0; y < bh; y++)                                                           \
    {                                                                                      \
        for (int x = 0; x < bw; x++)                                                       \
        {                                                                                  \
            const uint##nbits##_t *ra = buf_a + y * 16 * stride_a + x * 16;                \
            const uint##nbits##_t *rb = buf_b + y * 16 * stride_b + x * 16;                \
            for (int yy = 0; yy < 16; yy++)                                                \
            {                                                                              \
                uint32_t arrga[16];                                                        \
                uint32_t arrgb[16];                                                        \
                for (int xx = 0; xx < 16; xx++)                                            \
                {                                                                          \
                    arrga[xx] = pv->gamma_lut[ra[xx]];                                     \
                    arrgb[xx] = pv->gamma_lut[rb[xx]];                                     \
                }                                                                          \
                uint32x4_t vga0 = vld1q_u32(arrga);                                        \
                uint32x4_t vga1 = vld1q_u32(arrga + 4);                                    \
                uint32x4_t vga2 = vld1q_u32(arrga + 8);                                    \
                uint32x4_t vga3 = vld1q_u32(arrga + 12);                                   \
                uint32x4_t vgb0 = vld1q_u32(arrgb);                                        \
                uint32x4_t vgb1 = vld1q_u32(arrgb + 4);                                    \
                uint32x4_t vgb2 = vld1q_u32(arrgb + 8);                                    \
                uint32x4_t vgb3 = vld1q_u32(arrgb + 12);                                   \
                uint32x4_t vdf0 = vsubq_u32(vga0, vgb0);                                   \
                uint32x4_t vdf1 = vsubq_u32(vga1, vgb1);                                   \
                uint32x4_t vdf2 = vsubq_u32(vga2, vgb2);                                   \
                uint32x4_t vdf3 = vsubq_u32(vga3, vgb3);                                   \
                uint32x4_t vsq0 = vmulq_u32(vdf0, vdf0);                                   \
                uint32x4_t vsq1 = vmulq_u32(vdf1, vdf1);                                   \
                uint32x4_t vsq2 = vmulq_u32(vdf2, vdf2);                                   \
                uint32x4_t vsq3 = vmulq_u32(vdf3, vdf3);                                   \
                sum += vaddvq_u32(vsq0);                                                   \
                sum += vaddvq_u32(vsq1);                                                   \
                sum += vaddvq_u32(vsq2);                                                   \
                sum += vaddvq_u32(vsq3);                                                   \
                ra += stride_a;                                                            \
                rb += stride_b;                                                            \
            }                                                                              \
        }                                                                                  \
    }                                                                                      \
    return (float)sum / (width * height);                                                  \
 }                                                                                          \
-                for (int xx = 0; xx < 16; xx++)
+#else
                {
                    arrga[xx] = pv->gamma_lut[ra[xx]];
                    arrgb[xx] = pv->gamma_lut[rb[xx]];
                }
                uint32x4_t vga0 = vld1q_u32(arrga);
                uint32x4_t vga1 = vld1q_u32(arrga + 4);
                uint32x4_t vga2 = vld1q_u32(arrga + 8);
                uint32x4_t vga3 = vld1q_u32(arrga + 12);
                uint32x4_t vgb0 = vld1q_u32(arrgb);
                uint32x4_t vgb1 = vld1q_u32(arrgb + 4);
                uint32x4_t vgb2 = vld1q_u32(arrgb + 8);
                uint32x4_t vgb3 = vld1q_u32(arrgb + 12);
                uint32x4_t vdf0 = vsubq_u32(vga0, vgb0);
                uint32x4_t vdf1 = vsubq_u32(vga1, vgb1);
                uint32x4_t vdf2 = vsubq_u32(vga2, vgb2);
                uint32x4_t vdf3 = vsubq_u32(vga3, vgb3);
                uint32x4_t vsq0 = vmulq_u32(vdf0, vdf0);
                uint32x4_t vsq1 = vmulq_u32(vdf1, vdf1);
                uint32x4_t vsq2 = vmulq_u32(vdf2, vdf2);
                uint32x4_t vsq3 = vmulq_u32(vdf3, vdf3);
                sum += vaddvq_u32(vsq0);
                sum += vaddvq_u32(vsq1);
                sum += vaddvq_u32(vsq2);
                sum += vaddvq_u32(vsq3);
                ra += stride_a;
                rb += stride_b;
            }
        }
    }
    return (float)sum / (a->f.width * a->f.height);
 }
 #endif
 #define DEF_SSE_BLOCK16(nbits)                                                         \
 static inline unsigned sse_block16##_##nbits(unsigned *gamma_lut,                      \
@ -134,45 +157,73 @@ static inline unsigned sse_block16##_##nbits(unsigned *gamma_lut,
    return sum;                                                                        \
 }                                                                                      \
 #if !(defined (__aarch64__) && !defined(__APPLE__))
 DEF_SSE_BLOCK16(8)
 #endif
 DEF_SSE_BLOCK16(16)
 // Sum of squared errors.  Computes and sums the SSEs for all
 // 16x16 blocks in the images.  Only checks the Y component.
-#define DEF_MOTION_METRIC(nbits)                                             \
+#define DEF_MOTION_METRIC(nbits)                                                            \
-static float motion_metric##_##nbits(hb_motion_metric_private_t *pv,         \
+static float motion_metric##_##nbits(hb_motion_metric_private_t *pv,                        \
-                                     hb_buffer_t *a, hb_buffer_t *b)         \
+                                     int width, int height,                                 \
-{                                                                            \
+                                     int stride_a, int stride_b,                            \
-    int bw = a->f.width / 16;                                                \
+                                     const uint8_t *a, const uint8_t *b)                    \
-    int bh = a->f.height / 16;                                               \
+{                                                                                           \
-    int stride_a = a->plane[0].stride / pv->bps;                             \
+                                                                                            \
-    int stride_b = b->plane[0].stride / pv->bps;                             \
+    int bw, bh;                                                                             \
-    const uint##nbits##_t *pa = (const uint##nbits##_t *)a->plane[0].data;   \
+    uint##nbits##_t *buf_a, *buf_b;                                                         \
-    const uint##nbits##_t *pb = (const uint##nbits##_t *)b->plane[0].data;   \
+                                                                                            \
-    uint64_t sum = 0;                                                        \
+    buf_a     = (uint##nbits##_t *)a;                                                       \
-                                                                             \
+    buf_b     = (uint##nbits##_t *)b;                                                       \
-    for (int y = 0; y < bh; y++)                                             \
+    bw        = width / 16;                                                                 \
-    {                                                                        \
+    bh        = height / 16;                                                                \
-        for (int x = 0; x < bw; x++)                                         \
+                                                                                            \
-        {                                                                    \
+    uint64_t sum = 0;                                                                       \
-            sum += sse_block16##_##nbits(pv->gamma_lut,                      \
+    for (int y = 0; y < bh; y++)                                                            \
-                                         pa + y * 16 * stride_a + x * 16,    \
+    {                                                                                       \
-                                         pb + y * 16 * stride_b + x * 16,    \
+        for (int x = 0; x < bw; x++)                                                        \
-                                         stride_a, stride_b);                \
+        {                                                                                   \
-        }                                                                    \
+            sum += sse_block16##_##nbits(pv->gamma_lut,                                     \
-    }                                                                        \
+                        buf_a + y * 16 * stride_a + x * 16,                                 \
-    return (float)sum / (a->f.width * a->f.height);                          \
+                        buf_b + y * 16 * stride_b + x * 16,                                 \
-}                                                                            \
+                        stride_a, stride_b);                                                \
        }                                                                                   \
    }                                                                                       \
    return (float)sum / (width * height);                                                   \
 }                                                                                           \
 #if !(defined (__aarch64__) && !defined(__APPLE__))
 DEF_MOTION_METRIC(8)
 #endif
 DEF_MOTION_METRIC(8)
 DEF_MOTION_METRIC(16)
 #define DEF_MOTION_METRIC_FAST(nbits)                                                       \
 static float motion_metric_fast##_##nbits(hb_motion_metric_private_t *pv,                   \
                                     int width, int height,                                 \
                                     int stride_a, int stride_b,                            \
                                     const uint8_t *a, const uint8_t *b)                    \
 {                                                                                           \
    uint##nbits##_t *buf_a, *buf_b;                                                         \
    int stride_buf_a, stride_buf_b;                                                         \
    width  /= 4;                                                                            \
    height /= 4;                                                                            \
    stride_buf_a = width;                                                                   \
    stride_buf_b = width;                                                                   \
    buf_a = (uint##nbits##_t *)pv->approx_buf_a;                                            \
    buf_b = (uint##nbits##_t *)pv->approx_buf_b;                                            \
                                                                                            \
    approximate_frame_data##_##nbits((const uint##nbits##_t *)a, buf_a,                     \
                                     stride_a / pv->bps, stride_buf_a, width, height);      \
    approximate_frame_data##_##nbits((const uint##nbits##_t *)b, buf_b,                     \
                                     stride_b / pv->bps, stride_buf_b, width, height);      \
                                                                                            \
    return motion_metric##_##nbits(pv, width, height,                                       \
                                   stride_buf_a, stride_buf_b,                              \
                                   (const uint8_t *)buf_a, (const uint8_t *)buf_b);         \
 }                                                                                           \
 DEF_MOTION_METRIC_FAST(8)
 DEF_MOTION_METRIC_FAST(16)
 static int hb_motion_metric_init(hb_motion_metric_object_t *metric,
                                 hb_filter_init_t *init)
 {
@ -197,6 +248,31 @@ static int hb_motion_metric_init(hb_motion_metric_object_t *metric,
    }
    build_gamma_lut(pv);
    int fast = 0;
    if (init->geometry.width >= 1920 || init->geometry.height >= 1080)
    {
        fast = 1;
        int approx_height = init->geometry.height / 4;
        int approx_width  = init->geometry.width  / 4;
        int size = approx_height * approx_width * sizeof(uint8_t) * pv->bps;
        pv->approx_buf_a  = malloc(size);
        pv->approx_buf_b  = malloc(size);
        if (pv->approx_buf_a == NULL || pv->approx_buf_b == NULL)
        {
            hb_error("motion_metric: malloc failed");
            return -1;
        }
    }
    switch (pv->depth)
    {
        case 8:
            pv->motion_metric = fast ? motion_metric_fast_8 : motion_metric_8;
            break;
        default:
            pv->motion_metric = fast ? motion_metric_fast_16 : motion_metric_16;
    }
    return 0;
 }
@ -206,17 +282,10 @@ static float hb_motion_metric_work(hb_motion_metric_object_t *metric,
 {
    hb_motion_metric_private_t *pv = metric->private_data;
-    switch (pv->depth)
+    return pv->motion_metric(metric->private_data,
-    {
+                             buf_a->f.width, buf_a->f.height,
-        case 8:
+                             buf_a->plane[0].stride, buf_b->plane[0].stride,
-#if defined (__aarch64__) && !defined(__APPLE__)
+                             buf_a->plane[0].data, buf_b->plane[0].data);
            return motion_metric_neon_8(metric->private_data, buf_a, buf_b);
 #else
            return motion_metric_8(metric->private_data, buf_a, buf_b);
 #endif
        default:
            return motion_metric_16(metric->private_data, buf_a, buf_b);
    }
 }
 static void hb_motion_metric_close(hb_motion_metric_object_t *metric)
@ -229,5 +298,15 @@ static void hb_motion_metric_close(hb_motion_metric_object_t *metric)
    }
    free(pv->gamma_lut);
    free(pv->approx_buf_a);
    free(pv->approx_buf_b);
    free(pv);
 }
 hb_motion_metric_object_t hb_motion_metric =
 {
    .name  = "Motion metric",
    .init  = hb_motion_metric_init,
    .work  = hb_motion_metric_work,
    .close = hb_motion_metric_close,
 };