filter: optimize framerate shaper filter (#6656)

Run the metric on downscaled buffers only for high res. Co-authored-by: Logaprakash Ramajayam <logaprakash.ramajayam@multicorewareinc.com>
2025-07-28 21:57:35 +05:30 · 2025-07-28 21:57:35 +05:30 · 71eeb9a609
parent 45d86b1edd
commit 71eeb9a609
1 changed files with 193 additions and 114 deletions
--- a/libhb/motion_metric.c
+++ b/libhb/motion_metric.c
@ -12,29 +12,21 @@
 #if defined (__aarch64__) && !defined(__APPLE__)
    #include <arm_neon.h>
 #endif
+
 struct hb_motion_metric_private_s
 {
    unsigned *gamma_lut;
    int       depth;
    int       bps;
    int       max_value;
-};

-static int hb_motion_metric_init(hb_motion_metric_object_t *metric,
-                                 hb_filter_init_t *init);
+    uint8_t *approx_buf_a;
+    uint8_t *approx_buf_b;

-static float hb_motion_metric_work(hb_motion_metric_object_t *metric,
-                                   hb_buffer_t *buf_a,
-                                   hb_buffer_t *buf_b);
-
-static void hb_motion_metric_close(hb_motion_metric_object_t *metric);
-
-hb_motion_metric_object_t hb_motion_metric =
-{
-    .name  = "Motion metric",
-    .init  = hb_motion_metric_init,
-    .work  = hb_motion_metric_work,
-    .close = hb_motion_metric_close,
+    float (*motion_metric)(hb_motion_metric_private_t *pv,
+                           int width, int height,
+                           int stride_a, int stride_b,
+                           const uint8_t *buf_a, const uint8_t *buf_b);
 };

 // Create gamma lookup table.
@ -49,71 +41,102 @@ static void build_gamma_lut(hb_motion_metric_private_t *pv)
    }
 }

+#define APPROX(a, b, c, d) (((((uint32_t)a + b + 1) >> 1) + (((uint32_t)c + d + 1) >> 1) + 1) >> 1)
+#define APPROX_FRAME_DATA(nbits)                                                                        \
+static void approximate_frame_data##_##nbits(const uint##nbits##_t *source, uint##nbits##_t *dest,      \
+                                             int source_stride, int dest_stride, int width, int height) \
+{                                                                                                       \
+    int stride2 = source_stride * 2;                                                                    \
+    int stride3 = source_stride * 3;                                                                    \
+    int jj4;                                                                                            \
+    int top_left, top_right, bottom_left, bottom_right;                                                 \
+    for (int ii = 0; ii < height; ii++)                                                                 \
+    {                                                                                                   \
+        for (int jj = 0; jj < width; jj++)                                                              \
+        {                                                                                               \
+            jj4 = jj * 4;                                                                               \
+            top_left     = APPROX(source[jj4], source[jj4 + source_stride],                             \
+                                  source[jj4 + 1], source[jj4 + source_stride + 1]);                    \
+            top_right    = APPROX(source[jj4 + 2], source[jj4 + source_stride + 2],                     \
+                                  source[jj4 + 3], source[jj4 + source_stride + 3]);                    \
+            bottom_left  = APPROX(source[jj4 + stride2], source[jj4 + stride3],                         \
+                                  source[jj4 + stride2 + 1], source[jj4 + stride3 + 1]);                \
+            bottom_right = APPROX(source[jj4 + stride2 + 2], source[jj4 + stride3 + 2],                 \
+                                  source[jj4 + stride2 + 3], source[jj4 + stride3 + 3]);                \
+            dest[jj]     = APPROX(top_left, top_right, bottom_left, bottom_right);                      \
+        }                                                                                               \
+        source += source_stride * 4;                                                                    \
+        dest += dest_stride;                                                                            \
+    }                                                                                                   \
+}                                                                                                       \
+
+APPROX_FRAME_DATA(8)
+APPROX_FRAME_DATA(16)
+
 // Compute the sum of squared errors for a 16x16 block
 // Gamma adjusts pixel values so that less visible differences
 // count less.
 #if defined (__aarch64__) && !defined(__APPLE__)
-static
-float motion_metric_neon_8(hb_motion_metric_private_t *pv,
-                                     hb_buffer_t *a, hb_buffer_t *b)
-{
-    int bw = a->f.width / 16;
-    int bh = a->f.height / 16;
-    int stride_a = a->plane[0].stride / pv->bps;
-    int stride_b = b->plane[0].stride / pv->bps;
-    const uint8_t *pa = (const uint8_t *)a->plane[0].data;
-    const uint8_t *pb = (const uint8_t *)b->plane[0].data;
-    uint64_t sum = 0;
-    for (int y = 0; y < bh; y++)
-    {
-        for (int x = 0; x < bw; x++)
-        {
-            const uint8_t *ra = pa + y * 16 * stride_a + x * 16;
-            const uint8_t *rb = pb + y * 16 * stride_b + x * 16;

-            for (int yy = 0; yy < 16; yy++)
-            {
-                uint32_t arrga[16];
-                uint32_t arrgb[16];
+#define DEF_MOTION_METRIC(nbits)                                                           \
+static float motion_metric##_##nbits(hb_motion_metric_private_t *pv,                       \
+                                     int width, int height,                                \
+                                     int stride_a, int stride_b,                           \
+                                     const uint8_t *a, const uint8_t *b)                   \
+{                                                                                          \
+    int bw, bh;                                                                            \
+    uint##nbits##_t *buf_a, *buf_b;                                                        \
+                                                                                           \
+    buf_a     = (uint##nbits##_t *)a;                                                      \
+    buf_b     = (uint##nbits##_t *)b;                                                      \
+    bw        = width / 16;                                                                \
+    bh        = height / 16;                                                               \
+                                                                                           \
+    uint64_t sum = 0;                                                                      \
+    for (int y = 0; y < bh; y++)                                                           \
+    {                                                                                      \
+        for (int x = 0; x < bw; x++)                                                       \
+        {                                                                                  \
+            const uint##nbits##_t *ra = buf_a + y * 16 * stride_a + x * 16;                \
+            const uint##nbits##_t *rb = buf_b + y * 16 * stride_b + x * 16;                \
+            for (int yy = 0; yy < 16; yy++)                                                \
+            {                                                                              \
+                uint32_t arrga[16];                                                        \
+                uint32_t arrgb[16];                                                        \
+                for (int xx = 0; xx < 16; xx++)                                            \
+                {                                                                          \
+                    arrga[xx] = pv->gamma_lut[ra[xx]];                                     \
+                    arrgb[xx] = pv->gamma_lut[rb[xx]];                                     \
+                }                                                                          \
+                uint32x4_t vga0 = vld1q_u32(arrga);                                        \
+                uint32x4_t vga1 = vld1q_u32(arrga + 4);                                    \
+                uint32x4_t vga2 = vld1q_u32(arrga + 8);                                    \
+                uint32x4_t vga3 = vld1q_u32(arrga + 12);                                   \
+                uint32x4_t vgb0 = vld1q_u32(arrgb);                                        \
+                uint32x4_t vgb1 = vld1q_u32(arrgb + 4);                                    \
+                uint32x4_t vgb2 = vld1q_u32(arrgb + 8);                                    \
+                uint32x4_t vgb3 = vld1q_u32(arrgb + 12);                                   \
+                uint32x4_t vdf0 = vsubq_u32(vga0, vgb0);                                   \
+                uint32x4_t vdf1 = vsubq_u32(vga1, vgb1);                                   \
+                uint32x4_t vdf2 = vsubq_u32(vga2, vgb2);                                   \
+                uint32x4_t vdf3 = vsubq_u32(vga3, vgb3);                                   \
+                uint32x4_t vsq0 = vmulq_u32(vdf0, vdf0);                                   \
+                uint32x4_t vsq1 = vmulq_u32(vdf1, vdf1);                                   \
+                uint32x4_t vsq2 = vmulq_u32(vdf2, vdf2);                                   \
+                uint32x4_t vsq3 = vmulq_u32(vdf3, vdf3);                                   \
+                sum += vaddvq_u32(vsq0);                                                   \
+                sum += vaddvq_u32(vsq1);                                                   \
+                sum += vaddvq_u32(vsq2);                                                   \
+                sum += vaddvq_u32(vsq3);                                                   \
+                ra += stride_a;                                                            \
+                rb += stride_b;                                                            \
+            }                                                                              \
+        }                                                                                  \
+    }                                                                                      \
+    return (float)sum / (width * height);                                                  \
+}                                                                                          \

-                for (int xx = 0; xx < 16; xx++)
-                {
-                    arrga[xx] = pv->gamma_lut[ra[xx]];
-                    arrgb[xx] = pv->gamma_lut[rb[xx]];
-                }
-
-                uint32x4_t vga0 = vld1q_u32(arrga);
-                uint32x4_t vga1 = vld1q_u32(arrga + 4);
-                uint32x4_t vga2 = vld1q_u32(arrga + 8);
-                uint32x4_t vga3 = vld1q_u32(arrga + 12);
-
-                uint32x4_t vgb0 = vld1q_u32(arrgb);
-                uint32x4_t vgb1 = vld1q_u32(arrgb + 4);
-                uint32x4_t vgb2 = vld1q_u32(arrgb + 8);
-                uint32x4_t vgb3 = vld1q_u32(arrgb + 12);
-                uint32x4_t vdf0 = vsubq_u32(vga0, vgb0);
-                uint32x4_t vdf1 = vsubq_u32(vga1, vgb1);
-                uint32x4_t vdf2 = vsubq_u32(vga2, vgb2);
-                uint32x4_t vdf3 = vsubq_u32(vga3, vgb3);
-
-                uint32x4_t vsq0 = vmulq_u32(vdf0, vdf0);
-                uint32x4_t vsq1 = vmulq_u32(vdf1, vdf1);
-                uint32x4_t vsq2 = vmulq_u32(vdf2, vdf2);
-                uint32x4_t vsq3 = vmulq_u32(vdf3, vdf3);
-
-                sum += vaddvq_u32(vsq0);
-                sum += vaddvq_u32(vsq1);
-                sum += vaddvq_u32(vsq2);
-                sum += vaddvq_u32(vsq3);
-
-                ra += stride_a;
-                rb += stride_b;
-            }
-        }
-    }
-    return (float)sum / (a->f.width * a->f.height);
-}
-#endif
+#else

 #define DEF_SSE_BLOCK16(nbits)                                                         \
 static inline unsigned sse_block16##_##nbits(unsigned *gamma_lut,                      \
@ -134,45 +157,73 @@ static inline unsigned sse_block16##_##nbits(unsigned *gamma_lut,
    return sum;                                                                        \
 }                                                                                      \

-#if !(defined (__aarch64__) && !defined(__APPLE__))
 DEF_SSE_BLOCK16(8)
-#endif
-
 DEF_SSE_BLOCK16(16)

 // Sum of squared errors.  Computes and sums the SSEs for all
 // 16x16 blocks in the images.  Only checks the Y component.
 #define DEF_MOTION_METRIC(nbits)                                                            \
 static float motion_metric##_##nbits(hb_motion_metric_private_t *pv,                        \
-                                     hb_buffer_t *a, hb_buffer_t *b)         \
+                                     int width, int height,                                 \
+                                     int stride_a, int stride_b,                            \
+                                     const uint8_t *a, const uint8_t *b)                    \
 {                                                                                           \
-    int bw = a->f.width / 16;                                                \
-    int bh = a->f.height / 16;                                               \
-    int stride_a = a->plane[0].stride / pv->bps;                             \
-    int stride_b = b->plane[0].stride / pv->bps;                             \
-    const uint##nbits##_t *pa = (const uint##nbits##_t *)a->plane[0].data;   \
-    const uint##nbits##_t *pb = (const uint##nbits##_t *)b->plane[0].data;   \
-    uint64_t sum = 0;                                                        \
                                                                                            \
+    int bw, bh;                                                                             \
+    uint##nbits##_t *buf_a, *buf_b;                                                         \
+                                                                                            \
+    buf_a     = (uint##nbits##_t *)a;                                                       \
+    buf_b     = (uint##nbits##_t *)b;                                                       \
+    bw        = width / 16;                                                                 \
+    bh        = height / 16;                                                                \
+                                                                                            \
+    uint64_t sum = 0;                                                                       \
    for (int y = 0; y < bh; y++)                                                            \
    {                                                                                       \
        for (int x = 0; x < bw; x++)                                                        \
        {                                                                                   \
            sum += sse_block16##_##nbits(pv->gamma_lut,                                     \
-                                         pa + y * 16 * stride_a + x * 16,    \
-                                         pb + y * 16 * stride_b + x * 16,    \
+                        buf_a + y * 16 * stride_a + x * 16,                                 \
+                        buf_b + y * 16 * stride_b + x * 16,                                 \
                        stride_a, stride_b);                                                \
        }                                                                                   \
    }                                                                                       \
-    return (float)sum / (a->f.width * a->f.height);                          \
+    return (float)sum / (width * height);                                                   \
 }                                                                                           \

-#if !(defined (__aarch64__) && !defined(__APPLE__))
-DEF_MOTION_METRIC(8)
 #endif

+DEF_MOTION_METRIC(8)
 DEF_MOTION_METRIC(16)

+#define DEF_MOTION_METRIC_FAST(nbits)                                                       \
+static float motion_metric_fast##_##nbits(hb_motion_metric_private_t *pv,                   \
+                                     int width, int height,                                 \
+                                     int stride_a, int stride_b,                            \
+                                     const uint8_t *a, const uint8_t *b)                    \
+{                                                                                           \
+    uint##nbits##_t *buf_a, *buf_b;                                                         \
+    int stride_buf_a, stride_buf_b;                                                         \
+    width  /= 4;                                                                            \
+    height /= 4;                                                                            \
+    stride_buf_a = width;                                                                   \
+    stride_buf_b = width;                                                                   \
+    buf_a = (uint##nbits##_t *)pv->approx_buf_a;                                            \
+    buf_b = (uint##nbits##_t *)pv->approx_buf_b;                                            \
+                                                                                            \
+    approximate_frame_data##_##nbits((const uint##nbits##_t *)a, buf_a,                     \
+                                     stride_a / pv->bps, stride_buf_a, width, height);      \
+    approximate_frame_data##_##nbits((const uint##nbits##_t *)b, buf_b,                     \
+                                     stride_b / pv->bps, stride_buf_b, width, height);      \
+                                                                                            \
+    return motion_metric##_##nbits(pv, width, height,                                       \
+                                   stride_buf_a, stride_buf_b,                              \
+                                   (const uint8_t *)buf_a, (const uint8_t *)buf_b);         \
+}                                                                                           \
+
+DEF_MOTION_METRIC_FAST(8)
+DEF_MOTION_METRIC_FAST(16)
+
 static int hb_motion_metric_init(hb_motion_metric_object_t *metric,
                                 hb_filter_init_t *init)
 {
@ -197,6 +248,31 @@ static int hb_motion_metric_init(hb_motion_metric_object_t *metric,
    }
    build_gamma_lut(pv);

+    int fast = 0;
+    if (init->geometry.width >= 1920 || init->geometry.height >= 1080)
+    {
+        fast = 1;
+        int approx_height = init->geometry.height / 4;
+        int approx_width  = init->geometry.width  / 4;
+        int size = approx_height * approx_width * sizeof(uint8_t) * pv->bps;
+        pv->approx_buf_a  = malloc(size);
+        pv->approx_buf_b  = malloc(size);
+        if (pv->approx_buf_a == NULL || pv->approx_buf_b == NULL)
+        {
+            hb_error("motion_metric: malloc failed");
+            return -1;
+        }
+    }
+
+    switch (pv->depth)
+    {
+        case 8:
+            pv->motion_metric = fast ? motion_metric_fast_8 : motion_metric_8;
+            break;
+        default:
+            pv->motion_metric = fast ? motion_metric_fast_16 : motion_metric_16;
+    }
+
    return 0;
 }

@ -206,17 +282,10 @@ static float hb_motion_metric_work(hb_motion_metric_object_t *metric,
 {
    hb_motion_metric_private_t *pv = metric->private_data;

-    switch (pv->depth)
-    {
-        case 8:
-#if defined (__aarch64__) && !defined(__APPLE__)
-            return motion_metric_neon_8(metric->private_data, buf_a, buf_b);
-#else
-            return motion_metric_8(metric->private_data, buf_a, buf_b);
-#endif
-        default:
-            return motion_metric_16(metric->private_data, buf_a, buf_b);
-    }
+    return pv->motion_metric(metric->private_data,
+                             buf_a->f.width, buf_a->f.height,
+                             buf_a->plane[0].stride, buf_b->plane[0].stride,
+                             buf_a->plane[0].data, buf_b->plane[0].data);
 }

 static void hb_motion_metric_close(hb_motion_metric_object_t *metric)
@ -229,5 +298,15 @@ static void hb_motion_metric_close(hb_motion_metric_object_t *metric)
    }

    free(pv->gamma_lut);
+    free(pv->approx_buf_a);
+    free(pv->approx_buf_b);
    free(pv);
 }
+
+hb_motion_metric_object_t hb_motion_metric =
+{
+    .name  = "Motion metric",
+    .init  = hb_motion_metric_init,
+    .work  = hb_motion_metric_work,
+    .close = hb_motion_metric_close,
+};