PR #22381 opened by george.zaguri
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22381
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22381.patch

RPi4:
put_chroma_h_10_2x2_c:                                  63.4 ( 1.00x)
put_chroma_h_10_4x4_c:                                 151.4 ( 1.00x)
put_chroma_h_10_8x8_c:                                 555.1 ( 1.00x)
put_chroma_h_10_8x8_neon:                              113.9 ( 4.88x)
put_chroma_h_10_16x16_c:                              1068.5 ( 1.00x)
put_chroma_h_10_16x16_neon:                            439.4 ( 2.43x)
put_chroma_h_10_32x32_c:                              3432.6 ( 1.00x)
put_chroma_h_10_32x32_neon:                           1878.3 ( 1.83x)
put_chroma_h_10_64x64_c:                             12872.2 ( 1.00x)
put_chroma_h_10_64x64_neon:                           7868.2 ( 1.64x)
put_chroma_h_10_128x128_c:                           45612.2 ( 1.00x)
put_chroma_h_10_128x128_neon:                        28742.1 ( 1.59x)
put_chroma_h_12_2x2_c:                                  63.7 ( 1.00x)
put_chroma_h_12_4x4_c:                                 151.5 ( 1.00x)
put_chroma_h_12_8x8_c:                                 555.2 ( 1.00x)
put_chroma_h_12_8x8_neon:                              114.2 ( 4.86x)
put_chroma_h_12_16x16_c:                              1068.1 ( 1.00x)
put_chroma_h_12_16x16_neon:                            438.8 ( 2.43x)
put_chroma_h_12_32x32_c:                              3419.7 ( 1.00x)
put_chroma_h_12_32x32_neon:                           1878.7 ( 1.82x)
put_chroma_h_12_64x64_c:                             12862.2 ( 1.00x)
put_chroma_h_12_64x64_neon:                           7868.2 ( 1.63x)
put_chroma_h_12_128x128_c:                           45613.5 ( 1.00x)
put_chroma_h_12_128x128_neon:                        28743.3 ( 1.59x)

Apple M4:
put_chroma_h_10_2x2_c:                                   2.5 ( 1.00x)
put_chroma_h_10_4x4_c:                                   6.5 ( 1.00x)
put_chroma_h_10_8x8_c:                                  17.8 ( 1.00x)
put_chroma_h_10_8x8_neon:                                6.8 ( 2.60x)
put_chroma_h_10_16x16_c:                                53.3 ( 1.00x)
put_chroma_h_10_16x16_neon:                             30.4 ( 1.75x)
put_chroma_h_10_32x32_c:                               181.8 ( 1.00x)
put_chroma_h_10_32x32_neon:                            116.2 ( 1.56x)
put_chroma_h_10_64x64_c:                               684.2 ( 1.00x)
put_chroma_h_10_64x64_neon:                            470.3 ( 1.45x)
put_chroma_h_10_128x128_c:                            2567.6 ( 1.00x)
put_chroma_h_10_128x128_neon:                         1879.3 ( 1.37x)
put_chroma_h_12_2x2_c:                                   1.9 ( 1.00x)
put_chroma_h_12_4x4_c:                                   7.0 ( 1.00x)
put_chroma_h_12_8x8_c:                                  16.8 ( 1.00x)
put_chroma_h_12_8x8_neon:                                7.9 ( 2.12x)
put_chroma_h_12_16x16_c:                                55.0 ( 1.00x)
put_chroma_h_12_16x16_neon:                             29.0 ( 1.90x)
put_chroma_h_12_32x32_c:                               182.5 ( 1.00x)
put_chroma_h_12_32x32_neon:                            116.9 ( 1.56x)
put_chroma_h_12_64x64_c:                               666.8 ( 1.00x)
put_chroma_h_12_64x64_neon:                            474.5 ( 1.41x)
put_chroma_h_12_128x128_c:                            2588.1 ( 1.00x)
put_chroma_h_12_128x128_neon:                         1912.2 ( 1.35x)


>From b4082cfd4149ed6c8bf41cb28c6aef44447f2599 Mon Sep 17 00:00:00 2001
From: Georgii Zagoruiko <[email protected]>
Date: Wed, 4 Mar 2026 18:40:21 +0000
Subject: [PATCH] aarch64/vvc: Optimisations of put_luma_h() functions for
 10/12-bit

RPi4:
put_chroma_h_10_2x2_c:                                  63.4 ( 1.00x)
put_chroma_h_10_4x4_c:                                 151.4 ( 1.00x)
put_chroma_h_10_8x8_c:                                 555.1 ( 1.00x)
put_chroma_h_10_8x8_neon:                              113.9 ( 4.88x)
put_chroma_h_10_16x16_c:                              1068.5 ( 1.00x)
put_chroma_h_10_16x16_neon:                            439.4 ( 2.43x)
put_chroma_h_10_32x32_c:                              3432.6 ( 1.00x)
put_chroma_h_10_32x32_neon:                           1878.3 ( 1.83x)
put_chroma_h_10_64x64_c:                             12872.2 ( 1.00x)
put_chroma_h_10_64x64_neon:                           7868.2 ( 1.64x)
put_chroma_h_10_128x128_c:                           45612.2 ( 1.00x)
put_chroma_h_10_128x128_neon:                        28742.1 ( 1.59x)
put_chroma_h_12_2x2_c:                                  63.7 ( 1.00x)
put_chroma_h_12_4x4_c:                                 151.5 ( 1.00x)
put_chroma_h_12_8x8_c:                                 555.2 ( 1.00x)
put_chroma_h_12_8x8_neon:                              114.2 ( 4.86x)
put_chroma_h_12_16x16_c:                              1068.1 ( 1.00x)
put_chroma_h_12_16x16_neon:                            438.8 ( 2.43x)
put_chroma_h_12_32x32_c:                              3419.7 ( 1.00x)
put_chroma_h_12_32x32_neon:                           1878.7 ( 1.82x)
put_chroma_h_12_64x64_c:                             12862.2 ( 1.00x)
put_chroma_h_12_64x64_neon:                           7868.2 ( 1.63x)
put_chroma_h_12_128x128_c:                           45613.5 ( 1.00x)
put_chroma_h_12_128x128_neon:                        28743.3 ( 1.59x)

Apple M4:
put_chroma_h_10_2x2_c:                                   2.5 ( 1.00x)
put_chroma_h_10_4x4_c:                                   6.5 ( 1.00x)
put_chroma_h_10_8x8_c:                                  17.8 ( 1.00x)
put_chroma_h_10_8x8_neon:                                6.8 ( 2.60x)
put_chroma_h_10_16x16_c:                                53.3 ( 1.00x)
put_chroma_h_10_16x16_neon:                             30.4 ( 1.75x)
put_chroma_h_10_32x32_c:                               181.8 ( 1.00x)
put_chroma_h_10_32x32_neon:                            116.2 ( 1.56x)
put_chroma_h_10_64x64_c:                               684.2 ( 1.00x)
put_chroma_h_10_64x64_neon:                            470.3 ( 1.45x)
put_chroma_h_10_128x128_c:                            2567.6 ( 1.00x)
put_chroma_h_10_128x128_neon:                         1879.3 ( 1.37x)
put_chroma_h_12_2x2_c:                                   1.9 ( 1.00x)
put_chroma_h_12_4x4_c:                                   7.0 ( 1.00x)
put_chroma_h_12_8x8_c:                                  16.8 ( 1.00x)
put_chroma_h_12_8x8_neon:                                7.9 ( 2.12x)
put_chroma_h_12_16x16_c:                                55.0 ( 1.00x)
put_chroma_h_12_16x16_neon:                             29.0 ( 1.90x)
put_chroma_h_12_32x32_c:                               182.5 ( 1.00x)
put_chroma_h_12_32x32_neon:                            116.9 ( 1.56x)
put_chroma_h_12_64x64_c:                               666.8 ( 1.00x)
put_chroma_h_12_64x64_neon:                            474.5 ( 1.41x)
put_chroma_h_12_128x128_c:                            2588.1 ( 1.00x)
put_chroma_h_12_128x128_neon:                         1912.2 ( 1.35x)
---
 libavcodec/aarch64/vvc/dsp_init.c |  27 ++++++
 libavcodec/aarch64/vvc/inter.S    | 131 ++++++++++++++++++++++++++++++
 2 files changed, 158 insertions(+)

diff --git a/libavcodec/aarch64/vvc/dsp_init.c 
b/libavcodec/aarch64/vvc/dsp_init.c
index bc2677945e..d09dd36a43 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -43,6 +43,19 @@ void ff_vvc_put_luma_h16_12_neon(int16_t *dst, const uint8_t 
*_src, const ptrdif
 void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
                                    const int height, const int8_t *hf, const 
int8_t *vf, const int width);
 
+void ff_vvc_put_chroma_h8_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                  const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_chroma_h16_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                   const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_chroma_h_x16_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                     const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_chroma_h8_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                  const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_chroma_h16_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                   const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_chroma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                     const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+
 void ff_vvc_put_luma_v4_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
                                 const int height, const int8_t *hf, const 
int8_t *vf, const int width);
 void ff_vvc_put_luma_v8_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
@@ -274,6 +287,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
         c->inter.dmvr[0][1] = ff_vvc_dmvr_h_10_neon;
         c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
         c->inter.apply_bdof = ff_vvc_apply_bdof_10_neon;
+
+        c->inter.put[1][2][0][1] = ff_vvc_put_chroma_h8_10_neon;
+        c->inter.put[1][3][0][1] = ff_vvc_put_chroma_h16_10_neon;
+        c->inter.put[1][4][0][1] =
+        c->inter.put[1][5][0][1] =
+        c->inter.put[1][6][0][1] = ff_vvc_put_chroma_h_x16_10_neon;
+
         c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_10_neon;
         c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_10_neon;
         c->inter.put[0][4][0][1] =
@@ -297,6 +317,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
         c->inter.dmvr[0][1] = ff_vvc_dmvr_h_12_neon;
         c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
         c->inter.apply_bdof = ff_vvc_apply_bdof_12_neon;
+
+        c->inter.put[1][2][0][1] = ff_vvc_put_chroma_h8_12_neon;
+        c->inter.put[1][3][0][1] = ff_vvc_put_chroma_h16_12_neon;
+        c->inter.put[1][4][0][1] =
+        c->inter.put[1][5][0][1] =
+        c->inter.put[1][6][0][1] = ff_vvc_put_chroma_h_x16_12_neon;
+
         c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_12_neon;
         c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_12_neon;
         c->inter.put[0][4][0][1] =
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 887e456a66..752405e79b 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -1833,6 +1833,137 @@ function ff_vvc_put_luma_h_x16_12_neon, export=1
         put_luma_h_x16_xx_neon 4
 endfunc
 
+.macro put_chroma_h_x8_horizontal_filter shift
+        // 4 bytes from hf loaded to v0.4h
+        // 24 bytes from _src loaded to v20.8h & v21.4h where v21.4h is loaded 
for shift to v1.8h,v2.8h,v3.8h
+        // v24.4h & v25.4h are output vectors to store
+        ext             v1.16b, v20.16b, v21.16b, #2
+        ext             v2.16b, v20.16b, v21.16b, #4
+        ext             v3.16b, v20.16b, v21.16b, #6
+        smull           v24.4s, v20.4h, v0.h[0]
+        smull2          v25.4s, v20.8h, v0.h[0]
+        smlal           v24.4s, v1.4h, v0.h[1]
+        smlal2          v25.4s, v1.8h, v0.h[1]
+        smlal           v24.4s, v2.4h, v0.h[2]
+        smlal2          v25.4s, v2.8h, v0.h[2]
+        smlal           v24.4s, v3.4h, v0.h[3]
+        smlal2          v25.4s, v3.8h, v0.h[3]
+        sqshrn          v24.4h, v24.4s, #(\shift)
+        sqshrn          v25.4h, v25.4s, #(\shift)
+.endm
+
+.macro put_chroma_h8_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ldr             s0, [x4]
+        sub             x1, x1, #2
+        sub             x2, x2, #16
+        sxtl            v0.8h, v0.8b
+1:
+        ld1             {v20.8h}, [x1], #16
+        ld1             {v21.4h}, [x1], x2
+        put_chroma_h_x8_horizontal_filter \shift
+        subs            w3, w3, #1
+        st1             {v24.4h, v25.4h}, [x0], x9
+        b.gt            1b
+        ret
+.endm
+
+.macro put_chroma_h16_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ldr             s0, [x4]
+        sub             x9, x9, #16
+        sub             x1, x1, #2
+        sub             x2, x2, #32
+        sxtl            v0.8h, v0.8b
+1:
+        ld1             {v20.8h, v21.8h}, [x1], #32
+        ld1             {v22.4h}, [x1], x2
+        put_chroma_h_x8_horizontal_filter \shift
+        mov             v20.16b, v21.16b
+        mov             v21.16b, v22.16b
+        st1             {v24.4h, v25.4h}, [x0], #16
+        put_chroma_h_x8_horizontal_filter \shift
+        subs            w3, w3, #1
+        st1             {v24.4h, v25.4h}, [x0], x9
+        b.gt            1b
+        ret
+.endm
+
+.macro put_chroma_h_x16_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ldr             s0, [x4]
+        sub             x9, x9, w6, uxtw #1
+        sub             x2, x2, w6, uxtw #1
+        sxtl            v0.8h, v0.8b
+        sub             x1, x1, #2
+        sub             x2, x2, #16
+1:
+        ld1             {v20.8h}, [x1], #16
+        mov             w8, w6
+2:
+        ld1             {v21.8h, v22.8h}, [x1], #32
+        put_chroma_h_x8_horizontal_filter \shift
+        mov             v20.16b, v21.16b
+        mov             v21.16b, v22.16b
+        st1             {v24.4h, v25.4h}, [x0], #16
+        put_chroma_h_x8_horizontal_filter \shift
+        mov             v20.16b, v21.16b
+        subs            w8, w8, #16
+        st1             {v24.4h, v25.4h}, [x0], #16
+        b.gt            2b
+        subs            w3, w3, #1
+        add             x0, x0, x9
+        add             x1, x1, x2
+        b.gt            1b
+        ret
+.endm
+
+
+function ff_vvc_put_chroma_h8_10_neon, export=1
+        put_chroma_h8_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_h8_12_neon, export=1
+        put_chroma_h8_xx_neon 4
+endfunc
+
+function ff_vvc_put_chroma_h16_10_neon, export=1
+        put_chroma_h16_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_h16_12_neon, export=1
+        put_chroma_h16_xx_neon 4
+endfunc
+
+function ff_vvc_put_chroma_h_x16_10_neon, export=1
+        put_chroma_h_x16_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_h_x16_12_neon, export=1
+        put_chroma_h_x16_xx_neon 4
+endfunc
+
 .macro put_luma_v4_xx_neon shift
         mov             x9, #(VVC_MAX_PB_SIZE * 2)
         sub             x1, x1, x2, lsl #1
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to