PR #21661 opened by Jun Zhao (mypopydev) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21661 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21661.patch
Together, these commits finalize the ARM64 NEON optimization for HEVC qpel_uni_w motion compensation by adding support for widths 6, 12, 24, 32, and 48. The first commit implements vertical filtering using loop unrolling and decomposition, while the second adds bidirectional (HV) filtering with helper functions and tail-call optimizations. >From fdc8f3d530380078f3b68de62a5ba8f45c3e28f9 Mon Sep 17 00:00:00 2001 From: Jun Zhao <[email protected]> Date: Tue, 3 Feb 2026 11:05:58 +0800 Subject: [PATCH 1/2] lavc/hevc: add aarch64 NEON for qpel uni-weighted vertical filter Add NEON-optimized implementations for HEVC QPEL uni-weighted vertical interpolation (put_hevc_qpel_uni_w_v) at 8-bit depth. These functions perform weighted uni-directional prediction with vertical QPEL filtering: - 8-tap vertical QPEL filter - Weighted prediction: (filter_result * wx + offset) >> shift Previously only sizes 4, 8, 16, 64 were optimized. This patch adds optimized implementations for all remaining sizes: 6, 12, 24, 32, 48. Performance results on Apple M4: ./tests/checkasm/checkasm --test=hevc_pel --bench put_hevc_qpel_uni_w_v6_8_neon: 3.40x put_hevc_qpel_uni_w_v12_8_neon: 3.24x put_hevc_qpel_uni_w_v24_8_neon: 3.06x put_hevc_qpel_uni_w_v32_8_neon: 2.66x put_hevc_qpel_uni_w_v48_8_neon: 2.67x Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/aarch64/h26x/dsp.h | 3 +- libavcodec/aarch64/h26x/qpel_neon.S | 522 ++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 2 +- 3 files changed, 525 insertions(+), 2 deletions(-) diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h index 6c91004301..fb82b114c4 100644 --- a/libavcodec/aarch64/h26x/dsp.h +++ b/libavcodec/aarch64/h26x/dsp.h @@ -84,6 +84,7 @@ NEON8_FNPROTO_PARTIAL_6(qpel_bi, (uint8_t *_dst, ptrdiff_t _dststride, const uin void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \ void ff_hevc_put_hevc_##fn##64_8_neon##ext args + NEON8_FNPROTO(pel_pixels, (int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height, intptr_t mx, intptr_t my, int width),); @@ -143,7 +144,7 @@ NEON8_FNPROTO(epel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width),); -NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride, +NEON8_FNPROTO(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width),); diff --git a/libavcodec/aarch64/h26x/qpel_neon.S b/libavcodec/aarch64/h26x/qpel_neon.S index 7901fedaf3..7f995f3a33 100644 --- a/libavcodec/aarch64/h26x/qpel_neon.S +++ b/libavcodec/aarch64/h26x/qpel_neon.S @@ -2220,6 +2220,91 @@ function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1 ret endfunc +// Store 6 bytes: 4 bytes + 2 bytes, then advance dst pointer by dststride +.macro QPEL_UNI_W_V_6 + smull v24.4s, v26.4h, v30.4h + smull2 v25.4s, v26.8h, v30.8h + sqrshl v24.4s, v24.4s, v31.4s + sqrshl v25.4s, v25.4s, v31.4s + sqadd v24.4s, v24.4s, v29.4s + sqadd v25.4s, v25.4s, v29.4s + sqxtn v24.4h, v24.4s + sqxtn2 v24.8h, v25.4s + sqxtun v24.8b, v24.8h + st1 {v24.s}[0], [x0], #4 + st1 {v24.h}[2], [x0] + sub x0, x0, #4 + add x0, x0, x1 +.endm + +function ff_hevc_put_hevc_qpel_uni_w_v6_8_neon, export=1 + QPEL_UNI_W_V_HEADER + ldr d16, [x2] + ldr d17, [x2, x3] + add x2, x2, x3, lsl #1 + ldr d18, [x2] + ldr d19, [x2, x3] + add x2, x2, x3, lsl #1 + ldr d20, [x2] + ldr d21, [x2, x3] + add x2, x2, x3, lsl #1 + ldr d22, [x2] + +1: ldr d23, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_UNI_W_V_6 + subs w4, w4, #1 + b.eq 2f + + ldr d16, [x2] + QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_UNI_W_V_6 + subs w4, w4, #1 + b.eq 2f + + ldr d17, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_UNI_W_V_6 + subs w4, w4, #1 + b.eq 2f + + ldr d18, [x2] + QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_UNI_W_V_6 + subs w4, w4, #1 + b.eq 2f + + ldr d19, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_UNI_W_V_6 + subs w4, w4, #1 + b.eq 2f + + ldr d20, [x2] + QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_UNI_W_V_6 + subs w4, w4, #1 + b.eq 2f + + ldr d21, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_UNI_W_V_6 + subs w4, w4, #1 + b.eq 2f + + ldr d22, [x2] + QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_UNI_W_V_6 + subs w4, w4, #1 + b.ne 1b +2: + ret +endfunc + .macro QPEL_UNI_W_V_16 smull v24.4s, v26.4h, v30.4h smull2 v25.4s, v26.8h, v30.8h @@ -2318,6 +2403,104 @@ function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1 ret endfunc +// Store 12 bytes: 8 bytes + 4 bytes, then advance dst pointer by dststride +.macro QPEL_UNI_W_V_12 + smull v24.4s, v26.4h, v30.4h + smull2 v25.4s, v26.8h, v30.8h + smull v26.4s, v27.4h, v30.4h + sqrshl v24.4s, v24.4s, v31.4s + sqrshl v25.4s, v25.4s, v31.4s + sqrshl v26.4s, v26.4s, v31.4s + sqadd v24.4s, v24.4s, v29.4s + sqadd v25.4s, v25.4s, v29.4s + sqadd v26.4s, v26.4s, v29.4s + sqxtn v24.4h, v24.4s + sqxtn2 v24.8h, v25.4s + sqxtn v26.4h, v26.4s + sqxtun v24.8b, v24.8h + sqxtun v26.8b, v26.8h + st1 {v24.d}[0], [x0], #8 + st1 {v26.s}[0], [x0] + sub x0, x0, #8 + add x0, x0, x1 +.endm + +function ff_hevc_put_hevc_qpel_uni_w_v12_8_neon, export=1 + QPEL_UNI_W_V_HEADER + ldr q16, [x2] + ldr q17, [x2, x3] + add x2, x2, x3, lsl #1 + ldr q18, [x2] + ldr q19, [x2, x3] + add x2, x2, x3, lsl #1 + ldr q20, [x2] + ldr q21, [x2, x3] + add x2, x2, x3, lsl #1 + ldr q22, [x2] + +1: ldr q23, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_UNI_W_V_12 + subs w4, w4, #1 + b.eq 2f + + ldr q16, [x2] + QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_UNI_W_V_12 + subs w4, w4, #1 + b.eq 2f + + ldr q17, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_UNI_W_V_12 + subs w4, w4, #1 + b.eq 2f + + ldr q18, [x2] + QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_UNI_W_V_12 + subs w4, w4, #1 + b.eq 2f + + ldr q19, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_UNI_W_V_12 + subs w4, w4, #1 + b.eq 2f + + ldr q20, [x2] + QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_UNI_W_V_12 + subs w4, w4, #1 + b.eq 2f + + ldr q21, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_UNI_W_V_12 + subs w4, w4, #1 + b.eq 2f + + ldr q22, [x2] + QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_UNI_W_V_12 + subs w4, w4, #1 + b.ne 1b +2: + ret +endfunc + function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1 QPEL_UNI_W_V_HEADER ldur w13, [sp, #16] @@ -2408,6 +2591,345 @@ function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1 ret endfunc +// Store 24 bytes: process as 16 + 8 in a loop +function ff_hevc_put_hevc_qpel_uni_w_v24_8_neon, export=1 + QPEL_UNI_W_V_HEADER + mov w13, #24 // width + mov x14, x0 + mov x15, x2 + mov w11, w4 + +3: + cmp w13, #16 + b.le 4f + // Process 16 bytes + ldr q16, [x2] + ldr q17, [x2, x3] + add x2, x2, x3, lsl #1 + ldr q18, [x2] + ldr q19, [x2, x3] + add x2, x2, x3, lsl #1 + ldr q20, [x2] + ldr q21, [x2, x3] + add x2, x2, x3, lsl #1 + ldr q22, [x2] + +1: ldr q23, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q16, [x2] + QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q17, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q18, [x2] + QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q19, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q20, [x2] + QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q21, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q22, [x2] + QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.ne 1b +2: + subs w13, w13, #16 + add x14, x14, #16 + add x15, x15, #16 + mov x0, x14 + mov x2, x15 + mov w4, w11 + b.hi 3b + ret + +4: // Process remaining 8 bytes + ldr d16, [x2] + ldr d17, [x2, x3] + add x2, x2, x3, lsl #1 + ldr d18, [x2] + ldr d19, [x2, x3] + add x2, x2, x3, lsl #1 + ldr d20, [x2] + ldr d21, [x2, x3] + add x2, x2, x3, lsl #1 + ldr d22, [x2] + +5: ldr d23, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_UNI_W_V_8 + subs w4, w4, #1 + b.eq 6f + + ldr d16, [x2] + QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_UNI_W_V_8 + subs w4, w4, #1 + b.eq 6f + + ldr d17, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_UNI_W_V_8 + subs w4, w4, #1 + b.eq 6f + + ldr d18, [x2] + QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_UNI_W_V_8 + subs w4, w4, #1 + b.eq 6f + + ldr d19, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_UNI_W_V_8 + subs w4, w4, #1 + b.eq 6f + + ldr d20, [x2] + QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_UNI_W_V_8 + subs w4, w4, #1 + b.eq 6f + + ldr d21, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_UNI_W_V_8 + subs w4, w4, #1 + b.eq 6f + + ldr d22, [x2] + QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_UNI_W_V_8 + subs w4, w4, #1 + b.ne 5b +6: + ret +endfunc + +// v32: process as two 16-byte columns +function ff_hevc_put_hevc_qpel_uni_w_v32_8_neon, export=1 + QPEL_UNI_W_V_HEADER + mov w13, #32 // width + mov x14, x0 + mov x15, x2 + mov w11, w4 + +3: + ldr q16, [x2] + ldr q17, [x2, x3] + add x2, x2, x3, lsl #1 + ldr q18, [x2] + ldr q19, [x2, x3] + add x2, x2, x3, lsl #1 + ldr q20, [x2] + ldr q21, [x2, x3] + add x2, x2, x3, lsl #1 + ldr q22, [x2] + +1: ldr q23, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q16, [x2] + QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q17, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q18, [x2] + QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q19, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q20, [x2] + QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q21, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q22, [x2] + QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.ne 1b +2: + subs w13, w13, #16 + add x14, x14, #16 + add x15, x15, #16 + mov x0, x14 + mov x2, x15 + mov w4, w11 + b.hi 3b + ret +endfunc + +// v48: process as three 16-byte columns +function ff_hevc_put_hevc_qpel_uni_w_v48_8_neon, export=1 + QPEL_UNI_W_V_HEADER + mov w13, #48 // width + mov x14, x0 + mov x15, x2 + mov w11, w4 + +3: + ldr q16, [x2] + ldr q17, [x2, x3] + add x2, x2, x3, lsl #1 + ldr q18, [x2] + ldr q19, [x2, x3] + add x2, x2, x3, lsl #1 + ldr q20, [x2] + ldr q21, [x2, x3] + add x2, x2, x3, lsl #1 + ldr q22, [x2] + +1: ldr q23, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q16, [x2] + QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q17, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q18, [x2] + QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q19, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q20, [x2] + QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q21, [x2, x3] + add x2, x2, x3, lsl #1 + QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.eq 2f + + ldr q22, [x2] + QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_UNI_W_V_16 + subs w4, w4, #1 + b.ne 1b +2: + subs w13, w13, #16 + add x14, x14, #16 + add x15, x15, #16 + mov x0, x14 + mov x2, x15 + mov w4, w11 + b.hi 3b + ret +endfunc + function hevc_put_hevc_qpel_uni_hv4_8_end_neon mov x9, #(HEVC_MAX_PB_SIZE * 2) load_qpel_filterh x6, x5 diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 8ff7f632af..30560bafb9 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -274,7 +274,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,); NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,); NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,); - NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,); + NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,); NEON8_FNASSIGN_SHARED_32(c->put_hevc_epel, 0, 1, epel_h,); NEON8_FNASSIGN_SHARED_32(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h,); -- 2.52.0 >From 78477437dc773129b62ed56f4851df6850ee0346 Mon Sep 17 00:00:00 2001 From: Jun Zhao <[email protected]> Date: Tue, 3 Feb 2026 13:30:25 +0800 Subject: [PATCH 2/2] lavc/hevc: add aarch64 NEON for qpel uni-weighted HV filter Add NEON-optimized implementations for HEVC QPEL uni-directional weighted HV interpolation (put_hevc_qpel_uni_w_hv) at 8-bit depth, for block widths 6, 12, 24, and 48. These functions perform horizontal then vertical 8-tap QPEL filtering with weighting (wx, ox, denom) and output to uint8_t. Previously only widths 4, 8, 16, 32, 64 were implemented; this completes coverage for all standard HEVC block widths. Performance results on Apple M4: ./tests/checkasm/checkasm --test=hevc_pel --bench put_hevc_qpel_uni_w_hv6_8_neon: 3.11x put_hevc_qpel_uni_w_hv12_8_neon: 3.19x put_hevc_qpel_uni_w_hv24_8_neon: 2.26x put_hevc_qpel_uni_w_hv48_8_neon: 1.57x Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/aarch64/h26x/dsp.h | 4 +- libavcodec/aarch64/h26x/qpel_neon.S | 414 ++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 4 +- 3 files changed, 418 insertions(+), 4 deletions(-) diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h index fb82b114c4..1583d39c99 100644 --- a/libavcodec/aarch64/h26x/dsp.h +++ b/libavcodec/aarch64/h26x/dsp.h @@ -223,12 +223,12 @@ NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width), _i8mm); -NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, +NEON8_FNPROTO(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width),); -NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, +NEON8_FNPROTO(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width), _i8mm); diff --git a/libavcodec/aarch64/h26x/qpel_neon.S b/libavcodec/aarch64/h26x/qpel_neon.S index 7f995f3a33..03ab42ba3b 100644 --- a/libavcodec/aarch64/h26x/qpel_neon.S +++ b/libavcodec/aarch64/h26x/qpel_neon.S @@ -552,6 +552,144 @@ function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1 ret mx endfunc +// h24: process 24 pixels per row +// Strategy: Process 3 blocks of 8 pixels (0-7, 8-15, 16-23) inline. +function ff_hevc_put_hevc_\type\()_h24_8_neon, export=1 + load_filter mx + sxtw height, heightw +.ifc \type, qpel_bi + ldrh w8, [sp] // width + mov x16, #(HEVC_MAX_PB_SIZE << 2) // src2bstridel + lsl x17, height, #7 // src2b reset + add x15, x4, #(HEVC_MAX_PB_SIZE << 1) // src2b +.endif + sub src, src, #3 + mov mx, x30 +.ifc \type, qpel + mov dststride, #(HEVC_MAX_PB_SIZE << 1) + lsl x13, srcstride, #1 // srcstridel + mov x14, #((HEVC_MAX_PB_SIZE << 2) - 24) +.else + lsl x14, dststride, #1 // dststridel + lsl x13, srcstride, #1 // srcstridel + sub x14, x14, #16 +.endif + add x10, dst, dststride // dstb + add x12, src, srcstride // srcb +0: + cmp heightw, #2 + b.lt 2f + // Load 32 bytes for row1 and row2 (need 24+7=31) + ld1 {v16.8b-v19.8b}, [src], x13 + ld1 {v20.8b-v23.8b}, [x12], x13 + + // Extend all loaded data to 16-bit + // Row 1 (v16-v19 -> v24-v27, v28-v31) + uxtl v24.8h, v16.8b + uxtl2 v25.8h, v16.16b + uxtl v26.8h, v17.8b + uxtl2 v27.8h, v17.16b + uxtl v28.8h, v18.8b + uxtl2 v29.8h, v18.16b + uxtl v30.8h, v19.8b + uxtl2 v31.8h, v19.16b + + // Row 2 (v20-v23 -> v16-v19, v20-v23) + // Note: Reusing low registers v16-v23 for Row 2 extended data + // We need to save Row 1 results temporarily if we overwrite, + // or process Row 1 fully then Row 2. + // Since we have plenty of regs, let's keep Row 1 in v24-v31 + // and put Row 2 in v16-v23. + uxtl v16.8h, v20.8b + uxtl2 v17.8h, v20.16b + uxtl v18.8h, v21.8b + uxtl2 v19.8h, v21.16b + uxtl v20.8h, v22.8b + uxtl2 v21.8h, v22.16b + uxtl v22.8h, v23.8b + uxtl2 v23.8h, v23.16b + + // Filter Row 1 Block 1 (0-7) using v24, v25 + mul v0.8h, v24.8h, v0.h[0] +.irpc i, 1234567 + ext v4.16b, v24.16b, v25.16b, #(2*\i) + mla v0.8h, v4.8h, v0.h[\i] +.endr + // Filter Row 1 Block 2 (8-15) using v26, v27 + mul v1.8h, v26.8h, v0.h[0] +.irpc i, 1234567 + ext v4.16b, v26.16b, v27.16b, #(2*\i) + mla v1.8h, v4.8h, v0.h[\i] +.endr + // Filter Row 1 Block 3 (16-23) using v28, v29 + mul v2.8h, v28.8h, v0.h[0] +.irpc i, 1234567 + ext v4.16b, v28.16b, v29.16b, #(2*\i) + mla v2.8h, v4.8h, v0.h[\i] +.endr + + // Filter Row 2 Block 1 (0-7) using v16, v17 + mul v3.8h, v16.8h, v0.h[0] +.irpc i, 1234567 + ext v4.16b, v16.16b, v17.16b, #(2*\i) + mla v3.8h, v4.8h, v0.h[\i] +.endr + // Filter Row 2 Block 2 (8-15) using v18, v19 + mul v4.8h, v18.8h, v0.h[0] // reuse v4 as dest, filter src in v18/v19 +.irpc i, 1234567 + ext v5.16b, v18.16b, v19.16b, #(2*\i) + mla v4.8h, v5.8h, v0.h[\i] +.endr + // Filter Row 2 Block 3 (16-23) using v20, v21 + mul v5.8h, v20.8h, v0.h[0] +.irpc i, 1234567 + ext v6.16b, v20.16b, v21.16b, #(2*\i) + mla v5.8h, v6.8h, v0.h[\i] +.endr + + subs heightw, heightw, #2 +.ifc \type, qpel + // Store results + st1 {v0.8h, v1.8h, v2.8h}, [dst], x14 + st1 {v3.8h, v4.8h, v5.8h}, [x10], x14 +.else +.ifc \type, qpel_bi + // Load src2 and add + ld1 {v6.8h, v7.8h, v8.8h}, [x4], x16 + sqadd v0.8h, v0.8h, v6.8h + sqadd v1.8h, v1.8h, v7.8h + sqadd v2.8h, v2.8h, v8.8h + ld1 {v6.8h, v7.8h, v8.8h}, [x15], x16 + sqadd v3.8h, v3.8h, v6.8h + sqadd v4.8h, v4.8h, v7.8h + sqadd v5.8h, v5.8h, v8.8h + sub x4, x4, #48 + sub x15, x15, #48 + + sqrshrun v0.8b, v0.8h, #7 + sqrshrun v1.8b, v1.8h, #7 + sqrshrun v2.8b, v2.8h, #7 + sqrshrun v3.8b, v3.8h, #7 + sqrshrun v4.8b, v4.8h, #7 + sqrshrun v5.8b, v5.8h, #7 + st1 {v0.8b, v1.8b, v2.8b}, [dst], x14 + st1 {v3.8b, v4.8b, v5.8b}, [x10], x14 +.else + sqrshrun v0.8b, v0.8h, #6 + sqrshrun v1.8b, v1.8h, #6 + sqrshrun v2.8b, v2.8h, #6 + sqrshrun v3.8b, v3.8h, #6 + sqrshrun v4.8b, v4.8h, #6 + sqrshrun v5.8b, v5.8h, #6 + st1 {v0.8b, v1.8b, v2.8b}, [dst], x14 + st1 {v3.8b, v4.8b, v5.8b}, [x10], x14 +.endif +.endif + b.gt 0b +2: + ret mx +endfunc + .ifnc \type, qpel_bi function ff_vvc_put_\type\()_h16_8_neon, export=1 vvc_load_filter mx @@ -5176,7 +5314,14 @@ DISABLE_I8MM .if \width >= 32 mov w6, #\width bl X(ff_hevc_put_hevc_qpel_h32_8_neon) +.elseif \width == 24 +#if HAVE_I8MM + bl X(ff_hevc_put_hevc_qpel_h24_8_neon_i8mm) +#else + bl X(ff_hevc_put_hevc_qpel_h24_8_neon) +#endif .else + mov w6, #\width bl X(ff_hevc_put_hevc_qpel_h\width\()_8_\suffix) .endif .else @@ -5307,6 +5452,107 @@ function hevc_put_hevc_qpel_uni_w_hv4_8_end_neon ret endfunc +// hv6: process 6 pixels (4 + 2), use 8-element loads/stores +.macro QPEL_UNI_W_HV_6 + sshr v26.4s, v26.4s, #6 + sshr v27.4s, v27.4s, #6 + mul v24.4s, v26.4s, v28.4s + mul v25.4s, v27.4s, v28.4s + sqrshl v24.4s, v24.4s, v30.4s + sqrshl v25.4s, v25.4s, v30.4s + sqadd v24.4s, v24.4s, v29.4s + sqadd v25.4s, v25.4s, v29.4s + sqxtn v24.4h, v24.4s + sqxtn2 v24.8h, v25.4s + sqxtun v24.8b, v24.8h + st1 {v24.s}[0], [x20], #4 + st1 {v24.h}[2], [x20] + sub x20, x20, #4 + add x20, x20, x21 +.endm + +function hevc_put_hevc_qpel_uni_w_hv6_8_end_neon + ldr q16, [sp] + ldr q17, [sp, x10] + add sp, sp, x10, lsl #1 + ldr q18, [sp] + ldr q19, [sp, x10] + add sp, sp, x10, lsl #1 + ldr q20, [sp] + ldr q21, [sp, x10] + add sp, sp, x10, lsl #1 + ldr q22, [sp] + add sp, sp, x10 +1: + ldr q23, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_FILTER_H2 v27, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_UNI_W_HV_6 + subs w22, w22, #1 + b.eq 2f + + ldr q16, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_FILTER_H2 v27, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_UNI_W_HV_6 + subs w22, w22, #1 + b.eq 2f + + ldr q17, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_FILTER_H2 v27, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_UNI_W_HV_6 + subs w22, w22, #1 + b.eq 2f + + ldr q18, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_FILTER_H2 v27, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_UNI_W_HV_6 + subs w22, w22, #1 + b.eq 2f + + ldr q19, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_FILTER_H2 v27, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_UNI_W_HV_6 + subs w22, w22, #1 + b.eq 2f + + ldr q20, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_FILTER_H2 v27, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_UNI_W_HV_6 + subs w22, w22, #1 + b.eq 2f + + ldr q21, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_FILTER_H2 v27, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_UNI_W_HV_6 + subs w22, w22, #1 + b.eq 2f + + ldr q22, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_FILTER_H2 v27, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_UNI_W_HV_6 + subs w22, w22, #1 + b.hi 1b + +2: + QPEL_UNI_W_HV_END + ret +endfunc + .macro QPEL_UNI_W_HV_8 sshr v26.4s, v26.4s, #6 sshr v27.4s, v27.4s, #6 @@ -5404,6 +5650,144 @@ function hevc_put_hevc_qpel_uni_w_hv8_8_end_neon ret endfunc +// hv12: process 12 pixels = 8 + 4 +// Input: v16-v23 hold first 8 elements per row (q registers) +// v1-v7,v31 hold elements 8-11 per row (d registers, only .4h used) +// Output: 12 bytes to [x20], advance by x21 +.macro QPEL_UNI_W_HV_12 + sshr v24.4s, v24.4s, #6 + sshr v25.4s, v25.4s, #6 + sshr v26.4s, v26.4s, #6 + mul v24.4s, v24.4s, v28.4s + mul v25.4s, v25.4s, v28.4s + mul v26.4s, v26.4s, v28.4s + sqrshl v24.4s, v24.4s, v30.4s + sqrshl v25.4s, v25.4s, v30.4s + sqrshl v26.4s, v26.4s, v30.4s + sqadd v24.4s, v24.4s, v29.4s + sqadd v25.4s, v25.4s, v29.4s + sqadd v26.4s, v26.4s, v29.4s + sqxtn v24.4h, v24.4s + sqxtn2 v24.8h, v25.4s + sqxtn v26.4h, v26.4s + sqxtun v24.8b, v24.8h + sqxtun v26.8b, v26.8h + st1 {v24.d}[0], [x20], #8 + st1 {v26.s}[0], [x20] + sub x20, x20, #8 + add x20, x20, x21 +.endm + +function hevc_put_hevc_qpel_uni_w_hv12_8_end_neon + // Load first 7 rows of 12 elements each + // Each row: q16-q22 (first 8 elements) + d1-d7 (elements 8-11) + ldr q16, [sp] + ldr d1, [sp, #16] + add sp, sp, x10 + ldr q17, [sp] + ldr d2, [sp, #16] + add sp, sp, x10 + ldr q18, [sp] + ldr d3, [sp, #16] + add sp, sp, x10 + ldr q19, [sp] + ldr d4, [sp, #16] + add sp, sp, x10 + ldr q20, [sp] + ldr d5, [sp, #16] + add sp, sp, x10 + ldr q21, [sp] + ldr d6, [sp, #16] + add sp, sp, x10 + ldr q22, [sp] + ldr d7, [sp, #16] + add sp, sp, x10 +1: + ldr q23, [sp] + ldr d31, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31 + QPEL_UNI_W_HV_12 + subs w22, w22, #1 + b.eq 2f + + ldr q16, [sp] + ldr d1, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1 + QPEL_UNI_W_HV_12 + subs w22, w22, #1 + b.eq 2f + + ldr q17, [sp] + ldr d2, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2 + QPEL_UNI_W_HV_12 + subs w22, w22, #1 + b.eq 2f + + ldr q18, [sp] + ldr d3, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3 + QPEL_UNI_W_HV_12 + subs w22, w22, #1 + b.eq 2f + + ldr q19, [sp] + ldr d4, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4 + QPEL_UNI_W_HV_12 + subs w22, w22, #1 + b.eq 2f + + ldr q20, [sp] + ldr d5, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5 + QPEL_UNI_W_HV_12 + subs w22, w22, #1 + b.eq 2f + + ldr q21, [sp] + ldr d6, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6 + QPEL_UNI_W_HV_12 + subs w22, w22, #1 + b.eq 2f + + ldr q22, [sp] + ldr d7, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7 + QPEL_UNI_W_HV_12 + subs w22, w22, #1 + b.hi 1b + +2: + QPEL_UNI_W_HV_END + ret +endfunc + .macro QPEL_UNI_W_HV_16 sshr v24.4s, v24.4s, #6 sshr v25.4s, v25.4s, #6 @@ -5536,11 +5920,21 @@ function hevc_put_hevc_qpel_uni_w_hv16_8_end_neon add x11, x14, #32 add x20, x13, #16 mov w22, w12 + cmp w27, #8 + b.eq .Lqpel_uni_w_hv24_tail mov x14, x11 mov x13, x20 b.hi 3b QPEL_UNI_W_HV_END ret + +// hv24 tail: process remaining 8 columns (16-23) via tail-call to hv8 +// sp is set to the start of tail data. +// This reuses the stack frame setup by ff_hevc_put_hevc_qpel_uni_w_hv24_8_neon. +// hv8 will restore the original lr and return to the caller. +.Lqpel_uni_w_hv24_tail: + mov sp, x11 + b hevc_put_hevc_qpel_uni_w_hv8_8_end_neon endfunc .macro qpel_uni_w_hv suffix @@ -5549,11 +5943,26 @@ function ff_hevc_put_hevc_qpel_uni_w_hv4_8_\suffix, export=1 b hevc_put_hevc_qpel_uni_w_hv4_8_end_neon endfunc +function ff_hevc_put_hevc_qpel_uni_w_hv6_8_\suffix, export=1 + QPEL_UNI_W_HV_HEADER 6, \suffix + b hevc_put_hevc_qpel_uni_w_hv6_8_end_neon +endfunc + function ff_hevc_put_hevc_qpel_uni_w_hv8_8_\suffix, export=1 QPEL_UNI_W_HV_HEADER 8, \suffix b hevc_put_hevc_qpel_uni_w_hv8_8_end_neon endfunc +function ff_hevc_put_hevc_qpel_uni_w_hv12_8_\suffix, export=1 + QPEL_UNI_W_HV_HEADER 12, \suffix + b hevc_put_hevc_qpel_uni_w_hv12_8_end_neon +endfunc + +function ff_hevc_put_hevc_qpel_uni_w_hv24_8_\suffix, export=1 + QPEL_UNI_W_HV_HEADER 24, \suffix + b hevc_put_hevc_qpel_uni_w_hv16_8_end_neon +endfunc + function ff_hevc_put_hevc_qpel_uni_w_hv16_8_\suffix, export=1 QPEL_UNI_W_HV_HEADER 16, \suffix b hevc_put_hevc_qpel_uni_w_hv16_8_end_neon @@ -5564,6 +5973,11 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_\suffix, export=1 b hevc_put_hevc_qpel_uni_w_hv16_8_end_neon endfunc +function ff_hevc_put_hevc_qpel_uni_w_hv48_8_\suffix, export=1 + QPEL_UNI_W_HV_HEADER 48, \suffix + b hevc_put_hevc_qpel_uni_w_hv16_8_end_neon +endfunc + function ff_hevc_put_hevc_qpel_uni_w_hv64_8_\suffix, export=1 QPEL_UNI_W_HV_HEADER 64, \suffix b hevc_put_hevc_qpel_uni_w_hv16_8_end_neon diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 30560bafb9..b8448c24eb 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -288,7 +288,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv,); NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv,); - NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv,); + NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv,); NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv,); if (have_i8mm(cpu_flags)) { @@ -302,7 +302,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv, _i8mm); NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv, _i8mm); NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm); - NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm); + NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm); NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv, _i8mm); } -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
