This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 60b372c934ad404c58eb2545a0e186a474d09337 Author: Jun Zhao <[email protected]> AuthorDate: Fri Mar 6 17:56:14 2026 +0800 Commit: Jun Zhao <[email protected]> CommitDate: Mon Mar 30 14:32:10 2026 +0000 lavc/hevc: add aarch64 NEON for DC prediction Add NEON-optimized implementation for HEVC intra DC prediction at 8-bit depth, supporting all block sizes (4x4 to 32x32). DC prediction computes the average of top and left reference samples using uaddlv, with urshr for rounded division. For luma blocks smaller than 32x32, edge smoothing is applied: the first row and column are blended toward the reference using (ref[i] + 3*dc + 2) >> 2 computed entirely in the NEON domain. Fill stores use pre-computed address patterns to break dependency chains. Also adds the aarch64 initialization framework (Makefile, pred.c/pred.h hooks, hevcpred_init_aarch64.c). Speedup over C on Apple M4 (checkasm --bench): 4x4: 2.28x 8x8: 3.14x 16x16: 3.29x 32x32: 3.02x Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/aarch64/Makefile | 2 + libavcodec/aarch64/hevcpred_init_aarch64.c | 74 ++++++ libavcodec/aarch64/hevcpred_neon.S | 407 +++++++++++++++++++++++++++++ libavcodec/hevc/pred.c | 3 + libavcodec/hevc/pred.h | 1 + 5 files changed, 487 insertions(+) diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 41ab0257b3..085376ecd6 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -78,6 +78,8 @@ NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_deblock_neon.o \ aarch64/hevcdsp_dequant_neon.o \ aarch64/hevcdsp_idct_neon.o \ aarch64/hevcdsp_init_aarch64.o \ + aarch64/hevcpred_neon.o \ + aarch64/hevcpred_init_aarch64.o \ aarch64/h26x/epel_neon.o \ aarch64/h26x/qpel_neon.o \ aarch64/h26x/sao_neon.o diff --git a/libavcodec/aarch64/hevcpred_init_aarch64.c b/libavcodec/aarch64/hevcpred_init_aarch64.c new file mode 100644 index 0000000000..db4029a161 --- /dev/null +++ b/libavcodec/aarch64/hevcpred_init_aarch64.c @@ -0,0 +1,74 @@ +/* + * HEVC Intra Prediction NEON initialization + * + * Copyright (c) 2026 Jun Zhao <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/avassert.h" +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/hevc/pred.h" + +// DC prediction +void ff_hevc_pred_dc_4x4_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx); +void ff_hevc_pred_dc_8x8_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx); +void ff_hevc_pred_dc_16x16_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx); +void ff_hevc_pred_dc_32x32_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx); + +static void pred_dc_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int log2_size, int c_idx) +{ + switch (log2_size) { + case 2: + ff_hevc_pred_dc_4x4_8_neon(src, top, left, stride, c_idx); + break; + case 3: + ff_hevc_pred_dc_8x8_8_neon(src, top, left, stride, c_idx); + break; + case 4: + ff_hevc_pred_dc_16x16_8_neon(src, top, left, stride, c_idx); + break; + case 5: + ff_hevc_pred_dc_32x32_8_neon(src, top, left, stride, c_idx); + break; + default: + av_unreachable("log2_size must be 2, 3, 4 or 5"); + } +} + +av_cold void ff_hevc_pred_init_aarch64(HEVCPredContext *hpc, int bit_depth) +{ + int cpu_flags = av_get_cpu_flags(); + + if (!have_neon(cpu_flags)) + return; + + if (bit_depth == 8) { + hpc->pred_dc = pred_dc_neon; + } +} diff --git a/libavcodec/aarch64/hevcpred_neon.S b/libavcodec/aarch64/hevcpred_neon.S new file mode 100644 index 0000000000..cd1508bb6c --- /dev/null +++ b/libavcodec/aarch64/hevcpred_neon.S @@ -0,0 +1,407 @@ +/* + * HEVC Intra Prediction NEON optimizations + * + * Copyright (c) 2026 Jun Zhao <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +/* HEVC Intra Prediction NEON functions + * + * Internal NEON function signatures — the C dispatch wrappers in + * hevcpred_init_aarch64.c handle log2_size-based dispatch, so these + * per-size functions do not take log2_size themselves: + * + * pred_dc_NxN: void (uint8_t *src, const uint8_t *top, + * const uint8_t *left, ptrdiff_t stride, int c_idx) + * pred_planar_NxN: void (uint8_t *src, const uint8_t *top, + * const uint8_t *left, ptrdiff_t stride) + * pred_angular_*_NxN: void (uint8_t *src, const uint8_t *top, + * const uint8_t *left, ptrdiff_t stride, + * int c_idx, int mode) + * + * Mode 10 and 26 accept log2_size since they share one entry point per mode. + */ + +// ============================================================================= +// DC Prediction +// ============================================================================= + +/* + * DC prediction algorithm: + * 1. dc = sum(top[0..size-1]) + sum(left[0..size-1]) + size + * 2. dc >>= (log2_size + 1) + * 3. Fill block with dc value + * 4. If c_idx == 0 && size < 32: smooth edges + * - POS(0,0) = (left[0] + 2*dc + top[0] + 2) >> 2 + * - First row: (top[x] + 3*dc + 2) >> 2 + * - First col: (left[y] + 3*dc + 2) >> 2 +*/ + +// ----------------------------------------------------------------------------- +// pred_dc_4x4_8: DC prediction +// Arguments: +// x0: src +// x1: top +// x2: left +// x3: stride +// w4: c_idx +// ----------------------------------------------------------------------------- +function ff_hevc_pred_dc_4x4_8_neon, export=1 + // Load top[0..3] and left[0..3] + ldr s0, [x1] // top[0..3] + ldr s1, [x2] // left[0..3] + + // Sum using NEON + uaddlv h2, v0.8b // sum top (only 4 valid bytes) + uaddlv h3, v1.8b // sum left (only 4 valid bytes) + add v2.4h, v2.4h, v3.4h // total sum + + // Add rounding and shift by 3 (urshr = unsigned rounding shift right) + add x5, x0, x3, lsl #1 // row 2 address (early for str) + urshr v2.4h, v2.4h, #3 // (sum + 4) >> 3 + dup v2.8b, v2.b[0] // broadcast dc + + // Store 4 rows + str s2, [x0] + str s2, [x0, x3] + str s2, [x5] + str s2, [x5, x3] + + // Edge smoothing for luma only + cbnz w4, 2f + + // Compute 3*dc in NEON domain (16-bit) + uxtl v3.8h, v2.8b // widen dc to 16-bit + add v6.8h, v3.8h, v3.8h // 2*dc + add v3.8h, v6.8h, v3.8h // 3*dc + + // Widen top and left to 16-bit + uxtl v4.8h, v0.8b // top[0..3] widened + uxtl v5.8h, v1.8b // left[0..3] widened + + // Corner: (top[0] + left[0] + 2*dc + 2) >> 2 + // First row: (top[x] + 3*dc + 2) >> 2 + // First column: (left[y] + 3*dc + 2) >> 2 + add v7.4h, v4.4h, v5.4h // corner: top[x] + left[x] (only lane 0 matters) + add v4.8h, v4.8h, v3.8h // first row: top[x] + 3*dc + add v5.8h, v5.8h, v3.8h // first column: left[y] + 3*dc + add v7.4h, v7.4h, v6.4h // corner: + 2*dc + rshrn v4.8b, v4.8h, #2 // first row: (x + 2) >> 2 + rshrn v5.8b, v5.8h, #2 // first column: (x + 2) >> 2 + rshrn v7.8b, v7.8h, #2 // corner: (x + 2) >> 2 + + // Overwrite corner byte in row result + ins v4.b[0], v7.b[0] + + // Store smoothed first row + str s4, [x0] + + // Store smoothed column for y=1..3 + add x5, x0, x3 + add x6, x0, x3, lsl #1 + add x7, x5, x3, lsl #1 + st1 {v5.b}[1], [x5] + st1 {v5.b}[2], [x6] + st1 {v5.b}[3], [x7] + +2: ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_dc_8x8_8: DC prediction +// Arguments: +// x0: src +// x1: top +// x2: left +// x3: stride +// w4: c_idx +// ----------------------------------------------------------------------------- +function ff_hevc_pred_dc_8x8_8_neon, export=1 + // Load top[0..7] and left[0..7] + ldr d0, [x1] // top[0..7] + ldr d1, [x2] // left[0..7] + + // Sum all pixels + uaddlv h2, v0.8b // sum top + uaddlv h3, v1.8b // sum left + add v2.4h, v2.4h, v3.4h // total sum + + // Add rounding and shift by 4 + urshr v2.4h, v2.4h, #4 // (sum + 8) >> 4 + dup v2.8b, v2.b[0] // broadcast dc + + // Check if edge smoothing needed (luma only) + cbnz w4, 2f + + // === Luma path: fill + edge smoothing combined === + + // Compute 3*dc in NEON domain (16-bit) + uxtl v3.8h, v2.8b // widen dc to 16-bit + add v6.8h, v3.8h, v3.8h // 2*dc + add v3.8h, v6.8h, v3.8h // 3*dc + + // Widen top and left to 16-bit + uxtl v4.8h, v0.8b + uxtl v5.8h, v1.8b + + // Corner: (top[0] + left[0] + 2*dc + 2) >> 2 + // Smoothed first row: (top[x] + 3*dc + 2) >> 2 + // Smoothed column: (left[y] + 3*dc + 2) >> 2 + add v7.4h, v4.4h, v5.4h // corner: top[x] + left[x] (only lane 0 matters) + add v4.8h, v4.8h, v3.8h // first row: top[x] + 3*dc + add v5.8h, v5.8h, v3.8h // column: left[y] + 3*dc + add v7.4h, v7.4h, v6.4h // corner: + 2*dc + rshrn v4.8b, v4.8h, #2 // first row: (x + 2) >> 2 + rshrn v5.8b, v5.8h, #2 // column: (x + 2) >> 2 + rshrn v7.8b, v7.8h, #2 // corner: (x + 2) >> 2 + // Overwrite corner byte + ins v4.b[0], v7.b[0] + + // Store row 0 (smoothed) + str d4, [x0] + + // Store DC fill for rows 1-7 with pre-computed addresses + add x15, x0, x3, lsl #1 + str d2, [x0, x3] + add x5, x0, x3, lsl #2 + str d2, [x15] + str d2, [x15, x3] + str d2, [x5] + add x15, x5, x3, lsl #1 + str d2, [x5, x3] + str d2, [x15] + str d2, [x15, x3] + + // Scatter-store column bytes with pre-computed addresses + add x5, x0, x3 + add x6, x0, x3, lsl #1 + add x7, x5, x3, lsl #1 + add x8, x0, x3, lsl #2 + st1 {v5.b}[1], [x5] + st1 {v5.b}[2], [x6] + st1 {v5.b}[3], [x7] + st1 {v5.b}[4], [x8] + add x5, x8, x3 + add x6, x8, x3, lsl #1 + add x7, x5, x3, lsl #1 + st1 {v5.b}[5], [x5] + st1 {v5.b}[6], [x6] + st1 {v5.b}[7], [x7] + ret + +2: // === Chroma path: plain DC fill === + str d2, [x0] + add x15, x0, x3, lsl #1 + str d2, [x0, x3] + add x0, x0, x3, lsl #2 + str d2, [x15] + str d2, [x15, x3] + str d2, [x0] + add x15, x0, x3, lsl #1 + str d2, [x0, x3] + str d2, [x15] + str d2, [x15, x3] + ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_dc_16x16_8: DC prediction +// Arguments: +// x0: src +// x1: top +// x2: left +// x3: stride +// w4: c_idx +// ----------------------------------------------------------------------------- +function ff_hevc_pred_dc_16x16_8_neon, export=1 + // Load top[0..15] and left[0..15] + ldr q0, [x1] // top[0..15] + ldr q1, [x2] // left[0..15] + + // Sum all pixels + uaddlv h2, v0.16b // sum top + uaddlv h3, v1.16b // sum left + add v2.4h, v2.4h, v3.4h + + // Add rounding and shift by 5 + urshr v2.4h, v2.4h, #5 // (sum + 16) >> 5 + dup v2.16b, v2.b[0] // broadcast dc + + // Check if edge smoothing needed (luma only) + cbnz w4, 2f + + // === Luma path: fill + edge smoothing combined === + + // Compute 3*dc in NEON domain (16-bit) + uxtl v3.8h, v2.8b // widen dc to 16-bit + add v6.8h, v3.8h, v3.8h // 2*dc + add v3.8h, v6.8h, v3.8h // 3*dc + + // Widen top to 16-bit + uxtl v4.8h, v0.8b + uxtl2 v5.8h, v0.16b + + // Corner: (top[0] + left[0] + 2*dc + 2) >> 2 + // Smoothed first row: (top[x] + 3*dc + 2) >> 2 + uxtl v7.8h, v1.8b // widen left[0..7] (reuse for corner lane 0) + add v16.4h, v4.4h, v7.4h // corner: top[x] + left[x] (only lane 0 matters) + add v4.8h, v4.8h, v3.8h // first row lo: top[x] + 3*dc + add v5.8h, v5.8h, v3.8h // first row hi: top[x] + 3*dc + add v16.4h, v16.4h, v6.4h // corner: + 2*dc + rshrn v4.8b, v4.8h, #2 // first row lo: >> 2 + rshrn2 v4.16b, v5.8h, #2 // first row hi: >> 2 (smoothed first row) + rshrn v16.8b, v16.8h, #2 // corner: >> 2 + // Overwrite corner byte + ins v4.b[0], v16.b[0] + + // Smoothed column: (left[y] + 3*dc + 2) >> 2 + uxtl v5.8h, v1.8b + uxtl2 v6.8h, v1.16b + add v5.8h, v5.8h, v3.8h + add v6.8h, v6.8h, v3.8h + rshrn v5.8b, v5.8h, #2 + rshrn2 v5.16b, v6.8h, #2 // smoothed column values + + // Store row 0 (smoothed) + str q4, [x0] + + // Store DC fill for all 15 remaining rows + add x15, x0, x3, lsl #1 + str q2, [x0, x3] // row 1 + add x5, x0, x3, lsl #2 + str q2, [x15] // row 2 + str q2, [x15, x3] // row 3 + str q2, [x5] // row 4 + add x15, x5, x3, lsl #1 + str q2, [x5, x3] // row 5 + add x5, x5, x3, lsl #2 + str q2, [x15] // row 6 + str q2, [x15, x3] // row 7 + str q2, [x5] // row 8 + add x15, x5, x3, lsl #1 + str q2, [x5, x3] // row 9 + add x5, x5, x3, lsl #2 + str q2, [x15] // row 10 + str q2, [x15, x3] // row 11 + str q2, [x5] // row 12 + add x15, x5, x3, lsl #1 + str q2, [x5, x3] // row 13 + str q2, [x15] // row 14 + str q2, [x15, x3] // row 15 + + // Now scatter-store column bytes over the DC fill + add x5, x0, x3 + add x6, x0, x3, lsl #1 + add x7, x5, x3, lsl #1 + add x8, x0, x3, lsl #2 + st1 {v5.b}[1], [x5] + st1 {v5.b}[2], [x6] + st1 {v5.b}[3], [x7] + st1 {v5.b}[4], [x8] + add x5, x8, x3 + add x6, x8, x3, lsl #1 + add x7, x5, x3, lsl #1 + add x9, x8, x3, lsl #2 + st1 {v5.b}[5], [x5] + st1 {v5.b}[6], [x6] + st1 {v5.b}[7], [x7] + st1 {v5.b}[8], [x9] + add x5, x9, x3 + add x6, x9, x3, lsl #1 + add x7, x5, x3, lsl #1 + add x8, x9, x3, lsl #2 + st1 {v5.b}[9], [x5] + st1 {v5.b}[10], [x6] + st1 {v5.b}[11], [x7] + st1 {v5.b}[12], [x8] + add x5, x8, x3 + add x6, x8, x3, lsl #1 + add x7, x5, x3, lsl #1 + st1 {v5.b}[13], [x5] + st1 {v5.b}[14], [x6] + st1 {v5.b}[15], [x7] + ret + +2: // === Chroma path: plain DC fill === + str q2, [x0] // row 0 + add x15, x0, x3, lsl #1 + str q2, [x0, x3] // row 1 + add x5, x0, x3, lsl #2 + str q2, [x15] // row 2 + str q2, [x15, x3] // row 3 + str q2, [x5] // row 4 + add x15, x5, x3, lsl #1 + str q2, [x5, x3] // row 5 + add x5, x5, x3, lsl #2 + str q2, [x15] // row 6 + str q2, [x15, x3] // row 7 + str q2, [x5] // row 8 + add x15, x5, x3, lsl #1 + str q2, [x5, x3] // row 9 + add x5, x5, x3, lsl #2 + str q2, [x15] // row 10 + str q2, [x15, x3] // row 11 + str q2, [x5] // row 12 + add x15, x5, x3, lsl #1 + str q2, [x5, x3] // row 13 + str q2, [x15] // row 14 + str q2, [x15, x3] // row 15 + ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_dc_32x32_8: DC prediction (no edge smoothing) +// Arguments: +// x0: src +// x1: top +// x2: left +// x3: stride +// w4: c_idx +// ----------------------------------------------------------------------------- +function ff_hevc_pred_dc_32x32_8_neon, export=1 + // Load top[0..31] and left[0..31] + ldp q0, q1, [x1] // top[0..31] + ldp q2, q3, [x2] // left[0..31] + + // Sum all pixels + uaddlv h0, v0.16b + uaddlv h1, v1.16b + uaddlv h2, v2.16b + uaddlv h3, v3.16b + add v0.4h, v0.4h, v1.4h + add v2.4h, v2.4h, v3.4h + add v0.4h, v0.4h, v2.4h + + // Add rounding and shift by 6 (urshr = unsigned rounding shift right) + urshr v0.4h, v0.4h, #6 + dup v0.16b, v0.b[0] + mov v1.16b, v0.16b + + // Store 32 rows + mov w6, #32 +2: + subs w6, w6, #1 + stp q0, q1, [x0] + add x0, x0, x3 + b.ne 2b + + // No edge smoothing for 32x32 (size >= 32) + ret +endfunc diff --git a/libavcodec/hevc/pred.c b/libavcodec/hevc/pred.c index 8d588382fa..88306c23c4 100644 --- a/libavcodec/hevc/pred.c +++ b/libavcodec/hevc/pred.c @@ -75,6 +75,9 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth) break; } +#if ARCH_AARCH64 + ff_hevc_pred_init_aarch64(hpc, bit_depth); +#endif #if ARCH_MIPS ff_hevc_pred_init_mips(hpc, bit_depth); #endif diff --git a/libavcodec/hevc/pred.h b/libavcodec/hevc/pred.h index 1ac8f9666b..c4bd72b1a3 100644 --- a/libavcodec/hevc/pred.h +++ b/libavcodec/hevc/pred.h @@ -44,5 +44,6 @@ typedef struct HEVCPredContext { void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth); void ff_hevc_pred_init_mips(HEVCPredContext *hpc, int bit_depth); +void ff_hevc_pred_init_aarch64(HEVCPredContext *hpc, int bit_depth); #endif /* AVCODEC_HEVC_PRED_H */ _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
