PR #22349 opened by hezuoqiang URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22349 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22349.patch
Benchmark Results (Raspberry Pi 5 - Cortex-A76): pred8x8_horizontal_8_c: 15.8 ( 1.00x) pred8x8_horizontal_8_neon: 13.1 ( 1.21x) pred8x8_vertical_8_c: 11.5 ( 1.00x) pred8x8_vertical_8_neon: 11.3 ( 1.02x) >From e2ad9ed8f4f3ed1802ddb5c2e75bd53dbf0c17b8 Mon Sep 17 00:00:00 2001 From: Zuoqiang He <[email protected]> Date: Mon, 2 Mar 2026 22:53:18 +0800 Subject: [PATCH] aarch64/h264pred: Unroll loops in pred8x8 vert/hor NEON Benchmark Results (Raspberry Pi 5 - Cortex-A76): pred8x8_horizontal_8_c: 15.8 ( 1.00x) pred8x8_horizontal_8_neon: 13.1 ( 1.21x) pred8x8_vertical_8_c: 11.5 ( 1.00x) pred8x8_vertical_8_neon: 11.3 ( 1.02x) --- libavcodec/aarch64/h264pred_init.c | 2 -- libavcodec/aarch64/h264pred_neon.S | 28 +++++++++++++++++++++------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c index 9819f85f7d..afd2836569 100644 --- a/libavcodec/aarch64/h264pred_init.c +++ b/libavcodec/aarch64/h264pred_init.c @@ -82,10 +82,8 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, { if (bit_depth == 8) { if (chroma_format_idc <= 1) { -#if ENABLE_INEFFICIENT_ASM h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon; h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon; -#endif if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon; #if ENABLE_INEFFICIENT_ASM diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S index 795d2ce540..70370735fc 100644 --- a/libavcodec/aarch64/h264pred_neon.S +++ b/libavcodec/aarch64/h264pred_neon.S @@ -173,11 +173,22 @@ endconst function ff_pred8x8_hor_neon, export=1 sub x2, x0, #1 - mov w3, #8 -1: ld1r {v0.8b}, [x2], x1 - subs w3, w3, #1 + ld1r {v0.8b}, [x2], x1 + st1 {v0.8b}, [x0], x1 + ld1r {v0.8b}, [x2], x1 + st1 {v0.8b}, [x0], x1 + ld1r {v0.8b}, [x2], x1 + st1 {v0.8b}, [x0], x1 + ld1r {v0.8b}, [x2], x1 + st1 {v0.8b}, [x0], x1 + ld1r {v0.8b}, [x2], x1 + st1 {v0.8b}, [x0], x1 + ld1r {v0.8b}, [x2], x1 + st1 {v0.8b}, [x0], x1 + ld1r {v0.8b}, [x2], x1 + st1 {v0.8b}, [x0], x1 + ld1r {v0.8b}, [x2], x1 st1 {v0.8b}, [x0], x1 - b.ne 1b ret endfunc @@ -185,11 +196,14 @@ function ff_pred8x8_vert_neon, export=1 sub x2, x0, x1 lsl x1, x1, #1 ld1 {v0.8b}, [x2], x1 - mov w3, #4 -1: subs w3, w3, #1 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x2], x1 - b.ne 1b + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x2], x1 + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x2], x1 + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x2], x1 ret endfunc -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
