PR #22349 opened by hezuoqiang
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22349
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22349.patch

Benchmark Results (Raspberry Pi 5 - Cortex-A76):
pred8x8_horizontal_8_c: 15.8 ( 1.00x)
pred8x8_horizontal_8_neon: 13.1 ( 1.21x)
pred8x8_vertical_8_c: 11.5 ( 1.00x)
pred8x8_vertical_8_neon: 11.3 ( 1.02x)


>From e2ad9ed8f4f3ed1802ddb5c2e75bd53dbf0c17b8 Mon Sep 17 00:00:00 2001
From: Zuoqiang He <[email protected]>
Date: Mon, 2 Mar 2026 22:53:18 +0800
Subject: [PATCH] aarch64/h264pred: Unroll loops in pred8x8 vert/hor NEON

Benchmark Results (Raspberry Pi 5 - Cortex-A76):
pred8x8_horizontal_8_c:                                 15.8 ( 1.00x)
pred8x8_horizontal_8_neon:                              13.1 ( 1.21x)
pred8x8_vertical_8_c:                                   11.5 ( 1.00x)
pred8x8_vertical_8_neon:                                11.3 ( 1.02x)
---
 libavcodec/aarch64/h264pred_init.c |  2 --
 libavcodec/aarch64/h264pred_neon.S | 28 +++++++++++++++++++++-------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/libavcodec/aarch64/h264pred_init.c 
b/libavcodec/aarch64/h264pred_init.c
index 9819f85f7d..afd2836569 100644
--- a/libavcodec/aarch64/h264pred_init.c
+++ b/libavcodec/aarch64/h264pred_init.c
@@ -82,10 +82,8 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, 
int codec_id,
 {
     if (bit_depth == 8) {
         if (chroma_format_idc <= 1) {
-#if ENABLE_INEFFICIENT_ASM
             h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon;
             h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon;
-#endif
             if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
                 h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
 #if ENABLE_INEFFICIENT_ASM
diff --git a/libavcodec/aarch64/h264pred_neon.S 
b/libavcodec/aarch64/h264pred_neon.S
index 795d2ce540..70370735fc 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -173,11 +173,22 @@ endconst
 
 function ff_pred8x8_hor_neon, export=1
         sub             x2,  x0,  #1
-        mov             w3,  #8
-1:      ld1r            {v0.8b},  [x2], x1
-        subs            w3,  w3,  #1
+        ld1r            {v0.8b},  [x2], x1
+        st1             {v0.8b},  [x0], x1
+        ld1r            {v0.8b},  [x2], x1
+        st1             {v0.8b},  [x0], x1
+        ld1r            {v0.8b},  [x2], x1
+        st1             {v0.8b},  [x0], x1
+        ld1r            {v0.8b},  [x2], x1
+        st1             {v0.8b},  [x0], x1
+        ld1r            {v0.8b},  [x2], x1
+        st1             {v0.8b},  [x0], x1
+        ld1r            {v0.8b},  [x2], x1
+        st1             {v0.8b},  [x0], x1
+        ld1r            {v0.8b},  [x2], x1
+        st1             {v0.8b},  [x0], x1
+        ld1r            {v0.8b},  [x2], x1
         st1             {v0.8b},  [x0], x1
-        b.ne            1b
         ret
 endfunc
 
@@ -185,11 +196,14 @@ function ff_pred8x8_vert_neon, export=1
         sub             x2,  x0,  x1
         lsl             x1,  x1,  #1
         ld1             {v0.8b},  [x2], x1
-        mov             w3,  #4
-1:      subs            w3,  w3,  #1
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x2], x1
-        b.ne            1b
+        st1             {v0.8b},  [x0], x1
+        st1             {v0.8b},  [x2], x1
+        st1             {v0.8b},  [x0], x1
+        st1             {v0.8b},  [x2], x1
+        st1             {v0.8b},  [x0], x1
+        st1             {v0.8b},  [x2], x1
         ret
 endfunc
 
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to