PR #20517 opened by welder URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20517 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20517.patch
Nothing spectacular, merged a few adds and shifts into rounding shifts. >From 7809ff9746abf83bc41c1f13d9e1b2f1da6b0fb9 Mon Sep 17 00:00:00 2001 From: Krzysztof Pyrkosz <[email protected]> Date: Fri, 5 Sep 2025 19:52:11 +0200 Subject: [PATCH] avcodec/aarch64/vvc: Optimize dmvr_hv_10 Before and ofter on A53: dmvr_hv_10_12x20_neon: 1838.2 ( 3.02x) dmvr_hv_10_20x12_neon: 1330.2 ( 1.83x) dmvr_hv_10_20x20_neon: 2148.2 ( 1.85x) dmvr_hv_12_12x20_neon: 1839.2 ( 3.02x) dmvr_hv_12_20x12_neon: 1330.6 ( 1.83x) dmvr_hv_12_20x20_neon: 2147.2 ( 1.85x) dmvr_hv_10_12x20_neon: 1755.0 ( 3.17x) dmvr_hv_10_20x12_neon: 1165.8 ( 2.09x) dmvr_hv_10_20x20_neon: 1876.1 ( 2.12x) dmvr_hv_12_12x20_neon: 1754.4 ( 3.17x) dmvr_hv_12_20x12_neon: 1167.8 ( 2.09x) dmvr_hv_12_20x20_neon: 1878.8 ( 2.12x) --- libavcodec/aarch64/vvc/inter.S | 58 ++++++++++------------------------ 1 file changed, 17 insertions(+), 41 deletions(-) diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S index 01d2ff155c..79ff720cdd 100644 --- a/libavcodec/aarch64/vvc/inter.S +++ b/libavcodec/aarch64/vvc/inter.S @@ -599,18 +599,13 @@ function ff_vvc_dmvr_hv_8_neon, export=1 endfunc function ff_vvc_dmvr_hv_12_neon, export=1 - movi v29.4s, #(12 - 6) - movi v30.4s, #(1 << (12 - 7)) // offset1 + mvni v29.4s, #(12 - 6 - 1) b 0f endfunc function ff_vvc_dmvr_hv_10_neon, export=1 - movi v29.4s, #(10 - 6) - movi v30.4s, #(1 << (10 - 7)) // offset1 + mvni v29.4s, #(10 - 6 - 1) 0: - movi v31.4s, #8 // offset2 - neg v29.4s, v29.4s - sub sp, sp, #(VVC_MAX_PB_SIZE * 4) movrel x9, X(ff_vvc_inter_luma_dmvr_filters) @@ -626,7 +621,6 @@ function ff_vvc_dmvr_hv_10_neon, export=1 add x12, x9, my, lsl #1 ldrb w10, [x12] ldrb w11, [x12, #1] - sxtw x6, w6 dup v2.8h, w10 // filter_y[0] dup v3.8h, w11 // filter_y[1] @@ -635,7 +629,7 @@ function ff_vvc_dmvr_hv_10_neon, export=1 mov w10, #0 // start filter_y or not add height, height, #1 sub dst, dst, #(VVC_MAX_PB_SIZE * 2) - sub src_stride, src_stride, x6, lsl #1 + sub src_stride, src_stride, w6, sxtw #1 cset w15, gt // width > 16 1: mov x12, tmp0 @@ -656,14 +650,10 @@ function ff_vvc_dmvr_hv_10_neon, export=1 umlal v18.4s, v17.4h, v1.4h umlal2 v19.4s, v17.8h, v1.8h - add v4.4s, v4.4s, v30.4s - add v5.4s, v5.4s, v30.4s - add v18.4s, v18.4s, v30.4s - add v19.4s, v19.4s, v30.4s - ushl v4.4s, v4.4s, v29.4s - ushl v5.4s, v5.4s, v29.4s - ushl v18.4s, v18.4s, v29.4s - ushl v19.4s, v19.4s, v29.4s + urshl v4.4s, v4.4s, v29.4s + urshl v5.4s, v5.4s, v29.4s + urshl v18.4s, v18.4s, v29.4s + urshl v19.4s, v19.4s, v29.4s uqxtn v6.4h, v4.4s uqxtn2 v6.8h, v5.4s uqxtn v7.4h, v18.4s @@ -681,18 +671,10 @@ function ff_vvc_dmvr_hv_10_neon, export=1 umlal2 v18.4s, v6.8h, v3.8h umlal v19.4s, v7.4h, v3.4h umlal2 v20.4s, v7.8h, v3.8h - add v17.4s, v17.4s, v31.4s - add v18.4s, v18.4s, v31.4s - add v19.4s, v19.4s, v31.4s - add v20.4s, v20.4s, v31.4s - ushr v17.4s, v17.4s, #4 - ushr v18.4s, v18.4s, #4 - ushr v19.4s, v19.4s, #4 - ushr v20.4s, v20.4s, #4 - uqxtn v6.4h, v17.4s - uqxtn2 v6.8h, v18.4s - uqxtn v7.4h, v19.4s - uqxtn2 v7.8h, v20.4s + uqrshrn v6.4h, v17.4s, #4 + uqrshrn2 v6.8h, v18.4s, #4 + uqrshrn v7.4h, v19.4s, #4 + uqrshrn2 v7.8h, v20.4s, #4 stp q6, q7, [x14], #32 b 3f 2: @@ -704,10 +686,8 @@ function ff_vvc_dmvr_hv_10_neon, export=1 umlal v4.4s, v7.4h, v1.4h umlal2 v5.4s, v7.8h, v1.8h - add v4.4s, v4.4s, v30.4s - add v5.4s, v5.4s, v30.4s - ushl v4.4s, v4.4s, v29.4s - ushl v5.4s, v5.4s, v29.4s + urshl v4.4s, v4.4s, v29.4s + urshl v5.4s, v5.4s, v29.4s uqxtn v6.4h, v4.4s uqxtn2 v6.8h, v5.4s str q6, [x13], #16 @@ -719,10 +699,8 @@ function ff_vvc_dmvr_hv_10_neon, export=1 umull2 v18.4s, v16.8h, v2.8h umlal v17.4s, v6.4h, v3.4h umlal2 v18.4s, v6.8h, v3.8h - add v17.4s, v17.4s, v31.4s - add v18.4s, v18.4s, v31.4s - ushr v17.4s, v17.4s, #4 - ushr v18.4s, v18.4s, #4 + urshr v17.4s, v17.4s, #4 + urshr v18.4s, v18.4s, #4 uqxtn v16.4h, v17.4s uqxtn2 v16.8h, v18.4s str q16, [x14], #16 @@ -731,8 +709,7 @@ function ff_vvc_dmvr_hv_10_neon, export=1 ldr d6, [src], #8 umull v4.4s, v7.4h, v1.4h umlal v4.4s, v6.4h, v0.4h - add v4.4s, v4.4s, v30.4s - ushl v4.4s, v4.4s, v29.4s + urshl v4.4s, v4.4s, v29.4s uqxtn v6.4h, v4.4s str d6, [x13], #8 @@ -741,8 +718,7 @@ function ff_vvc_dmvr_hv_10_neon, export=1 ldr d16, [x12], #8 umull v17.4s, v16.4h, v2.4h umlal v17.4s, v6.4h, v3.4h - add v17.4s, v17.4s, v31.4s - ushr v17.4s, v17.4s, #4 + urshr v17.4s, v17.4s, #4 uqxtn v16.4h, v17.4s str d16, [x14], #8 4: -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
