[libav-devel] [PATCH 04/11] arm: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32

Martin Storsjö Wed, 23 Nov 2016 05:01:24 -0800

This work is sponsored by, and copyright, Google.

Previously all subpartitions except the eob=1 (DC) case ran with
the same runtime:


vp9_inv_dct_dct_16x16_sub16_add_neon:   3189.0   2486.8   2509.9   1964.1
vp9_inv_dct_dct_32x32_sub32_add_neon:  18448.1  16682.0  14235.4  11993.4

By skipping individual 4x16 or 4x32 pixel slices in the first pass,
we reduce the runtime of these functions like this:

vp9_inv_dct_dct_16x16_sub1_add_neon:     271.5    188.7    211.6    235.1
vp9_inv_dct_dct_16x16_sub4_add_neon:    2079.7   1606.3   1772.1   1264.8
vp9_inv_dct_dct_16x16_sub8_add_neon:    2449.2   1834.3   2046.5   1499.7
vp9_inv_dct_dct_16x16_sub12_add_neon:   2826.2   2109.2   2295.9   1758.2
vp9_inv_dct_dct_16x16_sub16_add_neon:   3224.1   2476.5   2533.1   1985.7
vp9_inv_dct_dct_32x32_sub1_add_neon:     752.5    457.5    863.7    554.7
vp9_inv_dct_dct_32x32_sub4_add_neon:   10689.2   8013.4   8592.9   6785.9
vp9_inv_dct_dct_32x32_sub8_add_neon:   12217.8   9068.1   9420.4   7518.3
vp9_inv_dct_dct_32x32_sub12_add_neon:  12967.3  10455.5  10223.9   8275.7
vp9_inv_dct_dct_32x32_sub16_add_neon:  14084.1  11933.7  10998.9   9012.5
vp9_inv_dct_dct_32x32_sub20_add_neon:  15171.4  13335.0  11820.6   9757.2
vp9_inv_dct_dct_32x32_sub24_add_neon:  16229.6  15185.7  12614.4  10504.9
vp9_inv_dct_dct_32x32_sub28_add_neon:  17338.1  15955.3  13445.0  11248.4
vp9_inv_dct_dct_32x32_sub32_add_neon:  18465.7  16974.6  14239.2  11999.1

I.e. in general a very minor overhead for the full subpartition case due
to the additional cmps, but a significant speedup for the cases when we
only need to process a small part of the actual input data.

In common VP9 content in a few inspected clips, 70-90% of the non-dc-only
16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left
8x8 or 16x16 subpartitions respectively.
---
This goes on top of the checkasm vp9dsp patch that adds benchmarking
of generic subpartitions in the itxfm.
---
 libavcodec/arm/vp9itxfm_neon.S | 70 ++++++++++++++++++++++++++++++++++++------
 tests/checkasm/vp9dsp.c        |  6 ++--
 2 files changed, 64 insertions(+), 12 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 01944bd..769579a 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -659,10 +659,17 @@ endfunc
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
 @ transpose into a horizontal 16x4 slice and store.
 @ r0 = dst (temp buffer)
-@ r1 = unused
+@ r1 = slice offset
 @ r2 = src
-@ r3 = slice offset
+@ r3 = eob
+@ r9 = min eob
 function \txfm\()16_1d_4x16_pass1_neon
+.ifc \txfm,idct
+        @ Check if this whole input slice is zero
+        cmp             r3,  r9
+        ble             2f
+.endif
+
         mov             r12, #32
         vmov.s16        q2, #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -678,14 +685,14 @@ function \txfm\()16_1d_4x16_pass1_neon
         transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, 
d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
 
         @ Store the transposed 4x4 blocks horizontally.
-        cmp             r3,  #12
+        cmp             r1,  #12
         beq             1f
 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
         vst1.16         {d\i}, [r0,:64]!
 .endr
         bx              lr
 1:
-        @ Special case: For the last input column (r3 == 12),
+        @ Special case: For the last input column (r1 == 12),
         @ which would be stored as the last row in the temp buffer,
         @ don't store the first 4x4 block, but keep it in registers
         @ for the first slice of the second pass (where it is the
@@ -711,6 +718,18 @@ function \txfm\()16_1d_4x16_pass1_neon
         vmov            d30, d18
         vmov            d31, d19
         bx              lr
+
+.ifc \txfm,idct
+2:
+        @ Set d28-d31 to zero, for the in-register passthrough of coefficients 
to pass 2
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+        @ Write zeros to the temp buffer for pass 2
+.rept 4
+        vst1.16         {q14-q15}, [r0,:128]!
+.endr
+        bx              lr
+.endif
 endfunc
 
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@@ -781,15 +800,23 @@ endfunc
 itxfm16_1d_funcs idct
 itxfm16_1d_funcs iadst
 
+@ This is the minimum eob value for each subpartition, in increments of 4
+const min_eob_idct_idct_16, align=4
+        .short  0, 10, 38, 89
+endconst
+
 .macro itxfm_func16x16 txfm1, txfm2
 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1\()_\txfm2,idct_idct
         cmp             r3,  #1
         beq             idct16x16_dc_add_neon
 .endif
-        push            {r4-r7,lr}
+        push            {r4-r9,lr}
 .ifnc \txfm1\()_\txfm2,idct_idct
         vpush           {q4-q7}
+        mov             r9,  #0
+.else
+        movrel          r8,  min_eob_idct_idct_16
 .endif
 
         @ Align the stack, allocate a temp buffer
@@ -810,8 +837,11 @@ A       and             r7,  sp,  #15
 
 .irp i, 0, 4, 8, 12
         add             r0,  sp,  #(\i*32)
+        mov             r1,  #\i
         add             r2,  r6,  #(\i*2)
-        mov             r3,  #\i
+.ifc \txfm1\()_\txfm2,idct_idct
+        ldrh            r9,  [r8, #(\i/2)]
+.endif
         bl              \txfm1\()16_1d_4x16_pass1_neon
 .endr
 .ifc \txfm2,idct
@@ -830,7 +860,7 @@ A       and             r7,  sp,  #15
 .ifnc \txfm1\()_\txfm2,idct_idct
         vpop            {q4-q7}
 .endif
-        pop             {r4-r7,pc}
+        pop             {r4-r9,pc}
 endfunc
 .endm
 
@@ -944,9 +974,14 @@ endfunc
 @ each output written twice), followed by a separate 16-point IDCT
 @ of the odd inputs, added/subtracted onto the outputs of the first idct16.
 @ r0 = dst (temp buffer)
-@ r1 = unused
+@ r1 = min eob
 @ r2 = src
+@ r3 = eob
 function idct32_1d_4x32_pass1_neon
+        @ Check if this whole input slice is zero
+        cmp             r3,  r1
+        ble             1f
+
         movrel          r12, idct_coeffs
         vld1.16         {q0-q1}, [r12,:128]
 
@@ -1023,6 +1058,15 @@ function idct32_1d_4x32_pass1_neon
         store_rev       28, 24, 20, 16
 .purgem store_rev
         bx              lr
+
+1:
+        @ Write zeros to the temp buffer for pass 2
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+.rept 8
+        vst1.16         {q14-q15}, [r0,:128]!
+.endr
+        bx              lr
 endfunc
 .ltorg
 
@@ -1110,11 +1154,16 @@ function idct32_1d_4x32_pass2_neon
         bx              lr
 endfunc
 
+const min_eob_idct_idct_32, align=4
+        .short  0, 9, 34, 70, 135, 240, 336, 448
+endconst
+
 function ff_vp9_idct_idct_32x32_add_neon, export=1
         cmp             r3,  #1
         beq             idct32x32_dc_add_neon
-        push            {r4-r7,lr}
+        push            {r4-r8,lr}
         vpush           {q4-q7}
+        movrel          r8,  min_eob_idct_idct_32
 
         @ Align the stack, allocate a temp buffer
 T       mov             r7,  sp
@@ -1129,6 +1178,7 @@ A       and             r7,  sp,  #15
 
 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
         add             r0,  sp,  #(\i*64)
+        ldrh            r1,  [r8, #(\i/2)]
         add             r2,  r6,  #(\i*2)
         bl              idct32_1d_4x32_pass1_neon
 .endr
@@ -1141,5 +1191,5 @@ A       and             r7,  sp,  #15
 
         add             sp,  sp,  r7
         vpop            {q4-q7}
-        pop             {r4-r7,pc}
+        pop             {r4-r8,pc}
 endfunc
diff --git a/tests/checkasm/vp9dsp.c b/tests/checkasm/vp9dsp.c
index 25f9dd1..76ce61f 100644
--- a/tests/checkasm/vp9dsp.c
+++ b/tests/checkasm/vp9dsp.c
@@ -272,8 +272,10 @@ static void check_itxfm(void)
             // skip testing sub-IDCTs for WHT or ADST since they don't
             // implement it in any of the SIMD functions. If they do,
             // consider changing this to ensure we have complete test
-            // coverage
-            for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz; sub <<= 1) {
+            // coverage. Test sub=1 for dc-only, then 4, 8, etc, since
+            // the arm version can distinguish them at that level.
+            for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz;
+                 sub == 1 ? (sub = 4) : (sub += 4)) {
                 if (check_func(dsp.itxfm_add[tx][txtp],
                                "vp9_inv_%s_%dx%d_sub%d_add",
                                tx == 4 ? "wht_wht" : txtp_types[txtp],
-- 
2.7.4

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 04/11] arm: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32

Reply via email to