This work is sponsored by, and copyright, Google.
Previously all subpartitions except the eob=1 (DC) case ran with
the same runtime:
vp9_inv_dct_dct_16x16_sub16_add_neon: 3189.0 2486.8 2509.9 1964.1
vp9_inv_dct_dct_32x32_sub32_add_neon: 18448.1 16682.0 14235.4 11993.4
By skipping individual 4x16 or 4x32 pixel slices in the first pass,
we reduce the runtime of these functions like this:
vp9_inv_dct_dct_16x16_sub1_add_neon: 271.5 188.7 211.6 235.1
vp9_inv_dct_dct_16x16_sub4_add_neon: 2079.7 1606.3 1772.1 1264.8
vp9_inv_dct_dct_16x16_sub8_add_neon: 2449.2 1834.3 2046.5 1499.7
vp9_inv_dct_dct_16x16_sub12_add_neon: 2826.2 2109.2 2295.9 1758.2
vp9_inv_dct_dct_16x16_sub16_add_neon: 3224.1 2476.5 2533.1 1985.7
vp9_inv_dct_dct_32x32_sub1_add_neon: 752.5 457.5 863.7 554.7
vp9_inv_dct_dct_32x32_sub4_add_neon: 10689.2 8013.4 8592.9 6785.9
vp9_inv_dct_dct_32x32_sub8_add_neon: 12217.8 9068.1 9420.4 7518.3
vp9_inv_dct_dct_32x32_sub12_add_neon: 12967.3 10455.5 10223.9 8275.7
vp9_inv_dct_dct_32x32_sub16_add_neon: 14084.1 11933.7 10998.9 9012.5
vp9_inv_dct_dct_32x32_sub20_add_neon: 15171.4 13335.0 11820.6 9757.2
vp9_inv_dct_dct_32x32_sub24_add_neon: 16229.6 15185.7 12614.4 10504.9
vp9_inv_dct_dct_32x32_sub28_add_neon: 17338.1 15955.3 13445.0 11248.4
vp9_inv_dct_dct_32x32_sub32_add_neon: 18465.7 16974.6 14239.2 11999.1
I.e. in general a very minor overhead for the full subpartition case due
to the additional cmps, but a significant speedup for the cases when we
only need to process a small part of the actual input data.
In common VP9 content in a few inspected clips, 70-90% of the non-dc-only
16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left
8x8 or 16x16 subpartitions respectively.
---
This goes on top of the checkasm vp9dsp patch that adds benchmarking
of generic subpartitions in the itxfm.
---
libavcodec/arm/vp9itxfm_neon.S | 70 ++++++++++++++++++++++++++++++++++++------
tests/checkasm/vp9dsp.c | 6 ++--
2 files changed, 64 insertions(+), 12 deletions(-)
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 01944bd..769579a 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -659,10 +659,17 @@ endfunc
@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@ transpose into a horizontal 16x4 slice and store.
@ r0 = dst (temp buffer)
-@ r1 = unused
+@ r1 = slice offset
@ r2 = src
-@ r3 = slice offset
+@ r3 = eob
+@ r9 = min eob
function \txfm\()16_1d_4x16_pass1_neon
+.ifc \txfm,idct
+ @ Check if this whole input slice is zero
+ cmp r3, r9
+ ble 2f
+.endif
+
mov r12, #32
vmov.s16 q2, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -678,14 +685,14 @@ function \txfm\()16_1d_4x16_pass1_neon
transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17,
d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
@ Store the transposed 4x4 blocks horizontally.
- cmp r3, #12
+ cmp r1, #12
beq 1f
.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
vst1.16 {d\i}, [r0,:64]!
.endr
bx lr
1:
- @ Special case: For the last input column (r3 == 12),
+ @ Special case: For the last input column (r1 == 12),
@ which would be stored as the last row in the temp buffer,
@ don't store the first 4x4 block, but keep it in registers
@ for the first slice of the second pass (where it is the
@@ -711,6 +718,18 @@ function \txfm\()16_1d_4x16_pass1_neon
vmov d30, d18
vmov d31, d19
bx lr
+
+.ifc \txfm,idct
+2:
+ @ Set d28-d31 to zero, for the in-register passthrough of coefficients
to pass 2
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+ @ Write zeros to the temp buffer for pass 2
+.rept 4
+ vst1.16 {q14-q15}, [r0,:128]!
+.endr
+ bx lr
+.endif
endfunc
@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@@ -781,15 +800,23 @@ endfunc
itxfm16_1d_funcs idct
itxfm16_1d_funcs iadst
+@ This is the minimum eob value for each subpartition, in increments of 4
+const min_eob_idct_idct_16, align=4
+ .short 0, 10, 38, 89
+endconst
+
.macro itxfm_func16x16 txfm1, txfm2
function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
.ifc \txfm1\()_\txfm2,idct_idct
cmp r3, #1
beq idct16x16_dc_add_neon
.endif
- push {r4-r7,lr}
+ push {r4-r9,lr}
.ifnc \txfm1\()_\txfm2,idct_idct
vpush {q4-q7}
+ mov r9, #0
+.else
+ movrel r8, min_eob_idct_idct_16
.endif
@ Align the stack, allocate a temp buffer
@@ -810,8 +837,11 @@ A and r7, sp, #15
.irp i, 0, 4, 8, 12
add r0, sp, #(\i*32)
+ mov r1, #\i
add r2, r6, #(\i*2)
- mov r3, #\i
+.ifc \txfm1\()_\txfm2,idct_idct
+ ldrh r9, [r8, #(\i/2)]
+.endif
bl \txfm1\()16_1d_4x16_pass1_neon
.endr
.ifc \txfm2,idct
@@ -830,7 +860,7 @@ A and r7, sp, #15
.ifnc \txfm1\()_\txfm2,idct_idct
vpop {q4-q7}
.endif
- pop {r4-r7,pc}
+ pop {r4-r9,pc}
endfunc
.endm
@@ -944,9 +974,14 @@ endfunc
@ each output written twice), followed by a separate 16-point IDCT
@ of the odd inputs, added/subtracted onto the outputs of the first idct16.
@ r0 = dst (temp buffer)
-@ r1 = unused
+@ r1 = min eob
@ r2 = src
+@ r3 = eob
function idct32_1d_4x32_pass1_neon
+ @ Check if this whole input slice is zero
+ cmp r3, r1
+ ble 1f
+
movrel r12, idct_coeffs
vld1.16 {q0-q1}, [r12,:128]
@@ -1023,6 +1058,15 @@ function idct32_1d_4x32_pass1_neon
store_rev 28, 24, 20, 16
.purgem store_rev
bx lr
+
+1:
+ @ Write zeros to the temp buffer for pass 2
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+.rept 8
+ vst1.16 {q14-q15}, [r0,:128]!
+.endr
+ bx lr
endfunc
.ltorg
@@ -1110,11 +1154,16 @@ function idct32_1d_4x32_pass2_neon
bx lr
endfunc
+const min_eob_idct_idct_32, align=4
+ .short 0, 9, 34, 70, 135, 240, 336, 448
+endconst
+
function ff_vp9_idct_idct_32x32_add_neon, export=1
cmp r3, #1
beq idct32x32_dc_add_neon
- push {r4-r7,lr}
+ push {r4-r8,lr}
vpush {q4-q7}
+ movrel r8, min_eob_idct_idct_32
@ Align the stack, allocate a temp buffer
T mov r7, sp
@@ -1129,6 +1178,7 @@ A and r7, sp, #15
.irp i, 0, 4, 8, 12, 16, 20, 24, 28
add r0, sp, #(\i*64)
+ ldrh r1, [r8, #(\i/2)]
add r2, r6, #(\i*2)
bl idct32_1d_4x32_pass1_neon
.endr
@@ -1141,5 +1191,5 @@ A and r7, sp, #15
add sp, sp, r7
vpop {q4-q7}
- pop {r4-r7,pc}
+ pop {r4-r8,pc}
endfunc
diff --git a/tests/checkasm/vp9dsp.c b/tests/checkasm/vp9dsp.c
index 25f9dd1..76ce61f 100644
--- a/tests/checkasm/vp9dsp.c
+++ b/tests/checkasm/vp9dsp.c
@@ -272,8 +272,10 @@ static void check_itxfm(void)
// skip testing sub-IDCTs for WHT or ADST since they don't
// implement it in any of the SIMD functions. If they do,
// consider changing this to ensure we have complete test
- // coverage
- for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz; sub <<= 1) {
+ // coverage. Test sub=1 for dc-only, then 4, 8, etc, since
+ // the arm version can distinguish them at that level.
+ for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz;
+ sub == 1 ? (sub = 4) : (sub += 4)) {
if (check_func(dsp.itxfm_add[tx][txtp],
"vp9_inv_%s_%dx%d_sub%d_add",
tx == 4 ? "wht_wht" : txtp_types[txtp],
--
2.7.4
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel