On Wed, 3 May 2017, Alexandra Hájková wrote:
---
libavcodec/arm/hevc_idct.S | 319 ++++++++++++++++++++++++++++++++++----
libavcodec/arm/hevcdsp_init_arm.c | 4 +
2 files changed, 297 insertions(+), 26 deletions(-)
diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevc_idct.S
index 4814c86..3a512b4 100644
--- a/libavcodec/arm/hevc_idct.S
+++ b/libavcodec/arm/hevc_idct.S
@@ -28,6 +28,10 @@ const trans, align=4
.short 89, 75, 50, 18
.short 90, 87, 80, 70
.short 57, 43, 25, 9
+ .short 90, 90, 88, 85
+ .short 82, 78, 73, 67
+ .short 61, 54, 46, 38
+ .short 31, 22, 13, 4
endconst
.macro clip10 in1, in2, c1, c2
@@ -509,7 +513,7 @@ endfunc
vsub.s32 \tmp_m, \e, \o
.endm
-.macro tr16_8x4 in0, in1, in2, in3, in4, in5, in6, in7
+.macro tr16_8x4 in0, in1, in2, in3, in4, in5, in6, in7, offset
tr_4x4_8 \in0, \in2, \in4, \in6, q8, q9, q10, q11, q12, q13,
q14, q15
vmull.s16 q12, \in1, \in0[0]
@@ -535,7 +539,7 @@ endfunc
butterfly q9, q13, q1, q6
butterfly q10, q14, q2, q5
butterfly q11, q15, q3, q4
- add r4, sp, #512
+ add r4, sp, #\offset
vst1.s32 {q0-q1}, [r4, :128]!
vst1.s32 {q2-q3}, [r4, :128]!
vst1.s32 {q4-q5}, [r4, :128]!
@@ -575,15 +579,15 @@ endfunc
vsub.s32 \in6, \in6, \in7
.endm
-.macro store16 in0, in1, in2, in3, in4, in5, in6, in7
+.macro store16 in0, in1, in2, in3, in4, in5, in6, in7, rx
vst1.s16 \in0, [r1, :64], r2
- vst1.s16 \in1, [r3, :64], r4
+ vst1.s16 \in1, [r3, :64], \rx
vst1.s16 \in2, [r1, :64], r2
- vst1.s16 \in3, [r3, :64], r4
+ vst1.s16 \in3, [r3, :64], \rx
vst1.s16 \in4, [r1, :64], r2
- vst1.s16 \in5, [r3, :64], r4
+ vst1.s16 \in5, [r3, :64], \rx
vst1.s16 \in6, [r1, :64], r2
- vst1.s16 \in7, [r3, :64], r4
+ vst1.s16 \in7, [r3, :64], \rx
.endm
.macro scale out0, out1, out2, out3, out4, out5, out6, out7, in0, in1, in2,
in3, in4, in5, in6, in7, shift
@@ -597,19 +601,33 @@ endfunc
vqrshrn.s32 \out7, \in7, \shift
.endm
-.macro tr_16x4 name, shift
+.macro store_to_stack off1, off2, in0, in2, in4, in6, in7, in5, in3, in1
+ add r1, sp, #\off1
+ add r3, sp, #\off2
+ mov r2, #-16
+ vst1.s32 {\in0}, [r1, :128]!
+ vst1.s32 {\in1}, [r3, :128], r2
+ vst1.s32 {\in2}, [r1, :128]!
+ vst1.s32 {\in3}, [r3, :128], r2
+ vst1.s32 {\in4}, [r1, :128]!
+ vst1.s32 {\in5}, [r3, :128], r2
+ vst1.s32 {\in6}, [r1, :128]
+ vst1.s32 {\in7}, [r3, :128]
+.endm
Ok, now this is a little better... It's still confusing; I'm not sure if
I'd prefer to have the parameters named "in0, in1, in2, in3" and then
"in4, in5, in6, in7", or to keep it like this (I guess their numbering
kinda matches how it is used?). If you keep it like this, at least add a
comment saying that it stores in0, in2, in4, in6 ascending from off1 and
in1, in3, in5, in7 descending from off2.
+
+.macro tr_16x4 name, shift, offset, step
function func_tr_16x4_\name
mov r1, r5
- add r3, r5, #64
- mov r2, #128
+ add r3, r5, #(\step * 64)
+ mov r2, #(\step * 128)
load16 d0, d1, d2, d3, d4, d5, d6, d7
movrel r1, trans
- tr16_8x4 d0, d1, d2, d3, d4, d5, d6, d7
+ tr16_8x4 d0, d1, d2, d3, d4, d5, d6, d7, \offset
- add r1, r5, #32
- add r3, r5, #(64 + 32)
- mov r2, #128
+ add r1, r5, #(\step * 32)
+ add r3, r5, #(\step * 3 *32)
+ mov r2, #(\step * 128)
load16 d8, d9, d2, d3, d4, d5, d6, d7
movrel r1, trans + 16
vld1.s16 {q0}, [r1, :128]
@@ -630,11 +648,12 @@ function func_tr_16x4_\name
add_member d6, d1[2], d0[3], d0[0], d0[2], d1[1], d1[3], d1[0],
d0[1], +, -, +, -, +, +, -, +
add_member d7, d1[3], d1[2], d1[1], d1[0], d0[3], d0[2], d0[1],
d0[0], +, -, +, -, +, -, +, -
- add r4, sp, #512
- vld1.s16 {q0-q1}, [r4, :128]!
- vld1.s16 {q2-q3}, [r4, :128]!
+ add r4, sp, #\offset
+ vld1.s32 {q0-q1}, [r4, :128]!
+ vld1.s32 {q2-q3}, [r4, :128]!
Move these changes from vld1.s16 to vld1.s32 into the previous patch as
well
butterfly16 q0, q5, q1, q6, q2, q7, q3, q8
+ .if \shift > 0
scale d26, d27, d28, d29, d30, d31, d16, d17, q4, q0, q5, q1,
q6, q2, q7, q3, \shift
transpose8_4x4 d26, d28, d30, d16
transpose8_4x4 d17, d31, d29, d27
@@ -642,12 +661,16 @@ function func_tr_16x4_\name
add r3, r6, #(24 +3*32)
mov r2, #32
mov r4, #-32
- store16 d26, d27, d28, d29, d30, d31, d16, d17
-
- add r4, sp, #576
- vld1.s16 {q0-q1}, [r4, :128]!
- vld1.s16 {q2-q3}, [r4, :128]
+ store16 d26, d27, d28, d29, d30, d31, d16, d17, r4
+ .else
+ store_to_stack \offset, (\offset + 240), q4, q5, q6, q7, q3, q2, q1,
q0
+ .endif
+
+ add r4, sp, #(\offset + 64)
+ vld1.s32 {q0-q1}, [r4, :128]!
+ vld1.s32 {q2-q3}, [r4, :128]
Here you also change vld1.s16 into vld1.s32 at the same time. Please
change that in the previous patch
// Martin
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel