On Wed, 3 May 2017, Alexandra Hájková wrote:

---
libavcodec/arm/hevc_idct.S        | 319 ++++++++++++++++++++++++++++++++++----
libavcodec/arm/hevcdsp_init_arm.c |   4 +
2 files changed, 297 insertions(+), 26 deletions(-)


diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevc_idct.S
index 4814c86..3a512b4 100644
--- a/libavcodec/arm/hevc_idct.S
+++ b/libavcodec/arm/hevc_idct.S
@@ -28,6 +28,10 @@ const trans, align=4
        .short 89, 75, 50, 18
        .short 90, 87, 80, 70
        .short 57, 43, 25, 9
+        .short 90, 90, 88, 85
+        .short 82, 78, 73, 67
+        .short 61, 54, 46, 38
+        .short 31, 22, 13, 4
endconst

.macro clip10 in1, in2, c1, c2
@@ -509,7 +513,7 @@ endfunc
        vsub.s32        \tmp_m, \e, \o
.endm

-.macro tr16_8x4 in0, in1, in2, in3, in4, in5, in6, in7
+.macro tr16_8x4 in0, in1, in2, in3, in4, in5, in6, in7, offset
        tr_4x4_8        \in0, \in2, \in4, \in6, q8, q9, q10, q11, q12, q13, 
q14, q15

        vmull.s16       q12, \in1, \in0[0]
@@ -535,7 +539,7 @@ endfunc
        butterfly       q9,  q13, q1, q6
        butterfly       q10, q14, q2, q5
        butterfly       q11, q15, q3, q4
-        add             r4,  sp,  #512
+        add             r4,  sp,  #\offset
        vst1.s32        {q0-q1}, [r4, :128]!
        vst1.s32        {q2-q3}, [r4, :128]!
        vst1.s32        {q4-q5}, [r4, :128]!
@@ -575,15 +579,15 @@ endfunc
        vsub.s32        \in6, \in6, \in7
.endm

-.macro store16 in0, in1, in2, in3, in4, in5, in6, in7
+.macro store16 in0, in1, in2, in3, in4, in5, in6, in7, rx
        vst1.s16        \in0, [r1, :64], r2
-        vst1.s16        \in1, [r3, :64], r4
+        vst1.s16        \in1, [r3, :64], \rx
        vst1.s16        \in2, [r1, :64], r2
-        vst1.s16        \in3, [r3, :64], r4
+        vst1.s16        \in3, [r3, :64], \rx
        vst1.s16        \in4, [r1, :64], r2
-        vst1.s16        \in5, [r3, :64], r4
+        vst1.s16        \in5, [r3, :64], \rx
        vst1.s16        \in6, [r1, :64], r2
-        vst1.s16        \in7, [r3, :64], r4
+        vst1.s16        \in7, [r3, :64], \rx
.endm

.macro scale out0, out1, out2, out3, out4, out5, out6, out7, in0, in1, in2, 
in3, in4, in5, in6, in7, shift
@@ -597,19 +601,33 @@ endfunc
        vqrshrn.s32     \out7, \in7, \shift
.endm

-.macro tr_16x4 name, shift
+.macro store_to_stack off1, off2, in0, in2, in4, in6, in7, in5, in3, in1
+        add             r1, sp, #\off1
+        add             r3, sp, #\off2
+        mov             r2, #-16
+        vst1.s32        {\in0}, [r1, :128]!
+        vst1.s32        {\in1}, [r3, :128], r2
+        vst1.s32        {\in2}, [r1, :128]!
+        vst1.s32        {\in3}, [r3, :128], r2
+        vst1.s32        {\in4}, [r1, :128]!
+        vst1.s32        {\in5}, [r3, :128], r2
+        vst1.s32        {\in6}, [r1, :128]
+        vst1.s32        {\in7}, [r3, :128]
+.endm

Ok, now this is a little better... It's still confusing; I'm not sure if I'd prefer to have the parameters named "in0, in1, in2, in3" and then "in4, in5, in6, in7", or to keep it like this (I guess their numbering kinda matches how it is used?). If you keep it like this, at least add a comment saying that it stores in0, in2, in4, in6 ascending from off1 and in1, in3, in5, in7 descending from off2.

+
+.macro tr_16x4 name, shift, offset, step
function func_tr_16x4_\name
        mov             r1,  r5
-        add             r3,  r5, #64
-        mov             r2,  #128
+        add             r3, r5, #(\step * 64)
+        mov             r2,  #(\step * 128)
        load16          d0, d1, d2, d3, d4, d5, d6, d7
        movrel          r1, trans

-        tr16_8x4        d0, d1, d2, d3, d4, d5, d6, d7
+        tr16_8x4        d0, d1, d2, d3, d4, d5, d6, d7, \offset

-        add             r1,  r5, #32
-        add             r3,  r5, #(64 + 32)
-        mov             r2,  #128
+        add             r1,  r5, #(\step * 32)
+        add             r3,  r5, #(\step * 3 *32)
+        mov             r2,  #(\step * 128)
        load16          d8, d9, d2, d3, d4, d5, d6, d7
        movrel          r1, trans + 16
        vld1.s16        {q0}, [r1, :128]
@@ -630,11 +648,12 @@ function func_tr_16x4_\name
        add_member      d6, d1[2], d0[3], d0[0], d0[2], d1[1], d1[3], d1[0], 
d0[1], +, -, +, -, +, +, -, +
        add_member      d7, d1[3], d1[2], d1[1], d1[0], d0[3], d0[2], d0[1], 
d0[0], +, -, +, -, +, -, +, -

-        add             r4, sp, #512
-        vld1.s16        {q0-q1}, [r4, :128]!
-        vld1.s16        {q2-q3}, [r4, :128]!
+        add             r4, sp, #\offset
+        vld1.s32        {q0-q1}, [r4, :128]!
+        vld1.s32        {q2-q3}, [r4, :128]!

Move these changes from vld1.s16 to vld1.s32 into the previous patch as well


        butterfly16     q0, q5, q1, q6, q2, q7, q3, q8
+    .if \shift > 0
        scale           d26, d27, d28, d29, d30, d31, d16, d17, q4, q0, q5, q1, 
q6, q2, q7, q3, \shift
        transpose8_4x4  d26, d28, d30, d16
        transpose8_4x4  d17, d31, d29, d27
@@ -642,12 +661,16 @@ function func_tr_16x4_\name
        add             r3, r6, #(24 +3*32)
        mov             r2, #32
        mov             r4, #-32
-        store16         d26, d27, d28, d29, d30, d31, d16, d17
-
-        add             r4, sp, #576
-        vld1.s16        {q0-q1}, [r4, :128]!
-        vld1.s16        {q2-q3}, [r4, :128]
+        store16         d26, d27, d28, d29, d30, d31, d16, d17, r4
+    .else
+        store_to_stack  \offset, (\offset + 240), q4, q5, q6, q7, q3, q2, q1, 
q0
+    .endif
+
+        add             r4, sp, #(\offset + 64)
+        vld1.s32        {q0-q1}, [r4, :128]!
+        vld1.s32        {q2-q3}, [r4, :128]

Here you also change vld1.s16 into vld1.s32 at the same time. Please change that in the previous patch

// Martin
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to