On Wed, 2 Nov 2016, Janne Grunau wrote:

On 2016-11-02 13:47:37 +0200, Martin Storsjö wrote:

diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S
new file mode 100644
index 0000000..0651ec7
--- /dev/null
+++ b/libavcodec/arm/vp9mc_neon.S
@@ -0,0 +1,764 @@
+
+@ All public functions in this file have the following signature:
+@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+@                            const uint8_t *ref, ptrdiff_t ref_stride,
+@                            int h, int mx, int my);
+
+function ff_vp9_copy64_neon, export=1
+        ldr             r12, [sp]
+        sub             r1,  r1,  #32
+        sub             r3,  r3,  #32
+1:
+        vld1.8          {q0,  q1},  [r2]!
+        vst1.8          {q0,  q1},  [r0, :128]!
+        vld1.8          {q2,  q3},  [r2], r3
+        subs            r12, r12, #1
+        vst1.8          {q2,  q3},  [r0, :128], r1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_vp9_avg64_neon, export=1
+        ldr             r12, [sp]
+        push            {r4}

if you use lr instead of r4 you can return with pop {pc}

Amended, although my benchmarks are quite inconclusive whether this actually makes it a couple cycles slower or not. Kept anyway since it's more elegant and fewer instructions.

+function ff_vp9_copy16_neon, export=1
+        ldr             r12, [sp]
+        push            {r4-r5}

same here, push {r4, lr}; pop {r4,pc}

Amended

+function ff_vp9_copy4_neon, export=1
+        ldr             r12, [sp]
+1:
+        vld1.32         {d0[]},   [r2], r3
+        vld1.32         {d1[]},   [r2], r3
+        vst1.32         {d0[0]},  [r0, :32], r1
+        vld1.32         {d2[]},   [r2], r3
+        vst1.32         {d1[0]},  [r0, :32], r1
+        vld1.32         {d3[]},   [r2], r3
+        subs            r12, r12, #4
+        vst1.32         {d2[0]},  [r0, :32], r1
+        vst1.32         {d3[0]},  [r0, :32], r1
+        bne             1b
+        bx              lr
+endfunc

have you tried using arm register instead, I would expect them to be
faster for size 4

It turns out it isn't - before:
vp9_put4_8bpp_c:     25.5  22.2  19.2  20.5
vp9_put4_8bpp_neon:  23.7  20.9  22.5  17.7
after (with 4 times unrolling, just like now):
vp9_put4_8bpp_neon:  28.2  23.2  19.2  26.7

So the A9 got faster, as fast as the C code, while all the others ended up slower than before, and slower than the C version. Another way of tweaking it actually makes it faster on A9, but even slower on all the others. Thus I think the current one is a decent compromise.

+@ Instantiate a horizontal filter function for the given size.
+@ This can work on 4, 8 or 16 pixels in parallel; for larger
+@ widths it will do 16 pixels at a time and loop horizontally.
+@ The actual width is passed in r5, the height in r4 and
+@ the filter coefficients in r12. idx2 is the index of the largest
+@ filter coefficient (3 or 4) and idx1 is the other one of them.
+.macro do_8tap_h type, size, idx1, idx2
+function \type\()_8tap_\size\()h_\idx1\idx2
+        sub             r2,  r2,  #3
+        add             r6,  r0,  r1
+        add             r7,  r2,  r3
+        add             r1,  r1,  r1
+        add             r3,  r3,  r3
+        @ Only size >= 16 loops horizontally and needs
+        @ reduced dst stride
+.if \size >= 16
+        sub             r1,  r1,  r5
+.endif
+        @ size >= 16 loads two qwords and increments r2,
+        @ for size 4/8 it's enough with one qword and no
+        @ postincrement
+.if \size >= 16
+        sub             r3,  r3,  r5
+        sub             r3,  r3,  #8
+.endif
+        @ Load the filter vector
+        vld1.16         {q0},  [r12,:128]
+1:
+.if \size >= 16
+        mov             r12, r5
+.endif
+        @ Load src
+.if \size >= 16
+        vld1.8          {q8},  [r2]!
+        vld1.8          {q11}, [r7]!
+        vld1.8          {d20}, [r2]!
+        vld1.8          {d26}, [r7]!
+.else
+        vld1.8          {q8},  [r2]
+        vld1.8          {q11}, [r7]
+.endif
+        vmovl.u8        q9,  d17
+        vmovl.u8        q8,  d16
+        vmovl.u8        q12, d23
+        vmovl.u8        q11, d22
+.if \size >= 16
+        vmovl.u8        q10, d20
+        vmovl.u8        q13, d26
+.endif

.if \size >= 16
 vld1.8   {d18, d19, d20}, [r2]!
 vld1.8   {d24, d25, d26}, [r7]!
.else
 vld1.8   {q9},  [r2]
 vld1.8   {q12}, [r7]
.endif
 vmovl.u8 q8,  d18
 vmovl.u8 q9,  d19
 vmovl.u8 q11, d24
 vmovl.u8 q12, d25

should be marginally faster

Oh, nice - yes, that's a bit faster

+@ Instantiate a vertical filter function for filtering a 4 pixels wide
+@ slice. The first half of the registers contain one row, while the second
+@ half of a register contains the second-next row (also stored in the first
+@ half of the register two steps ahead). The convolution does two outputs
+@ at a time; the output of q5-q12 into one, and q4-q13 into another one.
+@ The first half of first output is the first output row, the first half
+@ of the other output is the second output row. The second halves of the
+@ registers are rows 3 and 4.
+@ This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type, idx1, idx2
+function \type\()_8tap_4v_\idx1\idx2
+        sub             r2,  r2,  r3, lsl #1
+        sub             r2,  r2,  r3
+        vld1.16         {q0},  [r12, :128]
+
+        vld1.32         {d2[]},   [r2], r3
+        vld1.32         {d3[]},   [r2], r3
+        vld1.32         {d4[]},   [r2], r3
+        vld1.32         {d5[]},   [r2], r3
+        vld1.32         {d6[]},   [r2], r3
+        vld1.32         {d7[]},   [r2], r3
+        vext.8          d2,  d2,  d4,  #4
+        vld1.32         {d8[]},   [r2], r3
+        vext.8          d3,  d3,  d5,  #4
+        vld1.32         {d9[]},   [r2], r3
+        vmovl.u8        q5,  d2
+        vext.8          d4,  d4,  d6,  #4
+        vld1.32         {d28[]},  [r2], r3
+        vmovl.u8        q6,  d3
+        vext.8          d5,  d5,  d7,  #4
+        vmovl.u8        q7,  d4
+        vext.8          d6,  d6,  d8,  #4
+        vld1.32         {d9[1]},  [r2], r3

it probably makes sense to continue the vld1.32 {d[]}, vext.8 pattern.
d30 and d31 should be free. It shouldn't be much slower for the height
== 4 case and help for height == 8.

Ah, yes. Around 1 cycle slower for height == 4, and around 9 cycles faster for height == 8.

Thanks!

// Martin
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to