On Wed, 2 Nov 2016, Janne Grunau wrote:
On 2016-11-02 13:47:37 +0200, Martin Storsjö wrote:
diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S
new file mode 100644
index 0000000..0651ec7
--- /dev/null
+++ b/libavcodec/arm/vp9mc_neon.S
@@ -0,0 +1,764 @@
+
+@ All public functions in this file have the following signature:
+@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+@ const uint8_t *ref, ptrdiff_t ref_stride,
+@ int h, int mx, int my);
+
+function ff_vp9_copy64_neon, export=1
+ ldr r12, [sp]
+ sub r1, r1, #32
+ sub r3, r3, #32
+1:
+ vld1.8 {q0, q1}, [r2]!
+ vst1.8 {q0, q1}, [r0, :128]!
+ vld1.8 {q2, q3}, [r2], r3
+ subs r12, r12, #1
+ vst1.8 {q2, q3}, [r0, :128], r1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_vp9_avg64_neon, export=1
+ ldr r12, [sp]
+ push {r4}
if you use lr instead of r4 you can return with pop {pc}
Amended, although my benchmarks are quite inconclusive whether this
actually makes it a couple cycles slower or not. Kept anyway since it's
more elegant and fewer instructions.
+function ff_vp9_copy16_neon, export=1
+ ldr r12, [sp]
+ push {r4-r5}
same here, push {r4, lr}; pop {r4,pc}
Amended
+function ff_vp9_copy4_neon, export=1
+ ldr r12, [sp]
+1:
+ vld1.32 {d0[]}, [r2], r3
+ vld1.32 {d1[]}, [r2], r3
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vld1.32 {d2[]}, [r2], r3
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vld1.32 {d3[]}, [r2], r3
+ subs r12, r12, #4
+ vst1.32 {d2[0]}, [r0, :32], r1
+ vst1.32 {d3[0]}, [r0, :32], r1
+ bne 1b
+ bx lr
+endfunc
have you tried using arm register instead, I would expect them to be
faster for size 4
It turns out it isn't - before:
vp9_put4_8bpp_c: 25.5 22.2 19.2 20.5
vp9_put4_8bpp_neon: 23.7 20.9 22.5 17.7
after (with 4 times unrolling, just like now):
vp9_put4_8bpp_neon: 28.2 23.2 19.2 26.7
So the A9 got faster, as fast as the C code, while all the others ended up
slower than before, and slower than the C version. Another way of tweaking
it actually makes it faster on A9, but even slower on all the others. Thus
I think the current one is a decent compromise.
+@ Instantiate a horizontal filter function for the given size.
+@ This can work on 4, 8 or 16 pixels in parallel; for larger
+@ widths it will do 16 pixels at a time and loop horizontally.
+@ The actual width is passed in r5, the height in r4 and
+@ the filter coefficients in r12. idx2 is the index of the largest
+@ filter coefficient (3 or 4) and idx1 is the other one of them.
+.macro do_8tap_h type, size, idx1, idx2
+function \type\()_8tap_\size\()h_\idx1\idx2
+ sub r2, r2, #3
+ add r6, r0, r1
+ add r7, r2, r3
+ add r1, r1, r1
+ add r3, r3, r3
+ @ Only size >= 16 loops horizontally and needs
+ @ reduced dst stride
+.if \size >= 16
+ sub r1, r1, r5
+.endif
+ @ size >= 16 loads two qwords and increments r2,
+ @ for size 4/8 it's enough with one qword and no
+ @ postincrement
+.if \size >= 16
+ sub r3, r3, r5
+ sub r3, r3, #8
+.endif
+ @ Load the filter vector
+ vld1.16 {q0}, [r12,:128]
+1:
+.if \size >= 16
+ mov r12, r5
+.endif
+ @ Load src
+.if \size >= 16
+ vld1.8 {q8}, [r2]!
+ vld1.8 {q11}, [r7]!
+ vld1.8 {d20}, [r2]!
+ vld1.8 {d26}, [r7]!
+.else
+ vld1.8 {q8}, [r2]
+ vld1.8 {q11}, [r7]
+.endif
+ vmovl.u8 q9, d17
+ vmovl.u8 q8, d16
+ vmovl.u8 q12, d23
+ vmovl.u8 q11, d22
+.if \size >= 16
+ vmovl.u8 q10, d20
+ vmovl.u8 q13, d26
+.endif
.if \size >= 16
vld1.8 {d18, d19, d20}, [r2]!
vld1.8 {d24, d25, d26}, [r7]!
.else
vld1.8 {q9}, [r2]
vld1.8 {q12}, [r7]
.endif
vmovl.u8 q8, d18
vmovl.u8 q9, d19
vmovl.u8 q11, d24
vmovl.u8 q12, d25
should be marginally faster
Oh, nice - yes, that's a bit faster
+@ Instantiate a vertical filter function for filtering a 4 pixels wide
+@ slice. The first half of the registers contain one row, while the second
+@ half of a register contains the second-next row (also stored in the first
+@ half of the register two steps ahead). The convolution does two outputs
+@ at a time; the output of q5-q12 into one, and q4-q13 into another one.
+@ The first half of first output is the first output row, the first half
+@ of the other output is the second output row. The second halves of the
+@ registers are rows 3 and 4.
+@ This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type, idx1, idx2
+function \type\()_8tap_4v_\idx1\idx2
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, r3
+ vld1.16 {q0}, [r12, :128]
+
+ vld1.32 {d2[]}, [r2], r3
+ vld1.32 {d3[]}, [r2], r3
+ vld1.32 {d4[]}, [r2], r3
+ vld1.32 {d5[]}, [r2], r3
+ vld1.32 {d6[]}, [r2], r3
+ vld1.32 {d7[]}, [r2], r3
+ vext.8 d2, d2, d4, #4
+ vld1.32 {d8[]}, [r2], r3
+ vext.8 d3, d3, d5, #4
+ vld1.32 {d9[]}, [r2], r3
+ vmovl.u8 q5, d2
+ vext.8 d4, d4, d6, #4
+ vld1.32 {d28[]}, [r2], r3
+ vmovl.u8 q6, d3
+ vext.8 d5, d5, d7, #4
+ vmovl.u8 q7, d4
+ vext.8 d6, d6, d8, #4
+ vld1.32 {d9[1]}, [r2], r3
it probably makes sense to continue the vld1.32 {d[]}, vext.8 pattern.
d30 and d31 should be free. It shouldn't be much slower for the height
== 4 case and help for height == 8.
Ah, yes. Around 1 cycle slower for height == 4, and around 9 cycles faster
for height == 8.
Thanks!
// Martin
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel