Since aarch64 has different neon syntax from aarch32 and has no support for (older) arm-simd, there are no SIMD accelerations for pixman on aarch64.
We need new implementations. This patch only contains FAST_PATH codes, not bilinear optimizations codes. After completing optimization this patch, bilinear related codes should be done. This patch contains additional optimization from my previous patch to omit using unncessary register movings. Added: https://bugs.freedesktop.org/show_bug.cgi?id=94758 Signed-off-by: Mizuki Asakura <ed6e117f at gmail.com> --- diff -ruNp a/pixman/pixman/pixman-arma64-neon-asm.S b/pixman/pixman/pixman-arma64-neon-asm.S --- a/pixman/pixman/pixman-arma64-neon-asm.S 2016-04-14 22:09:47.120752451 +0900 +++ b/pixman/pixman/pixman-arma64-neon-asm.S 2016-04-14 22:06:45.092222137 +0900 @@ -3132,8 +3132,7 @@ generate_composite_function_nearest_scan .macro bilinear_load_8888 reg1, reg2, tmp asr TMP1, X, #16 add X, X, UX - lsl TMP2, TMP1, #2 - add TMP1, TOP, TMP2 + add TMP1, TOP, TMP1, lsl #2 ld1 {®1&.2s}, [TMP1], STRIDE ld1 {®2&.2s}, [TMP1] .endm @@ -3141,8 +3140,7 @@ generate_composite_function_nearest_scan .macro bilinear_load_0565 reg1, reg2, tmp asr TMP1, X, #16 add X, X, UX - lsl TMP2, TMP1, #1 - add TMP1, TOP, TMP2 + add TMP1, TOP, TMP1, lsl #1 ld1 {®2&.s}[0], [TMP1], STRIDE ld1 {®2&.s}[1], [TMP1] convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp diff -ruNp 160407/pixman/pixman/pixman-arma64-neon-asm.h 160408/pixman/pixman/pixman-arma64-neon-asm.h --- a/pixman/pixman/pixman-arma64-neon-asm.h 2016-04-14 22:09:47.080752305 +0900 +++ b/pixman/pixman/pixman-arma64-neon-asm.h 2016-04-14 22:06:45.044222036 +0900 @@ -231,16 +231,14 @@ 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - lsl DUMMY, TMP1, #1 - add TMP1, mem_operand, DUMMY + add TMP1, mem_operand, TMP1, lsl #1 asr TMP2, VX, #16 adds VX, VX, UNIT_X bmi 55f 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - lsl DUMMY, TMP2, #1 - add TMP2, mem_operand, DUMMY + add TMP2, mem_operand, TMP2, lsl #1 ld1 {v®1&.h}[0], [TMP1] asr TMP1, VX, #16 adds VX, VX, UNIT_X @@ -248,8 +246,7 @@ 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - lsl DUMMY, TMP1, #1 - add TMP1, mem_operand, DUMMY + add TMP1, mem_operand, TMP1, lsl #1 ld1 {v®1&.h}[1], [TMP2] asr TMP2, VX, #16 adds VX, VX, UNIT_X @@ -257,8 +254,7 @@ 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - lsl DUMMY, TMP2, #1 - add TMP2, mem_operand, DUMMY + add TMP2, mem_operand, TMP2, lsl #1 ld1 {v®1&.h}[2], [TMP1] ld1 {v®1&.h}[3], [TMP2] .elseif elem_size == 32 @@ -268,16 +264,14 @@ 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - lsl DUMMY, TMP1, #2 - add TMP1, mem_operand, DUMMY + add TMP1, mem_operand, TMP1, lsl #2 asr TMP2, VX, #16 adds VX, VX, UNIT_X bmi 55f 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - lsl DUMMY, TMP2, #2 - add TMP2, mem_operand, DUMMY + add TMP2, mem_operand, TMP2, lsl #2 ld1 {v®1&.s}[0], [TMP1] ld1 {v®1&.s}[1], [TMP2] .else @@ -317,8 +311,7 @@ 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - lsl DUMMY, TMP1, #1 - add TMP1, mem_operand, DUMMY + add TMP1, mem_operand, TMP1, lsl #1 ld1 {v®1&.h}[idx], [TMP1] .elseif elem_size == 32 asr DUMMY, VX, #16 @@ -328,8 +321,7 @@ 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - lsl DUMMY, TMP1, #2 - add TMP1, mem_operand, DUMMY + add TMP1, mem_operand, TMP1, lsl #2 ld1 {v®1&.s}[idx], [TMP1] .endif .endm @@ -638,27 +630,21 @@ local skip1 */ .macro advance_to_next_scanline start_of_loop_label mov W, ORIG_W - lsl DUMMY, DST_STRIDE, #dst_bpp_shift - add DST_W, DST_W, DUMMY + add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift .if src_bpp != 0 - lsl DUMMY, SRC_STRIDE, #src_bpp_shift - add SRC, SRC, DUMMY + add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift .endif .if mask_bpp != 0 - lsl DUMMY, MASK_STRIDE, #mask_bpp_shift - add MASK, MASK, DUMMY + add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift .endif .if (dst_w_bpp != 24) - lsl DUMMY, W, #dst_bpp_shift - sub DST_W, DST_W, DUMMY + sub DST_W, DST_W, W, lsl #dst_bpp_shift .endif .if (src_bpp != 24) && (src_bpp != 0) - lsl DUMMY, W, #src_bpp_shift - sub SRC, SRC, DUMMY + sub SRC, SRC, W, lsl #src_bpp_shift .endif .if (mask_bpp != 24) && (mask_bpp != 0) - lsl DUMMY, W, #mask_bpp_shift - sub MASK, MASK, DUMMY + sub MASK, MASK, W, lsl #mask_bpp_shift .endif subs H, H, #1 mov DST_R, DST_W -- 2.7.4 _______________________________________________ Pixman mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/pixman
