Hi, I have just switched to gcc 5.2 from 4.9.2 and the code quality does seem to have improved significantly. For example, it now seems much better at using ldp/stp and it seems to has stopped gratuitous use of the SIMD registers.
However, I still have a few whinges:-) See attached copy.c / copy.s (This is a performance critical function from OpenJDK) pd_disjoint_words: cmp x2, 8 <<< (1) sub sp, sp, #64 <<< (2) bhi .L2 cmp w2, 8 <<< (1) bls .L15 .L2: add sp, sp, 64 <<< (2) (1) If count as a 64 bit unsigned is <= 8 then it is probably still <= 8 as a 32 bit unsigned. (2) Nowhere in the function does it store anything on the stack, so why drop and restore the stack every time. Also, minor quibble in the disass, why does sub use #64 whereas add uses just '64' (appreciate this is probably binutils, not gcc). .L15: adrp x3, .L4 add x3, x3, :lo12:.L4 ldrb w2, [x3,w2,uxtw] <<< (3) adr x3, .Lrtx4 add x2, x3, w2, sxtb #2 br x2 (3) Why use a byte table, this is not some sort of embedded system. Use a word table and this becomes. .L15: adrp x3, .L4 add x3, x3, :lo12:.L4 ldr x2, [x3, x2, lsl #3] br x2 An aligned word load takes exactly the same time as a byte load and we save the faffing about calculating the address. .L10: ldp x6, x7, [x0] ldp x4, x5, [x0, 16] ldp x2, x3, [x0, 32] <<< (4) stp x2, x3, [x1, 32] <<< (4) stp x6, x7, [x1] stp x4, x5, [x1, 16] (4) Seems to be something wrong with the load scheduler here? Why not move the stp x2, x3 to the end. It does this repeatedly. Unfortunately as this function is performance critical it means I will probably end up doing it in inline assembler which is time consuming, error prone and non portable. * Whinge mode off Ed
#include <stddef.h> typedef long HeapWord; extern void _Copy_disjoint_words(HeapWord* from, HeapWord* to, size_t count); void pd_disjoint_words(HeapWord* from, HeapWord* to, size_t count) { switch (count) { case 0: return; case 1: to[0] = from[0]; return; case 2: { struct unit { HeapWord a, b; } *p, *q, t; p = (struct unit *)from; q = (struct unit *)to; t = *p; *q = t; return; } case 3: { struct unit { HeapWord a, b, c; } *p, *q, t; p = (struct unit *)from; q = (struct unit *)to; t = *p; *q = t; return; } case 4: { struct unit { HeapWord a, b, c, d; } *p, *q, t; p = (struct unit *)from; q = (struct unit *)to; t = *p; *q = t; return; } case 5: { struct unit { HeapWord a, b, c, d, e; } *p, *q, t; p = (struct unit *)from; q = (struct unit *)to; t = *p; *q = t; return; } case 6: { struct unit { HeapWord a, b, c, d, e, f; } *p, *q, t; p = (struct unit *)from; q = (struct unit *)to; t = *p; *q = t; return; } case 7: { struct unit { HeapWord a, b, c, d, e, f, g; } *p, *q, t; p = (struct unit *)from; q = (struct unit *)to; t = *p; *q = t; return; } case 8: { struct unit { HeapWord a, b, c, d, e, f, g, h; } *p, *q, t; p = (struct unit *)from; q = (struct unit *)to; t = *p; *q = t; return; } default: _Copy_disjoint_words(from, to, count); } }
.cpu generic+fp+simd .file "copy.c" .text .align 2 .p2align 3,,7 .global pd_disjoint_words .type pd_disjoint_words, %function pd_disjoint_words: cmp x2, 8 sub sp, sp, #64 bhi .L2 cmp w2, 8 bls .L15 .L2: add sp, sp, 64 b _Copy_disjoint_words .p2align 3 .L15: adrp x3, .L4 add x3, x3, :lo12:.L4 ldrb w2, [x3,w2,uxtw] adr x3, .Lrtx4 add x2, x3, w2, sxtb #2 br x2 .Lrtx4: .section .rodata .align 0 .align 2 .L4: .byte (.L1 - .Lrtx4) / 4 .byte (.L5 - .Lrtx4) / 4 .byte (.L6 - .Lrtx4) / 4 .byte (.L7 - .Lrtx4) / 4 .byte (.L8 - .Lrtx4) / 4 .byte (.L9 - .Lrtx4) / 4 .byte (.L10 - .Lrtx4) / 4 .byte (.L11 - .Lrtx4) / 4 .byte (.L12 - .Lrtx4) / 4 .text .p2align 3 .L10: ldp x6, x7, [x0] ldp x4, x5, [x0, 16] ldp x2, x3, [x0, 32] stp x2, x3, [x1, 32] stp x6, x7, [x1] stp x4, x5, [x1, 16] .L1: add sp, sp, 64 ret .p2align 3 .L11: ldp x6, x7, [x0] ldp x4, x5, [x0, 16] ldp x2, x3, [x0, 32] ldr x0, [x0, 48] str x0, [x1, 48] stp x6, x7, [x1] stp x4, x5, [x1, 16] stp x2, x3, [x1, 32] add sp, sp, 64 ret .p2align 3 .L9: ldp x4, x5, [x0] ldp x2, x3, [x0, 16] ldr x0, [x0, 32] str x0, [x1, 32] stp x4, x5, [x1] stp x2, x3, [x1, 16] add sp, sp, 64 ret .p2align 3 .L8: ldp x4, x5, [x0] ldp x2, x3, [x0, 16] stp x2, x3, [x1, 16] stp x4, x5, [x1] add sp, sp, 64 ret .p2align 3 .L7: ldp x2, x3, [x0] ldr x0, [x0, 16] str x0, [x1, 16] stp x2, x3, [x1] add sp, sp, 64 ret .p2align 3 .L6: ldr q0, [x0] str q0, [x1] add sp, sp, 64 ret .p2align 3 .L5: ldr x0, [x0] str x0, [x1] add sp, sp, 64 ret .p2align 3 .L12: ldp x8, x9, [x0] ldp x6, x7, [x0, 16] ldp x4, x5, [x0, 32] ldp x2, x3, [x0, 48] stp x2, x3, [x1, 48] stp x8, x9, [x1] stp x6, x7, [x1, 16] stp x4, x5, [x1, 32] add sp, sp, 64 ret .size pd_disjoint_words, .-pd_disjoint_words .ident "GCC: (GNU) 5.2.0" .section .note.GNU-stack,"",%progbits
_______________________________________________ linaro-toolchain mailing list linaro-toolchain@lists.linaro.org https://lists.linaro.org/mailman/listinfo/linaro-toolchain