http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50339



--- Comment #4 from Jakub Jelinek <jakub at gcc dot gnu.org> 2013-02-21 
12:57:40 UTC ---

Created attachment 29517

  --> http://gcc.gnu.org/bugzilla/attachment.cgi?id=29517

gcc48-pr50339.patch



Patch that improves 4.8 generated code to one insn better than what 4.7 did, by

lowering ASHIFTRT similarly how lower-subreg lowers ASHIFT and LSHIFTRT

already.



On this testcase the difference is between unpatched trunk and patched trunk:

-        movq    %rdi, %r9

-        movq    %rsi, %rdi

-        movq    %rsi, %r10

-        sarq    $63, %rdi

-        movq    %rdi, %rcx

-        xorq    %r9, %rcx

-        movq    %rcx, %rax

-        movq    %r10, %rcx

-        xorq    %rdi, %rcx

-        subq    %rdi, %rax

-        movq    %rcx, %rdx

-        sbbq    %rdi, %rdx

+        movq    %rsi, %rax

+        sarq    $63, %rax

+        movq    %rax, %r9

+        xorq    %rax, %rdi

+        xorq    %r9, %rsi

+        movq    %rdi, %rax

+        movq    %rsi, %rdx

+        subq    %r9, %rax

+        sbbq    %r9, %rdx



i.e. 4 moves instead of former 7 (no idea why RA chooses to do this shift on

%rax (i.e. first move %rsi to %rax, then shift %rax, then move %rax to %r9),

instead of copying %rsi to %r9 and shifting %r9, that would mean one less

move).

Even smaller code would probably need different expansion or much smarter

register allocation.



Anyway, also tested:

__int128_t

f1 (__int128_t a)

{

  return a >> 67;

}



__int128_t

f2 (__int128_t a)

{

  return a >> 64;

}



__int128_t

f3 (__int128_t a)

{

  return a >> 127;

}



__uint128_t

f4 (__uint128_t a)

{

  return a >> 67;

}



__uint128_t

f5 (__uint128_t a)

{

  return a >> 64;

}



__uint128_t

f6 (__uint128_t a)

{

  return a >> 127;

}



on x86_64 and the difference at -O2 is:

-        movq    %rsi, %rax

         movq    %rsi, %rdx

+        movq    %rsi, %rax

         sarq    $63, %rdx

         sarq    $3, %rax

for f1,

-        movq    %rsi, %rdx

         movq    %rsi, %rax

-        sarq    $63, %rdx

+        cqto

for f2 and

+        sarq    $63, %rsi

         movq    %rsi, %rdx

-        sarq    $63, %rdx

-        movq    %rdx, %rax

+        movq    %rsi, %rax

for f3, so either no pessimization, or small improvement.  On:

long long int

f1 (long long int a)

{

  return a >> 35;

}



long long int

f2 (long long int a)

{

  return a >> 32;

}



long long int

f3 (long long int a)

{

  return a >> 63;

}



unsigned long long int

f4 (unsigned long long int a)

{

  return a >> 35;

}



unsigned long long int

f5 (unsigned long long int a)

{

  return a >> 32;

}



unsigned long long int

f6 (unsigned long long int a)

{

  return a >> 63;

}



for -O2 -m32 the improvements are even better, for f1:

-        movl    8(%esp), %edx

-        movl    %edx, %eax

-        movl    %eax, %edx

-        sarl    $31, %edx

+        movl    8(%esp), %eax

+        cltd

         sarl    $3, %eax

and for f2:

-        movl    8(%esp), %edx

-        movl    %edx, %eax

-        movl    %eax, %edx

-        sarl    $31, %edx

+        movl    8(%esp), %eax

+        cltd

(no difference for f3).

Reply via email to