https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104600

--- Comment #2 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
Here is another example even with 32bit int and 64bit long:
#define vector __attribute__((vector_size(8)))

long f(int a, int b)
{
  vector int t = {a, b};
  return (long)t;
}

void f1(long *t1, int a, int b)
{
  vector int t = {a, b};
  *t1 =  (long)t;
}

void f2(long *t1, int a, int b)
{
  vector int t = {a, b};
  *t1 =  ((long)t) + 1;
}

long f_1(unsigned a, unsigned b)
{
  long t = (((unsigned long)a) << 32) | (unsigned long)b;
  return (long)t;
}

void f1_1(long *t1, unsigned a, unsigned b)
{
  long t = (((unsigned long)a) << 32) | (unsigned long)b;
  *t1 =  (long)t;
}

void f2_1(long *t1, unsigned a, unsigned b)
{
  long t = (((unsigned long)a) << 32) | (unsigned long)b;
  *t1 =  ((long)t) + 1;
}

----- CUT ----
For f2 and f2_1 we have:

        movd    %esi, %xmm0
        movd    %edx, %xmm1
        punpckldq       %xmm1, %xmm0
        movq    %xmm0, %rsi
        addq    $1, %rsi
        movq    %rsi, (%rdi)

vs
        salq    $32, %rsi
        movl    %edx, %edx
        orq     %rdx, %rsi
        addq    $1, %rsi
        movq    %rsi, (%rdi)

It all depends on how fast move between GPRs and SSE register sets is vs doing
it all in the integer.

I only think this should be done for the 2x size case and nothing more.

Reply via email to