http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50918

             Bug #: 50918
           Summary: Unoptimal code for vec-shift by scalar for integer
                    (byte, short, long long) operands
    Classification: Unclassified
           Product: gcc
           Version: 4.7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassig...@gcc.gnu.org
        ReportedBy: ubiz...@gmail.com


Following testcase:

--cut here--
#define N 8

short a[N] = { 1,2,3,4,10,20,30,90 };
short r[N];

void
test_var (int n)
{
  int i;

  for (i = 0; i < N; i++)
    r[i] = a[i] << n;
}

void
test_cst (void)
{
  int i;

  for (i = 0; i < N; i++)
    r[i] = a[i] << 3;
}
--cut here--

compiles to (-march=corei7 -O2 -ftree-vectorize):

test_var:
    movdqa    a(%rip), %xmm0
    movd    %edi, %xmm2
    pmovsxwd    %xmm0, %xmm1
    psrldq    $8, %xmm0
    pmovsxwd    %xmm0, %xmm0
    pslld    %xmm2, %xmm1
    pslld    %xmm2, %xmm0
    pshufb    .LC0(%rip), %xmm1
    pshufb    .LC1(%rip), %xmm0
    por    %xmm1, %xmm0
    movdqa    %xmm0, r(%rip)
    ret

test_cst:
    movdqa    a(%rip), %xmm0
    pmovsxwd    %xmm0, %xmm1
    psrldq    $8, %xmm0
    pmovsxwd    %xmm0, %xmm0
    pslld    $3, %xmm1
    pshufb    .LC0(%rip), %xmm1
    pslld    $3, %xmm0
    pshufb    .LC1(%rip), %xmm0
    por    %xmm1, %xmm0
    movdqa    %xmm0, r(%rip)
    ret

Why not psllw ?

The .optimized dump already shows:

test_var (int n)
{
  vector(8) short int vect_var_.16;
  vector(4) int vect_var_.15;
  vector(4) int vect_var_.14;
  vector(8) short int vect_var_.13;

<bb 2>:
  vect_var_.13_23 = MEM[(short int[8] *)&a];
  vect_var_.14_24 = [vec_unpack_lo_expr] vect_var_.13_23;
  vect_var_.14_25 = [vec_unpack_hi_expr] vect_var_.13_23;
  vect_var_.15_26 = vect_var_.14_24 << n_5(D);
  vect_var_.15_27 = vect_var_.14_25 << n_5(D);
  vect_var_.16_28 = VEC_PACK_TRUNC_EXPR <vect_var_.15_26, vect_var_.15_27>;
  MEM[(short int[8] *)&r] = vect_var_.16_28;
  return;

}


test_cst ()
{
  vector(8) short int vect_var_.36;
  vector(4) int vect_var_.35;
  vector(4) int vect_var_.34;
  vector(8) short int vect_var_.33;

<bb 2>:
  vect_var_.33_22 = MEM[(short int[8] *)&a];
  vect_var_.34_23 = [vec_unpack_lo_expr] vect_var_.33_22;
  vect_var_.34_24 = [vec_unpack_hi_expr] vect_var_.33_22;
  vect_var_.35_25 = vect_var_.34_23 << 3;
  vect_var_.35_26 = vect_var_.34_24 << 3;
  vect_var_.36_27 = VEC_PACK_TRUNC_EXPR <vect_var_.35_25, vect_var_.35_26>;
  MEM[(short int[8] *)&r] = vect_var_.36_27;
  return;

}

The same unoptimal code is generated for long-long and byte (-mxop target)
signed and unsigned arguments, left and right shifts. OTOH, int arguments
produce optimal code for left and right shifts:

test_var:
    movdqa    a(%rip), %xmm0
    movd    %edi, %xmm1
    pslld    %xmm1, %xmm0
    movdqa    %xmm0, r(%rip)
    ret

test_cst:
    movdqa    a(%rip), %xmm0
    pslld    $3, %xmm0
    movdqa    %xmm0, r(%rip)
    ret

Reply via email to