http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50918
Bug #: 50918 Summary: Unoptimal code for vec-shift by scalar for integer (byte, short, long long) operands Classification: Unclassified Product: gcc Version: 4.7.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization AssignedTo: unassig...@gcc.gnu.org ReportedBy: ubiz...@gmail.com Following testcase: --cut here-- #define N 8 short a[N] = { 1,2,3,4,10,20,30,90 }; short r[N]; void test_var (int n) { int i; for (i = 0; i < N; i++) r[i] = a[i] << n; } void test_cst (void) { int i; for (i = 0; i < N; i++) r[i] = a[i] << 3; } --cut here-- compiles to (-march=corei7 -O2 -ftree-vectorize): test_var: movdqa a(%rip), %xmm0 movd %edi, %xmm2 pmovsxwd %xmm0, %xmm1 psrldq $8, %xmm0 pmovsxwd %xmm0, %xmm0 pslld %xmm2, %xmm1 pslld %xmm2, %xmm0 pshufb .LC0(%rip), %xmm1 pshufb .LC1(%rip), %xmm0 por %xmm1, %xmm0 movdqa %xmm0, r(%rip) ret test_cst: movdqa a(%rip), %xmm0 pmovsxwd %xmm0, %xmm1 psrldq $8, %xmm0 pmovsxwd %xmm0, %xmm0 pslld $3, %xmm1 pshufb .LC0(%rip), %xmm1 pslld $3, %xmm0 pshufb .LC1(%rip), %xmm0 por %xmm1, %xmm0 movdqa %xmm0, r(%rip) ret Why not psllw ? The .optimized dump already shows: test_var (int n) { vector(8) short int vect_var_.16; vector(4) int vect_var_.15; vector(4) int vect_var_.14; vector(8) short int vect_var_.13; <bb 2>: vect_var_.13_23 = MEM[(short int[8] *)&a]; vect_var_.14_24 = [vec_unpack_lo_expr] vect_var_.13_23; vect_var_.14_25 = [vec_unpack_hi_expr] vect_var_.13_23; vect_var_.15_26 = vect_var_.14_24 << n_5(D); vect_var_.15_27 = vect_var_.14_25 << n_5(D); vect_var_.16_28 = VEC_PACK_TRUNC_EXPR <vect_var_.15_26, vect_var_.15_27>; MEM[(short int[8] *)&r] = vect_var_.16_28; return; } test_cst () { vector(8) short int vect_var_.36; vector(4) int vect_var_.35; vector(4) int vect_var_.34; vector(8) short int vect_var_.33; <bb 2>: vect_var_.33_22 = MEM[(short int[8] *)&a]; vect_var_.34_23 = [vec_unpack_lo_expr] vect_var_.33_22; vect_var_.34_24 = [vec_unpack_hi_expr] vect_var_.33_22; vect_var_.35_25 = vect_var_.34_23 << 3; vect_var_.35_26 = vect_var_.34_24 << 3; vect_var_.36_27 = VEC_PACK_TRUNC_EXPR <vect_var_.35_25, vect_var_.35_26>; MEM[(short int[8] *)&r] = vect_var_.36_27; return; } The same unoptimal code is generated for long-long and byte (-mxop target) signed and unsigned arguments, left and right shifts. OTOH, int arguments produce optimal code for left and right shifts: test_var: movdqa a(%rip), %xmm0 movd %edi, %xmm1 pslld %xmm1, %xmm0 movdqa %xmm0, r(%rip) ret test_cst: movdqa a(%rip), %xmm0 pslld $3, %xmm0 movdqa %xmm0, r(%rip) ret