Hello Paolo! > Heh, I'm quite at a loss regarding PR22480. I don't know exactly what > to do because i386 does not support, e.g. { 2, 4 } << { 1, 2 } (which > would give {4, 16} as a result). There is indeed a back-end problem, > because ashl<mode>3 is supposed to have two operands of the same mode, > not one vector and one SI! > I have changed a bit your proposed patterns:
--cut here-- (define_predicate "vec_shift_operand" (and (ior (match_code "reg") (match_code "const_vector")) (match_test "GET_MODE_CLASS (mode) == MODE_VECTOR_INT")) { unsigned elt = GET_MODE_NUNITS (mode) - 1; HOST_WIDE_INT ref; if (GET_CODE (op) == CONST_VECTOR) { ref = INTVAL (CONST_VECTOR_ELT (op, elt)); while (--elt) if (INTVAL (CONST_VECTOR_ELT (op, elt)) != ref) return 0; } return 1; }) (define_expand "ashl<mode>3" [(set (match_operand:SSEMODE248 0 "register_operand" "") (ashift:SSEMODE248 (match_operand:SSEMODE248 1 "register_operand" "") (match_operand:SSEMODE248 2 "vec_shift_operand" "")))] "TARGET_SSE2" { if (GET_CODE (operands[2]) == CONST_VECTOR) operands[2] = CONST_VECTOR_ELT (operands[2], 0); else operands[2] = gen_lowpart (SImode, operands[2]); }) (define_insn "sse_psll<mode>3" [(set (match_operand:SSEMODE248 0 "register_operand" "=x") (ashift:SSEMODE248 (match_operand:SSEMODE248 1 "register_operand" "0") (match_operand:SI 2 "nonmemory_operand" "xi")))] "TARGET_SSE2" "psll<ssevecsize>\t{%2, %0|%0, %2}" [(set_attr "type" "sseishft") (set_attr "mode" "TI")]) --cut here-- > > This however will not fix "a << b" shifts, which right now should (my > guess) ICE with something similar to PR22480. With above patterns, I tried to solve the a << b shifts. The proposed solution is to generate a SImode lowpart out of SImode vector, to get only element[0] value, in the hope that all elements are equal (this is true for pr22480.c testcases). For following testcase: void test_1 (void) { static unsigned bm[16]; int j; for (j = 0; j < 16; j++) bm[j] <<= 8; } void test_2 (int a) { static unsigned bm[16]; int j; for (j = 0; j < 16; j++) bm[j] <<= a; } I was able to generate following code (gcc -O2 -msse2 -ftree-vectorize -fomit-frame-pointer): test_1: movl $bm.1591, %eax movl $bm.1591, %edx .p2align 4,,15 .L2: movdqa (%eax), %xmm0 addl $4, %edx pslld $8, %xmm0 movdqa %xmm0, (%eax) addl $16, %eax cmpl $bm.1591+16, %edx jne .L2 ret test_2: subl $28, %esp movl $bm.1602, %eax movl $bm.1602, %edx movd 32(%esp), %xmm0 pshufd $0, %xmm0, %xmm0 movdqa %xmm0, (%esp) .p2align 4,,15 .L9: movdqa (%eax), %xmm0 movd (%esp), %xmm1 addl $4, %edx pslld %xmm1, %xmm0 movdqa %xmm0, (%eax) addl $16, %eax cmpl $bm.1602+16, %edx jne .L9 addl $28, %esp ret A couple of (unrelated) problems can be also seen with above code. First one is PR target/22479 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=22497, where register is wasted in a vectorised loop, the other problem is that movd is not pushed out of the loop. In this case, gcc should be able to optimize: movd 32(%esp), %xmm0 pshufd $0, %xmm0, %xmm0 movdqa %xmm0, (%esp) ... movd (%esp), %xmm1 into movd 32(%esp), %xmm1 BTW: This change breaks gcc.dg/i386-sse-6.c, because a __builtin_ia32_psllwi128 still calls ashl<mode>3 with integer parameter. Otherwise there are no other regressions. > By the way, it is time to remove the mmx_ prefix from the MMX insns! Not yet... emms patch depends heavily on optimize_mode_switching functionality, however o_m_s should be enhanced not to insert switching insns into the loops. Uros.