Hello Paolo!

> Heh, I'm quite at a loss regarding PR22480.  I don't know exactly what
> to do because i386 does not support, e.g. { 2, 4 } << { 1, 2 } (which
> would give {4, 16} as a result).  There is indeed a back-end problem,
> because ashl<mode>3 is supposed to have two operands of the same mode,
> not one vector and one SI!
> 
I have changed a bit your proposed patterns:

--cut here--
(define_predicate "vec_shift_operand"
 (and (ior (match_code "reg")
           (match_code "const_vector"))
      (match_test "GET_MODE_CLASS (mode) == MODE_VECTOR_INT"))
{
 unsigned elt = GET_MODE_NUNITS (mode) - 1;
 HOST_WIDE_INT ref;

 if (GET_CODE (op) == CONST_VECTOR)
   {
     ref = INTVAL (CONST_VECTOR_ELT (op, elt));

     while (--elt)
       if (INTVAL (CONST_VECTOR_ELT (op, elt)) != ref)
         return 0;
   }
 return 1;
})

(define_expand "ashl<mode>3"
 [(set (match_operand:SSEMODE248 0 "register_operand" "")
       (ashift:SSEMODE248
         (match_operand:SSEMODE248 1 "register_operand" "")
         (match_operand:SSEMODE248 2 "vec_shift_operand" "")))]
 "TARGET_SSE2"
{
  if (GET_CODE (operands[2]) == CONST_VECTOR)
    operands[2] = CONST_VECTOR_ELT (operands[2], 0);
  else
    operands[2] = gen_lowpart (SImode, operands[2]);
})

(define_insn "sse_psll<mode>3"
 [(set (match_operand:SSEMODE248 0 "register_operand" "=x")
       (ashift:SSEMODE248
         (match_operand:SSEMODE248 1 "register_operand" "0")
         (match_operand:SI 2 "nonmemory_operand" "xi")))]
 "TARGET_SSE2"
 "psll<ssevecsize>\t{%2, %0|%0, %2}"
 [(set_attr "type" "sseishft")
  (set_attr "mode" "TI")])

--cut here--

> 
> This however will not fix "a << b" shifts, which right now should (my
> guess) ICE with something similar to PR22480.

With above patterns, I tried to solve the a << b shifts. The proposed
solution is to generate a SImode lowpart out of SImode vector, to get
only element[0] value, in the hope that all elements are equal (this
is true for pr22480.c testcases).

For following testcase:

void
test_1 (void)
{
  static unsigned bm[16];
  int j;
  for (j = 0; j < 16; j++)
    bm[j] <<= 8;
}

void
test_2 (int a)
{
  static unsigned bm[16];
  int j;
  for (j = 0; j < 16; j++)
    bm[j] <<= a;
}

I was able to generate following code (gcc -O2 -msse2 -ftree-vectorize
-fomit-frame-pointer):

test_1:
        movl    $bm.1591, %eax
        movl    $bm.1591, %edx
        .p2align 4,,15
.L2:
        movdqa  (%eax), %xmm0
        addl    $4, %edx
        pslld   $8, %xmm0
        movdqa  %xmm0, (%eax)
        addl    $16, %eax
        cmpl    $bm.1591+16, %edx
        jne     .L2
        ret

test_2:
        subl    $28, %esp
        movl    $bm.1602, %eax
        movl    $bm.1602, %edx
        movd    32(%esp), %xmm0
        pshufd  $0, %xmm0, %xmm0
        movdqa  %xmm0, (%esp)
        .p2align 4,,15
.L9:
        movdqa  (%eax), %xmm0
        movd    (%esp), %xmm1
        addl    $4, %edx
        pslld   %xmm1, %xmm0
        movdqa  %xmm0, (%eax)
        addl    $16, %eax
        cmpl    $bm.1602+16, %edx
        jne     .L9
        addl    $28, %esp
        ret

A couple of (unrelated) problems can be also seen with above code.
First one is PR target/22479
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=22497, where register is
wasted in a vectorised loop, the other problem is that movd is not
pushed out of the loop.

In this case, gcc should be able to optimize:

        movd    32(%esp), %xmm0
        pshufd  $0, %xmm0, %xmm0
        movdqa  %xmm0, (%esp)
        ...
        movd    (%esp), %xmm1

into
        movd    32(%esp), %xmm1

BTW: This change breaks gcc.dg/i386-sse-6.c, because a
__builtin_ia32_psllwi128 still calls ashl<mode>3 with integer
parameter. Otherwise there are no other regressions.

> By the way, it is time to remove the mmx_ prefix from the MMX insns!

Not yet... emms patch depends heavily on optimize_mode_switching
functionality, however o_m_s should be enhanced not to insert
switching insns into the loops.

Uros.

Reply via email to