On Fri, 10 Jun 2016, Richard Biener wrote:
>
> With the proposed cost change for vector construction we will end up
> vectorizing the testcase in PR68961 again (on x86_64 and likely
> on ppc64le as well after that target gets adjustments). Currently
> we can't optimize that away again noticing the direct overlap of
> argument and return registers. The obstackle is
>
> (insn 7 4 8 2 (set (reg:V2DF 93)
> (vec_concat:V2DF (reg/v:DF 91 [ a ])
> (reg/v:DF 92 [ aa ])))
> ...
> (insn 21 8 24 2 (set (reg:DI 97 [ D.1756 ])
> (subreg:DI (reg:TI 88 [ D.1756 ]) 0))
> (insn 24 21 11 2 (set (reg:DI 100 [+8 ])
> (subreg:DI (reg:TI 88 [ D.1756 ]) 8))
>
> which we eventually optimize to DFmode subregs of (reg:V2DF 93).
>
> First of all simplify_subreg doesn't handle the subregs of a vec_concat
> (easy fix below).
>
> Then combine doesn't like to simplify the multi-use (it tries some
> parallel it seems). So I went to forwprop which eventually manages
> to do this but throws away the result (reg:DF 91) or (reg:DF 92)
> because it is not a constant. Thus I allow arbitrary simplification
> results for SUBREGs of [VEC_]CONCAT operations. There doesn't seem
> to be a magic flag to tell it to restrict to the case where all
> uses can be simplified or so, nor to restrict simplifications to a REG.
> But I don't see any undesirable simplifications of (subreg
> ([vec_]concat)).
>
> For the testcase I'm not sure if I have to exclude some ABIs (mingw?).
>
> Boostrap and regtest in progress on x86_64-unknown-linux-gnu, I'll
> install the simplify-rtx.c if that succeeds but like to have opinions
> on the fwprop.c change.
So the bootstrap exposes a latent issue in simplify-rtx.c in the changed
hunk via gcc.target/i386/mmx-8.c on i?86 which ends up with a
(vec_concat:V2SI (reg:SI 103)
(const_int 0 [0]))
and thus a VOIDmode 2nd operand (I'm sure this can happen for
complex integer concat as well, thus latent). I am adjusting the
simplify_subreg hunk to always pass GET_MODE_INNER (innermode)
(that hopefully exercises it a bit more than just using that
if GET_MODE (part) == VOIDmode - and hopefully they should always
agree).
Re-bootstrap / regtest running on x86_64-unknown-linux-gnu.
Comments still welcome.
Thanks,
Richard.
2016-06-13 Richard Biener <[email protected]>
PR rtl-optimization/68961
* simplify-rtx.c (simplify_subreg): Handle VEC_CONCAT like CONCAT.
* fwprop.c (propagate_rtx): Allow SUBREGs of VEC_CONCAT and CONCAT
to simplify to a non-constant.
* gcc.target/i386/pr68961.c: New testcase.
Index: gcc/simplify-rtx.c
===================================================================
*** gcc/simplify-rtx.c (revision 237286)
--- gcc/simplify-rtx.c (working copy)
*************** simplify_subreg (machine_mode outermode,
*** 6108,6116 ****
&& GET_MODE_SIZE (outermode) <= GET_MODE_SIZE (GET_MODE (op)))
return adjust_address_nv (op, outermode, byte);
! /* Handle complex values represented as CONCAT
! of real and imaginary part. */
! if (GET_CODE (op) == CONCAT)
{
unsigned int part_size, final_offset;
rtx part, res;
--- 6108,6117 ----
&& GET_MODE_SIZE (outermode) <= GET_MODE_SIZE (GET_MODE (op)))
return adjust_address_nv (op, outermode, byte);
! /* Handle complex or vector values represented as CONCAT or VEC_CONCAT
! of two parts. */
! if (GET_CODE (op) == CONCAT
! || GET_CODE (op) == VEC_CONCAT)
{
unsigned int part_size, final_offset;
rtx part, res;
*************** simplify_subreg (machine_mode outermode,
*** 6130,6139 ****
if (final_offset + GET_MODE_SIZE (outermode) > part_size)
return NULL_RTX;
! res = simplify_subreg (outermode, part, GET_MODE (part), final_offset);
if (res)
return res;
! if (validate_subreg (outermode, GET_MODE (part), part, final_offset))
return gen_rtx_SUBREG (outermode, part, final_offset);
return NULL_RTX;
}
--- 6131,6141 ----
if (final_offset + GET_MODE_SIZE (outermode) > part_size)
return NULL_RTX;
! enum machine_mode part_mode = GET_MODE_INNER (innermode);
! res = simplify_subreg (outermode, part, part_mode, final_offset);
if (res)
return res;
! if (validate_subreg (outermode, part_mode, part, final_offset))
return gen_rtx_SUBREG (outermode, part, final_offset);
return NULL_RTX;
}
Index: gcc/fwprop.c
===================================================================
*** gcc/fwprop.c (revision 237286)
--- gcc/fwprop.c (working copy)
*************** propagate_rtx (rtx x, machine_mode mode,
*** 664,670 ****
|| (GET_CODE (new_rtx) == SUBREG
&& REG_P (SUBREG_REG (new_rtx))
&& (GET_MODE_SIZE (mode)
! <= GET_MODE_SIZE (GET_MODE (SUBREG_REG (new_rtx))))))
flags |= PR_CAN_APPEAR;
if (!varying_mem_p (new_rtx))
flags |= PR_HANDLE_MEM;
--- 664,673 ----
|| (GET_CODE (new_rtx) == SUBREG
&& REG_P (SUBREG_REG (new_rtx))
&& (GET_MODE_SIZE (mode)
! <= GET_MODE_SIZE (GET_MODE (SUBREG_REG (new_rtx)))))
! || ((GET_CODE (new_rtx) == VEC_CONCAT
! || GET_CODE (new_rtx) == CONCAT)
! && GET_CODE (x) == SUBREG))
flags |= PR_CAN_APPEAR;
if (!varying_mem_p (new_rtx))
flags |= PR_HANDLE_MEM;
Index: gcc/testsuite/gcc.target/i386/pr68961.c
===================================================================
*** gcc/testsuite/gcc.target/i386/pr68961.c (revision 0)
--- gcc/testsuite/gcc.target/i386/pr68961.c (working copy)
***************
*** 0 ****
--- 1,19 ----
+ /* { dg-do compile { target lp64 } } */
+ /* { dg-options "-O3 -fno-vect-cost-model -fdump-tree-slp2-details" } */
+
+ struct x { double d[2]; };
+
+ struct x
+ pack (double a, double aa)
+ {
+ struct x u;
+ u.d[0] = a;
+ u.d[1] = aa;
+ return u;
+ }
+
+ /* The function should be optimized to just return as arguments and
+ result exactly overlap even when previously vectorized. */
+
+ /* { dg-final { scan-tree-dump "basic block vectorized" "slp2" } } */
+ /* { dg-final { scan-assembler-not "mov" } } */