https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121218

--- Comment #10 from Richard Biener <rguenth at gcc dot gnu.org> ---
Even w/o the -fdbg-cnt there's exactly one SLP vectorization at the place
pointed out.

The vector code seems still sensible at RTL expansion time:

  __BB(6,guessed_local(119292721)):
  _18 = __MEM <uint128_t, 8> ((char * {ref-all})in$16_1);
  __MEM <uint128_t, 64> ((struct Vec128 *)&in0) = _18;
  _19 = __MEM <uint128_t, 8> ((char * {ref-all})in$16_1 + _Literal (char *
{ref-all}) 16);
  _20 = __MEM <uint128_t, 8> ((char * {ref-all})in$16_1 + _Literal (char *
{ref-all}) 32);
  __MEM <uint128_t, 64> ((struct Vec128 *)&in2) = _20;
  _21 = __MEM <uint128_t, 8> ((char * {ref-all})in$16_1 + _Literal (char *
{ref-all}) 48);
  _133 = (long unsigned int) _18;
  _128 = (long unsigned int) _19;
  _127 = (long unsigned int) _20;
  _125 = (long unsigned int) _21;
  _22 = _Literal (long unsigned int [[gnu::vector_size(32)]]) {_133, _128,
_127, _125};
  _123 = __BIT_FIELD_REF <long unsigned int> (_18, 64u, 64u);
  _121 = __BIT_FIELD_REF <long unsigned int> (_19, 64u, 64u);
  _120 = __BIT_FIELD_REF <long unsigned int> (_20, 64u, 64u);
  _119 = __BIT_FIELD_REF <long unsigned int> (_21, 64u, 64u);
  _78 = _Literal (long unsigned int [[gnu::vector_size(32)]]) {_123, _121,
_120, _119};
  __MEM <long unsigned int [[gnu::vector_size(32)]], 64> ((TFromD *
restrict)actual_aligned$16_2 + _Literal (TFromD * restrict) 8) = _22;
  __MEM <long unsigned int [[gnu::vector_size(32)]], 64> ((TFromD *
restrict)actual_aligned$16_2 + _Literal (TFromD * restrict) 40) = _78;
  __MEM <uint128_t, 8> ((char * {ref-all})actual_aligned$16_2 + _Literal (char
* {ref-all}) 72) = 0u;
  buf0[0ul] = _133;
  buf2[0ul] = _127;
  buf0[1ul] = _123;
  buf2[1ul] = _120;
  _37 = __MEM <uint128_t, 8> ((char * {ref-all})&buf0);
  _38 = __MEM <uint128_t, 8> ((char * {ref-all})&buf2);
  buf0 ={v} _Literal (long unsigned int[2]) {CLOBBER(eos)};
  buf2 ={v} _Literal (long unsigned int[2]) {CLOBBER(eos)};
  __asm__ __volatile__("# FOO" :  :  : "memory");

Interestingly -fno-tree-sra fixes the issue with similar vectorization
but the verification code affected (outlining that also avoids the issue,
but not placing memory barriers before it).  I have

...
    StoreInterleaved4(in0, in1, in2, in3, d, actual);
    StoreU(Zero(d), d, actual + kVectors * N);
    Vec<D> out0, out1, out2, out3;
    LoadInterleaved4(d, actual, out0, out1, out2, out3);
  __asm__ volatile ("# FOO" : : : "memory");
    AssertVecEqual(
        d, in0, out0,
        "/home/sam/bugs/highway/highway-1.2.0/hwy/tests/interleaved_test.cc",
        151);
  __asm__ volatile ("# FOO" : : : "memory");
    AssertVecEqual(
        d, in2, out2,
        "/home/sam/bugs/highway/highway-1.2.0/hwy/tests/interleaved_test.cc",
        153);

reproducing it still but removing eithe AssertVecEqual fixes it.

-fdisable-rtl-cse1 -fdisable-rtl-cse2 fixes it as well.

Reply via email to