https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120517

            Bug ID: 120517
           Summary: [16 Regression] "For datarefs with big gap, split them
                    into different groups" vs. nvptx SLP vectorizer
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Keywords: testsuite-fail
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tschwinge at gcc dot gnu.org
                CC: liuhongt at gcc dot gnu.org, rguenth at gcc dot gnu.org,
                    vries at gcc dot gnu.org
  Target Milestone: ---
            Target: nvptx

This is about the recent commit
r16-924-g1bc5b47f5b06dc4e8d2e7b622a7100b40b8e6b27 "For datarefs with big gap,
split them into different groups".

GCC '--target=nvptx-none' has (very) limited support for (SLP) vectorization. 
Tom (CCed) implemented this many years ago; I'm not very familiar with this
specifically, or the vectorizer generally.

Looking at the code generated for nvptx target libraries, this commit looks
like causing an overall improvement: resolves some gratuitous use of moves
through vector registers, nice.  (..., which may have been a nvptx target
instruction costing issue?)

However, it also regresses:

    PASS: gcc.target/nvptx/slp-2.c (test for excess errors)
    [-PASS:]{+FAIL:+} gcc.target/nvptx/slp-2.c scan-assembler ld.v2.u64
    [-PASS:]{+FAIL:+} gcc.target/nvptx/slp-2.c scan-assembler st.v2.u64

(So far, I've only tested 'nvptx.exp'.)

..., and in nvptx offloading configuration:

    PASS: libgomp.oacc-c/vec.c -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0
-foffload=nvptx-none  -O2  (test for excess errors)
    PASS: libgomp.oacc-c/vec.c -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0
-foffload=nvptx-none  -O2  execution test
    [-PASS:-]{+FAIL:+} libgomp.oacc-c/vec.c -DACC_DEVICE_TYPE_nvidia=1
-DACC_MEM_SHARED=0 -foffload=nvptx-none  -O2  
scan-nvptx-none-offload-tree-dump slp1 "vector\\(2\\) long long unsigned int"

Looking at 'gcc.target/nvptx/slp-2.c', where we've got:

    long long int p[1000] __attribute__((aligned(16)));
    long long int p2[1000] __attribute__((aligned(16)));

    void __attribute__((noinline, noclone))
    foo ()
    {
      long long int a, b;

      int i;
      for (i = 0; i < 1000; i += 2)
        {
          a = p[i];
          b = p[i+1];

          p2[i] = a;
          p2[i+1] = b;
        }
    }

..., and before this commit, used to generate:

            .visible .global .align 16 .u64 p[1000];
            .visible .global .align 16 .u64 p2[1000];
    .visible .func foo
    {
            .reg.u64 %r22;
            .reg.u64 %r24;
            .reg.v2.u64 %r26;
            .reg.u64 %r27;
            .reg.pred %r28;
                    cvta.global.u64 %r24, p;
                    cvta.global.u64 %r22, p2;
                    add.u64 %r27, %r24, 8000;
    $L2:
                    ld.v2.u64       %r26, [%r24];
                    st.v2.u64       [%r22], %r26;
                    add.u64 %r24, %r24, 16;
                    add.u64 %r22, %r22, 16;
                    setp.ne.u64     %r28, %r24, %r27;
            @%r28   bra     $L2;
            ret;
    }

..., that is, load from 'p' and store into 'p2' with 'v2.u64', that is, 'i' and
'i+1' at once, 16 bytes at a time.

In 'gcc/config/nvptx/nvptx-modes.def' we have:

    VECTOR_MODE (INT, DI, 2);  /* V2DI */

After this commit, we now generate:

    .visible .func foo
    {
            .reg.u64 %r23;
            .reg.u64 %r24;
            .reg.u64 %r25;
            .reg.u64 %r26;
            .reg.u64 %r27;
            .reg.u64 %r30;
            .reg.u64 %r33;
            .reg.pred %r34;
                    cvta.global.u64 %r24, p;
                    cvta.global.u64 %r25, p2;
                    add.u64 %r26, %r24, 8;
                    add.u64 %r27, %r25, 8;
                    add.u64 %r30, %r24, 8000;
    $L2:
                    ld.u64  %r23, [%r26];
                    ld.u64  %r33, [%r24];
                    st.u64  [%r25], %r33;
                    st.u64  [%r27], %r23;
                    add.u64 %r24, %r24, 16;
                    add.u64 %r25, %r25, 16;
                    add.u64 %r26, %r26, 16;
                    add.u64 %r27, %r27, 16;
                    setp.ne.u64     %r34, %r24, %r30;
            @%r34   bra     $L2;
            ret;
    }

..., that is, load from 'p' and store into 'p2' with 'u64', that is, 'i' and
'i+1' separately, 8 bytes at a time.  I've not benchmarked it, but it certainly
looks inferior?

Per the commit, we've now got
'gcc/tree-vect-data-refs.cc:vect_analyze_data_ref_accesses':

    3685                  /* For datarefs with big gap, it's better to split
them into different
    3686                     groups.
    3687                     .i.e a[0], a[1], a[2], .. a[7], a[100],
a[101],..., a[107]  */
    3688                  if ((unsigned HOST_WIDE_INT)(init_b - init_prev) *
tree_to_uhwi (szb)
    3689                      > MAX_BITSIZE_MODE_ANY_MODE / BITS_PER_UNIT)
    3690                    break;
    (gdb) print init_b
    $3 = 8
    (gdb) print init_prev
    $4 = 0
    (gdb) call tree_to_uhwi (szb)
    $5 = 8

..., and per nvptx '[build-gcc]/gcc/insn-modes.h', we've got:

    #define BITS_PER_UNIT (8)
    #define MAX_BITSIZE_MODE_ANY_INT (16*BITS_PER_UNIT)
    #define MAX_BITSIZE_MODE_ANY_MODE (32*BITS_PER_UNIT)

..., so '(8 - 0) * 8 > (32*8) / (8)' is true, thus 'break;'

Reply via email to