[Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop

juzhe.zhong at rivai dot ai via Gcc-bugs Wed, 17 Jan 2024 04:38:20 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441


            Bug ID: 113441
           Summary: [14 Regression] Fail to fold the last element with
                    multiple loop
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: juzhe.zhong at rivai dot ai
  Target Milestone: ---

Hi, We found there is a regression between GCC-12 vs GCC-14 when evaluating our
downstream RVV GCC vs upstream RVV GCC.

Such regression not only happens on our RVV GCC but also ARM SVE GCC.

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int16_t array1[4][1 * 273 * 12 * 2];

int16_t array2[4][1 * 273 * 12 * 2];
int16_t array3[4][4 * 2];

void
foo (uint8_t a, uint16_t b)
{
  int32_t sum[2];
  int32_t result[4][2];
  uint16_t j = 0;
  uint8_t i = 0;
  uint16_t l = 0;
  uint16_t k = 0;
  uint32_t m = 0;

  for (i = 0; i < 4; i++)
    {
      m = 0;
      for (j = 0; j < a; j++)
        {
          for (k = 0; k < b; k++)
            {
              for (l = 0; l < 12; l++)
                {
                  result[0][0] = array1[0][2 * m] * array3[i][0]
                                 - array1[0][2 * m + 1] * array3[i][1];
                  result[0][1] = array1[0][2 * m + 1] * array3[i][0]
                                 + array1[0][2 * m] * array3[i][1];

                  result[1][0] = array1[1][2 * m] * array3[i][2]
                                 - array1[1][2 * m + 1] * array3[i][3];
                  result[1][1] = array1[1][2 * m + 1] * array3[i][2]
                                 + array1[1][2 * m] * array3[i][3];

                  result[2][0] = array1[2][2 * m] * array3[i][4]
                                 - array1[2][2 * m + 1] * array3[i][5];
                  result[2][1] = array1[2][2 * m + 1] * array3[i][4]
                                 + array1[2][2 * m] * array3[i][5];

                  result[3][0] = array1[3][2 * m] * array3[i][6]
                                 - array1[3][2 * m + 1] * array3[i][7];
                  result[3][1] = array1[3][2 * m + 1] * array3[i][6]
                                 + array1[3][2 * m] * array3[i][7];
                  sum[0]
                    = result[0][0] + result[1][0] + result[2][0] +
result[3][0];
                  sum[1]
                    = result[0][1] + result[1][1] + result[2][1] +
result[3][1];
                  array2[i][2 * m] = (int16_t) (sum[0] >> 15);
                  array2[i][2 * m + 1] = (int16_t) (sum[1] >> 15);
                  m++;
                }
            }
        }
    }
}

Here is reference:

https://godbolt.org/z/hfqWvdf8e

Here is the analysis:

First, Note the inner loop iterations = 12 (for (l = 0; l < 12; l++))

GCC 14 process 11 elements and leave the last element using scalar:

```
        mov     x1, 11           ---> process 11 elements
        whilelo p5.s, xzr, x1
        ...
        vector codes
        ...
        scalar codes of the last element:
        ldrsh   w8, [x0, x5, lsl 1]
        add     x6, x5, x10
        ldrsh   w14, [x0, x7, lsl 1]
        add     x1, x4, x10
        ldrsh   w7, [x0, x4, lsl 1]
        add     x12, x5, x27
        ldrsh   w2, [x0, x2, lsl 1]
        add     x5, x28, x5
        mul     w11, w24, w8
        ldrsh   w13, [x0, x6, lsl 1]
        ldrsh   w1, [x0, x1, lsl 1]
        add     x6, x4, x27
        msub    w11, w21, w7, w11
        ldrsh   w12, [x0, x12, lsl 1]
        mul     w7, w24, w7
        add     x4, x28, x4
        madd    w8, w21, w8, w7
        ldrsh   w6, [x0, x6, lsl 1]
        mul     w7, w20, w14
        add     w3, w3, 24
        msub    w7, w19, w2, w7
        mul     w2, w20, w2
        add     w7, w7, w11
        mul     w11, w18, w13
        msub    w11, w17, w1, w11
        madd    w2, w19, w14, w2
        add     w11, w7, w11
        mul     w1, w18, w1
        mul     w7, w16, w12
        add     w2, w2, w8
        msub    w7, w15, w6, w7
        madd    w1, w17, w13, w1
        mul     w6, w16, w6
        add     w11, w11, w7
        madd    w6, w15, w12, w6
        add     w1, w2, w1
        asr     w11, w11, 15
        strh    w11, [x9, x5, lsl 1]
        add     w1, w1, w6
        asr     w1, w1, 15
        strh    w1, [x9, x4, lsl 1]
        cmp     w30, w3
        bne     .L4
        ldp     w2, w7, [sp, 108]
        ldr     w3, [sp, 116]
        add     w1, w2, 1
        add     w30, w30, w7
        ldr     x8, [sp, 96]
        and     w2, w1, 65535
        cmp     w3, w1, uxth
        bne     .L6
        ldr     x3, [sp, 120]
        add     x23, x23, x22
        ldr     w5, [sp, 116]
        add     x8, x8, 16
        add     x3, x3, 1
        cmp     x3, 4
        bne     .L3

```

GCC-12 has much better codegen (Fold 12 elements in vector codes):

```
        mov     x1, 12   ----> process 12 elements in vector.
        ptrue   p0.b, vl64
        whilelo p1.s, xzr, x1

```
       vector codes:
```
       No scalar epilogue.

This benchmark has over 70% performance drop between GCC-12 and GCC-14 for both
RVV and ARM SVE.

[Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop

Reply via email to