https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441
Bug ID: 113441
Summary: [14 Regression] Fail to fold the last element with
multiple loop
Product: gcc
Version: 14.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: juzhe.zhong at rivai dot ai
Target Milestone: ---
Hi, We found there is a regression between GCC-12 vs GCC-14 when evaluating our
downstream RVV GCC vs upstream RVV GCC.
Such regression not only happens on our RVV GCC but also ARM SVE GCC.
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int16_t array1[4][1 * 273 * 12 * 2];
int16_t array2[4][1 * 273 * 12 * 2];
int16_t array3[4][4 * 2];
void
foo (uint8_t a, uint16_t b)
{
int32_t sum[2];
int32_t result[4][2];
uint16_t j = 0;
uint8_t i = 0;
uint16_t l = 0;
uint16_t k = 0;
uint32_t m = 0;
for (i = 0; i < 4; i++)
{
m = 0;
for (j = 0; j < a; j++)
{
for (k = 0; k < b; k++)
{
for (l = 0; l < 12; l++)
{
result[0][0] = array1[0][2 * m] * array3[i][0]
- array1[0][2 * m + 1] * array3[i][1];
result[0][1] = array1[0][2 * m + 1] * array3[i][0]
+ array1[0][2 * m] * array3[i][1];
result[1][0] = array1[1][2 * m] * array3[i][2]
- array1[1][2 * m + 1] * array3[i][3];
result[1][1] = array1[1][2 * m + 1] * array3[i][2]
+ array1[1][2 * m] * array3[i][3];
result[2][0] = array1[2][2 * m] * array3[i][4]
- array1[2][2 * m + 1] * array3[i][5];
result[2][1] = array1[2][2 * m + 1] * array3[i][4]
+ array1[2][2 * m] * array3[i][5];
result[3][0] = array1[3][2 * m] * array3[i][6]
- array1[3][2 * m + 1] * array3[i][7];
result[3][1] = array1[3][2 * m + 1] * array3[i][6]
+ array1[3][2 * m] * array3[i][7];
sum[0]
= result[0][0] + result[1][0] + result[2][0] +
result[3][0];
sum[1]
= result[0][1] + result[1][1] + result[2][1] +
result[3][1];
array2[i][2 * m] = (int16_t) (sum[0] >> 15);
array2[i][2 * m + 1] = (int16_t) (sum[1] >> 15);
m++;
}
}
}
}
}
Here is reference:
https://godbolt.org/z/hfqWvdf8e
Here is the analysis:
First, Note the inner loop iterations = 12 (for (l = 0; l < 12; l++))
GCC 14 process 11 elements and leave the last element using scalar:
```
mov x1, 11 ---> process 11 elements
whilelo p5.s, xzr, x1
...
vector codes
...
scalar codes of the last element:
ldrsh w8, [x0, x5, lsl 1]
add x6, x5, x10
ldrsh w14, [x0, x7, lsl 1]
add x1, x4, x10
ldrsh w7, [x0, x4, lsl 1]
add x12, x5, x27
ldrsh w2, [x0, x2, lsl 1]
add x5, x28, x5
mul w11, w24, w8
ldrsh w13, [x0, x6, lsl 1]
ldrsh w1, [x0, x1, lsl 1]
add x6, x4, x27
msub w11, w21, w7, w11
ldrsh w12, [x0, x12, lsl 1]
mul w7, w24, w7
add x4, x28, x4
madd w8, w21, w8, w7
ldrsh w6, [x0, x6, lsl 1]
mul w7, w20, w14
add w3, w3, 24
msub w7, w19, w2, w7
mul w2, w20, w2
add w7, w7, w11
mul w11, w18, w13
msub w11, w17, w1, w11
madd w2, w19, w14, w2
add w11, w7, w11
mul w1, w18, w1
mul w7, w16, w12
add w2, w2, w8
msub w7, w15, w6, w7
madd w1, w17, w13, w1
mul w6, w16, w6
add w11, w11, w7
madd w6, w15, w12, w6
add w1, w2, w1
asr w11, w11, 15
strh w11, [x9, x5, lsl 1]
add w1, w1, w6
asr w1, w1, 15
strh w1, [x9, x4, lsl 1]
cmp w30, w3
bne .L4
ldp w2, w7, [sp, 108]
ldr w3, [sp, 116]
add w1, w2, 1
add w30, w30, w7
ldr x8, [sp, 96]
and w2, w1, 65535
cmp w3, w1, uxth
bne .L6
ldr x3, [sp, 120]
add x23, x23, x22
ldr w5, [sp, 116]
add x8, x8, 16
add x3, x3, 1
cmp x3, 4
bne .L3
```
GCC-12 has much better codegen (Fold 12 elements in vector codes):
```
mov x1, 12 ----> process 12 elements in vector.
ptrue p0.b, vl64
whilelo p1.s, xzr, x1
```
vector codes:
```
No scalar epilogue.
This benchmark has over 70% performance drop between GCC-12 and GCC-14 for both
RVV and ARM SVE.