https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138
Bug ID: 98138
Summary: BB vect fail to SLP one case
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: linkw at gcc dot gnu.org
Target Milestone: ---
Test case:
extern void test(unsigned int t[4][4]);
void foo(unsigned char *p1, int i1, unsigned char *p2, int i2)
{
unsigned int tmp[4][4];
unsigned int a0, a1, a2, a3;
for (int i = 0; i < 4; i++, p1 += i1, p2 += i2) {
a0 = (p1[0] - p2[0]) + ((p1[4] - p2[4]) << 16);
a1 = (p1[1] - p2[1]) + ((p1[5] - p2[5]) << 16);
a2 = (p1[2] - p2[2]) + ((p1[6] - p2[6]) << 16);
a3 = (p1[3] - p2[3]) + ((p1[7] - p2[7]) << 16);
int t0 = a0 + a1;
int t1 = a0 - a1;
int t2 = a2 + a3;
int t3 = a2 - a3;
tmp[i][0] = t0 + t2;
tmp[i][2] = t0 - t2;
tmp[i][1] = t1 + t3;
tmp[i][3] = t1 - t3;
}
test(tmp);
}
The expected code on ppc64le can look like:
// p1 byte 0 to byte 7
d1_0_7 = load_dword(p1)
// p1+i1 b0 to b7, rename it as 8 to 15
d1_8_15 = load_dword(p1 + i1)
d1_16_23 = load_dword(p1 + 2*i1)
d1_24_31 = load_dword(p1 + 3*i1)
V_d1_0_15 = construct_vec(d1_0_7,d1_8_15) // vector char
V_d1_16_31 = construct_vec(d1_16_23,d1_24_31)
V_d1_0_3_all = vperm(V_d1_0_15, V_d1_0_15,
{0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27})
V_d1_4_7_all = vperm(V_d1_0_15, V_d1_0_15,
{4 12 20 28 5 13 21 29 6 14 22 30 7 15 23 31})
// Do the similar for p2 with i2, get V_d2_0_3_all, V_d2_4_7_all
// Do the subtraction together (all 4x4 bytes)
V_sub1 = V_d1_0_3_all - V_d2_0_3_all
V_sub2 = V_d1_4_7_all - V_d2_4_7_all
// Do some unpack and get the promoted vector int
V_a0_tmp = vec_promote(V_sub2, {0 1 2 3}) // vector int {b4 b12 b20 b28}
V_a0_1 = V_a0_tmp << 16
V_a0_0 = vec_promote(V_sub1, {0 1 2 3}). // vector int {b0 b8 b16 b24}
// vector int {a0_iter0, a0_iter1, a0_iter2, a0_iter3}
V_a0 = V_a0_0 + V_a0_1
// Get the similar for V_a1, V_a2, V_a3
// Compute t0/t1/t2/t3
// vector int {t0_iter0, t0_iter1, t0_iter2, t0_iter3}
V_t0 = V_a0 + V_a1
V_t1 = V_a0 - V_a1
V_t2 = V_a2 + V_a3
V_t3 = V_a2 - V_a3
// Compute tmps
// vector int {tmp[0][0], tmp[1][0], tmp[2][0], tmp[3][0]}
V_tmp0 = V_t0 + V_t2
V_tmp2 = V_t0 - V_t2
V_tmp1 = V_t1 + V_t3
V_tmp3 = V_t1 - V_t3
// Final construct the {tmp[0][0], tmp[0][1], tmp[0][2], tmp[0][3]} ...
// with six further permutation on V_tmp0/V_tmp1/V_tmp2/V_tmp3