https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138
Bug ID: 98138 Summary: BB vect fail to SLP one case Product: gcc Version: 11.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: linkw at gcc dot gnu.org Target Milestone: --- Test case: extern void test(unsigned int t[4][4]); void foo(unsigned char *p1, int i1, unsigned char *p2, int i2) { unsigned int tmp[4][4]; unsigned int a0, a1, a2, a3; for (int i = 0; i < 4; i++, p1 += i1, p2 += i2) { a0 = (p1[0] - p2[0]) + ((p1[4] - p2[4]) << 16); a1 = (p1[1] - p2[1]) + ((p1[5] - p2[5]) << 16); a2 = (p1[2] - p2[2]) + ((p1[6] - p2[6]) << 16); a3 = (p1[3] - p2[3]) + ((p1[7] - p2[7]) << 16); int t0 = a0 + a1; int t1 = a0 - a1; int t2 = a2 + a3; int t3 = a2 - a3; tmp[i][0] = t0 + t2; tmp[i][2] = t0 - t2; tmp[i][1] = t1 + t3; tmp[i][3] = t1 - t3; } test(tmp); } The expected code on ppc64le can look like: // p1 byte 0 to byte 7 d1_0_7 = load_dword(p1) // p1+i1 b0 to b7, rename it as 8 to 15 d1_8_15 = load_dword(p1 + i1) d1_16_23 = load_dword(p1 + 2*i1) d1_24_31 = load_dword(p1 + 3*i1) V_d1_0_15 = construct_vec(d1_0_7,d1_8_15) // vector char V_d1_16_31 = construct_vec(d1_16_23,d1_24_31) V_d1_0_3_all = vperm(V_d1_0_15, V_d1_0_15, {0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27}) V_d1_4_7_all = vperm(V_d1_0_15, V_d1_0_15, {4 12 20 28 5 13 21 29 6 14 22 30 7 15 23 31}) // Do the similar for p2 with i2, get V_d2_0_3_all, V_d2_4_7_all // Do the subtraction together (all 4x4 bytes) V_sub1 = V_d1_0_3_all - V_d2_0_3_all V_sub2 = V_d1_4_7_all - V_d2_4_7_all // Do some unpack and get the promoted vector int V_a0_tmp = vec_promote(V_sub2, {0 1 2 3}) // vector int {b4 b12 b20 b28} V_a0_1 = V_a0_tmp << 16 V_a0_0 = vec_promote(V_sub1, {0 1 2 3}). // vector int {b0 b8 b16 b24} // vector int {a0_iter0, a0_iter1, a0_iter2, a0_iter3} V_a0 = V_a0_0 + V_a0_1 // Get the similar for V_a1, V_a2, V_a3 // Compute t0/t1/t2/t3 // vector int {t0_iter0, t0_iter1, t0_iter2, t0_iter3} V_t0 = V_a0 + V_a1 V_t1 = V_a0 - V_a1 V_t2 = V_a2 + V_a3 V_t3 = V_a2 - V_a3 // Compute tmps // vector int {tmp[0][0], tmp[1][0], tmp[2][0], tmp[3][0]} V_tmp0 = V_t0 + V_t2 V_tmp2 = V_t0 - V_t2 V_tmp1 = V_t1 + V_t3 V_tmp3 = V_t1 - V_t3 // Final construct the {tmp[0][0], tmp[0][1], tmp[0][2], tmp[0][3]} ... // with six further permutation on V_tmp0/V_tmp1/V_tmp2/V_tmp3