https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101944
Bug ID: 101944 Summary: suboptimal SLP for reduced case from namd_r Product: gcc Version: 12.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: linkw at gcc dot gnu.org Target Milestone: --- For SPEC2017 bmk 508.namd_r, it's observed that it degraded by -3.73% at -O2 -ftree-slp-vectorize vs baseline -O2 on Power9 with either default cost model or very cheap cost model. By isolating functions, several functions are responsible for it. One typical case is the below reduced one: ------------- TEST CASE typedef double BigReal; extern BigReal table_four_i[8]; extern void func1(BigReal *); extern void func2(BigReal *); extern void func3(BigReal, BigReal, BigReal, BigReal); void foo(BigReal scaling, BigReal *A1, BigReal *B1, BigReal diffa) { BigReal vdwEnergy = 0; func1(&vdwEnergy); const BigReal A = scaling * *A1; const BigReal B = scaling * *B1; BigReal vdw_d = A * table_four_i[0] - B * table_four_i[2]; BigReal vdw_c = A * table_four_i[1] - B * table_four_i[3]; BigReal vdw_b = A * table_four_i[4] - B * table_four_i[6]; BigReal vdw_a = A * table_four_i[5] - B * table_four_i[7]; register BigReal vdw_val = ((diffa * vdw_d * (1 / 6.) + vdw_c * (1 / 4.)) * diffa + vdw_b * (1 / 2.)) * diffa + vdw_a; vdwEnergy -= vdw_val; func2 (&vdwEnergy); func3 (vdw_a, vdw_b, vdw_c, vdw_d); } ------------- Options: -O2 -ffast-math -ftree-slp-vectorize -mcpu=power9 Scalar version at optimized dumping: func1 (&vdwEnergy); _1 = *A1_32(D); A_34 = _1 * scaling_33(D); _2 = *B1_35(D); B_36 = _2 * scaling_33(D); _3 = table_four_i[0]; _5 = table_four_i[2]; _6 = _5 * B_36; vdw_d_37 = .FMS (_3, A_34, _6); _7 = table_four_i[1]; _9 = table_four_i[3]; _10 = _9 * B_36; vdw_c_38 = .FMS (_7, A_34, _10); _11 = table_four_i[4]; _13 = table_four_i[6]; _14 = _13 * B_36; vdw_b_39 = .FMS (_11, A_34, _14); _15 = table_four_i[5]; _17 = table_four_i[7]; _18 = _17 * B_36; vdw_a_40 = .FMS (_15, A_34, _18); _51 = diffa_41(D) * 1.666666666666666574148081281236954964697360992431640625e-1; _21 = vdw_c_38 * 2.5e-1; _22 = .FMA (vdw_d_37, _51, _21); _24 = vdw_b_39 * 5.0e-1; _25 = .FMA (_22, diffa_41(D), _24); _26 = _25 * diffa_41(D); vdwEnergy.0_27 = vdwEnergy; _49 = _18 + vdwEnergy.0_27; _52 = .FMA (_15, A_34, _26); _28 = _49 - _52; vdwEnergy = _28; func2 (&vdwEnergy); Vector version at optimized dumping: func1 (&vdwEnergy); _1 = *A1_32(D); A_34 = _1 * scaling_33(D); _49 = {A_34, A_34}; _2 = *B1_35(D); B_36 = _2 * scaling_33(D); _54 = {B_36, B_36}; vect__3.6_48 = MEM <vector(2) double> [(double *)&table_four_i]; vect__5.10_52 = MEM <vector(2) double> [(double *)&table_four_i + 16B]; vect__6.11_55 = vect__5.10_52 * _54; vect_vdw_d_37.12_56 = .FMS (vect__3.6_48, _49, vect__6.11_55); _58 = BIT_FIELD_REF <vect_vdw_d_37.12_56, 64, 64>; _57 = BIT_FIELD_REF <vect_vdw_d_37.12_56, 64, 0>; _11 = table_four_i[4]; _13 = table_four_i[6]; _14 = _13 * B_36; vdw_b_39 = .FMS (_11, A_34, _14); _15 = table_four_i[5]; _17 = table_four_i[7]; _18 = _17 * B_36; vdw_a_40 = .FMS (_15, A_34, _18); _51 = diffa_41(D) * 1.666666666666666574148081281236954964697360992431640625e-1; _59 = {_51, 2.5e-1}; vect__20.13_60 = vect_vdw_d_37.12_56 * _59; _61 = .REDUC_PLUS (vect__20.13_60); _24 = vdw_b_39 * 5.0e-1; _25 = .FMA (diffa_41(D), _61, _24); _26 = _25 * diffa_41(D); vdwEnergy.0_27 = vdwEnergy; _66 = _18 + vdwEnergy.0_27; _68 = .FMA (_15, A_34, _26); _28 = _66 - _68; vdwEnergy = _28; func2 (&vdwEnergy); reduc.c:24:34: note: Cost model analysis for part in loop 0: Vector cost: 16 Scalar cost: 17