[PATCH] libgcov: Fix gcov overlap bugs of divide to 0
Fix the long lasting issue of `gcov-tool overlap xxx yyy`, divide to 0 caused the output shows a lot of nans, another problem is the counts in file are never acculated leads to incorrect results. Signed-off-by: Xionghu Luo libgcc/ChangeLog: * libgcov-util.c (compute_one_gcov): Avoid divide to 0. (accumulate_sum_counts): New. (calculate_overlap): Call accumulate_sum_countes. --- libgcc/libgcov-util.c | 58 --- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/libgcc/libgcov-util.c b/libgcc/libgcov-util.c index d547c103cab..26a02e66567 100644 --- a/libgcc/libgcov-util.c +++ b/libgcc/libgcov-util.c @@ -1072,6 +1072,8 @@ compute_one_gcov (const struct gcov_info *gcov_info1, for (f_ix = 0; f_ix < gcov_info1->n_functions; f_ix++) { + double func_1 = 0.0; + double func_2 = 0.0; double func_cum_1 = 0.0; double func_cum_2 = 0.0; double func_val = 0.0; @@ -1096,11 +1098,15 @@ compute_one_gcov (const struct gcov_info *gcov_info1, ci_ptr2->values[c_num], sum_1, sum_2); - func_cum_1 += ci_ptr1->values[c_num] / sum_1; - func_cum_2 += ci_ptr2->values[c_num] / sum_2; + if (sum_1) + func_1 = ci_ptr1->values[c_num] / sum_1; + func_cum_1 += func_1; + if (sum_2) + func_2 = ci_ptr2->values[c_num] / sum_2; + func_cum_2 += func_2; nonzero = 1; - if (ci_ptr1->values[c_num] / sum_1 >= overlap_hot_threshold - || ci_ptr2->values[c_num] / sum_2 >= overlap_hot_threshold) + if (func_1 >= overlap_hot_threshold + || func_2 >= overlap_hot_threshold) hot = 1; } } @@ -1322,6 +1328,47 @@ matched_gcov_info (const struct gcov_info *info1, const struct gcov_info *info2) return 1; } +static int +accumuate_sum_counts (const struct gcov_info *gcov_info1, + const struct gcov_info *gcov_info2) +{ + gcc_assert (gcov_info1 || gcov_info2); + unsigned f_ix; + + if (gcov_info1) +{ + gcov_type cum_1 = 0; + for (f_ix = 0; f_ix < gcov_info1->n_functions; f_ix++) + { + const struct gcov_fn_info *gfi_ptr = gcov_info1->functions[f_ix]; + if (!gfi_ptr || gfi_ptr->key != gcov_info1) + continue; + const struct gcov_ctr_info *ci_ptr = gfi_ptr->ctrs; + unsigned c_num; + for (c_num = 0; c_num < ci_ptr->num; c_num++) + cum_1 += ci_ptr->values[c_num]; + } + overlap_sum_1 = cum_1; +} + + if (gcov_info2) +{ + gcov_type cum_2 = 0; + for (f_ix = 0; f_ix < gcov_info2->n_functions; f_ix++) + { + const struct gcov_fn_info *gfi_ptr = gcov_info2->functions[f_ix]; + if (!gfi_ptr || gfi_ptr->key != gcov_info2) + continue; + const struct gcov_ctr_info *ci_ptr = gfi_ptr->ctrs; + unsigned c_num; + for (c_num = 0; c_num < ci_ptr->num; c_num++) + cum_2 += ci_ptr->values[c_num]; + } + overlap_sum_2 = cum_2; +} + return 0; +} + /* Compute the overlap score of two profiles with the head of GCOV_LIST1 and GCOV_LIST1. Return a number ranging from [0.0, 1.0], with 0.0 meaning no match and 1.0 meaning a perfect match. */ @@ -1410,6 +1457,9 @@ calculate_overlap (struct gcov_info *gcov_list1, if (overlap_func_level) printf("\n processing %36s:\n", filename); + overlap_sum_1 = overlap_sum_2 = 0.0; + accumuate_sum_counts (all_infos[i].obj1, all_infos[i].obj2); + val = compute_one_gcov (all_infos[i].obj1, all_infos[i].obj2, overlap_sum_1, overlap_sum_2, &cum_1, &cum_2); -- 2.39.3
Re: [PATCH] libgcov: Fix gcov overlap bugs of divide to 0
+cc maintainers. On 2023/10/26 11:25, Xionghu Luo wrote: Fix the long lasting issue of `gcov-tool overlap xxx yyy`, divide to 0 caused the output shows a lot of nans, another problem is the counts in file are never acculated leads to incorrect results. Signed-off-by: Xionghu Luo libgcc/ChangeLog: * libgcov-util.c (compute_one_gcov): Avoid divide to 0. (accumulate_sum_counts): New. (calculate_overlap): Call accumulate_sum_countes. --- libgcc/libgcov-util.c | 58 --- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/libgcc/libgcov-util.c b/libgcc/libgcov-util.c index d547c103cab..26a02e66567 100644 --- a/libgcc/libgcov-util.c +++ b/libgcc/libgcov-util.c @@ -1072,6 +1072,8 @@ compute_one_gcov (const struct gcov_info *gcov_info1, for (f_ix = 0; f_ix < gcov_info1->n_functions; f_ix++) { + double func_1 = 0.0; + double func_2 = 0.0; double func_cum_1 = 0.0; double func_cum_2 = 0.0; double func_val = 0.0; @@ -1096,11 +1098,15 @@ compute_one_gcov (const struct gcov_info *gcov_info1, ci_ptr2->values[c_num], sum_1, sum_2); - func_cum_1 += ci_ptr1->values[c_num] / sum_1; - func_cum_2 += ci_ptr2->values[c_num] / sum_2; + if (sum_1) + func_1 = ci_ptr1->values[c_num] / sum_1; + func_cum_1 += func_1; + if (sum_2) + func_2 = ci_ptr2->values[c_num] / sum_2; + func_cum_2 += func_2; nonzero = 1; - if (ci_ptr1->values[c_num] / sum_1 >= overlap_hot_threshold - || ci_ptr2->values[c_num] / sum_2 >= overlap_hot_threshold) + if (func_1 >= overlap_hot_threshold + || func_2 >= overlap_hot_threshold) hot = 1; } } @@ -1322,6 +1328,47 @@ matched_gcov_info (const struct gcov_info *info1, const struct gcov_info *info2) return 1; } +static int +accumuate_sum_counts (const struct gcov_info *gcov_info1, + const struct gcov_info *gcov_info2) +{ + gcc_assert (gcov_info1 || gcov_info2); + unsigned f_ix; + + if (gcov_info1) +{ + gcov_type cum_1 = 0; + for (f_ix = 0; f_ix < gcov_info1->n_functions; f_ix++) + { + const struct gcov_fn_info *gfi_ptr = gcov_info1->functions[f_ix]; + if (!gfi_ptr || gfi_ptr->key != gcov_info1) + continue; + const struct gcov_ctr_info *ci_ptr = gfi_ptr->ctrs; + unsigned c_num; + for (c_num = 0; c_num < ci_ptr->num; c_num++) + cum_1 += ci_ptr->values[c_num]; + } + overlap_sum_1 = cum_1; +} + + if (gcov_info2) +{ + gcov_type cum_2 = 0; + for (f_ix = 0; f_ix < gcov_info2->n_functions; f_ix++) + { + const struct gcov_fn_info *gfi_ptr = gcov_info2->functions[f_ix]; + if (!gfi_ptr || gfi_ptr->key != gcov_info2) + continue; + const struct gcov_ctr_info *ci_ptr = gfi_ptr->ctrs; + unsigned c_num; + for (c_num = 0; c_num < ci_ptr->num; c_num++) + cum_2 += ci_ptr->values[c_num]; + } + overlap_sum_2 = cum_2; +} + return 0; +} + /* Compute the overlap score of two profiles with the head of GCOV_LIST1 and GCOV_LIST1. Return a number ranging from [0.0, 1.0], with 0.0 meaning no match and 1.0 meaning a perfect match. */ @@ -1410,6 +1457,9 @@ calculate_overlap (struct gcov_info *gcov_list1, if (overlap_func_level) printf("\n processing %36s:\n", filename); + overlap_sum_1 = overlap_sum_2 = 0.0; + accumuate_sum_counts (all_infos[i].obj1, all_infos[i].obj2); + val = compute_one_gcov (all_infos[i].obj1, all_infos[i].obj2, overlap_sum_1, overlap_sum_2, &cum_1, &cum_2);
[RFC] Run store-merging pass once more before pass fre/pre
Store-merging pass should run twice, the reason is pass fre/pre will do some kind of optimizations to instructions by: 1. Converting the load from address to load from function arguments (store_merging_30.c:foo1). 2. Converting the byte access to BIT_FIELD_REF(store_merging_30.c:foo2). 3. Other bitfield combinations or potential interference optimizations etc. These optimizations will break the store chain, store-merging pass fails to catch such kind of pattern so stores are not merged in middle end, then consecutive stb/sth instructions(should be merged to stw) are emitted finally. And why not directly move store-merging pass(numbered 194) just before fre1(numbered 35) is for case store_merging_14.c, 5 merges are done by store_merging1, and 4 merges are done fore store_merge2. So, keep the original store_merge as store_merge2 as store merge may be still available after other pass optimizations. Most of the 30 store_merging_N.c test case dg-final pass name would be updated from store-merging to store-merging1 once this RFC patch idea got confirmed. Any comments? Thanks. PS: Before this patch, store_merging_30.c.035t.fre1: ... foo1: Inserted _13 = (short unsigned int) counters_new_5(D); Replaced tmp.D.2912.D.2911.D.2910.D.2909.inuse with _13 in all uses of _1 = tmp.D.2912.D.2911.D.2910.D.2909.inuse; Removing dead stmt _1 = tmp.D.2912.D.2911.D.2910.D.2909.inuse; ... foo2: Inserted _17 = BIT_FIELD_REF <_1, 8, 16>; Replaced tmp.D.2926.D.2925.D.2924.D.2923.objects with _17 in all uses of _3 = tmp.D.2926.D.2925.D.2924.D.2923.objects; Removing dead stmt _3 = tmp.D.2926.D.2925.D.2924.D.2923.objects; foo1 asm: rldicl 9,4,48,48 sth 4,0(3) sth 9,2(3) blr With this patch(similar for foo2): stw r4,0(r3) blr gcc/ChangeLog 2020-02-18 Xiong Hu Luo Part of PR middle-end/71509 gimple-ssa-store-merging.c (clone): New. passes.def (pass_store_merging): New. gcc/testsuite/ChangeLog 2020-02-18 Xiong Hu Luo Part of PR middle-end/71509 testsuite/gcc.dg/store_merging_14.c: Update. testsuite/gcc.dg/store_merging_30.c: New. --- gcc/gimple-ssa-store-merging.c | 2 + gcc/passes.def | 1 + gcc/testsuite/gcc.dg/store_merging_14.c | 3 +- gcc/testsuite/gcc.dg/store_merging_30.c | 86 + 4 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.dg/store_merging_30.c diff --git a/gcc/gimple-ssa-store-merging.c b/gcc/gimple-ssa-store-merging.c index 8371323ef4a..9a5bd49fc3a 100644 --- a/gcc/gimple-ssa-store-merging.c +++ b/gcc/gimple-ssa-store-merging.c @@ -2156,6 +2156,8 @@ public: { } + opt_pass * clone () { return new pass_store_merging (m_ctxt); } + /* Pass not supported for PDP-endian, nor for insane hosts or target character sizes where native_{encode,interpret}_expr doesn't work properly. */ diff --git a/gcc/passes.def b/gcc/passes.def index 2bf2cb78fc5..e531531cb14 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -85,6 +85,7 @@ along with GCC; see the file COPYING3. If not see /* pass_build_ealias is a dummy pass that ensures that we execute TODO_rebuild_alias at this point. */ NEXT_PASS (pass_build_ealias); + NEXT_PASS (pass_store_merging); NEXT_PASS (pass_fre, true /* may_iterate */); NEXT_PASS (pass_early_vrp); NEXT_PASS (pass_merge_phi); diff --git a/gcc/testsuite/gcc.dg/store_merging_14.c b/gcc/testsuite/gcc.dg/store_merging_14.c index 9310aaf3489..bd120d18ac6 100644 --- a/gcc/testsuite/gcc.dg/store_merging_14.c +++ b/gcc/testsuite/gcc.dg/store_merging_14.c @@ -214,4 +214,5 @@ main () return 0; } -/* { dg-final { scan-tree-dump-times "Merging successful" 9 "store-merging" } } */ +/* { dg-final { scan-tree-dump-times "Merging successful" 5 "store-merging1" } } */ +/* { dg-final { scan-tree-dump-times "Merging successful" 4 "store-merging2" } } */ diff --git a/gcc/testsuite/gcc.dg/store_merging_30.c b/gcc/testsuite/gcc.dg/store_merging_30.c new file mode 100644 index 000..71369c3b196 --- /dev/null +++ b/gcc/testsuite/gcc.dg/store_merging_30.c @@ -0,0 +1,86 @@ +/* { dg-do run } */ +/* { dg-require-effective-target store_merge } */ +/* { dg-options "-O2 -fdump-tree-store-merging" } */ + +typedef unsigned int atomic_t; + +struct page +{ + union + { +unsigned long counters; +struct +{ + union + { + struct + { + unsigned inuse : 16; + unsigned objects : 15; + unsigned frozen : 1; + }; + }; +}; + }; +}; + +struct page2 +{ + union + { +unsigned counters; +struct +{ + union + { + struct + { + unsigned inuse : 16; + unsigned objects : 8; + unsigned frozen : 8; + }; + }; +}; + }; +}; + +__attribute__((noipa)) void +foo1 (struct page *page, unsigned long counters_new) +{ +struct page tmp; +tmp.c
[PATCH] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069]
The native RTL expression for vec_mrghw should be same for BE and LE as they are register and endian-independent. So both BE and LE need generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw with vec_select and vec_concat. (set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI (subreg:V4SI (reg:V16QI 139) 0) (subreg:V4SI (reg:V16QI 140) 0)) [const_int 0 4 1 5])) Then combine pass could do the nested vec_select optimization in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE: 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5]) 24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);} => 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel) 24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);} The endianness check need only once at ASM generation finally. ASM would be better due to nested vec_select simplified to simple scalar load. Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{32,64} Linux(Thanks to Kewen), OK for master? Or should we revert r12-4496 to restore to the UNSPEC implementation? gcc/ChangeLog: PR target/106069 * config/rs6000/altivec.md (altivec_vmrghb): Emit same native RTL for BE and LE. (altivec_vmrghh): Likewise. (altivec_vmrghw): Likewise. (*altivec_vmrghsf): Adjust. (altivec_vmrglb): Likewise. (altivec_vmrglh): Likewise. (altivec_vmrglw): Likewise. (*altivec_vmrglsf): Adjust. (altivec_vmrghb_direct): Emit different ASM for BE and LE. (altivec_vmrghh_direct): Likewise. (altivec_vmrghw_direct_): Likewise. (altivec_vmrglb_direct): Likewise. (altivec_vmrglh_direct): Likewise. (altivec_vmrglw_direct_): Likewise. (vec_widen_smult_hi_v16qi): Adjust. (vec_widen_smult_lo_v16qi): Adjust. (vec_widen_umult_hi_v16qi): Adjust. (vec_widen_umult_lo_v16qi): Adjust. (vec_widen_smult_hi_v8hi): Adjust. (vec_widen_smult_lo_v8hi): Adjust. (vec_widen_umult_hi_v8hi): Adjust. (vec_widen_umult_lo_v8hi): Adjust. * config/rs6000/rs6000.cc (altivec_expand_vec_perm_const): Emit same native RTL for BE and LE. * config/rs6000/vsx.md (vsx_xxmrghw_): Likewise. (vsx_xxmrglw_): Likewise. gcc/testsuite/ChangeLog: PR target/106069 * gcc.target/powerpc/pr106069.C: New test. Signed-off-by: Xionghu Luo --- gcc/config/rs6000/altivec.md| 122 gcc/config/rs6000/rs6000.cc | 36 +++--- gcc/config/rs6000/vsx.md| 16 +-- gcc/testsuite/gcc.target/powerpc/pr106069.C | 118 +++ 4 files changed, 209 insertions(+), 83 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr106069.C diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 2c4940f2e21..8d9c0109559 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -1144,11 +1144,7 @@ (define_expand "altivec_vmrghb" (use (match_operand:V16QI 2 "register_operand"))] "TARGET_ALTIVEC" { - rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct - : gen_altivec_vmrglb_direct; - if (!BYTES_BIG_ENDIAN) -std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + emit_insn (gen_altivec_vmrghb_direct (operands[0], operands[1], operands[2])); DONE; }) @@ -1167,7 +1163,12 @@ (define_insn "altivec_vmrghb_direct" (const_int 6) (const_int 22) (const_int 7) (const_int 23)])))] "TARGET_ALTIVEC" - "vmrghb %0,%1,%2" + { + if (BYTES_BIG_ENDIAN) + return "vmrghb %0,%1,%2"; +else + return "vmrglb %0,%2,%1"; + } [(set_attr "type" "vecperm")]) (define_expand "altivec_vmrghh" @@ -1176,11 +1177,7 @@ (define_expand "altivec_vmrghh" (use (match_operand:V8HI 2 "register_operand"))] "TARGET_ALTIVEC" { - rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghh_direct - : gen_altivec_vmrglh_direct; - if (!BYTES_BIG_ENDIAN) -std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + emit_insn (gen_altivec_vmrghh_direct (operands[0], operands[1], operands[2])); DONE; }) @@ -1195,7 +1192,12 @@ (define_insn "altivec_vmrghh_direct" (const_int 2) (const_int 10) (const_int 3) (const_int 11)])))] "TARGET_ALTIVEC" - "vmrghh %0,%1,%2" + { + if (BYTES_BIG_ENDIAN) + return "vmrghh %0,%1,%2"; +else + return "vmrglh
Re: [PATCH v2] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069]
On 2022/8/9 11:01, Kewen.Lin wrote: Hi Xionghu, Thanks for the fix. on 2022/8/8 11:42, Xionghu Luo wrote: The native RTL expression for vec_mrghw should be same for BE and LE as they are register and endian-independent. So both BE and LE need generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw with vec_select and vec_concat. (set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI (subreg:V4SI (reg:V16QI 139) 0) (subreg:V4SI (reg:V16QI 140) 0)) [const_int 0 4 1 5])) Then combine pass could do the nested vec_select optimization in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE: 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5]) 24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);} => 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel) 24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);} The endianness check need only once at ASM generation finally. ASM would be better due to nested vec_select simplified to simple scalar load. Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{32,64} Sorry, no -m32 for LE testing. I noticed the attachement in that PR didn't include the test case (though the changelog has it), so I re-tested it again, nothing changed. :) Linux(Thanks to Kewen), OK for master? Or should we revert r12-4496 to restore to the UNSPEC implementation? I have some concern on those changed "altivec_*_direct", IMHO the suffix "_direct" is normally to indicate the define_insn is mapped to the corresponding hw insn directly. With this change, for example, altivec_vmrghb_direct can be mapped into vmrghb or vmrglb, this looks misleading. Maybe we can add the corresponding _direct_le and _direct_be versions, both are mapped into the same insn but have different RTL patterns. Looking forward to Segher's and David's suggestions. Thanks! Do you mean same RTL patterns with different hw insn? Updated as: v2: Split the direct pattern to be and le with same RTL but different insn. The native RTL expression for vec_mrghw should be same for BE and LE as they are register and endian-independent. So both BE and LE need generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw with vec_select and vec_concat. (set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI (subreg:V4SI (reg:V16QI 139) 0) (subreg:V4SI (reg:V16QI 140) 0)) [const_int 0 4 1 5])) Then combine pass could do the nested vec_select optimization in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE: 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5]) 24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);} => 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel) 24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);} The endianness check need only once at ASM generation finally. ASM would be better due to nested vec_select simplified to simple scalar load. Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{32,64} Linux(Thanks to Kewen), OK for master? Or should we revert r12-4496 to restore to the UNSPEC implementation? gcc/ChangeLog: PR target/106069 * config/rs6000/altivec.md (altivec_vmrghb): Emit same native RTL for BE and LE. (altivec_vmrghh): Likewise. (altivec_vmrghw): Likewise. (*altivec_vmrghsf): Adjust. (altivec_vmrglb): Likewise. (altivec_vmrglh): Likewise. (altivec_vmrglw): Likewise. (*altivec_vmrglsf): Adjust. (altivec_vmrghb_direct): Emit different ASM for BE and LE. (altivec_vmrghh_direct): Likewise. (altivec_vmrghw_direct_): Likewise. (altivec_vmrglb_direct): Likewise. (altivec_vmrglh_direct): Likewise. (altivec_vmrglw_direct_): Likewise. (vec_widen_smult_hi_v16qi): Adjust. (vec_widen_smult_lo_v16qi): Adjust. (vec_widen_umult_hi_v16qi): Adjust. (vec_widen_umult_lo_v16qi): Adjust. (vec_widen_smult_hi_v8hi): Adjust. (vec_widen_smult_lo_v8hi): Adjust. (vec_widen_umult_hi_v8hi): Adjust. (vec_widen_umult_lo_v8hi): Adjust. * config/rs6000/rs6000.cc (altivec_expand_vec_perm_const): Emit same native RTL for BE and LE. * config/rs6000/vsx.md (vsx_xxmrghw_): Likewise. (vsx_xxmrglw_): Likewise. gcc/testsuite/ChangeLog: PR target/106069 * g++.target/powerpc/pr106069.C: New test. Signed-off-by: Xionghu Luo --- gcc/config/rs6000/altivec.md| 223 ++-- gcc/config/rs6000/rs6000.cc | 36 ++-- gcc/config/rs6000/vsx.md| 26 +-- gcc/testsuite/g++.target/powerpc/pr106069.C | 120 +++ 4 files changed, 303 insertions(+), 102 deletions(-) create mode 100644 gc
Re: [PATCH v2] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069]
On 2022/8/11 01:07, Segher Boessenkool wrote: On Wed, Aug 10, 2022 at 02:39:02PM +0800, Xionghu Luo wrote: On 2022/8/9 11:01, Kewen.Lin wrote: I have some concern on those changed "altivec_*_direct", IMHO the suffix "_direct" is normally to indicate the define_insn is mapped to the corresponding hw insn directly. With this change, for example, altivec_vmrghb_direct can be mapped into vmrghb or vmrglb, this looks misleading. Maybe we can add the corresponding _direct_le and _direct_be versions, both are mapped into the same insn but have different RTL patterns. Looking forward to Segher's and David's suggestions. Thanks! Do you mean same RTL patterns with different hw insn? A pattern called altivec_vmrghb_direct_le should always emit a vmrghb instruction, never a vmrglb instead. Misleading names are an expensive problem. Thanks. Then on LE platforms, if user calls altivec_vmrghw,it will be expanded to RTL (vec_select (vec_concat (R0 R1 (0 4 1 5))), and finally matched to altivec_vmrglw_direct_v4si_le with ASM "vmrglw". For BE just strict forward, seems more clear :-), OK for master? [PATCH v3] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069] v3: rename altivec_vmrghb_direct_le to altivec_vmrglb_direct_le to match the actual output ASM vmrglb. Likewise for all similar xxx_direct_le patterns. v2: Split the direct pattern to be and le with same RTL but different insn. The native RTL expression for vec_mrghw should be same for BE and LE as they are register and endian-independent. So both BE and LE need generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw with vec_select and vec_concat. (set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI (subreg:V4SI (reg:V16QI 139) 0) (subreg:V4SI (reg:V16QI 140) 0)) [const_int 0 4 1 5])) Then combine pass could do the nested vec_select optimization in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE: 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5]) 24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);} => 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel) 24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);} The endianness check need only once at ASM generation finally. ASM would be better due to nested vec_select simplified to simple scalar load. Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{64} Linux(Thanks to Kewen). gcc/ChangeLog: PR target/106069 * config/rs6000/altivec.md (altivec_vmrghb_direct): Remove. (altivec_vmrghb_direct_be): New pattern for BE. (altivec_vmrglb_direct_le): New pattern for LE. (altivec_vmrghh_direct): Remove. (altivec_vmrghh_direct_be): New pattern for BE. (altivec_vmrglh_direct_le): New pattern for LE. (altivec_vmrghw_direct_): Remove. (altivec_vmrghw_direct__be): New pattern for BE. (altivec_vmrglw_direct__le): New pattern for LE. (altivec_vmrglb_direct): Remove. (altivec_vmrglb_direct_be): New pattern for BE. (altivec_vmrghb_direct_le): New pattern for LE. (altivec_vmrglh_direct): Remove. (altivec_vmrglh_direct_be): New pattern for BE. (altivec_vmrghh_direct_le): New pattern for LE. (altivec_vmrglw_direct_): Remove. (altivec_vmrglw_direct__be): New pattern for BE. (altivec_vmrghw_direct__le): New pattern for LE. * config/rs6000/rs6000.cc (altivec_expand_vec_perm_const): Adjust. * config/rs6000/vsx.md: Likewise. gcc/testsuite/ChangeLog: PR target/106069 * g++.target/powerpc/pr106069.C: New test. Signed-off-by: Xionghu Luo --- gcc/config/rs6000/altivec.md| 223 ++-- gcc/config/rs6000/rs6000.cc | 36 ++-- gcc/config/rs6000/vsx.md| 24 +-- gcc/testsuite/g++.target/powerpc/pr106069.C | 120 +++ 4 files changed, 305 insertions(+), 98 deletions(-) create mode 100644 gcc/testsuite/g++.target/powerpc/pr106069.C diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 2c4940f2e21..78245f470e9 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -1144,15 +1144,17 @@ (define_expand "altivec_vmrghb" (use (match_operand:V16QI 2 "register_operand"))] "TARGET_ALTIVEC" { - rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct - : gen_altivec_vmrglb_direct; - if (!BYTES_BIG_ENDIAN) -std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + rtvec v = gen_rtvec (16, GEN_INT (0), GEN_INT (16), GEN_INT (1), GEN_INT (17), + GEN_INT (2), GEN_INT (18), GEN_INT (3), GEN_INT (19),
Ping: [PATCH v4] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069]
Hi Segher, Ping this for stage 4... On 2023/2/10 10:59, Xionghu Luo via Gcc-patches wrote: Resend this patch... v4: Update per comments. v3: rename altivec_vmrghb_direct_le to altivec_vmrglb_direct_le to match the actual output ASM vmrglb. Likewise for all similar xxx_direct_le patterns. v2: Split the direct pattern to be and le with same RTL but different insn. The native RTL expression for vec_mrghw should be same for BE and LE as they are register and endian-independent. So both BE and LE need generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw with vec_select and vec_concat. (set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI (subreg:V4SI (reg:V16QI 139) 0) (subreg:V4SI (reg:V16QI 140) 0)) [const_int 0 4 1 5])) Then combine pass could do the nested vec_select optimization in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE: 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5]) 24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);} => 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel) 24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);} The endianness check need only once at ASM generation finally. ASM would be better due to nested vec_select simplified to simple scalar load. Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{32,64} Linux. gcc/ChangeLog: PR target/106069 * config/rs6000/altivec.md (altivec_vmrghb_direct): Remove. (altivec_vmrghb_direct_be): New pattern for BE. (altivec_vmrghb_direct_le): New pattern for LE. (altivec_vmrghh_direct): Remove. (altivec_vmrghh_direct_be): New pattern for BE. (altivec_vmrghh_direct_le): New pattern for LE. (altivec_vmrghw_direct_): Remove. (altivec_vmrghw_direct__be): New pattern for BE. (altivec_vmrghw_direct__le): New pattern for LE. (altivec_vmrglb_direct): Remove. (altivec_vmrglb_direct_be): New pattern for BE. (altivec_vmrglb_direct_le): New pattern for LE. (altivec_vmrglh_direct): Remove. (altivec_vmrglh_direct_be): New pattern for BE. (altivec_vmrglh_direct_le): New pattern for LE. (altivec_vmrglw_direct_): Remove. (altivec_vmrglw_direct__be): New pattern for BE. (altivec_vmrglw_direct__le): New pattern for LE. * config/rs6000/rs6000.cc (altivec_expand_vec_perm_const): Adjust. * config/rs6000/vsx.md: Likewise. gcc/testsuite/ChangeLog: PR target/106069 * g++.target/powerpc/pr106069.C: New test. Signed-off-by: Xionghu Luo --- gcc/config/rs6000/altivec.md| 222 ++-- gcc/config/rs6000/rs6000.cc | 24 +-- gcc/config/rs6000/vsx.md| 28 +-- gcc/testsuite/g++.target/powerpc/pr106069.C | 118 +++ 4 files changed, 307 insertions(+), 85 deletions(-) create mode 100644 gcc/testsuite/g++.target/powerpc/pr106069.C diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 30606b8ab21..4bfeecec224 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -1144,15 +1144,16 @@ (define_expand "altivec_vmrghb" (use (match_operand:V16QI 2 "register_operand"))] "TARGET_ALTIVEC" { - rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct - : gen_altivec_vmrglb_direct; - if (!BYTES_BIG_ENDIAN) -std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + if (BYTES_BIG_ENDIAN) +emit_insn ( + gen_altivec_vmrghb_direct_be (operands[0], operands[1], operands[2])); + else +emit_insn ( + gen_altivec_vmrglb_direct_le (operands[0], operands[2], operands[1])); DONE; }) -(define_insn "altivec_vmrghb_direct" +(define_insn "altivec_vmrghb_direct_be" [(set (match_operand:V16QI 0 "register_operand" "=v") (vec_select:V16QI (vec_concat:V32QI @@ -1166,7 +1167,25 @@ (define_insn "altivec_vmrghb_direct" (const_int 5) (const_int 21) (const_int 6) (const_int 22) (const_int 7) (const_int 23)])))] - "TARGET_ALTIVEC" + "TARGET_ALTIVEC && BYTES_BIG_ENDIAN" + "vmrghb %0,%1,%2" + [(set_attr "type" "vecperm")]) + +(define_insn "altivec_vmrghb_direct_le" + [(set (match_operand:V16QI 0 "register_operand" "=v") + (vec_select:V16QI + (vec_concat:V32QI + (match_operand:V16QI 2 "register_operand" "v") + (match_operand:V16QI 1 "register_operand" "v")) + (parallel [(const_int 8) (const_int 24) +(const
[PATCH 1/2] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]
When spliting edge with self loop, the split edge should be placed just next to the edge_in->src, otherwise it may generate different position latch bbs for two consecutive self loops. For details, please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93680#c4 Regression tested pass on x86_64-linux-gnu and aarch64-linux-gnu, OK for master? gcc/ChangeLog: PR gcov/93680 * tree-cfg.cc (split_edge_bb_loc): Return edge_in->src for self loop. gcc/testsuite/ChangeLog: PR gcov/93680 * gcc.misc-tests/gcov-pr93680.c: New test. Signed-off-by: Xionghu Luo --- gcc/testsuite/gcc.misc-tests/gcov-pr93680.c | 24 + gcc/tree-cfg.cc | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.misc-tests/gcov-pr93680.c diff --git a/gcc/testsuite/gcc.misc-tests/gcov-pr93680.c b/gcc/testsuite/gcc.misc-tests/gcov-pr93680.c new file mode 100644 index 000..b2bf9e626fc --- /dev/null +++ b/gcc/testsuite/gcc.misc-tests/gcov-pr93680.c @@ -0,0 +1,24 @@ +/* { dg-options "-fprofile-arcs -ftest-coverage" } */ +/* { dg-do run { target native } } */ + +int f(int s, int n) +{ + int p = 0; + + switch (s) + { +case 0: /* count(5) */ + do { p++; } while (--n); /* count(5) */ + return p; /* count(1) */ + +case 1: /* count(5) */ + do { p++; } while (--n); /* count(5) */ + return p; /* count(1) */ + } + + return 0; +} + +int main() { f(0, 5); f(1, 5); return 0; } + +/* { dg-final { run-gcov gcov-pr93680.c } } */ diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc index a9fcc7fd050..6fa1d83d366 100644 --- a/gcc/tree-cfg.cc +++ b/gcc/tree-cfg.cc @@ -3009,7 +3009,7 @@ split_edge_bb_loc (edge edge_in) if (dest_prev) { edge e = find_edge (dest_prev, dest); - if (e && !(e->flags & EDGE_COMPLEX)) + if ((e && !(e->flags & EDGE_COMPLEX)) || edge_in->src == edge_in->dest) return edge_in->src; } return dest_prev; -- 2.27.0
[PATCH 2/2] gcov: Fix incorrect gimple line LOCATION [PR97923]
For case like belowi test.c: 1:int foo(char c) 2:{ 3: return ((c >= 'A' && c <= 'Z') 4: || (c >= 'a' && c <= 'z') 5: || (c >= '0' && c <='0'));} the generated line number is incorrect for condition c>='A' of block 2: Thus correct the condition op0 location. gcno diff before and with this patch: test.gcno: 575: block 11: 1:0001(tree) test.gcno: 583:0145: 35:LINES -test.gcno: 595: block 2:`test.c':1, 5 +test.gcno: 595: block 2:`test.c':1, 3 test.gcno: 626:0145: 31:LINES test.gcno: 638: block 3:`test.c':3 test.gcno: 665:0145: 31:LINES test.gcno: 677: block 4:`test.c':4 test.gcno: 704:0145: 31:LINES test.gcno: 716: block 5:`test.c':4 test.gcno: 743:0145: 31:LINES test.gcno: 755: block 6:`test.c':5 Also save line id in line vector for gcov debug use. Regression tested pass on x86_64-linux-gnu and aarch64-linux-gnu, OK for master? gcc/ChangeLog: PR gcov/97923 * gcov.cc (line_info::line_info): Init id. (solve_flow_graph): Fix typo. (add_line_counts): Set line->id. * gimplify.cc (shortcut_cond_r): Correct cond expr op0 location. gcc/testsuite/ChangeLog: PR gcov/97923 * gcc.misc-tests/gcov-pr97923.c: New test. Signed-off-by: Xionghu Luo --- gcc/gcov.cc | 9 ++--- gcc/gimplify.cc | 6 -- gcc/testsuite/gcc.misc-tests/gcov-pr97923.c | 13 + 3 files changed, 23 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.misc-tests/gcov-pr97923.c diff --git a/gcc/gcov.cc b/gcc/gcov.cc index 2ec7248cc0e..77ca94c71c4 100644 --- a/gcc/gcov.cc +++ b/gcc/gcov.cc @@ -205,6 +205,8 @@ public: /* Execution count. */ gcov_type count; + unsigned id; + /* Branches from blocks that end on this line. */ vector branches; @@ -216,8 +218,8 @@ public: unsigned has_unexecuted_block : 1; }; -line_info::line_info (): count (0), branches (), blocks (), exists (false), - unexceptional (0), has_unexecuted_block (0) +line_info::line_info (): count (0), id (0), branches (), blocks (), + exists (false), unexceptional (0), has_unexecuted_block (0) { } @@ -2370,7 +2372,7 @@ solve_flow_graph (function_info *fn) /* If the graph has been correctly solved, every block will have a valid count. */ - for (unsigned i = 0; ix < fn->blocks.size (); i++) + for (unsigned i = 0; i < fn->blocks.size (); i++) if (!fn->blocks[i].count_valid) { fnotice (stderr, "%s:graph is unsolvable for '%s'\n", @@ -2730,6 +2732,7 @@ add_line_counts (coverage_info *coverage, function_info *fn) } line->count += block->count; } + line->id = ln; } has_any_line = true; diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc index ade6e335da7..341a27b033e 100644 --- a/gcc/gimplify.cc +++ b/gcc/gimplify.cc @@ -3915,7 +3915,8 @@ shortcut_cond_r (tree pred, tree *true_label_p, tree *false_label_p, false_label_p = &local_label; /* Keep the original source location on the first 'if'. */ - t = shortcut_cond_r (TREE_OPERAND (pred, 0), NULL, false_label_p, locus); + tree op0 = TREE_OPERAND (pred, 0); + t = shortcut_cond_r (op0, NULL, false_label_p, EXPR_LOCATION (op0)); append_to_statement_list (t, &expr); /* Set the source location of the && on the second 'if'. */ @@ -3938,7 +3939,8 @@ shortcut_cond_r (tree pred, tree *true_label_p, tree *false_label_p, true_label_p = &local_label; /* Keep the original source location on the first 'if'. */ - t = shortcut_cond_r (TREE_OPERAND (pred, 0), true_label_p, NULL, locus); + tree op0 = TREE_OPERAND (pred, 0); + t = shortcut_cond_r (op0, true_label_p, NULL, EXPR_LOCATION (op0)); append_to_statement_list (t, &expr); /* Set the source location of the || on the second 'if'. */ diff --git a/gcc/testsuite/gcc.misc-tests/gcov-pr97923.c b/gcc/testsuite/gcc.misc-tests/gcov-pr97923.c new file mode 100644 index 000..ad4f7d40817 --- /dev/null +++ b/gcc/testsuite/gcc.misc-tests/gcov-pr97923.c @@ -0,0 +1,13 @@ +/* { dg-options "-fprofile-arcs -ftest-coverage" } */ +/* { dg-do run { target native } } */ + +int foo(int c) +{ + return ((c >= 'A' && c <= 'Z') /* count(1*) */ + || (c >= 'a' && c <= 'z') /* count(1*) */ + || (c >= '0' && c <= '0')); /* count(1*) */ +} + +int main() { foo(0); } + +/* { dg-final { run-gcov gcov-pr97923-1.c } } */ -- 2.27.0
Re: [PATCH 2/2] gcov: Fix incorrect gimple line LOCATION [PR97923]
On 2023/3/2 16:16, Richard Biener wrote: On Thu, Mar 2, 2023 at 3:31 AM Xionghu Luo via Gcc-patches wrote: For case like belowi test.c: 1:int foo(char c) 2:{ 3: return ((c >= 'A' && c <= 'Z') 4: || (c >= 'a' && c <= 'z') 5: || (c >= '0' && c <='0'));} the generated line number is incorrect for condition c>='A' of block 2: Thus correct the condition op0 location. gcno diff before and with this patch: test.gcno: 575: block 11: 1:0001(tree) test.gcno: 583:0145: 35:LINES -test.gcno: 595: block 2:`test.c':1, 5 +test.gcno: 595: block 2:`test.c':1, 3 test.gcno: 626:0145: 31:LINES test.gcno: 638: block 3:`test.c':3 test.gcno: 665:0145: 31:LINES test.gcno: 677: block 4:`test.c':4 test.gcno: 704:0145: 31:LINES test.gcno: 716: block 5:`test.c':4 test.gcno: 743:0145: 31:LINES test.gcno: 755: block 6:`test.c':5 Also save line id in line vector for gcov debug use. Regression tested pass on x86_64-linux-gnu and aarch64-linux-gnu, OK for master? gcc/ChangeLog: PR gcov/97923 * gcov.cc (line_info::line_info): Init id. (solve_flow_graph): Fix typo. (add_line_counts): Set line->id. * gimplify.cc (shortcut_cond_r): Correct cond expr op0 location. gcc/testsuite/ChangeLog: PR gcov/97923 * gcc.misc-tests/gcov-pr97923.c: New test. Signed-off-by: Xionghu Luo --- gcc/gcov.cc | 9 ++--- gcc/gimplify.cc | 6 -- gcc/testsuite/gcc.misc-tests/gcov-pr97923.c | 13 + 3 files changed, 23 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.misc-tests/gcov-pr97923.c diff --git a/gcc/gcov.cc b/gcc/gcov.cc index 2ec7248cc0e..77ca94c71c4 100644 --- a/gcc/gcov.cc +++ b/gcc/gcov.cc @@ -205,6 +205,8 @@ public: /* Execution count. */ gcov_type count; + unsigned id; + /* Branches from blocks that end on this line. */ vector branches; @@ -216,8 +218,8 @@ public: unsigned has_unexecuted_block : 1; }; -line_info::line_info (): count (0), branches (), blocks (), exists (false), - unexceptional (0), has_unexecuted_block (0) +line_info::line_info (): count (0), id (0), branches (), blocks (), + exists (false), unexceptional (0), has_unexecuted_block (0) { } @@ -2370,7 +2372,7 @@ solve_flow_graph (function_info *fn) /* If the graph has been correctly solved, every block will have a valid count. */ - for (unsigned i = 0; ix < fn->blocks.size (); i++) + for (unsigned i = 0; i < fn->blocks.size (); i++) if (!fn->blocks[i].count_valid) { fnotice (stderr, "%s:graph is unsolvable for '%s'\n", @@ -2730,6 +2732,7 @@ add_line_counts (coverage_info *coverage, function_info *fn) } line->count += block->count; } + line->id = ln; } has_any_line = true; diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc index ade6e335da7..341a27b033e 100644 --- a/gcc/gimplify.cc +++ b/gcc/gimplify.cc @@ -3915,7 +3915,8 @@ shortcut_cond_r (tree pred, tree *true_label_p, tree *false_label_p, false_label_p = &local_label; /* Keep the original source location on the first 'if'. */ - t = shortcut_cond_r (TREE_OPERAND (pred, 0), NULL, false_label_p, locus); + tree op0 = TREE_OPERAND (pred, 0); + t = shortcut_cond_r (op0, NULL, false_label_p, EXPR_LOCATION (op0)); append_to_statement_list (t, &expr); The comment now no longer is true? For the else arm we use rexpr_location, why not here as well? To quote the following lines: /* Set the source location of the && on the second 'if'. */ new_locus = rexpr_location (pred, locus); t = shortcut_cond_r (TREE_OPERAND (pred, 1), true_label_p, false_label_p, new_locus); append_to_statement_list (t, &expr); Thanks, should use rexpr_location with each operand like below. with your change the location of the outer COND_EXPR is lost? Can we guarantee that it's used for the first operand of a if (a && b && c)? It would be nice to expand the leading comment for such a three operand case and explain how it's supposed to work. I tested the three operand case, it will iteratively call shortcut_cond_r and also works as expected. Seems the outer COND_EXPR is useless if we do the followed conversion? if (TREE_CODE (pred) == TRUTH_ANDIF_EXPR) { location_t new_locus; /* Turn if (a && b) into if (a); else goto no; if
Re: [PATCH 1/2] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]
On 2023/3/2 16:41, Richard Biener wrote: On Thu, Mar 2, 2023 at 3:31 AM Xionghu Luo via Gcc-patches wrote: When spliting edge with self loop, the split edge should be placed just next to the edge_in->src, otherwise it may generate different position latch bbs for two consecutive self loops. For details, please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93680#c4 Regression tested pass on x86_64-linux-gnu and aarch64-linux-gnu, OK for master? gcc/ChangeLog: PR gcov/93680 * tree-cfg.cc (split_edge_bb_loc): Return edge_in->src for self loop. gcc/testsuite/ChangeLog: PR gcov/93680 * gcc.misc-tests/gcov-pr93680.c: New test. Signed-off-by: Xionghu Luo --- gcc/testsuite/gcc.misc-tests/gcov-pr93680.c | 24 + gcc/tree-cfg.cc | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.misc-tests/gcov-pr93680.c diff --git a/gcc/testsuite/gcc.misc-tests/gcov-pr93680.c b/gcc/testsuite/gcc.misc-tests/gcov-pr93680.c new file mode 100644 index 000..b2bf9e626fc --- /dev/null +++ b/gcc/testsuite/gcc.misc-tests/gcov-pr93680.c @@ -0,0 +1,24 @@ +/* { dg-options "-fprofile-arcs -ftest-coverage" } */ +/* { dg-do run { target native } } */ + +int f(int s, int n) +{ + int p = 0; + + switch (s) + { +case 0: /* count(5) */ + do { p++; } while (--n); /* count(5) */ + return p; /* count(1) */ + +case 1: /* count(5) */ + do { p++; } while (--n); /* count(5) */ + return p; /* count(1) */ + } + + return 0; +} + +int main() { f(0, 5); f(1, 5); return 0; } + +/* { dg-final { run-gcov gcov-pr93680.c } } */ diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc index a9fcc7fd050..6fa1d83d366 100644 --- a/gcc/tree-cfg.cc +++ b/gcc/tree-cfg.cc @@ -3009,7 +3009,7 @@ split_edge_bb_loc (edge edge_in) if (dest_prev) { edge e = find_edge (dest_prev, dest); - if (e && !(e->flags & EDGE_COMPLEX)) + if ((e && !(e->flags & EDGE_COMPLEX)) || edge_in->src == edge_in->dest) I think this should eventually apply to all backedge edge_in, correct? But of course we cannot easily test for this here. Still since this affects ordering in the {next,prev}_bb chain only but not CFG semantics I wonder how it can affect coverage? Isn't it only by chance that this block order survives? For case: 1 int f(int s, int n) 2 { 3 int p = 0; 4 int q = 0; 5 6 switch (s) 7{ 8case 0: 9 do { p++; } while (--n); 10 return p; 11 12case 1: 13 do { p++; } while (--n); 14 return p; 15} 16 17 return 0; 18 } 19 20 int main() { f(0, 5); f(1, 5);} current GCC generates: : ... :<= first loop ... goto ; [INV] else goto ; [INV] : <= first latch bb goto ; [100.00%] : ... goto ; [INV] : <= second latch bb :<= second loop ... goto ; [INV] else goto ; [INV] and are created by split_edge->split_edge_bb_loc, is located after the loop, but is located before the loop. First call of split_edge_bb_loc, the dest_prev is , and find_edge did find a edge from to , the returned afte_bb is , so latch is put after the loop but second call of split_edge_bb_loc, the dest_prev is , so find_edge return 0, and the returned after_bb is , then the created latch is put before the loop... Different latch bb position caused different gcno, while gcov has poor information and not that smart to recognize it:(, is it reasonable to keep this kind of loops same order? small.gcno: 648: block 2:`small.c':1, 3, 4, 6 small.gcno: 688:0145: 36:LINES small.gcno: 700: block 3:`small.c':8, 9 small.gcno: 732:0145: 32:LINES small.gcno: 744: block 5:`small.c':10 -small.gcno: 772:0145: 32:LINES -small.gcno: 784: block 6:`small.c':12 -small.gcno: 812:0145: 36:LINES -small.gcno: 824: block 7:`small.c':12, 13 +small.gcno: 772:0145: 36:LINES +small.gcno: 784: block 6:`small.c':12, 13 +small.gcno: 816:0145: 32:LINES +small.gcno: 828: block 8:`small.c':14 small.gcno: 856:0145: 32:LINES -small.gcno: 868: block 8:`small.c':14 -small.gcno: 896:0145: 32:LINES -small.gcno: 908: block 9:`small.c':17 +small.gcno: 868: block 9:`small.c':17 For the case when both edge_in->src has more than one successor and edge_in->dest has more than one predecessor there isn't any good heuristic to make printing the blocks in chain order "nice" (well, the backedge one maybe). But as said - this order shouldn't have any effect on semantics ... return edge_in->src; } return dest_prev; -- 2.27.0
Re: [PATCH 1/2] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]
On 2023/3/2 18:45, Richard Biener wrote: small.gcno: 648: block 2:`small.c':1, 3, 4, 6 small.gcno: 688:0145: 36:LINES small.gcno: 700: block 3:`small.c':8, 9 small.gcno: 732:0145: 32:LINES small.gcno: 744: block 5:`small.c':10 -small.gcno: 772:0145: 32:LINES -small.gcno: 784: block 6:`small.c':12 -small.gcno: 812:0145: 36:LINES -small.gcno: 824: block 7:`small.c':12, 13 +small.gcno: 772:0145: 36:LINES +small.gcno: 784: block 6:`small.c':12, 13 +small.gcno: 816:0145: 32:LINES +small.gcno: 828: block 8:`small.c':14 small.gcno: 856:0145: 32:LINES -small.gcno: 868: block 8:`small.c':14 -small.gcno: 896:0145: 32:LINES -small.gcno: 908: block 9:`small.c':17 +small.gcno: 868: block 9:`small.c':17 Looking at the CFG and the instrumentation shows : PROF_edge_counter_17 = __gcov0.f[0]; PROF_edge_counter_18 = PROF_edge_counter_17 + 1; __gcov0.f[0] = PROF_edge_counter_18; [t.c:3:7] p_6 = 0; [t.c:5:3] switch (s_7(D)) [INV], [t.c:7:5] case 0: [INV], [t.c:11:5] case 1: [INV]> : # n_1 = PHI # p_3 = PHI <[t.c:3:7] p_6(2), [t.c:8:15] p_12(4)> [t.c:7:5] : [t.c:8:15] p_12 = p_3 + 1; [t.c:8:28] n_13 = n_1 + -1; [t.c:8:28] if (n_13 != 0) goto ; [INV] else goto ; [INV] : PROF_edge_counter_21 = __gcov0.f[2]; PROF_edge_counter_22 = PROF_edge_counter_21 + 1; __gcov0.f[2] = PROF_edge_counter_22; [t.c:7:5] goto ; [100.00%] : PROF_edge_counter_23 = __gcov0.f[3]; PROF_edge_counter_24 = PROF_edge_counter_23 + 1; __gcov0.f[3] = PROF_edge_counter_24; [t.c:9:16] _14 = p_12; [t.c:9:16] goto ; [INV] so the reason this goes wrong is that gcov associates the "wrong" counter with the block containing the 'case' label(s), for the case 0 it should have chosen the counter from bb5 but it likely computed the count of bb3? It might be that ordering blocks differently puts the instrumentation to different blocks or it makes gcovs association chose different blocks but that means it's just luck and not fixing the actual issue? To me it looks like the correct thing to investigate is switch statement and/or case label handling. One can also see that having line number 7 is wrong to the extent that the position of the label doesn't match the number of times it executes in the source. So placement of the label is wrong here, possibly caused by CFG cleanup after CFG build (but generally labels are not used for anything once the CFG is built and coverage instrumentation is late so it might fail due to us moving labels). It might be OK to avoid moving labels for --coverage but then coverage should possibly look at edges rather than labels? Thanks, I investigated the Labels, it seems wrong at the beginning from .gimple to .cfg very early quite like PR90574: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90574 .gimple: int f (int s, int n) [small.c:2:1] { int D.2755; int p; [small.c:3:7] p = 0; [small.c:5:3] switch (s) , [small.c:7:5] case 0: , [small.c:11:5] case 1: > [small.c:7:5] : <= case label :<= loop label [small.c:8:13] p = p + 1; [small.c:8:26] n = n + -1; [small.c:8:26] if (n != 0) goto ; else goto ; : [small.c:9:14] D.2755 = p; [small.c:9:14] return D.2755; [small.c:11:5] : : [small.c:12:13] p = p + 1; [small.c:12:26] n = n + -1; [small.c:12:26] if (n != 0) goto ; else goto ; : [small.c:13:14] D.2755 = p; [small.c:13:14] return D.2755; : [small.c:16:10] D.2755 = 0; [small.c:16:10] return D.2755; } .cfg: int f (int s, int n) { int p; int D.2755; : [small.c:3:7] p = 0; [small.c:5:3] switch (s) [INV], [small.c:7:5] case 0: [INV], [small.c:11:5] case 1: [INV]> : [small.c:7:5] : <= case 0 [small.c:8:13 discrim 1] p = p + 1; [small.c:8:26 discrim 1] n = n + -1; [small.c:8:26 discrim 1] if (n != 0) goto ; [INV] else goto ; [INV] : [small.c:9:14] D.2755 = p; [small.c:9:14] goto ; [INV] : [small.c:11:5] : <= case 1 [small.c:12:13 discrim 1] p = p + 1; [small.c:12:26 discrim 1] n = n + -1; [small.c:12:26 discrim 1] if (n != 0) goto ; [INV] else goto ; [INV] The labels are merged into the loop unexpected, so I tried below fix for --coverage if two labels are not on same line to start new basic block: index 10ca86714f4..b788198ac31 100644 --- a/gcc/tree-cfg.cc +++ b/gcc/tree-cfg.cc @@ -2860,6 +2860,13 @@ stmt_starts_bb_p (gimple *stmt, gimple *prev_stmt) || !DECL_ARTIFICIAL (gimple_label_label (plabel))) return true; + location_t loc_prev = gimple_location (plabel); + location_t locus = gimple_location (label_stmt); + expanded_location locus_e = expand_location
Re: [PATCH 1/2] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]
On 2023/3/6 16:11, Richard Biener wrote: On Mon, Mar 6, 2023 at 8:22 AM Xionghu Luo wrote: On 2023/3/2 18:45, Richard Biener wrote: small.gcno: 648: block 2:`small.c':1, 3, 4, 6 small.gcno: 688:0145: 36:LINES small.gcno: 700: block 3:`small.c':8, 9 small.gcno: 732:0145: 32:LINES small.gcno: 744: block 5:`small.c':10 -small.gcno: 772:0145: 32:LINES -small.gcno: 784: block 6:`small.c':12 -small.gcno: 812:0145: 36:LINES -small.gcno: 824: block 7:`small.c':12, 13 +small.gcno: 772:0145: 36:LINES +small.gcno: 784: block 6:`small.c':12, 13 +small.gcno: 816:0145: 32:LINES +small.gcno: 828: block 8:`small.c':14 small.gcno: 856:0145: 32:LINES -small.gcno: 868: block 8:`small.c':14 -small.gcno: 896:0145: 32:LINES -small.gcno: 908: block 9:`small.c':17 +small.gcno: 868: block 9:`small.c':17 Looking at the CFG and the instrumentation shows : PROF_edge_counter_17 = __gcov0.f[0]; PROF_edge_counter_18 = PROF_edge_counter_17 + 1; __gcov0.f[0] = PROF_edge_counter_18; [t.c:3:7] p_6 = 0; [t.c:5:3] switch (s_7(D)) [INV], [t.c:7:5] case 0: [INV], [t.c:11:5] case 1: [INV]> : # n_1 = PHI # p_3 = PHI <[t.c:3:7] p_6(2), [t.c:8:15] p_12(4)> [t.c:7:5] : [t.c:8:15] p_12 = p_3 + 1; [t.c:8:28] n_13 = n_1 + -1; [t.c:8:28] if (n_13 != 0) goto ; [INV] else goto ; [INV] : PROF_edge_counter_21 = __gcov0.f[2]; PROF_edge_counter_22 = PROF_edge_counter_21 + 1; __gcov0.f[2] = PROF_edge_counter_22; [t.c:7:5] goto ; [100.00%] : PROF_edge_counter_23 = __gcov0.f[3]; PROF_edge_counter_24 = PROF_edge_counter_23 + 1; __gcov0.f[3] = PROF_edge_counter_24; [t.c:9:16] _14 = p_12; [t.c:9:16] goto ; [INV] so the reason this goes wrong is that gcov associates the "wrong" counter with the block containing the 'case' label(s), for the case 0 it should have chosen the counter from bb5 but it likely computed the count of bb3? It might be that ordering blocks differently puts the instrumentation to different blocks or it makes gcovs association chose different blocks but that means it's just luck and not fixing the actual issue? To me it looks like the correct thing to investigate is switch statement and/or case label handling. One can also see that having line number 7 is wrong to the extent that the position of the label doesn't match the number of times it executes in the source. So placement of the label is wrong here, possibly caused by CFG cleanup after CFG build (but generally labels are not used for anything once the CFG is built and coverage instrumentation is late so it might fail due to us moving labels). It might be OK to avoid moving labels for --coverage but then coverage should possibly look at edges rather than labels? Thanks, I investigated the Labels, it seems wrong at the beginning from .gimple to .cfg very early quite like PR90574: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90574 .gimple: int f (int s, int n) [small.c:2:1] { int D.2755; int p; [small.c:3:7] p = 0; [small.c:5:3] switch (s) , [small.c:7:5] case 0: , [small.c:11:5] case 1: > [small.c:7:5] : <= case label :<= loop label [small.c:8:13] p = p + 1; [small.c:8:26] n = n + -1; [small.c:8:26] if (n != 0) goto ; else goto ; : [small.c:9:14] D.2755 = p; [small.c:9:14] return D.2755; [small.c:11:5] : : [small.c:12:13] p = p + 1; [small.c:12:26] n = n + -1; [small.c:12:26] if (n != 0) goto ; else goto ; : [small.c:13:14] D.2755 = p; [small.c:13:14] return D.2755; : [small.c:16:10] D.2755 = 0; [small.c:16:10] return D.2755; } .cfg: int f (int s, int n) { int p; int D.2755; : [small.c:3:7] p = 0; [small.c:5:3] switch (s) [INV], [small.c:7:5] case 0: [INV], [small.c:11:5] case 1: [INV]> : [small.c:7:5] : <= case 0 [small.c:8:13 discrim 1] p = p + 1; [small.c:8:26 discrim 1] n = n + -1; [small.c:8:26 discrim 1] if (n != 0) goto ; [INV] else goto ; [INV] : [small.c:9:14] D.2755 = p; [small.c:9:14] goto ; [INV] : [small.c:11:5] : <= case 1 [small.c:12:13 discrim 1] p = p + 1; [small.c:12:26 discrim 1] n = n + -1; [small.c:12:26 discrim 1] if (n != 0) goto ; [INV] else goto ; [INV] The labels are merged into the loop unexpected, so I tried below fix for --coverage if two labels are not on same line to start new basic block: index 10ca86714f4..b788198ac31 100644 --- a/gcc/tree-cfg.cc +++ b/gcc/tree-cfg.cc @@
Re: [PATCH 1/2] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]
On 2023/3/7 16:53, Richard Biener wrote: On Tue, 7 Mar 2023, Xionghu Luo wrote: Unfortunately this change (flag_test_coverage -> !optimize ) caused hundred of gfortran cases execution failure with O0. Take gfortran.dg/index.f90 for example: .gimple: __attribute__((fn spec (". "))) void p () [/data/RocksDB_Docker/tgcc-master/gcc/testsuite/gfortran.dg/index_4.f90:6:9] { [/data/RocksDB_Docker/tgcc-master/gcc/testsuite/gfortran.dg/index_4.f90:13:28] L.1: [/data/RocksDB_Docker/tgcc-master/gcc/testsuite/gfortran.dg/index_4.f90:14:28] L.2: [/data/RocksDB_Docker/tgcc-master/gcc/testsuite/gfortran.dg/index_4.f90:15:28] L.3: [/data/RocksDB_Docker/tgcc-master/gcc/testsuite/gfortran.dg/index_4.f90:16:28] L.4: [/data/RocksDB_Docker/tgcc-master/gcc/testsuite/gfortran.dg/index_4.f90:17:28] L.5: [/data/RocksDB_Docker/tgcc-master/gcc/testsuite/gfortran.dg/index_4.f90:18:72] L.6: } .cfg: ... Removing basic block 7 ;; basic block 7, loop depth 0 ;; pred: return; ;; succ: EXIT ;; 1 loops found ;; ;; Loop 0 ;; header 0, latch 1 ;; depth 0, outer -1 ;; nodes: 0 1 2 ;;2 succs { } __attribute__((fn spec (". "))) void p () { : } Due to the "return;" is removed in bb 7. OK, the issue is that make_edges_bb does nothing for an empty block but it should at least create a fallthru edge here. Thus, if (!last) fallthru = true; else switch (gimple_code (last)) { instead of simply returning if (!last). The alternative would be to make sure that cleanup_dead_labels preserves at least one statement in a block. Looking at the testcases I wonder if preserving all the fallthru labels is really necessary - for coverage we should have a counter ready. For the testcase we arrive with L.1: L.2: L.3: L.4: i = 1; It was: : : L.1: : L.2: : L.3: : L.4: : L.5: : L.6: return; : before the second call of cleanup_dead_labels, after it, all labels are removed, then tree_forwarder_block_p remove all forworders. Yes, it creates blocks and remove blocks immediately... where the frontend simplified things but put labels at each line. I suppose we could optimize this by re-computing TREE_USED and only splitting before labels reached by a control statement? That would cover the backedge case in the original testcase. cleanup_dead_labels does something like that already. actually in build_gimple_cfg, cleanup_dead_labels will remove all labels L.1 to L.6 first, then make_edges fail to create edges for to due to they are all EMPTY bb in make_edges_bb... 240│ /* To speed up statement iterator walks, we first purge dead labels. */ 241│ cleanup_dead_labels (); 242│ 243│ /* Group case nodes to reduce the number of edges. 244│ We do this after cleaning up dead labels because otherwise we miss 245│ a lot of obvious case merging opportunities. */ 246│ group_case_labels (); 247│ 248│ /* Create the edges of the flowgraph. */ 249│ discriminator_per_locus = new hash_table (13); 250├> make_edges (); : : : : : : : return; : Seems deadlock here as you said to set goto_locus as labels are removed before edges are created, the case could pass if I comment out the function cleanup_dead_labels(), so also not call it when !optimize? if (!!optimize) cleanup_dead_labels (); That probably makes sense. Looking at group_case_labels () that also seems to do unwanted things (to debugging and coverage), its comment says that for switch (i) { case 1: /* fallthru */ case 2: /* fallthru */ case 3: k = 0; it would replace that with case 1..3: k = 0; but that also fails to produce correct coverage, right? Likewise setting breakpoints. Yes. Should also exclude this. Does preserving the labels help setting a goto_locus for the fallthru edges? I don't see any code doing that, so CFG cleanup will remove the forwarders we created again. For the backedge case with switch-case-do-while, tree_forwarder_block_p returns false when iterating the statement check. The new created with only one case label instruction still owns location information in it, so CFG cleanup won't remove the forwarders. 390│ for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi)) 391│ { 392│ gimple *stmt = gsi_stmt (gsi); 393│ 394│ switch (gimple_code (stmt)) 395│ { 396│ case GIMPLE_LABEL: 397│ if (DECL_NONLOCAL (gimple_label_label (as_a (stmt 398│ return false; 399│ if (!optimize 400│ && (gimple_has_location (stmt) 401│ || LOCATION_LOCUS (locus) != UNKNOWN_LOCATION) 402│ && gimple_location (stmt) != locus) 403├>return false; 404│ break; (gdb) ps stmt : (gdb) p gimple_location (stmt) $154 = 2147483656 (gdb) pel $154 {file = 0x3e41af0 "small.c", line = 7, column = 5,
[PATCH v3] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]
On 2023/3/7 19:25, Richard Biener wrote: It would be nice to avoid creating blocks / preserving labels we'll immediately remove again. For that we do need some analysis before creating basic-blocks that determines whether a label is possibly reached by a non-falltru edge. : p = 0; switch (s) , case 0: , case 1: > : : <= prev_stmt : <= stmt p = p + 1; n = n + -1; if (n != 0) goto ; else goto ; Check if is a case label and is a goto target then return true in stmt_starts_bb_p to start a new basic block? This would avoid creating and removing blocks, but cleanup_dead_labels has all bbs setup while stmt_starts_bb_p does't yet to iterate bbs/labels to establish label_for_bb[] map? Yes. I think we'd need something more pragmatic before make_blocks (), like re-computing TREE_USED of the label decls or computing a bitmap of targeted labels (targeted by goto, switch or any other means). I'll note that doing a cleanup_dead_labels () like optimization before we create blocks will help keeping LABEL_DECL_UID and thus label_to_block_map dense. But it does look like a bit of an chicken-and-egg problem and the question is how effective the dead label removal is in practice. Tried to add function compute_target_labels(not sure whether the function name is suitable) in the front of make_blocks_1, now the fortran case doesn't create/removing blocks now, but I still have several questions: 1. I used hash_set to save the target labels instead of bitmap, as labels are tree type value instead of block index so bitmap is not good for it since we don't have LABEL_DECL_UID now? 2. Is the compute_target_labels still only for !optimize? And if we compute the target labels before create bbs, it is unnessary to guard the first cleanup_dead_labels under !optimize now, because the switch-case-do-while case already create new block for CASE_LABEL already. 3. I only added GIMPLE_SWITCH/GIMPLE_COND in compute_target_labels so far, is it needed to also handle GIMPLE_ASM/GIMPLE_TRANSACTION and even labels_eh? PS1: The v3 patch will cause one test case fail: Number of regressions in total: 1 FAIL: gcc.c-torture/compile/limits-caselabels.c -O0 (test for excess errors) due to this exausting case has labels from L0 to L11, they won't be optimized to a simple if-else expression like before... PS2: The GIMPLE_GOTO piece of code would cause some fortran cases run fail due to __builtin_unreachable trap generated in .fixup_cfg1, I didn't dig into it so just skip these label... + case GIMPLE_GOTO: +#if 0 + if (!computed_goto_p (stmt)) + { + tree dest = gimple_goto_dest (stmt); + target_labels->add (dest); + } +#endif + break; Change the #if 0 to #if 1 result in: Number of regressions in total: 8 FAIL: gcc.c-torture/compile/limits-caselabels.c -O0 (test for excess errors) FAIL: gcc.dg/analyzer/explode-2a.c (test for excess errors) FAIL: gcc.dg/analyzer/pragma-2.c (test for excess errors) FAIL: gfortran.dg/bound_2.f90 -O0 execution test FAIL: gfortran.dg/bound_7.f90 -O0 execution test FAIL: gfortran.dg/char_result_14.f90 -O0 execution test FAIL: gfortran.dg/pointer_array_1.f90 -O0 execution test FAIL: gfortran.dg/select_type_15.f03 -O0 execution test Paste the updated patch v3: v3: Add compute_target_labels and call it in the front of make_blocks_1. Start a new basic block if two labels have different location when test-coverage. Regression tested pass on x86_64-linux-gnu and aarch64-linux-gnu, OK for master? gcc/ChangeLog: PR gcov/93680 * tree-cfg.cc (stmt_starts_bb_p): Check whether the label is in target_labels. (compute_target_labels): New function. (make_blocks_1): Call compute_target_labels. gcc/testsuite/ChangeLog: PR gcov/93680 * g++.dg/gcov/gcov-1.C: Correct counts. * gcc.misc-tests/gcov-4.c: Likewise. * gcc.misc-tests/gcov-pr85332.c: Likewise. * lib/gcov.exp: Also clean gcda if fail. * gcc.misc-tests/gcov-pr93680.c: New test. Signed-off-by: Xionghu Luo --- gcc/tree-cfg.cc | 68 - gcc/testsuite/g++.dg/gcov/gcov-1.C | 2 +- gcc/testsuite/gcc.dg/analyzer/paths-4.c | 8 +-- gcc/testsuite/gcc.misc-tests/gcov-pr85332.c | 2 +- gcc/testsuite/gcc.misc-tests/gcov-pr93680.c | 24 gcc/testsuite/lib/gcov.exp | 4 +- 6 files changed, 96 insertions(+), 12 deletions(-) create mode 100644 gcc/testsuite/gcc.misc-tests/gcov-pr93680.c diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc index a9fcc7fd050..0f8efcf4aa3 100644 --- a/gcc/tree-cfg.cc +++ b/gcc/tree-cfg.cc @@ -164,7 +164,7 @@ static edge gimple_redirect_edge_and_branch (edge, basic_block); static edge gimple_try_redirect_by_replacing_jump (edge, basic_block); /* Various helpers. */ -static inline bool stmt_s
Re: [PATCH v4] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]
On 2023/3/9 20:02, Richard Biener wrote: On Wed, 8 Mar 2023, Xionghu Luo wrote: On 2023/3/7 19:25, Richard Biener wrote: It would be nice to avoid creating blocks / preserving labels we'll immediately remove again. For that we do need some analysis before creating basic-blocks that determines whether a label is possibly reached by a non-falltru edge. : p = 0; switch (s) , case 0: , case 1: > : : <= prev_stmt : <= stmt p = p + 1; n = n + -1; if (n != 0) goto ; else goto ; Check if is a case label and is a goto target then return true in stmt_starts_bb_p to start a new basic block? This would avoid creating and removing blocks, but cleanup_dead_labels has all bbs setup while stmt_starts_bb_p does't yet to iterate bbs/labels to establish label_for_bb[] map? Yes. I think we'd need something more pragmatic before make_blocks (), like re-computing TREE_USED of the label decls or computing a bitmap of targeted labels (targeted by goto, switch or any other means). I'll note that doing a cleanup_dead_labels () like optimization before we create blocks will help keeping LABEL_DECL_UID and thus label_to_block_map dense. But it does look like a bit of an chicken-and-egg problem and the question is how effective the dead label removal is in practice. Tried to add function compute_target_labels(not sure whether the function name is suitable) in the front of make_blocks_1, now the fortran case doesn't create/removing blocks now, but I still have several questions: 1. I used hash_set to save the target labels instead of bitmap, as labels are tree type value instead of block index so bitmap is not good for it since we don't have LABEL_DECL_UID now? We don't have LABEL_DECL_UID, we have DECL_UID though, but the choice of hash_set vs. bitmap is somewhat arbitrary here. The real cost is the extra walk over all stmts. 2. Is the compute_target_labels still only for !optimize? And if we compute the target labels before create bbs, it is unnessary to guard the first cleanup_dead_labels under !optimize now, because the switch-case-do-while case already create new block for CASE_LABEL already. OK. 3. I only added GIMPLE_SWITCH/GIMPLE_COND in compute_target_labels so far, is it needed to also handle GIMPLE_ASM/GIMPLE_TRANSACTION and even labels_eh? I'd add GIMPLE_ASM handling, the rest should be OK wrt debugging and coverage already? Added in patch v4. PS1: The v3 patch will cause one test case fail: Number of regressions in total: 1 FAIL: gcc.c-torture/compile/limits-caselabels.c -O0 (test for excess errors) due to this exausting case has labels from L0 to L11, they won't be optimized to a simple if-else expression like before... Hmm, that's somewhat unexpected. It could be fixed by not start a new block if two locus are on same line as the labels are expanded by MACRO with same location info. BTW, I found that two UNKOWN_LOCATION variable may have different value but return true in same_line_p? 2: locus1 = 2147483670 3: locus2 = 2147483652 (gdb) pel locus1 {file = 0x0, line = 0, column = 0, data = 0x76bdc300, sysp = false} (gdb) pel locus2 {file = 0x0, line = 0, column = 0, data = 0x76bdc4e0, sysp = false} (gdb) p LOCATION_LOCUS (locus1) $16 = 0 (gdb) p LOCATION_LOCUS (locus2) $17 = 0 So fix the function like this? @@ -1152,6 +1218,10 @@ same_line_p (location_t locus1, expanded_location *from, location_t locus2) { expanded_location to; + if (LOCATION_LOCUS (locus1) == UNKNOWN_LOCATION + && LOCATION_LOCUS (locus2) == UNKNOWN_LOCATION) +return false; + if (locus1 == locus2) return true; PS2: The GIMPLE_GOTO piece of code would cause some fortran cases run fail due to __builtin_unreachable trap generated in .fixup_cfg1, I didn't dig into it so just skip these label... Please investigate, we might be missing a corner case here. Yes. Take the case pointer_array_1.f90 as example, it has an UNUSED label "L.7" with locus info in it, not sure why it exists even since .original. [pointer_array_1.f90:39:10] if (test.14 != 0) goto ; els e goto ; : [pointer_array_1.f90:39:52] _gfortran_stop_numeric (3, 0); : parm.16 = {CLOBBER(eol)}; [pointer_array_1.f90:39:52] L.7: <= UNUSED label : [pointer_array_1.f90:39:52] L.3: atmp.0 = {CLOBBER(eol)}; A.1 = {CLOBBER(eol)}; atmp.5 = {CLOBBER(eol)}; A.6 = {CLOBBER(eol)}; d = {CLOBBER(eol)}; [pointer_array_1.f90:41:14] return; stmt_starts_bb_p will return true for L.7 as the prev_stmt "parm.16 = {CLOBBER(eol)};" is not a label statement, then will also return true in stmt_starts_bb_p as the label_stmt and prev_stmt are NOT on same line. : L.9: L.8: if (test.14 != 0) goto ; else goto ; : : _gfortran_stop_numeric (3, 0); : : parm.16 = {CLOBBER(eol)}; : <= empty block L.7: : : L.3: atmp.0 = {CLOBBER(eol)}; A.1 = {CLOBBER(eo
[PATCH v4] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]
On 2023/3/9 20:02, Richard Biener wrote: On Wed, 8 Mar 2023, Xionghu Luo wrote: On 2023/3/7 19:25, Richard Biener wrote: It would be nice to avoid creating blocks / preserving labels we'll immediately remove again. For that we do need some analysis before creating basic-blocks that determines whether a label is possibly reached by a non-falltru edge. : p = 0; switch (s) , case 0: , case 1: > : : <= prev_stmt : <= stmt p = p + 1; n = n + -1; if (n != 0) goto ; else goto ; Check if is a case label and is a goto target then return true in stmt_starts_bb_p to start a new basic block? This would avoid creating and removing blocks, but cleanup_dead_labels has all bbs setup while stmt_starts_bb_p does't yet to iterate bbs/labels to establish label_for_bb[] map? Yes. I think we'd need something more pragmatic before make_blocks (), like re-computing TREE_USED of the label decls or computing a bitmap of targeted labels (targeted by goto, switch or any other means). I'll note that doing a cleanup_dead_labels () like optimization before we create blocks will help keeping LABEL_DECL_UID and thus label_to_block_map dense. But it does look like a bit of an chicken-and-egg problem and the question is how effective the dead label removal is in practice. Tried to add function compute_target_labels(not sure whether the function name is suitable) in the front of make_blocks_1, now the fortran case doesn't create/removing blocks now, but I still have several questions: 1. I used hash_set to save the target labels instead of bitmap, as labels are tree type value instead of block index so bitmap is not good for it since we don't have LABEL_DECL_UID now? We don't have LABEL_DECL_UID, we have DECL_UID though, but the choice of hash_set vs. bitmap is somewhat arbitrary here. The real cost is the extra walk over all stmts. 2. Is the compute_target_labels still only for !optimize? And if we compute the target labels before create bbs, it is unnessary to guard the first cleanup_dead_labels under !optimize now, because the switch-case-do-while case already create new block for CASE_LABEL already. OK. 3. I only added GIMPLE_SWITCH/GIMPLE_COND in compute_target_labels so far, is it needed to also handle GIMPLE_ASM/GIMPLE_TRANSACTION and even labels_eh? I'd add GIMPLE_ASM handling, the rest should be OK wrt debugging and coverage already? PS1: The v3 patch will cause one test case fail: Number of regressions in total: 1 FAIL: gcc.c-torture/compile/limits-caselabels.c -O0 (test for excess errors) due to this exausting case has labels from L0 to L11, they won't be optimized to a simple if-else expression like before... Hmm, that's somewhat unexpected. PS2: The GIMPLE_GOTO piece of code would cause some fortran cases run fail due to __builtin_unreachable trap generated in .fixup_cfg1, I didn't dig into it so just skip these label... Please investigate, we might be missing a corner case here. I think the *previous fix* for labels “in the middle of block” is *incorrect*, it should be handled in make_edges_bb when a basic block only has Label in it, just create a fallthrough edge for it to avoid wrong cfg and unreachable trap generated? @@ -853,6 +922,12 @@ make_edges_bb (basic_block bb, struct omp_region **pcur_region, int *pomp_index) bool fallthru = false; int ret = 0; + if (!optimize && !last) +{ + make_edge (bb, bb->next_bb, EDGE_FALLTHRU); + return 0; +} + if (!last) return ret; With the fix, the attached version could pass bootstrap and regression test on x86_64-linux-gnu. From ec505cc7952707db805802af83dd82776a1d949f Mon Sep 17 00:00:00 2001 From: Xionghu Luo Date: Tue, 28 Feb 2023 17:46:18 +0800 Subject: [PATCH v4] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680] v4: Address comments. 4.1. Handle GIMPLE_GOTO and GIMPLE_ASM. 4.2. Fix failure of limit-caselabels.c (labels on same line), pointer_array_1.f90 (unused labels) etc. v3: Add compute_target_labels and call it in the front of make_blocks_1. v2: Check whether two locus are on same line. Start a new basic block if two labels have different location when test-coverage. Regression tested pass on x86_64-linux-gnu and aarch64-linux-gnu, OK for master? gcc/ChangeLog: PR gcov/93680 * tree-cfg.cc (stmt_starts_bb_p): Check whether the label is in target_labels. (compute_target_labels): New function. (make_blocks_1): Call compute_target_labels. (same_line_p): Return false if two locus are both UNKOWN_LOCATION. gcc/testsuite/ChangeLog: PR gcov/93680 * g++.dg/gcov/gcov-1.C: Correct counts. * gcc.misc-tests/gcov-4.c: Likewise. * gcc.misc-tests/gcov-pr85332.c: Likewise. * lib/gcov.exp: Also clean gcda if fail. * gcc.m
[PATCH] rs6000: Fix vec insert ilp32 ICE and test failures [PR98799]
From: "luo...@cn.ibm.com" UNSPEC_SI_FROM_SF is not supported when TARGET_DIRECT_MOVE_64BIT is false for -m32, don't generate VIEW_CONVERT_EXPR(ARRAY_REF) for variable vector insert. Remove rs6000_expand_vector_set_var helper function, adjust the p8 and p9 definitions position and make them static. The previous commit r11-6858 missed check m32, This patch is tested pass on P7BE{m32,m64}/P8BE{m32,m64}/P8LE/P9LE with RUNTESTFLAGS="--target_board =unix'{-m32,-m64}" for BE targets. gcc/ChangeLog: 2021-01-26 Xionghu Luo David Edelsohn PR target/98799 * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin): Don't generate VIEW_CONVERT_EXPR for m32. * config/rs6000/rs6000-protos.h (rs6000_expand_vector_set_var): Delete. * config/rs6000/rs6000.c (rs6000_expand_vector_set): Remove the wrapper call rs6000_expand_vector_set_var. Call rs6000_expand_vector_set_var_p9 and rs6000_expand_vector_set_var_p8 directly. (rs6000_expand_vector_set_var): Delete. gcc/testsuite/ChangeLog: 2021-01-26 Xionghu Luo PR target/98827 * gcc.target/powerpc/fold-vec-insert-char-p8.c: Adjust ilp32. * gcc.target/powerpc/fold-vec-insert-char-p9.c: Likewise. * gcc.target/powerpc/fold-vec-insert-double.c: Likewise. * gcc.target/powerpc/fold-vec-insert-float-p8.c: Likewise. * gcc.target/powerpc/fold-vec-insert-float-p9.c: Likewise. * gcc.target/powerpc/fold-vec-insert-int-p8.c: Likewise. * gcc.target/powerpc/fold-vec-insert-int-p9.c: Likewise. * gcc.target/powerpc/fold-vec-insert-longlong.c: Likewise. * gcc.target/powerpc/fold-vec-insert-short-p8.c: Likewise. * gcc.target/powerpc/fold-vec-insert-short-p9.c: Likewise. * gcc.target/powerpc/pr79251.p8.c: Likewise. * gcc.target/powerpc/pr79251.p9.c: Likewise. * gcc.target/powerpc/vsx-builtin-7.c: Likewise. --- gcc/config/rs6000/rs6000-c.c | 2 +- gcc/config/rs6000/rs6000-protos.h | 1 - gcc/config/rs6000/rs6000.c| 236 +- .../powerpc/fold-vec-insert-char-p8.c | 14 +- .../powerpc/fold-vec-insert-char-p9.c | 6 +- .../powerpc/fold-vec-insert-double.c | 10 +- .../powerpc/fold-vec-insert-float-p8.c| 12 +- .../powerpc/fold-vec-insert-float-p9.c| 6 +- .../powerpc/fold-vec-insert-int-p8.c | 13 +- .../powerpc/fold-vec-insert-int-p9.c | 9 +- .../powerpc/fold-vec-insert-longlong.c| 8 +- .../powerpc/fold-vec-insert-short-p8.c| 10 +- .../powerpc/fold-vec-insert-short-p9.c| 13 +- gcc/testsuite/gcc.target/powerpc/pr79251.p8.c | 17 +- gcc/testsuite/gcc.target/powerpc/pr79251.p9.c | 16 +- .../gcc.target/powerpc/vsx-builtin-7.c| 2 +- 16 files changed, 203 insertions(+), 172 deletions(-) diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c index f6ee1e61b56..656cdb39f3f 100644 --- a/gcc/config/rs6000/rs6000-c.c +++ b/gcc/config/rs6000/rs6000-c.c @@ -1600,7 +1600,7 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt); } - if (TARGET_P8_VECTOR) + if (TARGET_P8_VECTOR && TARGET_DIRECT_MOVE_64BIT) { stmt = build_array_ref (loc, stmt, arg2); stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 9a46a414743..9cca7325d0d 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -58,7 +58,6 @@ extern bool rs6000_split_128bit_ok_p (rtx []); extern void rs6000_expand_float128_convert (rtx, rtx, bool); extern void rs6000_expand_vector_init (rtx, rtx); extern void rs6000_expand_vector_set (rtx, rtx, rtx); -extern void rs6000_expand_vector_set_var (rtx, rtx, rtx); extern void rs6000_expand_vector_extract (rtx, rtx, rtx); extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx); extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index f5565a1a253..471bf5660bd 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -6977,122 +6977,10 @@ rs6000_expand_vector_init (rtx target, rtx vals) emit_move_insn (target, mem); } -/* Set field ELT_RTX of TARGET to VAL. */ - -void -rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx) -{ - machine_mode mode = GET_MODE (target); - machine_mode inner_mode = GET_MODE_INNER (mode); - rtx reg = gen_reg_rtx (mode); - rtx mask, mem, x; - int width = GET_MODE_SIZE (inner_mode); - int i; - - val = force_reg (GET_MODE (val), val); - - if (VECTOR_MEM_VSX_P (mode)) -{ - if (!CONST_INT_P (elt_rtx)) - { -
Re: [PATCH] rs6000: Fix vec insert ilp32 ICE and test failures [PR98799]
Hi, On 2021/1/27 03:00, David Edelsohn wrote: > On Tue, Jan 26, 2021 at 2:46 AM Xionghu Luo wrote: >> >> From: "luo...@cn.ibm.com" >> >> UNSPEC_SI_FROM_SF is not supported when TARGET_DIRECT_MOVE_64BIT >> is false for -m32, don't generate VIEW_CONVERT_EXPR(ARRAY_REF) for >> variable vector insert. Remove rs6000_expand_vector_set_var helper >> function, adjust the p8 and p9 definitions position and make them >> static. >> >> The previous commit r11-6858 missed check m32, This patch is tested pass >> on P7BE{m32,m64}/P8BE{m32,m64}/P8LE/P9LE with >> RUNTESTFLAGS="--target_board =unix'{-m32,-m64}" for BE targets. > > Hi, Xionghu > > Thanks for addressing these failures and the cleanups. > > This patch addresses most of the failures. > > pr79251-run.c continues to fail. The directives are not complete. > I'm not certain if your intention is to run the testcase on all > targets or only on Power7 and above. The testcase relies on vector > "long long", which only is available with -mvsx, but the testcase only > enables -maltivec. I believe that the testcase happens to pass on the > Linux platforms you tested because GCC defaulted to Power7 or Power8 > ISA and the ABI specifies VSX. The testcase probably needs to be > restricted to only run on some level of VSX enabled processor (VSX? > Power8? Power9?) and also needs some additional compiler options when > compiling the testcase instead of relying upon the default > configuration of the compiler. P8BE: gcc/testsuite/gcc/gcc.sum(it didn't run before due to no 'dg-do run'): Running target unix/-m32 Running /home/luoxhu/workspace/gcc/gcc/testsuite/gcc.target/powerpc/powerpc.exp ... PASS: gcc.target/powerpc/pr79251-run.c (test for excess errors) PASS: gcc.target/powerpc/pr79251-run.c execution test === gcc Summary for unix/-m32 === # of expected passes2 Running target unix/-m64 Running /home/luoxhu/workspace/gcc/gcc/testsuite/gcc.target/powerpc/powerpc.exp ... PASS: gcc.target/powerpc/pr79251-run.c (test for excess errors) PASS: gcc.target/powerpc/pr79251-run.c execution test === gcc Summary for unix/-m64 === # of expected passes2 How did you get the failure of pr79251-run.c, please? I tested it all passes on P7BE{m32,m64}/P8BE{m32,m64}/P8LE/P9LE of Linux. This case is just verifying the *functionality* of "u = vec_insert (254, v, k)" and compare whether u[k] is changed to 254, it must work on all platforms, no matter with the optimization or not, otherwise there is a functional error. As to "long long", add target vsx_hw and powerpc like below? (Also change the -maltive to -mvsx for pr79251.p8.c/pr79251.p9.c.) --- a/gcc/testsuite/gcc.target/powerpc/pr79251-run.c +++ b/gcc/testsuite/gcc.target/powerpc/pr79251-run.c @@ -1,4 +1,6 @@ -/* { dg-options "-O2 -maltivec" } */ +/* { dg-do run { target powerpc*-*-* } } */ +/* { dg-require-effective-target vsx_hw { target powerpc*-*-* } } */ +/* { dg-options "-O2 -mvsx" } */ Any other options necessary to limit the testcases? :) > > Also, part of the change seems to be > >> - if (TARGET_P9_VECTOR || GET_MODE_SIZE (inner_mode) == 8) >> -rs6000_expand_vector_set_var_p9 (target, val, idx); >> + if ((TARGET_P9_VECTOR && TARGET_POWERPC64) || width == 8) >> + { >> + rs6000_expand_vector_set_var_p9 (target, val, elt_rtx); >> + return; >> + } > > Does the P9 case need TARGET_POWERPC64? This optimization seemed to > be functioning on P9 in 32 bit mode prior to this fix. It would be a > shame to unnecessarily disable this optimization in 32 bit mode. Or > maybe it generated a functioning sequence but didn't utilize the > optimization. Would you please check / clarify? >> - if (TARGET_P8_VECTOR) >> + if (TARGET_P8_VECTOR && TARGET_DIRECT_MOVE_64BIT) >> { >>stmt = build_array_ref (loc, stmt, arg2); >>stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, This change in rs6000-c.c causes it not generating VIEW_CONVERT_EXPR(ARRAY_REF) gimple code again for P9-32bit, then the IFN VEC_SET won't be matched, so rs6000.c:rs6000_expand_vector_set_var_p9 won't be called to produce optimized "lvsl+xxperm+lvsr" for P9-32bit again. It's a pity, but without this, it ICEs on P8BE-32bit because of UNSPEC_SI_FROM_SF is not supported for -m32, if we need to support P9-32bit, why not also support P8-32bit as only float vec_insert ICE, is there any method could move SI from SF for P8-32bit? (I verified the m32 optimized binary and non-optimized binary for int vec_insert on P8-BE-32bit, the performance gain is also huge as 8x improvement with this patch.) rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx idx) { ... /* mtvsrd[wz] f0,tmp_val. */ rtx tmp_val = gen_reg_rtx (SImode); if (inner_mode == E_SFmode) emit_insn (gen_movsi_from_sf (tmp_val, val)); else tmp_val = force_reg (SImode, val); ... } Thanks Xionghu
[PATCH] testsuite: Run vec_insert case on P8 and P9 with option specified
Move common functions to header file for cleanup. gcc/testsuite/ChangeLog: 2021-01-27 Xionghu Luo * gcc.target/powerpc/pr79251.p8.c: Move definition to ... * gcc.target/powerpc/pr79251.h: ...this. * gcc.target/powerpc/pr79251.p9.c: Likewise. * gcc.target/powerpc/pr79251-run.c: Rename to... * gcc.target/powerpc/pr79251-run.p8.c: ...this. * gcc.target/powerpc/pr79251-run.p9.c: New test. --- .../gcc.target/powerpc/pr79251-run.c | 30 --- .../gcc.target/powerpc/pr79251-run.p8.c | 14 + .../gcc.target/powerpc/pr79251-run.p9.c | 14 + gcc/testsuite/gcc.target/powerpc/pr79251.h| 17 +++ gcc/testsuite/gcc.target/powerpc/pr79251.p8.c | 2 -- gcc/testsuite/gcc.target/powerpc/pr79251.p9.c | 2 -- 6 files changed, 45 insertions(+), 34 deletions(-) delete mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.p8.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.p9.c diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251-run.c b/gcc/testsuite/gcc.target/powerpc/pr79251-run.c deleted file mode 100644 index 6afd357c7ba..000 --- a/gcc/testsuite/gcc.target/powerpc/pr79251-run.c +++ /dev/null @@ -1,30 +0,0 @@ -/* { dg-do run } */ -/* { dg-require-effective-target vsx_hw } */ -/* { dg-options "-O2 -mvsx" } */ - -#include -#include -#include "pr79251.h" - -TEST_VEC_INSERT_ALL (test) - -#define run_test(TYPE, num) \ - { \ -vector TYPE v; \ -vector TYPE u = {0x0}; \ -for (long k = 0; k < 16 / sizeof (TYPE); k++) \ - v[k] = 0xaa; \ -for (long k = 0; k < 16 / sizeof (TYPE); k++) \ - { \ - u = test##num (v, 254, k); \ - if (u[k] != (TYPE) 254)\ - __builtin_abort (); \ - } \ - } - -int -main (void) -{ - TEST_VEC_INSERT_ALL (run_test) - return 0; -} diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251-run.p8.c b/gcc/testsuite/gcc.target/powerpc/pr79251-run.p8.c new file mode 100644 index 000..47d4d288f3c --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr79251-run.p8.c @@ -0,0 +1,14 @@ +/* { dg-do run } */ +/* { dg-require-effective-target p8vector_hw } */ +/* { dg-options "-O2 -mvsx -mdejagnu-cpu=power8" } */ + +#include +#include +#include "pr79251.h" + +int +main (void) +{ + TEST_VEC_INSERT_ALL (run_test) + return 0; +} diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251-run.p9.c b/gcc/testsuite/gcc.target/powerpc/pr79251-run.p9.c new file mode 100644 index 000..fd56b2356f4 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr79251-run.p9.c @@ -0,0 +1,14 @@ +/* { dg-do run } */ +/* { dg-require-effective-target p9vector_hw } */ +/* { dg-options "-O2 -mvsx -mdejagnu-cpu=power9" } */ + +#include +#include +#include "pr79251.h" + +int +main (void) +{ + TEST_VEC_INSERT_ALL (run_test) + return 0; +} diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.h b/gcc/testsuite/gcc.target/powerpc/pr79251.h index addb067f9ed..2684b660966 100644 --- a/gcc/testsuite/gcc.target/powerpc/pr79251.h +++ b/gcc/testsuite/gcc.target/powerpc/pr79251.h @@ -17,3 +17,20 @@ T (unsigned long long, 7) \ T (float, 8) \ T (double, 9) + +TEST_VEC_INSERT_ALL (test) + +#define run_test(TYPE, num) \ + { \ +vector TYPE v; \ +vector TYPE u = {0x0}; \ +for (long k = 0; k < 16 / sizeof (TYPE); k++) \ + v[k] = 0xaa; \ +for (long k = 0; k < 16 / sizeof (TYPE); k++) \ + { \ + u = test##num (v, 254, k); \ + if (u[k] != (TYPE) 254)
[PATCH] testsuite: Update pr79251 ilp32 store regex.
BE ilp32 Linux generates extra stack stwu instructions which shouldn't be counted in, \m … \M is needed around each instruction, not just the beginning and end of the entire pattern. Pre-approved, committing. gcc/testsuite/ChangeLog: 2021-02-01 Xionghu Luo * gcc.target/powerpc/pr79251.p8.c: Update regex. * gcc.target/powerpc/pr79251.p9.c: Likewise. --- gcc/testsuite/gcc.target/powerpc/pr79251.p8.c | 2 +- gcc/testsuite/gcc.target/powerpc/pr79251.p9.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.p8.c b/gcc/testsuite/gcc.target/powerpc/pr79251.p8.c index b0e7732a38b..178e02fc866 100644 --- a/gcc/testsuite/gcc.target/powerpc/pr79251.p8.c +++ b/gcc/testsuite/gcc.target/powerpc/pr79251.p8.c @@ -16,7 +16,7 @@ /* { dg-final { scan-assembler-times {\mrlwinm\M} 10 { target ilp32 } } } */ /* { dg-final { scan-assembler-times {\mstxvw4x\M} 6 { target ilp32 } } } */ /* { dg-final { scan-assembler-times {\mstxvd2x\M} 4 { target ilp32 } } } */ -/* { dg-final { scan-assembler-times {\mstb|sth|stw|stfs|stfd\M} 12 { target ilp32 } } } */ +/* { dg-final { scan-assembler-times {\mstb\M|\msth\M|\mstw\M|\mstfs\M|\mstfd\M} 12 { target ilp32 } } } */ /* { dg-final { scan-assembler-times {\mlxvw4x\M} 6 { target ilp32 } } } */ /* { dg-final { scan-assembler-times {\mlxvd2x\M} 4 { target ilp32 } } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.p9.c b/gcc/testsuite/gcc.target/powerpc/pr79251.p9.c index cedb0bf7da8..2ae2da8c72e 100644 --- a/gcc/testsuite/gcc.target/powerpc/pr79251.p9.c +++ b/gcc/testsuite/gcc.target/powerpc/pr79251.p9.c @@ -17,6 +17,6 @@ /* { dg-final { scan-assembler-times {\mrlwinm\M} 10 { target ilp32 } } } */ /* { dg-final { scan-assembler-times {\mstxv\M} 10 { target ilp32 } } } */ -/* { dg-final { scan-assembler-times {\mstb|sth|stw|stfs|stfd\M} 12 { target ilp32 } } } */ +/* { dg-final { scan-assembler-times {\mstb\M|\msth\M|\mstw\M|\mstfs\M|\mstfd\M} 12 { target ilp32 } } } */ /* { dg-final { scan-assembler-times {\mlxv\M} 10 { target ilp32 } } } */ -- 2.25.1
[PATCH] rs6000: Convert the vector element register to SImode [PR98914]
v[k] will also be expanded to IFN VEC_SET if k is long type when built with -Og. -O0 didn't exposed the issue due to v is TREE_ADDRESSABLE, -O1 and above also didn't capture it because of v[k] is not optimized to VIEW_CONVERT_EXPR(v)[k_1]. vec_insert defines the element argument type to be signed int by ELFv2 ABI, so convert it to SImode if it wasn't for Power target requirements. gcc/ChangeLog: 2021-02-03 Xionghu Luo * config/rs6000/rs6000.c (rs6000_expand_vector_set): Convert elt_rtx to SImode if it wasn't. gcc/testsuite/ChangeLog: 2021-02-03 Xionghu Luo * gcc.target/powerpc/pr98914.c: New test. --- gcc/config/rs6000/rs6000.c | 17 ++--- gcc/testsuite/gcc.target/powerpc/pr98914.c | 11 +++ 2 files changed, 21 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr98914.c diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index ec068c58aa5..9f7f8da56c6 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -7000,8 +7000,6 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, rtx idx) gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)); - gcc_assert (GET_MODE (idx) == E_SImode); - machine_mode inner_mode = GET_MODE (val); rtx tmp = gen_reg_rtx (GET_MODE (idx)); @@ -7047,8 +7045,6 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx idx) gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)); - gcc_assert (GET_MODE (idx) == E_SImode); - machine_mode inner_mode = GET_MODE (val); HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode); @@ -7144,7 +7140,7 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx) machine_mode mode = GET_MODE (target); machine_mode inner_mode = GET_MODE_INNER (mode); rtx reg = gen_reg_rtx (mode); - rtx mask, mem, x; + rtx mask, mem, x, elt_si; int width = GET_MODE_SIZE (inner_mode); int i; @@ -7154,16 +7150,23 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx) { if (!CONST_INT_P (elt_rtx)) { + /* elt_rtx should be SImode from ELFv2 ABI. */ + elt_si = gen_reg_rtx (E_SImode); + if (GET_MODE (elt_rtx) != E_SImode) + convert_move (elt_si, elt_rtx, 0); + else + elt_si = elt_rtx; + /* For V2DI/V2DF, could leverage the P9 version to generate xxpermdi when elt_rtx is variable. */ if ((TARGET_P9_VECTOR && TARGET_POWERPC64) || width == 8) { - rs6000_expand_vector_set_var_p9 (target, val, elt_rtx); + rs6000_expand_vector_set_var_p9 (target, val, elt_si); return; } else if (TARGET_P8_VECTOR && TARGET_DIRECT_MOVE_64BIT) { - rs6000_expand_vector_set_var_p8 (target, val, elt_rtx); + rs6000_expand_vector_set_var_p8 (target, val, elt_si); return; } } diff --git a/gcc/testsuite/gcc.target/powerpc/pr98914.c b/gcc/testsuite/gcc.target/powerpc/pr98914.c new file mode 100644 index 000..e4d78e3e6b3 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr98914.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target powerpc_p8vector_ok } */ +/* { dg-options "-Og -mvsx" } */ + +vector int +foo (vector int v) +{ + for (long k = 0; k < 1; ++k) +v[k] = 0; + return v; +} -- 2.25.1
Ping: [PATCH] rs6000: Convert the vector element register to SImode [PR98914]
Gentle ping, thanks. On 2021/2/3 17:01, Xionghu Luo wrote: v[k] will also be expanded to IFN VEC_SET if k is long type when built with -Og. -O0 didn't exposed the issue due to v is TREE_ADDRESSABLE, -O1 and above also didn't capture it because of v[k] is not optimized to VIEW_CONVERT_EXPR(v)[k_1]. vec_insert defines the element argument type to be signed int by ELFv2 ABI, so convert it to SImode if it wasn't for Power target requirements. gcc/ChangeLog: 2021-02-03 Xionghu Luo * config/rs6000/rs6000.c (rs6000_expand_vector_set): Convert elt_rtx to SImode if it wasn't. gcc/testsuite/ChangeLog: 2021-02-03 Xionghu Luo * gcc.target/powerpc/pr98914.c: New test. --- gcc/config/rs6000/rs6000.c | 17 ++--- gcc/testsuite/gcc.target/powerpc/pr98914.c | 11 +++ 2 files changed, 21 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr98914.c diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index ec068c58aa5..9f7f8da56c6 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -7000,8 +7000,6 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, rtx idx) gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)); - gcc_assert (GET_MODE (idx) == E_SImode); - machine_mode inner_mode = GET_MODE (val); rtx tmp = gen_reg_rtx (GET_MODE (idx)); @@ -7047,8 +7045,6 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx idx) gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)); - gcc_assert (GET_MODE (idx) == E_SImode); - machine_mode inner_mode = GET_MODE (val); HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode); @@ -7144,7 +7140,7 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx) machine_mode mode = GET_MODE (target); machine_mode inner_mode = GET_MODE_INNER (mode); rtx reg = gen_reg_rtx (mode); - rtx mask, mem, x; + rtx mask, mem, x, elt_si; int width = GET_MODE_SIZE (inner_mode); int i; @@ -7154,16 +7150,23 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx) { if (!CONST_INT_P (elt_rtx)) { + /* elt_rtx should be SImode from ELFv2 ABI. */ + elt_si = gen_reg_rtx (E_SImode); + if (GET_MODE (elt_rtx) != E_SImode) + convert_move (elt_si, elt_rtx, 0); + else + elt_si = elt_rtx; + /* For V2DI/V2DF, could leverage the P9 version to generate xxpermdi when elt_rtx is variable. */ if ((TARGET_P9_VECTOR && TARGET_POWERPC64) || width == 8) { - rs6000_expand_vector_set_var_p9 (target, val, elt_rtx); + rs6000_expand_vector_set_var_p9 (target, val, elt_si); return; } else if (TARGET_P8_VECTOR && TARGET_DIRECT_MOVE_64BIT) { - rs6000_expand_vector_set_var_p8 (target, val, elt_rtx); + rs6000_expand_vector_set_var_p8 (target, val, elt_si); return; } } diff --git a/gcc/testsuite/gcc.target/powerpc/pr98914.c b/gcc/testsuite/gcc.target/powerpc/pr98914.c new file mode 100644 index 000..e4d78e3e6b3 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr98914.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target powerpc_p8vector_ok } */ +/* { dg-options "-Og -mvsx" } */ + +vector int +foo (vector int v) +{ + for (long k = 0; k < 1; ++k) +v[k] = 0; + return v; +} -- Thanks, Xionghu
[PATCH v2] rs6000: Convert the vector element register to SImode [PR98914]
vec_insert defines the element argument type to be signed int by ELFv2 ABI, When expanding a vector with a variable rtx, convert the rtx type SImode. gcc/ChangeLog: 2021-02-24 Xionghu Luo PR target/98914 * config/rs6000/rs6000.c (rs6000_expand_vector_set): Convert elt_rtx to SImode. (rs6000_expand_vector_set_var_p9): Remove assert. (rs6000_expand_vector_set_var_p8): Likewise. gcc/testsuite/ChangeLog: 2021-02-24 Xionghu Luo * gcc.target/powerpc/pr98914.c: New test. --- gcc/config/rs6000/rs6000.c | 17 ++--- gcc/testsuite/gcc.target/powerpc/pr98914.c | 11 +++ 2 files changed, 21 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr98914.c diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index ec068c58aa5..9f7f8da56c6 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -7000,8 +7000,6 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, rtx idx) gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)); - gcc_assert (GET_MODE (idx) == E_SImode); - machine_mode inner_mode = GET_MODE (val); rtx tmp = gen_reg_rtx (GET_MODE (idx)); @@ -7047,8 +7045,6 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx idx) gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)); - gcc_assert (GET_MODE (idx) == E_SImode); - machine_mode inner_mode = GET_MODE (val); HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode); @@ -7144,7 +7140,7 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx) machine_mode mode = GET_MODE (target); machine_mode inner_mode = GET_MODE_INNER (mode); rtx reg = gen_reg_rtx (mode); - rtx mask, mem, x; + rtx mask, mem, x, elt_si; int width = GET_MODE_SIZE (inner_mode); int i; @@ -7154,16 +7150,23 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx) { if (!CONST_INT_P (elt_rtx)) { + /* elt_rtx should be SImode from ELFv2 ABI. */ + elt_si = gen_reg_rtx (E_SImode); + if (GET_MODE (elt_rtx) != E_SImode) + convert_move (elt_si, elt_rtx, 0); + else + elt_si = elt_rtx; + /* For V2DI/V2DF, could leverage the P9 version to generate xxpermdi when elt_rtx is variable. */ if ((TARGET_P9_VECTOR && TARGET_POWERPC64) || width == 8) { - rs6000_expand_vector_set_var_p9 (target, val, elt_rtx); + rs6000_expand_vector_set_var_p9 (target, val, elt_si); return; } else if (TARGET_P8_VECTOR && TARGET_DIRECT_MOVE_64BIT) { - rs6000_expand_vector_set_var_p8 (target, val, elt_rtx); + rs6000_expand_vector_set_var_p8 (target, val, elt_si); return; } } diff --git a/gcc/testsuite/gcc.target/powerpc/pr98914.c b/gcc/testsuite/gcc.target/powerpc/pr98914.c new file mode 100644 index 000..e4d78e3e6b3 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr98914.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target powerpc_p8vector_ok } */ +/* { dg-options "-Og -mvsx" } */ + +vector int +foo (vector int v) +{ + for (long k = 0; k < 1; ++k) +v[k] = 0; + return v; +} -- 2.25.1
Re: [PATCH v2] rs6000: Convert the vector element register to SImode [PR98914]
On 2021/2/25 00:57, Segher Boessenkool wrote: > Hi! > > On Wed, Feb 24, 2021 at 09:06:24AM +0800, Xionghu Luo wrote: >> vec_insert defines the element argument type to be signed int by ELFv2 >> ABI, When expanding a vector with a variable rtx, convert the rtx type >> SImode. > > But that is true for the intrinsics, not for all other callers of > rs6000_expand_vector_init. See > <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98914#c2> as well? > > So I don't think you do this in the right place. You can convince me > with good arguments of course :-) Thanks for pointing out, it seems we should convert the type to DImode in rs6000_expand_vector_set_var_p9 and rs6000_expand_vector_set_var_p8 to support both usage? PS: for "vec_insert (i, u, n)" usage when n is long, what should the front-end do in altivec_resolve_overloaded_builtin to follow the ELFv2 rule? Currently, no warning/error message or conversion there, INTEGRAL_TYPE_P range is much larger than signed int. gcc/config/rs6000/rs6000-c.c altivec_resolve_overloaded_builtin { ... if (!INTEGRAL_TYPE_P (TREE_TYPE (arg2))) goto bad; ... } Updated the back-end patch as below. 0001-rs6000-Convert-the-vector-set-variable-idx-to-DImode.patch vec_insert defines the element argument type to be signed int by ELFv2 ABI. When expanding a vector with a variable rtx, convert the rtx type to DImode to support both intrinsic usage and other callers from rs6000_expand_vector_init produced by v[k] = val when k is long type. gcc/ChangeLog: 2021-02-25 Xionghu Luo PR target/98914 * config/rs6000/rs6000.c (rs6000_expand_vector_set_var_p9): Convert idx to DImode. (rs6000_expand_vector_set_var_p8): Likewise. gcc/testsuite/ChangeLog: 2021-02-25 Xionghu Luo PR target/98914 * gcc.target/powerpc/pr98914.c: New test. --- gcc/config/rs6000/rs6000.c | 33 +- gcc/testsuite/gcc.target/powerpc/pr98914.c | 11 2 files changed, 30 insertions(+), 14 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr98914.c diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index ec068c58aa5..48eb91132a9 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -7000,11 +7000,15 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, rtx idx) gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)); - gcc_assert (GET_MODE (idx) == E_SImode); - machine_mode inner_mode = GET_MODE (val); - rtx tmp = gen_reg_rtx (GET_MODE (idx)); + machine_mode idx_mode = GET_MODE (idx); + rtx tmp = gen_reg_rtx (DImode); + if (idx_mode != DImode) +tmp = convert_modes (DImode, idx_mode, idx, 0); + else +tmp = idx; + int width = GET_MODE_SIZE (inner_mode); gcc_assert (width >= 1 && width <= 8); @@ -7012,9 +7016,7 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, rtx idx) int shift = exact_log2 (width); /* Generate the IDX for permute shift, width is the vector element size. idx = idx * width. */ - emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (shift))); - - tmp = convert_modes (DImode, SImode, tmp, 1); + emit_insn (gen_ashldi3 (tmp, tmp, GEN_INT (shift))); /* lvsrv1,0,idx. */ rtx pcvr = gen_reg_rtx (V16QImode); @@ -7047,27 +7049,31 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx idx) gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)); - gcc_assert (GET_MODE (idx) == E_SImode); - machine_mode inner_mode = GET_MODE (val); HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode); - rtx tmp = gen_reg_rtx (GET_MODE (idx)); int width = GET_MODE_SIZE (inner_mode); + machine_mode idx_mode = GET_MODE (idx); + rtx tmp = gen_reg_rtx (DImode); + if (idx_mode != DImode) +tmp = convert_modes (DImode, idx_mode, idx, 0); + else +tmp = idx; + gcc_assert (width >= 1 && width <= 4); if (!BYTES_BIG_ENDIAN) { /* idx = idx * width. */ - emit_insn (gen_mulsi3 (tmp, idx, GEN_INT (width))); + emit_insn (gen_muldi3 (tmp, tmp, GEN_INT (width))); /* idx = idx + 8. */ - emit_insn (gen_addsi3 (tmp, tmp, GEN_INT (8))); + emit_insn (gen_adddi3 (tmp, tmp, GEN_INT (8))); } else { - emit_insn (gen_mulsi3 (tmp, idx, GEN_INT (width))); - emit_insn (gen_subsi3 (tmp, GEN_INT (24 - width), tmp)); + emit_insn (gen_muldi3 (tmp, idx, GEN_INT (width))); + emit_insn (gen_subdi3 (tmp, GEN_INT (24 - width), tmp)); } /* lxv vs33, mask. @@ -7118,7 +7124,6 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx idx) emit_insn (gen_rtx_SET (val_v16qi, sub_val)); /* lvsl13,0,idx. */ - tmp = convert_modes (DImode, SImode, tmp, 1); rtx pcv = gen_reg_rtx (V16QImode); emit_insn (gen_altivec_lvsl_reg (pcv, tmp)); diff --git a/gc
Ping: [PATCH v2] rs6000: Convert the vector element register to SImode [PR98914]
On 2021/2/25 14:33, Xionghu Luo via Gcc-patches wrote: > > > On 2021/2/25 00:57, Segher Boessenkool wrote: >> Hi! >> >> On Wed, Feb 24, 2021 at 09:06:24AM +0800, Xionghu Luo wrote: >>> vec_insert defines the element argument type to be signed int by ELFv2 >>> ABI, When expanding a vector with a variable rtx, convert the rtx type >>> SImode. >> >> But that is true for the intrinsics, not for all other callers of >> rs6000_expand_vector_init. See >> <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98914#c2> as well? >> >> So I don't think you do this in the right place. You can convince me >> with good arguments of course :-) > > Thanks for pointing out, it seems we should convert the type to DImode in > rs6000_expand_vector_set_var_p9 and rs6000_expand_vector_set_var_p8 > to support both usage? > > > PS: for "vec_insert (i, u, n)" usage when n is long, what should the front-end > do in altivec_resolve_overloaded_builtin to follow the ELFv2 rule? Currently, > no warning/error message or conversion there, INTEGRAL_TYPE_P range is much > larger > than signed int. long to int should follow implicit transformation, so no need change here. Ping the patch, thanks. BR, Xionghu > > > > Updated the back-end patch as below. > > > 0001-rs6000-Convert-the-vector-set-variable-idx-to-DImode.patch > > > vec_insert defines the element argument type to be signed int by ELFv2 > ABI. When expanding a vector with a variable rtx, convert the rtx type > to DImode to support both intrinsic usage and other callers from > rs6000_expand_vector_init produced by v[k] = val when k is long type. > > gcc/ChangeLog: > > 2021-02-25 Xionghu Luo > > PR target/98914 > * config/rs6000/rs6000.c (rs6000_expand_vector_set_var_p9): > Convert idx to DImode. > (rs6000_expand_vector_set_var_p8): Likewise. > > gcc/testsuite/ChangeLog: > > 2021-02-25 Xionghu Luo > > PR target/98914 > * gcc.target/powerpc/pr98914.c: New test. > --- > gcc/config/rs6000/rs6000.c | 33 +- > gcc/testsuite/gcc.target/powerpc/pr98914.c | 11 > 2 files changed, 30 insertions(+), 14 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr98914.c > > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c > index ec068c58aa5..48eb91132a9 100644 > --- a/gcc/config/rs6000/rs6000.c > +++ b/gcc/config/rs6000/rs6000.c > @@ -7000,11 +7000,15 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, > rtx idx) > > gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)); > > - gcc_assert (GET_MODE (idx) == E_SImode); > - > machine_mode inner_mode = GET_MODE (val); > > - rtx tmp = gen_reg_rtx (GET_MODE (idx)); > + machine_mode idx_mode = GET_MODE (idx); > + rtx tmp = gen_reg_rtx (DImode); > + if (idx_mode != DImode) > +tmp = convert_modes (DImode, idx_mode, idx, 0); > + else > +tmp = idx; > + > int width = GET_MODE_SIZE (inner_mode); > > gcc_assert (width >= 1 && width <= 8); > @@ -7012,9 +7016,7 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, > rtx idx) > int shift = exact_log2 (width); > /* Generate the IDX for permute shift, width is the vector element size. >idx = idx * width. */ > - emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (shift))); > - > - tmp = convert_modes (DImode, SImode, tmp, 1); > + emit_insn (gen_ashldi3 (tmp, tmp, GEN_INT (shift))); > > /* lvsrv1,0,idx. */ > rtx pcvr = gen_reg_rtx (V16QImode); > @@ -7047,27 +7049,31 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, > rtx idx) > > gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)); > > - gcc_assert (GET_MODE (idx) == E_SImode); > - > machine_mode inner_mode = GET_MODE (val); > HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode); > > - rtx tmp = gen_reg_rtx (GET_MODE (idx)); > int width = GET_MODE_SIZE (inner_mode); > > + machine_mode idx_mode = GET_MODE (idx); > + rtx tmp = gen_reg_rtx (DImode); > + if (idx_mode != DImode) > +tmp = convert_modes (DImode, idx_mode, idx, 0); > + else > +tmp = idx; > + > gcc_assert (width >= 1 && width <= 4); > > if (!BYTES_BIG_ENDIAN) > { > /* idx = idx * width. */ > - emit_insn (gen_mulsi3 (tmp, idx, GEN_INT (width))); > + emit_insn (gen_muldi3 (tmp, tmp, GEN_INT (width))); > /* idx = idx + 8. */ > - emit_insn (gen_addsi3 (tmp, tmp, GEN_INT (8))); > + emit_insn (ge
[PATCH] Fix loop split incorrect count and probability
loop split condition is moved between loop1 and loop2, the split bb's count and probability should also be duplicated instead of (100% vs INV), secondly, the original loop1 and loop2 count need be propotional from the original loop. Regression tested pass, OK for master? diff base/loop-cond-split-1.c.151t.lsplit patched/loop-cond-split-1.c.151t.lsplit: ... int prephitmp_16; int prephitmp_25; [local count: 118111600]: if (n_7(D) > 0) goto ; [89.00%] else goto ; [11.00%] [local count: 118111600]: return; [local count: 105119324]: pretmp_3 = ga; - [local count: 955630225]: + [local count: 315357973]: # i_13 = PHI # prephitmp_12 = PHI if (prephitmp_12 != 0) goto ; [33.00%] else goto ; [67.00%] - [local count: 315357972]: + [local count: 104068130]: _2 = do_something (); ga = _2; - [local count: 955630225]: + [local count: 315357973]: # prephitmp_5 = PHI i_10 = inc (i_13); if (n_7(D) > i_10) goto ; [89.00%] else goto ; [11.00%] [local count: 105119324]: goto ; [100.00%] - [local count: 850510901]: + [local count: 280668596]: if (prephitmp_12 != 0) -goto ; [100.00%] +goto ; [33.00%] else -goto ; [INV] +goto ; [67.00%] - [local count: 850510901]: + [local count: 280668596]: goto ; [100.00%] - [count: 0]: + [local count: 70429947]: # i_23 = PHI # prephitmp_25 = PHI - [local count: 955630225]: + [local count: 640272252]: # i_15 = PHI # prephitmp_16 = PHI i_22 = inc (i_15); if (n_7(D) > i_22) goto ; [89.00%] else goto ; [11.00%] - [local count: 850510901]: + [local count: 569842305]: goto ; [100.00%] } gcc/ChangeLog: * tree-ssa-loop-split.c (split_loop): Fix incorrect probability. (do_split_loop_on_cond): Likewise. --- gcc/tree-ssa-loop-split.c | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c index 3a09bbc39e5..8e5a7ded0f7 100644 --- a/gcc/tree-ssa-loop-split.c +++ b/gcc/tree-ssa-loop-split.c @@ -583,10 +583,10 @@ split_loop (class loop *loop1) basic_block cond_bb; class loop *loop2 = loop_version (loop1, cond, &cond_bb, - profile_probability::always (), - profile_probability::always (), - profile_probability::always (), - profile_probability::always (), + true_edge->probability, + true_edge->probability.invert (), + true_edge->probability, + true_edge->probability.invert (), true); gcc_assert (loop2); @@ -1486,10 +1486,10 @@ do_split_loop_on_cond (struct loop *loop1, edge invar_branch) initialize_original_copy_tables (); struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL, -profile_probability::always (), -profile_probability::never (), -profile_probability::always (), -profile_probability::always (), +invar_branch->probability.invert (), +invar_branch->probability, +invar_branch->probability.invert (), +invar_branch->probability, true); if (!loop2) { -- 2.25.1
Re: [PATCH] Fix loop split incorrect count and probability
I' like to split this patch: https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576488.html to two patches: 0001-Fix-loop-split-incorrect-count-and-probability.patch 0002-Don-t-move-cold-code-out-of-loop-by-checking-bb-coun.patch since they are solving two different things, please help to review the attached series. They show obvious performance improvement on both P8 and P9 for CPU2017, and I am not sure how it will affect other platforms like X86 and AArch64, it will be grateful if someone could try it. Thanks. Xionghu From 4e1ef5b1f423484a6789750e7cc0cf2e94517f20 Mon Sep 17 00:00:00 2001 From: Xionghu Luo Date: Tue, 3 Aug 2021 03:44:14 -0500 Subject: [PATCH 1/2] Fix loop split incorrect count and probability loop split condition is moved between loop1 and loop2, the split bb's count and probability should also be duplicated instead of (100% vs INV), secondly, the original loop1 and loop2 count need be propotional from the original loop. Regression tested pass, OK for master? diff base/loop-cond-split-1.c.151t.lsplit patched/loop-cond-split-1.c.151t.lsplit: ... int prephitmp_16; int prephitmp_25; [local count: 118111600]: if (n_7(D) > 0) goto ; [89.00%] else goto ; [11.00%] [local count: 118111600]: return; [local count: 105119324]: pretmp_3 = ga; - [local count: 955630225]: + [local count: 315357973]: # i_13 = PHI # prephitmp_12 = PHI if (prephitmp_12 != 0) goto ; [33.00%] else goto ; [67.00%] - [local count: 315357972]: + [local count: 104068130]: _2 = do_something (); ga = _2; - [local count: 955630225]: + [local count: 315357973]: # prephitmp_5 = PHI i_10 = inc (i_13); if (n_7(D) > i_10) goto ; [89.00%] else goto ; [11.00%] [local count: 105119324]: goto ; [100.00%] - [local count: 850510901]: + [local count: 280668596]: if (prephitmp_12 != 0) -goto ; [100.00%] +goto ; [33.00%] else -goto ; [INV] +goto ; [67.00%] - [local count: 850510901]: + [local count: 280668596]: goto ; [100.00%] - [count: 0]: + [local count: 70429947]: # i_23 = PHI # prephitmp_25 = PHI - [local count: 955630225]: + [local count: 640272252]: # i_15 = PHI # prephitmp_16 = PHI i_22 = inc (i_15); if (n_7(D) > i_22) goto ; [89.00%] else goto ; [11.00%] - [local count: 850510901]: + [local count: 569842305]: goto ; [100.00%] } gcc/ChangeLog: * tree-ssa-loop-split.c (split_loop): Fix incorrect probability. (do_split_loop_on_cond): Likewise. --- gcc/tree-ssa-loop-split.c | 25 - 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c index 3f6ad046623..d30782888f3 100644 --- a/gcc/tree-ssa-loop-split.c +++ b/gcc/tree-ssa-loop-split.c @@ -575,7 +575,11 @@ split_loop (class loop *loop1) stmts2); tree cond = build2 (guard_code, boolean_type_node, guard_init, border); if (!initial_true) - cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); + cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); + + edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE + ? EDGE_SUCC (bbs[i], 0) + : EDGE_SUCC (bbs[i], 1); /* Now version the loop, placing loop2 after loop1 connecting them, and fix up SSA form for that. */ @@ -583,10 +587,10 @@ split_loop (class loop *loop1) basic_block cond_bb; class loop *loop2 = loop_version (loop1, cond, &cond_bb, - profile_probability::always (), - profile_probability::always (), - profile_probability::always (), - profile_probability::always (), + true_edge->probability, + true_edge->probability.invert (), + true_edge->probability, + true_edge->probability.invert (), true); gcc_assert (loop2); @@ -1486,10 +1490,10 @@ do_split_loop_on_cond (struct loop *loop1, edge invar_branch) initialize_original_copy_tables (); struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL, -profile_probability::always (), -profile_probability::never (), -profile_probability::always (), -profile_probability::always (), +invar_branch->probability.invert (), +
Re: [PATCH] Fix loop split incorrect count and probability
Thanks, On 2021/8/6 19:46, Richard Biener wrote: > On Tue, 3 Aug 2021, Xionghu Luo wrote: > >> loop split condition is moved between loop1 and loop2, the split bb's >> count and probability should also be duplicated instead of (100% vs INV), >> secondly, the original loop1 and loop2 count need be propotional from the >> original loop. >> >> >> diff base/loop-cond-split-1.c.151t.lsplit >> patched/loop-cond-split-1.c.151t.lsplit: >> ... >> int prephitmp_16; >> int prephitmp_25; >> >> [local count: 118111600]: >> if (n_7(D) > 0) >> goto ; [89.00%] >> else >> goto ; [11.00%] >> >> [local count: 118111600]: >> return; >> >> [local count: 105119324]: >> pretmp_3 = ga; >> >> - [local count: 955630225]: >> + [local count: 315357973]: >> # i_13 = PHI >> # prephitmp_12 = PHI >> if (prephitmp_12 != 0) >> goto ; [33.00%] >> else >> goto ; [67.00%] >> >> - [local count: 315357972]: >> + [local count: 104068130]: >> _2 = do_something (); >> ga = _2; >> >> - [local count: 955630225]: >> + [local count: 315357973]: >> # prephitmp_5 = PHI >> i_10 = inc (i_13); >> if (n_7(D) > i_10) >> goto ; [89.00%] >> else >> goto ; [11.00%] >> >> [local count: 105119324]: >> goto ; [100.00%] >> >> - [local count: 850510901]: >> + [local count: 280668596]: >> if (prephitmp_12 != 0) >> -goto ; [100.00%] >> +goto ; [33.00%] >> else >> -goto ; [INV] >> +goto ; [67.00%] >> >> - [local count: 850510901]: >> + [local count: 280668596]: >> goto ; [100.00%] >> >> - [count: 0]: >> + [local count: 70429947]: >> # i_23 = PHI >> # prephitmp_25 = PHI >> >> - [local count: 955630225]: >> + [local count: 640272252]: >> # i_15 = PHI >> # prephitmp_16 = PHI >> i_22 = inc (i_15); >> if (n_7(D) > i_22) >> goto ; [89.00%] >> else >> goto ; [11.00%] >> >> - [local count: 850510901]: >> + [local count: 569842305]: >> goto ; [100.00%] >> >> } >> >> gcc/ChangeLog: >> >> * tree-ssa-loop-split.c (split_loop): Fix incorrect probability. >> (do_split_loop_on_cond): Likewise. >> --- >> gcc/tree-ssa-loop-split.c | 16 >> 1 file changed, 8 insertions(+), 8 deletions(-) >> >> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c >> index 3a09bbc39e5..8e5a7ded0f7 100644 >> --- a/gcc/tree-ssa-loop-split.c >> +++ b/gcc/tree-ssa-loop-split.c >> @@ -583,10 +583,10 @@ split_loop (class loop *loop1) >> basic_block cond_bb; if (!initial_true) - cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); + cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); + + edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE + ? EDGE_SUCC (bbs[i], 0) + : EDGE_SUCC (bbs[i], 1); >> >> class loop *loop2 = loop_version (loop1, cond, &cond_bb, >> - profile_probability::always (), >> - profile_probability::always (), >> - profile_probability::always (), >> - profile_probability::always (), >> + true_edge->probability, >> + true_edge->probability.invert (), >> + true_edge->probability, >> + true_edge->probability.invert (), >> true); > > there is no 'true_edge' variable at this point. Sorry, missed the above hunk when split the patch. > >> gcc_assert (loop2); >> >> @@ -1486,10 +1486,10 @@ do_split_loop_on_cond (struct loop *loop1, edge >> invar_branch) >> initialize_original_copy_tables (); >> >> struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL, >> - profile_probability::always (), >> - profile_probability::never (), >> - profile_probability::always (), >> -
Re: [RFC] Don't move cold code out of loop by checking bb count
Hi, On 2021/8/6 20:15, Richard Biener wrote: > On Mon, Aug 2, 2021 at 7:05 AM Xiong Hu Luo wrote: >> >> There was a patch trying to avoid move cold block out of loop: >> >> https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html >> >> Richard suggested to "never hoist anything from a bb with lower execution >> frequency to a bb with higher one in LIM invariantness_dom_walker >> before_dom_children". >> >> This patch does this profile count check in both gimple LIM >> move_computations_worker and RTL loop-invariant.c find_invariants_bb, >> if the loop bb is colder than loop preheader, don't hoist it out of >> loop. >> >> Also, the profile count in loop split pass should be corrected to avoid >> lim2 and lim4 mismatch behavior, currently, the new loop preheader generated >> by loop_version is set to "[count: 0]:", then lim4 after lsplt pass will >> move statement out of loop unexpectely when lim2 didn't move it. This >> change could fix regression on 544.nab_r from -1.55% to +0.46%. >> >> SPEC2017 performance evaluation shows 1% performance improvement for >> intrate GEOMEAN and no obvious regression for others. Especially, >> 500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is >> largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00% >> on P8LE. >> >> Regression and bootstrap tested pass on P8LE, any comments? Thanks. > > While I'm not familiar with the RTL invariant motion pass the patch there > looks reasonable. Note that we should assess the profile quality > somehow - I'm not sure how to do that, CCed Honza for that. Thanks. > > For the GIMPLE part the patch looks quite complicated - but note it > probably has to be since LIM performs kind of a "CSE" on loads > (and stores for store-motion), so when there are multiple stmts > affected by a hoisting decision the biggest block count has to be > accounted. Likewise when there are dependent stmts involved > that might include conditional stmts (a "PHI"), but the overall > cost should be looked at. Currently, The gimple code check two situations with the patch: 1) The statement or PHI‘s BB is *colder* then preheader, don't move it out of loop; 2) The statement or PHI's BB is *hotter* then preheader, but any of it's rhs couldn't be moved out of loop, also don't move it out of loop to avoid definition not dominates use error. May be I could collect the number of instructions not hoisted with the patch on regression tests and SPEC2017 to do a estimation for "multiple stmts affected" and "overall cost" need to be considered? But it seems move_computations_worker couldn't rollback if we still want to hoist multiple stmts out during the iterations? > > Now - GIMPLE LIM "costing" is somewhat backward right now > and it isn't set up to consider those multiple involved stmts. Plus > the store-motion part does not have any cost part (but it depends > on previously decided invariant motions). > > I think the way you implemented the check will cause no hoisting > to be performed instead of, say, hoisting to a different loop level > only. Possibly shown when you consider a loop nest like > >for (;;) > if (unlikely_cond) >for (;;) > invariant; > > we want to hoist 'invariant' but only from the inner loop even if it > is invariant also in the outer loop. For this case, theorotically I think the master GCC will optimize it to: invariant; for (;;) if (unlikely_cond) for (;;) ; 'invariant' is moved out of outer loop, but with the patch, it will get: for (;;) if (unlikely_cond) { invariant; for (;;) ; } 'invariant' is *cold* for outer loop, but it is still *hot* for inner loop, so hoist it out of inner loop, this is exactly what we want, right? > But for example if there is > a store motion opportunity like > >for (;;) > { > if (unlikely_cond) >for (;;) > a = ...; > a = ...; > } > > we'd still want to perform the store motion on the outer loop. > > Note that store-motion already performs part of the transform > before dependent code is moved in move_computations (that > you patched). Yes. do_store_motion is running before move_computations_worker, store motion happens earlier in execute_sm, I also added the check in execute_sm to stop cold store moved out of loop. So for your case, I think my patch will similarly optimize it to: for (;;) { if (unlikely_cond) { for (;;) ; a = ...; } } a = ...; Whether this is better? Will construct cases to verify it. > > IIRC your main concern were the COND_EXPRs we insert > for hoisted conditional stmts? Not sure what you mean here of COND_EXPRs? Thanks, Xionghu > > Thanks, > Richard. > >> gcc/ChangeLog: >> >> * loop-invariant.c (find_invariants_bb): Check profile count >> before motion. >> (find_invar
Re: [PATCH] Fix loop split incorrect count and probability
On 2021/8/10 22:47, Richard Biener wrote: > On Mon, 9 Aug 2021, Xionghu Luo wrote: > >> Thanks, >> >> On 2021/8/6 19:46, Richard Biener wrote: >>> On Tue, 3 Aug 2021, Xionghu Luo wrote: >>> >>>> loop split condition is moved between loop1 and loop2, the split bb's >>>> count and probability should also be duplicated instead of (100% vs INV), >>>> secondly, the original loop1 and loop2 count need be propotional from the >>>> original loop. >>>> >>>> >>>> diff base/loop-cond-split-1.c.151t.lsplit >>>> patched/loop-cond-split-1.c.151t.lsplit: >>>> ... >>>> int prephitmp_16; >>>> int prephitmp_25; >>>> >>>> [local count: 118111600]: >>>> if (n_7(D) > 0) >>>>goto ; [89.00%] >>>> else >>>>goto ; [11.00%] >>>> >>>> [local count: 118111600]: >>>> return; >>>> >>>> [local count: 105119324]: >>>> pretmp_3 = ga; >>>> >>>> - [local count: 955630225]: >>>> + [local count: 315357973]: >>>> # i_13 = PHI >>>> # prephitmp_12 = PHI >>>> if (prephitmp_12 != 0) >>>>goto ; [33.00%] >>>> else >>>>goto ; [67.00%] >>>> >>>> - [local count: 315357972]: >>>> + [local count: 104068130]: >>>> _2 = do_something (); >>>> ga = _2; >>>> >>>> - [local count: 955630225]: >>>> + [local count: 315357973]: >>>> # prephitmp_5 = PHI >>>> i_10 = inc (i_13); >>>> if (n_7(D) > i_10) >>>>goto ; [89.00%] >>>> else >>>>goto ; [11.00%] >>>> >>>> [local count: 105119324]: >>>> goto ; [100.00%] >>>> >>>> - [local count: 850510901]: >>>> + [local count: 280668596]: >>>> if (prephitmp_12 != 0) >>>> -goto ; [100.00%] >>>> +goto ; [33.00%] >>>> else >>>> -goto ; [INV] >>>> +goto ; [67.00%] >>>> >>>> - [local count: 850510901]: >>>> + [local count: 280668596]: >>>> goto ; [100.00%] >>>> >>>> - [count: 0]: >>>> + [local count: 70429947]: >>>> # i_23 = PHI >>>> # prephitmp_25 = PHI >>>> >>>> - [local count: 955630225]: >>>> + [local count: 640272252]: >>>> # i_15 = PHI >>>> # prephitmp_16 = PHI >>>> i_22 = inc (i_15); >>>> if (n_7(D) > i_22) >>>>goto ; [89.00%] >>>> else >>>>goto ; [11.00%] >>>> >>>> - [local count: 850510901]: >>>> + [local count: 569842305]: >>>> goto ; [100.00%] >>>> >>>>} >>>> >>>> gcc/ChangeLog: >>>> >>>>* tree-ssa-loop-split.c (split_loop): Fix incorrect probability. >>>>(do_split_loop_on_cond): Likewise. >>>> --- >>>>gcc/tree-ssa-loop-split.c | 16 >>>>1 file changed, 8 insertions(+), 8 deletions(-) >>>> >>>> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c >>>> index 3a09bbc39e5..8e5a7ded0f7 100644 >>>> --- a/gcc/tree-ssa-loop-split.c >>>> +++ b/gcc/tree-ssa-loop-split.c >>>> @@ -583,10 +583,10 @@ split_loop (class loop *loop1) >>>>basic_block cond_bb; >> >> if (!initial_true) >> - cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); >> + cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); >> + >> +edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE >> + ? EDGE_SUCC (bbs[i], 0) >> + : EDGE_SUCC (bbs[i], 1); >> >>>> >>>>class loop *loop2 = loop_version (loop1, cond, &cond_bb, >>>> - profile_probability::always (), >>>> - profile_probability::always (), >>>> - profile_probability::always (), >>
Re: [PATCH] Fix loop split incorrect count and probability
On 2021/8/11 17:16, Richard Biener wrote: On Wed, 11 Aug 2021, Xionghu Luo wrote: On 2021/8/10 22:47, Richard Biener wrote: On Mon, 9 Aug 2021, Xionghu Luo wrote: Thanks, On 2021/8/6 19:46, Richard Biener wrote: On Tue, 3 Aug 2021, Xionghu Luo wrote: loop split condition is moved between loop1 and loop2, the split bb's count and probability should also be duplicated instead of (100% vs INV), secondly, the original loop1 and loop2 count need be propotional from the original loop. diff base/loop-cond-split-1.c.151t.lsplit patched/loop-cond-split-1.c.151t.lsplit: ... int prephitmp_16; int prephitmp_25; [local count: 118111600]: if (n_7(D) > 0) goto ; [89.00%] else goto ; [11.00%] [local count: 118111600]: return; [local count: 105119324]: pretmp_3 = ga; - [local count: 955630225]: + [local count: 315357973]: # i_13 = PHI # prephitmp_12 = PHI if (prephitmp_12 != 0) goto ; [33.00%] else goto ; [67.00%] - [local count: 315357972]: + [local count: 104068130]: _2 = do_something (); ga = _2; - [local count: 955630225]: + [local count: 315357973]: # prephitmp_5 = PHI i_10 = inc (i_13); if (n_7(D) > i_10) goto ; [89.00%] else goto ; [11.00%] [local count: 105119324]: goto ; [100.00%] - [local count: 850510901]: + [local count: 280668596]: if (prephitmp_12 != 0) -goto ; [100.00%] +goto ; [33.00%] else -goto ; [INV] +goto ; [67.00%] - [local count: 850510901]: + [local count: 280668596]: goto ; [100.00%] - [count: 0]: + [local count: 70429947]: # i_23 = PHI # prephitmp_25 = PHI - [local count: 955630225]: + [local count: 640272252]: # i_15 = PHI # prephitmp_16 = PHI i_22 = inc (i_15); if (n_7(D) > i_22) goto ; [89.00%] else goto ; [11.00%] - [local count: 850510901]: + [local count: 569842305]: goto ; [100.00%] } gcc/ChangeLog: * tree-ssa-loop-split.c (split_loop): Fix incorrect probability. (do_split_loop_on_cond): Likewise. --- gcc/tree-ssa-loop-split.c | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c index 3a09bbc39e5..8e5a7ded0f7 100644 --- a/gcc/tree-ssa-loop-split.c +++ b/gcc/tree-ssa-loop-split.c @@ -583,10 +583,10 @@ split_loop (class loop *loop1) basic_block cond_bb; if (!initial_true) - cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); + cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); + + edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE + ? EDGE_SUCC (bbs[i], 0) + : EDGE_SUCC (bbs[i], 1); class loop *loop2 = loop_version (loop1, cond, &cond_bb, - profile_probability::always (), - profile_probability::always (), - profile_probability::always (), - profile_probability::always (), + true_edge->probability, + true_edge->probability.invert (), + true_edge->probability, + true_edge->probability.invert (), true); there is no 'true_edge' variable at this point. Sorry, missed the above hunk when split the patch. gcc_assert (loop2); @@ -1486,10 +1486,10 @@ do_split_loop_on_cond (struct loop *loop1, edge invar_branch) initialize_original_copy_tables (); struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL, -profile_probability::always (), -profile_probability::never (), -profile_probability::always (), -profile_probability::always (), +invar_branch->probability.invert (), +invar_branch->probability, +invar_branch->probability.invert (), +invar_branch->probability, true); if (!loop2) { The patch introduction seems to talk about do_split_loop_on_cond only. split_loop faces similar issue though it sets the two branches to 100% vs 100% and no scaling which seems also incorrect. Since loop versioning inserts a condition with the passed probabilities but in this case a 'boolean_true_node' condition the th
Re: [PATCH] Fix incorrect computation in fill_always_executed_in_1
Hi, On 2021/8/16 19:46, Richard Biener wrote: On Mon, 16 Aug 2021, Xiong Hu Luo wrote: It seems to me that ALWAYS_EXECUTED_IN is not computed correctly for nested loops. inn_loop is updated to inner loop, so it need be restored when exiting from innermost loop. With this patch, the store instruction in outer loop could also be moved out of outer loop by store motion. Any comments? Thanks. gcc/ChangeLog: * tree-ssa-loop-im.c (fill_always_executed_in_1): Restore inn_loop when exiting from innermost loop. gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/ssa-lim-19.c: New test. --- gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c | 24 ++ gcc/tree-ssa-loop-im.c | 6 +- 2 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c new file mode 100644 index 000..097a5ee4a4b --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c @@ -0,0 +1,24 @@ +/* PR/101293 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-lim2-details" } */ + +struct X { int i; int j; int k;}; + +void foo(struct X *x, int n, int l) +{ + for (int j = 0; j < l; j++) +{ + for (int i = 0; i < n; ++i) + { + int *p = &x->j; + int tem = *p; + x->j += tem * i; + } + int *r = &x->k; + int tem2 = *r; + x->k += tem2 * j; +} +} + +/* { dg-final { scan-tree-dump-times "Executing store motion" 2 "lim2" } } */ + diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index b24bc64f2a7..5ca4738b20e 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -3211,6 +3211,10 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) last = bb; + if (inn_loop != loop + && flow_loop_nested_p (bb->loop_father, inn_loop)) + inn_loop = bb->loop_father; + The comment says /* In a loop that is always entered we may proceed anyway. But record that we entered it and stop once we leave it. */ inn_loop = bb->loop_father; and your change would defeat that early return, no? The issue is the search method exits too early when iterating the outer loop. For example of a nested loop, loop 1 includes 5,8,3,10,4,9 and loop2 includes 3,10. Currently, it breaks when bb is 3 as bb 3 doesn't dominate bb 9 of loop 1. But actually, both bb 5 and bb 4 are ALWAYS_EXECUTED for loop 1, so if there are store instructions in bb 4 they won't be processed by store motion again. 5< |\ | 8 \ 9 | \ | --->3--->4 || 10---| SET_ALWAYS_EXECUTED_IN is only set to bb 5 on master code now, with this patch, it will continue search when meet bb 3 until bb 4, then last is updated to bb 4, it will break until exit edge is found at bb 4 by "if (!flow_bb_inside_loop_p (loop, e->dest))". Then the followed loop code will set bb 4 as ALWAYS_EXEUCTED and all it's idoms bb 5. while (1) { SET_ALWAYS_EXECUTED_IN (last, loop); if (last == loop->header) break; last = get_immediate_dominator (CDI_DOMINATORS, last); } After further discussion with Kewen, we found that the inn_loop variable is totally useless and could be removed. if (bitmap_bit_p (contains_call, bb->index)) break; @@ -3238,7 +3242,7 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) if (bb->loop_father->header == bb) { - if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) + if (!dominated_by_p (CDI_DOMINATORS, bb->loop_father->latch, bb)) break; That's now a always false condition - a loops latch is always dominated by its header. The condition as written tries to verify whether the loop is always entered - mind we visit all blocks, not only those always executed. Thanks for the catch! I am afraid the piece of code should be removed since it stops search of potential ALWAYS EXECUTED bb after inner loop... In fact for your testcase the x->j ref is _not_ always executed since the inner loop is conditional on n > 0. Yes. But I want to move x->k (not x->j) out of loop 1 when l > 0 in store-motion. Attached the diff file without and with my patch to show the extra optimization. x->j is already moved out of loop 2 on master code. If change n and l to constant numbers like 100, master code could also do 2 store motions as expected. The edge from bb 5 to bb 4 doesn't exist now, so bb 4, bb 3 and bb 5 are ALWAYS EXECUTED for loop 1. struct X { int i; int j; int k;}; void foo(struct X *x, int n, int l) { for (int j = 0; j < l; j++) // loop 1 { for (int i = 0; i < n; ++i) // loop 2 { int *p = &x->j;
Re: [PATCH] Fix incorrect computation in fill_always_executed_in_1
On 2021/8/17 13:17, Xionghu Luo via Gcc-patches wrote: Hi, On 2021/8/16 19:46, Richard Biener wrote: On Mon, 16 Aug 2021, Xiong Hu Luo wrote: It seems to me that ALWAYS_EXECUTED_IN is not computed correctly for nested loops. inn_loop is updated to inner loop, so it need be restored when exiting from innermost loop. With this patch, the store instruction in outer loop could also be moved out of outer loop by store motion. Any comments? Thanks. gcc/ChangeLog: * tree-ssa-loop-im.c (fill_always_executed_in_1): Restore inn_loop when exiting from innermost loop. gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/ssa-lim-19.c: New test. --- gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c | 24 ++ gcc/tree-ssa-loop-im.c | 6 +- 2 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c new file mode 100644 index 000..097a5ee4a4b --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c @@ -0,0 +1,24 @@ +/* PR/101293 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-lim2-details" } */ + +struct X { int i; int j; int k;}; + +void foo(struct X *x, int n, int l) +{ + for (int j = 0; j < l; j++) + { + for (int i = 0; i < n; ++i) + { + int *p = &x->j; + int tem = *p; + x->j += tem * i; + } + int *r = &x->k; + int tem2 = *r; + x->k += tem2 * j; + } +} + +/* { dg-final { scan-tree-dump-times "Executing store motion" 2 "lim2" } } */ + diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index b24bc64f2a7..5ca4738b20e 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -3211,6 +3211,10 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) last = bb; + if (inn_loop != loop + && flow_loop_nested_p (bb->loop_father, inn_loop)) + inn_loop = bb->loop_father; + The comment says /* In a loop that is always entered we may proceed anyway. But record that we entered it and stop once we leave it. */ inn_loop = bb->loop_father; and your change would defeat that early return, no? The issue is the search method exits too early when iterating the outer loop. For example of a nested loop, loop 1 includes 5,8,3,10,4,9 and loop2 includes 3,10. Currently, it breaks when bb is 3 as bb 3 doesn't dominate bb 9 of loop 1. But actually, both bb 5 and bb 4 are ALWAYS_EXECUTED for loop 1, so if there are store instructions in bb 4 they won't be processed by store motion again. 5< |\ | 8 \ 9 | \ | --->3--->4 | | 10---| Correct the graph display: 5< |\ | 8 \ 9 | \ | --->3--->4 | | ---10 SET_ALWAYS_EXECUTED_IN is only set to bb 5 on master code now, with this patch, it will continue search when meet bb 3 until bb 4, then last is updated to bb 4, it will break until exit edge is found at bb 4 by "if (!flow_bb_inside_loop_p (loop, e->dest))". Then the followed loop code will set bb 4 as ALWAYS_EXEUCTED and all it's idoms bb 5. while (1) { SET_ALWAYS_EXECUTED_IN (last, loop); if (last == loop->header) break; last = get_immediate_dominator (CDI_DOMINATORS, last); } After further discussion with Kewen, we found that the inn_loop variable is totally useless and could be removed. if (bitmap_bit_p (contains_call, bb->index)) break; @@ -3238,7 +3242,7 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) if (bb->loop_father->header == bb) { - if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) + if (!dominated_by_p (CDI_DOMINATORS, bb->loop_father->latch, bb)) break; That's now a always false condition - a loops latch is always dominated by its header. The condition as written tries to verify whether the loop is always entered - mind we visit all blocks, not only those always executed. Thanks for the catch! I am afraid the piece of code should be removed since it stops search of potential ALWAYS EXECUTED bb after inner loop... In fact for your testcase the x->j ref is _not_ always executed since the inner loop is conditional on n > 0. Yes. But I want to move x->k (not x->j) out of loop 1 when l > 0 in store-motion. Attached the diff file without and with my patch to show the extra optimization. x->j is already moved out of loop 2 on master code. If change n and l to constant numbers like 100, master code could also do 2 store motions as expected. The edge from bb 5 to bb 4 does
[PATCH v2] Fix incomplete computation in fill_always_executed_in_1
On 2021/8/17 15:12, Richard Biener wrote: > On Tue, 17 Aug 2021, Xionghu Luo wrote: > >> Hi, >> >> On 2021/8/16 19:46, Richard Biener wrote: >>> On Mon, 16 Aug 2021, Xiong Hu Luo wrote: >>> >>>> It seems to me that ALWAYS_EXECUTED_IN is not computed correctly for >>>> nested loops. inn_loop is updated to inner loop, so it need be restored >>>> when exiting from innermost loop. With this patch, the store instruction >>>> in outer loop could also be moved out of outer loop by store motion. >>>> Any comments? Thanks. >>> >>>> gcc/ChangeLog: >>>> >>>> * tree-ssa-loop-im.c (fill_always_executed_in_1): Restore >>>> inn_loop when exiting from innermost loop. >>>> >>>> gcc/testsuite/ChangeLog: >>>> >>>>* gcc.dg/tree-ssa/ssa-lim-19.c: New test. >>>> --- >>>>gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c | 24 ++ >>>>gcc/tree-ssa-loop-im.c | 6 +- >>>>2 files changed, 29 insertions(+), 1 deletion(-) >>>>create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c >>>> >>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c >>>> b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c >>>> new file mode 100644 >>>> index 000..097a5ee4a4b >>>> --- /dev/null >>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c >>>> @@ -0,0 +1,24 @@ >>>> +/* PR/101293 */ >>>> +/* { dg-do compile } */ >>>> +/* { dg-options "-O2 -fdump-tree-lim2-details" } */ >>>> + >>>> +struct X { int i; int j; int k;}; >>>> + >>>> +void foo(struct X *x, int n, int l) >>>> +{ >>>> + for (int j = 0; j < l; j++) >>>> +{ >>>> + for (int i = 0; i < n; ++i) >>>> + { >>>> +int *p = &x->j; >>>> +int tem = *p; >>>> +x->j += tem * i; >>>> + } >>>> + int *r = &x->k; >>>> + int tem2 = *r; >>>> + x->k += tem2 * j; >>>> +} >>>> +} >>>> + >>>> +/* { dg-final { scan-tree-dump-times "Executing store motion" 2 "lim2" } } >>>> */ >>>> + >>>> diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c >>>> index b24bc64f2a7..5ca4738b20e 100644 >>>> --- a/gcc/tree-ssa-loop-im.c >>>> +++ b/gcc/tree-ssa-loop-im.c >>>> @@ -3211,6 +3211,10 @@ fill_always_executed_in_1 (class loop *loop, sbitmap >>>> @@ contains_call) >>>> if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) >>>> last = bb; >>>>+ if (inn_loop != loop >>>> +&& flow_loop_nested_p (bb->loop_father, inn_loop)) >>>> + inn_loop = bb->loop_father; >>>> + >>> >>> The comment says >>> >>> /* In a loop that is always entered we may proceed anyway. >>>But record that we entered it and stop once we leave it. >>> */ >>> inn_loop = bb->loop_father; >>> >>> and your change would defeat that early return, no? >> >> The issue is the search method exits too early when iterating the outer >> loop. For example of a nested loop, loop 1 includes 5,8,3,10,4,9 >> and loop2 includes 3,10. Currently, it breaks when bb is 3 as bb 3 >> doesn't dominate bb 9 of loop 1. But actually, both bb 5 and bb 4 are >> ALWAYS_EXECUTED for loop 1, so if there are store instructions in bb 4 >> they won't be processed by store motion again. >> >> >> 5< >> |\ | >> 8 \ 9 >> | \ | >> --->3--->4 >> || >> 10---| >> >> >> SET_ALWAYS_EXECUTED_IN is only set to bb 5 on master code now, with this >> patch, it will continue search when meet bb 3 until bb 4, then last is >> updated >> to bb 4, it will break until exit edge is found at bb 4 by >> "if (!flow_bb_inside_loop_p (loop, e->dest))". Then the followed loop code >> will >> set bb 4 as ALWAYS_EXEUCTED and all it's idoms bb 5. >> >> >> while (1) >> { >>SET_ALWAYS_EXECUTED_IN (last, loop); >>if (last == loop->header) >> break; >>last =
Re: [PATCH v2] Fix incomplete computation in fill_always_executed_in_1
On 2021/8/17 17:10, Xionghu Luo via Gcc-patches wrote: > > > On 2021/8/17 15:12, Richard Biener wrote: >> On Tue, 17 Aug 2021, Xionghu Luo wrote: >> >>> Hi, >>> >>> On 2021/8/16 19:46, Richard Biener wrote: >>>> On Mon, 16 Aug 2021, Xiong Hu Luo wrote: >>>> >>>>> It seems to me that ALWAYS_EXECUTED_IN is not computed correctly for >>>>> nested loops. inn_loop is updated to inner loop, so it need be restored >>>>> when exiting from innermost loop. With this patch, the store instruction >>>>> in outer loop could also be moved out of outer loop by store motion. >>>>> Any comments? Thanks. >>>> >>>>> gcc/ChangeLog: >>>>> >>>>>* tree-ssa-loop-im.c (fill_always_executed_in_1): Restore >>>>>inn_loop when exiting from innermost loop. >>>>> >>>>> gcc/testsuite/ChangeLog: >>>>> >>>>> * gcc.dg/tree-ssa/ssa-lim-19.c: New test. >>>>> --- >>>>> gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c | 24 ++ >>>>> gcc/tree-ssa-loop-im.c | 6 +- >>>>> 2 files changed, 29 insertions(+), 1 deletion(-) >>>>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c >>>>> >>>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c >>>>> b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c >>>>> new file mode 100644 >>>>> index 000..097a5ee4a4b >>>>> --- /dev/null >>>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c >>>>> @@ -0,0 +1,24 @@ >>>>> +/* PR/101293 */ >>>>> +/* { dg-do compile } */ >>>>> +/* { dg-options "-O2 -fdump-tree-lim2-details" } */ >>>>> + >>>>> +struct X { int i; int j; int k;}; >>>>> + >>>>> +void foo(struct X *x, int n, int l) >>>>> +{ >>>>> + for (int j = 0; j < l; j++) >>>>> +{ >>>>> + for (int i = 0; i < n; ++i) >>>>> + { >>>>> + int *p = &x->j; >>>>> + int tem = *p; >>>>> + x->j += tem * i; >>>>> + } >>>>> + int *r = &x->k; >>>>> + int tem2 = *r; >>>>> + x->k += tem2 * j; >>>>> +} >>>>> +} >>>>> + >>>>> +/* { dg-final { scan-tree-dump-times "Executing store motion" 2 "lim2" } >>>>> } >>>>> */ >>>>> + >>>>> diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c >>>>> index b24bc64f2a7..5ca4738b20e 100644 >>>>> --- a/gcc/tree-ssa-loop-im.c >>>>> +++ b/gcc/tree-ssa-loop-im.c >>>>> @@ -3211,6 +3211,10 @@ fill_always_executed_in_1 (class loop *loop, >>>>> sbitmap >>>>> @@ contains_call) >>>>>if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) >>>>> last = bb; >>>>> + if (inn_loop != loop >>>>> + && flow_loop_nested_p (bb->loop_father, inn_loop)) >>>>> + inn_loop = bb->loop_father; >>>>> + >>>> >>>> The comment says >>>> >>>> /* In a loop that is always entered we may proceed anyway. >>>> But record that we entered it and stop once we leave >>>> it. >>>> */ >>>> inn_loop = bb->loop_father; >>>> >>>> and your change would defeat that early return, no? >>> >>> The issue is the search method exits too early when iterating the outer >>> loop. For example of a nested loop, loop 1 includes 5,8,3,10,4,9 >>> and loop2 includes 3,10. Currently, it breaks when bb is 3 as bb 3 >>> doesn't dominate bb 9 of loop 1. But actually, both bb 5 and bb 4 are >>> ALWAYS_EXECUTED for loop 1, so if there are store instructions in bb 4 >>> they won't be processed by store motion again. >>> >>> >>> 5< >>> |\ | >>> 8 \ 9 >>> | \ | >>> --->3--->4 >>> || >>> 10---| >>> >>> >>> SET_ALWAYS_EXECUTED_IN is only set to bb 5 on master code now, with this >>> patch
[PATCH v2] Don't move cold code out of loop by checking bb count
On 2021/8/10 12:25, Ulrich Drepper wrote: > On Tue, Aug 10, 2021 at 4:03 AM Xionghu Luo via Gcc-patches > wrote: >> For this case, theorotically I think the master GCC will optimize it to: >> >>invariant; >>for (;;) >> if (unlikely_cond) >>for (;;) >> ; >> >> 'invariant' is moved out of outer loop, but with the patch, it will get: >> >>for (;;) >> if (unlikely_cond) >>{ >> invariant; >> for (;;) >> ; >>} >> >> 'invariant' is *cold* for outer loop, but it is still *hot* for inner loop, >> so hoist it out of inner loop, this is exactly what we want, right? > > Is relying on absolute numbers really what you want? If the > 'unlikely_cond' condition depends on the iteration count of the outer > loop the probability of it being true in each individual iteration can > be low (at least that's how I use unlikely) but the overall > probability of needing the code is higher 1 - (1 - p)^n if 'p' is the > probability of 'unlikely_cond' and 'n' is the number of iterations. > Assuming complete independence of the loop iterations, otherwise it's > rather an upper limit. > > At the very least I'd generate code like this: > >first = true; >for (;;) > if (unlikely_cond) >{ > if (first) >{ > invariant; > first = false; >} > for (;;) > ; >} > > If it's worth hoisting the code the the extra test and flag should be > small in cost in comparison. > > If 'unlikely_cond' does not in any way depend on the loop iteration > then I think your code generation is fine. Thanks for your good suggestion, I am also not sure whether it is necessary to do it this way:) But I found that even the first step of for (;;) if (unlikely_cond) { invariant; for (;;) ; } is not supported yet. So I added a new function *find_coldest_out_loop* to search the coldest function between outermost invariant loop and own loop in compute_invariantness to move invariant out to cold loop first: [PATCH v2] Don't move cold code out of loop by checking bb count There was a patch trying to avoid move cold block out of loop: https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html Richard suggested to "never hoist anything from a bb with lower execution frequency to a bb with higher one in LIM invariantness_dom_walker before_dom_children". In gimple LIM analysis, add find_coldest_out_loop to move invariants to expected target loop, then in both gimple LIM move_computations_worker and RTL loop-invariant.c find_invariants_bb, if profile count check find the loop bb is colder than target loop preheader, don't hoist it out of loop. SPEC2017 performance evaluation shows 1% performance improvement for intrate GEOMEAN and no obvious regression for others. Especially, 500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00% on P8LE. Regression and bootstrap tested pass on P8LE, any comments? Thanks. gcc/ChangeLog: * loop-invariant.c (find_invariants_bb): Check profile count before motion. (find_invariants_body): Add argument. * tree-ssa-loop-im.c (find_coldest_out_loop): New function. (outermost_invariant_loop): Use find_coldest_out_loop. (determine_max_movement): Likewise. (move_computations_worker): Check profile count before motion. (execute_sm): Likewise. (execute_sm_exit): Check pointer validness. gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/recip-3.c: Adjust. * gcc.dg/tree-ssa/ssa-lim-16.c: New test. * gcc.dg/tree-ssa/ssa-lim-17.c: New test. --- gcc/loop-invariant.c | 10 +- gcc/tree-ssa-loop-im.c | 186 +++-- gcc/testsuite/gcc.dg/tree-ssa/recip-3.c| 2 +- gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-16.c | 21 +++ gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-17.c | 26 +++ 5 files changed, 231 insertions(+), 14 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-16.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-17.c diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c index fca0c2b24be..5c3be7bf0eb 100644 --- a/gcc/loop-invariant.c +++ b/gcc/loop-invariant.c @@ -1183,9 +1183,14 @@ find_invariants_insn (rtx_insn *insn, bool always_reached, bool always_executed) call. */ static void -find_invariants_bb (basic_block bb, bool always_reached, bool always_executed) +find_invariants_bb (class loop *loop, ba
Re: [PATCH v2] Fix incomplete computation in fill_always_executed_in_1
On 2021/8/19 20:11, Richard Biener wrote: >> - class loop *inn_loop = loop; >> >> if (ALWAYS_EXECUTED_IN (loop->header) == NULL) >> { >> @@ -3232,19 +3231,6 @@ fill_always_executed_in_1 (class loop *loop, sbitmap >> contains_call) >> to disprove this if possible). */ >>if (bb->flags & BB_IRREDUCIBLE_LOOP) >> break; >> - >> - if (!flow_bb_inside_loop_p (inn_loop, bb)) >> -break; >> - >> - if (bb->loop_father->header == bb) >> -{ >> - if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) >> -break; >> - >> - /* In a loop that is always entered we may proceed anyway. >> - But record that we entered it and stop once we leave it. */ >> - inn_loop = bb->loop_father; >> -} >> } >> >> while (1) > I'm not sure this will work correct (I'm not sure how the existing > code makes it so either...). That said, I can't poke any hole > into the change. What I see is that definitely > >if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) > last = bb; > >if (bitmap_bit_p (contains_call, bb->index)) > break; > > doesn't work reliably since the DOM ordering will process blocks > A B and C in random order for > >for (;;) > { >if (cond) > { >A: foo (); > } >else B:; >C:; > } > > and thus we can end up setting 'last' to C_before_ processing > 'A' and thus arriving at the call foo () ... > > get_loop_body_in_dom_order does some "special sauce" but not > to address the above problem - but it might be that a subtle > issue like the above is the reason for the inner loop handling. > The inner loop block order does_not_ adhere to this "special sauce", > that is - the "Additionally, if a basic block s dominates > the latch, then only blocks dominated by s are be after it." > guarantee holds for the outer loop latch, not for the inner. > > Digging into the history of fill_always_executed_in_1 doesn't > reveal anything - the inner loop handling has been present > since introduction by Zdenek - but usually Zdenek has a reason > for doing things as he does;) Yes, this is really complicated usage, thanks for point it out. :) I constructed two cases to verify this with inner loop includes "If A; else B; C". Finding that fill_sons_in_loop in get_loop_body_in_dom_order will also checks whether the bb domintes outer loop’s latch, if C dominate outer loop’s latch, C is postponed, the access order is ABC, 'last' won’t be set to C if A or B contains call; Otherwise if C doesn’t dominate outer loop’s latch in fill_sons_in_loop, the access order is CAB, but 'last' also won’t be updated to C in fill_always_executed_in_1 since there is also dominate check, then if A or B contains call, it could break successfully. C won't be set to ALWAYS EXECUTED for both circumstance. > > Note it might be simply a measure against quadratic complexity, > esp. since with your patch we also dive into not always executed > subloops as you remove the > >if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) > break; > > check. I suggest to evaluate behavior of the patch on a testcase > like > > void foo (int n, int **k) > { >for (int i = 0; i < n; ++i) > if (k[0][i]) >for (int j = 0; j < n; ++j) > if (k[1][j]) >for (int l = 0; l < n; ++l) > if (k[2][l]) >... > } Theoretically the complexity is changing from L1(bbs) to L1(bbs)+L2(bbs)+L3(bbs)+…+Ln(bbs), so fill_always_executed_in_1's execution time is supposed to be increase from O(n) to O(n2)? The time should depend on loop depth and bb counts. I also drafted a test case has 73-depth loop function with 25 no-ipa function copies each compiled in lim2 and lim4 dependently. Total execution time of fill_always_executed_in_1 is increased from 32ms to 58ms, almost doubled but not quadratic? It seems reasonable to see compiling time getting longer since most bbs are checked more but a MUST to ensure early break correctly in every loop level... Though loop nodes could be huge, loop depth will never be so large in actual code? > > I suspect you'll see quadratic behavior with your patch. You > should be at least able to preserve a check like > >/* Do not process not always executed subloops to avoid > quadratic behavior. */ >if (bb->loop_father->header == bb >&& !dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) > break; > > which is of course not optimistic for cases like > >for (..) > { > if (cond) > for (..) > x = 1; // this is always executed if the inner loop is finite > } > > but we need to have an eye on the complexity of this function. I would > have suggested to do greedy visiting of the loop header successors, > proce
Re: [PATCH v3] Fix incomplete computation in fill_always_executed_in_1
On 2021/8/24 16:20, Richard Biener wrote: > On Tue, 24 Aug 2021, Xionghu Luo wrote: > >> >> >> On 2021/8/19 20:11, Richard Biener wrote: >>>> - class loop *inn_loop = loop; >>>> >>>> if (ALWAYS_EXECUTED_IN (loop->header) == NULL) >>>>{ >>>> @@ -3232,19 +3231,6 @@ fill_always_executed_in_1 (class loop *loop, >>>> sbitmap contains_call) >>>> to disprove this if possible). */ >>>> if (bb->flags & BB_IRREDUCIBLE_LOOP) >>>>break; >>>> - >>>> -if (!flow_bb_inside_loop_p (inn_loop, bb)) >>>> - break; >>>> - >>>> -if (bb->loop_father->header == bb) >>>> - { >>>> -if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) >>>> - break; >>>> - >>>> -/* In a loop that is always entered we may proceed anyway. >>>> - But record that we entered it and stop once we leave it. */ >>>> -inn_loop = bb->loop_father; >>>> - } >>>>} >>>> >>>> while (1) >>> I'm not sure this will work correct (I'm not sure how the existing >>> code makes it so either...). That said, I can't poke any hole >>> into the change. What I see is that definitely >>> >>> if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) >>> last = bb; >>> >>> if (bitmap_bit_p (contains_call, bb->index)) >>> break; >>> >>> doesn't work reliably since the DOM ordering will process blocks >>> A B and C in random order for >>> >>> for (;;) >>> { >>> if (cond) >>> { >>> A: foo (); >>> } >>> else B:; >>> C:; >>> } >>> >>> and thus we can end up setting 'last' to C_before_ processing >>> 'A' and thus arriving at the call foo () ... >>> >>> get_loop_body_in_dom_order does some "special sauce" but not >>> to address the above problem - but it might be that a subtle >>> issue like the above is the reason for the inner loop handling. >>> The inner loop block order does_not_ adhere to this "special sauce", >>> that is - the "Additionally, if a basic block s dominates >>> the latch, then only blocks dominated by s are be after it." >>> guarantee holds for the outer loop latch, not for the inner. >>> >>> Digging into the history of fill_always_executed_in_1 doesn't >>> reveal anything - the inner loop handling has been present >>> since introduction by Zdenek - but usually Zdenek has a reason >>> for doing things as he does;) >> >> Yes, this is really complicated usage, thanks for point it out. :) >> I constructed two cases to verify this with inner loop includes "If A; else >> B; C". >> Finding that fill_sons_in_loop in get_loop_body_in_dom_order will also checks >> whether the bb domintes outer loop’s latch, if C dominate outer loop’s latch, >> C is postponed, the access order is ABC, 'last' won’t be set to C if A or B >> contains call; > > But it depends on the order of visiting ABC and that's hard to put into > a testcase since it depends on the order of edges and the processing > of the dominance computation. ABC are simply unordered with respect > to a dominator walk. > >> Otherwise if C doesn’t dominate outer loop’s latch in fill_sons_in_loop, >> the access order is CAB, but 'last' also won’t be updated to C in >> fill_always_executed_in_1 >> since there is also dominate check, then if A or B contains call, it could >> break >> successfully. >> >> C won't be set to ALWAYS EXECUTED for both circumstance. >> >>> >>> Note it might be simply a measure against quadratic complexity, >>> esp. since with your patch we also dive into not always executed >>> subloops as you remove the >>> >>> if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) >>> break; >>> >>> check. I suggest to evaluate behavior of the patch on a testcase >>> like >>> >>> void foo (int n, int **k) >>> { >>> for (int i = 0; i < n; ++i) >
Re: [PATCH v3] Fix incomplete computation in fill_always_executed_in_1
On 2021/8/27 15:45, Richard Biener wrote: On Thu, 26 Aug 2021, Xionghu Luo wrote: On 2021/8/24 16:20, Richard Biener wrote: On Tue, 24 Aug 2021, Xionghu Luo wrote: On 2021/8/19 20:11, Richard Biener wrote: - class loop *inn_loop = loop; if (ALWAYS_EXECUTED_IN (loop->header) == NULL) { @@ -3232,19 +3231,6 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) to disprove this if possible). */ if (bb->flags & BB_IRREDUCIBLE_LOOP) break; - - if (!flow_bb_inside_loop_p (inn_loop, bb)) - break; - - if (bb->loop_father->header == bb) - { - if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) - break; - - /* In a loop that is always entered we may proceed anyway. -But record that we entered it and stop once we leave it. */ - inn_loop = bb->loop_father; - } } while (1) I'm not sure this will work correct (I'm not sure how the existing code makes it so either...). That said, I can't poke any hole into the change. What I see is that definitely if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) last = bb; if (bitmap_bit_p (contains_call, bb->index)) break; doesn't work reliably since the DOM ordering will process blocks A B and C in random order for for (;;) { if (cond) { A: foo (); } else B:; C:; } and thus we can end up setting 'last' to C_before_ processing 'A' and thus arriving at the call foo () ... get_loop_body_in_dom_order does some "special sauce" but not to address the above problem - but it might be that a subtle issue like the above is the reason for the inner loop handling. The inner loop block order does_not_ adhere to this "special sauce", that is - the "Additionally, if a basic block s dominates the latch, then only blocks dominated by s are be after it." guarantee holds for the outer loop latch, not for the inner. Digging into the history of fill_always_executed_in_1 doesn't reveal anything - the inner loop handling has been present since introduction by Zdenek - but usually Zdenek has a reason for doing things as he does;) Yes, this is really complicated usage, thanks for point it out. :) I constructed two cases to verify this with inner loop includes "If A; else B; C". Finding that fill_sons_in_loop in get_loop_body_in_dom_order will also checks whether the bb domintes outer loop’s latch, if C dominate outer loop’s latch, C is postponed, the access order is ABC, 'last' won’t be set to C if A or B contains call; But it depends on the order of visiting ABC and that's hard to put into a testcase since it depends on the order of edges and the processing of the dominance computation. ABC are simply unordered with respect to a dominator walk. Otherwise if C doesn’t dominate outer loop’s latch in fill_sons_in_loop, the access order is CAB, but 'last' also won’t be updated to C in fill_always_executed_in_1 since there is also dominate check, then if A or B contains call, it could break successfully. C won't be set to ALWAYS EXECUTED for both circumstance. Note it might be simply a measure against quadratic complexity, esp. since with your patch we also dive into not always executed subloops as you remove the if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) break; check. I suggest to evaluate behavior of the patch on a testcase like void foo (int n, int **k) { for (int i = 0; i < n; ++i) if (k[0][i]) for (int j = 0; j < n; ++j) if (k[1][j]) for (int l = 0; l < n; ++l) if (k[2][l]) ... } Theoretically the complexity is changing from L1(bbs) to L1(bbs)+L2(bbs)+L3(bbs)+…+Ln(bbs), so fill_always_executed_in_1's execution time is supposed to be increase from O(n) to O(n2)? The time should depend on loop depth and bb counts. I also drafted a test case has 73-depth loop function with 25 no-ipa function copies each compiled in lim2 and lim4 dependently. Total execution time of fill_always_executed_in_1 is increased from 32ms to 58ms, almost doubled but not quadratic? It's more like n + (n-1) + (n-2) + ... + 1 which is n^2/2 but that's still O(n^2). It seems reasonable to see compiling time getting longer since most bbs are checked more but a MUST to ensure early break correctly in every loop level... Though loop nodes could be huge, loop depth will never be so large in actual code? The "in practice" argument is almost always defeated by automatic program generators ;) I suspect you'll see quadratic
Re: [PATCH v3] Fix incomplete computation in fill_always_executed_in_1
On 2021/8/30 17:19, Richard Biener wrote: bitmap_set_bit (work_set, loop->header->index); + unsigned bb_index; - for (i = 0; i < loop->num_nodes; i++) - { - edge_iterator ei; - bb = bbs[i]; + unsigned array_size = last_basic_block_for_fn (cfun) + 1; + int *bbd = XNEWVEC (int, array_size); + bbd = XDUPVEC (int, bbi, array_size); I don't think you need to copy 'bbi' but you can re-use the state from the outer loop processing. Did you run into any issues with that? Yes. For example, adding a small if-else block to ssa-lim-19.c, Then block "x->j += tem * i;" of bb 6 is always executed for loop 2, when call fill_always_executed_in_1 for loop 1, bbi[6] is decreased from 2 to 1 to 0, then if fill_always_executed_in_1 is called again for loop 2, it's value is not reset so bbi[6] won't be set ALWAYS EXECUTE, this is wrong. struct X { int i; int j; int k;}; void foo(struct X *x, int n, int l, int m) { for (int j = 0; j < l; j++) // loop 1 { for (int i = 0; i < n; ++i) // loop 2 { if (m) x->j++; else x->j = m+n+l; int *p = &x->j; // bb 6 int tem = *p; x->j += tem * i; } int *r = &x->k; int tem2 = *r; x->k += tem2 * j; } } Hmm, but if the outer loop processing reaches bb 6 then it should have set it ALWAYS_EXECUTED in loop 1 already? But bb 6 is NOT ALWAYS_EXECUTED for loop 1, it is only ALWAYS_EXECUTED for loop 2 as it requires n>0. Please refer to the attached file ssa-lim-19.c.138t.lim2. ;; ;; Loop 1 ;; header 8, latch 12 ;; depth 1, outer 0 ;; nodes: 8 12 7 6 4 5 3 13 11 ;; ;; Loop 2 ;; header 3, latch 13 ;; depth 2, outer 1 ;; nodes: 3 13 6 4 5 ;; 2 succs { 10 9 } ;; 10 succs { 8 } ;; 11 succs { 3 } ;; 3 succs { 4 5 } ;; 4 succs { 6 } ;; 5 succs { 6 } ;; 6 succs { 13 7 } ;; 13 succs { 3 } ;; 7 succs { 12 9 } ;; 12 succs { 8 } ;; 8 succs { 11 7 } ;; 9 succs { 1 } always executed: bb->index:8, loop->num: 1 always executed: bb->index:7, loop->num: 1 always executed: bb->index:3, loop->num: 2 always executed: bb->index:6, loop->num: 2 8<--- / \ | 11 \ | / \| 3<--- \ | /\| \ | 4 5 | \ | \/|\| 6| \ | |-->13 \ | |--> 7 | /\| 9 12--- (gdb) x /15x bbd 0x1354c9b0: 0x 0x 0x0001 0x0001 0x1354c9c0: 0x0001 0x0001 0x0002 0x0002 0x1354c9d0: 0x0001 0x0002 0x0001 0x0001 0x1354c9e0: 0x0001 0x0001 0x our algorithm will walk through 8->11->3->4->5->6->7, for loop 1, exit at edge 7->9. (gdb) x /15x bbd 0x1354c9b0: 0x 0x 0x0001 0x 0x1354c9c0: 0x 0x 0x 0x 0x1354c9d0: 0x0001 0x0002 0x0001 0x 0x1354c9e0: 0x0001 0x 0x If we don't reset bbd to incoming_edge by memcpy, bbd[3],bbd[4],bbd[5] and bbd[6] is 0 now for loop 2, fill_always_executed_in_1 couldn't set ALWAYS_EXECUTED correctly for loop 2 at bb 3 and bb 6. + while (!bitmap_empty_p (work_set)) + { + bb_index = bitmap_first_set_bit (work_set); + bitmap_clear_bit (work_set, bb_index); + bb = BASIC_BLOCK_FOR_FN (cfun, bb_index); if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) - last = bb; - + SET_ALWAYS_EXECUTED_IN (bb, loop); if (bitmap_bit_p (contains_call, bb->index)) break; I think you want to continue; here (process remaining worklist but not continue greedy walking this block) Same as above, if use 'continue' instead of 'break', the algorithm seems also not work again. If inner loop contains a jump to outmost loop, the blocks after the jump block will be set to ALWAYS EXECUTE incorrectly. - + edge_iterator ei; FOR_EACH_EDGE (e, ei, bb->succs) { - /* If there is an exit from this BB. */ if (!flow_bb_inside_loop_p (loop, e->dest)) break; in particular this should keep the outer 'bbi' valid to re-use. But again, you want 'continue;' the greedy walk to other edges. If that's not valid (I'd need to think about this) then with your patch whether we process an edge depends on the order of the edge visit so you'd have to walk successors twice, once to determine whether we can greedily walk any of it and once to actually do the greedy walk. So thinking about it an exit edge is like a not returning call and thus we indeed should not process any outgoing edges of this block. + /* Or we enter a possibly non-finite loop. */ if (flow_loop_nested_p (bb->loop_father, e->dest->loop_father) && ! finite_loop_p (e->dest->loop_father)) break; I think this is no longer necessary? In any case it would again be 'conti
[PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
ult: + break; + } + } +} + return 0; } diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index 8efc77d986b..36837381c04 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -115,6 +115,7 @@ init_internal_fns () #define vec_condeq_direct { 0, 0, false } #define scatter_store_direct { 3, 1, false } #define len_store_direct { 3, 3, false } +#define vec_set_direct { 3, 3, false } #define unary_direct { 0, 0, true } #define binary_direct { 0, 0, true } #define ternary_direct { 0, 0, true } @@ -2658,6 +2659,40 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab) #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn +static void +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) all new functions require a function level comment Done. +{ + tree lhs = gimple_call_lhs (stmt); + tree op0 = gimple_call_arg (stmt, 0); + tree op1 = gimple_call_arg (stmt, 1); + tree op2 = gimple_call_arg (stmt, 2); + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE); + + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); + scalar_mode innermode = GET_MODE_INNER (outermode); + + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); + + class expand_operand ops[3]; + enum insn_code icode = optab_handler (optab, outermode); + + if (icode != CODE_FOR_nothing) +{ + pos = convert_to_mode (E_SImode, pos, 0); + + create_fixed_operand (&ops[0], src); + create_input_operand (&ops[1], value, innermode); + create_input_operand (&ops[2], pos, GET_MODE (pos)); + if (maybe_expand_insn (icode, 3, ops)) + { + emit_move_insn (target, src); I think you need to assert that we end up here. Added gcc_unreachable at the end of this function. + return; + } +} +} + static void expand_ABNORMAL_DISPATCHER (internal_fn, gcall *) { @@ -3253,6 +3288,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, #define direct_fold_left_optab_supported_p direct_optab_supported_p #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p #define direct_check_ptrs_optab_supported_p direct_optab_supported_p +#define direct_vec_set_optab_supported_p direct_optab_supported_p /* Return the optab used by internal function FN. */ diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 13e60828fcf..e6cfe1b6159 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu) DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq) DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask) +DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set) + DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store) DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while) diff --git a/gcc/optabs.c b/gcc/optabs.c index 184827fdf4e..c8125670d2d 100644 --- a/gcc/optabs.c +++ b/gcc/optabs.c @@ -3841,6 +3841,23 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode, && insn_operand_matches (icode, 3, test); } +bool +can_vec_set_var_idx_p (enum tree_code code, machine_mode vec_mode, + machine_mode value_mode, machine_mode idx_mode) toplevel comment missing +{ + gcc_assert (code == VECTOR_TYPE); what's the point of pasing 'code' here then? Since the optab only has a single mode, the vector mode, the value_mode is redundant as well. And I guess we might want to handle "arbitrary" index modes? That is, the .md expanders should not restrict its mode - I guess it simply uses VOIDmode at the moment (for integer constants). Not sure how to best do this without an explicit mode in the optab ... Yes, removed 'code' and value_mode by checking VECTOR_MODE_P and use GET_MODE_INNER for value_mode. ".md expanders" shall support for integer constants index mode, but I guess they shouldn't be expanded by IFN as this function is for variable index insert only? Anyway, the v3 patch used VOIDmode check... Thanks, Xionghu From 571717aea126380d3e36fdb4504f9a6337eed206 Mon Sep 17 00:00:00 2001 From: Xiong Hu Luo Date: Mon, 14 Sep 2020 21:08:11 -0500 Subject: [PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to VEC_SET internal function in gimple-isel pass if target supports vec_set with variable index by checking can_vec_set_var_idx_p. gcc/ChangeLog: 2020-09-22 Xionghu Luo * gimple-isel.cc (gimple_expand_vec_set_expr): New function. (gimple_expand_vec_cond_exprs): Rename to ... (gimple_expand_vec_exprs): ... this and call gimple_expand_
Re: [PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
Hi, On 2020/9/23 19:33, Richard Biener wrote: >> The first loop is for rhs stmt process, this loop is for lhs stmt process. >> I thought vec_extract also need to generate IFN before, but seems not >> necessary now? And that the first loop needs to update the lhs stmt while >> then second doesn't. > That's not good reasons to separate them, please move all the processing > into one loop. > > + gassign *stmt = dyn_cast (gsi_stmt (gsi)); > + if (!stmt) > + continue; > + > + enum tree_code code; > + code = TREE_CODE (gimple_assign_lhs (stmt)); > + switch (code) > + { > + case ARRAY_REF: > + gimple_expand_vec_set_expr (&gsi); > > you also do the assign and ARRAY_REF checking duplicate. > > The patch likely wasn't bootstrapped because I've seen unused and > set-but-not-used > variables. > > Otherwise the patch looks good to me - I guess you want to add the > vec_extract bits as well so you can overall assess the affect of the patch > on altivec code? That said, the patch misses a testcase where we verify > we properly expand the vector to a pseudo now. Thanks, fixed the bootstrap error. Actually the "[PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]" includes typed vec_insert tests for V4SI/V4SF/V8HI/V16QI/V2DI/V2DF of expanding the IFN VEC_SET and instruction count check, but I am discussing and refining with Segher's comments, will split and send it later once we reached agreement. Not sure whether this is the testcase you mentioned? (As you said *vec_extract*, but this patch series target for vec_insert only.) FYI, We are trying below or even better code generations: rlwinm 6,6,2,28,29 mtvsrwz 0,5 lvsr 1,0,6 lvsl 0,0,6 xxperm 34,34,33 xxinsertw 34,0,12 xxperm 34,34,32 Second thing is I removed the second loop and move the "gimple_expand_vec_set_expr (&gsi);" up as your comments. Thanks again. IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to VEC_SET internal function in gimple-isel pass if target supports vec_set with variable index by checking can_vec_set_var_idx_p. gcc/ChangeLog: 2020-09-24 Xionghu Luo * gimple-isel.cc (gimple_expand_vec_set_expr): New function. (gimple_expand_vec_cond_exprs): Rename to ... (gimple_expand_vec_exprs): ... this and call gimple_expand_vec_set_expr. * internal-fn.c (vec_set_direct): New define. (expand_vec_set_optab_fn): New function. (direct_vec_set_optab_supported_p): New define. * internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN. * optabs.c (can_vec_set_var_idx_p): New function. * optabs.h (can_vec_set_var_idx_p): New declaration. --- gcc/gimple-isel.cc | 75 +++-- gcc/internal-fn.c | 39 +++ gcc/internal-fn.def | 2 ++ gcc/optabs.c| 21 + gcc/optabs.h| 4 +++ 5 files changed, 139 insertions(+), 2 deletions(-) diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc index b330cf4c20e..02513e04900 100644 --- a/gcc/gimple-isel.cc +++ b/gcc/gimple-isel.cc @@ -35,6 +35,74 @@ along with GCC; see the file COPYING3. If not see #include "tree-cfg.h" #include "bitmap.h" #include "tree-ssa-dce.h" +#include "memmodel.h" +#include "optabs.h" + +/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to + internal function based on vector type of selected expansion. + i.e.: + VIEW_CONVERT_EXPR(u)[_1] = = i_4(D); + => + _7 = u; + _8 = .VEC_SET (_7, i_4(D), _1); + u = _8; */ + +static gimple * +gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi) +{ + enum tree_code code; + gcall *new_stmt = NULL; + gassign *ass_stmt = NULL; + + /* Only consider code == GIMPLE_ASSIGN. */ + gassign *stmt = dyn_cast (gsi_stmt (*gsi)); + if (!stmt) +return NULL; + + tree lhs = gimple_assign_lhs (stmt); + code = TREE_CODE (lhs); + if (code != ARRAY_REF) +return NULL; + + tree val = gimple_assign_rhs1 (stmt); + tree op0 = TREE_OPERAND (lhs, 0); + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR && DECL_P (TREE_OPERAND (op0, 0)) + && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0))) + && TYPE_MODE (TREE_TYPE (lhs)) + == TYPE_MODE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (op0, 0) +{ + tree pos = TREE_OPERAND (lhs, 1); + tree view_op0 = TREE_OPERAND (op0, 0); + machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0)); + if (auto_var_in_fn_p (view_op0, cfun->decl) + && !TREE_ADDRESSABLE (view_op0) && can_vec_set_var_idx
Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]
Hi Segher, The attached two patches are updated and split from "[PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]" as your comments. [PATCH v3 2/3] rs6000: Fix lvsl&lvsr mode and change rs6000_expand_vector_set param This one is preparation work of fix lvsl&lvsr arg mode and rs6000_expand_vector_set parameter support for both constant and variable index input. [PATCH v3 2/3] rs6000: Support variable insert and Expand vec_insert in expander [PR79251] This one is Building VIEW_CONVERT_EXPR and expand the IFN VEC_SET to fast. Thanks, Xionghu From 9d74c488ad3c7cad8c276cc49749ec05158d1e96 Mon Sep 17 00:00:00 2001 From: Xiong Hu Luo Date: Thu, 24 Sep 2020 00:52:35 -0500 Subject: [PATCH v3 2/3] rs6000: Fix lvsl&lvsr mode and change rs6000_expand_vector_set param lvsl and lvsr looks only at the low 4 bits, use SI for index param. rs6000_expand_vector_set could accept insert either to constant position or variable position, so change the operand to reg_or_cint_operand. gcc/ChangeLog: 2020-09-24 Xionghu Luo * config/rs6000/altivec.md (altivec_lvsl_reg): Change to SImode. (altivec_lvsr_reg): Likewise. * config/rs6000/rs6000-call.c (altivec_expand_vec_set_builtin): Change call param 2 from type int to rtx. * config/rs6000/rs6000-protos.h (rs6000_expand_vector_set): Likewise. * config/rs6000/rs6000.c (rs6000_expand_vector_init): Change call param 2 from type int to rtx. (rs6000_expand_vector_set): Likewise. * config/rs6000/vector.md (vec_set): Support both constant and variable index vec_set. * config/rs6000/vsx.md: Call gen_altivec_lvsl_reg with SImode. --- gcc/config/rs6000/altivec.md | 4 ++-- gcc/config/rs6000/rs6000-call.c | 2 +- gcc/config/rs6000/rs6000-protos.h | 2 +- gcc/config/rs6000/rs6000.c| 16 +--- gcc/config/rs6000/vector.md | 4 ++-- gcc/config/rs6000/vsx.md | 3 ++- 6 files changed, 17 insertions(+), 14 deletions(-) diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 0a2e634d6b0..a1c06c9ab8c 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -2775,7 +2775,7 @@ (define_expand "altivec_lvsl" (define_insn "altivec_lvsl_reg" [(set (match_operand:V16QI 0 "altivec_register_operand" "=v") (unspec:V16QI - [(match_operand:DI 1 "gpc_reg_operand" "b")] + [(match_operand:SI 1 "gpc_reg_operand" "b")] UNSPEC_LVSL_REG))] "TARGET_ALTIVEC" "lvsl %0,0,%1" @@ -2813,7 +2813,7 @@ (define_expand "altivec_lvsr" (define_insn "altivec_lvsr_reg" [(set (match_operand:V16QI 0 "altivec_register_operand" "=v") (unspec:V16QI - [(match_operand:DI 1 "gpc_reg_operand" "b")] + [(match_operand:SI 1 "gpc_reg_operand" "b")] UNSPEC_LVSR_REG))] "TARGET_ALTIVEC" "lvsr %0,0,%1" diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index e39cfcf672b..51f278933bd 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -10655,7 +10655,7 @@ altivec_expand_vec_set_builtin (tree exp) op0 = force_reg (tmode, op0); op1 = force_reg (mode1, op1); - rs6000_expand_vector_set (op0, op1, elt); + rs6000_expand_vector_set (op0, op1, GEN_INT (elt)); return op0; } diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 28e859f4381..6a0fbc3ba2e 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -57,7 +57,7 @@ extern bool rs6000_move_128bit_ok_p (rtx []); extern bool rs6000_split_128bit_ok_p (rtx []); extern void rs6000_expand_float128_convert (rtx, rtx, bool); extern void rs6000_expand_vector_init (rtx, rtx); -extern void rs6000_expand_vector_set (rtx, rtx, int); +extern void rs6000_expand_vector_set (rtx, rtx, rtx); extern void rs6000_expand_vector_extract (rtx, rtx, rtx); extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx); extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index fe93cf6ff2b..c46ec14f060 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -6669,7 +6669,8 @@ rs6000_expand_vector_init (rtx target, rtx vals) rs6000_expand_vector_init (target, copy); /* Insert variable. */ - rs6000_expand_vector_set (target, XVECEXP (vals, 0, one_var), one_var); + rs6000_expand_vector_set (target, XVECEXP (vals, 0, one_var), + GEN_INT (one_var)); return; } @@ -6683,10 +6684,10 @@ rs6000_expand_vector_init (rtx target, rtx vals) emit_move_insn (targe
Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]
Hi, On 2020/9/24 21:27, Richard Biener wrote: > On Thu, Sep 24, 2020 at 10:21 AM xionghu luo wrote: > > I'll just comment that > > xxperm 34,34,33 > xxinsertw 34,0,12 > xxperm 34,34,32 > > doesn't look like a variable-position insert instruction but > this is a variable whole-vector rotate plus an insert at index zero > followed by a variable whole-vector rotate. I'm not fluend in > ppc assembly but > > rlwinm 6,6,2,28,29 > mtvsrwz 0,5 > lvsr 1,0,6 > lvsl 0,0,6 > > possibly computes the shift masks for r33/r32? though > I do not see those registers mentioned... For V4SI: rlwinm 6,6,2,28,29 // r6*4 mtvsrwz 0,5 // vs0 <- r5 (0xfe) lvsr 1,0,6 // vs33 <- lvsr[r6] lvsl 0,0,6 // vs32 <- lvsl[r6] xxperm 34,34,33 xxinsertw 34,0,12 xxperm 34,34,32 blr idx = idx * 4; 00 0x4000300020001 xxperm:0x4000300020001 vs33:0x101112131415161718191a1b1c1d1e1f vs32:0x102030405060708090a0b0c0d0e0f 14 0x4000300020001 xxperm:0x1000400030002 vs33:0xc0d0e0f101112131415161718191a1b vs32:0x405060708090a0b0c0d0e0f10111213 28 0x4000300020001 xxperm:0x2000100040003 vs33:0x8090a0b0c0d0e0f1011121314151617 vs32:0x8090a0b0c0d0e0f1011121314151617 312 0x4000300020001 xxperm:0x3000200010004 vs33:0x405060708090a0b0c0d0e0f10111213 vs32:0xc0d0e0f101112131415161718191a1b vs34: 0x40003000200fe 0x4000300fe0001 0x400fe00020001 0xfe000300020001 "xxinsertw 34,0,12" will always insert vs0[32:63] content to the forth word of target vector, bits[96:127]. Then the second xxperm rotate the modified vector back. All the instructions are register based operation, as Segher replied, power9 supports only fixed position inserts, so we need do some trick here to support it instead of generate short store wide load instructions. > > This might be a generic viable expansion strathegy btw, > which is why I asked before whether the CPU supports > inserts at a variable position ... the building blocks are > already there with vec_set at constant zero position > plus vec_perm_const for the rotates. > > But well, I did ask this question. Multiple times. > > ppc does _not_ have a VSX instruction > like xxinsertw r34, r8, r12 where r8 denotes > the vector element (or byte position or whatever). > > So I don't think vec_set with a variable index is the > best approach. > Xionghu - you said even without the patch the stack > storage is eventually elided but > > addi 9,1,-16 > rldic 6,6,2,60 > stxv 34,-16(1) > stwx 5,9,6 > lxv 34,-16(1) > > still shows stack(?) store/load with a bad STLF penalty. Sorry that if I didn't describe clearly and misunderstood you, I mean if insert many instructions(tested with a loop inserted) between "stwx 5,9,6" and "lxv 34,-16(1)", the store hit load performance issue could be elided, but this is not the solution we want. I also changed your test as below and build for X86, seems it also generates inefficient code? What my patch does maybe different usage from your pasted case? #define N 32 typedef int T; typedef T V __attribute__((vector_size(N))); V setg3 (V v, int idx, T val) { v[idx&31] = val; return v; } -O2 -S -mavx -march=znver2: setg3: pushrbp and edi, 31 mov rbp, rsp and rsp, -32 vmovdqa YMMWORD PTR [rsp-32], ymm0 mov DWORD PTR [rsp-32+rdi*4], esi vmovdqa ymm0, YMMWORD PTR [rsp-32] leave ret While idx is constant: setg3: vpinsrd xmm1, xmm0, esi, 3 vinserti128 ymm0, ymm0, xmm1, 0x0 ret And ARM with -O2 -S -march=armv8.2-a+sve (N change to 16): setg3: sub sp, sp, #16 and x0, x0, 15 str q0, [sp] str w1, [sp, x0, lsl 2] ldr q0, [sp] add sp, sp, 16 ret While idx is constant: setg3: ins v0.s[3], w1 ret Though I've no idea how to optimize this on X86 and ARM with vector instructions to avoid short store with wide load followed on stack. Thanks, Xionghu
[PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
Hi, On 2020/9/24 20:39, Richard Sandiford wrote: > xionghu luo writes: >> @@ -2658,6 +2659,43 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall >> *stmt, convert_optab optab) >> >> #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn >> >> +/* Expand VEC_SET internal functions. */ >> + >> +static void >> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) >> +{ >> + tree lhs = gimple_call_lhs (stmt); >> + tree op0 = gimple_call_arg (stmt, 0); >> + tree op1 = gimple_call_arg (stmt, 1); >> + tree op2 = gimple_call_arg (stmt, 2); >> + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); >> + rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE); > > I'm not sure about the expand_expr here. ISTM that op0 is a normal > input and so should be expanded by expand_normal rather than > EXPAND_WRITE. Also: > >> + >> + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); >> + scalar_mode innermode = GET_MODE_INNER (outermode); >> + >> + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); >> + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); >> + >> + class expand_operand ops[3]; >> + enum insn_code icode = optab_handler (optab, outermode); >> + >> + if (icode != CODE_FOR_nothing) >> +{ >> + pos = convert_to_mode (E_SImode, pos, 0); >> + >> + create_fixed_operand (&ops[0], src); > > ...this would mean that if SRC happens to be a MEM, the pattern > must also accept a MEM. > > ISTM that we're making more work for ourselves by not “fixing” the optab > to have a natural pure-input + pure-output interface. :-) But if we > stick with the current optab interface, I think we need to: > > - create a temporary register > - move SRC into the temporary register before the insn > - use create_fixed_operand with the temporary register for operand 0 > - move the temporary register into TARGET after the insn > >> + create_input_operand (&ops[1], value, innermode); >> + create_input_operand (&ops[2], pos, GET_MODE (pos)); > > For this I think we should use convert_operand_from on the original “pos”, > so that the target gets to choose what the mode of the operand is. > Thanks a lot for the nice suggestions, fixed them all and updated the patch as below. [PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to VEC_SET internal function in gimple-isel pass if target supports vec_set with variable index by checking can_vec_set_var_idx_p. gcc/ChangeLog: 2020-09-25 Xionghu Luo * gimple-isel.cc (gimple_expand_vec_set_expr): New function. (gimple_expand_vec_cond_exprs): Rename to ... (gimple_expand_vec_exprs): ... this and call gimple_expand_vec_set_expr. * internal-fn.c (vec_set_direct): New define. (expand_vec_set_optab_fn): New function. (direct_vec_set_optab_supported_p): New define. * internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN. * optabs.c (can_vec_set_var_idx_p): New function. * optabs.h (can_vec_set_var_idx_p): New declaration. --- gcc/gimple-isel.cc | 75 +++-- gcc/internal-fn.c | 41 + gcc/internal-fn.def | 2 ++ gcc/optabs.c| 21 + gcc/optabs.h| 4 +++ 5 files changed, 141 insertions(+), 2 deletions(-) diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc index b330cf4c20e..02513e04900 100644 --- a/gcc/gimple-isel.cc +++ b/gcc/gimple-isel.cc @@ -35,6 +35,74 @@ along with GCC; see the file COPYING3. If not see #include "tree-cfg.h" #include "bitmap.h" #include "tree-ssa-dce.h" +#include "memmodel.h" +#include "optabs.h" + +/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to + internal function based on vector type of selected expansion. + i.e.: + VIEW_CONVERT_EXPR(u)[_1] = = i_4(D); + => + _7 = u; + _8 = .VEC_SET (_7, i_4(D), _1); + u = _8; */ + +static gimple * +gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi) +{ + enum tree_code code; + gcall *new_stmt = NULL; + gassign *ass_stmt = NULL; + + /* Only consider code == GIMPLE_ASSIGN. */ + gassign *stmt = dyn_cast (gsi_stmt (*gsi)); + if (!stmt) +return NULL; + + tree lhs = gimple_assign_lhs (stmt); + code = TREE_CODE (lhs); + if (code != ARRAY_REF) +return NULL; + + tree val = gimple_assign_rhs1 (stmt); + tree op0 = TREE_OPERAND (lhs, 0); + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR && DECL_P (TREE_OPERAND (op0, 0)) + &
Re: [PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
On 2020/9/25 21:28, Richard Sandiford wrote: > xionghu luo writes: >> @@ -2658,6 +2659,45 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall >> *stmt, convert_optab optab) >> >> #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn >> >> +/* Expand VEC_SET internal functions. */ >> + >> +static void >> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) >> +{ >> + tree lhs = gimple_call_lhs (stmt); >> + tree op0 = gimple_call_arg (stmt, 0); >> + tree op1 = gimple_call_arg (stmt, 1); >> + tree op2 = gimple_call_arg (stmt, 2); >> + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); >> + rtx src = expand_normal (op0); >> + >> + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); >> + scalar_mode innermode = GET_MODE_INNER (outermode); >> + >> + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); >> + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); > > These two can just use expand_normal. Might be easier to read if > they come immediately after the expand_normal (op0). > > LGTM with that change for the internal-fn.c stuff, thanks. > Thank you, updated and committed as r11-3486. Tested and confirmed Power/X86/ARM still not supporting vec_set with register index, so there are no ICE regressions caused by generating IFN VEC_SET but not properly expanded. Thanks, Xionghu
[PATCH 1/4] rs6000: Change rs6000_expand_vector_set param
rs6000_expand_vector_set could accept insert either to constant position or variable position, so change the operand to reg_or_cint_operand. gcc/ChangeLog: 2020-10-10 Xionghu Luo * config/rs6000/rs6000-call.c (altivec_expand_vec_set_builtin): Change call param 2 from type int to rtx. * config/rs6000/rs6000-protos.h (rs6000_expand_vector_set): Likewise. * config/rs6000/rs6000.c (rs6000_expand_vector_init): Change call param 2 from type int to rtx. (rs6000_expand_vector_set): Likewise. * config/rs6000/vector.md (vec_set): Support both constant and variable index vec_set. --- gcc/config/rs6000/rs6000-call.c | 2 +- gcc/config/rs6000/rs6000-protos.h | 2 +- gcc/config/rs6000/rs6000.c| 16 +--- gcc/config/rs6000/vector.md | 4 ++-- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index a8b520834c7..2608a2a0797 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -10655,7 +10655,7 @@ altivec_expand_vec_set_builtin (tree exp) op0 = force_reg (tmode, op0); op1 = force_reg (mode1, op1); - rs6000_expand_vector_set (op0, op1, elt); + rs6000_expand_vector_set (op0, op1, GEN_INT (elt)); return op0; } diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 25fa5dd57cd..3578136e79b 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -57,7 +57,7 @@ extern bool rs6000_move_128bit_ok_p (rtx []); extern bool rs6000_split_128bit_ok_p (rtx []); extern void rs6000_expand_float128_convert (rtx, rtx, bool); extern void rs6000_expand_vector_init (rtx, rtx); -extern void rs6000_expand_vector_set (rtx, rtx, int); +extern void rs6000_expand_vector_set (rtx, rtx, rtx); extern void rs6000_expand_vector_extract (rtx, rtx, rtx); extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx); extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 375fff59928..a5b59395abd 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -6678,7 +6678,8 @@ rs6000_expand_vector_init (rtx target, rtx vals) rs6000_expand_vector_init (target, copy); /* Insert variable. */ - rs6000_expand_vector_set (target, XVECEXP (vals, 0, one_var), one_var); + rs6000_expand_vector_set (target, XVECEXP (vals, 0, one_var), + GEN_INT (one_var)); return; } @@ -6692,10 +6693,10 @@ rs6000_expand_vector_init (rtx target, rtx vals) emit_move_insn (target, mem); } -/* Set field ELT of TARGET to VAL. */ +/* Set field ELT_RTX of TARGET to VAL. */ void -rs6000_expand_vector_set (rtx target, rtx val, int elt) +rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx) { machine_mode mode = GET_MODE (target); machine_mode inner_mode = GET_MODE_INNER (mode); @@ -6709,7 +6710,6 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt) if (VECTOR_MEM_VSX_P (mode)) { rtx insn = NULL_RTX; - rtx elt_rtx = GEN_INT (elt); if (mode == V2DFmode) insn = gen_vsx_set_v2df (target, target, val, elt_rtx); @@ -6736,8 +6736,11 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt) } } + gcc_assert (CONST_INT_P (elt_rtx)); + /* Simplify setting single element vectors like V1TImode. */ - if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (inner_mode) && elt == 0) + if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (inner_mode) + && INTVAL (elt_rtx) == 0) { emit_move_insn (target, gen_lowpart (mode, val)); return; @@ -6760,8 +6763,7 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt) /* Set permute mask to insert element into target. */ for (i = 0; i < width; ++i) -XVECEXP (mask, 0, elt*width + i) - = GEN_INT (i + 0x10); +XVECEXP (mask, 0, INTVAL (elt_rtx) * width + i) = GEN_INT (i + 0x10); x = gen_rtx_CONST_VECTOR (V16QImode, XVEC (mask, 0)); if (BYTES_BIG_ENDIAN) diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md index 796345c80d3..7aab1887cf5 100644 --- a/gcc/config/rs6000/vector.md +++ b/gcc/config/rs6000/vector.md @@ -1227,10 +1227,10 @@ (define_expand "vec_init" (define_expand "vec_set" [(match_operand:VEC_E 0 "vlogical_operand") (match_operand: 1 "register_operand") - (match_operand 2 "const_int_operand")] + (match_operand 2 "reg_or_cint_operand")] "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)" { - rs6000_expand_vector_set (operands[0], operands[1], INTVAL (operands[2])); + rs6000_expand_vector_set (operands[0], operands[1], operands[2]); DONE; }) -- 2.25.1
[PATCH 4/4] rs6000: Update testcases' instruction count
gcc/testsuite/ChangeLog: 2020-10-10 Xionghu Luo * gcc.target/powerpc/fold-vec-insert-char-p8.c: Adjust instruction counts. * gcc.target/powerpc/fold-vec-insert-char-p9.c: Likewise. * gcc.target/powerpc/fold-vec-insert-double.c: Likewise. * gcc.target/powerpc/fold-vec-insert-float-p8.c: Likewise. * gcc.target/powerpc/fold-vec-insert-float-p9.c: Likewise. * gcc.target/powerpc/fold-vec-insert-int-p8.c: Likewise. * gcc.target/powerpc/fold-vec-insert-int-p9.c: Likewise. * gcc.target/powerpc/fold-vec-insert-longlong.c: Likewise. * gcc.target/powerpc/fold-vec-insert-short-p8.c: Likewise. * gcc.target/powerpc/fold-vec-insert-short-p9.c: Likewise. * gcc.target/powerpc/vsx-builtin-7.c: Likewise. --- .../gcc.target/powerpc/fold-vec-insert-char-p8.c | 11 ++- .../gcc.target/powerpc/fold-vec-insert-char-p9.c | 12 ++-- .../gcc.target/powerpc/fold-vec-insert-double.c | 11 --- .../gcc.target/powerpc/fold-vec-insert-float-p8.c| 6 +++--- .../gcc.target/powerpc/fold-vec-insert-float-p9.c| 10 +- .../gcc.target/powerpc/fold-vec-insert-int-p8.c | 9 + .../gcc.target/powerpc/fold-vec-insert-int-p9.c | 11 +-- .../gcc.target/powerpc/fold-vec-insert-longlong.c| 10 +++--- .../gcc.target/powerpc/fold-vec-insert-short-p8.c| 9 + .../gcc.target/powerpc/fold-vec-insert-short-p9.c| 8 gcc/testsuite/gcc.target/powerpc/vsx-builtin-7.c | 4 ++-- 11 files changed, 52 insertions(+), 49 deletions(-) diff --git a/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p8.c b/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p8.c index b13c8ca19c7..1ad23de99a9 100644 --- a/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p8.c +++ b/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p8.c @@ -44,15 +44,16 @@ vector unsigned char testuu_cst (unsigned char x, vector unsigned char v) return vec_insert (x, v, 12); } -/* one store per _var test */ -/* { dg-final { scan-assembler-times {\mstvx\M|\mstxvw4x\M} 4 } } */ +/* no store per _var test */ +/* { dg-final { scan-assembler-times {\mstvx\M|\mstxvw4x\M} 0 } } */ /* one store-byte per test */ -/* { dg-final { scan-assembler-times {\mstb\M} 8 } } */ +/* { dg-final { scan-assembler-times {\mstb\M} 4 } } */ /* one load per test */ -/* { dg-final { scan-assembler-times {\mlvx\M|\mlxvw4x\M} 8 } } */ +/* { dg-final { scan-assembler-times {\mlvx\M|\mlxvw4x\M} 8 { target le } } } */ +/* { dg-final { scan-assembler-times {\mlvx\M|\mlxvw4x\M} 4 { target be } } } */ /* one lvebx per _cst test.*/ /* { dg-final { scan-assembler-times {\mlvebx\M} 4 } } */ /* one vperm per _cst test.*/ -/* { dg-final { scan-assembler-times {\mvperm\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mvperm\M} 12 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p9.c b/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p9.c index 16432289d68..400caa31bb4 100644 --- a/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p9.c +++ b/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p9.c @@ -44,13 +44,13 @@ vector unsigned char testuu_cst (unsigned char x, vector unsigned char v) return vec_insert (x, v, 12); } -/* load immediate, add, store, stb, load variable test. */ -/* { dg-final { scan-assembler-times {\mstxv\M|\mstvx\M} 4 { target lp64 } } } */ -/* { dg-final { scan-assembler-times {\mstb\M} 4 { target lp64 } } } */ -/* { dg-final { scan-assembler-times {\mlvebx\M|\mlxv\M|\mlvx\M} 4 { target lp64} } } */ +/* no store per _var test. */ +/* { dg-final { scan-assembler-times {\mstxv\M|\mstvx\M} 0 { target lp64 } } } */ +/* { dg-final { scan-assembler-times {\mstb\M} 0 { target lp64 } } } */ +/* { dg-final { scan-assembler-times {\mlvebx\M|\mlxv\M|\mlvx\M} 0 { target lp64} } } */ /* an insert and a move per constant test. */ -/* { dg-final { scan-assembler-times {\mmtvsrwz\M} 4 { target lp64 } } } */ -/* { dg-final { scan-assembler-times {\mvinsertb\M} 4 { target lp64 } } } */ +/* { dg-final { scan-assembler-times {\mmtvsrwz\M} 8 { target lp64 } } } */ +/* { dg-final { scan-assembler-times {\mvinsertb\M} 8 { target lp64 } } } */ /* -m32 codegen. */ /* { dg-final { scan-assembler-times {\mrlwinm\M} 4 { target ilp32 } } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-double.c b/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-double.c index 435d28d5420..842fe9bbcad 100644 --- a/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-double.c +++ b/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-double.c @@ -23,7 +23,12 @@ testd_cst (double d, vector double vd) /* { dg-final { scan-assembler {\mxxpermdi\M} } } */ /* { dg-final { scan-assembler-times {\mrldic\M|\mrlwinm\M} 1 } } */ -/* { dg-final { scan-assembler-times {\mstxvd2x\M|\mstxv\M|\mstvx\M} 1 } } */ -/* { dg-final { scan-assembler-times {\mstfdx
[PATCH 3/4] rs6000: Enable vec_insert for P8 with rs6000_expand_vector_set_var_p8
gcc/ChangeLog: 2020-10-10 Xionghu Luo * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin): Generate ARRAY_REF(VIEW_CONVERT_EXPR) for P8 and later platforms. * config/rs6000/rs6000.c (rs6000_expand_vector_set_var): Update to call different path for P8 and P9. (rs6000_expand_vector_set_var_p9): New function. (rs6000_expand_vector_set_var_p8): New function. gcc/testsuite/ChangeLog: 2020-10-10 Xionghu Luo * gcc.target/powerpc/pr79251.p8.c: New test. --- gcc/config/rs6000/rs6000-c.c | 27 +++- gcc/config/rs6000/rs6000.c| 117 +- gcc/testsuite/gcc.target/powerpc/pr79251.p8.c | 17 +++ 3 files changed, 155 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p8.c diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c index 5551a21d738..4bea8001ec6 100644 --- a/gcc/config/rs6000/rs6000-c.c +++ b/gcc/config/rs6000/rs6000-c.c @@ -1599,10 +1599,29 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, SET_EXPR_LOCATION (stmt, loc); stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt); } - stmt = build_array_ref (loc, stmt, arg2); - stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, - convert (TREE_TYPE (stmt), arg0)); - stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl); + + if (TARGET_P8_VECTOR) + { + stmt = build_array_ref (loc, stmt, arg2); + stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, + convert (TREE_TYPE (stmt), arg0)); + stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl); + } + else + { + tree arg1_inner_type; + tree innerptrtype; + arg1_inner_type = TREE_TYPE (arg1_type); + innerptrtype = build_pointer_type (arg1_inner_type); + + stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0); + stmt = convert (innerptrtype, stmt); + stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1); + stmt = build_indirect_ref (loc, stmt, RO_NULL); + stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt, +convert (TREE_TYPE (stmt), arg0)); + stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl); + } return stmt; } diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 96f76c7a74c..33ca839cb28 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -6806,10 +6806,10 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx) } /* Insert VAL into IDX of TARGET, VAL size is same of the vector element, IDX - is variable and also counts by vector element size. */ + is variable and also counts by vector element size for p9 and above. */ void -rs6000_expand_vector_set_var (rtx target, rtx val, rtx idx) +rs6000_expand_vector_set_var_p9 (rtx target, rtx val, rtx idx) { machine_mode mode = GET_MODE (target); @@ -6852,6 +6852,119 @@ rs6000_expand_vector_set_var (rtx target, rtx val, rtx idx) emit_insn (perml); } +/* Insert VAL into IDX of TARGET, VAL size is same of the vector element, IDX + is variable and also counts by vector element size for p8. */ + +void +rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx idx) +{ + machine_mode mode = GET_MODE (target); + + gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)); + + gcc_assert (GET_MODE (idx) == E_SImode); + + machine_mode inner_mode = GET_MODE (val); + HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode); + + rtx tmp = gen_reg_rtx (GET_MODE (idx)); + int width = GET_MODE_SIZE (inner_mode); + + gcc_assert (width >= 1 && width <= 4); + + if (!BYTES_BIG_ENDIAN) +{ + /* idx = idx * width. */ + emit_insn (gen_mulsi3 (tmp, idx, GEN_INT (width))); + /* idx = idx + 8. */ + emit_insn (gen_addsi3 (tmp, tmp, GEN_INT (8))); +} + else +{ + emit_insn (gen_mulsi3 (tmp, idx, GEN_INT (width))); + emit_insn (gen_subsi3 (tmp, GEN_INT (24 - width), tmp)); +} + + /* lxv vs33, mask. + DImode: 0x + SImode: 0x + HImode: 0x. + QImode: 0x00ff. */ + rtx mask = gen_reg_rtx (V16QImode); + rtx mask_v2di = gen_reg_rtx (V2DImode); + rtvec v = rtvec_alloc (2); + if (!BYTES_BIG_ENDIAN) +{ + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, 0); + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, mode_mask); +} + else +{ + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, mode_mask); + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, 0); +} + emit_insn (gen_vec_initv2didi (mask_v2di, gen_rtx_PARALLEL (V2DImode, v))); + rtx sub_mask = simplify_gen_subreg (V16QImode, mask_v2di, V2DImode, 0); +
[PATCH 2/4] rs6000: Support variable insert and Expand vec_insert in expander [PR79251]
vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value to be insert, arg2 is the place to insert arg1 to arg0. Current expander generates stxv+stwx+lxv if arg2 is variable instead of constant, which causes serious store hit load performance issue on Power. This patch tries 1) Build VIEW_CONVERT_EXPR for vec_insert (i, v, n) like v[n&3] = i to unify the gimple code, then expander could use vec_set_optab to expand. 2) Expand the IFN VEC_SET to fast instructions: lvsr+insert+lvsl. In this way, "vec_insert (i, v, n)" and "v[n&3] = i" won't be expanded too early in gimple stage if arg2 is variable, avoid generating store hit load instructions. For Power9 V4SI: addi 9,1,-16 rldic 6,6,2,60 stxv 34,-16(1) stwx 5,9,6 lxv 34,-16(1) => rlwinm 6,6,2,28,29 mtvsrwz 0,5 lvsr 1,0,6 lvsl 0,0,6 xxperm 34,34,33 xxinsertw 34,0,12 xxperm 34,34,32 Though instructions increase from 5 to 7, the performance is improved 60% in typical cases. Tested with V2DI, V2DF V4SI, V4SF, V8HI, V16QI on Power9-LE. gcc/ChangeLog: 2020-10-10 Xionghu Luo * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin): Ajdust variable index vec_insert from address dereference to ARRAY_REF(VIEW_CONVERT_EXPR) tree expression. * config/rs6000/rs6000-protos.h (rs6000_expand_vector_set_var): New declaration. * config/rs6000/rs6000.c (rs6000_expand_vector_set_var): New function. * config/rs6000/vector.md (vec_set): Support both constant and variable index vec_set. gcc/testsuite/ChangeLog: 2020-10-10 Xionghu Luo * gcc.target/powerpc/pr79251.p9.c: New test. * gcc.target/powerpc/pr79251-run.c: New test. * gcc.target/powerpc/pr79251.h: New header. --- gcc/config/rs6000/rs6000-c.c | 25 - gcc/config/rs6000/rs6000-protos.h | 1 + gcc/config/rs6000/rs6000.c| 53 +++ .../gcc.target/powerpc/pr79251-run.c | 28 ++ gcc/testsuite/gcc.target/powerpc/pr79251.h| 19 +++ gcc/testsuite/gcc.target/powerpc/pr79251.p9.c | 18 +++ 6 files changed, 130 insertions(+), 14 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.h create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p9.c diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c index cc1e997524e..5551a21d738 100644 --- a/gcc/config/rs6000/rs6000-c.c +++ b/gcc/config/rs6000/rs6000-c.c @@ -1512,9 +1512,7 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, tree arg1; tree arg2; tree arg1_type; - tree arg1_inner_type; tree decl, stmt; - tree innerptrtype; machine_mode mode; /* No second or third arguments. */ @@ -1566,8 +1564,13 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, return build_call_expr (call, 3, arg1, arg0, arg2); } - /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */ - arg1_inner_type = TREE_TYPE (arg1_type); + /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0 with +VIEW_CONVERT_EXPR. i.e.: +D.3192 = v1; +_1 = n & 3; +VIEW_CONVERT_EXPR(D.3192)[_1] = i; +v1 = D.3192; +D.3194 = v1; */ if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1) arg2 = build_int_cst (TREE_TYPE (arg2), 0); else @@ -1582,6 +1585,7 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, TREE_USED (decl) = 1; TREE_TYPE (decl) = arg1_type; TREE_READONLY (decl) = TYPE_READONLY (arg1_type); + TREE_ADDRESSABLE (decl) = 1; if (c_dialect_cxx ()) { stmt = build4 (TARGET_EXPR, arg1_type, decl, arg1, @@ -1592,19 +1596,12 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, { DECL_INITIAL (decl) = arg1; stmt = build1 (DECL_EXPR, arg1_type, decl); - TREE_ADDRESSABLE (decl) = 1; SET_EXPR_LOCATION (stmt, loc); stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt); } - - innerptrtype = build_pointer_type (arg1_inner_type); - - stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0); - stmt = convert (innerptrtype, stmt); - stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1); - stmt = build_indirect_ref (loc, stmt, RO_NULL); - stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt, -convert (TREE_TYPE (stmt), arg0)); + stmt = build_array_ref (loc, stmt, arg2); + stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, + convert (TREE_TYPE (stmt), arg0)); stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl); return stmt; } d
[PATCH 0/4] rs6000: Enable variable vec_insert with IFN VEC_SET
Originated from https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554240.html with patch split and some refinement per review comments. Patch of IFN VEC_SET for ARRAY_REF(VIEW_CONVERT_EXPR) is committed, this patch set enables expanding IFN VEC_SET for Power9 and Power8 with specfic instruction sequences. Xionghu Luo (4): rs6000: Change rs6000_expand_vector_set param rs6000: Support variable insert and Expand vec_insert in expander [PR79251] rs6000: Enable vec_insert for P8 with rs6000_expand_vector_set_var_p8 rs6000: Update testcases' instruction count gcc/config/rs6000/rs6000-c.c | 44 +++-- gcc/config/rs6000/rs6000-call.c | 2 +- gcc/config/rs6000/rs6000-protos.h | 3 +- gcc/config/rs6000/rs6000.c| 181 +- gcc/config/rs6000/vector.md | 4 +- .../powerpc/fold-vec-insert-char-p8.c | 8 +- .../powerpc/fold-vec-insert-char-p9.c | 12 +- .../powerpc/fold-vec-insert-double.c | 11 +- .../powerpc/fold-vec-insert-float-p8.c| 6 +- .../powerpc/fold-vec-insert-float-p9.c| 10 +- .../powerpc/fold-vec-insert-int-p8.c | 6 +- .../powerpc/fold-vec-insert-int-p9.c | 11 +- .../powerpc/fold-vec-insert-longlong.c| 10 +- .../powerpc/fold-vec-insert-short-p8.c| 6 +- .../powerpc/fold-vec-insert-short-p9.c| 8 +- .../gcc.target/powerpc/pr79251-run.c | 28 +++ gcc/testsuite/gcc.target/powerpc/pr79251.h| 19 ++ gcc/testsuite/gcc.target/powerpc/pr79251.p8.c | 17 ++ gcc/testsuite/gcc.target/powerpc/pr79251.p9.c | 18 ++ .../gcc.target/powerpc/vsx-builtin-7.c| 4 +- 20 files changed, 337 insertions(+), 71 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.h create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p8.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p9.c -- 2.25.1
[PATCH] Fix incorrect loop exit edge probability [PR103270]
r12-4526 cancelled jump thread path rotates loop. It exposes a issue in profile-estimate when predict_extra_loop_exits, outer loop's exit edge is marked as inner loop's extra loop exit and set with incorrect prediction, then a hot inner loop will become cold loop finally through optimizations, this patch ignores the EDGE_DFS_BACK edge when searching extra exit edges to avoid unexpected predict_edge. gcc/ChangeLog: PR middle-end/103270 * predict.c (predict_extra_loop_exits): Ignore EDGE_DFS_BACK edge. gcc/ChangeLog: PR middle-end/103270 * predict.c (predict_extra_loop_exits): New. --- gcc/predict.c | 4 1 file changed, 4 insertions(+) diff --git a/gcc/predict.c b/gcc/predict.c index 68b11135680..1ae8ccff72c 100644 --- a/gcc/predict.c +++ b/gcc/predict.c @@ -1910,6 +1910,10 @@ predict_extra_loop_exits (edge exit_edge) continue; if ((check_value_one ^ integer_onep (val)) == 1) continue; +#if 0 + if (e->flags & EDGE_DFS_BACK) + continue; +#endif if (EDGE_COUNT (e->src->succs) != 1) { predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN); -- 2.25.1
Re: [PATCH] Fix incorrect loop exit edge probability [PR103270]
On 2021/11/23 13:51, Xionghu Luo wrote: > r12-4526 cancelled jump thread path rotates loop. It exposes a issue in > profile-estimate when predict_extra_loop_exits, outer loop's exit edge > is marked as inner loop's extra loop exit and set with incorrect > prediction, then a hot inner loop will become cold loop finally through > optimizations, this patch ignores the EDGE_DFS_BACK edge when searching > extra exit edges to avoid unexpected predict_edge. > > gcc/ChangeLog: > > PR middle-end/103270 > * predict.c (predict_extra_loop_exits): Ignore EDGE_DFS_BACK edge. > > gcc/ChangeLog: > > PR middle-end/103270 > * predict.c (predict_extra_loop_exits): New. > --- > gcc/predict.c | 4 > 1 file changed, 4 insertions(+) > > diff --git a/gcc/predict.c b/gcc/predict.c > index 68b11135680..1ae8ccff72c 100644 > --- a/gcc/predict.c > +++ b/gcc/predict.c > @@ -1910,6 +1910,10 @@ predict_extra_loop_exits (edge exit_edge) > continue; >if ((check_value_one ^ integer_onep (val)) == 1) > continue; > + if (e->flags & EDGE_DFS_BACK) > + continue; Sorry, made a mistake before send the patch, #if 0 #endif should be removed... >if (EDGE_COUNT (e->src->succs) != 1) > { > predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN); > -- Thanks, Xionghu
[PATCH v2] Fix incorrect loop exit edge probability [PR103270]
On 2021/11/23 17:50, Jan Hubicka wrote: >> On Tue, Nov 23, 2021 at 6:52 AM Xionghu Luo wrote: >>> >>> r12-4526 cancelled jump thread path rotates loop. It exposes a issue in >>> profile-estimate when predict_extra_loop_exits, outer loop's exit edge >>> is marked as inner loop's extra loop exit and set with incorrect >>> prediction, then a hot inner loop will become cold loop finally through >>> optimizations, this patch ignores the EDGE_DFS_BACK edge when searching >>> extra exit edges to avoid unexpected predict_edge. >> >> Not sure how outer vs. inner loop exit correlates with EDGE_DFS_BACK, >> I have expected a check based on which loop is exited by the edge instead? >> A backedge should never be an exit, no? >> >> Note that the profile pass does not yet mark backedges so EDGE_DFS_BACK >> settings are unreliable. > > So we have two nested loops and an exit which goes from inner loop and > exists both loops. While processing outer loop we set pretty high exit > probability that is not good for inner loop? No, the edge only belongs to outer loop only. Can an exit edge belongs to two different loops at the same time? Exit edges are iterated with LI_FROM_INNERMOST in predict_loops, if an edge already has prediction by querying edge_predicted_by_p, maybe_predict_edge will early return to not set it again. The CFG is: 2 | 8< // l1 | \ | 10 9 | | | 7 6 <// l2 | | 11| | | 4<- |// l3 | \| | 5 3 | | | -- l2's edge (6->11,6->7) is set to (33%,67%) by l3 unexpectedly. FYI: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103270#c5 > > I guess we could just check if exit edge source basic block has same > loop depth as the loop we are processing? > Thanks for the suggestion, it works. Loop checks already existed in predict_paths_for_bb, just need pass down the loop argument. Updated as v2 patch. v2-0001-Fix-incorrect-loop-exit-edge-probability-PR103270.patch r12-4526 cancelled jump thread path rotates loop. It exposes a issue in profile-estimate when predict_extra_loop_exits, outer loop's exit edge is marked as inner loop's extra loop exit and set with incorrect prediction, then a hot inner loop will become cold loop finally through optimizations, this patch add loop check when searching extra exit edges to avoid unexpected predict_edge from predict_paths_for_bb. Regression tested pass on P8 & x86, OK for master? gcc/ChangeLog: PR middle-end/103270 * predict.c (predict_extra_loop_exits): Add loop parameter. (predict_loops): Call with loop argument. gcc/testsuite/ChangeLog: PR middle-end/103270 * gcc.dg/pr103270.c: New test. --- gcc/predict.c | 10 ++ gcc/testsuite/gcc.dg/pr103270.c | 19 +++ 2 files changed, 25 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/pr103270.c diff --git a/gcc/predict.c b/gcc/predict.c index 68b11135680..082782ec4e9 100644 --- a/gcc/predict.c +++ b/gcc/predict.c @@ -1859,7 +1859,7 @@ predict_iv_comparison (class loop *loop, basic_block bb, exits to predict them using PRED_LOOP_EXTRA_EXIT. */ static void -predict_extra_loop_exits (edge exit_edge) +predict_extra_loop_exits (class loop *loop, edge exit_edge) { unsigned i; bool check_value_one; @@ -1912,12 +1912,14 @@ predict_extra_loop_exits (edge exit_edge) continue; if (EDGE_COUNT (e->src->succs) != 1) { - predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN); + predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN, +loop); continue; } FOR_EACH_EDGE (e1, ei, e->src->preds) - predict_paths_leading_to_edge (e1, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN); + predict_paths_leading_to_edge (e1, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN, + loop); } } @@ -2009,7 +2011,7 @@ predict_loops (void) ex->src->index, ex->dest->index); continue; } - predict_extra_loop_exits (ex); + predict_extra_loop_exits (loop, ex); if (number_of_iterations_exit (loop, ex, &niter_desc, false, false)) niter = niter_desc.niter; diff --git a/gcc/testsuite/gcc.dg/pr103270.c b/gcc/testsuite/gcc.dg/pr103270.c new file mode 100644 index 000..819310e360e --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr103270.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-profile_estimate" } */ + +void test(int a, int* i) +{ + for (; a < 5; ++a) +{ + int b = 0; + int c = 0; + for (; b != -11; b--) + for (int d = 0; d ==0; d++) + { + *i += c & a; + c = b; + } +} +} + +/* { dg-final { scan-tree-dump-not "extra loop exit heuristics of edge\[^:\]*:" "profile_estimate"} } */ -- 2.25.1
Re: [PATCH v3 1/4] Fix loop split incorrect count and probability
Gentle ping, thanks. [PATCH v3] Fix loop split incorrect count and probability https://gcc.gnu.org/pipermail/gcc-patches/2021-November/583626.html On 2021/11/8 14:09, Xionghu Luo via Gcc-patches wrote: > > > On 2021/10/27 15:44, Jan Hubicka wrote: >>> On Wed, 27 Oct 2021, Jan Hubicka wrote: >>> >>>>> >>>>> gcc/ChangeLog: >>>>> >>>>> * tree-ssa-loop-split.c (split_loop): Fix incorrect probability. >>>>> (do_split_loop_on_cond): Likewise. >>>>> --- >>>>> gcc/tree-ssa-loop-split.c | 25 - >>>>> 1 file changed, 16 insertions(+), 9 deletions(-) >>>>> >>>>> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c >>>>> index 3f6ad046623..d30782888f3 100644 >>>>> --- a/gcc/tree-ssa-loop-split.c >>>>> +++ b/gcc/tree-ssa-loop-split.c >>>>> @@ -575,7 +575,11 @@ split_loop (class loop *loop1) >>>>> stmts2); >>>>> tree cond = build2 (guard_code, boolean_type_node, guard_init, border); >>>>> if (!initial_true) >>>>> - cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); >>>>> + cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); >>>>> + >>>>> + edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE >>>>> +? EDGE_SUCC (bbs[i], 0) >>>>> +: EDGE_SUCC (bbs[i], 1); >>>>> >>>>> /* Now version the loop, placing loop2 after loop1 connecting >>>>> them, and fix up SSA form for that. */ >>>>> @@ -583,10 +587,10 @@ split_loop (class loop *loop1) >>>>> basic_block cond_bb; >>>>> >>>>> class loop *loop2 = loop_version (loop1, cond, &cond_bb, >>>>> -profile_probability::always (), >>>>> -profile_probability::always (), >>>>> -profile_probability::always (), >>>>> -profile_probability::always (), >>>>> +true_edge->probability, >>>>> +true_edge->probability.invert (), >>>>> +true_edge->probability, >>>>> +true_edge->probability.invert (), >>>>> true); >>>> >>>> As discussed yesterday, for loop of form >>>> >>>> for (...) >>>> if (cond) >>>> cond = something(); >>>> else >>>> something2 >>>> >>>> Split as >>> >>> Note that you are missing to conditionalize loop1 execution >>> on 'cond' (not sure if that makes a difference). >> You are right - forgot to mention that. >> >> Entry conditional makes no difference on scaling stmts inside loop but >> affects its header and expected trip count. We however need to set up >> probability of this conditional (and preheader count if it exists) >> There is no general way to read the probability of this initial >> conditional from cfg profile. So I guess we are stuck with guessing >> some arbitrary value. I guess common case is that cond is true first >> iteration tough and often we can easily see that fromo PHI node >> initializing the test variable. >> >> Other thing that changes is expected number of iterations of the split >> loops, so we may want to update the exit conditinal probability >> accordingly... >> > Sorry for the late reply. The below updated patch mainly solves the issues > you pointed out: > - profile count proportion for both original loop and copied loop > without dropping down the true branch's count; > - probability update in the two loops and between the two loops; > - number of iterations update/check for split_loop. > > > [PATCH v3] Fix loop split incorrect count and probability > > In tree-ssa-loop-split.c, split_loop and split_loop_on_cond does two > kind of split. split_loop only works for single loop and insert edge at > exit when split, while split_loop_on_cond is not limited to single loop > and insert edge at latch when split. Both split behavior should consider > loop count and probability update. For split_loop, loop split condition
Ping: [PATCH v7 2/2] Don't move cold code out of loop by checking bb count
Gentle ping and is this patch still suitable for stage 3? Thanks. [PATCH v7 2/2] Don't move cold code out of loop by checking bb count https://gcc.gnu.org/pipermail/gcc-patches/2021-November/583911.html On 2021/11/10 11:08, Xionghu Luo via Gcc-patches wrote: > > > On 2021/11/4 21:00, Richard Biener wrote: >> On Wed, Nov 3, 2021 at 2:29 PM Xionghu Luo wrote: >>> >>> >>>> + while (outmost_loop != loop) >>>> +{ >>>> + if (bb_colder_than_loop_preheader (loop_preheader_edge >>>> (outmost_loop)->src, >>>> +loop_preheader_edge >>>> (cold_loop)->src)) >>>> + cold_loop = outmost_loop; >>>> + outmost_loop = superloop_at_depth (loop, loop_depth (outmost_loop) >>>> + 1); >>>> +} >>>> >>>> could be instead written as >>>> >>>> coldest_loop = coldest_outermost_loop[loop->num]; >>>> if (loop_depth (coldest_loop) < loop_depth (outermost_loop)) >>>> return outermost_loop; >>>> return coldest_loop; >>>> >>>> ? And in the usual case coldest_outermost_loop[L] would be the loop tree >>>> root. >>>> It should be possible to compute such cache in a DFS walk of the loop tree >>>> (the loop iterator by default visits in such order). >>> >>> >>> Thanks. Updated the patch with your suggestion. Not sure whether it >>> strictly >>> conforms to your comments. Though the patch passed all my added >>> tests(coverage not enough), >>> I am still a bit worried if pre-computed coldest_loop is outside of >>> outermost_loop, but >>> outermost_loop is not the COLDEST LOOP, i.e. (outer->inner) >>> >>> [loop tree root, coldest_loop, outermost_loop,..., second_coldest_loop, >>> ..., loop], >>> >>> then function find_coldest_out_loop will return a loop NOT accord with our >>> expectation, that should return second_coldest_loop instead of >>> outermost_loop? >> Hmm, interesting - yes. I guess the common case will be that the >> pre-computed >> outermost loop will be the loop at depth 1 since outer loops tend to >> be colder than >> inner loops? That would then defeat the whole exercise. > > It is not easy to construct such cases, But finally I got below results, > > 1) many cases inner loop is hotter than outer loop, for example: > > loop 1's coldest_outermost_loop is 1, colder_than_inner_loop is NULL > loop 2's coldest_outermost_loop is 1, colder_than_inner_loop is 1 > loop 3's coldest_outermost_loop is 1, colder_than_inner_loop is 2 > loop 4's coldest_outermost_loop is 1, colder_than_inner_loop is 2 > > > 2) But there are also cases inner loop is colder than outer loop, like: > > loop 1's coldest outermost loop is 1, colder_than_inner_loop is NULL > loop 2's coldest outermost loop is 2, colder_than_inner_loop is NULL > loop 3's coldest outermost loop is 3, colder_than_inner_loop is NULL > > >> >> To optimize the common case but not avoiding iteration in the cases we care >> about we could instead cache the next outermost loop that is _not_ colder >> than loop. So for your [ ... ] example above we'd have> >> hotter_than_inner_loop[loop] == outer (second_coldest_loop), where the >> candidate would then be 'second_coldest_loop' and we'd then iterate >> to hotter_than_inner_loop[hotter_than_inner_loop[loop]] to find the next >> cold candidate we can compare against? For the common case we'd >> have hotter_than_inner_loop[looo] == NULL (no such loop) and we then >> simply pick 'outermost_loop'. > > Thanks. It was difficult to understand, but finally I got to know what you > want to express :) > > We should cache the next loop that is *colder* than loop instead of '_not_ > colder > than loop', and 'hotter_than_inner_loop' should be 'colder_than_inner_loop', > then it makes sense if the coldest loop is outside of outermost loop, > continue to > find a colder loop between outermost loop and current loop in > colder_than_inner_loop[loop->num]? Hope I understood you correctly... > >> >> One comment on the patch itself below. >> > > The loop in fill_cold_out_loop is also removed in the updated v7 patch. > > > > [PATCH v7 2/2] Don't move cold code out of loop by checking bb count > > From: Xiong Hu Luo > > v7 changes: > 1. Refine get_coldest
Re: [PATCH v8 2/2] Don't move cold code out of loop by checking bb count
On 2021/12/1 18:09, Richard Biener wrote: > On Wed, Nov 10, 2021 at 4:08 AM Xionghu Luo wrote: >> >> >> >> On 2021/11/4 21:00, Richard Biener wrote: >>> On Wed, Nov 3, 2021 at 2:29 PM Xionghu Luo wrote: >>>> >>>> >>>>> + while (outmost_loop != loop) >>>>> +{ >>>>> + if (bb_colder_than_loop_preheader (loop_preheader_edge >>>>> (outmost_loop)->src, >>>>> +loop_preheader_edge >>>>> (cold_loop)->src)) >>>>> + cold_loop = outmost_loop; >>>>> + outmost_loop = superloop_at_depth (loop, loop_depth (outmost_loop) >>>>> + 1); >>>>> +} >>>>> >>>>> could be instead written as >>>>> >>>>> coldest_loop = coldest_outermost_loop[loop->num]; >>>>> if (loop_depth (coldest_loop) < loop_depth (outermost_loop)) >>>>> return outermost_loop; >>>>> return coldest_loop; >>>>> >>>>> ? And in the usual case coldest_outermost_loop[L] would be the loop tree >>>>> root. >>>>> It should be possible to compute such cache in a DFS walk of the loop tree >>>>> (the loop iterator by default visits in such order). >>>> >>>> >>>> Thanks. Updated the patch with your suggestion. Not sure whether it >>>> strictly >>>> conforms to your comments. Though the patch passed all my added >>>> tests(coverage not enough), >>>> I am still a bit worried if pre-computed coldest_loop is outside of >>>> outermost_loop, but >>>> outermost_loop is not the COLDEST LOOP, i.e. (outer->inner) >>>> >>>> [loop tree root, coldest_loop, outermost_loop,..., second_coldest_loop, >>>> ..., loop], >>>> >>>> then function find_coldest_out_loop will return a loop NOT accord with our >>>> expectation, that should return second_coldest_loop instead of >>>> outermost_loop? >>> Hmm, interesting - yes. I guess the common case will be that the >>> pre-computed >>> outermost loop will be the loop at depth 1 since outer loops tend to >>> be colder than >>> inner loops? That would then defeat the whole exercise. >> >> It is not easy to construct such cases, But finally I got below results, >> >> 1) many cases inner loop is hotter than outer loop, for example: >> >> loop 1's coldest_outermost_loop is 1, colder_than_inner_loop is NULL >> loop 2's coldest_outermost_loop is 1, colder_than_inner_loop is 1 >> loop 3's coldest_outermost_loop is 1, colder_than_inner_loop is 2 >> loop 4's coldest_outermost_loop is 1, colder_than_inner_loop is 2 >> >> >> 2) But there are also cases inner loop is colder than outer loop, like: >> >> loop 1's coldest outermost loop is 1, colder_than_inner_loop is NULL >> loop 2's coldest outermost loop is 2, colder_than_inner_loop is NULL >> loop 3's coldest outermost loop is 3, colder_than_inner_loop is NULL >> >> >>> >>> To optimize the common case but not avoiding iteration in the cases we care >>> about we could instead cache the next outermost loop that is _not_ colder >>> than loop. So for your [ ... ] example above we'd have> >>> hotter_than_inner_loop[loop] == outer (second_coldest_loop), where the >>> candidate would then be 'second_coldest_loop' and we'd then iterate >>> to hotter_than_inner_loop[hotter_than_inner_loop[loop]] to find the next >>> cold candidate we can compare against? For the common case we'd >>> have hotter_than_inner_loop[looo] == NULL (no such loop) and we then >>> simply pick 'outermost_loop'. >> >> Thanks. It was difficult to understand, but finally I got to know what you >> want to express :) >> >> We should cache the next loop that is *colder* than loop instead of '_not_ >> colder >> than loop', and 'hotter_than_inner_loop' should be 'colder_than_inner_loop', >> then it makes sense if the coldest loop is outside of outermost loop, >> continue to >> find a colder loop between outermost loop and current loop in >> colder_than_inner_loop[loop->num]? Hope I understood you correctly... > > Heh, looking at the patch - I don't know. > > To make the calls to bb_colder_than_loop_preheader more obvious can you > change that
Ping: [PATCH v2] Fix incorrect loop exit edge probability [PR103270]
Hi Honza, Gentle ping for this :), thanks. https://gcc.gnu.org/pipermail/gcc-patches/2021-November/585289.html On 2021/11/24 13:03, Xionghu Luo via Gcc-patches wrote: > On 2021/11/23 17:50, Jan Hubicka wrote: >>> On Tue, Nov 23, 2021 at 6:52 AM Xionghu Luo wrote: >>>> >>>> r12-4526 cancelled jump thread path rotates loop. It exposes a issue in >>>> profile-estimate when predict_extra_loop_exits, outer loop's exit edge >>>> is marked as inner loop's extra loop exit and set with incorrect >>>> prediction, then a hot inner loop will become cold loop finally through >>>> optimizations, this patch ignores the EDGE_DFS_BACK edge when searching >>>> extra exit edges to avoid unexpected predict_edge. >>> >>> Not sure how outer vs. inner loop exit correlates with EDGE_DFS_BACK, >>> I have expected a check based on which loop is exited by the edge instead? >>> A backedge should never be an exit, no? >>> >>> Note that the profile pass does not yet mark backedges so EDGE_DFS_BACK >>> settings are unreliable. >> >> So we have two nested loops and an exit which goes from inner loop and >> exists both loops. While processing outer loop we set pretty high exit >> probability that is not good for inner loop? > > No, the edge only belongs to outer loop only. Can an exit edge belongs to > two different loops at the same time? > Exit edges are iterated with LI_FROM_INNERMOST in predict_loops, if an edge > already has prediction by querying edge_predicted_by_p, maybe_predict_edge > will early return to not set it again. > > The CFG is: > > 2 > | > 8< // l1 > | \ | > 10 9 | > | | > 7 > 6 <// l2 > | | > 11| > | | > 4<- |// l3 > | \| | > 5 3 | > | | > -- > > l2's edge (6->11,6->7) is set to (33%,67%) by l3 unexpectedly. > > FYI: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103270#c5 > >> >> I guess we could just check if exit edge source basic block has same >> loop depth as the loop we are processing? >> > > > Thanks for the suggestion, it works. Loop checks already existed in > predict_paths_for_bb, just need pass down the loop argument. > Updated as v2 patch. > > > v2-0001-Fix-incorrect-loop-exit-edge-probability-PR103270.patch > > r12-4526 cancelled jump thread path rotates loop. It exposes a issue in > profile-estimate when predict_extra_loop_exits, outer loop's exit edge > is marked as inner loop's extra loop exit and set with incorrect > prediction, then a hot inner loop will become cold loop finally through > optimizations, this patch add loop check when searching extra exit edges > to avoid unexpected predict_edge from predict_paths_for_bb. > > Regression tested pass on P8 & x86, OK for master? > > gcc/ChangeLog: > > PR middle-end/103270 > * predict.c (predict_extra_loop_exits): Add loop parameter. > (predict_loops): Call with loop argument. > > gcc/testsuite/ChangeLog: > > PR middle-end/103270 > * gcc.dg/pr103270.c: New test. > --- > gcc/predict.c | 10 ++ > gcc/testsuite/gcc.dg/pr103270.c | 19 +++ > 2 files changed, 25 insertions(+), 4 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/pr103270.c > > diff --git a/gcc/predict.c b/gcc/predict.c > index 68b11135680..082782ec4e9 100644 > --- a/gcc/predict.c > +++ b/gcc/predict.c > @@ -1859,7 +1859,7 @@ predict_iv_comparison (class loop *loop, basic_block bb, > exits to predict them using PRED_LOOP_EXTRA_EXIT. */ > > static void > -predict_extra_loop_exits (edge exit_edge) > +predict_extra_loop_exits (class loop *loop, edge exit_edge) > { >unsigned i; >bool check_value_one; > @@ -1912,12 +1912,14 @@ predict_extra_loop_exits (edge exit_edge) > continue; >if (EDGE_COUNT (e->src->succs) != 1) > { > - predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN); > + predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN, > + loop); > continue; > } > >FOR_EACH_EDGE (e1, ei, e->src->preds) > - predict_paths_leading_to_edge (e1, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN); > + predict_paths_leading_to_edge (e1, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN, > +loop); > } > } > > @@ -2009,7 +2011,7 @@ predict_loops (void) >ex->src->index, ex->dest->index); > continue; >
Re: [PATCH v8 2/2] Don't move cold code out of loop by checking bb count
On 2021/12/6 13:09, Xionghu Luo via Gcc-patches wrote: > > > On 2021/12/1 18:09, Richard Biener wrote: >> On Wed, Nov 10, 2021 at 4:08 AM Xionghu Luo wrote: >>> >>> >>> >>> On 2021/11/4 21:00, Richard Biener wrote: >>>> On Wed, Nov 3, 2021 at 2:29 PM Xionghu Luo wrote: >>>>> >>>>> >>>>>> + while (outmost_loop != loop) >>>>>> +{ >>>>>> + if (bb_colder_than_loop_preheader (loop_preheader_edge >>>>>> (outmost_loop)->src, >>>>>> +loop_preheader_edge >>>>>> (cold_loop)->src)) >>>>>> + cold_loop = outmost_loop; >>>>>> + outmost_loop = superloop_at_depth (loop, loop_depth >>>>>> (outmost_loop) + 1); >>>>>> +} >>>>>> >>>>>> could be instead written as >>>>>> >>>>>> coldest_loop = coldest_outermost_loop[loop->num]; >>>>>> if (loop_depth (coldest_loop) < loop_depth (outermost_loop)) >>>>>> return outermost_loop; >>>>>> return coldest_loop; >>>>>> >>>>>> ? And in the usual case coldest_outermost_loop[L] would be the loop >>>>>> tree root. >>>>>> It should be possible to compute such cache in a DFS walk of the loop >>>>>> tree >>>>>> (the loop iterator by default visits in such order). >>>>> >>>>> >>>>> Thanks. Updated the patch with your suggestion. Not sure whether it >>>>> strictly >>>>> conforms to your comments. Though the patch passed all my added >>>>> tests(coverage not enough), >>>>> I am still a bit worried if pre-computed coldest_loop is outside of >>>>> outermost_loop, but >>>>> outermost_loop is not the COLDEST LOOP, i.e. (outer->inner) >>>>> >>>>> [loop tree root, coldest_loop, outermost_loop,..., second_coldest_loop, >>>>> ..., loop], >>>>> >>>>> then function find_coldest_out_loop will return a loop NOT accord with our >>>>> expectation, that should return second_coldest_loop instead of >>>>> outermost_loop? >>>> Hmm, interesting - yes. I guess the common case will be that the >>>> pre-computed >>>> outermost loop will be the loop at depth 1 since outer loops tend to >>>> be colder than >>>> inner loops? That would then defeat the whole exercise. >>> >>> It is not easy to construct such cases, But finally I got below results, >>> >>> 1) many cases inner loop is hotter than outer loop, for example: >>> >>> loop 1's coldest_outermost_loop is 1, colder_than_inner_loop is NULL >>> loop 2's coldest_outermost_loop is 1, colder_than_inner_loop is 1 >>> loop 3's coldest_outermost_loop is 1, colder_than_inner_loop is 2 >>> loop 4's coldest_outermost_loop is 1, colder_than_inner_loop is 2 >>> >>> >>> 2) But there are also cases inner loop is colder than outer loop, like: >>> >>> loop 1's coldest outermost loop is 1, colder_than_inner_loop is NULL >>> loop 2's coldest outermost loop is 2, colder_than_inner_loop is NULL >>> loop 3's coldest outermost loop is 3, colder_than_inner_loop is NULL >>> >>> >>>> >>>> To optimize the common case but not avoiding iteration in the cases we care >>>> about we could instead cache the next outermost loop that is _not_ colder >>>> than loop. So for your [ ... ] example above we'd have> >>>> hotter_than_inner_loop[loop] == outer (second_coldest_loop), where the >>>> candidate would then be 'second_coldest_loop' and we'd then iterate >>>> to hotter_than_inner_loop[hotter_than_inner_loop[loop]] to find the next >>>> cold candidate we can compare against? For the common case we'd >>>> have hotter_than_inner_loop[looo] == NULL (no such loop) and we then >>>> simply pick 'outermost_loop'. >>> >>> Thanks. It was difficult to understand, but finally I got to know what you >>> want to express :) >>> >>> We should cache the next loop that is *colder* than loop instead of '_not_ >>> colder >>> than loop', and 'hotter_than_inner_loop' should be 'c
[PATCH 0/3] Dependency patches for hoist LIM code to cold loop
This patchset is a recollect of previously sent patches. Thanks Richard that The "Don't move cold code out of loop by checking bb count" is approved[1], but there are still 3 prerequesite patches to supplement or avoid regression. 1) Patch [1/3] is the RTL part of not hoisting LIM code out of cold loop, it could improve perlbench by 7.69% [2]. 2) Patch [2/3] is a test case regression fix for pr103270.c, after enabling gimple part of hoisting LIM code to coldest loop [1], the store instruction in loop won't be moved out of inner loop, it is caused by a jump-threading patch unexpectedly turning a hot inner loop to cold loop, this patch could recover the inner loop to be hot[3]. 3) As data showed in [2], besides improvement, there is also a small regression on SPEC2017 544.nab_r (-1.55%). After investigation, it turned out to be the profile count and probability is not correctly adjusted in loop split, with this patch [3/3], the only regression is also fixed. This version slightly updates [4] to fix ICEs. [1] https://gcc.gnu.org/pipermail/gcc-patches/2021-December/586319.html [2] https://gcc.gnu.org/pipermail/gcc-patches/2021-September/580109.html [3] https://gcc.gnu.org/pipermail/gcc-patches/2021-November/585195.html [4] https://gcc.gnu.org/pipermail/gcc-patches/2021-November/585290.html Xionghu Luo (3): loop-invariant: Don't move cold bb instructions to preheader in RTL Fix incorrect loop exit edge probability [PR103270] Fix loop split incorrect count and probability gcc/loop-invariant.c| 10 ++-- gcc/predict.c | 10 ++-- gcc/tree-ssa-loop-split.c | 85 + gcc/testsuite/gcc.dg/pr103270.c | 19 4 files changed, 109 insertions(+), 15 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/pr103270.c -- 2.25.1
[PATCH 1/3] loop-invariant: Don't move cold bb instructions to preheader in RTL
gcc/ChangeLog: * loop-invariant.c (find_invariants_bb): Check profile count before motion. (find_invariants_body): Add argument. --- gcc/loop-invariant.c | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c index 5eee2e5c9f8..c61c8612fae 100644 --- a/gcc/loop-invariant.c +++ b/gcc/loop-invariant.c @@ -1183,9 +1183,14 @@ find_invariants_insn (rtx_insn *insn, bool always_reached, bool always_executed) call. */ static void -find_invariants_bb (basic_block bb, bool always_reached, bool always_executed) +find_invariants_bb (class loop *loop, basic_block bb, bool always_reached, + bool always_executed) { rtx_insn *insn; + basic_block preheader = loop_preheader_edge (loop)->src; + + if (preheader->count > bb->count) +return; FOR_BB_INSNS (bb, insn) { @@ -1214,8 +1219,7 @@ find_invariants_body (class loop *loop, basic_block *body, unsigned i; for (i = 0; i < loop->num_nodes; i++) -find_invariants_bb (body[i], - bitmap_bit_p (always_reached, i), +find_invariants_bb (loop, body[i], bitmap_bit_p (always_reached, i), bitmap_bit_p (always_executed, i)); } -- 2.25.1
[PATCH 2/3] Fix incorrect loop exit edge probability [PR103270]
r12-4526 cancelled jump thread path rotates loop. It exposes a issue in profile-estimate when predict_extra_loop_exits, outer loop's exit edge is marked as inner loop's extra loop exit and set with incorrect prediction, then a hot inner loop will become cold loop finally through optimizations, this patch add loop check when searching extra exit edges to avoid unexpected predict_edge from predict_paths_for_bb. Regression tested on P8LE, OK for master? gcc/ChangeLog: PR middle-end/103270 * predict.c (predict_extra_loop_exits): Add loop parameter. (predict_loops): Call with loop argument. gcc/testsuite/ChangeLog: PR middle-end/103270 * gcc.dg/pr103270.c: New test. --- gcc/predict.c | 10 ++ gcc/testsuite/gcc.dg/pr103270.c | 19 +++ 2 files changed, 25 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/pr103270.c diff --git a/gcc/predict.c b/gcc/predict.c index 3cb4e3c0eb5..5b6e0cf722b 100644 --- a/gcc/predict.c +++ b/gcc/predict.c @@ -1859,7 +1859,7 @@ predict_iv_comparison (class loop *loop, basic_block bb, exits to predict them using PRED_LOOP_EXTRA_EXIT. */ static void -predict_extra_loop_exits (edge exit_edge) +predict_extra_loop_exits (class loop *loop, edge exit_edge) { unsigned i; bool check_value_one; @@ -1912,12 +1912,14 @@ predict_extra_loop_exits (edge exit_edge) continue; if (EDGE_COUNT (e->src->succs) != 1) { - predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN); + predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN, +loop); continue; } FOR_EACH_EDGE (e1, ei, e->src->preds) - predict_paths_leading_to_edge (e1, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN); + predict_paths_leading_to_edge (e1, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN, + loop); } } @@ -2008,7 +2010,7 @@ predict_loops (void) ex->src->index, ex->dest->index); continue; } - predict_extra_loop_exits (ex); + predict_extra_loop_exits (loop, ex); if (number_of_iterations_exit (loop, ex, &niter_desc, false, false)) niter = niter_desc.niter; diff --git a/gcc/testsuite/gcc.dg/pr103270.c b/gcc/testsuite/gcc.dg/pr103270.c new file mode 100644 index 000..819310e360e --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr103270.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-profile_estimate" } */ + +void test(int a, int* i) +{ + for (; a < 5; ++a) +{ + int b = 0; + int c = 0; + for (; b != -11; b--) + for (int d = 0; d ==0; d++) + { + *i += c & a; + c = b; + } +} +} + +/* { dg-final { scan-tree-dump-not "extra loop exit heuristics of edge\[^:\]*:" "profile_estimate"} } */ -- 2.25.1
[PATCH 3/3] Fix loop split incorrect count and probability
In tree-ssa-loop-split.c, split_loop and split_loop_on_cond does two kind of split. split_loop only works for single loop and insert edge at exit when split, while split_loop_on_cond is not limited to single loop and insert edge at latch when split. Both split behavior should consider loop count and probability update. For split_loop, loop split condition is moved in front of loop1 and loop2; But split_loop_on_cond moves the condition between loop1 and loop2, this patch does: 1) profile count proportion for both original loop and copied loop without dropping down the true branch's count; 2) probability update in the two loops and between the two loops. Regression tested pass, OK for master? Changes diff for split_loop and split_loop_on_cond cases: 1) diff base/loop-split.c.151t.lsplit patched/loop-split.c.152t.lsplit ... [local count: 118111600]: if (beg_5(D) < end_8(D)) goto ; [89.00%] else goto ; [11.00%] [local count: 105119324]: if (beg2_6(D) < c_9(D)) -goto ; [100.00%] +goto ; [33.00%] else -goto ; [100.00%] +goto ; [67.00%] - [local count: 105119324]: + [local count: 34689377]: _25 = beg_5(D) + 1; _26 = end_8(D) - beg_5(D); _27 = beg2_6(D) + _26; _28 = MIN_EXPR ; - [local count: 955630225]: + [local count: 315357973]: # i_16 = PHI # j_17 = PHI printf ("a: %d %d\n", i_16, j_17); i_11 = i_16 + 1; j_12 = j_17 + 1; if (j_12 < _28) -goto ; [89.00%] +goto ; [29.37%] else -goto ; [11.00%] +goto ; [70.63%] - [local count: 850510901]: + [local count: 280668596]: goto ; [100.00%] - [local count: 105119324]: + [local count: 70429947]: # i_22 = PHI # j_23 = PHI [local count: 955630225]: # i_2 = PHI # j_1 = PHI i_20 = i_2 + 1; j_21 = j_1 + 1; if (end_8(D) > i_20) -goto ; [89.00%] +goto ; [59.63%] else -goto ; [11.00%] +goto ; [40.37%] - [local count: 850510901]: + [local count: 569842305]: goto ; [100.00%] [local count: 105119324]: # i_29 = PHI # j_30 = PHI if (end_8(D) > i_29) goto ; [80.00%] else goto ; [20.00%] [local count: 105119324]: [local count: 118111600]: return 0; } [local count: 118111600]: - if (beg_5(D) < end_8(D)) + _1 = end_6(D) - beg_7(D); + j_9 = _1 + beg2_8(D); + if (end_6(D) > beg_7(D)) goto ; [89.00%] else goto ; [11.00%] [local count: 105119324]: - if (beg2_6(D) < c_9(D)) -goto ; [100.00%] + if (j_9 >= c_11(D)) +goto ; [33.00%] else -goto ; [100.00%] +goto ; [67.00%] - [local count: 105119324]: - _25 = beg_5(D) + 1; - _26 = end_8(D) - beg_5(D); - _27 = beg2_6(D) + _26; - _28 = MIN_EXPR ; - - [local count: 955630225]: - # i_16 = PHI - # j_17 = PHI - printf ("a: %d %d\n", i_16, j_17); - i_11 = i_16 + 1; - j_12 = j_17 + 1; - if (j_12 < _28) -goto ; [89.00%] + [local count: 34689377]: + _27 = end_6(D) + -1; + _28 = beg_7(D) - end_6(D); + _29 = j_9 + _28; + _30 = _29 + 1; + _31 = MAX_EXPR ; + + [local count: 315357973]: + # i_18 = PHI + # j_19 = PHI + printf ("a: %d %d\n", i_18, j_19); + i_13 = i_18 + -1; + j_14 = j_19 + -1; + if (j_14 >= _31) +goto ; [29.37%] else -goto ; [11.00%] +goto ; [70.63%] - [local count: 850510901]: + [local count: 280668596]: goto ; [100.00%] - [local count: 105119324]: - # i_22 = PHI - # j_23 = PHI + [local count: 70429947]: + # i_24 = PHI + # j_25 = PHI [local count: 955630225]: - # i_2 = PHI - # j_1 = PHI - i_20 = i_2 + 1; - j_21 = j_1 + 1; - if (end_8(D) > i_20) + # i_3 = PHI + # j_2 = PHI + i_22 = i_3 + -1; + j_23 = j_2 + -1; + if (beg_7(D) < i_22) goto ; [89.00%] else goto ; [11.00%] - [local count: 850510901]: + [local count: 569842305]: goto ; [100.00%] [local count: 105119324]: - # i_29 = PHI - # j_30 = PHI - if (end_8(D) > i_29) + # i_32 = PHI + # j_33 = PHI + if (beg_7(D) < i_32) goto ; [80.00%] else goto ; [20.00%] [local count: 105119324]: [local count: 118111600]: return 0; } 2) diff base/loop-cond-split-1.c.151t.lsplit patched/loop-cond-split-1.c.151t.lsplit: ... [local count: 118111600]: if (n_7(D) > 0) goto ; [89.00%] else goto ; [11.00%] [local count: 118111600]: return; [local count: 105119324]: pretmp_3 = ga; - [local count: 955630225]: + [local count: 315357973]: # i_13 = PHI # prephitmp_12 = PHI if (prephitmp_12 != 0) goto ; [33.00%] else goto ; [67.00%] [local count: 315357972]: _2 = do_something (); ga = _2; - [local count: 955630225]: + [local count: 315357973]: # prephitmp_5 = PHI i_10 = inc (i_13); if (n_7(D) > i_10) goto ; [89.00%] else goto ; [11.00%] [local count: 105119324]: goto ; [100.00%] - [local count: 850510901]: + [local count: 280668596]: if (prephitmp_12 != 0) -goto ; [100.00%] +
Re: [PATCH v8 2/2] Don't move cold code out of loop by checking bb count
On 2021/12/7 20:17, Richard Biener wrote: >>> + class loop *coldest_loop = coldest_outermost_loop[loop->num]; >>> + if (loop_depth (coldest_loop) < loop_depth (outermost_loop)) >>> +{ >>> + class loop *hotter_loop = hotter_than_inner_loop[loop->num]; >>> + if (!hotter_loop >>> + || loop_depth (hotter_loop) < loop_depth (outermost_loop)) >>> + return outermost_loop; >>> + >>> + /* hotter_loop is between OUTERMOST_LOOP and LOOP like: >>> + [loop tree root, ..., coldest_loop, ..., outermost_loop, ..., >>> + hotter_loop, second_coldest_loop, ..., loop] >>> + return second_coldest_loop to be the hoist target. */ >>> + class loop *aloop; >>> + for (aloop = hotter_loop->inner; aloop; aloop = aloop->next) >>> + if (flow_loop_nested_p (aloop, loop)) >> should be: >> >> if (aloop == loop || flow_loop_nested_p (aloop, loop)) > OK with that fixed. > > Are necessary prerequesites committed to avoid regressions? > I guess we need to keep a watchful eye and eventually revert > (or gate with a --param disabled by default) the new behavior if > severe regressions are discovered. > > Thanks and sorry for the repeated delays. > Richard. > Thanks for your review, I learned quite a lot and gained very useful comments & help through the period :) There are still 3 patches required to avoid regression or so, I've reorganized them and sent it out. https://gcc.gnu.org/pipermail/gcc-patches/2021-December/586371.html In addition, cooked the patch to add option for disable/enable it. Is it OK to merge it to current patch? [PATCH] Add option -fhoist-to-cold-loop gcc/ChangeLog: * common.opt: New. * loop-invariant.c (find_invariants_bb): * tree-ssa-loop-im.c (get_coldest_out_loop): (can_sm_ref_p): (loop_invariant_motion_in_fun): --- gcc/common.opt | 4 gcc/loop-invariant.c | 2 +- gcc/tree-ssa-loop-im.c | 33 ++--- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/gcc/common.opt b/gcc/common.opt index b921f5e3b25..62b82bd8b95 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1171,6 +1171,10 @@ fcode-hoisting Common Var(flag_code_hoisting) Optimization Enable code hoisting. +fhoist-to-cold-loop +Common Var(flag_hoist_to_cold_loop) Init(1) Optimization +Enable hoisting code to cold loop. + fcombine-stack-adjustments Common Var(flag_combine_stack_adjustments) Optimization Looks for opportunities to reduce stack adjustments and stack references. diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c index 5c3be7bf0eb..75b9dd47cd7 100644 --- a/gcc/loop-invariant.c +++ b/gcc/loop-invariant.c @@ -1189,7 +1189,7 @@ find_invariants_bb (class loop *loop, basic_block bb, bool always_reached, rtx_insn *insn; basic_block preheader = loop_preheader_edge (loop)->src; - if (preheader->count > bb->count) + if (flag_hoist_to_cold_loop && preheader->count > bb->count) return; FOR_BB_INSNS (bb, insn) diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index 565ee62d3f7..d745f66851b 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -450,6 +450,9 @@ static class loop * get_coldest_out_loop (class loop *outermost_loop, class loop *loop, basic_block curr_bb) { + if (!flag_hoist_to_cold_loop) +return outermost_loop; + gcc_assert (outermost_loop == loop || flow_loop_nested_p (outermost_loop, loop)); @@ -3031,8 +3034,9 @@ can_sm_ref_p (class loop *loop, im_mem_ref *ref) /* Verify whether the candidate is hot for LOOP. Only do store motion if the candidate's profile count is hot. Statement in cold BB shouldn't be moved out of it's loop_father. */ - if (!for_all_locs_in_loop (loop, ref, ref_in_loop_hot_body (loop))) -return false; + if (flag_hoist_to_cold_loop) +if (!for_all_locs_in_loop (loop, ref, ref_in_loop_hot_body (loop))) + return false; return true; } @@ -3373,8 +3377,11 @@ tree_ssa_lim_finalize (void) free (bb_loop_postorder); - coldest_outermost_loop.release (); - hotter_than_inner_loop.release (); + if (flag_hoist_to_cold_loop) +{ + coldest_outermost_loop.release (); + hotter_than_inner_loop.release (); +} } /* Moves invariants from loops. Only "expensive" invariants are moved out -- @@ -3396,13 +3403,17 @@ loop_invariant_motion_in_fun (function *fun, bool store_motion) /* Pre-compute coldest outermost loop and nearest hotter loop of each loop. */ - class loop *loop; - coldest_outermost_loop.create (number_of_loops (cfun)); - coldest_outermost_loop.safe_grow_cleared (number_of_loops (cfun)); - hotter_than_inner_loop.create (number_of_loops (cfun)); - hotter_than_inner_loop.safe_grow_cleared (number_of_loops (cfun)); - for (loop = current_loops->tree_root->inner; loop != NULL; loop = loop->next) -fill_coldest_and_hotter_out_loop (loop, NULL, loop); + if (flag_hoist_to_cold_loop)
[PATCH] rs6000: powerpc suboptimal boolean test of contiguous bits [PR102239]
Add specialized version to combine two instructions from 9: {r123:CC=cmp(r124:DI&0x6,0);clobber scratch;} REG_DEAD r124:DI 10: pc={(r123:CC==0)?L15:pc} REG_DEAD r123:CC to: 10: {pc={(r123:DI&0x6==0)?L15:pc};clobber scratch;clobber %0:CC;} then split2 will split it to one rotate dot instruction (to save one rotate back instruction) as shifted result doesn't matter when comparing to 0 in CCEQmode. Bootstrapped and regression tested pass on Power 8/9/10, OK for master? gcc/ChangeLog: PR target/102239 * config/rs6000/rs6000.md (*anddi3_insn_dot): New. gcc/testsuite/ChangeLog: PR target/102239 * gcc.target/powerpc/pr102239.c: New test. --- gcc/config/rs6000/rs6000-protos.h | 1 + gcc/config/rs6000/rs6000.c | 7 gcc/config/rs6000/rs6000.md | 38 + gcc/testsuite/gcc.target/powerpc/pr102239.c | 13 +++ 4 files changed, 59 insertions(+) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr102239.c diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 14f6b313105..3644c524376 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -73,6 +73,7 @@ extern int expand_block_move (rtx[], bool); extern bool expand_block_compare (rtx[]); extern bool expand_strn_compare (rtx[], int); extern bool rs6000_is_valid_mask (rtx, int *, int *, machine_mode); +extern bool rs6000_is_valid_rotate_dot_mask (rtx mask, machine_mode mode); extern bool rs6000_is_valid_and_mask (rtx, machine_mode); extern bool rs6000_is_valid_shift_mask (rtx, rtx, machine_mode); extern bool rs6000_is_valid_insert_mask (rtx, rtx, machine_mode); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 5e129986516..57a38cf954a 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -11606,6 +11606,13 @@ rs6000_is_valid_mask (rtx mask, int *b, int *e, machine_mode mode) return true; } +bool +rs6000_is_valid_rotate_dot_mask (rtx mask, machine_mode mode) +{ + int nb, ne; + return rs6000_is_valid_mask (mask, &nb, &ne, mode) && nb >= ne && ne > 0; +} + /* Return whether MASK (a CONST_INT) is a valid mask for any rlwinm, rldicl, or rldicr instruction, to implement an AND with it in mode MODE. */ diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 6bec2bddbde..014dc9612ea 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -3762,6 +3762,44 @@ (define_insn_and_split "*and3_2insn_dot2" (set_attr "dot" "yes") (set_attr "length" "8,12")]) +(define_insn_and_split "*anddi3_insn_dot" + [(set (pc) +(if_then_else (eq (and:DI (match_operand:DI 1 "gpc_reg_operand" "%r,r") + (match_operand:DI 2 "const_int_operand" "n,n")) + (const_int 0)) + (label_ref (match_operand 3 "")) + (pc))) + (clobber (match_scratch:DI 0 "=r,r")) + (clobber (reg:CC CR0_REGNO))] + "rs6000_is_valid_rotate_dot_mask (operands[2], DImode) + && TARGET_POWERPC64" + "#" + "&& reload_completed" + [(pc)] +{ + int nb, ne; + if (rs6000_is_valid_mask (operands[2], &nb, &ne, DImode) + && nb >= ne + && ne > 0) + { + unsigned HOST_WIDE_INT val = INTVAL (operands[2]); + int shift = 63 - nb; + rtx tmp = gen_rtx_ASHIFT (DImode, operands[1], GEN_INT (shift)); + tmp = gen_rtx_AND (DImode, tmp, GEN_INT (val << shift)); + rtx cr0 = gen_rtx_REG (CCmode, CR0_REGNO); + rs6000_emit_dot_insn (operands[0], tmp, 1, cr0); + rtx loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[3]); + rtx cond = gen_rtx_EQ (CCEQmode, cr0, const0_rtx); + rtx ite = gen_rtx_IF_THEN_ELSE (VOIDmode, cond, loc_ref, pc_rtx); + emit_jump_insn (gen_rtx_SET (pc_rtx, ite)); + DONE; + } + else + FAIL; +} + [(set_attr "type" "shift") + (set_attr "dot" "yes") + (set_attr "length" "8,12")]) (define_expand "3" [(set (match_operand:SDI 0 "gpc_reg_operand") diff --git a/gcc/testsuite/gcc.target/powerpc/pr102239.c b/gcc/testsuite/gcc.target/powerpc/pr102239.c new file mode 100644 index 000..1bafc9fe18e --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr102239.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2" } */ + +void foo(long arg) +{ + if (arg & ((1UL << 33) | (1UL << 34))) +asm volatile("# if"); + else +asm volatile("# else"); +} + +/* { dg-final { scan-assembler-times "rldicr." 1 } } */ -- 2.25.1
Re: [PATCH 3/3] Fix loop split incorrect count and probability
On 2021/12/9 07:47, Jeff Law wrote: >> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c >> index 3f6ad046623..33128061aab 100644 >> --- a/gcc/tree-ssa-loop-split.c >> +++ b/gcc/tree-ssa-loop-split.c >> >> @@ -607,6 +610,38 @@ split_loop (class loop *loop1) >> tree guard_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge >> (loop1)); >> patch_loop_exit (loop1, guard_stmt, guard_next, newend, >> initial_true); >> + update_ssa (TODO_update_ssa); >> + >> + /* Proportion first loop's bb counts except those dominated by true >> + branch to avoid drop 1s down. */ >> + basic_block *bbs1, *bbs2; >> + bbs1 = get_loop_body (loop1); >> + unsigned j; >> + for (j = 0; j < loop1->num_nodes; j++) >> + if (bbs1[j] == loop1->latch >> + || !dominated_by_p (CDI_DOMINATORS, bbs1[j], true_edge->dest)) >> + bbs1[j]->count >> + = bbs1[j]->count.apply_probability (true_edge->probability); >> + free (bbs1); > It looks like there's two copies of this code in this patch, one in > split_loop and the other in do_split_loop_on_cond. Would it make sense > to factor it out into its own little function? > > >> + >> + /* Proportion second loop's bb counts except those dominated by >> false >> + branch to avoid drop 1s down. */ >> + basic_block bbi_copy = get_bb_copy (false_edge->dest); >> + bbs2 = get_loop_body (loop2); >> + for (j = 0; j < loop2->num_nodes; j++) >> + if (bbs2[j] == loop2->latch >> + || !dominated_by_p (CDI_DOMINATORS, bbs2[j], bbi_copy)) >> + bbs2[j]->count = bbs2[j]->count.apply_probability ( >> + true_edge->probability.invert ()); >> + free (bbs2); > Similarly for this block of code. > > If those can be reasonably factored out into two helper functions to be > called from split_loop and do_split_loop_on_cond, then this is OK with > the refactoring. > > jeff Thanks for the comments, updated as below. Will commit this patchset and the approved patch for LIM if there are no objections: [PATCH v2 3/3] Fix loop split incorrect count and probability In tree-ssa-loop-split.c, split_loop and split_loop_on_cond does two kind of split. split_loop only works for single loop and insert edge at exit when split, while split_loop_on_cond is not limited to single loop and insert edge at latch when split. Both split behavior should consider loop count and probability update. For split_loop, loop split condition is moved in front of loop1 and loop2; But split_loop_on_cond moves the condition between loop1 and loop2, this patch does: 1) profile count proportion for both original loop and copied loop without dropping down the true branch's count; 2) probability update in the two loops and between the two loops. Regression tested pass, OK for master? Changes diff for split_loop and split_loop_on_cond cases: 1) diff base/loop-split.c.151t.lsplit patched/loop-split.c.152t.lsplit ... [local count: 118111600]: _1 = end_6(D) - beg_7(D); j_9 = _1 + beg2_8(D); if (end_6(D) > beg_7(D)) goto ; [89.00%] else goto ; [11.00%] [local count: 105119324]: if (j_9 >= c_11(D)) -goto ; [100.00%] +goto ; [33.00%] else -goto ; [100.00%] +goto ; [67.00%] - [local count: 105119324]: + [local count: 34689377]: _27 = end_6(D) + -1; _28 = beg_7(D) - end_6(D); _29 = j_9 + _28; _30 = _29 + 1; _31 = MAX_EXPR ; - [local count: 955630225]: + [local count: 315357973]: # i_18 = PHI # j_19 = PHI printf ("a: %d %d\n", i_18, j_19); i_13 = i_18 + -1; j_14 = j_19 + -1; if (j_14 >= _31) -goto ; [89.00%] +goto ; [29.37%] else -goto ; [11.00%] +goto ; [70.63%] - [local count: 850510901]: + [local count: 280668596]: goto ; [100.00%] - [local count: 105119324]: + [local count: 70429947]: # i_24 = PHI # j_25 = PHI [local count: 955630225]: # i_3 = PHI # j_2 = PHI i_22 = i_3 + -1; j_23 = j_2 + -1; if (beg_7(D) < i_22) goto ; [89.00%] else goto ; [11.00%] - [local count: 850510901]: + [local count: 569842305]: goto ; [100.00%] [local count: 105119324]: # i_32 = PHI # j_33 = PHI if (beg_7(D) < i_32) goto ; [80.00%] else goto ; [20.00%] [local count: 105119324]: [local count: 118111600]: return 0; } 2) diff base/loop-cond-split-1.c.151t.lsplit patched/loop-cond-split-1.c.151t.lsplit: ... [local count: 118111600]: if (n_7(D) > 0) goto ; [89.00%] else goto ; [11.00%] [local count: 118111600]: return; [local count: 105119324]: pretmp_3 = ga; - [local count: 955630225]: + [local count: 315357973]: # i_13 = PHI # prephitmp_12 = PHI if (prephitmp_12 != 0) goto ; [33.00%] else goto ; [67.00%] [local count: 315357972]: _2 = do_something (); ga = _2; - [local count: 955630225]: + [local count: 315357973]: # prephitmp_5 = PHI i_
Re: [PATCH v4] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069]
On 2022/8/16 14:53, Kewen.Lin wrote: Hi Xionghu, Thanks for the updated version of patch, some comments are inlined. on 2022/8/11 14:15, Xionghu Luo wrote: On 2022/8/11 01:07, Segher Boessenkool wrote: On Wed, Aug 10, 2022 at 02:39:02PM +0800, Xionghu Luo wrote: On 2022/8/9 11:01, Kewen.Lin wrote: I have some concern on those changed "altivec_*_direct", IMHO the suffix "_direct" is normally to indicate the define_insn is mapped to the corresponding hw insn directly. With this change, for example, altivec_vmrghb_direct can be mapped into vmrghb or vmrglb, this looks misleading. Maybe we can add the corresponding _direct_le and _direct_be versions, both are mapped into the same insn but have different RTL patterns. Looking forward to Segher's and David's suggestions. Thanks! Do you mean same RTL patterns with different hw insn? A pattern called altivec_vmrghb_direct_le should always emit a vmrghb instruction, never a vmrglb instead. Misleading names are an expensive problem. Thanks. Then on LE platforms, if user calls altivec_vmrghw,it will be expanded to RTL (vec_select (vec_concat (R0 R1 (0 4 1 5))), and finally matched to altivec_vmrglw_direct_v4si_le with ASM "vmrglw". For BE just strict forward, seems more clear :-), OK for master? [PATCH v3] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069] v3: rename altivec_vmrghb_direct_le to altivec_vmrglb_direct_le to match the actual output ASM vmrglb. Likewise for all similar xxx_direct_le patterns. v2: Split the direct pattern to be and le with same RTL but different insn. The native RTL expression for vec_mrghw should be same for BE and LE as they are register and endian-independent. So both BE and LE need generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw with vec_select and vec_concat. (set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI (subreg:V4SI (reg:V16QI 139) 0) (subreg:V4SI (reg:V16QI 140) 0)) [const_int 0 4 1 5])) Then combine pass could do the nested vec_select optimization in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE: 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5]) 24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);} => 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel) 24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);} The endianness check need only once at ASM generation finally. ASM would be better due to nested vec_select simplified to simple scalar load. Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{64} Linux(Thanks to Kewen). gcc/ChangeLog: PR target/106069 * config/rs6000/altivec.md (altivec_vmrghb_direct): Remove. (altivec_vmrghb_direct_be): New pattern for BE. (altivec_vmrglb_direct_le): New pattern for LE. (altivec_vmrghh_direct): Remove. (altivec_vmrghh_direct_be): New pattern for BE. (altivec_vmrglh_direct_le): New pattern for LE. (altivec_vmrghw_direct_): Remove. (altivec_vmrghw_direct__be): New pattern for BE. (altivec_vmrglw_direct__le): New pattern for LE. (altivec_vmrglb_direct): Remove. (altivec_vmrglb_direct_be): New pattern for BE. (altivec_vmrghb_direct_le): New pattern for LE. (altivec_vmrglh_direct): Remove. (altivec_vmrglh_direct_be): New pattern for BE. (altivec_vmrghh_direct_le): New pattern for LE. (altivec_vmrglw_direct_): Remove. (altivec_vmrglw_direct__be): New pattern for BE. (altivec_vmrghw_direct__le): New pattern for LE. * config/rs6000/rs6000.cc (altivec_expand_vec_perm_const): Adjust. * config/rs6000/vsx.md: Likewise. gcc/testsuite/ChangeLog: PR target/106069 * g++.target/powerpc/pr106069.C: New test. Signed-off-by: Xionghu Luo --- gcc/config/rs6000/altivec.md | 223 ++-- gcc/config/rs6000/rs6000.cc | 36 ++-- gcc/config/rs6000/vsx.md | 24 +-- gcc/testsuite/g++.target/powerpc/pr106069.C | 120 +++ 4 files changed, 305 insertions(+), 98 deletions(-) create mode 100644 gcc/testsuite/g++.target/powerpc/pr106069.C diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 2c4940f2e21..78245f470e9 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -1144,15 +1144,17 @@ (define_expand "altivec_vmrghb" (use (match_operand:V16QI 2 "register_operand"))] "TARGET_ALTIVEC" { - rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct - : gen_altivec_vmrglb_direct; - if (!BYTES_BIG_ENDIAN) - std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + rtvec v = gen_rtvec (16, GEN_INT (0), GEN_INT (16), GEN_INT (1), GEN_INT (17), + GEN_INT (2), GEN_INT (18), GEN_IN
Ping: [PATCH v4] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069]
Hi Segher, I'd like to resend and ping for this patch. Thanks. From 23bffdacdf0eb1140c7a3571e6158797f4818d57 Mon Sep 17 00:00:00 2001 From: Xionghu Luo Date: Thu, 4 Aug 2022 03:44:58 + Subject: [PATCH v4] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069] v4: Update per comments. v3: rename altivec_vmrghb_direct_le to altivec_vmrglb_direct_le to match the actual output ASM vmrglb. Likewise for all similar xxx_direct_le patterns. v2: Split the direct pattern to be and le with same RTL but different insn. The native RTL expression for vec_mrghw should be same for BE and LE as they are register and endian-independent. So both BE and LE need generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw with vec_select and vec_concat. (set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI (subreg:V4SI (reg:V16QI 139) 0) (subreg:V4SI (reg:V16QI 140) 0)) [const_int 0 4 1 5])) Then combine pass could do the nested vec_select optimization in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE: 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5]) 24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);} => 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel) 24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);} The endianness check need only once at ASM generation finally. ASM would be better due to nested vec_select simplified to simple scalar load. Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{32,64} Linux. gcc/ChangeLog: PR target/106069 * config/rs6000/altivec.md (altivec_vmrghb_direct): Remove. (altivec_vmrghb_direct_be): New pattern for BE. (altivec_vmrghb_direct_le): New pattern for LE. (altivec_vmrghh_direct): Remove. (altivec_vmrghh_direct_be): New pattern for BE. (altivec_vmrghh_direct_le): New pattern for LE. (altivec_vmrghw_direct_): Remove. (altivec_vmrghw_direct__be): New pattern for BE. (altivec_vmrghw_direct__le): New pattern for LE. (altivec_vmrglb_direct): Remove. (altivec_vmrglb_direct_be): New pattern for BE. (altivec_vmrglb_direct_le): New pattern for LE. (altivec_vmrglh_direct): Remove. (altivec_vmrglh_direct_be): New pattern for BE. (altivec_vmrglh_direct_le): New pattern for LE. (altivec_vmrglw_direct_): Remove. (altivec_vmrglw_direct__be): New pattern for BE. (altivec_vmrglw_direct__le): New pattern for LE. * config/rs6000/rs6000.cc (altivec_expand_vec_perm_const): Adjust. * config/rs6000/vsx.md: Likewise. gcc/testsuite/ChangeLog: PR target/106069 * g++.target/powerpc/pr106069.C: New test. Signed-off-by: Xionghu Luo --- gcc/config/rs6000/altivec.md| 222 ++-- gcc/config/rs6000/rs6000.cc | 24 +-- gcc/config/rs6000/vsx.md| 28 +-- gcc/testsuite/g++.target/powerpc/pr106069.C | 118 +++ 4 files changed, 307 insertions(+), 85 deletions(-) create mode 100644 gcc/testsuite/g++.target/powerpc/pr106069.C diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 2c4940f2e21..c6a381908cb 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -1144,15 +1144,16 @@ (define_expand "altivec_vmrghb" (use (match_operand:V16QI 2 "register_operand"))] "TARGET_ALTIVEC" { - rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct - : gen_altivec_vmrglb_direct; - if (!BYTES_BIG_ENDIAN) -std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + if (BYTES_BIG_ENDIAN) +emit_insn ( + gen_altivec_vmrghb_direct_be (operands[0], operands[1], operands[2])); + else +emit_insn ( + gen_altivec_vmrglb_direct_le (operands[0], operands[2], operands[1])); DONE; }) -(define_insn "altivec_vmrghb_direct" +(define_insn "altivec_vmrghb_direct_be" [(set (match_operand:V16QI 0 "register_operand" "=v") (vec_select:V16QI (vec_concat:V32QI @@ -1166,7 +1167,25 @@ (define_insn "altivec_vmrghb_direct" (const_int 5) (const_int 21) (const_int 6) (const_int 22) (const_int 7) (const_int 23)])))] - "TARGET_ALTIVEC" + "TARGET_ALTIVEC && BYTES_BIG_ENDIAN" + "vmrghb %0,%1,%2" + [(set_attr "type" "vecperm")]) + +(define_insn "altivec_vmrghb_direct_le" + [(set (match_operand:V16QI 0 "register_operand" "=v") + (vec_select:V16QI + (vec_concat:V32QI + (match_operand:V16QI 2 "register_operand" "v")
Ping: [PATCH 0/4] rs6000: Enable variable vec_insert with IFN VEC_SET
Ping. On 2020/10/10 16:08, Xionghu Luo wrote: Originated from https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554240.html with patch split and some refinement per review comments. Patch of IFN VEC_SET for ARRAY_REF(VIEW_CONVERT_EXPR) is committed, this patch set enables expanding IFN VEC_SET for Power9 and Power8 with specfic instruction sequences. Xionghu Luo (4): rs6000: Change rs6000_expand_vector_set param rs6000: Support variable insert and Expand vec_insert in expander [PR79251] rs6000: Enable vec_insert for P8 with rs6000_expand_vector_set_var_p8 rs6000: Update testcases' instruction count gcc/config/rs6000/rs6000-c.c | 44 +++-- gcc/config/rs6000/rs6000-call.c | 2 +- gcc/config/rs6000/rs6000-protos.h | 3 +- gcc/config/rs6000/rs6000.c| 181 +- gcc/config/rs6000/vector.md | 4 +- .../powerpc/fold-vec-insert-char-p8.c | 8 +- .../powerpc/fold-vec-insert-char-p9.c | 12 +- .../powerpc/fold-vec-insert-double.c | 11 +- .../powerpc/fold-vec-insert-float-p8.c| 6 +- .../powerpc/fold-vec-insert-float-p9.c| 10 +- .../powerpc/fold-vec-insert-int-p8.c | 6 +- .../powerpc/fold-vec-insert-int-p9.c | 11 +- .../powerpc/fold-vec-insert-longlong.c| 10 +- .../powerpc/fold-vec-insert-short-p8.c| 6 +- .../powerpc/fold-vec-insert-short-p9.c| 8 +- .../gcc.target/powerpc/pr79251-run.c | 28 +++ gcc/testsuite/gcc.target/powerpc/pr79251.h| 19 ++ gcc/testsuite/gcc.target/powerpc/pr79251.p8.c | 17 ++ gcc/testsuite/gcc.target/powerpc/pr79251.p9.c | 18 ++ .../gcc.target/powerpc/vsx-builtin-7.c| 4 +- 20 files changed, 337 insertions(+), 71 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.h create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p8.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p9.c -- Thanks, Xionghu
Ping^2: [PATCH 0/4] rs6000: Enable variable vec_insert with IFN VEC_SET
Ping^2, thanks. On 2020/11/5 09:34, Xionghu Luo via Gcc-patches wrote: Ping. On 2020/10/10 16:08, Xionghu Luo wrote: Originated from https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554240.html with patch split and some refinement per review comments. Patch of IFN VEC_SET for ARRAY_REF(VIEW_CONVERT_EXPR) is committed, this patch set enables expanding IFN VEC_SET for Power9 and Power8 with specfic instruction sequences. Xionghu Luo (4): rs6000: Change rs6000_expand_vector_set param rs6000: Support variable insert and Expand vec_insert in expander [PR79251] rs6000: Enable vec_insert for P8 with rs6000_expand_vector_set_var_p8 rs6000: Update testcases' instruction count gcc/config/rs6000/rs6000-c.c | 44 +++-- gcc/config/rs6000/rs6000-call.c | 2 +- gcc/config/rs6000/rs6000-protos.h | 3 +- gcc/config/rs6000/rs6000.c | 181 +- gcc/config/rs6000/vector.md | 4 +- .../powerpc/fold-vec-insert-char-p8.c | 8 +- .../powerpc/fold-vec-insert-char-p9.c | 12 +- .../powerpc/fold-vec-insert-double.c | 11 +- .../powerpc/fold-vec-insert-float-p8.c | 6 +- .../powerpc/fold-vec-insert-float-p9.c | 10 +- .../powerpc/fold-vec-insert-int-p8.c | 6 +- .../powerpc/fold-vec-insert-int-p9.c | 11 +- .../powerpc/fold-vec-insert-longlong.c | 10 +- .../powerpc/fold-vec-insert-short-p8.c | 6 +- .../powerpc/fold-vec-insert-short-p9.c | 8 +- .../gcc.target/powerpc/pr79251-run.c | 28 +++ gcc/testsuite/gcc.target/powerpc/pr79251.h | 19 ++ gcc/testsuite/gcc.target/powerpc/pr79251.p8.c | 17 ++ gcc/testsuite/gcc.target/powerpc/pr79251.p9.c | 18 ++ .../gcc.target/powerpc/vsx-builtin-7.c | 4 +- 20 files changed, 337 insertions(+), 71 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.h create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p8.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p9.c -- Thanks, Xionghu
Re: [PATCH] rs6000: Don't split constant operator add before reload, move to temp register for future optimization
Hi, On 2020/10/27 05:10, Segher Boessenkool wrote: > On Wed, Oct 21, 2020 at 03:25:29AM -0500, Xionghu Luo wrote: >> Don't split code from add3 for SDI to allow a later pass to split. > > This is very problematic. > >> This allows later logic to hoist out constant load in add instructions. > > Later logic should be able to do that any way (I do not say that works > perfectly, mind; it no doubt could be improved). > >> In loop, lis+ori could be hoisted out to improve performance compared with >> previous addis+addi (About 15% on typical case), weak point is >> one more register is used and one more instruction is generated. i.e.: > > Yes, better performance on one testcase, and worse code always :-( > >> addis 3,3,0x6765 >> addi 3,3,0x4321 >> >> => >> >> lis 9,0x6765 >> ori 9,9,0x4321 >> add 3,3,9 > > This is the typical kind of clumsy code you get if you generate RTL that > matches actual machine instructions too late ("split too late"). > > So, please make it possible to hoist 2-insn-immediate sequences out of > loops, *without* changing them to fake 1-insn things. > As we discussed offline, addis+addi is not quite possible to be hoisted out of loops as not invariant, update the patch as below, thanks: [PATCH v2] rs6000: Split constant operator add in split1 instead of expander Currently, ADD with positive 32bit constant is split to addis+addi in expander, which seems too early to optimize the constant load out of loop compared with other targets. This patch use a temp register to load the constant and do two register addition in expander same as negative 32bit constant add. This allows loop invariant pass to hoist out constant load before add instructions, then split1 pass will split the load to lis+ori after combine. Performance could be improved by 15% on typical case compared with previous addis+addi in loop. (1) 0x67654321 addis 3,3,0x6765 addi 3,3,0x4321 => lis 9,0x6765 ori 9,9,0x4321 add 3,3,9 (2) 0x8fff addis 9,9,0x1 addi 3,9,-28673 => li 10,0 ori 10,10,0x8fff add 3,3,10 Regression and bootstrap tested pass on P8LE. gcc/ChangeLog: 2020-10-21 Xiong Hu Luo * config/rs6000/rs6000.md (add3 for SDI): Don't split before reload, move constant to temp register for add. (define_split): Split const from split1. gcc/testsuite/ChangeLog: 2020-10-21 Xiong Hu Luo * gcc.target/powerpc/add-const.c: New test. --- gcc/config/rs6000/rs6000.md | 38 gcc/testsuite/gcc.target/powerpc/add-const.c | 18 ++ 2 files changed, 41 insertions(+), 15 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/add-const.c diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 5e5ad9f7c3d..b52e9555962 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -1750,18 +1750,26 @@ (define_expand "add3" if (CONST_INT_P (operands[2]) && !add_operand (operands[2], mode)) { - rtx tmp = ((!can_create_pseudo_p () - || rtx_equal_p (operands[0], operands[1])) -? operands[0] : gen_reg_rtx (mode)); - - /* Adding a constant to r0 is not a valid insn, so use a different -strategy in that case. */ - if (reg_or_subregno (operands[1]) == 0 || reg_or_subregno (tmp) == 0) + bool reg0 = reg_or_subregno (operands[0]) == 0; + if (can_create_pseudo_p () || reg0) { - if (operands[0] == operands[1]) - FAIL; - rs6000_emit_move (operands[0], operands[2], mode); - emit_insn (gen_add3 (operands[0], operands[1], operands[0])); + + rtx tmp = (!can_create_pseudo_p () + || rtx_equal_p (operands[0], operands[1])) + ? operands[0] : gen_reg_rtx (mode); + + /* Adding a constant to r0 is not a valid insn, so use a different +strategy in that case. See stack-limit.c, need generate +"24: %0:DI=0x20fa0; 25: %0:DI=%14:DI+%0:DI" in pro_and_epilogue +when can_create_pseudo_p is false. */ + if (reg0 == 0 || reg_or_subregno (tmp) == 0) + { + if (operands[0] == operands[1]) + FAIL; + } + + rs6000_emit_move (tmp, operands[2], mode); + emit_insn (gen_add3 (operands[0], operands[1], tmp)); DONE; } @@ -1775,8 +1783,8 @@ (define_expand "add3" /* The ordering here is important for the prolog expander. When space is allocated from the stack, adding 'low' first may produce a temporary deallocation (which would be bad). */ - emit_insn (gen_add3 (tmp, operands[1], GEN_INT (rest))); - emit_insn (gen_add3 (operands[0], tmp, GEN_INT (low))); + emit_insn (gen_add3 (operands[0], operands[1], GE
Re: [PATCH] tree-optimization/102155 - fix LIM fill_always_executed_in CFG walk
On 2021/9/1 17:58, Richard Biener wrote: This fixes the CFG walk order of fill_always_executed_in to use RPO oder rather than the dominator based order computed by get_loop_body_in_dom_order. That fixes correctness issues with unordered dominator children. The RPO order computed by rev_post_order_and_mark_dfs_back_seme in its for-iteration mode is a good match for the algorithm. Xionghu, I've tried to only fix the CFG walk order issue and not change anything else with this so we have a more correct base to work against. The code still walks inner loop bodies up to loop depth times and thus is quadratic in the loop depth. Bootstrapped and tested on x86_64-unknown-linux-gnu, if you don't have any comments I plan to push this and then revisit what we were circling around. LGTM, thanks. Richard. 2021-09-01 Richard Biener PR tree-optimization/102155 * tree-ssa-loop-im.c (fill_always_executed_in_1): Iterate over a part of the RPO array and do not recurse here. Dump blocks marked as always executed. (fill_always_executed_in): Walk over the RPO array and process loops whose header we run into. (loop_invariant_motion_in_fun): Compute the first RPO using rev_post_order_and_mark_dfs_back_seme in iteration order and pass that to fill_always_executed_in. --- gcc/tree-ssa-loop-im.c | 136 ++--- 1 file changed, 73 insertions(+), 63 deletions(-) diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index d9f75d5025e..f3706dcdb8a 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -3025,77 +3025,74 @@ do_store_motion (void) /* Fills ALWAYS_EXECUTED_IN information for basic blocks of LOOP, i.e. for each such basic block bb records the outermost loop for that execution of its header implies execution of bb. CONTAINS_CALL is the bitmap of - blocks that contain a nonpure call. */ + blocks that contain a nonpure call. The blocks of LOOP start at index + START of the RPO array of size N. */ static void -fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) +fill_always_executed_in_1 (function *fun, class loop *loop, + int *rpo, int start, int n, sbitmap contains_call) { - basic_block bb = NULL, *bbs, last = NULL; - unsigned i; - edge e; + basic_block last = NULL; class loop *inn_loop = loop; - if (ALWAYS_EXECUTED_IN (loop->header) == NULL) + for (int i = start; i < n; i++) { - bbs = get_loop_body_in_dom_order (loop); - - for (i = 0; i < loop->num_nodes; i++) - { - edge_iterator ei; - bb = bbs[i]; - - if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) - last = bb; + basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]); + /* Stop when we iterated over all blocks in this loop. */ + if (!flow_bb_inside_loop_p (loop, bb)) + break; - if (bitmap_bit_p (contains_call, bb->index)) - break; + if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) + last = bb; - FOR_EACH_EDGE (e, ei, bb->succs) - { - /* If there is an exit from this BB. */ - if (!flow_bb_inside_loop_p (loop, e->dest)) - break; - /* Or we enter a possibly non-finite loop. */ - if (flow_loop_nested_p (bb->loop_father, - e->dest->loop_father) - && ! finite_loop_p (e->dest->loop_father)) - break; - } - if (e) - break; + if (bitmap_bit_p (contains_call, bb->index)) + break; - /* A loop might be infinite (TODO use simple loop analysis -to disprove this if possible). */ - if (bb->flags & BB_IRREDUCIBLE_LOOP) + edge_iterator ei; + edge e; + FOR_EACH_EDGE (e, ei, bb->succs) + { + /* If there is an exit from this BB. */ + if (!flow_bb_inside_loop_p (loop, e->dest)) break; - - if (!flow_bb_inside_loop_p (inn_loop, bb)) + /* Or we enter a possibly non-finite loop. */ + if (flow_loop_nested_p (bb->loop_father, + e->dest->loop_father) + && ! finite_loop_p (e->dest->loop_father)) break; + } + if (e) + break; - if (bb->loop_father->header == bb) - { - if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) - break; + /* A loop might be infinite (TODO use simple loop analysis +to disprove this if possible). */ + if (bb->flags & BB_IRREDUCIBLE_LOOP) + break; - /* In a loop that is always entered we may proceed anyway. -But record that we entered it and stop once we leave it. */ - inn_loop = bb->loop_father; - } - } + if (!flow_bb_inside_loop_p (inn_loop, bb)) + break; -
Re: [PATCH] tree-optimization/102155 - fix LIM fill_always_executed_in CFG walk
On 2021/9/2 16:50, Richard Biener wrote: > On Thu, 2 Sep 2021, Richard Biener wrote: > >> On Thu, 2 Sep 2021, Xionghu Luo wrote: >> >>> >>> >>> On 2021/9/1 17:58, Richard Biener wrote: >>>> This fixes the CFG walk order of fill_always_executed_in to use >>>> RPO oder rather than the dominator based order computed by >>>> get_loop_body_in_dom_order. That fixes correctness issues with >>>> unordered dominator children. >>>> >>>> The RPO order computed by rev_post_order_and_mark_dfs_back_seme in >>>> its for-iteration mode is a good match for the algorithm. >>>> >>>> Xionghu, I've tried to only fix the CFG walk order issue and not >>>> change anything else with this so we have a more correct base >>>> to work against. The code still walks inner loop bodies >>>> up to loop depth times and thus is quadratic in the loop depth. >>>> >>>> Bootstrapped and tested on x86_64-unknown-linux-gnu, if you don't >>>> have any comments I plan to push this and then revisit what we >>>> were circling around. >>> >>> LGTM, thanks. >> >> I pushed it, thought again in the attempt to build a testcase and >> concluded I was wrong with the appearant mishandling of >> contains_call - get_loop_body_in_dom_order seems to be exactly >> correct for this specific case. So I reverted the commit again. > > And I figured what the > >/* In a loop that is always entered we may proceed anyway. > But record that we entered it and stop once we leave it. > */ > > comment was about. The code was present before the fix for PR78185 > and it was supposed to catch the case where the entered inner loop > is not finite. Just as the testcase from PR78185 shows the > stopping was done too late when the exit block was already marked > as to be always executed. A simpler fix for PR78185 would have been > to move > >if (!flow_bb_inside_loop_p (inn_loop, bb)) > break; > > before setting of last = bb. In fact the installed fix was more > pessimistic than that given it terminated already when entering > a possibly infinite loop. So we can improve that by doing > sth like which should also improve the situation for some of > the cases you were looking at? > > What remains is that we continue to stop when entering a > not always executed loop: > >if (bb->loop_father->header == bb) > { >if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) > break; Yes. This will cause blocks after inner loop missed to be check if they are actually ALWAYS_EXECUTED. I am afraid O(N^2) is inevitable here... > > that I can at this point only explain by possible efficiency > concerns? Any better idea on that one? >From experiment, early break from inner loop seems not cost shorter time than full inner loop walk. I will take more precise measurement and larger data set on the function fill_always_executed_in_1 if necessary. My previous v2 patch also tried to update inn_loop level by level when exiting from inn_loops, but it is proved to be unnecessary but you worried about the dominance order by get_loop_body_in_dom_order. > > I'm going to test the patch below which improves the situation for > > volatile int flag, bar; > double foo (double *valp) > { >double sum = 0; >for (int i = 0; i < 256; ++i) > { >for (int j = 0; j < 256; ++j) > bar = flag; >if (flag) > sum += 1.; >sum += *valp; > } >return sum; > } The patch still fails to handle cases like this: struct X { int i; int j; int k;}; volatile int m; void bar (struct X *x, int n, int l, int k) { for (int i = 0; i < l; i++) { if (k) for (int j = 0; j < l; j++) { if (m) x->i = m; else x->i = 1 - m; int *r = &x->k; int tem2 = *r; x->k += tem2 * j; } x->i = m; } } x->i is still not marked ALWAYS_EXECUTED for outer loop. > > Thanks, > Richard. > > diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c > index d9f75d5025e..f0c93d6a882 100644 > --- a/gcc/tree-ssa-loop-im.c > +++ b/gcc/tree-ssa-loop-im.c > @@ -3044,23 +3044,27 @@ fill_always_executed_in_1 (class loop *loop, > sbitmap contains_call) >edge_iterator ei; >bb = bbs[i]; > > + if (!flow_bb_inside_loop_p (inn_loop, bb)) > + { > +
Re: Ping ^ 2: [PATCH] rs6000: Expand fmod and remainder when built with fast-math [PR97142]
Resend the patch that addressed Will's comments. fmod/fmodf and remainder/remainderf could be expanded instead of library call when fast-math build, which is much faster. fmodf: fdivs f0,f1,f2 frizf0,f0 fnmsubs f1,f2,f0,f1 remainderf: fdivs f0,f1,f2 frinf0,f0 fnmsubs f1,f2,f0,f1 SPEC2017 Ofast P8LE: 511.povray_r +1.14%, 526.blender_r +1.72% gcc/ChangeLog: 2021-09-03 Xionghu Luo PR target/97142 * config/rs6000/rs6000.md (fmod3): New define_expand. (remainder3): Likewise. gcc/testsuite/ChangeLog: 2021-09-03 Xionghu Luo PR target/97142 * gcc.target/powerpc/pr97142.c: New test. --- gcc/config/rs6000/rs6000.md| 36 ++ gcc/testsuite/gcc.target/powerpc/pr97142.c | 35 + 2 files changed, 71 insertions(+) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr97142.c diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index c8cdc42533c..84820d3b5cb 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -4932,6 +4932,42 @@ (define_insn "fre" [(set_attr "type" "fp") (set_attr "isa" "*,")]) +(define_expand "fmod3" + [(use (match_operand:SFDF 0 "gpc_reg_operand")) + (use (match_operand:SFDF 1 "gpc_reg_operand")) + (use (match_operand:SFDF 2 "gpc_reg_operand"))] + "TARGET_HARD_FLOAT + && TARGET_FPRND + && flag_unsafe_math_optimizations" +{ + rtx div = gen_reg_rtx (mode); + emit_insn (gen_div3 (div, operands[1], operands[2])); + + rtx friz = gen_reg_rtx (mode); + emit_insn (gen_btrunc2 (friz, div)); + + emit_insn (gen_nfms4 (operands[0], operands[2], friz, operands[1])); + DONE; + }) + +(define_expand "remainder3" + [(use (match_operand:SFDF 0 "gpc_reg_operand")) + (use (match_operand:SFDF 1 "gpc_reg_operand")) + (use (match_operand:SFDF 2 "gpc_reg_operand"))] + "TARGET_HARD_FLOAT + && TARGET_FPRND + && flag_unsafe_math_optimizations" +{ + rtx div = gen_reg_rtx (mode); + emit_insn (gen_div3 (div, operands[1], operands[2])); + + rtx frin = gen_reg_rtx (mode); + emit_insn (gen_round2 (frin, div)); + + emit_insn (gen_nfms4 (operands[0], operands[2], frin, operands[1])); + DONE; + }) + (define_insn "*rsqrt2" [(set (match_operand:SFDF 0 "gpc_reg_operand" "=,wa") (unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" ",wa")] diff --git a/gcc/testsuite/gcc.target/powerpc/pr97142.c b/gcc/testsuite/gcc.target/powerpc/pr97142.c new file mode 100644 index 000..e5306eb681b --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr97142.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast" } */ + +#include + +float test1 (float x, float y) +{ + return fmodf (x, y); +} + +double test2 (double x, double y) +{ + return fmod (x, y); +} + +float test3 (float x, float y) +{ + return remainderf (x, y); +} + +double test4 (double x, double y) +{ + return remainder (x, y); +} + +/* { dg-final { scan-assembler-not {\mbl fmod\M} } } */ +/* { dg-final { scan-assembler-not {\mbl fmodf\M} } } */ +/* { dg-final { scan-assembler-not {\mbl remainder\M} } } */ +/* { dg-final { scan-assembler-not {\mbl remainderf\M} } } */ +/* { dg-final { scan-assembler-times {\mfdiv\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mfdivs\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mfnmsub\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mfnmsubs\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mfriz\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mfrin\M} 2 } } */ -- 2.25.1
Ping ^ 2: [PATCH] rs6000: Fix wrong code generation for vec_sel [PR94613]
Ping^2, thanks. https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html On 2021/6/30 09:42, Xionghu Luo via Gcc-patches wrote: Gentle ping, thanks. https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html On 2021/5/14 14:57, Xionghu Luo via Gcc-patches wrote: Hi, On 2021/5/13 18:49, Segher Boessenkool wrote: Hi! On Fri, Apr 30, 2021 at 01:32:58AM -0500, Xionghu Luo wrote: The vsel instruction is a bit-wise select instruction. Using an IF_THEN_ELSE to express it in RTL is wrong and leads to wrong code being generated in the combine pass. Per element selection is a subset of per bit-wise selection,with the patch the pattern is written using bit operations. But there are 8 different patterns to define "op0 := (op1 & ~op3) | (op2 & op3)": (~op3&op1) | (op3&op2), (~op3&op1) | (op2&op3), (op3&op2) | (~op3&op1), (op2&op3) | (~op3&op1), (op1&~op3) | (op3&op2), (op1&~op3) | (op2&op3), (op3&op2) | (op1&~op3), (op2&op3) | (op1&~op3), Combine pass will swap (op1&~op3) to (~op3&op1) due to commutative canonical, which could reduce it to the FIRST 4 patterns, but it won't swap (op2&op3) | (~op3&op1) to (~op3&op1) | (op2&op3), so this patch handles it with two patterns with different NOT op3 position and check equality inside it. Yup, that latter case does not have canonicalisation rules. Btw, not only combine does this canonicalisation: everything should, non-canonical RTL is invalid RTL (in the instruction stream, you can do everything in temporary code of course, as long as the RTL isn't malformed). -(define_insn "*altivec_vsel" +(define_insn "altivec_vsel" [(set (match_operand:VM 0 "altivec_register_operand" "=v") - (if_then_else:VM - (ne:CC (match_operand:VM 1 "altivec_register_operand" "v") - (match_operand:VM 4 "zero_constant" "")) - (match_operand:VM 2 "altivec_register_operand" "v") - (match_operand:VM 3 "altivec_register_operand" "v")))] - "VECTOR_MEM_ALTIVEC_P (mode)" - "vsel %0,%3,%2,%1" + (ior:VM + (and:VM + (not:VM (match_operand:VM 3 "altivec_register_operand" "v")) + (match_operand:VM 1 "altivec_register_operand" "v")) + (and:VM + (match_operand:VM 2 "altivec_register_operand" "v") + (match_operand:VM 4 "altivec_register_operand" "v"] + "VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode) + && (rtx_equal_p (operands[2], operands[3]) + || rtx_equal_p (operands[4], operands[3]))" + { + if (rtx_equal_p (operands[2], operands[3])) + return "vsel %0,%1,%4,%3"; + else + return "vsel %0,%1,%2,%3"; + } [(set_attr "type" "vecmove")]) That rtx_equal_p stuff is nice and tricky, but it is a bit too tricky I think. So please write this as two patterns (and keep the expand if that helps). I was a bit concerned that there would be a lot of duplicate code if we write two patterns for each vsel, totally 4 similar patterns in altivec.md and another 4 in vsx.md make it difficult to maintain, however I updated it since you prefer this way, as you pointed out the xxsel in vsx.md could be folded by later patch. +(define_insn "altivec_vsel2" (same here of course). ;; Fused multiply add. diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index f5676255387..d65bdc01055 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -3362,11 +3362,11 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_unsigned_V2DI }, { ALTIVEC_BUILTIN_VEC_SEL, ALTIVEC_BUILTIN_VSEL_2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI }, - { ALTIVEC_BUILTIN_VEC_SEL, ALTIVEC_BUILTIN_VSEL_2DI, + { ALTIVEC_BUILTIN_VEC_SEL, ALTIVEC_BUILTIN_VSEL_2DI_UNS, Are the _uns things still used for anything? But, let's not change this until Bill's stuff is in :-) Why do you want to change this here, btw? I don't understand. OK, they are actually "unsigned type" overload builtin functions, change it or not so far won't cause functionality issue, I will revert this change in the updated patch. + if (target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) No space after ! and other unary operators (except for casts and other operators you write with alphanumerics, like "sizeof"). I know you copied this code, but :-) OK, thanks. @@ -15608,8 +15606,6 @@ rs6000_emit_vector_cond_expr (rtx dest, rtx op_true, rtx op_false, case GEU: case LTU:
Ping ^ 2: [PATCH] rs6000: Remove unspecs for vec_mrghl[bhw]
Ping^2, thanks. https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572330.html On 2021/6/30 09:47, Xionghu Luo via Gcc-patches wrote: Gentle ping, thanks. https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572330.html On 2021/6/9 16:03, Xionghu Luo via Gcc-patches wrote: Hi, On 2021/6/9 07:25, Segher Boessenkool wrote: On Mon, May 24, 2021 at 04:02:13AM -0500, Xionghu Luo wrote: vmrghb only accepts permute index {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23} no matter for BE or LE in ISA, similarly for vmrghlb. (vmrglb) + if (BYTES_BIG_ENDIAN) + emit_insn ( + gen_altivec_vmrghb_direct (operands[0], operands[1], operands[2])); + else + emit_insn ( + gen_altivec_vmrglb_direct (operands[0], operands[2], operands[1])); Please don't indent like that, it doesn't match what we do elsewhere. For better or for worse (for worse imo), we use deep hanging indents. If you have to, you can do something like rtx insn; if (BYTES_BIG_ENDIAN) insn = gen_altivec_vmrghb_direct (operands[0], operands[1], operands[2]); else insn = gen_altivec_vmrglb_direct (operands[0], operands[2], operands[1]); emit_insn (insn); (this is better even, in that it has only one emit_insn), or even rtx (*fun) () = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct : gen_altivec_vmrglb_direct; if (!BYTES_BIG_ENDIAN) std::swap (operands[1], operands[2]); emit_insn (fun (operands[0], operands[1], operands[2])); Well, C++ does not allow that last example like that, sigh, so rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct : gen_altivec_vmrglb_direct; This is shorter than the other two options ;-) Changed. +(define_insn "altivec_vmrghb_direct" [(set (match_operand:V16QI 0 "register_operand" "=v") + (vec_select:V16QI This should be indented one space more. "TARGET_ALTIVEC" "@ - xxmrghw %x0,%x1,%x2 - vmrghw %0,%1,%2" + xxmrghw %x0,%x1,%x2 + vmrghw %0,%1,%2" The original indent was correct, please restore. - emit_insn (gen_altivec_vmrghw_direct (operands[0], ve, vo)); + emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], ve, vo)); When you see a mode as part of a pattern name, chances are that it will be a good candidate for using parameterized names with. (But don't do that now, just keep it in mind as a nice cleanup to do). OK. @@ -23022,8 +23022,8 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1, : CODE_FOR_altivec_vmrglh_direct), { 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 } }, { OPTION_MASK_ALTIVEC, - (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct - : CODE_FOR_altivec_vmrglw_direct), + (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct_v4si + : CODE_FOR_altivec_vmrglw_direct_v4si), The correct way is to align the ? and the : (or put everything on one line of course, if that fits) The parens around this are not needed btw, and are a distraction. Changed. --- a/gcc/testsuite/gcc.target/powerpc/builtins-1.c +++ b/gcc/testsuite/gcc.target/powerpc/builtins-1.c @@ -317,10 +317,10 @@ int main () /* { dg-final { scan-assembler-times "vctuxs" 2 } } */ /* { dg-final { scan-assembler-times "vmrghb" 4 { target be } } } */ -/* { dg-final { scan-assembler-times "vmrghb" 5 { target le } } } */ +/* { dg-final { scan-assembler-times "vmrghb" 6 { target le } } } */ /* { dg-final { scan-assembler-times "vmrghh" 8 } } */ -/* { dg-final { scan-assembler-times "xxmrghw" 8 } } */ -/* { dg-final { scan-assembler-times "xxmrglw" 8 } } */ +/* { dg-final { scan-assembler-times "xxmrghw" 4 } } */ +/* { dg-final { scan-assembler-times "xxmrglw" 4 } } */ /* { dg-final { scan-assembler-times "vmrglh" 8 } } */ /* { dg-final { scan-assembler-times "xxlnor" 6 } } */ /* { dg-final { scan-assembler-times {\mvpkudus\M} 1 } } */ @@ -347,7 +347,7 @@ int main () /* { dg-final { scan-assembler-times "vspltb" 6 } } */ /* { dg-final { scan-assembler-times "vspltw" 0 } } */ /* { dg-final { scan-assembler-times "vmrgow" 8 } } */ -/* { dg-final { scan-assembler-times "vmrglb" 5 { target le } } } */ +/* { dg-final { scan-assembler-times "vmrglb" 4 { target le } } } */ /* { dg-final { scan-assembler-times "vmrglb" 6 { target be } } } */ /* { dg-final { scan-assembler-times "vmrgew" 8 } } */ /* { dg-final { scan-assembler-times "vsplth" 8 } } */ Are those changes correct? It looks like a vmrglb became a vmrghb, and that 4 each of xxmrghw and xxmrglw disappeared? Both seem wrong? This case is built with "-mdejagnu-cpu=power8 -O0 -mno-fold-gimple -dp" and it also counted the g
Re: Ping ^ 2: [PATCH] rs6000: Expand fmod and remainder when built with fast-math [PR97142]
On 2021/9/4 05:44, Segher Boessenkool wrote: Hi! On Fri, Sep 03, 2021 at 10:31:24AM +0800, Xionghu Luo wrote: fmod/fmodf and remainder/remainderf could be expanded instead of library call when fast-math build, which is much faster. Thank you very much for this patch. Some trivial comments if you haven't commmitted it yet: +(define_expand "fmod3" + [(use (match_operand:SFDF 0 "gpc_reg_operand")) + (use (match_operand:SFDF 1 "gpc_reg_operand")) + (use (match_operand:SFDF 2 "gpc_reg_operand"))] + "TARGET_HARD_FLOAT + && TARGET_FPRND + && flag_unsafe_math_optimizations" It should have one extra space before each && here: OK. "TARGET_HARD_FLOAT && TARGET_FPRND && flag_unsafe_math_optimizations" (so that everything inside of the string aligns). +(define_expand "remainder3" (same here). +/* { dg-final { scan-assembler-not {\mbl fmod\M} } } */ +/* { dg-final { scan-assembler-not {\mbl fmodf\M} } } */ +/* { dg-final { scan-assembler-not {\mbl remainder\M} } } */ +/* { dg-final { scan-assembler-not {\mbl remainderf\M} } } */ These are negative tests, so won't spuriously fail, but this does not test for the function prefixes we can have. See gcc.target/powerpc/builtins-1.c for example. Thanks. Verified that different calls are generated on different platforms without this patch. P8BE-64: bl __fmodf_finite P8BE-32: b __fmodf_finite P8LE-64: bl fmodf "l", "__" and "_finite" are optional, so is it OK to check them with below patterns? +/* { dg-final { scan-assembler-not {\mbl? (__)?fmod(_finite)?\M} } } */ +/* { dg-final { scan-assembler-not {\mbl? (__)?fmodf(_finite)?\M} } } */ +/* { dg-final { scan-assembler-not {\mbl? (__)?remainder(_finite)?\M} } } */ +/* { dg-final { scan-assembler-not {\mbl? (__)?remainderf(_finite)?\M} } } */ Again, thank you, and thanks to everyone else for the patch review action :-) Segher -- Thanks, Xionghu
Re: [RFC] Don't move cold code out of loop by checking bb count
On 2021/8/26 19:33, Richard Biener wrote: On Tue, Aug 10, 2021 at 4:03 AM Xionghu Luo wrote: Hi, On 2021/8/6 20:15, Richard Biener wrote: On Mon, Aug 2, 2021 at 7:05 AM Xiong Hu Luo wrote: There was a patch trying to avoid move cold block out of loop: https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html Richard suggested to "never hoist anything from a bb with lower execution frequency to a bb with higher one in LIM invariantness_dom_walker before_dom_children". This patch does this profile count check in both gimple LIM move_computations_worker and RTL loop-invariant.c find_invariants_bb, if the loop bb is colder than loop preheader, don't hoist it out of loop. Also, the profile count in loop split pass should be corrected to avoid lim2 and lim4 mismatch behavior, currently, the new loop preheader generated by loop_version is set to "[count: 0]:", then lim4 after lsplt pass will move statement out of loop unexpectely when lim2 didn't move it. This change could fix regression on 544.nab_r from -1.55% to +0.46%. SPEC2017 performance evaluation shows 1% performance improvement for intrate GEOMEAN and no obvious regression for others. Especially, 500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00% on P8LE. Regression and bootstrap tested pass on P8LE, any comments? Thanks. While I'm not familiar with the RTL invariant motion pass the patch there looks reasonable. Note that we should assess the profile quality somehow - I'm not sure how to do that, CCed Honza for that. Thanks. For the GIMPLE part the patch looks quite complicated - but note it probably has to be since LIM performs kind of a "CSE" on loads (and stores for store-motion), so when there are multiple stmts affected by a hoisting decision the biggest block count has to be accounted. Likewise when there are dependent stmts involved that might include conditional stmts (a "PHI"), but the overall cost should be looked at. Currently, The gimple code check two situations with the patch: 1) The statement or PHI‘s BB is *colder* then preheader, don't move it out of loop; 2) The statement or PHI's BB is *hotter* then preheader, but any of it's rhs couldn't be moved out of loop, also don't move it out of loop to avoid definition not dominates use error. But part 2) is obviously already done. What I tried to say is your heuristic doesn't integrate nicely with the pass but I admitted that it might be a bit difficult to find a place to add this heuristic. There is lim_data->cost which we could bias negatively but then this is a cost that is independent on the hoisting distance. But doing this would work at least for the case where the immediately enclosing loop preheader is hotter than the stmt and with this it would be a patch that's similarly simple as the RTL one. Another possibility is to simply only adjust PHI processing in compute_invariantness, capping movement according to the hotness heuristic. The same could be done for regular stmts there but I'm not sure that will do good in the end since this function is supposed to compute "correctness" (well, it also has the cost stuff), and it's not the place to do overall cost considerations. Thanks. I found that adding a function find_coldest_out_loop and check it in outermost_invariant_loop to find the coldest invariant loop between outermost loop and itself could also reach the purpose. Then the gimple code check is redundant and could be removed. May be I could collect the number of instructions not hoisted with the patch on regression tests and SPEC2017 to do a estimation for "multiple stmts affected" and "overall cost" need to be considered? But it seems move_computations_worker couldn't rollback if we still want to hoist multiple stmts out during the iterations? Now - GIMPLE LIM "costing" is somewhat backward right now and it isn't set up to consider those multiple involved stmts. Plus the store-motion part does not have any cost part (but it depends on previously decided invariant motions). I think the way you implemented the check will cause no hoisting to be performed instead of, say, hoisting to a different loop level only. Possibly shown when you consider a loop nest like for (;;) if (unlikely_cond) for (;;) invariant; we want to hoist 'invariant' but only from the inner loop even if it is invariant also in the outer loop. For this case, theorotically I think the master GCC will optimize it to: invariant; for (;;) if (unlikely_cond) for (;;) ; 'invariant' is moved out of outer loop, but with the patch, it will get: for (;;) if (unlikely_cond) { invariant; for (;;) ;
Re: [PATCH] tree-optimization/102155 - fix LIM fill_always_executed_in CFG walk
On 2021/9/2 18:37, Richard Biener wrote: On Thu, 2 Sep 2021, Xionghu Luo wrote: On 2021/9/2 16:50, Richard Biener wrote: On Thu, 2 Sep 2021, Richard Biener wrote: On Thu, 2 Sep 2021, Xionghu Luo wrote: On 2021/9/1 17:58, Richard Biener wrote: This fixes the CFG walk order of fill_always_executed_in to use RPO oder rather than the dominator based order computed by get_loop_body_in_dom_order. That fixes correctness issues with unordered dominator children. The RPO order computed by rev_post_order_and_mark_dfs_back_seme in its for-iteration mode is a good match for the algorithm. Xionghu, I've tried to only fix the CFG walk order issue and not change anything else with this so we have a more correct base to work against. The code still walks inner loop bodies up to loop depth times and thus is quadratic in the loop depth. Bootstrapped and tested on x86_64-unknown-linux-gnu, if you don't have any comments I plan to push this and then revisit what we were circling around. LGTM, thanks. I pushed it, thought again in the attempt to build a testcase and concluded I was wrong with the appearant mishandling of contains_call - get_loop_body_in_dom_order seems to be exactly correct for this specific case. So I reverted the commit again. And I figured what the /* In a loop that is always entered we may proceed anyway. But record that we entered it and stop once we leave it. */ comment was about. The code was present before the fix for PR78185 and it was supposed to catch the case where the entered inner loop is not finite. Just as the testcase from PR78185 shows the stopping was done too late when the exit block was already marked as to be always executed. A simpler fix for PR78185 would have been to move if (!flow_bb_inside_loop_p (inn_loop, bb)) break; before setting of last = bb. In fact the installed fix was more pessimistic than that given it terminated already when entering a possibly infinite loop. So we can improve that by doing sth like which should also improve the situation for some of the cases you were looking at? What remains is that we continue to stop when entering a not always executed loop: if (bb->loop_father->header == bb) { if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) break; Yes. This will cause blocks after inner loop missed to be check if they are actually ALWAYS_EXECUTED. I am afraid O(N^2) is inevitable here... Yes. What we can try is pre-computing whether a loop has a call or an inner loop that might not terminate and then when that's true for the loop to be entered continue to break; but when not, skip processing that loop blocks (but we still fill the blocks array, and we do need to do this in the order for the loop we're processing ...). So what I was thinking was to somehow embed the dominator walk of get_loop_body_in_dom_order and instead of pre-recording the above info (call, infinite loop) for loops, pre-record it on the dominator tree so that we can ask "in any of our dominator children, is there a call or an infinite loop" and thus cut the dominator walk at loop header blocks that are not dominating the outer loop latch ... Of course the simplistic solution might be to simply do if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb) && ((loop_depth (bb->loop_father) - loop_depth (loop)) > param_max_lim_loop_depth_lookahead))) break; and thus limit the processing of conditionally executed inner loops by relative depth ... as you say the actual processing is unlikely to be the bottleneck for the degenerate cases of a very deep nest of conditionally executed loops. But still for this case get_loop_body_in_dom_order is doing quadratic processing so we can also say that another linear walk over the produced array does not increase complexity.> volatile int flag, bar; double foo (double *valp) { double sum = 0; for (int i = 0; i < 256; ++i) { for (int j = 0; j < 256; ++j) bar = flag; if (flag) sum += 1.; sum += *valp; } return sum; } The patch still fails to handle cases like this: struct X { int i; int j; int k;}; volatile int m; void bar (struct X *x, int n, int l, int k) { for (int i = 0; i < l; i++) { if (k) for (int j = 0; j < l; j++) { if (m) x->i = m; else x->i = 1 - m; int *r = &x->k; int tem2 = *r; x->k += tem2 * j; } x->i = m; } } x->i is still not marked ALWAYS_EXECUTED for outer loop. Collected data when build gcc stage1 and bootstrap. There are still about 9% bbs are missed to be marked with ALWAYS_EXECUTED. Execution time of fill_always_exec
Re: [PATCH] tree-optimization/102155 - fix LIM fill_always_executed_in CFG walk
On 2021/9/9 18:55, Richard Biener wrote: diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index 5d6845478e7..4b187c2cdaf 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -3074,15 +3074,13 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) break; if (bb->loop_father->header == bb) - { - if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) - break; - - /* In a loop that is always entered we may proceed anyway. -But record that we entered it and stop once we leave it -since it might not be finite. */ - inn_loop = bb->loop_father; - } + /* Record that we enter into a subloop since it might not + be finite. */ + /* ??? Entering into a not always executed subloop makes + fill_always_executed_in quadratic in loop depth since + we walk those loops N times. This is not a problem + in practice though, see PR102253 for a worst-case testcase. */ + inn_loop = bb->loop_father; Yes your two patches extracted the get_loop_body_in_dom_order out and removed the inn_loop break logic when it doesn't dominate outer loop. Confirmed the replacement could improve for saving ~10% build time due to not full DOM walker and marked the previously ignored ALWAYS_EXECUTED bbs. But if we don't break for inner loop again, why still keep the *inn_loop* variable? It seems unnecessary and confusing, could we just remove it and restore the original infinte loop check in bb->succs for better understanding? diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index d1e2104233b..82a0509e0c4 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -3200,7 +3200,6 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) { basic_block bb = NULL, last = NULL; edge e; - class loop *inn_loop = loop; if (ALWAYS_EXECUTED_IN (loop->header) == NULL) { @@ -3213,17 +3212,6 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) edge_iterator ei; bb = worklist.pop (); - if (!flow_bb_inside_loop_p (inn_loop, bb)) - { - /* When we are leaving a possibly infinite inner loop -we have to stop processing. */ - if (!finite_loop_p (inn_loop)) - break; - /* If the loop was finite we can continue with processing -the loop we exited to. */ - inn_loop = bb->loop_father; - } - if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) last = bb; @@ -3232,8 +3220,15 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) /* If LOOP exits from this BB stop processing. */ FOR_EACH_EDGE (e, ei, bb->succs) + { if (!flow_bb_inside_loop_p (loop, e->dest)) break; + /* Or we enter a possibly non-finite loop. */ + if (flow_loop_nested_p (bb->loop_father, + e->dest->loop_father) + && ! finite_loop_p (e->dest->loop_father)) + break; + } if (e) break; @@ -3242,15 +3237,6 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) if (bb->flags & BB_IRREDUCIBLE_LOOP) break; - if (bb->loop_father->header == bb) - /* Record that we enter into a subloop since it might not - be finite. */ - /* ??? Entering into a not always executed subloop makes - fill_always_executed_in quadratic in loop depth since - we walk those loops N times. This is not a problem - in practice though, see PR102253 for a worst-case testcase. */ - inn_loop = bb->loop_father; - /* Walk the body of LOOP sorted by dominance relation. Additionally, if a basic block S dominates the latch, then only blocks dominated by S are after it. /* Walk the body of LOOP sorted by dominance relation. Additionally, if a basic block S dominates the latch, then only blocks dominated -- Thanks, Xionghu
Re: [PATCH] tree-optimization/102155 - fix LIM fill_always_executed_in CFG walk
On 2021/9/10 21:54, Xionghu Luo via Gcc-patches wrote: On 2021/9/9 18:55, Richard Biener wrote: diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index 5d6845478e7..4b187c2cdaf 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -3074,15 +3074,13 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) break; if (bb->loop_father->header == bb) - { - if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) - break; - - /* In a loop that is always entered we may proceed anyway. - But record that we entered it and stop once we leave it - since it might not be finite. */ - inn_loop = bb->loop_father; - } + /* Record that we enter into a subloop since it might not + be finite. */ + /* ??? Entering into a not always executed subloop makes + fill_always_executed_in quadratic in loop depth since + we walk those loops N times. This is not a problem + in practice though, see PR102253 for a worst-case testcase. */ + inn_loop = bb->loop_father; Yes your two patches extracted the get_loop_body_in_dom_order out and removed the inn_loop break logic when it doesn't dominate outer loop. Confirmed the replacement could improve for saving ~10% build time due to not full DOM walker and marked the previously ignored ALWAYS_EXECUTED bbs. But if we don't break for inner loop again, why still keep the *inn_loop* variable? It seems unnecessary and confusing, could we just remove it and restore the original infinte loop check in bb->succs for better understanding? What's more, the refine of this fix is incorrect for PR78185. commit 483e400870601f650c80f867ec781cd5f83507d6 Author: Richard Biener Date: Thu Sep 2 10:47:35 2021 +0200 Refine fix for PR78185, improve LIM for code after inner loops This refines the fix for PR78185 after understanding that the code regarding to the comment 'In a loop that is always entered we may proceed anyway. But record that we entered it and stop once we leave it.' was supposed to protect us from leaving possibly infinite inner loops. The simpler fix of moving the misplaced stopping code can then be refined to continue processing when the exited inner loop is finite, improving invariant motion for cases like in the added testcase. 2021-09-02 Richard Biener * tree-ssa-loop-im.c (fill_always_executed_in_1): Refine fix for PR78185 and continue processing when leaving finite inner loops. * gcc.dg/tree-ssa/ssa-lim-16.c: New testcase. 3<--- || 6<---| | \ | | | \ | | 48 | |--- | | | | 5 7-- | 1 loop 2 is an infinite loop, it is only ALWAYS_EXECUTED for loop 2, but r12-3313-g483e40087 sets it ALWAYS_EXECUTED for loop 1. We need to restore it like this: https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579195.html diff of pr78185.c.138t.lim2: ;; ;; Loop 1 ;; header 3, latch 7 ;; depth 1, outer 0 ;; nodes: 3 7 4 6 8 ;; ;; Loop 2 ;; header 6, latch 8 ;; depth 2, outer 1 ;; nodes: 6 8 ;; 2 succs { 3 } ;; 3 succs { 6 } ;; 6 succs { 4 8 } ;; 8 succs { 6 } ;; 4 succs { 7 5 } ;; 7 succs { 3 } ;; 5 succs { 1 } Memory reference 1: var1 -BB 6 is always executed in loop 1 BB 3 is always executed in loop 1 +BB 6 is always executed in loop 2 Basic block 3 (loop 1 -- depth 1): Basic block 6 (loop 2 -- depth 2): diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index d1e2104233b..82a0509e0c4 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -3200,7 +3200,6 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) { basic_block bb = NULL, last = NULL; edge e; - class loop *inn_loop = loop; if (ALWAYS_EXECUTED_IN (loop->header) == NULL) { @@ -3213,17 +3212,6 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) edge_iterator ei; bb = worklist.pop (); - if (!flow_bb_inside_loop_p (inn_loop, bb)) - { - /* When we are leaving a possibly infinite inner loop - we have to stop processing. */ - if (!finite_loop_p (inn_loop)) - break; - /* If the loop was finite we can continue with processing - the loop we exited to. */ - inn_loop = bb->loop_father; - } - if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) last = bb; @@ -3232,8 +3220,15 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) /* If LOOP exits from this BB stop processing. */ FOR_EACH_EDGE (e, ei, bb->succs) + { if (!flow_bb_inside_loop_p (loop, e->dest))
Re: [PATCH] tree-optimization/102155 - fix LIM fill_always_executed_in CFG walk
On 2021/9/13 16:17, Richard Biener wrote: On Mon, 13 Sep 2021, Xionghu Luo wrote: On 2021/9/10 21:54, Xionghu Luo via Gcc-patches wrote: On 2021/9/9 18:55, Richard Biener wrote: diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index 5d6845478e7..4b187c2cdaf 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -3074,15 +3074,13 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) break; if (bb->loop_father->header == bb) - { - if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) - break; - - /* In a loop that is always entered we may proceed anyway. - But record that we entered it and stop once we leave it - since it might not be finite. */ - inn_loop = bb->loop_father; - } + /* Record that we enter into a subloop since it might not + be finite. */ + /* ??? Entering into a not always executed subloop makes + fill_always_executed_in quadratic in loop depth since + we walk those loops N times. This is not a problem + in practice though, see PR102253 for a worst-case testcase. */ + inn_loop = bb->loop_father; Yes your two patches extracted the get_loop_body_in_dom_order out and removed the inn_loop break logic when it doesn't dominate outer loop. Confirmed the replacement could improve for saving ~10% build time due to not full DOM walker and marked the previously ignored ALWAYS_EXECUTED bbs. But if we don't break for inner loop again, why still keep the *inn_loop* variable? It seems unnecessary and confusing, could we just remove it and restore the original infinte loop check in bb->succs for better understanding? What's more, the refine of this fix is incorrect for PR78185. commit 483e400870601f650c80f867ec781cd5f83507d6 Author: Richard Biener Date: Thu Sep 2 10:47:35 2021 +0200 Refine fix for PR78185, improve LIM for code after inner loops This refines the fix for PR78185 after understanding that the code regarding to the comment 'In a loop that is always entered we may proceed anyway. But record that we entered it and stop once we leave it.' was supposed to protect us from leaving possibly infinite inner loops. The simpler fix of moving the misplaced stopping code can then be refined to continue processing when the exited inner loop is finite, improving invariant motion for cases like in the added testcase. 2021-09-02 Richard Biener * tree-ssa-loop-im.c (fill_always_executed_in_1): Refine fix for PR78185 and continue processing when leaving finite inner loops. * gcc.dg/tree-ssa/ssa-lim-16.c: New testcase. 3<--- || 6<---| | \ | | | \ | | 48 | |--- | | | | 5 7-- | 1 loop 2 is an infinite loop, it is only ALWAYS_EXECUTED for loop 2, but r12-3313-g483e40087 sets it ALWAYS_EXECUTED for loop 1. We need to restore it like this: https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579195.html I don't understand - BB6 is the header block of loop 2 which is always entered and thus BB6 is always executed at least once. The important part is that BB4 which follows the inner loop is _not_ always executed because we don't know if we will exit the inner loop. What am I missing? Oh, I see. I only noticed the functionality change of the patch on the case and no failure check of it, misunderstood it was a regression instead of an improvement to also hoisting invariants from infinite loop, sorry about that. Finally, the function fill_always_executed_in_1 could mark all ALWAYS_EXECUTED bb both including and after all subloops' bb but break after exiting from infinite subloops with better performance, thanks. The only thing to be worried is replacing get_loop_body_in_dom_order makes the code a bit more complicated for later readers as the loop depth and DOM order is not a problem here any more? ;) Richard. -- Thanks, Xionghu
Re: Ping ^ 3: [PATCH] rs6000: Fix wrong code generation for vec_sel [PR94613]
Ping^3, thanks. https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html On 2021/9/6 08:52, Xionghu Luo via Gcc-patches wrote: Ping^2, thanks. https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html On 2021/6/30 09:42, Xionghu Luo via Gcc-patches wrote: Gentle ping, thanks. https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html On 2021/5/14 14:57, Xionghu Luo via Gcc-patches wrote: Hi, On 2021/5/13 18:49, Segher Boessenkool wrote: Hi! On Fri, Apr 30, 2021 at 01:32:58AM -0500, Xionghu Luo wrote: The vsel instruction is a bit-wise select instruction. Using an IF_THEN_ELSE to express it in RTL is wrong and leads to wrong code being generated in the combine pass. Per element selection is a subset of per bit-wise selection,with the patch the pattern is written using bit operations. But there are 8 different patterns to define "op0 := (op1 & ~op3) | (op2 & op3)": (~op3&op1) | (op3&op2), (~op3&op1) | (op2&op3), (op3&op2) | (~op3&op1), (op2&op3) | (~op3&op1), (op1&~op3) | (op3&op2), (op1&~op3) | (op2&op3), (op3&op2) | (op1&~op3), (op2&op3) | (op1&~op3), Combine pass will swap (op1&~op3) to (~op3&op1) due to commutative canonical, which could reduce it to the FIRST 4 patterns, but it won't swap (op2&op3) | (~op3&op1) to (~op3&op1) | (op2&op3), so this patch handles it with two patterns with different NOT op3 position and check equality inside it. Yup, that latter case does not have canonicalisation rules. Btw, not only combine does this canonicalisation: everything should, non-canonical RTL is invalid RTL (in the instruction stream, you can do everything in temporary code of course, as long as the RTL isn't malformed). -(define_insn "*altivec_vsel" +(define_insn "altivec_vsel" [(set (match_operand:VM 0 "altivec_register_operand" "=v") - (if_then_else:VM - (ne:CC (match_operand:VM 1 "altivec_register_operand" "v") - (match_operand:VM 4 "zero_constant" "")) - (match_operand:VM 2 "altivec_register_operand" "v") - (match_operand:VM 3 "altivec_register_operand" "v")))] - "VECTOR_MEM_ALTIVEC_P (mode)" - "vsel %0,%3,%2,%1" + (ior:VM + (and:VM + (not:VM (match_operand:VM 3 "altivec_register_operand" "v")) + (match_operand:VM 1 "altivec_register_operand" "v")) + (and:VM + (match_operand:VM 2 "altivec_register_operand" "v") + (match_operand:VM 4 "altivec_register_operand" "v"] + "VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode) + && (rtx_equal_p (operands[2], operands[3]) + || rtx_equal_p (operands[4], operands[3]))" + { + if (rtx_equal_p (operands[2], operands[3])) + return "vsel %0,%1,%4,%3"; + else + return "vsel %0,%1,%2,%3"; + } [(set_attr "type" "vecmove")]) That rtx_equal_p stuff is nice and tricky, but it is a bit too tricky I think. So please write this as two patterns (and keep the expand if that helps). I was a bit concerned that there would be a lot of duplicate code if we write two patterns for each vsel, totally 4 similar patterns in altivec.md and another 4 in vsx.md make it difficult to maintain, however I updated it since you prefer this way, as you pointed out the xxsel in vsx.md could be folded by later patch. +(define_insn "altivec_vsel2" (same here of course). ;; Fused multiply add. diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index f5676255387..d65bdc01055 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -3362,11 +3362,11 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_unsigned_V2DI }, { ALTIVEC_BUILTIN_VEC_SEL, ALTIVEC_BUILTIN_VSEL_2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI }, - { ALTIVEC_BUILTIN_VEC_SEL, ALTIVEC_BUILTIN_VSEL_2DI, + { ALTIVEC_BUILTIN_VEC_SEL, ALTIVEC_BUILTIN_VSEL_2DI_UNS, Are the _uns things still used for anything? But, let's not change this until Bill's stuff is in :-) Why do you want to change this here, btw? I don't understand. OK, they are actually "unsigned type" overload builtin functions, change it or not so far won't cause functionality issue, I will revert this change in the updated patch. + if (target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) No space after ! and other unary operators (except for casts and other operators you write with alphanumerics, like "sizeof"). I know you copied this code, but :-)
[PATCH v2 2/2] rs6000: Fold xxsel to vsel since they have same semantics
Fold xxsel to vsel like xxperm/vperm to avoid duplicate code. gcc/ChangeLog: 2021-09-17 Xionghu Luo * config/rs6000/altivec.md: Add vsx register constraints. * config/rs6000/vsx.md (vsx_xxsel): Delete. (vsx_xxsel2): Likewise. (vsx_xxsel3): Likewise. (vsx_xxsel4): Likewise. --- gcc/config/rs6000/altivec.md | 60 +++ gcc/config/rs6000/vsx.md | 57 -- gcc/testsuite/gcc.target/powerpc/builtins-1.c | 2 +- 3 files changed, 37 insertions(+), 82 deletions(-) diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index a3424e1a458..4b4ca2c5d17 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -684,56 +684,68 @@ (define_insn "*altivec_gev4sf" [(set_attr "type" "veccmp")]) (define_insn "altivec_vsel" - [(set (match_operand:VM 0 "altivec_register_operand" "=v") + [(set (match_operand:VM 0 "register_operand" "=wa,v") (ior:VM (and:VM - (not:VM (match_operand:VM 3 "altivec_register_operand" "v")) - (match_operand:VM 1 "altivec_register_operand" "v")) + (not:VM (match_operand:VM 3 "register_operand" "wa,v")) + (match_operand:VM 1 "register_operand" "wa,v")) (and:VM (match_dup 3) - (match_operand:VM 2 "altivec_register_operand" "v"] + (match_operand:VM 2 "register_operand" "wa,v"] "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)" - "vsel %0,%1,%2,%3" - [(set_attr "type" "vecmove")]) + "@ + xxsel %x0,%x1,%x2,%x3 + vsel %0,%1,%2,%3" + [(set_attr "type" "vecmove") + (set_attr "isa" "")]) (define_insn "altivec_vsel2" - [(set (match_operand:VM 0 "altivec_register_operand" "=v") + [(set (match_operand:VM 0 "register_operand" "=wa,v") (ior:VM (and:VM - (not:VM (match_operand:VM 3 "altivec_register_operand" "v")) - (match_operand:VM 1 "altivec_register_operand" "v")) + (not:VM (match_operand:VM 3 "register_operand" "wa,v")) + (match_operand:VM 1 "register_operand" "wa,v")) (and:VM - (match_operand:VM 2 "altivec_register_operand" "v") + (match_operand:VM 2 "register_operand" "wa,v") (match_dup 3] "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)" - "vsel %0,%1,%2,%3" - [(set_attr "type" "vecmove")]) + "@ + xxsel %x0,%x1,%x2,%x3 + vsel %0,%1,%2,%3" + [(set_attr "type" "vecmove") + (set_attr "isa" "")]) (define_insn "altivec_vsel3" - [(set (match_operand:VM 0 "altivec_register_operand" "=v") + [(set (match_operand:VM 0 "register_operand" "=wa,v") (ior:VM (and:VM - (match_operand:VM 3 "altivec_register_operand" "v") - (match_operand:VM 1 "altivec_register_operand" "v")) + (match_operand:VM 3 "register_operand" "wa,v") + (match_operand:VM 1 "register_operand" "wa,v")) (and:VM (not:VM (match_dup 3)) - (match_operand:VM 2 "altivec_register_operand" "v"] + (match_operand:VM 2 "register_operand" "wa,v"] "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)" - "vsel %0,%2,%1,%3" - [(set_attr "type" "vecmove")]) + "@ + xxsel %x0,%x2,%x1,%x3 + vsel %0,%2,%1,%3" + [(set_attr "type" "vecmove") + (set_attr "isa" "")]) (define_insn "altivec_vsel4" - [(set (match_operand:VM 0 "altivec_register_operand" "=v") + [(set (match_operand:VM 0 "register_operand" "=wa,v") (ior:VM (and:VM - (match_operand:VM 1 "altivec_register_operand" "v") - (match_operand:VM 3 "altivec_register_operand" "v")) + (match_operand:VM 1 "register_operand" "wa,v") + (match_operand:VM 3 "register_operand" "wa,v")) (and:VM (not:VM (match_dup 3)) - (match_operand:VM 2 "altivec_register_operand" "v"] + (match_operand:VM 2 "register_operand" "wa,v"] "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)" - "vsel
[PATCH v2 0/2] Fix vec_sel code generation and merge xxsel to vsel
These two patches are updated version from: https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579490.html Changes: 1. Fix alignment error in md files. 2. Replace rtx_equal_p with match_dup. 3. Use register_operand instead of gpc_reg_operand to align with vperm/xxperm. 4. Regression tested pass on P8LE. Xionghu Luo (2): rs6000: Fix wrong code generation for vec_sel [PR94613] rs6000: Fold xxsel to vsel since they have same semantics gcc/config/rs6000/altivec.md | 84 ++- gcc/config/rs6000/rs6000-call.c | 62 ++ gcc/config/rs6000/rs6000.c| 19 ++--- gcc/config/rs6000/vector.md | 26 +++--- gcc/config/rs6000/vsx.md | 25 -- gcc/testsuite/gcc.target/powerpc/builtins-1.c | 2 +- gcc/testsuite/gcc.target/powerpc/pr94613.c| 47 +++ 7 files changed, 193 insertions(+), 72 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr94613.c -- 2.25.1
[PATCH v2 1/2] rs6000: Fix wrong code generation for vec_sel [PR94613]
The vsel instruction is a bit-wise select instruction. Using an IF_THEN_ELSE to express it in RTL is wrong and leads to wrong code being generated in the combine pass. Per element selection is a subset of per bit-wise selection,with the patch the pattern is written using bit operations. But there are 8 different patterns to define "op0 := (op1 & ~op3) | (op2 & op3)": (~op3&op1) | (op3&op2), (~op3&op1) | (op2&op3), (op3&op2) | (~op3&op1), (op2&op3) | (~op3&op1), (op1&~op3) | (op3&op2), (op1&~op3) | (op2&op3), (op3&op2) | (op1&~op3), (op2&op3) | (op1&~op3), The latter 4 cases does not follow canonicalisation rules, non-canonical RTL is invalid RTL in vregs pass. Secondly, combine pass will swap (op1&~op3) to (~op3&op1) by commutative canonical, which could reduce it to the FIRST 4 patterns, but it won't swap (op2&op3) | (~op3&op1) to (~op3&op1) | (op2&op3), so this patch handles it with 4 patterns with different NOT op3 position and check equality inside it. Tested pass on Power8LE, any comments? gcc/ChangeLog: 2021-09-17 Xionghu Luo * config/rs6000/altivec.md (*altivec_vsel): Change to ... (altivec_vsel): ... this and update define. (*altivec_vsel_uns): Delete. (altivec_vsel2): New define_insn. (altivec_vsel3): Likewise. (altivec_vsel4): Likewise. * config/rs6000/rs6000-call.c (altivec_expand_vec_sel_builtin): New. (altivec_expand_builtin): Call altivec_expand_vec_sel_builtin to expand vel_sel. * config/rs6000/rs6000.c (rs6000_emit_vector_cond_expr): Use bit-wise selection instead of per element. * config/rs6000/vector.md: * config/rs6000/vsx.md (*vsx_xxsel): Change to ... (vsx_xxsel): ... this and update define. (*vsx_xxsel_uns): Delete. (vsx_xxsel2): New define_insn. (vsx_xxsel3): Likewise. (vsx_xxsel4): Likewise. gcc/testsuite/ChangeLog: 2021-09-17 Xionghu Luo * gcc.target/powerpc/pr94613.c: New test. --- gcc/config/rs6000/altivec.md | 62 -- gcc/config/rs6000/rs6000-call.c| 62 ++ gcc/config/rs6000/rs6000.c | 19 +++ gcc/config/rs6000/vector.md| 26 + gcc/config/rs6000/vsx.md | 60 - gcc/testsuite/gcc.target/powerpc/pr94613.c | 47 6 files changed, 221 insertions(+), 55 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr94613.c diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 93d237156d5..a3424e1a458 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -683,26 +683,56 @@ (define_insn "*altivec_gev4sf" "vcmpgefp %0,%1,%2" [(set_attr "type" "veccmp")]) -(define_insn "*altivec_vsel" +(define_insn "altivec_vsel" [(set (match_operand:VM 0 "altivec_register_operand" "=v") - (if_then_else:VM -(ne:CC (match_operand:VM 1 "altivec_register_operand" "v") - (match_operand:VM 4 "zero_constant" "")) -(match_operand:VM 2 "altivec_register_operand" "v") -(match_operand:VM 3 "altivec_register_operand" "v")))] - "VECTOR_MEM_ALTIVEC_P (mode)" - "vsel %0,%3,%2,%1" + (ior:VM + (and:VM + (not:VM (match_operand:VM 3 "altivec_register_operand" "v")) + (match_operand:VM 1 "altivec_register_operand" "v")) + (and:VM + (match_dup 3) + (match_operand:VM 2 "altivec_register_operand" "v"] + "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)" + "vsel %0,%1,%2,%3" [(set_attr "type" "vecmove")]) -(define_insn "*altivec_vsel_uns" +(define_insn "altivec_vsel2" [(set (match_operand:VM 0 "altivec_register_operand" "=v") - (if_then_else:VM -(ne:CCUNS (match_operand:VM 1 "altivec_register_operand" "v") - (match_operand:VM 4 "zero_constant" "")) -(match_operand:VM 2 "altivec_register_operand" "v") -(match_operand:VM 3 "altivec_register_operand" "v")))] - "VECTOR_MEM_ALTIVEC_P (mode)" - "vsel %0,%3,%2,%1" + (ior:VM + (and:VM + (not:VM (match_operand:VM 3 "altivec_register_operand" "v")) + (match_operand:VM 1 "altivec_register_operand" "v")) + (and:VM + (match_operand:VM 2 "altivec_register_operand" "v") + (match_dup 3] + "
Re: Ping ^ 3: [PATCH] rs6000: Fix wrong code generation for vec_sel [PR94613]
On 2021/9/15 21:11, David Edelsohn wrote: Hi, Xionhu Should "altivec_vsel2" .. 3 .. 4 be "*altivec_vsel2", etc. because they are combiner patterns and never referenced by name? Only the first, named pattern is referenced by the builtin code. Thanks, updated the patchset with Segher's review comments, he didn't mention about this and sorry to forget change this part, I am also not sure whether "altivec_vsel2" .. 3 .. 4 will be used/generated or optimized by expander in future, is there any benefit to add "*" to the define_insn patterns? Other than that question / suggestion, this patch is okay. Please coordinate with Bill and his builtin patches. OK. Thanks, David On Wed, Sep 15, 2021 at 3:50 AM Xionghu Luo wrote: Ping^3, thanks. https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html On 2021/9/6 08:52, Xionghu Luo via Gcc-patches wrote: Ping^2, thanks. https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html On 2021/6/30 09:42, Xionghu Luo via Gcc-patches wrote: Gentle ping, thanks. https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html On 2021/5/14 14:57, Xionghu Luo via Gcc-patches wrote: Hi, On 2021/5/13 18:49, Segher Boessenkool wrote: Hi! On Fri, Apr 30, 2021 at 01:32:58AM -0500, Xionghu Luo wrote: The vsel instruction is a bit-wise select instruction. Using an IF_THEN_ELSE to express it in RTL is wrong and leads to wrong code being generated in the combine pass. Per element selection is a subset of per bit-wise selection,with the patch the pattern is written using bit operations. But there are 8 different patterns to define "op0 := (op1 & ~op3) | (op2 & op3)": (~op3&op1) | (op3&op2), (~op3&op1) | (op2&op3), (op3&op2) | (~op3&op1), (op2&op3) | (~op3&op1), (op1&~op3) | (op3&op2), (op1&~op3) | (op2&op3), (op3&op2) | (op1&~op3), (op2&op3) | (op1&~op3), Combine pass will swap (op1&~op3) to (~op3&op1) due to commutative canonical, which could reduce it to the FIRST 4 patterns, but it won't swap (op2&op3) | (~op3&op1) to (~op3&op1) | (op2&op3), so this patch handles it with two patterns with different NOT op3 position and check equality inside it. Yup, that latter case does not have canonicalisation rules. Btw, not only combine does this canonicalisation: everything should, non-canonical RTL is invalid RTL (in the instruction stream, you can do everything in temporary code of course, as long as the RTL isn't malformed). -(define_insn "*altivec_vsel" +(define_insn "altivec_vsel" [(set (match_operand:VM 0 "altivec_register_operand" "=v") -(if_then_else:VM - (ne:CC (match_operand:VM 1 "altivec_register_operand" "v") -(match_operand:VM 4 "zero_constant" "")) - (match_operand:VM 2 "altivec_register_operand" "v") - (match_operand:VM 3 "altivec_register_operand" "v")))] - "VECTOR_MEM_ALTIVEC_P (mode)" - "vsel %0,%3,%2,%1" +(ior:VM + (and:VM + (not:VM (match_operand:VM 3 "altivec_register_operand" "v")) + (match_operand:VM 1 "altivec_register_operand" "v")) + (and:VM + (match_operand:VM 2 "altivec_register_operand" "v") + (match_operand:VM 4 "altivec_register_operand" "v"] + "VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode) + && (rtx_equal_p (operands[2], operands[3]) + || rtx_equal_p (operands[4], operands[3]))" + { +if (rtx_equal_p (operands[2], operands[3])) + return "vsel %0,%1,%4,%3"; +else + return "vsel %0,%1,%2,%3"; + } [(set_attr "type" "vecmove")]) That rtx_equal_p stuff is nice and tricky, but it is a bit too tricky I think. So please write this as two patterns (and keep the expand if that helps). I was a bit concerned that there would be a lot of duplicate code if we write two patterns for each vsel, totally 4 similar patterns in altivec.md and another 4 in vsx.md make it difficult to maintain, however I updated it since you prefer this way, as you pointed out the xxsel in vsx.md could be folded by later patch. +(define_insn "altivec_vsel2" (same here of course). ;; Fused multiply add. diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index f5676255387..d65bdc01055 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -3362,11 +3362,11 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_unsigned_V2DI }, { ALTIVEC_BUILTIN_VEC_SEL, ALTIVEC_BUILTIN_VSEL_2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI }, -
Re: [PATCH] Fix loop split incorrect count and probability
On 2021/8/11 17:16, Richard Biener wrote: On Wed, 11 Aug 2021, Xionghu Luo wrote: On 2021/8/10 22:47, Richard Biener wrote: On Mon, 9 Aug 2021, Xionghu Luo wrote: Thanks, On 2021/8/6 19:46, Richard Biener wrote: On Tue, 3 Aug 2021, Xionghu Luo wrote: loop split condition is moved between loop1 and loop2, the split bb's count and probability should also be duplicated instead of (100% vs INV), secondly, the original loop1 and loop2 count need be propotional from the original loop. diff base/loop-cond-split-1.c.151t.lsplit patched/loop-cond-split-1.c.151t.lsplit: ... int prephitmp_16; int prephitmp_25; [local count: 118111600]: if (n_7(D) > 0) goto ; [89.00%] else goto ; [11.00%] [local count: 118111600]: return; [local count: 105119324]: pretmp_3 = ga; - [local count: 955630225]: + [local count: 315357973]: # i_13 = PHI # prephitmp_12 = PHI if (prephitmp_12 != 0) goto ; [33.00%] else goto ; [67.00%] - [local count: 315357972]: + [local count: 104068130]: _2 = do_something (); ga = _2; - [local count: 955630225]: + [local count: 315357973]: # prephitmp_5 = PHI i_10 = inc (i_13); if (n_7(D) > i_10) goto ; [89.00%] else goto ; [11.00%] [local count: 105119324]: goto ; [100.00%] - [local count: 850510901]: + [local count: 280668596]: if (prephitmp_12 != 0) -goto ; [100.00%] +goto ; [33.00%] else -goto ; [INV] +goto ; [67.00%] - [local count: 850510901]: + [local count: 280668596]: goto ; [100.00%] - [count: 0]: + [local count: 70429947]: # i_23 = PHI # prephitmp_25 = PHI - [local count: 955630225]: + [local count: 640272252]: # i_15 = PHI # prephitmp_16 = PHI i_22 = inc (i_15); if (n_7(D) > i_22) goto ; [89.00%] else goto ; [11.00%] - [local count: 850510901]: + [local count: 569842305]: goto ; [100.00%] } gcc/ChangeLog: * tree-ssa-loop-split.c (split_loop): Fix incorrect probability. (do_split_loop_on_cond): Likewise. --- gcc/tree-ssa-loop-split.c | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c index 3a09bbc39e5..8e5a7ded0f7 100644 --- a/gcc/tree-ssa-loop-split.c +++ b/gcc/tree-ssa-loop-split.c @@ -583,10 +583,10 @@ split_loop (class loop *loop1) basic_block cond_bb; if (!initial_true) - cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); + cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); + + edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE + ? EDGE_SUCC (bbs[i], 0) + : EDGE_SUCC (bbs[i], 1); class loop *loop2 = loop_version (loop1, cond, &cond_bb, - profile_probability::always (), - profile_probability::always (), - profile_probability::always (), - profile_probability::always (), + true_edge->probability, + true_edge->probability.invert (), + true_edge->probability, + true_edge->probability.invert (), true); there is no 'true_edge' variable at this point. Sorry, missed the above hunk when split the patch. gcc_assert (loop2); @@ -1486,10 +1486,10 @@ do_split_loop_on_cond (struct loop *loop1, edge invar_branch) initialize_original_copy_tables (); struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL, -profile_probability::always (), -profile_probability::never (), -profile_probability::always (), -profile_probability::always (), +invar_branch->probability.invert (), +invar_branch->probability, +invar_branch->probability.invert (), +invar_branch->probability, true); if (!loop2) { The patch introduction seems to talk about do_split_loop_on_cond only. split_loop faces similar issue though it sets the two branches to 100% vs 100% and no scaling which seems also incorrect. Since loop versioning inserts a condition with the passed probabilities but in this case a 'boolean_true_node' condition the th
Re: [RFC] Don't move cold code out of loop by checking bb count
On 2021/9/22 17:14, Richard Biener wrote: On Thu, Sep 9, 2021 at 3:56 AM Xionghu Luo wrote: On 2021/8/26 19:33, Richard Biener wrote: On Tue, Aug 10, 2021 at 4:03 AM Xionghu Luo wrote: Hi, On 2021/8/6 20:15, Richard Biener wrote: On Mon, Aug 2, 2021 at 7:05 AM Xiong Hu Luo wrote: There was a patch trying to avoid move cold block out of loop: https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html Richard suggested to "never hoist anything from a bb with lower execution frequency to a bb with higher one in LIM invariantness_dom_walker before_dom_children". This patch does this profile count check in both gimple LIM move_computations_worker and RTL loop-invariant.c find_invariants_bb, if the loop bb is colder than loop preheader, don't hoist it out of loop. Also, the profile count in loop split pass should be corrected to avoid lim2 and lim4 mismatch behavior, currently, the new loop preheader generated by loop_version is set to "[count: 0]:", then lim4 after lsplt pass will move statement out of loop unexpectely when lim2 didn't move it. This change could fix regression on 544.nab_r from -1.55% to +0.46%. SPEC2017 performance evaluation shows 1% performance improvement for intrate GEOMEAN and no obvious regression for others. Especially, 500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00% on P8LE. Regression and bootstrap tested pass on P8LE, any comments? Thanks. While I'm not familiar with the RTL invariant motion pass the patch there looks reasonable. Note that we should assess the profile quality somehow - I'm not sure how to do that, CCed Honza for that. Thanks. For the GIMPLE part the patch looks quite complicated - but note it probably has to be since LIM performs kind of a "CSE" on loads (and stores for store-motion), so when there are multiple stmts affected by a hoisting decision the biggest block count has to be accounted. Likewise when there are dependent stmts involved that might include conditional stmts (a "PHI"), but the overall cost should be looked at. Currently, The gimple code check two situations with the patch: 1) The statement or PHI‘s BB is *colder* then preheader, don't move it out of loop; 2) The statement or PHI's BB is *hotter* then preheader, but any of it's rhs couldn't be moved out of loop, also don't move it out of loop to avoid definition not dominates use error. But part 2) is obviously already done. What I tried to say is your heuristic doesn't integrate nicely with the pass but I admitted that it might be a bit difficult to find a place to add this heuristic. There is lim_data->cost which we could bias negatively but then this is a cost that is independent on the hoisting distance. But doing this would work at least for the case where the immediately enclosing loop preheader is hotter than the stmt and with this it would be a patch that's similarly simple as the RTL one. Another possibility is to simply only adjust PHI processing in compute_invariantness, capping movement according to the hotness heuristic. The same could be done for regular stmts there but I'm not sure that will do good in the end since this function is supposed to compute "correctness" (well, it also has the cost stuff), and it's not the place to do overall cost considerations. Thanks. I found that adding a function find_coldest_out_loop and check it in outermost_invariant_loop to find the coldest invariant loop between outermost loop and itself could also reach the purpose. Then the gimple code check is redundant and could be removed. May be I could collect the number of instructions not hoisted with the patch on regression tests and SPEC2017 to do a estimation for "multiple stmts affected" and "overall cost" need to be considered? But it seems move_computations_worker couldn't rollback if we still want to hoist multiple stmts out during the iterations? Now - GIMPLE LIM "costing" is somewhat backward right now and it isn't set up to consider those multiple involved stmts. Plus the store-motion part does not have any cost part (but it depends on previously decided invariant motions). I think the way you implemented the check will cause no hoisting to be performed instead of, say, hoisting to a different loop level only. Possibly shown when you consider a loop nest like for (;;) if (unlikely_cond) for (;;) invariant; we want to hoist 'invariant' but only from the inner loop even if it is invariant also in the outer loop. For this case, theorotically I think the master GCC will optimize it to: invariant; for (;;) if (unlikely_cond) for (;;) ; 'invariant' is moved out of outer loop, but with the patch, it
Re: [RFC] Don't move cold code out of loop by checking bb count
On 2021/9/23 10:13, Xionghu Luo via Gcc-patches wrote: On 2021/9/22 17:14, Richard Biener wrote: On Thu, Sep 9, 2021 at 3:56 AM Xionghu Luo wrote: On 2021/8/26 19:33, Richard Biener wrote: On Tue, Aug 10, 2021 at 4:03 AM Xionghu Luo wrote: Hi, On 2021/8/6 20:15, Richard Biener wrote: On Mon, Aug 2, 2021 at 7:05 AM Xiong Hu Luo wrote: There was a patch trying to avoid move cold block out of loop: https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html Richard suggested to "never hoist anything from a bb with lower execution frequency to a bb with higher one in LIM invariantness_dom_walker before_dom_children". This patch does this profile count check in both gimple LIM move_computations_worker and RTL loop-invariant.c find_invariants_bb, if the loop bb is colder than loop preheader, don't hoist it out of loop. Also, the profile count in loop split pass should be corrected to avoid lim2 and lim4 mismatch behavior, currently, the new loop preheader generated by loop_version is set to "[count: 0]:", then lim4 after lsplt pass will move statement out of loop unexpectely when lim2 didn't move it. This change could fix regression on 544.nab_r from -1.55% to +0.46%. SPEC2017 performance evaluation shows 1% performance improvement for intrate GEOMEAN and no obvious regression for others. Especially, 500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00% on P8LE. Regression and bootstrap tested pass on P8LE, any comments? Thanks. While I'm not familiar with the RTL invariant motion pass the patch there looks reasonable. Note that we should assess the profile quality somehow - I'm not sure how to do that, CCed Honza for that. Thanks. For the GIMPLE part the patch looks quite complicated - but note it probably has to be since LIM performs kind of a "CSE" on loads (and stores for store-motion), so when there are multiple stmts affected by a hoisting decision the biggest block count has to be accounted. Likewise when there are dependent stmts involved that might include conditional stmts (a "PHI"), but the overall cost should be looked at. Currently, The gimple code check two situations with the patch: 1) The statement or PHI‘s BB is *colder* then preheader, don't move it out of loop; 2) The statement or PHI's BB is *hotter* then preheader, but any of it's rhs couldn't be moved out of loop, also don't move it out of loop to avoid definition not dominates use error. But part 2) is obviously already done. What I tried to say is your heuristic doesn't integrate nicely with the pass but I admitted that it might be a bit difficult to find a place to add this heuristic. There is lim_data->cost which we could bias negatively but then this is a cost that is independent on the hoisting distance. But doing this would work at least for the case where the immediately enclosing loop preheader is hotter than the stmt and with this it would be a patch that's similarly simple as the RTL one. Another possibility is to simply only adjust PHI processing in compute_invariantness, capping movement according to the hotness heuristic. The same could be done for regular stmts there but I'm not sure that will do good in the end since this function is supposed to compute "correctness" (well, it also has the cost stuff), and it's not the place to do overall cost considerations. Thanks. I found that adding a function find_coldest_out_loop and check it in outermost_invariant_loop to find the coldest invariant loop between outermost loop and itself could also reach the purpose. Then the gimple code check is redundant and could be removed. May be I could collect the number of instructions not hoisted with the patch on regression tests and SPEC2017 to do a estimation for "multiple stmts affected" and "overall cost" need to be considered? But it seems move_computations_worker couldn't rollback if we still want to hoist multiple stmts out during the iterations? Now - GIMPLE LIM "costing" is somewhat backward right now and it isn't set up to consider those multiple involved stmts. Plus the store-motion part does not have any cost part (but it depends on previously decided invariant motions). I think the way you implemented the check will cause no hoisting to be performed instead of, say, hoisting to a different loop level only. Possibly shown when you consider a loop nest like for (;;) if (unlikely_cond) for (;;) invariant; we want to hoist 'invariant' but only from the inner loop even if it is invariant also in the outer loop. For this case, theorotically I think the master GCC will optimize it to: invariant; for (;;) if (unlikely_cond)
Re: [RFC] Don't move cold code out of loop by checking bb count
Update the patch to v3, not sure whether you prefer the paste style and continue to link the previous thread as Segher dislikes this... [PATCH v3] Don't move cold code out of loop by checking bb count Changes: 1. Handle max_loop in determine_max_movement instead of outermost_invariant_loop. 2. Remove unnecessary changes. 3. Add for_all_locs_in_loop (loop, ref, ref_in_loop_hot_body) in can_sm_ref_p. 4. "gsi_next (&bsi);" in move_computations_worker is kept since it caused infinite loop when implementing v1 and the iteration is missed to be updated actually. v1: https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576488.html v2: https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579086.html There was a patch trying to avoid move cold block out of loop: https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html Richard suggested to "never hoist anything from a bb with lower execution frequency to a bb with higher one in LIM invariantness_dom_walker before_dom_children". In gimple LIM analysis, add find_coldest_out_loop to move invariants to expected target loop, if profile count of the loop bb is colder than target loop preheader, it won't be hoisted out of loop. Likely for store motion, if all locations of the REF in loop is cold, don't do store motion of it. SPEC2017 performance evaluation shows 1% performance improvement for intrate GEOMEAN and no obvious regression for others. Especially, 500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00% on P8LE. gcc/ChangeLog: * loop-invariant.c (find_invariants_bb): Check profile count before motion. (find_invariants_body): Add argument. * tree-ssa-loop-im.c (find_coldest_out_loop): New function. (determine_max_movement): Use find_coldest_out_loop. (move_computations_worker): Adjust and fix iteration udpate. (execute_sm_exit): Check pointer validness. (class ref_in_loop_hot_body): New functor. (ref_in_loop_hot_body::operator): New. (can_sm_ref_p): Use for_all_locs_in_loop. gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/recip-3.c: Adjust. * gcc.dg/tree-ssa/ssa-lim-18.c: New test. * gcc.dg/tree-ssa/ssa-lim-19.c: New test. * gcc.dg/tree-ssa/ssa-lim-20.c: New test. --- gcc/loop-invariant.c | 10 ++-- gcc/tree-ssa-loop-im.c | 61 -- gcc/testsuite/gcc.dg/tree-ssa/recip-3.c| 2 +- gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-18.c | 20 +++ gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c | 27 ++ gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-20.c | 25 + gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-21.c | 28 ++ 7 files changed, 165 insertions(+), 8 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-18.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-20.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-21.c diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c index fca0c2b24be..5c3be7bf0eb 100644 --- a/gcc/loop-invariant.c +++ b/gcc/loop-invariant.c @@ -1183,9 +1183,14 @@ find_invariants_insn (rtx_insn *insn, bool always_reached, bool always_executed) call. */ static void -find_invariants_bb (basic_block bb, bool always_reached, bool always_executed) +find_invariants_bb (class loop *loop, basic_block bb, bool always_reached, + bool always_executed) { rtx_insn *insn; + basic_block preheader = loop_preheader_edge (loop)->src; + + if (preheader->count > bb->count) +return; FOR_BB_INSNS (bb, insn) { @@ -1214,8 +1219,7 @@ find_invariants_body (class loop *loop, basic_block *body, unsigned i; for (i = 0; i < loop->num_nodes; i++) -find_invariants_bb (body[i], - bitmap_bit_p (always_reached, i), +find_invariants_bb (loop, body[i], bitmap_bit_p (always_reached, i), bitmap_bit_p (always_executed, i)); } diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index 4b187c2cdaf..655fab03442 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -417,6 +417,28 @@ movement_possibility (gimple *stmt) return ret; } +/* Find coldest loop between outmost_loop and loop by comapring profile count. */ + +static class loop * +find_coldest_out_loop (class loop *outmost_loop, class loop *loop, + basic_block curr_bb) +{ + class loop *cold_loop, *min_loop; + cold_loop = min_loop = outmost_loop; + profile_count min_count = loop_preheader_edge (min_loop)->src->count; + + if (curr_bb && curr_bb->count < loop_preheader_edge (loop)->src->count) +return NULL; + + while (min_loop != loop) +{ + min_loop = superloop_at_depth (loop, loop_depth (min_loop) + 1); + if (loop_preheader_edge (min_loop)->src->count < min_count) + cold_loop = min
Re: [PATCH v2 2/4] Refactor loop_version
On 2021/10/29 19:52, Richard Biener wrote: > On Wed, 27 Oct 2021, Xionghu Luo wrote: > >> loop_version currently does lv_adjust_loop_entry_edge >> before it loopifys the copy inserted on the header. This patch moves >> the condition generation later and thus we have four pieces to help >> understanding of how the adjustment works: >> 1) duplicating the loop on the entry edge. >> 2) loopify the duplicated new loop. >> 3) adjusting the CFG to insert a condition branching to either loop >> with lv_adjust_loop_entry_edge. >> 4) From loopify extract the scale_loop_frequencies bits. >> >> Also removed some piece of code seems obviously useless which is not >> completely sure: >> - redirect_all_edges since it is false and loopify only called once. >> - extract_cond_bb_edges and lv_flush_pending_stmts (false_edge) as the >> edge is not redirected actually. > > This is OK (you can also commit this independently), thanks for the > cleanup. Thanks, committed this and [PATCH v2 4/4] to r12-4818 and r12-4819. -- Thanks, Xionghu
Re: [RFC] Don't move cold code out of loop by checking bb count
On 2021/10/29 19:48, Richard Biener wrote: > I'm talking about the can_sm_ref_p call, in that context 'loop' will > be the outermost loop of > interest, and we are calling this for all stores in a loop. We're doing > > +bool > +ref_in_loop_hot_body::operator () (mem_ref_loc *loc) > +{ > + basic_block curr_bb = gimple_bb (loc->stmt); > + class loop *inner_loop = curr_bb->loop_father; > + return find_coldest_out_loop (l, inner_loop, curr_bb); > > for each location the ref is accessed and the intent was to see > whether there's at least one > that we would like to move to 'loop'. Indeed since we only know the > common outer loop > but not the inner we are hosting from there's not a single "coldest" > loop to cache and so > any caching we might want to perform could be applied to the other case as > well. > > I suppose the most natural thing to cache is for each loop the outer loop > where > its outer loop preheader would be hotter than the outer loops preheader so > that > > + while (outmost_loop != loop) > +{ > + if (bb_colder_than_loop_preheader (loop_preheader_edge > (outmost_loop)->src, > +loop_preheader_edge > (cold_loop)->src)) > + cold_loop = outmost_loop; > + outmost_loop = superloop_at_depth (loop, loop_depth (outmost_loop) + > 1); > +} > > could be instead written as > > coldest_loop = coldest_outermost_loop[loop->num]; > if (loop_depth (coldest_loop) < loop_depth (outermost_loop)) > return outermost_loop; > return coldest_loop; > > ? And in the usual case coldest_outermost_loop[L] would be the loop tree > root. > It should be possible to compute such cache in a DFS walk of the loop tree > (the loop iterator by default visits in such order). Thanks. Updated the patch with your suggestion. Not sure whether it strictly conforms to your comments. Though the patch passed all my added tests(coverage not enough), I am still a bit worried if pre-computed coldest_loop is outside of outermost_loop, but outermost_loop is not the COLDEST LOOP, i.e. (outer->inner) [loop tree root, coldest_loop, outermost_loop,..., second_coldest_loop, ..., loop], then function find_coldest_out_loop will return a loop NOT accord with our expectation, that should return second_coldest_loop instead of outermost_loop? Changes: 1. Add function fill_coldest_out_loop to pre compute the coldest outermost loop for each loop. 2. Rename find_coldest_out_loop to get_coldest_out_loop. 3. Add testcase ssa-lim-22.c to differentiate with ssa-lim-19.c. v5 changes: 1. Refine comments for new functions. 2. Use basic_block instead of count in bb_colder_than_loop_preheader to align with function name. 3. Refine with simpler implementation for get_coldest_out_loop and ref_in_loop_hot_body::operator for better understanding. v4 changes: 1. Sort out profile_count comparision to function bb_cold_than_loop_preheader. 2. Update ref_in_loop_hot_body::operator () to find cold_loop before compare. 3. Split RTL invariant motion part out. 4. Remove aux changes. v3 changes: 1. Handle max_loop in determine_max_movement instead of outermost_invariant_loop. 2. Remove unnecessary changes. 3. Add for_all_locs_in_loop (loop, ref, ref_in_loop_hot_body) in can_sm_ref_p. 4. "gsi_next (&bsi);" in move_computations_worker is kept since it caused infinite loop when implementing v1 and the iteration is missed to be updated actually. v1: https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576488.html v2: https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579086.html v3: https://gcc.gnu.org/pipermail/gcc-patches/2021-September/580211.html v4: https://gcc.gnu.org/pipermail/gcc-patches/2021-October/581231.html v5: https://gcc.gnu.org/pipermail/gcc-patches/2021-October/581961.html There was a patch trying to avoid move cold block out of loop: https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html Richard suggested to "never hoist anything from a bb with lower execution frequency to a bb with higher one in LIM invariantness_dom_walker before_dom_children". In gimple LIM analysis, add get_coldest_out_loop to move invariants to expected target loop, if profile count of the loop bb is colder than target loop preheader, it won't be hoisted out of loop. Likely for store motion, if all locations of the REF in loop is cold, don't do store motion of it. SPEC2017 performance evaluation shows 1% performance improvement for intrate GEOMEAN and no obvious regression for others. Especially, 500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00% on P8LE. gcc/ChangeLog: * tree-ssa-loop-im.c (bb_colder_than_loop_preheader): New function. (get_coldest_out_loop): New function. (determine_max_movement): Use get_coldest_out_loop. (move_computations_worker): Adjust and fix iteration udpate. (class ref_in_loop_hot_body): New functor.
[PATCH] rs6000: Fix incorrect fusion constraint [PR102991]
The clobber constraint should match operand's constraint. fusion.md was generated by genfusion.pl, but it is disabled now, update both places with correct clobber constraint. gcc/ChangeLog: * config/rs6000/fusion.md: Fix incorrect clobber constraint. * config/rs6000/genfusion.pl: Likewise. --- gcc/config/rs6000/fusion.md| 128 - gcc/config/rs6000/genfusion.pl | 2 +- 2 files changed, 65 insertions(+), 65 deletions(-) diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md index 516baa0bb0b..d11cecb11ee 100644 --- a/gcc/config/rs6000/fusion.md +++ b/gcc/config/rs6000/fusion.md @@ -1874,7 +1874,7 @@ (define_insn "*fuse_vand_vand" (and:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ vand %3,%1,%0\;vand %3,%3,%2 @@ -1892,7 +1892,7 @@ (define_insn "*fuse_vandc_vand" (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ vandc %3,%1,%0\;vand %3,%3,%2 @@ -1910,7 +1910,7 @@ (define_insn "*fuse_veqv_vand" (and:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ veqv %3,%1,%0\;vand %3,%3,%2 @@ -1928,7 +1928,7 @@ (define_insn "*fuse_vnand_vand" (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ vnand %3,%1,%0\;vand %3,%3,%2 @@ -1946,7 +1946,7 @@ (define_insn "*fuse_vnor_vand" (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ vnor %3,%1,%0\;vand %3,%3,%2 @@ -1964,7 +1964,7 @@ (define_insn "*fuse_vor_vand" (and:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ vor %3,%1,%0\;vand %3,%3,%2 @@ -1982,7 +1982,7 @@ (define_insn "*fuse_vorc_vand" (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ vorc %3,%1,%0\;vand %3,%3,%2 @@ -2000,7 +2000,7 @@ (define_insn "*fuse_vxor_vand" (and:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ vxor %3,%1,%0\;vand %3,%3,%2 @@ -2018,7 +2018,7 @@ (define_insn "*fuse_vand_vandc" (and:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 2 "altivec_register_operand" "v,v,v,v" - (clobber (match_scratch
Re: [RFC] Don't move cold code out of loop by checking bb count
On 2021/10/29 19:48, Richard Biener wrote: > I'm talking about the can_sm_ref_p call, in that context 'loop' will > be the outermost loop of > interest, and we are calling this for all stores in a loop. We're doing > > +bool > +ref_in_loop_hot_body::operator () (mem_ref_loc *loc) > +{ > + basic_block curr_bb = gimple_bb (loc->stmt); > + class loop *inner_loop = curr_bb->loop_father; > + return find_coldest_out_loop (l, inner_loop, curr_bb); > > for each location the ref is accessed and the intent was to see > whether there's at least one > that we would like to move to 'loop'. Indeed since we only know the > common outer loop > but not the inner we are hosting from there's not a single "coldest" > loop to cache and so > any caching we might want to perform could be applied to the other case as > well. > > I suppose the most natural thing to cache is for each loop the outer loop > where > its outer loop preheader would be hotter than the outer loops preheader so > that > > + while (outmost_loop != loop) > +{ > + if (bb_colder_than_loop_preheader (loop_preheader_edge > (outmost_loop)->src, > +loop_preheader_edge > (cold_loop)->src)) > + cold_loop = outmost_loop; > + outmost_loop = superloop_at_depth (loop, loop_depth (outmost_loop) + > 1); > +} > > could be instead written as > > coldest_loop = coldest_outermost_loop[loop->num]; > if (loop_depth (coldest_loop) < loop_depth (outermost_loop)) > return outermost_loop; > return coldest_loop; > > ? And in the usual case coldest_outermost_loop[L] would be the loop tree > root. > It should be possible to compute such cache in a DFS walk of the loop tree > (the loop iterator by default visits in such order). Thanks. Updated the patch with your suggestion. Not sure whether it strictly conforms to your comments. Though the patch passed all my added tests(coverage not enough), I am still a bit worried if pre-computed coldest_loop is outside of outermost_loop, but outermost_loop is not the COLDEST LOOP, i.e. (outer->inner) [loop tree root, coldest_loop, outermost_loop,..., second_coldest_loop, ..., loop], then function find_coldest_out_loop will return a loop NOT accord with our expectation, that should return second_coldest_loop instead of outermost_loop? Changes: 1. Add function fill_coldest_out_loop to pre compute the coldest outermost loop for each loop. 2. Rename find_coldest_out_loop to get_coldest_out_loop. 3. Add testcase ssa-lim-22.c to differentiate with ssa-lim-19.c. v5 changes: 1. Refine comments for new functions. 2. Use basic_block instead of count in bb_colder_than_loop_preheader to align with function name. 3. Refine with simpler implementation for get_coldest_out_loop and ref_in_loop_hot_body::operator for better understanding. v4 changes: 1. Sort out profile_count comparision to function bb_cold_than_loop_preheader. 2. Update ref_in_loop_hot_body::operator () to find cold_loop before compare. 3. Split RTL invariant motion part out. 4. Remove aux changes. v3 changes: 1. Handle max_loop in determine_max_movement instead of outermost_invariant_loop. 2. Remove unnecessary changes. 3. Add for_all_locs_in_loop (loop, ref, ref_in_loop_hot_body) in can_sm_ref_p. 4. "gsi_next (&bsi);" in move_computations_worker is kept since it caused infinite loop when implementing v1 and the iteration is missed to be updated actually. v1: https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576488.html v2: https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579086.html v3: https://gcc.gnu.org/pipermail/gcc-patches/2021-September/580211.html v4: https://gcc.gnu.org/pipermail/gcc-patches/2021-October/581231.html v5: https://gcc.gnu.org/pipermail/gcc-patches/2021-October/581961.html There was a patch trying to avoid move cold block out of loop: https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html Richard suggested to "never hoist anything from a bb with lower execution frequency to a bb with higher one in LIM invariantness_dom_walker before_dom_children". In gimple LIM analysis, add get_coldest_out_loop to move invariants to expected target loop, if profile count of the loop bb is colder than target loop preheader, it won't be hoisted out of loop. Likely for store motion, if all locations of the REF in loop is cold, don't do store motion of it. SPEC2017 performance evaluation shows 1% performance improvement for intrate GEOMEAN and no obvious regression for others. Especially, 500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00% on P8LE. gcc/ChangeLog: * tree-ssa-loop-im.c (bb_colder_than_loop_preheader): New function. (get_coldest_out_loop): New function. (determine_max_movement): Use get_coldest_out_loop. (move_computations_worker): Adjust and fix iteration udpate. (class ref_in_loop_hot_body): New functor.
Re: [PATCH] rs6000: Fix incorrect fusion constraint [PR102991]
On 2021/11/3 23:13, David Edelsohn wrote: > Did you manually change fusion.md or did you regenerate it after > fixing genfusion.pl? > > If you regenerated it, the ChangeLog entry should be "Regenerated" and > the "Fix incorrect clobber constraint." should refer to the > genfusion.pl change. > > I want to ensure that genfusion.pl generates the correct constraint > the next time it is used. > Aaron mentioned he disabled the auto generation here[1], but before than that, Segher suggested to enable it in stage1. [1] https://gcc.gnu.org/pipermail/gcc-patches/2021-February/564652.html [2] https://gcc.gnu.org/pipermail/gcc-patches/2021-January/564244.html Thus re-enable it with the followed v2 patch(Confirmed the fusion.md is exactly same with v1 patch.) [PATCH v2] rs6000: Fix incorrect fusion constraint [PR102991] gcc/ChangeLog: * config/rs6000/fusion.md: Regenerate. * config/rs6000/genfusion.pl: Fix incorrect clobber constraint. * config/rs6000/t-rs6000: Uncomment regeneration of fusion.md. --- gcc/config/rs6000/fusion.md| 128 - gcc/config/rs6000/genfusion.pl | 2 +- gcc/config/rs6000/t-rs6000 | 4 +- 3 files changed, 67 insertions(+), 67 deletions(-) diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md index 516baa0bb0b..d11cecb11ee 100644 --- a/gcc/config/rs6000/fusion.md +++ b/gcc/config/rs6000/fusion.md @@ -1874,7 +1874,7 @@ (define_insn "*fuse_vand_vand" (and:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ vand %3,%1,%0\;vand %3,%3,%2 @@ -1892,7 +1892,7 @@ (define_insn "*fuse_vandc_vand" (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ vandc %3,%1,%0\;vand %3,%3,%2 @@ -1910,7 +1910,7 @@ (define_insn "*fuse_veqv_vand" (and:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ veqv %3,%1,%0\;vand %3,%3,%2 @@ -1928,7 +1928,7 @@ (define_insn "*fuse_vnand_vand" (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ vnand %3,%1,%0\;vand %3,%3,%2 @@ -1946,7 +1946,7 @@ (define_insn "*fuse_vnor_vand" (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (not:VM (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ vnor %3,%1,%0\;vand %3,%3,%2 @@ -1964,7 +1964,7 @@ (define_insn "*fuse_vor_vand" (and:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ vor %3,%1,%0\;vand %3,%3,%2 @@ -1982,7 +1982,7 @@ (define_insn "*fuse_vorc_vand" (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ vorc %3,%1,%0\;vand %3,%3,%2 @@ -2000,7 +2000,7 @@ (define_insn "*fuse_vxor_vand"
Re: [PATCH] rs6000: Fix incorrect fusion constraint [PR102991]
On 2021/11/4 09:59, David Edelsohn wrote: > On Wed, Nov 3, 2021 at 9:46 PM Xionghu Luo wrote: >> >> On 2021/11/3 23:13, David Edelsohn wrote: >>> Did you manually change fusion.md or did you regenerate it after >>> fixing genfusion.pl? >>> >>> If you regenerated it, the ChangeLog entry should be "Regenerated" and >>> the "Fix incorrect clobber constraint." should refer to the >>> genfusion.pl change. >>> >>> I want to ensure that genfusion.pl generates the correct constraint >>> the next time it is used. >>> >> >> Aaron mentioned he disabled the auto generation here[1], but before >> than that, Segher suggested to enable it in stage1. >> >> [1] https://gcc.gnu.org/pipermail/gcc-patches/2021-February/564652.html >> [2] https://gcc.gnu.org/pipermail/gcc-patches/2021-January/564244.html >> >> >> Thus re-enable it with the followed v2 patch(Confirmed the fusion.md is >> exactly same with v1 patch.) >> >> >> [PATCH v2] rs6000: Fix incorrect fusion constraint [PR102991] >> >> >> gcc/ChangeLog: >> >> * config/rs6000/fusion.md: Regenerate. >> * config/rs6000/genfusion.pl: Fix incorrect clobber constraint. >> * config/rs6000/t-rs6000: Uncomment regeneration of fusion.md. > > I believe that there is some confusion about my request. I am not > requesting that the patch enable genfusion.pl . The Makefile fragment > rule to generate fusion.md is disabled for a reason and normally > should not be enabled. But fusion.md should be generated by > genfusion.pl when there is a change, and any changes should be made in > genfusion.pl. In other words, change genfusion.pl, temporarily enable > the Makefile fragment rule, generate fusion.md, disable genfusion.pl. > My request was an effort to ensure that genfusion.pl correctly > regenerates the new, corrected fusion.md file. I don't want a manual > change to fusion.md that differs from the automatically generated > file. Only the updated fusion.md and genfusion.pl should be checked > in. > > Has Aaron reviewed and confirmed the change to genfusion.pl? > Regenerate and update the ChangeLog description from v1: [PATCH] rs6000: Fix incorrect fusion constraint [PR102991] gcc/ChangeLog: * config/rs6000/fusion.md: Regenerate. * config/rs6000/genfusion.pl: Fix incorrect clobber constraint. --- gcc/config/rs6000/fusion.md| 128 - gcc/config/rs6000/genfusion.pl | 2 +- 2 files changed, 65 insertions(+), 65 deletions(-) diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md index 516baa0bb0b..d11cecb11ee 100644 --- a/gcc/config/rs6000/fusion.md +++ b/gcc/config/rs6000/fusion.md @@ -1874,7 +1874,7 @@ (define_insn "*fuse_vand_vand" (and:VM (and:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "%v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ vand %3,%1,%0\;vand %3,%3,%2 @@ -1892,7 +1892,7 @@ (define_insn "*fuse_vandc_vand" (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 1 "altivec_register_operand" "v,v,v,v")) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ vandc %3,%1,%0\;vand %3,%3,%2 @@ -1910,7 +1910,7 @@ (define_insn "*fuse_veqv_vand" (and:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v") (match_operand:VM 1 "altivec_register_operand" "v,v,v,v"))) (match_operand:VM 2 "altivec_register_operand" "v,v,v,v"))) - (clobber (match_scratch:VM 4 "=X,X,X,&r"))] + (clobber (match_scratch:VM 4 "=X,X,X,&v"))] "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)" "@ veqv %3,%1,%0\;vand %3,%3,%2 @@ -1928,7 +1928,7 @@ (define_insn "*fuse_vnand_vand" (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" "v,v,v,v"))
Re: [PATCH] rs6000: Fix incorrect fusion constraint [PR102991]
On 2021/11/5 08:58, David Edelsohn wrote: > On Thu, Nov 4, 2021 at 8:50 PM Xionghu Luo wrote: > >> [PATCH] rs6000: Fix incorrect fusion constraint [PR102991] >> >> gcc/ChangeLog: >> >> * config/rs6000/fusion.md: Regenerate. >> * config/rs6000/genfusion.pl: Fix incorrect clobber constraint. > > Okay. > > Thanks, David > Committed to r12-4930, the gcc-11 branch also has fusion.md&genfusion.pl and needs backport. -- Thanks, Xionghu
Re: [PATCH v3 1/4] Fix loop split incorrect count and probability
On 2021/10/27 15:44, Jan Hubicka wrote: >> On Wed, 27 Oct 2021, Jan Hubicka wrote: >> gcc/ChangeLog: * tree-ssa-loop-split.c (split_loop): Fix incorrect probability. (do_split_loop_on_cond): Likewise. --- gcc/tree-ssa-loop-split.c | 25 - 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c index 3f6ad046623..d30782888f3 100644 --- a/gcc/tree-ssa-loop-split.c +++ b/gcc/tree-ssa-loop-split.c @@ -575,7 +575,11 @@ split_loop (class loop *loop1) stmts2); tree cond = build2 (guard_code, boolean_type_node, guard_init, border); if (!initial_true) -cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); +cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); + + edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE + ? EDGE_SUCC (bbs[i], 0) + : EDGE_SUCC (bbs[i], 1); /* Now version the loop, placing loop2 after loop1 connecting them, and fix up SSA form for that. */ @@ -583,10 +587,10 @@ split_loop (class loop *loop1) basic_block cond_bb; class loop *loop2 = loop_version (loop1, cond, &cond_bb, - profile_probability::always (), - profile_probability::always (), - profile_probability::always (), - profile_probability::always (), + true_edge->probability, + true_edge->probability.invert (), + true_edge->probability, + true_edge->probability.invert (), true); >>> >>> As discussed yesterday, for loop of form >>> >>> for (...) >>> if (cond) >>> cond = something(); >>> else >>> something2 >>> >>> Split as >> >> Note that you are missing to conditionalize loop1 execution >> on 'cond' (not sure if that makes a difference). > You are right - forgot to mention that. > > Entry conditional makes no difference on scaling stmts inside loop but > affects its header and expected trip count. We however need to set up > probability of this conditional (and preheader count if it exists) > There is no general way to read the probability of this initial > conditional from cfg profile. So I guess we are stuck with guessing > some arbitrary value. I guess common case is that cond is true first > iteration tough and often we can easily see that fromo PHI node > initializing the test variable. > > Other thing that changes is expected number of iterations of the split > loops, so we may want to update the exit conditinal probability > accordingly... > Sorry for the late reply. The below updated patch mainly solves the issues you pointed out: - profile count proportion for both original loop and copied loop without dropping down the true branch's count; - probability update in the two loops and between the two loops; - number of iterations update/check for split_loop. [PATCH v3] Fix loop split incorrect count and probability In tree-ssa-loop-split.c, split_loop and split_loop_on_cond does two kind of split. split_loop only works for single loop and insert edge at exit when split, while split_loop_on_cond is not limited to single loop and insert edge at latch when split. Both split behavior should consider loop count and probability update. For split_loop, loop split condition is moved in front of loop1 and loop2; But split_loop_on_cond moves the condition between loop1 and loop2, this patch does: 1) profile count proportion for both original loop and copied loop without dropping down the true branch's count; 2) probability update in and between the two loops; 3) number of iterations update for split_loop. Regression tested pass, OK for master? Changes diff for split_loop and split_loop_on_cond cases: 1) diff base/loop-split.c.151t.lsplit patched/loop-split.c.152t.lsplit ... [local count: 118111600]: if (beg_5(D) < end_8(D)) goto ; [89.00%] else goto ; [11.00%] [local count: 105119324]: if (beg2_6(D) < c_9(D)) -goto ; [100.00%] +goto ; [33.00%] else -goto ; [100.00%] +goto ; [67.00%] - [local count: 105119324]: + [local count: 34689377]: _25 = beg_5(D) + 1; _26 = end_8(D) - beg_5(D); _27 = beg2_6(D) + _26; _28 = MIN_EXPR ; - [local count: 955630225]: + [local count: 315357973]: # i_16 = PHI # j_17 = PHI printf ("a: %d %d\n", i_16, j_17); i_11 = i_16 + 1; j_12 = j_17 + 1; if (j_12 < _28) -goto ; [89.00%] +goto ; [29.37%] else -goto ; [11.00%] +goto ; [70.63%] -