from:"Xionghu Luo"

[PATCH] libgcov: Fix gcov overlap bugs of divide to 0

2023-10-25 Thread Xionghu Luo

Fix the long lasting issue of `gcov-tool overlap  xxx yyy`,
divide to 0 caused the output shows a lot of nans, another problem
is the counts in file are never acculated leads to incorrect results.

Signed-off-by: Xionghu Luo 

libgcc/ChangeLog:

* libgcov-util.c (compute_one_gcov): Avoid divide to 0.
(accumulate_sum_counts): New.
(calculate_overlap): Call accumulate_sum_countes.
---
 libgcc/libgcov-util.c | 58 ---
 1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/libgcc/libgcov-util.c b/libgcc/libgcov-util.c
index d547c103cab..26a02e66567 100644
--- a/libgcc/libgcov-util.c
+++ b/libgcc/libgcov-util.c
@@ -1072,6 +1072,8 @@ compute_one_gcov (const struct gcov_info *gcov_info1,
 
   for (f_ix = 0; f_ix < gcov_info1->n_functions; f_ix++)
 {
+  double func_1 = 0.0;
+  double func_2 = 0.0;
   double func_cum_1 = 0.0;
   double func_cum_2 = 0.0;
   double func_val = 0.0;
@@ -1096,11 +1098,15 @@ compute_one_gcov (const struct gcov_info *gcov_info1,
   ci_ptr2->values[c_num],
   sum_1, sum_2);
 
- func_cum_1 += ci_ptr1->values[c_num] / sum_1;
- func_cum_2 += ci_ptr2->values[c_num] / sum_2;
+ if (sum_1)
+   func_1 = ci_ptr1->values[c_num] / sum_1;
+ func_cum_1 += func_1;
+ if (sum_2)
+   func_2 = ci_ptr2->values[c_num] / sum_2;
+ func_cum_2 += func_2;
  nonzero = 1;
- if (ci_ptr1->values[c_num] / sum_1 >= overlap_hot_threshold
- || ci_ptr2->values[c_num] / sum_2 >= overlap_hot_threshold)
+ if (func_1 >= overlap_hot_threshold
+ || func_2 >= overlap_hot_threshold)
hot = 1;
}
}
@@ -1322,6 +1328,47 @@ matched_gcov_info (const struct gcov_info *info1, const 
struct gcov_info *info2)
   return 1;
 }
 
+static int
+accumuate_sum_counts (const struct gcov_info *gcov_info1,
+ const struct gcov_info *gcov_info2)
+{
+  gcc_assert (gcov_info1 || gcov_info2);
+  unsigned f_ix;
+
+  if (gcov_info1)
+{
+  gcov_type cum_1 = 0;
+  for (f_ix = 0; f_ix < gcov_info1->n_functions; f_ix++)
+   {
+ const struct gcov_fn_info *gfi_ptr = gcov_info1->functions[f_ix];
+ if (!gfi_ptr || gfi_ptr->key != gcov_info1)
+   continue;
+ const struct gcov_ctr_info *ci_ptr = gfi_ptr->ctrs;
+ unsigned c_num;
+ for (c_num = 0; c_num < ci_ptr->num; c_num++)
+   cum_1 += ci_ptr->values[c_num];
+   }
+  overlap_sum_1 = cum_1;
+}
+
+  if (gcov_info2)
+{
+  gcov_type cum_2 = 0;
+  for (f_ix = 0; f_ix < gcov_info2->n_functions; f_ix++)
+   {
+ const struct gcov_fn_info *gfi_ptr = gcov_info2->functions[f_ix];
+ if (!gfi_ptr || gfi_ptr->key != gcov_info2)
+   continue;
+ const struct gcov_ctr_info *ci_ptr = gfi_ptr->ctrs;
+ unsigned c_num;
+ for (c_num = 0; c_num < ci_ptr->num; c_num++)
+   cum_2 += ci_ptr->values[c_num];
+   }
+  overlap_sum_2 = cum_2;
+}
+  return 0;
+}
+
 /* Compute the overlap score of two profiles with the head of GCOV_LIST1 and
GCOV_LIST1. Return a number ranging from [0.0, 1.0], with 0.0 meaning no
match and 1.0 meaning a perfect match.  */
@@ -1410,6 +1457,9 @@ calculate_overlap (struct gcov_info *gcov_list1,
   if (overlap_func_level)
 printf("\n   processing %36s:\n", filename);
 
+  overlap_sum_1 = overlap_sum_2 = 0.0;
+  accumuate_sum_counts (all_infos[i].obj1, all_infos[i].obj2);
+
   val = compute_one_gcov (all_infos[i].obj1, all_infos[i].obj2,
   overlap_sum_1, overlap_sum_2, &cum_1, &cum_2);
 
-- 
2.39.3

Re: [PATCH] libgcov: Fix gcov overlap bugs of divide to 0

2023-10-26 Thread Xionghu Luo


+cc maintainers.


On 2023/10/26 11:25, Xionghu Luo wrote:

Fix the long lasting issue of `gcov-tool overlap  xxx yyy`,
divide to 0 caused the output shows a lot of nans, another problem
is the counts in file are never acculated leads to incorrect results.

Signed-off-by: Xionghu Luo 

libgcc/ChangeLog:

* libgcov-util.c (compute_one_gcov): Avoid divide to 0.
(accumulate_sum_counts): New.
(calculate_overlap): Call accumulate_sum_countes.
---
  libgcc/libgcov-util.c | 58 ---
  1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/libgcc/libgcov-util.c b/libgcc/libgcov-util.c
index d547c103cab..26a02e66567 100644
--- a/libgcc/libgcov-util.c
+++ b/libgcc/libgcov-util.c
@@ -1072,6 +1072,8 @@ compute_one_gcov (const struct gcov_info *gcov_info1,
  
for (f_ix = 0; f_ix < gcov_info1->n_functions; f_ix++)

  {
+  double func_1 = 0.0;
+  double func_2 = 0.0;
double func_cum_1 = 0.0;
double func_cum_2 = 0.0;
double func_val = 0.0;
@@ -1096,11 +1098,15 @@ compute_one_gcov (const struct gcov_info *gcov_info1,
   ci_ptr2->values[c_num],
   sum_1, sum_2);
  
-	  func_cum_1 += ci_ptr1->values[c_num] / sum_1;

- func_cum_2 += ci_ptr2->values[c_num] / sum_2;
+ if (sum_1)
+   func_1 = ci_ptr1->values[c_num] / sum_1;
+ func_cum_1 += func_1;
+ if (sum_2)
+   func_2 = ci_ptr2->values[c_num] / sum_2;
+ func_cum_2 += func_2;
  nonzero = 1;
- if (ci_ptr1->values[c_num] / sum_1 >= overlap_hot_threshold
- || ci_ptr2->values[c_num] / sum_2 >= overlap_hot_threshold)
+ if (func_1 >= overlap_hot_threshold
+ || func_2 >= overlap_hot_threshold)
hot = 1;
}
}
@@ -1322,6 +1328,47 @@ matched_gcov_info (const struct gcov_info *info1, const 
struct gcov_info *info2)
return 1;
  }
  
+static int

+accumuate_sum_counts (const struct gcov_info *gcov_info1,
+ const struct gcov_info *gcov_info2)
+{
+  gcc_assert (gcov_info1 || gcov_info2);
+  unsigned f_ix;
+
+  if (gcov_info1)
+{
+  gcov_type cum_1 = 0;
+  for (f_ix = 0; f_ix < gcov_info1->n_functions; f_ix++)
+   {
+ const struct gcov_fn_info *gfi_ptr = gcov_info1->functions[f_ix];
+ if (!gfi_ptr || gfi_ptr->key != gcov_info1)
+   continue;
+ const struct gcov_ctr_info *ci_ptr = gfi_ptr->ctrs;
+ unsigned c_num;
+ for (c_num = 0; c_num < ci_ptr->num; c_num++)
+   cum_1 += ci_ptr->values[c_num];
+   }
+  overlap_sum_1 = cum_1;
+}
+
+  if (gcov_info2)
+{
+  gcov_type cum_2 = 0;
+  for (f_ix = 0; f_ix < gcov_info2->n_functions; f_ix++)
+   {
+ const struct gcov_fn_info *gfi_ptr = gcov_info2->functions[f_ix];
+ if (!gfi_ptr || gfi_ptr->key != gcov_info2)
+   continue;
+ const struct gcov_ctr_info *ci_ptr = gfi_ptr->ctrs;
+ unsigned c_num;
+ for (c_num = 0; c_num < ci_ptr->num; c_num++)
+   cum_2 += ci_ptr->values[c_num];
+   }
+  overlap_sum_2 = cum_2;
+}
+  return 0;
+}
+
  /* Compute the overlap score of two profiles with the head of GCOV_LIST1 and
 GCOV_LIST1. Return a number ranging from [0.0, 1.0], with 0.0 meaning no
 match and 1.0 meaning a perfect match.  */
@@ -1410,6 +1457,9 @@ calculate_overlap (struct gcov_info *gcov_list1,
if (overlap_func_level)
  printf("\n   processing %36s:\n", filename);
  
+  overlap_sum_1 = overlap_sum_2 = 0.0;

+  accumuate_sum_counts (all_infos[i].obj1, all_infos[i].obj2);
+
val = compute_one_gcov (all_infos[i].obj1, all_infos[i].obj2,
overlap_sum_1, overlap_sum_2, &cum_1, &cum_2);

[RFC] Run store-merging pass once more before pass fre/pre

2020-02-18 Thread Xionghu Luo

Store-merging pass should run twice, the reason is pass fre/pre will do
some kind of optimizations to instructions by:
  1. Converting the load from address to load from function arguments
  (store_merging_30.c:foo1).
  2. Converting the byte access to BIT_FIELD_REF(store_merging_30.c:foo2).
  3. Other bitfield combinations or potential interference optimizations etc.
These optimizations will break the store chain, store-merging pass fails
to catch such kind of pattern so stores are not merged in middle end,
then consecutive stb/sth instructions(should be merged to stw) are emitted
finally.

And why not directly move store-merging pass(numbered 194) just before
fre1(numbered 35) is for case store_merging_14.c, 5 merges are done by
store_merging1, and 4 merges are done fore store_merge2. So, keep the
original store_merge as store_merge2 as store merge may be still available
after other pass optimizations.  Most of the 30 store_merging_N.c test
case dg-final pass name would be updated from store-merging to
store-merging1 once this RFC patch idea got confirmed.
Any comments?  Thanks.

PS:
Before this patch, store_merging_30.c.035t.fre1:

... foo1:
Inserted _13 = (short unsigned int) counters_new_5(D);
Replaced tmp.D.2912.D.2911.D.2910.D.2909.inuse with _13 in all uses of
_1 = tmp.D.2912.D.2911.D.2910.D.2909.inuse;
Removing dead stmt _1 = tmp.D.2912.D.2911.D.2910.D.2909.inuse;
... foo2:
Inserted _17 = BIT_FIELD_REF <_1, 8, 16>;
Replaced tmp.D.2926.D.2925.D.2924.D.2923.objects with _17 in all uses of
_3 = tmp.D.2926.D.2925.D.2924.D.2923.objects;
Removing dead stmt _3 = tmp.D.2926.D.2925.D.2924.D.2923.objects;

foo1 asm:
rldicl 9,4,48,48
sth 4,0(3)
sth 9,2(3)
blr

With this patch(similar for foo2):

stw r4,0(r3)
blr

gcc/ChangeLog

2020-02-18  Xiong Hu Luo  

Part of PR middle-end/71509
gimple-ssa-store-merging.c (clone): New.
passes.def (pass_store_merging): New.

gcc/testsuite/ChangeLog

2020-02-18  Xiong Hu Luo  

Part of PR middle-end/71509
testsuite/gcc.dg/store_merging_14.c: Update.
testsuite/gcc.dg/store_merging_30.c: New.
---
 gcc/gimple-ssa-store-merging.c  |  2 +
 gcc/passes.def  |  1 +
 gcc/testsuite/gcc.dg/store_merging_14.c |  3 +-
 gcc/testsuite/gcc.dg/store_merging_30.c | 86 +
 4 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/store_merging_30.c

diff --git a/gcc/gimple-ssa-store-merging.c b/gcc/gimple-ssa-store-merging.c
index 8371323ef4a..9a5bd49fc3a 100644
--- a/gcc/gimple-ssa-store-merging.c
+++ b/gcc/gimple-ssa-store-merging.c
@@ -2156,6 +2156,8 @@ public:
   {
   }
 
+  opt_pass * clone () { return new pass_store_merging (m_ctxt); }
+
   /* Pass not supported for PDP-endian, nor for insane hosts or
  target character sizes where native_{encode,interpret}_expr
  doesn't work properly.  */
diff --git a/gcc/passes.def b/gcc/passes.def
index 2bf2cb78fc5..e531531cb14 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -85,6 +85,7 @@ along with GCC; see the file COPYING3.  If not see
  /* pass_build_ealias is a dummy pass that ensures that we
 execute TODO_rebuild_alias at this point.  */
  NEXT_PASS (pass_build_ealias);
+ NEXT_PASS (pass_store_merging);
  NEXT_PASS (pass_fre, true /* may_iterate */);
  NEXT_PASS (pass_early_vrp);
  NEXT_PASS (pass_merge_phi);
diff --git a/gcc/testsuite/gcc.dg/store_merging_14.c 
b/gcc/testsuite/gcc.dg/store_merging_14.c
index 9310aaf3489..bd120d18ac6 100644
--- a/gcc/testsuite/gcc.dg/store_merging_14.c
+++ b/gcc/testsuite/gcc.dg/store_merging_14.c
@@ -214,4 +214,5 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "Merging successful" 9 "store-merging" } 
} */
+/* { dg-final { scan-tree-dump-times "Merging successful" 5 "store-merging1" } 
} */
+/* { dg-final { scan-tree-dump-times "Merging successful" 4 "store-merging2" } 
} */
diff --git a/gcc/testsuite/gcc.dg/store_merging_30.c 
b/gcc/testsuite/gcc.dg/store_merging_30.c
new file mode 100644
index 000..71369c3b196
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/store_merging_30.c
@@ -0,0 +1,86 @@
+/* { dg-do run } */
+/* { dg-require-effective-target store_merge } */
+/* { dg-options "-O2 -fdump-tree-store-merging" } */
+
+typedef unsigned int atomic_t;
+
+struct page
+{
+  union
+  {
+unsigned long counters;
+struct
+{
+  union
+  {
+   struct
+   {
+ unsigned inuse : 16;
+ unsigned objects : 15;
+ unsigned frozen : 1;
+   };
+  };
+};
+  };
+};
+
+struct page2
+{
+  union
+  {
+unsigned counters;
+struct
+{
+  union
+  {
+   struct
+   {
+ unsigned inuse : 16;
+ unsigned objects : 8;
+ unsigned frozen : 8;
+   };
+  };
+};
+  };
+};
+
+__attribute__((noipa)) void
+foo1 (struct page *page, unsigned long counters_new)
+{
+struct page tmp;
+tmp.c

[PATCH] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069]

2022-08-07 Thread Xionghu Luo via Gcc-patches

The native RTL expression for vec_mrghw should be same for BE and LE as
they are register and endian-independent.  So both BE and LE need
generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw
with vec_select and vec_concat.

(set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI
   (subreg:V4SI (reg:V16QI 139) 0)
   (subreg:V4SI (reg:V16QI 140) 0))
   [const_int 0 4 1 5]))

Then combine pass could do the nested vec_select optimization
in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE:

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5])
24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);}

=>

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel)
24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);}

The endianness check need only once at ASM generation finally.
ASM would be better due to nested vec_select simplified to simple scalar
load.

Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{32,64}
Linux(Thanks to Kewen), OK for master?  Or should we revert r12-4496 to
restore to the UNSPEC implementation?

gcc/ChangeLog:
PR target/106069
* config/rs6000/altivec.md (altivec_vmrghb): Emit same native
RTL for BE and LE.
(altivec_vmrghh): Likewise.
(altivec_vmrghw): Likewise.
(*altivec_vmrghsf): Adjust.
(altivec_vmrglb): Likewise.
(altivec_vmrglh): Likewise.
(altivec_vmrglw): Likewise.
(*altivec_vmrglsf): Adjust.
(altivec_vmrghb_direct): Emit different ASM for BE and LE.
(altivec_vmrghh_direct): Likewise.
(altivec_vmrghw_direct_): Likewise.
(altivec_vmrglb_direct): Likewise.
(altivec_vmrglh_direct): Likewise.
(altivec_vmrglw_direct_): Likewise.
(vec_widen_smult_hi_v16qi): Adjust.
(vec_widen_smult_lo_v16qi): Adjust.
(vec_widen_umult_hi_v16qi): Adjust.
(vec_widen_umult_lo_v16qi): Adjust.
(vec_widen_smult_hi_v8hi): Adjust.
(vec_widen_smult_lo_v8hi): Adjust.
(vec_widen_umult_hi_v8hi): Adjust.
(vec_widen_umult_lo_v8hi): Adjust.
* config/rs6000/rs6000.cc (altivec_expand_vec_perm_const): Emit same
native RTL for BE and LE.
* config/rs6000/vsx.md (vsx_xxmrghw_): Likewise.
(vsx_xxmrglw_): Likewise.

gcc/testsuite/ChangeLog:
PR target/106069
* gcc.target/powerpc/pr106069.C: New test.

Signed-off-by: Xionghu Luo 
---
 gcc/config/rs6000/altivec.md| 122 
 gcc/config/rs6000/rs6000.cc |  36 +++---
 gcc/config/rs6000/vsx.md|  16 +--
 gcc/testsuite/gcc.target/powerpc/pr106069.C | 118 +++
 4 files changed, 209 insertions(+), 83 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr106069.C

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 2c4940f2e21..8d9c0109559 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -1144,11 +1144,7 @@ (define_expand "altivec_vmrghb"
(use (match_operand:V16QI 2 "register_operand"))]
   "TARGET_ALTIVEC"
 {
-  rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct
-   : gen_altivec_vmrglb_direct;
-  if (!BYTES_BIG_ENDIAN)
-std::swap (operands[1], operands[2]);
-  emit_insn (fun (operands[0], operands[1], operands[2]));
+  emit_insn (gen_altivec_vmrghb_direct (operands[0], operands[1], 
operands[2]));
   DONE;
 })
 
@@ -1167,7 +1163,12 @@ (define_insn "altivec_vmrghb_direct"
 (const_int 6) (const_int 22)
 (const_int 7) (const_int 23)])))]
   "TARGET_ALTIVEC"
-  "vmrghb %0,%1,%2"
+  {
+ if (BYTES_BIG_ENDIAN)
+  return "vmrghb %0,%1,%2";
+else
+  return "vmrglb %0,%2,%1";
+ }
   [(set_attr "type" "vecperm")])
 
 (define_expand "altivec_vmrghh"
@@ -1176,11 +1177,7 @@ (define_expand "altivec_vmrghh"
(use (match_operand:V8HI 2 "register_operand"))]
   "TARGET_ALTIVEC"
 {
-  rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghh_direct
-   : gen_altivec_vmrglh_direct;
-  if (!BYTES_BIG_ENDIAN)
-std::swap (operands[1], operands[2]);
-  emit_insn (fun (operands[0], operands[1], operands[2]));
+  emit_insn (gen_altivec_vmrghh_direct (operands[0], operands[1], 
operands[2]));
   DONE;
 })
 
@@ -1195,7 +1192,12 @@ (define_insn "altivec_vmrghh_direct"
 (const_int 2) (const_int 10)
 (const_int 3) (const_int 11)])))]
   "TARGET_ALTIVEC"
-  "vmrghh %0,%1,%2"
+  {
+ if (BYTES_BIG_ENDIAN)
+  return "vmrghh %0,%1,%2";
+else
+  return "vmrglh

Re: [PATCH v2] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069]

2022-08-09 Thread Xionghu Luo via Gcc-patches





On 2022/8/9 11:01, Kewen.Lin wrote:

Hi Xionghu,

Thanks for the fix.

on 2022/8/8 11:42, Xionghu Luo wrote:

The native RTL expression for vec_mrghw should be same for BE and LE as
they are register and endian-independent.  So both BE and LE need
generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw
with vec_select and vec_concat.

(set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI
   (subreg:V4SI (reg:V16QI 139) 0)
   (subreg:V4SI (reg:V16QI 140) 0))
   [const_int 0 4 1 5]))

Then combine pass could do the nested vec_select optimization
in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE:

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5])
24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);}

=>

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel)
24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);}

The endianness check need only once at ASM generation finally.
ASM would be better due to nested vec_select simplified to simple scalar
load.

Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{32,64}


Sorry, no -m32 for LE testing.  I noticed the attachement in that PR didn't
include the test case (though the changelog has it), so I re-tested it
again, nothing changed.  :)


Linux(Thanks to Kewen), OK for master?  Or should we revert r12-4496 to
restore to the UNSPEC implementation?



I have some concern on those changed "altivec_*_direct", IMHO the suffix
"_direct" is normally to indicate the define_insn is mapped to the
corresponding hw insn directly.  With this change, for example,
altivec_vmrghb_direct can be mapped into vmrghb or vmrglb, this looks
misleading.  Maybe we can add the corresponding _direct_le and _direct_be
versions, both are mapped into the same insn but have different RTL
patterns.  Looking forward to Segher's and David's suggestions.



Thanks!  Do you mean same RTL patterns with different hw insn?
Updated as:

v2: Split the direct pattern to be and le with same RTL but different insn.

The native RTL expression for vec_mrghw should be same for BE and LE as
they are register and endian-independent.  So both BE and LE need
generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw
with vec_select and vec_concat.

(set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI
   (subreg:V4SI (reg:V16QI 139) 0)
   (subreg:V4SI (reg:V16QI 140) 0))
   [const_int 0 4 1 5]))

Then combine pass could do the nested vec_select optimization
in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE:

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5])
24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);}

=>

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel)
24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);}

The endianness check need only once at ASM generation finally.
ASM would be better due to nested vec_select simplified to simple scalar
load.

Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{32,64}
Linux(Thanks to Kewen), OK for master?  Or should we revert r12-4496 to
restore to the UNSPEC implementation?

gcc/ChangeLog:
PR target/106069
* config/rs6000/altivec.md (altivec_vmrghb): Emit same native
RTL for BE and LE.
(altivec_vmrghh): Likewise.
(altivec_vmrghw): Likewise.
(*altivec_vmrghsf): Adjust.
(altivec_vmrglb): Likewise.
(altivec_vmrglh): Likewise.
(altivec_vmrglw): Likewise.
(*altivec_vmrglsf): Adjust.
(altivec_vmrghb_direct): Emit different ASM for BE and LE.
(altivec_vmrghh_direct): Likewise.
(altivec_vmrghw_direct_): Likewise.
(altivec_vmrglb_direct): Likewise.
(altivec_vmrglh_direct): Likewise.
(altivec_vmrglw_direct_): Likewise.
(vec_widen_smult_hi_v16qi): Adjust.
(vec_widen_smult_lo_v16qi): Adjust.
(vec_widen_umult_hi_v16qi): Adjust.
(vec_widen_umult_lo_v16qi): Adjust.
(vec_widen_smult_hi_v8hi): Adjust.
(vec_widen_smult_lo_v8hi): Adjust.
(vec_widen_umult_hi_v8hi): Adjust.
(vec_widen_umult_lo_v8hi): Adjust.
* config/rs6000/rs6000.cc (altivec_expand_vec_perm_const): Emit same
native RTL for BE and LE.
* config/rs6000/vsx.md (vsx_xxmrghw_): Likewise.
(vsx_xxmrglw_): Likewise.

gcc/testsuite/ChangeLog:
PR target/106069
* g++.target/powerpc/pr106069.C: New test.

Signed-off-by: Xionghu Luo 
---
 gcc/config/rs6000/altivec.md| 223 ++--
 gcc/config/rs6000/rs6000.cc |  36 ++--
 gcc/config/rs6000/vsx.md|  26 +--
 gcc/testsuite/g++.target/powerpc/pr106069.C | 120 +++
 4 files changed, 303 insertions(+), 102 deletions(-)
 create mode 100644 gc

Re: [PATCH v2] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069]

2022-08-10 Thread Xionghu Luo via Gcc-patches





On 2022/8/11 01:07, Segher Boessenkool wrote:

On Wed, Aug 10, 2022 at 02:39:02PM +0800, Xionghu Luo wrote:

On 2022/8/9 11:01, Kewen.Lin wrote:

I have some concern on those changed "altivec_*_direct", IMHO the suffix
"_direct" is normally to indicate the define_insn is mapped to the
corresponding hw insn directly.  With this change, for example,
altivec_vmrghb_direct can be mapped into vmrghb or vmrglb, this looks
misleading.  Maybe we can add the corresponding _direct_le and _direct_be
versions, both are mapped into the same insn but have different RTL
patterns.  Looking forward to Segher's and David's suggestions.


Thanks!  Do you mean same RTL patterns with different hw insn?


A pattern called altivec_vmrghb_direct_le should always emit a vmrghb
instruction, never a vmrglb instead.  Misleading names are an expensive
problem.




Thanks.  Then on LE platforms, if user calls altivec_vmrghw，it will be
expanded to RTL (vec_select (vec_concat (R0 R1 (0 4 1 5))), and
finally matched to altivec_vmrglw_direct_v4si_le with ASM "vmrglw".
For BE just strict forward, seems more clear :-), OK for master?


[PATCH v3] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS 
[PR106069]

v3: rename altivec_vmrghb_direct_le to altivec_vmrglb_direct_le to match
the actual output ASM vmrglb. Likewise for all similar xxx_direct_le
patterns.
v2: Split the direct pattern to be and le with same RTL but different insn.

The native RTL expression for vec_mrghw should be same for BE and LE as
they are register and endian-independent.  So both BE and LE need
generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw
with vec_select and vec_concat.

(set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI
   (subreg:V4SI (reg:V16QI 139) 0)
   (subreg:V4SI (reg:V16QI 140) 0))
   [const_int 0 4 1 5]))

Then combine pass could do the nested vec_select optimization
in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE:

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5])
24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);}

=>

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel)
24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);}

The endianness check need only once at ASM generation finally.
ASM would be better due to nested vec_select simplified to simple scalar
load.

Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{64}
Linux(Thanks to Kewen).

gcc/ChangeLog:

PR target/106069
* config/rs6000/altivec.md (altivec_vmrghb_direct): Remove.
(altivec_vmrghb_direct_be): New pattern for BE.
(altivec_vmrglb_direct_le): New pattern for LE.
(altivec_vmrghh_direct): Remove.
(altivec_vmrghh_direct_be): New pattern for BE.
(altivec_vmrglh_direct_le): New pattern for LE.
(altivec_vmrghw_direct_): Remove.
(altivec_vmrghw_direct__be): New pattern for BE.
(altivec_vmrglw_direct__le): New pattern for LE.
(altivec_vmrglb_direct): Remove.
(altivec_vmrglb_direct_be): New pattern for BE.
(altivec_vmrghb_direct_le): New pattern for LE.
(altivec_vmrglh_direct): Remove.
(altivec_vmrglh_direct_be): New pattern for BE.
(altivec_vmrghh_direct_le): New pattern for LE.
(altivec_vmrglw_direct_): Remove.
(altivec_vmrglw_direct__be): New pattern for BE.
(altivec_vmrghw_direct__le): New pattern for LE.
* config/rs6000/rs6000.cc (altivec_expand_vec_perm_const):
Adjust.
* config/rs6000/vsx.md: Likewise.

gcc/testsuite/ChangeLog:

PR target/106069
* g++.target/powerpc/pr106069.C: New test.

Signed-off-by: Xionghu Luo 
---
 gcc/config/rs6000/altivec.md| 223 ++--
 gcc/config/rs6000/rs6000.cc |  36 ++--
 gcc/config/rs6000/vsx.md|  24 +--
 gcc/testsuite/g++.target/powerpc/pr106069.C | 120 +++
 4 files changed, 305 insertions(+), 98 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/powerpc/pr106069.C

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 2c4940f2e21..78245f470e9 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -1144,15 +1144,17 @@ (define_expand "altivec_vmrghb"
(use (match_operand:V16QI 2 "register_operand"))]
   "TARGET_ALTIVEC"
 {
-  rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct
-   : gen_altivec_vmrglb_direct;
-  if (!BYTES_BIG_ENDIAN)
-std::swap (operands[1], operands[2]);
-  emit_insn (fun (operands[0], operands[1], operands[2]));
+  rtvec v = gen_rtvec (16, GEN_INT (0), GEN_INT (16), GEN_INT (1), GEN_INT 
(17),
+ GEN_INT (2), GEN_INT (18), GEN_INT (3), GEN_INT (19),

Ping: [PATCH v4] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069]

2023-02-27 Thread Xionghu Luo via Gcc-patches


Hi Segher, Ping this for stage 4...


On 2023/2/10 10:59, Xionghu Luo via Gcc-patches wrote:

Resend this patch...

v4: Update per comments.
v3: rename altivec_vmrghb_direct_le to altivec_vmrglb_direct_le to match
the actual output ASM vmrglb. Likewise for all similar xxx_direct_le
patterns.
v2: Split the direct pattern to be and le with same RTL but different insn.

The native RTL expression for vec_mrghw should be same for BE and LE as
they are register and endian-independent.  So both BE and LE need
generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw
with vec_select and vec_concat.

(set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI
   (subreg:V4SI (reg:V16QI 139) 0)
   (subreg:V4SI (reg:V16QI 140) 0))
   [const_int 0 4 1 5]))

Then combine pass could do the nested vec_select optimization
in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE:

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5])
24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);}

=>

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel)
24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);}

The endianness check need only once at ASM generation finally.
ASM would be better due to nested vec_select simplified to simple scalar
load.

Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{32,64}
Linux.

gcc/ChangeLog:

PR target/106069
* config/rs6000/altivec.md (altivec_vmrghb_direct): Remove.
(altivec_vmrghb_direct_be): New pattern for BE.
(altivec_vmrghb_direct_le): New pattern for LE.
(altivec_vmrghh_direct): Remove.
(altivec_vmrghh_direct_be): New pattern for BE.
(altivec_vmrghh_direct_le): New pattern for LE.
(altivec_vmrghw_direct_): Remove.
(altivec_vmrghw_direct__be): New pattern for BE.
(altivec_vmrghw_direct__le): New pattern for LE.
(altivec_vmrglb_direct): Remove.
(altivec_vmrglb_direct_be): New pattern for BE.
(altivec_vmrglb_direct_le): New pattern for LE.
(altivec_vmrglh_direct): Remove.
(altivec_vmrglh_direct_be): New pattern for BE.
(altivec_vmrglh_direct_le): New pattern for LE.
(altivec_vmrglw_direct_): Remove.
(altivec_vmrglw_direct__be): New pattern for BE.
(altivec_vmrglw_direct__le): New pattern for LE.
* config/rs6000/rs6000.cc (altivec_expand_vec_perm_const):
Adjust.
* config/rs6000/vsx.md: Likewise.

gcc/testsuite/ChangeLog:

PR target/106069
* g++.target/powerpc/pr106069.C: New test.

Signed-off-by: Xionghu Luo 
---
  gcc/config/rs6000/altivec.md| 222 ++--
  gcc/config/rs6000/rs6000.cc |  24 +--
  gcc/config/rs6000/vsx.md|  28 +--
  gcc/testsuite/g++.target/powerpc/pr106069.C | 118 +++
  4 files changed, 307 insertions(+), 85 deletions(-)
  create mode 100644 gcc/testsuite/g++.target/powerpc/pr106069.C

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 30606b8ab21..4bfeecec224 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -1144,15 +1144,16 @@ (define_expand "altivec_vmrghb"
 (use (match_operand:V16QI 2 "register_operand"))]
"TARGET_ALTIVEC"
  {
-  rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct
-   : gen_altivec_vmrglb_direct;
-  if (!BYTES_BIG_ENDIAN)
-std::swap (operands[1], operands[2]);
-  emit_insn (fun (operands[0], operands[1], operands[2]));
+  if (BYTES_BIG_ENDIAN)
+emit_insn (
+  gen_altivec_vmrghb_direct_be (operands[0], operands[1], operands[2]));
+  else
+emit_insn (
+  gen_altivec_vmrglb_direct_le (operands[0], operands[2], operands[1]));
DONE;
  })
  
-(define_insn "altivec_vmrghb_direct"

+(define_insn "altivec_vmrghb_direct_be"
[(set (match_operand:V16QI 0 "register_operand" "=v")
(vec_select:V16QI
  (vec_concat:V32QI
@@ -1166,7 +1167,25 @@ (define_insn "altivec_vmrghb_direct"
 (const_int 5) (const_int 21)
 (const_int 6) (const_int 22)
 (const_int 7) (const_int 23)])))]
-  "TARGET_ALTIVEC"
+  "TARGET_ALTIVEC && BYTES_BIG_ENDIAN"
+  "vmrghb %0,%1,%2"
+  [(set_attr "type" "vecperm")])
+
+(define_insn "altivec_vmrghb_direct_le"
+  [(set (match_operand:V16QI 0 "register_operand" "=v")
+   (vec_select:V16QI
+ (vec_concat:V32QI
+   (match_operand:V16QI 2 "register_operand" "v")
+   (match_operand:V16QI 1 "register_operand" "v"))
+ (parallel [(const_int  8) (const_int 24)
+(const

[PATCH 1/2] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]

2023-03-01 Thread Xionghu Luo via Gcc-patches

When spliting edge with self loop, the split edge should be placed just next to
the edge_in->src, otherwise it may generate different position latch bbs for
two consecutive self loops.  For details, please refer to:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93680#c4

Regression tested pass on x86_64-linux-gnu and aarch64-linux-gnu, OK for
master?

gcc/ChangeLog:

PR gcov/93680
* tree-cfg.cc (split_edge_bb_loc): Return edge_in->src for self loop.

gcc/testsuite/ChangeLog:

PR gcov/93680
* gcc.misc-tests/gcov-pr93680.c: New test.

Signed-off-by: Xionghu Luo 
---
 gcc/testsuite/gcc.misc-tests/gcov-pr93680.c | 24 +
 gcc/tree-cfg.cc |  2 +-
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.misc-tests/gcov-pr93680.c

diff --git a/gcc/testsuite/gcc.misc-tests/gcov-pr93680.c 
b/gcc/testsuite/gcc.misc-tests/gcov-pr93680.c
new file mode 100644
index 000..b2bf9e626fc
--- /dev/null
+++ b/gcc/testsuite/gcc.misc-tests/gcov-pr93680.c
@@ -0,0 +1,24 @@
+/* { dg-options "-fprofile-arcs -ftest-coverage" } */
+/* { dg-do run { target native } } */
+
+int f(int s, int n)
+{
+  int p = 0;
+
+  switch (s)
+  {
+case 0: /* count(5) */
+  do { p++; } while (--n); /* count(5) */
+  return p; /* count(1) */
+
+case 1: /* count(5) */
+  do { p++; } while (--n); /* count(5) */
+  return p; /* count(1) */
+  }
+
+  return 0;
+}
+
+int main() { f(0, 5); f(1, 5); return 0; }
+
+/* { dg-final { run-gcov gcov-pr93680.c } } */
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index a9fcc7fd050..6fa1d83d366 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -3009,7 +3009,7 @@ split_edge_bb_loc (edge edge_in)
   if (dest_prev)
 {
   edge e = find_edge (dest_prev, dest);
-  if (e && !(e->flags & EDGE_COMPLEX))
+  if ((e && !(e->flags & EDGE_COMPLEX)) || edge_in->src == edge_in->dest)
return edge_in->src;
 }
   return dest_prev;
-- 
2.27.0

[PATCH 2/2] gcov: Fix incorrect gimple line LOCATION [PR97923]

2023-03-01 Thread Xionghu Luo via Gcc-patches

For case like belowi test.c:

1:int foo(char c)
2:{
3:  return ((c >= 'A' && c <= 'Z')
4:   || (c >= 'a' && c <= 'z')
5:   || (c >= '0' && c <='0'));}

the generated line number is incorrect for condition c>='A' of block 2:
Thus correct the condition op0 location.

gcno diff before and with this patch:

test.gcno:  575:  block 11: 1:0001(tree)
test.gcno:  583:0145:  35:LINES
-test.gcno:  595:  block 2:`test.c':1, 5
+test.gcno:  595:  block 2:`test.c':1, 3
test.gcno:  626:0145:  31:LINES
test.gcno:  638:  block 3:`test.c':3
test.gcno:  665:0145:  31:LINES
test.gcno:  677:  block 4:`test.c':4
test.gcno:  704:0145:  31:LINES
test.gcno:  716:  block 5:`test.c':4
test.gcno:  743:0145:  31:LINES
test.gcno:  755:  block 6:`test.c':5

Also save line id in line vector for gcov debug use.

Regression tested pass on x86_64-linux-gnu and aarch64-linux-gnu, OK for
master?

gcc/ChangeLog:

PR gcov/97923
* gcov.cc (line_info::line_info): Init id.
(solve_flow_graph): Fix typo.
(add_line_counts): Set line->id.
* gimplify.cc (shortcut_cond_r): Correct cond expr op0 location.

gcc/testsuite/ChangeLog:

PR gcov/97923
* gcc.misc-tests/gcov-pr97923.c: New test.

Signed-off-by: Xionghu Luo 
---
 gcc/gcov.cc |  9 ++---
 gcc/gimplify.cc |  6 --
 gcc/testsuite/gcc.misc-tests/gcov-pr97923.c | 13 +
 3 files changed, 23 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.misc-tests/gcov-pr97923.c

diff --git a/gcc/gcov.cc b/gcc/gcov.cc
index 2ec7248cc0e..77ca94c71c4 100644
--- a/gcc/gcov.cc
+++ b/gcc/gcov.cc
@@ -205,6 +205,8 @@ public:
   /* Execution count.  */
   gcov_type count;
 
+  unsigned id;
+
   /* Branches from blocks that end on this line.  */
   vector branches;
 
@@ -216,8 +218,8 @@ public:
   unsigned has_unexecuted_block : 1;
 };
 
-line_info::line_info (): count (0), branches (), blocks (), exists (false),
-  unexceptional (0), has_unexecuted_block (0)
+line_info::line_info (): count (0), id (0), branches (), blocks (),
+  exists (false), unexceptional (0), has_unexecuted_block (0)
 {
 }
 
@@ -2370,7 +2372,7 @@ solve_flow_graph (function_info *fn)
 
   /* If the graph has been correctly solved, every block will have a
  valid count.  */
-  for (unsigned i = 0; ix < fn->blocks.size (); i++)
+  for (unsigned i = 0; i < fn->blocks.size (); i++)
 if (!fn->blocks[i].count_valid)
   {
fnotice (stderr, "%s:graph is unsolvable for '%s'\n",
@@ -2730,6 +2732,7 @@ add_line_counts (coverage_info *coverage, function_info 
*fn)
}
  line->count += block->count;
}
+ line->id = ln;
}
 
  has_any_line = true;
diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index ade6e335da7..341a27b033e 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -3915,7 +3915,8 @@ shortcut_cond_r (tree pred, tree *true_label_p, tree 
*false_label_p,
false_label_p = &local_label;
 
   /* Keep the original source location on the first 'if'.  */
-  t = shortcut_cond_r (TREE_OPERAND (pred, 0), NULL, false_label_p, locus);
+  tree op0 = TREE_OPERAND (pred, 0);
+  t = shortcut_cond_r (op0, NULL, false_label_p, EXPR_LOCATION (op0));
   append_to_statement_list (t, &expr);
 
   /* Set the source location of the && on the second 'if'.  */
@@ -3938,7 +3939,8 @@ shortcut_cond_r (tree pred, tree *true_label_p, tree 
*false_label_p,
true_label_p = &local_label;
 
   /* Keep the original source location on the first 'if'.  */
-  t = shortcut_cond_r (TREE_OPERAND (pred, 0), true_label_p, NULL, locus);
+  tree op0 = TREE_OPERAND (pred, 0);
+  t = shortcut_cond_r (op0, true_label_p, NULL, EXPR_LOCATION (op0));
   append_to_statement_list (t, &expr);
 
   /* Set the source location of the || on the second 'if'.  */
diff --git a/gcc/testsuite/gcc.misc-tests/gcov-pr97923.c 
b/gcc/testsuite/gcc.misc-tests/gcov-pr97923.c
new file mode 100644
index 000..ad4f7d40817
--- /dev/null
+++ b/gcc/testsuite/gcc.misc-tests/gcov-pr97923.c
@@ -0,0 +1,13 @@
+/* { dg-options "-fprofile-arcs -ftest-coverage" } */
+/* { dg-do run { target native } } */
+
+int foo(int c)
+{
+  return ((c >= 'A' && c <= 'Z') /* count(1*) */
+  || (c >= 'a' && c <= 'z') /* count(1*) */
+  || (c >= '0' && c <= '0')); /* count(1*) */
+}
+
+int main() { foo(0); }
+
+/* { dg-final { run-gcov gcov-pr97923-1.c } } */
-- 
2.27.0

Re: [PATCH 2/2] gcov: Fix incorrect gimple line LOCATION [PR97923]

2023-03-02 Thread Xionghu Luo via Gcc-patches





On 2023/3/2 16:16, Richard Biener wrote:

On Thu, Mar 2, 2023 at 3:31 AM Xionghu Luo via Gcc-patches
 wrote:


For case like belowi test.c:

1:int foo(char c)
2:{
3:  return ((c >= 'A' && c <= 'Z')
4:   || (c >= 'a' && c <= 'z')
5:   || (c >= '0' && c <='0'));}

the generated line number is incorrect for condition c>='A' of block 2:
Thus correct the condition op0 location.

gcno diff before and with this patch:

test.gcno:  575:  block 11: 1:0001(tree)
test.gcno:  583:0145:  35:LINES
-test.gcno:  595:  block 2:`test.c':1, 5
+test.gcno:  595:  block 2:`test.c':1, 3
test.gcno:  626:0145:  31:LINES
test.gcno:  638:  block 3:`test.c':3
test.gcno:  665:0145:  31:LINES
test.gcno:  677:  block 4:`test.c':4
test.gcno:  704:0145:  31:LINES
test.gcno:  716:  block 5:`test.c':4
test.gcno:  743:0145:  31:LINES
test.gcno:  755:  block 6:`test.c':5

Also save line id in line vector for gcov debug use.

Regression tested pass on x86_64-linux-gnu and aarch64-linux-gnu, OK for
master?

gcc/ChangeLog:

 PR gcov/97923
 * gcov.cc (line_info::line_info): Init id.
 (solve_flow_graph): Fix typo.
 (add_line_counts): Set line->id.
 * gimplify.cc (shortcut_cond_r): Correct cond expr op0 location.

gcc/testsuite/ChangeLog:

 PR gcov/97923
 * gcc.misc-tests/gcov-pr97923.c: New test.

Signed-off-by: Xionghu Luo 
---
  gcc/gcov.cc |  9 ++---
  gcc/gimplify.cc |  6 --
  gcc/testsuite/gcc.misc-tests/gcov-pr97923.c | 13 +
  3 files changed, 23 insertions(+), 5 deletions(-)
  create mode 100644 gcc/testsuite/gcc.misc-tests/gcov-pr97923.c

diff --git a/gcc/gcov.cc b/gcc/gcov.cc
index 2ec7248cc0e..77ca94c71c4 100644
--- a/gcc/gcov.cc
+++ b/gcc/gcov.cc
@@ -205,6 +205,8 @@ public:
/* Execution count.  */
gcov_type count;

+  unsigned id;
+
/* Branches from blocks that end on this line.  */
vector branches;

@@ -216,8 +218,8 @@ public:
unsigned has_unexecuted_block : 1;
  };

-line_info::line_info (): count (0), branches (), blocks (), exists (false),
-  unexceptional (0), has_unexecuted_block (0)
+line_info::line_info (): count (0), id (0), branches (), blocks (),
+  exists (false), unexceptional (0), has_unexecuted_block (0)
  {
  }

@@ -2370,7 +2372,7 @@ solve_flow_graph (function_info *fn)

/* If the graph has been correctly solved, every block will have a
   valid count.  */
-  for (unsigned i = 0; ix < fn->blocks.size (); i++)
+  for (unsigned i = 0; i < fn->blocks.size (); i++)
  if (!fn->blocks[i].count_valid)
{
 fnotice (stderr, "%s:graph is unsolvable for '%s'\n",
@@ -2730,6 +2732,7 @@ add_line_counts (coverage_info *coverage, function_info 
*fn)
 }
   line->count += block->count;
 }
+ line->id = ln;
 }

   has_any_line = true;
diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index ade6e335da7..341a27b033e 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -3915,7 +3915,8 @@ shortcut_cond_r (tree pred, tree *true_label_p, tree 
*false_label_p,
 false_label_p = &local_label;

/* Keep the original source location on the first 'if'.  */
-  t = shortcut_cond_r (TREE_OPERAND (pred, 0), NULL, false_label_p, locus);
+  tree op0 = TREE_OPERAND (pred, 0);
+  t = shortcut_cond_r (op0, NULL, false_label_p, EXPR_LOCATION (op0));
append_to_statement_list (t, &expr);


The comment now no longer is true?  For the else arm we use
rexpr_location, why not
here as well?  To quote the following lines:

   /* Set the source location of the && on the second 'if'.  */
   new_locus = rexpr_location (pred, locus);
   t = shortcut_cond_r (TREE_OPERAND (pred, 1), true_label_p, false_label_p,
new_locus);
   append_to_statement_list (t, &expr);


Thanks, should use rexpr_location with each operand like below.




with your change the location of the outer COND_EXPR is lost?  Can we guarantee
that it's used for the first operand of a if (a && b && c)?  It would
be nice to expand
the leading comment for such a three operand case and explain how it's supposed
to work.


I tested the three operand case, it will iteratively call shortcut_cond_r and
also works as expected.  Seems the outer COND_EXPR is useless if we do the
followed conversion?


   if (TREE_CODE (pred) == TRUTH_ANDIF_EXPR)
 {
   location_t new_locus;

   /* Turn if (a && b) into

 if (a); else goto no;
 if

Re: [PATCH 1/2] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]

2023-03-02 Thread Xionghu Luo via Gcc-patches





On 2023/3/2 16:41, Richard Biener wrote:

On Thu, Mar 2, 2023 at 3:31 AM Xionghu Luo via Gcc-patches
 wrote:


When spliting edge with self loop, the split edge should be placed just next to
the edge_in->src, otherwise it may generate different position latch bbs for
two consecutive self loops.  For details, please refer to:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93680#c4

Regression tested pass on x86_64-linux-gnu and aarch64-linux-gnu, OK for
master?

gcc/ChangeLog:

 PR gcov/93680
 * tree-cfg.cc (split_edge_bb_loc): Return edge_in->src for self loop.

gcc/testsuite/ChangeLog:

 PR gcov/93680
 * gcc.misc-tests/gcov-pr93680.c: New test.

Signed-off-by: Xionghu Luo 
---
  gcc/testsuite/gcc.misc-tests/gcov-pr93680.c | 24 +
  gcc/tree-cfg.cc |  2 +-
  2 files changed, 25 insertions(+), 1 deletion(-)
  create mode 100644 gcc/testsuite/gcc.misc-tests/gcov-pr93680.c

diff --git a/gcc/testsuite/gcc.misc-tests/gcov-pr93680.c 
b/gcc/testsuite/gcc.misc-tests/gcov-pr93680.c
new file mode 100644
index 000..b2bf9e626fc
--- /dev/null
+++ b/gcc/testsuite/gcc.misc-tests/gcov-pr93680.c
@@ -0,0 +1,24 @@
+/* { dg-options "-fprofile-arcs -ftest-coverage" } */
+/* { dg-do run { target native } } */
+
+int f(int s, int n)
+{
+  int p = 0;
+
+  switch (s)
+  {
+case 0: /* count(5) */
+  do { p++; } while (--n); /* count(5) */
+  return p; /* count(1) */
+
+case 1: /* count(5) */
+  do { p++; } while (--n); /* count(5) */
+  return p; /* count(1) */
+  }
+
+  return 0;
+}
+
+int main() { f(0, 5); f(1, 5); return 0; }
+
+/* { dg-final { run-gcov gcov-pr93680.c } } */
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index a9fcc7fd050..6fa1d83d366 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -3009,7 +3009,7 @@ split_edge_bb_loc (edge edge_in)
if (dest_prev)
  {
edge e = find_edge (dest_prev, dest);
-  if (e && !(e->flags & EDGE_COMPLEX))
+  if ((e && !(e->flags & EDGE_COMPLEX)) || edge_in->src == edge_in->dest)


I think this should eventually apply to all backedge edge_in, correct?
  But of course
we cannot easily test for this here.

Still since this affects ordering in the {next,prev}_bb chain only but not CFG
semantics I wonder how it can affect coverage?  Isn't it only by chance that
this block order survives?


For case:

1 int f(int s, int n)
2 {
3  int p = 0;
4  int q = 0;
5
6  switch (s)
7{
8case 0:
9  do { p++; } while (--n);
10  return p;
11
12case 1:
13  do { p++; } while (--n);
14  return p;
15}
16
17  return 0;
18 }
19
20 int main() { f(0, 5); f(1, 5);}


current GCC generates:

 :
...

  :<= first loop
...
goto ; [INV]
  else
goto ; [INV]

   :   <= first latch bb
  goto ; [100.00%]

   :
...
  goto ; [INV]

   :   <= second latch bb

   :<= second loop
...
goto ; [INV]
  else
goto ; [INV]


 and  are created by split_edge->split_edge_bb_loc, 
is located after the loop, but  is located before the loop.

First call of split_edge_bb_loc, the dest_prev is , and find_edge
did find a edge from  to , the returned afte_bb is , so
latch  is put after the loop

but second call of split_edge_bb_loc, the dest_prev is , so find_edge
return 0, and the returned after_bb is , then the created latch 
is put before the loop...

Different latch bb position caused different gcno, while gcov has poor
information and not that smart to recognize it:(, is it reasonable to keep
this kind of loops same order?


 small.gcno:  648:  block 2:`small.c':1, 3, 4, 6
 small.gcno:  688:0145:  36:LINES
 small.gcno:  700:  block 3:`small.c':8, 9
 small.gcno:  732:0145:  32:LINES
 small.gcno:  744:  block 5:`small.c':10
-small.gcno:  772:0145:  32:LINES
-small.gcno:  784:  block 6:`small.c':12
-small.gcno:  812:0145:  36:LINES
-small.gcno:  824:  block 7:`small.c':12, 13
+small.gcno:  772:0145:  36:LINES
+small.gcno:  784:  block 6:`small.c':12, 13
+small.gcno:  816:0145:  32:LINES
+small.gcno:  828:  block 8:`small.c':14
 small.gcno:  856:0145:  32:LINES
-small.gcno:  868:  block 8:`small.c':14
-small.gcno:  896:0145:  32:LINES
-small.gcno:  908:  block 9:`small.c':17
+small.gcno:  868:  block 9:`small.c':17





For the case when both edge_in->src has more than one successor and
edge_in->dest has more than one predecessor there isn't any good heuristic
to make printing the blocks in chain order "nice" (well, the backedge
one maybe).

But as said - this order shouldn't have any effect on semantics ...


 return edge_in->src;
  }
return dest_prev;
--
2.27.0

Re: [PATCH 1/2] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]

2023-03-05 Thread Xionghu Luo via Gcc-patches





On 2023/3/2 18:45, Richard Biener wrote:



   small.gcno:  648:  block 2:`small.c':1, 3, 4, 6
   small.gcno:  688:0145:  36:LINES
   small.gcno:  700:  block 3:`small.c':8, 9
   small.gcno:  732:0145:  32:LINES
   small.gcno:  744:  block 5:`small.c':10
-small.gcno:  772:0145:  32:LINES
-small.gcno:  784:  block 6:`small.c':12
-small.gcno:  812:0145:  36:LINES
-small.gcno:  824:  block 7:`small.c':12, 13
+small.gcno:  772:0145:  36:LINES
+small.gcno:  784:  block 6:`small.c':12, 13
+small.gcno:  816:0145:  32:LINES
+small.gcno:  828:  block 8:`small.c':14
   small.gcno:  856:0145:  32:LINES
-small.gcno:  868:  block 8:`small.c':14
-small.gcno:  896:0145:  32:LINES
-small.gcno:  908:  block 9:`small.c':17
+small.gcno:  868:  block 9:`small.c':17


Looking at the CFG and the instrumentation shows

:
   PROF_edge_counter_17 = __gcov0.f[0];
   PROF_edge_counter_18 = PROF_edge_counter_17 + 1;
   __gcov0.f[0] = PROF_edge_counter_18;
   [t.c:3:7] p_6 = 0;
   [t.c:5:3] switch (s_7(D))  [INV], [t.c:7:5] case 0:
 [INV], [t.c:11:5] case 1:  [INV]>

:
   # n_1 = PHI 
   # p_3 = PHI <[t.c:3:7] p_6(2), [t.c:8:15] p_12(4)>
[t.c:7:5] :
   [t.c:8:15] p_12 = p_3 + 1;
   [t.c:8:28] n_13 = n_1 + -1;
   [t.c:8:28] if (n_13 != 0)
 goto ; [INV]
   else
 goto ; [INV]

:
   PROF_edge_counter_21 = __gcov0.f[2];
   PROF_edge_counter_22 = PROF_edge_counter_21 + 1;
   __gcov0.f[2] = PROF_edge_counter_22;
   [t.c:7:5] goto ; [100.00%]

:
   PROF_edge_counter_23 = __gcov0.f[3];
   PROF_edge_counter_24 = PROF_edge_counter_23 + 1;
   __gcov0.f[3] = PROF_edge_counter_24;
   [t.c:9:16] _14 = p_12;
   [t.c:9:16] goto ; [INV]

so the reason this goes wrong is that gcov associates the "wrong"
counter with the block containing
the 'case' label(s), for the case 0 it should have chosen the counter
from bb5 but it likely
computed the count of bb3?

It might be that ordering blocks differently puts the instrumentation
to different blocks or it
makes gcovs association chose different blocks but that means it's
just luck and not fixing
the actual issue?

To me it looks like the correct thing to investigate is switch
statement and/or case label
handling.  One can also see that  having line number 7 is wrong to
the extent that
the position of the label doesn't match the number of times it
executes in the source.  So
placement of the label is wrong here, possibly caused by CFG cleanup
after CFG build
(but generally labels are not used for anything once the CFG is built
and coverage
instrumentation is late so it might fail due to us moving labels).  It
might be OK to
avoid moving labels for --coverage but then coverage should possibly
look at edges
rather than labels?



Thanks, I investigated the Labels, it seems wrong at the beginning from
.gimple to .cfg very early quite like PR90574:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90574

.gimple:

int f (int s, int n)
[small.c:2:1] {
  int D.2755;
  int p;

  [small.c:3:7] p = 0;
  [small.c:5:3] switch (s) , [small.c:7:5] case 0: , 
[small.c:11:5] case 1: >
  [small.c:7:5] :  <= case label
  :<= loop label
  [small.c:8:13] p = p + 1;
  [small.c:8:26] n = n + -1;
  [small.c:8:26] if (n != 0) goto ; else goto ;
  :
  [small.c:9:14] D.2755 = p;
  [small.c:9:14] return D.2755;
  [small.c:11:5] :
  :
  [small.c:12:13] p = p + 1;
  [small.c:12:26] n = n + -1;
  [small.c:12:26] if (n != 0) goto ; else goto ;
  :
  [small.c:13:14] D.2755 = p;
  [small.c:13:14] return D.2755;
  :
  [small.c:16:10] D.2755 = 0;
  [small.c:16:10] return D.2755;
}

.cfg:

int f (int s, int n)
{
  int p;
  int D.2755;

   :
  [small.c:3:7] p = 0;
  [small.c:5:3] switch (s)  [INV], [small.c:7:5] case 0:  [INV], 
[small.c:11:5] case 1:  [INV]>

   :
[small.c:7:5] :   <= case 0
  [small.c:8:13 discrim 1] p = p + 1;
  [small.c:8:26 discrim 1] n = n + -1;
  [small.c:8:26 discrim 1] if (n != 0)
goto ; [INV]
  else
goto ; [INV]

   :
  [small.c:9:14] D.2755 = p;
  [small.c:9:14] goto ; [INV]

   :
[small.c:11:5] :  <= case 1
  [small.c:12:13 discrim 1] p = p + 1;
  [small.c:12:26 discrim 1] n = n + -1;
  [small.c:12:26 discrim 1] if (n != 0)
goto ; [INV]
  else
goto ; [INV]


The labels are merged into the loop unexpected, so I tried below fix
for --coverage if two labels are not on same line to start new basic block:


index 10ca86714f4..b788198ac31 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -2860,6 +2860,13 @@ stmt_starts_bb_p (gimple *stmt, gimple *prev_stmt)
  || !DECL_ARTIFICIAL (gimple_label_label (plabel)))
return true;

+ location_t loc_prev = gimple_location (plabel);
+ location_t locus = gimple_location (label_stmt);
+ expanded_location locus_e = expand_location

Re: [PATCH 1/2] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]

2023-03-06 Thread Xionghu Luo via Gcc-patches




On 2023/3/6 16:11, Richard Biener wrote:

On Mon, Mar 6, 2023 at 8:22 AM Xionghu Luo  wrote:




On 2023/3/2 18:45, Richard Biener wrote:



small.gcno:  648:  block 2:`small.c':1, 3, 4, 6
small.gcno:  688:0145:  36:LINES
small.gcno:  700:  block 3:`small.c':8, 9
small.gcno:  732:0145:  32:LINES
small.gcno:  744:  block 5:`small.c':10
-small.gcno:  772:0145:  32:LINES
-small.gcno:  784:  block 6:`small.c':12
-small.gcno:  812:0145:  36:LINES
-small.gcno:  824:  block 7:`small.c':12, 13
+small.gcno:  772:0145:  36:LINES
+small.gcno:  784:  block 6:`small.c':12, 13
+small.gcno:  816:0145:  32:LINES
+small.gcno:  828:  block 8:`small.c':14
small.gcno:  856:0145:  32:LINES
-small.gcno:  868:  block 8:`small.c':14
-small.gcno:  896:0145:  32:LINES
-small.gcno:  908:  block 9:`small.c':17
+small.gcno:  868:  block 9:`small.c':17


Looking at the CFG and the instrumentation shows

 :
PROF_edge_counter_17 = __gcov0.f[0];
PROF_edge_counter_18 = PROF_edge_counter_17 + 1;
__gcov0.f[0] = PROF_edge_counter_18;
[t.c:3:7] p_6 = 0;
[t.c:5:3] switch (s_7(D))  [INV], [t.c:7:5] case 0:
 [INV], [t.c:11:5] case 1:  [INV]>

 :
# n_1 = PHI 
# p_3 = PHI <[t.c:3:7] p_6(2), [t.c:8:15] p_12(4)>
[t.c:7:5] :
[t.c:8:15] p_12 = p_3 + 1;
[t.c:8:28] n_13 = n_1 + -1;
[t.c:8:28] if (n_13 != 0)
  goto ; [INV]
else
  goto ; [INV]

 :
PROF_edge_counter_21 = __gcov0.f[2];
PROF_edge_counter_22 = PROF_edge_counter_21 + 1;
__gcov0.f[2] = PROF_edge_counter_22;
[t.c:7:5] goto ; [100.00%]

 :
PROF_edge_counter_23 = __gcov0.f[3];
PROF_edge_counter_24 = PROF_edge_counter_23 + 1;
__gcov0.f[3] = PROF_edge_counter_24;
[t.c:9:16] _14 = p_12;
[t.c:9:16] goto ; [INV]

so the reason this goes wrong is that gcov associates the "wrong"
counter with the block containing
the 'case' label(s), for the case 0 it should have chosen the counter
from bb5 but it likely
computed the count of bb3?

It might be that ordering blocks differently puts the instrumentation
to different blocks or it
makes gcovs association chose different blocks but that means it's
just luck and not fixing
the actual issue?

To me it looks like the correct thing to investigate is switch
statement and/or case label
handling.  One can also see that  having line number 7 is wrong to
the extent that
the position of the label doesn't match the number of times it
executes in the source.  So
placement of the label is wrong here, possibly caused by CFG cleanup
after CFG build
(but generally labels are not used for anything once the CFG is built
and coverage
instrumentation is late so it might fail due to us moving labels).  It
might be OK to
avoid moving labels for --coverage but then coverage should possibly
look at edges
rather than labels?



Thanks, I investigated the Labels, it seems wrong at the beginning from
.gimple to .cfg very early quite like PR90574:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90574

.gimple:

int f (int s, int n)
[small.c:2:1] {
int D.2755;
int p;

[small.c:3:7] p = 0;
[small.c:5:3] switch (s) , [small.c:7:5] case 0: , 
[small.c:11:5] case 1: >
[small.c:7:5] :  <= case label
:<= loop label
[small.c:8:13] p = p + 1;
[small.c:8:26] n = n + -1;
[small.c:8:26] if (n != 0) goto ; else goto ;
:
[small.c:9:14] D.2755 = p;
[small.c:9:14] return D.2755;
[small.c:11:5] :
:
[small.c:12:13] p = p + 1;
[small.c:12:26] n = n + -1;
[small.c:12:26] if (n != 0) goto ; else goto ;
:
[small.c:13:14] D.2755 = p;
[small.c:13:14] return D.2755;
:
[small.c:16:10] D.2755 = 0;
[small.c:16:10] return D.2755;
}

.cfg:

int f (int s, int n)
{
int p;
int D.2755;

 :
[small.c:3:7] p = 0;
[small.c:5:3] switch (s)  [INV], [small.c:7:5] case 0:  [INV], 
[small.c:11:5] case 1:  [INV]>

 :
[small.c:7:5] :   <= case 0
[small.c:8:13 discrim 1] p = p + 1;
[small.c:8:26 discrim 1] n = n + -1;
[small.c:8:26 discrim 1] if (n != 0)
  goto ; [INV]
else
  goto ; [INV]

 :
[small.c:9:14] D.2755 = p;
[small.c:9:14] goto ; [INV]

 :
[small.c:11:5] :  <= case 1
[small.c:12:13 discrim 1] p = p + 1;
[small.c:12:26 discrim 1] n = n + -1;
[small.c:12:26 discrim 1] if (n != 0)
  goto ; [INV]
else
  goto ; [INV]


The labels are merged into the loop unexpected, so I tried below fix
for --coverage if two labels are not on same line to start new basic block:


index 10ca86714f4..b788198ac31 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@

Re: [PATCH 1/2] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]

2023-03-07 Thread Xionghu Luo via Gcc-patches





On 2023/3/7 16:53, Richard Biener wrote:

On Tue, 7 Mar 2023, Xionghu Luo wrote:



Unfortunately this change (flag_test_coverage -> !optimize ) caused hundred
of gfortran cases execution failure with O0.  Take gfortran.dg/index.f90 for
example:

.gimple:

__attribute__((fn spec (". ")))
void p ()
[/data/RocksDB_Docker/tgcc-master/gcc/testsuite/gfortran.dg/index_4.f90:6:9] {
   
[/data/RocksDB_Docker/tgcc-master/gcc/testsuite/gfortran.dg/index_4.f90:13:28]
   L.1:
   
[/data/RocksDB_Docker/tgcc-master/gcc/testsuite/gfortran.dg/index_4.f90:14:28]
   L.2:
   
[/data/RocksDB_Docker/tgcc-master/gcc/testsuite/gfortran.dg/index_4.f90:15:28]
   L.3:
   
[/data/RocksDB_Docker/tgcc-master/gcc/testsuite/gfortran.dg/index_4.f90:16:28]
   L.4:
   
[/data/RocksDB_Docker/tgcc-master/gcc/testsuite/gfortran.dg/index_4.f90:17:28]
   L.5:
   
[/data/RocksDB_Docker/tgcc-master/gcc/testsuite/gfortran.dg/index_4.f90:18:72]
   L.6:
}

.cfg:

...
Removing basic block 7
;; basic block 7, loop depth 0
;;  pred:
return;
;;  succ:   EXIT


;; 1 loops found
;;
;; Loop 0
;;  header 0, latch 1
;;  depth 0, outer -1
;;  nodes: 0 1 2
;;2 succs { }
__attribute__((fn spec (". ")))
void p ()
{
:

}

Due to the "return;" is removed in bb 7.


OK, the issue is that make_edges_bb does nothing for an empty block
but it should at least create a fallthru edge here.  Thus,

   if (!last)
 fallthru = true;

   else
 switch (gimple_code (last))
   {

instead of simply returning if (!last).  The alternative would be
to make sure that cleanup_dead_labels preserves at least one
statement in a block.

Looking at the testcases I wonder if preserving all the fallthru labels
is really necessary - for coverage we should have a counter ready.  For
the testcase we arrive with

L.1:
L.2:
L.3:
L.4:
i = 1;


It was:

 :

 :
L.1:

 :
L.2:

 :
L.3:

 :
L.4:

 :
L.5:

 :
L.6:
return;

 :

before the second call of cleanup_dead_labels, after it, all labels are
removed, then tree_forwarder_block_p remove all forworders.  Yes, it
creates blocks and remove blocks immediately...



where the frontend simplified things but put labels at each line.
I suppose we could optimize this by re-computing TREE_USED and only
splitting before labels reached by a control statement?  That would
cover the backedge case in the original testcase.  cleanup_dead_labels
does something like that already.


actually in build_gimple_cfg, cleanup_dead_labels will remove all labels L.1
to L.6
first, then make_edges fail to create edges for  to  due to they
are all
EMPTY bb in make_edges_bb...
  


   240│   /* To speed up statement iterator walks, we first purge dead labels.
   */
   241│   cleanup_dead_labels ();
   242│
   243│   /* Group case nodes to reduce the number of edges.
   244│  We do this after cleaning up dead labels because otherwise we
   miss
   245│  a lot of obvious case merging opportunities.  */
   246│   group_case_labels ();
   247│
   248│   /* Create the edges of the flowgraph.  */
   249│   discriminator_per_locus = new hash_table (13);
   250├>  make_edges ();


 :

 :

 :

 :

 :

 :

 :
return;

 :


Seems deadlock here as you said to set goto_locus as labels are removed before
edges are created, the case could pass if I comment out the function
cleanup_dead_labels(),
so also not call it when !optimize?

if (!!optimize)
  cleanup_dead_labels ();


That probably makes sense.  Looking at group_case_labels () that also
seems to do unwanted things (to debugging and coverage), its comment
says that for

  switch (i)
  {
  case 1:
/* fallthru */
  case 2:
/* fallthru */
  case 3:
k = 0;

it would replace that with

  case 1..3:
k = 0;

but that also fails to produce correct coverage, right?  Likewise
setting breakpoints.


Yes.  Should also exclude this.



Does preserving the labels help setting a goto_locus for the
fallthru edges?  I don't see any code doing that, so
CFG cleanup will remove the forwarders we created again.


For the backedge case with switch-case-do-while, tree_forwarder_block_p
returns false when iterating the statement check.
The new created  with only one case label instruction still owns
location information in it, so CFG cleanup won't remove the forwarders.

 390│   for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
 391│ {
 392│   gimple *stmt = gsi_stmt (gsi);
 393│
 394│   switch (gimple_code (stmt))
 395│ {
 396│ case GIMPLE_LABEL:
 397│   if (DECL_NONLOCAL (gimple_label_label (as_a (stmt
 398│ return false;
 399│   if (!optimize
 400│   && (gimple_has_location (stmt)
 401│   || LOCATION_LOCUS (locus) != UNKNOWN_LOCATION)
 402│   && gimple_location (stmt) != locus)
 403├>return false;
 404│   break;


(gdb) ps stmt
:
(gdb) p gimple_location (stmt)
$154 = 2147483656
(gdb) pel $154
{file = 0x3e41af0 "small.c", line = 7, column = 5,

[PATCH v3] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]

2023-03-08 Thread Xionghu Luo via Gcc-patches





On 2023/3/7 19:25, Richard Biener wrote:

It would be nice to avoid creating blocks / preserving labels we'll
immediately remove again.  For that we do need some analysis
before creating basic-blocks that determines whether a label is
possibly reached by a non-falltru edge.



 :
p = 0;
switch (s) , case 0: , case 1: >

 :
:   <= prev_stmt
:   <= stmt
p = p + 1;
n = n + -1;
if (n != 0) goto ; else goto ;

Check if  is a case label and  is a goto target then return true
in stmt_starts_bb_p to start a new basic block?  This would avoid creating and
removing blocks, but cleanup_dead_labels has all bbs setup while
stmt_starts_bb_p
does't yet to iterate bbs/labels to establish label_for_bb[] map?



Yes.  I think we'd need something more pragmatic before make_blocks (),
like re-computing TREE_USED of the label decls or computing a bitmap
of targeted labels (targeted by goto, switch or any other means).

I'll note that doing a cleanup_dead_labels () like optimization before
we create blocks will help keeping LABEL_DECL_UID and thus
label_to_block_map dense.  But it does look like a bit of
an chicken-and-egg problem and the question is how effective the
dead label removal is in practice.


Tried to add function compute_target_labels(not sure whether the function
name is suitable) in the front of make_blocks_1, now the fortran case doesn't
create/removing blocks now, but I still have several questions:

 1. I used hash_set to save the target labels instead of bitmap, as labels
are tree type value instead of block index so bitmap is not good for it since
we don't have LABEL_DECL_UID now?
 2. Is the compute_target_labels still only for !optimize?  And if we compute
the target labels before create bbs, it is unnessary to guard the first
cleanup_dead_labels under !optimize now, because the switch-case-do-while
case already create new block for CASE_LABEL already.
 3. I only added GIMPLE_SWITCH/GIMPLE_COND in compute_target_labels
so far, is it needed to also handle GIMPLE_ASM/GIMPLE_TRANSACTION and even
labels_eh?


PS1: The v3 patch will cause one test case fail:

Number of regressions in total: 1

FAIL: gcc.c-torture/compile/limits-caselabels.c   -O0  (test for excess errors)


due to this exausting case has labels from L0 to L11, they won't be 
optimized
to a simple if-else expression like before...


PS2: The GIMPLE_GOTO piece of code would cause some fortran cases run fail due
to __builtin_unreachable trap generated in .fixup_cfg1, I didn't dig into it so
just skip these label...


+   case GIMPLE_GOTO:
+#if 0
+ if (!computed_goto_p (stmt))
+   {
+ tree dest = gimple_goto_dest (stmt);
+ target_labels->add (dest);
+   }
+#endif
+ break;

Change the #if 0 to #if 1 result in:

Number of regressions in total: 8

FAIL: gcc.c-torture/compile/limits-caselabels.c   -O0  (test for excess errors)
FAIL: gcc.dg/analyzer/explode-2a.c (test for excess errors)
FAIL: gcc.dg/analyzer/pragma-2.c (test for excess errors)
FAIL: gfortran.dg/bound_2.f90   -O0  execution test
FAIL: gfortran.dg/bound_7.f90   -O0  execution test
FAIL: gfortran.dg/char_result_14.f90   -O0  execution test
FAIL: gfortran.dg/pointer_array_1.f90   -O0  execution test
FAIL: gfortran.dg/select_type_15.f03   -O0  execution test




Paste the updated patch v3:


v3: Add compute_target_labels and call it in the front of make_blocks_1.

Start a new basic block if two labels have different location when
test-coverage.

Regression tested pass on x86_64-linux-gnu and aarch64-linux-gnu, OK for
master?

gcc/ChangeLog:

PR gcov/93680
* tree-cfg.cc (stmt_starts_bb_p): Check whether the label is in
target_labels.
(compute_target_labels): New function.
(make_blocks_1): Call compute_target_labels.

gcc/testsuite/ChangeLog:

PR gcov/93680
* g++.dg/gcov/gcov-1.C: Correct counts.
* gcc.misc-tests/gcov-4.c: Likewise.
* gcc.misc-tests/gcov-pr85332.c: Likewise.
* lib/gcov.exp: Also clean gcda if fail.
* gcc.misc-tests/gcov-pr93680.c: New test.

Signed-off-by: Xionghu Luo 
---
 gcc/tree-cfg.cc | 68 -
 gcc/testsuite/g++.dg/gcov/gcov-1.C  |  2 +-
 gcc/testsuite/gcc.dg/analyzer/paths-4.c |  8 +--
 gcc/testsuite/gcc.misc-tests/gcov-pr85332.c |  2 +-
 gcc/testsuite/gcc.misc-tests/gcov-pr93680.c | 24 
 gcc/testsuite/lib/gcov.exp  |  4 +-
 6 files changed, 96 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.misc-tests/gcov-pr93680.c

diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index a9fcc7fd050..0f8efcf4aa3 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -164,7 +164,7 @@ static edge gimple_redirect_edge_and_branch (edge, 
basic_block);
 static edge gimple_try_redirect_by_replacing_jump (edge, basic_block);
 
 /* Various helpers.  */

-static inline bool stmt_s

Re: [PATCH v4] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]

2023-03-13 Thread Xionghu Luo via Gcc-patches




On 2023/3/9 20:02, Richard Biener wrote:

On Wed, 8 Mar 2023, Xionghu Luo wrote:




On 2023/3/7 19:25, Richard Biener wrote:

It would be nice to avoid creating blocks / preserving labels we'll
immediately remove again.  For that we do need some analysis
before creating basic-blocks that determines whether a label is
possibly reached by a non-falltru edge.



 :
p = 0;
switch (s) , case 0: , case 1: >

 :
:   <= prev_stmt
:   <= stmt
p = p + 1;
n = n + -1;
if (n != 0) goto ; else goto ;

Check if  is a case label and  is a goto target then return
true
in stmt_starts_bb_p to start a new basic block?  This would avoid creating
and
removing blocks, but cleanup_dead_labels has all bbs setup while
stmt_starts_bb_p
does't yet to iterate bbs/labels to establish label_for_bb[] map?



Yes.  I think we'd need something more pragmatic before make_blocks (),
like re-computing TREE_USED of the label decls or computing a bitmap
of targeted labels (targeted by goto, switch or any other means).

I'll note that doing a cleanup_dead_labels () like optimization before
we create blocks will help keeping LABEL_DECL_UID and thus
label_to_block_map dense.  But it does look like a bit of
an chicken-and-egg problem and the question is how effective the
dead label removal is in practice.


Tried to add function compute_target_labels(not sure whether the function
name is suitable) in the front of make_blocks_1, now the fortran case doesn't
create/removing blocks now, but I still have several questions:

  1. I used hash_set to save the target labels instead of bitmap, as
labels
are tree type value instead of block index so bitmap is not good for it since
we don't have LABEL_DECL_UID now?


We don't have LABEL_DECL_UID, we have DECL_UID though, but the choice of
hash_set vs. bitmap is somewhat arbitrary here.  The real cost is
the extra walk over all stmts.


  2. Is the compute_target_labels still only for !optimize?  And if we compute
the target labels before create bbs, it is unnessary to guard the first
cleanup_dead_labels under !optimize now, because the switch-case-do-while
case already create new block for CASE_LABEL already.


OK.


  3. I only added GIMPLE_SWITCH/GIMPLE_COND in compute_target_labels
so far, is it needed to also handle GIMPLE_ASM/GIMPLE_TRANSACTION and even
labels_eh?


I'd add GIMPLE_ASM handling, the rest should be OK wrt debugging and
coverage already?


Added in patch v4.




PS1: The v3 patch will cause one test case fail:

Number of regressions in total: 1

FAIL: gcc.c-torture/compile/limits-caselabels.c   -O0  (test for excess
errors)


due to this exausting case has labels from L0 to L11, they won't be
optimized
to a simple if-else expression like before...


Hmm, that's somewhat unexpected.


It could be fixed by not start a new block if two locus are on same line as the
labels are expanded by MACRO with same location info.  BTW, I found that two
UNKOWN_LOCATION variable may have different value but return true in 
same_line_p?

2: locus1 = 2147483670
3: locus2 = 2147483652
(gdb) pel locus1
{file = 0x0, line = 0, column = 0, data = 0x76bdc300, sysp = false}
(gdb) pel locus2
{file = 0x0, line = 0, column = 0, data = 0x76bdc4e0, sysp = false}
(gdb) p LOCATION_LOCUS (locus1)
$16 = 0
(gdb) p LOCATION_LOCUS (locus2)
$17 = 0

So fix the function like this?

@@ -1152,6 +1218,10 @@ same_line_p (location_t locus1, expanded_location *from, 
location_t locus2)
 {
   expanded_location to;

+  if (LOCATION_LOCUS (locus1) == UNKNOWN_LOCATION
+  && LOCATION_LOCUS (locus2) == UNKNOWN_LOCATION)
+return false;
+
   if (locus1 == locus2)
 return true;





PS2: The GIMPLE_GOTO piece of code would cause some fortran cases run fail due
to __builtin_unreachable trap generated in .fixup_cfg1, I didn't dig into it
so
just skip these label...


Please investigate, we might be missing a corner case here.


Yes.  Take the case pointer_array_1.f90 as example, it has an UNUSED label "L.7"
with locus info in it, not sure why it exists even since .original.


  [pointer_array_1.f90:39:10] if (test.14 != 0) goto ; els
e goto ;
  :
  [pointer_array_1.f90:39:52] _gfortran_stop_numeric (3, 0);
  :
  parm.16 = {CLOBBER(eol)};
  [pointer_array_1.f90:39:52] L.7: <= UNUSED label
  :
  [pointer_array_1.f90:39:52] L.3:
  atmp.0 = {CLOBBER(eol)};
  A.1 = {CLOBBER(eol)};
  atmp.5 = {CLOBBER(eol)};
  A.6 = {CLOBBER(eol)};
  d = {CLOBBER(eol)};
  [pointer_array_1.f90:41:14] return;

stmt_starts_bb_p will return true for L.7 as the prev_stmt "parm.16 = 
{CLOBBER(eol)};"
is not a label statement, then  will also return true in 
stmt_starts_bb_p as
the label_stmt and prev_stmt are NOT on same line.

 :
L.9:
L.8:
if (test.14 != 0) goto ; else goto ;

 :
:
_gfortran_stop_numeric (3, 0);

 :
:
parm.16 = {CLOBBER(eol)};

 :   <= empty block
L.7:

 :
:
L.3:
atmp.0 = {CLOBBER(eol)};
A.1 = {CLOBBER(eo

[PATCH v4] gcov: Fix "do-while" structure in case statement leads to incorrect code coverage [PR93680]

2023-03-15 Thread Xionghu Luo via Gcc-patches




On 2023/3/9 20:02, Richard Biener wrote:

On Wed, 8 Mar 2023, Xionghu Luo wrote:




On 2023/3/7 19:25, Richard Biener wrote:

It would be nice to avoid creating blocks / preserving labels we'll
immediately remove again.  For that we do need some analysis
before creating basic-blocks that determines whether a label is
possibly reached by a non-falltru edge.



 :
p = 0;
switch (s) , case 0: , case 1: >

 :
:   <= prev_stmt
:   <= stmt
p = p + 1;
n = n + -1;
if (n != 0) goto ; else goto ;

Check if  is a case label and  is a goto target then return
true
in stmt_starts_bb_p to start a new basic block?  This would avoid creating
and
removing blocks, but cleanup_dead_labels has all bbs setup while
stmt_starts_bb_p
does't yet to iterate bbs/labels to establish label_for_bb[] map?



Yes.  I think we'd need something more pragmatic before make_blocks (),
like re-computing TREE_USED of the label decls or computing a bitmap
of targeted labels (targeted by goto, switch or any other means).

I'll note that doing a cleanup_dead_labels () like optimization before
we create blocks will help keeping LABEL_DECL_UID and thus
label_to_block_map dense.  But it does look like a bit of
an chicken-and-egg problem and the question is how effective the
dead label removal is in practice.


Tried to add function compute_target_labels(not sure whether the function
name is suitable) in the front of make_blocks_1, now the fortran case doesn't
create/removing blocks now, but I still have several questions:

  1. I used hash_set to save the target labels instead of bitmap, as
labels
are tree type value instead of block index so bitmap is not good for it since
we don't have LABEL_DECL_UID now?


We don't have LABEL_DECL_UID, we have DECL_UID though, but the choice of
hash_set vs. bitmap is somewhat arbitrary here.  The real cost is
the extra walk over all stmts.


  2. Is the compute_target_labels still only for !optimize?  And if we compute
the target labels before create bbs, it is unnessary to guard the first
cleanup_dead_labels under !optimize now, because the switch-case-do-while
case already create new block for CASE_LABEL already.


OK.


  3. I only added GIMPLE_SWITCH/GIMPLE_COND in compute_target_labels
so far, is it needed to also handle GIMPLE_ASM/GIMPLE_TRANSACTION and even
labels_eh?


I'd add GIMPLE_ASM handling, the rest should be OK wrt debugging and
coverage already?


PS1: The v3 patch will cause one test case fail:

Number of regressions in total: 1

FAIL: gcc.c-torture/compile/limits-caselabels.c   -O0  (test for excess
errors)


due to this exausting case has labels from L0 to L11, they won't be
optimized
to a simple if-else expression like before...


Hmm, that's somewhat unexpected.



PS2: The GIMPLE_GOTO piece of code would cause some fortran cases run fail due
to __builtin_unreachable trap generated in .fixup_cfg1, I didn't dig into it
so
just skip these label...


Please investigate, we might be missing a corner case here.



I think the *previous fix* for labels “in the middle of block” is *incorrect*, 
it should
be handled in make_edges_bb when a basic block only has Label in it, just 
create a
fallthrough edge for it to avoid wrong cfg and unreachable trap generated?


@@ -853,6 +922,12 @@ make_edges_bb (basic_block bb, struct omp_region 
**pcur_region, int *pomp_index)
   bool fallthru = false;
   int ret = 0;

+  if (!optimize && !last)
+{
+  make_edge (bb, bb->next_bb, EDGE_FALLTHRU);
+  return 0;
+}
+
   if (!last)
 return ret;


With the fix, the attached version could pass bootstrap and regression test on 
x86_64-linux-gnu.


From ec505cc7952707db805802af83dd82776a1d949f Mon Sep 17 00:00:00 2001
From: Xionghu Luo 
Date: Tue, 28 Feb 2023 17:46:18 +0800
Subject: [PATCH v4]  gcov: Fix "do-while" structure in case statement leads to
 incorrect code coverage [PR93680]

v4: Address comments.
 4.1. Handle GIMPLE_GOTO and GIMPLE_ASM.
 4.2. Fix failure of limit-caselabels.c (labels on same line),
 pointer_array_1.f90 (unused labels) etc.

v3: Add compute_target_labels and call it in the front of make_blocks_1.
v2: Check whether two locus are on same line.

Start a new basic block if two labels have different location when
test-coverage.

Regression tested pass on x86_64-linux-gnu and aarch64-linux-gnu, OK for
master?

gcc/ChangeLog:

PR gcov/93680
* tree-cfg.cc (stmt_starts_bb_p): Check whether the label is in
target_labels.
(compute_target_labels): New function.
(make_blocks_1): Call compute_target_labels.
(same_line_p): Return false if two locus are both
UNKOWN_LOCATION.

gcc/testsuite/ChangeLog:

PR gcov/93680
* g++.dg/gcov/gcov-1.C: Correct counts.
* gcc.misc-tests/gcov-4.c: Likewise.
* gcc.misc-tests/gcov-pr85332.c: Likewise.
* lib/gcov.exp: Also clean gcda if fail.
* gcc.m

[PATCH] rs6000: Fix vec insert ilp32 ICE and test failures [PR98799]

2021-01-25 Thread Xionghu Luo via Gcc-patches

From: "luo...@cn.ibm.com" 

UNSPEC_SI_FROM_SF is not supported when TARGET_DIRECT_MOVE_64BIT
is false for -m32, don't generate VIEW_CONVERT_EXPR(ARRAY_REF) for
variable vector insert.  Remove rs6000_expand_vector_set_var helper
function, adjust the p8 and p9 definitions position and make them
static.

The previous commit r11-6858 missed check m32, This patch is tested pass
on P7BE{m32,m64}/P8BE{m32,m64}/P8LE/P9LE with
RUNTESTFLAGS="--target_board =unix'{-m32,-m64}" for BE targets.

gcc/ChangeLog:

    2021-01-26  Xionghu Luo  
David Edelsohn 

PR target/98799
* config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
Don't generate VIEW_CONVERT_EXPR for m32.
* config/rs6000/rs6000-protos.h (rs6000_expand_vector_set_var):
Delete.
* config/rs6000/rs6000.c (rs6000_expand_vector_set): Remove the
wrapper call rs6000_expand_vector_set_var. Call
rs6000_expand_vector_set_var_p9 and rs6000_expand_vector_set_var_p8
directly.
(rs6000_expand_vector_set_var): Delete.

gcc/testsuite/ChangeLog:

2021-01-26  Xionghu Luo  

PR target/98827
* gcc.target/powerpc/fold-vec-insert-char-p8.c: Adjust ilp32.
* gcc.target/powerpc/fold-vec-insert-char-p9.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-double.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-float-p8.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-float-p9.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-int-p8.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-int-p9.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-longlong.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-short-p8.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-short-p9.c: Likewise.
* gcc.target/powerpc/pr79251.p8.c: Likewise.
* gcc.target/powerpc/pr79251.p9.c: Likewise.
* gcc.target/powerpc/vsx-builtin-7.c: Likewise.
---
 gcc/config/rs6000/rs6000-c.c  |   2 +-
 gcc/config/rs6000/rs6000-protos.h |   1 -
 gcc/config/rs6000/rs6000.c| 236 +-
 .../powerpc/fold-vec-insert-char-p8.c |  14 +-
 .../powerpc/fold-vec-insert-char-p9.c |   6 +-
 .../powerpc/fold-vec-insert-double.c  |  10 +-
 .../powerpc/fold-vec-insert-float-p8.c|  12 +-
 .../powerpc/fold-vec-insert-float-p9.c|   6 +-
 .../powerpc/fold-vec-insert-int-p8.c  |  13 +-
 .../powerpc/fold-vec-insert-int-p9.c  |   9 +-
 .../powerpc/fold-vec-insert-longlong.c|   8 +-
 .../powerpc/fold-vec-insert-short-p8.c|  10 +-
 .../powerpc/fold-vec-insert-short-p9.c|  13 +-
 gcc/testsuite/gcc.target/powerpc/pr79251.p8.c |  17 +-
 gcc/testsuite/gcc.target/powerpc/pr79251.p9.c |  16 +-
 .../gcc.target/powerpc/vsx-builtin-7.c|   2 +-
 16 files changed, 203 insertions(+), 172 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index f6ee1e61b56..656cdb39f3f 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -1600,7 +1600,7 @@ altivec_resolve_overloaded_builtin (location_t loc, tree 
fndecl,
  stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt);
}
 
-  if (TARGET_P8_VECTOR)
+  if (TARGET_P8_VECTOR && TARGET_DIRECT_MOVE_64BIT)
{
  stmt = build_array_ref (loc, stmt, arg2);
  stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt,
diff --git a/gcc/config/rs6000/rs6000-protos.h 
b/gcc/config/rs6000/rs6000-protos.h
index 9a46a414743..9cca7325d0d 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -58,7 +58,6 @@ extern bool rs6000_split_128bit_ok_p (rtx []);
 extern void rs6000_expand_float128_convert (rtx, rtx, bool);
 extern void rs6000_expand_vector_init (rtx, rtx);
 extern void rs6000_expand_vector_set (rtx, rtx, rtx);
-extern void rs6000_expand_vector_set_var (rtx, rtx, rtx);
 extern void rs6000_expand_vector_extract (rtx, rtx, rtx);
 extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx);
 extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index f5565a1a253..471bf5660bd 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6977,122 +6977,10 @@ rs6000_expand_vector_init (rtx target, rtx vals)
   emit_move_insn (target, mem);
 }
 
-/* Set field ELT_RTX of TARGET to VAL.  */
-
-void
-rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx)
-{
-  machine_mode mode = GET_MODE (target);
-  machine_mode inner_mode = GET_MODE_INNER (mode);
-  rtx reg = gen_reg_rtx (mode);
-  rtx mask, mem, x;
-  int width = GET_MODE_SIZE (inner_mode);
-  int i;
-
-  val = force_reg (GET_MODE (val), val);
-
-  if (VECTOR_MEM_VSX_P (mode))
-{
-  if (!CONST_INT_P (elt_rtx))
-   {
-

Re: [PATCH] rs6000: Fix vec insert ilp32 ICE and test failures [PR98799]

2021-01-26 Thread Xionghu Luo via Gcc-patches

Hi,

On 2021/1/27 03:00, David Edelsohn wrote:
> On Tue, Jan 26, 2021 at 2:46 AM Xionghu Luo  wrote:
>>
>> From: "luo...@cn.ibm.com" 
>>
>> UNSPEC_SI_FROM_SF is not supported when TARGET_DIRECT_MOVE_64BIT
>> is false for -m32, don't generate VIEW_CONVERT_EXPR(ARRAY_REF) for
>> variable vector insert.  Remove rs6000_expand_vector_set_var helper
>> function, adjust the p8 and p9 definitions position and make them
>> static.
>>
>> The previous commit r11-6858 missed check m32, This patch is tested pass
>> on P7BE{m32,m64}/P8BE{m32,m64}/P8LE/P9LE with
>> RUNTESTFLAGS="--target_board =unix'{-m32,-m64}" for BE targets.
> 
> Hi, Xionghu
> 
> Thanks for addressing these failures and the cleanups.
> 
> This patch addresses most of the failures.
> 
> pr79251-run.c continues to fail.  The directives are not complete.
> I'm not certain if your intention is to run the testcase on all
> targets or only on Power7 and above.  The testcase relies on vector
> "long long", which only is available with -mvsx, but the testcase only
> enables -maltivec.  I believe that the testcase happens to pass on the
> Linux platforms you tested because GCC defaulted to Power7 or Power8
> ISA and the ABI specifies VSX.  The testcase probably needs to be
> restricted to only run on some level of VSX enabled processor (VSX?
> Power8? Power9?) and also needs some additional compiler options when
> compiling the testcase instead of relying upon the default
> configuration of the compiler.

P8BE: gcc/testsuite/gcc/gcc.sum(it didn't run before due to no 'dg-do run'):

Running target unix/-m32
Running /home/luoxhu/workspace/gcc/gcc/testsuite/gcc.target/powerpc/powerpc.exp 
...
PASS: gcc.target/powerpc/pr79251-run.c (test for excess errors)
PASS: gcc.target/powerpc/pr79251-run.c execution test
=== gcc Summary for unix/-m32 ===

# of expected passes2
Running target unix/-m64
Running /home/luoxhu/workspace/gcc/gcc/testsuite/gcc.target/powerpc/powerpc.exp 
...
PASS: gcc.target/powerpc/pr79251-run.c (test for excess errors)
PASS: gcc.target/powerpc/pr79251-run.c execution test
=== gcc Summary for unix/-m64 ===

# of expected passes2

How did you get the failure of pr79251-run.c, please?  I tested it all
passes on P7BE{m32,m64}/P8BE{m32,m64}/P8LE/P9LE of Linux.  This case is
just verifying the *functionality* of "u = vec_insert (254, v, k)" and
compare whether u[k] is changed to 254, it must work on all platforms,
no matter with the optimization or not, otherwise there is a functional
error.  As to "long long", add target vsx_hw and powerpc like below?
(Also change the -maltive to -mvsx for pr79251.p8.c/pr79251.p9.c.)

--- a/gcc/testsuite/gcc.target/powerpc/pr79251-run.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr79251-run.c
@@ -1,4 +1,6 @@
-/* { dg-options "-O2 -maltivec" } */
+/* { dg-do run { target powerpc*-*-* } } */
+/* { dg-require-effective-target vsx_hw { target powerpc*-*-* } } */
+/* { dg-options "-O2 -mvsx" } */

Any other options necessary to limit the testcases? :)

> 
> Also, part of the change seems to be
> 
>> -  if (TARGET_P9_VECTOR || GET_MODE_SIZE (inner_mode) == 8)
>> -rs6000_expand_vector_set_var_p9 (target, val, idx);
>> + if ((TARGET_P9_VECTOR && TARGET_POWERPC64) || width == 8)
>> +   {
>> + rs6000_expand_vector_set_var_p9 (target, val, elt_rtx);
>> + return;
>> +   }
> 
> Does the P9 case need TARGET_POWERPC64?  This optimization seemed to
> be functioning on P9 in 32 bit mode prior to this fix.  It would be a
> shame to unnecessarily disable this optimization in 32 bit mode.  Or
> maybe it generated a functioning sequence but didn't utilize the
> optimization.  Would you please check / clarify?

>> -  if (TARGET_P8_VECTOR)
>> +  if (TARGET_P8_VECTOR && TARGET_DIRECT_MOVE_64BIT)
>>  {
>>stmt = build_array_ref (loc, stmt, arg2);
>>stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt,

This change in rs6000-c.c causes it not generating VIEW_CONVERT_EXPR(ARRAY_REF)
gimple code again for P9-32bit, then the IFN VEC_SET won't be matched,
so rs6000.c:rs6000_expand_vector_set_var_p9 won't be called to produce
optimized "lvsl+xxperm+lvsr" for P9-32bit again.  It's a pity, but without
this, it ICEs on P8BE-32bit because of UNSPEC_SI_FROM_SF is not supported
for -m32, if we need to support P9-32bit, why not also support P8-32bit as
only float vec_insert ICE, is there any method could move SI from SF for 
P8-32bit?
(I verified the m32 optimized binary and non-optimized binary for int vec_insert
on P8-BE-32bit, the performance gain is also huge as 8x improvement with this 
patch.)

rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx idx)
{
...
  /*  mtvsrd[wz] f0,tmp_val.  */
  rtx tmp_val = gen_reg_rtx (SImode);
  if (inner_mode == E_SFmode)
emit_insn (gen_movsi_from_sf (tmp_val, val));
  else
tmp_val = force_reg (SImode, val);
...
}

Thanks
Xionghu

[PATCH] testsuite: Run vec_insert case on P8 and P9 with option specified

2021-01-28 Thread Xionghu Luo via Gcc-patches

Move common functions to header file for cleanup.

gcc/testsuite/ChangeLog:

2021-01-27  Xionghu Luo  

* gcc.target/powerpc/pr79251.p8.c: Move definition to ...
* gcc.target/powerpc/pr79251.h: ...this.
* gcc.target/powerpc/pr79251.p9.c: Likewise.
* gcc.target/powerpc/pr79251-run.c: Rename to...
* gcc.target/powerpc/pr79251-run.p8.c: ...this.
* gcc.target/powerpc/pr79251-run.p9.c: New test.
---
 .../gcc.target/powerpc/pr79251-run.c  | 30 ---
 .../gcc.target/powerpc/pr79251-run.p8.c   | 14 +
 .../gcc.target/powerpc/pr79251-run.p9.c   | 14 +
 gcc/testsuite/gcc.target/powerpc/pr79251.h| 17 +++
 gcc/testsuite/gcc.target/powerpc/pr79251.p8.c |  2 --
 gcc/testsuite/gcc.target/powerpc/pr79251.p9.c |  2 --
 6 files changed, 45 insertions(+), 34 deletions(-)
 delete mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.p8.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.p9.c

diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251-run.c 
b/gcc/testsuite/gcc.target/powerpc/pr79251-run.c
deleted file mode 100644
index 6afd357c7ba..000
--- a/gcc/testsuite/gcc.target/powerpc/pr79251-run.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/* { dg-do run } */
-/* { dg-require-effective-target vsx_hw } */
-/* { dg-options "-O2 -mvsx" } */
-
-#include 
-#include 
-#include "pr79251.h"
-
-TEST_VEC_INSERT_ALL (test)
-
-#define run_test(TYPE, num)
\
-  {
\
-vector TYPE v; 
\
-vector TYPE u = {0x0}; 
\
-for (long k = 0; k < 16 / sizeof (TYPE); k++)  
\
-  v[k] = 0xaa; 
\
-for (long k = 0; k < 16 / sizeof (TYPE); k++)  
\
-  {
\
-   u = test##num (v, 254, k); \
-   if (u[k] != (TYPE) 254)\
- __builtin_abort ();  \
-  }
\
-  }
-
-int
-main (void)
-{
-  TEST_VEC_INSERT_ALL (run_test)
-  return 0;
-}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251-run.p8.c 
b/gcc/testsuite/gcc.target/powerpc/pr79251-run.p8.c
new file mode 100644
index 000..47d4d288f3c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr79251-run.p8.c
@@ -0,0 +1,14 @@
+/* { dg-do run } */
+/* { dg-require-effective-target p8vector_hw } */
+/* { dg-options "-O2 -mvsx -mdejagnu-cpu=power8" } */
+
+#include 
+#include 
+#include "pr79251.h"
+
+int
+main (void)
+{
+  TEST_VEC_INSERT_ALL (run_test)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251-run.p9.c 
b/gcc/testsuite/gcc.target/powerpc/pr79251-run.p9.c
new file mode 100644
index 000..fd56b2356f4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr79251-run.p9.c
@@ -0,0 +1,14 @@
+/* { dg-do run } */
+/* { dg-require-effective-target p9vector_hw } */
+/* { dg-options "-O2 -mvsx -mdejagnu-cpu=power9" } */
+
+#include 
+#include 
+#include "pr79251.h"
+
+int
+main (void)
+{
+  TEST_VEC_INSERT_ALL (run_test)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.h 
b/gcc/testsuite/gcc.target/powerpc/pr79251.h
index addb067f9ed..2684b660966 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr79251.h
+++ b/gcc/testsuite/gcc.target/powerpc/pr79251.h
@@ -17,3 +17,20 @@
   T (unsigned long long, 7)
\
   T (float, 8) 
\
   T (double, 9)
+
+TEST_VEC_INSERT_ALL (test)
+
+#define run_test(TYPE, num)
\
+  {
\
+vector TYPE v; 
\
+vector TYPE u = {0x0}; 
\
+for (long k = 0; k < 16 / sizeof (TYPE); k++)  
\
+  v[k] = 0xaa; 
\
+for (long k = 0; k < 16 / sizeof (TYPE); k++)  
\
+  {
\
+   u = test##num (v, 254, k); \
+   if (u[k] != (TYPE) 254)

[PATCH] testsuite: Update pr79251 ilp32 store regex.

2021-01-31 Thread Xionghu Luo via Gcc-patches

BE ilp32 Linux generates extra stack stwu instructions which shouldn't
be counted in, \m … \M is needed around each instruction, not just the
beginning and end of the entire pattern. Pre-approved, committing.

gcc/testsuite/ChangeLog:

2021-02-01  Xionghu Luo  

* gcc.target/powerpc/pr79251.p8.c: Update regex.
* gcc.target/powerpc/pr79251.p9.c: Likewise.
---
 gcc/testsuite/gcc.target/powerpc/pr79251.p8.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/pr79251.p9.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.p8.c 
b/gcc/testsuite/gcc.target/powerpc/pr79251.p8.c
index b0e7732a38b..178e02fc866 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr79251.p8.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr79251.p8.c
@@ -16,7 +16,7 @@
 /* { dg-final { scan-assembler-times {\mrlwinm\M} 10 { target ilp32 } } } */
 /* { dg-final { scan-assembler-times {\mstxvw4x\M} 6 { target ilp32 } } } */
 /* { dg-final { scan-assembler-times {\mstxvd2x\M} 4 { target ilp32 } } } */
-/* { dg-final { scan-assembler-times {\mstb|sth|stw|stfs|stfd\M} 12 { target 
ilp32 } } } */
+/* { dg-final { scan-assembler-times 
{\mstb\M|\msth\M|\mstw\M|\mstfs\M|\mstfd\M} 12 { target ilp32 } } } */
 /* { dg-final { scan-assembler-times {\mlxvw4x\M} 6 { target ilp32 } } } */
 /* { dg-final { scan-assembler-times {\mlxvd2x\M} 4 { target ilp32 } } } */
 
diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.p9.c 
b/gcc/testsuite/gcc.target/powerpc/pr79251.p9.c
index cedb0bf7da8..2ae2da8c72e 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr79251.p9.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr79251.p9.c
@@ -17,6 +17,6 @@
 
 /* { dg-final { scan-assembler-times {\mrlwinm\M} 10 { target ilp32 } } } */
 /* { dg-final { scan-assembler-times {\mstxv\M} 10 { target ilp32 } } } */
-/* { dg-final { scan-assembler-times {\mstb|sth|stw|stfs|stfd\M} 12 { target 
ilp32 } } } */
+/* { dg-final { scan-assembler-times 
{\mstb\M|\msth\M|\mstw\M|\mstfs\M|\mstfd\M} 12 { target ilp32 } } } */
 /* { dg-final { scan-assembler-times {\mlxv\M} 10 { target ilp32 } } } */
 
-- 
2.25.1

[PATCH] rs6000: Convert the vector element register to SImode [PR98914]

2021-02-03 Thread Xionghu Luo via Gcc-patches

v[k] will also be expanded to IFN VEC_SET if k is long type when built
with -Og.  -O0 didn't exposed the issue due to v is TREE_ADDRESSABLE,
-O1 and above also didn't capture it because of v[k] is not optimized to
VIEW_CONVERT_EXPR(v)[k_1].
vec_insert defines the element argument type to be signed int by ELFv2
ABI, so convert it to SImode if it wasn't for Power target requirements.

gcc/ChangeLog:

2021-02-03  Xionghu Luo  

* config/rs6000/rs6000.c (rs6000_expand_vector_set): Convert
elt_rtx to SImode if it wasn't.

gcc/testsuite/ChangeLog:

2021-02-03  Xionghu Luo  

* gcc.target/powerpc/pr98914.c: New test.
---
 gcc/config/rs6000/rs6000.c | 17 ++---
 gcc/testsuite/gcc.target/powerpc/pr98914.c | 11 +++
 2 files changed, 21 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr98914.c

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index ec068c58aa5..9f7f8da56c6 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -7000,8 +7000,6 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, rtx 
idx)
 
   gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx));
 
-  gcc_assert (GET_MODE (idx) == E_SImode);
-
   machine_mode inner_mode = GET_MODE (val);
 
   rtx tmp = gen_reg_rtx (GET_MODE (idx));
@@ -7047,8 +7045,6 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx 
idx)
 
   gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx));
 
-  gcc_assert (GET_MODE (idx) == E_SImode);
-
   machine_mode inner_mode = GET_MODE (val);
   HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode);
 
@@ -7144,7 +7140,7 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx 
elt_rtx)
   machine_mode mode = GET_MODE (target);
   machine_mode inner_mode = GET_MODE_INNER (mode);
   rtx reg = gen_reg_rtx (mode);
-  rtx mask, mem, x;
+  rtx mask, mem, x, elt_si;
   int width = GET_MODE_SIZE (inner_mode);
   int i;
 
@@ -7154,16 +7150,23 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx 
elt_rtx)
 {
   if (!CONST_INT_P (elt_rtx))
{
+ /* elt_rtx should be SImode from ELFv2 ABI.  */
+ elt_si = gen_reg_rtx (E_SImode);
+ if (GET_MODE (elt_rtx) != E_SImode)
+   convert_move (elt_si, elt_rtx, 0);
+ else
+   elt_si = elt_rtx;
+
  /* For V2DI/V2DF, could leverage the P9 version to generate xxpermdi
 when elt_rtx is variable.  */
  if ((TARGET_P9_VECTOR && TARGET_POWERPC64) || width == 8)
{
- rs6000_expand_vector_set_var_p9 (target, val, elt_rtx);
+ rs6000_expand_vector_set_var_p9 (target, val, elt_si);
  return;
}
  else if (TARGET_P8_VECTOR && TARGET_DIRECT_MOVE_64BIT)
{
- rs6000_expand_vector_set_var_p8 (target, val, elt_rtx);
+ rs6000_expand_vector_set_var_p8 (target, val, elt_si);
  return;
}
}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr98914.c 
b/gcc/testsuite/gcc.target/powerpc/pr98914.c
new file mode 100644
index 000..e4d78e3e6b3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr98914.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-Og -mvsx" } */
+
+vector int
+foo (vector int v)
+{
+  for (long k = 0; k < 1; ++k)
+v[k] = 0;
+  return v;
+}
-- 
2.25.1

Ping: [PATCH] rs6000: Convert the vector element register to SImode [PR98914]

2021-02-17 Thread Xionghu Luo via Gcc-patches


Gentle ping, thanks.


On 2021/2/3 17:01, Xionghu Luo wrote:

v[k] will also be expanded to IFN VEC_SET if k is long type when built
with -Og.  -O0 didn't exposed the issue due to v is TREE_ADDRESSABLE,
-O1 and above also didn't capture it because of v[k] is not optimized to
VIEW_CONVERT_EXPR(v)[k_1].
vec_insert defines the element argument type to be signed int by ELFv2
ABI, so convert it to SImode if it wasn't for Power target requirements.

gcc/ChangeLog:

2021-02-03  Xionghu Luo  

* config/rs6000/rs6000.c (rs6000_expand_vector_set): Convert
elt_rtx to SImode if it wasn't.

gcc/testsuite/ChangeLog:

2021-02-03  Xionghu Luo  

* gcc.target/powerpc/pr98914.c: New test.
---
  gcc/config/rs6000/rs6000.c | 17 ++---
  gcc/testsuite/gcc.target/powerpc/pr98914.c | 11 +++
  2 files changed, 21 insertions(+), 7 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr98914.c

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index ec068c58aa5..9f7f8da56c6 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -7000,8 +7000,6 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, rtx 
idx)
  
gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx));
  
-  gcc_assert (GET_MODE (idx) == E_SImode);

-
machine_mode inner_mode = GET_MODE (val);
  
rtx tmp = gen_reg_rtx (GET_MODE (idx));

@@ -7047,8 +7045,6 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx 
idx)
  
gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx));
  
-  gcc_assert (GET_MODE (idx) == E_SImode);

-
machine_mode inner_mode = GET_MODE (val);
HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode);
  
@@ -7144,7 +7140,7 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx)

machine_mode mode = GET_MODE (target);
machine_mode inner_mode = GET_MODE_INNER (mode);
rtx reg = gen_reg_rtx (mode);
-  rtx mask, mem, x;
+  rtx mask, mem, x, elt_si;
int width = GET_MODE_SIZE (inner_mode);
int i;
  
@@ -7154,16 +7150,23 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx)

  {
if (!CONST_INT_P (elt_rtx))
{
+ /* elt_rtx should be SImode from ELFv2 ABI.  */
+ elt_si = gen_reg_rtx (E_SImode);
+ if (GET_MODE (elt_rtx) != E_SImode)
+   convert_move (elt_si, elt_rtx, 0);
+ else
+   elt_si = elt_rtx;
+
  /* For V2DI/V2DF, could leverage the P9 version to generate xxpermdi
 when elt_rtx is variable.  */
  if ((TARGET_P9_VECTOR && TARGET_POWERPC64) || width == 8)
{
- rs6000_expand_vector_set_var_p9 (target, val, elt_rtx);
+ rs6000_expand_vector_set_var_p9 (target, val, elt_si);
  return;
}
  else if (TARGET_P8_VECTOR && TARGET_DIRECT_MOVE_64BIT)
{
- rs6000_expand_vector_set_var_p8 (target, val, elt_rtx);
+ rs6000_expand_vector_set_var_p8 (target, val, elt_si);
  return;
}
}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr98914.c 
b/gcc/testsuite/gcc.target/powerpc/pr98914.c
new file mode 100644
index 000..e4d78e3e6b3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr98914.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-Og -mvsx" } */
+
+vector int
+foo (vector int v)
+{
+  for (long k = 0; k < 1; ++k)
+v[k] = 0;
+  return v;
+}



--
Thanks,
Xionghu

[PATCH v2] rs6000: Convert the vector element register to SImode [PR98914]

2021-02-23 Thread Xionghu Luo via Gcc-patches

vec_insert defines the element argument type to be signed int by ELFv2
ABI, When expanding a vector with a variable rtx, convert the rtx type
SImode.

gcc/ChangeLog:

2021-02-24  Xionghu Luo  

PR target/98914
* config/rs6000/rs6000.c (rs6000_expand_vector_set): Convert
elt_rtx to SImode.
(rs6000_expand_vector_set_var_p9): Remove assert.
(rs6000_expand_vector_set_var_p8): Likewise.

gcc/testsuite/ChangeLog:

2021-02-24  Xionghu Luo  

* gcc.target/powerpc/pr98914.c: New test.
---
 gcc/config/rs6000/rs6000.c | 17 ++---
 gcc/testsuite/gcc.target/powerpc/pr98914.c | 11 +++
 2 files changed, 21 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr98914.c

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index ec068c58aa5..9f7f8da56c6 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -7000,8 +7000,6 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, rtx 
idx)
 
   gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx));
 
-  gcc_assert (GET_MODE (idx) == E_SImode);
-
   machine_mode inner_mode = GET_MODE (val);
 
   rtx tmp = gen_reg_rtx (GET_MODE (idx));
@@ -7047,8 +7045,6 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx 
idx)
 
   gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx));
 
-  gcc_assert (GET_MODE (idx) == E_SImode);
-
   machine_mode inner_mode = GET_MODE (val);
   HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode);
 
@@ -7144,7 +7140,7 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx 
elt_rtx)
   machine_mode mode = GET_MODE (target);
   machine_mode inner_mode = GET_MODE_INNER (mode);
   rtx reg = gen_reg_rtx (mode);
-  rtx mask, mem, x;
+  rtx mask, mem, x, elt_si;
   int width = GET_MODE_SIZE (inner_mode);
   int i;
 
@@ -7154,16 +7150,23 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx 
elt_rtx)
 {
   if (!CONST_INT_P (elt_rtx))
{
+ /* elt_rtx should be SImode from ELFv2 ABI.  */
+ elt_si = gen_reg_rtx (E_SImode);
+ if (GET_MODE (elt_rtx) != E_SImode)
+   convert_move (elt_si, elt_rtx, 0);
+ else
+   elt_si = elt_rtx;
+
  /* For V2DI/V2DF, could leverage the P9 version to generate xxpermdi
 when elt_rtx is variable.  */
  if ((TARGET_P9_VECTOR && TARGET_POWERPC64) || width == 8)
{
- rs6000_expand_vector_set_var_p9 (target, val, elt_rtx);
+ rs6000_expand_vector_set_var_p9 (target, val, elt_si);
  return;
}
  else if (TARGET_P8_VECTOR && TARGET_DIRECT_MOVE_64BIT)
{
- rs6000_expand_vector_set_var_p8 (target, val, elt_rtx);
+ rs6000_expand_vector_set_var_p8 (target, val, elt_si);
  return;
}
}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr98914.c 
b/gcc/testsuite/gcc.target/powerpc/pr98914.c
new file mode 100644
index 000..e4d78e3e6b3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr98914.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-Og -mvsx" } */
+
+vector int
+foo (vector int v)
+{
+  for (long k = 0; k < 1; ++k)
+v[k] = 0;
+  return v;
+}
-- 
2.25.1

Re: [PATCH v2] rs6000: Convert the vector element register to SImode [PR98914]

2021-02-24 Thread Xionghu Luo via Gcc-patches




On 2021/2/25 00:57, Segher Boessenkool wrote:
> Hi!
> 
> On Wed, Feb 24, 2021 at 09:06:24AM +0800, Xionghu Luo wrote:
>> vec_insert defines the element argument type to be signed int by ELFv2
>> ABI, When expanding a vector with a variable rtx, convert the rtx type
>> SImode.
> 
> But that is true for the intrinsics, not for all other callers of
> rs6000_expand_vector_init.  See
> <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98914#c2> as well?
> 
> So I don't think you do this in the right place.  You can convince me
> with good arguments of course :-)

Thanks for pointing out, it seems we should convert the type to DImode in 
rs6000_expand_vector_set_var_p9 and rs6000_expand_vector_set_var_p8
to support both usage?

 
PS: for "vec_insert (i, u, n)" usage when n is long, what should the front-end
do in altivec_resolve_overloaded_builtin to follow the ELFv2 rule?  Currently,
no warning/error message or conversion there, INTEGRAL_TYPE_P range is much 
larger
than signed int.


gcc/config/rs6000/rs6000-c.c
altivec_resolve_overloaded_builtin
{
...
  if (!INTEGRAL_TYPE_P (TREE_TYPE (arg2)))
goto bad;
...
}



Updated the back-end patch as below.


0001-rs6000-Convert-the-vector-set-variable-idx-to-DImode.patch


vec_insert defines the element argument type to be signed int by ELFv2
ABI.  When expanding a vector with a variable rtx, convert the rtx type
to DImode to support both intrinsic usage and other callers from
rs6000_expand_vector_init produced by v[k] = val when k is long type.

gcc/ChangeLog:

2021-02-25  Xionghu Luo  

PR target/98914
* config/rs6000/rs6000.c (rs6000_expand_vector_set_var_p9):
Convert idx to DImode.
(rs6000_expand_vector_set_var_p8): Likewise.

gcc/testsuite/ChangeLog:

2021-02-25  Xionghu Luo  

PR target/98914
* gcc.target/powerpc/pr98914.c: New test.
---
 gcc/config/rs6000/rs6000.c | 33 +-
 gcc/testsuite/gcc.target/powerpc/pr98914.c | 11 
 2 files changed, 30 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr98914.c

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index ec068c58aa5..48eb91132a9 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -7000,11 +7000,15 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, 
rtx idx)
 
   gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx));
 
-  gcc_assert (GET_MODE (idx) == E_SImode);
-
   machine_mode inner_mode = GET_MODE (val);
 
-  rtx tmp = gen_reg_rtx (GET_MODE (idx));
+  machine_mode idx_mode = GET_MODE (idx);
+  rtx tmp = gen_reg_rtx (DImode);
+  if (idx_mode != DImode)
+tmp = convert_modes (DImode, idx_mode, idx, 0);
+  else
+tmp = idx;
+
   int width = GET_MODE_SIZE (inner_mode);
 
   gcc_assert (width >= 1 && width <= 8);
@@ -7012,9 +7016,7 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, rtx 
idx)
   int shift = exact_log2 (width);
   /* Generate the IDX for permute shift, width is the vector element size.
  idx = idx * width.  */
-  emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (shift)));
-
-  tmp = convert_modes (DImode, SImode, tmp, 1);
+  emit_insn (gen_ashldi3 (tmp, tmp, GEN_INT (shift)));
 
   /*  lvsrv1,0,idx.  */
   rtx pcvr = gen_reg_rtx (V16QImode);
@@ -7047,27 +7049,31 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, 
rtx idx)
 
   gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx));
 
-  gcc_assert (GET_MODE (idx) == E_SImode);
-
   machine_mode inner_mode = GET_MODE (val);
   HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode);
 
-  rtx tmp = gen_reg_rtx (GET_MODE (idx));
   int width = GET_MODE_SIZE (inner_mode);
 
+  machine_mode idx_mode = GET_MODE (idx);
+  rtx tmp = gen_reg_rtx (DImode);
+  if (idx_mode != DImode)
+tmp = convert_modes (DImode, idx_mode, idx, 0);
+  else
+tmp = idx;
+
   gcc_assert (width >= 1 && width <= 4);
 
   if (!BYTES_BIG_ENDIAN)
 {
   /*  idx = idx * width.  */
-  emit_insn (gen_mulsi3 (tmp, idx, GEN_INT (width)));
+  emit_insn (gen_muldi3 (tmp, tmp, GEN_INT (width)));
   /*  idx = idx + 8.  */
-  emit_insn (gen_addsi3 (tmp, tmp, GEN_INT (8)));
+  emit_insn (gen_adddi3 (tmp, tmp, GEN_INT (8)));
 }
   else
 {
-  emit_insn (gen_mulsi3 (tmp, idx, GEN_INT (width)));
-  emit_insn (gen_subsi3 (tmp, GEN_INT (24 - width), tmp));
+  emit_insn (gen_muldi3 (tmp, idx, GEN_INT (width)));
+  emit_insn (gen_subdi3 (tmp, GEN_INT (24 - width), tmp));
 }
 
   /*  lxv vs33, mask.
@@ -7118,7 +7124,6 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx 
idx)
   emit_insn (gen_rtx_SET (val_v16qi, sub_val));
 
   /*  lvsl13,0,idx.  */
-  tmp = convert_modes (DImode, SImode, tmp, 1);
   rtx pcv = gen_reg_rtx (V16QImode);
   emit_insn (gen_altivec_lvsl_reg (pcv, tmp));
 
diff --git a/gc

Ping: [PATCH v2] rs6000: Convert the vector element register to SImode [PR98914]

2021-03-02 Thread Xionghu Luo via Gcc-patches



On 2021/2/25 14:33, Xionghu Luo via Gcc-patches wrote:
> 
> 
> On 2021/2/25 00:57, Segher Boessenkool wrote:
>> Hi!
>>
>> On Wed, Feb 24, 2021 at 09:06:24AM +0800, Xionghu Luo wrote:
>>> vec_insert defines the element argument type to be signed int by ELFv2
>>> ABI, When expanding a vector with a variable rtx, convert the rtx type
>>> SImode.
>>
>> But that is true for the intrinsics, not for all other callers of
>> rs6000_expand_vector_init.  See
>> <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98914#c2> as well?
>>
>> So I don't think you do this in the right place.  You can convince me
>> with good arguments of course :-)
> 
> Thanks for pointing out, it seems we should convert the type to DImode in
> rs6000_expand_vector_set_var_p9 and rs6000_expand_vector_set_var_p8
> to support both usage?
> 
> 
> PS: for "vec_insert (i, u, n)" usage when n is long, what should the front-end
> do in altivec_resolve_overloaded_builtin to follow the ELFv2 rule?  Currently,
> no warning/error message or conversion there, INTEGRAL_TYPE_P range is much 
> larger
> than signed int.

long to int should follow implicit transformation, so no need change here.
Ping the patch, thanks.


BR,
Xionghu

> 
> 
> 
> Updated the back-end patch as below.
> 
> 
> 0001-rs6000-Convert-the-vector-set-variable-idx-to-DImode.patch
> 
> 
> vec_insert defines the element argument type to be signed int by ELFv2
> ABI.  When expanding a vector with a variable rtx, convert the rtx type
> to DImode to support both intrinsic usage and other callers from
> rs6000_expand_vector_init produced by v[k] = val when k is long type.
> 
> gcc/ChangeLog:
> 
> 2021-02-25  Xionghu Luo  
> 
>   PR target/98914
>   * config/rs6000/rs6000.c (rs6000_expand_vector_set_var_p9):
>   Convert idx to DImode.
>   (rs6000_expand_vector_set_var_p8): Likewise.
> 
> gcc/testsuite/ChangeLog:
> 
> 2021-02-25  Xionghu Luo  
> 
>   PR target/98914
>   * gcc.target/powerpc/pr98914.c: New test.
> ---
>   gcc/config/rs6000/rs6000.c | 33 +-
>   gcc/testsuite/gcc.target/powerpc/pr98914.c | 11 
>   2 files changed, 30 insertions(+), 14 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/powerpc/pr98914.c
> 
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index ec068c58aa5..48eb91132a9 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -7000,11 +7000,15 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, 
> rtx idx)
> 
> gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx));
> 
> -  gcc_assert (GET_MODE (idx) == E_SImode);
> -
> machine_mode inner_mode = GET_MODE (val);
> 
> -  rtx tmp = gen_reg_rtx (GET_MODE (idx));
> +  machine_mode idx_mode = GET_MODE (idx);
> +  rtx tmp = gen_reg_rtx (DImode);
> +  if (idx_mode != DImode)
> +tmp = convert_modes (DImode, idx_mode, idx, 0);
> +  else
> +tmp = idx;
> +
> int width = GET_MODE_SIZE (inner_mode);
> 
> gcc_assert (width >= 1 && width <= 8);
> @@ -7012,9 +7016,7 @@ rs6000_expand_vector_set_var_p9 (rtx target, rtx val, 
> rtx idx)
> int shift = exact_log2 (width);
> /* Generate the IDX for permute shift, width is the vector element size.
>idx = idx * width.  */
> -  emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (shift)));
> -
> -  tmp = convert_modes (DImode, SImode, tmp, 1);
> +  emit_insn (gen_ashldi3 (tmp, tmp, GEN_INT (shift)));
> 
> /*  lvsrv1,0,idx.  */
> rtx pcvr = gen_reg_rtx (V16QImode);
> @@ -7047,27 +7049,31 @@ rs6000_expand_vector_set_var_p8 (rtx target, rtx val, 
> rtx idx)
> 
> gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx));
> 
> -  gcc_assert (GET_MODE (idx) == E_SImode);
> -
> machine_mode inner_mode = GET_MODE (val);
> HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode);
> 
> -  rtx tmp = gen_reg_rtx (GET_MODE (idx));
> int width = GET_MODE_SIZE (inner_mode);
> 
> +  machine_mode idx_mode = GET_MODE (idx);
> +  rtx tmp = gen_reg_rtx (DImode);
> +  if (idx_mode != DImode)
> +tmp = convert_modes (DImode, idx_mode, idx, 0);
> +  else
> +tmp = idx;
> +
> gcc_assert (width >= 1 && width <= 4);
> 
> if (!BYTES_BIG_ENDIAN)
>   {
> /*  idx = idx * width.  */
> -  emit_insn (gen_mulsi3 (tmp, idx, GEN_INT (width)));
> +  emit_insn (gen_muldi3 (tmp, tmp, GEN_INT (width)));
> /*  idx = idx + 8.  */
> -  emit_insn (gen_addsi3 (tmp, tmp, GEN_INT (8)));
> +  emit_insn (ge

[PATCH] Fix loop split incorrect count and probability

2021-08-03 Thread Xionghu Luo via Gcc-patches

loop split condition is moved between loop1 and loop2, the split bb's
count and probability should also be duplicated instead of (100% vs INV),
secondly, the original loop1 and loop2 count need be propotional from the
original loop.

Regression tested pass, OK for master?

diff base/loop-cond-split-1.c.151t.lsplit  
patched/loop-cond-split-1.c.151t.lsplit:
...
   int prephitmp_16;
   int prephitmp_25;

[local count: 118111600]:
   if (n_7(D) > 0)
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 118111600]:
   return;

[local count: 105119324]:
   pretmp_3 = ga;

-   [local count: 955630225]:
+   [local count: 315357973]:
   # i_13 = PHI 
   # prephitmp_12 = PHI 
   if (prephitmp_12 != 0)
 goto ; [33.00%]
   else
 goto ; [67.00%]

-   [local count: 315357972]:
+   [local count: 104068130]:
   _2 = do_something ();
   ga = _2;

-   [local count: 955630225]:
+   [local count: 315357973]:
   # prephitmp_5 = PHI 
   i_10 = inc (i_13);
   if (n_7(D) > i_10)
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 105119324]:
   goto ; [100.00%]

-   [local count: 850510901]:
+   [local count: 280668596]:
   if (prephitmp_12 != 0)
-goto ; [100.00%]
+goto ; [33.00%]
   else
-goto ; [INV]
+goto ; [67.00%]

-   [local count: 850510901]:
+   [local count: 280668596]:
   goto ; [100.00%]

-   [count: 0]:
+   [local count: 70429947]:
   # i_23 = PHI 
   # prephitmp_25 = PHI 

-   [local count: 955630225]:
+   [local count: 640272252]:
   # i_15 = PHI 
   # prephitmp_16 = PHI 
   i_22 = inc (i_15);
   if (n_7(D) > i_22)
 goto ; [89.00%]
   else
 goto ; [11.00%]

-   [local count: 850510901]:
+   [local count: 569842305]:
   goto ; [100.00%]

 }

gcc/ChangeLog:

* tree-ssa-loop-split.c (split_loop): Fix incorrect probability.
(do_split_loop_on_cond): Likewise.
---
 gcc/tree-ssa-loop-split.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
index 3a09bbc39e5..8e5a7ded0f7 100644
--- a/gcc/tree-ssa-loop-split.c
+++ b/gcc/tree-ssa-loop-split.c
@@ -583,10 +583,10 @@ split_loop (class loop *loop1)
basic_block cond_bb;
 
class loop *loop2 = loop_version (loop1, cond, &cond_bb,
-  profile_probability::always (),
-  profile_probability::always (),
-  profile_probability::always (),
-  profile_probability::always (),
+  true_edge->probability,
+  true_edge->probability.invert (),
+  true_edge->probability,
+  true_edge->probability.invert (),
   true);
gcc_assert (loop2);
 
@@ -1486,10 +1486,10 @@ do_split_loop_on_cond (struct loop *loop1, edge 
invar_branch)
   initialize_original_copy_tables ();
 
   struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL,
-profile_probability::always (),
-profile_probability::never (),
-profile_probability::always (),
-profile_probability::always (),
+invar_branch->probability.invert (),
+invar_branch->probability,
+invar_branch->probability.invert (),
+invar_branch->probability,
 true);
   if (!loop2)
 {
-- 
2.25.1

Re: [PATCH] Fix loop split incorrect count and probability

2021-08-03 Thread Xionghu Luo via Gcc-patches


I' like to split this patch:

https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576488.html

to two patches:

0001-Fix-loop-split-incorrect-count-and-probability.patch
0002-Don-t-move-cold-code-out-of-loop-by-checking-bb-coun.patch

since they are solving two different things, please help to review
the attached series.  They show obvious performance improvement on
both P8 and P9 for CPU2017, and I am not sure how it will affect other
platforms like X86 and AArch64, it will be grateful if someone could
try it.  Thanks.


Xionghu
From 4e1ef5b1f423484a6789750e7cc0cf2e94517f20 Mon Sep 17 00:00:00 2001
From: Xionghu Luo 
Date: Tue, 3 Aug 2021 03:44:14 -0500
Subject: [PATCH 1/2] Fix loop split incorrect count and probability

loop split condition is moved between loop1 and loop2, the split bb's
count and probability should also be duplicated instead of (100% vs INV),
secondly, the original loop1 and loop2 count need be propotional from the
original loop.

Regression tested pass, OK for master?

diff base/loop-cond-split-1.c.151t.lsplit  
patched/loop-cond-split-1.c.151t.lsplit:
...
   int prephitmp_16;
   int prephitmp_25;

[local count: 118111600]:
   if (n_7(D) > 0)
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 118111600]:
   return;

[local count: 105119324]:
   pretmp_3 = ga;

-   [local count: 955630225]:
+   [local count: 315357973]:
   # i_13 = PHI 
   # prephitmp_12 = PHI 
   if (prephitmp_12 != 0)
 goto ; [33.00%]
   else
 goto ; [67.00%]

-   [local count: 315357972]:
+   [local count: 104068130]:
   _2 = do_something ();
   ga = _2;

-   [local count: 955630225]:
+   [local count: 315357973]:
   # prephitmp_5 = PHI 
   i_10 = inc (i_13);
   if (n_7(D) > i_10)
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 105119324]:
   goto ; [100.00%]

-   [local count: 850510901]:
+   [local count: 280668596]:
   if (prephitmp_12 != 0)
-goto ; [100.00%]
+goto ; [33.00%]
   else
-goto ; [INV]
+goto ; [67.00%]

-   [local count: 850510901]:
+   [local count: 280668596]:
   goto ; [100.00%]

-   [count: 0]:
+   [local count: 70429947]:
   # i_23 = PHI 
   # prephitmp_25 = PHI 

-   [local count: 955630225]:
+   [local count: 640272252]:
   # i_15 = PHI 
   # prephitmp_16 = PHI 
   i_22 = inc (i_15);
   if (n_7(D) > i_22)
 goto ; [89.00%]
   else
 goto ; [11.00%]

-   [local count: 850510901]:
+   [local count: 569842305]:
   goto ; [100.00%]

 }

gcc/ChangeLog:

* tree-ssa-loop-split.c (split_loop): Fix incorrect probability.
(do_split_loop_on_cond): Likewise.
---
 gcc/tree-ssa-loop-split.c | 25 -
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
index 3f6ad046623..d30782888f3 100644
--- a/gcc/tree-ssa-loop-split.c
+++ b/gcc/tree-ssa-loop-split.c
@@ -575,7 +575,11 @@ split_loop (class loop *loop1)
stmts2);
tree cond = build2 (guard_code, boolean_type_node, guard_init, border);
if (!initial_true)
- cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); 
+ cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond);
+
+   edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE
+  ? EDGE_SUCC (bbs[i], 0)
+  : EDGE_SUCC (bbs[i], 1);
 
/* Now version the loop, placing loop2 after loop1 connecting
   them, and fix up SSA form for that.  */
@@ -583,10 +587,10 @@ split_loop (class loop *loop1)
basic_block cond_bb;
 
class loop *loop2 = loop_version (loop1, cond, &cond_bb,
-  profile_probability::always (),
-  profile_probability::always (),
-  profile_probability::always (),
-  profile_probability::always (),
+  true_edge->probability,
+  true_edge->probability.invert (),
+  true_edge->probability,
+  true_edge->probability.invert (),
   true);
gcc_assert (loop2);
 
@@ -1486,10 +1490,10 @@ do_split_loop_on_cond (struct loop *loop1, edge 
invar_branch)
   initialize_original_copy_tables ();
 
   struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL,
-profile_probability::always (),
-profile_probability::never (),
-profile_probability::always (),
-profile_probability::always (),
+invar_branch->probability.invert (),
+

Re: [PATCH] Fix loop split incorrect count and probability

2021-08-08 Thread Xionghu Luo via Gcc-patches

Thanks,

On 2021/8/6 19:46, Richard Biener wrote:
> On Tue, 3 Aug 2021, Xionghu Luo wrote:
> 
>> loop split condition is moved between loop1 and loop2, the split bb's
>> count and probability should also be duplicated instead of (100% vs INV),
>> secondly, the original loop1 and loop2 count need be propotional from the
>> original loop.
>>
>>
>> diff base/loop-cond-split-1.c.151t.lsplit  
>> patched/loop-cond-split-1.c.151t.lsplit:
>> ...
>> int prephitmp_16;
>> int prephitmp_25;
>>
>>  [local count: 118111600]:
>> if (n_7(D) > 0)
>>   goto ; [89.00%]
>> else
>>   goto ; [11.00%]
>>
>>  [local count: 118111600]:
>> return;
>>
>>  [local count: 105119324]:
>> pretmp_3 = ga;
>>
>> -   [local count: 955630225]:
>> +   [local count: 315357973]:
>> # i_13 = PHI 
>> # prephitmp_12 = PHI 
>> if (prephitmp_12 != 0)
>>   goto ; [33.00%]
>> else
>>   goto ; [67.00%]
>>
>> -   [local count: 315357972]:
>> +   [local count: 104068130]:
>> _2 = do_something ();
>> ga = _2;
>>
>> -   [local count: 955630225]:
>> +   [local count: 315357973]:
>> # prephitmp_5 = PHI 
>> i_10 = inc (i_13);
>> if (n_7(D) > i_10)
>>   goto ; [89.00%]
>> else
>>   goto ; [11.00%]
>>
>>  [local count: 105119324]:
>> goto ; [100.00%]
>>
>> -   [local count: 850510901]:
>> +   [local count: 280668596]:
>> if (prephitmp_12 != 0)
>> -goto ; [100.00%]
>> +goto ; [33.00%]
>> else
>> -goto ; [INV]
>> +goto ; [67.00%]
>>
>> -   [local count: 850510901]:
>> +   [local count: 280668596]:
>> goto ; [100.00%]
>>
>> -   [count: 0]:
>> +   [local count: 70429947]:
>> # i_23 = PHI 
>> # prephitmp_25 = PHI 
>>
>> -   [local count: 955630225]:
>> +   [local count: 640272252]:
>> # i_15 = PHI 
>> # prephitmp_16 = PHI 
>> i_22 = inc (i_15);
>> if (n_7(D) > i_22)
>>   goto ; [89.00%]
>> else
>>   goto ; [11.00%]
>>
>> -   [local count: 850510901]:
>> +   [local count: 569842305]:
>> goto ; [100.00%]
>>
>>   }
>>
>> gcc/ChangeLog:
>>
>>  * tree-ssa-loop-split.c (split_loop): Fix incorrect probability.
>>  (do_split_loop_on_cond): Likewise.
>> ---
>>   gcc/tree-ssa-loop-split.c | 16 
>>   1 file changed, 8 insertions(+), 8 deletions(-)
>>
>> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
>> index 3a09bbc39e5..8e5a7ded0f7 100644
>> --- a/gcc/tree-ssa-loop-split.c
>> +++ b/gcc/tree-ssa-loop-split.c
>> @@ -583,10 +583,10 @@ split_loop (class loop *loop1)
>>  basic_block cond_bb;

if (!initial_true)
- cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); 
+ cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond);
+
+   edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE
+  ? EDGE_SUCC (bbs[i], 0)
+  : EDGE_SUCC (bbs[i], 1);

>>   
>>  class loop *loop2 = loop_version (loop1, cond, &cond_bb,
>> -   profile_probability::always (),
>> -   profile_probability::always (),
>> -   profile_probability::always (),
>> -   profile_probability::always (),
>> +   true_edge->probability,
>> +   true_edge->probability.invert (),
>> +   true_edge->probability,
>> +   true_edge->probability.invert (),
>> true);
> 
> there is no 'true_edge' variable at this point.

Sorry, missed the above hunk when split the patch. 

> 
>>  gcc_assert (loop2);
>>   
>> @@ -1486,10 +1486,10 @@ do_split_loop_on_cond (struct loop *loop1, edge 
>> invar_branch)
>> initialize_original_copy_tables ();
>>   
>> struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL,
>> - profile_probability::always (),
>> - profile_probability::never (),
>> - profile_probability::always (),
>> -

Re: [RFC] Don't move cold code out of loop by checking bb count

2021-08-09 Thread Xionghu Luo via Gcc-patches

Hi,

On 2021/8/6 20:15, Richard Biener wrote:
> On Mon, Aug 2, 2021 at 7:05 AM Xiong Hu Luo  wrote:
>>
>> There was a patch trying to avoid move cold block out of loop:
>>
>> https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html
>>
>> Richard suggested to "never hoist anything from a bb with lower execution
>> frequency to a bb with higher one in LIM invariantness_dom_walker
>> before_dom_children".
>>
>> This patch does this profile count check in both gimple LIM
>> move_computations_worker and RTL loop-invariant.c find_invariants_bb,
>> if the loop bb is colder than loop preheader, don't hoist it out of
>> loop.
>>
>> Also, the profile count in loop split pass should be corrected to avoid
>> lim2 and lim4 mismatch behavior, currently, the new loop preheader generated
>> by loop_version is set to "[count: 0]:", then lim4 after lsplt pass will
>> move statement out of loop unexpectely when lim2 didn't move it.  This
>> change could fix regression on 544.nab_r from -1.55% to +0.46%.
>>
>> SPEC2017 performance evaluation shows 1% performance improvement for
>> intrate GEOMEAN and no obvious regression for others.  Especially,
>> 500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is
>> largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00%
>> on P8LE.
>>
>> Regression and bootstrap tested pass on P8LE, any comments?  Thanks.
> 
> While I'm not familiar with the RTL invariant motion pass the patch there
> looks reasonable.  Note that we should assess the profile quality
> somehow - I'm not sure how to do that, CCed Honza for that.

Thanks.

> 
> For the GIMPLE part the patch looks quite complicated - but note it
> probably has to be since LIM performs kind of a "CSE" on loads
> (and stores for store-motion), so when there are multiple stmts
> affected by a hoisting decision the biggest block count has to be
> accounted.  Likewise when there are dependent stmts involved
> that might include conditional stmts (a "PHI"), but the overall
> cost should be looked at.

Currently, The gimple code check two situations with the patch:
1) The statement or PHI‘s BB is *colder* then preheader, don't move it out
of loop;
2) The statement or PHI's BB is *hotter* then preheader, but any of it's rhs
couldn't be moved out of loop, also don't move it out of loop to avoid 
definition
not dominates use error.

May be I could collect the number of instructions not hoisted with the patch
on regression tests and SPEC2017 to do a estimation for "multiple stmts 
affected"
and "overall cost" need to be considered?  But it seems move_computations_worker
couldn't rollback if we still want to hoist multiple stmts out during the 
iterations?

> 
> Now - GIMPLE LIM "costing" is somewhat backward right now
> and it isn't set up to consider those multiple involved stmts.  Plus
> the store-motion part does not have any cost part (but it depends
> on previously decided invariant motions).
> 
> I think the way you implemented the check will cause no hoisting
> to be performed instead of, say, hoisting to a different loop level
> only.  Possibly shown when you consider a loop nest like
> 
>for (;;)
>  if (unlikely_cond)
>for (;;)
>   invariant;
> 
> we want to hoist 'invariant' but only from the inner loop even if it
> is invariant also in the outer loop.


For this case, theorotically I think the master GCC will optimize it to:

  invariant;
  for (;;)
if (unlikely_cond)
  for (;;)
 ;

'invariant' is moved out of outer loop, but with the patch, it will get:

  for (;;)
if (unlikely_cond)
  {
invariant;
for (;;)
   ;
  }

'invariant' is *cold* for outer loop, but it is still *hot* for inner loop,
so hoist it out of inner loop, this is exactly what we want, right?


>  But for example if there is
> a store motion opportunity like
> 
>for (;;)
>   {
>  if (unlikely_cond)
>for (;;)
>  a = ...;
>  a = ...;
>   }
> 
> we'd still want to perform the store motion on the outer loop.
> 
> Note that store-motion already performs part of the transform
> before dependent code is moved in move_computations (that
> you patched).

Yes.  do_store_motion is running before move_computations_worker, store
motion happens earlier in execute_sm, I also added the check in execute_sm
to stop cold store moved out of loop.  So for your case, I think my patch
will similarly optimize it to:

  for (;;)
 {
if (unlikely_cond)
  {
for (;;)
  ;
a = ...;
  }
 }
a = ...;

Whether this is better?  Will construct cases to verify it.

> 
> IIRC your main concern were the COND_EXPRs we insert
> for hoisted conditional stmts?

Not sure what you mean here of COND_EXPRs?


Thanks,
Xionghu

> 
> Thanks,
> Richard.
> 
>> gcc/ChangeLog:
>>
>>  * loop-invariant.c (find_invariants_bb): Check profile count
>>  before motion.
>>  (find_invar

Re: [PATCH] Fix loop split incorrect count and probability

2021-08-11 Thread Xionghu Luo via Gcc-patches




On 2021/8/10 22:47, Richard Biener wrote:
> On Mon, 9 Aug 2021, Xionghu Luo wrote:
> 
>> Thanks,
>>
>> On 2021/8/6 19:46, Richard Biener wrote:
>>> On Tue, 3 Aug 2021, Xionghu Luo wrote:
>>>
>>>> loop split condition is moved between loop1 and loop2, the split bb's
>>>> count and probability should also be duplicated instead of (100% vs INV),
>>>> secondly, the original loop1 and loop2 count need be propotional from the
>>>> original loop.
>>>>
>>>>
>>>> diff base/loop-cond-split-1.c.151t.lsplit  
>>>> patched/loop-cond-split-1.c.151t.lsplit:
>>>> ...
>>>>  int prephitmp_16;
>>>>  int prephitmp_25;
>>>>
>>>>   [local count: 118111600]:
>>>>  if (n_7(D) > 0)
>>>>goto ; [89.00%]
>>>>  else
>>>>goto ; [11.00%]
>>>>
>>>>   [local count: 118111600]:
>>>>  return;
>>>>
>>>>   [local count: 105119324]:
>>>>  pretmp_3 = ga;
>>>>
>>>> -   [local count: 955630225]:
>>>> +   [local count: 315357973]:
>>>>  # i_13 = PHI 
>>>>  # prephitmp_12 = PHI 
>>>>  if (prephitmp_12 != 0)
>>>>goto ; [33.00%]
>>>>  else
>>>>goto ; [67.00%]
>>>>
>>>> -   [local count: 315357972]:
>>>> +   [local count: 104068130]:
>>>>  _2 = do_something ();
>>>>  ga = _2;
>>>>
>>>> -   [local count: 955630225]:
>>>> +   [local count: 315357973]:
>>>>  # prephitmp_5 = PHI 
>>>>  i_10 = inc (i_13);
>>>>  if (n_7(D) > i_10)
>>>>goto ; [89.00%]
>>>>  else
>>>>goto ; [11.00%]
>>>>
>>>>   [local count: 105119324]:
>>>>  goto ; [100.00%]
>>>>
>>>> -   [local count: 850510901]:
>>>> +   [local count: 280668596]:
>>>>  if (prephitmp_12 != 0)
>>>> -goto ; [100.00%]
>>>> +goto ; [33.00%]
>>>>  else
>>>> -goto ; [INV]
>>>> +goto ; [67.00%]
>>>>
>>>> -   [local count: 850510901]:
>>>> +   [local count: 280668596]:
>>>>  goto ; [100.00%]
>>>>
>>>> -   [count: 0]:
>>>> +   [local count: 70429947]:
>>>>  # i_23 = PHI 
>>>>  # prephitmp_25 = PHI 
>>>>
>>>> -   [local count: 955630225]:
>>>> +   [local count: 640272252]:
>>>>  # i_15 = PHI 
>>>>  # prephitmp_16 = PHI 
>>>>  i_22 = inc (i_15);
>>>>  if (n_7(D) > i_22)
>>>>goto ; [89.00%]
>>>>  else
>>>>goto ; [11.00%]
>>>>
>>>> -   [local count: 850510901]:
>>>> +   [local count: 569842305]:
>>>>  goto ; [100.00%]
>>>>
>>>>}
>>>>
>>>> gcc/ChangeLog:
>>>>
>>>>* tree-ssa-loop-split.c (split_loop): Fix incorrect probability.
>>>>(do_split_loop_on_cond): Likewise.
>>>> ---
>>>>gcc/tree-ssa-loop-split.c | 16 
>>>>1 file changed, 8 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
>>>> index 3a09bbc39e5..8e5a7ded0f7 100644
>>>> --- a/gcc/tree-ssa-loop-split.c
>>>> +++ b/gcc/tree-ssa-loop-split.c
>>>> @@ -583,10 +583,10 @@ split_loop (class loop *loop1)
>>>>basic_block cond_bb;
>>
>>  if (!initial_true)
>> -  cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond);
>> +  cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond);
>> +
>> +edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE
>> +   ? EDGE_SUCC (bbs[i], 0)
>> +   : EDGE_SUCC (bbs[i], 1);
>>
>>>>
>>>>class loop *loop2 = loop_version (loop1, cond, &cond_bb,
>>>> - profile_probability::always (),
>>>> - profile_probability::always (),
>>>> - profile_probability::always (),
>>

Re: [PATCH] Fix loop split incorrect count and probability

2021-08-11 Thread Xionghu Luo via Gcc-patches





On 2021/8/11 17:16, Richard Biener wrote:

On Wed, 11 Aug 2021, Xionghu Luo wrote:




On 2021/8/10 22:47, Richard Biener wrote:

On Mon, 9 Aug 2021, Xionghu Luo wrote:


Thanks,

On 2021/8/6 19:46, Richard Biener wrote:

On Tue, 3 Aug 2021, Xionghu Luo wrote:


loop split condition is moved between loop1 and loop2, the split bb's
count and probability should also be duplicated instead of (100% vs INV),
secondly, the original loop1 and loop2 count need be propotional from the
original loop.


diff base/loop-cond-split-1.c.151t.lsplit  
patched/loop-cond-split-1.c.151t.lsplit:
...
  int prephitmp_16;
  int prephitmp_25;

   [local count: 118111600]:
  if (n_7(D) > 0)
goto ; [89.00%]
  else
goto ; [11.00%]

   [local count: 118111600]:
  return;

   [local count: 105119324]:
  pretmp_3 = ga;

-   [local count: 955630225]:
+   [local count: 315357973]:
  # i_13 = PHI 
  # prephitmp_12 = PHI 
  if (prephitmp_12 != 0)
goto ; [33.00%]
  else
goto ; [67.00%]

-   [local count: 315357972]:
+   [local count: 104068130]:
  _2 = do_something ();
  ga = _2;

-   [local count: 955630225]:
+   [local count: 315357973]:
  # prephitmp_5 = PHI 
  i_10 = inc (i_13);
  if (n_7(D) > i_10)
goto ; [89.00%]
  else
goto ; [11.00%]

   [local count: 105119324]:
  goto ; [100.00%]

-   [local count: 850510901]:
+   [local count: 280668596]:
  if (prephitmp_12 != 0)
-goto ; [100.00%]
+goto ; [33.00%]
  else
-goto ; [INV]
+goto ; [67.00%]

-   [local count: 850510901]:
+   [local count: 280668596]:
  goto ; [100.00%]

-   [count: 0]:
+   [local count: 70429947]:
  # i_23 = PHI 
  # prephitmp_25 = PHI 

-   [local count: 955630225]:
+   [local count: 640272252]:
  # i_15 = PHI 
  # prephitmp_16 = PHI 
  i_22 = inc (i_15);
  if (n_7(D) > i_22)
goto ; [89.00%]
  else
goto ; [11.00%]

-   [local count: 850510901]:
+   [local count: 569842305]:
  goto ; [100.00%]

}

gcc/ChangeLog:

* tree-ssa-loop-split.c (split_loop): Fix incorrect probability.
(do_split_loop_on_cond): Likewise.
---
gcc/tree-ssa-loop-split.c | 16 
1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
index 3a09bbc39e5..8e5a7ded0f7 100644
--- a/gcc/tree-ssa-loop-split.c
+++ b/gcc/tree-ssa-loop-split.c
@@ -583,10 +583,10 @@ split_loop (class loop *loop1)
basic_block cond_bb;


if (!initial_true)
- cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond);
+ cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond);
+
+   edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE
+  ? EDGE_SUCC (bbs[i], 0)
+  : EDGE_SUCC (bbs[i], 1);


	class loop *loop2 = loop_version (loop1, cond, &cond_bb,

-  profile_probability::always (),
-  profile_probability::always (),
-  profile_probability::always (),
-  profile_probability::always (),
+  true_edge->probability,
+  true_edge->probability.invert (),
+  true_edge->probability,
+  true_edge->probability.invert (),
   true);


there is no 'true_edge' variable at this point.


Sorry, missed the above hunk when split the patch.




gcc_assert (loop2);

@@ -1486,10 +1486,10 @@ do_split_loop_on_cond (struct loop *loop1, edge invar_branch)

  initialize_original_copy_tables ();

  struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL,

-profile_probability::always (),
-profile_probability::never (),
-profile_probability::always (),
-profile_probability::always (),
+invar_branch->probability.invert (),
+invar_branch->probability,
+invar_branch->probability.invert (),
+invar_branch->probability,
 true);
  if (!loop2)
{


The patch introduction seems to talk about do_split_loop_on_cond only.


split_loop faces similar issue though it sets the two branches to 100% vs 100%
and no scaling which seems also incorrect.


Since loop versioning inserts a condition with the passed probabilities
but in this case a 'boolean_true_node' condition the th

Re: [PATCH] Fix incorrect computation in fill_always_executed_in_1

2021-08-16 Thread Xionghu Luo via Gcc-patches


Hi,

On 2021/8/16 19:46, Richard Biener wrote:

On Mon, 16 Aug 2021, Xiong Hu Luo wrote:


It seems to me that ALWAYS_EXECUTED_IN is not computed correctly for
nested loops.  inn_loop is updated to inner loop, so it need be restored
when exiting from innermost loop. With this patch, the store instruction
in outer loop could also be moved out of outer loop by store motion.
Any comments?  Thanks.



gcc/ChangeLog:

* tree-ssa-loop-im.c (fill_always_executed_in_1): Restore
inn_loop when exiting from innermost loop.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/ssa-lim-19.c: New test.
---
  gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c | 24 ++
  gcc/tree-ssa-loop-im.c |  6 +-
  2 files changed, 29 insertions(+), 1 deletion(-)
  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c
new file mode 100644
index 000..097a5ee4a4b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c
@@ -0,0 +1,24 @@
+/* PR/101293 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-lim2-details" } */
+
+struct X { int i; int j; int k;};
+
+void foo(struct X *x, int n, int l)
+{
+  for (int j = 0; j < l; j++)
+{
+  for (int i = 0; i < n; ++i)
+   {
+ int *p = &x->j;
+ int tem = *p;
+ x->j += tem * i;
+   }
+  int *r = &x->k;
+  int tem2 = *r;
+  x->k += tem2 * j;
+}
+}
+
+/* { dg-final { scan-tree-dump-times "Executing store motion" 2 "lim2" } } */
+
diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index b24bc64f2a7..5ca4738b20e 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -3211,6 +3211,10 @@ fill_always_executed_in_1 (class loop *loop, sbitmap 
contains_call)
  if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
last = bb;
  
+	  if (inn_loop != loop

+ && flow_loop_nested_p (bb->loop_father, inn_loop))
+   inn_loop = bb->loop_father;
+


The comment says

   /* In a loop that is always entered we may proceed anyway.
  But record that we entered it and stop once we leave it.
*/
   inn_loop = bb->loop_father;

and your change would defeat that early return, no?


The issue is the search method exits too early when iterating the outer
loop.  For example of a nested loop, loop 1 includes 5,8,3,10,4,9
and loop2 includes 3,10.  Currently, it breaks when bb is 3 as bb 3
doesn't dominate bb 9 of loop 1.  But actually, both bb 5 and bb 4 are
ALWAYS_EXECUTED for loop 1, so if there are store instructions in bb 4
they won't be processed by store motion again.


5<
|\   |
8 \  9
|  \ |
--->3--->4
|| 
10---|



SET_ALWAYS_EXECUTED_IN is only set to bb 5 on master code now, with this
patch, it will continue search when meet bb 3 until bb 4, then last is updated
to bb 4, it will break until exit edge is found at bb 4 by
"if (!flow_bb_inside_loop_p (loop, e->dest))".  Then the followed loop code will
set bb 4 as ALWAYS_EXEUCTED and all it's idoms bb 5.


 while (1)
{
  SET_ALWAYS_EXECUTED_IN (last, loop);
  if (last == loop->header)
break;
  last = get_immediate_dominator (CDI_DOMINATORS, last);
}

After further discussion with Kewen, we found that the inn_loop variable is
totally useless and could be removed.





  if (bitmap_bit_p (contains_call, bb->index))
break;
  
@@ -3238,7 +3242,7 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call)
  
  	  if (bb->loop_father->header == bb)

{
- if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
+ if (!dominated_by_p (CDI_DOMINATORS, bb->loop_father->latch, bb))
break;


That's now a always false condition - a loops latch is always dominated
by its header.  The condition as written tries to verify whether the
loop is always entered - mind we visit all blocks, not only those
always executed.


Thanks for the catch!  I am afraid the piece of code should be removed since it 
stops
search of potential ALWAYS EXECUTED bb after inner loop...



In fact for your testcase the x->j ref is _not_ always executed
since the inner loop is conditional on n > 0.


Yes.  But I want to move x->k (not x->j) out of loop 1 when l > 0 in 
store-motion.
Attached the diff file without and with my patch to show the extra optimization.

x->j is already moved out of loop 2 on master code.
If change n and l to constant numbers like 100, master code could also do 2 
store
motions as expected. The edge from bb 5 to bb 4 doesn't exist now, so bb 4, bb 3
and bb 5 are ALWAYS EXECUTED for loop 1.


struct X { int i; int j; int k;};

void foo(struct X *x, int n, int l)
{
 for (int j = 0; j < l; j++) // loop 1
   {
 for (int i = 0; i < n; ++i)  // loop 2
   {
 int *p = &x->j;

Re: [PATCH] Fix incorrect computation in fill_always_executed_in_1

2021-08-16 Thread Xionghu Luo via Gcc-patches





On 2021/8/17 13:17, Xionghu Luo via Gcc-patches wrote:

Hi,

On 2021/8/16 19:46, Richard Biener wrote:

On Mon, 16 Aug 2021, Xiong Hu Luo wrote:


It seems to me that ALWAYS_EXECUTED_IN is not computed correctly for
nested loops.  inn_loop is updated to inner loop, so it need be restored
when exiting from innermost loop. With this patch, the store instruction
in outer loop could also be moved out of outer loop by store motion.
Any comments?  Thanks.



gcc/ChangeLog:

* tree-ssa-loop-im.c (fill_always_executed_in_1): Restore
inn_loop when exiting from innermost loop.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/ssa-lim-19.c: New test.
---
  gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c | 24 ++
  gcc/tree-ssa-loop-im.c |  6 +-
  2 files changed, 29 insertions(+), 1 deletion(-)
  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c

new file mode 100644
index 000..097a5ee4a4b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c
@@ -0,0 +1,24 @@
+/* PR/101293 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-lim2-details" } */
+
+struct X { int i; int j; int k;};
+
+void foo(struct X *x, int n, int l)
+{
+  for (int j = 0; j < l; j++)
+    {
+  for (int i = 0; i < n; ++i)
+    {
+  int *p = &x->j;
+  int tem = *p;
+  x->j += tem * i;
+    }
+  int *r = &x->k;
+  int tem2 = *r;
+  x->k += tem2 * j;
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "Executing store motion" 2 
"lim2" } } */

+
diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index b24bc64f2a7..5ca4738b20e 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -3211,6 +3211,10 @@ fill_always_executed_in_1 (class loop *loop, 
sbitmap contains_call)

    if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
  last = bb;
+  if (inn_loop != loop
+  && flow_loop_nested_p (bb->loop_father, inn_loop))
+    inn_loop = bb->loop_father;
+


The comment says

   /* In a loop that is always entered we may proceed anyway.
  But record that we entered it and stop once we leave 
it.

*/
   inn_loop = bb->loop_father;

and your change would defeat that early return, no?


The issue is the search method exits too early when iterating the outer
loop.  For example of a nested loop, loop 1 includes 5,8,3,10,4,9
and loop2 includes 3,10.  Currently, it breaks when bb is 3 as bb 3
doesn't dominate bb 9 of loop 1.  But actually, both bb 5 and bb 4 are
ALWAYS_EXECUTED for loop 1, so if there are store instructions in bb 4
they won't be processed by store motion again.


     5<
     |\   |
     8 \  9
     |  \ |
--->3--->4
|    | 10---|




Correct the graph display:

 5<
 |\   |
 8 \  9
 |  \ |
 --->3--->4
|   |
 ---10





SET_ALWAYS_EXECUTED_IN is only set to bb 5 on master code now, with this
patch, it will continue search when meet bb 3 until bb 4, then last is 
updated

to bb 4, it will break until exit edge is found at bb 4 by
"if (!flow_bb_inside_loop_p (loop, e->dest))".  Then the followed loop 
code will

set bb 4 as ALWAYS_EXEUCTED and all it's idoms bb 5.


  while (1)
 {
   SET_ALWAYS_EXECUTED_IN (last, loop);
   if (last == loop->header)
     break;
   last = get_immediate_dominator (CDI_DOMINATORS, last);
 }

After further discussion with Kewen, we found that the inn_loop variable is
totally useless and could be removed.





    if (bitmap_bit_p (contains_call, bb->index))
  break;
@@ -3238,7 +3242,7 @@ fill_always_executed_in_1 (class loop *loop, 
sbitmap contains_call)

    if (bb->loop_father->header == bb)
  {
-  if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
+  if (!dominated_by_p (CDI_DOMINATORS, 
bb->loop_father->latch, bb))

  break;


That's now a always false condition - a loops latch is always dominated
by its header.  The condition as written tries to verify whether the
loop is always entered - mind we visit all blocks, not only those
always executed.


Thanks for the catch!  I am afraid the piece of code should be removed 
since it stops

search of potential ALWAYS EXECUTED bb after inner loop...



In fact for your testcase the x->j ref is _not_ always executed
since the inner loop is conditional on n > 0.


Yes.  But I want to move x->k (not x->j) out of loop 1 when l > 0 in 
store-motion.
Attached the diff file without and with my patch to show the extra 
optimization.


x->j is already moved out of loop 2 on master code.
If change n and l to constant numbers like 100, master code could also 
do 2 store
motions as expected. The edge from bb 5 to bb 4 does

[PATCH v2] Fix incomplete computation in fill_always_executed_in_1

2021-08-17 Thread Xionghu Luo via Gcc-patches




On 2021/8/17 15:12, Richard Biener wrote:
> On Tue, 17 Aug 2021, Xionghu Luo wrote:
> 
>> Hi,
>>
>> On 2021/8/16 19:46, Richard Biener wrote:
>>> On Mon, 16 Aug 2021, Xiong Hu Luo wrote:
>>>
>>>> It seems to me that ALWAYS_EXECUTED_IN is not computed correctly for
>>>> nested loops.  inn_loop is updated to inner loop, so it need be restored
>>>> when exiting from innermost loop. With this patch, the store instruction
>>>> in outer loop could also be moved out of outer loop by store motion.
>>>> Any comments?  Thanks.
>>>
>>>> gcc/ChangeLog:
>>>>
>>>>   * tree-ssa-loop-im.c (fill_always_executed_in_1): Restore
>>>>   inn_loop when exiting from innermost loop.
>>>>
>>>> gcc/testsuite/ChangeLog:
>>>>
>>>>* gcc.dg/tree-ssa/ssa-lim-19.c: New test.
>>>> ---
>>>>gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c | 24 ++
>>>>gcc/tree-ssa-loop-im.c |  6 +-
>>>>2 files changed, 29 insertions(+), 1 deletion(-)
>>>>create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c
>>>>
>>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c
>>>> b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c
>>>> new file mode 100644
>>>> index 000..097a5ee4a4b
>>>> --- /dev/null
>>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c
>>>> @@ -0,0 +1,24 @@
>>>> +/* PR/101293 */
>>>> +/* { dg-do compile } */
>>>> +/* { dg-options "-O2 -fdump-tree-lim2-details" } */
>>>> +
>>>> +struct X { int i; int j; int k;};
>>>> +
>>>> +void foo(struct X *x, int n, int l)
>>>> +{
>>>> +  for (int j = 0; j < l; j++)
>>>> +{
>>>> +  for (int i = 0; i < n; ++i)
>>>> +  {
>>>> +int *p = &x->j;
>>>> +int tem = *p;
>>>> +x->j += tem * i;
>>>> +  }
>>>> +  int *r = &x->k;
>>>> +  int tem2 = *r;
>>>> +  x->k += tem2 * j;
>>>> +}
>>>> +}
>>>> +
>>>> +/* { dg-final { scan-tree-dump-times "Executing store motion" 2 "lim2" } }
>>>> */
>>>> +
>>>> diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
>>>> index b24bc64f2a7..5ca4738b20e 100644
>>>> --- a/gcc/tree-ssa-loop-im.c
>>>> +++ b/gcc/tree-ssa-loop-im.c
>>>> @@ -3211,6 +3211,10 @@ fill_always_executed_in_1 (class loop *loop, sbitmap
>>>> @@ contains_call)
>>>>   if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
>>>> last = bb;
>>>>+ if (inn_loop != loop
>>>> +&& flow_loop_nested_p (bb->loop_father, inn_loop))
>>>> +  inn_loop = bb->loop_father;
>>>> +
>>>
>>> The comment says
>>>
>>> /* In a loop that is always entered we may proceed anyway.
>>>But record that we entered it and stop once we leave it.
>>> */
>>> inn_loop = bb->loop_father;
>>>
>>> and your change would defeat that early return, no?
>>
>> The issue is the search method exits too early when iterating the outer
>> loop.  For example of a nested loop, loop 1 includes 5,8,3,10,4,9
>> and loop2 includes 3,10.  Currently, it breaks when bb is 3 as bb 3
>> doesn't dominate bb 9 of loop 1.  But actually, both bb 5 and bb 4 are
>> ALWAYS_EXECUTED for loop 1, so if there are store instructions in bb 4
>> they won't be processed by store motion again.
>>
>>
>>  5<
>>  |\   |
>>  8 \  9
>>  |  \ |
>> --->3--->4
>> ||
>> 10---|
>>
>>
>> SET_ALWAYS_EXECUTED_IN is only set to bb 5 on master code now, with this
>> patch, it will continue search when meet bb 3 until bb 4, then last is 
>> updated
>> to bb 4, it will break until exit edge is found at bb 4 by
>> "if (!flow_bb_inside_loop_p (loop, e->dest))".  Then the followed loop code
>> will
>> set bb 4 as ALWAYS_EXEUCTED and all it's idoms bb 5.
>>
>>
>>   while (1)
>>  {
>>SET_ALWAYS_EXECUTED_IN (last, loop);
>>if (last == loop->header)
>>  break;
>>last =

Re: [PATCH v2] Fix incomplete computation in fill_always_executed_in_1

2021-08-18 Thread Xionghu Luo via Gcc-patches




On 2021/8/17 17:10, Xionghu Luo via Gcc-patches wrote:
> 
> 
> On 2021/8/17 15:12, Richard Biener wrote:
>> On Tue, 17 Aug 2021, Xionghu Luo wrote:
>>
>>> Hi,
>>>
>>> On 2021/8/16 19:46, Richard Biener wrote:
>>>> On Mon, 16 Aug 2021, Xiong Hu Luo wrote:
>>>>
>>>>> It seems to me that ALWAYS_EXECUTED_IN is not computed correctly for
>>>>> nested loops.  inn_loop is updated to inner loop, so it need be restored
>>>>> when exiting from innermost loop. With this patch, the store instruction
>>>>> in outer loop could also be moved out of outer loop by store motion.
>>>>> Any comments?  Thanks.
>>>>
>>>>> gcc/ChangeLog:
>>>>>
>>>>>* tree-ssa-loop-im.c (fill_always_executed_in_1): Restore
>>>>>inn_loop when exiting from innermost loop.
>>>>>
>>>>> gcc/testsuite/ChangeLog:
>>>>>
>>>>>   * gcc.dg/tree-ssa/ssa-lim-19.c: New test.
>>>>> ---
>>>>> gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c | 24 ++
>>>>> gcc/tree-ssa-loop-im.c |  6 +-
>>>>> 2 files changed, 29 insertions(+), 1 deletion(-)
>>>>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c
>>>>>
>>>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c
>>>>> b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c
>>>>> new file mode 100644
>>>>> index 000..097a5ee4a4b
>>>>> --- /dev/null
>>>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c
>>>>> @@ -0,0 +1,24 @@
>>>>> +/* PR/101293 */
>>>>> +/* { dg-do compile } */
>>>>> +/* { dg-options "-O2 -fdump-tree-lim2-details" } */
>>>>> +
>>>>> +struct X { int i; int j; int k;};
>>>>> +
>>>>> +void foo(struct X *x, int n, int l)
>>>>> +{
>>>>> +  for (int j = 0; j < l; j++)
>>>>> +{
>>>>> +  for (int i = 0; i < n; ++i)
>>>>> + {
>>>>> +   int *p = &x->j;
>>>>> +   int tem = *p;
>>>>> +   x->j += tem * i;
>>>>> + }
>>>>> +  int *r = &x->k;
>>>>> +  int tem2 = *r;
>>>>> +  x->k += tem2 * j;
>>>>> +}
>>>>> +}
>>>>> +
>>>>> +/* { dg-final { scan-tree-dump-times "Executing store motion" 2 "lim2" } 
>>>>> }
>>>>> */
>>>>> +
>>>>> diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
>>>>> index b24bc64f2a7..5ca4738b20e 100644
>>>>> --- a/gcc/tree-ssa-loop-im.c
>>>>> +++ b/gcc/tree-ssa-loop-im.c
>>>>> @@ -3211,6 +3211,10 @@ fill_always_executed_in_1 (class loop *loop, 
>>>>> sbitmap
>>>>> @@ contains_call)
>>>>>if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
>>>>>  last = bb;
>>>>> +   if (inn_loop != loop
>>>>> +   && flow_loop_nested_p (bb->loop_father, inn_loop))
>>>>> + inn_loop = bb->loop_father;
>>>>> +
>>>>
>>>> The comment says
>>>>
>>>>  /* In a loop that is always entered we may proceed anyway.
>>>> But record that we entered it and stop once we leave 
>>>> it.
>>>> */
>>>>  inn_loop = bb->loop_father;
>>>>
>>>> and your change would defeat that early return, no?
>>>
>>> The issue is the search method exits too early when iterating the outer
>>> loop.  For example of a nested loop, loop 1 includes 5,8,3,10,4,9
>>> and loop2 includes 3,10.  Currently, it breaks when bb is 3 as bb 3
>>> doesn't dominate bb 9 of loop 1.  But actually, both bb 5 and bb 4 are
>>> ALWAYS_EXECUTED for loop 1, so if there are store instructions in bb 4
>>> they won't be processed by store motion again.
>>>
>>>
>>>   5<
>>>   |\   |
>>>   8 \  9
>>>   |  \ |
>>> --->3--->4
>>> ||
>>> 10---|
>>>
>>>
>>> SET_ALWAYS_EXECUTED_IN is only set to bb 5 on master code now, with this
>>> patch

[PATCH v2] Don't move cold code out of loop by checking bb count

2021-08-18 Thread Xionghu Luo via Gcc-patches

On 2021/8/10 12:25, Ulrich Drepper wrote:
> On Tue, Aug 10, 2021 at 4:03 AM Xionghu Luo via Gcc-patches
>  wrote:
>> For this case, theorotically I think the master GCC will optimize it to:
>>
>>invariant;
>>for (;;)
>>  if (unlikely_cond)
>>for (;;)
>>   ;
>>
>> 'invariant' is moved out of outer loop, but with the patch, it will get:
>>
>>for (;;)
>>  if (unlikely_cond)
>>{
>>  invariant;
>>  for (;;)
>> ;
>>}
>>
>> 'invariant' is *cold* for outer loop, but it is still *hot* for inner loop,
>> so hoist it out of inner loop, this is exactly what we want, right?
> 
> Is relying on absolute numbers really what you want?  If the
> 'unlikely_cond' condition depends on the iteration count of the outer
> loop the probability of it being true in each individual iteration can
> be low (at least that's how I use unlikely) but the overall
> probability of needing the code is higher 1 - (1 - p)^n  if 'p' is the
> probability of 'unlikely_cond' and 'n' is the number of iterations.
> Assuming complete independence of the loop iterations, otherwise it's
> rather an upper limit.
> 
> At the very least I'd generate code like this:
> 
>first = true;
>for (;;)
>  if (unlikely_cond)
>{
>  if (first)
>{
>  invariant;
>  first = false;
>}
>  for (;;)
> ;
>}
> 
> If it's worth hoisting the code the the extra test and flag should be
> small in cost in comparison.
> 
> If 'unlikely_cond' does not in any way depend on the loop iteration
> then I think your code generation is fine.

Thanks for your good suggestion, I am also not sure whether it is
necessary to do it this way:)  But I found that even the first step
of 

  for (;;)
if (unlikely_cond)
  {
invariant;
for (;;)
   ;
  }

is not supported yet.  So I added a new function *find_coldest_out_loop*
to search the coldest function between outermost invariant loop and own
loop in compute_invariantness to move invariant out to cold loop first:

[PATCH v2] Don't move cold code out of loop by checking bb count

There was a patch trying to avoid move cold block out of loop:

https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html

Richard suggested to "never hoist anything from a bb with lower execution
frequency to a bb with higher one in LIM invariantness_dom_walker
before_dom_children".

In gimple LIM analysis,  add find_coldest_out_loop to move invariants to
expected target loop, then in both gimple LIM move_computations_worker
and RTL loop-invariant.c find_invariants_bb, if profile count check find
 the loop bb is colder than target loop preheader, don't hoist it out of
loop.

SPEC2017 performance evaluation shows 1% performance improvement for
intrate GEOMEAN and no obvious regression for others.  Especially,
500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is
largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00%
on P8LE.

Regression and bootstrap tested pass on P8LE, any comments?  Thanks.

gcc/ChangeLog:

* loop-invariant.c (find_invariants_bb): Check profile count
before motion.
(find_invariants_body): Add argument.
* tree-ssa-loop-im.c (find_coldest_out_loop): New function.
(outermost_invariant_loop): Use find_coldest_out_loop.
(determine_max_movement): Likewise.
(move_computations_worker): Check profile count before motion.
(execute_sm): Likewise.
(execute_sm_exit): Check pointer validness.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/recip-3.c: Adjust.
* gcc.dg/tree-ssa/ssa-lim-16.c: New test.
* gcc.dg/tree-ssa/ssa-lim-17.c: New test.
---
 gcc/loop-invariant.c   |  10 +-
 gcc/tree-ssa-loop-im.c | 186 +++--
 gcc/testsuite/gcc.dg/tree-ssa/recip-3.c|   2 +-
 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-16.c |  21 +++
 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-17.c |  26 +++
 5 files changed, 231 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-16.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-17.c

diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c
index fca0c2b24be..5c3be7bf0eb 100644
--- a/gcc/loop-invariant.c
+++ b/gcc/loop-invariant.c
@@ -1183,9 +1183,14 @@ find_invariants_insn (rtx_insn *insn, bool 
always_reached, bool always_executed)
call.  */

 static void
-find_invariants_bb (basic_block bb, bool always_reached, bool always_executed)
+find_invariants_bb (class loop *loop, ba

Re: [PATCH v2] Fix incomplete computation in fill_always_executed_in_1

2021-08-24 Thread Xionghu Luo via Gcc-patches

On 2021/8/19 20:11, Richard Biener wrote:
>> -  class loop *inn_loop = loop;
>>   
>> if (ALWAYS_EXECUTED_IN (loop->header) == NULL)
>>   {
>> @@ -3232,19 +3231,6 @@ fill_always_executed_in_1 (class loop *loop, sbitmap 
>> contains_call)
>>   to disprove this if possible).  */
>>if (bb->flags & BB_IRREDUCIBLE_LOOP)
>>  break;
>> -
>> -  if (!flow_bb_inside_loop_p (inn_loop, bb))
>> -break;
>> -
>> -  if (bb->loop_father->header == bb)
>> -{
>> -  if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
>> -break;
>> -
>> -  /* In a loop that is always entered we may proceed anyway.
>> - But record that we entered it and stop once we leave it.  */
>> -  inn_loop = bb->loop_father;
>> -}
>>  }
>>   
>> while (1)
> I'm not sure this will work correct (I'm not sure how the existing
> code makes it so either...).  That said, I can't poke any hole
> into the change.  What I see is that definitely
> 
>if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
>  last = bb;
> 
>if (bitmap_bit_p (contains_call, bb->index))
>  break;
> 
> doesn't work reliably since the DOM ordering will process blocks
> A B and C in random order for
> 
>for (;;)
> {
>if (cond)
>  {
>A: foo ();
>  }
>else B:;
>C:;
> }
> 
> and thus we can end up setting 'last' to C_before_  processing
> 'A' and thus arriving at the call foo () ...
> 
> get_loop_body_in_dom_order does some "special sauce" but not
> to address the above problem - but it might be that a subtle
> issue like the above is the reason for the inner loop handling.
> The inner loop block order does_not_  adhere to this "special sauce",
> that is - the "Additionally, if a basic block s dominates
> the latch, then only blocks dominated by s are be after it."
> guarantee holds for the outer loop latch, not for the inner.
> 
> Digging into the history of fill_always_executed_in_1 doesn't
> reveal anything - the inner loop handling has been present
> since introduction by Zdenek - but usually Zdenek has a reason
> for doing things as he does;)

Yes, this is really complicated usage, thanks for point it out. :)
I constructed two cases to verify this with inner loop includes "If A; else B; 
C". 
Finding that fill_sons_in_loop in get_loop_body_in_dom_order will also checks
whether the bb domintes outer loop’s latch, if C dominate outer loop’s latch,
C is postponed, the access order is ABC, 'last' won’t be set to C if A or B 
contains call;

Otherwise if C doesn’t dominate outer loop’s latch in fill_sons_in_loop,
the access order is CAB, but 'last' also won’t be updated to C in 
fill_always_executed_in_1
since there is also dominate check, then if A or B contains call, it could break
successfully. 

C won't be set to ALWAYS EXECUTED for both circumstance.

> 
> Note it might be simply a measure against quadratic complexity,
> esp. since with your patch we also dive into not always executed
> subloops as you remove the
> 
>if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
>  break;
> 
> check.  I suggest to evaluate behavior of the patch on a testcase
> like
> 
> void foo (int n, int **k)
> {
>for (int i = 0; i < n; ++i)
>  if (k[0][i])
>for (int j = 0; j < n; ++j)
>  if (k[1][j])
>for (int l = 0; l < n; ++l)
>  if (k[2][l])
>...
> }

Theoretically the complexity is changing from L1(bbs) to 
L1(bbs)+L2(bbs)+L3(bbs)+…+Ln(bbs),
so fill_always_executed_in_1's execution time is supposed to be increase from
O(n) to O(n2)?  The time should depend on loop depth and bb counts.   I also 
drafted a
test case has 73-depth loop function with 25 no-ipa function copies each 
compiled
in lim2 and lim4 dependently.  Total execution time of 
fill_always_executed_in_1 is
increased from 32ms to 58ms, almost doubled but not quadratic?

It seems reasonable to see compiling time getting longer since most bbs are 
checked
more but a MUST to ensure early break correctly in every loop level... 
Though loop nodes could be huge, loop depth will never be so large in actual 
code?

>  
> I suspect you'll see quadratic behavior with your patch.  You
> should be at least able to preserve a check like
> 
>/* Do not process not always executed subloops to avoid
>   quadratic behavior.  */
>if (bb->loop_father->header == bb
>&& !dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
>  break;
> 
> which is of course not optimistic for cases like
> 
>for (..)
> {
>   if (cond)
> for (..)
>   x = 1; // this is always executed if the inner loop is finite
> }
> 
> but we need to have an eye on the complexity of this function.  I would
> have suggested to do greedy visiting of the loop header successors,
> proce

Re: [PATCH v3] Fix incomplete computation in fill_always_executed_in_1

2021-08-25 Thread Xionghu Luo via Gcc-patches




On 2021/8/24 16:20, Richard Biener wrote:
> On Tue, 24 Aug 2021, Xionghu Luo wrote:
> 
>>
>>
>> On 2021/8/19 20:11, Richard Biener wrote:
>>>> -  class loop *inn_loop = loop;
>>>>
>>>>  if (ALWAYS_EXECUTED_IN (loop->header) == NULL)
>>>>{
>>>> @@ -3232,19 +3231,6 @@ fill_always_executed_in_1 (class loop *loop, 
>>>> sbitmap contains_call)
>>>> to disprove this if possible).  */
>>>>  if (bb->flags & BB_IRREDUCIBLE_LOOP)
>>>>break;
>>>> -
>>>> -if (!flow_bb_inside_loop_p (inn_loop, bb))
>>>> -  break;
>>>> -
>>>> -if (bb->loop_father->header == bb)
>>>> -  {
>>>> -if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
>>>> -  break;
>>>> -
>>>> -/* In a loop that is always entered we may proceed anyway.
>>>> -   But record that we entered it and stop once we leave it.  */
>>>> -inn_loop = bb->loop_father;
>>>> -  }
>>>>}
>>>>
>>>>  while (1)
>>> I'm not sure this will work correct (I'm not sure how the existing
>>> code makes it so either...).  That said, I can't poke any hole
>>> into the change.  What I see is that definitely
>>>
>>> if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
>>>   last = bb;
>>>
>>> if (bitmap_bit_p (contains_call, bb->index))
>>>   break;
>>>
>>> doesn't work reliably since the DOM ordering will process blocks
>>> A B and C in random order for
>>>
>>> for (;;)
>>>  {
>>> if (cond)
>>>   {
>>> A: foo ();
>>>   }
>>> else B:;
>>> C:;
>>>  }
>>>
>>> and thus we can end up setting 'last' to C_before_  processing
>>> 'A' and thus arriving at the call foo () ...
>>>
>>> get_loop_body_in_dom_order does some "special sauce" but not
>>> to address the above problem - but it might be that a subtle
>>> issue like the above is the reason for the inner loop handling.
>>> The inner loop block order does_not_  adhere to this "special sauce",
>>> that is - the "Additionally, if a basic block s dominates
>>> the latch, then only blocks dominated by s are be after it."
>>> guarantee holds for the outer loop latch, not for the inner.
>>>
>>> Digging into the history of fill_always_executed_in_1 doesn't
>>> reveal anything - the inner loop handling has been present
>>> since introduction by Zdenek - but usually Zdenek has a reason
>>> for doing things as he does;)
>>
>> Yes, this is really complicated usage, thanks for point it out. :)
>> I constructed two cases to verify this with inner loop includes "If A; else 
>> B; C".
>> Finding that fill_sons_in_loop in get_loop_body_in_dom_order will also checks
>> whether the bb domintes outer loop’s latch, if C dominate outer loop’s latch,
>> C is postponed, the access order is ABC, 'last' won’t be set to C if A or B 
>> contains call;
> 
> But it depends on the order of visiting ABC and that's hard to put into
> a testcase since it depends on the order of edges and the processing
> of the dominance computation.  ABC are simply unordered with respect
> to a dominator walk.
> 
>> Otherwise if C doesn’t dominate outer loop’s latch in fill_sons_in_loop,
>> the access order is CAB, but 'last' also won’t be updated to C in 
>> fill_always_executed_in_1
>> since there is also dominate check, then if A or B contains call, it could 
>> break
>> successfully.
>>
>> C won't be set to ALWAYS EXECUTED for both circumstance.
>>
>>>
>>> Note it might be simply a measure against quadratic complexity,
>>> esp. since with your patch we also dive into not always executed
>>> subloops as you remove the
>>>
>>> if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
>>>   break;
>>>
>>> check.  I suggest to evaluate behavior of the patch on a testcase
>>> like
>>>
>>> void foo (int n, int **k)
>>> {
>>> for (int i = 0; i < n; ++i)
>

Re: [PATCH v3] Fix incomplete computation in fill_always_executed_in_1

2021-08-30 Thread Xionghu Luo via Gcc-patches




On 2021/8/27 15:45, Richard Biener wrote:

On Thu, 26 Aug 2021, Xionghu Luo wrote:




On 2021/8/24 16:20, Richard Biener wrote:

On Tue, 24 Aug 2021, Xionghu Luo wrote:




On 2021/8/19 20:11, Richard Biener wrote:

-  class loop *inn_loop = loop;

  if (ALWAYS_EXECUTED_IN (loop->header) == NULL)

{
@@ -3232,19 +3231,6 @@ fill_always_executed_in_1 (class loop *loop, sbitmap 
contains_call)
 to disprove this if possible).  */
  if (bb->flags & BB_IRREDUCIBLE_LOOP)
break;
-
- if (!flow_bb_inside_loop_p (inn_loop, bb))
-   break;
-
- if (bb->loop_father->header == bb)
-   {
- if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
-   break;
-
- /* In a loop that is always entered we may proceed anyway.
-But record that we entered it and stop once we leave it.  */
- inn_loop = bb->loop_father;
-   }
}

  while (1)

I'm not sure this will work correct (I'm not sure how the existing
code makes it so either...).  That said, I can't poke any hole
into the change.  What I see is that definitely

 if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
   last = bb;

 if (bitmap_bit_p (contains_call, bb->index))
   break;

doesn't work reliably since the DOM ordering will process blocks
A B and C in random order for

 for (;;)
  {
 if (cond)
   {
 A: foo ();
   }
 else B:;
 C:;
  }

and thus we can end up setting 'last' to C_before_  processing
'A' and thus arriving at the call foo () ...

get_loop_body_in_dom_order does some "special sauce" but not
to address the above problem - but it might be that a subtle
issue like the above is the reason for the inner loop handling.
The inner loop block order does_not_  adhere to this "special sauce",
that is - the "Additionally, if a basic block s dominates
the latch, then only blocks dominated by s are be after it."
guarantee holds for the outer loop latch, not for the inner.

Digging into the history of fill_always_executed_in_1 doesn't
reveal anything - the inner loop handling has been present
since introduction by Zdenek - but usually Zdenek has a reason
for doing things as he does;)


Yes, this is really complicated usage, thanks for point it out. :)
I constructed two cases to verify this with inner loop includes "If A; else B; 
C".
Finding that fill_sons_in_loop in get_loop_body_in_dom_order will also checks
whether the bb domintes outer loop’s latch, if C dominate outer loop’s latch,
C is postponed, the access order is ABC, 'last' won’t be set to C if A or B 
contains call;


But it depends on the order of visiting ABC and that's hard to put into
a testcase since it depends on the order of edges and the processing
of the dominance computation.  ABC are simply unordered with respect
to a dominator walk.


Otherwise if C doesn’t dominate outer loop’s latch in fill_sons_in_loop,
the access order is CAB, but 'last' also won’t be updated to C in 
fill_always_executed_in_1
since there is also dominate check, then if A or B contains call, it could break
successfully.

C won't be set to ALWAYS EXECUTED for both circumstance.



Note it might be simply a measure against quadratic complexity,
esp. since with your patch we also dive into not always executed
subloops as you remove the

 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
   break;

check.  I suggest to evaluate behavior of the patch on a testcase
like

void foo (int n, int **k)
{
 for (int i = 0; i < n; ++i)
   if (k[0][i])
 for (int j = 0; j < n; ++j)
   if (k[1][j])
 for (int l = 0; l < n; ++l)
   if (k[2][l])
 ...
}


Theoretically the complexity is changing from L1(bbs) to 
L1(bbs)+L2(bbs)+L3(bbs)+…+Ln(bbs),
so fill_always_executed_in_1's execution time is supposed to be increase from
O(n) to O(n2)?  The time should depend on loop depth and bb counts.   I also 
drafted a
test case has 73-depth loop function with 25 no-ipa function copies each 
compiled
in lim2 and lim4 dependently.  Total execution time of 
fill_always_executed_in_1 is
increased from 32ms to 58ms, almost doubled but not quadratic?


It's more like n + (n-1) + (n-2) + ... + 1 which is n^2/2 but that's still
O(n^2).


It seems reasonable to see compiling time getting longer since most bbs are 
checked
more but a MUST to ensure early break correctly in every loop level...
Though loop nodes could be huge, loop depth will never be so large in actual 
code?


The "in practice" argument is almost always defeated by automatic
program generators ;)

   
I suspect you'll see quadratic

Re: [PATCH v3] Fix incomplete computation in fill_always_executed_in_1

2021-08-31 Thread Xionghu Luo via Gcc-patches




On 2021/8/30 17:19, Richard Biener wrote:

 bitmap_set_bit (work_set, loop->header->index);
+  unsigned bb_index;
   -  for (i = 0; i < loop->num_nodes; i++)
-   {
- edge_iterator ei;
- bb = bbs[i];
+  unsigned array_size = last_basic_block_for_fn (cfun) + 1;
+  int *bbd = XNEWVEC (int, array_size);
+  bbd = XDUPVEC (int, bbi, array_size);

I don't think you need to copy 'bbi' but you can re-use the
state from the outer loop processing.  Did you run into any
issues with that?

Yes.  For example, adding a small if-else block to ssa-lim-19.c,
Then block "x->j += tem * i;" of bb 6 is always executed for loop 2, when call
fill_always_executed_in_1 for loop 1, bbi[6] is decreased from 2 to 1 to 0,
then if fill_always_executed_in_1 is called again for loop 2, it's value is
not
reset so bbi[6] won't be set ALWAYS EXECUTE, this is wrong.


struct X { int i; int j; int k;};

void foo(struct X *x, int n, int l, int m)
{
  for (int j = 0; j < l; j++)  // loop 1
{
  for (int i = 0; i < n; ++i)  // loop 2
{
  if (m)
x->j++;
  else
x->j = m+n+l;

  int *p = &x->j;   // bb 6
  int tem = *p;
  x->j += tem * i;
}
  int *r = &x->k;
  int tem2 = *r;
  x->k += tem2 * j;
}
}

Hmm, but if the outer loop processing reaches bb 6 then
it should have set it ALWAYS_EXECUTED in loop 1 already?


But bb 6 is NOT ALWAYS_EXECUTED for loop 1, it is only ALWAYS_EXECUTED for
loop 2 as it requires n>0.  Please refer to the attached file 
ssa-lim-19.c.138t.lim2.

;;
;; Loop 1
;;  header 8, latch 12
;;  depth 1, outer 0
;;  nodes: 8 12 7 6 4 5 3 13 11
;;
;; Loop 2
;;  header 3, latch 13
;;  depth 2, outer 1
;;  nodes: 3 13 6 4 5
;; 2 succs { 10 9 }
;; 10 succs { 8 }
;; 11 succs { 3 }
;; 3 succs { 4 5 }
;; 4 succs { 6 }
;; 5 succs { 6 }
;; 6 succs { 13 7 }
;; 13 succs { 3 }
;; 7 succs { 12 9 }
;; 12 succs { 8 }
;; 8 succs { 11 7 }
;; 9 succs { 1 }

always executed: bb->index:8, loop->num: 1
always executed: bb->index:7, loop->num: 1
always executed: bb->index:3, loop->num: 2
always executed: bb->index:6, loop->num: 2

8<---
  /  \  |
 11   \ |
 / \|
 3<---  \   | 
/\|  \  |

4 5   |   \ |
\/|\|
 6| \   |
 |-->13  \  |
 |--> 7 |
  /\|
 9 12---

(gdb) x /15x bbd
0x1354c9b0: 0x  0x  0x0001  0x0001
0x1354c9c0: 0x0001  0x0001  0x0002  0x0002
0x1354c9d0: 0x0001  0x0002  0x0001  0x0001
0x1354c9e0: 0x0001  0x0001  0x

our algorithm will walk through 8->11->3->4->5->6->7,
for loop 1, exit at edge 7->9.

(gdb) x /15x bbd
0x1354c9b0: 0x  0x  0x0001  0x
0x1354c9c0: 0x  0x  0x  0x
0x1354c9d0: 0x0001  0x0002  0x0001  0x
0x1354c9e0: 0x0001  0x  0x

If we don't reset bbd to incoming_edge by memcpy, bbd[3],bbd[4],bbd[5]
and bbd[6] is 0 now for loop 2, fill_always_executed_in_1 couldn't set
ALWAYS_EXECUTED correctly for loop 2 at bb 3 and bb 6.





   

+  while (!bitmap_empty_p (work_set))
+   {
+ bb_index = bitmap_first_set_bit (work_set);
+ bitmap_clear_bit (work_set, bb_index);
+ bb = BASIC_BLOCK_FOR_FN (cfun, bb_index);
  if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
-   last = bb;
-
+   SET_ALWAYS_EXECUTED_IN (bb, loop);
  if (bitmap_bit_p (contains_call, bb->index))
break;

I think you want to continue; here (process remaining worklist
but not continue greedy walking this block)

Same as above, if use 'continue' instead of 'break', the algorithm
seems also not work again.  If inner loop contains a jump to outmost
loop, the blocks after the jump block will be set to ALWAYS EXECUTE
incorrectly.


-
+ edge_iterator ei;
  FOR_EACH_EDGE (e, ei, bb->succs)
{
- /* If there is an exit from this BB.  */
  if (!flow_bb_inside_loop_p (loop, e->dest))
 break;

in particular this should keep the outer 'bbi' valid to re-use.

But again, you want 'continue;' the greedy walk to other edges.
If that's not valid (I'd need to think about this) then with
your patch whether we process an edge depends on the order
of the edge visit so you'd have to walk successors twice,
once to determine whether we can greedily walk any of it
and once to actually do the greedy walk.

So thinking about it an exit edge is like a not returning call
and thus we indeed should not process any outgoing edges of
this block.


+
  /* Or we enter a possibly non-finite loop.  */
  if (flow_loop_nested_p (bb->loop_father,
 e->dest->loop_father)
   && ! finite_loop_p (e->dest->loop_father))
 break;

I think this is no longer necessary?  In any case it would
again be 'conti

[PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR

2020-09-21 Thread xionghu luo via Gcc-patches

ult:
+ break;
+   }
+   }
+}
+
return 0;
  }

diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 8efc77d986b..36837381c04 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -115,6 +115,7 @@ init_internal_fns ()
  #define vec_condeq_direct { 0, 0, false }
  #define scatter_store_direct { 3, 1, false }
  #define len_store_direct { 3, 3, false }
+#define vec_set_direct { 3, 3, false }
  #define unary_direct { 0, 0, true }
  #define binary_direct { 0, 0, true }
  #define ternary_direct { 0, 0, true }
@@ -2658,6 +2659,40 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall 
*stmt, convert_optab optab)

  #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn

+static void
+expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)


all new functions require a function level comment


Done.




+{
+  tree lhs = gimple_call_lhs (stmt);
+  tree op0 = gimple_call_arg (stmt, 0);
+  tree op1 = gimple_call_arg (stmt, 1);
+  tree op2 = gimple_call_arg (stmt, 2);
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE);
+
+  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
+  scalar_mode innermode = GET_MODE_INNER (outermode);
+
+  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+
+  class expand_operand ops[3];
+  enum insn_code icode = optab_handler (optab, outermode);
+
+  if (icode != CODE_FOR_nothing)
+{
+  pos = convert_to_mode (E_SImode, pos, 0);
+
+  create_fixed_operand (&ops[0], src);
+  create_input_operand (&ops[1], value, innermode);
+  create_input_operand (&ops[2], pos, GET_MODE (pos));
+  if (maybe_expand_insn (icode, 3, ops))
+   {
+ emit_move_insn (target, src);


I think you need to assert that we end up here.


Added gcc_unreachable at the end of this function.




+ return;
+   }
+}
+}
+
  static void
  expand_ABNORMAL_DISPATCHER (internal_fn, gcall *)
  {
@@ -3253,6 +3288,7 @@ multi_vector_optab_supported_p (convert_optab optab, 
tree_pair types,
  #define direct_fold_left_optab_supported_p direct_optab_supported_p
  #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
  #define direct_check_ptrs_optab_supported_p direct_optab_supported_p
+#define direct_vec_set_optab_supported_p direct_optab_supported_p

  /* Return the optab used by internal function FN.  */

diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 13e60828fcf..e6cfe1b6159 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu)
  DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq)
  DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask)

+DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
+
  DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)

  DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
diff --git a/gcc/optabs.c b/gcc/optabs.c
index 184827fdf4e..c8125670d2d 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -3841,6 +3841,23 @@ can_vcond_compare_p (enum rtx_code code, machine_mode 
value_mode,
  && insn_operand_matches (icode, 3, test);
  }

+bool
+can_vec_set_var_idx_p (enum tree_code code, machine_mode vec_mode,
+  machine_mode value_mode, machine_mode idx_mode)


toplevel comment missing


+{
+  gcc_assert (code == VECTOR_TYPE);


what's the point of pasing 'code' here then?  Since the optab only has a single
mode, the vector mode, the value_mode is redundant as well.  And I guess
we might want to handle "arbitrary" index modes?  That is, the .md expanders
should not restrict its mode - I guess it simply uses VOIDmode at the moment
(for integer constants).  Not sure how to best do this without an explicit mode
in the optab ...


Yes, removed 'code' and value_mode by checking VECTOR_MODE_P and use 
GET_MODE_INNER
for value_mode.  ".md expanders" shall support for integer constants index 
mode, but
I guess they shouldn't be expanded by IFN as this function is for variable index
insert only?  Anyway, the v3 patch used VOIDmode check...


Thanks,
Xionghu

From 571717aea126380d3e36fdb4504f9a6337eed206 Mon Sep 17 00:00:00 2001
From: Xiong Hu Luo 
Date: Mon, 14 Sep 2020 21:08:11 -0500
Subject: [PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with
 VIEW_CONVERT_EXPR

This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to
VEC_SET internal function in gimple-isel pass if target supports
vec_set with variable index by checking can_vec_set_var_idx_p.

gcc/ChangeLog:

2020-09-22  Xionghu Luo  

* gimple-isel.cc (gimple_expand_vec_set_expr): New function.
(gimple_expand_vec_cond_exprs): Rename to ...
(gimple_expand_vec_exprs): ... this and call
gimple_expand_

Re: [PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR

2020-09-23 Thread xionghu luo via Gcc-patches

Hi,

On 2020/9/23 19:33, Richard Biener wrote:
>> The first loop is for rhs stmt process, this loop is for lhs stmt process.
>> I thought vec_extract also need to generate IFN before, but seems not
>> necessary now?  And that the first loop needs to update the lhs stmt while
>> then second doesn't.
> That's not good reasons to separate them, please move all the processing
> into one loop.
> 
> + gassign *stmt = dyn_cast (gsi_stmt (gsi));
> + if (!stmt)
> +   continue;
> +
> + enum tree_code code;
> + code = TREE_CODE (gimple_assign_lhs (stmt));
> + switch (code)
> +   {
> +   case ARRAY_REF:
> + gimple_expand_vec_set_expr (&gsi);
> 
> you also do the assign and ARRAY_REF checking duplicate.
> 
> The patch likely wasn't bootstrapped because I've seen unused and
> set-but-not-used
> variables.
> 
> Otherwise the patch looks good to me - I guess you want to add the
> vec_extract bits as well so you can overall assess the affect of the patch
> on altivec code?  That said, the patch misses a testcase where we verify
> we properly expand the vector to a pseudo now.

Thanks, fixed the bootstrap error.  Actually the "[PATCH v2 2/2] rs6000: Expand 
vec_insert
in expander instead of gimple [PR79251]" includes typed vec_insert tests for 
V4SI/V4SF/V8HI/V16QI/V2DI/V2DF of expanding the IFN VEC_SET and instruction 
count check,
but I am discussing and refining with Segher's comments, will split and send it 
later once
we reached agreement.  Not sure whether this is the testcase you mentioned? (As 
you said
*vec_extract*, but this patch series target for vec_insert only.)  

FYI, We are trying below or even better code generations:

   rlwinm 6,6,2,28,29
   mtvsrwz 0,5
   lvsr 1,0,6
   lvsl 0,0,6
   xxperm 34,34,33
   xxinsertw 34,0,12
   xxperm 34,34,32

Second thing is I removed the second loop and move the 
"gimple_expand_vec_set_expr (&gsi);"
up as your comments.  Thanks again.


IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR

This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to
VEC_SET internal function in gimple-isel pass if target supports
vec_set with variable index by checking can_vec_set_var_idx_p.

gcc/ChangeLog:

2020-09-24  Xionghu Luo  

* gimple-isel.cc (gimple_expand_vec_set_expr): New function.
(gimple_expand_vec_cond_exprs): Rename to ...
(gimple_expand_vec_exprs): ... this and call
gimple_expand_vec_set_expr.
* internal-fn.c (vec_set_direct): New define.
(expand_vec_set_optab_fn): New function.
(direct_vec_set_optab_supported_p): New define.
* internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN.
* optabs.c (can_vec_set_var_idx_p): New function.
* optabs.h (can_vec_set_var_idx_p): New declaration.
---
 gcc/gimple-isel.cc  | 75 +++--
 gcc/internal-fn.c   | 39 +++
 gcc/internal-fn.def |  2 ++
 gcc/optabs.c| 21 +
 gcc/optabs.h|  4 +++
 5 files changed, 139 insertions(+), 2 deletions(-)

diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc
index b330cf4c20e..02513e04900 100644
--- a/gcc/gimple-isel.cc
+++ b/gcc/gimple-isel.cc
@@ -35,6 +35,74 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-cfg.h"
 #include "bitmap.h"
 #include "tree-ssa-dce.h"
+#include "memmodel.h"
+#include "optabs.h"
+
+/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to
+   internal function based on vector type of selected expansion.
+   i.e.:
+ VIEW_CONVERT_EXPR(u)[_1] =  = i_4(D);
+   =>
+ _7 = u;
+ _8 = .VEC_SET (_7, i_4(D), _1);
+ u = _8;  */
+
+static gimple *
+gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi)
+{
+  enum tree_code code;
+  gcall *new_stmt = NULL;
+  gassign *ass_stmt = NULL;
+
+  /* Only consider code == GIMPLE_ASSIGN.  */
+  gassign *stmt = dyn_cast (gsi_stmt (*gsi));
+  if (!stmt)
+return NULL;
+
+  tree lhs = gimple_assign_lhs (stmt);
+  code = TREE_CODE (lhs);
+  if (code != ARRAY_REF)
+return NULL;
+
+  tree val = gimple_assign_rhs1 (stmt);
+  tree op0 = TREE_OPERAND (lhs, 0);
+  if (TREE_CODE (op0) == VIEW_CONVERT_EXPR && DECL_P (TREE_OPERAND (op0, 0))
+  && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0)))
+  && TYPE_MODE (TREE_TYPE (lhs))
+  == TYPE_MODE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (op0, 0)
+{
+  tree pos = TREE_OPERAND (lhs, 1);
+  tree view_op0 = TREE_OPERAND (op0, 0);
+  machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0));
+  if (auto_var_in_fn_p (view_op0, cfun->decl)
+ && !TREE_ADDRESSABLE (view_op0) && can_vec_set_var_idx

Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-24 Thread xionghu luo via Gcc-patches


Hi Segher,

The attached two patches are updated and split from
"[PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple 
[PR79251]"
as your comments.


[PATCH v3 2/3] rs6000: Fix lvsl&lvsr mode and change rs6000_expand_vector_set 
param

This one is preparation work of fix lvsl&lvsr arg mode and 
rs6000_expand_vector_set
parameter support for both constant and variable index input.


[PATCH v3 2/3] rs6000: Support variable insert and Expand vec_insert in 
expander [PR79251]

This one is Building VIEW_CONVERT_EXPR and expand the IFN VEC_SET to fast.


Thanks,
Xionghu
From 9d74c488ad3c7cad8c276cc49749ec05158d1e96 Mon Sep 17 00:00:00 2001
From: Xiong Hu Luo 
Date: Thu, 24 Sep 2020 00:52:35 -0500
Subject: [PATCH v3 2/3] rs6000: Fix lvsl&lvsr mode and change
 rs6000_expand_vector_set param

lvsl and lvsr looks only at the low 4 bits, use SI for index param.
 rs6000_expand_vector_set could accept insert either to constant position
or variable position, so change the operand to reg_or_cint_operand.

gcc/ChangeLog:

2020-09-24  Xionghu Luo  

* config/rs6000/altivec.md (altivec_lvsl_reg): Change to
SImode.
(altivec_lvsr_reg): Likewise.
* config/rs6000/rs6000-call.c (altivec_expand_vec_set_builtin):
Change call param 2 from type int to rtx.
* config/rs6000/rs6000-protos.h (rs6000_expand_vector_set):
Likewise.
* config/rs6000/rs6000.c (rs6000_expand_vector_init):
Change call param 2 from type int to rtx.
(rs6000_expand_vector_set): Likewise.
* config/rs6000/vector.md (vec_set): Support both constant
and variable index vec_set.
* config/rs6000/vsx.md: Call gen_altivec_lvsl_reg with SImode.
---
 gcc/config/rs6000/altivec.md  |  4 ++--
 gcc/config/rs6000/rs6000-call.c   |  2 +-
 gcc/config/rs6000/rs6000-protos.h |  2 +-
 gcc/config/rs6000/rs6000.c| 16 +---
 gcc/config/rs6000/vector.md   |  4 ++--
 gcc/config/rs6000/vsx.md  |  3 ++-
 6 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 0a2e634d6b0..a1c06c9ab8c 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -2775,7 +2775,7 @@ (define_expand "altivec_lvsl"
 (define_insn "altivec_lvsl_reg"
   [(set (match_operand:V16QI 0 "altivec_register_operand" "=v")
(unspec:V16QI
-   [(match_operand:DI 1 "gpc_reg_operand" "b")]
+   [(match_operand:SI 1 "gpc_reg_operand" "b")]
UNSPEC_LVSL_REG))]
   "TARGET_ALTIVEC"
   "lvsl %0,0,%1"
@@ -2813,7 +2813,7 @@ (define_expand "altivec_lvsr"
 (define_insn "altivec_lvsr_reg"
   [(set (match_operand:V16QI 0 "altivec_register_operand" "=v")
(unspec:V16QI
-   [(match_operand:DI 1 "gpc_reg_operand" "b")]
+   [(match_operand:SI 1 "gpc_reg_operand" "b")]
UNSPEC_LVSR_REG))]
   "TARGET_ALTIVEC"
   "lvsr %0,0,%1"
diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c
index e39cfcf672b..51f278933bd 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -10655,7 +10655,7 @@ altivec_expand_vec_set_builtin (tree exp)
   op0 = force_reg (tmode, op0);
   op1 = force_reg (mode1, op1);
 
-  rs6000_expand_vector_set (op0, op1, elt);
+  rs6000_expand_vector_set (op0, op1, GEN_INT (elt));
 
   return op0;
 }
diff --git a/gcc/config/rs6000/rs6000-protos.h 
b/gcc/config/rs6000/rs6000-protos.h
index 28e859f4381..6a0fbc3ba2e 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -57,7 +57,7 @@ extern bool rs6000_move_128bit_ok_p (rtx []);
 extern bool rs6000_split_128bit_ok_p (rtx []);
 extern void rs6000_expand_float128_convert (rtx, rtx, bool);
 extern void rs6000_expand_vector_init (rtx, rtx);
-extern void rs6000_expand_vector_set (rtx, rtx, int);
+extern void rs6000_expand_vector_set (rtx, rtx, rtx);
 extern void rs6000_expand_vector_extract (rtx, rtx, rtx);
 extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx);
 extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index fe93cf6ff2b..c46ec14f060 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6669,7 +6669,8 @@ rs6000_expand_vector_init (rtx target, rtx vals)
   rs6000_expand_vector_init (target, copy);
 
   /* Insert variable.  */
-  rs6000_expand_vector_set (target, XVECEXP (vals, 0, one_var), one_var);
+  rs6000_expand_vector_set (target, XVECEXP (vals, 0, one_var),
+   GEN_INT (one_var));
   return;
 }
 
@@ -6683,10 +6684,10 @@ rs6000_expand_vector_init (rtx target, rtx vals)
   emit_move_insn (targe

Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-24 Thread xionghu luo via Gcc-patches

Hi,

On 2020/9/24 21:27, Richard Biener wrote:
> On Thu, Sep 24, 2020 at 10:21 AM xionghu luo  wrote:
> 
> I'll just comment that
> 
>  xxperm 34,34,33
>  xxinsertw 34,0,12
>  xxperm 34,34,32
> 
> doesn't look like a variable-position insert instruction but
> this is a variable whole-vector rotate plus an insert at index zero
> followed by a variable whole-vector rotate.  I'm not fluend in
> ppc assembly but
> 
>  rlwinm 6,6,2,28,29
>  mtvsrwz 0,5
>  lvsr 1,0,6
>  lvsl 0,0,6
> 
> possibly computes the shift masks for r33/r32?  though
> I do not see those registers mentioned...

For V4SI:
   rlwinm 6,6,2,28,29  // r6*4
   mtvsrwz 0,5 // vs0   <- r5  (0xfe)
   lvsr 1,0,6  // vs33  <- lvsr[r6]
   lvsl 0,0,6  // vs32  <- lvsl[r6] 
   xxperm 34,34,33   
   xxinsertw 34,0,12
   xxperm 34,34,32
   blr


idx = idx * 4; 
00  0x4000300020001   xxperm:0x4000300020001   
vs33:0x101112131415161718191a1b1c1d1e1f  vs32:0x102030405060708090a0b0c0d0e0f
14  0x4000300020001   xxperm:0x1000400030002   
vs33:0xc0d0e0f101112131415161718191a1b   vs32:0x405060708090a0b0c0d0e0f10111213
28  0x4000300020001   xxperm:0x2000100040003   
vs33:0x8090a0b0c0d0e0f1011121314151617   vs32:0x8090a0b0c0d0e0f1011121314151617
312 0x4000300020001   xxperm:0x3000200010004   
vs33:0x405060708090a0b0c0d0e0f10111213   vs32:0xc0d0e0f101112131415161718191a1b

vs34:
 0x40003000200fe
 0x4000300fe0001
 0x400fe00020001
0xfe000300020001


"xxinsertw 34,0,12" will always insert vs0[32:63] content to the forth word of
target vector, bits[96:127].  Then the second xxperm rotate the modified vector
back. 

All the instructions are register based operation, as Segher replied, power9
supports only fixed position inserts, so we need do some trick here to support
it instead of generate short store wide load instructions.


> 
> This might be a generic viable expansion strathegy btw,
> which is why I asked before whether the CPU supports
> inserts at a variable position ...  the building blocks are
> already there with vec_set at constant zero position
> plus vec_perm_const for the rotates.
> 
> But well, I did ask this question.  Multiple times.
> 
> ppc does _not_ have a VSX instruction
> like xxinsertw r34, r8, r12 where r8 denotes
> the vector element (or byte position or whatever).
> 
> So I don't think vec_set with a variable index is the
> best approach.
> Xionghu - you said even without the patch the stack
> storage is eventually elided but
> 
>  addi 9,1,-16
>  rldic 6,6,2,60
>  stxv 34,-16(1)
>  stwx 5,9,6
>  lxv 34,-16(1)
> 
> still shows stack(?) store/load with a bad STLF penalty.


Sorry that if I didn't describe clearly and misunderstood you, I mean if insert 
many
instructions(tested with a loop inserted) between "stwx 5,9,6" and "lxv 
34,-16(1)",
the store hit load performance issue could be elided, but this is not the 
solution
we want.

I also changed your test as below and build for X86, seems it also generates
inefficient code?  What my patch does maybe different usage from your pasted
case? 

#define N 32
typedef int T;
typedef T V __attribute__((vector_size(N)));
  V setg3 (V v, int idx, T val)
{
v[idx&31] = val;
return v;
}

-O2 -S -mavx -march=znver2:

setg3:
pushrbp
and edi, 31
mov rbp, rsp
and rsp, -32
vmovdqa YMMWORD PTR [rsp-32], ymm0
mov DWORD PTR [rsp-32+rdi*4], esi
vmovdqa ymm0, YMMWORD PTR [rsp-32]
leave
ret


While idx is constant: 

setg3:
vpinsrd xmm1, xmm0, esi, 3
vinserti128 ymm0, ymm0, xmm1, 0x0
ret

And ARM with -O2 -S -march=armv8.2-a+sve (N change to 16): 

setg3:
sub sp, sp, #16
and x0, x0, 15
str q0, [sp]
str w1, [sp, x0, lsl 2]
ldr q0, [sp]
add sp, sp, 16
ret

While idx is constant: 

setg3:
ins v0.s[3], w1
ret


Though I've no idea how to optimize this on X86 and ARM with vector instructions
to avoid short store with wide load followed on stack.


Thanks,
Xionghu

[PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR

2020-09-24 Thread xionghu luo via Gcc-patches

Hi,

On 2020/9/24 20:39, Richard Sandiford wrote:
> xionghu luo  writes:
>> @@ -2658,6 +2659,43 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall 
>> *stmt, convert_optab optab)
>>   
>>   #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn
>>   
>> +/* Expand VEC_SET internal functions.  */
>> +
>> +static void
>> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>> +{
>> +  tree lhs = gimple_call_lhs (stmt);
>> +  tree op0 = gimple_call_arg (stmt, 0);
>> +  tree op1 = gimple_call_arg (stmt, 1);
>> +  tree op2 = gimple_call_arg (stmt, 2);
>> +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
>> +  rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE);
> 
> I'm not sure about the expand_expr here.  ISTM that op0 is a normal
> input and so should be expanded by expand_normal rather than
> EXPAND_WRITE.  Also:
> 
>> +
>> +  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
>> +  scalar_mode innermode = GET_MODE_INNER (outermode);
>> +
>> +  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
>> +  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);
>> +
>> +  class expand_operand ops[3];
>> +  enum insn_code icode = optab_handler (optab, outermode);
>> +
>> +  if (icode != CODE_FOR_nothing)
>> +{
>> +  pos = convert_to_mode (E_SImode, pos, 0);
>> +
>> +  create_fixed_operand (&ops[0], src);
> 
> ...this would mean that if SRC happens to be a MEM, the pattern
> must also accept a MEM.
> 
> ISTM that we're making more work for ourselves by not “fixing” the optab
> to have a natural pure-input + pure-output interface. :-)  But if we
> stick with the current optab interface, I think we need to:
> 
> - create a temporary register
> - move SRC into the temporary register before the insn
> - use create_fixed_operand with the temporary register for operand 0
> - move the temporary register into TARGET after the insn
> 
>> +  create_input_operand (&ops[1], value, innermode);
>> +  create_input_operand (&ops[2], pos, GET_MODE (pos));
> 
> For this I think we should use convert_operand_from on the original “pos”,
> so that the target gets to choose what the mode of the operand is.
> 

Thanks a lot for the nice suggestions, fixed them all and updated the patch as 
below.


[PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR

This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to
VEC_SET internal function in gimple-isel pass if target supports
vec_set with variable index by checking can_vec_set_var_idx_p.

gcc/ChangeLog:

2020-09-25  Xionghu Luo  

* gimple-isel.cc (gimple_expand_vec_set_expr): New function.
(gimple_expand_vec_cond_exprs): Rename to ...
(gimple_expand_vec_exprs): ... this and call
gimple_expand_vec_set_expr.
* internal-fn.c (vec_set_direct): New define.
(expand_vec_set_optab_fn): New function.
(direct_vec_set_optab_supported_p): New define.
* internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN.
* optabs.c (can_vec_set_var_idx_p): New function.
* optabs.h (can_vec_set_var_idx_p): New declaration.
---
 gcc/gimple-isel.cc  | 75 +++--
 gcc/internal-fn.c   | 41 +
 gcc/internal-fn.def |  2 ++
 gcc/optabs.c| 21 +
 gcc/optabs.h|  4 +++
 5 files changed, 141 insertions(+), 2 deletions(-)

diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc
index b330cf4c20e..02513e04900 100644
--- a/gcc/gimple-isel.cc
+++ b/gcc/gimple-isel.cc
@@ -35,6 +35,74 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-cfg.h"
 #include "bitmap.h"
 #include "tree-ssa-dce.h"
+#include "memmodel.h"
+#include "optabs.h"
+
+/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to
+   internal function based on vector type of selected expansion.
+   i.e.:
+ VIEW_CONVERT_EXPR(u)[_1] =  = i_4(D);
+   =>
+ _7 = u;
+ _8 = .VEC_SET (_7, i_4(D), _1);
+ u = _8;  */
+
+static gimple *
+gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi)
+{
+  enum tree_code code;
+  gcall *new_stmt = NULL;
+  gassign *ass_stmt = NULL;
+
+  /* Only consider code == GIMPLE_ASSIGN.  */
+  gassign *stmt = dyn_cast (gsi_stmt (*gsi));
+  if (!stmt)
+return NULL;
+
+  tree lhs = gimple_assign_lhs (stmt);
+  code = TREE_CODE (lhs);
+  if (code != ARRAY_REF)
+return NULL;
+
+  tree val = gimple_assign_rhs1 (stmt);
+  tree op0 = TREE_OPERAND (lhs, 0);
+  if (TREE_CODE (op0) == VIEW_CONVERT_EXPR && DECL_P (TREE_OPERAND (op0, 0))
+  &

Re: [PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR

2020-09-26 Thread xionghu luo via Gcc-patches




On 2020/9/25 21:28, Richard Sandiford wrote:
> xionghu luo  writes:
>> @@ -2658,6 +2659,45 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall 
>> *stmt, convert_optab optab)
>>   
>>   #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn
>>   
>> +/* Expand VEC_SET internal functions.  */
>> +
>> +static void
>> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>> +{
>> +  tree lhs = gimple_call_lhs (stmt);
>> +  tree op0 = gimple_call_arg (stmt, 0);
>> +  tree op1 = gimple_call_arg (stmt, 1);
>> +  tree op2 = gimple_call_arg (stmt, 2);
>> +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
>> +  rtx src = expand_normal (op0);
>> +
>> +  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
>> +  scalar_mode innermode = GET_MODE_INNER (outermode);
>> +
>> +  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
>> +  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);
> 
> These two can just use expand_normal.  Might be easier to read if
> they come immediately after the expand_normal (op0).
> 
> LGTM with that change for the internal-fn.c stuff, thanks.
> 

Thank you, updated and committed as r11-3486.  Tested and confirmed 
Power/X86/ARM
still not supporting vec_set with register index, so there are no ICE 
regressions
caused by generating IFN VEC_SET but not properly expanded.


Thanks,
Xionghu

[PATCH 1/4] rs6000: Change rs6000_expand_vector_set param

2020-10-10 Thread Xionghu Luo via Gcc-patches

rs6000_expand_vector_set could accept insert either to constant position
or variable position, so change the operand to reg_or_cint_operand.

gcc/ChangeLog:

2020-10-10  Xionghu Luo  

* config/rs6000/rs6000-call.c (altivec_expand_vec_set_builtin):
Change call param 2 from type int to rtx.
* config/rs6000/rs6000-protos.h (rs6000_expand_vector_set):
Likewise.
* config/rs6000/rs6000.c (rs6000_expand_vector_init):
Change call param 2 from type int to rtx.
(rs6000_expand_vector_set): Likewise.
* config/rs6000/vector.md (vec_set): Support both constant
and variable index vec_set.
---
 gcc/config/rs6000/rs6000-call.c   |  2 +-
 gcc/config/rs6000/rs6000-protos.h |  2 +-
 gcc/config/rs6000/rs6000.c| 16 +---
 gcc/config/rs6000/vector.md   |  4 ++--
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c
index a8b520834c7..2608a2a0797 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -10655,7 +10655,7 @@ altivec_expand_vec_set_builtin (tree exp)
   op0 = force_reg (tmode, op0);
   op1 = force_reg (mode1, op1);
 
-  rs6000_expand_vector_set (op0, op1, elt);
+  rs6000_expand_vector_set (op0, op1, GEN_INT (elt));
 
   return op0;
 }
diff --git a/gcc/config/rs6000/rs6000-protos.h 
b/gcc/config/rs6000/rs6000-protos.h
index 25fa5dd57cd..3578136e79b 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -57,7 +57,7 @@ extern bool rs6000_move_128bit_ok_p (rtx []);
 extern bool rs6000_split_128bit_ok_p (rtx []);
 extern void rs6000_expand_float128_convert (rtx, rtx, bool);
 extern void rs6000_expand_vector_init (rtx, rtx);
-extern void rs6000_expand_vector_set (rtx, rtx, int);
+extern void rs6000_expand_vector_set (rtx, rtx, rtx);
 extern void rs6000_expand_vector_extract (rtx, rtx, rtx);
 extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx);
 extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 375fff59928..a5b59395abd 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6678,7 +6678,8 @@ rs6000_expand_vector_init (rtx target, rtx vals)
   rs6000_expand_vector_init (target, copy);
 
   /* Insert variable.  */
-  rs6000_expand_vector_set (target, XVECEXP (vals, 0, one_var), one_var);
+  rs6000_expand_vector_set (target, XVECEXP (vals, 0, one_var),
+   GEN_INT (one_var));
   return;
 }
 
@@ -6692,10 +6693,10 @@ rs6000_expand_vector_init (rtx target, rtx vals)
   emit_move_insn (target, mem);
 }
 
-/* Set field ELT of TARGET to VAL.  */
+/* Set field ELT_RTX of TARGET to VAL.  */
 
 void
-rs6000_expand_vector_set (rtx target, rtx val, int elt)
+rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx)
 {
   machine_mode mode = GET_MODE (target);
   machine_mode inner_mode = GET_MODE_INNER (mode);
@@ -6709,7 +6710,6 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt)
   if (VECTOR_MEM_VSX_P (mode))
 {
   rtx insn = NULL_RTX;
-  rtx elt_rtx = GEN_INT (elt);
 
   if (mode == V2DFmode)
insn = gen_vsx_set_v2df (target, target, val, elt_rtx);
@@ -6736,8 +6736,11 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt)
}
 }
 
+  gcc_assert (CONST_INT_P (elt_rtx));
+
   /* Simplify setting single element vectors like V1TImode.  */
-  if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (inner_mode) && elt == 0)
+  if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (inner_mode)
+  && INTVAL (elt_rtx) == 0)
 {
   emit_move_insn (target, gen_lowpart (mode, val));
   return;
@@ -6760,8 +6763,7 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt)
 
   /* Set permute mask to insert element into target.  */
   for (i = 0; i < width; ++i)
-XVECEXP (mask, 0, elt*width + i)
-  = GEN_INT (i + 0x10);
+XVECEXP (mask, 0, INTVAL (elt_rtx) * width + i) = GEN_INT (i + 0x10);
   x = gen_rtx_CONST_VECTOR (V16QImode, XVEC (mask, 0));
 
   if (BYTES_BIG_ENDIAN)
diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md
index 796345c80d3..7aab1887cf5 100644
--- a/gcc/config/rs6000/vector.md
+++ b/gcc/config/rs6000/vector.md
@@ -1227,10 +1227,10 @@ (define_expand "vec_init"
 (define_expand "vec_set"
   [(match_operand:VEC_E 0 "vlogical_operand")
(match_operand: 1 "register_operand")
-   (match_operand 2 "const_int_operand")]
+   (match_operand 2 "reg_or_cint_operand")]
   "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)"
 {
-  rs6000_expand_vector_set (operands[0], operands[1], INTVAL (operands[2]));
+  rs6000_expand_vector_set (operands[0], operands[1], operands[2]);
   DONE;
 })
 
-- 
2.25.1

[PATCH 4/4] rs6000: Update testcases' instruction count

2020-10-10 Thread Xionghu Luo via Gcc-patches

gcc/testsuite/ChangeLog:

2020-10-10  Xionghu Luo  

* gcc.target/powerpc/fold-vec-insert-char-p8.c: Adjust
instruction counts.
* gcc.target/powerpc/fold-vec-insert-char-p9.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-double.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-float-p8.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-float-p9.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-int-p8.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-int-p9.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-longlong.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-short-p8.c: Likewise.
* gcc.target/powerpc/fold-vec-insert-short-p9.c: Likewise.
* gcc.target/powerpc/vsx-builtin-7.c: Likewise.
---
 .../gcc.target/powerpc/fold-vec-insert-char-p8.c | 11 ++-
 .../gcc.target/powerpc/fold-vec-insert-char-p9.c | 12 ++--
 .../gcc.target/powerpc/fold-vec-insert-double.c  | 11 ---
 .../gcc.target/powerpc/fold-vec-insert-float-p8.c|  6 +++---
 .../gcc.target/powerpc/fold-vec-insert-float-p9.c| 10 +-
 .../gcc.target/powerpc/fold-vec-insert-int-p8.c  |  9 +
 .../gcc.target/powerpc/fold-vec-insert-int-p9.c  | 11 +--
 .../gcc.target/powerpc/fold-vec-insert-longlong.c| 10 +++---
 .../gcc.target/powerpc/fold-vec-insert-short-p8.c|  9 +
 .../gcc.target/powerpc/fold-vec-insert-short-p9.c|  8 
 gcc/testsuite/gcc.target/powerpc/vsx-builtin-7.c |  4 ++--
 11 files changed, 52 insertions(+), 49 deletions(-)

diff --git a/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p8.c 
b/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p8.c
index b13c8ca19c7..1ad23de99a9 100644
--- a/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p8.c
+++ b/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p8.c
@@ -44,15 +44,16 @@ vector unsigned char testuu_cst (unsigned char x, vector 
unsigned char v)
return vec_insert (x, v, 12);
 }
 
-/* one store per _var test */
-/* { dg-final { scan-assembler-times {\mstvx\M|\mstxvw4x\M} 4 } } */
+/* no store per _var test */
+/* { dg-final { scan-assembler-times {\mstvx\M|\mstxvw4x\M} 0 } } */
 /* one store-byte per test */
-/* { dg-final { scan-assembler-times {\mstb\M} 8 } } */
+/* { dg-final { scan-assembler-times {\mstb\M} 4 } } */
 /* one load per test */
-/* { dg-final { scan-assembler-times {\mlvx\M|\mlxvw4x\M} 8 } } */
+/* { dg-final { scan-assembler-times {\mlvx\M|\mlxvw4x\M} 8 { target le } } } 
*/
+/* { dg-final { scan-assembler-times {\mlvx\M|\mlxvw4x\M} 4 { target be } } } 
*/
 
 /* one lvebx per _cst test.*/
 /* { dg-final { scan-assembler-times {\mlvebx\M} 4 } } */
 /* one vperm per _cst test.*/
-/* { dg-final { scan-assembler-times {\mvperm\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mvperm\M} 12 } } */
 
diff --git a/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p9.c 
b/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p9.c
index 16432289d68..400caa31bb4 100644
--- a/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p9.c
+++ b/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-char-p9.c
@@ -44,13 +44,13 @@ vector unsigned char testuu_cst (unsigned char x, vector 
unsigned char v)
return vec_insert (x, v, 12);
 }
 
-/* load immediate, add, store, stb, load variable test.  */
-/* { dg-final { scan-assembler-times {\mstxv\M|\mstvx\M} 4 { target lp64 } } } 
*/
-/* { dg-final { scan-assembler-times {\mstb\M} 4 { target lp64 } } } */
-/* { dg-final { scan-assembler-times {\mlvebx\M|\mlxv\M|\mlvx\M} 4 { target 
lp64} } } */
+/* no store per _var test.  */
+/* { dg-final { scan-assembler-times {\mstxv\M|\mstvx\M} 0 { target lp64 } } } 
*/
+/* { dg-final { scan-assembler-times {\mstb\M} 0 { target lp64 } } } */
+/* { dg-final { scan-assembler-times {\mlvebx\M|\mlxv\M|\mlvx\M} 0 { target 
lp64} } } */
 /* an insert and a move per constant test. */
-/* { dg-final { scan-assembler-times {\mmtvsrwz\M} 4 { target lp64 } } } */
-/* { dg-final { scan-assembler-times {\mvinsertb\M} 4 { target lp64 } } } */
+/* { dg-final { scan-assembler-times {\mmtvsrwz\M} 8 { target lp64 } } } */
+/* { dg-final { scan-assembler-times {\mvinsertb\M} 8 { target lp64 } } } */
 
 /* -m32 codegen. */
 /* { dg-final { scan-assembler-times {\mrlwinm\M} 4 { target ilp32 } } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-double.c 
b/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-double.c
index 435d28d5420..842fe9bbcad 100644
--- a/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-double.c
+++ b/gcc/testsuite/gcc.target/powerpc/fold-vec-insert-double.c
@@ -23,7 +23,12 @@ testd_cst (double d, vector double vd)
 /* { dg-final { scan-assembler {\mxxpermdi\M} } } */
 
 /* { dg-final { scan-assembler-times {\mrldic\M|\mrlwinm\M} 1 } } */
-/* { dg-final { scan-assembler-times {\mstxvd2x\M|\mstxv\M|\mstvx\M} 1 } } */
-/* { dg-final { scan-assembler-times {\mstfdx

[PATCH 3/4] rs6000: Enable vec_insert for P8 with rs6000_expand_vector_set_var_p8

2020-10-10 Thread Xionghu Luo via Gcc-patches

gcc/ChangeLog:

2020-10-10  Xionghu Luo  

* config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
Generate ARRAY_REF(VIEW_CONVERT_EXPR) for P8 and later
platforms.
* config/rs6000/rs6000.c (rs6000_expand_vector_set_var): Update
to call different path for P8 and P9.
(rs6000_expand_vector_set_var_p9): New function.
(rs6000_expand_vector_set_var_p8): New function.

gcc/testsuite/ChangeLog:

2020-10-10  Xionghu Luo  

* gcc.target/powerpc/pr79251.p8.c: New test.
---
 gcc/config/rs6000/rs6000-c.c  |  27 +++-
 gcc/config/rs6000/rs6000.c| 117 +-
 gcc/testsuite/gcc.target/powerpc/pr79251.p8.c |  17 +++
 3 files changed, 155 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p8.c

diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index 5551a21d738..4bea8001ec6 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -1599,10 +1599,29 @@ altivec_resolve_overloaded_builtin (location_t loc, 
tree fndecl,
  SET_EXPR_LOCATION (stmt, loc);
  stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt);
}
-  stmt = build_array_ref (loc, stmt, arg2);
-  stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt,
- convert (TREE_TYPE (stmt), arg0));
-  stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
+
+  if (TARGET_P8_VECTOR)
+   {
+ stmt = build_array_ref (loc, stmt, arg2);
+ stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt,
+ convert (TREE_TYPE (stmt), arg0));
+ stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
+   }
+  else
+   {
+ tree arg1_inner_type;
+ tree innerptrtype;
+ arg1_inner_type = TREE_TYPE (arg1_type);
+ innerptrtype = build_pointer_type (arg1_inner_type);
+
+ stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0);
+ stmt = convert (innerptrtype, stmt);
+ stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1);
+ stmt = build_indirect_ref (loc, stmt, RO_NULL);
+ stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt,
+convert (TREE_TYPE (stmt), arg0));
+ stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
+   }
   return stmt;
 }
 
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 96f76c7a74c..33ca839cb28 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6806,10 +6806,10 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx 
elt_rtx)
 }
 
 /* Insert VAL into IDX of TARGET, VAL size is same of the vector element, IDX
-   is variable and also counts by vector element size.  */
+   is variable and also counts by vector element size for p9 and above.  */
 
 void
-rs6000_expand_vector_set_var (rtx target, rtx val, rtx idx)
+rs6000_expand_vector_set_var_p9 (rtx target, rtx val, rtx idx)
 {
   machine_mode mode = GET_MODE (target);
 
@@ -6852,6 +6852,119 @@ rs6000_expand_vector_set_var (rtx target, rtx val, rtx 
idx)
   emit_insn (perml);
 }
 
+/* Insert VAL into IDX of TARGET, VAL size is same of the vector element, IDX
+   is variable and also counts by vector element size for p8.  */
+
+void
+rs6000_expand_vector_set_var_p8 (rtx target, rtx val, rtx idx)
+{
+  machine_mode mode = GET_MODE (target);
+
+  gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx));
+
+  gcc_assert (GET_MODE (idx) == E_SImode);
+
+  machine_mode inner_mode = GET_MODE (val);
+  HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode);
+
+  rtx tmp = gen_reg_rtx (GET_MODE (idx));
+  int width = GET_MODE_SIZE (inner_mode);
+
+  gcc_assert (width >= 1 && width <= 4);
+
+  if (!BYTES_BIG_ENDIAN)
+{
+  /*  idx = idx * width.  */
+  emit_insn (gen_mulsi3 (tmp, idx, GEN_INT (width)));
+  /*  idx = idx + 8.  */
+  emit_insn (gen_addsi3 (tmp, tmp, GEN_INT (8)));
+}
+  else
+{
+  emit_insn (gen_mulsi3 (tmp, idx, GEN_INT (width)));
+  emit_insn (gen_subsi3 (tmp, GEN_INT (24 - width), tmp));
+}
+
+  /*  lxv vs33, mask.
+  DImode: 0x
+  SImode: 0x
+  HImode: 0x.
+  QImode: 0x00ff.  */
+  rtx mask = gen_reg_rtx (V16QImode);
+  rtx mask_v2di = gen_reg_rtx (V2DImode);
+  rtvec v = rtvec_alloc (2);
+  if (!BYTES_BIG_ENDIAN)
+{
+  RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, 0);
+  RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, mode_mask);
+}
+  else
+{
+  RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, mode_mask);
+  RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, 0);
+}
+  emit_insn (gen_vec_initv2didi (mask_v2di, gen_rtx_PARALLEL (V2DImode, v)));
+  rtx sub_mask = simplify_gen_subreg (V16QImode, mask_v2di, V2DImode, 0);
+

[PATCH 2/4] rs6000: Support variable insert and Expand vec_insert in expander [PR79251]

2020-10-10 Thread Xionghu Luo via Gcc-patches

vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value
to be insert, arg2 is the place to insert arg1 to arg0.  Current expander
generates stxv+stwx+lxv if arg2 is variable instead of constant, which
causes serious store hit load performance issue on Power.  This patch tries
 1) Build VIEW_CONVERT_EXPR for vec_insert (i, v, n) like v[n&3] = i to
unify the gimple code, then expander could use vec_set_optab to expand.
 2) Expand the IFN VEC_SET to fast instructions: lvsr+insert+lvsl.
In this way, "vec_insert (i, v, n)" and "v[n&3] = i" won't be expanded too
early in gimple stage if arg2 is variable, avoid generating store hit load
instructions.

For Power9 V4SI:
addi 9,1,-16
rldic 6,6,2,60
stxv 34,-16(1)
stwx 5,9,6
lxv 34,-16(1)
=>
rlwinm 6,6,2,28,29
mtvsrwz 0,5
lvsr 1,0,6
lvsl 0,0,6
xxperm 34,34,33
xxinsertw 34,0,12
xxperm 34,34,32

Though instructions increase from 5 to 7, the performance is improved
60% in typical cases.
Tested with V2DI, V2DF V4SI, V4SF, V8HI, V16QI on Power9-LE.

gcc/ChangeLog:

2020-10-10  Xionghu Luo  

* config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
Ajdust variable index vec_insert from address dereference to
ARRAY_REF(VIEW_CONVERT_EXPR) tree expression.
* config/rs6000/rs6000-protos.h (rs6000_expand_vector_set_var):
New declaration.
* config/rs6000/rs6000.c (rs6000_expand_vector_set_var): New function.
* config/rs6000/vector.md (vec_set): Support both constant
and variable index vec_set.

gcc/testsuite/ChangeLog:

2020-10-10  Xionghu Luo  

* gcc.target/powerpc/pr79251.p9.c: New test.
* gcc.target/powerpc/pr79251-run.c: New test.
* gcc.target/powerpc/pr79251.h: New header.
---
 gcc/config/rs6000/rs6000-c.c  | 25 -
 gcc/config/rs6000/rs6000-protos.h |  1 +
 gcc/config/rs6000/rs6000.c| 53 +++
 .../gcc.target/powerpc/pr79251-run.c  | 28 ++
 gcc/testsuite/gcc.target/powerpc/pr79251.h| 19 +++
 gcc/testsuite/gcc.target/powerpc/pr79251.p9.c | 18 +++
 6 files changed, 130 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.h
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p9.c

diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index cc1e997524e..5551a21d738 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -1512,9 +1512,7 @@ altivec_resolve_overloaded_builtin (location_t loc, tree 
fndecl,
   tree arg1;
   tree arg2;
   tree arg1_type;
-  tree arg1_inner_type;
   tree decl, stmt;
-  tree innerptrtype;
   machine_mode mode;
 
   /* No second or third arguments. */
@@ -1566,8 +1564,13 @@ altivec_resolve_overloaded_builtin (location_t loc, tree 
fndecl,
  return build_call_expr (call, 3, arg1, arg0, arg2);
}
 
-  /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */
-  arg1_inner_type = TREE_TYPE (arg1_type);
+  /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0 with
+VIEW_CONVERT_EXPR.  i.e.:
+D.3192 = v1;
+_1 = n & 3;
+VIEW_CONVERT_EXPR(D.3192)[_1] = i;
+v1 = D.3192;
+D.3194 = v1;  */
   if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1)
arg2 = build_int_cst (TREE_TYPE (arg2), 0);
   else
@@ -1582,6 +1585,7 @@ altivec_resolve_overloaded_builtin (location_t loc, tree 
fndecl,
   TREE_USED (decl) = 1;
   TREE_TYPE (decl) = arg1_type;
   TREE_READONLY (decl) = TYPE_READONLY (arg1_type);
+  TREE_ADDRESSABLE (decl) = 1;
   if (c_dialect_cxx ())
{
  stmt = build4 (TARGET_EXPR, arg1_type, decl, arg1,
@@ -1592,19 +1596,12 @@ altivec_resolve_overloaded_builtin (location_t loc, 
tree fndecl,
{
  DECL_INITIAL (decl) = arg1;
  stmt = build1 (DECL_EXPR, arg1_type, decl);
- TREE_ADDRESSABLE (decl) = 1;
  SET_EXPR_LOCATION (stmt, loc);
  stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt);
}
-
-  innerptrtype = build_pointer_type (arg1_inner_type);
-
-  stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0);
-  stmt = convert (innerptrtype, stmt);
-  stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1);
-  stmt = build_indirect_ref (loc, stmt, RO_NULL);
-  stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt,
-convert (TREE_TYPE (stmt), arg0));
+  stmt = build_array_ref (loc, stmt, arg2);
+  stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt,
+ convert (TREE_TYPE (stmt), arg0));
   stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
   return stmt;
 }
d

[PATCH 0/4] rs6000: Enable variable vec_insert with IFN VEC_SET

2020-10-10 Thread Xionghu Luo via Gcc-patches

Originated from
https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554240.html
with patch split and some refinement per review comments.

Patch of IFN VEC_SET for ARRAY_REF(VIEW_CONVERT_EXPR) is committed,
this patch set enables expanding IFN VEC_SET for Power9 and Power8
with specfic instruction sequences.

Xionghu Luo (4):
  rs6000: Change rs6000_expand_vector_set param
  rs6000: Support variable insert and Expand vec_insert in expander [PR79251]
  rs6000: Enable vec_insert for P8 with rs6000_expand_vector_set_var_p8
  rs6000: Update testcases' instruction count

 gcc/config/rs6000/rs6000-c.c  |  44 +++--
 gcc/config/rs6000/rs6000-call.c   |   2 +-
 gcc/config/rs6000/rs6000-protos.h |   3 +-
 gcc/config/rs6000/rs6000.c| 181 +-
 gcc/config/rs6000/vector.md   |   4 +-
 .../powerpc/fold-vec-insert-char-p8.c |   8 +-
 .../powerpc/fold-vec-insert-char-p9.c |  12 +-
 .../powerpc/fold-vec-insert-double.c  |  11 +-
 .../powerpc/fold-vec-insert-float-p8.c|   6 +-
 .../powerpc/fold-vec-insert-float-p9.c|  10 +-
 .../powerpc/fold-vec-insert-int-p8.c  |   6 +-
 .../powerpc/fold-vec-insert-int-p9.c  |  11 +-
 .../powerpc/fold-vec-insert-longlong.c|  10 +-
 .../powerpc/fold-vec-insert-short-p8.c|   6 +-
 .../powerpc/fold-vec-insert-short-p9.c|   8 +-
 .../gcc.target/powerpc/pr79251-run.c  |  28 +++
 gcc/testsuite/gcc.target/powerpc/pr79251.h|  19 ++
 gcc/testsuite/gcc.target/powerpc/pr79251.p8.c |  17 ++
 gcc/testsuite/gcc.target/powerpc/pr79251.p9.c |  18 ++
 .../gcc.target/powerpc/vsx-builtin-7.c|   4 +-
 20 files changed, 337 insertions(+), 71 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.h
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p8.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p9.c

-- 
2.25.1

[PATCH] Fix incorrect loop exit edge probability [PR103270]

2021-11-22 Thread Xionghu Luo via Gcc-patches

r12-4526 cancelled jump thread path rotates loop. It exposes a issue in
profile-estimate when predict_extra_loop_exits, outer loop's exit edge
is marked as inner loop's extra loop exit and set with incorrect
prediction, then a hot inner loop will become cold loop finally through
optimizations, this patch ignores the EDGE_DFS_BACK edge when searching
extra exit edges to avoid unexpected predict_edge.

gcc/ChangeLog:

PR middle-end/103270
* predict.c (predict_extra_loop_exits): Ignore EDGE_DFS_BACK edge.

gcc/ChangeLog:

PR middle-end/103270
* predict.c (predict_extra_loop_exits): New.
---
 gcc/predict.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/gcc/predict.c b/gcc/predict.c
index 68b11135680..1ae8ccff72c 100644
--- a/gcc/predict.c
+++ b/gcc/predict.c
@@ -1910,6 +1910,10 @@ predict_extra_loop_exits (edge exit_edge)
continue;
   if ((check_value_one ^ integer_onep (val)) == 1)
continue;
+#if 0
+  if (e->flags & EDGE_DFS_BACK)
+   continue;
+#endif
   if (EDGE_COUNT (e->src->succs) != 1)
{
  predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN);
-- 
2.25.1

Re: [PATCH] Fix incorrect loop exit edge probability [PR103270]

2021-11-22 Thread Xionghu Luo via Gcc-patches




On 2021/11/23 13:51, Xionghu Luo wrote:
> r12-4526 cancelled jump thread path rotates loop. It exposes a issue in
> profile-estimate when predict_extra_loop_exits, outer loop's exit edge
> is marked as inner loop's extra loop exit and set with incorrect
> prediction, then a hot inner loop will become cold loop finally through
> optimizations, this patch ignores the EDGE_DFS_BACK edge when searching
> extra exit edges to avoid unexpected predict_edge.
> 
> gcc/ChangeLog:
> 
>   PR middle-end/103270
>   * predict.c (predict_extra_loop_exits): Ignore EDGE_DFS_BACK edge.
> 
> gcc/ChangeLog:
> 
>   PR middle-end/103270
>   * predict.c (predict_extra_loop_exits): New.
> ---
>  gcc/predict.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/gcc/predict.c b/gcc/predict.c
> index 68b11135680..1ae8ccff72c 100644
> --- a/gcc/predict.c
> +++ b/gcc/predict.c
> @@ -1910,6 +1910,10 @@ predict_extra_loop_exits (edge exit_edge)
>   continue;
>if ((check_value_one ^ integer_onep (val)) == 1)
>   continue;
> +  if (e->flags & EDGE_DFS_BACK)
> + continue;

Sorry, made a mistake before send the patch, #if 0 #endif should be
removed...


>if (EDGE_COUNT (e->src->succs) != 1)
>   {
> predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN);
> 

-- 
Thanks,
Xionghu

[PATCH v2] Fix incorrect loop exit edge probability [PR103270]

2021-11-23 Thread Xionghu Luo via Gcc-patches

On 2021/11/23 17:50, Jan Hubicka wrote:
>> On Tue, Nov 23, 2021 at 6:52 AM Xionghu Luo  wrote:
>>>
>>> r12-4526 cancelled jump thread path rotates loop. It exposes a issue in
>>> profile-estimate when predict_extra_loop_exits, outer loop's exit edge
>>> is marked as inner loop's extra loop exit and set with incorrect
>>> prediction, then a hot inner loop will become cold loop finally through
>>> optimizations, this patch ignores the EDGE_DFS_BACK edge when searching
>>> extra exit edges to avoid unexpected predict_edge.
>>
>> Not sure how outer vs. inner loop exit correlates with EDGE_DFS_BACK,
>> I have expected a check based on which loop is exited by the edge instead?
>> A backedge should never be an exit, no?
>>
>> Note that the profile pass does not yet mark backedges so EDGE_DFS_BACK
>> settings are unreliable.
> 
> So we have two nested loops and an exit which goes from inner loop and
> exists both loops.  While processing outer loop we set pretty high exit
> probability that is not good for inner loop?

No, the edge only belongs to outer loop only.  Can an exit edge belongs to
two different loops at the same time?
Exit edges are iterated with LI_FROM_INNERMOST in predict_loops, if an edge
already has prediction by querying edge_predicted_by_p, maybe_predict_edge
will early return to not set it again.

The CFG is:

2
|
8< // l1
| \   |
10 9  |
| |
  7
6 <// l2
| | 
11| 
| |
4<-   |// l3
| \|  |
5  3  |
| |
--

l2's edge (6->11,6->7) is set to (33%,67%) by l3 unexpectedly.

FYI: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103270#c5

> 
> I guess we could just check if exit edge source basic block has same
> loop depth as the loop we are processing?
> 


Thanks for the suggestion, it works.  Loop checks already existed in
predict_paths_for_bb, just need pass down the loop argument.
Updated as v2 patch.


v2-0001-Fix-incorrect-loop-exit-edge-probability-PR103270.patch

r12-4526 cancelled jump thread path rotates loop. It exposes a issue in
profile-estimate when predict_extra_loop_exits, outer loop's exit edge
is marked as inner loop's extra loop exit and set with incorrect
prediction, then a hot inner loop will become cold loop finally through
optimizations, this patch add loop check when searching extra exit edges
to avoid unexpected predict_edge from predict_paths_for_bb.

Regression tested pass on P8 & x86, OK for master?

gcc/ChangeLog:

PR middle-end/103270
* predict.c (predict_extra_loop_exits): Add loop parameter.
(predict_loops): Call with loop argument.

gcc/testsuite/ChangeLog:

PR middle-end/103270
* gcc.dg/pr103270.c: New test.
---
 gcc/predict.c   | 10 ++
 gcc/testsuite/gcc.dg/pr103270.c | 19 +++
 2 files changed, 25 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr103270.c

diff --git a/gcc/predict.c b/gcc/predict.c
index 68b11135680..082782ec4e9 100644
--- a/gcc/predict.c
+++ b/gcc/predict.c
@@ -1859,7 +1859,7 @@ predict_iv_comparison (class loop *loop, basic_block bb,
exits to predict them using PRED_LOOP_EXTRA_EXIT.  */
 
 static void
-predict_extra_loop_exits (edge exit_edge)
+predict_extra_loop_exits (class loop *loop, edge exit_edge)
 {
   unsigned i;
   bool check_value_one;
@@ -1912,12 +1912,14 @@ predict_extra_loop_exits (edge exit_edge)
continue;
   if (EDGE_COUNT (e->src->succs) != 1)
{
- predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN);
+ predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN,
+loop);
  continue;
}
 
   FOR_EACH_EDGE (e1, ei, e->src->preds)
-   predict_paths_leading_to_edge (e1, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN);
+   predict_paths_leading_to_edge (e1, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN,
+  loop);
 }
 }
 
@@ -2009,7 +2011,7 @@ predict_loops (void)
 ex->src->index, ex->dest->index);
  continue;
}
- predict_extra_loop_exits (ex);
+ predict_extra_loop_exits (loop, ex);
 
  if (number_of_iterations_exit (loop, ex, &niter_desc, false, false))
niter = niter_desc.niter;
diff --git a/gcc/testsuite/gcc.dg/pr103270.c b/gcc/testsuite/gcc.dg/pr103270.c
new file mode 100644
index 000..819310e360e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr103270.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-profile_estimate" } */
+
+void test(int a, int* i)
+{
+  for (; a < 5; ++a)
+{
+  int b = 0;
+  int c = 0;
+  for (; b != -11; b--)
+   for (int d = 0; d ==0; d++)
+ {
+   *i += c & a;
+   c = b;
+ }
+}
+}
+
+/* { dg-final { scan-tree-dump-not "extra loop exit heuristics of 
edge\[^:\]*:" "profile_estimate"} } */
-- 
2.25.1

Re: [PATCH v3 1/4] Fix loop split incorrect count and probability

2021-11-23 Thread Xionghu Luo via Gcc-patches

Gentle ping, thanks.

[PATCH v3] Fix loop split incorrect count and probability

https://gcc.gnu.org/pipermail/gcc-patches/2021-November/583626.html


On 2021/11/8 14:09, Xionghu Luo via Gcc-patches wrote:
> 
> 
> On 2021/10/27 15:44, Jan Hubicka wrote:
>>> On Wed, 27 Oct 2021, Jan Hubicka wrote:
>>>
>>>>>
>>>>> gcc/ChangeLog:
>>>>>
>>>>>   * tree-ssa-loop-split.c (split_loop): Fix incorrect probability.
>>>>>   (do_split_loop_on_cond): Likewise.
>>>>> ---
>>>>>  gcc/tree-ssa-loop-split.c | 25 -
>>>>>  1 file changed, 16 insertions(+), 9 deletions(-)
>>>>>
>>>>> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
>>>>> index 3f6ad046623..d30782888f3 100644
>>>>> --- a/gcc/tree-ssa-loop-split.c
>>>>> +++ b/gcc/tree-ssa-loop-split.c
>>>>> @@ -575,7 +575,11 @@ split_loop (class loop *loop1)
>>>>>   stmts2);
>>>>>   tree cond = build2 (guard_code, boolean_type_node, guard_init, border);
>>>>>   if (!initial_true)
>>>>> -   cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); 
>>>>> +   cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond);
>>>>> +
>>>>> + edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE
>>>>> +? EDGE_SUCC (bbs[i], 0)
>>>>> +: EDGE_SUCC (bbs[i], 1);
>>>>>  
>>>>>   /* Now version the loop, placing loop2 after loop1 connecting
>>>>>  them, and fix up SSA form for that.  */
>>>>> @@ -583,10 +587,10 @@ split_loop (class loop *loop1)
>>>>>   basic_block cond_bb;
>>>>>  
>>>>>   class loop *loop2 = loop_version (loop1, cond, &cond_bb,
>>>>> -profile_probability::always (),
>>>>> -profile_probability::always (),
>>>>> -profile_probability::always (),
>>>>> -profile_probability::always (),
>>>>> +true_edge->probability,
>>>>> +true_edge->probability.invert (),
>>>>> +true_edge->probability,
>>>>> +true_edge->probability.invert (),
>>>>>  true);
>>>>
>>>> As discussed yesterday, for loop of form
>>>>
>>>> for (...)
>>>>   if (cond)
>>>> cond = something();
>>>>   else
>>>> something2
>>>>
>>>> Split as
>>>
>>> Note that you are missing to conditionalize loop1 execution
>>> on 'cond' (not sure if that makes a difference).
>> You are right - forgot to mention that.
>>
>> Entry conditional makes no difference on scaling stmts inside loop but
>> affects its header and expected trip count. We however need to set up
>> probability of this conditional (and preheader count if it exists)
>> There is no general way to read the probability of this initial
>> conditional from cfg profile.  So I guess we are stuck with guessing
>> some arbitrary value. I guess common case is that cond is true first
>> iteration tough and often we can easily see that fromo PHI node
>> initializing the test variable.
>>
>> Other thing that changes is expected number of iterations of the split
>> loops, so we may want to update the exit conditinal probability
>> accordingly...
>>
> Sorry for the late reply.  The below updated patch mainly solves the issues
> you pointed out:
>   - profile count proportion for both original loop and copied loop
> without dropping down the true branch's count;
>   - probability update in the two loops and between the two loops;
>   - number of iterations update/check for split_loop.
> 
> 
> [PATCH v3] Fix loop split incorrect count and probability
> 
> In tree-ssa-loop-split.c, split_loop and split_loop_on_cond does two
> kind of split. split_loop only works for single loop and insert edge at
> exit when split, while split_loop_on_cond is not limited to single loop
> and insert edge at latch when split.  Both split behavior should consider
> loop count and probability update.  For split_loop, loop split condition

Ping: [PATCH v7 2/2] Don't move cold code out of loop by checking bb count

2021-11-23 Thread Xionghu Luo via Gcc-patches

Gentle ping and is this patch still suitable for stage 3?  Thanks.


[PATCH v7 2/2] Don't move cold code out of loop by checking bb count

https://gcc.gnu.org/pipermail/gcc-patches/2021-November/583911.html



On 2021/11/10 11:08, Xionghu Luo via Gcc-patches wrote:
> 
> 
> On 2021/11/4 21:00, Richard Biener wrote:
>> On Wed, Nov 3, 2021 at 2:29 PM Xionghu Luo  wrote:
>>>
>>>
>>>> +  while (outmost_loop != loop)
>>>> +{
>>>> +  if (bb_colder_than_loop_preheader (loop_preheader_edge
>>>> (outmost_loop)->src,
>>>> +loop_preheader_edge 
>>>> (cold_loop)->src))
>>>> +   cold_loop = outmost_loop;
>>>> +  outmost_loop = superloop_at_depth (loop, loop_depth (outmost_loop) 
>>>> + 1);
>>>> +}
>>>>
>>>> could be instead written as
>>>>
>>>>   coldest_loop = coldest_outermost_loop[loop->num];
>>>>   if (loop_depth (coldest_loop) < loop_depth (outermost_loop))
>>>> return outermost_loop;
>>>>   return coldest_loop;
>>>>
>>>> ?  And in the usual case coldest_outermost_loop[L] would be the loop tree 
>>>> root.
>>>> It should be possible to compute such cache in a DFS walk of the loop tree
>>>> (the loop iterator by default visits in such order).
>>>
>>>
>>> Thanks.  Updated the patch with your suggestion.  Not sure whether it 
>>> strictly
>>> conforms to your comments.  Though the patch passed all my added 
>>> tests(coverage not enough),
>>> I am still a bit worried if pre-computed coldest_loop is outside of 
>>> outermost_loop, but
>>> outermost_loop is not the COLDEST LOOP, i.e. (outer->inner)
>>>
>>>  [loop tree root, coldest_loop, outermost_loop,..., second_coldest_loop, 
>>> ..., loop],
>>>
>>> then function find_coldest_out_loop will return a loop NOT accord with our
>>> expectation, that should return second_coldest_loop instead of 
>>> outermost_loop?
>> Hmm, interesting - yes.  I guess the common case will be that the 
>> pre-computed
>> outermost loop will be the loop at depth 1 since outer loops tend to
>> be colder than
>> inner loops?  That would then defeat the whole exercise.
> 
> It is not easy to construct such cases, But finally I got below results,
> 
> 1) many cases inner loop is hotter than outer loop, for example:
> 
> loop 1's coldest_outermost_loop is 1, colder_than_inner_loop is NULL
> loop 2's coldest_outermost_loop is 1, colder_than_inner_loop is 1
> loop 3's coldest_outermost_loop is 1, colder_than_inner_loop is 2
> loop 4's coldest_outermost_loop is 1, colder_than_inner_loop is 2
> 
> 
> 2) But there are also cases inner loop is colder than outer loop, like:
> 
> loop 1's coldest outermost loop is 1, colder_than_inner_loop is NULL
> loop 2's coldest outermost loop is 2, colder_than_inner_loop is NULL
> loop 3's coldest outermost loop is 3, colder_than_inner_loop is NULL
> 
> 
>>
>> To optimize the common case but not avoiding iteration in the cases we care
>> about we could instead cache the next outermost loop that is _not_ colder
>> than loop.  So for your [ ... ] example above we'd have> 
>> hotter_than_inner_loop[loop] == outer (second_coldest_loop), where the
>> candidate would then be 'second_coldest_loop' and we'd then iterate
>> to hotter_than_inner_loop[hotter_than_inner_loop[loop]] to find the next
>> cold candidate we can compare against?  For the common case we'd
>> have hotter_than_inner_loop[looo] == NULL (no such loop) and we then
>> simply pick 'outermost_loop'.
> 
> Thanks.  It was difficult to understand, but finally I got to know what you
> want to express :)
> 
> We should cache the next loop that is *colder* than loop instead of '_not_ 
> colder
> than loop', and 'hotter_than_inner_loop' should be 'colder_than_inner_loop',
> then it makes sense if the coldest loop is outside of outermost loop, 
> continue to
> find a colder loop between outermost loop and current loop in
> colder_than_inner_loop[loop->num]?  Hope I understood you correctly...
> 
>>
>> One comment on the patch itself below.
>>
> 
> The loop in fill_cold_out_loop is also removed in the updated v7 patch.
> 
> 
> 
> [PATCH v7 2/2] Don't move cold code out of loop by checking bb count
> 
> From: Xiong Hu Luo 
> 
> v7 changes:
> 1. Refine get_coldest

Re: [PATCH v8 2/2] Don't move cold code out of loop by checking bb count

2021-12-05 Thread Xionghu Luo via Gcc-patches




On 2021/12/1 18:09, Richard Biener wrote:
> On Wed, Nov 10, 2021 at 4:08 AM Xionghu Luo  wrote:
>>
>>
>>
>> On 2021/11/4 21:00, Richard Biener wrote:
>>> On Wed, Nov 3, 2021 at 2:29 PM Xionghu Luo  wrote:
>>>>
>>>>
>>>>> +  while (outmost_loop != loop)
>>>>> +{
>>>>> +  if (bb_colder_than_loop_preheader (loop_preheader_edge
>>>>> (outmost_loop)->src,
>>>>> +loop_preheader_edge 
>>>>> (cold_loop)->src))
>>>>> +   cold_loop = outmost_loop;
>>>>> +  outmost_loop = superloop_at_depth (loop, loop_depth (outmost_loop) 
>>>>> + 1);
>>>>> +}
>>>>>
>>>>> could be instead written as
>>>>>
>>>>>   coldest_loop = coldest_outermost_loop[loop->num];
>>>>>   if (loop_depth (coldest_loop) < loop_depth (outermost_loop))
>>>>> return outermost_loop;
>>>>>   return coldest_loop;
>>>>>
>>>>> ?  And in the usual case coldest_outermost_loop[L] would be the loop tree 
>>>>> root.
>>>>> It should be possible to compute such cache in a DFS walk of the loop tree
>>>>> (the loop iterator by default visits in such order).
>>>>
>>>>
>>>> Thanks.  Updated the patch with your suggestion.  Not sure whether it 
>>>> strictly
>>>> conforms to your comments.  Though the patch passed all my added 
>>>> tests(coverage not enough),
>>>> I am still a bit worried if pre-computed coldest_loop is outside of 
>>>> outermost_loop, but
>>>> outermost_loop is not the COLDEST LOOP, i.e. (outer->inner)
>>>>
>>>>  [loop tree root, coldest_loop, outermost_loop,..., second_coldest_loop, 
>>>> ..., loop],
>>>>
>>>> then function find_coldest_out_loop will return a loop NOT accord with our
>>>> expectation, that should return second_coldest_loop instead of 
>>>> outermost_loop?
>>> Hmm, interesting - yes.  I guess the common case will be that the 
>>> pre-computed
>>> outermost loop will be the loop at depth 1 since outer loops tend to
>>> be colder than
>>> inner loops?  That would then defeat the whole exercise.
>>
>> It is not easy to construct such cases, But finally I got below results,
>>
>> 1) many cases inner loop is hotter than outer loop, for example:
>>
>> loop 1's coldest_outermost_loop is 1, colder_than_inner_loop is NULL
>> loop 2's coldest_outermost_loop is 1, colder_than_inner_loop is 1
>> loop 3's coldest_outermost_loop is 1, colder_than_inner_loop is 2
>> loop 4's coldest_outermost_loop is 1, colder_than_inner_loop is 2
>>
>>
>> 2) But there are also cases inner loop is colder than outer loop, like:
>>
>> loop 1's coldest outermost loop is 1, colder_than_inner_loop is NULL
>> loop 2's coldest outermost loop is 2, colder_than_inner_loop is NULL
>> loop 3's coldest outermost loop is 3, colder_than_inner_loop is NULL
>>
>>
>>>
>>> To optimize the common case but not avoiding iteration in the cases we care
>>> about we could instead cache the next outermost loop that is _not_ colder
>>> than loop.  So for your [ ... ] example above we'd have> 
>>> hotter_than_inner_loop[loop] == outer (second_coldest_loop), where the
>>> candidate would then be 'second_coldest_loop' and we'd then iterate
>>> to hotter_than_inner_loop[hotter_than_inner_loop[loop]] to find the next
>>> cold candidate we can compare against?  For the common case we'd
>>> have hotter_than_inner_loop[looo] == NULL (no such loop) and we then
>>> simply pick 'outermost_loop'.
>>
>> Thanks.  It was difficult to understand, but finally I got to know what you
>> want to express :)
>>
>> We should cache the next loop that is *colder* than loop instead of '_not_ 
>> colder
>> than loop', and 'hotter_than_inner_loop' should be 'colder_than_inner_loop',
>> then it makes sense if the coldest loop is outside of outermost loop, 
>> continue to
>> find a colder loop between outermost loop and current loop in
>> colder_than_inner_loop[loop->num]?  Hope I understood you correctly...
> 
> Heh, looking at the patch - I don't know.
> 
> To make the calls to bb_colder_than_loop_preheader more obvious can you
> change that

Ping: [PATCH v2] Fix incorrect loop exit edge probability [PR103270]

2021-12-05 Thread Xionghu Luo via Gcc-patches

Hi Honza,

Gentle ping for this :), thanks.

https://gcc.gnu.org/pipermail/gcc-patches/2021-November/585289.html


On 2021/11/24 13:03, Xionghu Luo via Gcc-patches wrote:
> On 2021/11/23 17:50, Jan Hubicka wrote:
>>> On Tue, Nov 23, 2021 at 6:52 AM Xionghu Luo  wrote:
>>>>
>>>> r12-4526 cancelled jump thread path rotates loop. It exposes a issue in
>>>> profile-estimate when predict_extra_loop_exits, outer loop's exit edge
>>>> is marked as inner loop's extra loop exit and set with incorrect
>>>> prediction, then a hot inner loop will become cold loop finally through
>>>> optimizations, this patch ignores the EDGE_DFS_BACK edge when searching
>>>> extra exit edges to avoid unexpected predict_edge.
>>>
>>> Not sure how outer vs. inner loop exit correlates with EDGE_DFS_BACK,
>>> I have expected a check based on which loop is exited by the edge instead?
>>> A backedge should never be an exit, no?
>>>
>>> Note that the profile pass does not yet mark backedges so EDGE_DFS_BACK
>>> settings are unreliable.
>>
>> So we have two nested loops and an exit which goes from inner loop and
>> exists both loops.  While processing outer loop we set pretty high exit
>> probability that is not good for inner loop?
> 
> No, the edge only belongs to outer loop only.  Can an exit edge belongs to
> two different loops at the same time?
> Exit edges are iterated with LI_FROM_INNERMOST in predict_loops, if an edge
> already has prediction by querying edge_predicted_by_p, maybe_predict_edge
> will early return to not set it again.
> 
> The CFG is:
> 
> 2
> |
> 8< // l1
> | \   |
> 10 9  |
> | |
>   7
> 6 <// l2
> | | 
> 11| 
> | |
> 4<-   |// l3
> | \|  |
> 5  3  |
> | |
> --
> 
> l2's edge (6->11,6->7) is set to (33%,67%) by l3 unexpectedly.
> 
> FYI: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103270#c5
> 
>>
>> I guess we could just check if exit edge source basic block has same
>> loop depth as the loop we are processing?
>>
> 
> 
> Thanks for the suggestion, it works.  Loop checks already existed in
> predict_paths_for_bb, just need pass down the loop argument.
> Updated as v2 patch.
> 
> 
> v2-0001-Fix-incorrect-loop-exit-edge-probability-PR103270.patch
> 
> r12-4526 cancelled jump thread path rotates loop. It exposes a issue in
> profile-estimate when predict_extra_loop_exits, outer loop's exit edge
> is marked as inner loop's extra loop exit and set with incorrect
> prediction, then a hot inner loop will become cold loop finally through
> optimizations, this patch add loop check when searching extra exit edges
> to avoid unexpected predict_edge from predict_paths_for_bb.
> 
> Regression tested pass on P8 & x86, OK for master?
> 
> gcc/ChangeLog:
> 
>   PR middle-end/103270
>   * predict.c (predict_extra_loop_exits): Add loop parameter.
>   (predict_loops): Call with loop argument.
> 
> gcc/testsuite/ChangeLog:
> 
>   PR middle-end/103270
>   * gcc.dg/pr103270.c: New test.
> ---
>  gcc/predict.c   | 10 ++
>  gcc/testsuite/gcc.dg/pr103270.c | 19 +++
>  2 files changed, 25 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/pr103270.c
> 
> diff --git a/gcc/predict.c b/gcc/predict.c
> index 68b11135680..082782ec4e9 100644
> --- a/gcc/predict.c
> +++ b/gcc/predict.c
> @@ -1859,7 +1859,7 @@ predict_iv_comparison (class loop *loop, basic_block bb,
> exits to predict them using PRED_LOOP_EXTRA_EXIT.  */
> 
>  static void
> -predict_extra_loop_exits (edge exit_edge)
> +predict_extra_loop_exits (class loop *loop, edge exit_edge)
>  {
>unsigned i;
>bool check_value_one;
> @@ -1912,12 +1912,14 @@ predict_extra_loop_exits (edge exit_edge)
>   continue;
>if (EDGE_COUNT (e->src->succs) != 1)
>   {
> -   predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN);
> +   predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN,
> +  loop);
> continue;
>   }
> 
>FOR_EACH_EDGE (e1, ei, e->src->preds)
> - predict_paths_leading_to_edge (e1, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN);
> + predict_paths_leading_to_edge (e1, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN,
> +loop);
>  }
>  }
> 
> @@ -2009,7 +2011,7 @@ predict_loops (void)
>ex->src->index, ex->dest->index);
> continue;
>

Re: [PATCH v8 2/2] Don't move cold code out of loop by checking bb count

2021-12-05 Thread Xionghu Luo via Gcc-patches




On 2021/12/6 13:09, Xionghu Luo via Gcc-patches wrote:
> 
> 
> On 2021/12/1 18:09, Richard Biener wrote:
>> On Wed, Nov 10, 2021 at 4:08 AM Xionghu Luo  wrote:
>>>
>>>
>>>
>>> On 2021/11/4 21:00, Richard Biener wrote:
>>>> On Wed, Nov 3, 2021 at 2:29 PM Xionghu Luo  wrote:
>>>>>
>>>>>
>>>>>> +  while (outmost_loop != loop)
>>>>>> +{
>>>>>> +  if (bb_colder_than_loop_preheader (loop_preheader_edge
>>>>>> (outmost_loop)->src,
>>>>>> +loop_preheader_edge 
>>>>>> (cold_loop)->src))
>>>>>> +   cold_loop = outmost_loop;
>>>>>> +  outmost_loop = superloop_at_depth (loop, loop_depth 
>>>>>> (outmost_loop) + 1);
>>>>>> +}
>>>>>>
>>>>>> could be instead written as
>>>>>>
>>>>>>   coldest_loop = coldest_outermost_loop[loop->num];
>>>>>>   if (loop_depth (coldest_loop) < loop_depth (outermost_loop))
>>>>>> return outermost_loop;
>>>>>>   return coldest_loop;
>>>>>>
>>>>>> ?  And in the usual case coldest_outermost_loop[L] would be the loop 
>>>>>> tree root.
>>>>>> It should be possible to compute such cache in a DFS walk of the loop 
>>>>>> tree
>>>>>> (the loop iterator by default visits in such order).
>>>>>
>>>>>
>>>>> Thanks.  Updated the patch with your suggestion.  Not sure whether it 
>>>>> strictly
>>>>> conforms to your comments.  Though the patch passed all my added 
>>>>> tests(coverage not enough),
>>>>> I am still a bit worried if pre-computed coldest_loop is outside of 
>>>>> outermost_loop, but
>>>>> outermost_loop is not the COLDEST LOOP, i.e. (outer->inner)
>>>>>
>>>>>  [loop tree root, coldest_loop, outermost_loop,..., second_coldest_loop, 
>>>>> ..., loop],
>>>>>
>>>>> then function find_coldest_out_loop will return a loop NOT accord with our
>>>>> expectation, that should return second_coldest_loop instead of 
>>>>> outermost_loop?
>>>> Hmm, interesting - yes.  I guess the common case will be that the 
>>>> pre-computed
>>>> outermost loop will be the loop at depth 1 since outer loops tend to
>>>> be colder than
>>>> inner loops?  That would then defeat the whole exercise.
>>>
>>> It is not easy to construct such cases, But finally I got below results,
>>>
>>> 1) many cases inner loop is hotter than outer loop, for example:
>>>
>>> loop 1's coldest_outermost_loop is 1, colder_than_inner_loop is NULL
>>> loop 2's coldest_outermost_loop is 1, colder_than_inner_loop is 1
>>> loop 3's coldest_outermost_loop is 1, colder_than_inner_loop is 2
>>> loop 4's coldest_outermost_loop is 1, colder_than_inner_loop is 2
>>>
>>>
>>> 2) But there are also cases inner loop is colder than outer loop, like:
>>>
>>> loop 1's coldest outermost loop is 1, colder_than_inner_loop is NULL
>>> loop 2's coldest outermost loop is 2, colder_than_inner_loop is NULL
>>> loop 3's coldest outermost loop is 3, colder_than_inner_loop is NULL
>>>
>>>
>>>>
>>>> To optimize the common case but not avoiding iteration in the cases we care
>>>> about we could instead cache the next outermost loop that is _not_ colder
>>>> than loop.  So for your [ ... ] example above we'd have> 
>>>> hotter_than_inner_loop[loop] == outer (second_coldest_loop), where the
>>>> candidate would then be 'second_coldest_loop' and we'd then iterate
>>>> to hotter_than_inner_loop[hotter_than_inner_loop[loop]] to find the next
>>>> cold candidate we can compare against?  For the common case we'd
>>>> have hotter_than_inner_loop[looo] == NULL (no such loop) and we then
>>>> simply pick 'outermost_loop'.
>>>
>>> Thanks.  It was difficult to understand, but finally I got to know what you
>>> want to express :)
>>>
>>> We should cache the next loop that is *colder* than loop instead of '_not_ 
>>> colder
>>> than loop', and 'hotter_than_inner_loop' should be 'c

[PATCH 0/3] Dependency patches for hoist LIM code to cold loop

2021-12-07 Thread Xionghu Luo via Gcc-patches

This patchset is a recollect of previously sent patches.  Thanks
Richard that The "Don't move cold code out of loop by checking bb count"
is approved[1], but there are still 3 prerequesite patches to supplement
or avoid regression.

1) Patch [1/3] is the RTL part of not hoisting LIM code out of cold loop, it
could improve perlbench by 7.69% [2].
2) Patch [2/3] is a test case regression fix for pr103270.c, after enabling
gimple part of hoisting LIM code to coldest loop [1], the store
instruction in loop won't be moved out of inner loop, it is caused by a
jump-threading patch unexpectedly turning a hot inner loop to cold loop,
this patch could recover the inner loop to be hot[3].
3) As data showed in [2], besides improvement, there is also a small regression
on SPEC2017 544.nab_r (-1.55%).  After investigation, it turned out to be
the profile count and probability is not correctly adjusted in loop
split, with this patch [3/3], the only regression is also fixed. This version
slightly updates [4] to fix ICEs.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2021-December/586319.html
[2] https://gcc.gnu.org/pipermail/gcc-patches/2021-September/580109.html
[3] https://gcc.gnu.org/pipermail/gcc-patches/2021-November/585195.html
[4] https://gcc.gnu.org/pipermail/gcc-patches/2021-November/585290.html

Xionghu Luo (3):
  loop-invariant: Don't move cold bb instructions to preheader in RTL
  Fix incorrect loop exit edge probability [PR103270]
  Fix loop split incorrect count and probability

 gcc/loop-invariant.c| 10 ++--
 gcc/predict.c   | 10 ++--
 gcc/tree-ssa-loop-split.c   | 85 +
 gcc/testsuite/gcc.dg/pr103270.c | 19 
 4 files changed, 109 insertions(+), 15 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr103270.c

-- 
2.25.1

[PATCH 1/3] loop-invariant: Don't move cold bb instructions to preheader in RTL

2021-12-07 Thread Xionghu Luo via Gcc-patches

gcc/ChangeLog:

* loop-invariant.c (find_invariants_bb): Check profile count
before motion.
(find_invariants_body): Add argument.
---
 gcc/loop-invariant.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c
index 5eee2e5c9f8..c61c8612fae 100644
--- a/gcc/loop-invariant.c
+++ b/gcc/loop-invariant.c
@@ -1183,9 +1183,14 @@ find_invariants_insn (rtx_insn *insn, bool 
always_reached, bool always_executed)
call.  */
 
 static void
-find_invariants_bb (basic_block bb, bool always_reached, bool always_executed)
+find_invariants_bb (class loop *loop, basic_block bb, bool always_reached,
+   bool always_executed)
 {
   rtx_insn *insn;
+  basic_block preheader = loop_preheader_edge (loop)->src;
+
+  if (preheader->count > bb->count)
+return;
 
   FOR_BB_INSNS (bb, insn)
 {
@@ -1214,8 +1219,7 @@ find_invariants_body (class loop *loop, basic_block *body,
   unsigned i;
 
   for (i = 0; i < loop->num_nodes; i++)
-find_invariants_bb (body[i],
-   bitmap_bit_p (always_reached, i),
+find_invariants_bb (loop, body[i], bitmap_bit_p (always_reached, i),
bitmap_bit_p (always_executed, i));
 }
 
-- 
2.25.1

[PATCH 2/3] Fix incorrect loop exit edge probability [PR103270]

2021-12-07 Thread Xionghu Luo via Gcc-patches

r12-4526 cancelled jump thread path rotates loop. It exposes a issue in
profile-estimate when predict_extra_loop_exits, outer loop's exit edge
is marked as inner loop's extra loop exit and set with incorrect
prediction, then a hot inner loop will become cold loop finally through
optimizations, this patch add loop check when searching extra exit edges
to avoid unexpected predict_edge from predict_paths_for_bb.

Regression tested on P8LE, OK for master?

gcc/ChangeLog:

PR middle-end/103270
* predict.c (predict_extra_loop_exits): Add loop parameter.
(predict_loops): Call with loop argument.

gcc/testsuite/ChangeLog:

PR middle-end/103270
* gcc.dg/pr103270.c: New test.
---
 gcc/predict.c   | 10 ++
 gcc/testsuite/gcc.dg/pr103270.c | 19 +++
 2 files changed, 25 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr103270.c

diff --git a/gcc/predict.c b/gcc/predict.c
index 3cb4e3c0eb5..5b6e0cf722b 100644
--- a/gcc/predict.c
+++ b/gcc/predict.c
@@ -1859,7 +1859,7 @@ predict_iv_comparison (class loop *loop, basic_block bb,
exits to predict them using PRED_LOOP_EXTRA_EXIT.  */
 
 static void
-predict_extra_loop_exits (edge exit_edge)
+predict_extra_loop_exits (class loop *loop, edge exit_edge)
 {
   unsigned i;
   bool check_value_one;
@@ -1912,12 +1912,14 @@ predict_extra_loop_exits (edge exit_edge)
continue;
   if (EDGE_COUNT (e->src->succs) != 1)
{
- predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN);
+ predict_paths_leading_to_edge (e, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN,
+loop);
  continue;
}
 
   FOR_EACH_EDGE (e1, ei, e->src->preds)
-   predict_paths_leading_to_edge (e1, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN);
+   predict_paths_leading_to_edge (e1, PRED_LOOP_EXTRA_EXIT, NOT_TAKEN,
+  loop);
 }
 }
 
@@ -2008,7 +2010,7 @@ predict_loops (void)
 ex->src->index, ex->dest->index);
  continue;
}
- predict_extra_loop_exits (ex);
+ predict_extra_loop_exits (loop, ex);
 
  if (number_of_iterations_exit (loop, ex, &niter_desc, false, false))
niter = niter_desc.niter;
diff --git a/gcc/testsuite/gcc.dg/pr103270.c b/gcc/testsuite/gcc.dg/pr103270.c
new file mode 100644
index 000..819310e360e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr103270.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-profile_estimate" } */
+
+void test(int a, int* i)
+{
+  for (; a < 5; ++a)
+{
+  int b = 0;
+  int c = 0;
+  for (; b != -11; b--)
+   for (int d = 0; d ==0; d++)
+ {
+   *i += c & a;
+   c = b;
+ }
+}
+}
+
+/* { dg-final { scan-tree-dump-not "extra loop exit heuristics of 
edge\[^:\]*:" "profile_estimate"} } */
-- 
2.25.1

[PATCH 3/3] Fix loop split incorrect count and probability

2021-12-07 Thread Xionghu Luo via Gcc-patches

In tree-ssa-loop-split.c, split_loop and split_loop_on_cond does two
kind of split. split_loop only works for single loop and insert edge at
exit when split, while split_loop_on_cond is not limited to single loop
and insert edge at latch when split.  Both split behavior should consider
loop count and probability update.  For split_loop, loop split condition
is moved in front of loop1 and loop2; But split_loop_on_cond moves the
condition between loop1 and loop2, this patch does:
 1) profile count proportion for both original loop and copied loop
without dropping down the true branch's count;
 2) probability update in the two loops and between the two loops.

Regression tested pass, OK for master?

Changes diff for split_loop and split_loop_on_cond cases:

1) diff base/loop-split.c.151t.lsplit patched/loop-split.c.152t.lsplit
...
[local count: 118111600]:
   if (beg_5(D) < end_8(D))
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 105119324]:
   if (beg2_6(D) < c_9(D))
-goto ; [100.00%]
+goto ; [33.00%]
   else
-goto ; [100.00%]
+goto ; [67.00%]

-   [local count: 105119324]:
+   [local count: 34689377]:
   _25 = beg_5(D) + 1;
   _26 = end_8(D) - beg_5(D);
   _27 = beg2_6(D) + _26;
   _28 = MIN_EXPR ;

-   [local count: 955630225]:
+   [local count: 315357973]:
   # i_16 = PHI 
   # j_17 = PHI 
   printf ("a: %d %d\n", i_16, j_17);
   i_11 = i_16 + 1;
   j_12 = j_17 + 1;
   if (j_12 < _28)
-goto ; [89.00%]
+goto ; [29.37%]
   else
-goto ; [11.00%]
+goto ; [70.63%]

-   [local count: 850510901]:
+   [local count: 280668596]:
   goto ; [100.00%]

-   [local count: 105119324]:
+   [local count: 70429947]:
   # i_22 = PHI 
   # j_23 = PHI 

[local count: 955630225]:
   # i_2 = PHI 
   # j_1 = PHI 
   i_20 = i_2 + 1;
   j_21 = j_1 + 1;
   if (end_8(D) > i_20)
-goto ; [89.00%]
+goto ; [59.63%]
   else
-goto ; [11.00%]
+goto ; [40.37%]

-   [local count: 850510901]:
+   [local count: 569842305]:
   goto ; [100.00%]

[local count: 105119324]:
   # i_29 = PHI 
   # j_30 = PHI 
   if (end_8(D) > i_29)
 goto ; [80.00%]
   else
 goto ; [20.00%]

[local count: 105119324]:

[local count: 118111600]:
   return 0;

 }
[local count: 118111600]:
-  if (beg_5(D) < end_8(D))
+  _1 = end_6(D) - beg_7(D);
+  j_9 = _1 + beg2_8(D);
+  if (end_6(D) > beg_7(D))
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 105119324]:
-  if (beg2_6(D) < c_9(D))
-goto ; [100.00%]
+  if (j_9 >= c_11(D))
+goto ; [33.00%]
   else
-goto ; [100.00%]
+goto ; [67.00%]

-   [local count: 105119324]:
-  _25 = beg_5(D) + 1;
-  _26 = end_8(D) - beg_5(D);
-  _27 = beg2_6(D) + _26;
-  _28 = MIN_EXPR ;
-
-   [local count: 955630225]:
-  # i_16 = PHI 
-  # j_17 = PHI 
-  printf ("a: %d %d\n", i_16, j_17);
-  i_11 = i_16 + 1;
-  j_12 = j_17 + 1;
-  if (j_12 < _28)
-goto ; [89.00%]
+   [local count: 34689377]:
+  _27 = end_6(D) + -1;
+  _28 = beg_7(D) - end_6(D);
+  _29 = j_9 + _28;
+  _30 = _29 + 1;
+  _31 = MAX_EXPR ;
+
+   [local count: 315357973]:
+  # i_18 = PHI 
+  # j_19 = PHI 
+  printf ("a: %d %d\n", i_18, j_19);
+  i_13 = i_18 + -1;
+  j_14 = j_19 + -1;
+  if (j_14 >= _31)
+goto ; [29.37%]
   else
-goto ; [11.00%]
+goto ; [70.63%]

-   [local count: 850510901]:
+   [local count: 280668596]:
   goto ; [100.00%]

-   [local count: 105119324]:
-  # i_22 = PHI 
-  # j_23 = PHI 
+   [local count: 70429947]:
+  # i_24 = PHI 
+  # j_25 = PHI 

[local count: 955630225]:
-  # i_2 = PHI 
-  # j_1 = PHI 
-  i_20 = i_2 + 1;
-  j_21 = j_1 + 1;
-  if (end_8(D) > i_20)
+  # i_3 = PHI 
+  # j_2 = PHI 
+  i_22 = i_3 + -1;
+  j_23 = j_2 + -1;
+  if (beg_7(D) < i_22)
 goto ; [89.00%]
   else
 goto ; [11.00%]

-   [local count: 850510901]:
+   [local count: 569842305]:
   goto ; [100.00%]

[local count: 105119324]:
-  # i_29 = PHI 
-  # j_30 = PHI 
-  if (end_8(D) > i_29)
+  # i_32 = PHI 
+  # j_33 = PHI 
+  if (beg_7(D) < i_32)
 goto ; [80.00%]
   else
 goto ; [20.00%]

[local count: 105119324]:

[local count: 118111600]:
   return 0;

 }

2) diff base/loop-cond-split-1.c.151t.lsplit  
patched/loop-cond-split-1.c.151t.lsplit:
...
[local count: 118111600]:
   if (n_7(D) > 0)
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 118111600]:
   return;

[local count: 105119324]:
   pretmp_3 = ga;

-   [local count: 955630225]:
+   [local count: 315357973]:
   # i_13 = PHI 
   # prephitmp_12 = PHI 
   if (prephitmp_12 != 0)
 goto ; [33.00%]
   else
 goto ; [67.00%]

[local count: 315357972]:
   _2 = do_something ();
   ga = _2;

-   [local count: 955630225]:
+   [local count: 315357973]:
   # prephitmp_5 = PHI 
   i_10 = inc (i_13);
   if (n_7(D) > i_10)
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 105119324]:
   goto ; [100.00%]

-   [local count: 850510901]:
+   [local count: 280668596]:
   if (prephitmp_12 != 0)
-goto ; [100.00%]
+

Re: [PATCH v8 2/2] Don't move cold code out of loop by checking bb count

2021-12-07 Thread Xionghu Luo via Gcc-patches




On 2021/12/7 20:17, Richard Biener wrote:
>>> +  class loop *coldest_loop = coldest_outermost_loop[loop->num];
>>> +  if (loop_depth (coldest_loop) < loop_depth (outermost_loop))
>>> +{
>>> +  class loop *hotter_loop = hotter_than_inner_loop[loop->num];
>>> +  if (!hotter_loop
>>> +   || loop_depth (hotter_loop) < loop_depth (outermost_loop))
>>> + return outermost_loop;
>>> +
>>> +  /*  hotter_loop is between OUTERMOST_LOOP and LOOP like:
>>> + [loop tree root, ..., coldest_loop, ..., outermost_loop, ...,
>>> + hotter_loop, second_coldest_loop, ..., loop]
>>> + return second_coldest_loop to be the hoist target.  */
>>> +  class loop *aloop;
>>> +  for (aloop = hotter_loop->inner; aloop; aloop = aloop->next)
>>> + if (flow_loop_nested_p (aloop, loop))
>> should be:
>>
>> if (aloop == loop || flow_loop_nested_p (aloop, loop))
> OK with that fixed.
> 
> Are necessary prerequesites committed to avoid regressions?
> I guess we need to keep a watchful eye and eventually revert
> (or gate with a --param disabled by default) the new behavior if
> severe regressions are discovered.
> 
> Thanks and sorry for the repeated delays.
> Richard.
> 

Thanks for your review, I learned quite a lot and gained very useful
comments & help through the period :)  There are still 3 patches required
to avoid regression or so, I've reorganized them and sent it out.

https://gcc.gnu.org/pipermail/gcc-patches/2021-December/586371.html


In addition, cooked the patch to add option for disable/enable it.
Is it OK to merge it to current patch?


[PATCH] Add option -fhoist-to-cold-loop


gcc/ChangeLog:

* common.opt: New.
* loop-invariant.c (find_invariants_bb):
* tree-ssa-loop-im.c (get_coldest_out_loop):
(can_sm_ref_p):
(loop_invariant_motion_in_fun):
---
 gcc/common.opt |  4 
 gcc/loop-invariant.c   |  2 +-
 gcc/tree-ssa-loop-im.c | 33 ++---
 3 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/gcc/common.opt b/gcc/common.opt
index b921f5e3b25..62b82bd8b95 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1171,6 +1171,10 @@ fcode-hoisting
 Common Var(flag_code_hoisting) Optimization
 Enable code hoisting.
 
+fhoist-to-cold-loop
+Common Var(flag_hoist_to_cold_loop) Init(1) Optimization
+Enable hoisting code to cold loop.
+
 fcombine-stack-adjustments
 Common Var(flag_combine_stack_adjustments) Optimization
 Looks for opportunities to reduce stack adjustments and stack references.
diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c
index 5c3be7bf0eb..75b9dd47cd7 100644
--- a/gcc/loop-invariant.c
+++ b/gcc/loop-invariant.c
@@ -1189,7 +1189,7 @@ find_invariants_bb (class loop *loop, basic_block bb, 
bool always_reached,
   rtx_insn *insn;
   basic_block preheader = loop_preheader_edge (loop)->src;
 
-  if (preheader->count > bb->count)
+  if (flag_hoist_to_cold_loop && preheader->count > bb->count)
 return;
 
   FOR_BB_INSNS (bb, insn)
diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index 565ee62d3f7..d745f66851b 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -450,6 +450,9 @@ static class loop *
 get_coldest_out_loop (class loop *outermost_loop, class loop *loop,
  basic_block curr_bb)
 {
+  if (!flag_hoist_to_cold_loop)
+return outermost_loop;
+
   gcc_assert (outermost_loop == loop
  || flow_loop_nested_p (outermost_loop, loop));
 
@@ -3031,8 +3034,9 @@ can_sm_ref_p (class loop *loop, im_mem_ref *ref)
   /* Verify whether the candidate is hot for LOOP.  Only do store motion if the
 candidate's profile count is hot.  Statement in cold BB shouldn't be moved
 out of it's loop_father.  */
-  if (!for_all_locs_in_loop (loop, ref, ref_in_loop_hot_body (loop)))
-return false;
+  if (flag_hoist_to_cold_loop)
+if (!for_all_locs_in_loop (loop, ref, ref_in_loop_hot_body (loop)))
+  return false;
 
   return true;
 }
@@ -3373,8 +3377,11 @@ tree_ssa_lim_finalize (void)
 
   free (bb_loop_postorder);
 
-  coldest_outermost_loop.release ();
-  hotter_than_inner_loop.release ();
+  if (flag_hoist_to_cold_loop)
+{
+  coldest_outermost_loop.release ();
+  hotter_than_inner_loop.release ();
+}
 }
 
 /* Moves invariants from loops.  Only "expensive" invariants are moved out --
@@ -3396,13 +3403,17 @@ loop_invariant_motion_in_fun (function *fun, bool 
store_motion)
 
   /* Pre-compute coldest outermost loop and nearest hotter loop of each loop.
*/
-  class loop *loop;
-  coldest_outermost_loop.create (number_of_loops (cfun));
-  coldest_outermost_loop.safe_grow_cleared (number_of_loops (cfun));
-  hotter_than_inner_loop.create (number_of_loops (cfun));
-  hotter_than_inner_loop.safe_grow_cleared (number_of_loops (cfun));
-  for (loop = current_loops->tree_root->inner; loop != NULL; loop = loop->next)
-fill_coldest_and_hotter_out_loop (loop, NULL, loop);
+  if (flag_hoist_to_cold_loop)

[PATCH] rs6000: powerpc suboptimal boolean test of contiguous bits [PR102239]

2021-12-12 Thread Xionghu Luo via Gcc-patches

Add specialized version to combine two instructions from

 9: {r123:CC=cmp(r124:DI&0x6,0);clobber scratch;}
   REG_DEAD r124:DI
 10: pc={(r123:CC==0)?L15:pc}
  REG_DEAD r123:CC

to:

 10: {pc={(r123:DI&0x6==0)?L15:pc};clobber scratch;clobber %0:CC;}

then split2 will split it to one rotate dot instruction (to save one
rotate back instruction) as shifted result doesn't matter when comparing
to 0 in CCEQmode.

Bootstrapped and regression tested pass on Power 8/9/10, OK for master?

gcc/ChangeLog:

PR target/102239
* config/rs6000/rs6000.md (*anddi3_insn_dot): New.

gcc/testsuite/ChangeLog:

PR target/102239
* gcc.target/powerpc/pr102239.c: New test.
---
 gcc/config/rs6000/rs6000-protos.h   |  1 +
 gcc/config/rs6000/rs6000.c  |  7 
 gcc/config/rs6000/rs6000.md | 38 +
 gcc/testsuite/gcc.target/powerpc/pr102239.c | 13 +++
 4 files changed, 59 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr102239.c

diff --git a/gcc/config/rs6000/rs6000-protos.h 
b/gcc/config/rs6000/rs6000-protos.h
index 14f6b313105..3644c524376 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -73,6 +73,7 @@ extern int expand_block_move (rtx[], bool);
 extern bool expand_block_compare (rtx[]);
 extern bool expand_strn_compare (rtx[], int);
 extern bool rs6000_is_valid_mask (rtx, int *, int *, machine_mode);
+extern bool rs6000_is_valid_rotate_dot_mask (rtx mask, machine_mode mode);
 extern bool rs6000_is_valid_and_mask (rtx, machine_mode);
 extern bool rs6000_is_valid_shift_mask (rtx, rtx, machine_mode);
 extern bool rs6000_is_valid_insert_mask (rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 5e129986516..57a38cf954a 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -11606,6 +11606,13 @@ rs6000_is_valid_mask (rtx mask, int *b, int *e, 
machine_mode mode)
   return true;
 }
 
+bool
+rs6000_is_valid_rotate_dot_mask (rtx mask, machine_mode mode)
+{
+  int nb, ne;
+  return rs6000_is_valid_mask (mask, &nb, &ne, mode) && nb >= ne && ne > 0;
+}
+
 /* Return whether MASK (a CONST_INT) is a valid mask for any rlwinm, rldicl,
or rldicr instruction, to implement an AND with it in mode MODE.  */
 
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 6bec2bddbde..014dc9612ea 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -3762,6 +3762,44 @@ (define_insn_and_split "*and3_2insn_dot2"
(set_attr "dot" "yes")
(set_attr "length" "8,12")])
 
+(define_insn_and_split "*anddi3_insn_dot"
+ [(set (pc)
+(if_then_else (eq (and:DI (match_operand:DI 1 "gpc_reg_operand" "%r,r")
+ (match_operand:DI 2 "const_int_operand" "n,n"))
+ (const_int 0))
+ (label_ref (match_operand 3 ""))
+ (pc)))
+  (clobber (match_scratch:DI 0 "=r,r"))
+  (clobber (reg:CC CR0_REGNO))]
+  "rs6000_is_valid_rotate_dot_mask (operands[2], DImode)
+  && TARGET_POWERPC64"
+  "#"
+  "&& reload_completed"
+  [(pc)]
+{
+   int nb, ne;
+   if (rs6000_is_valid_mask (operands[2], &nb, &ne, DImode)
+   && nb >= ne
+   && ne > 0)
+ {
+   unsigned HOST_WIDE_INT val = INTVAL (operands[2]);
+   int shift = 63 - nb;
+   rtx tmp = gen_rtx_ASHIFT (DImode, operands[1], GEN_INT (shift));
+   tmp = gen_rtx_AND (DImode, tmp, GEN_INT (val << shift));
+   rtx cr0 = gen_rtx_REG (CCmode, CR0_REGNO);
+   rs6000_emit_dot_insn (operands[0], tmp, 1, cr0);
+   rtx loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[3]);
+   rtx cond = gen_rtx_EQ (CCEQmode, cr0, const0_rtx);
+   rtx ite = gen_rtx_IF_THEN_ELSE (VOIDmode, cond, loc_ref, pc_rtx);
+   emit_jump_insn (gen_rtx_SET (pc_rtx, ite));
+   DONE;
+ }
+   else
+ FAIL;
+}
+  [(set_attr "type" "shift")
+   (set_attr "dot" "yes")
+   (set_attr "length" "8,12")])
 
 (define_expand "3"
   [(set (match_operand:SDI 0 "gpc_reg_operand")
diff --git a/gcc/testsuite/gcc.target/powerpc/pr102239.c 
b/gcc/testsuite/gcc.target/powerpc/pr102239.c
new file mode 100644
index 000..1bafc9fe18e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr102239.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2" } */
+
+void foo(long arg)
+{
+  if (arg & ((1UL << 33) | (1UL << 34)))
+asm volatile("# if");
+  else
+asm volatile("# else");
+}
+
+/* { dg-final { scan-assembler-times "rldicr." 1 } } */
-- 
2.25.1

Re: [PATCH 3/3] Fix loop split incorrect count and probability

2021-12-13 Thread Xionghu Luo via Gcc-patches




On 2021/12/9 07:47, Jeff Law wrote:
>> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
>> index 3f6ad046623..33128061aab 100644
>> --- a/gcc/tree-ssa-loop-split.c
>> +++ b/gcc/tree-ssa-loop-split.c
>>
>> @@ -607,6 +610,38 @@ split_loop (class loop *loop1)
>>   tree guard_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge
>> (loop1));
>>   patch_loop_exit (loop1, guard_stmt, guard_next, newend,
>> initial_true);
>>   +    update_ssa (TODO_update_ssa);
>> +
>> +    /* Proportion first loop's bb counts except those dominated by true
>> +   branch to avoid drop 1s down.  */
>> +    basic_block *bbs1, *bbs2;
>> +    bbs1 = get_loop_body (loop1);
>> +    unsigned j;
>> +    for (j = 0; j < loop1->num_nodes; j++)
>> +  if (bbs1[j] == loop1->latch
>> +  || !dominated_by_p (CDI_DOMINATORS, bbs1[j], true_edge->dest))
>> +    bbs1[j]->count
>> +  = bbs1[j]->count.apply_probability (true_edge->probability);
>> +    free (bbs1);
> It looks like there's two copies of this code in this patch, one in
> split_loop and the other in do_split_loop_on_cond.  Would it make sense
> to factor it out into its own little function?
> 
> 
>> +
>> +    /* Proportion second loop's bb counts except those dominated by
>> false
>> +   branch to avoid drop 1s down.  */
>> +    basic_block bbi_copy = get_bb_copy (false_edge->dest);
>> +    bbs2 = get_loop_body (loop2);
>> +    for (j = 0; j < loop2->num_nodes; j++)
>> +  if (bbs2[j] == loop2->latch
>> +  || !dominated_by_p (CDI_DOMINATORS, bbs2[j], bbi_copy))
>> +    bbs2[j]->count = bbs2[j]->count.apply_probability (
>> +  true_edge->probability.invert ());
>> +    free (bbs2);
> Similarly for this block of code.
> 
> If those can be reasonably factored out into two helper functions to be
> called from split_loop and do_split_loop_on_cond, then this is OK with
> the refactoring.
> 
> jeff


Thanks for the comments, updated as below.  Will commit this patchset and the
approved patch for LIM if there are no objections:


[PATCH v2 3/3] Fix loop split incorrect count and probability

In tree-ssa-loop-split.c, split_loop and split_loop_on_cond does two
kind of split. split_loop only works for single loop and insert edge at
exit when split, while split_loop_on_cond is not limited to single loop
and insert edge at latch when split.  Both split behavior should consider
loop count and probability update.  For split_loop, loop split condition
is moved in front of loop1 and loop2; But split_loop_on_cond moves the
condition between loop1 and loop2, this patch does:
 1) profile count proportion for both original loop and copied loop
without dropping down the true branch's count;
 2) probability update in the two loops and between the two loops.

Regression tested pass, OK for master?

Changes diff for split_loop and split_loop_on_cond cases:

1) diff base/loop-split.c.151t.lsplit patched/loop-split.c.152t.lsplit
...
[local count: 118111600]:
   _1 = end_6(D) - beg_7(D);
   j_9 = _1 + beg2_8(D);
   if (end_6(D) > beg_7(D))
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 105119324]:
   if (j_9 >= c_11(D))
-goto ; [100.00%]
+goto ; [33.00%]
   else
-goto ; [100.00%]
+goto ; [67.00%]

-   [local count: 105119324]:
+   [local count: 34689377]:
   _27 = end_6(D) + -1;
   _28 = beg_7(D) - end_6(D);
   _29 = j_9 + _28;
   _30 = _29 + 1;
   _31 = MAX_EXPR ;

-   [local count: 955630225]:
+   [local count: 315357973]:
   # i_18 = PHI 
   # j_19 = PHI 
   printf ("a: %d %d\n", i_18, j_19);
   i_13 = i_18 + -1;
   j_14 = j_19 + -1;
   if (j_14 >= _31)
-goto ; [89.00%]
+goto ; [29.37%]
   else
-goto ; [11.00%]
+goto ; [70.63%]

-   [local count: 850510901]:
+   [local count: 280668596]:
   goto ; [100.00%]

-   [local count: 105119324]:
+   [local count: 70429947]:
   # i_24 = PHI 
   # j_25 = PHI 

[local count: 955630225]:
   # i_3 = PHI 
   # j_2 = PHI 
   i_22 = i_3 + -1;
   j_23 = j_2 + -1;
   if (beg_7(D) < i_22)
 goto ; [89.00%]
   else
 goto ; [11.00%]

-   [local count: 850510901]:
+   [local count: 569842305]:
   goto ; [100.00%]

[local count: 105119324]:
   # i_32 = PHI 
   # j_33 = PHI 
   if (beg_7(D) < i_32)
 goto ; [80.00%]
   else
 goto ; [20.00%]

[local count: 105119324]:

[local count: 118111600]:
   return 0;

 }

2) diff base/loop-cond-split-1.c.151t.lsplit  
patched/loop-cond-split-1.c.151t.lsplit:
...
[local count: 118111600]:
   if (n_7(D) > 0)
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 118111600]:
   return;

[local count: 105119324]:
   pretmp_3 = ga;

-   [local count: 955630225]:
+   [local count: 315357973]:
   # i_13 = PHI 
   # prephitmp_12 = PHI 
   if (prephitmp_12 != 0)
 goto ; [33.00%]
   else
 goto ; [67.00%]

[local count: 315357972]:
   _2 = do_something ();
   ga = _2;

-   [local count: 955630225]:
+   [local count: 315357973]:
   # prephitmp_5 = PHI 
   i_

Re: [PATCH v4] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069]

2022-08-16 Thread Xionghu Luo via Gcc-patches





On 2022/8/16 14:53, Kewen.Lin wrote:

Hi Xionghu,

Thanks for the updated version of patch, some comments are inlined.

on 2022/8/11 14:15, Xionghu Luo wrote:



On 2022/8/11 01:07, Segher Boessenkool wrote:

On Wed, Aug 10, 2022 at 02:39:02PM +0800, Xionghu Luo wrote:

On 2022/8/9 11:01, Kewen.Lin wrote:

I have some concern on those changed "altivec_*_direct", IMHO the suffix
"_direct" is normally to indicate the define_insn is mapped to the
corresponding hw insn directly.  With this change, for example,
altivec_vmrghb_direct can be mapped into vmrghb or vmrglb, this looks
misleading.  Maybe we can add the corresponding _direct_le and _direct_be
versions, both are mapped into the same insn but have different RTL
patterns.  Looking forward to Segher's and David's suggestions.


Thanks!  Do you mean same RTL patterns with different hw insn?


A pattern called altivec_vmrghb_direct_le should always emit a vmrghb
instruction, never a vmrglb instead.  Misleading names are an expensive
problem.




Thanks.  Then on LE platforms, if user calls altivec_vmrghw，it will be
expanded to RTL (vec_select (vec_concat (R0 R1 (0 4 1 5))), and
finally matched to altivec_vmrglw_direct_v4si_le with ASM "vmrglw".
For BE just strict forward, seems more clear :-), OK for master?


[PATCH v3] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS 
[PR106069]

v3: rename altivec_vmrghb_direct_le to altivec_vmrglb_direct_le to match
the actual output ASM vmrglb. Likewise for all similar xxx_direct_le
patterns.
v2: Split the direct pattern to be and le with same RTL but different insn.

The native RTL expression for vec_mrghw should be same for BE and LE as
they are register and endian-independent.  So both BE and LE need
generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw
with vec_select and vec_concat.

(set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI
    (subreg:V4SI (reg:V16QI 139) 0)
    (subreg:V4SI (reg:V16QI 140) 0))
    [const_int 0 4 1 5]))

Then combine pass could do the nested vec_select optimization
in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE:

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5])
24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);}

=>

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel)
24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);}

The endianness check need only once at ASM generation finally.
ASM would be better due to nested vec_select simplified to simple scalar
load.

Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{64}
Linux(Thanks to Kewen).

gcc/ChangeLog:

 PR target/106069
 * config/rs6000/altivec.md (altivec_vmrghb_direct): Remove.
 (altivec_vmrghb_direct_be): New pattern for BE.
 (altivec_vmrglb_direct_le): New pattern for LE.
 (altivec_vmrghh_direct): Remove.
 (altivec_vmrghh_direct_be): New pattern for BE.
 (altivec_vmrglh_direct_le): New pattern for LE.
 (altivec_vmrghw_direct_): Remove.
 (altivec_vmrghw_direct__be): New pattern for BE.
 (altivec_vmrglw_direct__le): New pattern for LE.
 (altivec_vmrglb_direct): Remove.
 (altivec_vmrglb_direct_be): New pattern for BE.
 (altivec_vmrghb_direct_le): New pattern for LE.
 (altivec_vmrglh_direct): Remove.
 (altivec_vmrglh_direct_be): New pattern for BE.
 (altivec_vmrghh_direct_le): New pattern for LE.
 (altivec_vmrglw_direct_): Remove.
 (altivec_vmrglw_direct__be): New pattern for BE.
 (altivec_vmrghw_direct__le): New pattern for LE.
 * config/rs6000/rs6000.cc (altivec_expand_vec_perm_const):
 Adjust.
 * config/rs6000/vsx.md: Likewise.

gcc/testsuite/ChangeLog:

 PR target/106069
 * g++.target/powerpc/pr106069.C: New test.

Signed-off-by: Xionghu Luo 
---
  gcc/config/rs6000/altivec.md    | 223 ++--
  gcc/config/rs6000/rs6000.cc |  36 ++--
  gcc/config/rs6000/vsx.md    |  24 +--
  gcc/testsuite/g++.target/powerpc/pr106069.C | 120 +++
  4 files changed, 305 insertions(+), 98 deletions(-)
  create mode 100644 gcc/testsuite/g++.target/powerpc/pr106069.C

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 2c4940f2e21..78245f470e9 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -1144,15 +1144,17 @@ (define_expand "altivec_vmrghb"
     (use (match_operand:V16QI 2 "register_operand"))]
    "TARGET_ALTIVEC"
  {
-  rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct
-    : gen_altivec_vmrglb_direct;
-  if (!BYTES_BIG_ENDIAN)
-    std::swap (operands[1], operands[2]);
-  emit_insn (fun (operands[0], operands[1], operands[2]));
+  rtvec v = gen_rtvec (16, GEN_INT (0), GEN_INT (16), GEN_INT (1), GEN_INT 
(17),
+  GEN_INT (2), GEN_INT (18), GEN_IN

Ping: [PATCH v4] rs6000: Fix incorrect RTL for Power LE when removing the UNSPECS [PR106069]

2022-08-23 Thread Xionghu Luo via Gcc-patches

Hi Segher, I'd like to resend and ping for this patch. Thanks.
From 23bffdacdf0eb1140c7a3571e6158797f4818d57 Mon Sep 17 00:00:00 2001
From: Xionghu Luo 
Date: Thu, 4 Aug 2022 03:44:58 +
Subject: [PATCH v4] rs6000: Fix incorrect RTL for Power LE when removing the
 UNSPECS [PR106069]

v4: Update per comments.
v3: rename altivec_vmrghb_direct_le to altivec_vmrglb_direct_le to match
the actual output ASM vmrglb. Likewise for all similar xxx_direct_le
patterns.
v2: Split the direct pattern to be and le with same RTL but different insn.

The native RTL expression for vec_mrghw should be same for BE and LE as
they are register and endian-independent.  So both BE and LE need
generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw
with vec_select and vec_concat.

(set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI
   (subreg:V4SI (reg:V16QI 139) 0)
   (subreg:V4SI (reg:V16QI 140) 0))
   [const_int 0 4 1 5]))

Then combine pass could do the nested vec_select optimization
in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE:

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5])
24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);}

=>

21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel)
24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);}

The endianness check need only once at ASM generation finally.
ASM would be better due to nested vec_select simplified to simple scalar
load.

Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{32,64}
Linux.

gcc/ChangeLog:

PR target/106069
* config/rs6000/altivec.md (altivec_vmrghb_direct): Remove.
(altivec_vmrghb_direct_be): New pattern for BE.
(altivec_vmrghb_direct_le): New pattern for LE.
(altivec_vmrghh_direct): Remove.
(altivec_vmrghh_direct_be): New pattern for BE.
(altivec_vmrghh_direct_le): New pattern for LE.
(altivec_vmrghw_direct_): Remove.
(altivec_vmrghw_direct__be): New pattern for BE.
(altivec_vmrghw_direct__le): New pattern for LE.
(altivec_vmrglb_direct): Remove.
(altivec_vmrglb_direct_be): New pattern for BE.
(altivec_vmrglb_direct_le): New pattern for LE.
(altivec_vmrglh_direct): Remove.
(altivec_vmrglh_direct_be): New pattern for BE.
(altivec_vmrglh_direct_le): New pattern for LE.
(altivec_vmrglw_direct_): Remove.
(altivec_vmrglw_direct__be): New pattern for BE.
(altivec_vmrglw_direct__le): New pattern for LE.
* config/rs6000/rs6000.cc (altivec_expand_vec_perm_const):
Adjust.
* config/rs6000/vsx.md: Likewise.

gcc/testsuite/ChangeLog:

PR target/106069
* g++.target/powerpc/pr106069.C: New test.

Signed-off-by: Xionghu Luo 
---
 gcc/config/rs6000/altivec.md| 222 ++--
 gcc/config/rs6000/rs6000.cc |  24 +--
 gcc/config/rs6000/vsx.md|  28 +--
 gcc/testsuite/g++.target/powerpc/pr106069.C | 118 +++
 4 files changed, 307 insertions(+), 85 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/powerpc/pr106069.C

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 2c4940f2e21..c6a381908cb 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -1144,15 +1144,16 @@ (define_expand "altivec_vmrghb"
(use (match_operand:V16QI 2 "register_operand"))]
   "TARGET_ALTIVEC"
 {
-  rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct
-   : gen_altivec_vmrglb_direct;
-  if (!BYTES_BIG_ENDIAN)
-std::swap (operands[1], operands[2]);
-  emit_insn (fun (operands[0], operands[1], operands[2]));
+  if (BYTES_BIG_ENDIAN)
+emit_insn (
+  gen_altivec_vmrghb_direct_be (operands[0], operands[1], operands[2]));
+  else
+emit_insn (
+  gen_altivec_vmrglb_direct_le (operands[0], operands[2], operands[1]));
   DONE;
 })

-(define_insn "altivec_vmrghb_direct"
+(define_insn "altivec_vmrghb_direct_be"
   [(set (match_operand:V16QI 0 "register_operand" "=v")
(vec_select:V16QI
  (vec_concat:V32QI
@@ -1166,7 +1167,25 @@ (define_insn "altivec_vmrghb_direct"
 (const_int 5) (const_int 21)
 (const_int 6) (const_int 22)
 (const_int 7) (const_int 23)])))]
-  "TARGET_ALTIVEC"
+  "TARGET_ALTIVEC && BYTES_BIG_ENDIAN"
+  "vmrghb %0,%1,%2"
+  [(set_attr "type" "vecperm")])
+
+(define_insn "altivec_vmrghb_direct_le"
+  [(set (match_operand:V16QI 0 "register_operand" "=v")
+   (vec_select:V16QI
+ (vec_concat:V32QI
+   (match_operand:V16QI 2 "register_operand" "v")

Ping: [PATCH 0/4] rs6000: Enable variable vec_insert with IFN VEC_SET

2020-11-04 Thread Xionghu Luo via Gcc-patches


Ping.

On 2020/10/10 16:08, Xionghu Luo wrote:

Originated from
https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554240.html
with patch split and some refinement per review comments.

Patch of IFN VEC_SET for ARRAY_REF(VIEW_CONVERT_EXPR) is committed,
this patch set enables expanding IFN VEC_SET for Power9 and Power8
with specfic instruction sequences.

Xionghu Luo (4):
   rs6000: Change rs6000_expand_vector_set param
   rs6000: Support variable insert and Expand vec_insert in expander [PR79251]
   rs6000: Enable vec_insert for P8 with rs6000_expand_vector_set_var_p8
   rs6000: Update testcases' instruction count

  gcc/config/rs6000/rs6000-c.c  |  44 +++--
  gcc/config/rs6000/rs6000-call.c   |   2 +-
  gcc/config/rs6000/rs6000-protos.h |   3 +-
  gcc/config/rs6000/rs6000.c| 181 +-
  gcc/config/rs6000/vector.md   |   4 +-
  .../powerpc/fold-vec-insert-char-p8.c |   8 +-
  .../powerpc/fold-vec-insert-char-p9.c |  12 +-
  .../powerpc/fold-vec-insert-double.c  |  11 +-
  .../powerpc/fold-vec-insert-float-p8.c|   6 +-
  .../powerpc/fold-vec-insert-float-p9.c|  10 +-
  .../powerpc/fold-vec-insert-int-p8.c  |   6 +-
  .../powerpc/fold-vec-insert-int-p9.c  |  11 +-
  .../powerpc/fold-vec-insert-longlong.c|  10 +-
  .../powerpc/fold-vec-insert-short-p8.c|   6 +-
  .../powerpc/fold-vec-insert-short-p9.c|   8 +-
  .../gcc.target/powerpc/pr79251-run.c  |  28 +++
  gcc/testsuite/gcc.target/powerpc/pr79251.h|  19 ++
  gcc/testsuite/gcc.target/powerpc/pr79251.p8.c |  17 ++
  gcc/testsuite/gcc.target/powerpc/pr79251.p9.c |  18 ++
  .../gcc.target/powerpc/vsx-builtin-7.c|   4 +-
  20 files changed, 337 insertions(+), 71 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.c
  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.h
  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p8.c
  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p9.c



--
Thanks,
Xionghu

Ping^2: [PATCH 0/4] rs6000: Enable variable vec_insert with IFN VEC_SET

2020-11-12 Thread Xionghu Luo via Gcc-patches


Ping^2, thanks.

On 2020/11/5 09:34, Xionghu Luo via Gcc-patches wrote:

Ping.

On 2020/10/10 16:08, Xionghu Luo wrote:

Originated from
https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554240.html
with patch split and some refinement per review comments.

Patch of IFN VEC_SET for ARRAY_REF(VIEW_CONVERT_EXPR) is committed,
this patch set enables expanding IFN VEC_SET for Power9 and Power8
with specfic instruction sequences.

Xionghu Luo (4):
   rs6000: Change rs6000_expand_vector_set param
   rs6000: Support variable insert and Expand vec_insert in expander 
[PR79251]

   rs6000: Enable vec_insert for P8 with rs6000_expand_vector_set_var_p8
   rs6000: Update testcases' instruction count

  gcc/config/rs6000/rs6000-c.c  |  44 +++--
  gcc/config/rs6000/rs6000-call.c   |   2 +-
  gcc/config/rs6000/rs6000-protos.h |   3 +-
  gcc/config/rs6000/rs6000.c    | 181 +-
  gcc/config/rs6000/vector.md   |   4 +-
  .../powerpc/fold-vec-insert-char-p8.c |   8 +-
  .../powerpc/fold-vec-insert-char-p9.c |  12 +-
  .../powerpc/fold-vec-insert-double.c  |  11 +-
  .../powerpc/fold-vec-insert-float-p8.c    |   6 +-
  .../powerpc/fold-vec-insert-float-p9.c    |  10 +-
  .../powerpc/fold-vec-insert-int-p8.c  |   6 +-
  .../powerpc/fold-vec-insert-int-p9.c  |  11 +-
  .../powerpc/fold-vec-insert-longlong.c    |  10 +-
  .../powerpc/fold-vec-insert-short-p8.c    |   6 +-
  .../powerpc/fold-vec-insert-short-p9.c    |   8 +-
  .../gcc.target/powerpc/pr79251-run.c  |  28 +++
  gcc/testsuite/gcc.target/powerpc/pr79251.h    |  19 ++
  gcc/testsuite/gcc.target/powerpc/pr79251.p8.c |  17 ++
  gcc/testsuite/gcc.target/powerpc/pr79251.p9.c |  18 ++
  .../gcc.target/powerpc/vsx-builtin-7.c    |   4 +-
  20 files changed, 337 insertions(+), 71 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.c
  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.h
  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p8.c
  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.p9.c





--
Thanks,
Xionghu

Re: [PATCH] rs6000: Don't split constant operator add before reload, move to temp register for future optimization

2020-11-13 Thread Xionghu Luo via Gcc-patches

Hi,

On 2020/10/27 05:10, Segher Boessenkool wrote:
> On Wed, Oct 21, 2020 at 03:25:29AM -0500, Xionghu Luo wrote:
>> Don't split code from add3 for SDI to allow a later pass to split.
> 
> This is very problematic.
> 
>> This allows later logic to hoist out constant load in add instructions.
> 
> Later logic should be able to do that any way (I do not say that works
> perfectly, mind; it no doubt could be improved).
> 
>> In loop, lis+ori could be hoisted out to improve performance compared with
>> previous addis+addi (About 15% on typical case), weak point is
>> one more register is used and one more instruction is generated.  i.e.:
> 
> Yes, better performance on one testcase, and worse code always :-(
> 
>> addis 3,3,0x6765
>> addi 3,3,0x4321
>>
>> =>
>>
>> lis 9,0x6765
>> ori 9,9,0x4321
>> add 3,3,9
> 
> This is the typical kind of clumsy code you get if you generate RTL that
> matches actual machine instructions too late ("split too late").
> 
> So, please make it possible to hoist 2-insn-immediate sequences out of
> loops, *without* changing them to fake 1-insn things.
> 

As we discussed offline, addis+addi is not quite possible to be hoisted out of
loops as not invariant, update the patch as below, thanks:


[PATCH v2] rs6000: Split constant operator add in split1 instead of expander


Currently, ADD with positive 32bit constant is split to addis+addi
in expander, which seems too early to optimize the constant load out
of loop compared with other targets.  This patch use a temp register
to load the constant and do two register addition in expander same as
negative 32bit constant add.
This allows loop invariant pass to hoist out constant load before
add instructions, then split1 pass will split the load to lis+ori
after combine.  Performance could be improved by 15% on typical case
compared with previous addis+addi in loop.

(1) 0x67654321
addis 3,3,0x6765
addi 3,3,0x4321
=>
lis 9,0x6765
ori 9,9,0x4321
add 3,3,9

(2) 0x8fff
addis 9,9,0x1
addi 3,9,-28673
=>
li 10,0
ori 10,10,0x8fff
add 3,3,10

Regression and bootstrap tested pass on P8LE.

gcc/ChangeLog:

2020-10-21  Xiong Hu Luo  

* config/rs6000/rs6000.md (add3 for SDI): Don't split
before reload, move constant to temp register for add.
(define_split): Split const from split1.

gcc/testsuite/ChangeLog:

2020-10-21  Xiong Hu Luo  

* gcc.target/powerpc/add-const.c: New test.
---
 gcc/config/rs6000/rs6000.md  | 38 
 gcc/testsuite/gcc.target/powerpc/add-const.c | 18 ++
 2 files changed, 41 insertions(+), 15 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/add-const.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 5e5ad9f7c3d..b52e9555962 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -1750,18 +1750,26 @@ (define_expand "add3"
 
   if (CONST_INT_P (operands[2]) && !add_operand (operands[2], mode))
 {
-  rtx tmp = ((!can_create_pseudo_p ()
- || rtx_equal_p (operands[0], operands[1]))
-? operands[0] : gen_reg_rtx (mode));
-
-  /* Adding a constant to r0 is not a valid insn, so use a different
-strategy in that case.  */
-  if (reg_or_subregno (operands[1]) == 0 || reg_or_subregno (tmp) == 0)
+  bool reg0 = reg_or_subregno (operands[0]) == 0;
+  if (can_create_pseudo_p () || reg0)
{
- if (operands[0] == operands[1])
-   FAIL;
- rs6000_emit_move (operands[0], operands[2], mode);
- emit_insn (gen_add3 (operands[0], operands[1], operands[0]));
+
+ rtx tmp = (!can_create_pseudo_p ()
+ || rtx_equal_p (operands[0], operands[1]))
+   ? operands[0] : gen_reg_rtx (mode);
+
+ /* Adding a constant to r0 is not a valid insn, so use a different
+strategy in that case.  See stack-limit.c, need generate
+"24: %0:DI=0x20fa0; 25: %0:DI=%14:DI+%0:DI" in pro_and_epilogue
+when can_create_pseudo_p is false.  */
+ if (reg0 == 0 || reg_or_subregno (tmp) == 0)
+   {
+ if (operands[0] == operands[1])
+   FAIL;
+   }
+
+ rs6000_emit_move (tmp, operands[2], mode);
+ emit_insn (gen_add3 (operands[0], operands[1], tmp));
  DONE;
}
 
@@ -1775,8 +1783,8 @@ (define_expand "add3"
   /* The ordering here is important for the prolog expander.
 When space is allocated from the stack, adding 'low' first may
 produce a temporary deallocation (which would be bad).  */
-  emit_insn (gen_add3 (tmp, operands[1], GEN_INT (rest)));
-  emit_insn (gen_add3 (operands[0], tmp, GEN_INT (low)));
+  emit_insn (gen_add3 (operands[0], operands[1], GE

Re: [PATCH] tree-optimization/102155 - fix LIM fill_always_executed_in CFG walk

2021-09-01 Thread Xionghu Luo via Gcc-patches





On 2021/9/1 17:58, Richard Biener wrote:

This fixes the CFG walk order of fill_always_executed_in to use
RPO oder rather than the dominator based order computed by
get_loop_body_in_dom_order.  That fixes correctness issues with
unordered dominator children.

The RPO order computed by rev_post_order_and_mark_dfs_back_seme in
its for-iteration mode is a good match for the algorithm.

Xionghu, I've tried to only fix the CFG walk order issue and not
change anything else with this so we have a more correct base
to work against.  The code still walks inner loop bodies
up to loop depth times and thus is quadratic in the loop depth.

Bootstrapped and tested on x86_64-unknown-linux-gnu, if you don't
have any comments I plan to push this and then revisit what we
were circling around.


LGTM, thanks.



Richard.

2021-09-01  Richard Biener  

PR tree-optimization/102155
* tree-ssa-loop-im.c (fill_always_executed_in_1): Iterate
over a part of the RPO array and do not recurse here.
Dump blocks marked as always executed.
(fill_always_executed_in): Walk over the RPO array and
process loops whose header we run into.
(loop_invariant_motion_in_fun): Compute the first RPO
using rev_post_order_and_mark_dfs_back_seme in iteration
order and pass that to fill_always_executed_in.
---
  gcc/tree-ssa-loop-im.c | 136 ++---
  1 file changed, 73 insertions(+), 63 deletions(-)

diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index d9f75d5025e..f3706dcdb8a 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -3025,77 +3025,74 @@ do_store_motion (void)
  /* Fills ALWAYS_EXECUTED_IN information for basic blocks of LOOP, i.e.
 for each such basic block bb records the outermost loop for that execution
 of its header implies execution of bb.  CONTAINS_CALL is the bitmap of
-   blocks that contain a nonpure call.  */
+   blocks that contain a nonpure call.  The blocks of LOOP start at index
+   START of the RPO array of size N.  */
  
  static void

-fill_always_executed_in_1 (class loop *loop, sbitmap contains_call)
+fill_always_executed_in_1 (function *fun, class loop *loop,
+  int *rpo, int start, int n, sbitmap contains_call)
  {
-  basic_block bb = NULL, *bbs, last = NULL;
-  unsigned i;
-  edge e;
+  basic_block last = NULL;
class loop *inn_loop = loop;
  
-  if (ALWAYS_EXECUTED_IN (loop->header) == NULL)

+  for (int i = start; i < n; i++)
  {
-  bbs = get_loop_body_in_dom_order (loop);
-
-  for (i = 0; i < loop->num_nodes; i++)
-   {
- edge_iterator ei;
- bb = bbs[i];
-
- if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
-   last = bb;
+  basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
+  /* Stop when we iterated over all blocks in this loop.  */
+  if (!flow_bb_inside_loop_p (loop, bb))
+   break;
  
-	  if (bitmap_bit_p (contains_call, bb->index))

-   break;
+  if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
+   last = bb;
  
-	  FOR_EACH_EDGE (e, ei, bb->succs)

-   {
- /* If there is an exit from this BB.  */
- if (!flow_bb_inside_loop_p (loop, e->dest))
-   break;
- /* Or we enter a possibly non-finite loop.  */
- if (flow_loop_nested_p (bb->loop_father,
- e->dest->loop_father)
- && ! finite_loop_p (e->dest->loop_father))
-   break;
-   }
- if (e)
-   break;
+  if (bitmap_bit_p (contains_call, bb->index))
+   break;
  
-	  /* A loop might be infinite (TODO use simple loop analysis

-to disprove this if possible).  */
- if (bb->flags & BB_IRREDUCIBLE_LOOP)
+  edge_iterator ei;
+  edge e;
+  FOR_EACH_EDGE (e, ei, bb->succs)
+   {
+ /* If there is an exit from this BB.  */
+ if (!flow_bb_inside_loop_p (loop, e->dest))
break;
-
- if (!flow_bb_inside_loop_p (inn_loop, bb))
+ /* Or we enter a possibly non-finite loop.  */
+ if (flow_loop_nested_p (bb->loop_father,
+ e->dest->loop_father)
+ && ! finite_loop_p (e->dest->loop_father))
break;
+   }
+  if (e)
+   break;
  
-	  if (bb->loop_father->header == bb)

-   {
- if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
-   break;
+  /* A loop might be infinite (TODO use simple loop analysis
+to disprove this if possible).  */
+  if (bb->flags & BB_IRREDUCIBLE_LOOP)
+   break;
  
-	  /* In a loop that is always entered we may proceed anyway.

-But record that we entered it and stop once we leave it.  */
- inn_loop = bb->loop_father;
-   }
-   }
+  if (!flow_bb_inside_loop_p (inn_loop, bb))
+   break;
  
-

Re: [PATCH] tree-optimization/102155 - fix LIM fill_always_executed_in CFG walk

2021-09-02 Thread Xionghu Luo via Gcc-patches




On 2021/9/2 16:50, Richard Biener wrote:
> On Thu, 2 Sep 2021, Richard Biener wrote:
> 
>> On Thu, 2 Sep 2021, Xionghu Luo wrote:
>>
>>>
>>>
>>> On 2021/9/1 17:58, Richard Biener wrote:
>>>> This fixes the CFG walk order of fill_always_executed_in to use
>>>> RPO oder rather than the dominator based order computed by
>>>> get_loop_body_in_dom_order.  That fixes correctness issues with
>>>> unordered dominator children.
>>>>
>>>> The RPO order computed by rev_post_order_and_mark_dfs_back_seme in
>>>> its for-iteration mode is a good match for the algorithm.
>>>>
>>>> Xionghu, I've tried to only fix the CFG walk order issue and not
>>>> change anything else with this so we have a more correct base
>>>> to work against.  The code still walks inner loop bodies
>>>> up to loop depth times and thus is quadratic in the loop depth.
>>>>
>>>> Bootstrapped and tested on x86_64-unknown-linux-gnu, if you don't
>>>> have any comments I plan to push this and then revisit what we
>>>> were circling around.
>>>
>>> LGTM, thanks.
>>
>> I pushed it, thought again in the attempt to build a testcase and
>> concluded I was wrong with the appearant mishandling of
>> contains_call - get_loop_body_in_dom_order seems to be exactly
>> correct for this specific case.  So I reverted the commit again.
> 
> And I figured what the
> 
>/* In a loop that is always entered we may proceed anyway.
>   But record that we entered it and stop once we leave it.
> */
> 
> comment was about.  The code was present before the fix for PR78185
> and it was supposed to catch the case where the entered inner loop
> is not finite.  Just as the testcase from PR78185 shows the
> stopping was done too late when the exit block was already marked
> as to be always executed.  A simpler fix for PR78185 would have been
> to move
> 
>if (!flow_bb_inside_loop_p (inn_loop, bb))
>  break;
> 
> before setting of last = bb.  In fact the installed fix was more
> pessimistic than that given it terminated already when entering
> a possibly infinite loop.  So we can improve that by doing
> sth like which should also improve the situation for some of
> the cases you were looking at?
> 
> What remains is that we continue to stop when entering a
> not always executed loop:
> 
>if (bb->loop_father->header == bb)
>  {
>if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
>  break;

Yes.  This will cause blocks after inner loop missed to be check
if they are actually ALWAYS_EXECUTED.   I am afraid O(N^2) is 
inevitable here...

> 
> that I can at this point only explain by possible efficiency
> concerns?  Any better idea on that one?
>From experiment, early break from inner loop seems not cost shorter
time than full inner loop walk.  I will take more precise
measurement and larger data set on the function fill_always_executed_in_1
if necessary.   

My previous v2 patch also tried to update inn_loop level by level
when exiting from inn_loops, but it is proved to be  unnecessary
but you worried about the dominance order by get_loop_body_in_dom_order.

> 
> I'm going to test the patch below which improves the situation for
> 
> volatile int flag, bar;
> double foo (double *valp)
> {
>double sum = 0;
>for (int i = 0; i < 256; ++i)
>  {
>for (int j = 0; j < 256; ++j)
>  bar = flag;
>if (flag)
>  sum += 1.;
>sum += *valp;
>  }
>return sum;
> }

The patch still fails to handle cases like this:


struct X { int i; int j; int k;};
volatile int m;

void bar (struct X *x, int n, int l, int k)
{
  for (int i = 0; i < l; i++)
{
 if (k)
for (int j = 0; j < l; j++)
  {
if (m)
  x->i = m;
else
  x->i = 1 - m;

int *r = &x->k;
int tem2 = *r;
x->k += tem2 * j;
}

  x->i = m;
}
}

x->i is still not marked ALWAYS_EXECUTED for outer loop.


> 
> Thanks,
> Richard.
> 
> diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
> index d9f75d5025e..f0c93d6a882 100644
> --- a/gcc/tree-ssa-loop-im.c
> +++ b/gcc/tree-ssa-loop-im.c
> @@ -3044,23 +3044,27 @@ fill_always_executed_in_1 (class loop *loop,
> sbitmap contains_call)
>edge_iterator ei;
>bb = bbs[i];
>   
> + if (!flow_bb_inside_loop_p (inn_loop, bb))
> +   {
> +

Re: Ping ^ 2: [PATCH] rs6000: Expand fmod and remainder when built with fast-math [PR97142]

2021-09-02 Thread Xionghu Luo via Gcc-patches

Resend the patch that addressed Will's comments.


fmod/fmodf and remainder/remainderf could be expanded instead of library
call when fast-math build, which is much faster.

fmodf:
 fdivs   f0,f1,f2
 frizf0,f0
 fnmsubs f1,f2,f0,f1

remainderf:
 fdivs   f0,f1,f2
 frinf0,f0
 fnmsubs f1,f2,f0,f1

SPEC2017 Ofast P8LE: 511.povray_r +1.14%,  526.blender_r +1.72%

gcc/ChangeLog:

2021-09-03  Xionghu Luo  

PR target/97142
* config/rs6000/rs6000.md (fmod3): New define_expand.
(remainder3): Likewise.

gcc/testsuite/ChangeLog:

2021-09-03  Xionghu Luo  

PR target/97142
* gcc.target/powerpc/pr97142.c: New test.
---
 gcc/config/rs6000/rs6000.md| 36 ++
 gcc/testsuite/gcc.target/powerpc/pr97142.c | 35 +
 2 files changed, 71 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr97142.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index c8cdc42533c..84820d3b5cb 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -4932,6 +4932,42 @@ (define_insn "fre"
   [(set_attr "type" "fp")
(set_attr "isa" "*,")])
 
+(define_expand "fmod3"
+  [(use (match_operand:SFDF 0 "gpc_reg_operand"))
+   (use (match_operand:SFDF 1 "gpc_reg_operand"))
+   (use (match_operand:SFDF 2 "gpc_reg_operand"))]
+  "TARGET_HARD_FLOAT
+  && TARGET_FPRND
+  && flag_unsafe_math_optimizations"
+{
+  rtx div = gen_reg_rtx (mode);
+  emit_insn (gen_div3 (div, operands[1], operands[2]));
+
+  rtx friz = gen_reg_rtx (mode);
+  emit_insn (gen_btrunc2 (friz, div));
+
+  emit_insn (gen_nfms4 (operands[0], operands[2], friz, operands[1]));
+  DONE;
+ })
+
+(define_expand "remainder3"
+  [(use (match_operand:SFDF 0 "gpc_reg_operand"))
+   (use (match_operand:SFDF 1 "gpc_reg_operand"))
+   (use (match_operand:SFDF 2 "gpc_reg_operand"))]
+  "TARGET_HARD_FLOAT
+  && TARGET_FPRND
+  && flag_unsafe_math_optimizations"
+{
+  rtx div = gen_reg_rtx (mode);
+  emit_insn (gen_div3 (div, operands[1], operands[2]));
+
+  rtx frin = gen_reg_rtx (mode);
+  emit_insn (gen_round2 (frin, div));
+
+  emit_insn (gen_nfms4 (operands[0], operands[2], frin, operands[1]));
+  DONE;
+ })
+
 (define_insn "*rsqrt2"
   [(set (match_operand:SFDF 0 "gpc_reg_operand" "=,wa")
(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" ",wa")]
diff --git a/gcc/testsuite/gcc.target/powerpc/pr97142.c 
b/gcc/testsuite/gcc.target/powerpc/pr97142.c
new file mode 100644
index 000..e5306eb681b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr97142.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast" } */
+
+#include 
+
+float test1 (float x, float y)
+{
+  return fmodf (x, y);
+}
+
+double test2 (double x, double y)
+{
+  return fmod (x, y);
+}
+
+float test3 (float x, float y)
+{
+  return remainderf (x, y);
+}
+
+double test4 (double x, double y)
+{
+  return remainder (x, y);
+}
+
+/* { dg-final { scan-assembler-not {\mbl fmod\M} } } */
+/* { dg-final { scan-assembler-not {\mbl fmodf\M} } } */
+/* { dg-final { scan-assembler-not {\mbl remainder\M} } } */
+/* { dg-final { scan-assembler-not {\mbl remainderf\M} } } */
+/* { dg-final { scan-assembler-times {\mfdiv\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mfdivs\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mfnmsub\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mfnmsubs\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mfriz\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mfrin\M} 2 } } */
-- 
2.25.1

Ping ^ 2: [PATCH] rs6000: Fix wrong code generation for vec_sel [PR94613]

2021-09-05 Thread Xionghu Luo via Gcc-patches


Ping^2, thanks.

https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html

On 2021/6/30 09:42, Xionghu Luo via Gcc-patches wrote:

Gentle ping, thanks.

https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html


On 2021/5/14 14:57, Xionghu Luo via Gcc-patches wrote:

Hi,

On 2021/5/13 18:49, Segher Boessenkool wrote:

Hi!

On Fri, Apr 30, 2021 at 01:32:58AM -0500, Xionghu Luo wrote:

The vsel instruction is a bit-wise select instruction.  Using an
IF_THEN_ELSE to express it in RTL is wrong and leads to wrong code
being generated in the combine pass.  Per element selection is a
subset of per bit-wise selection,with the patch the pattern is
written using bit operations.  But there are 8 different patterns
to define "op0 := (op1 & ~op3) | (op2 & op3)":

(~op3&op1) | (op3&op2),
(~op3&op1) | (op2&op3),
(op3&op2) | (~op3&op1),
(op2&op3) | (~op3&op1),
(op1&~op3) | (op3&op2),
(op1&~op3) | (op2&op3),
(op3&op2) | (op1&~op3),
(op2&op3) | (op1&~op3),

Combine pass will swap (op1&~op3) to (~op3&op1) due to commutative
canonical, which could reduce it to the FIRST 4 patterns, but it won't
swap (op2&op3) | (~op3&op1) to (~op3&op1) | (op2&op3), so this patch
handles it with two patterns with different NOT op3 position and check
equality inside it.


Yup, that latter case does not have canonicalisation rules.  Btw, not
only combine does this canonicalisation: everything should,
non-canonical RTL is invalid RTL (in the instruction stream, you can do
everything in temporary code of course, as long as the RTL isn't
malformed).


-(define_insn "*altivec_vsel"
+(define_insn "altivec_vsel"
    [(set (match_operand:VM 0 "altivec_register_operand" "=v")
-    (if_then_else:VM
- (ne:CC (match_operand:VM 1 "altivec_register_operand" "v")
-    (match_operand:VM 4 "zero_constant" ""))
- (match_operand:VM 2 "altivec_register_operand" "v")
- (match_operand:VM 3 "altivec_register_operand" "v")))]
-  "VECTOR_MEM_ALTIVEC_P (mode)"
-  "vsel %0,%3,%2,%1"
+    (ior:VM
+ (and:VM
+  (not:VM (match_operand:VM 3 "altivec_register_operand" "v"))
+  (match_operand:VM 1 "altivec_register_operand" "v"))
+ (and:VM
+  (match_operand:VM 2 "altivec_register_operand" "v")
+  (match_operand:VM 4 "altivec_register_operand" "v"]
+  "VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode)
+  && (rtx_equal_p (operands[2], operands[3])
+  || rtx_equal_p (operands[4], operands[3]))"
+  {
+    if (rtx_equal_p (operands[2], operands[3]))
+  return "vsel %0,%1,%4,%3";
+    else
+  return "vsel %0,%1,%2,%3";
+  }
    [(set_attr "type" "vecmove")])


That rtx_equal_p stuff is nice and tricky, but it is a bit too tricky I
think.  So please write this as two patterns (and keep the expand if
that helps).


I was a bit concerned that there would be a lot of duplicate code if we
write two patterns for each vsel, totally 4 similar patterns in
altivec.md and another 4 in vsx.md make it difficult to maintain, however
I updated it since you prefer this way, as you pointed out the xxsel in
vsx.md could be folded by later patch.




+(define_insn "altivec_vsel2"


(same here of course).


  ;; Fused multiply add.
diff --git a/gcc/config/rs6000/rs6000-call.c 
b/gcc/config/rs6000/rs6000-call.c

index f5676255387..d65bdc01055 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -3362,11 +3362,11 @@ const struct altivec_builtin_types 
altivec_overloaded_builtins[] = {
  RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, 
RS6000_BTI_unsigned_V2DI },

    { ALTIVEC_BUILTIN_VEC_SEL, ALTIVEC_BUILTIN_VSEL_2DI,
  RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, 
RS6000_BTI_V2DI },

-  { ALTIVEC_BUILTIN_VEC_SEL, ALTIVEC_BUILTIN_VSEL_2DI,
+  { ALTIVEC_BUILTIN_VEC_SEL, ALTIVEC_BUILTIN_VSEL_2DI_UNS,


Are the _uns things still used for anything?  But, let's not change
this until Bill's stuff is in :-)

Why do you want to change this here, btw?  I don't understand.


OK, they are actually "unsigned type" overload builtin functions, change
it or not so far won't cause functionality issue, I will revert this 
change

in the updated patch.




+  if (target == 0
+  || GET_MODE (target) != tmode
+  || ! (*insn_data[icode].operand[0].predicate) (target, tmode))


No space after ! and other unary operators (except for casts and other
operators you write with alphanumerics, like "sizeof").  I know you
copied this code, but :-)


OK, thanks.



@@ -15608,8 +15606,6 @@ rs6000_emit_vector_cond_expr (rtx dest, rtx 
op_true, rtx op_false,

  case GEU:
  case LTU:

Ping ^ 2: [PATCH] rs6000: Remove unspecs for vec_mrghl[bhw]

2021-09-05 Thread Xionghu Luo via Gcc-patches


Ping^2, thanks.

https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572330.html


On 2021/6/30 09:47, Xionghu Luo via Gcc-patches wrote:

Gentle ping, thanks.

https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572330.html


On 2021/6/9 16:03, Xionghu Luo via Gcc-patches wrote:

Hi,

On 2021/6/9 07:25, Segher Boessenkool wrote:

On Mon, May 24, 2021 at 04:02:13AM -0500, Xionghu Luo wrote:

vmrghb only accepts permute index {0, 16, 1, 17, 2, 18, 3, 19, 4, 20,
5, 21, 6, 22, 7, 23} no matter for BE or LE in ISA, similarly for 
vmrghlb.


(vmrglb)


+  if (BYTES_BIG_ENDIAN)
+    emit_insn (
+  gen_altivec_vmrghb_direct (operands[0], operands[1], 
operands[2]));

+  else
+    emit_insn (
+  gen_altivec_vmrglb_direct (operands[0], operands[2], 
operands[1]));


Please don't indent like that, it doesn't match what we do elsewhere.
For better or for worse (for worse imo), we use deep hanging indents.
If you have to, you can do something like

   rtx insn;
   if (BYTES_BIG_ENDIAN)
 insn = gen_altivec_vmrghb_direct (operands[0], operands[1], 
operands[2]);

   else
 insn = gen_altivec_vmrglb_direct (operands[0], operands[2], 
operands[1]);

   emit_insn (insn);

(this is better even, in that it has only one emit_insn), or even

   rtx (*fun) () = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct
   : gen_altivec_vmrglb_direct;
   if (!BYTES_BIG_ENDIAN)
 std::swap (operands[1], operands[2]);
   emit_insn (fun (operands[0], operands[1], operands[2]));

Well, C++ does not allow that last example like that, sigh, so
   rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? 
gen_altivec_vmrghb_direct

    : gen_altivec_vmrglb_direct;

This is shorter than the other two options ;-)


Changed.




+(define_insn "altivec_vmrghb_direct"
    [(set (match_operand:V16QI 0 "register_operand" "=v")
+    (vec_select:V16QI


This should be indented one space more.


    "TARGET_ALTIVEC"
    "@
-   xxmrghw %x0,%x1,%x2
-   vmrghw %0,%1,%2"
+  xxmrghw %x0,%x1,%x2
+  vmrghw %0,%1,%2"


The original indent was correct, please restore.


-  emit_insn (gen_altivec_vmrghw_direct (operands[0], ve, vo));
+  emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], ve, 
vo));


When you see a mode as part of a pattern name, chances are that it will
be a good candidate for using parameterized names with.  (But don't do
that now, just keep it in mind as a nice cleanup to do).


OK.



@@ -23022,8 +23022,8 @@ altivec_expand_vec_perm_const (rtx target, 
rtx op0, rtx op1,

 : CODE_FOR_altivec_vmrglh_direct),
    {  0,  1, 16, 17,  2,  3, 18, 19,  4,  5, 20, 21,  6,  7, 
22, 23 } },

  { OPTION_MASK_ALTIVEC,
-  (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct
-   : CODE_FOR_altivec_vmrglw_direct),
+  (BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct_v4si
+   : CODE_FOR_altivec_vmrglw_direct_v4si),


The correct way is to align the ? and the : (or put everything on one
line of course, if that fits)

The parens around this are not needed btw, and are a distraction.


Changed.




--- a/gcc/testsuite/gcc.target/powerpc/builtins-1.c
+++ b/gcc/testsuite/gcc.target/powerpc/builtins-1.c
@@ -317,10 +317,10 @@ int main ()
  /* { dg-final { scan-assembler-times "vctuxs" 2 } } */
  /* { dg-final { scan-assembler-times "vmrghb" 4 { target be } } } */
-/* { dg-final { scan-assembler-times "vmrghb" 5 { target le } } } */
+/* { dg-final { scan-assembler-times "vmrghb" 6 { target le } } } */
  /* { dg-final { scan-assembler-times "vmrghh" 8 } } */
-/* { dg-final { scan-assembler-times "xxmrghw" 8 } } */
-/* { dg-final { scan-assembler-times "xxmrglw" 8 } } */
+/* { dg-final { scan-assembler-times "xxmrghw" 4 } } */
+/* { dg-final { scan-assembler-times "xxmrglw" 4 } } */
  /* { dg-final { scan-assembler-times "vmrglh" 8 } } */
  /* { dg-final { scan-assembler-times "xxlnor" 6 } } */
  /* { dg-final { scan-assembler-times {\mvpkudus\M} 1 } } */
@@ -347,7 +347,7 @@ int main ()
  /* { dg-final { scan-assembler-times "vspltb" 6 } } */
  /* { dg-final { scan-assembler-times "vspltw" 0 } } */
  /* { dg-final { scan-assembler-times "vmrgow" 8 } } */
-/* { dg-final { scan-assembler-times "vmrglb" 5 { target le } } } */
+/* { dg-final { scan-assembler-times "vmrglb" 4 { target le } } } */
  /* { dg-final { scan-assembler-times "vmrglb" 6 { target be } } } */
  /* { dg-final { scan-assembler-times "vmrgew" 8 } } */
  /* { dg-final { scan-assembler-times "vsplth" 8 } } */


Are those changes correct?  It looks like a vmrglb became a vmrghb, and
that 4 each of xxmrghw and xxmrglw disappeared?  Both seem wrong?



This case is built with "-mdejagnu-cpu=power8 -O0 -mno-fold-gimple -dp"
and it also counted the g

Re: Ping ^ 2: [PATCH] rs6000: Expand fmod and remainder when built with fast-math [PR97142]

2021-09-06 Thread Xionghu Luo via Gcc-patches





On 2021/9/4 05:44, Segher Boessenkool wrote:

Hi!

On Fri, Sep 03, 2021 at 10:31:24AM +0800, Xionghu Luo wrote:

fmod/fmodf and remainder/remainderf could be expanded instead of library
call when fast-math build, which is much faster.


Thank you very much for this patch.

Some trivial comments if you haven't commmitted it yet:


+(define_expand "fmod3"
+  [(use (match_operand:SFDF 0 "gpc_reg_operand"))
+   (use (match_operand:SFDF 1 "gpc_reg_operand"))
+   (use (match_operand:SFDF 2 "gpc_reg_operand"))]
+  "TARGET_HARD_FLOAT
+  && TARGET_FPRND
+  && flag_unsafe_math_optimizations"


It should have one extra space before each && here:


OK.



   "TARGET_HARD_FLOAT
&& TARGET_FPRND
&& flag_unsafe_math_optimizations"

(so that everything inside of the string aligns).


+(define_expand "remainder3"


(same here).


+/* { dg-final { scan-assembler-not {\mbl fmod\M} } } */
+/* { dg-final { scan-assembler-not {\mbl fmodf\M} } } */
+/* { dg-final { scan-assembler-not {\mbl remainder\M} } } */
+/* { dg-final { scan-assembler-not {\mbl remainderf\M} } } */


These are negative tests, so won't spuriously fail, but this does not
test for the function prefixes we can have.  See
gcc.target/powerpc/builtins-1.c for example.


Thanks.  Verified that different calls are generated on different platforms
without this patch.

P8BE-64: bl __fmodf_finite
P8BE-32: b __fmodf_finite
P8LE-64:  bl fmodf

"l", "__" and "_finite" are optional, so is it OK to check them with below 
patterns?

+/* { dg-final { scan-assembler-not {\mbl? (__)?fmod(_finite)?\M} } } */
+/* { dg-final { scan-assembler-not {\mbl? (__)?fmodf(_finite)?\M} } } */
+/* { dg-final { scan-assembler-not {\mbl? (__)?remainder(_finite)?\M} } } */
+/* { dg-final { scan-assembler-not {\mbl? (__)?remainderf(_finite)?\M} } } */




Again, thank you, and thanks to everyone else for the patch review
action :-)


Segher



--
Thanks,
Xionghu

Re: [RFC] Don't move cold code out of loop by checking bb count

2021-09-08 Thread Xionghu Luo via Gcc-patches





On 2021/8/26 19:33, Richard Biener wrote:

On Tue, Aug 10, 2021 at 4:03 AM Xionghu Luo  wrote:


Hi,

On 2021/8/6 20:15, Richard Biener wrote:

On Mon, Aug 2, 2021 at 7:05 AM Xiong Hu Luo  wrote:


There was a patch trying to avoid move cold block out of loop:

https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html

Richard suggested to "never hoist anything from a bb with lower execution
frequency to a bb with higher one in LIM invariantness_dom_walker
before_dom_children".

This patch does this profile count check in both gimple LIM
move_computations_worker and RTL loop-invariant.c find_invariants_bb,
if the loop bb is colder than loop preheader, don't hoist it out of
loop.

Also, the profile count in loop split pass should be corrected to avoid
lim2 and lim4 mismatch behavior, currently, the new loop preheader generated
by loop_version is set to "[count: 0]:", then lim4 after lsplt pass will
move statement out of loop unexpectely when lim2 didn't move it.  This
change could fix regression on 544.nab_r from -1.55% to +0.46%.

SPEC2017 performance evaluation shows 1% performance improvement for
intrate GEOMEAN and no obvious regression for others.  Especially,
500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is
largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00%
on P8LE.

Regression and bootstrap tested pass on P8LE, any comments?  Thanks.


While I'm not familiar with the RTL invariant motion pass the patch there
looks reasonable.  Note that we should assess the profile quality
somehow - I'm not sure how to do that, CCed Honza for that.


Thanks.



For the GIMPLE part the patch looks quite complicated - but note it
probably has to be since LIM performs kind of a "CSE" on loads
(and stores for store-motion), so when there are multiple stmts
affected by a hoisting decision the biggest block count has to be
accounted.  Likewise when there are dependent stmts involved
that might include conditional stmts (a "PHI"), but the overall
cost should be looked at.


Currently, The gimple code check two situations with the patch:
1) The statement or PHI‘s BB is *colder* then preheader, don't move it out
of loop;
2) The statement or PHI's BB is *hotter* then preheader, but any of it's rhs
couldn't be moved out of loop, also don't move it out of loop to avoid 
definition
not dominates use error.


But part 2) is obviously already done.  What I tried to say is your heuristic
doesn't integrate nicely with the pass but I admitted that it might be a bit
difficult to find a place to add this heuristic.

There is lim_data->cost which we could bias negatively but then this is
a cost that is independent on the hoisting distance.  But doing this would
work at least for the case where the immediately enclosing loop preheader
is hotter than the stmt and with this it would be a patch that's similarly
simple as the RTL one.

Another possibility is to simply only adjust PHI processing in
compute_invariantness, capping movement according to the hotness
heuristic.  The same could be done for regular stmts there but I'm
not sure that will do good in the end since this function is supposed
to compute "correctness" (well, it also has the cost stuff), and it's
not the place to do overall cost considerations.


Thanks.  I found that adding a function find_coldest_out_loop and check it in
outermost_invariant_loop to find the coldest invariant loop between outermost
loop and itself could also reach the purpose.  Then the gimple code check is
redundant and could be removed.




May be I could collect the number of instructions not hoisted with the patch
on regression tests and SPEC2017 to do a estimation for "multiple stmts 
affected"
and "overall cost" need to be considered?  But it seems move_computations_worker
couldn't rollback if we still want to hoist multiple stmts out during the 
iterations?



Now - GIMPLE LIM "costing" is somewhat backward right now
and it isn't set up to consider those multiple involved stmts.  Plus
the store-motion part does not have any cost part (but it depends
on previously decided invariant motions).

I think the way you implemented the check will cause no hoisting
to be performed instead of, say, hoisting to a different loop level
only.  Possibly shown when you consider a loop nest like

for (;;)
  if (unlikely_cond)
for (;;)
   invariant;

we want to hoist 'invariant' but only from the inner loop even if it
is invariant also in the outer loop.



For this case, theorotically I think the master GCC will optimize it to:

   invariant;
   for (;;)
 if (unlikely_cond)
   for (;;)
  ;

'invariant' is moved out of outer loop, but with the patch, it will get:

   for (;;)
 if (unlikely_cond)
   {
 invariant;
 for (;;)
;

Re: [PATCH] tree-optimization/102155 - fix LIM fill_always_executed_in CFG walk

2021-09-08 Thread Xionghu Luo via Gcc-patches





On 2021/9/2 18:37, Richard Biener wrote:

On Thu, 2 Sep 2021, Xionghu Luo wrote:




On 2021/9/2 16:50, Richard Biener wrote:

On Thu, 2 Sep 2021, Richard Biener wrote:


On Thu, 2 Sep 2021, Xionghu Luo wrote:




On 2021/9/1 17:58, Richard Biener wrote:

This fixes the CFG walk order of fill_always_executed_in to use
RPO oder rather than the dominator based order computed by
get_loop_body_in_dom_order.  That fixes correctness issues with
unordered dominator children.

The RPO order computed by rev_post_order_and_mark_dfs_back_seme in
its for-iteration mode is a good match for the algorithm.

Xionghu, I've tried to only fix the CFG walk order issue and not
change anything else with this so we have a more correct base
to work against.  The code still walks inner loop bodies
up to loop depth times and thus is quadratic in the loop depth.

Bootstrapped and tested on x86_64-unknown-linux-gnu, if you don't
have any comments I plan to push this and then revisit what we
were circling around.


LGTM, thanks.


I pushed it, thought again in the attempt to build a testcase and
concluded I was wrong with the appearant mishandling of
contains_call - get_loop_body_in_dom_order seems to be exactly
correct for this specific case.  So I reverted the commit again.


And I figured what the

/* In a loop that is always entered we may proceed anyway.
   But record that we entered it and stop once we leave it.
*/

comment was about.  The code was present before the fix for PR78185
and it was supposed to catch the case where the entered inner loop
is not finite.  Just as the testcase from PR78185 shows the
stopping was done too late when the exit block was already marked
as to be always executed.  A simpler fix for PR78185 would have been
to move

if (!flow_bb_inside_loop_p (inn_loop, bb))
  break;

before setting of last = bb.  In fact the installed fix was more
pessimistic than that given it terminated already when entering
a possibly infinite loop.  So we can improve that by doing
sth like which should also improve the situation for some of
the cases you were looking at?

What remains is that we continue to stop when entering a
not always executed loop:

if (bb->loop_father->header == bb)
  {
if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
  break;


Yes.  This will cause blocks after inner loop missed to be check
if they are actually ALWAYS_EXECUTED.   I am afraid O(N^2) is
inevitable here...


Yes.  What we can try is pre-computing whether a loop
has a call or an inner loop that might not terminate and then
when that's true for the loop to be entered continue to break;
but when not, skip processing that loop blocks (but we still
fill the blocks array, and we do need to do this in the order
for the loop we're processing ...).

So what I was thinking was to somehow embed the dominator
walk of get_loop_body_in_dom_order and instead of pre-recording
the above info (call, infinite loop) for loops, pre-record
it on the dominator tree so that we can ask "in any of our
dominator children, is there a call or an infinite loop" and
thus cut the dominator walk at loop header blocks that are
not dominating the outer loop latch ...

Of course the simplistic solution might be to simply do

if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)
&& ((loop_depth (bb->loop_father) - loop_depth (loop))
> param_max_lim_loop_depth_lookahead)))
  break;

and thus limit the processing of conditionally executed inner
loops by relative depth ... as you say the actual processing
is unlikely to be the bottleneck for the degenerate cases
of a very deep nest of conditionally executed loops.

But still for this case get_loop_body_in_dom_order is
doing quadratic processing so we can also say that
another linear walk over the produced array does not
increase complexity.> 


volatile int flag, bar;
double foo (double *valp)
{
double sum = 0;
for (int i = 0; i < 256; ++i)
  {
for (int j = 0; j < 256; ++j)
  bar = flag;
if (flag)
  sum += 1.;
sum += *valp;
  }
return sum;
}


The patch still fails to handle cases like this:


struct X { int i; int j; int k;};
volatile int m;

void bar (struct X *x, int n, int l, int k)
{
   for (int i = 0; i < l; i++)
 {
  if (k)
 for (int j = 0; j < l; j++)
   {
 if (m)
   x->i = m;
 else
   x->i = 1 - m;

 int *r = &x->k;
 int tem2 = *r;
 x->k += tem2 * j;
 }

   x->i = m;
 }
}

x->i is still not marked ALWAYS_EXECUTED for outer loop.




Collected data when build gcc stage1 and bootstrap.  There are still
about 9% bbs are missed to be marked with ALWAYS_EXECUTED.  Execution time
of fill_always_exec

Re: [PATCH] tree-optimization/102155 - fix LIM fill_always_executed_in CFG walk

2021-09-10 Thread Xionghu Luo via Gcc-patches





On 2021/9/9 18:55, Richard Biener wrote:

diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index 5d6845478e7..4b187c2cdaf 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -3074,15 +3074,13 @@ fill_always_executed_in_1 (class loop *loop, sbitmap 
contains_call)
break;
  
  	  if (bb->loop_father->header == bb)

-   {
- if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
-   break;
-
- /* In a loop that is always entered we may proceed anyway.
-But record that we entered it and stop once we leave it
-since it might not be finite.  */
- inn_loop = bb->loop_father;
-   }
+   /* Record that we enter into a subloop since it might not
+  be finite.  */
+   /* ???  Entering into a not always executed subloop makes
+  fill_always_executed_in quadratic in loop depth since
+  we walk those loops N times.  This is not a problem
+  in practice though, see PR102253 for a worst-case testcase.  */
+   inn_loop = bb->loop_father;



Yes your two patches extracted the get_loop_body_in_dom_order out and removed
the inn_loop break logic when it doesn't dominate outer loop.  Confirmed the 
replacement
could improve for saving ~10% build time due to not full DOM walker and marked 
the previously
ignored ALWAYS_EXECUTED bbs.
But if we don't break for inner loop again, why still keep the *inn_loop* 
variable?
It seems unnecessary and confusing, could we just remove it and restore the 
original
infinte loop check in bb->succs for better understanding?


diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index d1e2104233b..82a0509e0c4 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -3200,7 +3200,6 @@ fill_always_executed_in_1 (class loop *loop, sbitmap 
contains_call)
 {
   basic_block bb = NULL, last = NULL;
   edge e;
-  class loop *inn_loop = loop;

   if (ALWAYS_EXECUTED_IN (loop->header) == NULL)
 {
@@ -3213,17 +3212,6 @@ fill_always_executed_in_1 (class loop *loop, sbitmap 
contains_call)
  edge_iterator ei;
  bb = worklist.pop ();

- if (!flow_bb_inside_loop_p (inn_loop, bb))
-   {
- /* When we are leaving a possibly infinite inner loop
-we have to stop processing.  */
- if (!finite_loop_p (inn_loop))
-   break;
- /* If the loop was finite we can continue with processing
-the loop we exited to.  */
- inn_loop = bb->loop_father;
-   }
-
  if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
last = bb;

@@ -3232,8 +3220,15 @@ fill_always_executed_in_1 (class loop *loop, sbitmap 
contains_call)

 /* If LOOP exits from this BB stop processing.  */
  FOR_EACH_EDGE (e, ei, bb->succs)
+ {
if (!flow_bb_inside_loop_p (loop, e->dest))
  break;
+   /* Or we enter a possibly non-finite loop.  */
+   if (flow_loop_nested_p (bb->loop_father,
+ e->dest->loop_father)
+   && ! finite_loop_p (e->dest->loop_father))
+ break;
+ }
  if (e)
break;

@@ -3242,15 +3237,6 @@ fill_always_executed_in_1 (class loop *loop, sbitmap 
contains_call)
  if (bb->flags & BB_IRREDUCIBLE_LOOP)
break;

- if (bb->loop_father->header == bb)
-   /* Record that we enter into a subloop since it might not
-  be finite.  */
-   /* ???  Entering into a not always executed subloop makes
-  fill_always_executed_in quadratic in loop depth since
-  we walk those loops N times.  This is not a problem
-  in practice though, see PR102253 for a worst-case testcase.  */
-   inn_loop = bb->loop_father;
-
  /* Walk the body of LOOP sorted by dominance relation.  Additionally,
 if a basic block S dominates the latch, then only blocks dominated
 by S are after it.

 

  
  	  /* Walk the body of LOOP sorted by dominance relation.  Additionally,

 if a basic block S dominates the latch, then only blocks dominated


--
Thanks,
Xionghu

Re: [PATCH] tree-optimization/102155 - fix LIM fill_always_executed_in CFG walk

2021-09-12 Thread Xionghu Luo via Gcc-patches





On 2021/9/10 21:54, Xionghu Luo via Gcc-patches wrote:



On 2021/9/9 18:55, Richard Biener wrote:

diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index 5d6845478e7..4b187c2cdaf 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -3074,15 +3074,13 @@ fill_always_executed_in_1 (class loop *loop, 
sbitmap contains_call)

  break;
    if (bb->loop_father->header == bb)
-    {
-  if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
-    break;
-
-  /* In a loop that is always entered we may proceed anyway.
- But record that we entered it and stop once we leave it
- since it might not be finite.  */
-  inn_loop = bb->loop_father;
-    }
+    /* Record that we enter into a subloop since it might not
+   be finite.  */
+    /* ???  Entering into a not always executed subloop makes
+   fill_always_executed_in quadratic in loop depth since
+   we walk those loops N times.  This is not a problem
+   in practice though, see PR102253 for a worst-case 
testcase.  */

+    inn_loop = bb->loop_father;



Yes your two patches extracted the get_loop_body_in_dom_order out and 
removed
the inn_loop break logic when it doesn't dominate outer loop.  Confirmed 
the replacement
could improve for saving ~10% build time due to not full DOM walker and 
marked the previously

ignored ALWAYS_EXECUTED bbs.
But if we don't break for inner loop again, why still keep the 
*inn_loop* variable?
It seems unnecessary and confusing, could we just remove it and restore 
the original

infinte loop check in bb->succs for better understanding?



What's more, the refine of this fix is incorrect for PR78185.


commit 483e400870601f650c80f867ec781cd5f83507d6
Author: Richard Biener 
Date:   Thu Sep 2 10:47:35 2021 +0200

Refine fix for PR78185, improve LIM for code after inner loops

This refines the fix for PR78185 after understanding that the code

regarding to the comment 'In a loop that is always entered we may
proceed anyway.  But record that we entered it and stop once we leave
it.' was supposed to protect us from leaving possibly infinite inner
loops.  The simpler fix of moving the misplaced stopping code
can then be refined to continue processing when the exited inner
loop is finite, improving invariant motion for cases like in the
added testcase.

2021-09-02  Richard Biener  

* tree-ssa-loop-im.c (fill_always_executed_in_1): Refine

fix for PR78185 and continue processing when leaving
finite inner loops.

* gcc.dg/tree-ssa/ssa-lim-16.c: New testcase.



3<---
||
6<---|
| \  |   |
|  \ |   |
48   |
|--- |
|  | |
5  7--
|
1

loop 2 is an infinite loop, it is only ALWAYS_EXECUTED for loop 2,
but r12-3313-g483e40087 sets it ALWAYS_EXECUTED for loop 1.
We need to restore it like this:
https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579195.html


diff of pr78185.c.138t.lim2:
;;
 ;; Loop 1
 ;;  header 3, latch 7
 ;;  depth 1, outer 0
 ;;  nodes: 3 7 4 6 8
 ;;
 ;; Loop 2
 ;;  header 6, latch 8
 ;;  depth 2, outer 1
 ;;  nodes: 6 8
 ;; 2 succs { 3 }
 ;; 3 succs { 6 }
 ;; 6 succs { 4 8 }
 ;; 8 succs { 6 }
 ;; 4 succs { 7 5 }
 ;; 7 succs { 3 }
 ;; 5 succs { 1 }
 Memory reference 1: var1
-BB 6 is always executed in loop 1
 BB 3 is always executed in loop 1
+BB 6 is always executed in loop 2
 Basic block 3 (loop 1 -- depth 1):

 Basic block 6 (loop 2 -- depth 2):





diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index d1e2104233b..82a0509e0c4 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -3200,7 +3200,6 @@ fill_always_executed_in_1 (class loop *loop, 
sbitmap contains_call)

  {
    basic_block bb = NULL, last = NULL;
    edge e;
-  class loop *inn_loop = loop;

    if (ALWAYS_EXECUTED_IN (loop->header) == NULL)
  {
@@ -3213,17 +3212,6 @@ fill_always_executed_in_1 (class loop *loop, 
sbitmap contains_call)

   edge_iterator ei;
   bb = worklist.pop ();

- if (!flow_bb_inside_loop_p (inn_loop, bb))
-   {
- /* When we are leaving a possibly infinite inner loop
-    we have to stop processing.  */
- if (!finite_loop_p (inn_loop))
-   break;
- /* If the loop was finite we can continue with processing
-    the loop we exited to.  */
- inn_loop = bb->loop_father;
-   }
-
   if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
     last = bb;

@@ -3232,8 +3220,15 @@ fill_always_executed_in_1 (class loop *loop, 
sbitmap contains_call)


  /* If LOOP exits from this BB stop processing.  */
   FOR_EACH_EDGE (e, ei, bb->succs)
+ {
     if (!flow_bb_inside_loop_p (loop, e->dest))

Re: [PATCH] tree-optimization/102155 - fix LIM fill_always_executed_in CFG walk

2021-09-13 Thread Xionghu Luo via Gcc-patches





On 2021/9/13 16:17, Richard Biener wrote:

On Mon, 13 Sep 2021, Xionghu Luo wrote:




On 2021/9/10 21:54, Xionghu Luo via Gcc-patches wrote:



On 2021/9/9 18:55, Richard Biener wrote:

diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index 5d6845478e7..4b187c2cdaf 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -3074,15 +3074,13 @@ fill_always_executed_in_1 (class loop *loop,
sbitmap contains_call)
   break;
     if (bb->loop_father->header == bb)
-    {
-  if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
-    break;
-
-  /* In a loop that is always entered we may proceed anyway.
- But record that we entered it and stop once we leave it
- since it might not be finite.  */
-  inn_loop = bb->loop_father;
-    }
+    /* Record that we enter into a subloop since it might not
+   be finite.  */
+    /* ???  Entering into a not always executed subloop makes
+   fill_always_executed_in quadratic in loop depth since
+   we walk those loops N times.  This is not a problem
+   in practice though, see PR102253 for a worst-case testcase.  */
+    inn_loop = bb->loop_father;



Yes your two patches extracted the get_loop_body_in_dom_order out and
removed
the inn_loop break logic when it doesn't dominate outer loop.  Confirmed the
replacement
could improve for saving ~10% build time due to not full DOM walker and
marked the previously
ignored ALWAYS_EXECUTED bbs.
But if we don't break for inner loop again, why still keep the *inn_loop*
variable?
It seems unnecessary and confusing, could we just remove it and restore the
original
infinte loop check in bb->succs for better understanding?



What's more, the refine of this fix is incorrect for PR78185.


commit 483e400870601f650c80f867ec781cd5f83507d6
Author: Richard Biener 
Date:   Thu Sep 2 10:47:35 2021 +0200

 Refine fix for PR78185, improve LIM for code after inner loops
 
 This refines the fix for PR78185 after understanding that the code

 regarding to the comment 'In a loop that is always entered we may
 proceed anyway.  But record that we entered it and stop once we leave
 it.' was supposed to protect us from leaving possibly infinite inner
 loops.  The simpler fix of moving the misplaced stopping code
 can then be refined to continue processing when the exited inner
 loop is finite, improving invariant motion for cases like in the
 added testcase.
 
 2021-09-02  Richard Biener  
 
 * tree-ssa-loop-im.c (fill_always_executed_in_1): Refine

 fix for PR78185 and continue processing when leaving
 finite inner loops.
 
 * gcc.dg/tree-ssa/ssa-lim-16.c: New testcase.



3<---
||
6<---|
| \  |   |
|  \ |   |
48   |
|--- |
|  | |
5  7--
|
1

loop 2 is an infinite loop, it is only ALWAYS_EXECUTED for loop 2,
but r12-3313-g483e40087 sets it ALWAYS_EXECUTED for loop 1.
We need to restore it like this:
https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579195.html


I don't understand - BB6 is the header block of loop 2 which is
always entered and thus BB6 is always executed at least once.

The important part is that BB4 which follows the inner loop is
_not_ always executed because we don't know if we will exit the
inner loop.

What am I missing?


Oh, I see.  I only noticed the functionality change of the patch on the case
and no failure check of it, misunderstood it was a regression instead of an
improvement to also hoisting invariants from infinite loop, sorry about that.

Finally, the function fill_always_executed_in_1 could mark all ALWAYS_EXECUTED
bb both including and after all subloops' bb but break after exiting from
infinite subloops with better performance, thanks.  The only thing to be
worried is replacing get_loop_body_in_dom_order makes the code a bit more
complicated for later readers as the loop depth and DOM order is not a problem
here any more? ;)



Richard.



--
Thanks,
Xionghu

Re: Ping ^ 3: [PATCH] rs6000: Fix wrong code generation for vec_sel [PR94613]

2021-09-15 Thread Xionghu Luo via Gcc-patches


Ping^3, thanks.
 
https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html



On 2021/9/6 08:52, Xionghu Luo via Gcc-patches wrote:

Ping^2, thanks.

https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html

On 2021/6/30 09:42, Xionghu Luo via Gcc-patches wrote:

Gentle ping, thanks.

https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html


On 2021/5/14 14:57, Xionghu Luo via Gcc-patches wrote:

Hi,

On 2021/5/13 18:49, Segher Boessenkool wrote:

Hi!

On Fri, Apr 30, 2021 at 01:32:58AM -0500, Xionghu Luo wrote:

The vsel instruction is a bit-wise select instruction.  Using an
IF_THEN_ELSE to express it in RTL is wrong and leads to wrong code
being generated in the combine pass.  Per element selection is a
subset of per bit-wise selection,with the patch the pattern is
written using bit operations.  But there are 8 different patterns
to define "op0 := (op1 & ~op3) | (op2 & op3)":

(~op3&op1) | (op3&op2),
(~op3&op1) | (op2&op3),
(op3&op2) | (~op3&op1),
(op2&op3) | (~op3&op1),
(op1&~op3) | (op3&op2),
(op1&~op3) | (op2&op3),
(op3&op2) | (op1&~op3),
(op2&op3) | (op1&~op3),

Combine pass will swap (op1&~op3) to (~op3&op1) due to commutative
canonical, which could reduce it to the FIRST 4 patterns, but it won't
swap (op2&op3) | (~op3&op1) to (~op3&op1) | (op2&op3), so this patch
handles it with two patterns with different NOT op3 position and check
equality inside it.


Yup, that latter case does not have canonicalisation rules.  Btw, not
only combine does this canonicalisation: everything should,
non-canonical RTL is invalid RTL (in the instruction stream, you can do
everything in temporary code of course, as long as the RTL isn't
malformed).


-(define_insn "*altivec_vsel"
+(define_insn "altivec_vsel"
    [(set (match_operand:VM 0 "altivec_register_operand" "=v")
-    (if_then_else:VM
- (ne:CC (match_operand:VM 1 "altivec_register_operand" "v")
-    (match_operand:VM 4 "zero_constant" ""))
- (match_operand:VM 2 "altivec_register_operand" "v")
- (match_operand:VM 3 "altivec_register_operand" "v")))]
-  "VECTOR_MEM_ALTIVEC_P (mode)"
-  "vsel %0,%3,%2,%1"
+    (ior:VM
+ (and:VM
+  (not:VM (match_operand:VM 3 "altivec_register_operand" "v"))
+  (match_operand:VM 1 "altivec_register_operand" "v"))
+ (and:VM
+  (match_operand:VM 2 "altivec_register_operand" "v")
+  (match_operand:VM 4 "altivec_register_operand" "v"]
+  "VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode)
+  && (rtx_equal_p (operands[2], operands[3])
+  || rtx_equal_p (operands[4], operands[3]))"
+  {
+    if (rtx_equal_p (operands[2], operands[3]))
+  return "vsel %0,%1,%4,%3";
+    else
+  return "vsel %0,%1,%2,%3";
+  }
    [(set_attr "type" "vecmove")])


That rtx_equal_p stuff is nice and tricky, but it is a bit too tricky I
think.  So please write this as two patterns (and keep the expand if
that helps).


I was a bit concerned that there would be a lot of duplicate code if we
write two patterns for each vsel, totally 4 similar patterns in
altivec.md and another 4 in vsx.md make it difficult to maintain, 
however

I updated it since you prefer this way, as you pointed out the xxsel in
vsx.md could be folded by later patch.




+(define_insn "altivec_vsel2"


(same here of course).


  ;; Fused multiply add.
diff --git a/gcc/config/rs6000/rs6000-call.c 
b/gcc/config/rs6000/rs6000-call.c

index f5676255387..d65bdc01055 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -3362,11 +3362,11 @@ const struct altivec_builtin_types 
altivec_overloaded_builtins[] = {
  RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, 
RS6000_BTI_unsigned_V2DI },

    { ALTIVEC_BUILTIN_VEC_SEL, ALTIVEC_BUILTIN_VSEL_2DI,
  RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, 
RS6000_BTI_V2DI },

-  { ALTIVEC_BUILTIN_VEC_SEL, ALTIVEC_BUILTIN_VSEL_2DI,
+  { ALTIVEC_BUILTIN_VEC_SEL, ALTIVEC_BUILTIN_VSEL_2DI_UNS,


Are the _uns things still used for anything?  But, let's not change
this until Bill's stuff is in :-)

Why do you want to change this here, btw?  I don't understand.


OK, they are actually "unsigned type" overload builtin functions, change
it or not so far won't cause functionality issue, I will revert this 
change

in the updated patch.




+  if (target == 0
+  || GET_MODE (target) != tmode
+  || ! (*insn_data[icode].operand[0].predicate) (target, tmode))


No space after ! and other unary operators (except for casts and other
operators you write with alphanumerics, like "sizeof").  I know you
copied this code, but :-)

[PATCH v2 2/2] rs6000: Fold xxsel to vsel since they have same semantics

2021-09-16 Thread Xionghu Luo via Gcc-patches

Fold xxsel to vsel like xxperm/vperm to avoid duplicate code.

gcc/ChangeLog:

2021-09-17  Xionghu Luo  

* config/rs6000/altivec.md: Add vsx register constraints.
* config/rs6000/vsx.md (vsx_xxsel): Delete.
(vsx_xxsel2): Likewise.
(vsx_xxsel3): Likewise.
(vsx_xxsel4): Likewise.
---
 gcc/config/rs6000/altivec.md  | 60 +++
 gcc/config/rs6000/vsx.md  | 57 --
 gcc/testsuite/gcc.target/powerpc/builtins-1.c |  2 +-
 3 files changed, 37 insertions(+), 82 deletions(-)

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index a3424e1a458..4b4ca2c5d17 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -684,56 +684,68 @@ (define_insn "*altivec_gev4sf"
   [(set_attr "type" "veccmp")])
 
 (define_insn "altivec_vsel"
-  [(set (match_operand:VM 0 "altivec_register_operand" "=v")
+  [(set (match_operand:VM 0 "register_operand" "=wa,v")
(ior:VM
  (and:VM
-   (not:VM (match_operand:VM 3 "altivec_register_operand" "v"))
-   (match_operand:VM 1 "altivec_register_operand" "v"))
+   (not:VM (match_operand:VM 3 "register_operand" "wa,v"))
+   (match_operand:VM 1 "register_operand" "wa,v"))
  (and:VM
(match_dup 3)
-   (match_operand:VM 2 "altivec_register_operand" "v"]
+   (match_operand:VM 2 "register_operand" "wa,v"]
   "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)"
-  "vsel %0,%1,%2,%3"
-  [(set_attr "type" "vecmove")])
+  "@
+   xxsel %x0,%x1,%x2,%x3
+   vsel %0,%1,%2,%3"
+  [(set_attr "type" "vecmove")
+   (set_attr "isa" "")])
 
 (define_insn "altivec_vsel2"
-  [(set (match_operand:VM 0 "altivec_register_operand" "=v")
+  [(set (match_operand:VM 0 "register_operand" "=wa,v")
(ior:VM
  (and:VM
-   (not:VM (match_operand:VM 3 "altivec_register_operand" "v"))
-   (match_operand:VM 1 "altivec_register_operand" "v"))
+   (not:VM (match_operand:VM 3 "register_operand" "wa,v"))
+   (match_operand:VM 1 "register_operand" "wa,v"))
  (and:VM
-   (match_operand:VM 2 "altivec_register_operand" "v")
+   (match_operand:VM 2 "register_operand" "wa,v")
(match_dup 3]
   "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)"
-  "vsel %0,%1,%2,%3"
-  [(set_attr "type" "vecmove")])
+  "@
+   xxsel %x0,%x1,%x2,%x3
+   vsel %0,%1,%2,%3"
+  [(set_attr "type" "vecmove")
+   (set_attr "isa" "")])
 
 (define_insn "altivec_vsel3"
-  [(set (match_operand:VM 0 "altivec_register_operand" "=v")
+  [(set (match_operand:VM 0 "register_operand" "=wa,v")
(ior:VM
  (and:VM
-   (match_operand:VM 3 "altivec_register_operand" "v")
-   (match_operand:VM 1 "altivec_register_operand" "v"))
+   (match_operand:VM 3 "register_operand" "wa,v")
+   (match_operand:VM 1 "register_operand" "wa,v"))
  (and:VM
(not:VM (match_dup 3))
-   (match_operand:VM 2 "altivec_register_operand" "v"]
+   (match_operand:VM 2 "register_operand" "wa,v"]
   "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)"
-  "vsel %0,%2,%1,%3"
-  [(set_attr "type" "vecmove")])
+  "@
+   xxsel %x0,%x2,%x1,%x3
+   vsel %0,%2,%1,%3"
+  [(set_attr "type" "vecmove")
+   (set_attr "isa" "")])
 
 (define_insn "altivec_vsel4"
-  [(set (match_operand:VM 0 "altivec_register_operand" "=v")
+  [(set (match_operand:VM 0 "register_operand" "=wa,v")
(ior:VM
  (and:VM
-   (match_operand:VM 1 "altivec_register_operand" "v")
-   (match_operand:VM 3 "altivec_register_operand" "v"))
+   (match_operand:VM 1 "register_operand" "wa,v")
+   (match_operand:VM 3 "register_operand" "wa,v"))
  (and:VM
(not:VM (match_dup 3))
-   (match_operand:VM 2 "altivec_register_operand" "v"]
+   (match_operand:VM 2 "register_operand" "wa,v"]
   "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)"
-  "vsel

[PATCH v2 0/2] Fix vec_sel code generation and merge xxsel to vsel

2021-09-16 Thread Xionghu Luo via Gcc-patches

These two patches are updated version from:
https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579490.html

Changes:
1. Fix alignment error in md files.
2. Replace rtx_equal_p with match_dup.
3. Use register_operand instead of gpc_reg_operand to align with
   vperm/xxperm.
4. Regression tested pass on P8LE.

Xionghu Luo (2):
  rs6000: Fix wrong code generation for vec_sel [PR94613]
  rs6000: Fold xxsel to vsel since they have same semantics

 gcc/config/rs6000/altivec.md  | 84 ++-
 gcc/config/rs6000/rs6000-call.c   | 62 ++
 gcc/config/rs6000/rs6000.c| 19 ++---
 gcc/config/rs6000/vector.md   | 26 +++---
 gcc/config/rs6000/vsx.md  | 25 --
 gcc/testsuite/gcc.target/powerpc/builtins-1.c |  2 +-
 gcc/testsuite/gcc.target/powerpc/pr94613.c| 47 +++
 7 files changed, 193 insertions(+), 72 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr94613.c

-- 
2.25.1

[PATCH v2 1/2] rs6000: Fix wrong code generation for vec_sel [PR94613]

2021-09-16 Thread Xionghu Luo via Gcc-patches

The vsel instruction is a bit-wise select instruction.  Using an
IF_THEN_ELSE to express it in RTL is wrong and leads to wrong code
being generated in the combine pass.  Per element selection is a
subset of per bit-wise selection,with the patch the pattern is
written using bit operations.  But there are 8 different patterns
to define "op0 := (op1 & ~op3) | (op2 & op3)":

(~op3&op1) | (op3&op2),
(~op3&op1) | (op2&op3),
(op3&op2) | (~op3&op1),
(op2&op3) | (~op3&op1),
(op1&~op3) | (op3&op2),
(op1&~op3) | (op2&op3),
(op3&op2) | (op1&~op3),
(op2&op3) | (op1&~op3),

The latter 4 cases does not follow canonicalisation rules, non-canonical
RTL is invalid RTL in vregs pass.  Secondly, combine pass will swap
(op1&~op3) to (~op3&op1) by commutative canonical, which could reduce
it to the FIRST 4 patterns, but it won't swap (op2&op3) | (~op3&op1) to
(~op3&op1) | (op2&op3), so this patch handles it with 4 patterns with
different NOT op3 position and check equality inside it.

Tested pass on Power8LE, any comments?

gcc/ChangeLog:

2021-09-17  Xionghu Luo  

* config/rs6000/altivec.md (*altivec_vsel): Change to ...
(altivec_vsel): ... this and update define.
(*altivec_vsel_uns): Delete.
(altivec_vsel2): New define_insn.
(altivec_vsel3): Likewise.
(altivec_vsel4): Likewise.
* config/rs6000/rs6000-call.c (altivec_expand_vec_sel_builtin): New.
(altivec_expand_builtin): Call altivec_expand_vec_sel_builtin to expand
vel_sel.
* config/rs6000/rs6000.c (rs6000_emit_vector_cond_expr): Use bit-wise
selection instead of per element.
* config/rs6000/vector.md:
* config/rs6000/vsx.md (*vsx_xxsel): Change to ...
(vsx_xxsel): ... this and update define.
(*vsx_xxsel_uns): Delete.
(vsx_xxsel2): New define_insn.
    (vsx_xxsel3): Likewise.
(vsx_xxsel4): Likewise.

gcc/testsuite/ChangeLog:

2021-09-17  Xionghu Luo  

* gcc.target/powerpc/pr94613.c: New test.
---
 gcc/config/rs6000/altivec.md   | 62 --
 gcc/config/rs6000/rs6000-call.c| 62 ++
 gcc/config/rs6000/rs6000.c | 19 +++
 gcc/config/rs6000/vector.md| 26 +
 gcc/config/rs6000/vsx.md   | 60 -
 gcc/testsuite/gcc.target/powerpc/pr94613.c | 47 
 6 files changed, 221 insertions(+), 55 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr94613.c

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 93d237156d5..a3424e1a458 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -683,26 +683,56 @@ (define_insn "*altivec_gev4sf"
   "vcmpgefp %0,%1,%2"
   [(set_attr "type" "veccmp")])
 
-(define_insn "*altivec_vsel"
+(define_insn "altivec_vsel"
   [(set (match_operand:VM 0 "altivec_register_operand" "=v")
-   (if_then_else:VM
-(ne:CC (match_operand:VM 1 "altivec_register_operand" "v")
-   (match_operand:VM 4 "zero_constant" ""))
-(match_operand:VM 2 "altivec_register_operand" "v")
-(match_operand:VM 3 "altivec_register_operand" "v")))]
-  "VECTOR_MEM_ALTIVEC_P (mode)"
-  "vsel %0,%3,%2,%1"
+   (ior:VM
+ (and:VM
+   (not:VM (match_operand:VM 3 "altivec_register_operand" "v"))
+   (match_operand:VM 1 "altivec_register_operand" "v"))
+ (and:VM
+   (match_dup 3)
+   (match_operand:VM 2 "altivec_register_operand" "v"]
+  "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)"
+  "vsel %0,%1,%2,%3"
   [(set_attr "type" "vecmove")])
 
-(define_insn "*altivec_vsel_uns"
+(define_insn "altivec_vsel2"
   [(set (match_operand:VM 0 "altivec_register_operand" "=v")
-   (if_then_else:VM
-(ne:CCUNS (match_operand:VM 1 "altivec_register_operand" "v")
-  (match_operand:VM 4 "zero_constant" ""))
-(match_operand:VM 2 "altivec_register_operand" "v")
-(match_operand:VM 3 "altivec_register_operand" "v")))]
-  "VECTOR_MEM_ALTIVEC_P (mode)"
-  "vsel %0,%3,%2,%1"
+   (ior:VM
+ (and:VM
+   (not:VM (match_operand:VM 3 "altivec_register_operand" "v"))
+   (match_operand:VM 1 "altivec_register_operand" "v"))
+ (and:VM
+   (match_operand:VM 2 "altivec_register_operand" "v")
+   (match_dup 3]
+  "

Re: Ping ^ 3: [PATCH] rs6000: Fix wrong code generation for vec_sel [PR94613]

2021-09-16 Thread Xionghu Luo via Gcc-patches





On 2021/9/15 21:11, David Edelsohn wrote:

Hi, Xionhu

Should "altivec_vsel2" .. 3 .. 4 be "*altivec_vsel2", etc.
because they are combiner patterns and never referenced by name?  Only
the first, named pattern is referenced by the builtin code.


Thanks, updated the patchset with Segher's review comments, he didn't mention
about this and sorry to forget change this part,  I am also not
sure whether "altivec_vsel2" .. 3 .. 4 will be used/generated or
optimized by expander in future, is there any benefit to add "*" to the
define_insn patterns?



Other than that question / suggestion, this patch is okay.  Please
coordinate with Bill and his builtin patches.


OK.



Thanks, David

On Wed, Sep 15, 2021 at 3:50 AM Xionghu Luo  wrote:


Ping^3, thanks.

https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html


On 2021/9/6 08:52, Xionghu Luo via Gcc-patches wrote:

Ping^2, thanks.

https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html

On 2021/6/30 09:42, Xionghu Luo via Gcc-patches wrote:

Gentle ping, thanks.

https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570333.html


On 2021/5/14 14:57, Xionghu Luo via Gcc-patches wrote:

Hi,

On 2021/5/13 18:49, Segher Boessenkool wrote:

Hi!

On Fri, Apr 30, 2021 at 01:32:58AM -0500, Xionghu Luo wrote:

The vsel instruction is a bit-wise select instruction.  Using an
IF_THEN_ELSE to express it in RTL is wrong and leads to wrong code
being generated in the combine pass.  Per element selection is a
subset of per bit-wise selection,with the patch the pattern is
written using bit operations.  But there are 8 different patterns
to define "op0 := (op1 & ~op3) | (op2 & op3)":

(~op3&op1) | (op3&op2),
(~op3&op1) | (op2&op3),
(op3&op2) | (~op3&op1),
(op2&op3) | (~op3&op1),
(op1&~op3) | (op3&op2),
(op1&~op3) | (op2&op3),
(op3&op2) | (op1&~op3),
(op2&op3) | (op1&~op3),

Combine pass will swap (op1&~op3) to (~op3&op1) due to commutative
canonical, which could reduce it to the FIRST 4 patterns, but it won't
swap (op2&op3) | (~op3&op1) to (~op3&op1) | (op2&op3), so this patch
handles it with two patterns with different NOT op3 position and check
equality inside it.


Yup, that latter case does not have canonicalisation rules.  Btw, not
only combine does this canonicalisation: everything should,
non-canonical RTL is invalid RTL (in the instruction stream, you can do
everything in temporary code of course, as long as the RTL isn't
malformed).


-(define_insn "*altivec_vsel"
+(define_insn "altivec_vsel"
 [(set (match_operand:VM 0 "altivec_register_operand" "=v")
-(if_then_else:VM
- (ne:CC (match_operand:VM 1 "altivec_register_operand" "v")
-(match_operand:VM 4 "zero_constant" ""))
- (match_operand:VM 2 "altivec_register_operand" "v")
- (match_operand:VM 3 "altivec_register_operand" "v")))]
-  "VECTOR_MEM_ALTIVEC_P (mode)"
-  "vsel %0,%3,%2,%1"
+(ior:VM
+ (and:VM
+  (not:VM (match_operand:VM 3 "altivec_register_operand" "v"))
+  (match_operand:VM 1 "altivec_register_operand" "v"))
+ (and:VM
+  (match_operand:VM 2 "altivec_register_operand" "v")
+  (match_operand:VM 4 "altivec_register_operand" "v"]
+  "VECTOR_UNIT_ALTIVEC_OR_VSX_P (mode)
+  && (rtx_equal_p (operands[2], operands[3])
+  || rtx_equal_p (operands[4], operands[3]))"
+  {
+if (rtx_equal_p (operands[2], operands[3]))
+  return "vsel %0,%1,%4,%3";
+else
+  return "vsel %0,%1,%2,%3";
+  }
 [(set_attr "type" "vecmove")])


That rtx_equal_p stuff is nice and tricky, but it is a bit too tricky I
think.  So please write this as two patterns (and keep the expand if
that helps).


I was a bit concerned that there would be a lot of duplicate code if we
write two patterns for each vsel, totally 4 similar patterns in
altivec.md and another 4 in vsx.md make it difficult to maintain,
however
I updated it since you prefer this way, as you pointed out the xxsel in
vsx.md could be folded by later patch.




+(define_insn "altivec_vsel2"


(same here of course).


   ;; Fused multiply add.
diff --git a/gcc/config/rs6000/rs6000-call.c
b/gcc/config/rs6000/rs6000-call.c
index f5676255387..d65bdc01055 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -3362,11 +3362,11 @@ const struct altivec_builtin_types
altivec_overloaded_builtins[] = {
   RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI,
RS6000_BTI_unsigned_V2DI },
 { ALTIVEC_BUILTIN_VEC_SEL, ALTIVEC_BUILTIN_VSEL_2DI,
   RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V2DI,
RS6000_BTI_V2DI },
-

Re: [PATCH] Fix loop split incorrect count and probability

2021-09-22 Thread Xionghu Luo via Gcc-patches





On 2021/8/11 17:16, Richard Biener wrote:

On Wed, 11 Aug 2021, Xionghu Luo wrote:




On 2021/8/10 22:47, Richard Biener wrote:

On Mon, 9 Aug 2021, Xionghu Luo wrote:


Thanks,

On 2021/8/6 19:46, Richard Biener wrote:

On Tue, 3 Aug 2021, Xionghu Luo wrote:


loop split condition is moved between loop1 and loop2, the split bb's
count and probability should also be duplicated instead of (100% vs INV),
secondly, the original loop1 and loop2 count need be propotional from the
original loop.


diff base/loop-cond-split-1.c.151t.lsplit  
patched/loop-cond-split-1.c.151t.lsplit:
...
  int prephitmp_16;
  int prephitmp_25;

   [local count: 118111600]:
  if (n_7(D) > 0)
goto ; [89.00%]
  else
goto ; [11.00%]

   [local count: 118111600]:
  return;

   [local count: 105119324]:
  pretmp_3 = ga;

-   [local count: 955630225]:
+   [local count: 315357973]:
  # i_13 = PHI 
  # prephitmp_12 = PHI 
  if (prephitmp_12 != 0)
goto ; [33.00%]
  else
goto ; [67.00%]

-   [local count: 315357972]:
+   [local count: 104068130]:
  _2 = do_something ();
  ga = _2;

-   [local count: 955630225]:
+   [local count: 315357973]:
  # prephitmp_5 = PHI 
  i_10 = inc (i_13);
  if (n_7(D) > i_10)
goto ; [89.00%]
  else
goto ; [11.00%]

   [local count: 105119324]:
  goto ; [100.00%]

-   [local count: 850510901]:
+   [local count: 280668596]:
  if (prephitmp_12 != 0)
-goto ; [100.00%]
+goto ; [33.00%]
  else
-goto ; [INV]
+goto ; [67.00%]

-   [local count: 850510901]:
+   [local count: 280668596]:
  goto ; [100.00%]

-   [count: 0]:
+   [local count: 70429947]:
  # i_23 = PHI 
  # prephitmp_25 = PHI 

-   [local count: 955630225]:
+   [local count: 640272252]:
  # i_15 = PHI 
  # prephitmp_16 = PHI 
  i_22 = inc (i_15);
  if (n_7(D) > i_22)
goto ; [89.00%]
  else
goto ; [11.00%]

-   [local count: 850510901]:
+   [local count: 569842305]:
  goto ; [100.00%]

}

gcc/ChangeLog:

* tree-ssa-loop-split.c (split_loop): Fix incorrect probability.
(do_split_loop_on_cond): Likewise.
---
gcc/tree-ssa-loop-split.c | 16 
1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
index 3a09bbc39e5..8e5a7ded0f7 100644
--- a/gcc/tree-ssa-loop-split.c
+++ b/gcc/tree-ssa-loop-split.c
@@ -583,10 +583,10 @@ split_loop (class loop *loop1)
basic_block cond_bb;


if (!initial_true)
- cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond);
+ cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond);
+
+   edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE
+  ? EDGE_SUCC (bbs[i], 0)
+  : EDGE_SUCC (bbs[i], 1);


	class loop *loop2 = loop_version (loop1, cond, &cond_bb,

-  profile_probability::always (),
-  profile_probability::always (),
-  profile_probability::always (),
-  profile_probability::always (),
+  true_edge->probability,
+  true_edge->probability.invert (),
+  true_edge->probability,
+  true_edge->probability.invert (),
   true);


there is no 'true_edge' variable at this point.


Sorry, missed the above hunk when split the patch.




gcc_assert (loop2);

@@ -1486,10 +1486,10 @@ do_split_loop_on_cond (struct loop *loop1, edge invar_branch)

  initialize_original_copy_tables ();

  struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL,

-profile_probability::always (),
-profile_probability::never (),
-profile_probability::always (),
-profile_probability::always (),
+invar_branch->probability.invert (),
+invar_branch->probability,
+invar_branch->probability.invert (),
+invar_branch->probability,
 true);
  if (!loop2)
{


The patch introduction seems to talk about do_split_loop_on_cond only.


split_loop faces similar issue though it sets the two branches to 100% vs 100%
and no scaling which seems also incorrect.


Since loop versioning inserts a condition with the passed probabilities
but in this case a 'boolean_true_node' condition the th

Re: [RFC] Don't move cold code out of loop by checking bb count

2021-09-22 Thread Xionghu Luo via Gcc-patches





On 2021/9/22 17:14, Richard Biener wrote:

On Thu, Sep 9, 2021 at 3:56 AM Xionghu Luo  wrote:




On 2021/8/26 19:33, Richard Biener wrote:

On Tue, Aug 10, 2021 at 4:03 AM Xionghu Luo  wrote:


Hi,

On 2021/8/6 20:15, Richard Biener wrote:

On Mon, Aug 2, 2021 at 7:05 AM Xiong Hu Luo  wrote:


There was a patch trying to avoid move cold block out of loop:

https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html

Richard suggested to "never hoist anything from a bb with lower execution
frequency to a bb with higher one in LIM invariantness_dom_walker
before_dom_children".

This patch does this profile count check in both gimple LIM
move_computations_worker and RTL loop-invariant.c find_invariants_bb,
if the loop bb is colder than loop preheader, don't hoist it out of
loop.

Also, the profile count in loop split pass should be corrected to avoid
lim2 and lim4 mismatch behavior, currently, the new loop preheader generated
by loop_version is set to "[count: 0]:", then lim4 after lsplt pass will
move statement out of loop unexpectely when lim2 didn't move it.  This
change could fix regression on 544.nab_r from -1.55% to +0.46%.

SPEC2017 performance evaluation shows 1% performance improvement for
intrate GEOMEAN and no obvious regression for others.  Especially,
500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is
largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00%
on P8LE.

Regression and bootstrap tested pass on P8LE, any comments?  Thanks.


While I'm not familiar with the RTL invariant motion pass the patch there
looks reasonable.  Note that we should assess the profile quality
somehow - I'm not sure how to do that, CCed Honza for that.


Thanks.



For the GIMPLE part the patch looks quite complicated - but note it
probably has to be since LIM performs kind of a "CSE" on loads
(and stores for store-motion), so when there are multiple stmts
affected by a hoisting decision the biggest block count has to be
accounted.  Likewise when there are dependent stmts involved
that might include conditional stmts (a "PHI"), but the overall
cost should be looked at.


Currently, The gimple code check two situations with the patch:
1) The statement or PHI‘s BB is *colder* then preheader, don't move it out
of loop;
2) The statement or PHI's BB is *hotter* then preheader, but any of it's rhs
couldn't be moved out of loop, also don't move it out of loop to avoid 
definition
not dominates use error.


But part 2) is obviously already done.  What I tried to say is your heuristic
doesn't integrate nicely with the pass but I admitted that it might be a bit
difficult to find a place to add this heuristic.

There is lim_data->cost which we could bias negatively but then this is
a cost that is independent on the hoisting distance.  But doing this would
work at least for the case where the immediately enclosing loop preheader
is hotter than the stmt and with this it would be a patch that's similarly
simple as the RTL one.

Another possibility is to simply only adjust PHI processing in
compute_invariantness, capping movement according to the hotness
heuristic.  The same could be done for regular stmts there but I'm
not sure that will do good in the end since this function is supposed
to compute "correctness" (well, it also has the cost stuff), and it's
not the place to do overall cost considerations.


Thanks.  I found that adding a function find_coldest_out_loop and check it in
outermost_invariant_loop to find the coldest invariant loop between outermost
loop and itself could also reach the purpose.  Then the gimple code check is
redundant and could be removed.




May be I could collect the number of instructions not hoisted with the patch
on regression tests and SPEC2017 to do a estimation for "multiple stmts 
affected"
and "overall cost" need to be considered?  But it seems move_computations_worker
couldn't rollback if we still want to hoist multiple stmts out during the 
iterations?



Now - GIMPLE LIM "costing" is somewhat backward right now
and it isn't set up to consider those multiple involved stmts.  Plus
the store-motion part does not have any cost part (but it depends
on previously decided invariant motions).

I think the way you implemented the check will cause no hoisting
to be performed instead of, say, hoisting to a different loop level
only.  Possibly shown when you consider a loop nest like

 for (;;)
   if (unlikely_cond)
 for (;;)
invariant;

we want to hoist 'invariant' but only from the inner loop even if it
is invariant also in the outer loop.



For this case, theorotically I think the master GCC will optimize it to:

invariant;
for (;;)
  if (unlikely_cond)
for (;;)
   ;

'invariant' is moved out of outer loop, but with the patch, it

Re: [RFC] Don't move cold code out of loop by checking bb count

2021-09-22 Thread Xionghu Luo via Gcc-patches





On 2021/9/23 10:13, Xionghu Luo via Gcc-patches wrote:



On 2021/9/22 17:14, Richard Biener wrote:

On Thu, Sep 9, 2021 at 3:56 AM Xionghu Luo  wrote:




On 2021/8/26 19:33, Richard Biener wrote:
On Tue, Aug 10, 2021 at 4:03 AM Xionghu Luo  
wrote:


Hi,

On 2021/8/6 20:15, Richard Biener wrote:
On Mon, Aug 2, 2021 at 7:05 AM Xiong Hu Luo  
wrote:


There was a patch trying to avoid move cold block out of loop:

https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html

Richard suggested to "never hoist anything from a bb with lower 
execution

frequency to a bb with higher one in LIM invariantness_dom_walker
before_dom_children".

This patch does this profile count check in both gimple LIM
move_computations_worker and RTL loop-invariant.c 
find_invariants_bb,

if the loop bb is colder than loop preheader, don't hoist it out of
loop.

Also, the profile count in loop split pass should be corrected to 
avoid
lim2 and lim4 mismatch behavior, currently, the new loop 
preheader generated
by loop_version is set to "[count: 0]:", then lim4 after lsplt 
pass will
move statement out of loop unexpectely when lim2 didn't move it.  
This

change could fix regression on 544.nab_r from -1.55% to +0.46%.

SPEC2017 performance evaluation shows 1% performance improvement for
intrate GEOMEAN and no obvious regression for others.  Especially,
500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is
largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00%
on P8LE.

Regression and bootstrap tested pass on P8LE, any comments?  Thanks.


While I'm not familiar with the RTL invariant motion pass the 
patch there

looks reasonable.  Note that we should assess the profile quality
somehow - I'm not sure how to do that, CCed Honza for that.


Thanks.



For the GIMPLE part the patch looks quite complicated - but note it
probably has to be since LIM performs kind of a "CSE" on loads
(and stores for store-motion), so when there are multiple stmts
affected by a hoisting decision the biggest block count has to be
accounted.  Likewise when there are dependent stmts involved
that might include conditional stmts (a "PHI"), but the overall
cost should be looked at.


Currently, The gimple code check two situations with the patch:
1) The statement or PHI‘s BB is *colder* then preheader, don't move 
it out

of loop;
2) The statement or PHI's BB is *hotter* then preheader, but any of 
it's rhs
couldn't be moved out of loop, also don't move it out of loop to 
avoid definition

not dominates use error.


But part 2) is obviously already done.  What I tried to say is your 
heuristic
doesn't integrate nicely with the pass but I admitted that it might 
be a bit

difficult to find a place to add this heuristic.

There is lim_data->cost which we could bias negatively but then this is
a cost that is independent on the hoisting distance.  But doing this 
would
work at least for the case where the immediately enclosing loop 
preheader
is hotter than the stmt and with this it would be a patch that's 
similarly

simple as the RTL one.

Another possibility is to simply only adjust PHI processing in
compute_invariantness, capping movement according to the hotness
heuristic.  The same could be done for regular stmts there but I'm
not sure that will do good in the end since this function is supposed
to compute "correctness" (well, it also has the cost stuff), and it's
not the place to do overall cost considerations.


Thanks.  I found that adding a function find_coldest_out_loop and 
check it in
outermost_invariant_loop to find the coldest invariant loop between 
outermost
loop and itself could also reach the purpose.  Then the gimple code 
check is

redundant and could be removed.



May be I could collect the number of instructions not hoisted with 
the patch
on regression tests and SPEC2017 to do a estimation for "multiple 
stmts affected"
and "overall cost" need to be considered?  But it seems 
move_computations_worker
couldn't rollback if we still want to hoist multiple stmts out 
during the iterations?




Now - GIMPLE LIM "costing" is somewhat backward right now
and it isn't set up to consider those multiple involved stmts.  Plus
the store-motion part does not have any cost part (but it depends
on previously decided invariant motions).

I think the way you implemented the check will cause no hoisting
to be performed instead of, say, hoisting to a different loop level
only.  Possibly shown when you consider a loop nest like

 for (;;)
   if (unlikely_cond)
 for (;;)
    invariant;

we want to hoist 'invariant' but only from the inner loop even if it
is invariant also in the outer loop.



For this case, theorotically I think the master GCC will optimize 
it to:


    invariant;
    for (;;)
  if (unlikely_cond)

Re: [RFC] Don't move cold code out of loop by checking bb count

2021-09-23 Thread Xionghu Luo via Gcc-patches

Update the patch to v3, not sure whether you prefer the paste style
and continue to link the previous thread as Segher dislikes this...


[PATCH v3] Don't move cold code out of loop by checking bb count


Changes:
1. Handle max_loop in determine_max_movement instead of
outermost_invariant_loop.
2. Remove unnecessary changes.
3. Add for_all_locs_in_loop (loop, ref, ref_in_loop_hot_body) in can_sm_ref_p.
4. "gsi_next (&bsi);" in move_computations_worker is kept since it caused
infinite loop when implementing v1 and the iteration is missed to be
updated actually.

v1: https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576488.html
v2: https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579086.html

There was a patch trying to avoid move cold block out of loop:

https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html

Richard suggested to "never hoist anything from a bb with lower execution
frequency to a bb with higher one in LIM invariantness_dom_walker
before_dom_children".

In gimple LIM analysis, add find_coldest_out_loop to move invariants to
expected target loop, if profile count of the loop bb is colder
than target loop preheader, it won't be hoisted out of loop.
Likely for store motion, if all locations of the REF in loop is cold,
don't do store motion of it.

SPEC2017 performance evaluation shows 1% performance improvement for
intrate GEOMEAN and no obvious regression for others.  Especially,
500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is
largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00%
on P8LE.

gcc/ChangeLog:

* loop-invariant.c (find_invariants_bb): Check profile count
before motion.
(find_invariants_body): Add argument.
* tree-ssa-loop-im.c (find_coldest_out_loop): New function.
(determine_max_movement): Use find_coldest_out_loop.
(move_computations_worker): Adjust and fix iteration udpate.
(execute_sm_exit): Check pointer validness.
(class ref_in_loop_hot_body): New functor.
(ref_in_loop_hot_body::operator): New.
(can_sm_ref_p): Use for_all_locs_in_loop.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/recip-3.c: Adjust.
* gcc.dg/tree-ssa/ssa-lim-18.c: New test.
* gcc.dg/tree-ssa/ssa-lim-19.c: New test.
* gcc.dg/tree-ssa/ssa-lim-20.c: New test.
---
 gcc/loop-invariant.c   | 10 ++--
 gcc/tree-ssa-loop-im.c | 61 --
 gcc/testsuite/gcc.dg/tree-ssa/recip-3.c|  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-18.c | 20 +++
 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c | 27 ++
 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-20.c | 25 +
 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-21.c | 28 ++
 7 files changed, 165 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-18.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-20.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-21.c

diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c
index fca0c2b24be..5c3be7bf0eb 100644
--- a/gcc/loop-invariant.c
+++ b/gcc/loop-invariant.c
@@ -1183,9 +1183,14 @@ find_invariants_insn (rtx_insn *insn, bool 
always_reached, bool always_executed)
call.  */
 
 static void
-find_invariants_bb (basic_block bb, bool always_reached, bool always_executed)
+find_invariants_bb (class loop *loop, basic_block bb, bool always_reached,
+   bool always_executed)
 {
   rtx_insn *insn;
+  basic_block preheader = loop_preheader_edge (loop)->src;
+
+  if (preheader->count > bb->count)
+return;
 
   FOR_BB_INSNS (bb, insn)
 {
@@ -1214,8 +1219,7 @@ find_invariants_body (class loop *loop, basic_block *body,
   unsigned i;
 
   for (i = 0; i < loop->num_nodes; i++)
-find_invariants_bb (body[i],
-   bitmap_bit_p (always_reached, i),
+find_invariants_bb (loop, body[i], bitmap_bit_p (always_reached, i),
bitmap_bit_p (always_executed, i));
 }
 
diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index 4b187c2cdaf..655fab03442 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -417,6 +417,28 @@ movement_possibility (gimple *stmt)
   return ret;
 }
 
+/* Find coldest loop between outmost_loop and loop by comapring profile count. 
 */
+
+static class loop *
+find_coldest_out_loop (class loop *outmost_loop, class loop *loop,
+  basic_block curr_bb)
+{
+  class loop *cold_loop, *min_loop;
+  cold_loop = min_loop = outmost_loop;
+  profile_count min_count = loop_preheader_edge (min_loop)->src->count;
+
+  if (curr_bb && curr_bb->count < loop_preheader_edge (loop)->src->count)
+return NULL;
+
+  while (min_loop != loop)
+{
+  min_loop = superloop_at_depth (loop, loop_depth (min_loop) + 1);
+  if (loop_preheader_edge (min_loop)->src->count < min_count)
+   cold_loop = min

Re: [PATCH v2 2/4] Refactor loop_version

2021-10-31 Thread Xionghu Luo via Gcc-patches




On 2021/10/29 19:52, Richard Biener wrote:
> On Wed, 27 Oct 2021, Xionghu Luo wrote:
> 
>> loop_version currently does lv_adjust_loop_entry_edge
>> before it loopifys the copy inserted on the header.  This patch moves
>> the condition generation later and thus we have four pieces to help
>> understanding of how the adjustment works:
>>  1) duplicating the loop on the entry edge.
>>  2) loopify the duplicated new loop.
>>  3) adjusting the CFG to insert a condition branching to either loop
>>  with lv_adjust_loop_entry_edge.
>>  4) From loopify extract the scale_loop_frequencies bits.
>>
>> Also removed some piece of code seems obviously useless which is not
>> completely sure:
>>  - redirect_all_edges since it is false and loopify only called once.
>>  - extract_cond_bb_edges and lv_flush_pending_stmts (false_edge) as the
>>  edge is not redirected actually.
> 
> This is OK (you can also commit this independently), thanks for the
> cleanup.

Thanks, committed this and [PATCH v2 4/4] to r12-4818 and r12-4819.

-- 
Thanks,
Xionghu

Re: [RFC] Don't move cold code out of loop by checking bb count

2021-11-03 Thread Xionghu Luo via Gcc-patches

On 2021/10/29 19:48, Richard Biener wrote:
> I'm talking about the can_sm_ref_p call, in that context 'loop' will
> be the outermost loop of
> interest, and we are calling this for all stores in a loop.  We're doing
> 
> +bool
> +ref_in_loop_hot_body::operator () (mem_ref_loc *loc)
> +{
> +  basic_block curr_bb = gimple_bb (loc->stmt);
> +  class loop *inner_loop = curr_bb->loop_father;
> +  return find_coldest_out_loop (l, inner_loop, curr_bb);
> 
> for each location the ref is accessed and the intent was to see
> whether there's at least one
> that we would like to move to 'loop'.  Indeed since we only know the
> common outer loop
> but not the inner we are hosting from there's not a single "coldest"
> loop to cache and so
> any caching we might want to perform could be applied to the other case as 
> well.
> 
> I suppose the most natural thing to cache is for each loop the outer loop 
> where
> its outer loop preheader would be hotter than the outer loops preheader so 
> that
> 
> +  while (outmost_loop != loop)
> +{
> +  if (bb_colder_than_loop_preheader (loop_preheader_edge
> (outmost_loop)->src,
> +loop_preheader_edge 
> (cold_loop)->src))
> +   cold_loop = outmost_loop;
> +  outmost_loop = superloop_at_depth (loop, loop_depth (outmost_loop) + 
> 1);
> +}
> 
> could be instead written as
> 
>   coldest_loop = coldest_outermost_loop[loop->num];
>   if (loop_depth (coldest_loop) < loop_depth (outermost_loop))
> return outermost_loop;
>   return coldest_loop;
> 
> ?  And in the usual case coldest_outermost_loop[L] would be the loop tree 
> root.
> It should be possible to compute such cache in a DFS walk of the loop tree
> (the loop iterator by default visits in such order).

Thanks.  Updated the patch with your suggestion.  Not sure whether it strictly
conforms to your comments.  Though the patch passed all my added tests(coverage 
not enough),
I am still a bit worried if pre-computed coldest_loop is outside of 
outermost_loop, but
outermost_loop is not the COLDEST LOOP, i.e. (outer->inner)

 [loop tree root, coldest_loop, outermost_loop,..., second_coldest_loop, ..., 
loop],

then function find_coldest_out_loop will return a loop NOT accord with our
expectation, that should return second_coldest_loop instead of outermost_loop?

Changes:
1. Add function fill_coldest_out_loop to pre compute the coldest
outermost loop for each loop.
2. Rename find_coldest_out_loop to get_coldest_out_loop.
3. Add testcase ssa-lim-22.c to differentiate with ssa-lim-19.c.

v5 changes:
1. Refine comments for new functions.
2. Use basic_block instead of count in bb_colder_than_loop_preheader
to align with function name.
3. Refine with simpler implementation for get_coldest_out_loop and
ref_in_loop_hot_body::operator for better understanding.

v4 changes:
1. Sort out profile_count comparision to function bb_cold_than_loop_preheader.
2. Update ref_in_loop_hot_body::operator () to find cold_loop before compare.
3. Split RTL invariant motion part out.
4. Remove aux changes.

v3 changes:
1. Handle max_loop in determine_max_movement instead of 
outermost_invariant_loop.
2. Remove unnecessary changes.
3. Add for_all_locs_in_loop (loop, ref, ref_in_loop_hot_body) in can_sm_ref_p.
4. "gsi_next (&bsi);" in move_computations_worker is kept since it caused
infinite loop when implementing v1 and the iteration is missed to be
updated actually.

v1: https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576488.html
v2: https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579086.html
v3: https://gcc.gnu.org/pipermail/gcc-patches/2021-September/580211.html
v4: https://gcc.gnu.org/pipermail/gcc-patches/2021-October/581231.html
v5: https://gcc.gnu.org/pipermail/gcc-patches/2021-October/581961.html

There was a patch trying to avoid move cold block out of loop:

https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html

Richard suggested to "never hoist anything from a bb with lower execution
frequency to a bb with higher one in LIM invariantness_dom_walker
before_dom_children".

In gimple LIM analysis, add get_coldest_out_loop to move invariants to
expected target loop, if profile count of the loop bb is colder
than target loop preheader, it won't be hoisted out of loop.
Likely for store motion, if all locations of the REF in loop is cold,
don't do store motion of it.

SPEC2017 performance evaluation shows 1% performance improvement for
intrate GEOMEAN and no obvious regression for others.  Especially,
500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is
largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00%
on P8LE.

gcc/ChangeLog:

* tree-ssa-loop-im.c (bb_colder_than_loop_preheader): New
function.
(get_coldest_out_loop): New function.
(determine_max_movement): Use get_coldest_out_loop.
(move_computations_worker): Adjust and fix iteration udpate.
(class ref_in_loop_hot_body): New functor.

[PATCH] rs6000: Fix incorrect fusion constraint [PR102991]

2021-11-03 Thread Xionghu Luo via Gcc-patches

The clobber constraint should match operand's constraint.  fusion.md was
generated by genfusion.pl, but it is disabled now, update both places with
correct clobber constraint.

gcc/ChangeLog:

* config/rs6000/fusion.md: Fix incorrect clobber constraint.
* config/rs6000/genfusion.pl: Likewise.
---
 gcc/config/rs6000/fusion.md| 128 -
 gcc/config/rs6000/genfusion.pl |   2 +-
 2 files changed, 65 insertions(+), 65 deletions(-)

diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md
index 516baa0bb0b..d11cecb11ee 100644
--- a/gcc/config/rs6000/fusion.md
+++ b/gcc/config/rs6000/fusion.md
@@ -1874,7 +1874,7 @@ (define_insn "*fuse_vand_vand"
 (and:VM (and:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v")
   (match_operand:VM 1 "altivec_register_operand" 
"%v,v,v,v"))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
vand %3,%1,%0\;vand %3,%3,%2
@@ -1892,7 +1892,7 @@ (define_insn "*fuse_vandc_vand"
 (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v"))
   (match_operand:VM 1 "altivec_register_operand" 
"v,v,v,v"))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
vandc %3,%1,%0\;vand %3,%3,%2
@@ -1910,7 +1910,7 @@ (define_insn "*fuse_veqv_vand"
 (and:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v")
   (match_operand:VM 1 "altivec_register_operand" 
"v,v,v,v")))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
veqv %3,%1,%0\;vand %3,%3,%2
@@ -1928,7 +1928,7 @@ (define_insn "*fuse_vnand_vand"
 (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v"))
   (not:VM (match_operand:VM 1 
"altivec_register_operand" "v,v,v,v")))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
vnand %3,%1,%0\;vand %3,%3,%2
@@ -1946,7 +1946,7 @@ (define_insn "*fuse_vnor_vand"
 (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v"))
   (not:VM (match_operand:VM 1 
"altivec_register_operand" "v,v,v,v")))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
vnor %3,%1,%0\;vand %3,%3,%2
@@ -1964,7 +1964,7 @@ (define_insn "*fuse_vor_vand"
 (and:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v")
   (match_operand:VM 1 "altivec_register_operand" 
"v,v,v,v"))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
vor %3,%1,%0\;vand %3,%3,%2
@@ -1982,7 +1982,7 @@ (define_insn "*fuse_vorc_vand"
 (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v"))
   (match_operand:VM 1 "altivec_register_operand" 
"v,v,v,v"))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
vorc %3,%1,%0\;vand %3,%3,%2
@@ -2000,7 +2000,7 @@ (define_insn "*fuse_vxor_vand"
 (and:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v")
   (match_operand:VM 1 "altivec_register_operand" 
"v,v,v,v"))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
vxor %3,%1,%0\;vand %3,%3,%2
@@ -2018,7 +2018,7 @@ (define_insn "*fuse_vand_vandc"
 (and:VM (and:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v")
   (match_operand:VM 1 "altivec_register_operand" 
"v,v,v,v"))
  (not:VM (match_operand:VM 2 "altivec_register_operand" 
"v,v,v,v"
-   (clobber (match_scratch

Re: [RFC] Don't move cold code out of loop by checking bb count

2021-11-03 Thread Xionghu Luo via Gcc-patches

On 2021/10/29 19:48, Richard Biener wrote:
> I'm talking about the can_sm_ref_p call, in that context 'loop' will
> be the outermost loop of
> interest, and we are calling this for all stores in a loop.  We're doing
> 
> +bool
> +ref_in_loop_hot_body::operator () (mem_ref_loc *loc)
> +{
> +  basic_block curr_bb = gimple_bb (loc->stmt);
> +  class loop *inner_loop = curr_bb->loop_father;
> +  return find_coldest_out_loop (l, inner_loop, curr_bb);
> 
> for each location the ref is accessed and the intent was to see
> whether there's at least one
> that we would like to move to 'loop'.  Indeed since we only know the
> common outer loop
> but not the inner we are hosting from there's not a single "coldest"
> loop to cache and so
> any caching we might want to perform could be applied to the other case as 
> well.
> 
> I suppose the most natural thing to cache is for each loop the outer loop 
> where
> its outer loop preheader would be hotter than the outer loops preheader so 
> that
> 
> +  while (outmost_loop != loop)
> +{
> +  if (bb_colder_than_loop_preheader (loop_preheader_edge
> (outmost_loop)->src,
> +loop_preheader_edge 
> (cold_loop)->src))
> +   cold_loop = outmost_loop;
> +  outmost_loop = superloop_at_depth (loop, loop_depth (outmost_loop) + 
> 1);
> +}
> 
> could be instead written as
> 
>   coldest_loop = coldest_outermost_loop[loop->num];
>   if (loop_depth (coldest_loop) < loop_depth (outermost_loop))
> return outermost_loop;
>   return coldest_loop;
> 
> ?  And in the usual case coldest_outermost_loop[L] would be the loop tree 
> root.
> It should be possible to compute such cache in a DFS walk of the loop tree
> (the loop iterator by default visits in such order).

Thanks.  Updated the patch with your suggestion.  Not sure whether it strictly
conforms to your comments.  Though the patch passed all my added tests(coverage 
not enough),
I am still a bit worried if pre-computed coldest_loop is outside of 
outermost_loop, but
outermost_loop is not the COLDEST LOOP, i.e. (outer->inner)

 [loop tree root, coldest_loop, outermost_loop,..., second_coldest_loop, ..., 
loop],

then function find_coldest_out_loop will return a loop NOT accord with our
expectation, that should return second_coldest_loop instead of outermost_loop?

Changes:
1. Add function fill_coldest_out_loop to pre compute the coldest
outermost loop for each loop.
2. Rename find_coldest_out_loop to get_coldest_out_loop.
3. Add testcase ssa-lim-22.c to differentiate with ssa-lim-19.c.

v5 changes:
1. Refine comments for new functions.
2. Use basic_block instead of count in bb_colder_than_loop_preheader
to align with function name.
3. Refine with simpler implementation for get_coldest_out_loop and
ref_in_loop_hot_body::operator for better understanding.

v4 changes:
1. Sort out profile_count comparision to function bb_cold_than_loop_preheader.
2. Update ref_in_loop_hot_body::operator () to find cold_loop before compare.
3. Split RTL invariant motion part out.
4. Remove aux changes.

v3 changes:
1. Handle max_loop in determine_max_movement instead of 
outermost_invariant_loop.
2. Remove unnecessary changes.
3. Add for_all_locs_in_loop (loop, ref, ref_in_loop_hot_body) in can_sm_ref_p.
4. "gsi_next (&bsi);" in move_computations_worker is kept since it caused
infinite loop when implementing v1 and the iteration is missed to be
updated actually.

v1: https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576488.html
v2: https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579086.html
v3: https://gcc.gnu.org/pipermail/gcc-patches/2021-September/580211.html
v4: https://gcc.gnu.org/pipermail/gcc-patches/2021-October/581231.html
v5: https://gcc.gnu.org/pipermail/gcc-patches/2021-October/581961.html

There was a patch trying to avoid move cold block out of loop:

https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html

Richard suggested to "never hoist anything from a bb with lower execution
frequency to a bb with higher one in LIM invariantness_dom_walker
before_dom_children".

In gimple LIM analysis, add get_coldest_out_loop to move invariants to
expected target loop, if profile count of the loop bb is colder
than target loop preheader, it won't be hoisted out of loop.
Likely for store motion, if all locations of the REF in loop is cold,
don't do store motion of it.

SPEC2017 performance evaluation shows 1% performance improvement for
intrate GEOMEAN and no obvious regression for others.  Especially,
500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is
largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00%
on P8LE.

gcc/ChangeLog:

* tree-ssa-loop-im.c (bb_colder_than_loop_preheader): New
function.
(get_coldest_out_loop): New function.
(determine_max_movement): Use get_coldest_out_loop.
(move_computations_worker): Adjust and fix iteration udpate.
(class ref_in_loop_hot_body): New functor.

Re: [PATCH] rs6000: Fix incorrect fusion constraint [PR102991]

2021-11-03 Thread Xionghu Luo via Gcc-patches




On 2021/11/3 23:13, David Edelsohn wrote:
> Did you manually change fusion.md or did you regenerate it after
> fixing genfusion.pl?
> 
> If you regenerated it, the ChangeLog entry should be "Regenerated" and
> the "Fix incorrect clobber constraint." should refer to the
> genfusion.pl change.
> 
> I want to ensure that genfusion.pl generates the correct constraint
> the next time it is used.
> 

Aaron mentioned he disabled the auto generation here[1], but before
than that, Segher suggested to enable it in stage1. 

[1] https://gcc.gnu.org/pipermail/gcc-patches/2021-February/564652.html
[2] https://gcc.gnu.org/pipermail/gcc-patches/2021-January/564244.html


Thus re-enable it with the followed v2 patch(Confirmed the fusion.md is
exactly same with v1 patch.)


[PATCH v2] rs6000: Fix incorrect fusion constraint [PR102991]


gcc/ChangeLog:

* config/rs6000/fusion.md: Regenerate.
* config/rs6000/genfusion.pl: Fix incorrect clobber constraint.
* config/rs6000/t-rs6000: Uncomment regeneration of fusion.md.
---
 gcc/config/rs6000/fusion.md| 128 -
 gcc/config/rs6000/genfusion.pl |   2 +-
 gcc/config/rs6000/t-rs6000 |   4 +-
 3 files changed, 67 insertions(+), 67 deletions(-)

diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md
index 516baa0bb0b..d11cecb11ee 100644
--- a/gcc/config/rs6000/fusion.md
+++ b/gcc/config/rs6000/fusion.md
@@ -1874,7 +1874,7 @@ (define_insn "*fuse_vand_vand"
 (and:VM (and:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v")
   (match_operand:VM 1 "altivec_register_operand" 
"%v,v,v,v"))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
vand %3,%1,%0\;vand %3,%3,%2
@@ -1892,7 +1892,7 @@ (define_insn "*fuse_vandc_vand"
 (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v"))
   (match_operand:VM 1 "altivec_register_operand" 
"v,v,v,v"))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
vandc %3,%1,%0\;vand %3,%3,%2
@@ -1910,7 +1910,7 @@ (define_insn "*fuse_veqv_vand"
 (and:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v")
   (match_operand:VM 1 "altivec_register_operand" 
"v,v,v,v")))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
veqv %3,%1,%0\;vand %3,%3,%2
@@ -1928,7 +1928,7 @@ (define_insn "*fuse_vnand_vand"
 (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v"))
   (not:VM (match_operand:VM 1 
"altivec_register_operand" "v,v,v,v")))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
vnand %3,%1,%0\;vand %3,%3,%2
@@ -1946,7 +1946,7 @@ (define_insn "*fuse_vnor_vand"
 (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v"))
   (not:VM (match_operand:VM 1 
"altivec_register_operand" "v,v,v,v")))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
vnor %3,%1,%0\;vand %3,%3,%2
@@ -1964,7 +1964,7 @@ (define_insn "*fuse_vor_vand"
 (and:VM (ior:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v")
   (match_operand:VM 1 "altivec_register_operand" 
"v,v,v,v"))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
vor %3,%1,%0\;vand %3,%3,%2
@@ -1982,7 +1982,7 @@ (define_insn "*fuse_vorc_vand"
 (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v"))
   (match_operand:VM 1 "altivec_register_operand" 
"v,v,v,v"))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
vorc %3,%1,%0\;vand %3,%3,%2
@@ -2000,7 +2000,7 @@ (define_insn "*fuse_vxor_vand"

Re: [PATCH] rs6000: Fix incorrect fusion constraint [PR102991]

2021-11-04 Thread Xionghu Luo via Gcc-patches




On 2021/11/4 09:59, David Edelsohn wrote:
> On Wed, Nov 3, 2021 at 9:46 PM Xionghu Luo  wrote:
>>
>> On 2021/11/3 23:13, David Edelsohn wrote:
>>> Did you manually change fusion.md or did you regenerate it after
>>> fixing genfusion.pl?
>>>
>>> If you regenerated it, the ChangeLog entry should be "Regenerated" and
>>> the "Fix incorrect clobber constraint." should refer to the
>>> genfusion.pl change.
>>>
>>> I want to ensure that genfusion.pl generates the correct constraint
>>> the next time it is used.
>>>
>>
>> Aaron mentioned he disabled the auto generation here[1], but before
>> than that, Segher suggested to enable it in stage1.
>>
>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2021-February/564652.html
>> [2] https://gcc.gnu.org/pipermail/gcc-patches/2021-January/564244.html
>>
>>
>> Thus re-enable it with the followed v2 patch(Confirmed the fusion.md is
>> exactly same with v1 patch.)
>>
>>
>> [PATCH v2] rs6000: Fix incorrect fusion constraint [PR102991]
>>
>>
>> gcc/ChangeLog:
>>
>> * config/rs6000/fusion.md: Regenerate.
>> * config/rs6000/genfusion.pl: Fix incorrect clobber constraint.
>> * config/rs6000/t-rs6000: Uncomment regeneration of fusion.md.
> 
> I believe that there is some confusion about my request. I am not
> requesting that the patch enable genfusion.pl . The Makefile fragment
> rule to generate fusion.md is disabled for a reason and normally
> should not be enabled.  But fusion.md should be generated by
> genfusion.pl when there is a change, and any changes should be made in
> genfusion.pl. In other words, change genfusion.pl, temporarily enable
> the Makefile fragment rule, generate fusion.md, disable genfusion.pl.
> My request was an effort to ensure that genfusion.pl correctly
> regenerates the new, corrected fusion.md file.  I don't want a manual
> change to fusion.md that differs from the automatically generated
> file. Only the updated fusion.md and genfusion.pl should be checked
> in.
> 
> Has Aaron reviewed and confirmed the change to genfusion.pl?
> 

Regenerate and update the ChangeLog description from v1:


[PATCH] rs6000: Fix incorrect fusion constraint [PR102991]


gcc/ChangeLog:

* config/rs6000/fusion.md: Regenerate.
* config/rs6000/genfusion.pl: Fix incorrect clobber constraint.
---
 gcc/config/rs6000/fusion.md| 128 -
 gcc/config/rs6000/genfusion.pl |   2 +-
 2 files changed, 65 insertions(+), 65 deletions(-)

diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md
index 516baa0bb0b..d11cecb11ee 100644
--- a/gcc/config/rs6000/fusion.md
+++ b/gcc/config/rs6000/fusion.md
@@ -1874,7 +1874,7 @@ (define_insn "*fuse_vand_vand"
 (and:VM (and:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v")
   (match_operand:VM 1 "altivec_register_operand" 
"%v,v,v,v"))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
vand %3,%1,%0\;vand %3,%3,%2
@@ -1892,7 +1892,7 @@ (define_insn "*fuse_vandc_vand"
 (and:VM (and:VM (not:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v"))
   (match_operand:VM 1 "altivec_register_operand" 
"v,v,v,v"))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
vandc %3,%1,%0\;vand %3,%3,%2
@@ -1910,7 +1910,7 @@ (define_insn "*fuse_veqv_vand"
 (and:VM (not:VM (xor:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v")
   (match_operand:VM 1 "altivec_register_operand" 
"v,v,v,v")))
  (match_operand:VM 2 "altivec_register_operand" "v,v,v,v")))
-   (clobber (match_scratch:VM 4 "=X,X,X,&r"))]
+   (clobber (match_scratch:VM 4 "=X,X,X,&v"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
   "@
veqv %3,%1,%0\;vand %3,%3,%2
@@ -1928,7 +1928,7 @@ (define_insn "*fuse_vnand_vand"
 (and:VM (ior:VM (not:VM (match_operand:VM 0 "altivec_register_operand" 
"v,v,v,v"))

Re: [PATCH] rs6000: Fix incorrect fusion constraint [PR102991]

2021-11-04 Thread Xionghu Luo via Gcc-patches




On 2021/11/5 08:58, David Edelsohn wrote:
> On Thu, Nov 4, 2021 at 8:50 PM Xionghu Luo  wrote:
> 
>> [PATCH] rs6000: Fix incorrect fusion constraint [PR102991]
>>
>> gcc/ChangeLog:
>>
>> * config/rs6000/fusion.md: Regenerate.
>> * config/rs6000/genfusion.pl: Fix incorrect clobber constraint.
> 
> Okay.
> 
> Thanks, David
> 

Committed to r12-4930, the gcc-11 branch also has fusion.md&genfusion.pl
and needs backport.

-- 
Thanks,
Xionghu

Re: [PATCH v3 1/4] Fix loop split incorrect count and probability

2021-11-07 Thread Xionghu Luo via Gcc-patches




On 2021/10/27 15:44, Jan Hubicka wrote:
>> On Wed, 27 Oct 2021, Jan Hubicka wrote:
>>

 gcc/ChangeLog:

* tree-ssa-loop-split.c (split_loop): Fix incorrect probability.
(do_split_loop_on_cond): Likewise.
 ---
  gcc/tree-ssa-loop-split.c | 25 -
  1 file changed, 16 insertions(+), 9 deletions(-)

 diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
 index 3f6ad046623..d30782888f3 100644
 --- a/gcc/tree-ssa-loop-split.c
 +++ b/gcc/tree-ssa-loop-split.c
 @@ -575,7 +575,11 @@ split_loop (class loop *loop1)
stmts2);
tree cond = build2 (guard_code, boolean_type_node, guard_init, border);
if (!initial_true)
 -cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); 
 +cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond);
 +
 +  edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE
 + ? EDGE_SUCC (bbs[i], 0)
 + : EDGE_SUCC (bbs[i], 1);
  
/* Now version the loop, placing loop2 after loop1 connecting
   them, and fix up SSA form for that.  */
 @@ -583,10 +587,10 @@ split_loop (class loop *loop1)
basic_block cond_bb;
  
class loop *loop2 = loop_version (loop1, cond, &cond_bb,
 - profile_probability::always (),
 - profile_probability::always (),
 - profile_probability::always (),
 - profile_probability::always (),
 + true_edge->probability,
 + true_edge->probability.invert (),
 + true_edge->probability,
 + true_edge->probability.invert (),
   true);
>>>
>>> As discussed yesterday, for loop of form
>>>
>>> for (...)
>>>   if (cond)
>>> cond = something();
>>>   else
>>> something2
>>>
>>> Split as
>>
>> Note that you are missing to conditionalize loop1 execution
>> on 'cond' (not sure if that makes a difference).
> You are right - forgot to mention that.
> 
> Entry conditional makes no difference on scaling stmts inside loop but
> affects its header and expected trip count. We however need to set up
> probability of this conditional (and preheader count if it exists)
> There is no general way to read the probability of this initial
> conditional from cfg profile.  So I guess we are stuck with guessing
> some arbitrary value. I guess common case is that cond is true first
> iteration tough and often we can easily see that fromo PHI node
> initializing the test variable.
> 
> Other thing that changes is expected number of iterations of the split
> loops, so we may want to update the exit conditinal probability
> accordingly...
> 
Sorry for the late reply.  The below updated patch mainly solves the issues
you pointed out:
  - profile count proportion for both original loop and copied loop
without dropping down the true branch's count;
  - probability update in the two loops and between the two loops;
  - number of iterations update/check for split_loop.


[PATCH v3] Fix loop split incorrect count and probability

In tree-ssa-loop-split.c, split_loop and split_loop_on_cond does two
kind of split. split_loop only works for single loop and insert edge at
exit when split, while split_loop_on_cond is not limited to single loop
and insert edge at latch when split.  Both split behavior should consider
loop count and probability update.  For split_loop, loop split condition
is moved in front of loop1 and loop2; But split_loop_on_cond moves the
condition between loop1 and loop2, this patch does:
1) profile count proportion for both original loop and copied loop
without dropping down the true branch's count;
2) probability update in and between the two loops;
3) number of iterations update for split_loop.

Regression tested pass, OK for master?

Changes diff for split_loop and split_loop_on_cond cases:

1) diff base/loop-split.c.151t.lsplit patched/loop-split.c.152t.lsplit
...
[local count: 118111600]:
   if (beg_5(D) < end_8(D))
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 105119324]:
   if (beg2_6(D) < c_9(D))
-goto ; [100.00%]
+goto ; [33.00%]
   else
-goto ; [100.00%]
+goto ; [67.00%]

-   [local count: 105119324]:
+   [local count: 34689377]:
   _25 = beg_5(D) + 1;
   _26 = end_8(D) - beg_5(D);
   _27 = beg2_6(D) + _26;
   _28 = MIN_EXPR ;

-   [local count: 955630225]:
+   [local count: 315357973]:
   # i_16 = PHI 
   # j_17 = PHI 
   printf ("a: %d %d\n", i_16, j_17);
   i_11 = i_16 + 1;
   j_12 = j_17 + 1;
   if (j_12 < _28)
-goto ; [89.00%]
+goto ; [29.37%]
   else
-goto ; [11.00%]
+goto ; [70.63%]

-

1 2 3 >

1 - 100 of 218 matches

Mail list logo