From: Xiong Hu Luo <[email protected]>
adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
on Power. For example, it generates mismatched address offset after
adjust iv update statement position:
<bb 32> [local count: 70988443]:
_84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
ivtmp.30_415 = ivtmp.30_414 + 1;
_34 = ref_180 + 18446744073709551615;
_86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
if (_84 == _86)
goto <bb 56>; [94.50%]
else
goto <bb 87>; [5.50%]
Disable it will produce:
<bb 32> [local count: 70988443]:
_84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
_86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
ivtmp.30_415 = ivtmp.30_414 + 1;
if (_84 == _86)
goto <bb 56>; [94.50%]
else
goto <bb 87>; [5.50%]
Then later pass loop unroll could benefit from same address offset
with different base address and reduces register dependency.
This patch could improve performance by 10% for typical case on Power,
no performance change observed for X86 or Aarch64 due to small loops
not unrolled on these platforms. Any comments?
.L67:
lbzx %r7,%r8,%r6
lbzx %r12,%r25,%r4
cmpw %cr0,%r7,%r12
bne %cr0,.L11
lbzx %r7,%r8,%r4
mr %r6,%r4
addi %r4,%r4,1
lbzx %r12,%r25,%r4
mr %r11,%r6
cmpw %cr0,%r7,%r12
bne %cr0,.L11
mr %r6,%r4
.L12:
cmpdi %cr0,%r10,1
addi %r4,%r6,1
mr %r11,%r6
addi %r10,%r10,-1
bne %cr0,.L67
vs.
.L67:
lbzx %r25,%r8,%r6
lbzx %r12,%r7,%r6
addi %r4,%r6,1
cmpw %cr0,%r25,%r12
bne %cr0,.L11
lbzx %r12,%r8,%r4
lbzx %r25,%r7,%r4
mr %r6,%r4
mr %r11,%r4
cmpw %cr0,%r12,%r25
bne %cr0,.L11
addi %r6,%r4,1
.L12:
cmpdi %cr0,%r10,1
mr %r11,%r6
addi %r10,%r10,-1
bne %cr0,.L67
gcc/ChangeLog:
* config/rs6000/rs6000.c (TARGET_ADJUST_IV_UPDATE_POS):
(rs6000_adjust_iv_update_pos):
* doc/tm.texi:
* doc/tm.texi.in:
* target.def:
* targhooks.c (default_adjust_iv_update_pos):
* targhooks.h (default_adjust_iv_update_pos):
* tree-ssa-loop-ivopts.c (rewrite_use_address):
---
gcc/config/rs6000/rs6000.c | 11 +++++++++++
gcc/doc/tm.texi | 5 +++++
gcc/doc/tm.texi.in | 2 ++
gcc/target.def | 7 +++++++
gcc/targhooks.c | 6 ++++++
gcc/targhooks.h | 2 ++
gcc/tree-ssa-loop-ivopts.c | 3 ++-
7 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index cd130dea611..e7725997793 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1455,6 +1455,9 @@ static const struct attribute_spec
rs6000_attribute_table[] =
#undef TARGET_LOOP_UNROLL_ADJUST
#define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
+#undef TARGET_ADJUST_IV_UPDATE_POS
+#define TARGET_ADJUST_IV_UPDATE_POS rs6000_adjust_iv_update_pos
+
#undef TARGET_INIT_BUILTINS
#define TARGET_INIT_BUILTINS rs6000_init_builtins
#undef TARGET_BUILTIN_DECL
@@ -5457,6 +5460,14 @@ rs6000_loop_unroll_adjust (unsigned nunroll, struct loop
*loop)
return nunroll;
}
+/* Implement targetm.adjust_iv_update_pos. */
+
+bool
+rs6000_adjust_iv_update_pos (void)
+{
+ return false;
+}
+
/* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
library with vectorized intrinsics. */
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index b272fa4806d..07ce40eb053 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11768,6 +11768,11 @@ By default, the RTL loop optimizer does not use a
present doloop pattern for
loops containing function calls or branch on table instructions.
@end deftypefn
+@deftypefn {Target Hook} bool TARGET_ADJUST_IV_UPDATE_POS (void)
+if adjust_iv_update_pos is enabled, reorder the iv update statement,
+ then mem ref uses the iv value after update.
+@end deftypefn
+
@deftypefn {Target Hook} bool TARGET_LEGITIMATE_COMBINED_INSN (rtx_insn
*@var{insn})
Take an instruction in @var{insn} and return @code{false} if the instruction
is not appropriate as a combination of two or more instructions. The default
is to accept all instructions.
@end deftypefn
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index bf724dc093c..87d02089588 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7979,6 +7979,8 @@ to by @var{ce_info}.
@hook TARGET_INVALID_WITHIN_DOLOOP
+@hook TARGET_ADJUST_IV_UPDATE_POS
+
@hook TARGET_LEGITIMATE_COMBINED_INSN
@hook TARGET_CAN_FOLLOW_JUMP
diff --git a/gcc/target.def b/gcc/target.def
index d7b94bd8e5d..aead7cb79ff 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4398,6 +4398,13 @@ loops containing function calls or branch on table
instructions.",
const char *, (const rtx_insn *insn),
default_invalid_within_doloop)
+/* Function to adjust iv update statment position. */
+DEFHOOK
+(adjust_iv_update_pos,
+ "if adjust_iv_update_pos is enabled, reorder the iv update statement,\n\
+ then mem ref uses the iv value after update.",
+ bool, (void), default_adjust_iv_update_pos)
+
/* Returns true for a legitimate combined insn. */
DEFHOOK
(legitimate_combined_insn,
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index d69c9a2d819..2a93a3489e6 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -679,6 +679,12 @@ default_invalid_within_doloop (const rtx_insn *insn)
return NULL;
}
+bool
+default_adjust_iv_update_pos (void)
+{
+ return true;
+}
+
/* Mapping of builtin functions to vectorized variants. */
tree
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 39a6f82f143..298ecd4fc99 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -90,6 +90,8 @@ extern bool default_has_ifunc_p (void);
extern bool default_predict_doloop_p (class loop *);
extern const char * default_invalid_within_doloop (const rtx_insn *);
+extern bool default_adjust_iv_update_pos (void);
+
extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
extern tree default_builtin_md_vectorized_function (tree, tree, tree);
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 4012ae3f19d..5dbc306862c 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -7438,7 +7438,8 @@ rewrite_use_address (struct ivopts_data *data,
aff_tree aff;
bool ok;
- adjust_iv_update_pos (cand, use);
+ if (targetm.adjust_iv_update_pos ())
+ adjust_iv_update_pos (cand, use);
ok = get_computation_aff (data->current_loop, use->stmt, use, cand, &aff);
gcc_assert (ok);
unshare_aff_combination (&aff);
--
2.25.1