On Fri, Jun 25, 2021 at 10:34 AM Xionghu Luo via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> From: Xiong Hu Luo <luo...@linux.ibm.com>
>
> adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
> on Power.  For example, it generates mismatched address offset after
> adjust iv update statement position:
>
> <bb 32> [local count: 70988443]:
> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> ivtmp.30_415 = ivtmp.30_414 + 1;
> _34 = ref_180 + 18446744073709551615;
> _86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
> if (_84 == _86)
>   goto <bb 56>; [94.50%]
>   else
>   goto <bb 87>; [5.50%]
>
> Disable it will produce:
>
> <bb 32> [local count: 70988443]:
> _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
> _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
> ivtmp.30_415 = ivtmp.30_414 + 1;
> if (_84 == _86)
>   goto <bb 56>; [94.50%]
>   else
>   goto <bb 87>; [5.50%]
>
> Then later pass loop unroll could benefit from same address offset
> with different base address and reduces register dependency.
> This patch could improve performance by 10% for typical case on Power,
> no performance change observed for X86 or Aarch64 due to small loops
> not unrolled on these platforms.  Any comments?

The case you quote is special in that if we hoisted the IV update before
the other MEM _also_ used in the condition it would be fine again.

Now, adjust_iv_update_pos doesn't seem to check that the
condition actually uses the IV use stmt def, so it likely applies to
too many cases.

Unfortunately the introducing rev didn't come with a testcase,
but still I think fixing up adjust_iv_update_pos is better than
introducing a way to short-cut it per target decision.

One "fix" might be to add a check that either the condition
lhs or rhs is the def of the IV use and the other operand
is invariant.  Or if it's of similar structure hoist across the
other iv-use as well.  Not that I understand the argument
about the overlapping life-range.

You also don't provide a complete testcase ...

Richard.


> .L67:
>   lbzx %r7,%r8,%r6
>   lbzx %r12,%r25,%r4
>   cmpw %cr0,%r7,%r12
>   bne %cr0,.L11
>   lbzx %r7,%r8,%r4
>   mr %r6,%r4
>   addi %r4,%r4,1
>   lbzx %r12,%r25,%r4
>   mr %r11,%r6
>   cmpw %cr0,%r7,%r12
>   bne %cr0,.L11
>   mr %r6,%r4
> .L12:
>   cmpdi %cr0,%r10,1
>   addi %r4,%r6,1
>   mr %r11,%r6
>   addi %r10,%r10,-1
>   bne %cr0,.L67
>
> vs.
>
> .L67:
>   lbzx %r25,%r8,%r6
>   lbzx %r12,%r7,%r6
>   addi %r4,%r6,1
>   cmpw %cr0,%r25,%r12
>   bne %cr0,.L11
>   lbzx %r12,%r8,%r4
>   lbzx %r25,%r7,%r4
>   mr %r6,%r4
>   mr %r11,%r4
>   cmpw %cr0,%r12,%r25
>   bne %cr0,.L11
>   addi %r6,%r4,1
> .L12:
>   cmpdi %cr0,%r10,1
>   mr %r11,%r6
>   addi %r10,%r10,-1
>   bne %cr0,.L67
>
> gcc/ChangeLog:
>
>         * config/rs6000/rs6000.c (TARGET_ADJUST_IV_UPDATE_POS):
>         (rs6000_adjust_iv_update_pos):
>         * doc/tm.texi:
>         * doc/tm.texi.in:
>         * target.def:
>         * targhooks.c (default_adjust_iv_update_pos):
>         * targhooks.h (default_adjust_iv_update_pos):
>         * tree-ssa-loop-ivopts.c (rewrite_use_address):
> ---
>  gcc/config/rs6000/rs6000.c | 11 +++++++++++
>  gcc/doc/tm.texi            |  5 +++++
>  gcc/doc/tm.texi.in         |  2 ++
>  gcc/target.def             |  7 +++++++
>  gcc/targhooks.c            |  6 ++++++
>  gcc/targhooks.h            |  2 ++
>  gcc/tree-ssa-loop-ivopts.c |  3 ++-
>  7 files changed, 35 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index cd130dea611..e7725997793 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -1455,6 +1455,9 @@ static const struct attribute_spec 
> rs6000_attribute_table[] =
>  #undef TARGET_LOOP_UNROLL_ADJUST
>  #define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
>
> +#undef TARGET_ADJUST_IV_UPDATE_POS
> +#define TARGET_ADJUST_IV_UPDATE_POS rs6000_adjust_iv_update_pos
> +
>  #undef TARGET_INIT_BUILTINS
>  #define TARGET_INIT_BUILTINS rs6000_init_builtins
>  #undef TARGET_BUILTIN_DECL
> @@ -5457,6 +5460,14 @@ rs6000_loop_unroll_adjust (unsigned nunroll, struct 
> loop *loop)
>    return nunroll;
>  }
>
> +/* Implement targetm.adjust_iv_update_pos.  */
> +
> +bool
> +rs6000_adjust_iv_update_pos (void)
> +{
> +  return false;
> +}
> +
>  /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
>     library with vectorized intrinsics.  */
>
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index b272fa4806d..07ce40eb053 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -11768,6 +11768,11 @@ By default, the RTL loop optimizer does not use a 
> present doloop pattern for
>  loops containing function calls or branch on table instructions.
>  @end deftypefn
>
> +@deftypefn {Target Hook} bool TARGET_ADJUST_IV_UPDATE_POS (void)
> +if adjust_iv_update_pos is enabled, reorder the iv update statement,
> + then mem ref uses the iv value after update.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} bool TARGET_LEGITIMATE_COMBINED_INSN (rtx_insn 
> *@var{insn})
>  Take an instruction in @var{insn} and return @code{false} if the instruction 
> is not appropriate as a combination of two or more instructions.  The default 
> is to accept all instructions.
>  @end deftypefn
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index bf724dc093c..87d02089588 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -7979,6 +7979,8 @@ to by @var{ce_info}.
>
>  @hook TARGET_INVALID_WITHIN_DOLOOP
>
> +@hook TARGET_ADJUST_IV_UPDATE_POS
> +
>  @hook TARGET_LEGITIMATE_COMBINED_INSN
>
>  @hook TARGET_CAN_FOLLOW_JUMP
> diff --git a/gcc/target.def b/gcc/target.def
> index d7b94bd8e5d..aead7cb79ff 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -4398,6 +4398,13 @@ loops containing function calls or branch on table 
> instructions.",
>   const char *, (const rtx_insn *insn),
>   default_invalid_within_doloop)
>
> +/* Function to adjust iv update statment position.  */
> +DEFHOOK
> +(adjust_iv_update_pos,
> + "if adjust_iv_update_pos is enabled, reorder the iv update statement,\n\
> + then mem ref uses the iv value after update.",
> + bool, (void), default_adjust_iv_update_pos)
> +
>  /* Returns true for a legitimate combined insn.  */
>  DEFHOOK
>  (legitimate_combined_insn,
> diff --git a/gcc/targhooks.c b/gcc/targhooks.c
> index d69c9a2d819..2a93a3489e6 100644
> --- a/gcc/targhooks.c
> +++ b/gcc/targhooks.c
> @@ -679,6 +679,12 @@ default_invalid_within_doloop (const rtx_insn *insn)
>    return NULL;
>  }
>
> +bool
> +default_adjust_iv_update_pos (void)
> +{
> +  return true;
> +}
> +
>  /* Mapping of builtin functions to vectorized variants.  */
>
>  tree
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 39a6f82f143..298ecd4fc99 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -90,6 +90,8 @@ extern bool default_has_ifunc_p (void);
>  extern bool default_predict_doloop_p (class loop *);
>  extern const char * default_invalid_within_doloop (const rtx_insn *);
>
> +extern bool default_adjust_iv_update_pos (void);
> +
>  extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
>  extern tree default_builtin_md_vectorized_function (tree, tree, tree);
>
> diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
> index 4012ae3f19d..5dbc306862c 100644
> --- a/gcc/tree-ssa-loop-ivopts.c
> +++ b/gcc/tree-ssa-loop-ivopts.c
> @@ -7438,7 +7438,8 @@ rewrite_use_address (struct ivopts_data *data,
>    aff_tree aff;
>    bool ok;
>
> -  adjust_iv_update_pos (cand, use);
> +  if (targetm.adjust_iv_update_pos ())
> +    adjust_iv_update_pos (cand, use);
>    ok = get_computation_aff (data->current_loop, use->stmt, use, cand, &aff);
>    gcc_assert (ok);
>    unshare_aff_combination (&aff);
> --
> 2.25.1
>

Reply via email to