"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
> vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
> 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
> scratch register to avoid stack realignment when expanding memset.
>
>       PR middle-end/90773
>       * builtins.c (gen_memset_value_from_prev): New function.
>       (gen_memset_broadcast): Likewise.
>       (builtin_memset_read_str): Use gen_memset_value_from_prev
>       and gen_memset_broadcast.
>       (builtin_memset_gen_str): Likewise.
>       * target.def (gen_memset_scratch_rtx): New hook.
>       * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
>       * doc/tm.texi: Regenerated.
> ---
>  gcc/builtins.c     | 123 +++++++++++++++++++++++++++++++++++++--------
>  gcc/doc/tm.texi    |   5 ++
>  gcc/doc/tm.texi.in |   2 +
>  gcc/target.def     |   7 +++
>  4 files changed, 116 insertions(+), 21 deletions(-)
>
> diff --git a/gcc/builtins.c b/gcc/builtins.c
> index 39ab139b7e1..c1758ae2efc 100644
> --- a/gcc/builtins.c
> +++ b/gcc/builtins.c
> @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
>    return NULL_RTX;
>  }
>  
> -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> -   bytes from constant string DATA + OFFSET and return it as target
> -   constant.  If PREV isn't nullptr, it has the RTL info from the
> +/* Return the RTL of a register in MODE generated from PREV in the
>     previous iteration.  */
>  
> -rtx
> -builtin_memset_read_str (void *data, void *prevp,
> -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> -                      scalar_int_mode mode)
> +static rtx
> +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
>  {
> +  rtx target = nullptr;
>    by_pieces_prev *prev = (by_pieces_prev *) prevp;
>    if (prev != nullptr && prev->data != nullptr)
>      {
>        /* Use the previous data in the same mode.  */
>        if (prev->mode == mode)
>       return prev->data;
> +
> +      rtx prev_rtx = prev->data;
> +      machine_mode prev_mode = prev->mode;
> +      unsigned int word_size = GET_MODE_SIZE (word_mode);
> +      if (word_size < GET_MODE_SIZE (prev->mode)
> +       && word_size > GET_MODE_SIZE (mode))
> +     {
> +       /* First generate subreg of word mode if the previous mode is
> +          wider than word mode and word mode is wider than MODE.  */
> +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> +                                       prev_mode, 0);
> +       prev_mode = word_mode;
> +     }
> +      if (prev_rtx != nullptr)
> +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
>      }
> +  return target;
> +}
> +
> +/* Return the RTL of a register in MODE broadcasted from DATA.  */
> +
> +static rtx
> +gen_memset_broadcast (rtx data, scalar_int_mode mode)
> +{
> +  /* Skip if regno_reg_rtx isn't initialized.  */
> +  if (!regno_reg_rtx)
> +    return nullptr;
> +
> +  rtx target = nullptr;
> +
> +  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
> +  machine_mode vector_mode;
> +  if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
> +    gcc_unreachable ();

Sorry, I realise it's a bit late to be raising this objection now,
but I don't think it's a good idea to use scalar integer modes as
a proxy for vector modes.  In principle there's no reason why a
target has to define an integer mode for every vector mode.

If we want the mode to be a vector then I think the by-pieces
infrastructure should be extended to support vectors directly,
rather than assuming that each piece can be represented as
a scalar_int_mode.

Thanks,
Richard

> +
> +  enum insn_code icode = optab_handler (vec_duplicate_optab,
> +                                     vector_mode);
> +  if (icode != CODE_FOR_nothing)
> +    {
> +      rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
> +      if (CONST_INT_P (data))
> +     {
> +       /* Use the move expander with CONST_VECTOR.  */
> +       rtx const_vec = gen_const_vec_duplicate (vector_mode, data);
> +       emit_move_insn (reg, const_vec);
> +     }
> +      else
> +     {
> +
> +       class expand_operand ops[2];
> +       create_output_operand (&ops[0], reg, vector_mode);
> +       create_input_operand (&ops[1], data, QImode);
> +       expand_insn (icode, 2, ops);
> +       if (!rtx_equal_p (reg, ops[0].value))
> +         emit_move_insn (reg, ops[0].value);
> +     }
> +      target = lowpart_subreg (mode, reg, vector_mode);
> +    }
> +
> +  return target;
> +}
> +
> +/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> +   bytes from constant string DATA + OFFSET and return it as target
> +   constant.  If PREV isn't nullptr, it has the RTL info from the
> +   previous iteration.  */
>  
> +rtx
> +builtin_memset_read_str (void *data, void *prev,
> +                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> +                      scalar_int_mode mode)
> +{
> +  rtx target;
>    const char *c = (const char *) data;
> -  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> +  char *p;
> +
> +  /* Don't use the previous value if size is 1.  */
> +  if (GET_MODE_SIZE (mode) != 1)
> +    {
> +      target = gen_memset_value_from_prev (prev, mode);
> +      if (target != nullptr)
> +     return target;
> +
> +      p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
> +      memset (p, *c, GET_MODE_SIZE (QImode));
> +      rtx src = c_readstr (p, QImode);
> +      target = gen_memset_broadcast (src, mode);
> +      if (target != nullptr)
> +     return target;
> +    }
> +
> +  p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
>  
>    memset (p, *c, GET_MODE_SIZE (mode));
>  
> @@ -6719,7 +6804,7 @@ builtin_memset_read_str (void *data, void *prevp,
>     nullptr, it has the RTL info from the previous iteration.  */
>  
>  static rtx
> -builtin_memset_gen_str (void *data, void *prevp,
> +builtin_memset_gen_str (void *data, void *prev,
>                       HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
>                       scalar_int_mode mode)
>  {
> @@ -6727,22 +6812,18 @@ builtin_memset_gen_str (void *data, void *prevp,
>    size_t size;
>    char *p;
>  
> -  by_pieces_prev *prev = (by_pieces_prev *) prevp;
> -  if (prev != nullptr && prev->data != nullptr)
> -    {
> -      /* Use the previous data in the same mode.  */
> -      if (prev->mode == mode)
> -     return prev->data;
> -
> -      target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
> -      if (target != nullptr)
> -     return target;
> -    }
> -
>    size = GET_MODE_SIZE (mode);
>    if (size == 1)
>      return (rtx) data;
>  
> +  target = gen_memset_value_from_prev (prev, mode);
> +  if (target != nullptr)
> +    return target;
> +
> +  target = gen_memset_broadcast ((rtx) data, mode);
> +  if (target != nullptr)
> +    return target;
> +
>    p = XALLOCAVEC (char, size);
>    memset (p, 1, size);
>    coeff = c_readstr (p, mode);
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 2a41ae5fba1..8849711fcf8 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -12122,6 +12122,11 @@ This function prepares to emit a conditional 
> comparison within a sequence
>   @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
>  @end deftypefn
>  
> +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode 
> @var{mode})
> +This hook should return an rtx for scratch register in @var{mode} to
> +be used by memset broadcast. The default is @code{gen_reg_rtx}.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned 
> @var{nunroll}, class loop *@var{loop})
>  This target hook returns a new value for the number of times @var{loop}
>  should be unrolled. The parameter @var{nunroll} is the number of times
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index f881cdabe9e..a6bbf4f2667 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -7958,6 +7958,8 @@ lists.
>  
>  @hook TARGET_GEN_CCMP_NEXT
>  
> +@hook TARGET_GEN_MEMSET_SCRATCH_RTX
> +
>  @hook TARGET_LOOP_UNROLL_ADJUST
>  
>  @defmac POWI_MAX_MULTS
> diff --git a/gcc/target.def b/gcc/target.def
> index c009671c583..5fb287db3bd 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -2726,6 +2726,13 @@ DEFHOOK
>   rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree 
> op0, tree op1, int bit_code),
>   NULL)
>  
> +DEFHOOK
> +(gen_memset_scratch_rtx,
> + "This hook should return an rtx for scratch register in @var{mode} to\n\
> +be used by memset broadcast. The default is @code{gen_reg_rtx}.",
> + rtx, (machine_mode mode),
> + gen_reg_rtx)
> +
>  /* Return a new value for loop unroll size.  */
>  DEFHOOK
>  (loop_unroll_adjust,

Reply via email to