HAO CHEN GUI <guih...@linux.ibm.com> writes:
> Hi,
>   This patch adds const0 move checking for CLEAR_BY_PIECES. The original
> vec_duplicate handles duplicates of non-constant inputs. But 0 is a
> constant. So even a platform doesn't support vec_duplicate, it could
> still do clear by pieces if it supports const0 move by that mode.
>
>   Compared to the previous version, the main changes are to create a
> new class for clear by pieces and add an additional argument to
> indicate if the object is constant in pieces_addr.
> https://gcc.gnu.org/pipermail/gcc-patches/2024-July/658337.html

Rather than add the additional argument, could we instead provide a
constfn that always returns zero?  ISTM that, under the current pieces_addr
framework, clear by pieces is essentially a memcpy from arbitrarily many
zeros.  E.g.:

  clear_by_pieces_d (rtx to, unsigned HOST_WIDE_INT len, unsigned int align)
    : op_by_pieces_d (STORE_MAX_PIECES, to, false, NULL_RTX, true,
                      read_zero, NULL, len, align, false, CLEAR_BY_PIECES)

where read_zero is something like:

static rtx
read_zero (void *, void *, HOST_WIDE_INT, fixed_size_mode mode)
{
  return CONST0_RTX (mode);
}

(completely untested).

The changes to by_pieces_mode_supported_p look good.

Thanks,
Richard

>   I didn't convert const0 move predicate check to an assertion as it
> caused ICEs on i386. On i386, some modes (V8QI V4HI V2SI V1DI) have
> move expand defined but their predicates don't include const0.
>
>   Bootstrapped and tested on powerpc64-linux BE and LE with no
> regressions.
>
>   On i386, it got several regressions. One issue is the predicate of
> V16QI move expand doesn't include const0. Thus V16QI mode can't be used
> for clear by pieces with the patch. The second issue is the const0 is
> passed directly to the move expand with the patch. Originally it is
> forced to a pseudo and i386 can leverage the previous data to do
> optimization.
>
>   The patch also raises several regressions on aarch64. The V2x8QImode
> replaces TImode to do 16-byte clear by pieces as V2x8QImode move expand
> supports const0 and vector mode is preferable. I drafted a patch to
> address the issue. It will be sent for review in a separate email.
> Another problem is V8QImode replaces DImode to do 8-byte clear by pieces.
> It seems cause different sequences of instructions but the actually
> instructions are the same.
>
>
> ChangeLog
> expand: Add const0 move checking for CLEAR_BY_PIECES optabs
>
> vec_duplicate handles duplicates of non-constant inputs.  The 0 is a
> constant.  So even a platform doesn't support vec_duplicate, it could
> still do clear by pieces if it supports const0 move.  This patch adds
> the checking.
>
> gcc/
>       * expr.cc (by_pieces_mode_supported_p): Add const0 move checking
>       for CLEAR_BY_PIECES.
>       (pieces_addr::pieces_addr): Add fifth argument is_const to
>       indicate if object is a constant.  Do not set m_addr_inc if object
>       is a constant.
>       (op_by_pieces_d::op_by_pieces_d): Initialize m_from by setting
>       is_const to true for CLEAR_BY_PIECES.
>       (class clear_by_pieces_d): New.
>       (clear_by_pieces_d::prepare_mode): New.
>       (clear_by_pieces_d::generate): New.
>       (clear_by_pieces): Replace store_by_pieces_d with clear_by_pieces_d.
>
> patch.diff
> diff --git a/gcc/expr.cc b/gcc/expr.cc
> index 9f66d479445..abf69c8d698 100644
> --- a/gcc/expr.cc
> +++ b/gcc/expr.cc
> @@ -1014,14 +1014,20 @@ can_use_qi_vectors (by_pieces_operation op)
>  static bool
>  by_pieces_mode_supported_p (fixed_size_mode mode, by_pieces_operation op)
>  {
> -  if (optab_handler (mov_optab, mode) == CODE_FOR_nothing)
> +  enum insn_code icode = optab_handler (mov_optab, mode);
> +  if (icode == CODE_FOR_nothing)
>      return false;
>
> -  if ((op == SET_BY_PIECES || op == CLEAR_BY_PIECES)
> +  if (op == SET_BY_PIECES
>        && VECTOR_MODE_P (mode)
>        && optab_handler (vec_duplicate_optab, mode) == CODE_FOR_nothing)
>      return false;
>
> +  if (op == CLEAR_BY_PIECES
> +      && VECTOR_MODE_P (mode)
> +      && !insn_operand_matches (icode, 1, CONST0_RTX (mode)))
> +   return false;
> +
>    if (op == COMPARE_BY_PIECES
>        && !can_compare_p (EQ, mode, ccp_jump))
>      return false;
> @@ -1184,7 +1190,7 @@ class pieces_addr
>    by_pieces_constfn m_constfn;
>    void *m_cfndata;
>  public:
> -  pieces_addr (rtx, bool, by_pieces_constfn, void *);
> +  pieces_addr (rtx, bool, by_pieces_constfn, void *, bool = false);
>    rtx adjust (fixed_size_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr);
>    void increment_address (HOST_WIDE_INT);
>    void maybe_predec (HOST_WIDE_INT);
> @@ -1204,7 +1210,7 @@ public:
>     memory load.  */
>
>  pieces_addr::pieces_addr (rtx obj, bool is_load, by_pieces_constfn constfn,
> -                       void *cfndata)
> +                       void *cfndata, bool is_const)
>    : m_obj (obj), m_is_load (is_load), m_constfn (constfn), m_cfndata 
> (cfndata)
>  {
>    m_addr_inc = 0;
> @@ -1228,7 +1234,9 @@ pieces_addr::pieces_addr (rtx obj, bool is_load, 
> by_pieces_constfn constfn,
>    else
>      {
>        m_addr = NULL_RTX;
> -      if (!is_load)
> +      if (is_const)
> +     ;
> +      else if (!is_load)
>       {
>         m_auto = true;
>         if (STACK_GROWS_DOWNWARD)
> @@ -1391,7 +1399,7 @@ op_by_pieces_d::op_by_pieces_d (unsigned int 
> max_pieces, rtx to,
>                               unsigned int align, bool push,
>                               by_pieces_operation op)
>    : m_to (to, to_load, NULL, NULL),
> -    m_from (from, from_load, from_cfn, from_cfn_data),
> +    m_from (from, from_load, from_cfn, from_cfn_data, op == CLEAR_BY_PIECES),
>      m_len (len), m_max_size (max_pieces + 1),
>      m_push (push), m_op (op)
>  {
> @@ -1724,6 +1732,48 @@ store_by_pieces_d::finish_retmode (memop_ret retmode)
>    return m_to.adjust (QImode, m_offset);
>  }
>
> +/* Derived class from op_by_pieces_d, providing support for block clear
> +   operations.  */
> +
> +class clear_by_pieces_d : public op_by_pieces_d
> +{
> +  insn_gen_fn m_gen_fun;
> +
> +  void generate (rtx, rtx, machine_mode) final override;
> +  bool prepare_mode (machine_mode, unsigned int) final override;
> +
> + public:
> +  clear_by_pieces_d (rtx to, unsigned HOST_WIDE_INT len, unsigned int align)
> +    : op_by_pieces_d (STORE_MAX_PIECES, to, false, NULL_RTX, false, NULL,
> +                   NULL, len, align, false, CLEAR_BY_PIECES)
> +  {
> +  }
> +};
> +
> +/* Return true if MODE can be used for a set of stores, given an
> +   alignment ALIGN.  Prepare whatever data is necessary for later
> +   calls to generate.  */
> +
> +bool
> +clear_by_pieces_d::prepare_mode (machine_mode mode, unsigned int align)
> +{
> +  insn_code icode = optab_handler (mov_optab, mode);
> +  m_gen_fun = GEN_FCN (icode);
> +  return icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode);
> +}
> +
> +/* A callback used when iterating for a clear_by_pieces_operation.
> +   The input op1 should be NULL for clear_by_pieces and set it to
> +   const0 of that mode.  */
> +
> +void
> +clear_by_pieces_d::generate (rtx op0, rtx op1, machine_mode mode)
> +{
> +  gcc_assert (!op1);
> +  op1 = CONST0_RTX (mode);
> +  emit_insn (m_gen_fun (op0, op1));
> +}
> +
>  /* Determine whether the LEN bytes generated by CONSTFUN can be
>     stored to memory using several move instructions.  CONSTFUNDATA is
>     a pointer which will be passed as argument in every CONSTFUN call.
> @@ -1849,10 +1899,7 @@ clear_by_pieces (rtx to, unsigned HOST_WIDE_INT len, 
> unsigned int align)
>    if (len == 0)
>      return;
>
> -  /* Use builtin_memset_read_str to support vector mode broadcast.  */
> -  char c = 0;
> -  store_by_pieces_d data (to, builtin_memset_read_str, &c, len, align,
> -                       CLEAR_BY_PIECES);
> +  clear_by_pieces_d data (to, len, align);
>    data.run ();
>  }

Reply via email to