On Sat, Dec 9, 2023 at 8:05 AM Alexandre Oliva <ol...@adacore.com> wrote:
>
> Scratch the previous one, the "slightly different version" I had before
> it was not entirely broken due to unnecessary, suboptimal and incorrect
> use of ctz.  Here I have yet another implementation of that loop that
> should perform better and even work correctly ;-)
>
>
> This one has so far regstrapped on x86_64-linux-gnu (v1 failed in
> regression testing, sorry), and bootstrapped with -finline-stringops on
> ppc64le-linux-gnu (still ongoing on x86-64-linux-gnu and
> aarch64-linux-gnu).  Ok to install?

OK

>
> The recently-added logic for -finline-stringops=memset introduced an
> assumption that doesn't necessarily hold, namely, that
> can_store_by_pieces of a larger size implies can_store_by_pieces by
> smaller sizes.  Checks for all sizes the by-multiple-pieces machinery
> might use before committing to an expansion pattern.
>
>
> for  gcc/ChangeLog
>
>         PR target/112778
>         * builtins.cc (can_store_by_multiple_pieces): New.
>         (try_store_by_multiple_pieces): Call it.
>
> for  gcc/testsuite/ChangeLog
>
>         PR target/112778
>         * gcc.dg/inline-mem-cmp-pr112778.c: New.
> ---
>  gcc/builtins.cc                                |   57 
> ++++++++++++++++++++----
>  gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c |   10 ++++
>  2 files changed, 58 insertions(+), 9 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c
>
> diff --git a/gcc/builtins.cc b/gcc/builtins.cc
> index 12a535d313f12..f6c96498f0783 100644
> --- a/gcc/builtins.cc
> +++ b/gcc/builtins.cc
> @@ -4284,6 +4284,40 @@ expand_builtin_memset (tree exp, rtx target, 
> machine_mode mode)
>    return expand_builtin_memset_args (dest, val, len, target, mode, exp);
>  }
>
> +/* Check that store_by_pieces allows BITS + LEN (so that we don't
> +   expand something too unreasonably long), and every power of 2 in
> +   BITS.  It is assumed that LEN has already been tested by
> +   itself.  */
> +static bool
> +can_store_by_multiple_pieces (unsigned HOST_WIDE_INT bits,
> +                             by_pieces_constfn constfun,
> +                             void *constfundata, unsigned int align,
> +                             bool memsetp,
> +                             unsigned HOST_WIDE_INT len)
> +{
> +  if (bits
> +      && !can_store_by_pieces (bits + len, constfun, constfundata,
> +                              align, memsetp))
> +    return false;
> +
> +  /* BITS set are expected to be generally in the low range and
> +     contiguous.  We do NOT want to repeat the test above in case BITS
> +     has a single bit set, so we terminate the loop when BITS == BIT.
> +     In the unlikely case that BITS has the MSB set, also terminate in
> +     case BIT gets shifted out.  */
> +  for (unsigned HOST_WIDE_INT bit = 1; bit < bits && bit; bit <<= 1)
> +    {
> +      if ((bits & bit) == 0)
> +       continue;
> +
> +      if (!can_store_by_pieces (bit, constfun, constfundata,
> +                               align, memsetp))
> +       return false;
> +    }
> +
> +  return true;
> +}
> +
>  /* Try to store VAL (or, if NULL_RTX, VALC) in LEN bytes starting at TO.
>     Return TRUE if successful, FALSE otherwise.  TO is assumed to be
>     aligned at an ALIGN-bits boundary.  LEN must be a multiple of
> @@ -4341,7 +4375,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, 
> unsigned int ctz_len,
>    else
>      /* Huh, max_len < min_len?  Punt.  See pr100843.c.  */
>      return false;
> -  if (min_len >= blksize)
> +  if (min_len >= blksize
> +      /* ??? Maybe try smaller fixed-prefix blksizes before
> +        punting?  */
> +      && can_store_by_pieces (blksize, builtin_memset_read_str,
> +                             &valc, align, true))
>      {
>        min_len -= blksize;
>        min_bits = floor_log2 (min_len);
> @@ -4367,8 +4405,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned 
> int ctz_len,
>       happen because of the way max_bits and blksize are related, but
>       it doesn't hurt to test.  */
>    if (blksize > xlenest
> -      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
> -                              &valc, align, true))
> +      || !can_store_by_multiple_pieces (xlenest - blksize,
> +                                       builtin_memset_read_str,
> +                                       &valc, align, true, blksize))
>      {
>        if (!(flag_inline_stringops & ILSOP_MEMSET))
>         return false;
> @@ -4386,17 +4425,17 @@ try_store_by_multiple_pieces (rtx to, rtx len, 
> unsigned int ctz_len,
>              of overflow.  */
>           if (max_bits < orig_max_bits
>               && xlenest + blksize >= xlenest
> -             && can_store_by_pieces (xlenest + blksize,
> -                                     builtin_memset_read_str,
> -                                     &valc, align, true))
> +             && can_store_by_multiple_pieces (xlenest,
> +                                              builtin_memset_read_str,
> +                                              &valc, align, true, blksize))
>             {
>               max_loop = true;
>               break;
>             }
>           if (blksize
> -             && can_store_by_pieces (xlenest,
> -                                     builtin_memset_read_str,
> -                                     &valc, align, true))
> +             && can_store_by_multiple_pieces (xlenest,
> +                                              builtin_memset_read_str,
> +                                              &valc, align, true, 0))
>             {
>               max_len += blksize;
>               min_len += blksize;
> diff --git a/gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c 
> b/gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c
> new file mode 100644
> index 0000000000000..fdfc5b6f28c8e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-finline-stringops" } */
> +
> +char buf[3];
> +
> +int
> +f ()
> +{
> +  __builtin_memset (buf, 'v', 3);
> +}
>
>
> --
> Alexandre Oliva, happy hacker            https://FSFLA.org/blogs/lxo/
>    Free Software Activist                   GNU Toolchain Engineer
> More tolerance and less prejudice are key for inclusion and diversity
> Excluding neuro-others for not behaving ""normal"" is *not* inclusive

Reply via email to