On Sat, Dec 9, 2023 at 8:05 AM Alexandre Oliva <ol...@adacore.com> wrote: > > Scratch the previous one, the "slightly different version" I had before > it was not entirely broken due to unnecessary, suboptimal and incorrect > use of ctz. Here I have yet another implementation of that loop that > should perform better and even work correctly ;-) > > > This one has so far regstrapped on x86_64-linux-gnu (v1 failed in > regression testing, sorry), and bootstrapped with -finline-stringops on > ppc64le-linux-gnu (still ongoing on x86-64-linux-gnu and > aarch64-linux-gnu). Ok to install?
OK > > The recently-added logic for -finline-stringops=memset introduced an > assumption that doesn't necessarily hold, namely, that > can_store_by_pieces of a larger size implies can_store_by_pieces by > smaller sizes. Checks for all sizes the by-multiple-pieces machinery > might use before committing to an expansion pattern. > > > for gcc/ChangeLog > > PR target/112778 > * builtins.cc (can_store_by_multiple_pieces): New. > (try_store_by_multiple_pieces): Call it. > > for gcc/testsuite/ChangeLog > > PR target/112778 > * gcc.dg/inline-mem-cmp-pr112778.c: New. > --- > gcc/builtins.cc | 57 > ++++++++++++++++++++---- > gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c | 10 ++++ > 2 files changed, 58 insertions(+), 9 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c > > diff --git a/gcc/builtins.cc b/gcc/builtins.cc > index 12a535d313f12..f6c96498f0783 100644 > --- a/gcc/builtins.cc > +++ b/gcc/builtins.cc > @@ -4284,6 +4284,40 @@ expand_builtin_memset (tree exp, rtx target, > machine_mode mode) > return expand_builtin_memset_args (dest, val, len, target, mode, exp); > } > > +/* Check that store_by_pieces allows BITS + LEN (so that we don't > + expand something too unreasonably long), and every power of 2 in > + BITS. It is assumed that LEN has already been tested by > + itself. */ > +static bool > +can_store_by_multiple_pieces (unsigned HOST_WIDE_INT bits, > + by_pieces_constfn constfun, > + void *constfundata, unsigned int align, > + bool memsetp, > + unsigned HOST_WIDE_INT len) > +{ > + if (bits > + && !can_store_by_pieces (bits + len, constfun, constfundata, > + align, memsetp)) > + return false; > + > + /* BITS set are expected to be generally in the low range and > + contiguous. We do NOT want to repeat the test above in case BITS > + has a single bit set, so we terminate the loop when BITS == BIT. > + In the unlikely case that BITS has the MSB set, also terminate in > + case BIT gets shifted out. */ > + for (unsigned HOST_WIDE_INT bit = 1; bit < bits && bit; bit <<= 1) > + { > + if ((bits & bit) == 0) > + continue; > + > + if (!can_store_by_pieces (bit, constfun, constfundata, > + align, memsetp)) > + return false; > + } > + > + return true; > +} > + > /* Try to store VAL (or, if NULL_RTX, VALC) in LEN bytes starting at TO. > Return TRUE if successful, FALSE otherwise. TO is assumed to be > aligned at an ALIGN-bits boundary. LEN must be a multiple of > @@ -4341,7 +4375,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, > unsigned int ctz_len, > else > /* Huh, max_len < min_len? Punt. See pr100843.c. */ > return false; > - if (min_len >= blksize) > + if (min_len >= blksize > + /* ??? Maybe try smaller fixed-prefix blksizes before > + punting? */ > + && can_store_by_pieces (blksize, builtin_memset_read_str, > + &valc, align, true)) > { > min_len -= blksize; > min_bits = floor_log2 (min_len); > @@ -4367,8 +4405,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned > int ctz_len, > happen because of the way max_bits and blksize are related, but > it doesn't hurt to test. */ > if (blksize > xlenest > - || !can_store_by_pieces (xlenest, builtin_memset_read_str, > - &valc, align, true)) > + || !can_store_by_multiple_pieces (xlenest - blksize, > + builtin_memset_read_str, > + &valc, align, true, blksize)) > { > if (!(flag_inline_stringops & ILSOP_MEMSET)) > return false; > @@ -4386,17 +4425,17 @@ try_store_by_multiple_pieces (rtx to, rtx len, > unsigned int ctz_len, > of overflow. */ > if (max_bits < orig_max_bits > && xlenest + blksize >= xlenest > - && can_store_by_pieces (xlenest + blksize, > - builtin_memset_read_str, > - &valc, align, true)) > + && can_store_by_multiple_pieces (xlenest, > + builtin_memset_read_str, > + &valc, align, true, blksize)) > { > max_loop = true; > break; > } > if (blksize > - && can_store_by_pieces (xlenest, > - builtin_memset_read_str, > - &valc, align, true)) > + && can_store_by_multiple_pieces (xlenest, > + builtin_memset_read_str, > + &valc, align, true, 0)) > { > max_len += blksize; > min_len += blksize; > diff --git a/gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c > b/gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c > new file mode 100644 > index 0000000000000..fdfc5b6f28c8e > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/inline-mem-cmp-pr112778.c > @@ -0,0 +1,10 @@ > +/* { dg-do compile } */ > +/* { dg-options "-finline-stringops" } */ > + > +char buf[3]; > + > +int > +f () > +{ > + __builtin_memset (buf, 'v', 3); > +} > > > -- > Alexandre Oliva, happy hacker https://FSFLA.org/blogs/lxo/ > Free Software Activist GNU Toolchain Engineer > More tolerance and less prejudice are key for inclusion and diversity > Excluding neuro-others for not behaving ""normal"" is *not* inclusive