On Fri, Aug 29, 2025 at 2:55 AM H.J. Lu <[email protected]> wrote:
>
> Since
>
> commit 401199377c50045ede560daf3f6e8b51749c2a87
> Author: H.J. Lu <[email protected]>
> Date:   Tue Jun 17 10:17:17 2025 +0800
>
>     x86: Improve vector_loop/unrolled_loop for memset/memcpy
>
> uses move_by_pieces and store_by_pieces to expand memcpy/memset epilogue
> with vector_loop even when targetm.use_by_pieces_infrastructure_p returns
> false, which triggers
>
>   gcc_assert (targetm.use_by_pieces_infrastructure_p
>                 (len, align,
>                  memsetp ? SET_BY_PIECES : STORE_BY_PIECES,
>                  optimize_insn_for_speed_p ()));
>
> in store_by_pieces.  Fix it by:
>
> 1. Add by_pieces_in_use to machine_function to indicate that by_pieces op
> is currently in use.
> 2. Set and clear by_pieces_in_use when expanding memcpy/memset epilogue
> with move_by_pieces and store_by_pieces.
> 3. Define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P to return true if
> by_pieces_in_use is true.
>
> gcc/
>
>         PR target/121096
>         * config/i386/i386-expand.cc (expand_cpymem_epilogue): Set and
>         clear by_pieces_in_use when using by_pieces op.
>         (expand_setmem_epilogue): Likewise.
>         * config/i386/i386.cc (ix86_use_by_pieces_infrastructure_p): New.
>         (TARGET_USE_BY_PIECES_INFRASTRUCTURE_P): Likewise.
>         * config/i386/i386.h (machine_function): Add by_pieces_in_use.
>
> gcc/testsuite/
>
>         PR target/121096
>         * gcc.target/i386/memcpy-strategy-14.c: New test.
>         * gcc.target/i386/memcpy-strategy-15.c: Likewise.
>         * gcc.target/i386/memset-strategy-10.c: Likewise.
>         * gcc.target/i386/memset-strategy-11.c: Likewise.
>         * gcc.target/i386/memset-strategy-12.c: Likewise.
>         * gcc.target/i386/memset-strategy-13.c: Likewise.
>         * gcc.target/i386/memset-strategy-14.c: Likewise.
>         * gcc.target/i386/memset-strategy-15.c: Likewise.

OK, although it looks a bit hackish.

Thanks,
Uros.

>
> Signed-off-by: H.J. Lu <[email protected]>
> ---
>  gcc/config/i386/i386-expand.cc                |  4 +++
>  gcc/config/i386/i386.cc                       | 21 +++++++++++++++
>  gcc/config/i386/i386.h                        |  3 +++
>  .../gcc.target/i386/memcpy-strategy-14.c      | 10 +++++++
>  .../gcc.target/i386/memcpy-strategy-15.c      | 10 +++++++
>  .../gcc.target/i386/memset-strategy-10.c      | 24 +++++++++++++++++
>  .../gcc.target/i386/memset-strategy-11.c      |  9 +++++++
>  .../gcc.target/i386/memset-strategy-12.c      |  8 ++++++
>  .../gcc.target/i386/memset-strategy-13.c      | 26 +++++++++++++++++++
>  .../gcc.target/i386/memset-strategy-14.c      |  8 ++++++
>  .../gcc.target/i386/memset-strategy-15.c      |  9 +++++++
>  11 files changed, 132 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-14.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-15.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-10.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-11.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-12.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-13.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-14.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-15.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index ef6c12cd569..ec211d11a94 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -8241,8 +8241,10 @@ expand_cpymem_epilogue (rtx destmem, rtx srcmem,
>        unsigned HOST_WIDE_INT countval = UINTVAL (count);
>        unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
>        unsigned int destalign = MEM_ALIGN (destmem);
> +      cfun->machine->by_pieces_in_use = true;
>        move_by_pieces (destmem, srcmem, epilogue_size, destalign,
>                       RETURN_BEGIN);
> +      cfun->machine->by_pieces_in_use = false;
>        return;
>      }
>    if (max_size > 8)
> @@ -8490,9 +8492,11 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx 
> value, rtx vec_value,
>        unsigned HOST_WIDE_INT countval = UINTVAL (count);
>        unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
>        unsigned int destalign = MEM_ALIGN (destmem);
> +      cfun->machine->by_pieces_in_use = true;
>        store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
>                        vec_value ? vec_value : value, destalign, true,
>                        RETURN_BEGIN);
> +      cfun->machine->by_pieces_in_use = false;
>        return;
>      }
>    if (max_size > 32)
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 1ca6c612137..471be3e8615 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -11382,6 +11382,23 @@ ix86_address_cost (rtx x, machine_mode, 
> addr_space_t, bool)
>
>    return cost;
>  }
> +
> +/* Implement TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.  */
> +
> +bool
> +ix86_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
> +                                    unsigned int align,
> +                                    enum by_pieces_operation op,
> +                                    bool speed_p)
> +{
> +  /* Return true when we are currently expanding memcpy/memset epilogue
> +     with move_by_pieces or store_by_pieces.  */
> +  if (cfun->machine->by_pieces_in_use)
> +    return true;
> +
> +  return default_use_by_pieces_infrastructure_p (size, align, op,
> +                                                speed_p);
> +}
>
>  /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
>     this is used for to form addresses to local data when -fPIC is in
> @@ -27934,6 +27951,10 @@ static const scoped_attribute_specs *const 
> ix86_attribute_table[] =
>  #undef TARGET_ADDRESS_COST
>  #define TARGET_ADDRESS_COST ix86_address_cost
>
> +#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
> +#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
> +  ix86_use_by_pieces_infrastructure_p
> +
>  #undef TARGET_OVERLAP_OP_BY_PIECES_P
>  #define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true
>
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 2eb141bab1a..ac0ce687f36 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -2954,6 +2954,9 @@ struct GTY(()) machine_function {
>    /* True if this is a recursive function.  */
>    BOOL_BITFIELD recursive_function : 1;
>
> +  /* True if by_pieces op is currently in use.  */
> +  BOOL_BITFIELD by_pieces_in_use : 1;
> +
>    /* The largest alignment, in bytes, of stack slot actually used.  */
>    unsigned int max_used_stack_alignment;
>
> diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-14.c 
> b/gcc/testsuite/gcc.target/i386/memcpy-strategy-14.c
> new file mode 100644
> index 00000000000..44cd6523029
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-14.c
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Os -mno-avx -msse2 -mtune=generic -minline-all-stringops 
> -mstringop-strategy=vector_loop" } */
> +/* { dg-final { scan-assembler-times "movaps" 8 } } */
> +
> +char a[2048];
> +char b[2048];
> +void t (void)
> +{
> +  __builtin_memcpy (a, b, 2048);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-15.c 
> b/gcc/testsuite/gcc.target/i386/memcpy-strategy-15.c
> new file mode 100644
> index 00000000000..ea8e4be4ac4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-15.c
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Os -mno-avx -msse2 -mtune=generic -minline-all-stringops 
> -mstringop-strategy=vector_loop" } */
> +/* { dg-final { scan-assembler-times "movups" 8 } } */
> +
> +char *a;
> +char *b;
> +void t (void)
> +{
> +  __builtin_memcpy (a, b, 2048);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-10.c 
> b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
> new file mode 100644
> index 00000000000..d6f2f4ed7ff
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Os -march=x86-64 -mstringop-strategy=vector_loop" } */
> +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
> +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } 
> {^\t?\.} } } */
> +
> +/*
> +**foo:
> +**.LFB[0-9]+:
> +**     .cfi_startproc
> +**     xorps   %xmm0, %xmm0
> +**     xorl    %eax, %eax
> +**     movq    %rax, 48\(%(e|r)di\)
> +**     movups  %xmm0, \(%(e|r)di\)
> +**     movups  %xmm0, 16\(%(e|r)di\)
> +**     movups  %xmm0, 32\(%(e|r)di\)
> +**     ret
> +**...
> +*/
> +
> +void
> +foo (char *a)
> +{
> +  __builtin_memset (a, 0, 56);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-11.c 
> b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
> new file mode 100644
> index 00000000000..851c6faaa09
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Os -mno-avx -msse2 -mtune=generic -minline-all-stringops 
> -mstringop-strategy=vector_loop" } */
> +/* { dg-final { scan-assembler-times "movaps" 4 } } */
> +
> +char a[2048];
> +void t (void)
> +{
> +  __builtin_memset (a, 0, 2048);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-12.c 
> b/gcc/testsuite/gcc.target/i386/memset-strategy-12.c
> new file mode 100644
> index 00000000000..06cac03426a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-12.c
> @@ -0,0 +1,8 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Os -mno-sse -mstringop-strategy=vector_loop" } */
> +
> +void
> +foo (char *a)
> +{
> +  __builtin_memset (a, 0, 56);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-13.c 
> b/gcc/testsuite/gcc.target/i386/memset-strategy-13.c
> new file mode 100644
> index 00000000000..cc2129f60eb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-13.c
> @@ -0,0 +1,26 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Os -mno-sse -mstringop-strategy=unrolled_loop" } */
> +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
> +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } 
> {^\t?\.} } } */
> +
> +/*
> +**foo:
> +**.LFB[0-9]+:
> +**     .cfi_startproc
> +**     xorl    %eax, %eax
> +**     movq    %rax, \(%(e|r)di\)
> +**     movq    %rax, 8\(%(e|r)di\)
> +**     movq    %rax, 16\(%(e|r)di\)
> +**     movq    %rax, 24\(%(e|r)di\)
> +**     movq    %rax, 32\(%(e|r)di\)
> +**     movq    %rax, 40\(%(e|r)di\)
> +**     movq    %rax, 48\(%(e|r)di\)
> +**     ret
> +**...
> +*/
> +
> +void
> +foo (char *a)
> +{
> +  __builtin_memset (a, 0, 56);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-14.c 
> b/gcc/testsuite/gcc.target/i386/memset-strategy-14.c
> new file mode 100644
> index 00000000000..144235ee082
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-14.c
> @@ -0,0 +1,8 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Os -march=x86-64 -mstringop-strategy=vector_loop" } */
> +
> +void
> +foo (char *a, int c)
> +{
> +  __builtin_memset (a, c, 56);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-15.c 
> b/gcc/testsuite/gcc.target/i386/memset-strategy-15.c
> new file mode 100644
> index 00000000000..66f9fa60049
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-15.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Os -mno-avx -msse2 -mtune=generic 
> -mstringop-strategy=vector_loop" } */
> +/* { dg-final { scan-assembler-times "movups" 4} } */
> +
> +char *a;
> +void t (void)
> +{
> +  __builtin_memset (a, 0, 2048);
> +}
> --
> 2.51.0
>

Reply via email to