1. Don't generate the loop if the loop count is 1.
2. For memset with vector on small size, use vector if small size supports
vector, otherwise use the scalar value.
3. Duplicate the promoted scalar value for vector.
4. Always expand vector-version of memset for vector_loop.
5. Use misaligned prologue if alignment isn't needed.  When misaligned
prologue is used, check if destination is actually aligned and update
destination alignment if aligned.

The included tests show that codegen of vector_loop/unrolled_loop for
memset/memcpy are significantly improved.  For

---
void
foo (void *p1, size_t len)
{
  __builtin_memset (p1, 0, len);
}
---

with

-O2 -minline-all-stringops 
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -march=x86-64

we used to generate

foo:
.LFB0:
        .cfi_startproc
        movq    %rdi, %rax
        pxor    %xmm0, %xmm0
        cmpq    $64, %rsi
        jnb     .L18
.L2:
        andl    $63, %esi
        je      .L1
        xorl    %edx, %edx
        testb   $1, %sil
        je      .L5
        movl    $1, %edx
        movb    $0, (%rax)
        cmpq    %rsi, %rdx
        jnb     .L19
.L5:
        movb    $0, (%rax,%rdx)
        movb    $0, 1(%rax,%rdx)
        addq    $2, %rdx
        cmpq    %rsi, %rdx
        jb      .L5
.L1:
        ret
        .p2align 4,,10
        .p2align 3
.L18:
        movq    %rsi, %rdx
        xorl    %eax, %eax
        andq    $-64, %rdx
.L3:
        movups  %xmm0, (%rdi,%rax)
        movups  %xmm0, 16(%rdi,%rax)
        movups  %xmm0, 32(%rdi,%rax)
        movups  %xmm0, 48(%rdi,%rax)
        addq    $64, %rax
        cmpq    %rdx, %rax
        jb      .L3
        addq    %rdi, %rax
        jmp     .L2
.L19:
        ret
        .cfi_endproc

with very poor prologue/epilogue.  With this patch, we now generate:

foo:
.LFB0:
        .cfi_startproc
        pxor    %xmm0, %xmm0
        cmpq    $64, %rsi
        jnb     .L2
        testb   $32, %sil
        jne     .L19
        testb   $16, %sil
        jne     .L20
        testb   $8, %sil
        jne     .L21
        testb   $4, %sil
        jne     .L22
        testq   %rsi, %rsi
        jne     .L23
.L1:
        ret
        .p2align 4,,10
        .p2align 3
.L2:
        movups  %xmm0, -64(%rdi,%rsi)
        movups  %xmm0, -48(%rdi,%rsi)
        movups  %xmm0, -32(%rdi,%rsi)
        movups  %xmm0, -16(%rdi,%rsi)
        subq    $1, %rsi
        cmpq    $64, %rsi
        jb      .L1
        andq    $-64, %rsi
        xorl    %eax, %eax
.L9:
        movups  %xmm0, (%rdi,%rax)
        movups  %xmm0, 16(%rdi,%rax)
        movups  %xmm0, 32(%rdi,%rax)
        movups  %xmm0, 48(%rdi,%rax)
        addq    $64, %rax
        cmpq    %rsi, %rax
        jb      .L9
        ret
        .p2align 4,,10
        .p2align 3
.L23:
        movb    $0, (%rdi)
        testb   $2, %sil
        je      .L1
        xorl    %eax, %eax
        movw    %ax, -2(%rdi,%rsi)
        ret
        .p2align 4,,10
        .p2align 3
.L19:
        movups  %xmm0, (%rdi)
        movups  %xmm0, 16(%rdi)
        movups  %xmm0, -32(%rdi,%rsi)
        movups  %xmm0, -16(%rdi,%rsi)
        ret
        .p2align 4,,10
        .p2align 3
.L20:
        movups  %xmm0, (%rdi)
        movups  %xmm0, -16(%rdi,%rsi)
        ret
        .p2align 4,,10
        .p2align 3
.L21:
        movq    $0, (%rdi)
        movq    $0, -8(%rdi,%rsi)
        ret
        .p2align 4,,10
        .p2align 3
.L22:
        movl    $0, (%rdi)
        movl    $0, -4(%rdi,%rsi)
        ret
        .cfi_endproc

gcc/

        PR target/120683
        * config/i386/i386-expand.cc (expand_set_or_cpymem_via_loop):
        Don't generate the loop if the loop count is 1.
        (expand_small_cpymem_or_setmem): Choose cpymem mode from MOVE_MAX.
        For memset with vector and the size is smaller than the vector
        size, first try the narrower vector, otherwise, use the scalar
        value.
        (expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves):
        Add an argument to indicate if destination is aligned and align
        destination if aligned.
        (promote_duplicated_reg): Duplicate the scalar value for vector.
        (ix86_expand_set_or_cpymem): Always expand vector-version of
        memset for vector_loop.  Use misaligned prologue if alignment
        isn't needed.  When misaligned prologue is used, check if
        destination is actually aligned and update destination alignment
        if needed.

gcc/testsuite/

        PR target/120683
        * gcc.target/i386/memcpy-pr120683-1.c: New test.
        * gcc.target/i386/memcpy-pr120683-2.c: Likewise.
        * gcc.target/i386/memcpy-pr120683-3.c: Likewise.
        * gcc.target/i386/memcpy-pr120683-4.c: Likewise.
        * gcc.target/i386/memcpy-pr120683-5.c: Likewise.
        * gcc.target/i386/memcpy-pr120683-6.c: Likewise.
        * gcc.target/i386/memcpy-pr120683-7.c: Likewise.
        * gcc.target/i386/memset-pr120683-1.c: Likewise.
        * gcc.target/i386/memset-pr120683-2.c: Likewise.
        * gcc.target/i386/memset-pr120683-3.c: Likewise.
        * gcc.target/i386/memset-pr120683-4.c: Likewise.
        * gcc.target/i386/memset-pr120683-5.c: Likewise.
        * gcc.target/i386/memset-pr120683-6.c: Likewise.
        * gcc.target/i386/memset-pr120683-7.c: Likewise.
        * gcc.target/i386/memset-pr120683-8.c: Likewise.
        * gcc.target/i386/memset-pr120683-9.c: Likewise.
        * gcc.target/i386/memset-pr120683-10.c: Likewise.
        * gcc.target/i386/memset-pr120683-11.c: Likewise.
        * gcc.target/i386/memset-pr120683-12.c: Likewise.
        * gcc.target/i386/memset-pr120683-13.c: Likewise.
        * gcc.target/i386/memset-pr120683-14.c: Likewise.
        * gcc.target/i386/memset-pr120683-15.c: Likewise.
        * gcc.target/i386/memset-pr120683-16.c: Likewise.
        * gcc.target/i386/memset-pr120683-17.c: Likewise.

Signed-off-by: H.J. Lu <hjl.to...@gmail.com>
---
 gcc/config/i386/i386-expand.cc                | 169 ++++++++++++++----
 .../gcc.target/i386/memcpy-pr120683-1.c       |  42 +++++
 .../gcc.target/i386/memcpy-pr120683-2.c       |  47 +++++
 .../gcc.target/i386/memcpy-pr120683-3.c       |  47 +++++
 .../gcc.target/i386/memcpy-pr120683-4.c       |  48 +++++
 .../gcc.target/i386/memcpy-pr120683-5.c       |  48 +++++
 .../gcc.target/i386/memcpy-pr120683-6.c       |  48 +++++
 .../gcc.target/i386/memcpy-pr120683-7.c       |  48 +++++
 .../gcc.target/i386/memset-pr120683-1.c       |  35 ++++
 .../gcc.target/i386/memset-pr120683-10.c      |  28 +++
 .../gcc.target/i386/memset-pr120683-11.c      |  29 +++
 .../gcc.target/i386/memset-pr120683-12.c      |  31 ++++
 .../gcc.target/i386/memset-pr120683-13.c      |  37 ++++
 .../gcc.target/i386/memset-pr120683-14.c      |  91 ++++++++++
 .../gcc.target/i386/memset-pr120683-15.c      | 103 +++++++++++
 .../gcc.target/i386/memset-pr120683-16.c      | 112 ++++++++++++
 .../gcc.target/i386/memset-pr120683-17.c      |  37 ++++
 .../gcc.target/i386/memset-pr120683-2.c       |  30 ++++
 .../gcc.target/i386/memset-pr120683-3.c       |  26 +++
 .../gcc.target/i386/memset-pr120683-4.c       |  93 ++++++++++
 .../gcc.target/i386/memset-pr120683-5.c       | 102 +++++++++++
 .../gcc.target/i386/memset-pr120683-6.c       | 109 +++++++++++
 .../gcc.target/i386/memset-pr120683-7.c       |  94 ++++++++++
 .../gcc.target/i386/memset-pr120683-8.c       | 103 +++++++++++
 .../gcc.target/i386/memset-pr120683-9.c       | 110 ++++++++++++
 25 files changed, 1635 insertions(+), 32 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-9.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 4946f87a131..9a07e026d62 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -7899,7 +7899,8 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
                               rtx count, machine_mode mode, int unroll,
                               int expected_size, bool issetmem)
 {
-  rtx_code_label *out_label, *top_label;
+  rtx_code_label *out_label = nullptr;
+  rtx_code_label *top_label = nullptr;
   rtx iter, tmp;
   machine_mode iter_mode = counter_mode (count);
   int piece_size_n = GET_MODE_SIZE (mode) * unroll;
@@ -7907,9 +7908,19 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
   rtx size;
   int i;
+  int loop_count;
 
-  top_label = gen_label_rtx ();
-  out_label = gen_label_rtx ();
+  if (expected_size != -1 && CONST_INT_P (count))
+    loop_count = INTVAL (count) / GET_MODE_SIZE (mode) / unroll;
+  else
+    loop_count = -1;
+
+  /* Don't generate the loop if the loop count is 1.  */
+  if (loop_count != 1)
+    {
+      top_label = gen_label_rtx ();
+      out_label = gen_label_rtx ();
+    }
   iter = gen_reg_rtx (iter_mode);
 
   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
@@ -7923,7 +7934,8 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
     }
   emit_move_insn (iter, const0_rtx);
 
-  emit_label (top_label);
+  if (loop_count != 1)
+    emit_label (top_label);
 
   tmp = convert_modes (Pmode, iter_mode, iter, true);
 
@@ -7991,21 +8003,25 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
-                          true, top_label);
-  if (expected_size != -1)
+  if (loop_count != 1)
     {
-      expected_size /= GET_MODE_SIZE (mode) * unroll;
-      if (expected_size == 0)
-       predict_jump (0);
-      else if (expected_size > REG_BR_PROB_BASE)
-       predict_jump (REG_BR_PROB_BASE - 1);
+      emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+                              true, top_label);
+      if (expected_size != -1)
+       {
+         expected_size /= GET_MODE_SIZE (mode) * unroll;
+         if (expected_size == 0)
+           predict_jump (0);
+         else if (expected_size > REG_BR_PROB_BASE)
+           predict_jump (REG_BR_PROB_BASE - 1);
+         else
+           predict_jump (REG_BR_PROB_BASE
+                         - (REG_BR_PROB_BASE + expected_size / 2)
+                           / expected_size);
+       }
       else
-        predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
-                     / expected_size);
+       predict_jump (REG_BR_PROB_BASE * 80 / 100);
     }
-  else
-    predict_jump (REG_BR_PROB_BASE * 80 / 100);
   iter = ix86_zero_extend_to_Pmode (iter);
   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
                             true, OPTAB_LIB_WIDEN);
@@ -8018,7 +8034,8 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
       if (tmp != srcptr)
        emit_move_insn (srcptr, tmp);
     }
-  emit_label (out_label);
+  if (loop_count != 1)
+    emit_label (out_label);
 }
 
 /* Divide COUNTREG by SCALE.  */
@@ -8552,6 +8569,7 @@ expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
   rtx_code_label *label = ix86_expand_aligntest (count, size, false);
   machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
   rtx modesize;
+  rtx scalar_value = value;
   int n;
 
   /* If we do not have vector value to copy, we must reduce size.  */
@@ -8571,11 +8589,57 @@ expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
     {
       /* Choose appropriate vector mode.  */
       if (size >= 32)
-       mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
+       switch (MOVE_MAX)
+         {
+         case 64:
+           if (size >= 64)
+             {
+               mode = V64QImode;
+               break;
+             }
+           /* FALLTHRU */
+         case 32:
+           mode = V32QImode;
+           break;
+         case 16:
+           mode = V16QImode;
+           break;
+         case 8:
+           mode = DImode;
+           break;
+         default:
+           gcc_unreachable ();
+         }
       else if (size >= 16)
        mode = TARGET_SSE ? V16QImode : DImode;
       srcmem = change_address (srcmem, mode, srcptr);
     }
+  if (issetmem && vec_value && GET_MODE_SIZE (mode) > size)
+    {
+      /* For memset with vector and the size is smaller than the vector
+        size, first try the narrower vector, otherwise, use the
+        original value. */
+      machine_mode inner_mode = GET_MODE_INNER (mode);
+      unsigned int nunits = size / GET_MODE_SIZE (inner_mode);
+      if (nunits > 1)
+       {
+         mode = mode_for_vector (GET_MODE_INNER (mode),
+                                 nunits).require ();
+         value = gen_rtx_SUBREG (mode, value, 0);
+       }
+      else
+       {
+         scalar_int_mode smode
+           = smallest_int_mode_for_size (size * BITS_PER_UNIT).require ();
+         gcc_assert (GET_MODE_SIZE (GET_MODE (scalar_value))
+                     >= GET_MODE_SIZE (smode));
+         mode = smode;
+         if (GET_MODE (scalar_value) == mode)
+           value = scalar_value;
+         else
+           value = gen_rtx_SUBREG (mode, scalar_value, 0);
+       }
+    }
   destmem = change_address (destmem, mode, destptr);
   modesize = GEN_INT (GET_MODE_SIZE (mode));
   gcc_assert (GET_MODE_SIZE (mode) <= size);
@@ -8631,6 +8695,8 @@ expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
    DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
    we will dispatch to a library call for large blocks.
 
+   If ALIGNED_DESTMEM is true, destination is aligned.
+
    In pseudocode we do:
 
    if (COUNT < SIZE)
@@ -8680,7 +8746,8 @@ 
expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx src
                                                            int align,
                                                            unsigned 
HOST_WIDE_INT *min_size,
                                                            bool dynamic_check,
-                                                           bool issetmem)
+                                                           bool issetmem,
+                                                           bool 
aligned_destmem)
 {
   rtx_code_label *loop_label = NULL, *label;
   int n;
@@ -8784,6 +8851,19 @@ 
expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx src
   destmem = offset_address (destmem,
                            GEN_INT (-size - prolog_size),
                            1);
+  if (aligned_destmem)
+    {
+      /* Check if destination is still aligned after adjustment.  */
+      aligned_destmem = false;
+      if (CONST_INT_P (*count))
+       {
+         int mode_align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
+         int offset = INTVAL (*count) - size - prolog_size;
+         aligned_destmem = (offset % mode_align) == 0;
+       }
+      if (aligned_destmem)
+       set_mem_align (destmem, GET_MODE_ALIGNMENT (mode));
+    }
   if (issetmem)
     emit_move_insn (destmem, mode_value);
   else
@@ -8797,6 +8877,8 @@ 
expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx src
   for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
     {
       destmem = offset_address (destmem, modesize, 1);
+      if (aligned_destmem)
+       set_mem_align (destmem, GET_MODE_ALIGNMENT (mode));
       if (issetmem)
        emit_move_insn (destmem, mode_value);
       else
@@ -9179,13 +9261,25 @@ decide_alignment (int align,
 static rtx
 promote_duplicated_reg (machine_mode mode, rtx val)
 {
+  if (val == const0_rtx)
+    return copy_to_mode_reg (mode, CONST0_RTX (mode));
+
   machine_mode valmode = GET_MODE (val);
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    {
+      /* Duplicate the scalar value for integer vector.  */
+      gcc_assert (GET_MODE_INNER (mode) == valmode);
+      rtx dup = gen_reg_rtx (mode);
+      bool ok = ix86_expand_vector_init_duplicate (false, mode, dup,
+                                                  val);
+      gcc_assert (ok);
+      return dup;
+    }
+
   rtx tmp;
   int nops = mode == DImode ? 3 : 2;
 
-  gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
-  if (val == const0_rtx)
-    return copy_to_mode_reg (mode, CONST0_RTX (mode));
+  gcc_assert (mode == SImode || mode == DImode);
   if (CONST_INT_P (val))
     {
       HOST_WIDE_INT v = INTVAL (val) & 255;
@@ -9414,11 +9508,6 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx 
count_exp, rtx val_exp,
     return false;
   gcc_assert (alg != no_stringop);
 
-  /* For now vector-version of memset is generated only for memory zeroing, as
-     creating of promoted vector value is very cheap in this case.  */
-  if (issetmem && alg == vector_loop && val_exp != const0_rtx)
-    alg = unrolled_loop;
-
   if (!count)
     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
   destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
@@ -9523,6 +9612,13 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx 
count_exp, rtx val_exp,
        && ((desired_align > align && !align_bytes)
           || (!count && epilogue_size_needed > 1)));
 
+  /* Destination is aligned after the misaligned prologue.  */
+  bool aligned_dstmem = misaligned_prologue_used;
+
+  /* Also use misaligned prologue if alignment isn't needed.  The
+     aligned store will be used if destination is actually aligned.  */
+  misaligned_prologue_used |= noalign;
+
   /* Do the cheap promotion to allow better CSE across the
      main loop and epilogue (ie one load of the big constant in the
      front of all code.
@@ -9532,11 +9628,12 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx 
count_exp, rtx val_exp,
     {
       if (alg == vector_loop)
        {
-         gcc_assert (val_exp == const0_rtx);
-         vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
          promoted_val = promote_duplicated_reg_to_size (val_exp,
                                                         GET_MODE_SIZE 
(word_mode),
                                                         desired_align, align);
+         /* Duplicate the promoted scalar value.  */
+         vec_promoted_val = promote_duplicated_reg (move_mode,
+                                                    promoted_val);
        }
       else
        {
@@ -9549,7 +9646,13 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx 
count_exp, rtx val_exp,
      also avoids redundant job when sizes are known precisely.  */
   if (misaligned_prologue_used)
     {
-      /* Misaligned move prologue handled small blocks by itself.  */
+      /* Misaligned move prologue handled small blocks by itself.
+        When alignment isn't needed, check if destination is
+        actually aligned and update destination alignment if
+        aligned.  */
+      if (noalign)
+       aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
+                         <= MEM_ALIGN (dst));
       expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
           (dst, src, &destreg, &srcreg,
            move_mode, promoted_val, vec_promoted_val,
@@ -9557,11 +9660,13 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx 
count_exp, rtx val_exp,
            &jump_around_label,
             desired_align < align
            ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
-           desired_align, align, &min_size, dynamic_check, issetmem);
+           desired_align, align, &min_size, dynamic_check, issetmem,
+           aligned_dstmem);
       if (!issetmem)
         src = change_address (src, BLKmode, srcreg);
       dst = change_address (dst, BLKmode, destreg);
-      set_mem_align (dst, desired_align * BITS_PER_UNIT);
+      if (aligned_dstmem)
+       set_mem_align (dst, desired_align * BITS_PER_UNIT);
       epilogue_size_needed = 0;
       if (need_zero_guard
          && min_size < (unsigned HOST_WIDE_INT) size_needed)
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c 
b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c
new file mode 100644
index 00000000000..753238e35fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse 
-mmemcpy-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     movq    221\(%rsi\), %rax
+**     xorl    %edx, %edx
+**     movq    %rax, 221\(%rdi\)
+**     movq    229\(%rsi\), %rax
+**     movq    %rax, 229\(%rdi\)
+**     movq    237\(%rsi\), %rax
+**     movq    %rax, 237\(%rdi\)
+**     movq    245\(%rsi\), %rax
+**     movq    %rax, 245\(%rdi\)
+**.L[0-9]+:
+**     movl    %edx, %eax
+**     addl    \$32, %edx
+**     movq    \(%rsi,%rax\), %r10
+**     movq    8\(%rsi,%rax\), %r9
+**     movq    16\(%rsi,%rax\), %r8
+**     movq    24\(%rsi,%rax\), %rcx
+**     movq    %r10, \(%rdi,%rax\)
+**     movq    %r9, 8\(%rdi,%rax\)
+**     movq    %r8, 16\(%rdi,%rax\)
+**     movq    %rcx, 24\(%rdi,%rax\)
+**     cmpl    \$224, %edx
+**     jb      .L[0-9]+
+**     ret
+**...
+*/
+
+void
+foo (char *dest, char *src)
+{
+  __builtin_memcpy (dest, src, 253);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c 
b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c
new file mode 100644
index 00000000000..b7ea7c2d489
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 
-mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     movdqu  src\+208\(%rip\), %xmm0
+**     xorl    %edx, %edx
+**     movaps  %xmm0, dest\+208\(%rip\)
+**     movdqu  src\+224\(%rip\), %xmm0
+**     movaps  %xmm0, dest\+224\(%rip\)
+**     movdqu  src\+240\(%rip\), %xmm0
+**     movaps  %xmm0, dest\+240\(%rip\)
+**     movdqu  src\+256\(%rip\), %xmm0
+**     movaps  %xmm0, dest\+256\(%rip\)
+**.L[0-9]+:
+**     movl    %edx, %eax
+**     addl    \$64, %edx
+**     movdqu  src\(%rax\), %xmm3
+**     movdqu  src\+16\(%rax\), %xmm2
+**     movdqu  src\+32\(%rax\), %xmm1
+**     movdqu  src\+48\(%rax\), %xmm0
+**     movaps  %xmm3, dest\(%rax\)
+**     movaps  %xmm2, dest\+16\(%rax\)
+**     movaps  %xmm1, dest\+32\(%rax\)
+**     movaps  %xmm0, dest\+48\(%rax\)
+**     cmpl    \$256, %edx
+**     jb      .L[0-9]+
+**     ret
+**...
+*/
+
+#define SIZE (16 + 1) * 16
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c 
b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c
new file mode 100644
index 00000000000..75295702952
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 
-mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     movdqu  src\+223\(%rip\), %xmm0
+**     xorl    %edx, %edx
+**     movups  %xmm0, dest\+223\(%rip\)
+**     movdqu  src\+239\(%rip\), %xmm0
+**     movups  %xmm0, dest\+239\(%rip\)
+**     movdqu  src\+255\(%rip\), %xmm0
+**     movups  %xmm0, dest\+255\(%rip\)
+**     movdqu  src\+271\(%rip\), %xmm0
+**     movups  %xmm0, dest\+271\(%rip\)
+**.L[0-9]+:
+**     movl    %edx, %eax
+**     addl    \$64, %edx
+**     movdqu  src\(%rax\), %xmm3
+**     movdqu  src\+16\(%rax\), %xmm2
+**     movdqu  src\+32\(%rax\), %xmm1
+**     movdqu  src\+48\(%rax\), %xmm0
+**     movaps  %xmm3, dest\(%rax\)
+**     movaps  %xmm2, dest\+16\(%rax\)
+**     movaps  %xmm1, dest\+32\(%rax\)
+**     movaps  %xmm0, dest\+48\(%rax\)
+**     cmpl    \$256, %edx
+**     jb      .L[0-9]+
+**     ret
+**...
+*/
+
+#define SIZE 16 * 16 + 31
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c 
b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c
new file mode 100644
index 00000000000..e83ec64a8ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 
-mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     vmovdqu src\+416\(%rip\), %ymm0
+**     xorl    %edx, %edx
+**     vmovdqa %ymm0, dest\+416\(%rip\)
+**     vmovdqu src\+448\(%rip\), %ymm0
+**     vmovdqa %ymm0, dest\+448\(%rip\)
+**     vmovdqu src\+480\(%rip\), %ymm0
+**     vmovdqa %ymm0, dest\+480\(%rip\)
+**     vmovdqu src\+512\(%rip\), %ymm0
+**     vmovdqa %ymm0, dest\+512\(%rip\)
+**.L[0-9]+:
+**     movl    %edx, %eax
+**     subl    \$-128, %edx
+**     vmovdqu src\(%rax\), %ymm3
+**     vmovdqu src\+32\(%rax\), %ymm2
+**     vmovdqu src\+64\(%rax\), %ymm1
+**     vmovdqu src\+96\(%rax\), %ymm0
+**     vmovdqa %ymm3, dest\(%rax\)
+**     vmovdqa %ymm2, dest\+32\(%rax\)
+**     vmovdqa %ymm1, dest\+64\(%rax\)
+**     vmovdqa %ymm0, dest\+96\(%rax\)
+**     cmpl    \$512, %edx
+**     jb      .L[0-9]+
+**     vzeroupper
+**     ret
+**...
+*/
+
+#define SIZE (16 + 1) * 32
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c 
b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c
new file mode 100644
index 00000000000..4ce7e2bb221
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 
-mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     vmovdqu src\+447\(%rip\), %ymm0
+**     xorl    %edx, %edx
+**     vmovdqu %ymm0, dest\+447\(%rip\)
+**     vmovdqu src\+479\(%rip\), %ymm0
+**     vmovdqu %ymm0, dest\+479\(%rip\)
+**     vmovdqu src\+511\(%rip\), %ymm0
+**     vmovdqu %ymm0, dest\+511\(%rip\)
+**     vmovdqu src\+543\(%rip\), %ymm0
+**     vmovdqu %ymm0, dest\+543\(%rip\)
+**.L[0-9]+:
+**     movl    %edx, %eax
+**     subl    \$-128, %edx
+**     vmovdqu src\(%rax\), %ymm3
+**     vmovdqu src\+32\(%rax\), %ymm2
+**     vmovdqu src\+64\(%rax\), %ymm1
+**     vmovdqu src\+96\(%rax\), %ymm0
+**     vmovdqa %ymm3, dest\(%rax\)
+**     vmovdqa %ymm2, dest\+32\(%rax\)
+**     vmovdqa %ymm1, dest\+64\(%rax\)
+**     vmovdqa %ymm0, dest\+96\(%rax\)
+**     cmpl    \$512, %edx
+**     jb      .L[0-9]+
+**     vzeroupper
+**     ret
+**...
+*/
+
+#define SIZE 16 * 32 + 32 + 31
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c 
b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c
new file mode 100644
index 00000000000..69048633c23
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 
-mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     vmovdqu64       src\+832\(%rip\), %zmm0
+**     xorl    %edx, %edx
+**     vmovdqa64       %zmm0, dest\+832\(%rip\)
+**     vmovdqu64       src\+896\(%rip\), %zmm0
+**     vmovdqa64       %zmm0, dest\+896\(%rip\)
+**     vmovdqu64       src\+960\(%rip\), %zmm0
+**     vmovdqa64       %zmm0, dest\+960\(%rip\)
+**     vmovdqu64       src\+1024\(%rip\), %zmm0
+**     vmovdqa64       %zmm0, dest\+1024\(%rip\)
+**.L[0-9]+:
+**     movl    %edx, %eax
+**     addl    \$256, %edx
+**     vmovdqu64       src\(%rax\), %zmm3
+**     vmovdqu64       src\+64\(%rax\), %zmm2
+**     vmovdqu64       src\+128\(%rax\), %zmm1
+**     vmovdqu64       src\+192\(%rax\), %zmm0
+**     vmovdqa64       %zmm3, dest\(%rax\)
+**     vmovdqa64       %zmm2, dest\+64\(%rax\)
+**     vmovdqa64       %zmm1, dest\+128\(%rax\)
+**     vmovdqa64       %zmm0, dest\+192\(%rax\)
+**     cmpl    \$1024, %edx
+**     jb      .L[0-9]+
+**     vzeroupper
+**     ret
+**...
+*/
+
+#define SIZE (16 + 1) * 64
+
+char dest[SIZE] __attribute__((aligned(64)));
+char src[SIZE] __attribute__((aligned(64)));
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c 
b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c
new file mode 100644
index 00000000000..f517ca48a14
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 
-mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     vmovdqu64       src\+831\(%rip\), %zmm0
+**     xorl    %edx, %edx
+**     vmovdqu64       %zmm0, dest\+831\(%rip\)
+**     vmovdqu64       src\+895\(%rip\), %zmm0
+**     vmovdqu64       %zmm0, dest\+895\(%rip\)
+**     vmovdqu64       src\+959\(%rip\), %zmm0
+**     vmovdqu64       %zmm0, dest\+959\(%rip\)
+**     vmovdqu64       src\+1023\(%rip\), %zmm0
+**     vmovdqu64       %zmm0, dest\+1023\(%rip\)
+**.L[0-9]+:
+**     movl    %edx, %eax
+**     addl    \$256, %edx
+**     vmovdqu64       src\(%rax\), %zmm3
+**     vmovdqu64       src\+64\(%rax\), %zmm2
+**     vmovdqu64       src\+128\(%rax\), %zmm1
+**     vmovdqu64       src\+192\(%rax\), %zmm0
+**     vmovdqa64       %zmm3, dest\(%rax\)
+**     vmovdqa64       %zmm2, dest\+64\(%rax\)
+**     vmovdqa64       %zmm1, dest\+128\(%rax\)
+**     vmovdqa64       %zmm0, dest\+192\(%rax\)
+**     cmpl    \$1024, %edx
+**     jb      .L[0-9]+
+**     vzeroupper
+**     ret
+**...
+*/
+
+#define SIZE 16 * 64 + 63
+
+char dest[SIZE] __attribute__((aligned(64)));
+char src[SIZE] __attribute__((aligned(64)));
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-1.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-1.c
new file mode 100644
index 00000000000..90e544df7ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-1.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     pxor    %xmm0, %xmm0
+**     xorl    %eax, %eax
+**     movups  %xmm0, 190\(%rdi\)
+**     movups  %xmm0, 206\(%rdi\)
+**     movups  %xmm0, 222\(%rdi\)
+**     movups  %xmm0, 238\(%rdi\)
+**.L[0-9]+:
+**     movl    %eax, %edx
+**     addl    \$64, %eax
+**     movups  %xmm0, \(%rdi,%rdx\)
+**     movups  %xmm0, 16\(%rdi,%rdx\)
+**     movups  %xmm0, 32\(%rdi,%rdx\)
+**     movups  %xmm0, 48\(%rdi,%rdx\)
+**     cmpl    \$192, %eax
+**     jb      .L[0-9]+
+**     ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 254);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-10.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-10.c
new file mode 100644
index 00000000000..21e86a3ee0e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-10.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse 
-mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     movq    \$0, 29\(%rdi\)
+**     movq    \$0, 37\(%rdi\)
+**     movq    \$0, 45\(%rdi\)
+**     movq    \$0, 53\(%rdi\)
+**     movq    \$0, \(%rdi\)
+**     movq    \$0, 8\(%rdi\)
+**     movq    \$0, 16\(%rdi\)
+**     movq    \$0, 24\(%rdi\)
+**     ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 61);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-11.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-11.c
new file mode 100644
index 00000000000..30b0cad04e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-11.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse 
-mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     movabsq \$289360691352306692, %rax
+**     movq    %rax, 48\(%rdi\)
+**     movq    %rax, \(%rdi\)
+**     movq    %rax, 8\(%rdi\)
+**     movq    %rax, 16\(%rdi\)
+**     movq    %rax, 24\(%rdi\)
+**     movq    %rax, 32\(%rdi\)
+**     movq    %rax, 40\(%rdi\)
+**     movq    %rax, 53\(%rdi\)
+**     ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 4, 61);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-12.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-12.c
new file mode 100644
index 00000000000..15987a6451f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-12.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse 
-mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     movabsq \$72340172838076673, %rax
+**     movzbl  %sil, %esi
+**     imulq   %rax, %rsi
+**     movq    %rsi, 48\(%rdi\)
+**     movq    %rsi, \(%rdi\)
+**     movq    %rsi, 8\(%rdi\)
+**     movq    %rsi, 16\(%rdi\)
+**     movq    %rsi, 24\(%rdi\)
+**     movq    %rsi, 32\(%rdi\)
+**     movq    %rsi, 40\(%rdi\)
+**     movq    %rsi, 53\(%rdi\)
+**     ret
+**...
+*/
+
+void
+foo (char *dest, int c)
+{
+  __builtin_memset (dest, c, 61);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-13.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-13.c
new file mode 100644
index 00000000000..35c34fc0502
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-13.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     pxor    %xmm0, %xmm0
+**     xorl    %eax, %eax
+**     movaps  %xmm0, dest\+176\(%rip\)
+**     movaps  %xmm0, dest\+192\(%rip\)
+**     movaps  %xmm0, dest\+208\(%rip\)
+**     movaps  %xmm0, dest\+224\(%rip\)
+**.L[0-9]+:
+**     movl    %eax, %edx
+**     addl    \$64, %eax
+**     movaps  %xmm0, dest\(%rdx\)
+**     movaps  %xmm0, dest\+16\(%rdx\)
+**     movaps  %xmm0, dest\+32\(%rdx\)
+**     movaps  %xmm0, dest\+48\(%rdx\)
+**     cmpl    \$192, %eax
+**     jb      .L[0-9]+
+**     ret
+**...
+*/
+
+char dest[240];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-14.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-14.c
new file mode 100644
index 00000000000..7ec9b3fe1bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-14.c
@@ -0,0 +1,91 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign 
-minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     pxor    %xmm0, %xmm0
+**     cmpq    \$64, %rsi
+**     jnb     .L2
+**     testb   \$32, %sil
+**     jne     .L19
+**     testb   \$16, %sil
+**     jne     .L20
+**     testb   \$8, %sil
+**     jne     .L21
+**     testb   \$4, %sil
+**     jne     .L22
+**     testq   %rsi, %rsi
+**     jne     .L23
+**.L1:
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     movups  %xmm0, -64\(%rdi,%rsi\)
+**     movups  %xmm0, -48\(%rdi,%rsi\)
+**     movups  %xmm0, -32\(%rdi,%rsi\)
+**     movups  %xmm0, -16\(%rdi,%rsi\)
+**     subq    \$1, %rsi
+**     cmpq    \$64, %rsi
+**     jb      .L1
+**     andq    \$-64, %rsi
+**     xorl    %eax, %eax
+**.L9:
+**     movups  %xmm0, \(%rdi,%rax\)
+**     movups  %xmm0, 16\(%rdi,%rax\)
+**     movups  %xmm0, 32\(%rdi,%rax\)
+**     movups  %xmm0, 48\(%rdi,%rax\)
+**     addq    \$64, %rax
+**     cmpq    %rsi, %rax
+**     jb      .L9
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     movb    \$0, \(%rdi\)
+**     testb   \$2, %sil
+**     je      .L1
+**     xorl    %eax, %eax
+**     movw    %ax, -2\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L19:
+**     movups  %xmm0, \(%rdi\)
+**     movups  %xmm0, 16\(%rdi\)
+**     movups  %xmm0, -32\(%rdi,%rsi\)
+**     movups  %xmm0, -16\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L20:
+**     movups  %xmm0, \(%rdi\)
+**     movups  %xmm0, -16\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L21:
+**     movq    \$0, \(%rdi\)
+**     movq    \$0, -8\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     movl    \$0, \(%rdi\)
+**     movl    \$0, -4\(%rdi,%rsi\)
+**     ret
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 0, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-15.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-15.c
new file mode 100644
index 00000000000..e7544057994
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-15.c
@@ -0,0 +1,103 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign 
-minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     vpxor   %xmm0, %xmm0, %xmm0
+**     cmpq    \$128, %rsi
+**     jnb     .L2
+**     testb   \$64, %sil
+**     jne     .L22
+**     testb   \$32, %sil
+**     jne     .L23
+**     testb   \$16, %sil
+**     jne     .L24
+**     testb   \$8, %sil
+**     jne     .L25
+**     testb   \$4, %sil
+**     jne     .L26
+**     testq   %rsi, %rsi
+**     jne     .L27
+**.L20:
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     vmovdqu %ymm0, -128\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -96\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -64\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     subq    \$1, %rsi
+**     cmpq    \$128, %rsi
+**     jb      .L19
+**     andq    \$-128, %rsi
+**     xorl    %eax, %eax
+**.L10:
+**     vmovdqu %ymm0, \(%rdi,%rax\)
+**     vmovdqu %ymm0, 32\(%rdi,%rax\)
+**     vmovdqu %ymm0, 64\(%rdi,%rax\)
+**     vmovdqu %ymm0, 96\(%rdi,%rax\)
+**     subq    \$-128, %rax
+**     cmpq    %rsi, %rax
+**     jb      .L10
+**.L19:
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L27:
+**     movb    \$0, \(%rdi\)
+**     testb   \$2, %sil
+**     je      .L20
+**     xorl    %eax, %eax
+**     movw    %ax, -2\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, 32\(%rdi\)
+**     vmovdqu %ymm0, -64\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L24:
+**     vmovdqu %xmm0, \(%rdi\)
+**     vmovdqu %xmm0, -16\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L25:
+**     movq    \$0, \(%rdi\)
+**     movq    \$0, -8\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L26:
+**     movl    \$0, \(%rdi\)
+**     movl    \$0, -4\(%rdi,%rsi\)
+**     ret
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 0, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-16.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-16.c
new file mode 100644
index 00000000000..c519bf36fb0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-16.c
@@ -0,0 +1,112 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign 
-minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     vpxor   %xmm0, %xmm0, %xmm0
+**     cmpq    \$256, %rsi
+**     jnb     .L2
+**     testb   \$-128, %sil
+**     jne     .L23
+**     testb   \$64, %sil
+**     jne     .L24
+**     testb   \$32, %sil
+**     jne     .L25
+**     testb   \$16, %sil
+**     jne     .L26
+**     testb   \$8, %sil
+**     jne     .L27
+**     testb   \$4, %sil
+**     jne     .L28
+**     testq   %rsi, %rsi
+**     jne     .L29
+**.L21:
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     vmovdqu64       %zmm0, -256\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -192\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -128\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rsi\)
+**     subq    \$1, %rsi
+**     cmpq    \$256, %rsi
+**     jb      .L20
+**     xorb    %sil, %sil
+**     xorl    %eax, %eax
+**.L11:
+**     vmovdqu64       %zmm0, \(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 64\(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 128\(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 192\(%rdi,%rax\)
+**     addq    \$256, %rax
+**     cmpq    %rsi, %rax
+**     jb      .L11
+**.L20:
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L29:
+**     movb    \$0, \(%rdi\)
+**     testb   \$2, %sil
+**     je      .L21
+**     xorl    %eax, %eax
+**     movw    %ax, -2\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     vmovdqu64       %zmm0, \(%rdi\)
+**     vmovdqu64       %zmm0, 64\(%rdi\)
+**     vmovdqu64       %zmm0, -128\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rsi\)
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L24:
+**     vmovdqu64       %zmm0, \(%rdi\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rsi\)
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L25:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L26:
+**     vmovdqu %xmm0, \(%rdi\)
+**     vmovdqu %xmm0, -16\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L27:
+**     movq    \$0, \(%rdi\)
+**     movq    \$0, -8\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L28:
+**     movl    \$0, \(%rdi\)
+**     movl    \$0, -4\(%rdi,%rsi\)
+**     ret
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 0, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-17.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-17.c
new file mode 100644
index 00000000000..4fb87774ee8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-17.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     pxor    %xmm0, %xmm0
+**     xorl    %eax, %eax
+**     movups  %xmm0, dest\+120\(%rip\)
+**     movups  %xmm0, dest\+136\(%rip\)
+**     movups  %xmm0, dest\+152\(%rip\)
+**     movups  %xmm0, dest\+168\(%rip\)
+**.L[0-9]+:
+**     movl    %eax, %edx
+**     addl    \$64, %eax
+**     movaps  %xmm0, dest\(%rdx\)
+**     movaps  %xmm0, dest\+16\(%rdx\)
+**     movaps  %xmm0, dest\+32\(%rdx\)
+**     movaps  %xmm0, dest\+48\(%rdx\)
+**     cmpl    \$128, %eax
+**     jb      .L[0-9]+
+**     ret
+**...
+*/
+
+char dest[184];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-2.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-2.c
new file mode 100644
index 00000000000..775fb4ce96d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-2.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     vpxor   %xmm0, %xmm0, %xmm0
+**     vmovdqu %ymm0, 126\(%rdi\)
+**     vmovdqu %ymm0, 158\(%rdi\)
+**     vmovdqu %ymm0, 190\(%rdi\)
+**     vmovdqu %ymm0, 222\(%rdi\)
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, 32\(%rdi\)
+**     vmovdqu %ymm0, 64\(%rdi\)
+**     vmovdqu %ymm0, 96\(%rdi\)
+**     vzeroupper
+**     ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 254);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-3.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-3.c
new file mode 100644
index 00000000000..621baf7b9fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-3.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     vpxor   %xmm0, %xmm0, %xmm0
+**     vmovdqu8        %zmm0, 128\(%rdi\)
+**     vmovdqu8        %zmm0, \(%rdi\)
+**     vmovdqu8        %zmm0, 64\(%rdi\)
+**     vmovdqu8        %zmm0, 190\(%rdi\)
+**     vzeroupper
+**     ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 254);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-4.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-4.c
new file mode 100644
index 00000000000..712404be416
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-4.c
@@ -0,0 +1,93 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign 
-minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     movabsq \$289360691352306692, %rax
+**     movq    %rax, %xmm0
+**     punpcklqdq      %xmm0, %xmm0
+**     cmpq    \$64, %rsi
+**     jnb     .L2
+**     testb   \$32, %sil
+**     jne     .L19
+**     testb   \$16, %sil
+**     jne     .L20
+**     testb   \$8, %sil
+**     jne     .L21
+**     testb   \$4, %sil
+**     jne     .L22
+**     testq   %rsi, %rsi
+**     jne     .L23
+**.L1:
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     movups  %xmm0, -64\(%rdi,%rsi\)
+**     movups  %xmm0, -48\(%rdi,%rsi\)
+**     movups  %xmm0, -32\(%rdi,%rsi\)
+**     movups  %xmm0, -16\(%rdi,%rsi\)
+**     subq    \$1, %rsi
+**     cmpq    \$64, %rsi
+**     jb      .L1
+**     andq    \$-64, %rsi
+**     xorl    %eax, %eax
+**.L9:
+**     movups  %xmm0, \(%rdi,%rax\)
+**     movups  %xmm0, 16\(%rdi,%rax\)
+**     movups  %xmm0, 32\(%rdi,%rax\)
+**     movups  %xmm0, 48\(%rdi,%rax\)
+**     addq    \$64, %rax
+**     cmpq    %rsi, %rax
+**     jb      .L9
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     movb    \$4, \(%rdi\)
+**     testb   \$2, %sil
+**     je      .L1
+**     movl    \$1028, %eax
+**     movw    %ax, -2\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L19:
+**     movups  %xmm0, \(%rdi\)
+**     movups  %xmm0, 16\(%rdi\)
+**     movups  %xmm0, -32\(%rdi,%rsi\)
+**     movups  %xmm0, -16\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L20:
+**     movups  %xmm0, \(%rdi\)
+**     movups  %xmm0, -16\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L21:
+**     movq    %rax, \(%rdi\)
+**     movq    %rax, -8\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     movl    \$67372036, \(%rdi\)
+**     movl    \$67372036, -4\(%rdi,%rsi\)
+**     ret
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 4, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-5.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-5.c
new file mode 100644
index 00000000000..f597395b38b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-5.c
@@ -0,0 +1,102 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign 
-minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     movabsq \$289360691352306692, %rax
+**     vmovq   %rax, %xmm1
+**     vpbroadcastq    %xmm1, %ymm0
+**     cmpq    \$128, %rsi
+**     jnb     .L2
+**     testb   \$64, %sil
+**     jne     .L21
+**     testb   \$32, %sil
+**     jne     .L22
+**     testb   \$16, %sil
+**     jne     .L23
+**     testb   \$8, %sil
+**     jne     .L24
+**     testb   \$4, %sil
+**     jne     .L25
+**     testq   %rsi, %rsi
+**     jne     .L26
+**.L19:
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     vmovdqu %ymm0, -128\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -96\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -64\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     subq    \$1, %rsi
+**     cmpq    \$128, %rsi
+**     jb      .L19
+**     andq    \$-128, %rsi
+**     xorl    %eax, %eax
+**.L10:
+**     vmovdqu %ymm0, \(%rdi,%rax\)
+**     vmovdqu %ymm0, 32\(%rdi,%rax\)
+**     vmovdqu %ymm0, 64\(%rdi,%rax\)
+**     vmovdqu %ymm0, 96\(%rdi,%rax\)
+**     subq    \$-128, %rax
+**     cmpq    %rsi, %rax
+**     jb      .L10
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L26:
+**     movb    \$4, \(%rdi\)
+**     testb   \$2, %sil
+**     je      .L19
+**     movl    \$1028, %eax
+**     movw    %ax, -2\(%rdi,%rsi\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L21:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, 32\(%rdi\)
+**     vmovdqu %ymm0, -64\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     vmovdqu %xmm0, \(%rdi\)
+**     vmovdqu %xmm0, -16\(%rdi,%rsi\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L24:
+**     movq    %rax, \(%rdi\)
+**     movq    %rax, -8\(%rdi,%rsi\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L25:
+**     movl    \$67372036, \(%rdi\)
+**     movl    \$67372036, -4\(%rdi,%rsi\)
+**     jmp     .L19
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 4, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-6.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-6.c
new file mode 100644
index 00000000000..7ba1b742076
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-6.c
@@ -0,0 +1,109 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign 
-minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     movabsq \$289360691352306692, %rax
+**     vpbroadcastq    %rax, %zmm0
+**     cmpq    \$256, %rsi
+**     jnb     .L2
+**     testb   \$-128, %sil
+**     jne     .L22
+**     testb   \$64, %sil
+**     jne     .L23
+**     testb   \$32, %sil
+**     jne     .L24
+**     testb   \$16, %sil
+**     jne     .L25
+**     testb   \$8, %sil
+**     jne     .L26
+**     testb   \$4, %sil
+**     jne     .L27
+**     testq   %rsi, %rsi
+**     jne     .L28
+**.L20:
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     vmovdqu64       %zmm0, -256\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -192\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -128\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rsi\)
+**     subq    \$1, %rsi
+**     cmpq    \$256, %rsi
+**     jb      .L20
+**     xorb    %sil, %sil
+**     xorl    %eax, %eax
+**.L11:
+**     vmovdqu64       %zmm0, \(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 64\(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 128\(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 192\(%rdi,%rax\)
+**     addq    \$256, %rax
+**     cmpq    %rsi, %rax
+**     jb      .L11
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L28:
+**     movb    \$4, \(%rdi\)
+**     testb   \$2, %sil
+**     je      .L20
+**     movl    \$1028, %eax
+**     movw    %ax, -2\(%rdi,%rsi\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     vmovdqu64       %zmm0, \(%rdi\)
+**     vmovdqu64       %zmm0, 64\(%rdi\)
+**     vmovdqu64       %zmm0, -128\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rsi\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     vmovdqu64       %zmm0, \(%rdi\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rsi\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L24:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L25:
+**     vmovdqu %xmm0, \(%rdi\)
+**     vmovdqu %xmm0, -16\(%rdi,%rsi\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L26:
+**     movq    %rax, \(%rdi\)
+**     movq    %rax, -8\(%rdi,%rsi\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L27:
+**     movl    \$67372036, \(%rdi\)
+**     movl    \$67372036, -4\(%rdi,%rsi\)
+**     jmp     .L20
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 4, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-7.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-7.c
new file mode 100644
index 00000000000..62f61c54ed0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-7.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign 
-minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     movabsq \$72340172838076673, %rax
+**     movzbl  %sil, %esi
+**     imulq   %rax, %rsi
+**     movq    %rsi, %xmm0
+**     punpcklqdq      %xmm0, %xmm0
+**     cmpq    \$64, %rdx
+**     jnb     .L2
+**     testb   \$32, %dl
+**     jne     .L19
+**     testb   \$16, %dl
+**     jne     .L20
+**     testb   \$8, %dl
+**     jne     .L21
+**     testb   \$4, %dl
+**     jne     .L22
+**     testq   %rdx, %rdx
+**     jne     .L23
+**.L1:
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     movups  %xmm0, -64\(%rdi,%rdx\)
+**     movups  %xmm0, -48\(%rdi,%rdx\)
+**     movups  %xmm0, -32\(%rdi,%rdx\)
+**     movups  %xmm0, -16\(%rdi,%rdx\)
+**     subq    \$1, %rdx
+**     cmpq    \$64, %rdx
+**     jb      .L1
+**     andq    \$-64, %rdx
+**     xorl    %eax, %eax
+**.L9:
+**     movups  %xmm0, \(%rdi,%rax\)
+**     movups  %xmm0, 16\(%rdi,%rax\)
+**     movups  %xmm0, 32\(%rdi,%rax\)
+**     movups  %xmm0, 48\(%rdi,%rax\)
+**     addq    \$64, %rax
+**     cmpq    %rdx, %rax
+**     jb      .L9
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     movb    %sil, \(%rdi\)
+**     testb   \$2, %dl
+**     je      .L1
+**     movw    %si, -2\(%rdi,%rdx\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L19:
+**     movups  %xmm0, \(%rdi\)
+**     movups  %xmm0, 16\(%rdi\)
+**     movups  %xmm0, -32\(%rdi,%rdx\)
+**     movups  %xmm0, -16\(%rdi,%rdx\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L20:
+**     movups  %xmm0, \(%rdi\)
+**     movups  %xmm0, -16\(%rdi,%rdx\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L21:
+**     movq    %rsi, \(%rdi\)
+**     movq    %rsi, -8\(%rdi,%rdx\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     movl    %esi, \(%rdi\)
+**     movl    %esi, -4\(%rdi,%rdx\)
+**     ret
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, int c, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, c, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-8.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-8.c
new file mode 100644
index 00000000000..d12ab157494
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-8.c
@@ -0,0 +1,103 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign 
-minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     movabsq \$72340172838076673, %rax
+**     movzbl  %sil, %esi
+**     imulq   %rax, %rsi
+**     vmovq   %rsi, %xmm1
+**     vpbroadcastq    %xmm1, %ymm0
+**     cmpq    \$128, %rdx
+**     jnb     .L2
+**     testb   \$64, %dl
+**     jne     .L21
+**     testb   \$32, %dl
+**     jne     .L22
+**     testb   \$16, %dl
+**     jne     .L23
+**     testb   \$8, %dl
+**     jne     .L24
+**     testb   \$4, %dl
+**     jne     .L25
+**     testq   %rdx, %rdx
+**     jne     .L26
+**.L19:
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     vmovdqu %ymm0, -128\(%rdi,%rdx\)
+**     vmovdqu %ymm0, -96\(%rdi,%rdx\)
+**     vmovdqu %ymm0, -64\(%rdi,%rdx\)
+**     vmovdqu %ymm0, -32\(%rdi,%rdx\)
+**     subq    \$1, %rdx
+**     cmpq    \$128, %rdx
+**     jb      .L19
+**     andq    \$-128, %rdx
+**     xorl    %eax, %eax
+**.L10:
+**     vmovdqu %ymm0, \(%rdi,%rax\)
+**     vmovdqu %ymm0, 32\(%rdi,%rax\)
+**     vmovdqu %ymm0, 64\(%rdi,%rax\)
+**     vmovdqu %ymm0, 96\(%rdi,%rax\)
+**     subq    \$-128, %rax
+**     cmpq    %rdx, %rax
+**     jb      .L10
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L26:
+**     movb    %sil, \(%rdi\)
+**     testb   \$2, %dl
+**     je      .L19
+**     movw    %si, -2\(%rdi,%rdx\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L21:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, 32\(%rdi\)
+**     vmovdqu %ymm0, -64\(%rdi,%rdx\)
+**     vmovdqu %ymm0, -32\(%rdi,%rdx\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rdx\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     vmovdqu %xmm0, \(%rdi\)
+**     vmovdqu %xmm0, -16\(%rdi,%rdx\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L24:
+**     movq    %rsi, \(%rdi\)
+**     movq    %rsi, -8\(%rdi,%rdx\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L25:
+**     movl    %esi, \(%rdi\)
+**     movl    %esi, -4\(%rdi,%rdx\)
+**     jmp     .L19
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, int c, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, c, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-9.c 
b/gcc/testsuite/gcc.target/i386/memset-pr120683-9.c
new file mode 100644
index 00000000000..1a0abe6614f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-9.c
@@ -0,0 +1,110 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign 
-minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     movabsq \$72340172838076673, %rax
+**     movzbl  %sil, %esi
+**     imulq   %rax, %rsi
+**     vpbroadcastq    %rsi, %zmm0
+**     cmpq    \$256, %rdx
+**     jnb     .L2
+**     testb   \$-128, %dl
+**     jne     .L22
+**     testb   \$64, %dl
+**     jne     .L23
+**     testb   \$32, %dl
+**     jne     .L24
+**     testb   \$16, %dl
+**     jne     .L25
+**     testb   \$8, %dl
+**     jne     .L26
+**     testb   \$4, %dl
+**     jne     .L27
+**     testq   %rdx, %rdx
+**     jne     .L28
+**.L20:
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     vmovdqu64       %zmm0, -256\(%rdi,%rdx\)
+**     vmovdqu64       %zmm0, -192\(%rdi,%rdx\)
+**     vmovdqu64       %zmm0, -128\(%rdi,%rdx\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rdx\)
+**     subq    \$1, %rdx
+**     cmpq    \$256, %rdx
+**     jb      .L20
+**     xorb    %dl, %dl
+**     xorl    %eax, %eax
+**.L11:
+**     vmovdqu64       %zmm0, \(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 64\(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 128\(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 192\(%rdi,%rax\)
+**     addq    \$256, %rax
+**     cmpq    %rdx, %rax
+**     jb      .L11
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L28:
+**     movb    %sil, \(%rdi\)
+**     testb   \$2, %dl
+**     je      .L20
+**     movw    %si, -2\(%rdi,%rdx\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     vmovdqu64       %zmm0, \(%rdi\)
+**     vmovdqu64       %zmm0, 64\(%rdi\)
+**     vmovdqu64       %zmm0, -128\(%rdi,%rdx\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rdx\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     vmovdqu64       %zmm0, \(%rdi\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rdx\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L24:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rdx\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L25:
+**     vmovdqu %xmm0, \(%rdi\)
+**     vmovdqu %xmm0, -16\(%rdi,%rdx\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L26:
+**     movq    %rsi, \(%rdi\)
+**     movq    %rsi, -8\(%rdi,%rdx\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L27:
+**     movl    %esi, \(%rdi\)
+**     movl    %esi, -4\(%rdi,%rdx\)
+**     jmp     .L20
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, int c, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, c, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
-- 
2.49.0

Reply via email to