1. Don't generate the loop if the loop count is 1. 2. For memset with vector on small size, use vector if small size supports vector, otherwise use the scalar value. 3. Duplicate the promoted scalar value for vector. 4. Always expand vector-version of memset for vector_loop. 5. Use misaligned prologue if alignment isn't needed. When misaligned prologue is used, check if destination is actually aligned and update destination alignment if aligned.
The included tests show that codegen of vector_loop/unrolled_loop for memset/memcpy are significantly improved. For --- void foo (void *p1, size_t len) { __builtin_memset (p1, 0, len); } --- with -O2 -minline-all-stringops -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -march=x86-64 we used to generate foo: .LFB0: .cfi_startproc movq %rdi, %rax pxor %xmm0, %xmm0 cmpq $64, %rsi jnb .L18 .L2: andl $63, %esi je .L1 xorl %edx, %edx testb $1, %sil je .L5 movl $1, %edx movb $0, (%rax) cmpq %rsi, %rdx jnb .L19 .L5: movb $0, (%rax,%rdx) movb $0, 1(%rax,%rdx) addq $2, %rdx cmpq %rsi, %rdx jb .L5 .L1: ret .p2align 4,,10 .p2align 3 .L18: movq %rsi, %rdx xorl %eax, %eax andq $-64, %rdx .L3: movups %xmm0, (%rdi,%rax) movups %xmm0, 16(%rdi,%rax) movups %xmm0, 32(%rdi,%rax) movups %xmm0, 48(%rdi,%rax) addq $64, %rax cmpq %rdx, %rax jb .L3 addq %rdi, %rax jmp .L2 .L19: ret .cfi_endproc with very poor prologue/epilogue. With this patch, we now generate: foo: .LFB0: .cfi_startproc pxor %xmm0, %xmm0 cmpq $64, %rsi jnb .L2 testb $32, %sil jne .L19 testb $16, %sil jne .L20 testb $8, %sil jne .L21 testb $4, %sil jne .L22 testq %rsi, %rsi jne .L23 .L1: ret .p2align 4,,10 .p2align 3 .L2: movups %xmm0, -64(%rdi,%rsi) movups %xmm0, -48(%rdi,%rsi) movups %xmm0, -32(%rdi,%rsi) movups %xmm0, -16(%rdi,%rsi) subq $1, %rsi cmpq $64, %rsi jb .L1 andq $-64, %rsi xorl %eax, %eax .L9: movups %xmm0, (%rdi,%rax) movups %xmm0, 16(%rdi,%rax) movups %xmm0, 32(%rdi,%rax) movups %xmm0, 48(%rdi,%rax) addq $64, %rax cmpq %rsi, %rax jb .L9 ret .p2align 4,,10 .p2align 3 .L23: movb $0, (%rdi) testb $2, %sil je .L1 xorl %eax, %eax movw %ax, -2(%rdi,%rsi) ret .p2align 4,,10 .p2align 3 .L19: movups %xmm0, (%rdi) movups %xmm0, 16(%rdi) movups %xmm0, -32(%rdi,%rsi) movups %xmm0, -16(%rdi,%rsi) ret .p2align 4,,10 .p2align 3 .L20: movups %xmm0, (%rdi) movups %xmm0, -16(%rdi,%rsi) ret .p2align 4,,10 .p2align 3 .L21: movq $0, (%rdi) movq $0, -8(%rdi,%rsi) ret .p2align 4,,10 .p2align 3 .L22: movl $0, (%rdi) movl $0, -4(%rdi,%rsi) ret .cfi_endproc gcc/ PR target/120683 * config/i386/i386-expand.cc (expand_set_or_cpymem_via_loop): Don't generate the loop if the loop count is 1. (expand_small_cpymem_or_setmem): Choose cpymem mode from MOVE_MAX. For memset with vector and the size is smaller than the vector size, first try the narrower vector, otherwise, use the scalar value. (expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves): Add an argument to indicate if destination is aligned and align destination if aligned. (promote_duplicated_reg): Duplicate the scalar value for vector. (ix86_expand_set_or_cpymem): Always expand vector-version of memset for vector_loop. Use misaligned prologue if alignment isn't needed. When misaligned prologue is used, check if destination is actually aligned and update destination alignment if needed. gcc/testsuite/ PR target/120683 * gcc.target/i386/memcpy-pr120683-1.c: New test. * gcc.target/i386/memcpy-pr120683-2.c: Likewise. * gcc.target/i386/memcpy-pr120683-3.c: Likewise. * gcc.target/i386/memcpy-pr120683-4.c: Likewise. * gcc.target/i386/memcpy-pr120683-5.c: Likewise. * gcc.target/i386/memcpy-pr120683-6.c: Likewise. * gcc.target/i386/memcpy-pr120683-7.c: Likewise. * gcc.target/i386/memset-pr120683-1.c: Likewise. * gcc.target/i386/memset-pr120683-2.c: Likewise. * gcc.target/i386/memset-pr120683-3.c: Likewise. * gcc.target/i386/memset-pr120683-4.c: Likewise. * gcc.target/i386/memset-pr120683-5.c: Likewise. * gcc.target/i386/memset-pr120683-6.c: Likewise. * gcc.target/i386/memset-pr120683-7.c: Likewise. * gcc.target/i386/memset-pr120683-8.c: Likewise. * gcc.target/i386/memset-pr120683-9.c: Likewise. * gcc.target/i386/memset-pr120683-10.c: Likewise. * gcc.target/i386/memset-pr120683-11.c: Likewise. * gcc.target/i386/memset-pr120683-12.c: Likewise. * gcc.target/i386/memset-pr120683-13.c: Likewise. * gcc.target/i386/memset-pr120683-14.c: Likewise. * gcc.target/i386/memset-pr120683-15.c: Likewise. * gcc.target/i386/memset-pr120683-16.c: Likewise. * gcc.target/i386/memset-pr120683-17.c: Likewise. Signed-off-by: H.J. Lu <hjl.to...@gmail.com> --- gcc/config/i386/i386-expand.cc | 169 ++++++++++++++---- .../gcc.target/i386/memcpy-pr120683-1.c | 42 +++++ .../gcc.target/i386/memcpy-pr120683-2.c | 47 +++++ .../gcc.target/i386/memcpy-pr120683-3.c | 47 +++++ .../gcc.target/i386/memcpy-pr120683-4.c | 48 +++++ .../gcc.target/i386/memcpy-pr120683-5.c | 48 +++++ .../gcc.target/i386/memcpy-pr120683-6.c | 48 +++++ .../gcc.target/i386/memcpy-pr120683-7.c | 48 +++++ .../gcc.target/i386/memset-pr120683-1.c | 35 ++++ .../gcc.target/i386/memset-pr120683-10.c | 28 +++ .../gcc.target/i386/memset-pr120683-11.c | 29 +++ .../gcc.target/i386/memset-pr120683-12.c | 31 ++++ .../gcc.target/i386/memset-pr120683-13.c | 37 ++++ .../gcc.target/i386/memset-pr120683-14.c | 91 ++++++++++ .../gcc.target/i386/memset-pr120683-15.c | 103 +++++++++++ .../gcc.target/i386/memset-pr120683-16.c | 112 ++++++++++++ .../gcc.target/i386/memset-pr120683-17.c | 37 ++++ .../gcc.target/i386/memset-pr120683-2.c | 30 ++++ .../gcc.target/i386/memset-pr120683-3.c | 26 +++ .../gcc.target/i386/memset-pr120683-4.c | 93 ++++++++++ .../gcc.target/i386/memset-pr120683-5.c | 102 +++++++++++ .../gcc.target/i386/memset-pr120683-6.c | 109 +++++++++++ .../gcc.target/i386/memset-pr120683-7.c | 94 ++++++++++ .../gcc.target/i386/memset-pr120683-8.c | 103 +++++++++++ .../gcc.target/i386/memset-pr120683-9.c | 110 ++++++++++++ 25 files changed, 1635 insertions(+), 32 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-1.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-10.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-11.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-12.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-13.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-14.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-15.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-16.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-17.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-2.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-3.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-4.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-5.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-6.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-7.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-8.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-9.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 4946f87a131..9a07e026d62 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -7899,7 +7899,8 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem, rtx count, machine_mode mode, int unroll, int expected_size, bool issetmem) { - rtx_code_label *out_label, *top_label; + rtx_code_label *out_label = nullptr; + rtx_code_label *top_label = nullptr; rtx iter, tmp; machine_mode iter_mode = counter_mode (count); int piece_size_n = GET_MODE_SIZE (mode) * unroll; @@ -7907,9 +7908,19 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem, rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1)); rtx size; int i; + int loop_count; - top_label = gen_label_rtx (); - out_label = gen_label_rtx (); + if (expected_size != -1 && CONST_INT_P (count)) + loop_count = INTVAL (count) / GET_MODE_SIZE (mode) / unroll; + else + loop_count = -1; + + /* Don't generate the loop if the loop count is 1. */ + if (loop_count != 1) + { + top_label = gen_label_rtx (); + out_label = gen_label_rtx (); + } iter = gen_reg_rtx (iter_mode); size = expand_simple_binop (iter_mode, AND, count, piece_size_mask, @@ -7923,7 +7934,8 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem, } emit_move_insn (iter, const0_rtx); - emit_label (top_label); + if (loop_count != 1) + emit_label (top_label); tmp = convert_modes (Pmode, iter_mode, iter, true); @@ -7991,21 +8003,25 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem, if (tmp != iter) emit_move_insn (iter, tmp); - emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode, - true, top_label); - if (expected_size != -1) + if (loop_count != 1) { - expected_size /= GET_MODE_SIZE (mode) * unroll; - if (expected_size == 0) - predict_jump (0); - else if (expected_size > REG_BR_PROB_BASE) - predict_jump (REG_BR_PROB_BASE - 1); + emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode, + true, top_label); + if (expected_size != -1) + { + expected_size /= GET_MODE_SIZE (mode) * unroll; + if (expected_size == 0) + predict_jump (0); + else if (expected_size > REG_BR_PROB_BASE) + predict_jump (REG_BR_PROB_BASE - 1); + else + predict_jump (REG_BR_PROB_BASE + - (REG_BR_PROB_BASE + expected_size / 2) + / expected_size); + } else - predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) - / expected_size); + predict_jump (REG_BR_PROB_BASE * 80 / 100); } - else - predict_jump (REG_BR_PROB_BASE * 80 / 100); iter = ix86_zero_extend_to_Pmode (iter); tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr, true, OPTAB_LIB_WIDEN); @@ -8018,7 +8034,8 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem, if (tmp != srcptr) emit_move_insn (srcptr, tmp); } - emit_label (out_label); + if (loop_count != 1) + emit_label (out_label); } /* Divide COUNTREG by SCALE. */ @@ -8552,6 +8569,7 @@ expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem, rtx_code_label *label = ix86_expand_aligntest (count, size, false); machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk (); rtx modesize; + rtx scalar_value = value; int n; /* If we do not have vector value to copy, we must reduce size. */ @@ -8571,11 +8589,57 @@ expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem, { /* Choose appropriate vector mode. */ if (size >= 32) - mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode; + switch (MOVE_MAX) + { + case 64: + if (size >= 64) + { + mode = V64QImode; + break; + } + /* FALLTHRU */ + case 32: + mode = V32QImode; + break; + case 16: + mode = V16QImode; + break; + case 8: + mode = DImode; + break; + default: + gcc_unreachable (); + } else if (size >= 16) mode = TARGET_SSE ? V16QImode : DImode; srcmem = change_address (srcmem, mode, srcptr); } + if (issetmem && vec_value && GET_MODE_SIZE (mode) > size) + { + /* For memset with vector and the size is smaller than the vector + size, first try the narrower vector, otherwise, use the + original value. */ + machine_mode inner_mode = GET_MODE_INNER (mode); + unsigned int nunits = size / GET_MODE_SIZE (inner_mode); + if (nunits > 1) + { + mode = mode_for_vector (GET_MODE_INNER (mode), + nunits).require (); + value = gen_rtx_SUBREG (mode, value, 0); + } + else + { + scalar_int_mode smode + = smallest_int_mode_for_size (size * BITS_PER_UNIT).require (); + gcc_assert (GET_MODE_SIZE (GET_MODE (scalar_value)) + >= GET_MODE_SIZE (smode)); + mode = smode; + if (GET_MODE (scalar_value) == mode) + value = scalar_value; + else + value = gen_rtx_SUBREG (mode, scalar_value, 0); + } + } destmem = change_address (destmem, mode, destptr); modesize = GEN_INT (GET_MODE_SIZE (mode)); gcc_assert (GET_MODE_SIZE (mode) <= size); @@ -8631,6 +8695,8 @@ expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem, DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether we will dispatch to a library call for large blocks. + If ALIGNED_DESTMEM is true, destination is aligned. + In pseudocode we do: if (COUNT < SIZE) @@ -8680,7 +8746,8 @@ expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx src int align, unsigned HOST_WIDE_INT *min_size, bool dynamic_check, - bool issetmem) + bool issetmem, + bool aligned_destmem) { rtx_code_label *loop_label = NULL, *label; int n; @@ -8784,6 +8851,19 @@ expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx src destmem = offset_address (destmem, GEN_INT (-size - prolog_size), 1); + if (aligned_destmem) + { + /* Check if destination is still aligned after adjustment. */ + aligned_destmem = false; + if (CONST_INT_P (*count)) + { + int mode_align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT; + int offset = INTVAL (*count) - size - prolog_size; + aligned_destmem = (offset % mode_align) == 0; + } + if (aligned_destmem) + set_mem_align (destmem, GET_MODE_ALIGNMENT (mode)); + } if (issetmem) emit_move_insn (destmem, mode_value); else @@ -8797,6 +8877,8 @@ expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx src for (n = 1; n * GET_MODE_SIZE (mode) < size; n++) { destmem = offset_address (destmem, modesize, 1); + if (aligned_destmem) + set_mem_align (destmem, GET_MODE_ALIGNMENT (mode)); if (issetmem) emit_move_insn (destmem, mode_value); else @@ -9179,13 +9261,25 @@ decide_alignment (int align, static rtx promote_duplicated_reg (machine_mode mode, rtx val) { + if (val == const0_rtx) + return copy_to_mode_reg (mode, CONST0_RTX (mode)); + machine_mode valmode = GET_MODE (val); + if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + { + /* Duplicate the scalar value for integer vector. */ + gcc_assert (GET_MODE_INNER (mode) == valmode); + rtx dup = gen_reg_rtx (mode); + bool ok = ix86_expand_vector_init_duplicate (false, mode, dup, + val); + gcc_assert (ok); + return dup; + } + rtx tmp; int nops = mode == DImode ? 3 : 2; - gcc_assert (mode == SImode || mode == DImode || val == const0_rtx); - if (val == const0_rtx) - return copy_to_mode_reg (mode, CONST0_RTX (mode)); + gcc_assert (mode == SImode || mode == DImode); if (CONST_INT_P (val)) { HOST_WIDE_INT v = INTVAL (val) & 255; @@ -9414,11 +9508,6 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, return false; gcc_assert (alg != no_stringop); - /* For now vector-version of memset is generated only for memory zeroing, as - creating of promoted vector value is very cheap in this case. */ - if (issetmem && alg == vector_loop && val_exp != const0_rtx) - alg = unrolled_loop; - if (!count) count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp); destreg = ix86_copy_addr_to_reg (XEXP (dst, 0)); @@ -9523,6 +9612,13 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, && ((desired_align > align && !align_bytes) || (!count && epilogue_size_needed > 1))); + /* Destination is aligned after the misaligned prologue. */ + bool aligned_dstmem = misaligned_prologue_used; + + /* Also use misaligned prologue if alignment isn't needed. The + aligned store will be used if destination is actually aligned. */ + misaligned_prologue_used |= noalign; + /* Do the cheap promotion to allow better CSE across the main loop and epilogue (ie one load of the big constant in the front of all code. @@ -9532,11 +9628,12 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, { if (alg == vector_loop) { - gcc_assert (val_exp == const0_rtx); - vec_promoted_val = promote_duplicated_reg (move_mode, val_exp); promoted_val = promote_duplicated_reg_to_size (val_exp, GET_MODE_SIZE (word_mode), desired_align, align); + /* Duplicate the promoted scalar value. */ + vec_promoted_val = promote_duplicated_reg (move_mode, + promoted_val); } else { @@ -9549,7 +9646,13 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, also avoids redundant job when sizes are known precisely. */ if (misaligned_prologue_used) { - /* Misaligned move prologue handled small blocks by itself. */ + /* Misaligned move prologue handled small blocks by itself. + When alignment isn't needed, check if destination is + actually aligned and update destination alignment if + aligned. */ + if (noalign) + aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode) + <= MEM_ALIGN (dst)); expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (dst, src, &destreg, &srcreg, move_mode, promoted_val, vec_promoted_val, @@ -9557,11 +9660,13 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, &jump_around_label, desired_align < align ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed, - desired_align, align, &min_size, dynamic_check, issetmem); + desired_align, align, &min_size, dynamic_check, issetmem, + aligned_dstmem); if (!issetmem) src = change_address (src, BLKmode, srcreg); dst = change_address (dst, BLKmode, destreg); - set_mem_align (dst, desired_align * BITS_PER_UNIT); + if (aligned_dstmem) + set_mem_align (dst, desired_align * BITS_PER_UNIT); epilogue_size_needed = 0; if (need_zero_guard && min_size < (unsigned HOST_WIDE_INT) size_needed) diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c new file mode 100644 index 00000000000..753238e35fc --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c @@ -0,0 +1,42 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-sse -mmemcpy-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** movq 221\(%rsi\), %rax +** xorl %edx, %edx +** movq %rax, 221\(%rdi\) +** movq 229\(%rsi\), %rax +** movq %rax, 229\(%rdi\) +** movq 237\(%rsi\), %rax +** movq %rax, 237\(%rdi\) +** movq 245\(%rsi\), %rax +** movq %rax, 245\(%rdi\) +**.L[0-9]+: +** movl %edx, %eax +** addl \$32, %edx +** movq \(%rsi,%rax\), %r10 +** movq 8\(%rsi,%rax\), %r9 +** movq 16\(%rsi,%rax\), %r8 +** movq 24\(%rsi,%rax\), %rcx +** movq %r10, \(%rdi,%rax\) +** movq %r9, 8\(%rdi,%rax\) +** movq %r8, 16\(%rdi,%rax\) +** movq %rcx, 24\(%rdi,%rax\) +** cmpl \$224, %edx +** jb .L[0-9]+ +** ret +**... +*/ + +void +foo (char *dest, char *src) +{ + __builtin_memcpy (dest, src, 253); +} + +/* { dg-final { scan-assembler-not "rep mov" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c new file mode 100644 index 00000000000..b7ea7c2d489 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c @@ -0,0 +1,47 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** movdqu src\+208\(%rip\), %xmm0 +** xorl %edx, %edx +** movaps %xmm0, dest\+208\(%rip\) +** movdqu src\+224\(%rip\), %xmm0 +** movaps %xmm0, dest\+224\(%rip\) +** movdqu src\+240\(%rip\), %xmm0 +** movaps %xmm0, dest\+240\(%rip\) +** movdqu src\+256\(%rip\), %xmm0 +** movaps %xmm0, dest\+256\(%rip\) +**.L[0-9]+: +** movl %edx, %eax +** addl \$64, %edx +** movdqu src\(%rax\), %xmm3 +** movdqu src\+16\(%rax\), %xmm2 +** movdqu src\+32\(%rax\), %xmm1 +** movdqu src\+48\(%rax\), %xmm0 +** movaps %xmm3, dest\(%rax\) +** movaps %xmm2, dest\+16\(%rax\) +** movaps %xmm1, dest\+32\(%rax\) +** movaps %xmm0, dest\+48\(%rax\) +** cmpl \$256, %edx +** jb .L[0-9]+ +** ret +**... +*/ + +#define SIZE (16 + 1) * 16 + +char dest[SIZE]; +char src[SIZE]; + +void +foo (void) +{ + __builtin_memcpy (dest, src, SIZE); +} + +/* { dg-final { scan-assembler-not "rep mov" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c new file mode 100644 index 00000000000..75295702952 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c @@ -0,0 +1,47 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** movdqu src\+223\(%rip\), %xmm0 +** xorl %edx, %edx +** movups %xmm0, dest\+223\(%rip\) +** movdqu src\+239\(%rip\), %xmm0 +** movups %xmm0, dest\+239\(%rip\) +** movdqu src\+255\(%rip\), %xmm0 +** movups %xmm0, dest\+255\(%rip\) +** movdqu src\+271\(%rip\), %xmm0 +** movups %xmm0, dest\+271\(%rip\) +**.L[0-9]+: +** movl %edx, %eax +** addl \$64, %edx +** movdqu src\(%rax\), %xmm3 +** movdqu src\+16\(%rax\), %xmm2 +** movdqu src\+32\(%rax\), %xmm1 +** movdqu src\+48\(%rax\), %xmm0 +** movaps %xmm3, dest\(%rax\) +** movaps %xmm2, dest\+16\(%rax\) +** movaps %xmm1, dest\+32\(%rax\) +** movaps %xmm0, dest\+48\(%rax\) +** cmpl \$256, %edx +** jb .L[0-9]+ +** ret +**... +*/ + +#define SIZE 16 * 16 + 31 + +char dest[SIZE]; +char src[SIZE]; + +void +foo (void) +{ + __builtin_memcpy (dest, src, SIZE); +} + +/* { dg-final { scan-assembler-not "rep mov" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c new file mode 100644 index 00000000000..e83ec64a8ad --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c @@ -0,0 +1,48 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** vmovdqu src\+416\(%rip\), %ymm0 +** xorl %edx, %edx +** vmovdqa %ymm0, dest\+416\(%rip\) +** vmovdqu src\+448\(%rip\), %ymm0 +** vmovdqa %ymm0, dest\+448\(%rip\) +** vmovdqu src\+480\(%rip\), %ymm0 +** vmovdqa %ymm0, dest\+480\(%rip\) +** vmovdqu src\+512\(%rip\), %ymm0 +** vmovdqa %ymm0, dest\+512\(%rip\) +**.L[0-9]+: +** movl %edx, %eax +** subl \$-128, %edx +** vmovdqu src\(%rax\), %ymm3 +** vmovdqu src\+32\(%rax\), %ymm2 +** vmovdqu src\+64\(%rax\), %ymm1 +** vmovdqu src\+96\(%rax\), %ymm0 +** vmovdqa %ymm3, dest\(%rax\) +** vmovdqa %ymm2, dest\+32\(%rax\) +** vmovdqa %ymm1, dest\+64\(%rax\) +** vmovdqa %ymm0, dest\+96\(%rax\) +** cmpl \$512, %edx +** jb .L[0-9]+ +** vzeroupper +** ret +**... +*/ + +#define SIZE (16 + 1) * 32 + +char dest[SIZE]; +char src[SIZE]; + +void +foo (void) +{ + __builtin_memcpy (dest, src, SIZE); +} + +/* { dg-final { scan-assembler-not "rep mov" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c new file mode 100644 index 00000000000..4ce7e2bb221 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c @@ -0,0 +1,48 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** vmovdqu src\+447\(%rip\), %ymm0 +** xorl %edx, %edx +** vmovdqu %ymm0, dest\+447\(%rip\) +** vmovdqu src\+479\(%rip\), %ymm0 +** vmovdqu %ymm0, dest\+479\(%rip\) +** vmovdqu src\+511\(%rip\), %ymm0 +** vmovdqu %ymm0, dest\+511\(%rip\) +** vmovdqu src\+543\(%rip\), %ymm0 +** vmovdqu %ymm0, dest\+543\(%rip\) +**.L[0-9]+: +** movl %edx, %eax +** subl \$-128, %edx +** vmovdqu src\(%rax\), %ymm3 +** vmovdqu src\+32\(%rax\), %ymm2 +** vmovdqu src\+64\(%rax\), %ymm1 +** vmovdqu src\+96\(%rax\), %ymm0 +** vmovdqa %ymm3, dest\(%rax\) +** vmovdqa %ymm2, dest\+32\(%rax\) +** vmovdqa %ymm1, dest\+64\(%rax\) +** vmovdqa %ymm0, dest\+96\(%rax\) +** cmpl \$512, %edx +** jb .L[0-9]+ +** vzeroupper +** ret +**... +*/ + +#define SIZE 16 * 32 + 32 + 31 + +char dest[SIZE]; +char src[SIZE]; + +void +foo (void) +{ + __builtin_memcpy (dest, src, SIZE); +} + +/* { dg-final { scan-assembler-not "rep mov" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c new file mode 100644 index 00000000000..69048633c23 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c @@ -0,0 +1,48 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** vmovdqu64 src\+832\(%rip\), %zmm0 +** xorl %edx, %edx +** vmovdqa64 %zmm0, dest\+832\(%rip\) +** vmovdqu64 src\+896\(%rip\), %zmm0 +** vmovdqa64 %zmm0, dest\+896\(%rip\) +** vmovdqu64 src\+960\(%rip\), %zmm0 +** vmovdqa64 %zmm0, dest\+960\(%rip\) +** vmovdqu64 src\+1024\(%rip\), %zmm0 +** vmovdqa64 %zmm0, dest\+1024\(%rip\) +**.L[0-9]+: +** movl %edx, %eax +** addl \$256, %edx +** vmovdqu64 src\(%rax\), %zmm3 +** vmovdqu64 src\+64\(%rax\), %zmm2 +** vmovdqu64 src\+128\(%rax\), %zmm1 +** vmovdqu64 src\+192\(%rax\), %zmm0 +** vmovdqa64 %zmm3, dest\(%rax\) +** vmovdqa64 %zmm2, dest\+64\(%rax\) +** vmovdqa64 %zmm1, dest\+128\(%rax\) +** vmovdqa64 %zmm0, dest\+192\(%rax\) +** cmpl \$1024, %edx +** jb .L[0-9]+ +** vzeroupper +** ret +**... +*/ + +#define SIZE (16 + 1) * 64 + +char dest[SIZE] __attribute__((aligned(64))); +char src[SIZE] __attribute__((aligned(64))); + +void +foo (void) +{ + __builtin_memcpy (dest, src, SIZE); +} + +/* { dg-final { scan-assembler-not "rep mov" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c new file mode 100644 index 00000000000..f517ca48a14 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c @@ -0,0 +1,48 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** vmovdqu64 src\+831\(%rip\), %zmm0 +** xorl %edx, %edx +** vmovdqu64 %zmm0, dest\+831\(%rip\) +** vmovdqu64 src\+895\(%rip\), %zmm0 +** vmovdqu64 %zmm0, dest\+895\(%rip\) +** vmovdqu64 src\+959\(%rip\), %zmm0 +** vmovdqu64 %zmm0, dest\+959\(%rip\) +** vmovdqu64 src\+1023\(%rip\), %zmm0 +** vmovdqu64 %zmm0, dest\+1023\(%rip\) +**.L[0-9]+: +** movl %edx, %eax +** addl \$256, %edx +** vmovdqu64 src\(%rax\), %zmm3 +** vmovdqu64 src\+64\(%rax\), %zmm2 +** vmovdqu64 src\+128\(%rax\), %zmm1 +** vmovdqu64 src\+192\(%rax\), %zmm0 +** vmovdqa64 %zmm3, dest\(%rax\) +** vmovdqa64 %zmm2, dest\+64\(%rax\) +** vmovdqa64 %zmm1, dest\+128\(%rax\) +** vmovdqa64 %zmm0, dest\+192\(%rax\) +** cmpl \$1024, %edx +** jb .L[0-9]+ +** vzeroupper +** ret +**... +*/ + +#define SIZE 16 * 64 + 63 + +char dest[SIZE] __attribute__((aligned(64))); +char src[SIZE] __attribute__((aligned(64))); + +void +foo (void) +{ + __builtin_memcpy (dest, src, SIZE); +} + +/* { dg-final { scan-assembler-not "rep mov" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-1.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-1.c new file mode 100644 index 00000000000..90e544df7ab --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-1.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** pxor %xmm0, %xmm0 +** xorl %eax, %eax +** movups %xmm0, 190\(%rdi\) +** movups %xmm0, 206\(%rdi\) +** movups %xmm0, 222\(%rdi\) +** movups %xmm0, 238\(%rdi\) +**.L[0-9]+: +** movl %eax, %edx +** addl \$64, %eax +** movups %xmm0, \(%rdi,%rdx\) +** movups %xmm0, 16\(%rdi,%rdx\) +** movups %xmm0, 32\(%rdi,%rdx\) +** movups %xmm0, 48\(%rdi,%rdx\) +** cmpl \$192, %eax +** jb .L[0-9]+ +** ret +**... +*/ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 254); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-10.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-10.c new file mode 100644 index 00000000000..21e86a3ee0e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-10.c @@ -0,0 +1,28 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-sse -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** movq \$0, 29\(%rdi\) +** movq \$0, 37\(%rdi\) +** movq \$0, 45\(%rdi\) +** movq \$0, 53\(%rdi\) +** movq \$0, \(%rdi\) +** movq \$0, 8\(%rdi\) +** movq \$0, 16\(%rdi\) +** movq \$0, 24\(%rdi\) +** ret +**... +*/ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 61); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-11.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-11.c new file mode 100644 index 00000000000..30b0cad04e6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-11.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-sse -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** movabsq \$289360691352306692, %rax +** movq %rax, 48\(%rdi\) +** movq %rax, \(%rdi\) +** movq %rax, 8\(%rdi\) +** movq %rax, 16\(%rdi\) +** movq %rax, 24\(%rdi\) +** movq %rax, 32\(%rdi\) +** movq %rax, 40\(%rdi\) +** movq %rax, 53\(%rdi\) +** ret +**... +*/ + +void +foo (char *dest) +{ + __builtin_memset (dest, 4, 61); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-12.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-12.c new file mode 100644 index 00000000000..15987a6451f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-12.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-sse -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** movabsq \$72340172838076673, %rax +** movzbl %sil, %esi +** imulq %rax, %rsi +** movq %rsi, 48\(%rdi\) +** movq %rsi, \(%rdi\) +** movq %rsi, 8\(%rdi\) +** movq %rsi, 16\(%rdi\) +** movq %rsi, 24\(%rdi\) +** movq %rsi, 32\(%rdi\) +** movq %rsi, 40\(%rdi\) +** movq %rsi, 53\(%rdi\) +** ret +**... +*/ + +void +foo (char *dest, int c) +{ + __builtin_memset (dest, c, 61); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-13.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-13.c new file mode 100644 index 00000000000..35c34fc0502 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-13.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** pxor %xmm0, %xmm0 +** xorl %eax, %eax +** movaps %xmm0, dest\+176\(%rip\) +** movaps %xmm0, dest\+192\(%rip\) +** movaps %xmm0, dest\+208\(%rip\) +** movaps %xmm0, dest\+224\(%rip\) +**.L[0-9]+: +** movl %eax, %edx +** addl \$64, %eax +** movaps %xmm0, dest\(%rdx\) +** movaps %xmm0, dest\+16\(%rdx\) +** movaps %xmm0, dest\+32\(%rdx\) +** movaps %xmm0, dest\+48\(%rdx\) +** cmpl \$192, %eax +** jb .L[0-9]+ +** ret +**... +*/ + +char dest[240]; + +void +foo (void) +{ + __builtin_memset (dest, 0, sizeof (dest)); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-14.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-14.c new file mode 100644 index 00000000000..7ec9b3fe1bd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-14.c @@ -0,0 +1,91 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** pxor %xmm0, %xmm0 +** cmpq \$64, %rsi +** jnb .L2 +** testb \$32, %sil +** jne .L19 +** testb \$16, %sil +** jne .L20 +** testb \$8, %sil +** jne .L21 +** testb \$4, %sil +** jne .L22 +** testq %rsi, %rsi +** jne .L23 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L2: +** movups %xmm0, -64\(%rdi,%rsi\) +** movups %xmm0, -48\(%rdi,%rsi\) +** movups %xmm0, -32\(%rdi,%rsi\) +** movups %xmm0, -16\(%rdi,%rsi\) +** subq \$1, %rsi +** cmpq \$64, %rsi +** jb .L1 +** andq \$-64, %rsi +** xorl %eax, %eax +**.L9: +** movups %xmm0, \(%rdi,%rax\) +** movups %xmm0, 16\(%rdi,%rax\) +** movups %xmm0, 32\(%rdi,%rax\) +** movups %xmm0, 48\(%rdi,%rax\) +** addq \$64, %rax +** cmpq %rsi, %rax +** jb .L9 +** ret +** .p2align 4,,10 +** .p2align 3 +**.L23: +** movb \$0, \(%rdi\) +** testb \$2, %sil +** je .L1 +** xorl %eax, %eax +** movw %ax, -2\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L19: +** movups %xmm0, \(%rdi\) +** movups %xmm0, 16\(%rdi\) +** movups %xmm0, -32\(%rdi,%rsi\) +** movups %xmm0, -16\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L20: +** movups %xmm0, \(%rdi\) +** movups %xmm0, -16\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L21: +** movq \$0, \(%rdi\) +** movq \$0, -8\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L22: +** movl \$0, \(%rdi\) +** movl \$0, -4\(%rdi,%rsi\) +** ret +** .cfi_endproc +**... +*/ + +void +foo (char *dest, __SIZE_TYPE__ n) +{ + __builtin_memset (dest, 0, n); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-15.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-15.c new file mode 100644 index 00000000000..e7544057994 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-15.c @@ -0,0 +1,103 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** vpxor %xmm0, %xmm0, %xmm0 +** cmpq \$128, %rsi +** jnb .L2 +** testb \$64, %sil +** jne .L22 +** testb \$32, %sil +** jne .L23 +** testb \$16, %sil +** jne .L24 +** testb \$8, %sil +** jne .L25 +** testb \$4, %sil +** jne .L26 +** testq %rsi, %rsi +** jne .L27 +**.L20: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L2: +** vmovdqu %ymm0, -128\(%rdi,%rsi\) +** vmovdqu %ymm0, -96\(%rdi,%rsi\) +** vmovdqu %ymm0, -64\(%rdi,%rsi\) +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** subq \$1, %rsi +** cmpq \$128, %rsi +** jb .L19 +** andq \$-128, %rsi +** xorl %eax, %eax +**.L10: +** vmovdqu %ymm0, \(%rdi,%rax\) +** vmovdqu %ymm0, 32\(%rdi,%rax\) +** vmovdqu %ymm0, 64\(%rdi,%rax\) +** vmovdqu %ymm0, 96\(%rdi,%rax\) +** subq \$-128, %rax +** cmpq %rsi, %rax +** jb .L10 +**.L19: +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L27: +** movb \$0, \(%rdi\) +** testb \$2, %sil +** je .L20 +** xorl %eax, %eax +** movw %ax, -2\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L22: +** vmovdqu %ymm0, \(%rdi\) +** vmovdqu %ymm0, 32\(%rdi\) +** vmovdqu %ymm0, -64\(%rdi,%rsi\) +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L23: +** vmovdqu %ymm0, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L24: +** vmovdqu %xmm0, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L25: +** movq \$0, \(%rdi\) +** movq \$0, -8\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L26: +** movl \$0, \(%rdi\) +** movl \$0, -4\(%rdi,%rsi\) +** ret +** .cfi_endproc +**... +*/ + +void +foo (char *dest, __SIZE_TYPE__ n) +{ + __builtin_memset (dest, 0, n); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-16.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-16.c new file mode 100644 index 00000000000..c519bf36fb0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-16.c @@ -0,0 +1,112 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** vpxor %xmm0, %xmm0, %xmm0 +** cmpq \$256, %rsi +** jnb .L2 +** testb \$-128, %sil +** jne .L23 +** testb \$64, %sil +** jne .L24 +** testb \$32, %sil +** jne .L25 +** testb \$16, %sil +** jne .L26 +** testb \$8, %sil +** jne .L27 +** testb \$4, %sil +** jne .L28 +** testq %rsi, %rsi +** jne .L29 +**.L21: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L2: +** vmovdqu64 %zmm0, -256\(%rdi,%rsi\) +** vmovdqu64 %zmm0, -192\(%rdi,%rsi\) +** vmovdqu64 %zmm0, -128\(%rdi,%rsi\) +** vmovdqu64 %zmm0, -64\(%rdi,%rsi\) +** subq \$1, %rsi +** cmpq \$256, %rsi +** jb .L20 +** xorb %sil, %sil +** xorl %eax, %eax +**.L11: +** vmovdqu64 %zmm0, \(%rdi,%rax\) +** vmovdqu64 %zmm0, 64\(%rdi,%rax\) +** vmovdqu64 %zmm0, 128\(%rdi,%rax\) +** vmovdqu64 %zmm0, 192\(%rdi,%rax\) +** addq \$256, %rax +** cmpq %rsi, %rax +** jb .L11 +**.L20: +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L29: +** movb \$0, \(%rdi\) +** testb \$2, %sil +** je .L21 +** xorl %eax, %eax +** movw %ax, -2\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L23: +** vmovdqu64 %zmm0, \(%rdi\) +** vmovdqu64 %zmm0, 64\(%rdi\) +** vmovdqu64 %zmm0, -128\(%rdi,%rsi\) +** vmovdqu64 %zmm0, -64\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L24: +** vmovdqu64 %zmm0, \(%rdi\) +** vmovdqu64 %zmm0, -64\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L25: +** vmovdqu %ymm0, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L26: +** vmovdqu %xmm0, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L27: +** movq \$0, \(%rdi\) +** movq \$0, -8\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L28: +** movl \$0, \(%rdi\) +** movl \$0, -4\(%rdi,%rsi\) +** ret +** .cfi_endproc +**... +*/ + +void +foo (char *dest, __SIZE_TYPE__ n) +{ + __builtin_memset (dest, 0, n); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-17.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-17.c new file mode 100644 index 00000000000..4fb87774ee8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-17.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** pxor %xmm0, %xmm0 +** xorl %eax, %eax +** movups %xmm0, dest\+120\(%rip\) +** movups %xmm0, dest\+136\(%rip\) +** movups %xmm0, dest\+152\(%rip\) +** movups %xmm0, dest\+168\(%rip\) +**.L[0-9]+: +** movl %eax, %edx +** addl \$64, %eax +** movaps %xmm0, dest\(%rdx\) +** movaps %xmm0, dest\+16\(%rdx\) +** movaps %xmm0, dest\+32\(%rdx\) +** movaps %xmm0, dest\+48\(%rdx\) +** cmpl \$128, %eax +** jb .L[0-9]+ +** ret +**... +*/ + +char dest[184]; + +void +foo (void) +{ + __builtin_memset (dest, 0, sizeof (dest)); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-2.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-2.c new file mode 100644 index 00000000000..775fb4ce96d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-2.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** vpxor %xmm0, %xmm0, %xmm0 +** vmovdqu %ymm0, 126\(%rdi\) +** vmovdqu %ymm0, 158\(%rdi\) +** vmovdqu %ymm0, 190\(%rdi\) +** vmovdqu %ymm0, 222\(%rdi\) +** vmovdqu %ymm0, \(%rdi\) +** vmovdqu %ymm0, 32\(%rdi\) +** vmovdqu %ymm0, 64\(%rdi\) +** vmovdqu %ymm0, 96\(%rdi\) +** vzeroupper +** ret +**... +*/ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 254); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-3.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-3.c new file mode 100644 index 00000000000..621baf7b9fe --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-3.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** vpxor %xmm0, %xmm0, %xmm0 +** vmovdqu8 %zmm0, 128\(%rdi\) +** vmovdqu8 %zmm0, \(%rdi\) +** vmovdqu8 %zmm0, 64\(%rdi\) +** vmovdqu8 %zmm0, 190\(%rdi\) +** vzeroupper +** ret +**... +*/ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 254); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-4.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-4.c new file mode 100644 index 00000000000..712404be416 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-4.c @@ -0,0 +1,93 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** movabsq \$289360691352306692, %rax +** movq %rax, %xmm0 +** punpcklqdq %xmm0, %xmm0 +** cmpq \$64, %rsi +** jnb .L2 +** testb \$32, %sil +** jne .L19 +** testb \$16, %sil +** jne .L20 +** testb \$8, %sil +** jne .L21 +** testb \$4, %sil +** jne .L22 +** testq %rsi, %rsi +** jne .L23 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L2: +** movups %xmm0, -64\(%rdi,%rsi\) +** movups %xmm0, -48\(%rdi,%rsi\) +** movups %xmm0, -32\(%rdi,%rsi\) +** movups %xmm0, -16\(%rdi,%rsi\) +** subq \$1, %rsi +** cmpq \$64, %rsi +** jb .L1 +** andq \$-64, %rsi +** xorl %eax, %eax +**.L9: +** movups %xmm0, \(%rdi,%rax\) +** movups %xmm0, 16\(%rdi,%rax\) +** movups %xmm0, 32\(%rdi,%rax\) +** movups %xmm0, 48\(%rdi,%rax\) +** addq \$64, %rax +** cmpq %rsi, %rax +** jb .L9 +** ret +** .p2align 4,,10 +** .p2align 3 +**.L23: +** movb \$4, \(%rdi\) +** testb \$2, %sil +** je .L1 +** movl \$1028, %eax +** movw %ax, -2\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L19: +** movups %xmm0, \(%rdi\) +** movups %xmm0, 16\(%rdi\) +** movups %xmm0, -32\(%rdi,%rsi\) +** movups %xmm0, -16\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L20: +** movups %xmm0, \(%rdi\) +** movups %xmm0, -16\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L21: +** movq %rax, \(%rdi\) +** movq %rax, -8\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L22: +** movl \$67372036, \(%rdi\) +** movl \$67372036, -4\(%rdi,%rsi\) +** ret +** .cfi_endproc +**... +*/ + +void +foo (char *dest, __SIZE_TYPE__ n) +{ + __builtin_memset (dest, 4, n); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-5.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-5.c new file mode 100644 index 00000000000..f597395b38b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-5.c @@ -0,0 +1,102 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** movabsq \$289360691352306692, %rax +** vmovq %rax, %xmm1 +** vpbroadcastq %xmm1, %ymm0 +** cmpq \$128, %rsi +** jnb .L2 +** testb \$64, %sil +** jne .L21 +** testb \$32, %sil +** jne .L22 +** testb \$16, %sil +** jne .L23 +** testb \$8, %sil +** jne .L24 +** testb \$4, %sil +** jne .L25 +** testq %rsi, %rsi +** jne .L26 +**.L19: +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L2: +** vmovdqu %ymm0, -128\(%rdi,%rsi\) +** vmovdqu %ymm0, -96\(%rdi,%rsi\) +** vmovdqu %ymm0, -64\(%rdi,%rsi\) +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** subq \$1, %rsi +** cmpq \$128, %rsi +** jb .L19 +** andq \$-128, %rsi +** xorl %eax, %eax +**.L10: +** vmovdqu %ymm0, \(%rdi,%rax\) +** vmovdqu %ymm0, 32\(%rdi,%rax\) +** vmovdqu %ymm0, 64\(%rdi,%rax\) +** vmovdqu %ymm0, 96\(%rdi,%rax\) +** subq \$-128, %rax +** cmpq %rsi, %rax +** jb .L10 +** jmp .L19 +** .p2align 4,,10 +** .p2align 3 +**.L26: +** movb \$4, \(%rdi\) +** testb \$2, %sil +** je .L19 +** movl \$1028, %eax +** movw %ax, -2\(%rdi,%rsi\) +** jmp .L19 +** .p2align 4,,10 +** .p2align 3 +**.L21: +** vmovdqu %ymm0, \(%rdi\) +** vmovdqu %ymm0, 32\(%rdi\) +** vmovdqu %ymm0, -64\(%rdi,%rsi\) +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** jmp .L19 +** .p2align 4,,10 +** .p2align 3 +**.L22: +** vmovdqu %ymm0, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** jmp .L19 +** .p2align 4,,10 +** .p2align 3 +**.L23: +** vmovdqu %xmm0, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rsi\) +** jmp .L19 +** .p2align 4,,10 +** .p2align 3 +**.L24: +** movq %rax, \(%rdi\) +** movq %rax, -8\(%rdi,%rsi\) +** jmp .L19 +** .p2align 4,,10 +** .p2align 3 +**.L25: +** movl \$67372036, \(%rdi\) +** movl \$67372036, -4\(%rdi,%rsi\) +** jmp .L19 +** .cfi_endproc +**... +*/ + +void +foo (char *dest, __SIZE_TYPE__ n) +{ + __builtin_memset (dest, 4, n); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-6.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-6.c new file mode 100644 index 00000000000..7ba1b742076 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-6.c @@ -0,0 +1,109 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** movabsq \$289360691352306692, %rax +** vpbroadcastq %rax, %zmm0 +** cmpq \$256, %rsi +** jnb .L2 +** testb \$-128, %sil +** jne .L22 +** testb \$64, %sil +** jne .L23 +** testb \$32, %sil +** jne .L24 +** testb \$16, %sil +** jne .L25 +** testb \$8, %sil +** jne .L26 +** testb \$4, %sil +** jne .L27 +** testq %rsi, %rsi +** jne .L28 +**.L20: +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L2: +** vmovdqu64 %zmm0, -256\(%rdi,%rsi\) +** vmovdqu64 %zmm0, -192\(%rdi,%rsi\) +** vmovdqu64 %zmm0, -128\(%rdi,%rsi\) +** vmovdqu64 %zmm0, -64\(%rdi,%rsi\) +** subq \$1, %rsi +** cmpq \$256, %rsi +** jb .L20 +** xorb %sil, %sil +** xorl %eax, %eax +**.L11: +** vmovdqu64 %zmm0, \(%rdi,%rax\) +** vmovdqu64 %zmm0, 64\(%rdi,%rax\) +** vmovdqu64 %zmm0, 128\(%rdi,%rax\) +** vmovdqu64 %zmm0, 192\(%rdi,%rax\) +** addq \$256, %rax +** cmpq %rsi, %rax +** jb .L11 +** jmp .L20 +** .p2align 4,,10 +** .p2align 3 +**.L28: +** movb \$4, \(%rdi\) +** testb \$2, %sil +** je .L20 +** movl \$1028, %eax +** movw %ax, -2\(%rdi,%rsi\) +** jmp .L20 +** .p2align 4,,10 +** .p2align 3 +**.L22: +** vmovdqu64 %zmm0, \(%rdi\) +** vmovdqu64 %zmm0, 64\(%rdi\) +** vmovdqu64 %zmm0, -128\(%rdi,%rsi\) +** vmovdqu64 %zmm0, -64\(%rdi,%rsi\) +** jmp .L20 +** .p2align 4,,10 +** .p2align 3 +**.L23: +** vmovdqu64 %zmm0, \(%rdi\) +** vmovdqu64 %zmm0, -64\(%rdi,%rsi\) +** jmp .L20 +** .p2align 4,,10 +** .p2align 3 +**.L24: +** vmovdqu %ymm0, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** jmp .L20 +** .p2align 4,,10 +** .p2align 3 +**.L25: +** vmovdqu %xmm0, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rsi\) +** jmp .L20 +** .p2align 4,,10 +** .p2align 3 +**.L26: +** movq %rax, \(%rdi\) +** movq %rax, -8\(%rdi,%rsi\) +** jmp .L20 +** .p2align 4,,10 +** .p2align 3 +**.L27: +** movl \$67372036, \(%rdi\) +** movl \$67372036, -4\(%rdi,%rsi\) +** jmp .L20 +** .cfi_endproc +**... +*/ + +void +foo (char *dest, __SIZE_TYPE__ n) +{ + __builtin_memset (dest, 4, n); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-7.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-7.c new file mode 100644 index 00000000000..62f61c54ed0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-7.c @@ -0,0 +1,94 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** movabsq \$72340172838076673, %rax +** movzbl %sil, %esi +** imulq %rax, %rsi +** movq %rsi, %xmm0 +** punpcklqdq %xmm0, %xmm0 +** cmpq \$64, %rdx +** jnb .L2 +** testb \$32, %dl +** jne .L19 +** testb \$16, %dl +** jne .L20 +** testb \$8, %dl +** jne .L21 +** testb \$4, %dl +** jne .L22 +** testq %rdx, %rdx +** jne .L23 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L2: +** movups %xmm0, -64\(%rdi,%rdx\) +** movups %xmm0, -48\(%rdi,%rdx\) +** movups %xmm0, -32\(%rdi,%rdx\) +** movups %xmm0, -16\(%rdi,%rdx\) +** subq \$1, %rdx +** cmpq \$64, %rdx +** jb .L1 +** andq \$-64, %rdx +** xorl %eax, %eax +**.L9: +** movups %xmm0, \(%rdi,%rax\) +** movups %xmm0, 16\(%rdi,%rax\) +** movups %xmm0, 32\(%rdi,%rax\) +** movups %xmm0, 48\(%rdi,%rax\) +** addq \$64, %rax +** cmpq %rdx, %rax +** jb .L9 +** ret +** .p2align 4,,10 +** .p2align 3 +**.L23: +** movb %sil, \(%rdi\) +** testb \$2, %dl +** je .L1 +** movw %si, -2\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L19: +** movups %xmm0, \(%rdi\) +** movups %xmm0, 16\(%rdi\) +** movups %xmm0, -32\(%rdi,%rdx\) +** movups %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L20: +** movups %xmm0, \(%rdi\) +** movups %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L21: +** movq %rsi, \(%rdi\) +** movq %rsi, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L22: +** movl %esi, \(%rdi\) +** movl %esi, -4\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +void +foo (char *dest, int c, __SIZE_TYPE__ n) +{ + __builtin_memset (dest, c, n); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-8.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-8.c new file mode 100644 index 00000000000..d12ab157494 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-8.c @@ -0,0 +1,103 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** movabsq \$72340172838076673, %rax +** movzbl %sil, %esi +** imulq %rax, %rsi +** vmovq %rsi, %xmm1 +** vpbroadcastq %xmm1, %ymm0 +** cmpq \$128, %rdx +** jnb .L2 +** testb \$64, %dl +** jne .L21 +** testb \$32, %dl +** jne .L22 +** testb \$16, %dl +** jne .L23 +** testb \$8, %dl +** jne .L24 +** testb \$4, %dl +** jne .L25 +** testq %rdx, %rdx +** jne .L26 +**.L19: +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L2: +** vmovdqu %ymm0, -128\(%rdi,%rdx\) +** vmovdqu %ymm0, -96\(%rdi,%rdx\) +** vmovdqu %ymm0, -64\(%rdi,%rdx\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** subq \$1, %rdx +** cmpq \$128, %rdx +** jb .L19 +** andq \$-128, %rdx +** xorl %eax, %eax +**.L10: +** vmovdqu %ymm0, \(%rdi,%rax\) +** vmovdqu %ymm0, 32\(%rdi,%rax\) +** vmovdqu %ymm0, 64\(%rdi,%rax\) +** vmovdqu %ymm0, 96\(%rdi,%rax\) +** subq \$-128, %rax +** cmpq %rdx, %rax +** jb .L10 +** jmp .L19 +** .p2align 4,,10 +** .p2align 3 +**.L26: +** movb %sil, \(%rdi\) +** testb \$2, %dl +** je .L19 +** movw %si, -2\(%rdi,%rdx\) +** jmp .L19 +** .p2align 4,,10 +** .p2align 3 +**.L21: +** vmovdqu %ymm0, \(%rdi\) +** vmovdqu %ymm0, 32\(%rdi\) +** vmovdqu %ymm0, -64\(%rdi,%rdx\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** jmp .L19 +** .p2align 4,,10 +** .p2align 3 +**.L22: +** vmovdqu %ymm0, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** jmp .L19 +** .p2align 4,,10 +** .p2align 3 +**.L23: +** vmovdqu %xmm0, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** jmp .L19 +** .p2align 4,,10 +** .p2align 3 +**.L24: +** movq %rsi, \(%rdi\) +** movq %rsi, -8\(%rdi,%rdx\) +** jmp .L19 +** .p2align 4,,10 +** .p2align 3 +**.L25: +** movl %esi, \(%rdi\) +** movl %esi, -4\(%rdi,%rdx\) +** jmp .L19 +** .cfi_endproc +**... +*/ + +void +foo (char *dest, int c, __SIZE_TYPE__ n) +{ + __builtin_memset (dest, c, n); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-9.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-9.c new file mode 100644 index 00000000000..1a0abe6614f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-9.c @@ -0,0 +1,110 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** movabsq \$72340172838076673, %rax +** movzbl %sil, %esi +** imulq %rax, %rsi +** vpbroadcastq %rsi, %zmm0 +** cmpq \$256, %rdx +** jnb .L2 +** testb \$-128, %dl +** jne .L22 +** testb \$64, %dl +** jne .L23 +** testb \$32, %dl +** jne .L24 +** testb \$16, %dl +** jne .L25 +** testb \$8, %dl +** jne .L26 +** testb \$4, %dl +** jne .L27 +** testq %rdx, %rdx +** jne .L28 +**.L20: +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L2: +** vmovdqu64 %zmm0, -256\(%rdi,%rdx\) +** vmovdqu64 %zmm0, -192\(%rdi,%rdx\) +** vmovdqu64 %zmm0, -128\(%rdi,%rdx\) +** vmovdqu64 %zmm0, -64\(%rdi,%rdx\) +** subq \$1, %rdx +** cmpq \$256, %rdx +** jb .L20 +** xorb %dl, %dl +** xorl %eax, %eax +**.L11: +** vmovdqu64 %zmm0, \(%rdi,%rax\) +** vmovdqu64 %zmm0, 64\(%rdi,%rax\) +** vmovdqu64 %zmm0, 128\(%rdi,%rax\) +** vmovdqu64 %zmm0, 192\(%rdi,%rax\) +** addq \$256, %rax +** cmpq %rdx, %rax +** jb .L11 +** jmp .L20 +** .p2align 4,,10 +** .p2align 3 +**.L28: +** movb %sil, \(%rdi\) +** testb \$2, %dl +** je .L20 +** movw %si, -2\(%rdi,%rdx\) +** jmp .L20 +** .p2align 4,,10 +** .p2align 3 +**.L22: +** vmovdqu64 %zmm0, \(%rdi\) +** vmovdqu64 %zmm0, 64\(%rdi\) +** vmovdqu64 %zmm0, -128\(%rdi,%rdx\) +** vmovdqu64 %zmm0, -64\(%rdi,%rdx\) +** jmp .L20 +** .p2align 4,,10 +** .p2align 3 +**.L23: +** vmovdqu64 %zmm0, \(%rdi\) +** vmovdqu64 %zmm0, -64\(%rdi,%rdx\) +** jmp .L20 +** .p2align 4,,10 +** .p2align 3 +**.L24: +** vmovdqu %ymm0, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** jmp .L20 +** .p2align 4,,10 +** .p2align 3 +**.L25: +** vmovdqu %xmm0, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** jmp .L20 +** .p2align 4,,10 +** .p2align 3 +**.L26: +** movq %rsi, \(%rdi\) +** movq %rsi, -8\(%rdi,%rdx\) +** jmp .L20 +** .p2align 4,,10 +** .p2align 3 +**.L27: +** movl %esi, \(%rdi\) +** movl %esi, -4\(%rdi,%rdx\) +** jmp .L20 +** .cfi_endproc +**... +*/ + +void +foo (char *dest, int c, __SIZE_TYPE__ n) +{ + __builtin_memset (dest, c, n); +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ -- 2.49.0