The problem here is that aarch64_expand_setmem does not change the alignment for strict alignment case. This is version 4 of the fix, major changes from the last version is fixing the way store pairs are handled which allows handling of storing 2 SI mode at a time. This also adds a testcase to show a case with -mstrict-align we can do the store word pair stores.
OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions. PR target/103100 gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_gen_store_pair): Add support for SImode. (aarch64_set_one_block_and_progress_pointer): Add use_pair argument and rewrite and simplifying the code. (aarch64_can_use_pair_load_stores): New function. (aarch64_expand_setmem): Rewrite mode selection to better handle strict alignment and non ld/stp pair case. gcc/testsuite/ChangeLog: * gcc.target/aarch64/memset-strict-align-1.c: Update test. Reduce the size down to 207 and make s1 global and aligned to 16 bytes. * gcc.target/aarch64/memset-strict-align-2.c: New test. * gcc.target/aarch64/memset-strict-align-3.c: New test. --- gcc/config/aarch64/aarch64.cc | 136 ++++++++++-------- .../aarch64/memset-strict-align-1.c | 19 ++- .../aarch64/memset-strict-align-2.c | 14 ++ .../aarch64/memset-strict-align-3.c | 15 ++ 4 files changed, 113 insertions(+), 71 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 5c40b6ed22a..3eaf9bd608a 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -8850,6 +8850,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2, { switch (mode) { + case E_SImode: + return gen_store_pair_sw_sisi (mem1, reg1, mem2, reg2); + case E_DImode: return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2); @@ -24896,42 +24899,49 @@ aarch64_expand_cpymem (rtx *operands) SRC is a register we have created with the duplicated value to be set. */ static void aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst, - machine_mode mode) + machine_mode mode, bool use_pairs) { + rtx reg = src; /* If we are copying 128bits or 256bits, we can do that straight from the SIMD register we prepared. */ - if (known_eq (GET_MODE_BITSIZE (mode), 256)) - { - mode = GET_MODE (src); - /* "Cast" the *dst to the correct mode. */ - *dst = adjust_address (*dst, mode, 0); - /* Emit the memset. */ - emit_insn (aarch64_gen_store_pair (mode, *dst, src, - aarch64_progress_pointer (*dst), src)); - - /* Move the pointers forward. */ - *dst = aarch64_move_pointer (*dst, 32); - return; - } if (known_eq (GET_MODE_BITSIZE (mode), 128)) - { - /* "Cast" the *dst to the correct mode. */ - *dst = adjust_address (*dst, GET_MODE (src), 0); - /* Emit the memset. */ - emit_move_insn (*dst, src); - /* Move the pointers forward. */ - *dst = aarch64_move_pointer (*dst, 16); - return; - } - /* For copying less, we have to extract the right amount from src. */ - rtx reg = lowpart_subreg (mode, src, GET_MODE (src)); + mode = GET_MODE(src); + else + /* For copying less, we have to extract the right amount from src. */ + reg = lowpart_subreg (mode, src, GET_MODE (src)); /* "Cast" the *dst to the correct mode. */ *dst = adjust_address (*dst, mode, 0); /* Emit the memset. */ - emit_move_insn (*dst, reg); + if (use_pairs) + emit_insn (aarch64_gen_store_pair (mode, *dst, reg, + aarch64_progress_pointer (*dst), + reg)); + else + emit_move_insn (*dst, reg); + /* Move the pointer forward. */ *dst = aarch64_progress_pointer (*dst); + if (use_pairs) + *dst = aarch64_progress_pointer (*dst); +} + +/* Returns true if size can be used as a store/load pair. + This is a helper function for aarch64_expand_setmem and others. */ +static bool +aarch64_can_use_pair_load_stores (unsigned HOST_WIDE_INT size) +{ + /* For DI and SI modes, we can use store pairs. */ + if (size == GET_MODE_BITSIZE (DImode) + || size == GET_MODE_BITSIZE (SImode)) + return true; + /* For TI mode, we will use store pairs only if + the target wants to. */ + else if (size == GET_MODE_BITSIZE (TImode) + && !(aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)) + return true; + return false; } /* Expand a setmem using the MOPS instructions. OPERANDS are the same @@ -24974,9 +24984,21 @@ aarch64_expand_setmem (rtx *operands) bool size_p = optimize_function_for_size_p (cfun); - /* Default the maximum to 256-bytes when considering only libcall vs - SIMD broadcast sequence. */ - unsigned max_set_size = 256; + /* Maximum amount to copy in one go (without taking into account store pairs). */ + int copy_limit = GET_MODE_BITSIZE (TImode); + + /* For strict alignment case, restrict the copy limit to the compiler time + known alignment of the memory. */ + if (STRICT_ALIGNMENT) + copy_limit = MIN (copy_limit, (int)MEM_ALIGN (dst)); + + bool use_pairs = aarch64_can_use_pair_load_stores (copy_limit); + + /* The max set size is 8 instructions of the copy_limit sized stores + (also taking into account using pair stores or not); + for the non strict alignment, this is 256 bytes. */ + unsigned max_set_size; + max_set_size = (copy_limit * (use_pairs ? 2 : 1) * 8) / BITS_PER_UNIT; len = INTVAL (operands[1]); if (len > max_set_size && !TARGET_MOPS) @@ -24990,8 +25012,10 @@ aarch64_expand_setmem (rtx *operands) (zero constants can use XZR directly). */ unsigned mops_cost = 3 + 1 + cst_val; /* A libcall to memset in the worst case takes 3 instructions to prepare - the arguments + 1 for the call. */ - unsigned libcall_cost = 4; + the arguments + 1 for the call. + In the case of not optimizing for size the cost of doing a libcall + is 17 instructions . */ + unsigned libcall_cost = size_p ? 4 : 17; /* Upper bound check. For large constant-sized setmem use the MOPS sequence when available. */ @@ -25001,12 +25025,12 @@ aarch64_expand_setmem (rtx *operands) /* Attempt a sequence with a vector broadcast followed by stores. Count the number of operations involved to see if it's worth it - against the alternatives. A simple counter simd_ops on the + against the alternatives. A simple counter inlined_ops on the algorithmically-relevant operations is used rather than an rtx_insn count as all the pointer adjusmtents and mode reinterprets will be optimized away later. */ start_sequence (); - unsigned simd_ops = 0; + unsigned inlined_ops = 0; base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); dst = adjust_automodify_address (dst, VOIDmode, base, 0); @@ -25014,16 +25038,10 @@ aarch64_expand_setmem (rtx *operands) /* Prepare the val using a DUP/MOVI v0.16B, val. */ src = expand_vector_broadcast (V16QImode, val); src = force_reg (V16QImode, src); - simd_ops++; + inlined_ops++; /* Convert len to bits to make the rest of the code simpler. */ n = len * BITS_PER_UNIT; - /* Maximum amount to copy in one go. We allow 256-bit chunks based on the - AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */ - const int copy_limit = (aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) - ? GET_MODE_BITSIZE (TImode) : 256; - while (n > 0) { /* Find the largest mode in which to do the copy without @@ -25036,15 +25054,18 @@ aarch64_expand_setmem (rtx *operands) gcc_assert (cur_mode != BLKmode); mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant (); - aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode); - simd_ops++; - n -= mode_bits; + bool pairs = aarch64_can_use_pair_load_stores (mode_bits); + if (n < (mode_bits * 2)) + pairs = false; + aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode, pairs); + inlined_ops++; + n -= mode_bits * (pairs ? 2 : 1); /* Do certain trailing copies as overlapping if it's going to be cheaper. i.e. less instructions to do so. For instance doing a 15 byte copy it's more efficient to do two overlapping 8 byte copies than 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */ - if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT) + if (n > 0 && n < copy_limit && !STRICT_ALIGNMENT) { next_mode = smallest_mode_for_size (n, MODE_INT); int n_bits = GET_MODE_BITSIZE (next_mode).to_constant (); @@ -25056,24 +25077,17 @@ aarch64_expand_setmem (rtx *operands) rtx_insn *seq = get_insns (); end_sequence (); - if (size_p) - { - /* When optimizing for size we have 3 options: the SIMD broadcast sequence, - call to memset or the MOPS expansion. */ - if (TARGET_MOPS - && mops_cost <= libcall_cost - && mops_cost <= simd_ops) - return aarch64_expand_setmem_mops (operands); - /* If MOPS is not available or not shorter pick a libcall if the SIMD - sequence is too long. */ - else if (libcall_cost < simd_ops) - return false; - emit_insn (seq); - return true; - } + /* When optimizing for size we have 3 options: the inlined sequence, + call to memset or the MOPS expansion. */ + if (size_p + && TARGET_MOPS + && mops_cost <= libcall_cost + && mops_cost <= inlined_ops) + return aarch64_expand_setmem_mops (operands); + /* Pick a libcall if the inlined sequence is too long. */ + else if (libcall_cost < inlined_ops) + return false; - /* At this point the SIMD broadcast sequence is the best choice when - optimizing for speed. */ emit_insn (seq); return true; } diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c index 664d43aee13..63c864b25b0 100644 --- a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c +++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c @@ -1,28 +1,27 @@ /* { dg-do compile } */ /* { dg-options "-O2 -mstrict-align" } */ -struct s { char x[255]; }; +struct s { char x[207]; }; +struct s s1 __attribute__((aligned(16))); void foo (struct s *); -void bar (void) { struct s s1 = {}; foo (&s1); } +void bar (void) { s1 = (struct s){}; foo (&s1); } -/* memset (s1 = {}, sizeof = 255) should be expanded out +/* memset (s1 = {}, sizeof = 207) should be expanded out such that there are no overlap stores when -mstrict-align is in use. - so 7 pairs of 16 bytes stores (224 bytes). - 1 16 byte stores + so 6 pairs of 16 bytes stores (96 bytes). 1 8 byte store 1 4 byte store 1 2 byte store 1 1 byte store */ -/* { dg-final { scan-assembler-times "stp\tq" 7 } } */ -/* { dg-final { scan-assembler-times "str\tq" 1 } } */ +/* { dg-final { scan-assembler-times "stp\tq" 6 } } */ +/* { dg-final { scan-assembler-times "str\tq" 0 } } */ /* { dg-final { scan-assembler-times "str\txzr" 1 } } */ /* { dg-final { scan-assembler-times "str\twzr" 1 } } */ /* { dg-final { scan-assembler-times "strh\twzr" 1 } } */ /* { dg-final { scan-assembler-times "strb\twzr" 1 } } */ -/* Also one store pair for the frame-pointer and the LR. */ -/* { dg-final { scan-assembler-times "stp\tx" 1 } } */ - +/* No store pair with 8 byte words is needed as foo is called with a sibcall. */ +/* { dg-final { scan-assembler-times "stp\tx" 0 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c new file mode 100644 index 00000000000..650be86604b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mstrict-align" } */ + +struct s { char x[7]; }; +void foo (struct s *); +void bar (struct s *s1) { *s1 = (struct s){}; foo (s1); } + +/* memset (s1 = {}, sizeof = 7) should be expanded out + such that there are no overlap stores when -mstrict-align + is in use. As the alignment of s1 is unknown, byte stores are needed. + so 15 byte stores + */ + +/* { dg-final { scan-assembler-times "strb\twzr" 7 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c new file mode 100644 index 00000000000..09cb9a654dc --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-Os -mstrict-align" } */ + +int a[2*3]; +int f(int t) +{ + __builtin_memset(a, t, 2*3*sizeof(int)); +} + +/* memset (a, (val), sizeof = 2*3*4) should be expanded out + such that there are no overlap stores when -mstrict-align + is in use. As the alignment of a is 4 byte aligned (due to -Os), + store word pairs are needed. so 3 stp are in use. */ + +/* { dg-final { scan-assembler-times "stp\ts" 3 } } */ -- 2.31.1