https://gcc.gnu.org/g:f2009b5cb0d0c18e19c0e2badfea5777fb7cbcae
commit f2009b5cb0d0c18e19c0e2badfea5777fb7cbcae Author: Craig Blackmore <craig.blackm...@embecosm.com> Date: Sat Oct 19 07:08:31 2024 -0600 [PATCH 7/7] RISC-V: Disable by pieces for vector setmem length > UNITS_PER_WORD For fast unaligned access targets, by pieces uses up to UNITS_PER_WORD size pieces resulting in more store instructions than needed. For example gcc.target/riscv/rvv/base/setmem-1.c:f1 built with `-O3 -march=rv64gcv -mtune=thead-c906`: ``` f1: vsetivli zero,8,e8,mf2,ta,ma vmv.v.x v1,a1 vsetivli zero,0,e32,mf2,ta,ma sb a1,14(a0) vmv.x.s a4,v1 vsetivli zero,8,e16,m1,ta,ma vmv.x.s a5,v1 vse8.v v1,0(a0) sw a4,8(a0) sh a5,12(a0) ret ``` The slow unaligned access version built with `-O3 -march=rv64gcv` used 15 sb instructions: ``` f1: sb a1,0(a0) sb a1,1(a0) sb a1,2(a0) sb a1,3(a0) sb a1,4(a0) sb a1,5(a0) sb a1,6(a0) sb a1,7(a0) sb a1,8(a0) sb a1,9(a0) sb a1,10(a0) sb a1,11(a0) sb a1,12(a0) sb a1,13(a0) sb a1,14(a0) ret ``` After this patch, the following is generated in both cases: ``` f1: vsetivli zero,15,e8,m1,ta,ma vmv.v.x v1,a1 vse8.v v1,0(a0) ret ``` gcc/ChangeLog: * config/riscv/riscv.cc (riscv_use_by_pieces_infrastructure_p): New function. (TARGET_USE_BY_PIECES_INFRASTRUCTURE_P): Define. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr113469.c: Expect mf2 setmem. * gcc.target/riscv/rvv/base/setmem-2.c: Update f1 to expect straight-line vector memset. * gcc.target/riscv/rvv/base/setmem-3.c: Likewise. (cherry picked from commit 72ceddbfb78dbb95f0808c3eca1765e8cd48b023) Diff: --- gcc/config/riscv/riscv.cc | 19 +++++++++++++++++++ gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c | 3 ++- gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c | 12 +++++++----- gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c | 12 +++++++----- 4 files changed, 35 insertions(+), 11 deletions(-) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 2f7250a0cf60..e4dcc3a0ba7f 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -12570,6 +12570,22 @@ riscv_stack_clash_protection_alloca_probe_range (void) return STACK_CLASH_CALLER_GUARD; } +static bool +riscv_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size, + unsigned alignment, + enum by_pieces_operation op, bool speed_p) +{ + /* For set/clear with size > UNITS_PER_WORD, by pieces uses vector broadcasts + with UNITS_PER_WORD size pieces. Use setmem<mode> instead which can use + bigger chunks. */ + if (TARGET_VECTOR && stringop_strategy & STRATEGY_VECTOR + && (op == CLEAR_BY_PIECES || op == SET_BY_PIECES) + && speed_p && size > UNITS_PER_WORD) + return false; + + return default_use_by_pieces_infrastructure_p (size, alignment, op, speed_p); +} + /* Initialize the GCC target structure. */ #undef TARGET_ASM_ALIGNED_HI_OP #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t" @@ -12932,6 +12948,9 @@ riscv_stack_clash_protection_alloca_probe_range (void) #undef TARGET_GET_RAW_RESULT_MODE #define TARGET_GET_RAW_RESULT_MODE riscv_get_raw_result_mode +#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P +#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P riscv_use_by_pieces_infrastructure_p + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-riscv.h" diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c index d1c118c02d6e..f86084bdb40f 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c @@ -51,4 +51,5 @@ void p(int buf, __builtin_va_list ab, int q) { } while (k); } -/* { dg-final { scan-assembler-times {vsetivli\tzero,\s*4,\s*e8,\s*mf4,\s*t[au],\s*m[au]} 2 } } */ +/* { dg-final { scan-assembler-times {vsetivli\tzero,\s*4,\s*e8,\s*mf4,\s*t[au],\s*m[au]} 1 } } */ +/* { dg-final { scan-assembler-times {vsetivli\tzero,\s*8,\s*e8,\s*mf2,\s*t[au],\s*m[au]} 1 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c index faea442a4bdc..838fbebadff3 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c @@ -5,15 +5,17 @@ #define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8) -/* Small memsets shouldn't be vectorised. +/* Vectorise with no loop. ** f1: ** ( -** sb\s+a1,0\(a0\) -** ... +** vsetivli\s+zero,\d+,e8,m1,ta,ma ** | -** li\s+a2,\d+ -** tail\s+memset +** li\s+a\d+,\d+ +** vsetvli\s+zero,a\d+,e8,m1,ta,ma ** ) +** vmv\.v\.x\s+v\d+,a1 +** vse8\.v\s+v\d+,0\(a0\) +** ret */ void * f1 (void *a, int const b) diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c index 25be694d248a..02de9a94cc47 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c @@ -5,15 +5,17 @@ #define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8) -/* Small memsets shouldn't be vectorised. +/* Vectorise with no loop. ** f1: ** ( -** sb\s+a1,0\(a0\) -** ... +** vsetivli\s+zero,\d+,e8,m1,ta,ma ** | -** li\s+a2,\d+ -** tail\s+memset +** li\s+a\d+,\d+ +** vsetvli\s+zero,a\d+,e8,m1,ta,ma ** ) +** vmv\.v\.x\s+v\d+,a1 +** vse8\.v\s+v\d+,0\(a0\) +** ret */ void * f1 (void *a, int const b)