On Mon, Dec 11, 2023 at 5:48 PM Sergei Lewis <sle...@rivosinc.com> wrote: > > gcc/ChangeLog > > * config/riscv/riscv-protos.h (riscv_vector::expand_vec_setmem): New > function > declaration. > > * config/riscv/riscv-string.cc (riscv_vector::expand_vec_setmem): New > function: this generates an inline vectorised memory set, if and only if > we > know the entire operation can be performed in a single vector store > > * config/riscv/riscv.md (setmem<mode>): Try > riscv_vector::expand_vec_setmem > for constant lengths > > gcc/testsuite/ChangeLog > * gcc.target/riscv/rvv/base/setmem-1.c: New tests > --- > gcc/config/riscv/riscv-protos.h | 1 + > gcc/config/riscv/riscv-string.cc | 82 +++++++++++++++ > gcc/config/riscv/riscv.md | 14 +++ > .../gcc.target/riscv/rvv/base/setmem-1.c | 99 +++++++++++++++++++ > 4 files changed, 196 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h > index 20bbb5b859c..950cb65c910 100644 > --- a/gcc/config/riscv/riscv-protos.h > +++ b/gcc/config/riscv/riscv-protos.h > @@ -560,6 +560,7 @@ void expand_popcount (rtx *); > void expand_rawmemchr (machine_mode, rtx, rtx, rtx, bool = false); > bool expand_strcmp (rtx, rtx, rtx, rtx, unsigned HOST_WIDE_INT, bool); > void emit_vec_extract (rtx, rtx, poly_int64); > +bool expand_vec_setmem (rtx, rtx, rtx, rtx); > > /* Rounding mode bitfield for fixed point VXRM. */ > enum fixed_point_rounding_mode > diff --git a/gcc/config/riscv/riscv-string.cc > b/gcc/config/riscv/riscv-string.cc > index 11c1f74d0b3..0abbd5f8b28 100644 > --- a/gcc/config/riscv/riscv-string.cc > +++ b/gcc/config/riscv/riscv-string.cc > @@ -1247,4 +1247,86 @@ expand_strcmp (rtx result, rtx src1, rtx src2, rtx > nbytes, > return true; > } > > + > +/* Select appropriate LMUL for a single vector operation based on > + byte size of data to be processed. > + On success, return true and populate lmul_out. > + If length_in is too wide for a single vector operation, return false > + and leave lmul_out unchanged. */ > + > +static bool > +select_appropriate_lmul (HOST_WIDE_INT length_in, > + HOST_WIDE_INT &lmul_out) > +{ > + /* if it's tiny, default operation is likely better; maybe worth > + considering fractional lmul in the future as well. */ > + if (length_in < (TARGET_MIN_VLEN/8))
(TARGET_MIN_VLEN / 8) > + return false; > + > + /* find smallest lmul large enough for entire op. */ > + HOST_WIDE_INT lmul = 1; > + while ((lmul <= 8) && (length_in > ((lmul*TARGET_MIN_VLEN)/8))) ((lmu l *TARGET_MIN_VLEN) / 8))) > + { > + lmul <<= 1; > + } > + > + if (lmul > 8) > + return false; > + > + lmul_out = lmul; > + return true; > +} > + > +/* Used by setmemdi in riscv.md. */ > +bool > +expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in, > + rtx alignment_in) > +{ > + /* we're generating vector code. */ > + if (!TARGET_VECTOR) > + return false; > + /* if we can't reason about the length, let libc handle the operation. */ > + if (!CONST_INT_P (length_in)) > + return false; > + > + HOST_WIDE_INT length = INTVAL (length_in); > + HOST_WIDE_INT lmul; > + > + /* select an lmul such that the data just fits into one vector operation; > + bail if we can't. */ > + if (!select_appropriate_lmul (length, lmul)) > + return false; > + > + machine_mode vmode = riscv_vector::get_vector_mode (QImode, > + BYTES_PER_RISCV_VECTOR * lmul).require (); > + rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0)); > + rtx dst = change_address (dst_in, vmode, dst_addr); > + > + rtx fill_value = gen_reg_rtx (vmode); > + rtx broadcast_ops[] = {fill_value, fill_value_in}; > + > + /* If the length is exactly vlmax for the selected mode, do that. > + Otherwise, use a predicated store. */ > + if (known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in))) > + { > + emit_vlmax_insn (code_for_pred_broadcast (vmode), > + UNARY_OP, broadcast_ops); > + emit_move_insn (dst, fill_value); > + } > + else > + { > + if (!satisfies_constraint_K (length_in)) > + length_in= force_reg (Pmode, length_in); > + emit_nonvlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP, > + broadcast_ops, length_in); > + machine_mode mask_mode = riscv_vector::get_vector_mode > + (BImode, GET_MODE_NUNITS (vmode)).require (); > + rtx mask = CONSTM1_RTX (mask_mode); > + emit_insn (gen_pred_store (vmode, dst, mask, fill_value, length_in, > + get_avl_type_rtx (riscv_vector::NONVLMAX))); > + } > + > + return true; > +} > + > } > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md > index 88fde290a8a..29d3b1aa342 100644 > --- a/gcc/config/riscv/riscv.md > +++ b/gcc/config/riscv/riscv.md > @@ -2381,6 +2381,20 @@ > FAIL; > }) > > +(define_expand "setmemsi" > + [(set (match_operand:BLK 0 "memory_operand") ;; Dest > + (match_operand:QI 2 "nonmemory_operand")) ;; Value > + (use (match_operand:SI 1 "const_int_operand")) ;; Length > + (match_operand:SI 3 "const_int_operand")] ;; Align > + "TARGET_VECTOR" > +{ > + if (riscv_vector::expand_vec_setmem (operands[0], operands[1], operands[2], > + operands[3])) > + DONE; > + else > + FAIL; > +}) > + > ;; Expand in-line code to clear the instruction cache between operand[0] and > ;; operand[1]. > (define_expand "clear_cache" > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c > b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c > new file mode 100644 > index 00000000000..d1a5ff002a9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c > @@ -0,0 +1,99 @@ > +/* { dg-do compile } */ > +/* { dg-add-options riscv_v } */ > +/* { dg-additional-options "-O3" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > + > +#include <string.h> Drop this to prevent multilib testing issues. > + > +#define MIN_VECTOR_BYTES (__riscv_v_min_vlen/8) > + > +/* tiny memsets should use scalar ops > +** f1: > +** sb\s+a1,0\(a0\) > +** ret > +*/ > +void * f1 (void *a, int const b) > +{ > + return memset (a, b, 1); __builtin_memset instead memset > +} > + > +/* tiny memsets should use scalar ops > +** f2: > +** sb\s+a1,0\(a0\) > +** sb\s+a1,1\(a0\) > +** ret > +*/ > +void * f2 (void *a, int const b) > +{ > + return memset (a, b, 2); Ditto. > +} > + > +/* tiny memsets should use scalar ops > +** f3: > +** sb\s+a1,0\(a0\) > +** sb\s+a1,1\(a0\) > +** sb\s+a1,2\(a0\) > +** ret > +*/ > +void * f3 (void *a, int const b) > +{ > + return memset (a, b, 3); Ditto. > +} > + > +/* vectorise+inline minimum vector register width with LMUL=1 > +** f4: > +** ( > +** vsetivli\s+zero,\d+,e8,m1,ta,ma > +** | > +** li\s+a\d+,\d+ > +** vsetvli\s+zero,a\d+,e8,m1,ta,ma > +** ) > +** vmv\.v\.x\s+v\d+,a1 > +** vse8\.v\s+v\d+,0\(a0\) > +** ret > +*/ > +void * f4 (void *a, int const b) > +{ > + return memset (a, b, MIN_VECTOR_BYTES); Ditto. > +} > + > +/* vectorised code should use smallest lmul known to fit length > +** f5: > +** ( > +** vsetivli\s+zero,\d+,e8,m2,ta,ma > +** | > +** li\s+a\d+,\d+ > +** vsetvli\s+zero,a\d+,e8,m2,ta,ma > +** ) > +** vmv\.v\.x\s+v\d+,a1 > +** vse8\.v\s+v\d+,0\(a0\) > +** ret > +*/ > +void * f5 (void *a, int const b) > +{ > + return memset (a, b, MIN_VECTOR_BYTES+1); Ditto. > +} > + > +/* vectorise+inline up to LMUL=8 > +** f6: > +** li\s+a\d+,\d+ > +** vsetvli\s+zero,a\d+,e8,m8,ta,ma > +** vmv\.v\.x\s+v\d+,a1 > +** vse8\.v\s+v\d+,0\(a0\) > +** ret > +*/ > +void * f6 (void *a, int const b) > +{ > + return memset (a, b, MIN_VECTOR_BYTES*8); Ditto. > +} > + > +/* don't vectorise if the move is too large for one operation > +** f7: > +** li\s+a2,\d+ > +** tail\s+memset > +*/ > +void * f7 (void *a, int const b) > +{ > + return memset (a, b, MIN_VECTOR_BYTES*8+1); Ditto. > +} > + > -- > 2.34.1 >