On Mon, Dec 11, 2023 at 5:48 PM Sergei Lewis <[email protected]> wrote:
>
> gcc/ChangeLog
>
> * config/riscv/riscv-protos.h (riscv_vector::expand_vec_setmem): New
> function
> declaration.
>
> * config/riscv/riscv-string.cc (riscv_vector::expand_vec_setmem): New
> function: this generates an inline vectorised memory set, if and only if
> we
> know the entire operation can be performed in a single vector store
>
> * config/riscv/riscv.md (setmem<mode>): Try
> riscv_vector::expand_vec_setmem
> for constant lengths
>
> gcc/testsuite/ChangeLog
> * gcc.target/riscv/rvv/base/setmem-1.c: New tests
> ---
> gcc/config/riscv/riscv-protos.h | 1 +
> gcc/config/riscv/riscv-string.cc | 82 +++++++++++++++
> gcc/config/riscv/riscv.md | 14 +++
> .../gcc.target/riscv/rvv/base/setmem-1.c | 99 +++++++++++++++++++
> 4 files changed, 196 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c
>
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 20bbb5b859c..950cb65c910 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -560,6 +560,7 @@ void expand_popcount (rtx *);
> void expand_rawmemchr (machine_mode, rtx, rtx, rtx, bool = false);
> bool expand_strcmp (rtx, rtx, rtx, rtx, unsigned HOST_WIDE_INT, bool);
> void emit_vec_extract (rtx, rtx, poly_int64);
> +bool expand_vec_setmem (rtx, rtx, rtx, rtx);
>
> /* Rounding mode bitfield for fixed point VXRM. */
> enum fixed_point_rounding_mode
> diff --git a/gcc/config/riscv/riscv-string.cc
> b/gcc/config/riscv/riscv-string.cc
> index 11c1f74d0b3..0abbd5f8b28 100644
> --- a/gcc/config/riscv/riscv-string.cc
> +++ b/gcc/config/riscv/riscv-string.cc
> @@ -1247,4 +1247,86 @@ expand_strcmp (rtx result, rtx src1, rtx src2, rtx
> nbytes,
> return true;
> }
>
> +
> +/* Select appropriate LMUL for a single vector operation based on
> + byte size of data to be processed.
> + On success, return true and populate lmul_out.
> + If length_in is too wide for a single vector operation, return false
> + and leave lmul_out unchanged. */
> +
> +static bool
> +select_appropriate_lmul (HOST_WIDE_INT length_in,
> + HOST_WIDE_INT &lmul_out)
> +{
> + /* if it's tiny, default operation is likely better; maybe worth
> + considering fractional lmul in the future as well. */
> + if (length_in < (TARGET_MIN_VLEN/8))
(TARGET_MIN_VLEN / 8)
> + return false;
> +
> + /* find smallest lmul large enough for entire op. */
> + HOST_WIDE_INT lmul = 1;
> + while ((lmul <= 8) && (length_in > ((lmul*TARGET_MIN_VLEN)/8)))
((lmu l *TARGET_MIN_VLEN) / 8)))
> + {
> + lmul <<= 1;
> + }
> +
> + if (lmul > 8)
> + return false;
> +
> + lmul_out = lmul;
> + return true;
> +}
> +
> +/* Used by setmemdi in riscv.md. */
> +bool
> +expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in,
> + rtx alignment_in)
> +{
> + /* we're generating vector code. */
> + if (!TARGET_VECTOR)
> + return false;
> + /* if we can't reason about the length, let libc handle the operation. */
> + if (!CONST_INT_P (length_in))
> + return false;
> +
> + HOST_WIDE_INT length = INTVAL (length_in);
> + HOST_WIDE_INT lmul;
> +
> + /* select an lmul such that the data just fits into one vector operation;
> + bail if we can't. */
> + if (!select_appropriate_lmul (length, lmul))
> + return false;
> +
> + machine_mode vmode = riscv_vector::get_vector_mode (QImode,
> + BYTES_PER_RISCV_VECTOR * lmul).require ();
> + rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
> + rtx dst = change_address (dst_in, vmode, dst_addr);
> +
> + rtx fill_value = gen_reg_rtx (vmode);
> + rtx broadcast_ops[] = {fill_value, fill_value_in};
> +
> + /* If the length is exactly vlmax for the selected mode, do that.
> + Otherwise, use a predicated store. */
> + if (known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
> + {
> + emit_vlmax_insn (code_for_pred_broadcast (vmode),
> + UNARY_OP, broadcast_ops);
> + emit_move_insn (dst, fill_value);
> + }
> + else
> + {
> + if (!satisfies_constraint_K (length_in))
> + length_in= force_reg (Pmode, length_in);
> + emit_nonvlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP,
> + broadcast_ops, length_in);
> + machine_mode mask_mode = riscv_vector::get_vector_mode
> + (BImode, GET_MODE_NUNITS (vmode)).require ();
> + rtx mask = CONSTM1_RTX (mask_mode);
> + emit_insn (gen_pred_store (vmode, dst, mask, fill_value, length_in,
> + get_avl_type_rtx (riscv_vector::NONVLMAX)));
> + }
> +
> + return true;
> +}
> +
> }
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index 88fde290a8a..29d3b1aa342 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -2381,6 +2381,20 @@
> FAIL;
> })
>
> +(define_expand "setmemsi"
> + [(set (match_operand:BLK 0 "memory_operand") ;; Dest
> + (match_operand:QI 2 "nonmemory_operand")) ;; Value
> + (use (match_operand:SI 1 "const_int_operand")) ;; Length
> + (match_operand:SI 3 "const_int_operand")] ;; Align
> + "TARGET_VECTOR"
> +{
> + if (riscv_vector::expand_vec_setmem (operands[0], operands[1], operands[2],
> + operands[3]))
> + DONE;
> + else
> + FAIL;
> +})
> +
> ;; Expand in-line code to clear the instruction cache between operand[0] and
> ;; operand[1].
> (define_expand "clear_cache"
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c
> b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c
> new file mode 100644
> index 00000000000..d1a5ff002a9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c
> @@ -0,0 +1,99 @@
> +/* { dg-do compile } */
> +/* { dg-add-options riscv_v } */
> +/* { dg-additional-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include <string.h>
Drop this to prevent multilib testing issues.
> +
> +#define MIN_VECTOR_BYTES (__riscv_v_min_vlen/8)
> +
> +/* tiny memsets should use scalar ops
> +** f1:
> +** sb\s+a1,0\(a0\)
> +** ret
> +*/
> +void * f1 (void *a, int const b)
> +{
> + return memset (a, b, 1);
__builtin_memset instead memset
> +}
> +
> +/* tiny memsets should use scalar ops
> +** f2:
> +** sb\s+a1,0\(a0\)
> +** sb\s+a1,1\(a0\)
> +** ret
> +*/
> +void * f2 (void *a, int const b)
> +{
> + return memset (a, b, 2);
Ditto.
> +}
> +
> +/* tiny memsets should use scalar ops
> +** f3:
> +** sb\s+a1,0\(a0\)
> +** sb\s+a1,1\(a0\)
> +** sb\s+a1,2\(a0\)
> +** ret
> +*/
> +void * f3 (void *a, int const b)
> +{
> + return memset (a, b, 3);
Ditto.
> +}
> +
> +/* vectorise+inline minimum vector register width with LMUL=1
> +** f4:
> +** (
> +** vsetivli\s+zero,\d+,e8,m1,ta,ma
> +** |
> +** li\s+a\d+,\d+
> +** vsetvli\s+zero,a\d+,e8,m1,ta,ma
> +** )
> +** vmv\.v\.x\s+v\d+,a1
> +** vse8\.v\s+v\d+,0\(a0\)
> +** ret
> +*/
> +void * f4 (void *a, int const b)
> +{
> + return memset (a, b, MIN_VECTOR_BYTES);
Ditto.
> +}
> +
> +/* vectorised code should use smallest lmul known to fit length
> +** f5:
> +** (
> +** vsetivli\s+zero,\d+,e8,m2,ta,ma
> +** |
> +** li\s+a\d+,\d+
> +** vsetvli\s+zero,a\d+,e8,m2,ta,ma
> +** )
> +** vmv\.v\.x\s+v\d+,a1
> +** vse8\.v\s+v\d+,0\(a0\)
> +** ret
> +*/
> +void * f5 (void *a, int const b)
> +{
> + return memset (a, b, MIN_VECTOR_BYTES+1);
Ditto.
> +}
> +
> +/* vectorise+inline up to LMUL=8
> +** f6:
> +** li\s+a\d+,\d+
> +** vsetvli\s+zero,a\d+,e8,m8,ta,ma
> +** vmv\.v\.x\s+v\d+,a1
> +** vse8\.v\s+v\d+,0\(a0\)
> +** ret
> +*/
> +void * f6 (void *a, int const b)
> +{
> + return memset (a, b, MIN_VECTOR_BYTES*8);
Ditto.
> +}
> +
> +/* don't vectorise if the move is too large for one operation
> +** f7:
> +** li\s+a2,\d+
> +** tail\s+memset
> +*/
> +void * f7 (void *a, int const b)
> +{
> + return memset (a, b, MIN_VECTOR_BYTES*8+1);
Ditto.
> +}
> +
> --
> 2.34.1
>