This patch is depending on middle-end patches which are under review. I will commit it after middle-end patches are approved.
Consider this following case: void foo (int * __restrict a, int * __restrict b, int stride, int n) { for (int i = 0; i < n; i++) a[i*stride] = b[i*stride] + 100; } Before this patch: slli a6,a2,2 vid.v v1 vmul.vx v1,v1,a2 vsetvli zero,zero,e64,m2,ta,ma vsext.vf2 v4,v1 vsll.vi v4,v4,2 .L4: vsetvli a5,a3,e32,m1,ta,ma mul a4,a6,a5 vluxei64.v v1,(a1),v4 sub a3,a3,a5 vadd.vv v1,v1,v2 vsuxei64.v v1,(a0),v4 add a1,a1,a4 add a0,a0,a4 bne a3,zero,.L4 ret After this patch: slli a6,a2,2 mv a4,a6 .L4: vsetvli a5,a3,e32,m1,ta,ma mul a2,a6,a5 vlse32.v v1,0(a1),a4 sub a3,a3,a5 vadd.vv v1,v1,v2 vsse32.v v1,0(a0),a4 add a1,a1,a2 add a0,a0,a2 bne a3,zero,.L4 ret gcc/ChangeLog: * config/riscv/autovec.md (mask_len_strided_load<V:mode><ANYI:mode>): New pattern. (mask_len_strided_store<V:mode><ANYI:mode>): Ditto. * config/riscv/predicates.md (vector_stride_extension_operand): New predicate. * config/riscv/riscv-protos.h (expand_strided_load_store): New function. * config/riscv/riscv-v.cc (expand_strided_load_store): Ditto. * config/riscv/vector-iterators.md: New attribute. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-1.c: Adapt test. * gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-2.c: Ditto. * gcc.target/riscv/rvv/autovec/gather-scatter/strided_store-1.c: Ditto. * gcc.target/riscv/rvv/autovec/gather-scatter/strided_store-2.c: Ditto. * gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_load-1.c: New test. * gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_load_run-1.c: New test. * gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_store-1.c: New test. * gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_store_run-1.c: New test. * gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-3.c: New test. * gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-3.c: New test. * gcc.target/riscv/rvv/autovec/gather-scatter/strided_store-3.c: New test. --- gcc/config/riscv/autovec.md | 34 +++++++ gcc/config/riscv/predicates.md | 9 ++ gcc/config/riscv/riscv-protos.h | 1 + gcc/config/riscv/riscv-v.cc | 76 +++++++++++++++ gcc/config/riscv/vector-iterators.md | 5 + .../gather-scatter/mask_strided_load-1.c | 47 +++++++++ .../gather-scatter/mask_strided_load_run-1.c | 97 +++++++++++++++++++ .../gather-scatter/mask_strided_store-1.c | 48 +++++++++ .../gather-scatter/mask_strided_store_run-1.c | 89 +++++++++++++++++ .../autovec/gather-scatter/strided_load-1.c | 2 +- .../autovec/gather-scatter/strided_load-2.c | 2 +- .../autovec/gather-scatter/strided_load-3.c | 45 +++++++++ .../gather-scatter/strided_load_run-3.c | 84 ++++++++++++++++ .../autovec/gather-scatter/strided_store-1.c | 2 +- .../autovec/gather-scatter/strided_store-2.c | 2 +- .../autovec/gather-scatter/strided_store-3.c | 45 +++++++++ 16 files changed, 584 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_load-1.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_load_run-1.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_store-1.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_store_run-1.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-3.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-3.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_store-3.c diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md index f5e3e347ace..3e4493c42cc 100644 --- a/gcc/config/riscv/autovec.md +++ b/gcc/config/riscv/autovec.md @@ -272,6 +272,40 @@ DONE; }) +;; ========================================================================= +;; == Strided Load/Store +;; ========================================================================= + +(define_expand "mask_len_strided_load<V:mode><ANYI:mode>" + [(match_operand:V 0 "register_operand") + (match_operand 1 "pmode_reg_or_0_operand") + (match_operand:ANYI 2 "register_operand") + (match_operand 3 "<stride_extension>") + (match_operand 4 "<gs_scale>") + (match_operand:<VM> 5 "vector_mask_operand") + (match_operand 6 "autovec_length_operand") + (match_operand 7 "const_0_operand")] + "TARGET_VECTOR" +{ + riscv_vector::expand_strided_load_store (<V:MODE>mode, operands, true); + DONE; +}) + +(define_expand "mask_len_strided_store<V:mode><ANYI:mode>" + [(match_operand 0 "pmode_reg_or_0_operand") + (match_operand:ANYI 1 "register_operand") + (match_operand 2 "<stride_extension>") + (match_operand 3 "<gs_scale>") + (match_operand:V 4 "register_operand") + (match_operand:<VM> 5 "vector_mask_operand") + (match_operand 6 "autovec_length_operand") + (match_operand 7 "const_0_operand")] + "TARGET_VECTOR" +{ + riscv_vector::expand_strided_load_store (<V:MODE>mode, operands, false); + DONE; +}) + ;; ========================================================================= ;; == Array Load/Store ;; ========================================================================= diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md index a37d035fa61..719c2e70229 100644 --- a/gcc/config/riscv/predicates.md +++ b/gcc/config/riscv/predicates.md @@ -489,6 +489,15 @@ (and (match_operand 0 "const_0_operand") (match_test "Pmode == SImode")))) +;; Since the stride of vlse/vsse is 32bit sign number in RV32 system. +;; we forbid strided load/store vectorization for unsigned situation +;; which may cause overflow of 32bit address calculation. +(define_predicate "vector_stride_extension_operand" + (ior (and (match_operand 0 "immediate_operand") + (match_test "Pmode == DImode")) + (and (match_operand 0 "const_0_operand") + (match_test "Pmode == SImode")))) + (define_predicate "vector_gs_scale_operand_16_rv32" (and (match_code "const_int") (match_test "INTVAL (op) == 1 diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index fde360327c1..02056591ec6 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -536,6 +536,7 @@ void expand_vec_init (rtx, rtx); void expand_vec_perm (rtx, rtx, rtx, rtx); void expand_select_vl (rtx *); void expand_load_store (rtx *, bool); +void expand_strided_load_store (machine_mode, rtx *, bool); void expand_gather_scatter (rtx *, bool); void expand_cond_len_ternop (unsigned, rtx *); void prepare_ternary_operands (rtx *); diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 3a49ae76426..668f3cd706b 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -3202,6 +3202,82 @@ expand_load_store (rtx *ops, bool is_load) } } +/* Expand MASK_LEN_STRIDED_{LOAD,STORE}. */ +void +expand_strided_load_store (machine_mode mode, rtx *ops, bool is_load) +{ + rtx ptr, stride, vec_reg; + bool zero_extend_p; + int scale_log2; + rtx mask = ops[5]; + rtx len = ops[6]; + poly_int64 value; + if (is_load) + { + vec_reg = ops[0]; + ptr = ops[1]; + stride = ops[2]; + zero_extend_p = INTVAL (ops[3]); + scale_log2 = exact_log2 (INTVAL (ops[4])); + } + else + { + vec_reg = ops[4]; + ptr = ops[0]; + stride = ops[1]; + zero_extend_p = INTVAL (ops[2]); + scale_log2 = exact_log2 (INTVAL (ops[3])); + } + + /* Calculate the stride. */ + if (GET_MODE (stride) != Pmode) + { + rtx tmp = gen_reg_rtx (Pmode); + emit_insn ( + gen_extend_insn (tmp, stride, Pmode, GET_MODE (stride), zero_extend_p)); + stride = tmp; + } + if (scale_log2 != 0) + { + stride = expand_binop (Pmode, ashl_optab, stride, + gen_int_mode (scale_log2, Pmode), NULL_RTX, 0, + OPTAB_DIRECT); + } + + if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode))) + { + /* If the length operand is equal to VF, it is VLMAX load/store. */ + if (is_load) + { + rtx m_ops[] = {vec_reg, mask, gen_rtx_MEM (mode, ptr), stride}; + emit_vlmax_insn (code_for_pred_strided_load (mode), BINARY_OP_TAMA, + m_ops); + } + else + { + len = gen_reg_rtx (Pmode); + emit_vlmax_vsetvl (mode, len); + emit_insn (gen_pred_strided_store (mode, gen_rtx_MEM (mode, ptr), + mask, stride, vec_reg, len, + get_avl_type_rtx (VLMAX))); + } + } + else + { + if (!satisfies_constraint_K (len)) + len = force_reg (Pmode, len); + if (is_load) + { + rtx m_ops[] = {vec_reg, mask, gen_rtx_MEM (mode, ptr), stride}; + emit_nonvlmax_insn (code_for_pred_strided_load (mode), BINARY_OP_TAMA, + m_ops, len); + } + else + emit_insn (gen_pred_strided_store (mode, gen_rtx_MEM (mode, ptr), mask, + stride, vec_reg, len, + get_avl_type_rtx (NONVLMAX))); + } +} /* Return true if the operation is the floating-point operation need FRM. */ static bool diff --git a/gcc/config/riscv/vector-iterators.md b/gcc/config/riscv/vector-iterators.md index d9b5dec5edb..e18723e4454 100644 --- a/gcc/config/riscv/vector-iterators.md +++ b/gcc/config/riscv/vector-iterators.md @@ -3385,6 +3385,11 @@ (RVVM2DF "immediate_operand") (RVVM1DF "immediate_operand") ]) +(define_mode_attr stride_extension [ + (QI "immediate_operand") (HI "immediate_operand") + (SI "vector_stride_extension_operand") (DI "immediate_operand") +]) + (define_mode_attr gs_scale [ (RVVM8QI "const_1_operand") (RVVM4QI "const_1_operand") (RVVM2QI "const_1_operand") (RVVM1QI "const_1_operand") (RVVMF2QI "const_1_operand") diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_load-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_load-1.c new file mode 100644 index 00000000000..b2b6a03189d --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_load-1.c @@ -0,0 +1,47 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -ffast-math -fdump-tree-optimized-details" } */ + +#include <stdint-gcc.h> + +#ifndef INDEX8 +#define INDEX8 int8_t +#define INDEX16 int16_t +#define INDEX32 int32_t +#define INDEX64 int64_t +#endif + +#define TEST_LOOP(DATA_TYPE, BITS) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \ + INDEX##BITS stride, DATA_TYPE *restrict cond, \ + INDEX##BITS n) \ + { \ + for (INDEX##BITS i = 0; i < n; ++i) \ + if (cond[i * stride]) \ + dest[i] += src[i * stride]; \ + } + +#define TEST_TYPE(T, DATA_TYPE) \ + T (DATA_TYPE, 8) \ + T (DATA_TYPE, 16) \ + T (DATA_TYPE, 32) \ + T (DATA_TYPE, 64) + +#define TEST_ALL(T) \ + TEST_TYPE (T, int8_t) \ + TEST_TYPE (T, uint8_t) \ + TEST_TYPE (T, int16_t) \ + TEST_TYPE (T, uint16_t) \ + TEST_TYPE (T, _Float16) \ + TEST_TYPE (T, int32_t) \ + TEST_TYPE (T, uint32_t) \ + TEST_TYPE (T, float) \ + TEST_TYPE (T, int64_t) \ + TEST_TYPE (T, uint64_t) \ + TEST_TYPE (T, double) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_LOAD" 132 "optimized" } } */ +/* { dg-final { scan-tree-dump-not " \.GATHER_LOAD" "optimized" } } */ +/* { dg-final { scan-tree-dump-not " \.MASK_GATHER_LOAD" "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_load_run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_load_run-1.c new file mode 100644 index 00000000000..08e70ad4e44 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_load_run-1.c @@ -0,0 +1,97 @@ +/* { dg-do run { target { riscv_v } } } */ +/* { dg-additional-options "-mcmodel=medany" } */ + +#include "mask_strided_load-1.c" +#include <assert.h> + +int +main (void) +{ + /* FIXME: The purpose of this assembly is to ensure that the vtype register is + initialized befor instructions such as vmv1r.v are executed. Otherwise you + will get illegal instruction errors when running with spike+pk. This is an + interim solution for reduce unnecessary failures and a unified solution + will come later. */ + asm volatile("vsetivli x0, 0, e8, m1, ta, ma"); +#define RUN_LOOP(DATA_TYPE, BITS) \ + DATA_TYPE dest_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \ + DATA_TYPE dest2_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \ + DATA_TYPE src_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \ + DATA_TYPE cond_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \ + INDEX##BITS stride_##DATA_TYPE##_##BITS = (BITS - 3); \ + INDEX##BITS n_##DATA_TYPE##_##BITS = (BITS + 13); \ + for (INDEX##BITS i = 0; \ + i < stride_##DATA_TYPE##_##BITS * n_##DATA_TYPE##_##BITS; i++) \ + { \ + dest_##DATA_TYPE##_##BITS[i] \ + = (DATA_TYPE) ((i * 81 + 735) & (BITS - 1)); \ + dest2_##DATA_TYPE##_##BITS[i] \ + = (DATA_TYPE) ((i * 81 + 735) & (BITS - 1)); \ + src_##DATA_TYPE##_##BITS[i] \ + = (DATA_TYPE) ((i * 13 + 9107) & (BITS - 1)); \ + cond_##DATA_TYPE##_##BITS[i] = (DATA_TYPE) (i & 1); \ + } \ + f_##DATA_TYPE##_##BITS (dest_##DATA_TYPE##_##BITS, src_##DATA_TYPE##_##BITS, \ + stride_##DATA_TYPE##_##BITS, \ + cond_##DATA_TYPE##_##BITS, n_##DATA_TYPE##_##BITS); \ + for (int i = 0; i < n_##DATA_TYPE##_##BITS; i++) \ + { \ + if (cond_##DATA_TYPE##_##BITS[i * stride_##DATA_TYPE##_##BITS]) \ + assert ( \ + dest_##DATA_TYPE##_##BITS[i] \ + == (dest2_##DATA_TYPE##_##BITS[i] \ + + src_##DATA_TYPE##_##BITS[i * stride_##DATA_TYPE##_##BITS])); \ + else \ + assert (dest_##DATA_TYPE##_##BITS[i] \ + == dest2_##DATA_TYPE##_##BITS[i]); \ + } + + RUN_LOOP (int8_t, 8) + RUN_LOOP (uint8_t, 8) + RUN_LOOP (int16_t, 8) + RUN_LOOP (uint16_t, 8) + RUN_LOOP (_Float16, 8) + RUN_LOOP (int32_t, 8) + RUN_LOOP (uint32_t, 8) + RUN_LOOP (float, 8) + RUN_LOOP (int64_t, 8) + RUN_LOOP (uint64_t, 8) + RUN_LOOP (double, 8) + + RUN_LOOP (int8_t, 16) + RUN_LOOP (uint8_t, 16) + RUN_LOOP (int16_t, 16) + RUN_LOOP (uint16_t, 16) + RUN_LOOP (_Float16, 16) + RUN_LOOP (int32_t, 16) + RUN_LOOP (uint32_t, 16) + RUN_LOOP (float, 16) + RUN_LOOP (int64_t, 16) + RUN_LOOP (uint64_t, 16) + RUN_LOOP (double, 16) + + RUN_LOOP (int8_t, 32) + RUN_LOOP (uint8_t, 32) + RUN_LOOP (int16_t, 32) + RUN_LOOP (uint16_t, 32) + RUN_LOOP (_Float16, 32) + RUN_LOOP (int32_t, 32) + RUN_LOOP (uint32_t, 32) + RUN_LOOP (float, 32) + RUN_LOOP (int64_t, 32) + RUN_LOOP (uint64_t, 32) + RUN_LOOP (double, 32) + + RUN_LOOP (int8_t, 64) + RUN_LOOP (uint8_t, 64) + RUN_LOOP (int16_t, 64) + RUN_LOOP (uint16_t, 64) + RUN_LOOP (_Float16, 64) + RUN_LOOP (int32_t, 64) + RUN_LOOP (uint32_t, 64) + RUN_LOOP (float, 64) + RUN_LOOP (int64_t, 64) + RUN_LOOP (uint64_t, 64) + RUN_LOOP (double, 64) + return 0; +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_store-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_store-1.c new file mode 100644 index 00000000000..a832af2ba57 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_store-1.c @@ -0,0 +1,48 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -ffast-math -fdump-tree-optimized-details" } */ + +#include <stdint-gcc.h> + +#ifndef INDEX8 +#define INDEX8 int8_t +#define INDEX16 int16_t +#define INDEX32 int32_t +#define INDEX64 int64_t +#endif + +#define TEST_LOOP(DATA_TYPE, BITS) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \ + INDEX##BITS stride, DATA_TYPE *restrict cond, \ + INDEX##BITS n) \ + { \ + for (INDEX##BITS i = 0; i < n; ++i) \ + if (cond[i * stride]) \ + dest[i * stride] = src[i] + BITS; \ + } + +#define TEST_TYPE(T, DATA_TYPE) \ + T (DATA_TYPE, 8) \ + T (DATA_TYPE, 16) \ + T (DATA_TYPE, 32) \ + T (DATA_TYPE, 64) + +#define TEST_ALL(T) \ + TEST_TYPE (T, int8_t) \ + TEST_TYPE (T, uint8_t) \ + TEST_TYPE (T, int16_t) \ + TEST_TYPE (T, uint16_t) \ + TEST_TYPE (T, _Float16) \ + TEST_TYPE (T, int32_t) \ + TEST_TYPE (T, uint32_t) \ + TEST_TYPE (T, float) \ + TEST_TYPE (T, int64_t) \ + TEST_TYPE (T, uint64_t) \ + TEST_TYPE (T, double) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_STORE" 66 "optimized" } } */ +/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_LOAD" 66 "optimized" } } */ +/* { dg-final { scan-tree-dump-not " \.SCATTER_STORE" "optimized" } } */ +/* { dg-final { scan-tree-dump-not " \.MASK_SCATTER_STORE" "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_store_run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_store_run-1.c new file mode 100644 index 00000000000..58956bd6925 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/mask_strided_store_run-1.c @@ -0,0 +1,89 @@ +/* { dg-do run { target { riscv_v } } } */ + +#include "mask_strided_store-1.c" +#include <assert.h> + +int +main (void) +{ +#define RUN_LOOP(DATA_TYPE, BITS) \ + DATA_TYPE dest_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \ + DATA_TYPE dest2_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \ + DATA_TYPE src_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \ + DATA_TYPE cond_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \ + INDEX##BITS stride_##DATA_TYPE##_##BITS = (BITS - 3); \ + INDEX##BITS n_##DATA_TYPE##_##BITS = (BITS + 13); \ + for (INDEX##BITS i = 0; \ + i < stride_##DATA_TYPE##_##BITS * n_##DATA_TYPE##_##BITS; i++) \ + { \ + dest_##DATA_TYPE##_##BITS[i] \ + = (DATA_TYPE) ((i * 81 + 735) & (BITS - 1)); \ + dest2_##DATA_TYPE##_##BITS[i] \ + = (DATA_TYPE) ((i * 81 + 735) & (BITS - 1)); \ + src_##DATA_TYPE##_##BITS[i] \ + = (DATA_TYPE) ((i * 13 + 9107) & (BITS - 1)); \ + cond_##DATA_TYPE##_##BITS[i] = (DATA_TYPE) (i & 1); \ + } \ + f_##DATA_TYPE##_##BITS (dest_##DATA_TYPE##_##BITS, src_##DATA_TYPE##_##BITS, \ + stride_##DATA_TYPE##_##BITS, \ + cond_##DATA_TYPE##_##BITS, n_##DATA_TYPE##_##BITS); \ + for (int i = 0; i < n_##DATA_TYPE##_##BITS; i++) \ + { \ + if (cond_##DATA_TYPE##_##BITS[i * stride_##DATA_TYPE##_##BITS]) \ + assert (dest_##DATA_TYPE##_##BITS[i * stride_##DATA_TYPE##_##BITS] \ + == (src_##DATA_TYPE##_##BITS[i] + BITS)); \ + else \ + assert ( \ + dest_##DATA_TYPE##_##BITS[i * stride_##DATA_TYPE##_##BITS] \ + == dest2_##DATA_TYPE##_##BITS[i * stride_##DATA_TYPE##_##BITS]); \ + } + + RUN_LOOP (int8_t, 8) + RUN_LOOP (uint8_t, 8) + RUN_LOOP (int16_t, 8) + RUN_LOOP (uint16_t, 8) + RUN_LOOP (_Float16, 8) + RUN_LOOP (int32_t, 8) + RUN_LOOP (uint32_t, 8) + RUN_LOOP (float, 8) + RUN_LOOP (int64_t, 8) + RUN_LOOP (uint64_t, 8) + RUN_LOOP (double, 8) + + RUN_LOOP (int8_t, 16) + RUN_LOOP (uint8_t, 16) + RUN_LOOP (int16_t, 16) + RUN_LOOP (uint16_t, 16) + RUN_LOOP (_Float16, 16) + RUN_LOOP (int32_t, 16) + RUN_LOOP (uint32_t, 16) + RUN_LOOP (float, 16) + RUN_LOOP (int64_t, 16) + RUN_LOOP (uint64_t, 16) + RUN_LOOP (double, 16) + + RUN_LOOP (int8_t, 32) + RUN_LOOP (uint8_t, 32) + RUN_LOOP (int16_t, 32) + RUN_LOOP (uint16_t, 32) + RUN_LOOP (_Float16, 32) + RUN_LOOP (int32_t, 32) + RUN_LOOP (uint32_t, 32) + RUN_LOOP (float, 32) + RUN_LOOP (int64_t, 32) + RUN_LOOP (uint64_t, 32) + RUN_LOOP (double, 32) + + RUN_LOOP (int8_t, 64) + RUN_LOOP (uint8_t, 64) + RUN_LOOP (int16_t, 64) + RUN_LOOP (uint16_t, 64) + RUN_LOOP (_Float16, 64) + RUN_LOOP (int32_t, 64) + RUN_LOOP (uint32_t, 64) + RUN_LOOP (float, 64) + RUN_LOOP (int64_t, 64) + RUN_LOOP (uint64_t, 64) + RUN_LOOP (double, 64) + return 0; +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-1.c index b1e6a17543f..8f383069b0e 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-1.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-1.c @@ -40,6 +40,6 @@ TEST_ALL (TEST_LOOP) -/* { dg-final { scan-tree-dump-times " \.MASK_LEN_GATHER_LOAD" 66 "optimized" } } */ +/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_LOAD" 66 "optimized" } } */ /* { dg-final { scan-tree-dump-not " \.GATHER_LOAD" "optimized" } } */ /* { dg-final { scan-tree-dump-not " \.MASK_GATHER_LOAD" "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-2.c index 2c9e7dd14a8..d5fcbcbcd66 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-2.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-2.c @@ -40,6 +40,6 @@ TEST_ALL (TEST_LOOP) -/* { dg-final { scan-tree-dump-times " \.MASK_LEN_GATHER_LOAD" 33 "optimized" } } */ +/* FIXME: Currently, we can't lower strided load in loop vectorizer if step is INTEGER_CST. */ /* { dg-final { scan-tree-dump-not " \.GATHER_LOAD" "optimized" } } */ /* { dg-final { scan-tree-dump-not " \.MASK_GATHER_LOAD" "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-3.c new file mode 100644 index 00000000000..77e34a85575 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load-3.c @@ -0,0 +1,45 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -ffast-math -fdump-tree-optimized-details" } */ + +#include <stdint-gcc.h> + +#ifndef INDEX8 +#define INDEX8 uint8_t +#define INDEX16 uint16_t +#define INDEX32 uint32_t +#define INDEX64 uint64_t +#endif + +#define TEST_LOOP(DATA_TYPE, BITS) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \ + INDEX##BITS stride, INDEX##BITS n) \ + { \ + for (INDEX##BITS i = 0; i < n; ++i) \ + dest[i] += src[i * stride]; \ + } + +#define TEST_TYPE(T, DATA_TYPE) \ + T (DATA_TYPE, 8) \ + T (DATA_TYPE, 16) \ + T (DATA_TYPE, 32) \ + T (DATA_TYPE, 64) + +#define TEST_ALL(T) \ + TEST_TYPE (T, int8_t) \ + TEST_TYPE (T, uint8_t) \ + TEST_TYPE (T, int16_t) \ + TEST_TYPE (T, uint16_t) \ + TEST_TYPE (T, _Float16) \ + TEST_TYPE (T, int32_t) \ + TEST_TYPE (T, uint32_t) \ + TEST_TYPE (T, float) \ + TEST_TYPE (T, int64_t) \ + TEST_TYPE (T, uint64_t) \ + TEST_TYPE (T, double) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_LOAD" 55 "optimized" } } */ +/* { dg-final { scan-tree-dump-not " \.GATHER_LOAD" "optimized" } } */ +/* { dg-final { scan-tree-dump-not " \.MASK_GATHER_LOAD" "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-3.c new file mode 100644 index 00000000000..2835e502cfa --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-3.c @@ -0,0 +1,84 @@ +/* { dg-do run { target { riscv_v } } } */ + +#include "strided_load-3.c" +#include <assert.h> + +int +main (void) +{ +#define RUN_LOOP(DATA_TYPE, BITS) \ + DATA_TYPE dest_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \ + DATA_TYPE dest2_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \ + DATA_TYPE src_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \ + INDEX##BITS stride_##DATA_TYPE##_##BITS = (BITS - 3); \ + INDEX##BITS n_##DATA_TYPE##_##BITS = (BITS + 13); \ + for (INDEX##BITS i = 0; \ + i < stride_##DATA_TYPE##_##BITS * n_##DATA_TYPE##_##BITS; i++) \ + { \ + dest_##DATA_TYPE##_##BITS[i] \ + = (DATA_TYPE) ((i * 81 + 735) & (BITS - 1)); \ + dest2_##DATA_TYPE##_##BITS[i] \ + = (DATA_TYPE) ((i * 81 + 735) & (BITS - 1)); \ + src_##DATA_TYPE##_##BITS[i] \ + = (DATA_TYPE) ((i * 13 + 9107) & (BITS - 1)); \ + } \ + f_##DATA_TYPE##_##BITS (dest_##DATA_TYPE##_##BITS, src_##DATA_TYPE##_##BITS, \ + stride_##DATA_TYPE##_##BITS, \ + n_##DATA_TYPE##_##BITS); \ + for (int i = 0; i < n_##DATA_TYPE##_##BITS; i++) \ + { \ + assert ( \ + dest_##DATA_TYPE##_##BITS[i] \ + == (dest2_##DATA_TYPE##_##BITS[i] \ + + src_##DATA_TYPE##_##BITS[i * stride_##DATA_TYPE##_##BITS])); \ + } + + RUN_LOOP (int8_t, 8) + RUN_LOOP (uint8_t, 8) + RUN_LOOP (int16_t, 8) + RUN_LOOP (uint16_t, 8) + RUN_LOOP (_Float16, 8) + RUN_LOOP (int32_t, 8) + RUN_LOOP (uint32_t, 8) + RUN_LOOP (float, 8) + RUN_LOOP (int64_t, 8) + RUN_LOOP (uint64_t, 8) + RUN_LOOP (double, 8) + + RUN_LOOP (int8_t, 16) + RUN_LOOP (uint8_t, 16) + RUN_LOOP (int16_t, 16) + RUN_LOOP (uint16_t, 16) + RUN_LOOP (_Float16, 16) + RUN_LOOP (int32_t, 16) + RUN_LOOP (uint32_t, 16) + RUN_LOOP (float, 16) + RUN_LOOP (int64_t, 16) + RUN_LOOP (uint64_t, 16) + RUN_LOOP (double, 16) + + RUN_LOOP (int8_t, 32) + RUN_LOOP (uint8_t, 32) + RUN_LOOP (int16_t, 32) + RUN_LOOP (uint16_t, 32) + RUN_LOOP (_Float16, 32) + RUN_LOOP (int32_t, 32) + RUN_LOOP (uint32_t, 32) + RUN_LOOP (float, 32) + RUN_LOOP (int64_t, 32) + RUN_LOOP (uint64_t, 32) + RUN_LOOP (double, 32) + + RUN_LOOP (int8_t, 64) + RUN_LOOP (uint8_t, 64) + RUN_LOOP (int16_t, 64) + RUN_LOOP (uint16_t, 64) + RUN_LOOP (_Float16, 64) + RUN_LOOP (int32_t, 64) + RUN_LOOP (uint32_t, 64) + RUN_LOOP (float, 64) + RUN_LOOP (int64_t, 64) + RUN_LOOP (uint64_t, 64) + RUN_LOOP (double, 64) + return 0; +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_store-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_store-1.c index 3e6a34029b3..d8da51f3ac5 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_store-1.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_store-1.c @@ -40,6 +40,6 @@ TEST_ALL (TEST_LOOP) -/* { dg-final { scan-tree-dump-times " \.MASK_LEN_SCATTER_STORE" 66 "optimized" } } */ +/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_STORE" 66 "optimized" } } */ /* { dg-final { scan-tree-dump-not " \.SCATTER_STORE" "optimized" } } */ /* { dg-final { scan-tree-dump-not " \.MASK_SCATTER_STORE" "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_store-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_store-2.c index 6906af17d84..9ffea4e0d99 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_store-2.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_store-2.c @@ -40,6 +40,6 @@ TEST_ALL (TEST_LOOP) -/* { dg-final { scan-tree-dump-times " \.MASK_LEN_SCATTER_STORE" 44 "optimized" } } */ +/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_STORE" 44 "optimized" } } */ /* { dg-final { scan-tree-dump-not " \.SCATTER_STORE" "optimized" } } */ /* { dg-final { scan-tree-dump-not " \.MASK_SCATTER_STORE" "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_store-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_store-3.c new file mode 100644 index 00000000000..eb0bcddaade --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/strided_store-3.c @@ -0,0 +1,45 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -ffast-math -fdump-tree-optimized-details" } */ + +#include <stdint-gcc.h> + +#ifndef INDEX8 +#define INDEX8 uint8_t +#define INDEX16 uint16_t +#define INDEX32 uint32_t +#define INDEX64 uint64_t +#endif + +#define TEST_LOOP(DATA_TYPE, BITS) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \ + INDEX##BITS stride, INDEX##BITS n) \ + { \ + for (INDEX##BITS i = 0; i < n; ++i) \ + dest[i * stride] = src[i] + BITS; \ + } + +#define TEST_TYPE(T, DATA_TYPE) \ + T (DATA_TYPE, 8) \ + T (DATA_TYPE, 16) \ + T (DATA_TYPE, 32) \ + T (DATA_TYPE, 64) + +#define TEST_ALL(T) \ + TEST_TYPE (T, int8_t) \ + TEST_TYPE (T, uint8_t) \ + TEST_TYPE (T, int16_t) \ + TEST_TYPE (T, uint16_t) \ + TEST_TYPE (T, _Float16) \ + TEST_TYPE (T, int32_t) \ + TEST_TYPE (T, uint32_t) \ + TEST_TYPE (T, float) \ + TEST_TYPE (T, int64_t) \ + TEST_TYPE (T, uint64_t) \ + TEST_TYPE (T, double) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_STORE" 55 "optimized" } } */ +/* { dg-final { scan-tree-dump-not " \.SCATTER_STORE" "optimized" } } */ +/* { dg-final { scan-tree-dump-not " \.MASK_SCATTER_STORE" "optimized" } } */ -- 2.36.3