OK. How about this following situation: for (i ... i < 64.. i++) { a[i] = a[i] + b[i] }
When TARGET_MAX_LMUL is 1 or 2. Can auto-vectorize use LMUL 8 in VLS mode to vectorize the case above ? juzhe.zh...@rivai.ai From: Kito Cheng Date: 2025-08-14 17:39 To: 钟居哲 CC: kito.cheng; gcc-patches; palmer; jeffreyalaw; rdapp; Li, Pan2; vineetg; patrick Subject: Re: [PATCH] RISC-V: Allow VLS types using up to LMUL 8 Yeah, I guess you definitely got something wrong at that timing, I also got some trouble when I relaxed the constraint...and then I finally found the right way to enable that without changing behavior of the auto vectorizer: only constraint the VLS mode in autovectorize_vector_modes. On Thu, Aug 14, 2025 at 5:29 PM 钟居哲 <juzhe.zh...@rivai.ai> wrote: > > As I remembered, I have ever trided to use larger LMUL in VLS mode than VLA > modes. > But it may cause ICE. I don't remember what scenario now... That's why > originally I use MAX_LMUL to control both VLA modes and VLS modes. > > ________________________________ > juzhe.zh...@rivai.ai > > > From: Kito Cheng > Date: 2025-08-14 15:17 > To: gcc-patches; kito.cheng; palmer; jeffreyalaw; rdapp; juzhe.zhong; > pan2.li; vineetg; patrick > CC: Kito Cheng > Subject: [PATCH] RISC-V: Allow VLS types using up to LMUL 8 > We used to apply -mrvv-max-lmul= to limit VLS code gen, auto vectorizer, > and builtin string function expansion. But I think the VLS code gen part > doesn't > need this limit, since it only happens when the user explicitly writes vector > types. > > For example, int32x8_t under -mrvv-max-lmul=m1 with VLEN=128 would be split > into > two int32x4_t, which generate more instructions and runs slower. > > In this patch, I changed -mrvv-max-lmul= to only affect auto vectorization and > builtin string function expansion. Actually, the option's help text already > says it only controls the LMUL used by auto-vectorization, so I believe this > change is makes sense :) > > gcc/ChangeLog: > > * config/riscv/riscv-protos.h (vls_mode_valid_p): New argument > allow_up_to_lmul_8. > * config/riscv/riscv-v.cc (autovectorize_vector_modes): Set > allow_up_to_lmul_8 to false. > (vls_mode_valid_p): Add new argument allow_up_to_lmul_8, and use > it to determine whether to allow LMUL 8. > > gcc/testsuite/ChangeLog: > > * gcc.target.riscv/rvv/vls-type-rvv-max-lmul.c: New test. > --- > gcc/config/riscv/riscv-protos.h | 2 +- > gcc/config/riscv/riscv-v.cc | 31 ++++++++++--------- > .../riscv/rvv/vls-type-rvv-max-lmul.c | 12 +++++++ > 3 files changed, 29 insertions(+), 16 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vls-type-rvv-max-lmul.c > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h > index 539321ff95b..045ee09b23f 100644 > --- a/gcc/config/riscv/riscv-protos.h > +++ b/gcc/config/riscv/riscv-protos.h > @@ -763,7 +763,7 @@ opt_machine_mode vectorize_related_mode (machine_mode, > scalar_mode, > unsigned int autovectorize_vector_modes (vec<machine_mode> *, bool); > bool cmp_lmul_le_one (machine_mode); > bool cmp_lmul_gt_one (machine_mode); > -bool vls_mode_valid_p (machine_mode); > +bool vls_mode_valid_p (machine_mode, bool allow_up_to_lmul_8 = true); > bool vlmax_avl_type_p (rtx_insn *); > bool has_vl_op (rtx_insn *); > bool tail_agnostic_p (rtx_insn *); > diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc > index c9c83282732..3484f6442e7 100644 > --- a/gcc/config/riscv/riscv-v.cc > +++ b/gcc/config/riscv/riscv-v.cc > @@ -2910,7 +2910,7 @@ autovectorize_vector_modes (vector_modes *modes, bool) > machine_mode mode; > while (size > 0 && get_vector_mode (QImode, size).exists (&mode)) > { > - if (vls_mode_valid_p (mode)) > + if (vls_mode_valid_p (mode, /* allow_up_to_lmul_8 */ false)) > modes->safe_push (mode); > i++; > @@ -5027,26 +5027,27 @@ cmp_lmul_gt_one (machine_mode mode) > Then we can have the condition for VLS mode in fixed-vlmax, aka: > PRECISION (VLSmode) < VLEN / (64 / PRECISION(VLS_inner_mode)). */ > bool > -vls_mode_valid_p (machine_mode vls_mode) > +vls_mode_valid_p (machine_mode vls_mode, bool allow_up_to_lmul_8) > { > if (!TARGET_VECTOR || TARGET_XTHEADVECTOR) > return false; > if (rvv_vector_bits == RVV_VECTOR_BITS_SCALABLE) > { > - if (GET_MODE_CLASS (vls_mode) != MODE_VECTOR_BOOL > - && !ordered_p (TARGET_MAX_LMUL * BITS_PER_RISCV_VECTOR, > - GET_MODE_PRECISION (vls_mode))) > - /* We enable VLS modes which are aligned with TARGET_MAX_LMUL and > - BITS_PER_RISCV_VECTOR. > - > - e.g. When TARGET_MAX_LMUL = 1 and BITS_PER_RISCV_VECTOR = (128,128). > - We enable VLS modes have fixed size <= 128bit. Since ordered_p is > - false between VLA modes with size = (128, 128) bits and VLS mode > - with size = 128 bits, we will end up with multiple ICEs in > - middle-end generic codes. */ > - return false; > - return true; > + if (GET_MODE_CLASS (vls_mode) != MODE_VECTOR_BOOL) > + return true; > + if (allow_up_to_lmul_8) > + return true; > + /* We enable VLS modes which are aligned with TARGET_MAX_LMUL and > + BITS_PER_RISCV_VECTOR. > + > + e.g. When TARGET_MAX_LMUL = 1 and BITS_PER_RISCV_VECTOR = (128,128). > + We enable VLS modes have fixed size <= 128bit. Since ordered_p is > + false between VLA modes with size = (128, 128) bits and VLS mode > + with size = 128 bits, we will end up with multiple ICEs in > + middle-end generic codes. */ > + return !ordered_p (TARGET_MAX_LMUL * BITS_PER_RISCV_VECTOR, > + GET_MODE_PRECISION (vls_mode)); > } > if (rvv_vector_bits == RVV_VECTOR_BITS_ZVL) > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vls-type-rvv-max-lmul.c > b/gcc/testsuite/gcc.target/riscv/rvv/vls-type-rvv-max-lmul.c > new file mode 100644 > index 00000000000..5d52f7798d5 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/vls-type-rvv-max-lmul.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -mrvv-max-lmul=m1 > -fdump-tree-optimized" } */ > + > +typedef long long int64x8_t __attribute__((vector_size(64))); > + > +int64x8_t foo(int64x8_t a, int64x8_t b) > +{ > + return a + b; > +} > +/* Make sure we can us up to LMUL 4 to process int64x8_t at once rather than > + break that into 4 LMUL 1 operations. */ > +/* { dg-final { scan-assembler {vsetivli\s+zero,8,e64,m4,t[au],m[au]} } } */ > -- > 2.34.1 > >