On Fri, 2 Jul 2021 at 10:53, Christophe Lyon <christophe.l...@linaro.org> wrote: > > Hi, > > On Wed, 9 Jun 2021 at 17:04, Richard Sandiford > <richard.sandif...@arm.com> wrote: > > > > Christophe Lyon <christophe.l...@linaro.org> writes: > > > The problem in this PR is that we call VPSEL with a mask of vector > > > type instead of HImode. This happens because operand 3 in vcond_mask > > > is the pre-computed vector comparison and has vector type. The fix is > > > to transfer this value to VPR.P0 by comparing operand 3 with a vector > > > of constant 1 of the same type as operand 3. > > > > The alternative is to implement TARGET_VECTORIZE_GET_MASK_MODE > > and return HImode for MVE. This is how AVX512 handles masks. > > > > It might be worth trying that to see how it works. I'm not sure > > off-hand whether it'll produce worse code or better code. However, > > using HImode as the mask mode would help when defining other > > predicated optabs in future. > > > > Here is my v2 of this patch, hopefully implementing what you suggested. > > Sorry it took me so long, but as implementing this hook was of course > not sufficient, and it took me a while to figure out I needed to keep the > non-HI expanders (vec_cmp ,...). Each time I fixed a bug, I created > another one... I shouldn't have added so many tests ;-) > > I'm not sure how to improve the vectorizer doc, to better describe the > vec_cmp/vcond patterns and see which ones the vectorizer is trying > to use (to understand which ones I should implement). > > Then I realized I was about to break Neon support, so I decided > it was safer to add Neon tests ;-) > > Is that version OK? > I forgot to mention that I merged the original patch 2/2 https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572300.html into this one.
> Thanks, > > Christophe > > > > Thanks, > > Richard > > > > > The pr100757*.c testcases are derived from > > > gcc.c-torture/compile/20160205-1.c, forcing the use of MVE, and using > > > different types and return values different from 0 and 1 to avoid > > > commonalization with boolean masks. > > > > > > Reducing the number of iterations in pr100757-3.c from 32 to 8, we > > > generate the code below: > > > > > > float a[32]; > > > float fn1(int d) { > > > int c = 4; > > > for (int b = 0; b < 8; b++) > > > if (a[b] != 2.0f) > > > c = 5; > > > return c; > > > } > > > > > > fn1: > > > ldr r3, .L4+80 > > > vpush.64 {d8, d9} > > > vldrw.32 q3, [r3] // q3=a[0..3] > > > vldr.64 d8, .L4 // q4=(2.0,2.0,2.0,2.0) > > > vldr.64 d9, .L4+8 > > > adds r3, r3, #16 > > > vcmp.f32 eq, q3, q4 // cmp a[0..3] == (2.0,2.0,2.0,2.0) > > > vldr.64 d2, .L4+16 // q1=(1,1,1,1) > > > vldr.64 d3, .L4+24 > > > vldrw.32 q3, [r3] // q3=a[4..7] > > > vldr.64 d4, .L4+32 // q2=(0,0,0,0) > > > vldr.64 d5, .L4+40 > > > vpsel q0, q1, q2 // q0=select (a[0..3]) > > > vcmp.f32 eq, q3, q4 // cmp a[4..7] == (2.0,2.0,2.0,2.0) > > > vldm sp!, {d8-d9} > > > vpsel q2, q1, q2 // q2=select (a[4..7]) > > > vand q2, q0, q2 // q2=select (a[0..3]) && select > > > (a[4..7]) > > > vldr.64 d6, .L4+48 // q3=(4.0,4.0,4.0,4.0) > > > vldr.64 d7, .L4+56 > > > vldr.64 d0, .L4+64 // q0=(5.0,5.0,5.0,5.0) > > > vldr.64 d1, .L4+72 > > > vcmp.i32 eq, q2, q1 // cmp mask(a[0..7]) == (1,1,1,1) > > > vpsel q3, q3, q0 // q3= vcond_mask(4.0,5.0) > > > vmov.32 r3, q3[0] // keep the scalar > > > maxv2-0001-arm-Fix-vcond_mask-expander-for-MVE-PR-target-100.patch > > > vmov.32 r1, q3[1] > > > vmov.32 r0, q3[3] > > > vmov.32 r2, q3[2] > > > vmov s14, r1 > > > vmov s15, r3 > > > vmaxnm.f32 s15, s15, s14 > > > vmov s14, r2 > > > vmaxnm.f32 s15, s15, s14 > > > vmov s14, r0 > > > vmaxnm.f32 s15, s15, s14 > > > vmov r0, s15 > > > bx lr > > > .L5: > > > .align 3 > > > .L4: > > > .word 1073741824 > > > .word 1073741824 > > > .word 1073741824 > > > .word 1073741824 > > > .word 1 > > > .word 1 > > > .word 1 > > > .word 1 > > > .word 0 > > > .word 0 > > > .word 0 > > > .word 0 > > > .word 1082130432 > > > .word 1082130432 > > > .word 1082130432 > > > .word 1082130432 > > > .word 1084227584 > > > .word 1084227584 > > > .word 1084227584 > > > .word 1084227584 > > > > > > 2021-06-09 Christophe Lyon <christophe.l...@linaro.org> > > > > > > PR target/100757 > > > gcc/ > > > * config/arm/vec-common.md (vcond_mask_<mode><v_cmp_result>): Fix > > > expansion for MVE. > > > > > > gcc/testsuite/ > > > * gcc.target/arm/simd/pr100757.c: New test. > > > * gcc.target/arm/simd/pr100757-2.c: New test. > > > * gcc.target/arm/simd/pr100757-3.c: New test. > > > --- > > > gcc/config/arm/vec-common.md | 24 +++++++++++++++++-- > > > .../gcc.target/arm/simd/pr100757-2.c | 20 ++++++++++++++++ > > > .../gcc.target/arm/simd/pr100757-3.c | 20 ++++++++++++++++ > > > gcc/testsuite/gcc.target/arm/simd/pr100757.c | 19 +++++++++++++++ > > > 4 files changed, 81 insertions(+), 2 deletions(-) > > > create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > > > create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > > > create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757.c > > > > > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md > > > index 0ffc7a9322c..ccdfaa8321f 100644 > > > --- a/gcc/config/arm/vec-common.md > > > +++ b/gcc/config/arm/vec-common.md > > > @@ -478,8 +478,28 @@ (define_expand "vcond_mask_<mode><v_cmp_result>" > > > } > > > else if (TARGET_HAVE_MVE) > > > { > > > - emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], > > > - operands[1], operands[2], operands[3])); > > > + /* Convert pre-computed vector comparison into VPR.P0 by comparing > > > + operand 3 with a vector of '1', then use VPSEL. */ > > > + machine_mode cmp_mode = GET_MODE (operands[3]); > > > + rtx vpr_p0 = gen_reg_rtx (HImode); > > > + rtx one = gen_reg_rtx (cmp_mode); > > > + emit_move_insn (one, CONST1_RTX (cmp_mode)); > > > + emit_insn (gen_mve_vcmpq (EQ, cmp_mode, vpr_p0, operands[3], one)); > > > + > > > + switch (GET_MODE_CLASS (<MODE>mode)) > > > + { > > > + case MODE_VECTOR_INT: > > > + emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, > > > operands[0], operands[1], operands[2], vpr_p0)); > > > + break; > > > + case MODE_VECTOR_FLOAT: > > > + if (TARGET_HAVE_MVE_FLOAT) > > > + emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0], > > > operands[1], operands[2], vpr_p0)); > > > + else > > > + gcc_unreachable (); > > > + break; > > > + default: > > > + gcc_unreachable (); > > > + } > > > } > > > else > > > gcc_unreachable (); > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > > > b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > > > new file mode 100644 > > > index 00000000000..993ce369090 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > > > @@ -0,0 +1,20 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */ > > > +/* { dg-add-options arm_v8_1m_mve_fp } */ > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */ > > > +/* Derived from gcc.c-torture/compile/20160205-1.c. */ > > > + > > > +float a[32]; > > > +int fn1(int d) { > > > + int c = 4; > > > + for (int b = 0; b < 32; b++) > > > + if (a[b] != 2.0f) > > > + c = 5; > > > + return c; > > > +} > > > + > > > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* > > > Constant 2.0f. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' > > > mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' > > > mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t4\n} 4 } } */ /* Initial > > > value for c. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t5\n} 4 } } */ /* Possible > > > value for c. */ > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > > > b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > > > new file mode 100644 > > > index 00000000000..b94a73b2d2c > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > > > @@ -0,0 +1,20 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */ > > > +/* { dg-add-options arm_v8_1m_mve_fp } */ > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */ > > > +/* Copied from gcc.c-torture/compile/20160205-1.c. */ > > > + > > > +float a[32]; > > > +float fn1(int d) { > > > + float c = 4; > > > + for (int b = 0; b < 32; b++) > > > + if (a[b] != 2.0f) > > > + c = 5; > > > + return c; > > > +} > > > + > > > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* > > > Constant 2.0f. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' > > > mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' > > > mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t1084227584\n} 4 } } */ /* > > > Initial value for c. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t1082130432\n} 4 } } */ /* > > > Possible value for c. */ > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757.c > > > b/gcc/testsuite/gcc.target/arm/simd/pr100757.c > > > new file mode 100644 > > > index 00000000000..e51e716b4ec > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757.c > > > @@ -0,0 +1,19 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ > > > +/* { dg-add-options arm_v8_1m_mve } */ > > > +/* { dg-additional-options "-O3" } */ > > > +/* Derived from gcc.c-torture/compile/20160205-1.c. */ > > > + > > > +int a[32]; > > > +int fn1(int d) { > > > + int c = 2; > > > + for (int b = 0; b < 32; b++) > > > + if (a[b]) > > > + c = 3; > > > + return c; > > > +} > > > + > > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' > > > mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' > > > mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t2\n} 4 } } */ /* Initial > > > value for c. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t3\n} 4 } } */ /* Possible > > > value for c. */