Re: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics

Christophe Lyon via Gcc-patches Fri, 15 Jan 2021 02:47:54 -0800

ping?


On Fri, 6 Nov 2020 at 16:22, Christophe Lyon <christophe.l...@linaro.org> wrote:
>
> On Thu, 5 Nov 2020 at 12:55, Christophe Lyon <christophe.l...@linaro.org> 
> wrote:
> >
> > On Thu, 5 Nov 2020 at 10:36, Kyrylo Tkachov <kyrylo.tkac...@arm.com> wrote:
> > >
> > > H, Christophe,
> > >
> > > > -----Original Message-----
> > > > From: Gcc-patches <gcc-patches-boun...@gcc.gnu.org> On Behalf Of
> > > > Christophe Lyon via Gcc-patches
> > > > Sent: 15 October 2020 18:23
> > > > To: gcc-patches@gcc.gnu.org
> > > > Subject: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64
> > > > intrinsics
> > > >
> > > > This patch adds implementations for vceqq_p64, vceqz_p64 and
> > > > vceqzq_p64 intrinsics.
> > > >
> > > > vceqq_p64 uses the existing vceq_p64 after splitting the input vectors
> > > > into their high and low halves.
> > > >
> > > > vceqz[q] simply call the vceq and vceqq with a second argument equal
> > > > to zero.
> > > >
> > > > The added (executable) testcases make sure that the poly64x2_t
> > > > variants have results with one element of all zeroes (false) and the
> > > > other element with all bits set to one (true).
> > > >
> > > > 2020-10-15  Christophe Lyon  <christophe.l...@linaro.org>
> > > >
> > > >       gcc/
> > > >       * config/arm/arm_neon.h (vceqz_p64, vceqq_p64, vceqzq_p64):
> > > > New.
> > > >
> > > >       gcc/testsuite/
> > > >       * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c: Add tests for
> > > >       vceqz_p64, vceqq_p64 and vceqzq_p64.
> > > > ---
> > > >  gcc/config/arm/arm_neon.h                          | 31 +++++++++++++++
> > > >  .../aarch64/advsimd-intrinsics/p64_p128.c          | 46
> > > > +++++++++++++++++++++-
> > > >  2 files changed, 76 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
> > > > index aa21730..f7eff37 100644
> > > > --- a/gcc/config/arm/arm_neon.h
> > > > +++ b/gcc/config/arm/arm_neon.h
> > > > @@ -16912,6 +16912,37 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b)
> > > >    return vreinterpret_u64_u32 (__m);
> > > >  }
> > > >
> > > > +__extension__ extern __inline uint64x1_t
> > > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > > +vceqz_p64 (poly64x1_t __a)
> > > > +{
> > > > +  poly64x1_t __b = vreinterpret_p64_u32 (vdup_n_u32 (0));
> > > > +  return vceq_p64 (__a, __b);
> > > > +}
> > >
> > > This approach is okay, but can we have some kind of test to confirm it 
> > > generates the VCEQ instruction with immediate zero rather than having a 
> > > separate DUP...
> >
> > I had checked that manually, but I'll add a test.
> > However, I have noticed that although vceqz_p64 uses vceq.i32 dX, dY, #0,
> > the vceqzq_64 version below first sets
> > vmov dZ, #0
> > and then emits two
> > vmoz dX, dY, dZ
> >
> > I'm looking at why this happens.
> >
>
> Hi,
>
> Here is an updated version, which adds two tests (arm/simd/vceqz_p64.c
> and arm/simd/vceqzq_p64.c).
>
> The vceqzq_64 test does not currently expect instructions with
> immediate zero, because we generate:
> vmov.i32        q9, #0  @ v4si
> [...]
> vceq.i32        d16, d16, d19
> vceq.i32        d17, d17, d19
>
> Looking at the traces, I can see this in reload:
> (insn 19 8 15 2 (set (reg:V2SI 48 d16 [orig:128 _18 ] [128])
>         (neg:V2SI (eq:V2SI (reg:V2SI 48 d16 [orig:139 v1 ] [139])
>                 (reg:V2SI 54 d19 [ _5+8 ]))))
> "/home/christophe.lyon/src/GCC/builds/gcc-fsf-git-neon-intrinsics/tools/lib/gcc/arm-none-linux-gnueabihf/11.0.0/include/arm_neon.h":2404:22
> 1650 {neon_vceqv2si_insn}
>      (expr_list:REG_EQUAL (neg:V2SI (eq:V2SI (subreg:V2SI (reg:DI 48
> d16 [orig:139 v1 ] [139]) 0)
>                 (const_vector:V2SI [
>                         (const_int 0 [0]) repeated x2
>                     ])))
>         (nil)))
> (insn 15 19 20 2 (set (reg:V2SI 50 d17 [orig:121 _11 ] [121])
>         (neg:V2SI (eq:V2SI (reg:V2SI 50 d17 [orig:141 v2 ] [141])
>                 (reg:V2SI 54 d19 [ _5+8 ]))))
> "/home/christophe.lyon/src/GCC/builds/gcc-fsf-git-neon-intrinsics/tools/lib/gcc/arm-none-linux-gnueabihf/11.0.0/include/arm_neon.h":2404:22
> 1650 {neon_vceqv2si_insn}
>      (expr_list:REG_EQUAL (neg:V2SI (eq:V2SI (subreg:V2SI (reg:DI 50
> d17 [orig:141 v2 ] [141]) 0)
>                 (const_vector:V2SI [
>                         (const_int 0 [0]) repeated x2
>                     ])))
>         (nil)))
>
> but it says:
>          Choosing alt 0 in insn 19:  (0) =w  (1) w  (2) w {neon_vceqv2si_insn}
>           alt=0,overall=0,losers=0,rld_nregs=0
>          Choosing alt 0 in insn 15:  (0) =w  (1) w  (2) w {neon_vceqv2si_insn}
>           alt=0,overall=0,losers=0,rld_nregs=0
>
> Why isn't it picking alternative 1 with the Dz constraint?
>
> Christophe
>
>
> > Thanks,
> >
> > Christophe
> >
> >
> > > Thanks,
> > > Kyrill
> > >
> > > > +
> > > > +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.  */
> > > > +__extension__ extern __inline uint64x2_t
> > > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > > +vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
> > > > +{
> > > > +  poly64_t __high_a = vget_high_p64 (__a);
> > > > +  poly64_t __high_b = vget_high_p64 (__b);
> > > > +  uint64x1_t __high = vceq_p64(__high_a, __high_b);
> > > > +
> > > > +  poly64_t __low_a = vget_low_p64 (__a);
> > > > +  poly64_t __low_b = vget_low_p64 (__b);
> > > > +  uint64x1_t __low = vceq_p64(__low_a, __low_b);
> > > > +  return vcombine_u64 (__low, __high);
> > > > +}
> > > > +
> > > > +__extension__ extern __inline uint64x2_t
> > > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > > +vceqzq_p64 (poly64x2_t __a)
> > > > +{
> > > > +  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
> > > > +  return vceqq_p64 (__a, __b);
> > > > +}
> > > > +
> > > >  /* The vtst_p64 intrinsic does not map to a single instruction.
> > > >     We emulate it in way similar to vceq_p64 above but here we do
> > > >     a reduction with max since if any two corresponding bits
> > > > diff --git 
> > > > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > > index a3210a9..6aed096 100644
> > > > --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > > @@ -16,6 +16,11 @@ VECT_VAR_DECL(vbsl_expected,poly,64,2) [] =
> > > > { 0xfffffff1,
> > > >
> > > >  /* Expected results: vceq.  */
> > > >  VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 };
> > > > +VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff 
> > > > };
> > > > +
> > > > +/* Expected results: vceqz.  */
> > > > +VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 };
> > > > +VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff 
> > > > };
> > > >
> > > >  /* Expected results: vcombine.  */
> > > >  VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0,
> > > > 0x88 };
> > > > @@ -213,7 +218,7 @@ int main (void)
> > > >
> > > >    /* vceq_p64 tests. */
> > > >  #undef TEST_MSG
> > > > -#define TEST_MSG "VCEQ"
> > > > +#define TEST_MSG "VCEQ/VCEQQ"
> > > >
> > > >  #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)
> > > >       \
> > > >    VECT_VAR(vceq_vector_res, T3, W, N) =
> > > >       \
> > > > @@ -227,16 +232,55 @@ int main (void)
> > > >    DECL_VARIABLE(vceq_vector, poly, 64, 1);
> > > >    DECL_VARIABLE(vceq_vector2, poly, 64, 1);
> > > >    DECL_VARIABLE(vceq_vector_res, uint, 64, 1);
> > > > +  DECL_VARIABLE(vceq_vector, poly, 64, 2);
> > > > +  DECL_VARIABLE(vceq_vector2, poly, 64, 2);
> > > > +  DECL_VARIABLE(vceq_vector_res, uint, 64, 2);
> > > >
> > > >    CLEAN(result, uint, 64, 1);
> > > > +  CLEAN(result, uint, 64, 2);
> > > >
> > > >    VLOAD(vceq_vector, buffer, , poly, p, 64, 1);
> > > > +  VLOAD(vceq_vector, buffer, q, poly, p, 64, 2);
> > > >
> > > >    VDUP(vceq_vector2, , poly, p, 64, 1, 0x88);
> > > > +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88);
> > > > +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1);
> > > >
> > > >    TEST_VCOMP(vceq, , poly, p, uint, 64, 1);
> > > > +  TEST_VCOMP(vceq, q, poly, p, uint, 64, 2);
> > > >
> > > >    CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, "");
> > > > +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, "");
> > > > +
> > > > +  /* vceqz_p64 tests. */
> > > > +#undef TEST_MSG
> > > > +#define TEST_MSG "VCEQZ/VCEQZQ"
> > > > +
> > > > +#define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
> > > >       \
> > > > +  VECT_VAR(vceqz_vector_res, T3, W, N) =                             \
> > > > +    INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N));             \
> > > > +  vst1##Q##_u##W(VECT_VAR(result, T3, W, N),
> > > > VECT_VAR(vceqz_vector_res, T3, W, N))
> > > > +
> > > > +#define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N)
> > > >       \
> > > > +  TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
> > > > +
> > > > +  DECL_VARIABLE(vceqz_vector, poly, 64, 1);
> > > > +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 1);
> > > > +  DECL_VARIABLE(vceqz_vector, poly, 64, 2);
> > > > +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 2);
> > > > +
> > > > +  CLEAN(result, uint, 64, 1);
> > > > +  CLEAN(result, uint, 64, 2);
> > > > +
> > > > +  VLOAD(vceqz_vector, buffer, , poly, p, 64, 1);
> > > > +  VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2);
> > > > +  VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0);
> > > > +
> > > > +  TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1);
> > > > +  TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2);
> > > > +
> > > > +  CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, "");
> > > > +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, "");
> > > >
> > > >    /* vcombine_p64 tests.  */
> > > >  #undef TEST_MSG
> > > > --
> > > > 2.7.4
> > >

Re: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics

Reply via email to