Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.

Andrew Pinski Thu, 05 Oct 2023 13:35:07 -0700

On Thu, Oct 5, 2023 at 12:48 PM Tamar Christina <tamar.christ...@arm.com> wrote:
>
> > -----Original Message-----
> > From: Richard Sandiford <richard.sandif...@arm.com>
> > Sent: Thursday, October 5, 2023 8:29 PM
> > To: Tamar Christina <tamar.christ...@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; Richard Earnshaw
> > <richard.earns...@arm.com>; Marcus Shawcroft
> > <marcus.shawcr...@arm.com>; Kyrylo Tkachov <kyrylo.tkac...@arm.com>
> > Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> >
> > Tamar Christina <tamar.christ...@arm.com> writes:
> > > Hi All,
> > >
> > > This adds an implementation for masked copysign along with an
> > > optimized pattern for masked copysign (x, -1).
> >
> > It feels like we're ending up with a lot of AArch64-specific code that just 
> > hard-
> > codes the observation that changing the sign is equivalent to changing the 
> > top
> > bit.  We then need to make sure that we choose the best way of changing the
> > top bit for any given situation.
> >
> > Hard-coding the -1/negative case is one instance of that.  But it looks 
> > like we
> > also fail to use the best sequence for SVE2.  E.g.
> > [https://godbolt.org/z/ajh3MM5jv]:
> >
> > #include <stdint.h>
> >
> > void f(double *restrict a, double *restrict b) {
> >     for (int i = 0; i < 100; ++i)
> >         a[i] = __builtin_copysign(a[i], b[i]); }
> >
> > void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
> >     for (int i = 0; i < 100; ++i)
> >         a[i] = (a[i] & ~c) | (b[i] & c); }
> >
> > gives:
> >
> > f:
> >         mov     x2, 0
> >         mov     w3, 100
> >         whilelo p7.d, wzr, w3
> > .L2:
> >         ld1d    z30.d, p7/z, [x0, x2, lsl 3]
> >         ld1d    z31.d, p7/z, [x1, x2, lsl 3]
> >         and     z30.d, z30.d, #0x7fffffffffffffff
> >         and     z31.d, z31.d, #0x8000000000000000
> >         orr     z31.d, z31.d, z30.d
> >         st1d    z31.d, p7, [x0, x2, lsl 3]
> >         incd    x2
> >         whilelo p7.d, w2, w3
> >         b.any   .L2
> >         ret
> > g:
> >         mov     x3, 0
> >         mov     w4, 100
> >         mov     z29.d, x2
> >         whilelo p7.d, wzr, w4
> > .L6:
> >         ld1d    z30.d, p7/z, [x0, x3, lsl 3]
> >         ld1d    z31.d, p7/z, [x1, x3, lsl 3]
> >         bsl     z31.d, z31.d, z30.d, z29.d
> >         st1d    z31.d, p7, [x0, x3, lsl 3]
> >         incd    x3
> >         whilelo p7.d, w3, w4
> >         b.any   .L6
> >         ret
> >
> > I saw that you originally tried to do this in match.pd and that the 
> > decision was
> > to fold to copysign instead.  But perhaps there's a compromise where isel 
> > does
> > something with the (new) copysign canonical form?
> > I.e. could we go with your new version of the match.pd patch, and add some
> > isel stuff as a follow-on?
> >
>
> Sure if that's what's desired.... But..
>
> The example you posted above is for instance worse for x86 
> https://godbolt.org/z/x9ccqxW6T
> where the first operation has a dependency chain of 2 and the latter of 3.  
> It's likely any
> open coding of this operation is going to hurt a target.


But that is because it is not using andn when it should be.
That would be https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94790
(scalar fix but not vector) and
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90323 IIRC.
AARCH64 already has a pattern to match the above which is why it works
there but not x86_64.

Thanks,
Andrew

>
> So I'm unsure what isel transform this into...
>
> Tamar
>
> > Not saying no to this patch, just thought that the above was worth
> > considering.
> >
> > [I agree with Andrew's comments FWIW.]
> >
> > Thanks,
> > Richard
> >
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > >     PR tree-optimization/109154
> > >     * config/aarch64/aarch64-sve.md (cond_copysign<mode>): New.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >     PR tree-optimization/109154
> > >     * gcc.target/aarch64/sve/fneg-abs_5.c: New test.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/config/aarch64/aarch64-sve.md
> > > b/gcc/config/aarch64/aarch64-sve.md
> > > index
> > >
> > 071400c820a5b106ddf9dc9faebb117975d74ea0..00ca30c24624dc661254
> > 568f45b6
> > > 1a14aa11c305 100644
> > > --- a/gcc/config/aarch64/aarch64-sve.md
> > > +++ b/gcc/config/aarch64/aarch64-sve.md
> > > @@ -6429,6 +6429,57 @@ (define_expand "copysign<mode>3"
> > >    }
> > >  )
> > >
> > > +(define_expand "cond_copysign<mode>"
> > > +  [(match_operand:SVE_FULL_F 0 "register_operand")
> > > +   (match_operand:<VPRED> 1 "register_operand")
> > > +   (match_operand:SVE_FULL_F 2 "register_operand")
> > > +   (match_operand:SVE_FULL_F 3 "nonmemory_operand")
> > > +   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> > > +  "TARGET_SVE"
> > > +  {
> > > +    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
> > > +    rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode);
> > > +    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
> > > +    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
> > > +
> > > +    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2],
> > <MODE>mode);
> > > +    rtx arg3 = lowpart_subreg (<V_INT_EQUIV>mode, operands[3],
> > <MODE>mode);
> > > +    rtx arg4 = lowpart_subreg (<V_INT_EQUIV>mode, operands[4],
> > > + <MODE>mode);
> > > +
> > > +    rtx v_sign_bitmask
> > > +      = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> > > +                                      HOST_WIDE_INT_M1U << bits);
> > > +
> > > +    /* copysign (x, -1) should instead be expanded as orr with the sign
> > > +       bit.  */
> > > +    if (!REG_P (operands[3]))
> > > +      {
> > > +   auto r0
> > > +     = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate
> > (operands[3]));
> > > +   if (-1 == real_to_integer (r0))
> > > +     {
> > > +       arg3 = force_reg (<V_INT_EQUIV>mode, v_sign_bitmask);
> > > +       emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], arg2,
> > > +                                             arg3, arg4));
> > > +       emit_move_insn (operands[0], gen_lowpart (<MODE>mode,
> > int_res));
> > > +       DONE;
> > > +     }
> > > +      }
> > > +
> > > +    operands[2] = force_reg (<MODE>mode, operands[3]);
> > > +    emit_insn (gen_and<v_int_equiv>3 (sign, arg3, v_sign_bitmask));
> > > +    emit_insn (gen_and<v_int_equiv>3
> > > +          (mant, arg2,
> > > +           aarch64_simd_gen_const_vector_dup
> > (<V_INT_EQUIV>mode,
> > > +                                              ~(HOST_WIDE_INT_M1U
> > > +                                                << bits))));
> > > +    emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], sign, 
> > > mant,
> > > +                                     arg4));
> > > +    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
> > > +    DONE;
> > > +  }
> > > +)
> > > +
> > >  (define_expand "xorsign<mode>3"
> > >    [(match_operand:SVE_FULL_F 0 "register_operand")
> > >     (match_operand:SVE_FULL_F 1 "register_operand") diff --git
> > > a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..f4ecbeecbe1290134e6
> > 88f46a438
> > > 9d17155e4a0a
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> > > @@ -0,0 +1,36 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3" } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > > +*/
> > > +
> > > +#include <arm_neon.h>
> > > +#include <math.h>
> > > +
> > > +/*
> > > +** f1:
> > > +** ...
> > > +** orr     z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
> > > +** ...
> > > +*/
> > > +void f1 (float32_t *a, int n)
> > > +{
> > > +  for (int i = 0; i < (n & -8); i++)
> > > +   if (a[i] > n)
> > > +     a[i] = -fabsf (a[i]);
> > > +   else
> > > +     a[i] = n;
> > > +}
> > > +
> > > +/*
> > > +** f2:
> > > +** ...
> > > +** orr     z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> > > +** ...
> > > +*/
> > > +void f2 (float64_t *a, int n)
> > > +{
> > > +  for (int i = 0; i < (n & -8); i++)
> > > +   if (a[i] > n)
> > > +     a[i] = -fabs (a[i]);
> > > +   else
> > > +     a[i] = n;
> > > +}

Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.

Reply via email to