RE: [PATCH]AArch64 Add SVE implementation for cond_copysign.

Tamar Christina Thu, 05 Oct 2023 12:48:10 -0700

> -----Original Message-----
> From: Richard Sandiford <richard.sandif...@arm.com>
> Sent: Thursday, October 5, 2023 8:29 PM
> To: Tamar Christina <tamar.christ...@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; Richard Earnshaw
> <richard.earns...@arm.com>; Marcus Shawcroft
> <marcus.shawcr...@arm.com>; Kyrylo Tkachov <kyrylo.tkac...@arm.com>
> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> 
> Tamar Christina <tamar.christ...@arm.com> writes:
> > Hi All,
> >
> > This adds an implementation for masked copysign along with an
> > optimized pattern for masked copysign (x, -1).
> 
> It feels like we're ending up with a lot of AArch64-specific code that just 
> hard-
> codes the observation that changing the sign is equivalent to changing the top
> bit.  We then need to make sure that we choose the best way of changing the
> top bit for any given situation.
> 
> Hard-coding the -1/negative case is one instance of that.  But it looks like 
> we
> also fail to use the best sequence for SVE2.  E.g.
> [https://godbolt.org/z/ajh3MM5jv]:
> 
> #include <stdint.h>
> 
> void f(double *restrict a, double *restrict b) {
>     for (int i = 0; i < 100; ++i)
>         a[i] = __builtin_copysign(a[i], b[i]); }
> 
> void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
>     for (int i = 0; i < 100; ++i)
>         a[i] = (a[i] & ~c) | (b[i] & c); }
> 
> gives:
> 
> f:
>         mov     x2, 0
>         mov     w3, 100
>         whilelo p7.d, wzr, w3
> .L2:
>         ld1d    z30.d, p7/z, [x0, x2, lsl 3]
>         ld1d    z31.d, p7/z, [x1, x2, lsl 3]
>         and     z30.d, z30.d, #0x7fffffffffffffff
>         and     z31.d, z31.d, #0x8000000000000000
>         orr     z31.d, z31.d, z30.d
>         st1d    z31.d, p7, [x0, x2, lsl 3]
>         incd    x2
>         whilelo p7.d, w2, w3
>         b.any   .L2
>         ret
> g:
>         mov     x3, 0
>         mov     w4, 100
>         mov     z29.d, x2
>         whilelo p7.d, wzr, w4
> .L6:
>         ld1d    z30.d, p7/z, [x0, x3, lsl 3]
>         ld1d    z31.d, p7/z, [x1, x3, lsl 3]
>         bsl     z31.d, z31.d, z30.d, z29.d
>         st1d    z31.d, p7, [x0, x3, lsl 3]
>         incd    x3
>         whilelo p7.d, w3, w4
>         b.any   .L6
>         ret
> 
> I saw that you originally tried to do this in match.pd and that the decision 
> was
> to fold to copysign instead.  But perhaps there's a compromise where isel does
> something with the (new) copysign canonical form?
> I.e. could we go with your new version of the match.pd patch, and add some
> isel stuff as a follow-on?
>


Sure if that's what's desired.... But..

The example you posted above is for instance worse for x86 
https://godbolt.org/z/x9ccqxW6T
where the first operation has a dependency chain of 2 and the latter of 3.  
It's likely any
open coding of this operation is going to hurt a target.

So I'm unsure what isel transform this into...

Tamar

> Not saying no to this patch, just thought that the above was worth
> considering.
> 
> [I agree with Andrew's comments FWIW.]
> 
> Thanks,
> Richard
> 
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> >     PR tree-optimization/109154
> >     * config/aarch64/aarch64-sve.md (cond_copysign<mode>): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> >     PR tree-optimization/109154
> >     * gcc.target/aarch64/sve/fneg-abs_5.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64-sve.md
> > b/gcc/config/aarch64/aarch64-sve.md
> > index
> >
> 071400c820a5b106ddf9dc9faebb117975d74ea0..00ca30c24624dc661254
> 568f45b6
> > 1a14aa11c305 100644
> > --- a/gcc/config/aarch64/aarch64-sve.md
> > +++ b/gcc/config/aarch64/aarch64-sve.md
> > @@ -6429,6 +6429,57 @@ (define_expand "copysign<mode>3"
> >    }
> >  )
> >
> > +(define_expand "cond_copysign<mode>"
> > +  [(match_operand:SVE_FULL_F 0 "register_operand")
> > +   (match_operand:<VPRED> 1 "register_operand")
> > +   (match_operand:SVE_FULL_F 2 "register_operand")
> > +   (match_operand:SVE_FULL_F 3 "nonmemory_operand")
> > +   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> > +  "TARGET_SVE"
> > +  {
> > +    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
> > +    rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode);
> > +    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
> > +    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
> > +
> > +    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2],
> <MODE>mode);
> > +    rtx arg3 = lowpart_subreg (<V_INT_EQUIV>mode, operands[3],
> <MODE>mode);
> > +    rtx arg4 = lowpart_subreg (<V_INT_EQUIV>mode, operands[4],
> > + <MODE>mode);
> > +
> > +    rtx v_sign_bitmask
> > +      = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> > +                                      HOST_WIDE_INT_M1U << bits);
> > +
> > +    /* copysign (x, -1) should instead be expanded as orr with the sign
> > +       bit.  */
> > +    if (!REG_P (operands[3]))
> > +      {
> > +   auto r0
> > +     = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate
> (operands[3]));
> > +   if (-1 == real_to_integer (r0))
> > +     {
> > +       arg3 = force_reg (<V_INT_EQUIV>mode, v_sign_bitmask);
> > +       emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], arg2,
> > +                                             arg3, arg4));
> > +       emit_move_insn (operands[0], gen_lowpart (<MODE>mode,
> int_res));
> > +       DONE;
> > +     }
> > +      }
> > +
> > +    operands[2] = force_reg (<MODE>mode, operands[3]);
> > +    emit_insn (gen_and<v_int_equiv>3 (sign, arg3, v_sign_bitmask));
> > +    emit_insn (gen_and<v_int_equiv>3
> > +          (mant, arg2,
> > +           aarch64_simd_gen_const_vector_dup
> (<V_INT_EQUIV>mode,
> > +                                              ~(HOST_WIDE_INT_M1U
> > +                                                << bits))));
> > +    emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], sign, mant,
> > +                                     arg4));
> > +    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
> > +    DONE;
> > +  }
> > +)
> > +
> >  (define_expand "xorsign<mode>3"
> >    [(match_operand:SVE_FULL_F 0 "register_operand")
> >     (match_operand:SVE_FULL_F 1 "register_operand") diff --git
> > a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..f4ecbeecbe1290134e6
> 88f46a438
> > 9d17155e4a0a
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> > @@ -0,0 +1,36 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > +*/
> > +
> > +#include <arm_neon.h>
> > +#include <math.h>
> > +
> > +/*
> > +** f1:
> > +** ...
> > +** orr     z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
> > +** ...
> > +*/
> > +void f1 (float32_t *a, int n)
> > +{
> > +  for (int i = 0; i < (n & -8); i++)
> > +   if (a[i] > n)
> > +     a[i] = -fabsf (a[i]);
> > +   else
> > +     a[i] = n;
> > +}
> > +
> > +/*
> > +** f2:
> > +** ...
> > +** orr     z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> > +** ...
> > +*/
> > +void f2 (float64_t *a, int n)
> > +{
> > +  for (int i = 0; i < (n & -8); i++)
> > +   if (a[i] > n)
> > +     a[i] = -fabs (a[i]);
> > +   else
> > +     a[i] = n;
> > +}

RE: [PATCH]AArch64 Add SVE implementation for cond_copysign.

Reply via email to