Re: [PATCH]AArch64 Handle copysign (x, -1) expansion efficiently

Richard Sandiford Thu, 19 Oct 2023 14:17:59 -0700

Tamar Christina <tamar.christ...@arm.com> writes:
> Hi All,
>
> copysign (x, -1) is effectively fneg (abs (x)) which on AArch64 can be
> most efficiently done by doing an OR of the signbit.
>
> The middle-end will optimize fneg (abs (x)) now to copysign as the
> canonical form and so this optimizes the expansion.
>
> If the target has an inclusive-OR that takes an immediate, then the 
> transformed
> instruction is both shorter and faster.  For those that don't, the immediate
> has to be separately constructed, but this still ends up being faster as the
> immediate construction is not on the critical path.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Note that this is part of another patch series, the additional testcases
> are mutually dependent on the match.pd patch.  As such the tests are added
> there insteadof here.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>       PR tree-optimization/109154
>       * config/aarch64/aarch64.md (copysign<GPF:mode>3): Handle
>       copysign (x, -1).
>       * config/aarch64/aarch64-simd.md (copysign<mode>3): Likewise.
>       * config/aarch64/aarch64-sve.md (copysign<mode>3): Likewise.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> 25a1e4e8ecf767636c0ff3cdab6cad6e1482f73e..a78e77dcc3473445108b06c50f9c28a8369f3e3f
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -754,15 +754,33 @@ (define_insn 
> "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi><vczle><
>  (define_expand "copysign<mode>3"
>    [(match_operand:VHSDF 0 "register_operand")
>     (match_operand:VHSDF 1 "register_operand")
> -   (match_operand:VHSDF 2 "register_operand")]
> +   (match_operand:VHSDF 2 "nonmemory_operand")]
>    "TARGET_SIMD"
>  {
> -  rtx v_bitmask = gen_reg_rtx (<V_INT_EQUIV>mode);
> +  machine_mode int_mode = <V_INT_EQUIV>mode;
> +  rtx v_bitmask = gen_reg_rtx (int_mode);
>    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
>  
>    emit_move_insn (v_bitmask,
>                 aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
>                                                    HOST_WIDE_INT_M1U << 
> bits));
> +
> +  /* copysign (x, -1) should instead be expanded as orr with the sign
> +     bit.  */
> +  if (!REG_P (operands[2]))
> +    {
> +      auto r0
> +     = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[2]));
> +      if (-1 == real_to_integer (r0))


Just restating Andrew's review really, but I think this is safer and
more general as:

  rtx op2_elt = unwrap_const_vec_duplicate (operands[2]);
  if (GET_CODE (op2_elt) == CONST_DOUBLE
      && real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt)))

For vectors, the condition (in principle) is instead whether each
element is individually negative, but we can leave that for later.

OK with that change if the patch to reverse the current
(copysign x -1) -> (fneg (fabs x)) canonicalisation is accepted.
(As said, I'm nervous that it'll regress powerpc.)

Thanks,
Richard

> +     {
> +       emit_insn (gen_ior<v_int_equiv>3 (
> +         lowpart_subreg (int_mode, operands[0], <MODE>mode),
> +         lowpart_subreg (int_mode, operands[1], <MODE>mode), v_bitmask));
> +       DONE;
> +     }
> +    }
> +
> +  operands[2] = force_reg (<MODE>mode, operands[2]);
>    emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], v_bitmask,
>                                        operands[2], operands[1]));
>    DONE;
> diff --git a/gcc/config/aarch64/aarch64-sve.md 
> b/gcc/config/aarch64/aarch64-sve.md
> index 
> 5a652d8536a0ef9461f40da7b22834e683e73ceb..071400c820a5b106ddf9dc9faebb117975d74ea0
>  100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -6387,7 +6387,7 @@ (define_insn "*<optab><mode>3"
>  (define_expand "copysign<mode>3"
>    [(match_operand:SVE_FULL_F 0 "register_operand")
>     (match_operand:SVE_FULL_F 1 "register_operand")
> -   (match_operand:SVE_FULL_F 2 "register_operand")]
> +   (match_operand:SVE_FULL_F 2 "nonmemory_operand")]
>    "TARGET_SVE"
>    {
>      rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
> @@ -6398,11 +6398,26 @@ (define_expand "copysign<mode>3"
>      rtx arg1 = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
>      rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
>  
> -    emit_insn (gen_and<v_int_equiv>3
> -            (sign, arg2,
> -             aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> -                                                HOST_WIDE_INT_M1U
> -                                                << bits)));
> +    rtx v_sign_bitmask
> +      = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> +                                        HOST_WIDE_INT_M1U << bits);
> +
> +    /* copysign (x, -1) should instead be expanded as orr with the sign
> +       bit.  */
> +    if (!REG_P (operands[2]))
> +      {
> +     auto r0
> +       = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[2]));
> +     if (-1 == real_to_integer (r0))
> +       {
> +         emit_insn (gen_ior<v_int_equiv>3 (int_res, arg1, v_sign_bitmask));
> +         emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
> +         DONE;
> +       }
> +      }
> +
> +    operands[2] = force_reg (<MODE>mode, operands[2]);
> +    emit_insn (gen_and<v_int_equiv>3 (sign, arg2, v_sign_bitmask));
>      emit_insn (gen_and<v_int_equiv>3
>              (mant, arg1,
>               aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> 24349ecdbbab875f21975f116732a9e53762d4c1..d6c581ad81615b4feb095391cbcf4f5b78fa72f1
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -6940,12 +6940,25 @@ (define_expand "lrint<GPF:mode><GPI:mode>2"
>  (define_expand "copysign<GPF:mode>3"
>    [(match_operand:GPF 0 "register_operand")
>     (match_operand:GPF 1 "register_operand")
> -   (match_operand:GPF 2 "register_operand")]
> +   (match_operand:GPF 2 "nonmemory_operand")]
>    "TARGET_SIMD"
>  {
> -  rtx bitmask = gen_reg_rtx (<V_INT_EQUIV>mode);
> +  machine_mode int_mode = <V_INT_EQUIV>mode;
> +  rtx bitmask = gen_reg_rtx (int_mode);
>    emit_move_insn (bitmask, GEN_INT (HOST_WIDE_INT_M1U
>                                   << (GET_MODE_BITSIZE (<MODE>mode) - 1)));
> +  /* copysign (x, -1) should instead be expanded as orr with the sign
> +     bit.  */
> +  auto r0 = CONST_DOUBLE_REAL_VALUE (operands[2]);
> +  if (-1 == real_to_integer (r0))
> +    {
> +      emit_insn (gen_ior<v_int_equiv>3 (
> +     lowpart_subreg (int_mode, operands[0], <MODE>mode),
> +     lowpart_subreg (int_mode, operands[1], <MODE>mode), bitmask));
> +      DONE;
> +    }
> +
> +  operands[2] = force_reg (<MODE>mode, operands[2]);
>    emit_insn (gen_copysign<mode>3_insn (operands[0], operands[1], operands[2],
>                                      bitmask));
>    DONE;

Re: [PATCH]AArch64 Handle copysign (x, -1) expansion efficiently

Reply via email to