Pengxuan Zheng <quic_pzh...@quicinc.com> writes:
> We can optimize AND with certain vector of immediates as FMOV if the result of
> the AND is as if the upper lane of the input vector is set to zero and the 
> lower
> lane remains unchanged.
>
> For example, at present:
>
> v4hi
> f_v4hi (v4hi x)
> {
>   return x & (v4hi){ 0xffff, 0xffff, 0, 0 };
> }
>
> generates:
>
> f_v4hi:
>       movi    d31, 0xffffffff
>       and     v0.8b, v0.8b, v31.8b
>       ret
>
> With this patch, it generates:
>
> f_v4hi:
>       fmov    s0, s0
>       ret
>
>       PR target/100165
>
> gcc/ChangeLog:
>
>       * config/aarch64/aarch64-protos.h (aarch64_output_fmov): New prototype.
>       (aarch64_simd_valid_and_imm_fmov): Likewise.
>       * config/aarch64/aarch64-simd.md (and<mode>3<vczle><vczbe>): Allow FMOV
>       codegen.
>       * config/aarch64/aarch64.cc (aarch64_simd_valid_and_imm_fmov): New
>       function.
>       (aarch64_output_fmov): Likewise.
>       * config/aarch64/constraints.md (Df): New constraint.
>       * config/aarch64/predicates.md (aarch64_reg_or_and_imm): Update
>       predicate to support FMOV codegen.
>
> gcc/testsuite/ChangeLog:
>
>       * gcc.target/aarch64/fmov-1.c: New test.
>       * gcc.target/aarch64/fmov-2.c: New test.
>       * gcc.target/aarch64/fmov-be-1.c: New test.
>       * gcc.target/aarch64/fmov-be-2.c: New test.

Thanks for doing this.  Mostly looks good, but some comments below:

> Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com>
> ---
>  gcc/config/aarch64/aarch64-protos.h          |   2 +
>  gcc/config/aarch64/aarch64-simd.md           |  10 +-
>  gcc/config/aarch64/aarch64.cc                |  75 ++++++++++
>  gcc/config/aarch64/constraints.md            |   7 +
>  gcc/config/aarch64/predicates.md             |   3 +-
>  gcc/testsuite/gcc.target/aarch64/fmov-1.c    | 149 +++++++++++++++++++
>  gcc/testsuite/gcc.target/aarch64/fmov-2.c    |  90 +++++++++++
>  gcc/testsuite/gcc.target/aarch64/fmov-be-1.c | 149 +++++++++++++++++++
>  gcc/testsuite/gcc.target/aarch64/fmov-be-2.c |  90 +++++++++++
>  9 files changed, 569 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-1.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-2.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-be-1.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-be-2.c
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 1ca86c9d175..c461fce8896 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -933,6 +933,7 @@ char *aarch64_output_simd_mov_imm (rtx, unsigned);
>  char *aarch64_output_simd_orr_imm (rtx, unsigned);
>  char *aarch64_output_simd_and_imm (rtx, unsigned);
>  char *aarch64_output_simd_xor_imm (rtx, unsigned);
> +char *aarch64_output_fmov (rtx);
>  
>  char *aarch64_output_sve_mov_immediate (rtx);
>  char *aarch64_output_sve_ptrues (rtx);
> @@ -948,6 +949,7 @@ bool aarch64_simd_scalar_immediate_valid_for_move (rtx, 
> scalar_int_mode);
>  bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
>  bool aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *);
>  bool aarch64_simd_valid_and_imm (rtx);
> +bool aarch64_simd_valid_and_imm_fmov (rtx, unsigned int * = NULL);
>  bool aarch64_simd_valid_mov_imm (rtx);
>  bool aarch64_simd_valid_orr_imm (rtx);
>  bool aarch64_simd_valid_xor_imm (rtx);
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index e2afe87e513..e051e6459a5 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -1117,17 +1117,17 @@ (define_insn "fabd<mode>3<vczle><vczbe>"
>    [(set_attr "type" "neon_fp_abd_<stype><q>")]
>  )
>  
> -;; For AND (vector, register) and BIC (vector, immediate)
> +;; For AND (vector, register), BIC (vector, immediate) and FMOV (register)
>  (define_insn "and<mode>3<vczle><vczbe>"
>    [(set (match_operand:VDQ_I 0 "register_operand")
>       (and:VDQ_I (match_operand:VDQ_I 1 "register_operand")
>                  (match_operand:VDQ_I 2 "aarch64_reg_or_and_imm")))]
>    "TARGET_SIMD"
> -  {@ [ cons: =0 , 1 , 2   ]
> -     [ w        , w , w   ] and\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>
> -     [ w        , 0 , Db  ] << aarch64_output_simd_and_imm (operands[2], 
> <bitsize>);
> +  {@ [ cons: =0 , 1 , 2  ; attrs: type   ]
> +     [ w        , w , w  ; neon_logic<q> ] and\t%0.<Vbtype>, %1.<Vbtype>, 
> %2.<Vbtype>
> +     [ w        , 0 , Db ; neon_logic<q> ] << aarch64_output_simd_and_imm 
> (operands[2], <bitsize>);
> +     [ w        , w , Df ; fmov          ] << aarch64_output_fmov 
> (operands[2]);

I think the fmov alternative should have priority over the AND immediate
alternative, so that we use fmov even for SVE.

>    }
> -  [(set_attr "type" "neon_logic<q>")]
>  )
>  
>  ;; For ORR (vector, register) and ORR (vector, immediate)
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 38c112cc92f..54895e3f456 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23516,6 +23516,61 @@ aarch64_simd_valid_and_imm (rtx op)
>    return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_AND);
>  }
>  
> +/* Return true if OP is a valid SIMD and immediate which allows the and be
> +   optimized as fmov.  If ELT_SIZE is nonnull, it represents the size of the
> +   register for fmov.  */
> +bool
> +aarch64_simd_valid_and_imm_fmov (rtx op, unsigned int *elt_size)
> +{
> +  machine_mode mode = GET_MODE (op);
> +  gcc_assert (!aarch64_sve_mode_p (mode));
> +
> +  auto_vec<target_unit, 128> buffer;

16 should be enough for Advanced SIMD vectors.

> +  unsigned int n_bytes
> +      = GET_MODE_BITSIZE (mode).to_constant () / BITS_PER_UNIT;

This can be simplified to GET_MODE_SIZE (mode)

> +  buffer.reserve (n_bytes);
> +
> +  bool ok = native_encode_rtx (mode, op, buffer, 0, n_bytes);
> +  gcc_assert (ok);
> +
> +  unsigned int n_ones = 0;
> +  unsigned int i = 0;
> +  auto idx = [] (unsigned int n, unsigned int i)
> +    {
> +      return BYTES_BIG_ENDIAN ? n - i - 1 : i;
> +    };
> +
> +  /* Find the number of consecutive 0xFFs at the beginning of the AND mask.  
> */
> +  for (; i < n_bytes; i++)
> +    {
> +      if (i > 8)
> +     return false;
> +
> +      if (buffer[idx (n_bytes, i)] != 0xff)
> +     break;
> +    }
> +
> +  n_ones = i;
> +  /* Bail out if the AND mask is not valid.  The valid n_ones are 2 (h with
> +     TARGET_SIMD_F16INST), 4 (s) and 8 (d).  */
> +  if ((n_ones != 2 && n_ones != 4 && n_ones != 8)
> +      || (n_ones == 2 && !TARGET_SIMD_F16INST))
> +    return false;
> +
> +  /* The rest of the AND mask must be all 0s.  */
> +  for (; i < n_bytes; i++)
> +    if (buffer[idx (n_bytes, i)] != 0)
> +      break;
> +
> +  if (i != n_bytes)
> +    return false;
> +
> +  if (elt_size)
> +    *elt_size = n_ones;
> +
> +  return true;
> +}
> +

With the decode_native_int patch I submitted here:

  https://gcc.gnu.org/pipermail/gcc-patches/2025-April/682003.html

this could be simplified to:

  bool ok = native_encode_rtx (mode, op, buffer, 0, n_bytes);
  gcc_assert (ok);

  auto mask = native_decode_int (buffer, 0, n_bytes, n_bytes * BITS_PER_UNIT);
  int set_bit = wi::exact_log2 (mask + 1);
  if ((set_bit == 16 && TARGET_SIMD_F16INST)
      || set_bit == 32
      || set_bit == 64)
    {
      if (elt_size)
        *elt_size = set_bit / BITS_PER_UNIT;
      return true;
    }

  return false;

>  /* Return true if OP is a valid SIMD xor immediate for SVE.  */
>  bool
>  aarch64_simd_valid_xor_imm (rtx op)
> @@ -25642,6 +25697,26 @@ aarch64_float_const_representable_p (rtx x)
>    return aarch64_real_float_const_representable_p (r);
>  }
>  
> +/* Returns the string with the fmov instruction which is equivalent to an and
> +   instruction with the SIMD immediate CONST_VECTOR.  */
> +char*
> +aarch64_output_fmov (rtx const_vector)
> +{
> +  bool is_valid;
> +  static char templ[40];
> +  char element_char;
> +  unsigned int elt_size;
> +
> +  is_valid = aarch64_simd_valid_and_imm_fmov (const_vector, &elt_size);
> +  gcc_assert (is_valid);
> +
> +  element_char = sizetochar (elt_size * BITS_PER_UNIT);
> +  snprintf (templ, sizeof (templ), "fmov\t%%%c0, %%%c1",
> +         element_char, element_char);
> +
> +  return templ;
> +}
> +
>  /* Returns the string with the instruction for the SIMD immediate
>   * CONST_VECTOR of MODE and WIDTH.  WHICH selects a move, and(bic) or orr.  
> */
>  char*
> diff --git a/gcc/config/aarch64/constraints.md 
> b/gcc/config/aarch64/constraints.md
> index e8321c4d2fb..253603c173a 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -466,6 +466,13 @@ (define_constraint "Do"
>   (and (match_code "const_vector")
>        (match_test "aarch64_simd_valid_orr_imm (op)")))
>  
> +(define_constraint "Df"
> +  "@internal
> +   A constraint that matches vector of immediates for and which can be\

matches a vector

The trailing backslash shouldn't be necesssary.

> +   optimized as fmov."
> + (and (match_code "const_vector")
> +      (match_test "aarch64_simd_valid_and_imm_fmov (op)")))
> +
>  (define_constraint "Db"
>    "@internal
>     A constraint that matches vector of immediates for and/bic."
> diff --git a/gcc/config/aarch64/predicates.md 
> b/gcc/config/aarch64/predicates.md
> index 1ab1c696c62..2c6af831eae 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -123,7 +123,8 @@ (define_predicate "aarch64_reg_or_orr_imm"
>  (define_predicate "aarch64_reg_or_and_imm"
>     (ior (match_operand 0 "register_operand")
>       (and (match_code "const_vector")
> -          (match_test "aarch64_simd_valid_and_imm (op)"))))
> +          (ior (match_test "aarch64_simd_valid_and_imm (op)")
> +               (match_test "aarch64_simd_valid_and_imm_fmov (op)")))))
>  
>  (define_predicate "aarch64_reg_or_xor_imm"
>     (ior (match_operand 0 "register_operand")
> diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-1.c 
> b/gcc/testsuite/gcc.target/aarch64/fmov-1.c
> new file mode 100644
> index 00000000000..d657dfdee88
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fmov-1.c
> @@ -0,0 +1,149 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */

Since this test is specific to little-endian, it might be better to
call it fmov-1-le.c.  The options should include -mlittle-endian,
so that the tests still pass on aarch64_be targets.

Same for fmov-2.c.

Thanks,
Richard

Reply via email to