> Pengxuan Zheng <quic_pzh...@quicinc.com> writes:
> > We can optimize AND with certain vector of immediates as FMOV if the
> > result of the AND is as if the upper lane of the input vector is set
> > to zero and the lower lane remains unchanged.
> >
> > For example, at present:
> >
> > v4hi
> > f_v4hi (v4hi x)
> > {
> >   return x & (v4hi){ 0xffff, 0xffff, 0, 0 }; }
> >
> > generates:
> >
> > f_v4hi:
> >     movi    d31, 0xffffffff
> >     and     v0.8b, v0.8b, v31.8b
> >     ret
> >
> > With this patch, it generates:
> >
> > f_v4hi:
> >     fmov    s0, s0
> >     ret
> >
> >     PR target/100165
> >
> > gcc/ChangeLog:
> >
> >     * config/aarch64/aarch64-protos.h (aarch64_output_fmov): New
> prototype.
> >     (aarch64_simd_valid_and_imm_fmov): Likewise.
> >     * config/aarch64/aarch64-simd.md (and<mode>3<vczle><vczbe>):
> Allow FMOV
> >     codegen.
> >     * config/aarch64/aarch64.cc (aarch64_simd_valid_and_imm_fmov):
> New
> >     function.
> >     (aarch64_output_fmov): Likewise.
> >     * config/aarch64/constraints.md (Df): New constraint.
> >     * config/aarch64/predicates.md (aarch64_reg_or_and_imm): Update
> >     predicate to support FMOV codegen.
> >
> > gcc/testsuite/ChangeLog:
> >
> >     * gcc.target/aarch64/fmov-1.c: New test.
> >     * gcc.target/aarch64/fmov-2.c: New test.
> >     * gcc.target/aarch64/fmov-be-1.c: New test.
> >     * gcc.target/aarch64/fmov-be-2.c: New test.
> 
> Thanks for doing this.  Mostly looks good, but some comments below:

Thanks for the review and all the suggestions! I've updated the patches
accordingly. Please let me know if there's any other comment.

https://gcc.gnu.org/pipermail/gcc-patches/2025-May/683390.html
https://gcc.gnu.org/pipermail/gcc-patches/2025-May/683391.html

Thanks,
Pengxuan
> 
> > Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com>
> > ---
> >  gcc/config/aarch64/aarch64-protos.h          |   2 +
> >  gcc/config/aarch64/aarch64-simd.md           |  10 +-
> >  gcc/config/aarch64/aarch64.cc                |  75 ++++++++++
> >  gcc/config/aarch64/constraints.md            |   7 +
> >  gcc/config/aarch64/predicates.md             |   3 +-
> >  gcc/testsuite/gcc.target/aarch64/fmov-1.c    | 149 +++++++++++++++++++
> >  gcc/testsuite/gcc.target/aarch64/fmov-2.c    |  90 +++++++++++
> >  gcc/testsuite/gcc.target/aarch64/fmov-be-1.c | 149
> > +++++++++++++++++++  gcc/testsuite/gcc.target/aarch64/fmov-be-2.c |
> > 90 +++++++++++
> >  9 files changed, 569 insertions(+), 6 deletions(-)  create mode
> > 100644 gcc/testsuite/gcc.target/aarch64/fmov-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-be-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-be-2.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-protos.h
> > b/gcc/config/aarch64/aarch64-protos.h
> > index 1ca86c9d175..c461fce8896 100644
> > --- a/gcc/config/aarch64/aarch64-protos.h
> > +++ b/gcc/config/aarch64/aarch64-protos.h
> > @@ -933,6 +933,7 @@ char *aarch64_output_simd_mov_imm (rtx,
> unsigned);
> > char *aarch64_output_simd_orr_imm (rtx, unsigned);  char
> > *aarch64_output_simd_and_imm (rtx, unsigned);  char
> > *aarch64_output_simd_xor_imm (rtx, unsigned);
> > +char *aarch64_output_fmov (rtx);
> >
> >  char *aarch64_output_sve_mov_immediate (rtx);  char
> > *aarch64_output_sve_ptrues (rtx); @@ -948,6 +949,7 @@ bool
> > aarch64_simd_scalar_immediate_valid_for_move (rtx, scalar_int_mode);
> > bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);  bool
> > aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *);
> > bool aarch64_simd_valid_and_imm (rtx);
> > +bool aarch64_simd_valid_and_imm_fmov (rtx, unsigned int * = NULL);
> >  bool aarch64_simd_valid_mov_imm (rtx);  bool
> > aarch64_simd_valid_orr_imm (rtx);  bool aarch64_simd_valid_xor_imm
> > (rtx); diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index e2afe87e513..e051e6459a5 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -1117,17 +1117,17 @@ (define_insn "fabd<mode>3<vczle><vczbe>"
> >    [(set_attr "type" "neon_fp_abd_<stype><q>")]
> >  )
> >
> > -;; For AND (vector, register) and BIC (vector, immediate)
> > +;; For AND (vector, register), BIC (vector, immediate) and FMOV
> > +(register)
> >  (define_insn "and<mode>3<vczle><vczbe>"
> >    [(set (match_operand:VDQ_I 0 "register_operand")
> >     (and:VDQ_I (match_operand:VDQ_I 1 "register_operand")
> >                (match_operand:VDQ_I 2 "aarch64_reg_or_and_imm")))]
> >    "TARGET_SIMD"
> > -  {@ [ cons: =0 , 1 , 2   ]
> > -     [ w        , w , w   ] and\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>
> > -     [ w        , 0 , Db  ] << aarch64_output_simd_and_imm
(operands[2],
> <bitsize>);
> > +  {@ [ cons: =0 , 1 , 2  ; attrs: type   ]
> > +     [ w        , w , w  ; neon_logic<q> ] and\t%0.<Vbtype>,
%1.<Vbtype>,
> %2.<Vbtype>
> > +     [ w        , 0 , Db ; neon_logic<q> ] <<
aarch64_output_simd_and_imm
> (operands[2], <bitsize>);
> > +     [ w        , w , Df ; fmov          ] << aarch64_output_fmov
(operands[2]);
> 
> I think the fmov alternative should have priority over the AND immediate
> alternative, so that we use fmov even for SVE.
> 
> >    }
> > -  [(set_attr "type" "neon_logic<q>")]
> >  )
> >
> >  ;; For ORR (vector, register) and ORR (vector, immediate) diff --git
> > a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index
> > 38c112cc92f..54895e3f456 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -23516,6 +23516,61 @@ aarch64_simd_valid_and_imm (rtx op)
> >    return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_AND);  }
> >
> > +/* Return true if OP is a valid SIMD and immediate which allows the and
be
> > +   optimized as fmov.  If ELT_SIZE is nonnull, it represents the size
of the
> > +   register for fmov.  */
> > +bool
> > +aarch64_simd_valid_and_imm_fmov (rtx op, unsigned int *elt_size) {
> > +  machine_mode mode = GET_MODE (op);
> > +  gcc_assert (!aarch64_sve_mode_p (mode));
> > +
> > +  auto_vec<target_unit, 128> buffer;
> 
> 16 should be enough for Advanced SIMD vectors.
> 
> > +  unsigned int n_bytes
> > +      = GET_MODE_BITSIZE (mode).to_constant () / BITS_PER_UNIT;
> 
> This can be simplified to GET_MODE_SIZE (mode)
> 
> > +  buffer.reserve (n_bytes);
> > +
> > +  bool ok = native_encode_rtx (mode, op, buffer, 0, n_bytes);
> > + gcc_assert (ok);
> > +
> > +  unsigned int n_ones = 0;
> > +  unsigned int i = 0;
> > +  auto idx = [] (unsigned int n, unsigned int i)
> > +    {
> > +      return BYTES_BIG_ENDIAN ? n - i - 1 : i;
> > +    };
> > +
> > +  /* Find the number of consecutive 0xFFs at the beginning of the AND
> > +mask.  */
> > +  for (; i < n_bytes; i++)
> > +    {
> > +      if (i > 8)
> > +   return false;
> > +
> > +      if (buffer[idx (n_bytes, i)] != 0xff)
> > +   break;
> > +    }
> > +
> > +  n_ones = i;
> > +  /* Bail out if the AND mask is not valid.  The valid n_ones are 2 (h
with
> > +     TARGET_SIMD_F16INST), 4 (s) and 8 (d).  */  if ((n_ones != 2 &&
> > + n_ones != 4 && n_ones != 8)
> > +      || (n_ones == 2 && !TARGET_SIMD_F16INST))
> > +    return false;
> > +
> > +  /* The rest of the AND mask must be all 0s.  */  for (; i <
> > + n_bytes; i++)
> > +    if (buffer[idx (n_bytes, i)] != 0)
> > +      break;
> > +
> > +  if (i != n_bytes)
> > +    return false;
> > +
> > +  if (elt_size)
> > +    *elt_size = n_ones;
> > +
> > +  return true;
> > +}
> > +
> 
> With the decode_native_int patch I submitted here:
> 
>   https://gcc.gnu.org/pipermail/gcc-patches/2025-April/682003.html
> 
> this could be simplified to:
> 
>   bool ok = native_encode_rtx (mode, op, buffer, 0, n_bytes);
>   gcc_assert (ok);
> 
>   auto mask = native_decode_int (buffer, 0, n_bytes, n_bytes *
> BITS_PER_UNIT);
>   int set_bit = wi::exact_log2 (mask + 1);
>   if ((set_bit == 16 && TARGET_SIMD_F16INST)
>       || set_bit == 32
>       || set_bit == 64)
>     {
>       if (elt_size)
>       *elt_size = set_bit / BITS_PER_UNIT;
>       return true;
>     }
> 
>   return false;
> 
> >  /* Return true if OP is a valid SIMD xor immediate for SVE.  */  bool
> > aarch64_simd_valid_xor_imm (rtx op) @@ -25642,6 +25697,26 @@
> > aarch64_float_const_representable_p (rtx x)
> >    return aarch64_real_float_const_representable_p (r);  }
> >
> > +/* Returns the string with the fmov instruction which is equivalent to
an
> and
> > +   instruction with the SIMD immediate CONST_VECTOR.  */
> > +char*
> > +aarch64_output_fmov (rtx const_vector) {
> > +  bool is_valid;
> > +  static char templ[40];
> > +  char element_char;
> > +  unsigned int elt_size;
> > +
> > +  is_valid = aarch64_simd_valid_and_imm_fmov (const_vector,
> > + &elt_size);  gcc_assert (is_valid);
> > +
> > +  element_char = sizetochar (elt_size * BITS_PER_UNIT);
> > +  snprintf (templ, sizeof (templ), "fmov\t%%%c0, %%%c1",
> > +       element_char, element_char);
> > +
> > +  return templ;
> > +}
> > +
> >  /* Returns the string with the instruction for the SIMD immediate
> >   * CONST_VECTOR of MODE and WIDTH.  WHICH selects a move, and(bic) or
> > orr.  */
> >  char*
> > diff --git a/gcc/config/aarch64/constraints.md
> > b/gcc/config/aarch64/constraints.md
> > index e8321c4d2fb..253603c173a 100644
> > --- a/gcc/config/aarch64/constraints.md
> > +++ b/gcc/config/aarch64/constraints.md
> > @@ -466,6 +466,13 @@ (define_constraint "Do"
> >   (and (match_code "const_vector")
> >        (match_test "aarch64_simd_valid_orr_imm (op)")))
> >
> > +(define_constraint "Df"
> > +  "@internal
> > +   A constraint that matches vector of immediates for and which can
> > +be\
> 
> matches a vector
> 
> The trailing backslash shouldn't be necesssary.
> 
> > +   optimized as fmov."
> > + (and (match_code "const_vector")
> > +      (match_test "aarch64_simd_valid_and_imm_fmov (op)")))
> > +
> >  (define_constraint "Db"
> >    "@internal
> >     A constraint that matches vector of immediates for and/bic."
> > diff --git a/gcc/config/aarch64/predicates.md
> > b/gcc/config/aarch64/predicates.md
> > index 1ab1c696c62..2c6af831eae 100644
> > --- a/gcc/config/aarch64/predicates.md
> > +++ b/gcc/config/aarch64/predicates.md
> > @@ -123,7 +123,8 @@ (define_predicate "aarch64_reg_or_orr_imm"
> >  (define_predicate "aarch64_reg_or_and_imm"
> >     (ior (match_operand 0 "register_operand")
> >     (and (match_code "const_vector")
> > -        (match_test "aarch64_simd_valid_and_imm (op)"))))
> > +        (ior (match_test "aarch64_simd_valid_and_imm (op)")
> > +             (match_test "aarch64_simd_valid_and_imm_fmov (op)")))))
> >
> >  (define_predicate "aarch64_reg_or_xor_imm"
> >     (ior (match_operand 0 "register_operand") diff --git
> > a/gcc/testsuite/gcc.target/aarch64/fmov-1.c
> > b/gcc/testsuite/gcc.target/aarch64/fmov-1.c
> > new file mode 100644
> > index 00000000000..d657dfdee88
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/fmov-1.c
> > @@ -0,0 +1,149 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> 
> Since this test is specific to little-endian, it might be better to call
it fmov-1-le.c.
> The options should include -mlittle-endian, so that the tests still pass
on
> aarch64_be targets.
> 
> Same for fmov-2.c.
> 
> Thanks,
> Richard

Reply via email to