Spencer Abson <spencer.ab...@arm.com> writes:
> @@ -9487,21 +9489,39 @@
>  ;; - FCVTZU
>  ;; -------------------------------------------------------------------------
>  
> -;; Unpredicated conversion of floats to integers of the same size (HF to HI,
> -;; SF to SI or DF to DI).
> -(define_expand "<optab><mode><v_int_equiv>2"
> -  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
> -     (unspec:<V_INT_EQUIV>
> +;; Unpredicated conversion of floats to integers of the same size or wider,
> +;; excluding conversions from DF (see below).
> +(define_expand "<optab><SVE_HSF:mode><SVE_HSDI:mode>2"
> +  [(set (match_operand:SVE_HSDI 0 "register_operand")
> +     (unspec:SVE_HSDI
> +       [(match_dup 2)
> +        (match_dup 3)
> +        (match_operand:SVE_HSF 1 "register_operand")]
> +       SVE_COND_FCVTI))]
> +  "TARGET_SVE
> +   && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & 
> <SVE_HSF:self_mask>) == 0"
> +  {
> +    operands[2] = aarch64_sve_fp_pred (<SVE_HSDI:MODE>mode, &operands[3]);
> +  }
> +)
> +
> +;; SI <- DF can't use SI <- trunc (DI <- DF) without -ffast-math, so this
> +;; truncating variant of FCVTZ{S,U} is useful for auto-vectorization.
> +;;
> +;; DF is the only source mode for which the mask used above doesn't apply,
> +;; we define a separate pattern for it here.
> +(define_expand "<optab><VNx2DF_ONLY:mode><SVE_2SDI:mode>2"
> +  [(set (match_operand:SVE_2SDI 0 "register_operand")
> +     (unspec:SVE_2SDI
>         [(match_dup 2)
>          (const_int SVE_RELAXED_GP)
> -        (match_operand:SVE_FULL_F 1 "register_operand")]
> +        (match_operand:VNx2DF_ONLY 1 "register_operand")]
>         SVE_COND_FCVTI))]
>    "TARGET_SVE"
>    {
> -    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
> +    operands[2] = aarch64_ptrue_reg (VNx2BImode);
>    }
>  )
> -

Sorry for the formatting nit, but: please keep the blank line.

>  ;; Predicated float-to-integer conversion, either to the same width or wider.
>  (define_insn 
> "@aarch64_sve_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>"
>    [(set (match_operand:SVE_FULL_HSDI 0 "register_operand")
> @@ -9517,18 +9537,34 @@
>    }
>  )
>  
> +;; As above, for pairs used by the auto-vectorizer only.
> +(define_insn 
> "*aarch64_sve_<optab>_nontrunc<SVE_PARTIAL_F:mode><SVE_HSDI:mode>"
> +  [(set (match_operand:SVE_HSDI 0 "register_operand")
> +     (unspec:SVE_HSDI
> +       [(match_operand:<SVE_HSDI:VPRED> 1 "aarch64_predicate_operand")
> +        (match_operand:SI 3 "aarch64_sve_gp_strictness")
> +        (match_operand:SVE_PARTIAL_F 2 "register_operand")]
> +       SVE_COND_FCVTI))]
> +   "TARGET_SVE
> +   && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & 
> <SVE_PARTIAL_F:self_mask>) == 0"
> +  {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
> +     [ w        , Upl , 0 ; *              ] 
> fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype>
> +     [ ?&w      , Upl , w ; yes            ] movprfx\t%0, 
> %2\;fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype>
> +  }
> +)
> +
>  ;; Predicated narrowing float-to-integer conversion.

I think it would be worth extending the comment here, in case it isn't
obvious what's going on:

;; Predicated narrowing float-to-integer conversion.  The VNx2DF->VNx4SI
;; variant is provided for the ACLE, where the zeroed odd-indexed lanes are
;; significant.  The VNx2DF->VNx2SI variant is provided for autovectorization,
;; where the odd-indexed lanes are ignored.

> -(define_insn "@aarch64_sve_<optab>_trunc<VNx2DF_ONLY:mode><VNx4SI_ONLY:mode>"
> -  [(set (match_operand:VNx4SI_ONLY 0 "register_operand")
> -     (unspec:VNx4SI_ONLY
> +(define_insn "@aarch64_sve_<optab>_trunc<VNx2DF_ONLY:mode><SVE_SI:mode>"
> +  [(set (match_operand:SVE_SI 0 "register_operand")
> +     (unspec:SVE_SI
>         [(match_operand:VNx2BI 1 "register_operand")
>          (match_operand:SI 3 "aarch64_sve_gp_strictness")
>          (match_operand:VNx2DF_ONLY 2 "register_operand")]
>         SVE_COND_FCVTI))]
>    "TARGET_SVE"
>    {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
> -     [ w        , Upl , 0 ; *              ] 
> fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
> -     [ ?&w      , Upl , w ; yes            ] movprfx\t%0, 
> %2\;fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
> +     [ w        , Upl , 0 ; *              ] fcvtz<su>\t%0.<SVE_SI:Vetype>, 
> %1/m, %2.<VNx2DF_ONLY:Vetype>
> +     [ ?&w      , Upl , w ; yes            ] movprfx\t%0, 
> %2\;fcvtz<su>\t%0.<SVE_SI:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
>    }
>  )
>  
> [...]
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 59ac08483f4..b13fce2a859 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -3855,6 +3855,44 @@ aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx 
> *pred2)
>    return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
>  }
>  
> +
> +/* Generate a predicate to control partial SVE mode DATA_MODE as if it
> +   were fully packed, enabling the defined elements only.  */
> +rtx
> +aarch64_sve_packed_pred (machine_mode data_mode)
> +{
> +  unsigned int container_bytes
> +    = aarch64_sve_container_bits (data_mode) / BITS_PER_UNIT;
> +  /* Enable the significand of each container only.  */
> +  rtx ptrue = force_reg (VNx16BImode, aarch64_ptrue_all (container_bytes));
> +  /* Predicate at the element size.  */
> +  machine_mode pmode
> +    = aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (data_mode)).require ();
> +  return gen_lowpart (pmode, ptrue);
> +}
> +
> +/* Generate a predicate and strictness value to govern a floating-point
> +   operation with SVE mode DATA_MODE.
> +
> +   If DATA_MODE is partial vector mode, this pair prevents the operation

is a partial vector mode

> +   from interpreting undefined elements - unless we don't need to control
> +   it's trapping behavior.  */

its

> +rtx
> +aarch64_sve_fp_pred (machine_mode data_mode, rtx *strictness)
> +{
> +   unsigned int vec_flags = aarch64_classify_vector_mode (data_mode);
> +   if (flag_trapping_math && (vec_flags & VEC_PARTIAL))
> +     {
> +       if (strictness)
> +      *strictness = gen_int_mode (SVE_STRICT_GP, SImode);
> +       return aarch64_sve_packed_pred (data_mode);
> +     }
> +   if (strictness)
> +     *strictness = gen_int_mode (SVE_RELAXED_GP, SImode);
> +   /* Use the natural predicate mode.  */
> +   return aarch64_ptrue_reg (aarch64_sve_pred_mode (data_mode));
> +}
> +
>  /* Emit a comparison CMP between OP0 and OP1, both of which have mode
>     DATA_MODE, and return the result in a predicate of mode PRED_MODE.
>     Use TARGET as the target register if nonnull and convenient.  */
> [...]
> diff --git a/gcc/config/aarch64/predicates.md 
> b/gcc/config/aarch64/predicates.md
> index 2c6af831eae..ed36cd72ea3 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -587,6 +587,10 @@
>    return aarch64_simd_shift_imm_p (op, mode, false);
>  })
>  
> +(define_special_predicate "aarch64_predicate_operand"
> +  (and (match_test "register_operand (op, GET_MODE (op))")
> +       (match_test "aarch64_sve_valid_pred_p (op, mode)")))

Since this is (rightly) invoking register_operand directly,
rather than using:

  (match_operand 0 "register_operand")

(which would pass the wrong mode), the gen* tools won't be able to infer that
the predicate only accepts reg and subreg.  So I think we should make it:

  (and (match_code "reg,subreg")
       (match_test "register_operand (op, GET_MODE (op))")
       (match_test "aarch64_sve_valid_pred_p (op, mode)")))

> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_1.c
> new file mode 100644
> index 00000000000..115182faf3a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_1.c
> @@ -0,0 +1,217 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msve-vector-bits=2048 -fno-schedule-insns2" } */

Might be good to pass -fno-schedule-insns too, in case it gets enabled
again by default (e.g. if it is finally tuned for OoO cores, and if the
compile-time problems are addressed).

Same for other tests in the series.

> [...]
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_3.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_3.c
> new file mode 100644
> index 00000000000..964264c4114
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_3.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +
> +#include <stdint.h>
> +
> +void f64_i32 (double *restrict x, int32_t  *restrict y, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    x[i] = (double)y[i];
> +}
> +
> +/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m, 
> z[0-9]+\.d\n} 1 } } */

I think we should accept \.[sd] for the source, since IMO it would be
more natural to do this without the sign-extension to 64 bits.
Or have I misunderstood the point of the test?

Looks really good otherwise, so ok with those changes.

Thanks,
Richard

Reply via email to