Spencer Abson <spencer.ab...@arm.com> writes: > @@ -9487,21 +9489,39 @@ > ;; - FCVTZU > ;; ------------------------------------------------------------------------- > > -;; Unpredicated conversion of floats to integers of the same size (HF to HI, > -;; SF to SI or DF to DI). > -(define_expand "<optab><mode><v_int_equiv>2" > - [(set (match_operand:<V_INT_EQUIV> 0 "register_operand") > - (unspec:<V_INT_EQUIV> > +;; Unpredicated conversion of floats to integers of the same size or wider, > +;; excluding conversions from DF (see below). > +(define_expand "<optab><SVE_HSF:mode><SVE_HSDI:mode>2" > + [(set (match_operand:SVE_HSDI 0 "register_operand") > + (unspec:SVE_HSDI > + [(match_dup 2) > + (match_dup 3) > + (match_operand:SVE_HSF 1 "register_operand")] > + SVE_COND_FCVTI))] > + "TARGET_SVE > + && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & > <SVE_HSF:self_mask>) == 0" > + { > + operands[2] = aarch64_sve_fp_pred (<SVE_HSDI:MODE>mode, &operands[3]); > + } > +) > + > +;; SI <- DF can't use SI <- trunc (DI <- DF) without -ffast-math, so this > +;; truncating variant of FCVTZ{S,U} is useful for auto-vectorization. > +;; > +;; DF is the only source mode for which the mask used above doesn't apply, > +;; we define a separate pattern for it here. > +(define_expand "<optab><VNx2DF_ONLY:mode><SVE_2SDI:mode>2" > + [(set (match_operand:SVE_2SDI 0 "register_operand") > + (unspec:SVE_2SDI > [(match_dup 2) > (const_int SVE_RELAXED_GP) > - (match_operand:SVE_FULL_F 1 "register_operand")] > + (match_operand:VNx2DF_ONLY 1 "register_operand")] > SVE_COND_FCVTI))] > "TARGET_SVE" > { > - operands[2] = aarch64_ptrue_reg (<VPRED>mode); > + operands[2] = aarch64_ptrue_reg (VNx2BImode); > } > ) > -
Sorry for the formatting nit, but: please keep the blank line. > ;; Predicated float-to-integer conversion, either to the same width or wider. > (define_insn > "@aarch64_sve_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>" > [(set (match_operand:SVE_FULL_HSDI 0 "register_operand") > @@ -9517,18 +9537,34 @@ > } > ) > > +;; As above, for pairs used by the auto-vectorizer only. > +(define_insn > "*aarch64_sve_<optab>_nontrunc<SVE_PARTIAL_F:mode><SVE_HSDI:mode>" > + [(set (match_operand:SVE_HSDI 0 "register_operand") > + (unspec:SVE_HSDI > + [(match_operand:<SVE_HSDI:VPRED> 1 "aarch64_predicate_operand") > + (match_operand:SI 3 "aarch64_sve_gp_strictness") > + (match_operand:SVE_PARTIAL_F 2 "register_operand")] > + SVE_COND_FCVTI))] > + "TARGET_SVE > + && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & > <SVE_PARTIAL_F:self_mask>) == 0" > + {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ] > + [ w , Upl , 0 ; * ] > fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype> > + [ ?&w , Upl , w ; yes ] movprfx\t%0, > %2\;fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype> > + } > +) > + > ;; Predicated narrowing float-to-integer conversion. I think it would be worth extending the comment here, in case it isn't obvious what's going on: ;; Predicated narrowing float-to-integer conversion. The VNx2DF->VNx4SI ;; variant is provided for the ACLE, where the zeroed odd-indexed lanes are ;; significant. The VNx2DF->VNx2SI variant is provided for autovectorization, ;; where the odd-indexed lanes are ignored. > -(define_insn "@aarch64_sve_<optab>_trunc<VNx2DF_ONLY:mode><VNx4SI_ONLY:mode>" > - [(set (match_operand:VNx4SI_ONLY 0 "register_operand") > - (unspec:VNx4SI_ONLY > +(define_insn "@aarch64_sve_<optab>_trunc<VNx2DF_ONLY:mode><SVE_SI:mode>" > + [(set (match_operand:SVE_SI 0 "register_operand") > + (unspec:SVE_SI > [(match_operand:VNx2BI 1 "register_operand") > (match_operand:SI 3 "aarch64_sve_gp_strictness") > (match_operand:VNx2DF_ONLY 2 "register_operand")] > SVE_COND_FCVTI))] > "TARGET_SVE" > {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ] > - [ w , Upl , 0 ; * ] > fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype> > - [ ?&w , Upl , w ; yes ] movprfx\t%0, > %2\;fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype> > + [ w , Upl , 0 ; * ] fcvtz<su>\t%0.<SVE_SI:Vetype>, > %1/m, %2.<VNx2DF_ONLY:Vetype> > + [ ?&w , Upl , w ; yes ] movprfx\t%0, > %2\;fcvtz<su>\t%0.<SVE_SI:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype> > } > ) > > [...] > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index 59ac08483f4..b13fce2a859 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -3855,6 +3855,44 @@ aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx > *pred2) > return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]); > } > > + > +/* Generate a predicate to control partial SVE mode DATA_MODE as if it > + were fully packed, enabling the defined elements only. */ > +rtx > +aarch64_sve_packed_pred (machine_mode data_mode) > +{ > + unsigned int container_bytes > + = aarch64_sve_container_bits (data_mode) / BITS_PER_UNIT; > + /* Enable the significand of each container only. */ > + rtx ptrue = force_reg (VNx16BImode, aarch64_ptrue_all (container_bytes)); > + /* Predicate at the element size. */ > + machine_mode pmode > + = aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (data_mode)).require (); > + return gen_lowpart (pmode, ptrue); > +} > + > +/* Generate a predicate and strictness value to govern a floating-point > + operation with SVE mode DATA_MODE. > + > + If DATA_MODE is partial vector mode, this pair prevents the operation is a partial vector mode > + from interpreting undefined elements - unless we don't need to control > + it's trapping behavior. */ its > +rtx > +aarch64_sve_fp_pred (machine_mode data_mode, rtx *strictness) > +{ > + unsigned int vec_flags = aarch64_classify_vector_mode (data_mode); > + if (flag_trapping_math && (vec_flags & VEC_PARTIAL)) > + { > + if (strictness) > + *strictness = gen_int_mode (SVE_STRICT_GP, SImode); > + return aarch64_sve_packed_pred (data_mode); > + } > + if (strictness) > + *strictness = gen_int_mode (SVE_RELAXED_GP, SImode); > + /* Use the natural predicate mode. */ > + return aarch64_ptrue_reg (aarch64_sve_pred_mode (data_mode)); > +} > + > /* Emit a comparison CMP between OP0 and OP1, both of which have mode > DATA_MODE, and return the result in a predicate of mode PRED_MODE. > Use TARGET as the target register if nonnull and convenient. */ > [...] > diff --git a/gcc/config/aarch64/predicates.md > b/gcc/config/aarch64/predicates.md > index 2c6af831eae..ed36cd72ea3 100644 > --- a/gcc/config/aarch64/predicates.md > +++ b/gcc/config/aarch64/predicates.md > @@ -587,6 +587,10 @@ > return aarch64_simd_shift_imm_p (op, mode, false); > }) > > +(define_special_predicate "aarch64_predicate_operand" > + (and (match_test "register_operand (op, GET_MODE (op))") > + (match_test "aarch64_sve_valid_pred_p (op, mode)"))) Since this is (rightly) invoking register_operand directly, rather than using: (match_operand 0 "register_operand") (which would pass the wrong mode), the gen* tools won't be able to infer that the predicate only accepts reg and subreg. So I think we should make it: (and (match_code "reg,subreg") (match_test "register_operand (op, GET_MODE (op))") (match_test "aarch64_sve_valid_pred_p (op, mode)"))) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_1.c > new file mode 100644 > index 00000000000..115182faf3a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_1.c > @@ -0,0 +1,217 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msve-vector-bits=2048 -fno-schedule-insns2" } */ Might be good to pass -fno-schedule-insns too, in case it gets enabled again by default (e.g. if it is finally tuned for OoO cores, and if the compile-time problems are addressed). Same for other tests in the series. > [...] > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_3.c > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_3.c > new file mode 100644 > index 00000000000..964264c4114 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_3.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize" } */ > + > +#include <stdint.h> > + > +void f64_i32 (double *restrict x, int32_t *restrict y, int n) > +{ > + for (int i = 0; i < n; i++) > + x[i] = (double)y[i]; > +} > + > +/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m, > z[0-9]+\.d\n} 1 } } */ I think we should accept \.[sd] for the source, since IMO it would be more natural to do this without the sign-extension to 64 bits. Or have I misunderstood the point of the test? Looks really good otherwise, so ok with those changes. Thanks, Richard