Tamar Christina <tamar.christ...@arm.com> writes: > diff --git a/gcc/config/aarch64/aarch64-sve.md > b/gcc/config/aarch64/aarch64-sve.md > index > 7db2938bb84e04d066a7b07574e5cf344a3a8fb6..2cdc6338902216760622a39b14f0076994458c98 > 100644 > --- a/gcc/config/aarch64/aarch64-sve.md > +++ b/gcc/config/aarch64/aarch64-sve.md > @@ -8657,6 +8657,22 @@ (define_insn "@aarch64_sve_<perm_insn><mode>" > "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>" > ) > > +;; Special purpose permute used by the predicate generation instructions. > +;; This version only accepts VNx16BI as input but can output as any predicate > +;; type and will reinterpet the input registers as the type in operand 3.
I think it would be more accurate to say something like: ;; Special purpose permute used by the predicate generation instructions. ;; Unlike the normal permute patterns, these instructions operate on VNx16BI ;; regardless of the element size, so that all input and output bits are ;; well-defined. Operand 3 then indicates the size of the permute. > +(define_insn "@aarch64_sve_trn1_conv<mode>" > + [(set (match_operand:VNx16BI 0 "register_operand" "=Upa") > + (unspec:VNx16BI [(match_operand:VNx16BI 1 "register_operand" "Upa") > + (match_operand:VNx16BI 2 "register_operand" "Upa") > + (clobber > + (match_operand:PRED_ALL 3 "register_operand" "=Upa")) I don't think we need a register for operand 3. We could just use the CONST0_RTX of the mode: (match_operand:PRED_ALL 3 "aarch64_simd_imm_zero") (no need for a constraint). > + ] Formatting nit: ] is usually on the previous line. > + UNSPEC_TRN1_CONV))] > + "TARGET_SVE" > + "trn1\t%0.<PRED_ALL:Vetype>, %1.<PRED_ALL:Vetype>, %2.<PRED_ALL:Vetype>" > +) > + > + Just one blank line here (sorry for the nitpick). > ;; ========================================================================= > ;; == Conversions > ;; ========================================================================= > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index > 994fafc2dc857ca5c7f345e49b47cc7e7dcf5900..61337881bfd05dbf6e84ada6810b87fa36dc989d > 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -5481,12 +5481,13 @@ aarch64_expand_sve_const_pred_trn (rtx target, > rtx_vector_builder &builder, > } > } > > - /* Emit the TRN1 itself. */ > + /* Emit the TRN1 itself. We emit a TRN that will always take a > + input registers as VNx16BI but re-interpret the results to > + MODE. */ Here too I think the output register mode is as important as the input register mode, since we rely on all bits of the output being well-defined. How about something like: /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI operands but permutes them as though they had mode MODE. */ Thanks, Richard > machine_mode mode = aarch64_sve_pred_mode (permute_size).require (); > - target = aarch64_target_reg (target, mode); > - emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target, > - gen_lowpart (mode, a), > - gen_lowpart (mode, b))); > + target = aarch64_target_reg (target, GET_MODE (a)); > + rtx type_reg = gen_reg_rtx (mode); > + emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg)); > return target; > } > > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index > 5f5abd60525ba52fdb466e94a92ff4d011bee5cd..cac33ae812b382cd55611b0da8a6e9eac3a513c4 > 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -649,6 +649,7 @@ (define_c_enum "unspec" > UNSPEC_UZP2Q ; Used in aarch64-sve.md. > UNSPEC_ZIP1Q ; Used in aarch64-sve.md. > UNSPEC_ZIP2Q ; Used in aarch64-sve.md. > + UNSPEC_TRN1_CONV ; Used in aarch64-sve.md. > UNSPEC_COND_CMPEQ_WIDE ; Used in aarch64-sve.md. > UNSPEC_COND_CMPGE_WIDE ; Used in aarch64-sve.md. > UNSPEC_COND_CMPGT_WIDE ; Used in aarch64-sve.md. > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c > b/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..525933863f7d67d76ba7afa4321346efa27ba000 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c > @@ -0,0 +1,25 @@ > +/* { dg-additional-options "-O2 -fno-schedule-insns" } */ > +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ > + > +#include "arm_sve.h" > + > +/* > +** foo: > +** ptrue (p[0-7])\.d, all > +** pfalse (p[0-7])\.b > +** ptrue (p[0-7])\.s, all > +** trn1 (p[0-7])\.d, \2\.d, \3\.d > +** trn1 \2\.d, \1\.d, \3\.d > +** faddv (h[0-31]), \4\, (z[0-31]).h > +** faddv (h[0-31]), \2\, \6\.h > +** str \5, [x0] > +** str \7, [x0, 2] > +** ret > +*/ > +void foo(svfloat16_t in, float16_t *dst) { > + const svbool_t pg_q0 = svdupq_n_b16(1, 0, 1, 0, 0, 0, 0, 0); > + const svbool_t pg_f0 = svdupq_n_b16(1, 0, 0, 0, 0, 0, 0, 0); > + dst[0] = svaddv_f16(pg_f0, in); > + dst[1] = svaddv_f16(pg_q0, in); > +} > +