Tamar Christina <tamar.christ...@arm.com> writes:
> diff --git a/gcc/config/aarch64/aarch64-sve.md 
> b/gcc/config/aarch64/aarch64-sve.md
> index 
> 7db2938bb84e04d066a7b07574e5cf344a3a8fb6..2cdc6338902216760622a39b14f0076994458c98
>  100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -8657,6 +8657,22 @@ (define_insn "@aarch64_sve_<perm_insn><mode>"
>    "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
>  )
>  
> +;; Special purpose permute used by the predicate generation instructions.
> +;; This version only accepts VNx16BI as input but can output as any predicate
> +;; type and will reinterpet the input registers as the type in operand 3.

I think it would be more accurate to say something like:

;; Special purpose permute used by the predicate generation instructions.
;; Unlike the normal permute patterns, these instructions operate on VNx16BI
;; regardless of the element size, so that all input and output bits are
;; well-defined.  Operand 3 then indicates the size of the permute.

> +(define_insn "@aarch64_sve_trn1_conv<mode>"
> +  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
> +     (unspec:VNx16BI [(match_operand:VNx16BI 1 "register_operand" "Upa")
> +                      (match_operand:VNx16BI 2 "register_operand" "Upa")
> +                      (clobber
> +                       (match_operand:PRED_ALL 3 "register_operand" "=Upa"))

I don't think we need a register for operand 3.  We could just use the
CONST0_RTX of the mode:

   (match_operand:PRED_ALL 3 "aarch64_simd_imm_zero")

(no need for a constraint).

> +                     ]

Formatting nit: ] is usually on the previous line.

> +                     UNSPEC_TRN1_CONV))]
> +  "TARGET_SVE"
> +  "trn1\t%0.<PRED_ALL:Vetype>, %1.<PRED_ALL:Vetype>, %2.<PRED_ALL:Vetype>"
> +)
> +
> +

Just one blank line here (sorry for the nitpick).

>  ;; =========================================================================
>  ;; == Conversions
>  ;; =========================================================================
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> 994fafc2dc857ca5c7f345e49b47cc7e7dcf5900..61337881bfd05dbf6e84ada6810b87fa36dc989d
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -5481,12 +5481,13 @@ aarch64_expand_sve_const_pred_trn (rtx target, 
> rtx_vector_builder &builder,
>       }
>      }
>  
> -  /* Emit the TRN1 itself.  */
> +  /* Emit the TRN1 itself.  We emit a TRN that will always take a
> +     input registers as VNx16BI but re-interpret the results to
> +     MODE.  */

Here too I think the output register mode is as important as the
input register mode, since we rely on all bits of the output being
well-defined.  How about something like:

  /* Emit the TRN1 itself.  We emit a TRN that operates on VNx16BI
     operands but permutes them as though they had mode MODE.  */

Thanks,
Richard

>    machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
> -  target = aarch64_target_reg (target, mode);
> -  emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
> -                           gen_lowpart (mode, a),
> -                           gen_lowpart (mode, b)));
> +  target = aarch64_target_reg (target, GET_MODE (a));
> +  rtx type_reg = gen_reg_rtx (mode);
> +  emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
>    return target;
>  }
>  
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 
> 5f5abd60525ba52fdb466e94a92ff4d011bee5cd..cac33ae812b382cd55611b0da8a6e9eac3a513c4
>  100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -649,6 +649,7 @@ (define_c_enum "unspec"
>      UNSPEC_UZP2Q     ; Used in aarch64-sve.md.
>      UNSPEC_ZIP1Q     ; Used in aarch64-sve.md.
>      UNSPEC_ZIP2Q     ; Used in aarch64-sve.md.
> +    UNSPEC_TRN1_CONV ; Used in aarch64-sve.md.
>      UNSPEC_COND_CMPEQ_WIDE ; Used in aarch64-sve.md.
>      UNSPEC_COND_CMPGE_WIDE ; Used in aarch64-sve.md.
>      UNSPEC_COND_CMPGT_WIDE ; Used in aarch64-sve.md.
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..525933863f7d67d76ba7afa4321346efa27ba000
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c
> @@ -0,0 +1,25 @@
> +/* { dg-additional-options "-O2 -fno-schedule-insns" } */
> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> +
> +#include "arm_sve.h"
> +
> +/*
> +** foo:
> +**        ptrue   (p[0-7])\.d, all
> +**        pfalse  (p[0-7])\.b
> +**        ptrue   (p[0-7])\.s, all
> +**        trn1    (p[0-7])\.d, \2\.d, \3\.d
> +**        trn1    \2\.d, \1\.d, \3\.d
> +**        faddv   (h[0-31]), \4\, (z[0-31]).h
> +**        faddv   (h[0-31]), \2\, \6\.h
> +**        str     \5, [x0]
> +**        str     \7, [x0, 2]
> +**        ret
> +*/
> +void foo(svfloat16_t in, float16_t *dst) {
> +  const svbool_t pg_q0 = svdupq_n_b16(1, 0, 1, 0, 0, 0, 0, 0);
> +  const svbool_t pg_f0 = svdupq_n_b16(1, 0, 0, 0, 0, 0, 0, 0);
> +  dst[0] = svaddv_f16(pg_f0, in);
> +  dst[1] = svaddv_f16(pg_q0, in);
> +}
> +

Reply via email to