Tamar Christina <[email protected]> writes:
> Hi All,
>
> This patch adds support for vector constructor from two partial SVE vectors
> into
> a full SVE vector. It also implements support for the standard vec_init obtab
> to
> do this.
>
> gcc/ChangeLog:
>
> PR target/96342
> * config/aarch64/aarch64-sve.md (vec_init<mode><Vhalf>): New.
> (@aarch64_pack_partial<mode>): New.
> * config/aarch64/aarch64.cc (aarch64_sve_expand_vector_init): Special
> case constructors of two vectors.
> * config/aarch64/iterators.md (SVE_NO2E, SVE_PARTIAL_NO2E): New.
> (VHALF, Vhalf, Vwstype): Add SVE partial vectors.
>
> gcc/testsuite/ChangeLog:
>
> PR target/96342
> * gcc.target/aarch64/vect-simd-clone-2.c: New test.
This triggers an ICE for:
typedef unsigned int v8si __attribute__((vector_size(32)));
typedef unsigned int v16si __attribute__((vector_size(64)));
v16si __GIMPLE
foo (v8si x, v8si y)
{
v16si res;
res = _Literal (v16si) { x, y };
return res;
}
compiled with -O2 -march=armv8-a+sve -msve-vector-bits=512 -fgimple.
Suggested fix below.
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64-sve.md
> b/gcc/config/aarch64/aarch64-sve.md
> index
> 9afd11d347626eeb640722fdba2ab763b8479aa7..9e3577be6e943d7a5c951196463873d4bcfee07c
> 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -2840,6 +2840,16 @@ (define_expand "vec_init<mode><Vel>"
> }
> )
>
> +(define_expand "vec_init<mode><Vhalf>"
> + [(match_operand:SVE_NO2E 0 "register_operand")
> + (match_operand 1 "")]
Nit: excess indentation.
> + "TARGET_SVE"
> + {
> + aarch64_sve_expand_vector_init (operands[0], operands[1]);
> + DONE;
> + }
> +)
> +
> ;; Shift an SVE vector left and insert a scalar into element 0.
> (define_insn "vec_shl_insert_<mode>"
> [(set (match_operand:SVE_FULL 0 "register_operand")
> @@ -9347,6 +9357,20 @@ (define_insn "vec_pack_trunc_<Vwide>"
> "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
> )
>
> +;; Integer partial pack packing two partial SVE types into a single full SVE
> +;; type of the same element type. Use UZP1 on the wider type, which discards
> +;; the high part of each wide element. This allows to concat SVE partial
> types
> +;; into a wider vector.
> +(define_insn "@aarch64_pack_partial<mode>"
> + [(set (match_operand:SVE_PARTIAL_NO2E 0 "register_operand" "=w")
> + (unspec:SVE_PARTIAL_NO2E
> + [(match_operand:<VHALF> 1 "register_operand" "w")
> + (match_operand:<VHALF> 2 "register_operand" "w")]
> + UNSPEC_PACK))]
> + "TARGET_SVE"
> + "uzp1\t%0.<Vwstype>, %1.<Vwstype>, %2.<Vwstype>"
> +)
> +
To fix the ICE above, I think we should define this pattern for
all SVE_NO2E. We can also make it a vec_concat, which should work
for both endiannesses.
Rather than use Vwstype, I think this is conceptually a permute of the
containers, so should use Vctype. That will change VNx4QI from using .h
(as in the patch) to .s (to match VNx4SI), but both work.
> ;; -------------------------------------------------------------------------
> ;; ---- [INT<-INT] Unpacks
> ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index
> af6fede102c2be6673c24f8020d000ea56322997..690d54b0a2954327e00d559f96c414c81c2e18cd
> 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -24790,6 +24790,17 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals)
> v.quick_push (XVECEXP (vals, 0, i));
> v.finalize ();
>
> + /* If we have two elements and are concatting vector. */
> + machine_mode elem_mode = GET_MODE (v.elt (0));
> + if (nelts == 2 && VECTOR_MODE_P (elem_mode))
> + {
> + /* We've failed expansion using a dup. Try using a cheeky truncate. */
> + rtx arg0 = force_reg (elem_mode, v.elt(0));
> + rtx arg1 = force_reg (elem_mode, v.elt(1));
> + emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
> + return;
> + }
> +
I think it'd be better to use an independent routine for this,
since there's not really any overlap with the scalar-element code.
In particular, we might as well get the vectors directly from
XVECEXP (val, 0, ...), since we don't need the rtx_vector_builder
for the expansion.
> /* If neither sub-vectors of v could be initialized specially,
> then use INSR to insert all elements from v into TARGET.
> ??? This might not be optimal for vectors with large
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index
> 023893d35f3e955e222c322ce370e84c95c29ee6..77d23d6ad795630d3d5fb5c076c086a479d46fee
> 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -138,6 +138,14 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
> ;; VQ without 2 element modes.
> (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
>
> +;; SVE modes without 2 element modes.
> +(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI VNx8HF
> + VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF])
> +
> +;; Partial SVE modes without 2 element modes.
> +(define_mode_iterator SVE_PARTIAL_NO2E [VNx8QI VNx4QI VNx4HI
> + VNx4HF VNx8BF VNx4BF])
> +
> ;; 2 element quad vector modes.
> (define_mode_iterator VQ_2E [V2DI V2DF])
>
> @@ -1678,7 +1686,15 @@ (define_mode_attr VHALF [(V8QI "V4QI") (V16QI "V8QI")
> (V2DI "DI") (V2SF "SF")
> (V4SF "V2SF") (V4HF "V2HF")
> (V8HF "V4HF") (V2DF "DF")
> - (V8BF "V4BF")])
> + (V8BF "V4BF")
> + (VNx16QI "VNx8QI") (VNx8QI "VNx4QI")
> + (VNx4QI "VNx2QI") (VNx2QI "QI")
> + (VNx8HI "VNx4HI") (VNx4HI "VNx2HI") (VNx2HI "HI")
> + (VNx8HF "VNx4HF") (VNx4HF "VNx2HF") (VNx2HF "HF")
> + (VNx8BF "VNx4BF") (VNx4BF "VNx2BF") (VNx2BF "BF")
> + (VNx4SI "VNx2SI") (VNx2SI "SI")
> + (VNx4SF "VNx2SF") (VNx2SF "SF")
> + (VNx2DI "DI") (VNx2DF "DF")])
Are the x2 entries necessary, given that the new uses are restricted
to NO2E?
Thanks,
Richard
> ;; Half modes of all vector modes, in lower-case.
> (define_mode_attr Vhalf [(V8QI "v4qi") (V16QI "v8qi")
> @@ -1686,7 +1702,15 @@ (define_mode_attr Vhalf [(V8QI "v4qi") (V16QI "v8qi")
> (V8HF "v4hf") (V8BF "v4bf")
> (V2SI "si") (V4SI "v2si")
> (V2DI "di") (V2SF "sf")
> - (V4SF "v2sf") (V2DF "df")])
> + (V4SF "v2sf") (V2DF "df")
> + (VNx16QI "vnx8qi") (VNx8QI "vnx4qi")
> + (VNx4QI "vnx2qi") (VNx2QI "qi")
> + (VNx8HI "vnx4hi") (VNx4HI "vnx2hi") (VNx2HI "hi")
> + (VNx8HF "vnx4hf") (VNx4HF "vnx2hf") (VNx2HF "hf")
> + (VNx8BF "vnx4bf") (VNx4BF "vnx2bf") (VNx2BF "bf")
> + (VNx4SI "vnx2si") (VNx2SI "si")
> + (VNx4SF "vnx2sf") (VNx2SF "sf")
> + (VNx2DI "di") (VNx2DF "df")])
>
> ;; Single-element half modes of quad vector modes.
> (define_mode_attr V1HALF [(V2DI "V1DI") (V2DF "V1DF")])
> @@ -1815,7 +1839,10 @@ (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s")
> ;; Widened scalar register suffixes.
> (define_mode_attr Vwstype [(V8QI "h") (V4HI "s")
> (V2SI "") (V16QI "h")
> - (V8HI "s") (V4SI "d")])
> + (V8HI "s") (V4SI "d")
> + (VNx8QI "h") (VNx4QI "h")
> + (VNx4HF "s") (VNx4HI "s")
> + (VNx8BF "h") (VNx4BF "h")])
> ;; Add a .1d for V2SI.
> (define_mode_attr Vwsuf [(V8QI "") (V4HI "")
> (V2SI ".1d") (V16QI "")
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
> b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..a25cae2708dd18cc91a7732f845419bbdb06c5c1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-std=c99" } */
> +/* { dg-additional-options "-O3 -march=armv8-a" } */
> +
> +#pragma GCC target ("+sve")
> +extern char __attribute__ ((simd, const)) fn3 (int, char);
> +void test_fn3 (int *a, int *b, char *c, int n)
> +{
> + for (int i = 0; i < n; ++i)
> + a[i] = (int) (fn3 (b[i], c[i]) + c[i]);
> +}
> +
> +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */