[resending without HTML]

On 6/3/19 11:48 AM, Kyrill Tkachov wrote:

On 6/3/19 11:32 AM, James Greenhalgh wrote:
On Fri, May 10, 2019 at 10:32:22AM +0100, Kyrill Tkachov wrote:
Hi all,

This patch fixes the failing gcc.dg/vect/slp-reduc-sad-2.c testcase on
aarch64
by implementing a vec_init optab that can handle two half-width vectors
producing a full-width one
by concatenating them.

In the gcc.dg/vect/slp-reduc-sad-2.c case it's a V8QI reg concatenated
with a V8QI const_vector of zeroes.
This can be implemented efficiently using the aarch64_combinez pattern
that just loads a D-register to make
use of the implicit zero-extending semantics of that load.
Otherwise it concatenates the two vector using aarch64_simd_combine.

With this patch I'm seeing the effect from richi's original patch that
added gcc.dg/vect/slp-reduc-sad-2.c on aarch64
and 525.x264_r improves by about 1.5%.

Bootstrapped and tested on aarch64-none-linux-gnu. Also tested on
aarch64_be-none-elf.

Ok for trunk?
I have a question on the patch. Otherise, this is OK for trunk.

2019-10-05  Kyrylo Tkachov <kyrylo.tkac...@arm.com>

      PR tree-optimization/90332
      * config/aarch64/aarch64.c (aarch64_expand_vector_init):
      Handle VALS containing two vectors.
      * config/aarch64/aarch64-simd.md (*aarch64_combinez<mode>): Rename
      to...
      (@aarch64_combinez<mode>): ... This.
      (*aarch64_combinez_be<mode>): Rename to...
      (@aarch64_combinez_be<mode>): ... This.
      (vec_init<mode><Vhalf>): New define_expand.
      * config/aarch64/iterators.md (Vhalf): Handle V8HF.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 0c2c17ed8269923723d066b250974ee1ff423d26..52c933cfdac20c5c566c13ae2528f039efda4c46 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -15075,6 +15075,43 @@ aarch64_expand_vector_init (rtx target, rtx vals)
    rtx v0 = XVECEXP (vals, 0, 0);
    bool all_same = true;
  +  /* This is a special vec_init<M><N> where N is not an element mode but a +     vector mode with half the elements of M.  We expect to find two entries +     of mode N in VALS and we must put their concatentation into TARGET.  */ +  if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
Should you validate the two vector modes are actually half-size vectors here,
and not something unexpected?


From my reading it can't ever be anything but half-size vectors on this path (the optabs are only defined for such modes)

I'll add an assert to that effect, as if that changes in the optab, this will blow up.

And this is what I committed (added an assert)

Thanks for the review.

Kyrill


Thanks,

Kyrill


Thanks,
James


+    {
+      rtx lo = XVECEXP (vals, 0, 0);
+      rtx hi = XVECEXP (vals, 0, 1);
+      machine_mode narrow_mode = GET_MODE (lo);
+      gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
+      gcc_assert (narrow_mode == GET_MODE (hi));
+
+      /* When we want to concatenate a half-width vector with zeroes we can
+     use the aarch64_combinez[_be] patterns.  Just make sure that the
+     zeroes are in the right half.  */
+      if (BYTES_BIG_ENDIAN
+      && aarch64_simd_imm_zero (lo, narrow_mode)
+      && general_operand (hi, narrow_mode))
+    emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
+      else if (!BYTES_BIG_ENDIAN
+           && aarch64_simd_imm_zero (hi, narrow_mode)
+           && general_operand (lo, narrow_mode))
+    emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
+      else
+    {
+      /* Else create the two half-width registers and combine them.  */
+      if (!REG_P (lo))
+        lo = force_reg (GET_MODE (lo), lo);
+      if (!REG_P (hi))
+        hi = force_reg (GET_MODE (hi), hi);
+
+      if (BYTES_BIG_ENDIAN)
+        std::swap (lo, hi);
+      emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
+    }
+     return;
+   }
+
    /* Count the number of variable elements to initialise. */
    for (int i = 0; i < n_elts; ++i)
      {
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9faec77a8c58b3722bc5906135bc927010c94011..4e12e8c71fb11c654336b773f85e3e764b85dbf6 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3226,7 +3226,7 @@
 ;; In this insn, operand 1 should be low, and operand 2 the high part of the
 ;; dest vector.
 
-(define_insn "*aarch64_combinez<mode>"
+(define_insn "@aarch64_combinez<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
 	(vec_concat:<VDBL>
 	  (match_operand:VDC 1 "general_operand" "w,?r,m")
@@ -3240,7 +3240,7 @@
    (set_attr "arch" "simd,fp,simd")]
 )
 
-(define_insn "*aarch64_combinez_be<mode>"
+(define_insn "@aarch64_combinez_be<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
         (vec_concat:<VDBL>
 	  (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")
@@ -5969,6 +5969,15 @@
   DONE;
 })
 
+(define_expand "vec_init<mode><Vhalf>"
+  [(match_operand:VQ_NO2E 0 "register_operand" "")
+   (match_operand 1 "" "")]
+  "TARGET_SIMD"
+{
+  aarch64_expand_vector_init (operands[0], operands[1]);
+  DONE;
+})
+
 (define_insn "*aarch64_simd_ld1r<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(vec_duplicate:VALL_F16
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b6df3e6ff6dbdaf0b1c7e646fbf7422fe995b263..bffb7e255df3e9709b16c14c75756a22563d850a 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -15112,6 +15112,45 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   rtx v0 = XVECEXP (vals, 0, 0);
   bool all_same = true;
 
+  /* This is a special vec_init<M><N> where N is not an element mode but a
+     vector mode with half the elements of M.  We expect to find two entries
+     of mode N in VALS and we must put their concatentation into TARGET.  */
+  if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
+    {
+      gcc_assert (known_eq (GET_MODE_SIZE (mode),
+		  2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
+      rtx lo = XVECEXP (vals, 0, 0);
+      rtx hi = XVECEXP (vals, 0, 1);
+      machine_mode narrow_mode = GET_MODE (lo);
+      gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
+      gcc_assert (narrow_mode == GET_MODE (hi));
+
+      /* When we want to concatenate a half-width vector with zeroes we can
+	 use the aarch64_combinez[_be] patterns.  Just make sure that the
+	 zeroes are in the right half.  */
+      if (BYTES_BIG_ENDIAN
+	  && aarch64_simd_imm_zero (lo, narrow_mode)
+	  && general_operand (hi, narrow_mode))
+	emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
+      else if (!BYTES_BIG_ENDIAN
+	       && aarch64_simd_imm_zero (hi, narrow_mode)
+	       && general_operand (lo, narrow_mode))
+	emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
+      else
+	{
+	  /* Else create the two half-width registers and combine them.  */
+	  if (!REG_P (lo))
+	    lo = force_reg (GET_MODE (lo), lo);
+	  if (!REG_P (hi))
+	    hi = force_reg (GET_MODE (hi), hi);
+
+	  if (BYTES_BIG_ENDIAN)
+	    std::swap (lo, hi);
+	  emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
+	}
+     return;
+   }
+
   /* Count the number of variable elements to initialise.  */
   for (int i = 0; i < n_elts; ++i)
     {
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index e51b6c0cba3df92ae86c5044232d3ca6fd9a713d..bb04ebd605dd9ddcde23adebade866c2a8aa1026 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -771,6 +771,7 @@
 ;; Half modes of all vector modes, in lower-case.
 (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
 			 (V4HI "v2hi")  (V8HI  "v4hi")
+			 (V8HF  "v4hf")
 			 (V2SI "si")    (V4SI  "v2si")
 			 (V2DI "di")    (V2SF  "sf")
 			 (V4SF "v2sf")  (V2DF  "df")])

Reply via email to