This patch uses predicated MLA, MLS, MAD and MSB to implement
conditional "FMA"s on integers.  This also requires providing
the unpredicated optabs (fma and fnma) since otherwise
tree-ssa-math-opts.c won't try to use the conditional forms.

We still want to use shifts and adds in preference to multiplications,
so the patch makes the optab expanders check for that.

The tests cover floating-point types too, which are already handled,
and which were already tested to some extent by gcc.dg/vect.

Tested on aarch64-linux-gnu (with and without SVE) and aarch64_be-elf.
Applied as r274509.

Richard


2019-08-15  Richard Sandiford  <richard.sandif...@arm.com>
            Kugan Vivekanandarajah  <kugan.vivekanandara...@linaro.org>

gcc/
        * config/aarch64/aarch64-protos.h (aarch64_prepare_sve_int_fma)
        (aarch64_prepare_sve_cond_int_fma): Declare.
        * config/aarch64/aarch64.c (aarch64_convert_mult_to_shift)
        (aarch64_prepare_sve_int_fma): New functions.
        (aarch64_prepare_sve_cond_int_fma): Likewise.
        * config/aarch64/aarch64-sve.md
        (cond_<SVE_INT_BINARY:optab><SVE_I:mode>): Add a "@" marker.
        (fma<SVE_I:mode>4, cond_fma<SVE_I:mode>, *cond_fma<SVE_I:mode>_2)
        (*cond_fma<SVE_I:mode>_4, *cond_fma<SVE_I:mode>_any, fnma<SVE_I:mode>4)
        (cond_fnma<SVE_I:mode>, *cond_fnma<SVE_I:mode>_2)
        (*cond_fnma<SVE_I:mode>_4, *cond_fnma<SVE_I:mode>_any): New patterns.
        (*madd<mode>): Rename to...
        (*fma<mode>4): ...this.
        (*msub<mode>): Rename to...
        (*fnma<mode>4): ...this.

gcc/testsuite/
        * gcc.target/aarch64/sve/cond_mla_1.c: New test.
        * gcc.target/aarch64/sve/cond_mla_1_run.c: Likewise.
        * gcc.target/aarch64/sve/cond_mla_2.c: Likewise.
        * gcc.target/aarch64/sve/cond_mla_2_run.c: Likewise.
        * gcc.target/aarch64/sve/cond_mla_3.c: Likewise.
        * gcc.target/aarch64/sve/cond_mla_3_run.c: Likewise.
        * gcc.target/aarch64/sve/cond_mla_4.c: Likewise.
        * gcc.target/aarch64/sve/cond_mla_4_run.c: Likewise.
        * gcc.target/aarch64/sve/cond_mla_5.c: Likewise.
        * gcc.target/aarch64/sve/cond_mla_5_run.c: Likewise.
        * gcc.target/aarch64/sve/cond_mla_6.c: Likewise.
        * gcc.target/aarch64/sve/cond_mla_6_run.c: Likewise.
        * gcc.target/aarch64/sve/cond_mla_7.c: Likewise.
        * gcc.target/aarch64/sve/cond_mla_7_run.c: Likewise.
        * gcc.target/aarch64/sve/cond_mla_8.c: Likewise.
        * gcc.target/aarch64/sve/cond_mla_8_run.c: Likewise.

Index: gcc/config/aarch64/aarch64-protos.h
===================================================================
--- gcc/config/aarch64/aarch64-protos.h 2019-08-15 09:20:26.000000000 +0100
+++ gcc/config/aarch64/aarch64-protos.h 2019-08-15 09:20:53.712070358 +0100
@@ -630,6 +630,9 @@ bool aarch64_gen_adjusted_ldpstp (rtx *,
 void aarch64_expand_sve_vec_cmp_int (rtx, rtx_code, rtx, rtx);
 bool aarch64_expand_sve_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
 void aarch64_expand_sve_vcond (machine_mode, machine_mode, rtx *);
+
+bool aarch64_prepare_sve_int_fma (rtx *, rtx_code);
+bool aarch64_prepare_sve_cond_int_fma (rtx *, rtx_code);
 #endif /* RTX_CODE */
 
 void aarch64_init_builtins (void);
Index: gcc/config/aarch64/aarch64.c
===================================================================
--- gcc/config/aarch64/aarch64.c        2019-08-15 09:20:26.000000000 +0100
+++ gcc/config/aarch64/aarch64.c        2019-08-15 09:20:53.712070358 +0100
@@ -16469,6 +16469,98 @@ aarch64_sve_expand_vector_init (rtx targ
     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
 }
 
+/* Check whether VALUE is a vector constant in which every element
+   is either a power of 2 or a negated power of 2.  If so, return
+   a constant vector of log2s, and flip CODE between PLUS and MINUS
+   if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
+
+static rtx
+aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
+{
+  if (GET_CODE (value) != CONST_VECTOR)
+    return NULL_RTX;
+
+  rtx_vector_builder builder;
+  if (!builder.new_unary_operation (GET_MODE (value), value, false))
+    return NULL_RTX;
+
+  scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
+  /* 1 if the result of the multiplication must be negated,
+     0 if it mustn't, or -1 if we don't yet care.  */
+  int negate = -1;
+  unsigned int encoded_nelts = const_vector_encoded_nelts (value);
+  for (unsigned int i = 0; i < encoded_nelts; ++i)
+    {
+      rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
+      if (!CONST_SCALAR_INT_P (elt))
+       return NULL_RTX;
+      rtx_mode_t val (elt, int_mode);
+      wide_int pow2 = wi::neg (val);
+      if (val != pow2)
+       {
+         /* It matters whether we negate or not.  Make that choice,
+            and make sure that it's consistent with previous elements.  */
+         if (negate == !wi::neg_p (val))
+           return NULL_RTX;
+         negate = wi::neg_p (val);
+         if (!negate)
+           pow2 = val;
+       }
+      /* POW2 is now the value that we want to be a power of 2.  */
+      int shift = wi::exact_log2 (pow2);
+      if (shift < 0)
+       return NULL_RTX;
+      builder.quick_push (gen_int_mode (shift, int_mode));
+    }
+  if (negate == -1)
+    /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
+    code = PLUS;
+  else if (negate == 1)
+    code = code == PLUS ? MINUS : PLUS;
+  return builder.build ();
+}
+
+/* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
+   CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
+   operands array, in the same order as for fma_optab.  Return true if
+   the function emitted all the necessary instructions, false if the caller
+   should generate the pattern normally with the new OPERANDS array.  */
+
+bool
+aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
+    {
+      rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
+                                 NULL_RTX, true, OPTAB_DIRECT);
+      force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
+                         operands[3], product, operands[0], true,
+                         OPTAB_DIRECT);
+      return true;
+    }
+  operands[2] = force_reg (mode, operands[2]);
+  return false;
+}
+
+/* Likewise, but for a conditional pattern.  */
+
+bool
+aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
+    {
+      rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
+                                 NULL_RTX, true, OPTAB_DIRECT);
+      emit_insn (gen_cond (code, mode, operands[0], operands[1],
+                          operands[4], product, operands[5]));
+      return true;
+    }
+  operands[3] = force_reg (mode, operands[3]);
+  return false;
+}
+
 static unsigned HOST_WIDE_INT
 aarch64_shift_truncation_mask (machine_mode mode)
 {
Index: gcc/config/aarch64/aarch64-sve.md
===================================================================
--- gcc/config/aarch64/aarch64-sve.md   2019-08-15 09:20:26.000000000 +0100
+++ gcc/config/aarch64/aarch64-sve.md   2019-08-15 09:20:53.712070358 +0100
@@ -1844,7 +1844,7 @@ (define_insn "*post_ra_<optab><mode>3"
 )
 
 ;; Predicated integer operations with merging.
-(define_expand "cond_<optab><mode>"
+(define_expand "@cond_<optab><mode>"
   [(set (match_operand:SVE_I 0 "register_operand")
        (unspec:SVE_I
          [(match_operand:<VPRED> 1 "register_operand")
@@ -3384,8 +3384,26 @@ (define_insn "*<logical_nn><mode>3"
 ;; - MLA
 ;; -------------------------------------------------------------------------
 
+;; Unpredicated integer addition of product.
+(define_expand "fma<mode>4"
+  [(set (match_operand:SVE_I 0 "register_operand")
+       (plus:SVE_I
+         (unspec:SVE_I
+           [(match_dup 4)
+            (mult:SVE_I (match_operand:SVE_I 1 "register_operand")
+                        (match_operand:SVE_I 2 "nonmemory_operand"))]
+           UNSPEC_PRED_X)
+         (match_operand:SVE_I 3 "register_operand")))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_int_fma (operands, PLUS))
+      DONE;
+    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
 ;; Predicated integer addition of product.
-(define_insn "*madd<mode>"
+(define_insn "*fma<mode>4"
   [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
        (plus:SVE_I
          (unspec:SVE_I
@@ -3402,6 +3420,97 @@ (define_insn "*madd<mode>"
   [(set_attr "movprfx" "*,*,yes")]
 )
 
+;; Predicated integer addition of product with merging.
+(define_expand "cond_fma<mode>"
+  [(set (match_operand:SVE_I 0 "register_operand")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand")
+          (plus:SVE_I
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand")
+                        (match_operand:SVE_I 3 "general_operand"))
+            (match_operand:SVE_I 4 "register_operand"))
+          (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero")]
+         UNSPEC_SEL))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_cond_int_fma (operands, PLUS))
+      DONE;
+    /* Swap the multiplication operands if the fallback value is the
+       second of the two.  */
+    if (rtx_equal_p (operands[3], operands[5]))
+      std::swap (operands[2], operands[3]);
+  }
+)
+
+;; Predicated integer addition of product, merging with the first input.
+(define_insn "*cond_fma<mode>_2"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+          (plus:SVE_I
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "0, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w"))
+            (match_operand:SVE_I 4 "register_operand" "w, w"))
+          (match_dup 2)]
+         UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0, %2\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer addition of product, merging with the third input.
+(define_insn "*cond_fma<mode>_4"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+          (plus:SVE_I
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w"))
+            (match_operand:SVE_I 4 "register_operand" "0, w"))
+          (match_dup 4)]
+         UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer addition of product, merging with an independent value.
+(define_insn_and_rewrite "*cond_fma<mode>_any"
+  [(set (match_operand:SVE_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, 
Upl, Upl")
+          (plus:SVE_I
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w, 0, w, 
w, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w, w, 0, 
w, w"))
+            (match_operand:SVE_I 4 "register_operand" "w, 0, w, w, w, w"))
+          (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 
0, w")]
+         UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[5])
+   && !rtx_equal_p (operands[3], operands[5])
+   && !rtx_equal_p (operands[4], operands[5])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, 
%2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mla\t%0.<Vetype>, %1/m, 
%2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, 
%3.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, 
%2.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, 
%2.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[5], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[5])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+                                            operands[5], operands[1]));
+    operands[5] = operands[4] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] MLS and MSB
 ;; -------------------------------------------------------------------------
@@ -3410,8 +3519,26 @@ (define_insn "*madd<mode>"
 ;; - MSB
 ;; -------------------------------------------------------------------------
 
+;; Unpredicated integer subtraction of product.
+(define_expand "fnma<mode>4"
+  [(set (match_operand:SVE_I 0 "register_operand")
+       (minus:SVE_I
+         (match_operand:SVE_I 3 "register_operand")
+         (unspec:SVE_I
+           [(match_dup 4)
+            (mult:SVE_I (match_operand:SVE_I 1 "register_operand")
+                        (match_operand:SVE_I 2 "general_operand"))]
+           UNSPEC_PRED_X)))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_int_fma (operands, MINUS))
+      DONE;
+    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
 ;; Predicated integer subtraction of product.
-(define_insn "*msub<mode>3"
+(define_insn "*fnma<mode>3"
   [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
        (minus:SVE_I
          (match_operand:SVE_I 4 "register_operand" "w, 0, w")
@@ -3428,6 +3555,98 @@ (define_insn "*msub<mode>3"
   [(set_attr "movprfx" "*,*,yes")]
 )
 
+;; Predicated integer subtraction of product with merging.
+(define_expand "cond_fnma<mode>"
+  [(set (match_operand:SVE_I 0 "register_operand")
+   (unspec:SVE_I
+       [(match_operand:<VPRED> 1 "register_operand")
+        (minus:SVE_I
+          (match_operand:SVE_I 4 "register_operand")
+          (mult:SVE_I (match_operand:SVE_I 2 "register_operand")
+                      (match_operand:SVE_I 3 "general_operand")))
+        (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero")]
+       UNSPEC_SEL))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_cond_int_fma (operands, MINUS))
+      DONE;
+    /* Swap the multiplication operands if the fallback value is the
+       second of the two.  */
+    if (rtx_equal_p (operands[3], operands[5]))
+      std::swap (operands[2], operands[3]);
+  }
+)
+
+;; Predicated integer subtraction of product, merging with the first input.
+(define_insn "*cond_fnma<mode>_2"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+          (minus:SVE_I
+            (match_operand:SVE_I 4 "register_operand" "w, w")
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "0, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w")))
+          (match_dup 2)]
+         UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0, %2\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer subtraction of product, merging with the third input.
+(define_insn "*cond_fnma<mode>_4"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+          (minus:SVE_I
+            (match_operand:SVE_I 4 "register_operand" "0, w")
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w")))
+          (match_dup 4)]
+         UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer subtraction of product, merging with an
+;; independent value.
+(define_insn_and_rewrite "*cond_fnma<mode>_any"
+  [(set (match_operand:SVE_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, 
Upl, Upl")
+          (minus:SVE_I
+            (match_operand:SVE_I 4 "register_operand" "w, 0, w, w, w, w")
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w, 0, w, 
w, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w, w, 0, 
w, w")))
+          (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 
0, w")]
+         UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[5])
+   && !rtx_equal_p (operands[3], operands[5])
+   && !rtx_equal_p (operands[4], operands[5])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, 
%2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mls\t%0.<Vetype>, %1/m, 
%2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, 
%3.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, 
%2.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, 
%2.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[5], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[5])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+                                            operands[5], operands[1]));
+    operands[5] = operands[4] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Dot product
 ;; -------------------------------------------------------------------------
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c   2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : b[i];   \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c       2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_1.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : b[i]; \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c   2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : c;      \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c       2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_2.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected = (pred[i] != 1                           \
+                        ? a[i] OP b[i] * (TYPE) FACTOR         \
+                        : (TYPE) FACTOR);                      \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c   2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : a[i];   \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c       2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_3.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : a[i]; \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c   2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] == 1 ? a[i] OP b[i] * c : pred[i];        \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m,} 2 } } 
*/
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,} 4 } } 
*/
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,} 4 } } 
*/
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m,} 4 } } 
*/
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c       2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_4.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3;                                        \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected = (pred[i] == 1                           \
+                        ? a[i] OP b[i] * (TYPE) FACTOR         \
+                        : pred[i]);                            \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c   2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] ? a[i] OP b[i] * c : 0;           \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.b, p[0-7]/m,} 1 
} } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.h, p[0-7]/m,} 1 
} } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.s, p[0-7]/m,} 1 
} } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.d, p[0-7]/m,} 1 
} } */
+
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.b, p[0-7]/m,} 1 
} } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.h, p[0-7]/m,} 1 
} } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.s, p[0-7]/m,} 1 
} } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.d, p[0-7]/m,} 1 
} } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z,} 2 } } 
*/
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z,} 4 } } 
*/
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z,} 4 } } 
*/
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z,} 4 } } 
*/
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c       2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_5.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 0;         \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c   2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] ? a[i] OP b[i] * c : 5;           \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsel\t} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c       2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_6.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 5; \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c   2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP, CONST)                                \
+  void __attribute__ ((noipa))                                 \
+  test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r,          \
+                                 TYPE *__restrict a,           \
+                                 TYPE *__restrict b,           \
+                                 TYPE *__restrict pred, int n) \
+  {                                                            \
+    for (int i = 0; i < n; ++i)                                        \
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i];       \
+  }
+
+#define TEST_COUNT(T, TYPE, CONST) \
+  T (TYPE, add, +, CONST) \
+  T (TYPE, sub, -, CONST)
+
+#define TEST_TYPE(T, TYPE, CONST) \
+  TEST_COUNT (T, TYPE, 2) \
+  TEST_COUNT (T, TYPE, 4) \
+  TEST_COUNT (T, TYPE, CONST)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t, 0x80) \
+  TEST_TYPE (T, uint16_t, 0x8000) \
+  TEST_TYPE (T, uint32_t, 0x80000000) \
+  TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } 
} */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 
} } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 
} } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 
} } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, 
z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, 
z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, 
z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, 
z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, 
z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, 
z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, 
z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, 
z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c       2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_7.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP, CONST)                       \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N);         \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i];         \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c   2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP, CONST)                                \
+  void __attribute__ ((noipa))                                 \
+  test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r,          \
+                                 TYPE *__restrict a,           \
+                                 TYPE *__restrict b,           \
+                                 TYPE *__restrict pred, int n) \
+  {                                                            \
+    for (int i = 0; i < n; ++i)                                        \
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i];      \
+  }
+
+#define TEST_COUNT(T, TYPE, CONST) \
+  T (TYPE, add, +, CONST) \
+  T (TYPE, sub, -, CONST)
+
+#define TEST_TYPE(T, TYPE, CONST) \
+  TEST_COUNT (T, TYPE, 2) \
+  TEST_COUNT (T, TYPE, 4) \
+  TEST_COUNT (T, TYPE, CONST)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t, 0x80) \
+  TEST_TYPE (T, uint16_t, 0x8000) \
+  TEST_TYPE (T, uint32_t, 0x80000000) \
+  TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } 
} */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 
} } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 
} } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } 
} */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 
} } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, 
z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, 
z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, 
z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, 
z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, 
z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, 
z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, 
z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, 
z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c
===================================================================
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c       2019-08-15 
09:20:53.716070328 +0100
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_8.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP, CONST)                       \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N);         \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i];        \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}

Reply via email to