[PATCH/GCC16 v2 1/1] AArch64: Emit half-precision FCMP/FCMPE

2025-01-31 Thread Spencer Abson
Enable a target with FEAT_FP16 to emit the half-precision variants
of FCMP/FCMPE.

gcc/ChangeLog:

* config/aarch64/aarch64.md: Update cbranch, cstore, fcmp
and fcmpe to use the GPF_F16 iterator for floating-point
modes.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/_Float16_cmp_1.c: New test.
* gcc.target/aarch64/_Float16_cmp_2.c: New (negative) test.
---
 gcc/config/aarch64/aarch64.md | 29 +-
 .../gcc.target/aarch64/_Float16_cmp_1.c   | 54 +++
 .../gcc.target/aarch64/_Float16_cmp_2.c   |  7 +++
 3 files changed, 77 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 071058dbeb3..f63e4d79b3c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -707,11 +707,12 @@
 )
 
 (define_expand "cbranch4"
-  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
-   [(match_operand:GPF 1 "register_operand")
-(match_operand:GPF 2 
"aarch64_fp_compare_operand")])
-  (label_ref (match_operand 3 "" ""))
-  (pc)))]
+  [(set (pc) (if_then_else
+   (match_operator 0 "aarch64_comparison_operator"
+[(match_operand:GPF_F16 1 "register_operand")
+ (match_operand:GPF_F16 2 "aarch64_fp_compare_operand")])
+   (label_ref (match_operand 3 "" ""))
+   (pc)))]
   ""
   "
   operands[1] = aarch64_gen_compare_reg (GET_CODE (operands[0]), operands[1],
@@ -4338,26 +4339,28 @@
 
 (define_insn "fcmp"
   [(set (reg:CCFP CC_REGNUM)
-(compare:CCFP (match_operand:GPF 0 "register_operand")
- (match_operand:GPF 1 "aarch64_fp_compare_operand")))]
+   (compare:CCFP
+ (match_operand:GPF_F16 0 "register_operand")
+ (match_operand:GPF_F16 1 "aarch64_fp_compare_operand")))]
"TARGET_FLOAT"
{@ [ cons: 0 , 1  ]
   [ w   , Y  ] fcmp\t%0, #0.0
   [ w   , w  ] fcmp\t%0, %1
   }
-  [(set_attr "type" "fcmp")]
+  [(set_attr "type" "fcmp")]
 )
 
 (define_insn "fcmpe"
   [(set (reg:CCFPE CC_REGNUM)
-(compare:CCFPE (match_operand:GPF 0 "register_operand")
-  (match_operand:GPF 1 "aarch64_fp_compare_operand")))]
+   (compare:CCFPE
+ (match_operand:GPF_F16 0 "register_operand")
+ (match_operand:GPF_F16 1 "aarch64_fp_compare_operand")))]
"TARGET_FLOAT"
{@ [ cons: 0 , 1  ]
   [ w   , Y  ] fcmpe\t%0, #0.0
   [ w   , w  ] fcmpe\t%0, %1
   }
-  [(set_attr "type" "fcmp")]
+  [(set_attr "type" "fcmp")]
 )
 
 (define_insn "*cmp_swp__reg"
@@ -4425,8 +4428,8 @@
 (define_expand "cstore4"
   [(set (match_operand:SI 0 "register_operand")
(match_operator:SI 1 "aarch64_comparison_operator_mode"
-[(match_operand:GPF 2 "register_operand")
- (match_operand:GPF 3 "aarch64_fp_compare_operand")]))]
+[(match_operand:GPF_F16 2 "register_operand")
+ (match_operand:GPF_F16 3 "aarch64_fp_compare_operand")]))]
   ""
   "
   operands[2] = aarch64_gen_compare_reg (GET_CODE (operands[1]), operands[2],
diff --git a/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c 
b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
new file mode 100644
index 000..e49ace1d7dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+fp16" } */
+
+/*
+** test_fcmp_store:
+** fcmph0, h1
+** csetw0, eq
+** ret
+*/
+int
+test_fcmp_store(_Float16 a, _Float16 b)
+{
+return a == b;
+}
+
+/*
+** test_fcmpe_store:
+** fcmpe   h0, h1
+** csetw0, mi
+** ret
+*/
+int
+test_fcmpe_store(_Float16 a, _Float16 b)
+{
+return a < b;
+}
+
+/*
+** test_fcmp_branch:
+** fcmph0, h1
+** ...
+*/
+_Float16
+test_fcmp_branch(_Float16 a, _Float16 b)
+{
+if (a == b)
+return a * b;
+return a;
+}
+
+/*
+** test_fcmpe_branch:
+** fcmpe   h0, h1
+** ...
+*/
+_Float16
+test_fcmpe_branch(_Float16 a, _Float16 b)
+{
+if (a < b)
+return a * b;
+return a;
+}
+
+/* { dg-final { check-function-bodies "**" "" "" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c 
b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c
new file mode 100644
index 000..0ff7cda8796
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+nofp16" } */
+
+#include "_Float16_cmp_1.c"
+
+/* { dg-final { scan-assembler-not {\tfcmp\th[0-9]+} } } */
+/* { dg-final { scan-assembler-not {\tfcmpe\th[0-9]+} } } */
-- 
2.34.1



[PATCH/GCC16 v2 0/1] AArch64: Emit half-precision FCMP/FCMPE

2025-01-31 Thread Spencer Abson
Applied the fixups suggested in the previous review, cheers.


This patch allows the AArch64 back end to emit the half-precision variants of
FCMP and FCMPE, given the target supports FEAT_FP16. Previously, such 
comparisons
would be unnecessarily promoted to single-precision.

The latest documentation of these instructions can be found here:
https://developer.arm.com/documentation/ddi0602/2024-12

Successfully bootstrapped and regtested on aarch64-linux-gnu.

OK for stage 1?

Spencer Abson (1):
  AArch64: Emit half-precision FCMP/FCMPE

 gcc/config/aarch64/aarch64.md | 29 +-
 .../gcc.target/aarch64/_Float16_cmp_1.c   | 54 +++
 .../gcc.target/aarch64/_Float16_cmp_2.c   |  7 +++
 3 files changed, 77 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c

-- 
2.34.1



[PATCH/GCC16 0/1] AArch64: Define the spaceship optab [PR117013]

2025-01-23 Thread Spencer Abson
This patch defines spaceship{sf,df,si,di} for AArch64. This is a fix
for the poor codegen on floating-point types raised by the PR, and
an improvement to that for integers where this optab applies.

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117013.

Successfully bootstrapped and regtested on aarch64-linux-gnu.

OK for stage 1?

Spencer Abson (1):
  AArch64: Define the spaceship optab [PR117013]

 gcc/config/aarch64/aarch64-protos.h   |   1 +
 gcc/config/aarch64/aarch64.cc |  73 +++
 gcc/config/aarch64/aarch64.md |  43 
 .../g++.target/aarch64/spaceship_1.C  | 192 ++
 .../g++.target/aarch64/spaceship_2.C  |  72 +++
 .../g++.target/aarch64/spaceship_3.C  |   9 +
 6 files changed, 390 insertions(+)
 create mode 100644 gcc/testsuite/g++.target/aarch64/spaceship_1.C
 create mode 100644 gcc/testsuite/g++.target/aarch64/spaceship_2.C
 create mode 100644 gcc/testsuite/g++.target/aarch64/spaceship_3.C

-- 
2.34.1



[PATCH/GCC16 1/1] AArch64: Define the spaceship optab [PR117013]

2025-01-23 Thread Spencer Abson
This expansion ensures that exactly one comparison is emitted for
spacesip-like sequences on floating-point operands, including when
the result of such sequences are compared against members of
std.

For both integer and floating-point types, we optimize for the case
in which the result of a spaceship-like operation is written to a GPR.
The PR highlights this issue for floating-point operands, but we also
make an improvement for integers, preferring:

cmp w0, w1
csetw1, gt
csinv   w0, w1, wzr, ge

over:

cmp w0, w1
mov w0, 1
csinv   w0, w0, wzr, ge
cselw0, w0, wzr, ne

to compute:

auto test(int a, int b) { return a <=> b;}

gcc/ChangeLog:
PR target/117013
* config/aarch64/aarch64-protos.h (aarch64_expand_fp_spaceship):
Declare optab expander function for floating-point types.
* config/aarch64/aarch64.cc (aarch64_expand_fp_spaceship):
Define optab expansion for floating-point types (new function).
* config/aarch64/aarch64.md (spaceship4):
Add define_expands for spaceship4 on integer and
floating-point types.

gcc/testsuite/ChangeLog:
PR target/117013
* g++.target/aarch64/spaceship_1.C: New test.
* g++.target/aarch64/spaceship_2.C: New test.
* g++.target/aarch64/spaceship_3.C: New test.
---
 gcc/config/aarch64/aarch64-protos.h   |   1 +
 gcc/config/aarch64/aarch64.cc |  73 +++
 gcc/config/aarch64/aarch64.md |  43 
 .../g++.target/aarch64/spaceship_1.C  | 192 ++
 .../g++.target/aarch64/spaceship_2.C  |  72 +++
 .../g++.target/aarch64/spaceship_3.C  |   9 +
 6 files changed, 390 insertions(+)
 create mode 100644 gcc/testsuite/g++.target/aarch64/spaceship_1.C
 create mode 100644 gcc/testsuite/g++.target/aarch64/spaceship_2.C
 create mode 100644 gcc/testsuite/g++.target/aarch64/spaceship_3.C

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index fa7bc8029be..39a1dae4e8b 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1240,6 +1240,7 @@ void aarch64_restore_za (rtx);
 void aarch64_expand_crc_using_pmull (scalar_mode, scalar_mode, rtx *);
 void aarch64_expand_reversed_crc_using_pmull (scalar_mode, scalar_mode, rtx *);
 
+void aarch64_expand_fp_spaceship (rtx, rtx, rtx, rtx);
 
 extern bool aarch64_gcs_enabled ();
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index dba779a8e51..ea5dd0d5047 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -31427,6 +31427,79 @@ aarch64_expand_reversed_crc_using_pmull (scalar_mode 
crc_mode,
 }
 }
 
+/* Expand the spaceship optab for floating-point operands.
+
+   If the result is compared against (-1, 0, 1 , 2), expand into
+   fcmpe + conditional branch insns.
+
+   Otherwise (the result is just stored as an integer), expand into
+   fcmpe + a sequence of conditional select/increment/invert insns.  */
+void
+aarch64_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx hint)
+{
+  rtx cc_reg = gen_rtx_REG (CCFPEmode, CC_REGNUM);
+  emit_set_insn (cc_reg, gen_rtx_COMPARE (CCFPEmode, op0, op1));
+
+  rtx cc_gt = gen_rtx_GT (VOIDmode, cc_reg, const0_rtx);
+  rtx cc_lt = gen_rtx_LT (VOIDmode, cc_reg, const0_rtx);
+  rtx cc_un = gen_rtx_UNORDERED (VOIDmode, cc_reg, const0_rtx);
+
+  if (hint == const0_rtx)
+{
+  rtx un_label = gen_label_rtx ();
+  rtx lt_label = gen_label_rtx ();
+  rtx gt_label = gen_label_rtx ();
+  rtx end_label = gen_label_rtx ();
+
+  rtx temp = gen_rtx_IF_THEN_ELSE (VOIDmode, cc_un,
+   gen_rtx_LABEL_REF (Pmode, un_label), pc_rtx);
+  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, temp));
+
+  temp = gen_rtx_IF_THEN_ELSE (VOIDmode, cc_lt,
+   gen_rtx_LABEL_REF (Pmode, lt_label), pc_rtx);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
+
+  temp = gen_rtx_IF_THEN_ELSE (VOIDmode, cc_gt,
+   gen_rtx_LABEL_REF (Pmode, gt_label), pc_rtx);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
+
+  /* Equality.  */
+  emit_move_insn (dest, const0_rtx);
+  emit_jump (end_label);
+
+  emit_label (un_label);
+  emit_move_insn (dest, const2_rtx);
+  emit_jump (end_label);
+
+  emit_label (gt_label);
+  emit_move_insn (dest, const1_rtx);
+  emit_jump (end_label);
+
+  emit_label (lt_label);
+  emit_move_insn (dest, constm1_rtx);
+
+  emit_label (end_label);
+}
+  else
+{
+  rtx temp0 = gen_reg_rtx (SImode);
+  rtx temp1 = gen_reg_rtx (SImode);
+  rtx cc_ungt = gen_rtx_UNGT (VOIDmode, cc_reg, const0_rtx);
+
+  /* The value of hint is stored if the operands are unordered.  */
+  rtx temp_un = gen_int_mode (UINTVAL (hint) - 1, SImode);
+  if (!aarch64_reg_zero_or_m1_or_1 (temp_un, SImode))
+   temp_un = force_reg (SImode, temp_un);
+
+ 

[PATCH/GCC16 0/1] AArch64: Emit half-precision FCMP/FCMPE

2025-01-27 Thread Spencer Abson
This patch allows the AArch64 back end to emit the half-precision variants of
FCMP and FCMPE, given the target supports FEAT_FP16. Previously, such 
comparisons
would be unnecessarily promoted to single-precision.

The latest documentation of these instructions can be found here:
https://developer.arm.com/documentation/ddi0602/2024-12

Successfully bootstrapped and regtested on aarch64-linux-gnu.

OK for stage 1?

Spencer Abson (1):
  AArch64: Emit half-precision FCMP/FCMPE

 gcc/config/aarch64/aarch64.md | 29 +-
 .../gcc.target/aarch64/_Float16_cmp_1.c   | 54 +++
 .../gcc.target/aarch64/_Float16_cmp_2.c   |  7 +++
 3 files changed, 77 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c

-- 
2.34.1



[PATCH/GCC16 1/1] AArch64: Emit half-precision FCMP/FCMPE

2025-01-27 Thread Spencer Abson
Enable a target with FEAT_FP16 to emit the half-precision variants
of FCMP/FCMPE.

gcc/ChangeLog:

* config/aarch64/aarch64.md: Update cbranch, cstore, fcmp
and fcmpe to use the GPF_F16 iterator for floating-point
modes.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/_Float16_cmp_1.c: New test.
* gcc.target/aarch64/_Float16_cmp_2.c: New (negative) test.
---
 gcc/config/aarch64/aarch64.md | 29 +-
 .../gcc.target/aarch64/_Float16_cmp_1.c   | 54 +++
 .../gcc.target/aarch64/_Float16_cmp_2.c   |  7 +++
 3 files changed, 77 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 071058dbeb3..8721bf5d4f3 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -707,11 +707,12 @@
 )
 
 (define_expand "cbranch4"
-  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
-   [(match_operand:GPF 1 "register_operand")
-(match_operand:GPF 2 
"aarch64_fp_compare_operand")])
-  (label_ref (match_operand 3 "" ""))
-  (pc)))]
+  [(set (pc) (if_then_else
+   (match_operator 0 "aarch64_comparison_operator"
+[(match_operand:GPF_F16 1 "register_operand")
+ (match_operand:GPF_F16 2 "aarch64_fp_compare_operand")])
+   (label_ref (match_operand 3 "" ""))
+   (pc)))]
   ""
   "
   operands[1] = aarch64_gen_compare_reg (GET_CODE (operands[0]), operands[1],
@@ -4338,26 +4339,28 @@
 
 (define_insn "fcmp"
   [(set (reg:CCFP CC_REGNUM)
-(compare:CCFP (match_operand:GPF 0 "register_operand")
- (match_operand:GPF 1 "aarch64_fp_compare_operand")))]
+   (compare:CCFP
+   (match_operand:GPF_F16 0 "register_operand")
+   (match_operand:GPF_F16 1 "aarch64_fp_compare_operand")))]
"TARGET_FLOAT"
{@ [ cons: 0 , 1  ]
   [ w   , Y  ] fcmp\t%0, #0.0
   [ w   , w  ] fcmp\t%0, %1
   }
-  [(set_attr "type" "fcmp")]
+  [(set_attr "type" "fcmp")]
 )
 
 (define_insn "fcmpe"
   [(set (reg:CCFPE CC_REGNUM)
-(compare:CCFPE (match_operand:GPF 0 "register_operand")
-  (match_operand:GPF 1 "aarch64_fp_compare_operand")))]
+   (compare:CCFPE
+   (match_operand:GPF_F16 0 "register_operand")
+   (match_operand:GPF_F16 1 "aarch64_fp_compare_operand")))]
"TARGET_FLOAT"
{@ [ cons: 0 , 1  ]
   [ w   , Y  ] fcmpe\t%0, #0.0
   [ w   , w  ] fcmpe\t%0, %1
   }
-  [(set_attr "type" "fcmp")]
+  [(set_attr "type" "fcmp")]
 )
 
 (define_insn "*cmp_swp__reg"
@@ -4425,8 +4428,8 @@
 (define_expand "cstore4"
   [(set (match_operand:SI 0 "register_operand")
(match_operator:SI 1 "aarch64_comparison_operator_mode"
-[(match_operand:GPF 2 "register_operand")
- (match_operand:GPF 3 "aarch64_fp_compare_operand")]))]
+[(match_operand:GPF_F16 2 "register_operand")
+ (match_operand:GPF_F16 3 "aarch64_fp_compare_operand")]))]
   ""
   "
   operands[2] = aarch64_gen_compare_reg (GET_CODE (operands[1]), operands[2],
diff --git a/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c 
b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
new file mode 100644
index 000..e49ace1d7dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+fp16" } */
+
+/*
+** test_fcmp_store:
+** fcmph0, h1
+** csetw0, eq
+** ret
+*/
+int
+test_fcmp_store(_Float16 a, _Float16 b)
+{
+return a == b;
+}
+
+/*
+** test_fcmpe_store:
+** fcmpe   h0, h1
+** csetw0, mi
+** ret
+*/
+int
+test_fcmpe_store(_Float16 a, _Float16 b)
+{
+return a < b;
+}
+
+/*
+** test_fcmp_branch:
+** fcmph0, h1
+** ...
+*/
+_Float16
+test_fcmp_branch(_Float16 a, _Float16 b)
+{
+if (a == b)
+return a * b;
+return a;
+}
+
+/*
+** test_fcmpe_branch:
+** fcmpe   h0, h1
+** ...
+*/
+_Float16
+test_fcmpe_branch(_Float16 a, _Float16 b)
+{
+if (a < b)
+return a * b;
+return a;
+}
+
+/* { dg-final { check-function-bodies "**" "" "" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c 
b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c
new file mode 100644
index 000..e714304970b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+nofp16" } */
+
+#include "_Float16_cmp_1.c"
+
+/* { dg-final { scan-assembler-not "\tfcmp\th\[0-9\]\+" } } */
+/* { dg-final { scan-assembler-not "\tfcmpe\th\[0-9\]\+" } } */
\ No newline at end of file
--

[PATCH v2 1/1] AArch64: Fold builtins with highpart args to highpart equivalent [PR117850]

2025-02-21 Thread Spencer Abson
Add a fold at gimple_fold_builtin to prefer the highpart variant of a builtin
if the arguments are better suited to it. This helps us avoid copying data
between lanes before operation.

E.g. We prefer to use UMULL2 rather than DUP+UMULL for the following:

uint16x8_t
foo(const uint8x16_t s) {
const uint8x16_t f0 = vdupq_n_u8(4);
return vmull_u8(vget_high_u8(s), vget_high_u8(f0));
}

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc (LO_HI_PAIRINGS): New macro.
Cover every lo/hi pairing in builtin-pairs.def.
(aarch64_get_highpart_builtin): New function.  Get the fndecl for
the hi builtin paired with FCODE.
(LO_HI_PAIR): New macro.
(aarch64_object_of_bfr): New function.  Parse BIT_FIELD_REF expressions.
(aarch64_duplicate_vector_cst): New function.
(aarch64_nbit_vector_type_p): New function.  Check if a type describes
an n-bit vector.
(aarch64_vq_high_half): New function. Helper to identify vector
highparts.
(aarch64_fold_lo_call_to_hi): New function.  Perform the fold described
here.
(aarch64_general_gimple_fold_builtin): Add cases for lo builtins.
* config/aarch64/aarch64-builtin-pairs.def: New file.  Declare pairings
of lo/hi builtins.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/simd/vabal_combine.c: Removed.
* gcc.target/aarch64/simd/fold_to_highpart_1.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_2.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_3.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_4.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_5.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_6.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_7.c: New test.
---
 gcc/config/aarch64/aarch64-builtin-pairs.def  |  81 ++
 gcc/config/aarch64/aarch64-builtins.cc| 206 +
 .../aarch64/simd/fold_to_highpart_1.c | 733 ++
 .../aarch64/simd/fold_to_highpart_2.c |  86 ++
 .../aarch64/simd/fold_to_highpart_3.c |  81 ++
 .../aarch64/simd/fold_to_highpart_4.c |  77 ++
 .../aarch64/simd/fold_to_highpart_5.c |  38 +
 .../aarch64/simd/fold_to_highpart_6.c |  94 +++
 .../aarch64/simd/fold_to_highpart_7.c |  36 +
 .../gcc.target/aarch64/simd/vabal_combine.c   |  72 --
 10 files changed, 1432 insertions(+), 72 deletions(-)
 create mode 100644 gcc/config/aarch64/aarch64-builtin-pairs.def
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_7.c
 delete mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c

diff --git a/gcc/config/aarch64/aarch64-builtin-pairs.def 
b/gcc/config/aarch64/aarch64-builtin-pairs.def
new file mode 100644
index 000..e1dc0b71a1c
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-builtin-pairs.def
@@ -0,0 +1,81 @@
+/* Pairings of AArch64 builtins that can be folded into each other.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   .  */
+
+/* LO/HI widenable integer modes.  */
+#define LO_HI_PAIR_V_WI(T, LO, HI) \
+  LO_HI_PAIR (T##_##LO##v2si, T##_##HI##v4si) \
+  LO_HI_PAIR (T##_##LO##v4hi, T##_##HI##v8hi) \
+  LO_HI_PAIR (T##_##LO##v8qi, T##_##HI##v16qi)
+
+/* LO/HI Single/Half integer modes.  */
+#define LO_HI_PAIR_V_HSI(T, LO, HI) \
+  LO_HI_PAIR (T##_##LO##v2si, T##_##HI##v4si) \
+  LO_HI_PAIR (T##_##LO##v4hi, T##_##HI##v8hi)
+
+#define UNOP_LONG_LH_PAIRS \
+  LO_HI_PAIR (UNOP_sxtlv8hi,  UNOP_vec_unpacks_hi_v16qi) \
+  LO_HI_PAIR (UNOP_sxtlv4si,  UNOP_vec_unpacks_hi_v8hi) \
+  LO_HI_PAIR (UNOP_sxtlv2di,  UNOP_vec_unpacks_hi_v4si) \
+  LO_HI_PAIR (UNOPU_uxtlv8hi, UNOPU_vec_unpacku_hi_v16qi) \
+  LO_HI_PAIR (UNOPU_uxtlv4si, UNOPU_vec_unpacku_hi_v8hi) \
+  LO_HI_PAIR (UNOPU_uxtlv2di, UNOPU_vec_unpa

[PATCH v2 0/1] AArch64: Fold builtins with highpart args to highpart equivalent [PR117850]

2025-02-21 Thread Spencer Abson
Hi all,

This patch implements the missed optimisation noted in PR117850.

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117850

Changes since the last revision:
- Now processing different function signatures programatically
- Fixed ICE RE the type referred to by a BIT_FIELD_REF
- Modified testcase regex to account for commutativity
- Removed vabal_combine.c in favour of coverage in fold_to_highpart_1.c
- Supported widening floating-point conversions
- General response to feedback from Richard S., Christophe Lyon.

Notes:
- I'm on the fence about adding more conservative assertions to 
aarch64_duplicate_vector_cst.
  E.g.
gcc_assert (types_compatible_p (TREE_TYPE (TREE_TYPE 
(vec_in)),
   (TREE_TYPE (out_ty;

- I give this test in fold_to_highpart_7.c; 
https://godbolt.org/z/sG6GdEdGb.  It's interesting
  that the current behavior of GCC is worse than GCC14, trunk targeting 
aarch64_be, and  Clang.

  Maybe it's worth a thought?

Bootstrapped and regtested on aarch64-none-linux-gnu. This work was also
tested on a cross-compiler targeting aarch64_be-none-linux-gnu.

OK for stage-1?

Thanks,
Spencer

Spencer Abson (1):
  AArch64: Fold builtins with highpart args to highpart equivalent
[PR117850]

 gcc/config/aarch64/aarch64-builtin-pairs.def  |  81 ++
 gcc/config/aarch64/aarch64-builtins.cc| 206 +
 .../aarch64/simd/fold_to_highpart_1.c | 733 ++
 .../aarch64/simd/fold_to_highpart_2.c |  86 ++
 .../aarch64/simd/fold_to_highpart_3.c |  81 ++
 .../aarch64/simd/fold_to_highpart_4.c |  77 ++
 .../aarch64/simd/fold_to_highpart_5.c |  38 +
 .../aarch64/simd/fold_to_highpart_6.c |  94 +++
 .../aarch64/simd/fold_to_highpart_7.c |  36 +
 .../gcc.target/aarch64/simd/vabal_combine.c   |  72 --
 10 files changed, 1432 insertions(+), 72 deletions(-)
 create mode 100644 gcc/config/aarch64/aarch64-builtin-pairs.def
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_7.c
 delete mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c

-- 
2.34.1



Re: [PATCH 1/1] AArch64: Fold builtins with highpart args to highpart equivalent [PR117850]

2025-02-18 Thread Spencer Abson
Hi Kyrill,

Thanks for your comments, and for answering my question RE your work. Happy to
apply those changes in the next revision.

Cheers,
Spencer


Re: [PATCH 1/1] AArch64: Fold builtins with highpart args to highpart equivalent [PR117850]

2025-02-18 Thread Spencer Abson
On Tue, Feb 18, 2025 at 10:27:46AM +, Richard Sandiford wrote:
> Thanks, this generally looks really good.  Some comments on top of
> Kyrill's, and Christophe's comment internally about -save-temps.
> 
> Spencer Abson  writes:
> > +/* Build and return a new VECTOR_CST that is the concatenation of
> > +   VEC_IN with itself.  */
> > +static tree
> > +aarch64_self_concat_vec_cst (tree vec_in)
> > +{
> > +  gcc_assert ((TREE_CODE (vec_in) == VECTOR_CST));
> > +  unsigned HOST_WIDE_INT nelts
> > += VECTOR_CST_NELTS (vec_in).to_constant ();
> > +
> > +  tree out_type = build_vector_type (TREE_TYPE (TREE_TYPE (vec_in)),
> > +nelts * 2);
> 
> It would be good to pass in the type that the caller wants.
> More about that below.

Yeah, I can see the advantage of that.

> 
> > +
> > +  /* Avoid decoding/encoding if the encoding won't change.  */
> > +  if (VECTOR_CST_DUPLICATE_P (vec_in))
> > +{
> > +  tree vec_out = make_vector (exact_log2
> > +(VECTOR_CST_NPATTERNS (vec_in)), 1);
> > +  unsigned int encoded_size
> > +   = vector_cst_encoded_nelts (vec_in) * sizeof (tree);
> > +
> > +  memcpy (VECTOR_CST_ENCODED_ELTS (vec_out),
> > + VECTOR_CST_ENCODED_ELTS (vec_in), encoded_size);
> > +
> > +  TREE_TYPE (vec_out) = out_type;
> > +  return vec_out;
> > +}
> 
> I'm not sure this is worth it.  The approach below shouldn't be that
> much less efficient, since all the temporaries are generally on the
> stack.  Also:
> 
> > +
> > +  tree_vector_builder vec_out (out_type, nelts, 1);
> 
> This call rightly describes a duplicated sequence of NELTS elements so...
> 
> > +  for (unsigned i = 0; i < nelts * 2; i++)
> > +vec_out.quick_push (VECTOR_CST_ELT (vec_in, i % nelts));
> 
> ...it should only be necessary to push nelts elements here.

Good point!

> 
> > +
> > +  return vec_out.build ();
> > +}
> > +
> > +/* If the SSA_NAME_DEF_STMT of ARG is an assignement to a
> > +   BIT_FIELD_REF with SIZE and OFFSET, return the object of the
> > +   BIT_FIELD_REF.  Otherwise, return NULL_TREE.  */
> > +static tree
> > +aarch64_object_of_bfr (tree arg, unsigned HOST_WIDE_INT size,
> > +  unsigned HOST_WIDE_INT offset)
> > +{
> > +  if (TREE_CODE (arg) != SSA_NAME)
> > +return NULL_TREE;
> > +
> > +  gassign *stmt = dyn_cast (SSA_NAME_DEF_STMT (arg));
> > +
> > +  if (!stmt)
> > +return NULL_TREE;
> > +
> > +  if (gimple_assign_rhs_code (stmt) != BIT_FIELD_REF)
> > +return NULL_TREE;
> > +
> > +  tree bf_ref = gimple_assign_rhs1 (stmt);
> > +
> > +  if (bit_field_size (bf_ref).to_constant () != size
> > +  || bit_field_offset (bf_ref).to_constant () != offset)
> > +return NULL_TREE;
> > +
> > +  return TREE_OPERAND (bf_ref, 0);
> 
> I think this also needs to check that operand 0 of the BIT_FIELD_REF
> is a 128-bit vector.  A 64-bit reference at offset 64 could instead
> be into something else, such as a 256-bit vector.
> 
> An example is:
> 
> --
> #include 
> 
> typedef int16_t int16x16_t __attribute__((vector_size(32)));
> 
> int32x4_t
> f (int16x16_t foo)
> {
>   return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] });
> }
> --
> 
> which triggers an ICE.
> 
> Even if the argument is a 128-bit vector, it could be a 128-bit
> vector of a different type, such as in:
> 
> --
> #include 
> 
> int32x4_t
> f (int32x4_t foo)
> {
>   return vmovl_s16 (vget_high_s16 (vreinterpretq_s16_s32 (foo)));
> }
> --
> 
> I think we should still accept this second case, but emit a VIEW_CONVERT_EXPR
> before the call to convert the argument to the right type.
> 

Thanks for raising these, serious tunnel vision on my part...

> > +}
> > +
> > +/*  Prefer to use the highpart builtin when:
> > +
> > +1) All lowpart arguments are references to the highparts of other
> > +vectors.
> > +
> > +2) For calls with two lowpart arguments, if either refers to a
> > +vector highpart and the other is a VECTOR_CST.  We can copy the
> > +VECTOR_CST to 128b in this case.  */
> > +static bool
> > +aarch64_fold_lo_call_

[PATCH 1/1] AArch64: Fold builtins with highpart args to highpart equivalent [PR117850]

2025-02-17 Thread Spencer Abson
Add a fold at gimple_fold_builtin to prefer the highpart variant of a builtin
if the arguments are better suited to it. This helps us avoid copying data
between lanes before operation.

E.g. We prefer to use UMULL2 rather than DUP+UMULL for the following:

uint16x8_t
foo(const uint8x16_t s) {
const uint8x16_t f0 = vdupq_n_u8(4);
return vmull_u8(vget_high_u8(s), vget_high_u8(f0));
}

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc (LO_HI_PAIRINGS): New macro.
Covers every LO_HI_PAIR.
(aarch64_get_highpart_builtin): New function. Get the highpart builtin
paired with the input FCODE.
(LO_HI_PAIR):
(aarch64_self_concat_vec_cst): New function. Concatenate a
VECTOR_CST with itself.
(aarch64_object_of_bfr): New function. Helper to check arguments
for vector highparts.
(aarch64_fold_lo_call_to_hi): New function.
(aarch64_general_gimple_fold_builtin): Add cases for the lowpart
builtins.
* config/aarch64/aarch64-builtin-pairs.def: New file. Declare
pairings of lowpart/highpart builtins.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/vabal_combine.c: Test changed to
pass after earlier builtin fold.
* gcc.target/aarch64/simd/fold_to_highpart_1.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_2.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_3.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_4.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_5.c: New test.
---
 gcc/config/aarch64/aarch64-builtin-pairs.def  |  77 ++
 gcc/config/aarch64/aarch64-builtins.cc| 232 ++
 .../aarch64/simd/fold_to_highpart_1.c | 708 ++
 .../aarch64/simd/fold_to_highpart_2.c |  82 ++
 .../aarch64/simd/fold_to_highpart_3.c |  80 ++
 .../aarch64/simd/fold_to_highpart_4.c |  77 ++
 .../aarch64/simd/fold_to_highpart_5.c |  71 ++
 .../gcc.target/aarch64/simd/vabal_combine.c   |  12 +-
 8 files changed, 1333 insertions(+), 6 deletions(-)
 create mode 100644 gcc/config/aarch64/aarch64-builtin-pairs.def
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c

diff --git a/gcc/config/aarch64/aarch64-builtin-pairs.def 
b/gcc/config/aarch64/aarch64-builtin-pairs.def
new file mode 100644
index 000..d3ca69a1887
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-builtin-pairs.def
@@ -0,0 +1,77 @@
+/* Pairings of AArch64 builtins that can be folded into each other.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   .  */
+
+/* LO/HI widenable integer modes.  */
+#define LO_HI_PAIR_V_WI(T, LO, HI) \
+  LO_HI_PAIR (T##_##LO##v2si, T##_##HI##v4si) \
+  LO_HI_PAIR (T##_##LO##v4hi, T##_##HI##v8hi) \
+  LO_HI_PAIR (T##_##LO##v8qi, T##_##HI##v16qi)
+
+/* LO/HI Single/Half integer modes.  */
+#define LO_HI_PAIR_V_HSI(T, LO, HI) \
+  LO_HI_PAIR (T##_##LO##v2si, T##_##HI##v4si) \
+  LO_HI_PAIR (T##_##LO##v4hi, T##_##HI##v8hi)
+
+#define UNOP_LONG_LH_PAIRS \
+  LO_HI_PAIR (UNOP_sxtlv8hi,  UNOP_vec_unpacks_hi_v16qi) \
+  LO_HI_PAIR (UNOP_sxtlv4si,  UNOP_vec_unpacks_hi_v8hi) \
+  LO_HI_PAIR (UNOP_sxtlv2di,  UNOP_vec_unpacks_hi_v4si) \
+  LO_HI_PAIR (UNOPU_uxtlv8hi, UNOPU_vec_unpacku_hi_v16qi) \
+  LO_HI_PAIR (UNOPU_uxtlv4si, UNOPU_vec_unpacku_hi_v8hi) \
+  LO_HI_PAIR (UNOPU_uxtlv2di, UNOPU_vec_unpacku_hi_v4si)
+
+#define BINOP_LONG_LH_PAIRS \
+  LO_HI_PAIR_V_WI (BINOP,  saddl, saddl2) \
+  LO_HI_PAIR_V_WI (BINOPU, uaddl, uaddl2) \
+  LO_HI_PAIR_V_WI (BINOP,  ssubl, ssubl2) \
+  LO_HI_PAIR_V_WI (BINOPU, usubl, usubl2) \
+  LO_HI_PAIR_V_WI (BINOP,  sabdl, sabdl2) \
+  LO_HI_PAIR_V_WI (BINOPU, uabdl, uabdl2) \
+  LO_HI_PAIR_V_WI (BINOP,  intrinsic_vec_smult_lo_, vec_widen_smult_hi_) \
+  LO_HI_PAIR_V_WI (BINOPU, intrinsic_vec_umult_lo_, vec_widen_umult_hi_) \
+  LO_HI_PAIR_V_HSI (BINOP,  sqdmull, sqdmull2)
+
+#define BINOP_LONG_N_LH_PAIRS \
+  LO_HI_PAIR_V_HSI (BINOP,  smull_n, smull_hi_n) \
+  

[PATCH 0/1] AArch64: Fold builtin calls w/ highpart args to highpart equivalent [PR117850]

2025-02-17 Thread Spencer Abson
Hi all,

This patch implements the missed optimisation noted in PR117850.

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117850

It covers all the AArch64 builtins that I can imagine this is sensible for,
excluding vshll/vshll_n (for now) due to a discrepancy in their declarations.

Bootstrapped and regtested on aarch64-none-linux-gnu. This work was also
tested on a cross-compiler targeting aarch64_be-none-linux-gnu.

CC'ing Kyrylo as it looks like this patch interferes with his earlier work -
I'm wondering what to do about simd/vabal_combine.c without losing coverage?

OK for stage-1?

Spencer

Spencer Abson (1):
  AArch64: Fold builtins with highpart args to highpart equivalent
[PR117850]

 gcc/config/aarch64/aarch64-builtin-pairs.def  |  77 ++
 gcc/config/aarch64/aarch64-builtins.cc| 232 ++
 .../aarch64/simd/fold_to_highpart_1.c | 708 ++
 .../aarch64/simd/fold_to_highpart_2.c |  82 ++
 .../aarch64/simd/fold_to_highpart_3.c |  80 ++
 .../aarch64/simd/fold_to_highpart_4.c |  77 ++
 .../aarch64/simd/fold_to_highpart_5.c |  71 ++
 .../gcc.target/aarch64/simd/vabal_combine.c   |  12 +-
 8 files changed, 1333 insertions(+), 6 deletions(-)
 create mode 100644 gcc/config/aarch64/aarch64-builtin-pairs.def
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c

-- 
2.34.1



[PATCH 0/1][RFC] middle-end: target support checks for vectorizable_induction

2025-03-20 Thread Spencer Abson
Hi all,

While tinkering with AArch64's SVE port, I noticed (by means of ICE) that 
vetorizable_induction does not accurately
test target support of the vectorized operations it emits.

This would only give an ICE for variable-length vectors (see 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103523),
so the patch I've attached here covers those only.

The question I'd like to raise is whether we should apply more scrutiny here; a 
vectorized MULT_EXPR is emitted to
calculate the step vector for each IV in SLP induction vectorization, as well 
as whenever we need to calucalate the
initial values of float inductions with variable-length vectors.

Is it worth moving some code around to test for support of MULT_EXPR with the 
mode of STEP_VECTYPE whenver we know
that the transformation will use it?
Is there a reason that testing for target support was omitted from the 
originial code?

While this is an RFC, the patch itself has been bootstrapped and regtested on 
aarch64-linux-gnu.

Thank you very much for any discussion.
Spencer Abson

Spencer Abson (1):
  Induction vectorizer: prevent ICE for scalable types

 gcc/tree-vect-loop.cc | 39 ++-
 1 file changed, 30 insertions(+), 9 deletions(-)

-- 
2.34.1



[PATCH 1/1][RFC] Induction vectorizer: prevent ICE for scalable types

2025-03-20 Thread Spencer Abson
We currently check that the target suppports PLUS_EXPR and MINUS_EXPR
with step_vectype (a fix for pr103523).  However, vectorizable_induction
can emit a vectorized MULT_EXPR when calculating the step of each IV for
SLP, and both MULT_EXPR/FLOAT_EXPR when calculating VEC_INIT for float
inductions.

gcc/ChangeLog:

* tree-vect-loop.cc (vectorizable_induction): Add target support
checks for vectorized MULT_EXPR and FLOAT_EXPR where necessary for
scalable types.
Prefer target_supports_op_p over directly_supports_p for these tree
codes.
(vect_update_nonlinear_iv): Fix a doc comment while I'm here.
---
 gcc/tree-vect-loop.cc | 39 ++-
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 9413dcef702..cce57978ae2 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10053,7 +10053,7 @@ vect_update_nonlinear_iv (gimple_seq* stmts, tree 
vectype,
 
 }
 
-/* Function vectorizable_induction
+/* Function vectorizable_nonlinear_induction
 
Check if STMT_INFO performs an nonlinear induction computation that can be
vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
@@ -10402,6 +10402,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   unsigned i;
   tree expr;
+  tree index_vectype = NULL_TREE;
   gimple_stmt_iterator si;
   enum vect_induction_op_type induction_type
 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
@@ -10513,12 +10514,29 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 "supported.\n");
   return false;
 }
-  tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
+  tree stept = TREE_TYPE (step_expr);
+  tree step_vectype = get_same_sized_vectype (stept, vectype);
 
-  /* Check for backend support of PLUS/MINUS_EXPR. */
-  if (!directly_supported_p (PLUS_EXPR, step_vectype)
-  || !directly_supported_p (MINUS_EXPR, step_vectype))
-return false;
+  /* Check for target support of the vectorized arithmetic used here.  */
+  if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
+  || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
+  return false;
+  if (!nunits.is_constant ())
+{
+  if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
+   return false;
+  /* FLOAT_EXPR when computing VEC_INIT for float inductions.  */
+  if (SCALAR_FLOAT_TYPE_P (stept))
+   {
+ tree index_type = build_nonstandard_integer_type
+   (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
+
+ index_vectype = build_vector_type (index_type, nunits);
+ if (!can_float_p (TYPE_MODE (step_vectype),
+   TYPE_MODE (index_vectype), 1))
+   return false;
+   }
+}
 
   if (!vec_stmt) /* transformation not required.  */
 {
@@ -10637,7 +10655,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
  nivs = 1;
}
   gimple_seq init_stmts = NULL;
-  tree stept = TREE_TYPE (step_vectype);
   tree lupdate_mul = NULL_TREE;
   if (!nested_in_vect_loop)
{
@@ -10741,7 +10758,9 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (steps[0])));
  gcc_assert (flag_associative_math);
- tree index = build_index_vector (step_vectype, 0, 1);
+ gcc_assert (index_vectype != NULL_TREE);
+
+ tree index = build_index_vector (index_vectype, 0, 1);
  new_name = gimple_convert (&init_stmts, TREE_TYPE (steps[0]),
 inits[0]);
  tree base_vec = gimple_build_vector_from_val (&init_stmts,
@@ -11016,7 +11035,9 @@ vectorizable_induction (loop_vec_info loop_vinfo,
+ (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
  gcc_assert (flag_associative_math);
- tree index = build_index_vector (step_vectype, 0, 1);
+ gcc_assert (index_vectype != NULL_TREE);
+
+ tree index = build_index_vector (index_vectype, 0, 1);
  tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
new_name);
  tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
-- 
2.34.1