This migrates AArch64 over to the new optab for 'plus' reductions, i.e. so the
define_expands produce scalars by generating a MOV to a GPR. Effectively, this
moves the vget_lane inside every arm_neon.h intrinsic, into the inside of the
define_expand.
Tested: aarch64.exp vect.exp on aarch64-none-elf and aarch64_be-none-elf (full
check-gcc on next patch for reduc_min/max)
gcc/ChangeLog:
* config/aarch64/aarch64-simd-builtins.def
(reduc_splus_<mode>/VDQF, reduc_uplus_<mode>/VDQF, reduc_splus_v4sf):
Remove.
(reduc_plus_scal_<mode>, reduc_plus_scal_v4sf): New.
* config/aarch64/aarch64-simd.md (reduc_<sur>plus_mode): Remove.
(reduc_splus_<mode>, reduc_uplus_<mode>, reduc_plus_scal_<mode>): New.
(reduc_<sur>plus_mode): Change SUADDV -> UNSPEC_ADDV, rename to...
(aarch64_reduc_plus_internal<mode>): ...this.
(reduc_<sur>plus_v2si): Change SUADDV -> UNSPEC_ADDV, rename to...
(aarch64_reduc_plus_internalv2si): ...this.
(reduc_splus_<mode>/V2F): Rename to...
(aarch64_reduc_plus_internal<mode>): ...this.
* config/aarch64/iterators.md
(UNSPEC_SADDV, UNSPEC_UADDV, SUADDV): Remove.
(UNSPEC_ADDV): New.
(sur): Remove elements for UNSPEC_SADDV and UNSPEC_UADDV.
* config/aarch64/arm_neon.h (vaddv_s8, vaddv_s16, vaddv_s32, vaddv_u8,
vaddv_u16, vaddv_u32, vaddvq_s8, vaddvq_s16, vaddvq_s32, vaddvq_s64,
vaddvq_u8, vaddvq_u16, vaddvq_u32, vaddvq_u64, vaddv_f32, vaddvq_f32,
vaddvq_f64): Change __builtin_aarch64_reduc_[us]plus_... to
__builtin_aarch64_reduc_plus_scal, remove vget_lane wrapper.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 4f3bd12c8447e7125dfeba3f06536cdf9acc2440..ae4ab42e3e3df7de4e4b2c5e46a1476a2ed64175 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -248,9 +248,8 @@
BUILTIN_VSDQ_I_DI (BINOP, cmgtu, 0)
BUILTIN_VSDQ_I_DI (BINOP, cmtst, 0)
- /* Implemented by reduc_<sur>plus_<mode>. */
- BUILTIN_VALL (UNOP, reduc_splus_, 10)
- BUILTIN_VDQ (UNOP, reduc_uplus_, 10)
+ /* Implemented by aarch64_reduc_plus_<mode>. */
+ BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
/* Implemented by reduc_<maxmin_uns>_<mode>. */
BUILTIN_VDQIF (UNOP, reduc_smax_, 10)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f5fa4aebe4cafe1430b31ca3a89ec5f3698d23bd..23b89584d9ba1d88ff49bfa28d210b325e7dea7f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1719,25 +1719,74 @@
;; 'across lanes' add.
-(define_insn "reduc_<sur>plus_<mode>"
+(define_expand "reduc_splus_<mode>"
+ [(match_operand:VALL 0 "register_operand" "=w")
+ (match_operand:VALL 1 "register_operand" "w")]
+ "TARGET_SIMD"
+ {
+ /* Old optab/standard name, should not be used since we are providing
+ newer reduc_plus_scal_<mode>. */
+ gcc_unreachable ();
+ }
+)
+
+(define_expand "reduc_uplus_<mode>"
+ [(match_operand:VALL 0 "register_operand" "=w")
+ (match_operand:VALL 1 "register_operand" "w")]
+ "TARGET_SIMD"
+ {
+ /* Old optab/standard name, should not be used since we are providing
+ newer reduc_plus_scal_<mode>. */
+ gcc_unreachable ();
+ }
+)
+
+(define_expand "reduc_plus_scal_<mode>"
+ [(match_operand:<VEL> 0 "register_operand" "=w")
+ (unspec:VDQ [(match_operand:VDQ 1 "register_operand" "w")]
+ UNSPEC_ADDV)]
+ "TARGET_SIMD"
+ {
+ rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
+ rtx scratch = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_aarch64_reduc_plus_internal<mode> (scratch, operands[1]));
+ emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
+ DONE;
+ }
+)
+
+(define_expand "reduc_plus_scal_<mode>"
+ [(match_operand:<VEL> 0 "register_operand" "=w")
+ (match_operand:V2F 1 "register_operand" "w")]
+ "TARGET_SIMD"
+ {
+ rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
+ rtx scratch = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_aarch64_reduc_plus_internal<mode> (scratch, operands[1]));
+ emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
+ DONE;
+ }
+)
+
+(define_insn "aarch64_reduc_plus_internal<mode>"
[(set (match_operand:VDQV 0 "register_operand" "=w")
(unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
- SUADDV))]
+ UNSPEC_ADDV))]
"TARGET_SIMD"
"add<VDQV:vp>\\t%<Vetype>0, %1.<Vtype>"
[(set_attr "type" "neon_reduc_add<q>")]
)
-(define_insn "reduc_<sur>plus_v2si"
+(define_insn "aarch64_reduc_plus_internalv2si"
[(set (match_operand:V2SI 0 "register_operand" "=w")
(unspec:V2SI [(match_operand:V2SI 1 "register_operand" "w")]
- SUADDV))]
+ UNSPEC_ADDV))]
"TARGET_SIMD"
"addp\\t%0.2s, %1.2s, %1.2s"
[(set_attr "type" "neon_reduc_add")]
)
-(define_insn "reduc_splus_<mode>"
+(define_insn "aarch64_reduc_plus_internal<mode>"
[(set (match_operand:V2F 0 "register_operand" "=w")
(unspec:V2F [(match_operand:V2F 1 "register_operand" "w")]
UNSPEC_FADDV))]
@@ -1755,14 +1804,17 @@
[(set_attr "type" "neon_fp_reduc_add_s_q")]
)
-(define_expand "reduc_splus_v4sf"
- [(set (match_operand:V4SF 0 "register_operand")
+(define_expand "reduc_plus_scal_v4sf"
+ [(set (match_operand:SF 0 "register_operand")
(unspec:V4SF [(match_operand:V4SF 1 "register_operand")]
UNSPEC_FADDV))]
"TARGET_SIMD"
{
- emit_insn (gen_aarch64_addpv4sf (operands[0], operands[1]));
- emit_insn (gen_aarch64_addpv4sf (operands[0], operands[0]));
+ rtx elt = GEN_INT (ENDIAN_LANE_N (V4SFmode, 0));
+ rtx scratch = gen_reg_rtx (V4SFmode);
+ emit_insn (gen_aarch64_addpv4sf (scratch, operands[1]));
+ emit_insn (gen_aarch64_addpv4sf (scratch, scratch));
+ emit_insn (gen_aarch64_get_lanev4sf (operands[0], scratch, elt));
DONE;
})
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 0a86172ccb9aa9ab026f4aa020fd4418098e0923..734788e1c0fc81f6bf7efc126b357a74c22692f5 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -13456,121 +13456,103 @@ vaddd_u64 (uint64_t __a, uint64_t __b)
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vaddv_s8 (int8x8_t __a)
{
- return vget_lane_s8 (__builtin_aarch64_reduc_splus_v8qi (__a), 0);
+ return __builtin_aarch64_reduc_plus_scal_v8qi (__a);
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vaddv_s16 (int16x4_t __a)
{
- return vget_lane_s16 (__builtin_aarch64_reduc_splus_v4hi (__a), 0);
+ return __builtin_aarch64_reduc_plus_scal_v4hi (__a);
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vaddv_s32 (int32x2_t __a)
{
- return vget_lane_s32 (__builtin_aarch64_reduc_splus_v2si (__a), 0);
+ return __builtin_aarch64_reduc_plus_scal_v2si (__a);
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
vaddv_u8 (uint8x8_t __a)
{
- return vget_lane_u8 ((uint8x8_t)
- __builtin_aarch64_reduc_uplus_v8qi ((int8x8_t) __a),
- 0);
+ return (uint8_t) __builtin_aarch64_reduc_plus_scal_v8qi ((int8x8_t) __a);
}
__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
vaddv_u16 (uint16x4_t __a)
{
- return vget_lane_u16 ((uint16x4_t)
- __builtin_aarch64_reduc_uplus_v4hi ((int16x4_t) __a),
- 0);
+ return (uint16_t) __builtin_aarch64_reduc_plus_scal_v4hi ((int16x4_t) __a);
}
__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
vaddv_u32 (uint32x2_t __a)
{
- return vget_lane_u32 ((uint32x2_t)
- __builtin_aarch64_reduc_uplus_v2si ((int32x2_t) __a),
- 0);
+ return (int32_t) __builtin_aarch64_reduc_plus_scal_v2si ((int32x2_t) __a);
}
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vaddvq_s8 (int8x16_t __a)
{
- return vgetq_lane_s8 (__builtin_aarch64_reduc_splus_v16qi (__a),
- 0);
+ return __builtin_aarch64_reduc_plus_scal_v16qi (__a);
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vaddvq_s16 (int16x8_t __a)
{
- return vgetq_lane_s16 (__builtin_aarch64_reduc_splus_v8hi (__a), 0);
+ return __builtin_aarch64_reduc_plus_scal_v8hi (__a);
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vaddvq_s32 (int32x4_t __a)
{
- return vgetq_lane_s32 (__builtin_aarch64_reduc_splus_v4si (__a), 0);
+ return __builtin_aarch64_reduc_plus_scal_v4si (__a);
}
__extension__ static __inline int64_t __attribute__ ((__always_inline__))
vaddvq_s64 (int64x2_t __a)
{
- return vgetq_lane_s64 (__builtin_aarch64_reduc_splus_v2di (__a), 0);
+ return __builtin_aarch64_reduc_plus_scal_v2di (__a);
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
vaddvq_u8 (uint8x16_t __a)
{
- return vgetq_lane_u8 ((uint8x16_t)
- __builtin_aarch64_reduc_uplus_v16qi ((int8x16_t) __a),
- 0);
+ return (uint8_t) __builtin_aarch64_reduc_plus_scal_v16qi ((int8x16_t) __a);
}
__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
vaddvq_u16 (uint16x8_t __a)
{
- return vgetq_lane_u16 ((uint16x8_t)
- __builtin_aarch64_reduc_uplus_v8hi ((int16x8_t) __a),
- 0);
+ return (uint16_t) __builtin_aarch64_reduc_plus_scal_v8hi ((int16x8_t) __a);
}
__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
vaddvq_u32 (uint32x4_t __a)
{
- return vgetq_lane_u32 ((uint32x4_t)
- __builtin_aarch64_reduc_uplus_v4si ((int32x4_t) __a),
- 0);
+ return (uint32_t) __builtin_aarch64_reduc_plus_scal_v4si ((int32x4_t) __a);
}
__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
vaddvq_u64 (uint64x2_t __a)
{
- return vgetq_lane_u64 ((uint64x2_t)
- __builtin_aarch64_reduc_uplus_v2di ((int64x2_t) __a),
- 0);
+ return (uint64_t) __builtin_aarch64_reduc_plus_scal_v2di ((int64x2_t) __a);
}
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vaddv_f32 (float32x2_t __a)
{
- float32x2_t __t = __builtin_aarch64_reduc_splus_v2sf (__a);
- return vget_lane_f32 (__t, 0);
+ return __builtin_aarch64_reduc_plus_scal_v2sf (__a);
}
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vaddvq_f32 (float32x4_t __a)
{
- float32x4_t __t = __builtin_aarch64_reduc_splus_v4sf (__a);
- return vgetq_lane_f32 (__t, 0);
+ return __builtin_aarch64_reduc_plus_scal_v4sf (__a);
}
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vaddvq_f64 (float64x2_t __a)
{
- float64x2_t __t = __builtin_aarch64_reduc_splus_v2df (__a);
- return vgetq_lane_f64 (__t, 0);
+ return __builtin_aarch64_reduc_plus_scal_v2df (__a);
}
/* vbsl */
@@ -19234,7 +19216,7 @@ vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vpaddd_f64 (float64x2_t __a)
{
- return vgetq_lane_f64 (__builtin_aarch64_reduc_splus_v2df (__a), 0);
+ return __builtin_aarch64_reduc_plus_scal_v2df (__a);
}
__extension__ static __inline int64_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 3203c3da7e293d566d1ea329856cbef8fb73a825..f738c298252736716077238d7c23478195481468 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -207,8 +207,7 @@
UNSPEC_FMINNMV ; Used in aarch64-simd.md.
UNSPEC_FMINV ; Used in aarch64-simd.md.
UNSPEC_FADDV ; Used in aarch64-simd.md.
- UNSPEC_SADDV ; Used in aarch64-simd.md.
- UNSPEC_UADDV ; Used in aarch64-simd.md.
+ UNSPEC_ADDV ; Used in aarch64-simd.md.
UNSPEC_SMAXV ; Used in aarch64-simd.md.
UNSPEC_SMINV ; Used in aarch64-simd.md.
UNSPEC_UMAXV ; Used in aarch64-simd.md.
@@ -845,8 +844,6 @@
(define_int_iterator FMAXMINV [UNSPEC_FMAXV UNSPEC_FMINV
UNSPEC_FMAXNMV UNSPEC_FMINNMV])
-(define_int_iterator SUADDV [UNSPEC_SADDV UNSPEC_UADDV])
-
(define_int_iterator HADDSUB [UNSPEC_SHADD UNSPEC_UHADD
UNSPEC_SRHADD UNSPEC_URHADD
UNSPEC_SHSUB UNSPEC_UHSUB
@@ -951,7 +948,6 @@
(UNSPEC_SUBHN2 "") (UNSPEC_RSUBHN2 "r")
(UNSPEC_SQXTN "s") (UNSPEC_UQXTN "u")
(UNSPEC_USQADD "us") (UNSPEC_SUQADD "su")
- (UNSPEC_SADDV "s") (UNSPEC_UADDV "u")
(UNSPEC_SSLI "s") (UNSPEC_USLI "u")
(UNSPEC_SSRI "s") (UNSPEC_USRI "u")
(UNSPEC_USRA "u") (UNSPEC_SSRA "s")