Hi,
The attached patch fixes all the reduc_* expansions to be BE-safe by moving the
scalar result to the LSB where RTL expects it. While moving it also adds
patterns that will give gcc the freedom to choose between 2-lane-situations like
ADDP Dd, Vd.2D
DUP Vd.2D, Vd.d[0]
and
ADDP Vd.2D, Vn.2D, Vn.2D.
based on what it thinks is profitable. Most of the time, the DUP is optimized
away and the scalar result is used directly. Nevertheless, the choice is given.
Tested on aarch64-none-elf and aarch64_be-none-elf. OK for trunk?
Thanks,
Tejas Belagod
ARM.
Changelog:
2013-11-15 Tejas Belagod <tejas.bela...@arm.com>
gcc/
* config/aarch64/aarch64-simd.md(aarch64_simd_reduc_<sur>plus_<mode>):
New abstraction.
(reduc_<sur>plus_<mode>, reduc_<sur>plus_v2di, reduc_<sur>plus_v2di,
reduc_<sur>plus_v4sf, reduc_<maxmin_uns>_<mode>,
reduc_<maxmin_uns>_v2di, reduc_<maxmin_uns>_v4sf): Make these
big-endian-safe.
(*addp_addv<mode>, *<maxmin_uns><mode>): New combiner patterns.
* config/aarch64/aarch64.h(ENDIAN_LANE_N): New.
* config/aarch64/arm_neon.h: Make all the reduction intrinsics
big-endian-safe.
* config/aarch64/iterators.md(V2I,VDQV_S): New mode iterators.
(VDQV): Add V2DI.
(vp): New mode attribute.
diff --git a/gcc/config/aarch64/aarch64-simd.md
b/gcc/config/aarch64/aarch64-simd.md
index a747ee8..6f50dd9 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1899,32 +1899,39 @@
;; 'across lanes' add.
-(define_insn "reduc_<sur>plus_<mode>"
- [(set (match_operand:VDQV 0 "register_operand" "=w")
- (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
+(define_insn "aarch64_simd_reduc_<sur>plus_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand" "=w")
+ (unspec:<VEL> [(match_operand:VDQV 1 "register_operand" "w")]
SUADDV))]
"TARGET_SIMD"
- "addv\\t%<Vetype>0, %1.<Vtype>"
+ "add<VDQV:vp>\\t%<Vetype>0, %1.<Vtype>"
[(set_attr "simd_type" "simd_addv")
(set_attr "type" "neon_reduc_add<q>")
(set_attr "simd_mode" "<MODE>")]
)
-(define_insn "reduc_<sur>plus_v2di"
- [(set (match_operand:V2DI 0 "register_operand" "=w")
- (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "w")]
+(define_expand "reduc_<sur>plus_<mode>"
+ [(set (match_operand:VDQV 0 "register_operand" "=w")
+ (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
SUADDV))]
"TARGET_SIMD"
- "addp\\t%d0, %1.2d"
- [(set_attr "simd_type" "simd_addv")
- (set_attr "type" "neon_reduc_add_q")
- (set_attr "simd_mode" "V2DI")]
-)
+{
+ rtx temp = gen_reg_rtx (<VEL>mode);
+ int elt = ENDIAN_LANE_N (<MODE>mode, 0);
+
+ emit_insn (gen_aarch64_simd_reduc_<sur>plus_<mode> (temp, operands[1]));
+ if (GET_MODE_NUNITS (<MODE>mode) == 2 && BYTES_BIG_ENDIAN)
+ emit_insn (gen_aarch64_simd_dup<mode> (operands[0], temp));
+ else
+ emit_insn (gen_vec_set<mode> (operands[0], temp, GEN_INT (elt)));
+
+ DONE;
+})
(define_insn "reduc_<sur>plus_v2si"
[(set (match_operand:V2SI 0 "register_operand" "=w")
(unspec:V2SI [(match_operand:V2SI 1 "register_operand" "w")]
- SUADDV))]
+ SUADDV))]
"TARGET_SIMD"
"addp\\t%0.2s, %1.2s, %1.2s"
[(set_attr "simd_type" "simd_addv")
@@ -1932,17 +1939,35 @@
(set_attr "simd_mode" "V2SI")]
)
-(define_insn "reduc_<sur>plus_<mode>"
- [(set (match_operand:V2F 0 "register_operand" "=w")
- (unspec:V2F [(match_operand:V2F 1 "register_operand" "w")]
- SUADDV))]
+(define_insn "aarch64_simd_reduc_splus_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand" "=w")
+ (unspec:<VEL> [(match_operand:V2F 1 "register_operand" "w")]
+ UNSPEC_FADDV))]
"TARGET_SIMD"
"faddp\\t%<Vetype>0, %1.<Vtype>"
- [(set_attr "simd_type" "simd_fadd")
+ [(set_attr "simd_type" "simd_addv")
(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")
(set_attr "simd_mode" "<MODE>")]
)
+(define_expand "reduc_splus_<mode>"
+ [(set (match_operand:V2F 0 "register_operand" "=w")
+ (unspec:V2F [(match_operand:V2F 1 "register_operand" "w")]
+ UNSPEC_FADDV))]
+ "TARGET_SIMD"
+{
+ rtx temp = gen_reg_rtx (<VEL>mode);
+ int elt = ENDIAN_LANE_N (<MODE>mode, 0);
+
+ emit_insn (gen_aarch64_simd_reduc_splus_<mode> (temp, operands[1]));
+ if (GET_MODE_NUNITS (<MODE>mode) == 2 && BYTES_BIG_ENDIAN)
+ emit_insn (gen_aarch64_simd_dup<mode> (operands[0], temp));
+ else
+ emit_insn (gen_vec_set<mode> (operands[0], temp, GEN_INT (elt)));
+
+ DONE;
+})
+
(define_insn "aarch64_addpv4sf"
[(set (match_operand:V4SF 0 "register_operand" "=w")
(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "w")]
@@ -1954,15 +1979,21 @@
(set_attr "simd_mode" "V4SF")]
)
-(define_expand "reduc_<sur>plus_v4sf"
+(define_expand "reduc_splus_v4sf"
[(set (match_operand:V4SF 0 "register_operand")
(unspec:V4SF [(match_operand:V4SF 1 "register_operand")]
- SUADDV))]
+ UNSPEC_FADDV))]
"TARGET_SIMD"
{
- rtx tmp = gen_reg_rtx (V4SFmode);
- emit_insn (gen_aarch64_addpv4sf (tmp, operands[1]));
- emit_insn (gen_aarch64_addpv4sf (operands[0], tmp));
+ rtx tmp1 = gen_reg_rtx (V4SFmode);
+ rtx tmp2 = gen_reg_rtx (V4SFmode);
+ rtx sc = gen_reg_rtx (SFmode);
+ int elt = ENDIAN_LANE_N (V4SFmode, 0);
+
+ emit_insn (gen_aarch64_addpv4sf (tmp1, operands[1]));
+ emit_insn (gen_aarch64_addpv4sf (tmp2, tmp1));
+ emit_insn (gen_aarch64_get_lanev4sf (sc, tmp2, const0_rtx));
+ emit_insn (gen_vec_setv4sf (operands[0], sc, GEN_INT (elt)));
DONE;
})
@@ -1978,9 +2009,9 @@
;; 'across lanes' max and min ops.
-(define_insn "reduc_<maxmin_uns>_<mode>"
- [(set (match_operand:VDQV 0 "register_operand" "=w")
- (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
+(define_insn "aarch64_simd_reduc_<maxmin_uns>_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand" "=w")
+ (unspec:<VEL> [(match_operand:VDQV_S 1 "register_operand" "w")]
MAXMINV))]
"TARGET_SIMD"
"<maxmin_uns_op>v\\t%<Vetype>0, %1.<Vtype>"
@@ -1989,16 +2020,19 @@
(set_attr "simd_mode" "<MODE>")]
)
-(define_insn "reduc_<maxmin_uns>_v2di"
- [(set (match_operand:V2DI 0 "register_operand" "=w")
- (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "w")]
- MAXMINV))]
+(define_expand "reduc_<maxmin_uns>_<mode>"
+ [(set (match_operand:VDQV_S 0 "register_operand" "=w")
+ (unspec:VDQV_S [(match_operand:VDQV_S 1 "register_operand" "w")]
+ MAXMINV))]
"TARGET_SIMD"
- "<maxmin_uns_op>p\\t%d0, %1.2d"
- [(set_attr "simd_type" "simd_minmaxv")
- (set_attr "type" "neon_reduc_minmax_q")
- (set_attr "simd_mode" "V2DI")]
-)
+{
+ rtx temp = gen_reg_rtx (<VEL>mode);
+ int elt = ENDIAN_LANE_N (<MODE>mode, 0);
+
+ emit_insn (gen_aarch64_simd_reduc_<maxmin_uns>_<mode> (temp, operands[1]));
+ emit_insn (gen_vec_set<mode> (operands[0], temp, GEN_INT (elt)));
+ DONE;
+})
(define_insn "reduc_<maxmin_uns>_v2si"
[(set (match_operand:V2SI 0 "register_operand" "=w")
@@ -2011,9 +2045,9 @@
(set_attr "simd_mode" "V2SI")]
)
-(define_insn "reduc_<maxmin_uns>_<mode>"
- [(set (match_operand:V2F 0 "register_operand" "=w")
- (unspec:V2F [(match_operand:V2F 1 "register_operand" "w")]
+(define_insn "aarch64_simd_reduc_<maxmin_uns>_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand" "=w")
+ (unspec:<VEL> [(match_operand:V2F 1 "register_operand" "w")]
FMAXMINV))]
"TARGET_SIMD"
"<maxmin_uns_op>p\\t%<Vetype>0, %1.<Vtype>"
@@ -2022,9 +2056,23 @@
(set_attr "simd_mode" "<MODE>")]
)
-(define_insn "reduc_<maxmin_uns>_v4sf"
- [(set (match_operand:V4SF 0 "register_operand" "=w")
- (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "w")]
+(define_expand "reduc_<maxmin_uns>_<mode>"
+ [(set (match_operand:V2F 0 "register_operand" "=w")
+ (unspec:V2F [(match_operand:V2F 1 "register_operand" "w")]
+ FMAXMINV))]
+ "TARGET_SIMD"
+{
+ rtx temp = gen_reg_rtx (<VEL>mode);
+ int elt = ENDIAN_LANE_N (<MODE>mode, 0);
+
+ emit_insn (gen_aarch64_simd_reduc_<maxmin_uns>_<mode> (temp, operands[1]));
+ emit_insn (gen_aarch64_simd_dup<mode> (operands[0], temp));
+ DONE;
+})
+
+(define_insn "aarch64_simd_reduc_<maxmin_uns>_v4sf"
+ [(set (match_operand:SF 0 "register_operand" "=w")
+ (unspec:SF [(match_operand:V4SF 1 "register_operand" "w")]
FMAXMINV))]
"TARGET_SIMD"
"<maxmin_uns_op>v\\t%s0, %1.4s"
@@ -2033,6 +2081,21 @@
(set_attr "simd_mode" "V4SF")]
)
+(define_expand "reduc_<maxmin_uns>_v4sf"
+ [(set (match_operand:V4SF 0 "register_operand" "=w")
+ (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "w")]
+ FMAXMINV))]
+ "TARGET_SIMD"
+{
+ rtx temp = gen_reg_rtx (SFmode);
+ int elt = ENDIAN_LANE_N (V4SFmode, 0);
+
+ emit_insn (gen_aarch64_simd_reduc_<maxmin_uns>_v4sf (temp, operands[1]));
+ emit_insn (gen_vec_setv4sf (operands[0], temp, GEN_INT (elt)));
+ DONE;
+})
+
+
;; aarch64_simd_bsl may compile to any of bsl/bif/bit depending on register
;; allocation.
;; Operand 1 is the mask, operands 2 and 3 are the bitfields from which
@@ -3919,6 +3982,47 @@
(set_attr "simd_mode" "DI")]
)
+;; eg. ADDV Dd, Vn.2D; DUP Vd.2D, Vn.D[0] => ADDP Vd.2D, Vn.2D, Vn.2D
+
+(define_insn "*addp_addv<mode>"
+ [(set (match_operand:V2I 0 "register_operand" "=w")
+ (vec_duplicate:V2I
+ (unspec:<VEL>
+ [(match_operand:V2I 1 "register_operand" "w")]
+ SUADDV)))]
+ "TARGET_SIMD"
+ "addp\\t%0.<Vtype>, %1.<Vtype>, %1.<Vtype>"
+ [(set_attr "simd_type" "simd_add")
+ (set_attr "type" "neon_reduc_add<q>")
+ (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*addp_addv<mode>"
+ [(set (match_operand:V2F 0 "register_operand" "=w")
+ (vec_duplicate:V2F
+ (unspec:<VEL>
+ [(match_operand:V2F 1 "register_operand" "w")]
+ UNSPEC_FADDV)))]
+ "TARGET_SIMD"
+ "faddp\\t%0.<Vtype>, %1.<Vtype>, %1.<Vtype>"
+ [(set_attr "simd_type" "simd_add")
+ (set_attr "type" "neon_fp_reduc_add_<Vetype><q>")
+ (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*<maxmin_uns><mode>"
+ [(set (match_operand:V2F 0 "register_operand" "=w")
+ (vec_duplicate:V2F
+ (unspec:<VEL>
+ [(match_operand:V2F 1 "register_operand" "w")]
+ FMAXMINV)))]
+ "TARGET_SIMD"
+ "<maxmin_uns_op>p\\t%0.<Vtype>, %1.<Vtype>, %1.<Vtype>"
+ [(set_attr "simd_type" "simd_fminmax")
+ (set_attr "type" "neon_fp_reduc_minmax_<Vetype><q>")
+ (set_attr "simd_mode" "<MODE>")]
+)
+
;; sqrt
(define_insn "sqrt<mode>2"
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 7a80e96..e8706e8 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -854,4 +854,7 @@ extern enum aarch64_code_model aarch64_cmodel;
((MODE) == V4SImode || (MODE) == V8HImode || (MODE) == V16QImode \
|| (MODE) == V4SFmode || (MODE) == V2DImode || mode == V2DFmode)
+#define ENDIAN_LANE_N(mode, n) \
+ (BYTES_BIG_ENDIAN ? GET_MODE_NUNITS (mode) - 1 - n : n)
+
#endif /* GCC_AARCH64_H */
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index f7c9db6..98c84a9 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -15904,118 +15904,132 @@ vaddd_u64 (uint64x1_t __a, uint64x1_t __b)
return __a + __b;
}
+#if __AARCH64EB__
+#define __LANE0(__t) ((__t) - 1)
+#else
+#define __LANE0(__t) 0
+#endif
+
/* vaddv */
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vaddv_s8 (int8x8_t __a)
{
- return vget_lane_s8 (__builtin_aarch64_reduc_splus_v8qi (__a), 0);
+ return vget_lane_s8 (__builtin_aarch64_reduc_splus_v8qi (__a), __LANE0 (8));
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vaddv_s16 (int16x4_t __a)
{
- return vget_lane_s16 (__builtin_aarch64_reduc_splus_v4hi (__a), 0);
+ return vget_lane_s16 (__builtin_aarch64_reduc_splus_v4hi (__a), __LANE0 (4));
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vaddv_s32 (int32x2_t __a)
{
- return vget_lane_s32 (__builtin_aarch64_reduc_splus_v2si (__a), 0);
+ return vget_lane_s32 (__builtin_aarch64_reduc_splus_v2si (__a), __LANE0 (2));
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
vaddv_u8 (uint8x8_t __a)
{
return vget_lane_u8 ((uint8x8_t)
- __builtin_aarch64_reduc_uplus_v8qi ((int8x8_t) __a), 0);
+ __builtin_aarch64_reduc_uplus_v8qi ((int8x8_t) __a),
+ __LANE0 (8));
}
__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
vaddv_u16 (uint16x4_t __a)
{
return vget_lane_u16 ((uint16x4_t)
- __builtin_aarch64_reduc_uplus_v4hi ((int16x4_t) __a), 0);
+ __builtin_aarch64_reduc_uplus_v4hi ((int16x4_t) __a),
+ __LANE0 (4));
}
__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
vaddv_u32 (uint32x2_t __a)
{
return vget_lane_u32 ((uint32x2_t)
- __builtin_aarch64_reduc_uplus_v2si ((int32x2_t) __a), 0);
+ __builtin_aarch64_reduc_uplus_v2si ((int32x2_t) __a),
+ __LANE0 (2));
}
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vaddvq_s8 (int8x16_t __a)
{
- return vgetq_lane_s8 (__builtin_aarch64_reduc_splus_v16qi (__a), 0);
+ return vgetq_lane_s8 (__builtin_aarch64_reduc_splus_v16qi (__a),
+ __LANE0 (16));
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vaddvq_s16 (int16x8_t __a)
{
- return vgetq_lane_s16 (__builtin_aarch64_reduc_splus_v8hi (__a), 0);
+ return vgetq_lane_s16 (__builtin_aarch64_reduc_splus_v8hi (__a), __LANE0
(8));
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vaddvq_s32 (int32x4_t __a)
{
- return vgetq_lane_s32 (__builtin_aarch64_reduc_splus_v4si (__a), 0);
+ return vgetq_lane_s32 (__builtin_aarch64_reduc_splus_v4si (__a), __LANE0
(4));
}
__extension__ static __inline int64_t __attribute__ ((__always_inline__))
vaddvq_s64 (int64x2_t __a)
{
- return vgetq_lane_s64 (__builtin_aarch64_reduc_splus_v2di (__a), 0);
+ return vgetq_lane_s64 (__builtin_aarch64_reduc_splus_v2di (__a), __LANE0
(2));
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
vaddvq_u8 (uint8x16_t __a)
{
return vgetq_lane_u8 ((uint8x16_t)
- __builtin_aarch64_reduc_uplus_v16qi ((int8x16_t) __a), 0);
+ __builtin_aarch64_reduc_uplus_v16qi ((int8x16_t) __a),
+ __LANE0 (16));
}
__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
vaddvq_u16 (uint16x8_t __a)
{
return vgetq_lane_u16 ((uint16x8_t)
- __builtin_aarch64_reduc_uplus_v8hi ((int16x8_t) __a), 0);
+ __builtin_aarch64_reduc_uplus_v8hi ((int16x8_t) __a),
+ __LANE0 (8));
}
__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
vaddvq_u32 (uint32x4_t __a)
{
return vgetq_lane_u32 ((uint32x4_t)
- __builtin_aarch64_reduc_uplus_v4si ((int32x4_t) __a), 0);
+ __builtin_aarch64_reduc_uplus_v4si ((int32x4_t) __a),
+ __LANE0 (4));
}
__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
vaddvq_u64 (uint64x2_t __a)
{
return vgetq_lane_u64 ((uint64x2_t)
- __builtin_aarch64_reduc_uplus_v2di ((int64x2_t) __a), 0);
+ __builtin_aarch64_reduc_uplus_v2di ((int64x2_t) __a),
+ __LANE0 (2));
}
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vaddv_f32 (float32x2_t __a)
{
- float32x2_t t = __builtin_aarch64_reduc_splus_v2sf (__a);
- return vget_lane_f32 (t, 0);
+ float32x2_t __t = __builtin_aarch64_reduc_splus_v2sf (__a);
+ return vget_lane_f32 (__t, __LANE0 (2));
}
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vaddvq_f32 (float32x4_t __a)
{
- float32x4_t t = __builtin_aarch64_reduc_splus_v4sf (__a);
- return vgetq_lane_f32 (t, 0);
+ float32x4_t __t = __builtin_aarch64_reduc_splus_v4sf (__a);
+ return vgetq_lane_f32 (__t, __LANE0 (4));
}
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vaddvq_f64 (float64x2_t __a)
{
- float64x2_t t = __builtin_aarch64_reduc_splus_v2df (__a);
- return vgetq_lane_f64 (t, 0);
+ float64x2_t __t = __builtin_aarch64_reduc_splus_v2df (__a);
+ return vgetq_lane_f64 (__t, __LANE0 (2));
}
/* vcage */
@@ -20256,97 +20270,106 @@ vmaxnmq_f64 (float64x2_t __a, float64x2_t __b)
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vmaxv_f32 (float32x2_t __a)
{
- return vget_lane_f32 (__builtin_aarch64_reduc_smax_nan_v2sf (__a), 0);
+ return vget_lane_f32 (__builtin_aarch64_reduc_smax_nan_v2sf (__a),
+ __LANE0 (2));
}
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vmaxv_s8 (int8x8_t __a)
{
- return vget_lane_s8 (__builtin_aarch64_reduc_smax_v8qi (__a), 0);
+ return vget_lane_s8 (__builtin_aarch64_reduc_smax_v8qi (__a), __LANE0 (8));
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vmaxv_s16 (int16x4_t __a)
{
- return vget_lane_s16 (__builtin_aarch64_reduc_smax_v4hi (__a), 0);
+ return vget_lane_s16 (__builtin_aarch64_reduc_smax_v4hi (__a), __LANE0 (4));
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vmaxv_s32 (int32x2_t __a)
{
- return vget_lane_s32 (__builtin_aarch64_reduc_smax_v2si (__a), 0);
+ return vget_lane_s32 (__builtin_aarch64_reduc_smax_v2si (__a), __LANE0 (2));
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
vmaxv_u8 (uint8x8_t __a)
{
return vget_lane_u8 ((uint8x8_t)
- __builtin_aarch64_reduc_umax_v8qi ((int8x8_t) __a), 0);
+ __builtin_aarch64_reduc_umax_v8qi ((int8x8_t) __a),
+ __LANE0 (8));
}
__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
vmaxv_u16 (uint16x4_t __a)
{
return vget_lane_u16 ((uint16x4_t)
- __builtin_aarch64_reduc_umax_v4hi ((int16x4_t) __a), 0);
+ __builtin_aarch64_reduc_umax_v4hi ((int16x4_t) __a),
+ __LANE0 (4));
}
__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
vmaxv_u32 (uint32x2_t __a)
{
return vget_lane_u32 ((uint32x2_t)
- __builtin_aarch64_reduc_umax_v2si ((int32x2_t) __a), 0);
+ __builtin_aarch64_reduc_umax_v2si ((int32x2_t) __a),
+ __LANE0 (2));
}
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vmaxvq_f32 (float32x4_t __a)
{
- return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_nan_v4sf (__a), 0);
+ return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_nan_v4sf (__a),
+ __LANE0 (4));
}
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vmaxvq_f64 (float64x2_t __a)
{
- return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_nan_v2df (__a), 0);
+ return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_nan_v2df (__a),
+ __LANE0 (2));
}
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vmaxvq_s8 (int8x16_t __a)
{
- return vgetq_lane_s8 (__builtin_aarch64_reduc_smax_v16qi (__a), 0);
+ return vgetq_lane_s8 (__builtin_aarch64_reduc_smax_v16qi (__a), __LANE0
(16));
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vmaxvq_s16 (int16x8_t __a)
{
- return vgetq_lane_s16 (__builtin_aarch64_reduc_smax_v8hi (__a), 0);
+ return vgetq_lane_s16 (__builtin_aarch64_reduc_smax_v8hi (__a), __LANE0 (8));
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vmaxvq_s32 (int32x4_t __a)
{
- return vgetq_lane_s32 (__builtin_aarch64_reduc_smax_v4si (__a), 0);
+ return vgetq_lane_s32 (__builtin_aarch64_reduc_smax_v4si (__a), __LANE0 (4));
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
vmaxvq_u8 (uint8x16_t __a)
{
return vgetq_lane_u8 ((uint8x16_t)
- __builtin_aarch64_reduc_umax_v16qi ((int8x16_t) __a), 0);
+ __builtin_aarch64_reduc_umax_v16qi ((int8x16_t) __a),
+ __LANE0 (16));
}
__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
vmaxvq_u16 (uint16x8_t __a)
{
return vgetq_lane_u16 ((uint16x8_t)
- __builtin_aarch64_reduc_umax_v8hi ((int16x8_t) __a), 0);
+ __builtin_aarch64_reduc_umax_v8hi ((int16x8_t) __a),
+ __LANE0 (8));
}
__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
vmaxvq_u32 (uint32x4_t __a)
{
return vgetq_lane_u32 ((uint32x4_t)
- __builtin_aarch64_reduc_umax_v4si ((int32x4_t) __a), 0);
+ __builtin_aarch64_reduc_umax_v4si ((int32x4_t) __a),
+ __LANE0 (4));
}
/* vmaxnmv */
@@ -20354,19 +20377,20 @@ vmaxvq_u32 (uint32x4_t __a)
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vmaxnmv_f32 (float32x2_t __a)
{
- return vget_lane_f32 (__builtin_aarch64_reduc_smax_v2sf (__a), 0);
+ return vget_lane_f32 (__builtin_aarch64_reduc_smax_v2sf (__a),
+ __LANE0 (2));
}
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vmaxnmvq_f32 (float32x4_t __a)
{
- return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_v4sf (__a), 0);
+ return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_v4sf (__a), __LANE0 (4));
}
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vmaxnmvq_f64 (float64x2_t __a)
{
- return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_v2df (__a), 0);
+ return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_v2df (__a), __LANE0 (2));
}
/* vmin */
@@ -20492,97 +20516,107 @@ vminnmq_f64 (float64x2_t __a, float64x2_t __b)
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vminv_f32 (float32x2_t __a)
{
- return vget_lane_f32 (__builtin_aarch64_reduc_smin_nan_v2sf (__a), 0);
+ return vget_lane_f32 (__builtin_aarch64_reduc_smin_nan_v2sf (__a),
+ __LANE0 (2));
}
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vminv_s8 (int8x8_t __a)
{
- return vget_lane_s8 (__builtin_aarch64_reduc_smin_v8qi (__a), 0);
+ return vget_lane_s8 (__builtin_aarch64_reduc_smin_v8qi (__a),
+ __LANE0 (8));
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vminv_s16 (int16x4_t __a)
{
- return vget_lane_s16 (__builtin_aarch64_reduc_smin_v4hi (__a), 0);
+ return vget_lane_s16 (__builtin_aarch64_reduc_smin_v4hi (__a), __LANE0 (4));
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vminv_s32 (int32x2_t __a)
{
- return vget_lane_s32 (__builtin_aarch64_reduc_smin_v2si (__a), 0);
+ return vget_lane_s32 (__builtin_aarch64_reduc_smin_v2si (__a), __LANE0 (2));
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
vminv_u8 (uint8x8_t __a)
{
return vget_lane_u8 ((uint8x8_t)
- __builtin_aarch64_reduc_umin_v8qi ((int8x8_t) __a), 0);
+ __builtin_aarch64_reduc_umin_v8qi ((int8x8_t) __a),
+ __LANE0 (8));
}
__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
vminv_u16 (uint16x4_t __a)
{
return vget_lane_u16 ((uint16x4_t)
- __builtin_aarch64_reduc_umin_v4hi ((int16x4_t) __a), 0);
+ __builtin_aarch64_reduc_umin_v4hi ((int16x4_t) __a),
+ __LANE0 (4));
}
__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
vminv_u32 (uint32x2_t __a)
{
return vget_lane_u32 ((uint32x2_t)
- __builtin_aarch64_reduc_umin_v2si ((int32x2_t) __a), 0);
+ __builtin_aarch64_reduc_umin_v2si ((int32x2_t) __a),
+ __LANE0 (2));
}
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vminvq_f32 (float32x4_t __a)
{
- return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_nan_v4sf (__a), 0);
+ return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_nan_v4sf (__a),
+ __LANE0 (4));
}
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vminvq_f64 (float64x2_t __a)
{
- return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_nan_v2df (__a), 0);
+ return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_nan_v2df (__a),
+ __LANE0 (2));
}
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vminvq_s8 (int8x16_t __a)
{
- return vgetq_lane_s8 (__builtin_aarch64_reduc_smin_v16qi (__a), 0);
+ return vgetq_lane_s8 (__builtin_aarch64_reduc_smin_v16qi (__a), __LANE0
(16));
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vminvq_s16 (int16x8_t __a)
{
- return vgetq_lane_s16 (__builtin_aarch64_reduc_smin_v8hi (__a), 0);
+ return vgetq_lane_s16 (__builtin_aarch64_reduc_smin_v8hi (__a), __LANE0 (8));
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vminvq_s32 (int32x4_t __a)
{
- return vgetq_lane_s32 (__builtin_aarch64_reduc_smin_v4si (__a), 0);
+ return vgetq_lane_s32 (__builtin_aarch64_reduc_smin_v4si (__a), __LANE0 (4));
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
vminvq_u8 (uint8x16_t __a)
{
return vgetq_lane_u8 ((uint8x16_t)
- __builtin_aarch64_reduc_umin_v16qi ((int8x16_t) __a), 0);
+ __builtin_aarch64_reduc_umin_v16qi ((int8x16_t) __a),
+ __LANE0 (16));
}
__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
vminvq_u16 (uint16x8_t __a)
{
return vgetq_lane_u16 ((uint16x8_t)
- __builtin_aarch64_reduc_umin_v8hi ((int16x8_t) __a), 0);
+ __builtin_aarch64_reduc_umin_v8hi ((int16x8_t) __a),
+ __LANE0 (8));
}
__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
vminvq_u32 (uint32x4_t __a)
{
return vgetq_lane_u32 ((uint32x4_t)
- __builtin_aarch64_reduc_umin_v4si ((int32x4_t) __a), 0);
+ __builtin_aarch64_reduc_umin_v4si ((int32x4_t) __a),
+ __LANE0 (4));
}
/* vminnmv */
@@ -20590,19 +20624,19 @@ vminvq_u32 (uint32x4_t __a)
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vminnmv_f32 (float32x2_t __a)
{
- return vget_lane_f32 (__builtin_aarch64_reduc_smin_v2sf (__a), 0);
+ return vget_lane_f32 (__builtin_aarch64_reduc_smin_v2sf (__a), __LANE0 (2));
}
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vminnmvq_f32 (float32x4_t __a)
{
- return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_v4sf (__a), 0);
+ return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_v4sf (__a), __LANE0 (4));
}
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vminnmvq_f64 (float64x2_t __a)
{
- return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_v2df (__a), 0);
+ return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_v2df (__a), __LANE0 (2));
}
/* vmla */
@@ -25435,6 +25469,8 @@ __INTERLEAVE_LIST (zip)
/* End of optimal implementations in approved order. */
+#undef __LANE0
+
#undef __aarch64_vget_lane_any
#undef __aarch64_vget_lane_f32
#undef __aarch64_vget_lane_f64
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 50bdac9..bf1374c 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -101,6 +101,9 @@
;; Vector Float modes with 2 elements.
(define_mode_iterator V2F [V2SF V2DF])
+;; Vector Integer modes with 2 elements.
+(define_mode_iterator V2I [V2SI V2DI])
+
;; All modes.
(define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF
V2DF])
@@ -108,7 +111,10 @@
(define_mode_iterator VALLDI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF
V2DF DI])
;; Vector modes for Integer reduction across lanes.
-(define_mode_iterator VDQV [V8QI V16QI V4HI V8HI V4SI])
+(define_mode_iterator VDQV [V8QI V16QI V4HI V8HI V4SI V2DI])
+
+;; Vector modes for Integer reduction across lanes.
+(define_mode_iterator VDQV_S [V8QI V16QI V4HI V8HI V4SI])
;; All double integer narrow-able modes.
(define_mode_iterator VDN [V4HI V2SI DI])
@@ -585,6 +591,12 @@
(V2DF "_q")
(QI "") (HI "") (SI "") (DI "") (SF "") (DF "")])
+(define_mode_attr vp [(V8QI "v") (V16QI "v")
+ (V4HI "v") (V8HI "v")
+ (V2SI "p") (V4SI "v")
+ (V2DI "p") (V2DF "p")
+ (V2SF "p") (V4SF "v")])
+
;; -------------------------------------------------------------------
;; Code Iterators
;; -------------------------------------------------------------------