On 04/09/2024 14:26, Christophe Lyon wrote: > Implement vadcq using the new MVE builtins framework. > > We re-use most of the code introduced by the previous patch to support > vadciq: we just need to initialize carry from the input parameter. > > 2024-08-28 Christophe Lyon <christophe.l...@linaro.org> > > gcc/ > > * config/arm/arm-mve-builtins-base.cc (vadcq_vsbc): Add support > for vadcq. > * config/arm/arm-mve-builtins-base.def (vadcq): New. > * config/arm/arm-mve-builtins-base.h (vadcq): New. > * config/arm/arm_mve.h (vadcq): Delete. > (vadcq_m): Delete. > (vadcq_s32): Delete. > (vadcq_u32): Delete. > (vadcq_m_s32): Delete. > (vadcq_m_u32): Delete. > (__arm_vadcq_s32): Delete. > (__arm_vadcq_u32): Delete. > (__arm_vadcq_m_s32): Delete. > (__arm_vadcq_m_u32): Delete. > (__arm_vadcq): Delete. > (__arm_vadcq_m): Delete.
> + if (!m_init_carry) > + { > + /* Prepare carry in: > + set_fpscr ( (fpscr & ~0x20000000u) > + | ((*carry & 1u) << 29) ) */ > + rtx carry_in = gen_reg_rtx (SImode); > + rtx fpscr = gen_reg_rtx (SImode); > + emit_insn (gen_get_fpscr_nzcvqc (fpscr)); > + emit_insn (gen_rtx_SET (carry_in, gen_rtx_MEM (SImode, carry_ptr))); > + > + emit_insn (gen_rtx_SET (carry_in, > + gen_rtx_ASHIFT (SImode, > + carry_in, > + GEN_INT (29)))); > + emit_insn (gen_rtx_SET (carry_in, > + gen_rtx_AND (SImode, > + carry_in, > + GEN_INT (0x20000000)))); > + emit_insn (gen_rtx_SET (fpscr, > + gen_rtx_AND (SImode, > + fpscr, > + GEN_INT (~0x20000000)))); > + emit_insn (gen_rtx_SET (carry_in, > + gen_rtx_IOR (SImode, > + carry_in, > + fpscr))); > + emit_insn (gen_set_fpscr_nzcvqc (carry_in)); > + } What's the logic here? Are we just trying to set the C flag to *carry != 0 (is carry a bool?)? Do we really need to preserve all the other bits in NZCV? I wouldn't have thought so, suggesting that: CMP *carry, #1 // Set C if *carry != 0 ought to be enough, without having to generate a read-modify-write sequence. R. > --- > gcc/config/arm/arm-mve-builtins-base.cc | 61 +++++++++++++++-- > gcc/config/arm/arm-mve-builtins-base.def | 1 + > gcc/config/arm/arm-mve-builtins-base.h | 1 + > gcc/config/arm/arm_mve.h | 87 ------------------------ > 4 files changed, 56 insertions(+), 94 deletions(-) > > diff --git a/gcc/config/arm/arm-mve-builtins-base.cc > b/gcc/config/arm/arm-mve-builtins-base.cc > index 6f3b18c2915..9c2e11356ef 100644 > --- a/gcc/config/arm/arm-mve-builtins-base.cc > +++ b/gcc/config/arm/arm-mve-builtins-base.cc > @@ -559,10 +559,19 @@ public: > class vadc_vsbc_impl : public function_base > { > public: > + CONSTEXPR vadc_vsbc_impl (bool init_carry) > + : m_init_carry (init_carry) > + {} > + > + /* Initialize carry with 0 (vadci). */ > + bool m_init_carry; > + > unsigned int > call_properties (const function_instance &) const override > { > unsigned int flags = CP_WRITE_MEMORY | CP_READ_FPCR; > + if (!m_init_carry) > + flags |= CP_READ_MEMORY; > return flags; > } > > @@ -605,22 +614,59 @@ public: > carry_ptr = e.args[carry_out_arg_no]; > e.args.ordered_remove (carry_out_arg_no); > > + if (!m_init_carry) > + { > + /* Prepare carry in: > + set_fpscr ( (fpscr & ~0x20000000u) > + | ((*carry & 1u) << 29) ) */ > + rtx carry_in = gen_reg_rtx (SImode); > + rtx fpscr = gen_reg_rtx (SImode); > + emit_insn (gen_get_fpscr_nzcvqc (fpscr)); > + emit_insn (gen_rtx_SET (carry_in, gen_rtx_MEM (SImode, carry_ptr))); > + > + emit_insn (gen_rtx_SET (carry_in, > + gen_rtx_ASHIFT (SImode, > + carry_in, > + GEN_INT (29)))); > + emit_insn (gen_rtx_SET (carry_in, > + gen_rtx_AND (SImode, > + carry_in, > + GEN_INT (0x20000000)))); > + emit_insn (gen_rtx_SET (fpscr, > + gen_rtx_AND (SImode, > + fpscr, > + GEN_INT (~0x20000000)))); > + emit_insn (gen_rtx_SET (carry_in, > + gen_rtx_IOR (SImode, > + carry_in, > + fpscr))); > + emit_insn (gen_set_fpscr_nzcvqc (carry_in)); > + } > + > switch (e.pred) > { > case PRED_none: > /* No predicate. */ > - unspec = e.type_suffix (0).unsigned_p > - ? VADCIQ_U > - : VADCIQ_S; > + unspec = m_init_carry > + ? (e.type_suffix (0).unsigned_p > + ? VADCIQ_U > + : VADCIQ_S) > + : (e.type_suffix (0).unsigned_p > + ? VADCQ_U > + : VADCQ_S); > code = code_for_mve_q_v4si (unspec, unspec); > insns = e.use_exact_insn (code); > break; > > case PRED_m: > /* "m" predicate. */ > - unspec = e.type_suffix (0).unsigned_p > - ? VADCIQ_M_U > - : VADCIQ_M_S; > + unspec = m_init_carry > + ? (e.type_suffix (0).unsigned_p > + ? VADCIQ_M_U > + : VADCIQ_M_S) > + : (e.type_suffix (0).unsigned_p > + ? VADCQ_M_U > + : VADCQ_M_S); > code = code_for_mve_q_m_v4si (unspec, unspec); > insns = e.use_cond_insn (code, 0); > break; > @@ -816,7 +862,8 @@ namespace arm_mve { > FUNCTION_PRED_P_S_U (vabavq, VABAVQ) > FUNCTION_WITHOUT_N (vabdq, VABDQ) > FUNCTION (vabsq, unspec_based_mve_function_exact_insn, (ABS, ABS, ABS, -1, > -1, -1, VABSQ_M_S, -1, VABSQ_M_F, -1, -1, -1)) > -FUNCTION (vadciq, vadc_vsbc_impl,) > +FUNCTION (vadciq, vadc_vsbc_impl, (true)) > +FUNCTION (vadcq, vadc_vsbc_impl, (false)) > FUNCTION_WITH_RTX_M_N (vaddq, PLUS, VADDQ) > FUNCTION_PRED_P_S_U (vaddlvaq, VADDLVAQ) > FUNCTION_PRED_P_S_U (vaddlvq, VADDLVQ) > diff --git a/gcc/config/arm/arm-mve-builtins-base.def > b/gcc/config/arm/arm-mve-builtins-base.def > index 72d6461c4e4..37efa6bf13e 100644 > --- a/gcc/config/arm/arm-mve-builtins-base.def > +++ b/gcc/config/arm/arm-mve-builtins-base.def > @@ -22,6 +22,7 @@ DEF_MVE_FUNCTION (vabavq, binary_acca_int32, all_integer, > p_or_none) > DEF_MVE_FUNCTION (vabdq, binary, all_integer, mx_or_none) > DEF_MVE_FUNCTION (vabsq, unary, all_signed, mx_or_none) > DEF_MVE_FUNCTION (vadciq, vadc_vsbc, integer_32, m_or_none) > +DEF_MVE_FUNCTION (vadcq, vadc_vsbc, integer_32, m_or_none) > DEF_MVE_FUNCTION (vaddlvaq, unary_widen_acc, integer_32, p_or_none) > DEF_MVE_FUNCTION (vaddlvq, unary_acc, integer_32, p_or_none) > DEF_MVE_FUNCTION (vaddq, binary_opt_n, all_integer, mx_or_none) > diff --git a/gcc/config/arm/arm-mve-builtins-base.h > b/gcc/config/arm/arm-mve-builtins-base.h > index 2dfc2e18062..eb8423c3fe2 100644 > --- a/gcc/config/arm/arm-mve-builtins-base.h > +++ b/gcc/config/arm/arm-mve-builtins-base.h > @@ -27,6 +27,7 @@ extern const function_base *const vabavq; > extern const function_base *const vabdq; > extern const function_base *const vabsq; > extern const function_base *const vadciq; > +extern const function_base *const vadcq; > extern const function_base *const vaddlvaq; > extern const function_base *const vaddlvq; > extern const function_base *const vaddq; > diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h > index 3a0b3041c42..dd7b6f5cdab 100644 > --- a/gcc/config/arm/arm_mve.h > +++ b/gcc/config/arm/arm_mve.h > @@ -85,8 +85,6 @@ > #define vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) > __arm_vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) > #define vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) > __arm_vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) > #define vstrwq_scatter_base_wb(__addr, __offset, __value) > __arm_vstrwq_scatter_base_wb(__addr, __offset, __value) > -#define vadcq(__a, __b, __carry) __arm_vadcq(__a, __b, __carry) > -#define vadcq_m(__inactive, __a, __b, __carry, __p) > __arm_vadcq_m(__inactive, __a, __b, __carry, __p) > #define vsbciq(__a, __b, __carry_out) __arm_vsbciq(__a, __b, __carry_out) > #define vsbciq_m(__inactive, __a, __b, __carry_out, __p) > __arm_vsbciq_m(__inactive, __a, __b, __carry_out, __p) > #define vsbcq(__a, __b, __carry) __arm_vsbcq(__a, __b, __carry) > @@ -319,10 +317,6 @@ > #define vstrwq_scatter_base_wb_s32(__addr, __offset, __value) > __arm_vstrwq_scatter_base_wb_s32(__addr, __offset, __value) > #define vstrwq_scatter_base_wb_u32(__addr, __offset, __value) > __arm_vstrwq_scatter_base_wb_u32(__addr, __offset, __value) > #define vstrwq_scatter_base_wb_f32(__addr, __offset, __value) > __arm_vstrwq_scatter_base_wb_f32(__addr, __offset, __value) > -#define vadcq_s32(__a, __b, __carry) __arm_vadcq_s32(__a, __b, __carry) > -#define vadcq_u32(__a, __b, __carry) __arm_vadcq_u32(__a, __b, __carry) > -#define vadcq_m_s32(__inactive, __a, __b, __carry, __p) > __arm_vadcq_m_s32(__inactive, __a, __b, __carry, __p) > -#define vadcq_m_u32(__inactive, __a, __b, __carry, __p) > __arm_vadcq_m_u32(__inactive, __a, __b, __carry, __p) > #define vsbciq_s32(__a, __b, __carry_out) __arm_vsbciq_s32(__a, __b, > __carry_out) > #define vsbciq_u32(__a, __b, __carry_out) __arm_vsbciq_u32(__a, __b, > __carry_out) > #define vsbciq_m_s32(__inactive, __a, __b, __carry_out, __p) > __arm_vsbciq_m_s32(__inactive, __a, __b, __carry_out, __p) > @@ -1684,46 +1678,6 @@ __arm_vstrwq_scatter_base_wb_u32 (uint32x4_t * __addr, > const int __offset, uint3 > *__addr = __builtin_mve_vstrwq_scatter_base_wb_uv4si (*__addr, __offset, > __value); > } > > -__extension__ extern __inline int32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry) > -{ > - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & > ~0x20000000u) | ((*__carry & 0x1u) << 29)); > - int32x4_t __res = __builtin_mve_vadcq_sv4si (__a, __b); > - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; > - return __res; > -} > - > -__extension__ extern __inline uint32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq_u32 (uint32x4_t __a, uint32x4_t __b, unsigned * __carry) > -{ > - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & > ~0x20000000u) | ((*__carry & 0x1u) << 29)); > - uint32x4_t __res = __builtin_mve_vadcq_uv4si (__a, __b); > - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; > - return __res; > -} > - > -__extension__ extern __inline int32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq_m_s32 (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, > unsigned * __carry, mve_pred16_t __p) > -{ > - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & > ~0x20000000u) | ((*__carry & 0x1u) << 29)); > - int32x4_t __res = __builtin_mve_vadcq_m_sv4si (__inactive, __a, __b, __p); > - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; > - return __res; > -} > - > -__extension__ extern __inline uint32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, > unsigned * __carry, mve_pred16_t __p) > -{ > - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & > ~0x20000000u) | ((*__carry & 0x1u) << 29)); > - uint32x4_t __res = __builtin_mve_vadcq_m_uv4si (__inactive, __a, __b, > __p); > - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; > - return __res; > -} > - > __extension__ extern __inline int32x4_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > __arm_vsbciq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry_out) > @@ -3600,34 +3554,6 @@ __arm_vstrwq_scatter_base_wb (uint32x4_t * __addr, > const int __offset, uint32x4_ > __arm_vstrwq_scatter_base_wb_u32 (__addr, __offset, __value); > } > > -__extension__ extern __inline int32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq (int32x4_t __a, int32x4_t __b, unsigned * __carry) > -{ > - return __arm_vadcq_s32 (__a, __b, __carry); > -} > - > -__extension__ extern __inline uint32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq (uint32x4_t __a, uint32x4_t __b, unsigned * __carry) > -{ > - return __arm_vadcq_u32 (__a, __b, __carry); > -} > - > -__extension__ extern __inline int32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq_m (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned > * __carry, mve_pred16_t __p) > -{ > - return __arm_vadcq_m_s32 (__inactive, __a, __b, __carry, __p); > -} > - > -__extension__ extern __inline uint32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq_m (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, > unsigned * __carry, mve_pred16_t __p) > -{ > - return __arm_vadcq_m_u32 (__inactive, __a, __b, __carry, __p); > -} > - > __extension__ extern __inline int32x4_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > __arm_vsbciq (int32x4_t __a, int32x4_t __b, unsigned * __carry_out) > @@ -5245,19 +5171,6 @@ extern void *__ARM_undef; > int (*)[__ARM_mve_type_int64_t_ptr]: > __arm_vldrdq_gather_shifted_offset_z_s64 (__ARM_mve_coerce_s64_ptr(p0, > int64_t *), p1, p2), \ > int (*)[__ARM_mve_type_uint64_t_ptr]: > __arm_vldrdq_gather_shifted_offset_z_u64 (__ARM_mve_coerce_u64_ptr(p0, > uint64_t *), p1, p2))) > > -#define __arm_vadcq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ > - __typeof(p1) __p1 = (p1); \ > - __typeof(p2) __p2 = (p2); \ > - _Generic( (int > (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, > \ > - int > (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: > __arm_vadcq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, > int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3, p4), \ > - int > (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: > __arm_vadcq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), > __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3, > p4));}) > - > -#define __arm_vadcq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ > - __typeof(p1) __p1 = (p1); \ > - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ > - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: > __arm_vadcq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, > int32x4_t), p2), \ > - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: > __arm_vadcq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, > uint32x4_t), p2));}) > - > #define __arm_vsbciq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ > __typeof(p1) __p1 = (p1); \ > __typeof(p2) __p2 = (p2); \