https://gcc.gnu.org/g:d54a66c1d81ca3874be4c086652f205b1d6ebe49
commit d54a66c1d81ca3874be4c086652f205b1d6ebe49 Author: Saurabh Jha <saurabh....@arm.com> Date: Tue Nov 19 22:38:51 2024 +0000 Work in progress for refactoring simd intrinsic Diff: --- gcc/config/aarch64/aarch64-builtins.cc | 138 ++- .../aarch64/aarch64-simd-pragma-builtins.def | 156 +++ gcc/config/aarch64/aarch64-simd.md | 21 +- gcc/config/aarch64/arm_neon.h | 1183 ++++---------------- gcc/config/aarch64/iterators.md | 5 + 5 files changed, 518 insertions(+), 985 deletions(-) diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index 7b2decf671fa..62adc62976c8 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -696,6 +696,7 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = { VREINTERPRET_BUILTINS \ VREINTERPRETQ_BUILTINS +/* Add fp8 here and in high */ #define AARCH64_SIMD_VGET_LOW_BUILTINS \ VGET_LOW_BUILTIN(f16) \ VGET_LOW_BUILTIN(f32) \ @@ -1608,31 +1609,85 @@ namespace simd_types { constexpr simd_type f8 { V8QImode, qualifier_modal_float }; constexpr simd_type f8q { V16QImode, qualifier_modal_float }; + constexpr simd_type s8_scalar_const_ptr + { QImode, qualifier_const_pointer_map_mode }; + constexpr simd_type s8_scalar { QImode, qualifier_none }; constexpr simd_type s8 { V8QImode, qualifier_none }; - constexpr simd_type u8 { V8QImode, qualifier_unsigned }; constexpr simd_type s8q { V16QImode, qualifier_none }; + constexpr simd_type u8_scalar_const_ptr + { QImode, qualifier_const_pointer_map_mode }; + constexpr simd_type u8_scalar { QImode, qualifier_unsigned }; + constexpr simd_type u8 { V8QImode, qualifier_unsigned }; constexpr simd_type u8q { V16QImode, qualifier_unsigned }; + constexpr simd_type s16_scalar_const_ptr + { HImode, qualifier_const_pointer_map_mode }; + constexpr simd_type s16_scalar { HImode, qualifier_none }; constexpr simd_type s16 { V4HImode, qualifier_none }; + constexpr simd_type u16_scalar_const_ptr + { HImode, qualifier_const_pointer_map_mode }; + constexpr simd_type u16_scalar { HImode, qualifier_unsigned }; constexpr simd_type u16 { V4HImode, qualifier_unsigned }; constexpr simd_type s16q { V8HImode, qualifier_none }; constexpr simd_type u16q { V8HImode, qualifier_unsigned }; + constexpr simd_type s32_scalar_const_ptr + { SImode, qualifier_const_pointer_map_mode }; constexpr simd_type s32_index { SImode, qualifier_lane_index }; + constexpr simd_type s32_scalar { SImode, qualifier_none }; constexpr simd_type s32 { V2SImode, qualifier_none }; + constexpr simd_type u32_scalar_const_ptr + { SImode, qualifier_const_pointer_map_mode }; + constexpr simd_type u32_scalar { SImode, qualifier_unsigned }; + constexpr simd_type u32 { V2SImode, qualifier_unsigned }; constexpr simd_type s32q { V4SImode, qualifier_none }; - + constexpr simd_type u32q { V4SImode, qualifier_unsigned }; + + constexpr simd_type s64_scalar_const_ptr + { DImode, qualifier_const_pointer_map_mode }; + constexpr simd_type s64_scalar { DImode, qualifier_none }; + constexpr simd_type s64 { V1DImode, qualifier_none }; + constexpr simd_type u64_scalar_const_ptr + { DImode, qualifier_const_pointer_map_mode }; + constexpr simd_type u64_scalar { DImode, qualifier_unsigned }; + constexpr simd_type u64 { V1DImode, qualifier_unsigned }; constexpr simd_type s64q { V2DImode, qualifier_none }; + constexpr simd_type u64q { V2DImode, qualifier_unsigned }; + constexpr simd_type p8_scalar_const_ptr + { QImode, qualifier_const_pointer_map_mode }; + constexpr simd_type p8_scalar { QImode, qualifier_poly }; constexpr simd_type p8 { V8QImode, qualifier_poly }; constexpr simd_type p8q { V16QImode, qualifier_poly }; + + constexpr simd_type p16_scalar_const_ptr + { HImode, qualifier_const_pointer_map_mode }; + constexpr simd_type p16_scalar { HImode, qualifier_poly }; constexpr simd_type p16 { V4HImode, qualifier_poly }; constexpr simd_type p16q { V8HImode, qualifier_poly }; + constexpr simd_type p64_scalar_const_ptr + { DImode, qualifier_const_pointer_map_mode }; + constexpr simd_type p64_scalar { DImode, qualifier_poly }; + constexpr simd_type p64 { V1DImode, qualifier_poly }; + constexpr simd_type p64q { V2DImode, qualifier_poly }; + + constexpr simd_type f16_scalar_const_ptr + { HFmode, qualifier_const_pointer_map_mode }; + constexpr simd_type f16_scalar { HFmode, qualifier_none }; constexpr simd_type f16 { V4HFmode, qualifier_none }; constexpr simd_type f16q { V8HFmode, qualifier_none }; + + constexpr simd_type f32_scalar_const_ptr + { SFmode, qualifier_const_pointer_map_mode }; + constexpr simd_type f32_scalar { SFmode, qualifier_none }; constexpr simd_type f32 { V2SFmode, qualifier_none }; constexpr simd_type f32q { V4SFmode, qualifier_none }; + + constexpr simd_type f64_scalar_const_ptr + { DFmode, qualifier_const_pointer_map_mode }; + constexpr simd_type f64_scalar { DFmode, qualifier_none }; + constexpr simd_type f64 { V1DFmode, qualifier_none }; constexpr simd_type f64q { V2DFmode, qualifier_none }; constexpr simd_type bf16 { V4BFmode, qualifier_none }; @@ -3592,8 +3647,87 @@ aarch64_expand_pragma_builtin (tree exp, rtx target, gcc_unreachable (); expand_insn (icode, nargs + 1, ops); + target = ops[0].value; + break; + + case UNSPEC_VCREATE: + target = force_lowpart_subreg (builtin_data->types[0].mode, + expand_normal (CALL_EXPR_ARG (exp, 0)), + DImode); + break; + + case UNSPEC_VEC_COPY: + { + /* Need to do lane checks here. */ + /* Also need to set indexes correctly here. */ + expand_operand vget_ops[3]; + rtx vget_target; + auto vget_output_mode = GET_MODE_INNER (builtin_data->types[0].mode); + create_output_operand (&vget_ops[0], vget_target, vget_output_mode); + vget_ops[1] = ops[3]; + vget_ops[2] = ops[4]; + auto vget_icode = code_for_aarch64_get_lane (builtin_data->types[0].mode); + expand_insn (vget_icode, 3, vget_ops); + vget_target = vget_ops[0].value; + + expand_operand vset_ops[4]; + create_output_operand (&vset_ops[0], + target, + builtin_data->types[0].mode); + vset_ops[1] = vget_ops[0]; + vset_ops[2] = ops[2]; + vset_ops[3] = ops[1]; + auto vset_icode = code_for_aarch64_simd_vec_set (builtin_data->types[0].mode); + expand_insn (vset_icode, 4, vset_ops); + + target = vset_ops[0].value; + break; + } + + case UNSPEC_DUP: + target = expand_vector_broadcast (builtin_data->types[0].mode, + expand_normal (CALL_EXPR_ARG (exp, 0))); + break; + + case UNSPEC_DUPB: + icode = code_for_aarch64_get_lane (builtin_data->types[1].mode); + expand_insn (icode, nargs + 1, ops); + target = ops[0].value; break; + case UNSPEC_LD1: + { + if (builtin_data->types[0].mode == V1DFmode) + target = expand_vector_broadcast (builtin_data->types[0].mode, + expand_normal (CALL_EXPR_ARG (exp, 0))); + else + { + icode = code_for_aarch64_ld1 (builtin_data->types[0].mode); + auto input + = convert_memory_address (Pmode, + expand_normal (CALL_EXPR_ARG (exp, 0))); + create_input_operand (&ops[1], input, Pmode); + expand_insn (icode, nargs + 1, ops); + } + target = ops[0].value; + break; + } + + case UNSPEC_DUP_LANE: + { + /* We need to do lane checks here. */ + auto lane = INTVAL (expand_normal (CALL_EXPR_ARG (exp, 1))); + auto vector_mode = builtin_data->types[1].mode; + auto nunits = GET_MODE_NUNITS (vector_mode).to_constant (); + create_input_operand(&ops[2], + gen_int_mode ((ENDIAN_LANE_N (nunits, lane)), + SImode), + SImode); + icode = code_for_aarch64_dup_lane (builtin_data->types[0].mode); + expand_insn (icode, nargs + 1, ops); + target = ops[0].value; + break; + } default: gcc_unreachable (); } diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def index c7857123ca03..deea4b9eb1a2 100644 --- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def @@ -27,6 +27,10 @@ #define ENTRY_BINARY_FPM(N, T0, T1, T2, U) \ ENTRY (N, binary, T0, T1, T2, none, none, true, U) +#undef ENTRY_BINARY_TWO_LANES +#define ENTRY_BINARY_TWO_LANES(N, T0, T1, T2, U) \ + ENTRY (N, quaternary, T0, T1, s32_index, T2, s32_index, false, U) + #undef ENTRY_TERNARY_FPM #define ENTRY_TERNARY_FPM(N, T0, T1, T2, T3, U) \ ENTRY (N, ternary, T0, T1, T2, T3, none, true, U) @@ -35,6 +39,14 @@ #define ENTRY_TERNARY_FPM_LANE(N, T0, T1, T2, T3, U) \ ENTRY (N, quaternary, T0, T1, T2, T3, s32_index, true, U) +#undef ENTRY_UNARY +#define ENTRY_UNARY(N, T0, T1, U) \ + ENTRY (N, unary, T0, T1, none, none, none, false, U) + +#undef ENTRY_UNARY_LANE +#define ENTRY_UNARY_LANE(N, T0, T1, U) \ + ENTRY_BINARY (N, T0, T1, s32_index, U) \ + #undef ENTRY_UNARY_FPM #define ENTRY_UNARY_FPM(N, T0, T1, U) \ ENTRY (N, unary, T0, T1, none, none, none, true, U) @@ -70,6 +82,99 @@ ENTRY_BINARY (NAME##q_f32, f32q, f32q, s32q, UNSPEC) \ ENTRY_BINARY (NAME##q_f64, f64q, f64q, s64q, UNSPEC) +#undef ENTRY_UNARY_N_VALL_F16_SCALAR +#define ENTRY_UNARY_N_VALL_F16_SCALAR(NAME, UNSPEC) \ + ENTRY_UNARY (NAME##_n_p8, p8, p8_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##q_n_p8, p8q, p8_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##_n_p16, p16, p16_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##q_n_p16, p16q, p16_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##_n_p64, p64, p64_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##q_n_p64, p64q, p64_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##_n_s8, s8, s8_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##q_n_s8, s8q, s8_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##_n_s16, s16, s16_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##q_n_s16, s16q, s16_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##_n_s32, s32, s32_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##q_n_s32, s32q, s32_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##_n_s64, s64, s64_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##q_n_s64, s64q, s64_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##_n_u8, u8, u8_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##q_n_u8, u8q, u8_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##_n_u16, u16, u16_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##q_n_u16, u16q, u16_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##_n_u32, u32, u32_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##q_n_u32, u32q, u32_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##_n_u64, u64, u64_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##q_n_u64, u64q, u64_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##_n_f16, f16, f16_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##q_n_f16, f16q, f16_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##_n_f32, f32, f32_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##q_n_f32, f32q, f32_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##_n_f64, f64, f64_scalar, UNSPEC) \ + ENTRY_UNARY (NAME##q_n_f64, f64q, f64_scalar, UNSPEC) \ + +#undef ENTRY_UNARY_VALL_F16_CONST_PTR +#define ENTRY_UNARY_VALL_F16_CONST_PTR(NAME, UNSPEC) \ + ENTRY_UNARY (NAME##_p8, p8, p8_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##q_p8, p8q, p8_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##_p16, p16, p16_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##q_p16, p16q, p16_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##_p64, p64, p64_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##q_p64, p64q, p64_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##_s8, s8, s8_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##q_s8, s8q, s8_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##_s16, s16, s16_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##q_s16, s16q, s16_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##_s32, s32, s32_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##q_s32, s32q, s32_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##_s64, s64, s64_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##q_s64, s64q, s64_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##_u8, u8, u8_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##q_u8, u8q, u8_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##_u16, u16, u16_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##q_u16, u16q, u16_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##_u32, u32, u32_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##q_u32, u32q, u32_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##_u64, u64, u64_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##q_u64, u64q, u64_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##_f16, f16, f16_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##q_f16, f16q, f16_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##_f32, f32, f32_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##q_f32, f32q, f32_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##_f64, f64, f64_scalar_const_ptr, UNSPEC) \ + ENTRY_UNARY (NAME##q_f64, f64q, f64_scalar_const_ptr, UNSPEC) \ + +#undef ENTRY_UNARY_LANE_VALL_F16 +#define ENTRY_UNARY_LANE_VALL_F16(NAME, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_lane_p8, p8, p8, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_laneq_p8, p8, p8q, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_lane_p16, p16, p16, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_laneq_p16, p16, p16q, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_lane_p64, p64, p64, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_laneq_p64, p64, p64q, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_lane_s8, s8, s8, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_laneq_s8, s8, s8q, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_lane_s16, s16, s16, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_laneq_s16, s16, s16q, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_lane_s32, s32, s32, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_laneq_s32, s32, s32q, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_lane_s64, s64, s64, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_laneq_s64, s64, s64q, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_lane_u8, u8, u8, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_laneq_u8, u8, u8q, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_lane_u16, u16, u16, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_laneq_u16, u16, u16q, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_lane_u32, u32, u32, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_laneq_u32, u32, u32q, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_lane_u64, u64, u64, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_laneq_u64, u64, u64q, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_lane_f16, f16, f16, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_laneq_f16, f16, f16q, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_lane_f32, f32, f32, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_laneq_f32, f32, f32q, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_lane_f64, f64, f64, UNSPEC) \ + ENTRY_UNARY_LANE (NAME##_laneq_f64, f64, f64q, UNSPEC) \ + // faminmax #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FAMINMAX) ENTRY_VHSDF (vamax, UNSPEC_FAMAX) @@ -122,3 +227,54 @@ ENTRY_FMA_FPM (vmlallbtq, f32, UNSPEC_FMLALLBT) ENTRY_FMA_FPM (vmlalltbq, f32, UNSPEC_FMLALLTB) ENTRY_FMA_FPM (vmlallttq, f32, UNSPEC_FMLALLTT) #undef REQUIRED_EXTENSIONS + +// dup +#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_SIMD) +ENTRY_UNARY_N_VALL_F16_SCALAR (vdup, UNSPEC_DUP) +ENTRY_UNARY_LANE_VALL_F16 (vdup, UNSPEC_DUP_LANE) +#undef REQUIRED_EXTENSIONS + +// mov +#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_SIMD) +ENTRY_UNARY_N_VALL_F16_SCALAR (vmov, UNSPEC_DUP) +#undef REQUIRED_EXTENSIONS + +// vcreate +#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_SIMD) +ENTRY_UNARY (vcreate_p8, p8, u64_scalar, UNSPEC_VCREATE) +ENTRY_UNARY (vcreate_p16, p16, u64_scalar, UNSPEC_VCREATE) +ENTRY_UNARY (vcreate_p64, p64, u64_scalar, UNSPEC_VCREATE) +ENTRY_UNARY (vcreate_s8, s8, u64_scalar, UNSPEC_VCREATE) +ENTRY_UNARY (vcreate_s16, s16, u64_scalar, UNSPEC_VCREATE) +ENTRY_UNARY (vcreate_s32, s32, u64_scalar, UNSPEC_VCREATE) +ENTRY_UNARY (vcreate_s64, s64, u64_scalar, UNSPEC_VCREATE) +ENTRY_UNARY (vcreate_u8, u8, u64_scalar, UNSPEC_VCREATE) +ENTRY_UNARY (vcreate_u16, u16, u64_scalar, UNSPEC_VCREATE) +ENTRY_UNARY (vcreate_u32, u32, u64_scalar, UNSPEC_VCREATE) +ENTRY_UNARY (vcreate_u64, u64, u64_scalar, UNSPEC_VCREATE) +ENTRY_UNARY (vcreate_f16, f16, u64_scalar, UNSPEC_VCREATE) +ENTRY_UNARY (vcreate_f32, f32, u64_scalar, UNSPEC_VCREATE) +ENTRY_UNARY (vcreate_f64, f64, u64_scalar, UNSPEC_VCREATE) +#undef REQUIRED_EXTENSIONS + +// vcopy_lane +#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_SIMD) +ENTRY_BINARY_TWO_LANES (vcopy_lane_p8, p8, p8, p8, UNSPEC_VEC_COPY) +ENTRY_BINARY_TWO_LANES (vcopy_lane_s8, s8, s8, s8, UNSPEC_VEC_COPY) +ENTRY_BINARY_TWO_LANES (vcopy_lane_u8, u8, u8, u8, UNSPEC_VEC_COPY) +#undef REQUIRED_EXTENSIONS + +// vdupb_lane +#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_SIMD) +ENTRY_UNARY_LANE (vdupb_lane_s8, s8_scalar, s8, UNSPEC_DUPB) +ENTRY_UNARY_LANE (vdupb_lane_u8, u8_scalar, u8, UNSPEC_DUPB) +ENTRY_UNARY_LANE (vdupb_lane_p8, p8_scalar, p8, UNSPEC_DUPB) +ENTRY_UNARY_LANE (vdupb_laneq_s8, s8_scalar, s8q, UNSPEC_DUPB) +ENTRY_UNARY_LANE (vdupb_laneq_u8, u8_scalar, u8q, UNSPEC_DUPB) +ENTRY_UNARY_LANE (vdupb_laneq_p8, p8_scalar, p8q, UNSPEC_DUPB) +#undef REQUIRED_EXTENSIONS + +// ld1 +#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_SIMD) +ENTRY_UNARY_VALL_F16_CONST_PTR (vld1, UNSPEC_LD1) +#undef REQUIRED_EXTENSIONS diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index df0d30af6a11..ffad1623fa7e 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -112,7 +112,7 @@ } ) -(define_insn "aarch64_dup_lane<mode>" +(define_insn "@aarch64_dup_lane<mode>" [(set (match_operand:VALL_F16 0 "register_operand" "=w") (vec_duplicate:VALL_F16 (vec_select:<VEL> @@ -121,6 +121,7 @@ )))] "TARGET_SIMD" { + /* TODO: Need to use ENDIAN_LANE_N this in existing intrinsics too. We still need the next line. */ operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2])); return "dup\\t%0.<Vtype>, %1.<Vetype>[%2]"; } @@ -1164,7 +1165,7 @@ [(set_attr "type" "neon_logic<q>")] ) -(define_insn "aarch64_simd_vec_set<mode>" +(define_insn "@aarch64_simd_vec_set<mode>" [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w") (vec_merge:VALL_F16 (vec_duplicate:VALL_F16 @@ -1178,9 +1179,9 @@ switch (which_alternative) { case 0: - return "ins\\t%0.<Vetype>[%p2], %1.<Vetype>[0]"; + return "ins1\\t%0.<Vetype>[%p2], %1.<Vetype>[0]"; case 1: - return "ins\\t%0.<Vetype>[%p2], %<vwcore>1"; + return "ins2\\t%0.<Vetype>[%p2], %<vwcore>1"; case 2: return "ld1\\t{%0.<Vetype>}[%p2], %1"; default: @@ -1190,7 +1191,7 @@ [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")] ) -(define_insn "aarch64_simd_vec_set_zero<mode>" +(define_insn "@aarch64_simd_vec_set_zero<mode>" [(set (match_operand:VALL_F16 0 "register_operand" "=w") (vec_merge:VALL_F16 (match_operand:VALL_F16 1 "aarch64_simd_imm_zero" "") @@ -1200,7 +1201,7 @@ { int elt = ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2]))); operands[2] = GEN_INT ((HOST_WIDE_INT) 1 << elt); - return "ins\\t%0.<Vetype>[%p2], <vwcore>zr"; + return "ins3\\t%0.<Vetype>[%p2], <vwcore>zr"; } ) @@ -1220,7 +1221,7 @@ operands[2] = GEN_INT (HOST_WIDE_INT_1 << elt); operands[4] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[4])); - return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]"; + return "ins4\t%0.<Vetype>[%p2], %3.<Vetype>[%4]"; } [(set_attr "type" "neon_ins<q>")] ) @@ -1242,7 +1243,7 @@ operands[4] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[4])); - return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]"; + return "ins5\t%0.<Vetype>[%p2], %3.<Vetype>[%4]"; } [(set_attr "type" "neon_ins<q>")] ) @@ -4357,7 +4358,7 @@ ;; RTL uses GCC vector extension indices throughout so flip only for assembly. ;; Extracting lane zero is split into a simple move when it is between SIMD ;; registers or a store. -(define_insn_and_split "aarch64_get_lane<mode>" +(define_insn_and_split "@aarch64_get_lane<mode>" [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=?r, w, Utv") (vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand" "w, w, w") @@ -8401,7 +8402,7 @@ DONE; }) -(define_expand "aarch64_ld1<VALL_F16:mode>" +(define_expand "@aarch64_ld1<VALL_F16:mode>" [(match_operand:VALL_F16 0 "register_operand") (match_operand:DI 1 "register_operand")] "TARGET_SIMD" diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index c727302ac75f..45f24d3bb489 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -2490,104 +2490,6 @@ vqrdmulhq_s32 (int32x4_t __a, int32x4_t __b) return (int32x4_t) __builtin_aarch64_sqrdmulhv4si (__a, __b); } -__extension__ extern __inline int8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcreate_s8 (uint64_t __a) -{ - return (int8x8_t) __a; -} - -__extension__ extern __inline int16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcreate_s16 (uint64_t __a) -{ - return (int16x4_t) __a; -} - -__extension__ extern __inline int32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcreate_s32 (uint64_t __a) -{ - return (int32x2_t) __a; -} - -__extension__ extern __inline int64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcreate_s64 (uint64_t __a) -{ - return (int64x1_t) {__a}; -} - -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcreate_f16 (uint64_t __a) -{ - return (float16x4_t) __a; -} - -__extension__ extern __inline float32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcreate_f32 (uint64_t __a) -{ - return (float32x2_t) __a; -} - -__extension__ extern __inline uint8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcreate_u8 (uint64_t __a) -{ - return (uint8x8_t) __a; -} - -__extension__ extern __inline uint16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcreate_u16 (uint64_t __a) -{ - return (uint16x4_t) __a; -} - -__extension__ extern __inline uint32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcreate_u32 (uint64_t __a) -{ - return (uint32x2_t) __a; -} - -__extension__ extern __inline uint64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcreate_u64 (uint64_t __a) -{ - return (uint64x1_t) {__a}; -} - -__extension__ extern __inline float64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcreate_f64 (uint64_t __a) -{ - return (float64x1_t) __a; -} - -__extension__ extern __inline poly8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcreate_p8 (uint64_t __a) -{ - return (poly8x8_t) __a; -} - -__extension__ extern __inline poly16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcreate_p16 (uint64_t __a) -{ - return (poly16x4_t) __a; -} - -__extension__ extern __inline poly64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcreate_p64 (uint64_t __a) -{ - return (poly64x1_t) __a; -} - /* vget_lane */ __extension__ extern __inline float16_t @@ -9245,14 +9147,14 @@ vcopy_lane_f64 (float64x1_t __a, const int __lane1, __a, __lane1); } -__extension__ extern __inline poly8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcopy_lane_p8 (poly8x8_t __a, const int __lane1, - poly8x8_t __b, const int __lane2) -{ - return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), - __a, __lane1); -} +/* __extension__ extern __inline poly8x8_t */ +/* __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) */ +/* vcopy_lane_p8 (poly8x8_t __a, const int __lane1, */ +/* poly8x8_t __b, const int __lane2) */ +/* { */ +/* return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), */ +/* __a, __lane1); */ +/* } */ __extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -9308,14 +9210,14 @@ vcopy_lane_s64 (int64x1_t __a, const int __lane1, __a, __lane1); } -__extension__ extern __inline uint8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcopy_lane_u8 (uint8x8_t __a, const int __lane1, - uint8x8_t __b, const int __lane2) -{ - return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), - __a, __lane1); -} +/* __extension__ extern __inline uint8x8_t */ +/* __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) */ +/* vcopy_lane_u8 (uint8x8_t __a, const int __lane1, */ +/* uint8x8_t __b, const int __lane2) */ +/* { */ +/* return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), */ +/* __a, __lane1); */ +/* } */ __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -10456,575 +10358,332 @@ vcvtpq_u64_f64 (float64x2_t __a) /* vdup_n */ +__extension__ extern __inline poly64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_n_p64 (poly64_t __a) +{ + return (poly64x1_t) {__a}; +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_n_f64 (float64_t __a) +{ + return (float64x1_t) {__a}; +} + +__extension__ extern __inline int64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_n_s64 (int64_t __a) +{ + return (int64x1_t) {__a}; +} + +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_n_u64 (uint64_t __a) +{ + return (uint64x1_t) {__a}; +} + +/* vdup_lane */ + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_lane_f64 (float64x1_t __a, const int __b) +{ + return __aarch64_vdup_lane_f64 (__a, __b); +} + +__extension__ extern __inline poly64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_lane_p64 (poly64x1_t __a, const int __b) +{ + return __aarch64_vdup_lane_p64 (__a, __b); +} + +__extension__ extern __inline int64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_lane_s64 (int64x1_t __a, const int __b) +{ + return __aarch64_vdup_lane_s64 (__a, __b); +} + +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_lane_u64 (uint64x1_t __a, const int __b) +{ + return __aarch64_vdup_lane_u64 (__a, __b); +} + +/* vdup_laneq */ + __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_n_f16 (float16_t __a) +vdup_laneq_f16 (float16x8_t __a, const int __b) { - return (float16x4_t) {__a, __a, __a, __a}; + return __aarch64_vdup_laneq_f16 (__a, __b); } __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_n_f32 (float32_t __a) +vdup_laneq_f32 (float32x4_t __a, const int __b) { - return (float32x2_t) {__a, __a}; + return __aarch64_vdup_laneq_f32 (__a, __b); } __extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_n_f64 (float64_t __a) +vdup_laneq_f64 (float64x2_t __a, const int __b) { - return (float64x1_t) {__a}; + return __aarch64_vdup_laneq_f64 (__a, __b); } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_n_p8 (poly8_t __a) +vdup_laneq_p8 (poly8x16_t __a, const int __b) { - return (poly8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; + return __aarch64_vdup_laneq_p8 (__a, __b); } __extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_n_p16 (poly16_t __a) +vdup_laneq_p16 (poly16x8_t __a, const int __b) { - return (poly16x4_t) {__a, __a, __a, __a}; + return __aarch64_vdup_laneq_p16 (__a, __b); } __extension__ extern __inline poly64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_n_p64 (poly64_t __a) +vdup_laneq_p64 (poly64x2_t __a, const int __b) { - return (poly64x1_t) {__a}; + return __aarch64_vdup_laneq_p64 (__a, __b); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_n_s8 (int8_t __a) +vdup_laneq_s8 (int8x16_t __a, const int __b) { - return (int8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; + return __aarch64_vdup_laneq_s8 (__a, __b); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_n_s16 (int16_t __a) +vdup_laneq_s16 (int16x8_t __a, const int __b) { - return (int16x4_t) {__a, __a, __a, __a}; + return __aarch64_vdup_laneq_s16 (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_n_s32 (int32_t __a) +vdup_laneq_s32 (int32x4_t __a, const int __b) { - return (int32x2_t) {__a, __a}; + return __aarch64_vdup_laneq_s32 (__a, __b); } __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_n_s64 (int64_t __a) +vdup_laneq_s64 (int64x2_t __a, const int __b) { - return (int64x1_t) {__a}; + return __aarch64_vdup_laneq_s64 (__a, __b); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_n_u8 (uint8_t __a) +vdup_laneq_u8 (uint8x16_t __a, const int __b) { - return (uint8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; + return __aarch64_vdup_laneq_u8 (__a, __b); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_n_u16 (uint16_t __a) +vdup_laneq_u16 (uint16x8_t __a, const int __b) { - return (uint16x4_t) {__a, __a, __a, __a}; + return __aarch64_vdup_laneq_u16 (__a, __b); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_n_u32 (uint32_t __a) +vdup_laneq_u32 (uint32x4_t __a, const int __b) { - return (uint32x2_t) {__a, __a}; + return __aarch64_vdup_laneq_u32 (__a, __b); } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_n_u64 (uint64_t __a) +vdup_laneq_u64 (uint64x2_t __a, const int __b) { - return (uint64x1_t) {__a}; + return __aarch64_vdup_laneq_u64 (__a, __b); } -/* vdupq_n */ +/* vdupq_lane */ __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_f16 (float16_t __a) +vdupq_lane_f16 (float16x4_t __a, const int __b) { - return (float16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; + return __aarch64_vdupq_lane_f16 (__a, __b); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_f32 (float32_t __a) +vdupq_lane_f32 (float32x2_t __a, const int __b) { - return (float32x4_t) {__a, __a, __a, __a}; + return __aarch64_vdupq_lane_f32 (__a, __b); } __extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_f64 (float64_t __a) +vdupq_lane_f64 (float64x1_t __a, const int __b) { - return (float64x2_t) {__a, __a}; + return __aarch64_vdupq_lane_f64 (__a, __b); } __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_p8 (poly8_t __a) +vdupq_lane_p8 (poly8x8_t __a, const int __b) { - return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, - __a, __a, __a, __a, __a, __a, __a, __a}; + return __aarch64_vdupq_lane_p8 (__a, __b); } __extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_p16 (poly16_t __a) +vdupq_lane_p16 (poly16x4_t __a, const int __b) { - return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; + return __aarch64_vdupq_lane_p16 (__a, __b); } __extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_p64 (poly64_t __a) +vdupq_lane_p64 (poly64x1_t __a, const int __b) { - return (poly64x2_t) {__a, __a}; + return __aarch64_vdupq_lane_p64 (__a, __b); } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_s8 (int8_t __a) +vdupq_lane_s8 (int8x8_t __a, const int __b) { - return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, - __a, __a, __a, __a, __a, __a, __a, __a}; + return __aarch64_vdupq_lane_s8 (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_s16 (int16_t __a) +vdupq_lane_s16 (int16x4_t __a, const int __b) { - return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; + return __aarch64_vdupq_lane_s16 (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_s32 (int32_t __a) +vdupq_lane_s32 (int32x2_t __a, const int __b) { - return (int32x4_t) {__a, __a, __a, __a}; + return __aarch64_vdupq_lane_s32 (__a, __b); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_s64 (int64_t __a) +vdupq_lane_s64 (int64x1_t __a, const int __b) { - return (int64x2_t) {__a, __a}; + return __aarch64_vdupq_lane_s64 (__a, __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_u8 (uint8_t __a) +vdupq_lane_u8 (uint8x8_t __a, const int __b) { - return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, - __a, __a, __a, __a, __a, __a, __a, __a}; + return __aarch64_vdupq_lane_u8 (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_u16 (uint16_t __a) +vdupq_lane_u16 (uint16x4_t __a, const int __b) { - return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; + return __aarch64_vdupq_lane_u16 (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_u32 (uint32_t __a) +vdupq_lane_u32 (uint32x2_t __a, const int __b) { - return (uint32x4_t) {__a, __a, __a, __a}; + return __aarch64_vdupq_lane_u32 (__a, __b); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_u64 (uint64_t __a) +vdupq_lane_u64 (uint64x1_t __a, const int __b) { - return (uint64x2_t) {__a, __a}; + return __aarch64_vdupq_lane_u64 (__a, __b); } -/* vdup_lane */ +/* vdupq_laneq */ -__extension__ extern __inline float16x4_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_lane_f16 (float16x4_t __a, const int __b) +vdupq_laneq_f16 (float16x8_t __a, const int __b) { - return __aarch64_vdup_lane_f16 (__a, __b); + return __aarch64_vdupq_laneq_f16 (__a, __b); } -__extension__ extern __inline float32x2_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_lane_f32 (float32x2_t __a, const int __b) +vdupq_laneq_f32 (float32x4_t __a, const int __b) { - return __aarch64_vdup_lane_f32 (__a, __b); + return __aarch64_vdupq_laneq_f32 (__a, __b); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_lane_f64 (float64x1_t __a, const int __b) +vdupq_laneq_f64 (float64x2_t __a, const int __b) { - return __aarch64_vdup_lane_f64 (__a, __b); + return __aarch64_vdupq_laneq_f64 (__a, __b); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_lane_p8 (poly8x8_t __a, const int __b) +vdupq_laneq_p8 (poly8x16_t __a, const int __b) { - return __aarch64_vdup_lane_p8 (__a, __b); + return __aarch64_vdupq_laneq_p8 (__a, __b); } -__extension__ extern __inline poly16x4_t +__extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_lane_p16 (poly16x4_t __a, const int __b) +vdupq_laneq_p16 (poly16x8_t __a, const int __b) { - return __aarch64_vdup_lane_p16 (__a, __b); + return __aarch64_vdupq_laneq_p16 (__a, __b); } -__extension__ extern __inline poly64x1_t +__extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_lane_p64 (poly64x1_t __a, const int __b) +vdupq_laneq_p64 (poly64x2_t __a, const int __b) { - return __aarch64_vdup_lane_p64 (__a, __b); + return __aarch64_vdupq_laneq_p64 (__a, __b); } -__extension__ extern __inline int8x8_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_lane_s8 (int8x8_t __a, const int __b) +vdupq_laneq_s8 (int8x16_t __a, const int __b) { - return __aarch64_vdup_lane_s8 (__a, __b); + return __aarch64_vdupq_laneq_s8 (__a, __b); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_lane_s16 (int16x4_t __a, const int __b) +vdupq_laneq_s16 (int16x8_t __a, const int __b) { - return __aarch64_vdup_lane_s16 (__a, __b); + return __aarch64_vdupq_laneq_s16 (__a, __b); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_lane_s32 (int32x2_t __a, const int __b) +vdupq_laneq_s32 (int32x4_t __a, const int __b) { - return __aarch64_vdup_lane_s32 (__a, __b); + return __aarch64_vdupq_laneq_s32 (__a, __b); } -__extension__ extern __inline int64x1_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_lane_s64 (int64x1_t __a, const int __b) -{ - return __aarch64_vdup_lane_s64 (__a, __b); -} - -__extension__ extern __inline uint8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_lane_u8 (uint8x8_t __a, const int __b) -{ - return __aarch64_vdup_lane_u8 (__a, __b); -} - -__extension__ extern __inline uint16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_lane_u16 (uint16x4_t __a, const int __b) -{ - return __aarch64_vdup_lane_u16 (__a, __b); -} - -__extension__ extern __inline uint32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_lane_u32 (uint32x2_t __a, const int __b) -{ - return __aarch64_vdup_lane_u32 (__a, __b); -} - -__extension__ extern __inline uint64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_lane_u64 (uint64x1_t __a, const int __b) -{ - return __aarch64_vdup_lane_u64 (__a, __b); -} - -/* vdup_laneq */ - -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_laneq_f16 (float16x8_t __a, const int __b) -{ - return __aarch64_vdup_laneq_f16 (__a, __b); -} - -__extension__ extern __inline float32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_laneq_f32 (float32x4_t __a, const int __b) -{ - return __aarch64_vdup_laneq_f32 (__a, __b); -} - -__extension__ extern __inline float64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_laneq_f64 (float64x2_t __a, const int __b) -{ - return __aarch64_vdup_laneq_f64 (__a, __b); -} - -__extension__ extern __inline poly8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_laneq_p8 (poly8x16_t __a, const int __b) -{ - return __aarch64_vdup_laneq_p8 (__a, __b); -} - -__extension__ extern __inline poly16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_laneq_p16 (poly16x8_t __a, const int __b) -{ - return __aarch64_vdup_laneq_p16 (__a, __b); -} - -__extension__ extern __inline poly64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_laneq_p64 (poly64x2_t __a, const int __b) -{ - return __aarch64_vdup_laneq_p64 (__a, __b); -} - -__extension__ extern __inline int8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_laneq_s8 (int8x16_t __a, const int __b) -{ - return __aarch64_vdup_laneq_s8 (__a, __b); -} - -__extension__ extern __inline int16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_laneq_s16 (int16x8_t __a, const int __b) -{ - return __aarch64_vdup_laneq_s16 (__a, __b); -} - -__extension__ extern __inline int32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_laneq_s32 (int32x4_t __a, const int __b) -{ - return __aarch64_vdup_laneq_s32 (__a, __b); -} - -__extension__ extern __inline int64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_laneq_s64 (int64x2_t __a, const int __b) -{ - return __aarch64_vdup_laneq_s64 (__a, __b); -} - -__extension__ extern __inline uint8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_laneq_u8 (uint8x16_t __a, const int __b) -{ - return __aarch64_vdup_laneq_u8 (__a, __b); -} - -__extension__ extern __inline uint16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_laneq_u16 (uint16x8_t __a, const int __b) -{ - return __aarch64_vdup_laneq_u16 (__a, __b); -} - -__extension__ extern __inline uint32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_laneq_u32 (uint32x4_t __a, const int __b) -{ - return __aarch64_vdup_laneq_u32 (__a, __b); -} - -__extension__ extern __inline uint64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdup_laneq_u64 (uint64x2_t __a, const int __b) -{ - return __aarch64_vdup_laneq_u64 (__a, __b); -} - -/* vdupq_lane */ - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_lane_f16 (float16x4_t __a, const int __b) -{ - return __aarch64_vdupq_lane_f16 (__a, __b); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_lane_f32 (float32x2_t __a, const int __b) -{ - return __aarch64_vdupq_lane_f32 (__a, __b); -} - -__extension__ extern __inline float64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_lane_f64 (float64x1_t __a, const int __b) -{ - return __aarch64_vdupq_lane_f64 (__a, __b); -} - -__extension__ extern __inline poly8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_lane_p8 (poly8x8_t __a, const int __b) -{ - return __aarch64_vdupq_lane_p8 (__a, __b); -} - -__extension__ extern __inline poly16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_lane_p16 (poly16x4_t __a, const int __b) -{ - return __aarch64_vdupq_lane_p16 (__a, __b); -} - -__extension__ extern __inline poly64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_lane_p64 (poly64x1_t __a, const int __b) -{ - return __aarch64_vdupq_lane_p64 (__a, __b); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_lane_s8 (int8x8_t __a, const int __b) -{ - return __aarch64_vdupq_lane_s8 (__a, __b); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_lane_s16 (int16x4_t __a, const int __b) -{ - return __aarch64_vdupq_lane_s16 (__a, __b); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_lane_s32 (int32x2_t __a, const int __b) -{ - return __aarch64_vdupq_lane_s32 (__a, __b); -} - -__extension__ extern __inline int64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_lane_s64 (int64x1_t __a, const int __b) -{ - return __aarch64_vdupq_lane_s64 (__a, __b); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_lane_u8 (uint8x8_t __a, const int __b) -{ - return __aarch64_vdupq_lane_u8 (__a, __b); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_lane_u16 (uint16x4_t __a, const int __b) -{ - return __aarch64_vdupq_lane_u16 (__a, __b); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_lane_u32 (uint32x2_t __a, const int __b) -{ - return __aarch64_vdupq_lane_u32 (__a, __b); -} - -__extension__ extern __inline uint64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_lane_u64 (uint64x1_t __a, const int __b) -{ - return __aarch64_vdupq_lane_u64 (__a, __b); -} - -/* vdupq_laneq */ - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_laneq_f16 (float16x8_t __a, const int __b) -{ - return __aarch64_vdupq_laneq_f16 (__a, __b); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_laneq_f32 (float32x4_t __a, const int __b) -{ - return __aarch64_vdupq_laneq_f32 (__a, __b); -} - -__extension__ extern __inline float64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_laneq_f64 (float64x2_t __a, const int __b) -{ - return __aarch64_vdupq_laneq_f64 (__a, __b); -} - -__extension__ extern __inline poly8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_laneq_p8 (poly8x16_t __a, const int __b) -{ - return __aarch64_vdupq_laneq_p8 (__a, __b); -} - -__extension__ extern __inline poly16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_laneq_p16 (poly16x8_t __a, const int __b) -{ - return __aarch64_vdupq_laneq_p16 (__a, __b); -} - -__extension__ extern __inline poly64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_laneq_p64 (poly64x2_t __a, const int __b) -{ - return __aarch64_vdupq_laneq_p64 (__a, __b); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_laneq_s8 (int8x16_t __a, const int __b) -{ - return __aarch64_vdupq_laneq_s8 (__a, __b); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_laneq_s16 (int16x8_t __a, const int __b) -{ - return __aarch64_vdupq_laneq_s16 (__a, __b); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_laneq_s32 (int32x4_t __a, const int __b) -{ - return __aarch64_vdupq_laneq_s32 (__a, __b); -} - -__extension__ extern __inline int64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_laneq_s64 (int64x2_t __a, const int __b) +vdupq_laneq_s64 (int64x2_t __a, const int __b) { return __aarch64_vdupq_laneq_s64 (__a, __b); } @@ -11057,28 +10716,6 @@ vdupq_laneq_u64 (uint64x2_t __a, const int __b) return __aarch64_vdupq_laneq_u64 (__a, __b); } -/* vdupb_lane */ -__extension__ extern __inline poly8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupb_lane_p8 (poly8x8_t __a, const int __b) -{ - return __aarch64_vget_lane_any (__a, __b); -} - -__extension__ extern __inline int8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupb_lane_s8 (int8x8_t __a, const int __b) -{ - return __aarch64_vget_lane_any (__a, __b); -} - -__extension__ extern __inline uint8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupb_lane_u8 (uint8x8_t __a, const int __b) -{ - return __aarch64_vget_lane_any (__a, __b); -} - /* vduph_lane */ __extension__ extern __inline float16_t @@ -11157,28 +10794,6 @@ vdupd_lane_u64 (uint64x1_t __a, const int __b) return __a[0]; } -/* vdupb_laneq */ -__extension__ extern __inline poly8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupb_laneq_p8 (poly8x16_t __a, const int __b) -{ - return __aarch64_vget_lane_any (__a, __b); -} - -__extension__ extern __inline int8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupb_laneq_s8 (int8x16_t __a, const int __b) -{ - return __aarch64_vget_lane_any (__a, __b); -} - -__extension__ extern __inline uint8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupb_laneq_u8 (uint8x16_t __a, const int __b) -{ - return __aarch64_vget_lane_any (__a, __b); -} - /* vduph_laneq */ __extension__ extern __inline float16_t @@ -11887,184 +11502,79 @@ vfmss_lane_f32 (float32_t __a, float32_t __b, __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfms_laneq_f32 (float32x2_t __a, float32x2_t __b, - float32x4_t __c, const int __lane) -{ - return __builtin_aarch64_fmav2sf (-__b, - __aarch64_vdup_laneq_f32 (__c, __lane), - __a); -} - -__extension__ extern __inline float64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfms_laneq_f64 (float64x1_t __a, float64x1_t __b, - float64x2_t __c, const int __lane) -{ - float64_t __c0 = __aarch64_vget_lane_any (__c, __lane); - return (float64x1_t) {__builtin_fma (-__b[0], __c0, __a[0])}; -} - -__extension__ extern __inline float64_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmsd_laneq_f64 (float64_t __a, float64_t __b, - float64x2_t __c, const int __lane) -{ - return __builtin_fma (-__b, __aarch64_vget_lane_any (__c, __lane), __a); -} - -__extension__ extern __inline float32_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmss_laneq_f32 (float32_t __a, float32_t __b, - float32x4_t __c, const int __lane) -{ - return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a); -} - -/* vfmsq_lane */ - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmsq_lane_f32 (float32x4_t __a, float32x4_t __b, - float32x2_t __c, const int __lane) -{ - return __builtin_aarch64_fmav4sf (-__b, - __aarch64_vdupq_lane_f32 (__c, __lane), - __a); -} - -__extension__ extern __inline float64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmsq_lane_f64 (float64x2_t __a, float64x2_t __b, - float64x1_t __c, const int __lane) -{ - return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c[0]), __a); -} - -/* vfmsq_laneq */ - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmsq_laneq_f32 (float32x4_t __a, float32x4_t __b, - float32x4_t __c, const int __lane) -{ - return __builtin_aarch64_fmav4sf (-__b, - __aarch64_vdupq_laneq_f32 (__c, __lane), - __a); -} - -__extension__ extern __inline float64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b, - float64x2_t __c, const int __lane) -{ - return __builtin_aarch64_fmav2df (-__b, - __aarch64_vdupq_laneq_f64 (__c, __lane), - __a); -} - -/* vld1 */ - -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_f16 (const float16_t *__a) -{ - return __builtin_aarch64_ld1v4hf (__a); -} - -__extension__ extern __inline float32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_f32 (const float32_t *__a) -{ - return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) __a); -} - -__extension__ extern __inline float64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_f64 (const float64_t *__a) -{ - return (float64x1_t) {*__a}; -} - -__extension__ extern __inline poly8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_p8 (const poly8_t *__a) -{ - return __builtin_aarch64_ld1v8qi_ps ( - (const __builtin_aarch64_simd_qi *) __a); -} - -__extension__ extern __inline poly16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_p16 (const poly16_t *__a) -{ - return __builtin_aarch64_ld1v4hi_ps ( - (const __builtin_aarch64_simd_hi *) __a); -} - -__extension__ extern __inline poly64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_p64 (const poly64_t *__a) -{ - return (poly64x1_t) {*__a}; -} - -__extension__ extern __inline int8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_s8 (const int8_t *__a) +vfms_laneq_f32 (float32x2_t __a, float32x2_t __b, + float32x4_t __c, const int __lane) { - return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) __a); + return __builtin_aarch64_fmav2sf (-__b, + __aarch64_vdup_laneq_f32 (__c, __lane), + __a); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_s16 (const int16_t *__a) +vfms_laneq_f64 (float64x1_t __a, float64x1_t __b, + float64x2_t __c, const int __lane) { - return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) __a); + float64_t __c0 = __aarch64_vget_lane_any (__c, __lane); + return (float64x1_t) {__builtin_fma (-__b[0], __c0, __a[0])}; } -__extension__ extern __inline int32x2_t +__extension__ extern __inline float64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_s32 (const int32_t *__a) +vfmsd_laneq_f64 (float64_t __a, float64_t __b, + float64x2_t __c, const int __lane) { - return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) __a); + return __builtin_fma (-__b, __aarch64_vget_lane_any (__c, __lane), __a); } -__extension__ extern __inline int64x1_t +__extension__ extern __inline float32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_s64 (const int64_t *__a) +vfmss_laneq_f32 (float32_t __a, float32_t __b, + float32x4_t __c, const int __lane) { - return (int64x1_t) {*__a}; + return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a); } -__extension__ extern __inline uint8x8_t +/* vfmsq_lane */ + +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_u8 (const uint8_t *__a) +vfmsq_lane_f32 (float32x4_t __a, float32x4_t __b, + float32x2_t __c, const int __lane) { - return __builtin_aarch64_ld1v8qi_us ( - (const __builtin_aarch64_simd_qi *) __a); + return __builtin_aarch64_fmav4sf (-__b, + __aarch64_vdupq_lane_f32 (__c, __lane), + __a); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_u16 (const uint16_t *__a) +vfmsq_lane_f64 (float64x2_t __a, float64x2_t __b, + float64x1_t __c, const int __lane) { - return __builtin_aarch64_ld1v4hi_us ( - (const __builtin_aarch64_simd_hi *) __a); + return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c[0]), __a); } -__extension__ extern __inline uint32x2_t +/* vfmsq_laneq */ + +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_u32 (const uint32_t *__a) +vfmsq_laneq_f32 (float32x4_t __a, float32x4_t __b, + float32x4_t __c, const int __lane) { - return __builtin_aarch64_ld1v2si_us ( - (const __builtin_aarch64_simd_si *) __a); + return __builtin_aarch64_fmav4sf (-__b, + __aarch64_vdupq_laneq_f32 (__c, __lane), + __a); } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_u64 (const uint64_t *__a) +vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b, + float64x2_t __c, const int __lane) { - return (uint64x1_t) {*__a}; + return __builtin_aarch64_fmav2df (-__b, + __aarch64_vdupq_laneq_f64 (__c, __lane), + __a); } /* vld1x3 */ @@ -12282,87 +11792,6 @@ vld1q_p64_x3 (const poly64_t *__a) /* vld1q */ -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_f16 (const float16_t *__a) -{ - return __builtin_aarch64_ld1v8hf (__a); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_f32 (const float32_t *__a) -{ - return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) __a); -} - -__extension__ extern __inline float64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_f64 (const float64_t *__a) -{ - return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) __a); -} - -__extension__ extern __inline poly8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_p8 (const poly8_t *__a) -{ - return __builtin_aarch64_ld1v16qi_ps ( - (const __builtin_aarch64_simd_qi *) __a); -} - -__extension__ extern __inline poly16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_p16 (const poly16_t *__a) -{ - return __builtin_aarch64_ld1v8hi_ps ( - (const __builtin_aarch64_simd_hi *) __a); -} - -__extension__ extern __inline poly64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_p64 (const poly64_t *__a) -{ - return __builtin_aarch64_ld1v2di_ps ( - (const __builtin_aarch64_simd_di *) __a); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_s8 (const int8_t *__a) -{ - return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) __a); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_s16 (const int16_t *__a) -{ - return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) __a); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_s32 (const int32_t *__a) -{ - return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) __a); -} - -__extension__ extern __inline int64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_s64 (const int64_t *__a) -{ - return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) __a); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_u8 (const uint8_t *__a) -{ - return __builtin_aarch64_ld1v16qi_us ( - (const __builtin_aarch64_simd_qi *) __a); -} - __extension__ extern __inline uint8x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_u8_x2 (const uint8_t *__a) @@ -12574,30 +12003,6 @@ vld1q_p64_x2 (const poly64_t *__a) (const __builtin_aarch64_simd_di *) __a); } -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_u16 (const uint16_t *__a) -{ - return __builtin_aarch64_ld1v8hi_us ( - (const __builtin_aarch64_simd_hi *) __a); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_u32 (const uint32_t *__a) -{ - return __builtin_aarch64_ld1v4si_us ( - (const __builtin_aarch64_simd_si *) __a); -} - -__extension__ extern __inline uint64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_u64 (const uint64_t *__a) -{ - return __builtin_aarch64_ld1v2di_us ( - (const __builtin_aarch64_simd_di *) __a); -} - /* vld1(q)_x4. */ __extension__ extern __inline int8x8x4_t @@ -16709,18 +16114,11 @@ vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, /* vmov_n_ */ -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_f16 (float16_t __a) -{ - return vdup_n_f16 (__a); -} - -__extension__ extern __inline float32x2_t +__extension__ extern __inline poly64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_f32 (float32_t __a) +vmov_n_p64 (poly64_t __a) { - return vdup_n_f32 (__a); + return (poly64x1_t) {__a}; } __extension__ extern __inline float64x1_t @@ -16730,48 +16128,6 @@ vmov_n_f64 (float64_t __a) return (float64x1_t) {__a}; } -__extension__ extern __inline poly8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_p8 (poly8_t __a) -{ - return vdup_n_p8 (__a); -} - -__extension__ extern __inline poly16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_p16 (poly16_t __a) -{ - return vdup_n_p16 (__a); -} - -__extension__ extern __inline poly64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_p64 (poly64_t __a) -{ - return vdup_n_p64 (__a); -} - -__extension__ extern __inline int8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_s8 (int8_t __a) -{ - return vdup_n_s8 (__a); -} - -__extension__ extern __inline int16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_s16 (int16_t __a) -{ - return vdup_n_s16 (__a); -} - -__extension__ extern __inline int32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_s32 (int32_t __a) -{ - return vdup_n_s32 (__a); -} - __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_s64 (int64_t __a) @@ -16779,27 +16135,6 @@ vmov_n_s64 (int64_t __a) return (int64x1_t) {__a}; } -__extension__ extern __inline uint8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_u8 (uint8_t __a) -{ - return vdup_n_u8 (__a); -} - -__extension__ extern __inline uint16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_u16 (uint16_t __a) -{ - return vdup_n_u16 (__a); -} - -__extension__ extern __inline uint32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_u32 (uint32_t __a) -{ - return vdup_n_u32 (__a); -} - __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_u64 (uint64_t __a) @@ -16807,104 +16142,6 @@ vmov_n_u64 (uint64_t __a) return (uint64x1_t) {__a}; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_f16 (float16_t __a) -{ - return vdupq_n_f16 (__a); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_f32 (float32_t __a) -{ - return vdupq_n_f32 (__a); -} - -__extension__ extern __inline float64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_f64 (float64_t __a) -{ - return vdupq_n_f64 (__a); -} - -__extension__ extern __inline poly8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_p8 (poly8_t __a) -{ - return vdupq_n_p8 (__a); -} - -__extension__ extern __inline poly16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_p16 (poly16_t __a) -{ - return vdupq_n_p16 (__a); -} - -__extension__ extern __inline poly64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_p64 (poly64_t __a) -{ - return vdupq_n_p64 (__a); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_s8 (int8_t __a) -{ - return vdupq_n_s8 (__a); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_s16 (int16_t __a) -{ - return vdupq_n_s16 (__a); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_s32 (int32_t __a) -{ - return vdupq_n_s32 (__a); -} - -__extension__ extern __inline int64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_s64 (int64_t __a) -{ - return vdupq_n_s64 (__a); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_u8 (uint8_t __a) -{ - return vdupq_n_u8 (__a); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_u16 (uint16_t __a) -{ - return vdupq_n_u16 (__a); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_u32 (uint32_t __a) -{ - return vdupq_n_u32 (__a); -} - -__extension__ extern __inline uint64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_u64 (uint64_t __a) -{ - return vdupq_n_u64 (__a); -} - /* vmul_lane */ __extension__ extern __inline float32x2_t diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 82dc7dcf7621..2cc3686ca13a 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -715,6 +715,9 @@ UNSPEC_ASHIFT_SIGNED ; Used in aarch-simd.md. UNSPEC_ASHIFT_UNSIGNED ; Used in aarch64-simd.md. UNSPEC_ABS ; Used in aarch64-simd.md. + UNSPEC_DUP ; Used in aarch64-simd.md. + UNSPEC_DUPB ; Used in aarch64-simd.md. + UNSPEC_DUP_LANE ; Used in aarch64-simd.md. UNSPEC_FMAX ; Used in aarch64-simd.md. UNSPEC_FMAXNMV ; Used in aarch64-simd.md. UNSPEC_FMAXV ; Used in aarch64-simd.md. @@ -765,6 +768,7 @@ UNSPEC_SSHLL ; Used in aarch64-simd.md. UNSPEC_USHLL ; Used in aarch64-simd.md. UNSPEC_ADDP ; Used in aarch64-simd.md. + UNSPEC_VCREATE ; Used in aarch64-simd.md. UNSPEC_VCVT ; Used in aarch64-simd.md. UNSPEC_VCVT_HIGH ; Used in aarch64-simd.md. UNSPEC_VCVT1 ; Used in aarch64-simd.md. @@ -812,6 +816,7 @@ UNSPEC_PMULL ; Used in aarch64-simd.md. UNSPEC_PMULL2 ; Used in aarch64-simd.md. UNSPEC_REV_REGLIST ; Used in aarch64-simd.md. + UNSPEC_VEC_COPY ; Used in aarch64-simd.md. UNSPEC_VEC_SHR ; Used in aarch64-simd.md. UNSPEC_SQRDMLAH ; Used in aarch64-simd.md. UNSPEC_SQRDMLSH ; Used in aarch64-simd.md.