Hi, This patch rewrites the vst4_lane intrinsics in terms of RTL builtins.
Tested on aarch64-none-elf with no issues. OK to queue for Stage 1? Thanks, James --- gcc/ 2014-02-13 James Greenhalgh <james.greenha...@arm.com> * config/aarch64/aarch64-builtins.c (aarch64_types_storestruct_lane_qualifiers): New. (TYPES_STORESTRUCT_LANE): Likewise. * config/aarch64/aarch64-simd-builtins.def (st2_lane): New. (st3_lane): Likewise. (st4_lane): Likewise. * config/aarch64/aarch64-simd.md (vec_store_lanesoi_lane<mode>): New. (vec_store_lanesci_lane<mode>): Likewise. (vec_store_lanesxi_lane<mode>): Likewise. (aarch64_st2_lane<VQ:mode>): Likewise. (aarch64_st3_lane<VQ:mode>): Likewise. (aarch64_st4_lane<VQ:mode>): Likewise. * config/aarch64/aarch64.md (unspec): Add UNSPEC_ST{2,3,4}_LANE. * config/aarch64/arm_neon.h (__ST2_LANE_FUNC): Rewrite using builtins, update use points to use new macro arguments. (__ST3_LANE_FUNC): Likewise. (__ST4_LANE_FUNC): Likewise. * config/aarch64/iterators.md (V_TWO_ELEM): New. (V_THREE_ELEM): Likewise. (V_FOUR_ELEM): Likewise.
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index ebab2ce..a12a1aa 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -226,6 +226,11 @@ aarch64_types_store1_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_void, qualifier_pointer_map_mode, qualifier_none }; #define TYPES_STORE1 (aarch64_types_store1_qualifiers) #define TYPES_STORESTRUCT (aarch64_types_store1_qualifiers) +static enum aarch64_type_qualifiers +aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_void, qualifier_pointer_map_mode, + qualifier_none, qualifier_none }; +#define TYPES_STORESTRUCT_LANE (aarch64_types_storestruct_lane_qualifiers) #define CF0(N, X) CODE_FOR_aarch64_##N##X #define CF1(N, X) CODE_FOR_##N##X##1 diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index e5f71b4..7bfdfca 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -107,6 +107,10 @@ BUILTIN_VQ (STORESTRUCT, st3, 0) BUILTIN_VQ (STORESTRUCT, st4, 0) + BUILTIN_VQ (STORESTRUCT_LANE, st2_lane, 0) + BUILTIN_VQ (STORESTRUCT_LANE, st3_lane, 0) + BUILTIN_VQ (STORESTRUCT_LANE, st4_lane, 0) + BUILTIN_VQW (BINOP, saddl2, 0) BUILTIN_VQW (BINOP, uaddl2, 0) BUILTIN_VQW (BINOP, ssubl2, 0) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 4dffb59e856aeaafb79007255d3b91a73ef1ef13..f19b7d5123b5a6249026d48f943445f8167b1c45 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3584,6 +3584,17 @@ (define_insn "vec_store_lanesoi<mode>" [(set_attr "type" "neon_store2_2reg<q>")] ) +(define_insn "vec_store_lanesoi_lane<mode>" + [(set (match_operand:<V_TWO_ELEM> 0 "aarch64_simd_struct_operand" "=Utv") + (unspec:<V_TWO_ELEM> [(match_operand:OI 1 "register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY) + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_ST2_LANE))] + "TARGET_SIMD" + "st2\\t{%S1.<Vetype> - %T1.<Vetype>}[%2], %0" + [(set_attr "type" "neon_store3_one_lane<q>")] +) + (define_insn "vec_load_lanesci<mode>" [(set (match_operand:CI 0 "register_operand" "=w") (unspec:CI [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv") @@ -3604,6 +3615,17 @@ (define_insn "vec_store_lanesci<mode>" [(set_attr "type" "neon_store3_3reg<q>")] ) +(define_insn "vec_store_lanesci_lane<mode>" + [(set (match_operand:<V_THREE_ELEM> 0 "aarch64_simd_struct_operand" "=Utv") + (unspec:<V_THREE_ELEM> [(match_operand:CI 1 "register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY) + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_ST3_LANE))] + "TARGET_SIMD" + "st3\\t{%S1.<Vetype> - %U1.<Vetype>}[%2], %0" + [(set_attr "type" "neon_store3_one_lane<q>")] +) + (define_insn "vec_load_lanesxi<mode>" [(set (match_operand:XI 0 "register_operand" "=w") (unspec:XI [(match_operand:XI 1 "aarch64_simd_struct_operand" "Utv") @@ -3624,6 +3646,17 @@ (define_insn "vec_store_lanesxi<mode>" [(set_attr "type" "neon_store4_4reg<q>")] ) +(define_insn "vec_store_lanesxi_lane<mode>" + [(set (match_operand:<V_FOUR_ELEM> 0 "aarch64_simd_struct_operand" "=Utv") + (unspec:<V_FOUR_ELEM> [(match_operand:XI 1 "register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY) + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_ST4_LANE))] + "TARGET_SIMD" + "st4\\t{%S1.<Vetype> - %V1.<Vetype>}[%2], %0" + [(set_attr "type" "neon_store4_one_lane<q>")] +) + ;; Reload patterns for AdvSIMD register list operands. (define_expand "mov<mode>" @@ -4118,6 +4151,57 @@ (define_expand "aarch64_st<VSTRUCT:nregs DONE; }) +(define_expand "aarch64_st2_lane<VQ:mode>" + [(match_operand:DI 0 "register_operand" "r") + (match_operand:OI 1 "register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY) + (match_operand:SI 2 "immediate_operand")] + "TARGET_SIMD" +{ + enum machine_mode mode = <V_TWO_ELEM>mode; + rtx mem = gen_rtx_MEM (mode, operands[0]); + operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); + + emit_insn (gen_vec_store_lanesoi_lane<VQ:mode> (mem, + operands[1], + operands[2])); + DONE; +}) + +(define_expand "aarch64_st3_lane<VQ:mode>" + [(match_operand:DI 0 "register_operand" "r") + (match_operand:CI 1 "register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY) + (match_operand:SI 2 "immediate_operand")] + "TARGET_SIMD" +{ + enum machine_mode mode = <V_THREE_ELEM>mode; + rtx mem = gen_rtx_MEM (mode, operands[0]); + operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); + + emit_insn (gen_vec_store_lanesci_lane<VQ:mode> (mem, + operands[1], + operands[2])); + DONE; +}) + +(define_expand "aarch64_st4_lane<VQ:mode>" + [(match_operand:DI 0 "register_operand" "r") + (match_operand:XI 1 "register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY) + (match_operand:SI 2 "immediate_operand")] + "TARGET_SIMD" +{ + enum machine_mode mode = <V_FOUR_ELEM>mode; + rtx mem = gen_rtx_MEM (mode, operands[0]); + operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); + + emit_insn (gen_vec_store_lanesxi_lane<VQ:mode> (mem, + operands[1], + operands[2])); + DONE; +}) + (define_expand "aarch64_st1<VALL:mode>" [(match_operand:DI 0 "register_operand") (match_operand:VALL 1 "register_operand")] diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 99a6ac8fcbdcd24a0ea18cc037bef9cf72070281..243ec8978578d69f5addcf3ad706c548bff7b88e 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -98,6 +98,9 @@ (define_c_enum "unspec" [ UNSPEC_ST2 UNSPEC_ST3 UNSPEC_ST4 + UNSPEC_ST2_LANE + UNSPEC_ST3_LANE + UNSPEC_ST4_LANE UNSPEC_TLS UNSPEC_TLSDESC UNSPEC_USHL_2S diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 6af99361..b85dabf 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -14205,131 +14205,224 @@ __LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q) __LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q) __LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q) -#define __ST2_LANE_FUNC(intype, ptrtype, regsuffix, \ - lnsuffix, funcsuffix, Q) \ - typedef struct { ptrtype __x[2]; } __ST2_LANE_STRUCTURE_##intype; \ - __extension__ static __inline void \ - __attribute__ ((__always_inline__)) \ - vst2 ## Q ## _lane_ ## funcsuffix (ptrtype *ptr, \ - intype b, const int c) \ - { \ - __ST2_LANE_STRUCTURE_##intype *__p = \ - (__ST2_LANE_STRUCTURE_##intype *)ptr; \ - __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \ - "st2 {v16." #lnsuffix ", v17." #lnsuffix "}[%2], %0\n\t" \ - : "=Q"(*__p) \ - : "Q"(b), "i"(c) \ - : "v16", "v17"); \ - } - -__ST2_LANE_FUNC (int8x8x2_t, int8_t, 8b, b, s8,) -__ST2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,) -__ST2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,) -__ST2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,) -__ST2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,) -__ST2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,) -__ST2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,) -__ST2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,) -__ST2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,) -__ST2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,) -__ST2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,) -__ST2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,) -__ST2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q) -__ST2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q) -__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q) -__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q) -__ST2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q) -__ST2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q) -__ST2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q) -__ST2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q) -__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q) -__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q) -__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q) -__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q) - -#define __ST3_LANE_FUNC(intype, ptrtype, regsuffix, \ - lnsuffix, funcsuffix, Q) \ - typedef struct { ptrtype __x[3]; } __ST3_LANE_STRUCTURE_##intype; \ - __extension__ static __inline void \ - __attribute__ ((__always_inline__)) \ - vst3 ## Q ## _lane_ ## funcsuffix (ptrtype *ptr, \ - intype b, const int c) \ - { \ - __ST3_LANE_STRUCTURE_##intype *__p = \ - (__ST3_LANE_STRUCTURE_##intype *)ptr; \ - __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \ - "st3 {v16." #lnsuffix " - v18." #lnsuffix "}[%2], %0\n\t" \ - : "=Q"(*__p) \ - : "Q"(b), "i"(c) \ - : "v16", "v17", "v18"); \ - } - -__ST3_LANE_FUNC (int8x8x3_t, int8_t, 8b, b, s8,) -__ST3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,) -__ST3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,) -__ST3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,) -__ST3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,) -__ST3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,) -__ST3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,) -__ST3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,) -__ST3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,) -__ST3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,) -__ST3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,) -__ST3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,) -__ST3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q) -__ST3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q) -__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q) -__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q) -__ST3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q) -__ST3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q) -__ST3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q) -__ST3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q) -__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q) -__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q) -__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q) -__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q) - -#define __ST4_LANE_FUNC(intype, ptrtype, regsuffix, \ - lnsuffix, funcsuffix, Q) \ - typedef struct { ptrtype __x[4]; } __ST4_LANE_STRUCTURE_##intype; \ - __extension__ static __inline void \ - __attribute__ ((__always_inline__)) \ - vst4 ## Q ## _lane_ ## funcsuffix (ptrtype *ptr, \ - intype b, const int c) \ - { \ - __ST4_LANE_STRUCTURE_##intype *__p = \ - (__ST4_LANE_STRUCTURE_##intype *)ptr; \ - __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \ - "st4 {v16." #lnsuffix " - v19." #lnsuffix "}[%2], %0\n\t" \ - : "=Q"(*__p) \ - : "Q"(b), "i"(c) \ - : "v16", "v17", "v18", "v19"); \ - } - -__ST4_LANE_FUNC (int8x8x4_t, int8_t, 8b, b, s8,) -__ST4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,) -__ST4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,) -__ST4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,) -__ST4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,) -__ST4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,) -__ST4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,) -__ST4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,) -__ST4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,) -__ST4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,) -__ST4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,) -__ST4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,) -__ST4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q) -__ST4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q) -__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q) -__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q) -__ST4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q) -__ST4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q) -__ST4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q) -__ST4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q) -__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q) -__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q) -__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q) -__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q) +#define __ST2_LANE_FUNC(intype, largetype, ptrtype, \ + mode, ptr_mode, funcsuffix, signedtype) \ +__extension__ static __inline void \ +__attribute__ ((__always_inline__)) \ +vst2_lane_ ## funcsuffix (ptrtype *__ptr, \ + intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_oi __o; \ + largetype __temp; \ + __temp.val[0] \ + = vcombine_##funcsuffix (__b.val[0], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __temp.val[1] \ + = vcombine_##funcsuffix (__b.val[1], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __o = __builtin_aarch64_set_qregoi##mode (__o, \ + (signedtype) __temp.val[0], 0); \ + __o = __builtin_aarch64_set_qregoi##mode (__o, \ + (signedtype) __temp.val[1], 1); \ + __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ + __ptr, __o, __c); \ +} + +__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v4sf, sf, f32, + float32x4_t) +__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, v2df, df, f64, + float64x2_t) +__ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v16qi, qi, p8, int8x16_t) +__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v8hi, hi, p16, + int16x8_t) +__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v16qi, qi, s8, int8x16_t) +__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v8hi, hi, s16, int16x8_t) +__ST2_LANE_FUNC (int32x2x2_t, int32x4x2_t, int32_t, v4si, si, s32, int32x4_t) +__ST2_LANE_FUNC (int64x1x2_t, int64x2x2_t, int64_t, v2di, di, s64, int64x2_t) +__ST2_LANE_FUNC (uint8x8x2_t, uint8x16x2_t, uint8_t, v16qi, qi, u8, int8x16_t) +__ST2_LANE_FUNC (uint16x4x2_t, uint16x8x2_t, uint16_t, v8hi, hi, u16, + int16x8_t) +__ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v4si, si, u32, + int32x4_t) +__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, v2di, di, u64, + int64x2_t) + +#undef __ST2_LANE_FUNC +#define __ST2_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ +__extension__ static __inline void \ +__attribute__ ((__always_inline__)) \ +vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \ + intype __b, const int __c) \ +{ \ + union { intype __i; \ + __builtin_aarch64_simd_oi __o; } __temp = { __b }; \ + __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ + __ptr, __temp.__o, __c); \ +} + +__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32) +__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64) +__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8) +__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16) +__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8) +__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16) +__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32) +__ST2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64) +__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8) +__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16) +__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32) +__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64) + +#define __ST3_LANE_FUNC(intype, largetype, ptrtype, \ + mode, ptr_mode, funcsuffix, signedtype) \ +__extension__ static __inline void \ +__attribute__ ((__always_inline__)) \ +vst3_lane_ ## funcsuffix (ptrtype *__ptr, \ + intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_ci __o; \ + largetype __temp; \ + __temp.val[0] \ + = vcombine_##funcsuffix (__b.val[0], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __temp.val[1] \ + = vcombine_##funcsuffix (__b.val[1], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __temp.val[2] \ + = vcombine_##funcsuffix (__b.val[2], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __o = __builtin_aarch64_set_qregci##mode (__o, \ + (signedtype) __temp.val[0], 0); \ + __o = __builtin_aarch64_set_qregci##mode (__o, \ + (signedtype) __temp.val[1], 1); \ + __o = __builtin_aarch64_set_qregci##mode (__o, \ + (signedtype) __temp.val[2], 2); \ + __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ + __ptr, __o, __c); \ +} + +__ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v4sf, sf, f32, + float32x4_t) +__ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, v2df, df, f64, + float64x2_t) +__ST3_LANE_FUNC (poly8x8x3_t, poly8x16x3_t, poly8_t, v16qi, qi, p8, int8x16_t) +__ST3_LANE_FUNC (poly16x4x3_t, poly16x8x3_t, poly16_t, v8hi, hi, p16, + int16x8_t) +__ST3_LANE_FUNC (int8x8x3_t, int8x16x3_t, int8_t, v16qi, qi, s8, int8x16_t) +__ST3_LANE_FUNC (int16x4x3_t, int16x8x3_t, int16_t, v8hi, hi, s16, int16x8_t) +__ST3_LANE_FUNC (int32x2x3_t, int32x4x3_t, int32_t, v4si, si, s32, int32x4_t) +__ST3_LANE_FUNC (int64x1x3_t, int64x2x3_t, int64_t, v2di, di, s64, int64x2_t) +__ST3_LANE_FUNC (uint8x8x3_t, uint8x16x3_t, uint8_t, v16qi, qi, u8, int8x16_t) +__ST3_LANE_FUNC (uint16x4x3_t, uint16x8x3_t, uint16_t, v8hi, hi, u16, + int16x8_t) +__ST3_LANE_FUNC (uint32x2x3_t, uint32x4x3_t, uint32_t, v4si, si, u32, + int32x4_t) +__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, v2di, di, u64, + int64x2_t) + +#undef __ST3_LANE_FUNC +#define __ST3_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ +__extension__ static __inline void \ +__attribute__ ((__always_inline__)) \ +vst3q_lane_ ## funcsuffix (ptrtype *__ptr, \ + intype __b, const int __c) \ +{ \ + union { intype __i; \ + __builtin_aarch64_simd_ci __o; } __temp = { __b }; \ + __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ + __ptr, __temp.__o, __c); \ +} + +__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32) +__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64) +__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8) +__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16) +__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8) +__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16) +__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32) +__ST3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64) +__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8) +__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16) +__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32) +__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64) + +#define __ST4_LANE_FUNC(intype, largetype, ptrtype, \ + mode, ptr_mode, funcsuffix, signedtype) \ +__extension__ static __inline void \ +__attribute__ ((__always_inline__)) \ +vst4_lane_ ## funcsuffix (ptrtype *__ptr, \ + intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_xi __o; \ + largetype __temp; \ + __temp.val[0] \ + = vcombine_##funcsuffix (__b.val[0], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __temp.val[1] \ + = vcombine_##funcsuffix (__b.val[1], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __temp.val[2] \ + = vcombine_##funcsuffix (__b.val[2], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __temp.val[3] \ + = vcombine_##funcsuffix (__b.val[3], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __o = __builtin_aarch64_set_qregxi##mode (__o, \ + (signedtype) __temp.val[0], 0); \ + __o = __builtin_aarch64_set_qregxi##mode (__o, \ + (signedtype) __temp.val[1], 1); \ + __o = __builtin_aarch64_set_qregxi##mode (__o, \ + (signedtype) __temp.val[2], 2); \ + __o = __builtin_aarch64_set_qregxi##mode (__o, \ + (signedtype) __temp.val[3], 3); \ + __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ + __ptr, __o, __c); \ +} + +__ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v4sf, sf, f32, + float32x4_t) +__ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, v2df, df, f64, + float64x2_t) +__ST4_LANE_FUNC (poly8x8x4_t, poly8x16x4_t, poly8_t, v16qi, qi, p8, int8x16_t) +__ST4_LANE_FUNC (poly16x4x4_t, poly16x8x4_t, poly16_t, v8hi, hi, p16, + int16x8_t) +__ST4_LANE_FUNC (int8x8x4_t, int8x16x4_t, int8_t, v16qi, qi, s8, int8x16_t) +__ST4_LANE_FUNC (int16x4x4_t, int16x8x4_t, int16_t, v8hi, hi, s16, int16x8_t) +__ST4_LANE_FUNC (int32x2x4_t, int32x4x4_t, int32_t, v4si, si, s32, int32x4_t) +__ST4_LANE_FUNC (int64x1x4_t, int64x2x4_t, int64_t, v2di, di, s64, int64x2_t) +__ST4_LANE_FUNC (uint8x8x4_t, uint8x16x4_t, uint8_t, v16qi, qi, u8, int8x16_t) +__ST4_LANE_FUNC (uint16x4x4_t, uint16x8x4_t, uint16_t, v8hi, hi, u16, + int16x8_t) +__ST4_LANE_FUNC (uint32x2x4_t, uint32x4x4_t, uint32_t, v4si, si, u32, + int32x4_t) +__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, v2di, di, u64, + int64x2_t) + +#undef __ST4_LANE_FUNC +#define __ST4_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ +__extension__ static __inline void \ +__attribute__ ((__always_inline__)) \ +vst4q_lane_ ## funcsuffix (ptrtype *__ptr, \ + intype __b, const int __c) \ +{ \ + union { intype __i; \ + __builtin_aarch64_simd_xi __o; } __temp = { __b }; \ + __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ + __ptr, __temp.__o, __c); \ +} + +__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32) +__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64) +__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8) +__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16) +__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8) +__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16) +__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32) +__ST4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64) +__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8) +__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16) +__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32) +__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64) __extension__ static __inline int64_t __attribute__ ((__always_inline__)) vaddlv_s32 (int32x2_t a) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index f1339b8cc80a76a70ced138f2702a324781db93d..3af5d3925dc12ffa392959debd88f2384ee688d1 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -554,6 +554,32 @@ (define_mode_attr VRL4 [(V8QI "V64QI") ( (define_mode_attr VSTRUCT_DREG [(OI "TI") (CI "EI") (XI "OI")]) +;; Mode of pair of elements for each vector mode, to define transfer +;; size for structure lane/dup loads and stores. +(define_mode_attr V_TWO_ELEM [(V8QI "HI") (V16QI "HI") + (V4HI "SI") (V8HI "SI") + (V2SI "V2SI") (V4SI "V2SI") + (DI "V2DI") (V2DI "V2DI") + (V2SF "V2SF") (V4SF "V2SF") + (DF "V2DI") (V2DF "V2DI")]) + +;; Similar, for three elements. +(define_mode_attr V_THREE_ELEM [(V8QI "BLK") (V16QI "BLK") + (V4HI "BLK") (V8HI "BLK") + (V2SI "BLK") (V4SI "BLK") + (DI "EI") (V2DI "EI") + (V2SF "BLK") (V4SF "BLK") + (DF "EI") (V2DF "EI")]) + +;; Similar, for four elements. +(define_mode_attr V_FOUR_ELEM [(V8QI "SI") (V16QI "SI") + (V4HI "V4HI") (V8HI "V4HI") + (V2SI "V4SI") (V4SI "V4SI") + (DI "OI") (V2DI "OI") + (V2SF "V4SF") (V4SF "V4SF") + (DF "OI") (V2DF "OI")]) + + ;; Mode for atomic operation suffixes (define_mode_attr atomic_sfx [(QI "b") (HI "h") (SI "") (DI "")])