This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics vld<n>{q}_bf16 as part of the BFloat16 extension. (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) The intrinsics are declared in arm_neon.h . A new test is added to check assembler output.
This patch depends on the Arm back-end patche. (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html) Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have commit rights, so if this is ok can someone please commit it for me? gcc/ChangeLog: 2019-11-14 Delia Burduv <delia.bur...@arm.com> * config/arm/arm_neon.h (bfloat16_t): New typedef. (bfloat16x4x2_t): New typedef. (bfloat16x8x2_t): New typedef. (bfloat16x4x3_t): New typedef. (bfloat16x8x3_t): New typedef. (bfloat16x4x4_t): New typedef. (bfloat16x8x4_t): New typedef. (vld2_bf16): New. (vld2q_bf16): New. (vld3_bf16): New. (vld3q_bf16): New. (vld4_bf16): New. (vld4q_bf16): New. (vld2_dup_bf16): New. (vld2q_dup_bf16): New. (vld3_dup_bf16): New. (vld3q_dup_bf16): New. (vld4_dup_bf16): New. (vld4q_dup_bf16): New. * config/arm/arm-builtins.c (E_V2BFmode): New mode. (VAR13): New. (arm_simd_types[Bfloat16x2_t]):New type. * config/arm/arm-modes.def (V2BF): New mode. * config/arm/arm-simd-builtin-types.def (Bfloat16x2_t): New entry. * config/arm/arm_neon_builtins.def (vld2): Changed to VAR13 and added v4bf, v8bf (vld2_dup): Changed to VAR8 and added v4bf, v8bf (vld3): Changed to VAR13 and added v4bf, v8bf (vld3_dup): Changed to VAR8 and added v4bf, v8bf (vld4): Changed to VAR13 and added v4bf, v8bf (vld4_dup): Changed to VAR8 and added v4bf, v8bf * config/arm/iterators.md (VDXBF): New iterator. (VQ2BF): New iterator. (V_elem): Added V4BF, V8BF. (V_sz_elem): Added V4BF, V8BF. (V_mode_nunits): Added V4BF, V8BF. (q): Added V4BF, V8BF. *config/arm/neon.md (vld2): Used new iterators. (vld2_dup<mode>): Used new iterators. (vld2_dupv8bf): New. (vst3): Used new iterators. (vst3qa): Used new iterators. (vst3qb): Used new iterators. (vld3_dup<mode>): Used new iterators. (vld3_dupv8bf): New. (vst4): Used new iterators. (vst4qa): Used new iterators. (vst4qb): Used new iterators. (vld4_dup<mode>): Used new iterators. (vld4_dupv8bf): New. gcc/testsuite/ChangeLog: 2019-11-14 Delia Burduv <delia.bur...@arm.com> * gcc.target/arm/simd/bf16_vldn_1.c: New test.
diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c index df09a6bb1fce5f9216337d71cba51a890fd57baf..551d76a44fadc58a35a6155486ec1fb16c959da0 100644 --- a/gcc/config/arm/arm-builtins.c +++ b/gcc/config/arm/arm-builtins.c @@ -318,6 +318,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS] #define v4bf_UP E_V4BFmode #define v2si_UP E_V2SImode #define v2sf_UP E_V2SFmode +#define v2bf_UP E_V2BFmode #define di_UP E_DImode #define v16qi_UP E_V16QImode #define v8hi_UP E_V8HImode @@ -381,6 +382,9 @@ typedef struct { #define VAR12(T, N, A, B, C, D, E, F, G, H, I, J, K, L) \ VAR11 (T, N, A, B, C, D, E, F, G, H, I, J, K) \ VAR1 (T, N, L) +#define VAR13(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \ + VAR12 (T, N, A, B, C, D, E, F, G, H, I, J, K, L) \ + VAR1 (T, N, M) /* The builtin data can be found in arm_neon_builtins.def, arm_vfp_builtins.def and arm_acle_builtins.def. The entries in arm_neon_builtins.def require @@ -1013,6 +1017,7 @@ arm_init_simd_builtin_types (void) arm_simd_types[Float32x4_t].eltype = float_type_node; /* Init Bfloat vector types with underlying __bf16 scalar type. */ + arm_simd_types[Bfloat16x2_t].eltype = arm_bf16_type_node; arm_simd_types[Bfloat16x4_t].eltype = arm_bf16_type_node; arm_simd_types[Bfloat16x8_t].eltype = arm_bf16_type_node; diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def index 80c3c1a6eb258d116b07ad71fafafc9befb76e8b..9533d177059d98fa2a9e9d1d6321f3d92dad7592 100644 --- a/gcc/config/arm/arm-modes.def +++ b/gcc/config/arm/arm-modes.def @@ -80,6 +80,7 @@ VECTOR_MODE (FLOAT, HF, 2); /* V2HF */ FLOAT_MODE (BF, 2, 0); ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format); +VECTOR_MODE (FLOAT, BF, 2); /* V2BF. */ VECTOR_MODE (FLOAT, BF, 4); /* V4BF. */ VECTOR_MODE (FLOAT, BF, 8); /* V8BF. */ diff --git a/gcc/config/arm/arm-simd-builtin-types.def b/gcc/config/arm/arm-simd-builtin-types.def index ee240f85c5618417fff039ec43b81641b187c126..f52f679156d5041ab109909393dc37fda33a390d 100644 --- a/gcc/config/arm/arm-simd-builtin-types.def +++ b/gcc/config/arm/arm-simd-builtin-types.def @@ -48,5 +48,6 @@ ENTRY (Float16x8_t, V8HF, none, 128, float16, 19) ENTRY (Float32x4_t, V4SF, none, 128, float32, 19) + ENTRY (Bfloat16x2_t, V2BF, none, 32, bfloat16, 20) ENTRY (Bfloat16x4_t, V4BF, none, 64, bfloat16, 20) ENTRY (Bfloat16x8_t, V8BF, none, 128, bfloat16, 20) diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 71e7568e4315a9354062dee5442ca4af9d9660a9..c47f3cdd2d51066067d2ef341cc12a6db4b6f785 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -91,6 +91,145 @@ typedef float float32_t; #ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC typedef __simd128_bfloat16_t bfloat16x8_t; typedef __simd64_bfloat16_t bfloat16x4_t; + +typedef struct bfloat16x4x2_t +{ + bfloat16x4_t val[2]; +} bfloat16x4x2_t; + +typedef struct bfloat16x8x2_t +{ + bfloat16x8_t val[2]; +} bfloat16x8x2_t; + +typedef struct bfloat16x4x3_t +{ + bfloat16x4_t val[3]; +} bfloat16x4x3_t; + +typedef struct bfloat16x8x3_t +{ + bfloat16x8_t val[3]; +} bfloat16x8x3_t; + +typedef struct bfloat16x4x4_t +{ + bfloat16x4_t val[4]; +} bfloat16x4x4_t; + +typedef struct bfloat16x8x4_t +{ + bfloat16x8_t val[4]; +} bfloat16x8x4_t; + +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_bf16 (bfloat16_t const * __ptr) +{ + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + #endif #pragma GCC pop_options #pragma GCC pop_options diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index bcccf93f7fa2750e9006e5856efecbec0fb331b9..b9b56fc3d8b767eac0734d75e3fc5b61188ddca7 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -320,29 +320,29 @@ VAR12 (STORE1, vst1, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) VAR12 (STORE1LANE, vst1_lane, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) -VAR11 (LOAD1, vld2, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld2, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld2_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR11 (STORE1, vst2, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) VAR9 (STORE1LANE, vst2_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR11 (LOAD1, vld3, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld3, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld3_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR11 (STORE1, vst3, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) VAR9 (STORE1LANE, vst3_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR11 (LOAD1, vld4, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld4, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld4_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR11 (STORE1, vst4, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) VAR9 (STORE1LANE, vst4_lane, diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 439021fa0733ac31706287c4f98d62b080afc3a1..f8b98bd57af223cacba05907d25e3d4b9d58eb8a 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -86,6 +86,12 @@ ;; Double-width vector modes plus 64-bit elements. (define_mode_iterator VDX [V8QI V4HI V4HF V2SI V2SF DI]) +;; Double-width vector modes plus 64-bit elements and V4BF. +(define_mode_iterator VDXBF [V8QI V4HI V4HF V2SI V2SF DI (V4BF "TARGET_BF16_SIMD")]) + +;; Double-width vector modes plus 64-bit elements, V4BF and V8BF. +(define_mode_iterator VDXBF2 [V8QI V4HI V4HF V2SI V2SF DI (V4BF "TARGET_BF16_SIMD") (V8BF ("TARGET_BF16_SIMD"))]) + ;; Double-width vector modes plus 64-bit elements, ;; with V4BFmode added, suitable for moves. (define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI]) @@ -102,6 +108,9 @@ ;; Quad-width vector modes, including V8HF. (define_mode_iterator VQ2 [V16QI V8HI V8HF V4SI V4SF]) +;; Quad-width vector modes, including V8HF and V8BF. +(define_mode_iterator VQ2BF [V16QI V8HI V8HF V4SI V4SF (V8BF "TARGET_BF16_SIMD")]) + ;; Quad-width vector modes with 16- or 32-bit elements (define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF]) @@ -546,6 +555,7 @@ (define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI") (V4HI "SI") (V8HI "SI") (V4HF "SF") (V8HF "SF") + (V4BF "V2BF") (V8BF "V2BF") (V2SI "V2SI") (V4SI "V2SI") (V2SF "V2SF") (V4SF "V2SF") (DI "V2DI") (V2DI "V2DI")]) @@ -566,6 +576,7 @@ (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK") (V4HI "BLK") (V8HI "BLK") (V4HF "BLK") (V8HF "BLK") + (V4BF "BLK") (V8BF "BLK") (V2SI "BLK") (V4SI "BLK") (V2SF "BLK") (V4SF "BLK") (DI "EI") (V2DI "EI")]) @@ -574,6 +585,7 @@ (define_mode_attr V_four_elem [(V8QI "SI") (V16QI "SI") (V4HI "V4HI") (V8HI "V4HI") (V4HF "V4HF") (V8HF "V4HF") + (V4BF "V4BF") (V8BF "V4BF") (V2SI "V4SI") (V4SI "V4SI") (V2SF "V4SF") (V4SF "V4SF") (DI "OI") (V2DI "OI")]) @@ -697,6 +709,7 @@ (V2SI "32") (V4SI "32") (DI "64") (V2DI "64") (V4HF "16") (V8HF "16") + (V4BF "16") (V8BF "16") (V2SF "32") (V4SF "32")]) (define_mode_attr V_elem_ch [(V8QI "b") (V16QI "b") @@ -772,6 +785,7 @@ (define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16") (V4HF "4") (V8HF "8") (V4HI "4") (V8HI "8") + (V4BF "4") (V8BF "8") (V2SI "2") (V4SI "4") (V2SF "2") (V4SF "4") (DI "1") (V2DI "2") diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index b724aab65f720bf0e48bb828f0874426effd235c..4109e7f84838e48eebd95290eeeefc9d3e48ec7d 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -5383,7 +5383,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2<mode>" [(set (match_operand:TI 0 "s_register_operand" "=w") (unspec:TI [(match_operand:TI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2))] "TARGET_NEON" { @@ -5408,7 +5408,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2<mode>" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2))] "TARGET_NEON" "vld2.<V_sz_elem>\t%h0, %A1" @@ -5471,7 +5471,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2_dup<mode>" [(set (match_operand:TI 0 "s_register_operand" "=w") (unspec:TI [(match_operand:<V_two_elem> 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2_DUP))] "TARGET_NEON" { @@ -5486,6 +5486,27 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load1_1reg<q>")))] ) +(define_insn "neon_vld2_dupv8bf" + [(set (match_operand:OI 0 "s_register_operand" "=w") + (unspec:OI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[5]; + int tabbase = REGNO (operands[0]); + + ops[4] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6); + output_asm_insn ("vld2.16\t{%P0, %P1, %P2, %P3}, %A4", ops); + return ""; + } + [(set_attr "type" "neon_load2_all_lanes_q")] +) + (define_expand "vec_store_lanesti<mode>" [(set (match_operand:TI 0 "neon_struct_operand") (unspec:TI [(match_operand:TI 1 "s_register_operand") @@ -5592,7 +5613,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3<mode>" [(set (match_operand:EI 0 "s_register_operand" "=w") (unspec:EI [(match_operand:EI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3))] "TARGET_NEON" { @@ -5620,7 +5641,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vld3<mode>" [(match_operand:CI 0 "s_register_operand") (match_operand:CI 1 "neon_struct_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -5635,7 +5656,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3qa<mode>" [(set (match_operand:CI 0 "s_register_operand" "=w") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3A))] "TARGET_NEON" { @@ -5655,7 +5676,7 @@ if (BYTES_BIG_ENDIAN) [(set (match_operand:CI 0 "s_register_operand" "=w") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") (match_operand:CI 2 "s_register_operand" "0") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3B))] "TARGET_NEON" { @@ -5732,7 +5753,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3_dup<mode>" [(set (match_operand:EI 0 "s_register_operand" "=w") (unspec:EI [(match_operand:<V_three_elem> 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3_DUP))] "TARGET_NEON" { @@ -5755,6 +5776,26 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load3_all_lanes<q>") (const_string "neon_load1_1reg<q>")))]) +(define_insn "neon_vld3_dupv8bf" + [(set (match_operand:CI 0 "s_register_operand" "=w") + (unspec:CI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[4]; + int tabbase = REGNO (operands[0]); + + ops[3] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + output_asm_insn ("vld3.16\t{%P0[], %P1[], %P2[]}, %A3", ops); + return ""; + } + [(set_attr "type" "neon_load3_all_lanes_q")] +) + (define_expand "vec_store_lanesei<mode>" [(set (match_operand:EI 0 "neon_struct_operand") (unspec:EI [(match_operand:EI 1 "s_register_operand") @@ -5910,7 +5951,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4<mode>" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4))] "TARGET_NEON" { @@ -5938,7 +5979,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vld4<mode>" [(match_operand:XI 0 "s_register_operand") (match_operand:XI 1 "neon_struct_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -5953,7 +5994,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4qa<mode>" [(set (match_operand:XI 0 "s_register_operand" "=w") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4A))] "TARGET_NEON" { @@ -5974,7 +6015,7 @@ if (BYTES_BIG_ENDIAN) [(set (match_operand:XI 0 "s_register_operand" "=w") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") (match_operand:XI 2 "s_register_operand" "0") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4B))] "TARGET_NEON" { @@ -6054,7 +6095,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4_dup<mode>" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:<V_four_elem> 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4_DUP))] "TARGET_NEON" { @@ -6080,6 +6121,27 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load1_1reg<q>")))] ) +(define_insn "neon_vld4_dupv8bf" + [(set (match_operand:XI 0 "s_register_operand" "=w") + (unspec:XI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[5]; + int tabbase = REGNO (operands[0]); + + ops[4] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6); + output_asm_insn ("vld4.16\t{%P0[], %P1[], %P2[], %P3[]}, %A4", ops); + return ""; + } + [(set_attr "type" "neon_load4_all_lanes_q")] +) + (define_expand "vec_store_lanesoi<mode>" [(set (match_operand:OI 0 "neon_struct_operand") (unspec:OI [(match_operand:OI 1 "s_register_operand") diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c new file mode 100644 index 0000000000000000000000000000000000000000..8db8dfbe28d2136bd2d943e2aae80e32cea34133 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c @@ -0,0 +1,152 @@ +/* { dg-do assemble } */ +/* { dg-options "-save-temps" } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + + +/* +**test_vld2_bf16: +** ... +** vld2.16\t{d[0-9]+-d[0-9]+}, \[r[0-9]+\] +** ... +*/ +bfloat16x4x2_t +test_vld2_bf16 (bfloat16_t * ptr) +{ + vld2_bf16 (ptr); +} + +/* +**test_vld2q_bf16: +** ... +** vld2.16\t{d[0-9]+-d[0-9]+}, \[r[0-9]+\] +** ... +*/ +bfloat16x8x2_t +test_vld2q_bf16 (bfloat16_t * ptr) +{ + vld2q_bf16 (ptr); +} + +/* +**test_vld2_dup_bf16: +** ... +** vld2.16\t{d[0-9]+\[\], d[0-9]+\[\]}, \[r[0-9]+\] +** ... +*/ +bfloat16x4x2_t +test_vld2_dup_bf16 (bfloat16_t * ptr) +{ + vld2_dup_bf16 (ptr); +} + +/* +**test_vld2q_dup_bf16: +** ... +** vld2.16\t{d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}, \[r3\] +** ... +*/ +bfloat16x8x2_t +test_vld2q_dup_bf16 (bfloat16_t * ptr) +{ + vld2q_dup_bf16 (ptr); +} + +/* +**test_vld3_bf16: +** ... +** vld3.16\t{d[0-9]+-d[0-9]+}, \[r[0-9]+\] +** ... +*/ +bfloat16x4x3_t +test_vld3_bf16 (bfloat16_t * ptr) +{ + vld3_bf16 (ptr); +} + +/* +**test_vld3q_bf16: +** ... +** vld3.16\t{d[0-9]+, d[0-9]+, d[0-9]+}, \[r[0-9]+\] +** ... +*/ +bfloat16x8x3_t +test_vld3q_bf16 (bfloat16_t * ptr) +{ + vld3q_bf16 (ptr); +} + +/* +**test_vld3_dup_bf16: +** ... +** vld3.16\t{d[0-9]+\[\], d[0-9]+\[\], d[0-9]+\[\]}, \[r[0-9]+\] +** ... +*/ +bfloat16x4x3_t +test_vld3_dup_bf16 (bfloat16_t * ptr) +{ + vld3_dup_bf16 (ptr); +} + +/* +**test_vld3q_dup_bf16: +** ... +** vld3.16\t{d[0-9]+\[\], d[0-9]+\[\], d[0-9]+\[\]}, \[r[0-9]+\] +** ... +*/ +bfloat16x8x3_t +test_vld3q_dup_bf16 (bfloat16_t * ptr) +{ + vld3q_dup_bf16 (ptr); +} + +/* +**test_vld4_bf16: +** ... +** vld4.16\t{d[0-9]+-d[0-9]+}, \[r[0-9]+\] +** ... +*/ +bfloat16x4x4_t +test_vld4_bf16 (bfloat16_t * ptr) +{ + vld4_bf16 (ptr); +} + +/* +**test_vld4q_bf16: +** ... +** vld4.16\t{d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}, \[r[0-9]+\] +** ... +*/ +bfloat16x8x4_t +test_vld4q_bf16 (bfloat16_t * ptr) +{ + vld4q_bf16 (ptr); +} + +/* +**test_vld4_dup_bf16: +** ... +** vld4.16\t{d[0-9]+\[\], d[0-9]+\[\], d[0-9]+\[\], d[0-9]+\[\]}, \[r[0-9]+\] +** ... +*/ +bfloat16x4x4_t +test_vld4_dup_bf16 (bfloat16_t * ptr) +{ + vld4_dup_bf16 (ptr); +} + +/* +**test_vld4q_dup_bf16: +** ... +** vld4.16\t{d[0-9]+\[\], d[0-9]+\[\], d[0-9]+\[\], d[0-9]+\[\]}, \[r[0-9]+\] +** ... +*/ +bfloat16x4x4_t +test_vld4q_dup_bf16 (bfloat16_t * ptr) +{ + vld4q_dup_bf16 (ptr); +}