Hi All, This adds implementation for the optabs for complex additions. With this the following C code:
void f90 (float complex a[restrict N], float complex b[restrict N], float complex c[restrict N]) { for (int i=0; i < N; i++) c[i] = a[i] + (b[i] * I); } generates f90: add r3, r2, #1600 .L2: vld1.32 {q8}, [r0]! vld1.32 {q9}, [r1]! vcadd.f32 q8, q8, q9, #90 vst1.32 {q8}, [r2]! cmp r3, r2 bne .L2 bx lr instead of f90: add r3, r2, #1600 .L2: vld2.32 {d24-d27}, [r0]! vld2.32 {d20-d23}, [r1]! vsub.f32 q8, q12, q11 vadd.f32 q9, q13, q10 vst2.32 {d16-d19}, [r2]! cmp r3, r2 bne .L2 bx lr Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/arm/iterators.md (rot): Add UNSPEC_VCMLS, UNSPEC_VCMUL and UNSPEC_VCMUL180. (rot_op, rotsplit1, rotsplit2, fcmac1, VCMLA_OP, VCMUL_OP): New. * config/arm/neon.md (cadd<rot><mode>3, cml<fcmac1><rot_op><mode>4, cmul<rot_op><mode>3): New. * config/arm/unspecs.md (UNSPEC_VCMUL, UNSPEC_VCMUL180, UNSPEC_VCMLS, UNSPEC_VCMLS180): New. --
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 0bc9eba0722689aff4c1a143e952f6eb91c0cd86..f5693c0524274da1eb1c767713574c01ec6d544c 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -1146,10 +1146,38 @@ (define_int_attr crypto_mode [(UNSPEC_SHA1H "V4SI") (UNSPEC_AESMC "V16QI") (define_int_attr rot [(UNSPEC_VCADD90 "90") (UNSPEC_VCADD270 "270") + (UNSPEC_VCMLS "0") (UNSPEC_VCMLA "0") (UNSPEC_VCMLA90 "90") (UNSPEC_VCMLA180 "180") - (UNSPEC_VCMLA270 "270")]) + (UNSPEC_VCMLA270 "270") + (UNSPEC_VCMUL "0") + (UNSPEC_VCMUL180 "180")]) + +;; A conjucate is a rotation of 180* around the argand plane, or * I. +(define_int_attr rot_op [(UNSPEC_VCMLS "") + (UNSPEC_VCMLS180 "_conj") + (UNSPEC_VCMLA "") + (UNSPEC_VCMLA180 "_conj") + (UNSPEC_VCMUL "") + (UNSPEC_VCMUL180 "_conj")]) + +(define_int_attr rotsplit1 [(UNSPEC_VCMLA "0") + (UNSPEC_VCMLA180 "0") + (UNSPEC_VCMUL "0") + (UNSPEC_VCMUL180 "0") + (UNSPEC_VCMLS "270") + (UNSPEC_VCMLS180 "90")]) + +(define_int_attr rotsplit2 [(UNSPEC_VCMLA "90") + (UNSPEC_VCMLA180 "270") + (UNSPEC_VCMUL "90") + (UNSPEC_VCMUL180 "270") + (UNSPEC_VCMLS "180") + (UNSPEC_VCMLS180 "180")]) + +(define_int_attr fcmac1 [(UNSPEC_VCMLA "a") (UNSPEC_VCMLA180 "a") + (UNSPEC_VCMLS "s") (UNSPEC_VCMLS180 "s")]) (define_int_attr simd32_op [(UNSPEC_QADD8 "qadd8") (UNSPEC_QSUB8 "qsub8") (UNSPEC_SHADD8 "shadd8") (UNSPEC_SHSUB8 "shsub8") @@ -1256,3 +1284,12 @@ (define_int_attr bt [(UNSPEC_BFMAB "b") (UNSPEC_BFMAT "t")]) ;; An iterator for CDE MVE accumulator/non-accumulator versions. (define_int_attr a [(UNSPEC_VCDE "") (UNSPEC_VCDEA "a")]) + +;; Define iterators for VCMLA operations +(define_int_iterator VCMLA_OP [UNSPEC_VCMLA + UNSPEC_VCMLA180 + UNSPEC_VCMLS]) + +;; Define iterators for VCMLA operations as MUL +(define_int_iterator VCMUL_OP [UNSPEC_VCMUL + UNSPEC_VCMUL180]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 3e7b51d8ab60007901392df0ca1cb09fead4d0e9..1611bcea1ba8cb416d27368e4dc39ce15b3a4cd8 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -3217,6 +3217,14 @@ (define_insn "neon_vcadd<rot><mode>" [(set_attr "type" "neon_fcadd")] ) +(define_expand "cadd<rot><mode>3" + [(set (match_operand:VF 0 "register_operand") + (unspec:VF [(match_operand:VF 1 "register_operand") + (match_operand:VF 2 "register_operand")] + VCADD))] + "TARGET_COMPLEX" +) + (define_insn "neon_vcmla<rot><mode>" [(set (match_operand:VF 0 "register_operand" "=w") (plus:VF (match_operand:VF 1 "register_operand" "0") @@ -3274,6 +3282,43 @@ (define_insn "neon_vcmlaq_lane<rot><mode>" ) +;; The complex mla/mls operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cml<fcmac1><rot_op><mode>4" + [(set (match_operand:VF 0 "register_operand") + (plus:VF (match_operand:VF 1 "register_operand") + (unspec:VF [(match_operand:VF 2 "register_operand") + (match_operand:VF 3 "register_operand")] + VCMLA_OP)))] + "TARGET_COMPLEX" +{ + emit_insn (gen_neon_vcmla<rotsplit1><mode> (operands[0], operands[1], + operands[2], operands[3])); + emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], operands[0], + operands[2], operands[3])); + DONE; +}) + +;; The complex mul operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cmul<rot_op><mode>3" + [(set (match_operand:VF 0 "register_operand") + (unspec:VF [(match_operand:VF 1 "register_operand") + (match_operand:VF 2 "register_operand")] + VCMUL_OP))] + "TARGET_COMPLEX" +{ + rtx tmp = gen_reg_rtx (<MODE>mode); + emit_move_insn (tmp, CONST0_RTX (<MODE>mode)); + emit_insn (gen_neon_vcmla<rotsplit1><mode> (operands[0], tmp, + operands[1], operands[2])); + emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], operands[0], + operands[1], operands[2])); + DONE; +}) + ;; These instructions map to the __builtins for the Dot Product operations. (define_insn "neon_<sup>dot<vsi2qi>" [(set (match_operand:VCVTI 0 "register_operand" "=w") diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 0a2399d4fb7bdef6c9ff2b31a743cf357fd271d5..d1b2824a0fe76f62d69c18dcec2f47dfb75b586e 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -510,6 +510,10 @@ (define_c_enum "unspec" [ UNSPEC_VCMLA90 UNSPEC_VCMLA180 UNSPEC_VCMLA270 + UNSPEC_VCMUL + UNSPEC_VCMUL180 + UNSPEC_VCMLS + UNSPEC_VCMLS180 UNSPEC_MATMUL_S UNSPEC_MATMUL_U UNSPEC_MATMUL_US