[PATCH] D92930: [Clang] Add vcmla and rotated variants for Arm ACLE.

Florian Hahn via Phabricator via cfe-commits Wed, 09 Dec 2020 13:17:59 -0800

fhahn updated this revision to Diff 310632.
fhahn added a comment.

Updated to use new VCMLA_ROTS multiclass to define rotated variants.



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D92930/new/

https://reviews.llvm.org/D92930

Files:
  clang/include/clang/Basic/arm_neon.td
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/aarch64-neon-vcmla.c

Index: clang/test/CodeGen/aarch64-neon-vcmla.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-neon-vcmla.c
@@ -0,0 +1,105 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
+// RUN:  -target-feature +v8.3a -target-feature +fullfp16 -S -emit-llvm -o - %s \
+// RUN:  | FileCheck %s
+
+#include <arm_neon.h>
+
+void foo_16x4(float16x4_t a, float16x4_t b, float16x4_t c) {
+ // CHECK: call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16
+  float16x4_t result = vcmla_f16(a, b, c);
+}
+
+void foo_rot90_16x4(float16x4_t a, float16x4_t b, float16x4_t c) {
+ // CHECK: call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16
+  float16x4_t result = vcmla_rot90_f16(a, b, c);
+}
+
+void foo_rot180_16x4(float16x4_t a, float16x4_t b, float16x4_t c) {
+ // CHECK: call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16
+  float16x4_t result = vcmla_rot180_f16(a, b, c);
+}
+
+void foo_rot270_16x4(float16x4_t a, float16x4_t b, float16x4_t c) {
+ // CHECK: call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16
+  float16x4_t result = vcmla_rot270_f16(a, b, c);
+}
+
+void foo_16x8(float16x8_t a, float16x8_t b, float16x8_t c) {
+ // CHECK: call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16
+  float16x8_t result = vcmlaq_f16(a, b, c);
+}
+
+void foo_rot90_16x8(float16x8_t a, float16x8_t b, float16x8_t c) {
+ // CHECK: call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16
+  float16x8_t result = vcmlaq_rot90_f16(a, b, c);
+}
+
+void foo_rot180_16x8(float16x8_t a, float16x8_t b, float16x8_t c) {
+ // CHECK: call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16
+  float16x8_t result = vcmlaq_rot180_f16(a, b, c);
+}
+
+void foo_rot270_16x8(float16x8_t a, float16x8_t b, float16x8_t c) {
+ // CHECK: call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16
+  float16x8_t result = vcmlaq_rot270_f16(a, b, c);
+}
+
+void foo_32x2(float32x2_t a, float32x2_t b, float32x2_t c) {
+ // CHECK: call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32
+  float32x2_t result = vcmla_f32(a, b, c);
+}
+
+void foo_rot90_32x2(float32x2_t a, float32x2_t b, float32x2_t c) {
+ // CHECK: call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32
+  float32x2_t result = vcmla_rot90_f32(a, b, c);
+}
+
+void foo_rot180_32x2(float32x2_t a, float32x2_t b, float32x2_t c) {
+ // CHECK: call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32
+  float32x2_t result = vcmla_rot180_f32(a, b, c);
+}
+
+void foo_rot270_32x2(float32x2_t a, float32x2_t b, float32x2_t c) {
+ // CHECK: call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32
+  float32x2_t result = vcmla_rot270_f32(a, b, c);
+}
+
+void foo_32x4(float32x4_t a, float32x4_t b, float32x4_t c) {
+ // CHECK: call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32
+  float32x4_t result = vcmlaq_f32(a, b, c);
+}
+
+void foo_rot90_32x4(float32x4_t a, float32x4_t b, float32x4_t c) {
+ // CHECK: call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32
+  float32x4_t result = vcmlaq_rot90_f32(a, b, c);
+}
+
+void foo_rot180_32x4(float32x4_t a, float32x4_t b, float32x4_t c) {
+ // CHECK: call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32
+  float32x4_t result = vcmlaq_rot180_f32(a, b, c);
+}
+
+void foo_rot270_32x4(float32x4_t a, float32x4_t b, float32x4_t c) {
+ // CHECK: call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32
+  float32x4_t result = vcmlaq_rot270_f32(a, b, c);
+}
+
+void foo_64x2(float64x2_t a, float64x2_t b, float64x2_t c) {
+ // CHECK: call <2 x double> @llvm.aarch64.neon.vcmla.rot0.v2f64
+  float64x2_t result = vcmlaq_f64(a, b, c);
+}
+
+void foo_rot90_64x2(float64x2_t a, float64x2_t b, float64x2_t c) {
+ // CHECK: call <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64
+  float64x2_t result = vcmlaq_rot90_f64(a, b, c);
+}
+
+void foo_rot180_64x2(float64x2_t a, float64x2_t b, float64x2_t c) {
+ // CHECK: call <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64
+  float64x2_t result = vcmlaq_rot180_f64(a, b, c);
+}
+
+void foo_rot270_64x2(float64x2_t a, float64x2_t b, float64x2_t c) {
+ // CHECK: call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64
+  float64x2_t result = vcmlaq_rot270_f64(a, b, c);
+}
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -5548,6 +5548,14 @@
   NEONMAP0(vcltzq_v),
   NEONMAP1(vclz_v, ctlz, Add1ArgType),
   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
+  NEONMAP1(vcmla_rot180_v, aarch64_neon_vcmla_rot180, Add1ArgType),
+  NEONMAP1(vcmla_rot270_v, aarch64_neon_vcmla_rot270, Add1ArgType),
+  NEONMAP1(vcmla_rot90_v, aarch64_neon_vcmla_rot90, Add1ArgType),
+  NEONMAP1(vcmla_v, aarch64_neon_vcmla_rot0, Add1ArgType),
+  NEONMAP1(vcmlaq_rot180_v, aarch64_neon_vcmla_rot180, Add1ArgType),
+  NEONMAP1(vcmlaq_rot270_v, aarch64_neon_vcmla_rot270, Add1ArgType),
+  NEONMAP1(vcmlaq_rot90_v, aarch64_neon_vcmla_rot90, Add1ArgType),
+  NEONMAP1(vcmlaq_v, aarch64_neon_vcmla_rot0, Add1ArgType),
   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
   NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0),
Index: clang/include/clang/Basic/arm_neon.td
===================================================================
--- clang/include/clang/Basic/arm_neon.td
+++ clang/include/clang/Basic/arm_neon.td
@@ -1902,22 +1902,38 @@
   def VFMLALT_LANEQ_BF : SOpInst<"vbfmlalt_laneq", "..B(BQ)I", "Qf", OP_BFMLALT_LN>;
 }
 
+multiclass VCMLA_ROTS<string args, string type, string qstr> {
+  def ROT0 : SInst<"vcmla" # qstr, args, type>;
+  def ROT90 : SInst<"vcmla" # qstr # "_rot90", args, type>;
+  def ROT180 : SInst<"vcmla" # qstr # "_rot180", args, type>;
+  def ROT3270 : SInst<"vcmla" # qstr # "_rot270", args, type>;
+}
+
 // v8.3-A Vector complex addition intrinsics
 let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in {
   def VCADD_ROT90_FP16   : SInst<"vcadd_rot90", "...", "h">;
   def VCADD_ROT270_FP16  : SInst<"vcadd_rot270", "...", "h">;
   def VCADDQ_ROT90_FP16  : SInst<"vcaddq_rot90", "QQQ", "h">;
   def VCADDQ_ROT270_FP16 : SInst<"vcaddq_rot270", "QQQ", "h">;
+
+
+  defm VCMLA_FP16  : VCMLA_ROTS<"....", "h", "">;
+  defm VCMLAQ_FP16 : VCMLA_ROTS<"QQQQ", "h", "q">;
 }
 let ArchGuard = "defined(__ARM_FEATURE_COMPLEX)" in {
   def VCADD_ROT90   : SInst<"vcadd_rot90", "...", "f">;
   def VCADD_ROT270  : SInst<"vcadd_rot270", "...", "f">;
   def VCADDQ_ROT90  : SInst<"vcaddq_rot90", "QQQ", "f">;
   def VCADDQ_ROT270 : SInst<"vcaddq_rot270", "QQQ", "f">;
+
+  defm VCMLA_F32        : VCMLA_ROTS< "....", "f", "">;
+  defm VCMLAQ_F32 : VCMLA_ROTS<"QQQQ", "f", "q">;
 }
 let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__aarch64__)" in {
   def VCADDQ_ROT90_FP64  : SInst<"vcaddq_rot90", "QQQ", "d">;
   def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">;
+
+  defm VCMLAQ_FP64 : VCMLA_ROTS<"QQQQ", "d", "q">;
 }
 
 // V8.2-A BFloat intrinsics

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D92930: [Clang] Add vcmla and rotated variants for Arm ACLE.

Reply via email to