[PATCH] D59655: [AArch64] Split the neon.addp intrinsic into integer and fp variants

Amara Emerson via Phabricator via cfe-commits Thu, 21 Mar 2019 10:57:16 -0700

aemerson created this revision.
aemerson added reviewers: paquette, eli.friedman, t.p.northover.
aemerson added projects: LLVM, clang.
Herald added subscribers: Petar.Avramovic, hiraditya, kristof.beyls, 
javed.absar.


This is the result of discussions on the list about how to deal with intrinsics 
which require codegen to disambiguate them via only the integer/fp overloads. 
It causes problems for GlobalISel as some of that information is lost during 
translation, while with other operations like IR instructions the information 
is encoded into the instruction opcode.

This patch changes clang to emit the new faddp intrinsic if the vector operands 
to the builtin have FP element types. LLVM IR AutoUpgrade has been taught to 
upgrade existing calls to aarch64.neon.addp with fp vector arguments, and we 
remove the workarounds introduced for GlobalISel in r355865.

This is a more permanent solution to PR40968.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D59655

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/aarch64-neon-intrinsics.c
  clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
  llvm/include/llvm/IR/IntrinsicsAArch64.td
  llvm/lib/IR/AutoUpgrade.cpp
  llvm/lib/Target/AArch64/AArch64InstrInfo.td
  llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
  llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
  llvm/test/CodeGen/AArch64/GlobalISel/fallback-ambiguous-addp-intrinsic.mir
  llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
  llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
  llvm/test/CodeGen/AArch64/arm64-vadd.ll
  llvm/test/CodeGen/AArch64/autoupgrade-aarch64-neon-addp-float.ll

Index: llvm/test/CodeGen/AArch64/autoupgrade-aarch64-neon-addp-float.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/autoupgrade-aarch64-neon-addp-float.ll
@@ -0,0 +1,10 @@
+; RUN: opt -S < %s -mtriple=arm64 | FileCheck %s
+declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>)
+
+; CHECK: @llvm.aarch64.neon.faddp.v4f32
+define <4 x float> @upgrade_aarch64_neon_addp_float(<4 x float> %a, <4 x float> %b) {
+  %res = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %res
+}
+
Index: llvm/test/CodeGen/AArch64/arm64-vadd.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-vadd.ll
+++ llvm/test/CodeGen/AArch64/arm64-vadd.ll
@@ -712,7 +712,7 @@
 ;CHECK: faddp.2s
         %tmp1 = load <2 x float>, <2 x float>* %A
         %tmp2 = load <2 x float>, <2 x float>* %B
-        %tmp3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        %tmp3 = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
         ret <2 x float> %tmp3
 }
 
@@ -721,7 +721,7 @@
 ;CHECK: faddp.4s
         %tmp1 = load <4 x float>, <4 x float>* %A
         %tmp2 = load <4 x float>, <4 x float>* %B
-        %tmp3 = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        %tmp3 = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
         ret <4 x float> %tmp3
 }
 
@@ -730,13 +730,13 @@
 ;CHECK: faddp.2d
         %tmp1 = load <2 x double>, <2 x double>* %A
         %tmp2 = load <2 x double>, <2 x double>* %B
-        %tmp3 = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        %tmp3 = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
         ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: uaddl_duprhs
Index: llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
+++ llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
@@ -65,27 +65,27 @@
         ret <2 x i64> %val
 }
 
-declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>)
+declare <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>)
 
 define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
 ; CHECK: test_faddp_v2f32:
-        %val = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+        %val = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %lhs, <2 x float> %rhs)
 ; CHECK: faddp v0.2s, v0.2s, v1.2s
         ret <2 x float> %val
 }
 
 define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK: test_faddp_v4f32:
-        %val = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+        %val = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %lhs, <4 x float> %rhs)
 ; CHECK: faddp v0.4s, v0.4s, v1.4s
         ret <4 x float> %val
 }
 
 define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
 ; CHECK: test_faddp_v2f64:
-        %val = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+        %val = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %lhs, <2 x double> %rhs)
 ; CHECK: faddp v0.2d, v0.2d, v1.2d
         ret <2 x double> %val
 }
Index: llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
===================================================================
--- llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -151,7 +151,7 @@
 # DEBUG:      .. the first uncovered type index: 1, OK
 #
 # DEBUG-NEXT: G_INTRINSIC (opcode {{[0-9]+}}): 0 type indices
-# DEBUG:      .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG:      .. type index coverage check SKIPPED: no rules defined
 #
 # DEBUG-NEXT: G_INTRINSIC_W_SIDE_EFFECTS (opcode {{[0-9]+}}): 0 type indices
 # DEBUG:      .. type index coverage check SKIPPED: no rules defined
Index: llvm/test/CodeGen/AArch64/GlobalISel/fallback-ambiguous-addp-intrinsic.mir
===================================================================
--- llvm/test/CodeGen/AArch64/GlobalISel/fallback-ambiguous-addp-intrinsic.mir
+++ /dev/null
@@ -1,32 +0,0 @@
-# RUN: llc -mtriple aarch64-unknown-unknown -O0 -start-before=legalizer -pass-remarks-missed=gisel* %s -o - 2>&1 | FileCheck %s
-#
-# Check that we fall back on @llvm.aarch64.neon.addp and ensure that we get the
-# correct instruction.
-# https://bugs.llvm.org/show_bug.cgi?id=40968
-
---- |
-  define <2 x float> @foo(<2 x float> %v1, <2 x float> %v2) {
-  entry:
-    %v3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %v1, <2 x float> %v2)
-    ret <2 x float> %v3
-  }
-  declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>)
-...
----
-name:            foo
-alignment:       2
-tracksRegLiveness: true
-body:             |
-  bb.1.entry:
-    liveins: $d0, $d1
-    ; CHECK: remark:
-    ; CHECK-SAME: unable to legalize instruction: %2:_(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.addp), %0:_(<2 x s32>), %1:_(<2 x s32>)
-    ; CHECK: faddp
-    ; CHECK-NOT: addp
-    %0:_(<2 x s32>) = COPY $d0
-    %1:_(<2 x s32>) = COPY $d1
-    %2:_(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.addp), %0(<2 x s32>), %1(<2 x s32>)
-    $d0 = COPY %2(<2 x s32>)
-    RET_ReallyLR implicit $d0
-
-...
Index: llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
+++ llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
@@ -34,9 +34,6 @@
 private:
   bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI,
                      MachineIRBuilder &MIRBuilder) const;
-
-  bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
-                         MachineIRBuilder &MIRBuilder) const;
 };
 } // End llvm namespace.
 #endif
Index: llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -64,11 +64,6 @@
         return std::make_pair(0, EltTy);
       });
 
-  // HACK: Check that the intrinsic isn't ambiguous.
-  // (See: https://bugs.llvm.org/show_bug.cgi?id=40968)
-  getActionDefinitionsBuilder(G_INTRINSIC)
-    .custom();
-
   getActionDefinitionsBuilder(G_PHI)
       .legalFor({p0, s16, s32, s64})
       .clampScalar(0, s16, s64)
@@ -517,30 +512,11 @@
     return false;
   case TargetOpcode::G_VAARG:
     return legalizeVaArg(MI, MRI, MIRBuilder);
-  case TargetOpcode::G_INTRINSIC:
-    return legalizeIntrinsic(MI, MRI, MIRBuilder);
   }
 
   llvm_unreachable("expected switch to return");
 }
 
-bool AArch64LegalizerInfo::legalizeIntrinsic(
-    MachineInstr &MI, MachineRegisterInfo &MRI,
-    MachineIRBuilder &MIRBuilder) const {
-  // HACK: Don't allow faddp/addp for now. We don't pass down the type info
-  // necessary to get this right today.
-  //
-  // It looks like addp/faddp is the only intrinsic that's impacted by this.
-  // All other intrinsics fully describe the required types in their names.
-  //
-  // (See: https://bugs.llvm.org/show_bug.cgi?id=40968)
-  const MachineOperand &IntrinOp = MI.getOperand(1);
-  if (IntrinOp.isIntrinsicID() &&
-      IntrinOp.getIntrinsicID() == Intrinsic::aarch64_neon_addp)
-    return false;
-  return true;
-}
-
 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
                                          MachineRegisterInfo &MRI,
                                          MachineIRBuilder &MIRBuilder) const {
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3499,7 +3499,7 @@
 }
 defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
 defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
-defm FADDP   : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>;
+defm FADDP   : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_faddp>;
 defm FADD    : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
 defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
 defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
Index: llvm/lib/IR/AutoUpgrade.cpp
===================================================================
--- llvm/lib/IR/AutoUpgrade.cpp
+++ llvm/lib/IR/AutoUpgrade.cpp
@@ -568,6 +568,17 @@
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::thread_pointer);
       return true;
     }
+    if (Name.startswith("aarch64.neon.addp")) {
+      VectorType *ArgTy = cast<VectorType>(F->arg_begin()->getType());
+      if (ArgTy->getElementType()->isFloatingPointTy()) {
+        auto fArgs = F->getFunctionType()->params();
+        Type *Tys[] = {fArgs[0], fArgs[1]};
+        NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                          Intrinsic::aarch64_neon_faddp, Tys);
+        return true;
+      }
+      return false;
+    }
     break;
   }
 
Index: llvm/include/llvm/IR/IntrinsicsAArch64.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -289,6 +289,7 @@
 
   // Pairwise Add
   def int_aarch64_neon_addp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_faddp : AdvSIMD_2VectorArg_Intrinsic;
 
   // Long Pairwise Add
   // FIXME: In theory, we shouldn't need intrinsics for saddlp or
Index: clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
===================================================================
--- clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@@ -736,14 +736,14 @@
 }
 
 // CHECK-LABEL: test_vpadd_f16
-// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.aarch64.neon.addp.v4f16(<4 x half> %a, <4 x half> %b)
+// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half> %a, <4 x half> %b)
 // CHECK:  ret <4 x half> [[ADD]]
 float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) {
   return vpadd_f16(a, b);
 }
 
 // CHECK-LABEL: test_vpaddq_f16
-// CHECK:  [[ADD:%.*]] = call <8 x half> @llvm.aarch64.neon.addp.v8f16(<8 x half> %a, <8 x half> %b)
+// CHECK:  [[ADD:%.*]] = call <8 x half> @llvm.aarch64.neon.faddp.v8f16(<8 x half> %a, <8 x half> %b)
 // CHECK:  ret <8 x half> [[ADD]]
 float16x8_t test_vpaddq_f16(float16x8_t a, float16x8_t b) {
   return vpaddq_f16(a, b);
Index: clang/test/CodeGen/aarch64-neon-intrinsics.c
===================================================================
--- clang/test/CodeGen/aarch64-neon-intrinsics.c
+++ clang/test/CodeGen/aarch64-neon-intrinsics.c
@@ -4411,7 +4411,7 @@
 // CHECK-LABEL: @test_vpadd_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %a, <2 x float> %b)
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %a, <2 x float> %b)
 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
 // CHECK:   ret <2 x float> [[VPADD_V2_I]]
 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
@@ -4475,7 +4475,7 @@
 // CHECK-LABEL: @test_vpaddq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %a, <4 x float> %b)
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %a, <4 x float> %b)
 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x float> [[VPADDQ_V2_I]] to <16 x i8>
 // CHECK:   ret <4 x float> [[VPADDQ_V2_I]]
 float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) {
@@ -4485,7 +4485,7 @@
 // CHECK-LABEL: @test_vpaddq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %a, <2 x double> %b)
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %a, <2 x double> %b)
 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x double> [[VPADDQ_V2_I]] to <16 x i8>
 // CHECK:   ret <2 x double> [[VPADDQ_V2_I]]
 float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) {
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -5095,6 +5095,12 @@
 
   switch (BuiltinID) {
   default: break;
+  case NEON::BI__builtin_neon_vpadd_v:
+  case NEON::BI__builtin_neon_vpaddq_v:
+    // We don't allow fp/int overloading of intrinsics.
+    if (VTy->getElementType()->isFloatingPointTy())
+      Int = Intrinsic::aarch64_neon_faddp;
+    break;
   case NEON::BI__builtin_neon_vabs_v:
   case NEON::BI__builtin_neon_vabsq_v:
     if (VTy->getElementType()->isFloatingPointTy())

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D59655: [AArch64] Split the neon.addp intrinsic into integer and fp variants

Reply via email to