[PATCH] D86819: [PowerPC][Power10] Implementation of 128-bit Binary Vector Rotate builtins

Albion Fung via Phabricator via cfe-commits Tue, 22 Sep 2020 08:51:55 -0700

Conanap updated this revision to Diff 293474.
Conanap marked 2 inline comments as done.
Conanap added a comment.


Changed implementation for vrlqnm as per Nemanja


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D86819/new/

https://reviews.llvm.org/D86819

Files:
  clang/include/clang/Basic/BuiltinsPPC.def
  clang/lib/Headers/altivec.h
  clang/test/CodeGen/builtins-ppc-p10vector.c
  llvm/include/llvm/IR/IntrinsicsPowerPC.td
  llvm/lib/Target/PowerPC/PPCISelLowering.cpp
  llvm/lib/Target/PowerPC/PPCInstrPrefix.td
  llvm/test/CodeGen/PowerPC/p10-vector-rotate.ll

Index: llvm/test/CodeGen/PowerPC/p10-vector-rotate.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/PowerPC/p10-vector-rotate.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+
+; This test case aims to test the builtins for vector rotate instructions
+; on Power10.
+
+
+define <1 x i128> @test_vrlq(<1 x i128> %x, <1 x i128> %y) {
+; CHECK-LABEL: test_vrlq:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrlq v2, v3, v2
+; CHECK-NEXT:    blr
+  %shl.i = shl <1 x i128> %y, %x
+  %sub.i = sub <1 x i128> <i128 128>, %x
+  %lshr.i = lshr <1 x i128> %y, %sub.i
+  %tmp = or <1 x i128> %shl.i, %lshr.i
+  ret <1 x i128> %tmp
+}
+
+define <1 x i128> @test_vrlq_cost_mult8(<1 x i128> %x) {
+; CHECK-LABEL: test_vrlq_cost_mult8:
+; CHECK: # %bb.0:
+; CHECK: vrlq v2, v3, v2
+; CHECK-NEXT: blr
+  %shl.i = shl <1 x i128> <i128 16>, %x
+  %sub.i = sub <1 x i128> <i128 128>, %x
+  %lshr.i = lshr <1 x i128> <i128 16>, %sub.i
+  %tmp = or <1 x i128> %shl.i, %lshr.i
+  ret <1 x i128> %tmp
+}
+
+define <1 x i128> @test_vrlq_cost_non_mult8(<1 x i128> %x) {
+; CHECK-LABEL: test_vrlq_cost_non_mult8:
+; CHECK: # %bb.0:
+; CHECK: vrlq v2, v3, v2
+; CHECK-NEXT: blr
+  %shl.i = shl <1 x i128> <i128 4>, %x
+  %sub.i = sub <1 x i128> <i128 128>, %x
+  %lshr.i = lshr <1 x i128> <i128 4>, %sub.i
+  %tmp = or <1 x i128> %shl.i, %lshr.i
+  ret <1 x i128> %tmp
+}
+
+; Function Attrs: nounwind readnone
+define <1 x i128> @test_vrlqmi(<1 x i128> %a, <1 x i128> %b, <1 x i128> %c) {
+; CHECK-LABEL: test_vrlqmi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vrlqmi v3, v2, v4
+; CHECK-NEXT:    vmr v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %tmp = tail call <1 x i128> @llvm.ppc.altivec.vrlqmi(<1 x i128> %a, <1 x i128> %c, <1 x i128> %b)
+  ret <1 x i128> %tmp
+}
+
+; Function Attrs: nounwind readnone
+define <1 x i128> @test_vrlqnm(<1 x i128> %a, <1 x i128> %b, <1 x i128> %c) {
+; CHECK-LABEL: test_vrlqnm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vrlqnm v2, v2, v3
+; CHECK-NEXT:    xxland v2, v2, v4
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call <1 x i128> @llvm.ppc.altivec.vrlqnm(<1 x i128> %a, <1 x i128> %b)
+  %tmp = and <1 x i128> %0, %c
+  ret <1 x i128> %tmp
+}
+
+; Function Attrs: nounwind readnone
+declare <1 x i128> @llvm.ppc.altivec.vrlqmi(<1 x i128>, <1 x i128>, <1 x i128>)
+
+; Function Attrs: nounwind readnone
+declare <1 x i128> @llvm.ppc.altivec.vrlqnm(<1 x i128>, <1 x i128>)
Index: llvm/lib/Target/PowerPC/PPCInstrPrefix.td
===================================================================
--- llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -1446,19 +1446,25 @@
                                "vcmpuq $BF, $vA, $vB", IIC_VecGeneral, []>;
   def VCMPSQ : VXForm_BF3_VAB5<321, (outs crrc:$BF), (ins vrrc:$vA, vrrc:$vB),
                                "vcmpsq $BF, $vA, $vB", IIC_VecGeneral, []>;
-  def VRLQNM : VX1_VT5_VA5_VB5<325, "vrlqnm", []>;
-  def VRLQMI : VXForm_1<69, (outs vrrc:$vD),
-                        (ins vrrc:$vA, vrrc:$vB, vrrc:$vDi),
-                        "vrlqmi $vD, $vA, $vB", IIC_VecFP, []>,
-                        RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
   def VSLQ : VX1_VT5_VA5_VB5<261, "vslq", []>;
   def VSRAQ : VX1_VT5_VA5_VB5<773, "vsraq", []>;
   def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq", []>;
-  def VRLQ : VX1_VT5_VA5_VB5<5, "vrlq", []>;
   def XSCVQPUQZ : X_VT5_XO5_VB5<63, 0, 836, "xscvqpuqz", []>;
   def XSCVQPSQZ : X_VT5_XO5_VB5<63, 8, 836, "xscvqpsqz", []>;
   def XSCVUQQP : X_VT5_XO5_VB5<63, 3, 836, "xscvuqqp", []>;
   def XSCVSQQP : X_VT5_XO5_VB5<63, 11, 836, "xscvsqqp", []>;
+  def VRLQ : VX1_VT5_VA5_VB5<5, "vrlq", []>;
+  def VRLQNM : VX1_VT5_VA5_VB5<325, "vrlqnm",
+                               [(set v1i128:$vD,
+                                   (int_ppc_altivec_vrlqnm v1i128:$vA,
+                                                           v1i128:$vB))]>;
+  def VRLQMI : VXForm_1<69, (outs vrrc:$vD),
+                        (ins vrrc:$vA, vrrc:$vB, vrrc:$vDi),
+                        "vrlqmi $vD, $vA, $vB", IIC_VecFP,
+                        [(set v1i128:$vD,
+                           (int_ppc_altivec_vrlqmi v1i128:$vA, v1i128:$vB,
+                                                   v1i128:$vDi))]>,
+                        RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
 }
 
 let Predicates = [IsISA3_1, HasVSX] in {
@@ -1510,6 +1516,9 @@
              (v1i128 (COPY_TO_REGCLASS (LXVRWX xoaddr:$src), VRRC))>;
   def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 64)),
              (v1i128 (COPY_TO_REGCLASS (LXVRDX xoaddr:$src), VRRC))>;
+
+  def : Pat<(v1i128 (rotl v1i128:$vA, v1i128:$vB)),
+            (v1i128 (VRLQ v1i128:$vA, v1i128:$vB))>;
 }
 
 let Predicates = [IsISA3_1, HasVSX] in {
Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -890,6 +890,7 @@
       setOperationAction(ISD::SREM, MVT::v4i32, Legal);
       setOperationAction(ISD::UDIV, MVT::v1i128, Legal);
       setOperationAction(ISD::SDIV, MVT::v1i128, Legal);
+      setOperationAction(ISD::ROTL, MVT::v1i128, Legal);
     }
 
     setOperationAction(ISD::MUL, MVT::v8i16, Legal);
Index: llvm/include/llvm/IR/IntrinsicsPowerPC.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1002,6 +1002,15 @@
                             [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
                             [IntrNoMem]>;
 
+def int_ppc_altivec_vrlqnm :
+      PowerPC_Vec_Intrinsic<"vrlqnm", [llvm_v1i128_ty],
+                           [llvm_v1i128_ty, llvm_v1i128_ty],
+                            [IntrNoMem]>;
+def int_ppc_altivec_vrlqmi :
+      PowerPC_Vec_Intrinsic<"vrlqmi", [llvm_v1i128_ty],
+                            [llvm_v1i128_ty, llvm_v1i128_ty, llvm_v1i128_ty],
+                            [IntrNoMem]>;
+
 // Vector Divide Extended Intrinsics.
 def int_ppc_altivec_vdivesw : PowerPC_Vec_WWW_Intrinsic<"vdivesw">;
 def int_ppc_altivec_vdiveuw : PowerPC_Vec_WWW_Intrinsic<"vdiveuw">;
Index: clang/test/CodeGen/builtins-ppc-p10vector.c
===================================================================
--- clang/test/CodeGen/builtins-ppc-p10vector.c
+++ clang/test/CodeGen/builtins-ppc-p10vector.c
@@ -17,7 +17,7 @@
 vector unsigned int vuia, vuib, vuic;
 vector signed long long vslla, vsllb;
 vector unsigned long long vulla, vullb, vullc;
-vector signed __int128 vsi128a, vsi128b;
+vector signed __int128 vsi128a, vsi128b, vsi128c;
 vector unsigned __int128 vui128a, vui128b, vui128c;
 vector float vfa, vfb;
 vector double vda, vdb;
@@ -1157,3 +1157,49 @@
   // CHECK: ret <1 x i128>
   return vec_xl_zext(llb, ullap);
 }
+
+vector signed __int128 test_vec_rl_s128(void) {
+  // CHECK-LABEL: @test_vec_rl_s128(
+  // CHECK: sub <1 x i128>
+  // CHECK-NEXT: lshr <1 x i128>
+  // CHECK-NEXT: or <1 x i128>
+  // CHECK-NEXT: ret <1 x i128>
+  return vec_rl(vsi128a, vsi128b);
+}
+
+vector unsigned __int128 test_vec_rl_u128(void) {
+  // CHECK-LABEL: @test_vec_rl_u128(
+  // CHECK: sub <1 x i128>
+  // CHECK: lshr <1 x i128>
+  // CHECK: or <1 x i128>
+  // CHECK-NEXT: ret <1 x i128>
+  return vec_rl(vui128a, vui128b);
+}
+
+vector signed __int128 test_vec_rlnm_s128(void) {
+  // CHECK-LABEL: @test_vec_rlnm_s128(
+  // CHECK: call <1 x i128> @llvm.ppc.altivec.vrlqnm(<1 x i128>
+  // CHECK-NEXT: ret <1 x i128>
+  return vec_rlnm(vsi128a, vsi128b, vsi128c);
+}
+
+vector unsigned __int128 test_vec_rlnm_u128(void) {
+  // CHECK-LABEL: @test_vec_rlnm_u128(
+  // CHECK: call <1 x i128> @llvm.ppc.altivec.vrlqnm(<1 x i128>
+  // CHECK-NEXT: ret <1 x i128>
+  return vec_rlnm(vui128a, vui128b, vui128c);
+}
+
+vector signed __int128 test_vec_rlmi_s128(void) {
+  // CHECK-LABEL: @test_vec_rlmi_s128(
+  // CHECK: call <1 x i128> @llvm.ppc.altivec.vrlqmi(<1 x i128>
+  // CHECK-NEXT: ret <1 x i128>
+  return vec_rlmi(vsi128a, vsi128b, vsi128c);
+}
+
+vector unsigned __int128 test_vec_rlmi_u128(void) {
+  // CHECK-LABEL: @test_vec_rlmi_u128(
+  // CHECK: call <1 x i128> @llvm.ppc.altivec.vrlqmi(<1 x i128>
+  // CHECK-NEXT: ret <1 x i128>
+  return vec_rlmi(vui128a, vui128b, vui128c);
+}
Index: clang/lib/Headers/altivec.h
===================================================================
--- clang/lib/Headers/altivec.h
+++ clang/lib/Headers/altivec.h
@@ -7789,6 +7789,18 @@
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_rl(vector signed __int128 __a, vector unsigned __int128 __b) {
+  return (__b << __a)|(__b >> ((__CHAR_BIT__ * sizeof(vector signed __int128)) - __a));
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_rl(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return (__b << __a)|(__b >> ((__CHAR_BIT__ * sizeof(vector unsigned __int128)) - __a));
+}
+#endif
+
 /* vec_rlmi */
 #ifdef __POWER9_VECTOR__
 static __inline__ vector unsigned int __ATTRS_o_ai
@@ -7802,8 +7814,24 @@
          vector unsigned long long __c) {
   return __builtin_altivec_vrldmi(__a, __c, __b);
 }
+#endif
+
+#ifdef __POWER10_VECTOR__
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_rlmi(vector unsigned __int128 __a, vector unsigned __int128 __b,
+         vector unsigned __int128 __c) {
+  return __builtin_altivec_vrlqmi(__a, __c, __b);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_rlmi(vector signed __int128 __a, vector signed __int128 __b,
+         vector signed __int128 __c) {
+  return __builtin_altivec_vrlqmi(__a, __c, __b);
+}
+#endif
 
 /* vec_rlnm */
+#ifdef __POWER9_VECTOR__
 static __inline__ vector unsigned int __ATTRS_o_ai
 vec_rlnm(vector unsigned int __a, vector unsigned int __b,
          vector unsigned int __c) {
@@ -7819,6 +7847,42 @@
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_rlnm(vector unsigned __int128 __a, vector unsigned __int128 __b,
+         vector unsigned __int128 __c) {
+  // Merge __b and __c using an appropriate shuffle.
+  vector unsigned char TmpB = (vector unsigned char)__b;
+  vector unsigned char TmpC = (vector unsigned char)__c;
+  vector unsigned char MaskAndShift =
+#ifdef __LITTLE_ENDIAN__
+      __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, -1, -1, -1, 16, 1,
+                              0, -1, -1, -1, -1, -1);
+#else
+      __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, 30, 31, 15, -1,
+                              -1, -1, -1, -1, -1, -1, -1);
+#endif
+  return __builtin_altivec_vrlqnm(__a, MaskAndShift);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_rlnm(vector signed __int128 __a, vector signed __int128 __b,
+         vector signed __int128 __c) {
+  // Merge __b and __c using an appropriate shuffle.
+  vector unsigned char TmpB = (vector unsigned char)__b;
+  vector unsigned char TmpC = (vector unsigned char)__c;
+  vector unsigned char MaskAndShift =
+#ifdef __LITTLE_ENDIAN__
+      __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, -1, -1, -1, 16, 1,
+                              0, -1, -1, -1, -1, -1);
+#else
+      __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, 30, 31, 15, -1,
+                              -1, -1, -1, -1, -1, -1, -1);
+#endif
+  return __builtin_altivec_vrlqnm(__a, MaskAndShift);
+}
+#endif
+
 /* vec_vrlb */
 
 static __inline__ vector signed char __ATTRS_o_ai
Index: clang/include/clang/Basic/BuiltinsPPC.def
===================================================================
--- clang/include/clang/Basic/BuiltinsPPC.def
+++ clang/include/clang/Basic/BuiltinsPPC.def
@@ -390,6 +390,10 @@
 BUILTIN(__builtin_altivec_vextddvlx, "V2ULLiV2ULLiV2ULLiUi", "")
 BUILTIN(__builtin_altivec_vextddvrx, "V2ULLiV2ULLiV2ULLiUi", "")
 
+// P10 Vector rotate built-ins.
+BUILTIN(__builtin_altivec_vrlqmi, "V1ULLLiV1ULLLiV1ULLLiV1ULLLi", "")
+BUILTIN(__builtin_altivec_vrlqnm, "V1ULLLiV1ULLLiV1ULLLi", "")
+
 // VSX built-ins.
 
 BUILTIN(__builtin_vsx_lxvd2x, "V2divC*", "")

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D86819: [PowerPC][Power10] Implementation of 128-bit Binary Vector Rotate builtins

Reply via email to