[clang] 56d5c46 - [X86] Support __tile_stream_loadd intrinsic for new AMX interface

2021-06-11 Thread Bing1 Yu via cfe-commits

Author: Bing1 Yu
Date: 2021-06-11T17:28:43+08:00
New Revision: 56d5c46b494d2232792a46e9b95de40b082f4164

URL: 
https://github.com/llvm/llvm-project/commit/56d5c46b494d2232792a46e9b95de40b082f4164
DIFF: 
https://github.com/llvm/llvm-project/commit/56d5c46b494d2232792a46e9b95de40b082f4164.diff

LOG: [X86] Support __tile_stream_loadd intrinsic for new AMX interface

Adding support for __tile_stream_loadd intrinsic.

Reviewed By: LuoYuanke

Differential Revision: https://reviews.llvm.org/D103784

Added: 


Modified: 
clang/include/clang/Basic/BuiltinsX86_64.def
clang/lib/Headers/amxintrin.h
clang/test/CodeGen/X86/amx_api.c
llvm/include/llvm/IR/IntrinsicsX86.td
llvm/lib/Target/X86/X86ExpandPseudo.cpp
llvm/lib/Target/X86/X86FastTileConfig.cpp
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
llvm/lib/Target/X86/X86InstrAMX.td
llvm/lib/Target/X86/X86LowerAMXType.cpp
llvm/lib/Target/X86/X86PreAMXConfig.cpp
llvm/lib/Target/X86/X86RegisterInfo.cpp
llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll

Removed: 




diff  --git a/clang/include/clang/Basic/BuiltinsX86_64.def 
b/clang/include/clang/Basic/BuiltinsX86_64.def
index 57bf1b477d10b..ce2b1decdf6ca 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -103,6 +103,7 @@ TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", 
"uintr")
 // AMX internal builtin
 TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", 
"amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", 
"amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", 
"amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", 
"n", "amx-int8")
 TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", 
"n", "amx-int8")
 TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", 
"n", "amx-int8")

diff  --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
index 6dc0c1f031c4f..ec601a58e7c34 100644
--- a/clang/lib/Headers/amxintrin.h
+++ b/clang/lib/Headers/amxintrin.h
@@ -239,6 +239,14 @@ _tile_loadd_internal(unsigned short m, unsigned short n, 
const void *base,
  (__SIZE_TYPE__)(stride));
 }
 
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
+   __SIZE_TYPE__ stride) {
+  return __builtin_ia32_tileloaddt164_internal(m, n, base,
+   (__SIZE_TYPE__)(stride));
+}
+
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
@@ -311,6 +319,27 @@ static void __tile_loadd(__tile1024i *dst, const void 
*base,
   dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
 }
 
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst". This intrinsic provides a hint to the 
implementation
+/// that the data will likely not be reused in the near future and the data
+/// caching can be optimized accordingly.
+///
+/// \headerfile 
+///
+/// This intrinsic corresponds to the  TILELOADDT1  instruction.
+///
+/// \param dst
+///A destination tile. Max size is 1024 Bytes.
+/// \param base
+///A pointer to base address.
+/// \param stride
+///The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TILE
+static void __tile_stream_loadd(__tile1024i *dst, const void *base,
+__SIZE_TYPE__ stride) {
+  dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
+}
+
 /// Compute dot-product of bytes in tiles with a source/destination 
accumulator.
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 
32-bit

diff  --git a/clang/test/CodeGen/X86/amx_api.c 
b/clang/test/CodeGen/X86/amx_api.c
index 3bfe887c0445b..fda6d6e8ee4f2 100644
--- a/clang/test/CodeGen/X86/amx_api.c
+++ b/clang/test/CodeGen/X86/amx_api.c
@@ -39,6 +39,14 @@ void test_tile_loadd(short row, short col) {
   __tile_loadd(&a, buf, STRIDE);
 }
 
+void test_tile_stream_loadd(short row, short col) {
+  //CHECK-LABEL: @test_tile_stream_loadd
+  //CHECK: call x86_amx @llvm.x86.tileloaddt164.internal
+  //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32>
+  __tile1024i a = {row, col};
+  __tile_stream_loadd(&a, buf, STRIDE);
+}
+
 void test_tile_dpbssd(__tile1024i a, __tile1024i b, __tile1024i c) {
   //CHECK-LABEL: @test_tile_dpbssd
   //CHECK: call x86_amx @llvm.x86.tdpbssd.internal

[clang] 07e3476 - [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit

2022-08-23 Thread Bing1 Yu via cfe-commits

Author: Bing1 Yu
Date: 2022-08-24T09:28:55+08:00
New Revision: 07e34763b02728857e1d6e8ccd2b82820eb3c0cc

URL: 
https://github.com/llvm/llvm-project/commit/07e34763b02728857e1d6e8ccd2b82820eb3c0cc
DIFF: 
https://github.com/llvm/llvm-project/commit/07e34763b02728857e1d6e8ccd2b82820eb3c0cc.diff

LOG: [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D132141

Added: 


Modified: 
clang/lib/Headers/immintrin.h
clang/test/CodeGen/X86/rdrand-builtins.c

Removed: 




diff  --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index cca34783efaf4..9dfe3bfe6a6cd 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -287,6 +287,23 @@ _rdrand64_step(unsigned long long *__p)
 {
   return (int)__builtin_ia32_rdrand64_step(__p);
 }
+#else
+// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+// rdrand instructions.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, 
__target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  unsigned int __lo, __hi;
+  int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
+  int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
+  if (__res_lo && __res_hi) {
+*__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo;
+return 1;
+  } else {
+*__p = 0;
+return 0;
+  }
+}
 #endif
 #endif /* __RDRND__ */
 

diff  --git a/clang/test/CodeGen/X86/rdrand-builtins.c 
b/clang/test/CodeGen/X86/rdrand-builtins.c
index 4eb17a400fa8b..b3ad463c55f24 100644
--- a/clang/test/CodeGen/X86/rdrand-builtins.c
+++ b/clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
-// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86
 
 #include 
 
@@ -17,14 +17,61 @@ int rdrand32(unsigned *p) {
 // CHECK: store i32
 }
 
-#if __x86_64__
 int rdrand64(unsigned long long *p) {
   return _rdrand64_step(p);
 // X64: @rdrand64
 // X64: call { i64, i32 } @llvm.x86.rdrand.64
 // X64: store i64
+
+// X86-LABEL: @rdrand64(
+// X86-NEXT:  entry:
+// X86-NEXT:[[RETVAL_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__P_ADDR_I:%.*]] = alloca i64*, align 4
+// X86-NEXT:[[__LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__RES_LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__RES_HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[P_ADDR:%.*]] = alloca i64*, align 4
+// X86-NEXT:store i64* [[P:%.*]], i64** [[P_ADDR]], align 4
+// X86-NEXT:[[TMP0:%.*]] = load i64*, i64** [[P_ADDR]], align 4
+// X86-NEXT:store i64* [[TMP0]], i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:[[TMP1:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP2:%.*]] = extractvalue { i32, i32 } [[TMP1]], 0
+// X86-NEXT:store i32 [[TMP2]], i32* [[__LO_I]], align 4
+// X86-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP1]], 1
+// X86-NEXT:store i32 [[TMP3]], i32* [[__RES_LO_I]], align 4
+// X86-NEXT:[[TMP4:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP5:%.*]] = extractvalue { i32, i32 } [[TMP4]], 0
+// X86-NEXT:store i32 [[TMP5]], i32* [[__HI_I]], align 4
+// X86-NEXT:[[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP4]], 1
+// X86-NEXT:store i32 [[TMP6]], i32* [[__RES_HI_I]], align 4
+// X86-NEXT:[[TMP7:%.*]] = load i32, i32* [[__RES_LO_I]], align 4
+// X86-NEXT:[[TOBOOL_I:%.*]] = icmp ne i32 [[TMP7]], 0
+// X86-NEXT:br i1 [[TOBOOL_I]], label [[LAND_LHS_TRUE_I:%.*]], label 
[[IF_ELSE_I:%.*]]
+// X86:   land.lhs.true.i:
+// X86-NEXT:[[TMP8:%.*]] = load i32, i32* [[__RES_HI_I]], align 4
+// X86-NEXT:[[TOBOOL1_I:%.*]] = icmp ne i32 [[TMP8]], 0
+// X86-NEXT:br i1 [[TOBOOL1_I]], label [[IF_THEN_I:%.*]], label 
[[IF_ELSE_I]]
+// X86:   if.then.i:
+// X86-NEXT:[[TMP9:%.*]] = load i32, i32* [[__HI_I]], align 4
+// X86-NEXT:[[CONV_I:%.*]] = zext i32 [[TMP9]] to i64
+// X86-NEXT:[[SHL_I:%.*]] = shl i64 [[CONV_I]], 32
+// X86-NEXT:[[TMP10:%.*]] = load i32, i32* [[__LO_I]], align 4
+// X86-NEXT:[[CONV2_I:%.*]] = zext i32 [[TMP10]] to i64
+// X86-NEXT:[[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV2_I]]
+// X86-NEXT:[[TMP11:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 [[OR_I]], i64* [[TMP11]], align 4
+

[clang] 0d8f952 - Revert "[X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit"

2022-08-23 Thread Bing1 Yu via cfe-commits

Author: Bing1 Yu
Date: 2022-08-24T09:38:46+08:00
New Revision: 0d8f9520c5d8912e80b3f245d369c7b86fbd2d5d

URL: 
https://github.com/llvm/llvm-project/commit/0d8f9520c5d8912e80b3f245d369c7b86fbd2d5d
DIFF: 
https://github.com/llvm/llvm-project/commit/0d8f9520c5d8912e80b3f245d369c7b86fbd2d5d.diff

LOG: Revert "[X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit"

This reverts commit 07e34763b02728857e1d6e8ccd2b82820eb3c0cc.

Added: 


Modified: 
clang/lib/Headers/immintrin.h
clang/test/CodeGen/X86/rdrand-builtins.c

Removed: 




diff  --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 9dfe3bfe6a6cd..cca34783efaf4 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -287,23 +287,6 @@ _rdrand64_step(unsigned long long *__p)
 {
   return (int)__builtin_ia32_rdrand64_step(__p);
 }
-#else
-// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
-// rdrand instructions.
-static __inline__ int __attribute__((__always_inline__, __nodebug__, 
__target__("rdrnd")))
-_rdrand64_step(unsigned long long *__p)
-{
-  unsigned int __lo, __hi;
-  int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
-  int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
-  if (__res_lo && __res_hi) {
-*__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo;
-return 1;
-  } else {
-*__p = 0;
-return 0;
-  }
-}
 #endif
 #endif /* __RDRND__ */
 

diff  --git a/clang/test/CodeGen/X86/rdrand-builtins.c 
b/clang/test/CodeGen/X86/rdrand-builtins.c
index b3ad463c55f24..4eb17a400fa8b 100644
--- a/clang/test/CodeGen/X86/rdrand-builtins.c
+++ b/clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
-// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86
+// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
 
 #include 
 
@@ -17,61 +17,14 @@ int rdrand32(unsigned *p) {
 // CHECK: store i32
 }
 
+#if __x86_64__
 int rdrand64(unsigned long long *p) {
   return _rdrand64_step(p);
 // X64: @rdrand64
 // X64: call { i64, i32 } @llvm.x86.rdrand.64
 // X64: store i64
-
-// X86-LABEL: @rdrand64(
-// X86-NEXT:  entry:
-// X86-NEXT:[[RETVAL_I:%.*]] = alloca i32, align 4
-// X86-NEXT:[[__P_ADDR_I:%.*]] = alloca i64*, align 4
-// X86-NEXT:[[__LO_I:%.*]] = alloca i32, align 4
-// X86-NEXT:[[__HI_I:%.*]] = alloca i32, align 4
-// X86-NEXT:[[__RES_LO_I:%.*]] = alloca i32, align 4
-// X86-NEXT:[[__RES_HI_I:%.*]] = alloca i32, align 4
-// X86-NEXT:[[P_ADDR:%.*]] = alloca i64*, align 4
-// X86-NEXT:store i64* [[P:%.*]], i64** [[P_ADDR]], align 4
-// X86-NEXT:[[TMP0:%.*]] = load i64*, i64** [[P_ADDR]], align 4
-// X86-NEXT:store i64* [[TMP0]], i64** [[__P_ADDR_I]], align 4
-// X86-NEXT:[[TMP1:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
-// X86-NEXT:[[TMP2:%.*]] = extractvalue { i32, i32 } [[TMP1]], 0
-// X86-NEXT:store i32 [[TMP2]], i32* [[__LO_I]], align 4
-// X86-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP1]], 1
-// X86-NEXT:store i32 [[TMP3]], i32* [[__RES_LO_I]], align 4
-// X86-NEXT:[[TMP4:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
-// X86-NEXT:[[TMP5:%.*]] = extractvalue { i32, i32 } [[TMP4]], 0
-// X86-NEXT:store i32 [[TMP5]], i32* [[__HI_I]], align 4
-// X86-NEXT:[[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP4]], 1
-// X86-NEXT:store i32 [[TMP6]], i32* [[__RES_HI_I]], align 4
-// X86-NEXT:[[TMP7:%.*]] = load i32, i32* [[__RES_LO_I]], align 4
-// X86-NEXT:[[TOBOOL_I:%.*]] = icmp ne i32 [[TMP7]], 0
-// X86-NEXT:br i1 [[TOBOOL_I]], label [[LAND_LHS_TRUE_I:%.*]], label 
[[IF_ELSE_I:%.*]]
-// X86:   land.lhs.true.i:
-// X86-NEXT:[[TMP8:%.*]] = load i32, i32* [[__RES_HI_I]], align 4
-// X86-NEXT:[[TOBOOL1_I:%.*]] = icmp ne i32 [[TMP8]], 0
-// X86-NEXT:br i1 [[TOBOOL1_I]], label [[IF_THEN_I:%.*]], label 
[[IF_ELSE_I]]
-// X86:   if.then.i:
-// X86-NEXT:[[TMP9:%.*]] = load i32, i32* [[__HI_I]], align 4
-// X86-NEXT:[[CONV_I:%.*]] = zext i32 [[TMP9]] to i64
-// X86-NEXT:[[SHL_I:%.*]] = shl i64 [[CONV_I]], 32
-// X86-NEXT:[[TMP10:%.*]] = load i32, i32* [[__LO_I]], align 4
-// X86-NEXT:[[CONV2_I:%.*]] = zext i32 [[TMP10]] to i64
-// X86-NEXT:[[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV2_I]]
-// X86-NEXT:[[TMP11:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
-// X86-NEXT:store i64 [[OR_I]], i64* [[TMP11]], align 4
-// X86-

[clang] 6d8ddf5 - [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit

2022-08-23 Thread Bing1 Yu via cfe-commits

Author: Bing1 Yu
Date: 2022-08-24T10:22:46+08:00
New Revision: 6d8ddf53cc8026748a27a8964e117da371f2ccf2

URL: 
https://github.com/llvm/llvm-project/commit/6d8ddf53cc8026748a27a8964e117da371f2ccf2
DIFF: 
https://github.com/llvm/llvm-project/commit/6d8ddf53cc8026748a27a8964e117da371f2ccf2.diff

LOG: [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D132141

Added: 


Modified: 
clang/lib/Headers/immintrin.h
clang/test/CodeGen/X86/rdrand-builtins.c

Removed: 




diff  --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index cca34783efaf4..f4e4ceaefb2e3 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -287,6 +287,23 @@ _rdrand64_step(unsigned long long *__p)
 {
   return (int)__builtin_ia32_rdrand64_step(__p);
 }
+#else
+// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+// rdrand instructions.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, 
__target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  unsigned int __lo, __hi;
+  unsigned int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
+  unsigned int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
+  if (__res_lo && __res_hi) {
+*__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo;
+return 1;
+  } else {
+*__p = 0;
+return 0;
+  }
+}
 #endif
 #endif /* __RDRND__ */
 

diff  --git a/clang/test/CodeGen/X86/rdrand-builtins.c 
b/clang/test/CodeGen/X86/rdrand-builtins.c
index 4eb17a400fa8b..b3ad463c55f24 100644
--- a/clang/test/CodeGen/X86/rdrand-builtins.c
+++ b/clang/test/CodeGen/X86/rdrand-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
-// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s 
-triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86
 
 #include 
 
@@ -17,14 +17,61 @@ int rdrand32(unsigned *p) {
 // CHECK: store i32
 }
 
-#if __x86_64__
 int rdrand64(unsigned long long *p) {
   return _rdrand64_step(p);
 // X64: @rdrand64
 // X64: call { i64, i32 } @llvm.x86.rdrand.64
 // X64: store i64
+
+// X86-LABEL: @rdrand64(
+// X86-NEXT:  entry:
+// X86-NEXT:[[RETVAL_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__P_ADDR_I:%.*]] = alloca i64*, align 4
+// X86-NEXT:[[__LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__RES_LO_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[__RES_HI_I:%.*]] = alloca i32, align 4
+// X86-NEXT:[[P_ADDR:%.*]] = alloca i64*, align 4
+// X86-NEXT:store i64* [[P:%.*]], i64** [[P_ADDR]], align 4
+// X86-NEXT:[[TMP0:%.*]] = load i64*, i64** [[P_ADDR]], align 4
+// X86-NEXT:store i64* [[TMP0]], i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:[[TMP1:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP2:%.*]] = extractvalue { i32, i32 } [[TMP1]], 0
+// X86-NEXT:store i32 [[TMP2]], i32* [[__LO_I]], align 4
+// X86-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP1]], 1
+// X86-NEXT:store i32 [[TMP3]], i32* [[__RES_LO_I]], align 4
+// X86-NEXT:[[TMP4:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32()
+// X86-NEXT:[[TMP5:%.*]] = extractvalue { i32, i32 } [[TMP4]], 0
+// X86-NEXT:store i32 [[TMP5]], i32* [[__HI_I]], align 4
+// X86-NEXT:[[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP4]], 1
+// X86-NEXT:store i32 [[TMP6]], i32* [[__RES_HI_I]], align 4
+// X86-NEXT:[[TMP7:%.*]] = load i32, i32* [[__RES_LO_I]], align 4
+// X86-NEXT:[[TOBOOL_I:%.*]] = icmp ne i32 [[TMP7]], 0
+// X86-NEXT:br i1 [[TOBOOL_I]], label [[LAND_LHS_TRUE_I:%.*]], label 
[[IF_ELSE_I:%.*]]
+// X86:   land.lhs.true.i:
+// X86-NEXT:[[TMP8:%.*]] = load i32, i32* [[__RES_HI_I]], align 4
+// X86-NEXT:[[TOBOOL1_I:%.*]] = icmp ne i32 [[TMP8]], 0
+// X86-NEXT:br i1 [[TOBOOL1_I]], label [[IF_THEN_I:%.*]], label 
[[IF_ELSE_I]]
+// X86:   if.then.i:
+// X86-NEXT:[[TMP9:%.*]] = load i32, i32* [[__HI_I]], align 4
+// X86-NEXT:[[CONV_I:%.*]] = zext i32 [[TMP9]] to i64
+// X86-NEXT:[[SHL_I:%.*]] = shl i64 [[CONV_I]], 32
+// X86-NEXT:[[TMP10:%.*]] = load i32, i32* [[__LO_I]], align 4
+// X86-NEXT:[[CONV2_I:%.*]] = zext i32 [[TMP10]] to i64
+// X86-NEXT:[[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV2_I]]
+// X86-NEXT:[[TMP11:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4
+// X86-NEXT:store i64 [[OR_I]], i64* [[

cfe-commits@lists.llvm.org

2023-08-02 Thread Bing1 Yu via cfe-commits

Author: Bing1 Yu
Date: 2023-08-03T13:58:33+08:00
New Revision: 6ee497aa0b48ad892447f29a90b4e61241949295

URL: 
https://github.com/llvm/llvm-project/commit/6ee497aa0b48ad892447f29a90b4e61241949295
DIFF: 
https://github.com/llvm/llvm-project/commit/6ee497aa0b48ad892447f29a90b4e61241949295.diff

LOG: [X86][Regcall] Add an option to respect regcall ABI v.4 in win64&win32

Reviewed By: pengfei

Differential Revision: https://reviews.llvm.org/D155863

Added: 
clang/test/CodeGen/check-regcall4-moduleflag.c
clang/test/CodeGen/regcall4.c
clang/test/CodeGenCXX/regcall4.cpp
llvm/test/CodeGen/X86/sse-regcall4.ll

Modified: 
clang/include/clang/Basic/LangOptions.def
clang/include/clang/Driver/Options.td
clang/lib/AST/ItaniumMangle.cpp
clang/lib/AST/Mangle.cpp
clang/lib/AST/MicrosoftMangle.cpp
clang/lib/CodeGen/CodeGenModule.cpp
clang/lib/Driver/ToolChains/Clang.cpp
clang/test/Driver/cl-cc-flags.c
llvm/lib/Target/X86/X86CallingConv.td

Removed: 




diff  --git a/clang/include/clang/Basic/LangOptions.def 
b/clang/include/clang/Basic/LangOptions.def
index 007b3737f83e62..b6bb5e969e130c 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -429,6 +429,8 @@ LANGOPT(PaddingOnUnsignedFixedPoint, 1, 0,
 
 LANGOPT(RegisterStaticDestructors, 1, 1, "Register C++ static destructors")
 
+LANGOPT(RegCall4, 1, 0, "Set __regcall4 as a default calling convention to 
respect __regcall ABI v.4")
+
 LANGOPT(MatrixTypes, 1, 0, "Enable or disable the builtin matrix type")
 
 ENUM_LANGOPT(StrictFlexArraysLevel, StrictFlexArraysLevelKind, 2,

diff  --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index 9e25a5e0b58a58..296fa1fcc38a02 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4505,6 +4505,9 @@ def no_offload_add_rpath: Flag<["--"], 
"no-offload-add-rpath">, Flags<[NoArgumen
   Alias;
 def r : Flag<["-"], "r">, Flags<[LinkerInput,NoArgumentUnused]>,
 Group;
+def regcall4 : Flag<["-"], "regcall4">, Group, Flags<[CC1Option]>,
+  HelpText<"Set __regcall4 as a default calling convention to respect 
__regcall ABI v.4">,
+  MarshallingInfoFlag>;
 def save_temps_EQ : Joined<["-", "--"], "save-temps=">, Flags<[CC1Option, 
FlangOption, FC1Option, NoXarchOption]>,
   HelpText<"Save intermediate compilation results.">;
 def save_temps : Flag<["-", "--"], "save-temps">, Flags<[FlangOption, 
FC1Option, NoXarchOption]>,
@@ -7292,6 +7295,8 @@ def _SLASH_Gv : CLFlag<"Gv">,
   HelpText<"Set __vectorcall as a default calling convention">;
 def _SLASH_Gregcall : CLFlag<"Gregcall">,
   HelpText<"Set __regcall as a default calling convention">;
+def _SLASH_Gregcall4 : CLFlag<"Gregcall4">,
+  HelpText<"Set __regcall4 as a default calling convention to respect 
__regcall ABI v.4">;
 
 // GNU Driver aliases
 

diff  --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 16f0d90451f7ad..153f6dc2e9cf12 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -1688,8 +1688,12 @@ void CXXNameMangler::mangleRegCallName(const 
IdentifierInfo *II) {
   //  ::=  __regcall3__ 
   //  ::= [n] 
   //  ::= 
-  Out << II->getLength() + sizeof("__regcall3__") - 1 << "__regcall3__"
-  << II->getName();
+  if (getASTContext().getLangOpts().RegCall4)
+Out << II->getLength() + sizeof("__regcall4__") - 1 << "__regcall4__"
+<< II->getName();
+  else
+Out << II->getLength() + sizeof("__regcall3__") - 1 << "__regcall3__"
+<< II->getName();
 }
 
 void CXXNameMangler::mangleDeviceStubName(const IdentifierInfo *II) {

diff  --git a/clang/lib/AST/Mangle.cpp b/clang/lib/AST/Mangle.cpp
index 31cdad4c8fdd4e..53af9fc4d51897 100644
--- a/clang/lib/AST/Mangle.cpp
+++ b/clang/lib/AST/Mangle.cpp
@@ -198,8 +198,12 @@ void MangleContext::mangleName(GlobalDecl GD, raw_ostream 
&Out) {
 Out << '_';
   else if (CC == CCM_Fast)
 Out << '@';
-  else if (CC == CCM_RegCall)
-Out << "__regcall3__";
+  else if (CC == CCM_RegCall) {
+if (getASTContext().getLangOpts().RegCall4)
+  Out << "__regcall4__";
+else
+  Out << "__regcall3__";
+  }
 
   if (!MCXX)
 Out << D->getIdentifier()->getName();

diff  --git a/clang/lib/AST/MicrosoftMangle.cpp 
b/clang/lib/AST/MicrosoftMangle.cpp
index 3306d90dc85664..91af18d6119796 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -2853,6 +2853,7 @@ void 
MicrosoftCXXNameMangler::mangleCallingConvention(CallingConv CC) {
   //  ::= T # __attribute__((__swiftasynccall__))
   //// Clang-only
   //  ::= w # __regcall
+  //  ::= x # __regcall4
   // The 'export' calling conventions are from a bygone era
   // (*cough*Win16*cough*) when functions were declared fo

[clang] 320b72e - [X86][AMX] Rename amx-bf16 intrinsic according to correct naming convention

2021-03-16 Thread Bing1 Yu via cfe-commits

Author: Bing1 Yu
Date: 2021-03-17T11:22:52+08:00
New Revision: 320b72e9cd77504054bd2c837149df2f2bd4c149

URL: 
https://github.com/llvm/llvm-project/commit/320b72e9cd77504054bd2c837149df2f2bd4c149
DIFF: 
https://github.com/llvm/llvm-project/commit/320b72e9cd77504054bd2c837149df2f2bd4c149.diff

LOG: [X86][AMX] Rename amx-bf16 intrinsic according to correct naming convention

__tile_tdpbf16ps should be renamed with __tile_dpbf16ps

Reviewed By: pengfei

Differential Revision: https://reviews.llvm.org/D98685

Added: 


Modified: 
clang/lib/Headers/amxintrin.h
clang/test/CodeGen/X86/amx_api.c

Removed: 




diff  --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
index 8c276519e362..12d21d40bcff 100644
--- a/clang/lib/Headers/amxintrin.h
+++ b/clang/lib/Headers/amxintrin.h
@@ -267,8 +267,8 @@ _tile_stored_internal(unsigned short m, unsigned short n, 
void *base,
 }
 
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
-_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
- _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+_tile1024i dst, _tile1024i src1, _tile1024i src2) {
   return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
 }
 
@@ -323,10 +323,10 @@ static void __tile_zero(__tile1024i *dst) {
 }
 
 __DEFAULT_FN_ATTRS_BF16
-static void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src1,
- __tile1024i src2) {
-  dst->tile = _tile_tdpbf16ps_internal(src1.row, src2.col, src1.col, dst->tile,
-   src1.tile, src2.tile);
+static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src1,
+__tile1024i src2) {
+  dst->tile = _tile_dpbf16ps_internal(src1.row, src2.col, src1.col, dst->tile,
+  src1.tile, src2.tile);
 }
 
 #undef __DEFAULT_FN_ATTRS_TILE

diff  --git a/clang/test/CodeGen/X86/amx_api.c 
b/clang/test/CodeGen/X86/amx_api.c
index 824a3aec20ec..3bfe887c0445 100644
--- a/clang/test/CodeGen/X86/amx_api.c
+++ b/clang/test/CodeGen/X86/amx_api.c
@@ -81,9 +81,9 @@ void test_tile_zero(__tile1024i c) {
   __tile_zero(&c);
 }
 
-void test_tile_tdpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_tdpbf16ps
+void test_tile_dpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
+  //CHECK-LABEL: @test_tile_dpbf16ps
   //CHECK: call x86_amx @llvm.x86.tdpbf16ps.internal
   //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32>
-  __tile_tdpbf16ps(&a, b, c);
+  __tile_dpbf16ps(&a, b, c);
 }



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits