[clang] 56d5c46 - [X86] Support __tile_stream_loadd intrinsic for new AMX interface
Author: Bing1 Yu Date: 2021-06-11T17:28:43+08:00 New Revision: 56d5c46b494d2232792a46e9b95de40b082f4164 URL: https://github.com/llvm/llvm-project/commit/56d5c46b494d2232792a46e9b95de40b082f4164 DIFF: https://github.com/llvm/llvm-project/commit/56d5c46b494d2232792a46e9b95de40b082f4164.diff LOG: [X86] Support __tile_stream_loadd intrinsic for new AMX interface Adding support for __tile_stream_loadd intrinsic. Reviewed By: LuoYuanke Differential Revision: https://reviews.llvm.org/D103784 Added: Modified: clang/include/clang/Basic/BuiltinsX86_64.def clang/lib/Headers/amxintrin.h clang/test/CodeGen/X86/amx_api.c llvm/include/llvm/IR/IntrinsicsX86.td llvm/lib/Target/X86/X86ExpandPseudo.cpp llvm/lib/Target/X86/X86FastTileConfig.cpp llvm/lib/Target/X86/X86ISelDAGToDAG.cpp llvm/lib/Target/X86/X86InstrAMX.td llvm/lib/Target/X86/X86LowerAMXType.cpp llvm/lib/Target/X86/X86PreAMXConfig.cpp llvm/lib/Target/X86/X86RegisterInfo.cpp llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll Removed: diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index 57bf1b477d10b..ce2b1decdf6ca 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -103,6 +103,7 @@ TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr") // AMX internal builtin TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h index 6dc0c1f031c4f..ec601a58e7c34 100644 --- a/clang/lib/Headers/amxintrin.h +++ b/clang/lib/Headers/amxintrin.h @@ -239,6 +239,14 @@ _tile_loadd_internal(unsigned short m, unsigned short n, const void *base, (__SIZE_TYPE__)(stride)); } +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloaddt164_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} + /// This is internal intrinsic. C/C++ user should avoid calling it directly. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k, @@ -311,6 +319,27 @@ static void __tile_loadd(__tile1024i *dst, const void *base, dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride); } +/// Load tile rows from memory specifieid by "base" address and "stride" into +/// destination tile "dst". This intrinsic provides a hint to the implementation +/// that the data will likely not be reused in the near future and the data +/// caching can be optimized accordingly. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILELOADDT1 instruction. +/// +/// \param dst +///A destination tile. Max size is 1024 Bytes. +/// \param base +///A pointer to base address. +/// \param stride +///The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TILE +static void __tile_stream_loadd(__tile1024i *dst, const void *base, +__SIZE_TYPE__ stride) { + dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride); +} + /// Compute dot-product of bytes in tiles with a source/destination accumulator. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit diff --git a/clang/test/CodeGen/X86/amx_api.c b/clang/test/CodeGen/X86/amx_api.c index 3bfe887c0445b..fda6d6e8ee4f2 100644 --- a/clang/test/CodeGen/X86/amx_api.c +++ b/clang/test/CodeGen/X86/amx_api.c @@ -39,6 +39,14 @@ void test_tile_loadd(short row, short col) { __tile_loadd(&a, buf, STRIDE); } +void test_tile_stream_loadd(short row, short col) { + //CHECK-LABEL: @test_tile_stream_loadd + //CHECK: call x86_amx @llvm.x86.tileloaddt164.internal + //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32> + __tile1024i a = {row, col}; + __tile_stream_loadd(&a, buf, STRIDE); +} + void test_tile_dpbssd(__tile1024i a, __tile1024i b, __tile1024i c) { //CHECK-LABEL: @test_tile_dpbssd //CHECK: call x86_amx @llvm.x86.tdpbssd.internal
[clang] 07e3476 - [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit
Author: Bing1 Yu Date: 2022-08-24T09:28:55+08:00 New Revision: 07e34763b02728857e1d6e8ccd2b82820eb3c0cc URL: https://github.com/llvm/llvm-project/commit/07e34763b02728857e1d6e8ccd2b82820eb3c0cc DIFF: https://github.com/llvm/llvm-project/commit/07e34763b02728857e1d6e8ccd2b82820eb3c0cc.diff LOG: [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D132141 Added: Modified: clang/lib/Headers/immintrin.h clang/test/CodeGen/X86/rdrand-builtins.c Removed: diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index cca34783efaf4..9dfe3bfe6a6cd 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -287,6 +287,23 @@ _rdrand64_step(unsigned long long *__p) { return (int)__builtin_ia32_rdrand64_step(__p); } +#else +// We need to emulate the functionality of 64-bit rdrand with 2 32-bit +// rdrand instructions. +static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd"))) +_rdrand64_step(unsigned long long *__p) +{ + unsigned int __lo, __hi; + int __res_lo = __builtin_ia32_rdrand32_step(&__lo); + int __res_hi = __builtin_ia32_rdrand32_step(&__hi); + if (__res_lo && __res_hi) { +*__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo; +return 1; + } else { +*__p = 0; +return 0; + } +} #endif #endif /* __RDRND__ */ diff --git a/clang/test/CodeGen/X86/rdrand-builtins.c b/clang/test/CodeGen/X86/rdrand-builtins.c index 4eb17a400fa8b..b3ad463c55f24 100644 --- a/clang/test/CodeGen/X86/rdrand-builtins.c +++ b/clang/test/CodeGen/X86/rdrand-builtins.c @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64 -// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86 #include @@ -17,14 +17,61 @@ int rdrand32(unsigned *p) { // CHECK: store i32 } -#if __x86_64__ int rdrand64(unsigned long long *p) { return _rdrand64_step(p); // X64: @rdrand64 // X64: call { i64, i32 } @llvm.x86.rdrand.64 // X64: store i64 + +// X86-LABEL: @rdrand64( +// X86-NEXT: entry: +// X86-NEXT:[[RETVAL_I:%.*]] = alloca i32, align 4 +// X86-NEXT:[[__P_ADDR_I:%.*]] = alloca i64*, align 4 +// X86-NEXT:[[__LO_I:%.*]] = alloca i32, align 4 +// X86-NEXT:[[__HI_I:%.*]] = alloca i32, align 4 +// X86-NEXT:[[__RES_LO_I:%.*]] = alloca i32, align 4 +// X86-NEXT:[[__RES_HI_I:%.*]] = alloca i32, align 4 +// X86-NEXT:[[P_ADDR:%.*]] = alloca i64*, align 4 +// X86-NEXT:store i64* [[P:%.*]], i64** [[P_ADDR]], align 4 +// X86-NEXT:[[TMP0:%.*]] = load i64*, i64** [[P_ADDR]], align 4 +// X86-NEXT:store i64* [[TMP0]], i64** [[__P_ADDR_I]], align 4 +// X86-NEXT:[[TMP1:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32() +// X86-NEXT:[[TMP2:%.*]] = extractvalue { i32, i32 } [[TMP1]], 0 +// X86-NEXT:store i32 [[TMP2]], i32* [[__LO_I]], align 4 +// X86-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP1]], 1 +// X86-NEXT:store i32 [[TMP3]], i32* [[__RES_LO_I]], align 4 +// X86-NEXT:[[TMP4:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32() +// X86-NEXT:[[TMP5:%.*]] = extractvalue { i32, i32 } [[TMP4]], 0 +// X86-NEXT:store i32 [[TMP5]], i32* [[__HI_I]], align 4 +// X86-NEXT:[[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP4]], 1 +// X86-NEXT:store i32 [[TMP6]], i32* [[__RES_HI_I]], align 4 +// X86-NEXT:[[TMP7:%.*]] = load i32, i32* [[__RES_LO_I]], align 4 +// X86-NEXT:[[TOBOOL_I:%.*]] = icmp ne i32 [[TMP7]], 0 +// X86-NEXT:br i1 [[TOBOOL_I]], label [[LAND_LHS_TRUE_I:%.*]], label [[IF_ELSE_I:%.*]] +// X86: land.lhs.true.i: +// X86-NEXT:[[TMP8:%.*]] = load i32, i32* [[__RES_HI_I]], align 4 +// X86-NEXT:[[TOBOOL1_I:%.*]] = icmp ne i32 [[TMP8]], 0 +// X86-NEXT:br i1 [[TOBOOL1_I]], label [[IF_THEN_I:%.*]], label [[IF_ELSE_I]] +// X86: if.then.i: +// X86-NEXT:[[TMP9:%.*]] = load i32, i32* [[__HI_I]], align 4 +// X86-NEXT:[[CONV_I:%.*]] = zext i32 [[TMP9]] to i64 +// X86-NEXT:[[SHL_I:%.*]] = shl i64 [[CONV_I]], 32 +// X86-NEXT:[[TMP10:%.*]] = load i32, i32* [[__LO_I]], align 4 +// X86-NEXT:[[CONV2_I:%.*]] = zext i32 [[TMP10]] to i64 +// X86-NEXT:[[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV2_I]] +// X86-NEXT:[[TMP11:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4 +// X86-NEXT:store i64 [[OR_I]], i64* [[TMP11]], align 4 +
[clang] 0d8f952 - Revert "[X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit"
Author: Bing1 Yu Date: 2022-08-24T09:38:46+08:00 New Revision: 0d8f9520c5d8912e80b3f245d369c7b86fbd2d5d URL: https://github.com/llvm/llvm-project/commit/0d8f9520c5d8912e80b3f245d369c7b86fbd2d5d DIFF: https://github.com/llvm/llvm-project/commit/0d8f9520c5d8912e80b3f245d369c7b86fbd2d5d.diff LOG: Revert "[X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit" This reverts commit 07e34763b02728857e1d6e8ccd2b82820eb3c0cc. Added: Modified: clang/lib/Headers/immintrin.h clang/test/CodeGen/X86/rdrand-builtins.c Removed: diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 9dfe3bfe6a6cd..cca34783efaf4 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -287,23 +287,6 @@ _rdrand64_step(unsigned long long *__p) { return (int)__builtin_ia32_rdrand64_step(__p); } -#else -// We need to emulate the functionality of 64-bit rdrand with 2 32-bit -// rdrand instructions. -static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd"))) -_rdrand64_step(unsigned long long *__p) -{ - unsigned int __lo, __hi; - int __res_lo = __builtin_ia32_rdrand32_step(&__lo); - int __res_hi = __builtin_ia32_rdrand32_step(&__hi); - if (__res_lo && __res_hi) { -*__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo; -return 1; - } else { -*__p = 0; -return 0; - } -} #endif #endif /* __RDRND__ */ diff --git a/clang/test/CodeGen/X86/rdrand-builtins.c b/clang/test/CodeGen/X86/rdrand-builtins.c index b3ad463c55f24..4eb17a400fa8b 100644 --- a/clang/test/CodeGen/X86/rdrand-builtins.c +++ b/clang/test/CodeGen/X86/rdrand-builtins.c @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64 -// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86 +// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK #include @@ -17,61 +17,14 @@ int rdrand32(unsigned *p) { // CHECK: store i32 } +#if __x86_64__ int rdrand64(unsigned long long *p) { return _rdrand64_step(p); // X64: @rdrand64 // X64: call { i64, i32 } @llvm.x86.rdrand.64 // X64: store i64 - -// X86-LABEL: @rdrand64( -// X86-NEXT: entry: -// X86-NEXT:[[RETVAL_I:%.*]] = alloca i32, align 4 -// X86-NEXT:[[__P_ADDR_I:%.*]] = alloca i64*, align 4 -// X86-NEXT:[[__LO_I:%.*]] = alloca i32, align 4 -// X86-NEXT:[[__HI_I:%.*]] = alloca i32, align 4 -// X86-NEXT:[[__RES_LO_I:%.*]] = alloca i32, align 4 -// X86-NEXT:[[__RES_HI_I:%.*]] = alloca i32, align 4 -// X86-NEXT:[[P_ADDR:%.*]] = alloca i64*, align 4 -// X86-NEXT:store i64* [[P:%.*]], i64** [[P_ADDR]], align 4 -// X86-NEXT:[[TMP0:%.*]] = load i64*, i64** [[P_ADDR]], align 4 -// X86-NEXT:store i64* [[TMP0]], i64** [[__P_ADDR_I]], align 4 -// X86-NEXT:[[TMP1:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32() -// X86-NEXT:[[TMP2:%.*]] = extractvalue { i32, i32 } [[TMP1]], 0 -// X86-NEXT:store i32 [[TMP2]], i32* [[__LO_I]], align 4 -// X86-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP1]], 1 -// X86-NEXT:store i32 [[TMP3]], i32* [[__RES_LO_I]], align 4 -// X86-NEXT:[[TMP4:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32() -// X86-NEXT:[[TMP5:%.*]] = extractvalue { i32, i32 } [[TMP4]], 0 -// X86-NEXT:store i32 [[TMP5]], i32* [[__HI_I]], align 4 -// X86-NEXT:[[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP4]], 1 -// X86-NEXT:store i32 [[TMP6]], i32* [[__RES_HI_I]], align 4 -// X86-NEXT:[[TMP7:%.*]] = load i32, i32* [[__RES_LO_I]], align 4 -// X86-NEXT:[[TOBOOL_I:%.*]] = icmp ne i32 [[TMP7]], 0 -// X86-NEXT:br i1 [[TOBOOL_I]], label [[LAND_LHS_TRUE_I:%.*]], label [[IF_ELSE_I:%.*]] -// X86: land.lhs.true.i: -// X86-NEXT:[[TMP8:%.*]] = load i32, i32* [[__RES_HI_I]], align 4 -// X86-NEXT:[[TOBOOL1_I:%.*]] = icmp ne i32 [[TMP8]], 0 -// X86-NEXT:br i1 [[TOBOOL1_I]], label [[IF_THEN_I:%.*]], label [[IF_ELSE_I]] -// X86: if.then.i: -// X86-NEXT:[[TMP9:%.*]] = load i32, i32* [[__HI_I]], align 4 -// X86-NEXT:[[CONV_I:%.*]] = zext i32 [[TMP9]] to i64 -// X86-NEXT:[[SHL_I:%.*]] = shl i64 [[CONV_I]], 32 -// X86-NEXT:[[TMP10:%.*]] = load i32, i32* [[__LO_I]], align 4 -// X86-NEXT:[[CONV2_I:%.*]] = zext i32 [[TMP10]] to i64 -// X86-NEXT:[[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV2_I]] -// X86-NEXT:[[TMP11:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4 -// X86-NEXT:store i64 [[OR_I]], i64* [[TMP11]], align 4 -// X86-
[clang] 6d8ddf5 - [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit
Author: Bing1 Yu Date: 2022-08-24T10:22:46+08:00 New Revision: 6d8ddf53cc8026748a27a8964e117da371f2ccf2 URL: https://github.com/llvm/llvm-project/commit/6d8ddf53cc8026748a27a8964e117da371f2ccf2 DIFF: https://github.com/llvm/llvm-project/commit/6d8ddf53cc8026748a27a8964e117da371f2ccf2.diff LOG: [X86] Emulate _rdrand64_step with two rdrand32 if it is 32bit Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D132141 Added: Modified: clang/lib/Headers/immintrin.h clang/test/CodeGen/X86/rdrand-builtins.c Removed: diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index cca34783efaf4..f4e4ceaefb2e3 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -287,6 +287,23 @@ _rdrand64_step(unsigned long long *__p) { return (int)__builtin_ia32_rdrand64_step(__p); } +#else +// We need to emulate the functionality of 64-bit rdrand with 2 32-bit +// rdrand instructions. +static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd"))) +_rdrand64_step(unsigned long long *__p) +{ + unsigned int __lo, __hi; + unsigned int __res_lo = __builtin_ia32_rdrand32_step(&__lo); + unsigned int __res_hi = __builtin_ia32_rdrand32_step(&__hi); + if (__res_lo && __res_hi) { +*__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo; +return 1; + } else { +*__p = 0; +return 0; + } +} #endif #endif /* __RDRND__ */ diff --git a/clang/test/CodeGen/X86/rdrand-builtins.c b/clang/test/CodeGen/X86/rdrand-builtins.c index 4eb17a400fa8b..b3ad463c55f24 100644 --- a/clang/test/CodeGen/X86/rdrand-builtins.c +++ b/clang/test/CodeGen/X86/rdrand-builtins.c @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64 -// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -no-opaque-pointers -ffreestanding %s -triple=i386-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86 #include @@ -17,14 +17,61 @@ int rdrand32(unsigned *p) { // CHECK: store i32 } -#if __x86_64__ int rdrand64(unsigned long long *p) { return _rdrand64_step(p); // X64: @rdrand64 // X64: call { i64, i32 } @llvm.x86.rdrand.64 // X64: store i64 + +// X86-LABEL: @rdrand64( +// X86-NEXT: entry: +// X86-NEXT:[[RETVAL_I:%.*]] = alloca i32, align 4 +// X86-NEXT:[[__P_ADDR_I:%.*]] = alloca i64*, align 4 +// X86-NEXT:[[__LO_I:%.*]] = alloca i32, align 4 +// X86-NEXT:[[__HI_I:%.*]] = alloca i32, align 4 +// X86-NEXT:[[__RES_LO_I:%.*]] = alloca i32, align 4 +// X86-NEXT:[[__RES_HI_I:%.*]] = alloca i32, align 4 +// X86-NEXT:[[P_ADDR:%.*]] = alloca i64*, align 4 +// X86-NEXT:store i64* [[P:%.*]], i64** [[P_ADDR]], align 4 +// X86-NEXT:[[TMP0:%.*]] = load i64*, i64** [[P_ADDR]], align 4 +// X86-NEXT:store i64* [[TMP0]], i64** [[__P_ADDR_I]], align 4 +// X86-NEXT:[[TMP1:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32() +// X86-NEXT:[[TMP2:%.*]] = extractvalue { i32, i32 } [[TMP1]], 0 +// X86-NEXT:store i32 [[TMP2]], i32* [[__LO_I]], align 4 +// X86-NEXT:[[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP1]], 1 +// X86-NEXT:store i32 [[TMP3]], i32* [[__RES_LO_I]], align 4 +// X86-NEXT:[[TMP4:%.*]] = call { i32, i32 } @llvm.x86.rdrand.32() +// X86-NEXT:[[TMP5:%.*]] = extractvalue { i32, i32 } [[TMP4]], 0 +// X86-NEXT:store i32 [[TMP5]], i32* [[__HI_I]], align 4 +// X86-NEXT:[[TMP6:%.*]] = extractvalue { i32, i32 } [[TMP4]], 1 +// X86-NEXT:store i32 [[TMP6]], i32* [[__RES_HI_I]], align 4 +// X86-NEXT:[[TMP7:%.*]] = load i32, i32* [[__RES_LO_I]], align 4 +// X86-NEXT:[[TOBOOL_I:%.*]] = icmp ne i32 [[TMP7]], 0 +// X86-NEXT:br i1 [[TOBOOL_I]], label [[LAND_LHS_TRUE_I:%.*]], label [[IF_ELSE_I:%.*]] +// X86: land.lhs.true.i: +// X86-NEXT:[[TMP8:%.*]] = load i32, i32* [[__RES_HI_I]], align 4 +// X86-NEXT:[[TOBOOL1_I:%.*]] = icmp ne i32 [[TMP8]], 0 +// X86-NEXT:br i1 [[TOBOOL1_I]], label [[IF_THEN_I:%.*]], label [[IF_ELSE_I]] +// X86: if.then.i: +// X86-NEXT:[[TMP9:%.*]] = load i32, i32* [[__HI_I]], align 4 +// X86-NEXT:[[CONV_I:%.*]] = zext i32 [[TMP9]] to i64 +// X86-NEXT:[[SHL_I:%.*]] = shl i64 [[CONV_I]], 32 +// X86-NEXT:[[TMP10:%.*]] = load i32, i32* [[__LO_I]], align 4 +// X86-NEXT:[[CONV2_I:%.*]] = zext i32 [[TMP10]] to i64 +// X86-NEXT:[[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV2_I]] +// X86-NEXT:[[TMP11:%.*]] = load i64*, i64** [[__P_ADDR_I]], align 4 +// X86-NEXT:store i64 [[OR_I]], i64* [[
cfe-commits@lists.llvm.org
Author: Bing1 Yu Date: 2023-08-03T13:58:33+08:00 New Revision: 6ee497aa0b48ad892447f29a90b4e61241949295 URL: https://github.com/llvm/llvm-project/commit/6ee497aa0b48ad892447f29a90b4e61241949295 DIFF: https://github.com/llvm/llvm-project/commit/6ee497aa0b48ad892447f29a90b4e61241949295.diff LOG: [X86][Regcall] Add an option to respect regcall ABI v.4 in win64&win32 Reviewed By: pengfei Differential Revision: https://reviews.llvm.org/D155863 Added: clang/test/CodeGen/check-regcall4-moduleflag.c clang/test/CodeGen/regcall4.c clang/test/CodeGenCXX/regcall4.cpp llvm/test/CodeGen/X86/sse-regcall4.ll Modified: clang/include/clang/Basic/LangOptions.def clang/include/clang/Driver/Options.td clang/lib/AST/ItaniumMangle.cpp clang/lib/AST/Mangle.cpp clang/lib/AST/MicrosoftMangle.cpp clang/lib/CodeGen/CodeGenModule.cpp clang/lib/Driver/ToolChains/Clang.cpp clang/test/Driver/cl-cc-flags.c llvm/lib/Target/X86/X86CallingConv.td Removed: diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 007b3737f83e62..b6bb5e969e130c 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -429,6 +429,8 @@ LANGOPT(PaddingOnUnsignedFixedPoint, 1, 0, LANGOPT(RegisterStaticDestructors, 1, 1, "Register C++ static destructors") +LANGOPT(RegCall4, 1, 0, "Set __regcall4 as a default calling convention to respect __regcall ABI v.4") + LANGOPT(MatrixTypes, 1, 0, "Enable or disable the builtin matrix type") ENUM_LANGOPT(StrictFlexArraysLevel, StrictFlexArraysLevelKind, 2, diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 9e25a5e0b58a58..296fa1fcc38a02 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4505,6 +4505,9 @@ def no_offload_add_rpath: Flag<["--"], "no-offload-add-rpath">, Flags<[NoArgumen Alias; def r : Flag<["-"], "r">, Flags<[LinkerInput,NoArgumentUnused]>, Group; +def regcall4 : Flag<["-"], "regcall4">, Group, Flags<[CC1Option]>, + HelpText<"Set __regcall4 as a default calling convention to respect __regcall ABI v.4">, + MarshallingInfoFlag>; def save_temps_EQ : Joined<["-", "--"], "save-temps=">, Flags<[CC1Option, FlangOption, FC1Option, NoXarchOption]>, HelpText<"Save intermediate compilation results.">; def save_temps : Flag<["-", "--"], "save-temps">, Flags<[FlangOption, FC1Option, NoXarchOption]>, @@ -7292,6 +7295,8 @@ def _SLASH_Gv : CLFlag<"Gv">, HelpText<"Set __vectorcall as a default calling convention">; def _SLASH_Gregcall : CLFlag<"Gregcall">, HelpText<"Set __regcall as a default calling convention">; +def _SLASH_Gregcall4 : CLFlag<"Gregcall4">, + HelpText<"Set __regcall4 as a default calling convention to respect __regcall ABI v.4">; // GNU Driver aliases diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 16f0d90451f7ad..153f6dc2e9cf12 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -1688,8 +1688,12 @@ void CXXNameMangler::mangleRegCallName(const IdentifierInfo *II) { // ::= __regcall3__ // ::= [n] // ::= - Out << II->getLength() + sizeof("__regcall3__") - 1 << "__regcall3__" - << II->getName(); + if (getASTContext().getLangOpts().RegCall4) +Out << II->getLength() + sizeof("__regcall4__") - 1 << "__regcall4__" +<< II->getName(); + else +Out << II->getLength() + sizeof("__regcall3__") - 1 << "__regcall3__" +<< II->getName(); } void CXXNameMangler::mangleDeviceStubName(const IdentifierInfo *II) { diff --git a/clang/lib/AST/Mangle.cpp b/clang/lib/AST/Mangle.cpp index 31cdad4c8fdd4e..53af9fc4d51897 100644 --- a/clang/lib/AST/Mangle.cpp +++ b/clang/lib/AST/Mangle.cpp @@ -198,8 +198,12 @@ void MangleContext::mangleName(GlobalDecl GD, raw_ostream &Out) { Out << '_'; else if (CC == CCM_Fast) Out << '@'; - else if (CC == CCM_RegCall) -Out << "__regcall3__"; + else if (CC == CCM_RegCall) { +if (getASTContext().getLangOpts().RegCall4) + Out << "__regcall4__"; +else + Out << "__regcall3__"; + } if (!MCXX) Out << D->getIdentifier()->getName(); diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index 3306d90dc85664..91af18d6119796 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -2853,6 +2853,7 @@ void MicrosoftCXXNameMangler::mangleCallingConvention(CallingConv CC) { // ::= T # __attribute__((__swiftasynccall__)) //// Clang-only // ::= w # __regcall + // ::= x # __regcall4 // The 'export' calling conventions are from a bygone era // (*cough*Win16*cough*) when functions were declared fo
[clang] 320b72e - [X86][AMX] Rename amx-bf16 intrinsic according to correct naming convention
Author: Bing1 Yu Date: 2021-03-17T11:22:52+08:00 New Revision: 320b72e9cd77504054bd2c837149df2f2bd4c149 URL: https://github.com/llvm/llvm-project/commit/320b72e9cd77504054bd2c837149df2f2bd4c149 DIFF: https://github.com/llvm/llvm-project/commit/320b72e9cd77504054bd2c837149df2f2bd4c149.diff LOG: [X86][AMX] Rename amx-bf16 intrinsic according to correct naming convention __tile_tdpbf16ps should be renamed with __tile_dpbf16ps Reviewed By: pengfei Differential Revision: https://reviews.llvm.org/D98685 Added: Modified: clang/lib/Headers/amxintrin.h clang/test/CodeGen/X86/amx_api.c Removed: diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h index 8c276519e362..12d21d40bcff 100644 --- a/clang/lib/Headers/amxintrin.h +++ b/clang/lib/Headers/amxintrin.h @@ -267,8 +267,8 @@ _tile_stored_internal(unsigned short m, unsigned short n, void *base, } static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16 -_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { +_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k, +_tile1024i dst, _tile1024i src1, _tile1024i src2) { return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2); } @@ -323,10 +323,10 @@ static void __tile_zero(__tile1024i *dst) { } __DEFAULT_FN_ATTRS_BF16 -static void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src1, - __tile1024i src2) { - dst->tile = _tile_tdpbf16ps_internal(src1.row, src2.col, src1.col, dst->tile, - src1.tile, src2.tile); +static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src1, +__tile1024i src2) { + dst->tile = _tile_dpbf16ps_internal(src1.row, src2.col, src1.col, dst->tile, + src1.tile, src2.tile); } #undef __DEFAULT_FN_ATTRS_TILE diff --git a/clang/test/CodeGen/X86/amx_api.c b/clang/test/CodeGen/X86/amx_api.c index 824a3aec20ec..3bfe887c0445 100644 --- a/clang/test/CodeGen/X86/amx_api.c +++ b/clang/test/CodeGen/X86/amx_api.c @@ -81,9 +81,9 @@ void test_tile_zero(__tile1024i c) { __tile_zero(&c); } -void test_tile_tdpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) { - //CHECK-LABEL: @test_tile_tdpbf16ps +void test_tile_dpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) { + //CHECK-LABEL: @test_tile_dpbf16ps //CHECK: call x86_amx @llvm.x86.tdpbf16ps.internal //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32> - __tile_tdpbf16ps(&a, b, c); + __tile_dpbf16ps(&a, b, c); } ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits