Author: Jan Patrick Lehr Date: 2026-03-11T09:27:48-04:00 New Revision: 883aa697db5ebc29502d93cd8d54d22fd1bc2644
URL: https://github.com/llvm/llvm-project/commit/883aa697db5ebc29502d93cd8d54d22fd1bc2644 DIFF: https://github.com/llvm/llvm-project/commit/883aa697db5ebc29502d93cd8d54d22fd1bc2644.diff LOG: Revert "[Clang][AMDGPU] Change __fp16 to _Float16 in builtin definitions" (#185861) Reverts llvm/llvm-project#185446 This breaks CK build downstream. Added: Modified: clang/include/clang/Basic/BuiltinsAMDGPU.td clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl clang/test/SemaHIP/hip-builtin-lvalue-to-rvalue.hip Removed: clang/test/CodeGenHIP/builtins-amdgcn-f16-misc.hip clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w32.hip clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w64.hip clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-f16-misc.hip clang/test/CodeGenHIP/builtins-amdgcn-gfx950-f16.hip ################################################################################ diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index 18aebdc38bcfc..acd0a34a79253 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -223,7 +223,7 @@ def __builtin_amdgcn_alignbit : AMDGPUBuiltin<"unsigned int(unsigned int, unsign def __builtin_amdgcn_alignbyte : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)", [Const]>; def __builtin_amdgcn_ubfe : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)", [Const]>; def __builtin_amdgcn_sbfe : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)", [Const]>; -def __builtin_amdgcn_cvt_pkrtz : AMDGPUBuiltin<"_ExtVector<2, _Float16>(float, float)", [Const]>; +def __builtin_amdgcn_cvt_pkrtz : AMDGPUBuiltin<"_ExtVector<2, __fp16>(float, float)", [Const]>; def __builtin_amdgcn_cvt_pknorm_i16 : AMDGPUBuiltin<"_ExtVector<2, short>(float, float)", [Const], "cvt-pknorm-vop2-insts">; def __builtin_amdgcn_cvt_pknorm_u16 : AMDGPUBuiltin<"_ExtVector<2, unsigned short>(float, float)", [Const], "cvt-pknorm-vop2-insts">; def __builtin_amdgcn_cvt_pk_i16 : AMDGPUBuiltin<"_ExtVector<2, short>(int, int)", [Const]>; @@ -319,7 +319,7 @@ def __builtin_amdgcn_ds_gws_sema_release_all : AMDGPUBuiltin<"void(unsigned int) // Interpolation builtins. //===----------------------------------------------------------------------===// def __builtin_amdgcn_interp_p1_f16 : AMDGPUBuiltin<"float(float, unsigned int, unsigned int, bool, unsigned int)", [Const]>; -def __builtin_amdgcn_interp_p2_f16 : AMDGPUBuiltin<"_Float16(float, float, unsigned int, unsigned int, bool, unsigned int)", [Const]>; +def __builtin_amdgcn_interp_p2_f16 : AMDGPUBuiltin<"__fp16(float, float, unsigned int, unsigned int, bool, unsigned int)", [Const]>; def __builtin_amdgcn_interp_p1 : AMDGPUBuiltin<"float(float, unsigned int, unsigned int, unsigned int)", [Const]>; def __builtin_amdgcn_interp_p2 : AMDGPUBuiltin<"float(float, float, unsigned int, unsigned int, unsigned int)", [Const]>; def __builtin_amdgcn_interp_mov : AMDGPUBuiltin<"float(unsigned int, unsigned int, unsigned int, unsigned int)", [Const]>; @@ -349,7 +349,7 @@ def __builtin_amdgcn_perm : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned i // GFX9+ only builtins. //===----------------------------------------------------------------------===// -def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"_Float16(_Float16, _Float16, _Float16)", [Const], "gfx9-insts">; +def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"__fp16(__fp16, __fp16, __fp16)", [Const], "gfx9-insts">; def __builtin_amdgcn_global_atomic_fadd_f64 : AMDGPUBuiltin<"double(double address_space<1> *, double)", [], "gfx90a-insts">; def __builtin_amdgcn_global_atomic_fadd_f32 : AMDGPUBuiltin<"float(float address_space<1> *, float)", [], "atomic-fadd-rtn-insts">; @@ -669,7 +669,7 @@ def __builtin_amdgcn_ds_read_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ def __builtin_amdgcn_ds_read_tr6_b96_v3i32 : AMDGPUBuiltin<"_ExtVector<3, int>(_ExtVector<3, int> address_space<3> *)", [Const], "gfx950-insts">; def __builtin_amdgcn_ds_read_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<3> *)", [Const], "gfx950-insts">; def __builtin_amdgcn_ds_read_tr16_b64_v4i16 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short> address_space<3> *)", [Const], "gfx950-insts">; -def __builtin_amdgcn_ds_read_tr16_b64_v4f16 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16> address_space<3> *)", [Const], "gfx950-insts">; +def __builtin_amdgcn_ds_read_tr16_b64_v4f16 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16> address_space<3> *)", [Const], "gfx950-insts">; def __builtin_amdgcn_ds_read_tr16_b64_v4bf16 : AMDGPUBuiltin<"_ExtVector<4, __bf16>(_ExtVector<4, __bf16> address_space<3> *)", [Const], "gfx950-insts">; def __builtin_amdgcn_ashr_pk_i8_i32 : AMDGPUBuiltin<"unsigned short(unsigned int, unsigned int, unsigned int)", [Const], "ashr-pk-insts">; @@ -699,11 +699,11 @@ def __builtin_amdgcn_s_buffer_prefetch_data : AMDGPUBuiltin<"void(__amdgpu_buffe def __builtin_amdgcn_global_load_tr_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize32">; def __builtin_amdgcn_global_load_tr_b128_v8i16 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_global_load_tr_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize32">; +def __builtin_amdgcn_global_load_tr_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize32">; def __builtin_amdgcn_global_load_tr_b128_v8bf16 : AMDGPUBuiltin<"_ExtVector<8, __bf16>(_ExtVector<8, __bf16> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize32">; def __builtin_amdgcn_global_load_tr_b64_i32 : AMDGPUBuiltin<"int(int address_space<1> *)", [Const], "gfx12-insts,wavefrontsize64">; def __builtin_amdgcn_global_load_tr_b128_v4i16 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_global_load_tr_b128_v4f16 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize64">; +def __builtin_amdgcn_global_load_tr_b128_v4f16 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize64">; def __builtin_amdgcn_global_load_tr_b128_v4bf16 : AMDGPUBuiltin<"_ExtVector<4, __bf16>(_ExtVector<4, __bf16> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize64">; def __builtin_amdgcn_ds_bpermute_fi_b32 : AMDGPUBuiltin<"int(int, int)", [Const], "gfx12-insts">; @@ -828,9 +828,9 @@ def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector let ArgNames = ["a_sign", "a", "b_sign", "b", "c", "clamp"]; } -def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, __fp16>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, short>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">; def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">; @@ -840,9 +840,9 @@ def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32 : AMDGPUBuiltin<"_ExtVector def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<8, _Float16>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; -def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16>, _ExtVector<8, _Float16>, _ExtVector<4, _Float16>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, __fp16>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, short>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">; def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">; @@ -947,13 +947,13 @@ def __builtin_amdgcn_global_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, in def __builtin_amdgcn_global_load_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_global_load_tr6_b96_v3i32 : AMDGPUBuiltin<"_ExtVector<3, int>(_ExtVector<3, int> address_space<1> *)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">; def __builtin_amdgcn_global_load_tr16_b128_v8i16 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short> address_space<1> *)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_global_load_tr16_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16> address_space<1> *)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_global_load_tr16_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16> address_space<1> *)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_global_load_tr16_b128_v8bf16 : AMDGPUBuiltin<"_ExtVector<8, __bf16>(_ExtVector<8, __bf16> address_space<1> *)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_ds_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<3> *)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">; def __builtin_amdgcn_ds_load_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<3> *)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_ds_load_tr6_b96_v3i32 : AMDGPUBuiltin<"_ExtVector<3, int>(_ExtVector<3, int> address_space<3> *)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">; def __builtin_amdgcn_ds_load_tr16_b128_v8i16 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short> address_space<3> *)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_ds_load_tr16_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16> address_space<3> *)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_ds_load_tr16_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16> address_space<3> *)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_ds_load_tr16_b128_v8bf16 : AMDGPUBuiltin<"_ExtVector<8, __bf16>(_ExtVector<8, __bf16> address_space<3> *)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_s_setprio_inc_wg : AMDGPUBuiltin<"void(_Constant short)", [], "setprio-inc-wg-inst">; @@ -964,7 +964,7 @@ def __builtin_amdgcn_s_wait_asynccnt : AMDGPUBuiltin<"void(_Constant unsigned sh def __builtin_amdgcn_s_wait_tensorcnt : AMDGPUBuiltin<"void(_Constant unsigned short)", [], "gfx1250-insts">; def __builtin_amdgcn_tanhf : AMDGPUBuiltin<"float(float)", [Const], "tanh-insts">; -def __builtin_amdgcn_tanhh : AMDGPUBuiltin<"_Float16(_Float16)", [Const], "tanh-insts">; +def __builtin_amdgcn_tanhh : AMDGPUBuiltin<"__fp16(__fp16)", [Const], "tanh-insts">; def __builtin_amdgcn_tanh_bf16 : AMDGPUBuiltin<"__bf16(__bf16)", [Const], "bf16-trans-insts">; def __builtin_amdgcn_rcp_bf16 : AMDGPUBuiltin<"__bf16(__bf16)", [Const], "bf16-trans-insts">; def __builtin_amdgcn_sqrt_bf16 : AMDGPUBuiltin<"__bf16(__bf16)", [Const], "bf16-trans-insts">; diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-f16-misc.hip b/clang/test/CodeGenHIP/builtins-amdgcn-f16-misc.hip deleted file mode 100644 index fc3bf9a87e282..0000000000000 --- a/clang/test/CodeGenHIP/builtins-amdgcn-f16-misc.hip +++ /dev/null @@ -1,88 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 -// REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s - -#define __device__ __attribute__((device)) - -typedef _Float16 v2h __attribute__((ext_vector_type(2))); - -// cvt_pkrtz: _ExtVector<2, _Float16>(float, float) -// CHECK-LABEL: define dso_local void @_Z14test_cvt_pkrtzPDv2_DF16_ff( -// CHECK-SAME: ptr noundef [[OUT:%.*]], float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr -// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store float [[B]], ptr [[B_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[B_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = call contract <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[TMP0]], float [[TMP1]]) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <2 x half> [[TMP2]], ptr [[TMP3]], align 4 -// CHECK-NEXT: ret void -// -__device__ void test_cvt_pkrtz(v2h *out, float a, float b) { - *out = __builtin_amdgcn_cvt_pkrtz(a, b); -} - -// interp_p2_f16: _Float16(float, float, unsigned int, unsigned int, bool, unsigned int) -// attr_chan and attr must be compile-time constants -// CHECK-LABEL: define dso_local void @_Z18test_interp_p2_f16PDF16_ffj( -// CHECK-SAME: ptr noundef [[OUT:%.*]], float noundef [[P1:%.*]], float noundef [[J:%.*]], i32 noundef [[M0:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[P1_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: [[J_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: [[M0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: [[P1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P1_ADDR]] to ptr -// CHECK-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr -// CHECK-NEXT: [[M0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M0_ADDR]] to ptr -// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store float [[P1]], ptr [[P1_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store float [[J]], ptr [[J_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[M0]], ptr [[M0_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[P1_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[J_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[M0_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = call contract half @llvm.amdgcn.interp.p2.f16(float [[TMP0]], float [[TMP1]], i32 2, i32 3, i1 false, i32 [[TMP2]]) -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store half [[TMP3]], ptr [[TMP4]], align 2 -// CHECK-NEXT: ret void -// -__device__ void test_interp_p2_f16(_Float16 *out, float p1, float j, unsigned int m0) { - *out = __builtin_amdgcn_interp_p2_f16(p1, j, 2, 3, false, m0); -} - -// fmed3h: _Float16(_Float16, _Float16, _Float16) - requires gfx9-insts -// CHECK-LABEL: define dso_local void @_Z11test_fmed3hPDF16_DF16_DF16_DF16_( -// CHECK-SAME: ptr noundef [[OUT:%.*]], half noundef [[A:%.*]], half noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5) -// CHECK-NEXT: [[B_ADDR:%.*]] = alloca half, align 2, addrspace(5) -// CHECK-NEXT: [[C_ADDR:%.*]] = alloca half, align 2, addrspace(5) -// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr -// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr -// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store half [[A]], ptr [[A_ADDR_ASCAST]], align 2 -// CHECK-NEXT: store half [[B]], ptr [[B_ADDR_ASCAST]], align 2 -// CHECK-NEXT: store half [[C]], ptr [[C_ADDR_ASCAST]], align 2 -// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[B_ADDR_ASCAST]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = load half, ptr [[C_ADDR_ASCAST]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = call contract half @llvm.amdgcn.fmed3.f16(half [[TMP0]], half [[TMP1]], half [[TMP2]]) -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store half [[TMP3]], ptr [[TMP4]], align 2 -// CHECK-NEXT: ret void -// -__device__ void test_fmed3h(_Float16 *out, _Float16 a, _Float16 b, _Float16 c) { - *out = __builtin_amdgcn_fmed3h(a, b, c); -} diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w32.hip b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w32.hip deleted file mode 100644 index a688869be9f38..0000000000000 --- a/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w32.hip +++ /dev/null @@ -1,96 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 -// REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s - -#define __device__ __attribute__((device)) - -typedef _Float16 v8h __attribute__((ext_vector_type(8))); -typedef _Float16 v16h __attribute__((ext_vector_type(16))); -typedef float v8f __attribute__((ext_vector_type(8))); - -// global_load_tr_b128_v8f16: _ExtVector<8, _Float16>(_ExtVector<8, _Float16> address_space<1> *) -// Requires gfx12-insts,wavefrontsize32 -// CHECK-LABEL: define dso_local void @_Z30test_global_load_tr_b128_v8f16PDv8_DF16_PU3AS1S_( -// CHECK-SAME: ptr noundef [[OUT:%.*]], ptr addrspace(1) noundef [[INPTR:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[INPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: [[INPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INPTR_ADDR]] to ptr -// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store ptr addrspace(1) [[INPTR]], ptr [[INPTR_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[INPTR_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = call contract <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <8 x half> [[TMP1]], ptr [[TMP2]], align 16 -// CHECK-NEXT: ret void -// -__device__ void test_global_load_tr_b128_v8f16(v8h *out, v8h __attribute__((address_space(1))) *inptr) { - *out = __builtin_amdgcn_global_load_tr_b128_v8f16(inptr); -} - -// swmmac_f32_16x16x32_f16_w32: _ExtVector<8, float>(_ExtVector<8, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, float>, int) -// Requires wmma-128b-insts,wavefrontsize32 -// CHECK-LABEL: define dso_local void @_Z32test_swmmac_f32_16x16x32_f16_w32PDv8_fDv8_DF16_Dv16_DF16_S_i( -// CHECK-SAME: ptr noundef [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) -// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5) -// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <8 x float>, align 32, addrspace(5) -// CHECK-NEXT: [[INDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr -// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr -// CHECK-NEXT: [[INDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INDEX_ADDR]] to ptr -// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <8 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store <16 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 32 -// CHECK-NEXT: store <8 x float> [[C]], ptr [[C_ADDR_ASCAST]], align 32 -// CHECK-NEXT: store i32 [[INDEX]], ptr [[INDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR_ASCAST]], align 32 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, ptr [[C_ADDR_ASCAST]], align 32 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[INDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = call contract <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i32(<8 x half> [[TMP0]], <16 x half> [[TMP1]], <8 x float> [[TMP2]], i32 [[TMP3]]) -// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <8 x float> [[TMP4]], ptr [[TMP5]], align 32 -// CHECK-NEXT: ret void -// -__device__ void test_swmmac_f32_16x16x32_f16_w32(v8f *out, v8h a, v16h b, v8f c, int index) { - *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index); -} - -// swmmac_f16_16x16x32_f16_w32: _ExtVector<8, _Float16>(_ExtVector<8, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, int) -// Requires wmma-128b-insts,wavefrontsize32 -// CHECK-LABEL: define dso_local void @_Z32test_swmmac_f16_16x16x32_f16_w32PDv8_DF16_S_Dv16_DF16_S_i( -// CHECK-SAME: ptr noundef [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) -// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5) -// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) -// CHECK-NEXT: [[INDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr -// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr -// CHECK-NEXT: [[INDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INDEX_ADDR]] to ptr -// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <8 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store <16 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 32 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store i32 [[INDEX]], ptr [[INDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR_ASCAST]], align 32 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[INDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = call contract <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i32(<8 x half> [[TMP0]], <16 x half> [[TMP1]], <8 x half> [[TMP2]], i32 [[TMP3]]) -// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <8 x half> [[TMP4]], ptr [[TMP5]], align 16 -// CHECK-NEXT: ret void -// -__device__ void test_swmmac_f16_16x16x32_f16_w32(v8h *out, v8h a, v16h b, v8h c, int index) { - *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index); -} diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w64.hip b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w64.hip deleted file mode 100644 index a18fdffe9920a..0000000000000 --- a/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w64.hip +++ /dev/null @@ -1,96 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 -// REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s - -#define __device__ __attribute__((device)) - -typedef _Float16 v4h __attribute__((ext_vector_type(4))); -typedef _Float16 v8h __attribute__((ext_vector_type(8))); -typedef float v4f __attribute__((ext_vector_type(4))); - -// global_load_tr_b128_v4f16: _ExtVector<4, _Float16>(_ExtVector<4, _Float16> address_space<1> *) -// Requires gfx12-insts,wavefrontsize64 -// CHECK-LABEL: define dso_local void @_Z30test_global_load_tr_b128_v4f16PDv4_DF16_PU3AS1S_( -// CHECK-SAME: ptr noundef [[OUT:%.*]], ptr addrspace(1) noundef [[INPTR:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[INPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: [[INPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INPTR_ADDR]] to ptr -// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store ptr addrspace(1) [[INPTR]], ptr [[INPTR_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[INPTR_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = call contract <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <4 x half> [[TMP1]], ptr [[TMP2]], align 8 -// CHECK-NEXT: ret void -// -__device__ void test_global_load_tr_b128_v4f16(v4h *out, v4h __attribute__((address_space(1))) *inptr) { - *out = __builtin_amdgcn_global_load_tr_b128_v4f16(inptr); -} - -// swmmac_f32_16x16x32_f16_w64: _ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<8, _Float16>, _ExtVector<4, float>, int) -// Requires wmma-128b-insts,wavefrontsize64 -// CHECK-LABEL: define dso_local void @_Z32test_swmmac_f32_16x16x32_f16_w64PDv4_fDv4_DF16_Dv8_DF16_S_i( -// CHECK-SAME: ptr noundef [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) -// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) -// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) -// CHECK-NEXT: [[INDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr -// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr -// CHECK-NEXT: [[INDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INDEX_ADDR]] to ptr -// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <4 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <8 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store <4 x float> [[C]], ptr [[C_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store i32 [[INDEX]], ptr [[INDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[INDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = call contract <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i32(<4 x half> [[TMP0]], <8 x half> [[TMP1]], <4 x float> [[TMP2]], i32 [[TMP3]]) -// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <4 x float> [[TMP4]], ptr [[TMP5]], align 16 -// CHECK-NEXT: ret void -// -__device__ void test_swmmac_f32_16x16x32_f16_w64(v4f *out, v4h a, v8h b, v4f c, int index) { - *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index); -} - -// swmmac_f16_16x16x32_f16_w64: _ExtVector<4, _Float16>(_ExtVector<4, _Float16>, _ExtVector<8, _Float16>, _ExtVector<4, _Float16>, int) -// Requires wmma-128b-insts,wavefrontsize64 -// CHECK-LABEL: define dso_local void @_Z32test_swmmac_f16_16x16x32_f16_w64PDv4_DF16_S_Dv8_DF16_S_i( -// CHECK-SAME: ptr noundef [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) -// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) -// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) -// CHECK-NEXT: [[INDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr -// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr -// CHECK-NEXT: [[INDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INDEX_ADDR]] to ptr -// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <4 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <8 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 16 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store i32 [[INDEX]], ptr [[INDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR_ASCAST]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[C_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[INDEX_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = call contract <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i32(<4 x half> [[TMP0]], <8 x half> [[TMP1]], <4 x half> [[TMP2]], i32 [[TMP3]]) -// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <4 x half> [[TMP4]], ptr [[TMP5]], align 8 -// CHECK-NEXT: ret void -// -__device__ void test_swmmac_f16_16x16x32_f16_w64(v4h *out, v4h a, v8h b, v4h c, int index) { - *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index); -} diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-f16-misc.hip b/clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-f16-misc.hip deleted file mode 100644 index c6f34f789d1ba..0000000000000 --- a/clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-f16-misc.hip +++ /dev/null @@ -1,70 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 -// REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s - -#define __device__ __attribute__((device)) - -typedef _Float16 v8h __attribute__((ext_vector_type(8))); - -// global_load_tr16_b128_v8f16: _ExtVector<8, _Float16>(_ExtVector<8, _Float16> address_space<1> *) -// Requires gfx1250-insts,wavefrontsize32 -// CHECK-LABEL: define dso_local void @_Z32test_global_load_tr16_b128_v8f16PDv8_DF16_PU3AS1S_( -// CHECK-SAME: ptr noundef [[OUT:%.*]], ptr addrspace(1) noundef [[INPTR:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[INPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: [[INPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INPTR_ADDR]] to ptr -// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store ptr addrspace(1) [[INPTR]], ptr [[INPTR_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[INPTR_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = call contract <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <8 x half> [[TMP1]], ptr [[TMP2]], align 16 -// CHECK-NEXT: ret void -// -__device__ void test_global_load_tr16_b128_v8f16(v8h *out, v8h __attribute__((address_space(1))) *inptr) { - *out = __builtin_amdgcn_global_load_tr16_b128_v8f16(inptr); -} - -// ds_load_tr16_b128_v8f16: _ExtVector<8, _Float16>(_ExtVector<8, _Float16> address_space<3> *) -// Requires gfx1250-insts,wavefrontsize32 -// CHECK-LABEL: define dso_local void @_Z28test_ds_load_tr16_b128_v8f16PDv8_DF16_PU3AS3S_( -// CHECK-SAME: ptr noundef [[OUT:%.*]], ptr addrspace(3) noundef [[INPTR:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[INPTR_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) -// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: [[INPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INPTR_ADDR]] to ptr -// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store ptr addrspace(3) [[INPTR]], ptr [[INPTR_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(3), ptr [[INPTR_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = call contract <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <8 x half> [[TMP1]], ptr [[TMP2]], align 16 -// CHECK-NEXT: ret void -// -__device__ void test_ds_load_tr16_b128_v8f16(v8h *out, v8h __attribute__((address_space(3))) *inptr) { - *out = __builtin_amdgcn_ds_load_tr16_b128_v8f16(inptr); -} - -// tanhh: _Float16(_Float16) -// Requires tanh-insts -// CHECK-LABEL: define dso_local void @_Z10test_tanhhPDF16_DF16_( -// CHECK-SAME: ptr noundef [[OUT:%.*]], half noundef [[A:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5) -// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store half [[A]], ptr [[A_ADDR_ASCAST]], align 2 -// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = call contract half @llvm.amdgcn.tanh.f16(half [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store half [[TMP1]], ptr [[TMP2]], align 2 -// CHECK-NEXT: ret void -// -__device__ void test_tanhh(_Float16 *out, _Float16 a) { - *out = __builtin_amdgcn_tanhh(a); -} diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx950-f16.hip b/clang/test/CodeGenHIP/builtins-amdgcn-gfx950-f16.hip deleted file mode 100644 index 96e22d04ee42f..0000000000000 --- a/clang/test/CodeGenHIP/builtins-amdgcn-gfx950-f16.hip +++ /dev/null @@ -1,27 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 -// REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx950 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s - -#define __device__ __attribute__((device)) - -typedef _Float16 v4h __attribute__((ext_vector_type(4))); - -// ds_read_tr16_b64_v4f16: _ExtVector<4, _Float16>(_ExtVector<4, _Float16> address_space<3> *) -// CHECK-LABEL: define dso_local void @_Z27test_ds_read_tr16_b64_v4f16PDv4_DF16_PU3AS3S_( -// CHECK-SAME: ptr noundef [[OUT:%.*]], ptr addrspace(3) noundef [[INPTR:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[INPTR_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) -// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: [[INPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INPTR_ADDR]] to ptr -// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store ptr addrspace(3) [[INPTR]], ptr [[INPTR_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(3), ptr [[INPTR_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = call contract <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store <4 x half> [[TMP1]], ptr [[TMP2]], align 8 -// CHECK-NEXT: ret void -// -__device__ void test_ds_read_tr16_b64_v4f16(v4h *out, v4h __attribute__((address_space(3))) *inptr) { - *out = __builtin_amdgcn_ds_read_tr16_b64_v4f16(inptr); -} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl index 467d4fe17f504..1e3a88a41f90e 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl @@ -6,7 +6,7 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef int v3i __attribute__((ext_vector_type(3))); typedef int v4i __attribute__((ext_vector_type(4))); typedef short v8s __attribute__((ext_vector_type(8))); -typedef _Float16 v8h __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); typedef __bf16 v8y __attribute__((ext_vector_type(8))); // CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr4_b64_v2i32( diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl index 0d9bcbfe335fa..af1f434403767 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl @@ -4,7 +4,7 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef int v3i __attribute__((ext_vector_type(3))); typedef short v4s __attribute__((ext_vector_type(4))); -typedef _Float16 v4h __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); typedef __bf16 v4y __attribute__((ext_vector_type(4))); // GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b4_v2i32( diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl index 267b634414692..8242ae6a98c40 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl @@ -5,10 +5,10 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef short v8s __attribute__((ext_vector_type(8))); -typedef _Float16 v8h __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); typedef __bf16 v8y __attribute__((ext_vector_type(8))); typedef short v4s __attribute__((ext_vector_type(4))); -typedef _Float16 v4h __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); typedef __bf16 v4y __attribute__((ext_vector_type(4))); void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8y* v8y_inptr, diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl index 5533b6cfa7c8f..6f7a93ef897ac 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl @@ -4,7 +4,7 @@ // REQUIRES: amdgpu-registered-target typedef short v4s __attribute__((ext_vector_type(4))); -typedef _Float16 v4h __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); typedef __bf16 v4y __attribute__((ext_vector_type(4))); void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4y* v4y_inptr) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl index 2f380bcf57d47..b7323f1b41c2a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl @@ -5,7 +5,7 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef short v8s __attribute__((ext_vector_type(8))); -typedef _Float16 v8h __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); typedef __bf16 v8y __attribute__((ext_vector_type(8))); void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8y* v8y_inptr) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl index 012844a90512d..186fc4eacfaaf 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl @@ -4,7 +4,7 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef short v8s __attribute__((ext_vector_type(8))); -typedef _Float16 v8h __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); typedef __bf16 v8y __attribute__((ext_vector_type(8))); // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32( diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl index 3d84d04f56eb8..b6627f1c8114d 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl @@ -3,7 +3,7 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 typedef short v4s __attribute__((ext_vector_type(4))); -typedef _Float16 v4h __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); typedef __bf16 v4y __attribute__((ext_vector_type(4))); // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32( diff --git a/clang/test/SemaHIP/hip-builtin-lvalue-to-rvalue.hip b/clang/test/SemaHIP/hip-builtin-lvalue-to-rvalue.hip index db156af516f46..b3238d7b29d3e 100644 --- a/clang/test/SemaHIP/hip-builtin-lvalue-to-rvalue.hip +++ b/clang/test/SemaHIP/hip-builtin-lvalue-to-rvalue.hip @@ -7,7 +7,7 @@ #define __device__ __attribute__((device)) -typedef __attribute__((__vector_size__(8 * sizeof(_Float16)))) _Float16 fp16x8_t; +typedef __attribute__((__vector_size__(8 * sizeof(__fp16)))) __fp16 fp16x8_t; // CHECK: ImplicitCastExpr {{.*}} <AddressSpaceConversion> // CHECK-NEXT: ImplicitCastExpr {{.*}} <LValueToRValue> _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
