[PATCH] D72841: Add support for pragma float_control, to control precision and exception behavior at the source level
cfang added a comment. -ffast-math flag got lost in the Builder after this change. FMF.isFast() is true before updateFastMathFlags(FMF, FPFeatures), but turns false after. It seems the Builder.FMF has been correctly set before, but I am not clear what FPFeatures should be at this point: +static void setBuilderFlagsFromFPFeatures(CGBuilderTy &Builder, + CodeGenFunction &CGF, + FPOptions FPFeatures) { + auto NewRoundingBehavior = FPFeatures.getRoundingMode(); + Builder.setDefaultConstrainedRounding(NewRoundingBehavior); + auto NewExceptionBehavior = + ToConstrainedExceptMD(FPFeatures.getExceptionMode()); + Builder.setDefaultConstrainedExcept(NewExceptionBehavior); + auto FMF = Builder.getFastMathFlags(); + updateFastMathFlags(FMF, FPFeatures); + Builder.setFastMathFlags(FMF); + assert((CGF.CurFuncDecl == nullptr || Builder.getIsFPConstrained() || + isa(CGF.CurFuncDecl) || + isa(CGF.CurFuncDecl) || + (NewExceptionBehavior == llvm::fp::ebIgnore && + NewRoundingBehavior == llvm::RoundingMode::NearestTiesToEven)) && + "FPConstrained should be enabled on entire function"); +} Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D72841/new/ https://reviews.llvm.org/D72841 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D120265: AMDGPU: Use the implicit kernargs for code object version 5
This revision was landed with ongoing or failed builds. This revision was automatically updated to reflect the committed changes. Closed by commit rGdd5895cc3986: AMDGPU: Use the implicit kernargs for code object version 5 (authored by cfang). Herald added a project: clang. Herald added a subscriber: cfe-commits. Changed prior to commit: https://reviews.llvm.org/D120265?vs=416080&id=416315#toc Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D120265/new/ https://reviews.llvm.org/D120265 Files: clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp llvm/lib/Target/AMDGPU/SIDefines.h llvm/lib/Target/AMDGPU/SIISelLowering.cpp llvm/lib/Target/AMDGPU/SIISelLowering.h llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll Index: llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll === --- /dev/null +++ llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -0,0 +1,550 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefix=GFX8V3 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefix=GFX8V4 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=GFX8V5 %s + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefixes=GFX9V3 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefixes=GFX9V4 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefixes=GFX9V5 %s + +define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 addrspace(3)* %ptr.local) { +; GFX8V3-LABEL: addrspacecast: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT:s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V3-NEXT:s_load_dword s2, s[4:5], 0x44 +; GFX8V3-NEXT:s_load_dword s3, s[4:5], 0x40 +; GFX8V3-NEXT:v_mov_b32_e32 v4, 1 +; GFX8V3-NEXT:s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT:s_cmp_lg_u32 s0, -1 +; GFX8V3-NEXT:v_mov_b32_e32 v0, s2 +; GFX8V3-NEXT:s_cselect_b64 vcc, -1, 0 +; GFX8V3-NEXT:v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX8V3-NEXT:v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT:s_cmp_lg_u32 s1, -1 +; GFX8V3-NEXT:v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8V3-NEXT:v_mov_b32_e32 v2, s3 +; GFX8V3-NEXT:s_cselect_b64 vcc, -1, 0 +; GFX8V3-NEXT:v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX8V3-NEXT:v_mov_b32_e32 v2, s1 +; GFX8V3-NEXT:v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8V3-NEXT:flat_store_dword v[0:1], v4 +; GFX8V3-NEXT:s_waitcnt vmcnt(0) +; GFX8V3-NEXT:v_mov_b32_e32 v0, 2 +; GFX8V3-NEXT:flat_store_dword v[2:3], v0 +; GFX8V3-NEXT:s_waitcnt vmcnt(0) +; GFX8V3-NEXT:s_endpgm +; +; GFX8V4-LABEL: addrspacecast: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT:s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT:s_load_dword s2, s[4:5], 0x44 +; GFX8V4-NEXT:s_load_dword s3, s[4:5], 0x40 +; GFX8V4-NEXT:v_mov_b32_e32 v4, 1 +; GFX8V4-NEXT:s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT:s_cmp_lg_u32 s0, -1 +; GFX8V4-NEXT:v_mov_b32_e32 v0, s2 +; GFX8V4-NEXT:s_cselect_b64 vcc, -1, 0 +; GFX8V4-NEXT:v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX8V4-NEXT:v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT:s_cmp_lg_u32 s1, -1 +; GFX8V4-NEXT:v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8V4-NEXT:v_mov_b32_e32 v2, s3 +; GFX8V4-NEXT:s_cselect_b64 vcc, -1, 0 +; GFX8V4-NEXT:v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX8V4-NEXT:v_mov_b32_e32 v2, s1 +; GFX8V4-NEXT:v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8V4-NEXT:flat_store_dword v[0:1], v4 +; GFX8V4-NEXT:s_waitcnt vmcnt(0) +; GFX8V4-NEXT:v_mov_b32_e32 v0, 2 +; GFX8V4-NEXT:flat_store_dword v[2:3], v0 +; GFX8V4-NEXT:s_waitcnt vmcnt(0) +; GFX8V4-NEXT:s_endpgm +; +; GFX8V5-LABEL: addrspacecast: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT:s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8V5-NEXT:s_load_dword s2, s[4:5], 0xc8 +; GFX8V5-NEXT:s_load_dword s3, s[4:5], 0xcc +; GFX8V5-NEXT:v_mov_b32_e32 v4, 1 +; GFX8V5-NEXT:s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT:s_cmp_lg_u32 s0, -1 +; GFX8V5-NEXT:v_mov_b32_e32 v0, s2 +; GFX8V5-NEXT:s_cselect_b64 vcc, -1, 0 +; GFX8V5-NEXT:v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX8V5-NEXT:v_mov_b32_e32 v0, s0 +; GF
[PATCH] D134355: [AMDGPU] Emit module flag for all code object versions
cfang accepted this revision. cfang added a comment. This revision is now accepted and ready to land. LGTM Should the module flag name be amdgpu_code_object_version or amdhsa_code_object_version? CHANGES SINCE LAST ACTION https://reviews.llvm.org/D134355/new/ https://reviews.llvm.org/D134355 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D155986: [clang][AMDGPU]: Don't use byval for struct arguments in function ABI
This revision was landed with ongoing or failed builds. This revision was automatically updated to reflect the committed changes. Closed by commit rGd77c62053c94: [clang][AMDGPU]: Don't use byval for struct arguments in function ABI (authored by cfang). Herald added a project: clang. Herald added a subscriber: cfe-commits. Changed prior to commit: https://reviews.llvm.org/D155986?vs=549545&id=549546#toc Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D155986/new/ https://reviews.llvm.org/D155986 Files: clang/docs/ReleaseNotes.rst clang/lib/CodeGen/CGCall.cpp clang/lib/CodeGen/Targets/AMDGPU.cpp clang/test/CodeGenCUDA/kernel-args.cu clang/test/CodeGenCXX/amdgcn-func-arg.cpp clang/test/CodeGenOpenCL/addr-space-struct-arg.cl clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl clang/test/CodeGenOpenCL/byval.cl llvm/docs/AMDGPUUsage.rst Index: llvm/docs/AMDGPUUsage.rst === --- llvm/docs/AMDGPUUsage.rst +++ llvm/docs/AMDGPUUsage.rst @@ -13812,6 +13812,10 @@ 9. All other registers are unspecified. 10. Any necessary ``s_waitcnt`` has been performed to ensure memory is available to the function. +11: Use pass-by-reference (byref) in stead of pass-by-value (byval) for struct +arguments in C ABI. Callee is responsible for allocating stack memory and +copying the value of the struct if modified. Note that the backend still +supports byval for struct arguments. On exit from a function: Index: clang/test/CodeGenOpenCL/byval.cl === --- clang/test/CodeGenOpenCL/byval.cl +++ clang/test/CodeGenOpenCL/byval.cl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn %s | FileCheck %s - +// RUN: %clang_cc1 -emit-llvm -o - -triple i686-pc-darwin %s | FileCheck -check-prefix=X86 %s +// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn %s | FileCheck -check-prefix=AMDGCN %s struct A { int x[100]; }; @@ -8,8 +8,10 @@ int g() { struct A a; - // CHECK: call i32 @f(ptr addrspace(5) noundef byval{{.*}}%a) + // X86:call i32 @f(ptr noundef nonnull byval(%struct.A) align 4 %a) + // AMDGCN: call i32 @f(ptr addrspace(5) noundef byref{{.*}}%a) return f(a); } -// CHECK: declare i32 @f(ptr addrspace(5) noundef byval{{.*}}) +// X86: declare i32 @f(ptr noundef byval(%struct.A) align 4) +// AMDGCN: declare i32 @f(ptr addrspace(5) noundef byref{{.*}}) Index: clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl === --- clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl +++ clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl @@ -448,11 +448,11 @@ // CHECK: define{{.*}} void @func_reg_state_lo(<4 x i32> noundef %arg0, <4 x i32> noundef %arg1, <4 x i32> noundef %arg2, i32 noundef %arg3, i32 %s.coerce0, float %s.coerce1, i32 %s.coerce2) void func_reg_state_lo(int4 arg0, int4 arg1, int4 arg2, int arg3, struct_arg_t s) { } -// CHECK: define{{.*}} void @func_reg_state_hi(<4 x i32> noundef %arg0, <4 x i32> noundef %arg1, <4 x i32> noundef %arg2, i32 noundef %arg3, i32 noundef %arg4, ptr addrspace(5) nocapture noundef readnone byval(%struct.struct_arg) align 4 %s) +// CHECK: define{{.*}} void @func_reg_state_hi(<4 x i32> noundef %arg0, <4 x i32> noundef %arg1, <4 x i32> noundef %arg2, i32 noundef %arg3, i32 noundef %arg4, ptr addrspace(5) nocapture noundef readnone byref(%struct.struct_arg) align 4 %{{.*}}) void func_reg_state_hi(int4 arg0, int4 arg1, int4 arg2, int arg3, int arg4, struct_arg_t s) { } // XXX - Why don't the inner structs flatten? -// CHECK: define{{.*}} void @func_reg_state_num_regs_nested_struct(<4 x i32> noundef %arg0, i32 noundef %arg1, i32 %arg2.coerce0, %struct.nested %arg2.coerce1, i32 %arg3.coerce0, %struct.nested %arg3.coerce1, ptr addrspace(5) nocapture noundef readnone byval(%struct.num_regs_nested_struct) align 8 %arg4) +// CHECK: define{{.*}} void @func_reg_state_num_regs_nested_struct(<4 x i32> noundef %arg0, i32 noundef %arg1, i32 %arg2.coerce0, %struct.nested %arg2.coerce1, i32 %arg3.coerce0, %struct.nested %arg3.coerce1, ptr addrspace(5) nocapture noundef readnone byref(%struct.num_regs_nested_struct) align 8 %{{.*}}) void func_reg_state_num_regs_nested_struct(int4 arg0, int arg1, num_regs_nested_struct arg2, num_regs_nested_struct arg3, num_regs_nested_struct arg4) { } // CHECK: define{{.*}} void @func_double_nested_struct_arg(<4 x i32> noundef %arg0, i32 noundef %arg1, i32 %arg2.coerce0, %struct.double_nested %arg2.coerce1, i16 %arg2.coerce2) @@ -467,7 +467,7 @@ // CHECK: define{{.*}} void @func_large_struct_padding_arg_direct(i8 %arg.coerce0, i32 %arg.coerce1, i8 %arg.coerce2, i32 %arg.coerce3, i8 %arg.coerce4, i8 %arg.coerce5, i16 %arg.coerce6, i16 %arg.coerce7, [3 x i8] %arg.coerce8, i64 %arg.coerce9, i32 %arg.coerce10, i8 %arg.co
[PATCH] D146023: [AMDGPU] Remove Code Object V2
cfang added a comment. Comment at: llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp:5598 return ParseDirectiveHSAMetadata(); } else { -if (IDVal == ".hsa_code_object_version") Are you sure Non-HSA does not have the four directives you deleted? Comment at: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp:122 std::optional getHsaAbiVersion(const MCSubtargetInfo *STI) { if (STI && STI->getTargetTriple().getOS() != Triple::AMDHSA) return std::nullopt; It is fine now. But I think STI could never be null. Comment at: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h:46 enum { - AMDHSA_COV2 = 2, AMDHSA_COV3 = 3, Should we keep this field, and just mention "unsupported"? Comment at: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h:59 /// false otherwise. bool isHsaAbiVersion3(const MCSubtargetInfo *STI); /// \returns True if HSA OS ABI Version identification is 4, Are all these "isHsaAbiVersionX" no longer needed? Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D146023/new/ https://reviews.llvm.org/D146023 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits