https://github.com/pasaulais created https://github.com/llvm/llvm-project/pull/69229
On some systems and when accessing certain memory areas it is not safe to use atomic xor with AMDGPU. This may be the case when using fine-grained memory allocations (e.g. through `hipAllocManaged`) and when the atomic operation needs to go through the PCIe bus, which does not support atomic xor (PCIe 3.0 does support other atomic operations like fetch_and_add and cmpxchg) [1]. The issue has been worked around in DPC++/HIP by prefetching memory before executing kernels [2], however this adds an overhead that can outweigh the performance benefit of using atomic xor over a cmpxchg loop. This is why a way to switch between 'native' atomic xor and a cmpxchg-loop solution is needed. These changes add `-munsafe-int-atomics` and `-mno-unsafe-int-atomics` options to clang that can be used to switch between the two implementations. The default is the current behaviour of generative 'native' atomic xor instructions. When `-mno-unsafe-int-atomics` is passed to clang, functions are marked with the `amdgpu-unsafe-int-atomics` attribute (set to `false`) at the IR level and this tells the backend to expand atomic xor to a cmpxchg loop. The options only affect the global and flat address spaces. [1]: https://github.com/RadeonOpenCompute/ROCm/issues/2481 [2]: https://github.com/intel/llvm/pull/10430 >From 508e577fa67b590ad297e45aeb5210ec3190ebc9 Mon Sep 17 00:00:00 2001 From: Pierre-Andre Saulais <pierre-an...@codeplay.com> Date: Thu, 12 Oct 2023 16:00:20 +0100 Subject: [PATCH] [AMDGPU] Add an option to disable unsafe uses of atomic xor --- clang/include/clang/Basic/TargetInfo.h | 7 +++ clang/include/clang/Basic/TargetOptions.h | 3 + clang/include/clang/Driver/Options.td | 8 +++ clang/lib/Basic/TargetInfo.cpp | 1 + clang/lib/Basic/Targets/AMDGPU.cpp | 1 + clang/lib/CodeGen/Targets/AMDGPU.cpp | 3 + clang/lib/Driver/ToolChains/Clang.cpp | 2 + .../amdgpu-unsafe-int-atomics.cl | 55 +++++++++++++++++++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 15 +++++ llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll | 23 ++++++-- 10 files changed, 114 insertions(+), 4 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index 9d56e97a3d4bb88..96416c4405115cd 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -253,6 +253,8 @@ class TargetInfo : public TransferrableTargetInfo, unsigned AllowAMDGPUUnsafeFPAtomics : 1; + unsigned AllowAMDGPUUnsafeIntAtomics : 1; + unsigned ARMCDECoprocMask : 8; unsigned MaxOpenCLWorkGroupSize; @@ -995,6 +997,11 @@ class TargetInfo : public TransferrableTargetInfo, /// allowed. bool allowAMDGPUUnsafeFPAtomics() const { return AllowAMDGPUUnsafeFPAtomics; } + /// Returns whether or not the AMDGPU unsafe integer atomics are allowed. + bool allowAMDGPUUnsafeIntAtomics() const { + return AllowAMDGPUUnsafeIntAtomics; + } + /// For ARM targets returns a mask defining which coprocessors are configured /// as Custom Datapath. uint32_t getARMCDECoprocMask() const { return ARMCDECoprocMask; } diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h index ba3acd029587160..4feb6f2dcd86b83 100644 --- a/clang/include/clang/Basic/TargetOptions.h +++ b/clang/include/clang/Basic/TargetOptions.h @@ -78,6 +78,9 @@ class TargetOptions { /// \brief If enabled, allow AMDGPU unsafe floating point atomics. bool AllowAMDGPUUnsafeFPAtomics = false; + /// \brief If enabled, allow AMDGPU unsafe integer atomics. + bool AllowAMDGPUUnsafeIntAtomics = true; + /// \brief Enumeration value for AMDGPU code object version, which is the /// code object version times 100. enum CodeObjectVersionKind { diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index a89d6b6579f1176..265f5c23e7858ee 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4705,6 +4705,14 @@ defm unsafe_fp_atomics : BoolOption<"m", "unsafe-fp-atomics", "for certain memory destinations. (AMDGPU only)">, NegFlag<SetFalse>>, Group<m_Group>; +defm unsafe_int_atomics : BoolOption<"m", "unsafe-int-atomics", + TargetOpts<"AllowAMDGPUUnsafeIntAtomics">, DefaultTrue, + PosFlag<SetTrue, [], [ClangOption, CC1Option], + "Enable generation of unsafe integer atomic instructions. May " + "generate more efficient code, but may give incorrect results " + "for certain memory destinations. (AMDGPU only)">, + NegFlag<SetFalse, [], [ClangOption, CC1Option]>>, Group<m_Group>; + def faltivec : Flag<["-"], "faltivec">, Group<f_Group>, Flags<[NoXarchOption]>; def fno_altivec : Flag<["-"], "fno-altivec">, Group<f_Group>, Flags<[NoXarchOption]>; diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp index 6cd5d618a4acaa5..06997cc54642623 100644 --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -157,6 +157,7 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) { HasAArch64SVETypes = false; HasRISCVVTypes = false; AllowAMDGPUUnsafeFPAtomics = false; + AllowAMDGPUUnsafeIntAtomics = true; ARMCDECoprocMask = 0; // Default to no types using fpret. diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp index 409ae32ab424215..20ad8e5c59c2b66 100644 --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -232,6 +232,7 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple, HasFloat16 = true; WavefrontSize = GPUFeatures & llvm::AMDGPU::FEATURE_WAVE32 ? 32 : 64; AllowAMDGPUUnsafeFPAtomics = Opts.AllowAMDGPUUnsafeFPAtomics; + AllowAMDGPUUnsafeIntAtomics = Opts.AllowAMDGPUUnsafeIntAtomics; // Set pointer width and alignment for the generic address space. PointerWidth = PointerAlign = getPointerWidthV(LangAS::Default); diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index f6a614b3e4d54dd..9be15d89d3f2f46 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -409,6 +409,9 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes( if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics()) F->addFnAttr("amdgpu-unsafe-fp-atomics", "true"); + if (!M.getContext().getTargetInfo().allowAMDGPUUnsafeIntAtomics()) + F->addFnAttr("amdgpu-unsafe-int-atomics", "false"); + if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts) F->addFnAttr("amdgpu-ieee", "false"); } diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 94c184435ae14de..be3076cdbdc4a68 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -7372,6 +7372,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.addOptInFlag(CmdArgs, options::OPT_munsafe_fp_atomics, options::OPT_mno_unsafe_fp_atomics); + Args.addOptOutFlag(CmdArgs, options::OPT_munsafe_int_atomics, + options::OPT_mno_unsafe_int_atomics); Args.addOptOutFlag(CmdArgs, options::OPT_mamdgpu_ieee, options::OPT_mno_amdgpu_ieee); } diff --git a/clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl b/clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl new file mode 100644 index 000000000000000..02dfddd5791b405 --- /dev/null +++ b/clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl @@ -0,0 +1,55 @@ +// REQUIRES: amdgpu-registered-target +// +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \ +// RUN: -emit-llvm \ +// RUN: | FileCheck -check-prefixes=COMMON,UNSAFE-INT-DEFAULT %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \ +// RUN: -emit-llvm -munsafe-int-atomics \ +// RUN: | FileCheck -check-prefixes=COMMON,UNSAFE-INT-ON %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \ +// RUN: -emit-llvm -mno-unsafe-int-atomics \ +// RUN: | FileCheck -check-prefixes=COMMON,UNSAFE-INT-OFF %s + +// Check AMDGCN ISA generation. + +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \ +// RUN: -munsafe-int-atomics \ +// RUN: | FileCheck -check-prefixes=COMMON-ISA,ISA-ON %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \ +// RUN: -mno-unsafe-int-atomics \ +// RUN: | FileCheck -check-prefixes=COMMON-ISA,ISA-OFF %s + +#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable + +typedef enum memory_order { + memory_order_relaxed = __ATOMIC_RELAXED, + memory_order_acquire = __ATOMIC_ACQUIRE, + memory_order_release = __ATOMIC_RELEASE, + memory_order_acq_rel = __ATOMIC_ACQ_REL, + memory_order_seq_cst = __ATOMIC_SEQ_CST +} memory_order; + +typedef enum memory_scope { + memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, + memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, + memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, + memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, +#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) + memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP +#endif +} memory_scope; + +// COMMON-ISA: kern: +// ISA-ON: flat_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc +// ISA-OFF: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc +kernel void kern(global atomic_int *x, int y, global int *z) { + *z = __opencl_atomic_fetch_xor(x, y, memory_order_seq_cst, memory_scope_work_group); +} + +// COMMON: define{{.*}} amdgpu_kernel void @kern{{.*}} [[ATTRS1:#[0-9]+]] +// COMMON: atomicrmw xor ptr addrspace(1) %x, i32 %y syncscope("workgroup") seq_cst, align 4 +// +// UNSAFE-INT-DEFAULT-NOT: attributes [[ATTRS1]] = {{.*}} "amdgpu-unsafe-int-atomics" +// UNSAFE-INT-ON-NOT: attributes [[ATTRS1]] = {{.*}} "amdgpu-unsafe-int-atomics" +// UNSAFE-INT-OFF: attributes [[ATTRS1]] = {{.*}} "amdgpu-unsafe-int-atomics"="false" diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9c5b166c9652238..c379cf4dd64642f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -14975,6 +14975,14 @@ bool unsafeFPAtomicsDisabled(Function *F) { "true"; } +// The amdgpu-unsafe-int-atomics attribute disables generation of unsafe +// integer atomic instructions. May generate more efficient code, but may +// give incorrect results for certain memory destinations. +bool unsafeIntAtomicsDisabled(Function *F) { + return F->getFnAttribute("amdgpu-unsafe-int-atomics").getValueAsString() == + "false"; +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { unsigned AS = RMW->getPointerAddressSpace(); @@ -15097,6 +15105,13 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { } break; } + case AtomicRMWInst::Xor: { + if (AMDGPU::isFlatGlobalAddrSpace(AS) && + unsafeIntAtomicsDisabled(RMW->getFunction())) { + return AtomicExpansionKind::CmpXChg; + } + break; + } default: break; } diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index 92b70ad797b88ae..6cc97019d3ff5ce 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1100 %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX90A %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX940 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1100 %s define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX908-LABEL: syncscope_system: @@ -391,4 +391,19 @@ define float @no_unsafe(ptr %addr, float %val) { ret float %res } +define i32 @default_xor(ptr %addr, i32 %val) { + ; GCN-LABEL: default_xor: + ; GCN: flat_atomic_xor + %res = atomicrmw xor ptr %addr, i32 %val seq_cst + ret i32 %res +} + +define i32 @no_unsafe_xor(ptr %addr, i32 %val) #1 { + ; GCN-LABEL: no_unsafe_xor: + ; GCN: flat_atomic_cmpswap + %res = atomicrmw xor ptr %addr, i32 %val seq_cst + ret i32 %res +} + attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" } +attributes #1 = { "amdgpu-unsafe-int-atomics"="false" } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits