llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-clang Author: Pierre van Houtryve (Pierre-vh) <details> <summary>Changes</summary> Note: please only review the last commit Allows Vulkan front-ends to properly implement the Vulkan memory model. --- Patch is 548.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78573.diff 47 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+1) - (modified) clang/lib/CodeGen/CGBuiltin.cpp (+27) - (modified) clang/lib/CodeGen/CodeGenFunction.h (+2) - (modified) clang/lib/Sema/SemaChecking.cpp (+6-1) - (added) clang/test/CodeGenCXX/builtin-amdgcn-fence-opencl.cpp (+108) - (added) llvm/docs/MemoryModelRelaxationAnnotations.rst (+476) - (modified) llvm/docs/Reference.rst (+4) - (modified) llvm/include/llvm/Analysis/VectorUtils.h (+1-1) - (modified) llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h (+9) - (modified) llvm/include/llvm/CodeGen/MachineFunction.h (+2-1) - (modified) llvm/include/llvm/CodeGen/MachineInstr.h (+37-10) - (modified) llvm/include/llvm/CodeGen/MachineInstrBuilder.h (+32-14) - (modified) llvm/include/llvm/CodeGen/SelectionDAG.h (+12) - (modified) llvm/include/llvm/IR/FixedMetadataKinds.def (+1) - (added) llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h (+101) - (modified) llvm/lib/Analysis/VectorUtils.cpp (+10-1) - (modified) llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp (+1) - (modified) llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp (+3-1) - (modified) llvm/lib/CodeGen/MIRPrinter.cpp (+7) - (modified) llvm/lib/CodeGen/MachineFunction.cpp (+2-2) - (modified) llvm/lib/CodeGen/MachineInstr.cpp (+37-12) - (modified) llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp (+9) - (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (+1-1) - (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (+11-6) - (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp (+7) - (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp (+2) - (modified) llvm/lib/IR/CMakeLists.txt (+1) - (modified) llvm/lib/IR/Instruction.cpp (+13-1) - (added) llvm/lib/IR/MemoryModelRelaxationAnnotations.cpp (+201) - (modified) llvm/lib/IR/Verifier.cpp (+27) - (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+242-72) - (modified) llvm/lib/Transforms/Scalar/GVNHoist.cpp (+2-1) - (modified) llvm/lib/Transforms/Utils/FunctionComparator.cpp (+5) - (modified) llvm/lib/Transforms/Utils/Local.cpp (+14-1) - (added) llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll (+49) - (added) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll (+1428) - (added) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll (+1188) - (added) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-private.ll (+1188) - (added) llvm/test/CodeGen/AMDGPU/memory-legalizer-mmra-vulkan-default.ll (+2550) - (added) llvm/test/CodeGen/AMDGPU/memory-legalizer-mmra-vulkan-nonprivate.ll (+4599) - (added) llvm/test/CodeGen/AMDGPU/memory-legalizer-mmra-vulkan-private.ll (+414) - (added) llvm/test/CodeGen/AMDGPU/mmra.ll (+254) - (added) llvm/test/Verifier/mmra-allowed.ll (+33) - (added) llvm/test/Verifier/mmra.ll (+37) - (modified) llvm/unittests/CodeGen/MachineInstrTest.cpp (+51) - (modified) llvm/unittests/IR/CMakeLists.txt (+1) - (added) llvm/unittests/IR/MemoryModelRelaxationAnnotationsTest.cpp (+34) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index e562ef04a30194..b9cd21bbfe514c 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -68,6 +68,7 @@ BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n") BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n") BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n") BUILTIN(__builtin_amdgcn_fence, "vUicC*", "n") +BUILTIN(__builtin_amdgcn_fence_opencl, "vUiUicC*", "n") BUILTIN(__builtin_amdgcn_groupstaticsize, "Ui", "n") BUILTIN(__builtin_amdgcn_atomic_inc32, "UZiUZiD*UZiUicC*", "n") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index de48b15645b1ab..050539b3816e54 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -55,6 +55,7 @@ #include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/MatrixBuilder.h" +#include "llvm/IR/MemoryModelRelaxationAnnotations.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/ScopedPrinter.h" @@ -17859,6 +17860,25 @@ llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments, return Arg; } +void CodeGenFunction::AddAMDGCNAddressSpaceMMRA(llvm::Instruction *Inst, + llvm::Value *ASMask) { + constexpr const char *Tag = "opencl-fence-mem"; + + uint64_t Mask = cast<llvm::ConstantInt>(ASMask)->getZExtValue(); + if (Mask == 0) + return; + + // 3 bits can be set: local, global, image in that order. + MMRAMetadata MMRAs; + if (Mask & (1 << 0)) + MMRAs.addTag(Tag, "local"); + if (Mask & (1 << 1)) + MMRAs.addTag(Tag, "global"); + if (Mask & (1 << 2)) + MMRAs.addTag(Tag, "image"); + Inst->setMetadata(LLVMContext::MD_MMRA, MMRAs.getAsMD(getLLVMContext())); +} + Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; @@ -18348,6 +18368,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, EmitScalarExpr(E->getArg(1)), AO, SSID); return Builder.CreateFence(AO, SSID); } + case AMDGPU::BI__builtin_amdgcn_fence_opencl: { + ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2)), AO, SSID); + FenceInst *Fence = Builder.CreateFence(AO, SSID); + AddAMDGCNAddressSpaceMMRA(Fence, EmitScalarExpr(E->getArg(0))); + return Fence; + } case AMDGPU::BI__builtin_amdgcn_atomic_inc32: case AMDGPU::BI__builtin_amdgcn_atomic_inc64: case AMDGPU::BI__builtin_amdgcn_atomic_dec32: diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 143ad64e8816b1..93a11c379e82c9 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4401,6 +4401,8 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Value *EmitHexagonBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitRISCVBuiltinExpr(unsigned BuiltinID, const CallExpr *E, ReturnValueSlot ReturnValue); + + void AddAMDGCNAddressSpaceMMRA(llvm::Instruction *Inst, llvm::Value *ASMask); void ProcessOrderScopeAMDGCN(llvm::Value *Order, llvm::Value *Scope, llvm::AtomicOrdering &AO, llvm::SyncScope::ID &SSID); diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index ace3e386988f00..0a98f5504ff4f6 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -5098,6 +5098,10 @@ bool Sema::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, OrderIndex = 0; ScopeIndex = 1; break; + case AMDGPU::BI__builtin_amdgcn_fence_opencl: + OrderIndex = 1; + ScopeIndex = 2; + break; default: return false; } @@ -5120,7 +5124,8 @@ bool Sema::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, switch (static_cast<llvm::AtomicOrderingCABI>(Ord)) { case llvm::AtomicOrderingCABI::relaxed: case llvm::AtomicOrderingCABI::consume: - if (BuiltinID == AMDGPU::BI__builtin_amdgcn_fence) + if (BuiltinID == AMDGPU::BI__builtin_amdgcn_fence || + BuiltinID == AMDGPU::BI__builtin_amdgcn_fence_opencl) return Diag(ArgExpr->getBeginLoc(), diag::warn_atomic_op_has_invalid_memory_order) << 0 << ArgExpr->getSourceRange(); diff --git a/clang/test/CodeGenCXX/builtin-amdgcn-fence-opencl.cpp b/clang/test/CodeGenCXX/builtin-amdgcn-fence-opencl.cpp new file mode 100644 index 00000000000000..b4ac7ca8c178d9 --- /dev/null +++ b/clang/test/CodeGenCXX/builtin-amdgcn-fence-opencl.cpp @@ -0,0 +1,108 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --version 3 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 %s -emit-llvm -O0 -o - \ +// RUN: -triple=amdgcn-amd-amdhsa -Qn -mcode-object-version=none | FileCheck %s + +#define LOCAL_MASK (1 << 0) +#define GLOBAL_MASK (1 << 1) +#define IMAGE_MASK (1 << 2) + +//. +// CHECK: @.str = private unnamed_addr addrspace(4) constant [10 x i8] c"workgroup\00", align 1 +// CHECK: @.str.1 = private unnamed_addr addrspace(4) constant [6 x i8] c"agent\00", align 1 +// CHECK: @.str.2 = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1 +//. +// CHECK-LABEL: define dso_local void @_Z10test_localv( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META1:![0-9]+]] +// CHECK-NEXT: fence syncscope("agent") acquire, !mmra [[META1]] +// CHECK-NEXT: fence seq_cst, !mmra [[META1]] +// CHECK-NEXT: fence syncscope("agent") acq_rel, !mmra [[META1]] +// CHECK-NEXT: fence syncscope("workgroup") release, !mmra [[META1]] +// CHECK-NEXT: ret void +// +void test_local() { + + __builtin_amdgcn_fence_opencl(LOCAL_MASK, __ATOMIC_SEQ_CST, "workgroup"); + + __builtin_amdgcn_fence_opencl(LOCAL_MASK,__ATOMIC_ACQUIRE, "agent"); + + __builtin_amdgcn_fence_opencl(LOCAL_MASK,__ATOMIC_SEQ_CST, ""); + + __builtin_amdgcn_fence_opencl(LOCAL_MASK, 4, "agent"); + + __builtin_amdgcn_fence_opencl(LOCAL_MASK, 3, "workgroup"); +} + +// CHECK-LABEL: define dso_local void @_Z11test_globalv( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META2:![0-9]+]] +// CHECK-NEXT: fence syncscope("agent") acquire, !mmra [[META2]] +// CHECK-NEXT: fence seq_cst, !mmra [[META2]] +// CHECK-NEXT: fence syncscope("agent") acq_rel, !mmra [[META2]] +// CHECK-NEXT: fence syncscope("workgroup") release, !mmra [[META2]] +// CHECK-NEXT: ret void +// +void test_global() { + + __builtin_amdgcn_fence_opencl(GLOBAL_MASK, __ATOMIC_SEQ_CST, "workgroup"); + + __builtin_amdgcn_fence_opencl(GLOBAL_MASK,__ATOMIC_ACQUIRE, "agent"); + + __builtin_amdgcn_fence_opencl(GLOBAL_MASK,__ATOMIC_SEQ_CST, ""); + + __builtin_amdgcn_fence_opencl(GLOBAL_MASK, 4, "agent"); + + __builtin_amdgcn_fence_opencl(GLOBAL_MASK, 3, "workgroup"); +} + +// CHECK-LABEL: define dso_local void @_Z10test_imagev( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META3:![0-9]+]] +// CHECK-NEXT: fence syncscope("agent") acquire, !mmra [[META3]] +// CHECK-NEXT: fence seq_cst, !mmra [[META2]] +// CHECK-NEXT: fence syncscope("agent") acq_rel, !mmra [[META3]] +// CHECK-NEXT: fence syncscope("workgroup") release, !mmra [[META3]] +// CHECK-NEXT: ret void +// +void test_image() { + + __builtin_amdgcn_fence_opencl(IMAGE_MASK, __ATOMIC_SEQ_CST, "workgroup"); + + __builtin_amdgcn_fence_opencl(IMAGE_MASK,__ATOMIC_ACQUIRE, "agent"); + + __builtin_amdgcn_fence_opencl(GLOBAL_MASK,__ATOMIC_SEQ_CST, ""); + + __builtin_amdgcn_fence_opencl(IMAGE_MASK, 4, "agent"); + + __builtin_amdgcn_fence_opencl(IMAGE_MASK, 3, "workgroup"); +} + +// CHECK-LABEL: define dso_local void @_Z10test_mixedv( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META4:![0-9]+]] +// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META5:![0-9]+]] +// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META5]] +// CHECK-NEXT: ret void +// +void test_mixed() { + + __builtin_amdgcn_fence_opencl(IMAGE_MASK | GLOBAL_MASK, __ATOMIC_SEQ_CST, "workgroup"); + __builtin_amdgcn_fence_opencl(IMAGE_MASK | GLOBAL_MASK | LOCAL_MASK, __ATOMIC_SEQ_CST, "workgroup"); + + __builtin_amdgcn_fence_opencl(0xFF,__ATOMIC_SEQ_CST, "workgroup"); +} +//. +// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +//. +// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// CHECK: [[META1]] = !{!"opencl-fence-mem", !"local"} +// CHECK: [[META2]] = !{!"opencl-fence-mem", !"global"} +// CHECK: [[META3]] = !{!"opencl-fence-mem", !"image"} +// CHECK: [[META4]] = !{[[META3]], [[META2]]} +// CHECK: [[META5]] = !{[[META3]], [[META2]], [[META1]]} +//. diff --git a/llvm/docs/MemoryModelRelaxationAnnotations.rst b/llvm/docs/MemoryModelRelaxationAnnotations.rst new file mode 100644 index 00000000000000..e83e365a3a57de --- /dev/null +++ b/llvm/docs/MemoryModelRelaxationAnnotations.rst @@ -0,0 +1,476 @@ +=================================== +Memory Model Relaxation Annotations +=================================== + +.. contents:: + :local: + +Introduction +============ + +Memory Model Relaxation Annotations (MMRAs) are target-defined properties +on instructions that can be used to selectively relax constraints placed +by the memory model. For example: + +* The use of ``VulkanMemoryModel`` in a SPIRV program allows certain + memory operations to be reordered across ``acquire`` or ``release`` + operations. +* OpenCL APIs expose primitives to only fence a specific set of address + spaces, carrying that information to the backend can enable the + use of faster synchronization instructions, rather than fencing all + address spaces. + +MMRAs offer an opt-in system for targets to relax the default LLVM +memory model. +As such, they are attached to an operation using LLVM metadata which +can always be dropped without affecting correctness. + +Definitions +=========== + +memory operation + A load, a store, an atomic, or a function call that is marked as + accessing memory. + +synchronizing operation + An instruction that synchronizes memory with other threads (e.g. + an atomic or a fence). + +tag + Metadata attached to a memory or synchronizing operation + that represents some target-defined property regarding memory + synchronization. + + An operation may have multiple tags that each represent a different + property. + + A tag is composed of a pair of metadata nodes: + + * a *prefix* string. + * a *suffix* integer or string. + + In LLVM IR, the pair is represented using a metadata tuple. + In other cases (comments, documentation, etc.), we may use the + ``prefix:suffix`` notation. + For example: + + .. code-block:: + :caption: Example: Tags in Metadata + + !0 = !{!"scope", !"workgroup"} # scope:workgroup + !1 = !{!"scope", !"device"} # scope:device + !2 = !{!"scope", !"system"} # scope:system + !3 = !{!"sync-as", i32 2} # sync-as:2 + !4 = !{!"sync-as", i32 1} # sync-as:1 + !5 = !{!"sync-as", i32 0} # sync-as:0 + + .. note:: + + The only semantics relevant to the optimizer is the + "compatibility" relation defined below. All other + semantics are target defined. + + Tags can also be organised in lists to allow operations + to specify all of the tags they belong to. Such a list + is referred to as a "set of tags". + + .. code-block:: + :caption: Example: Set of Tags in Metadata + + !0 = !{!"scope", !"workgroup"} + !1 = !{!"sync-as", !"private"} + !2 = !{!0, !2} + + .. note:: + + If an operation does not have MMRA metadata, it's treated as if + it has an empty list (``!{}``) of tags. + + Note that it is not an error if a tag is not recognized by the + instruction it is applied to, or by the current target. + Such tags are simply ignored. + + Both synchronizing operations and memory operations can have + zero or more tags attached to them using the ``!mmra`` syntax. + + For the sake of readability in examples below, + we use a (non-functional) short syntax to represent MMMRA metadata: + + .. code-block:: + :caption: Short Syntax Example + + store %ptr1 # foo:bar + store %ptr1 !mmra !{!"foo", !"bar"} + + These two notations can be used in this document and are strictly + equivalent. However, only the second version is functional. + +compatibility + Two sets of tags are said to be *compatible* iff, for every unique + tag prefix P present in at least one set: + + - the other set contains no tag with prefix P, or + - at least one tag with prefix P is common to both sets. + + The above definition implies that an empty set is always compatible + with any other set. This is an important property as it ensures that + if a transform drops the metadata on an operation, it can never affect + correctness. In other words, the memory model cannot be relaxed further + by deleting metadata from instructions. + +.. _HappensBefore: + +The *happens-before* Relation +============================== + +Compatibility checks can be used to opt out of the *happens-before* relation +established between two instructions. + +Ordering + When two instructions' metadata are not compatible, any program order + between them are not in *happens-before*. + + For example, consider two tags ``foo:bar`` and + ``foo:baz`` exposed by a target: + + .. code-block:: + + A: store %ptr1 # foo:bar + B: store %ptr2 # foo:baz + X: store atomic release %ptr3 # foo:bar + + In the above figure, ``A`` is compatible with ``X``, and hence ``A`` + happens-before ``X``. But ``B`` is not compatible with + ``X``, and hence it is not happens-before ``X``. + +Synchronization + If an synchronizing operation has one or more tags, then whether it + participate in the ``seq_cst`` order with other operations is target + dependent. + + .. code-block:: + + ; Depending on the semantics of foo:bar & foo:bux, this may not + ; synchronize with another sequence. + fence release # foo:bar + store atomic %ptr1 # foo:bux + +Examples +-------- + +.. code-block:: text + :caption: Example 1 + + A: store ptr addrspace(1) %ptr2 # sync-as:1 vulkan:nonprivate + B: store atomic release ptr addrspace(1) %ptr3 # sync-as:0 vulkan:nonprivate + +A and B are not ordered relative to each other +(no *happens-before*) because their sets of tags are not compatible. + +Note that the ``sync-as`` value does not have to match the ``addrspace`` value. +e.g. In Example 1, a store-release to a location in ``addrspace(1)`` wants to +only synchronize with operations happening in ``addrspace(0)``. + +.. code-block:: text + :caption: Example 2 + + A: store ptr addrspace(1) %ptr2 # sync-as:1 vulkan:nonprivate + B: store atomic release ptr addrspace(1) %ptr3 # sync-as:1 vulkan:nonprivate + +The ordering of A and B is unaffected because their set of tags are +compatible. + +Note that A and B may or may not be in *happens-before* due to other reasons. + +.. code-block:: text + :caption: Example 3 + + A: store ptr addrspace(1) %ptr2 # sync-as:1 vulkan:nonprivate + B: store atomic release ptr addrspace(1) %ptr3 # vulkan:nonprivate + +The ordering of A and B is unaffected because their set of tags are +compatible. + +.. code-block:: text + :caption: Example 3 + + A: store ptr addrspace(1) %ptr2 # sync-as:1 + B: store atomic release ptr addrspace(1) %ptr3 # sync-as:2 + +A and B do not have to be ordered relative to each other +(no *happens-before*) because their sets of tags are not compatible. + +Use-cases +========= + +SPIRV ``NonPrivatePointer`` +--------------------------- + +MMRAs can support the SPIRV capability +``VulkanMemoryModel``, where synchronizing operations only affect +memory operations that specify ``NonPrivatePointer`` semantics. + +The example below is generated from a SPIRV program using the +following recipe: + +- Add ``vulkan:nonprivate`` to every synchronizing operation. +- Add ``vulkan:nonprivate`` to every non-atomic memory operation + that is marked ``NonPrivatePointer``. +- Add ``vulkan:private`` to tags of every non-atomic memory operation + that is not marked ``NonPrivatePointer``. + +.. code-block:: + + Thread T1: + A: store %ptr1 # vulkan:nonprivate + B: store %ptr2 # vulkan:private + X: store atomic release %ptr3 # vulkan:nonprivate + + Thread T2: + Y: load atomic acquire %ptr3 # vulkan:nonprivate + C: load %ptr2 # vulkan:private + D: load %ptr1 # vulkan:nonprivate + +Compatibility ensures that operation ``A`` is ordered +relative to ``X`` while operation ``D`` is ordered relative to ``Y``. +If ``X`` synchronizes with ``Y``, then ``A`` happens-before ``D``. +No such relation can be inferred about operations ``B`` and ``C``. + +.. note:: + The `Vulkan Memory Model <https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#memory-model-non-private>`_ + considers all atomic operation non-private. + + Whether ``vulkan:nonprivate`` would be specified on atomic operations is + an implementation detail, as an atomic operation is always ``nonprivate``. + The implementation may choose to be explicit and emit IR with + ``vulkan:nonprivate`` on every atomic operation, or it could choose to + only emit ``vulkan::private`` and assume ``vulkan:nonprivate`` + by default. + +Operations marked with ``vulkan:private`` effectively opt out of the +happens-before order in a SPIRV program since they are incompatible +with every synchronizing operation. Note that SPIRV operations that +are not marked ``NonPrivatePointer`` are not entirely private to the +thread --- they are implicitly synchronized at the start or end of a +thread by the Vulkan *system-synchronizes-with* relationship. This +example assumes that the target-defined semantics of +``vulkan:private`` correctly implements this property. + +This scheme is general enough to express the interoperability of SPIRV +programs with other environments. + +.. code-block:: + + Thread T1: + A: store %ptr1 # vulkan:nonprivate + X: store atomic release %ptr2 # vulkan:nonprivate + + Thread T2: + Y: load atomic acquire %ptr2 # foo:bar + B: load %ptr1 + +In the above example, thread ``T1`` originates from a SPIRV program +while thread ``T2`` originates from a non-SPIRV program. Whether ``X`` +can synchronize with ``Y`` is target defined. If ``X`` synchronizes +with ``Y``, then ``A`` happens before ``B`` (because A/X and +Y/B are compatible). + +Implementation Example +~~~~~~~~~~~~~~~~~~~~~~ + +Consider the implementation of SPIRV ``NonPrivatePointer`` on a target +where all memory operations are cached, and the entire cache is +flushed or invalidated at a ``release`` or ``acquire`` respectively. A +possible scheme is that when translating a SPIRV program, memory +operations marked ``NonPrivatePointer`` should not be cached, and the +cache contents should not be touched during an ``acquire`` and +``release`` operation. + +This could be implemented using the tags that share the ``vulkan:`` prefix, +as follows: + +- For memory operations: + + - Operations with ``vulkan:nonprivate`` should bypass the cache. + - Operations with ``vulkan:private`` should be cached. + - Operations that specify neither or both should conservatively + bypass the cache to ensure correctness. + +- For synchronizing operations: + + - Operations with ``vulkan:nonprivate`` should not flush or + invalidate the cache. + - Operations with ``vulkan:private`` should flush or invalidate the... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/78573 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits