[clang-tools-extra] [flang] [llvm] [mlir] [libcxx] [libc] [compiler-rt] [clang] [mlir][gpu] Support dynamic_shared_memory Op with vector dialect (PR #74475)
https://github.com/grypp updated https://github.com/llvm/llvm-project/pull/74475 >From 2848c9011cb4db5e91754300eb466927738a363a Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Tue, 5 Dec 2023 15:16:20 +0100 Subject: [PATCH] [mlir][gpu] Support dynamic_shared_memory Op with vector dialect `gpu.dynamic_shared_memory` currently does not get lowered when it is used with vector dialect. The reason is that vector-to-llvm conversion is not included in gpu-to-nvvm. This PR includes that and adds a test. --- .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp| 2 ++ .../GPUCommon/lower-memory-space-attrs.mlir | 20 +++ 2 files changed, 22 insertions(+) diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index 4855fd187eb58..0e978ca0a6424 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -21,6 +21,7 @@ #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" #include "mlir/Conversion/LLVMCommon/TypeConverter.h" #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" +#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" @@ -282,6 +283,7 @@ struct LowerGpuOpsToNVVMOpsPass populateFinalizeMemRefToLLVMConversionPatterns(converter, llvmPatterns); populateGpuToNVVMConversionPatterns(converter, llvmPatterns); populateGpuWMMAToNVVMConversionPatterns(converter, llvmPatterns); +populateVectorToLLVMConversionPatterns(converter, llvmPatterns); if (this->hasRedux) populateGpuSubgroupReduceOpLoweringPattern(converter, llvmPatterns); LLVMConversionTarget target(getContext()); diff --git a/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir b/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir index 14f5302ac2002..6d50770f53543 100644 --- a/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir @@ -46,3 +46,23 @@ gpu.module @kernel { // CHECK: [[value:%.+]] = llvm.load // CHECK-SAME: : !llvm.ptr<1> -> f32 // CHECK: llvm.return [[value]] + +// - + +gpu.module @kernel { + gpu.func @dynamic_shmem_with_vector(%arg1: memref<1xf32>) { +%0 = arith.constant 0 : index +%1 = gpu.dynamic_shared_memory : memref> +%2 = memref.view %1[%0][] : memref> to memref<1xf32, #gpu.address_space> +%3 = vector.load %2[%0] : memref<1xf32, #gpu.address_space>, vector<1xf32> +vector.store %3, %arg1[%0] : memref<1xf32>, vector<1xf32> +gpu.return + } +} + +// ROCDL: llvm.mlir.global internal @__dynamic_shmem__0() {addr_space = 3 : i32} : !llvm.array<0 x i8> +// NVVM: llvm.mlir.global internal @__dynamic_shmem__0() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8> +// CHECK-LABEL: llvm.func @dynamic_shmem_with_vector +// CHECK: llvm.mlir.addressof @__dynamic_shmem__0 : !llvm.ptr<3> +// CHECK: llvm.load %{{.*}} {alignment = 4 : i64} : !llvm.ptr<3> -> vector<1xf32> +// CHECK: llvm.store ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[libcxx] [clang] [compiler-rt] [llvm] [flang] [libc] [clang-tools-extra] [mlir] [mlir][gpu] Support dynamic_shared_memory Op with vector dialect (PR #74475)
https://github.com/grypp closed https://github.com/llvm/llvm-project/pull/74475 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[libc] [compiler-rt] [clang-tools-extra] [clang] [llvm] [flang] [mlir] [libcxx] [mlir][nvvm] Introduce `fence.mbarrier.init` (PR #74058)
https://github.com/grypp updated https://github.com/llvm/llvm-project/pull/74058 >From 9f35504e81246f97a9d8c14a06043685660ae15e Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Fri, 1 Dec 2023 11:10:40 +0100 Subject: [PATCH 1/3] [mlir][nvvm] Introduce `fence.mbarrier.init` This PR introduce `fence.mbarrier.init` OP --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 10 ++ mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir | 8 2 files changed, 18 insertions(+) diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index ecad1a16eb6c5..f400c18b5f32c 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -422,6 +422,16 @@ def NVVM_SetMaxRegisterOp : NVVM_PTXBuilder_Op<"setmaxregister"> { let hasVerifier = 1; } +def NVVM_FenceMbarrierInitOp : NVVM_PTXBuilder_Op<"fence.mbarrier.init"> { + let arguments = (ins ); + let assemblyFormat = "attr-dict"; + let extraClassDefinition = [{ +std::string $cppClass::getPtx() { + return std::string("fence.mbarrier_init.release.cluster;"); +} + }]; +} + def ShflKindBfly : I32EnumAttrCase<"bfly", 0>; def ShflKindUp : I32EnumAttrCase<"up", 1>; def ShflKindDown : I32EnumAttrCase<"down", 2>; diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir index 5482cc194192d..8366f1d109b1c 100644 --- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir +++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir @@ -629,3 +629,11 @@ func.func @cp_bulk_commit() { nvvm.cp.async.bulk.commit.group func.return } + +// - + +func.func @fence_mbarrier_init() { + //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.mbarrier_init.release.cluster;" + nvvm.fence.mbarrier.init + func.return +} >From 34e29b2bef58739dbcc2e34efcec644accd5c089 Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Fri, 1 Dec 2023 16:00:37 +0100 Subject: [PATCH 2/3] add descripton --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 6 ++ 1 file changed, 6 insertions(+) diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index f400c18b5f32c..adc60e72fdf82 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -424,6 +424,12 @@ def NVVM_SetMaxRegisterOp : NVVM_PTXBuilder_Op<"setmaxregister"> { def NVVM_FenceMbarrierInitOp : NVVM_PTXBuilder_Op<"fence.mbarrier.init"> { let arguments = (ins ); +let description = [{ +Fence operation that applies on the prior nvvm.mbarrier.init +[For more information, see PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) + }]; + let assemblyFormat = "attr-dict"; let extraClassDefinition = [{ std::string $cppClass::getPtx() { >From c5d66888946d4397fb29cafa2555f13b9bec8e42 Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Wed, 6 Dec 2023 11:40:56 +0100 Subject: [PATCH 3/3] fix typo in test --- mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir index ec79ad3e8c187..a4336a30999a1 100644 --- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir +++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir @@ -636,7 +636,8 @@ func.func @cp_bulk_commit() { func.func @fence_mbarrier_init() { //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.mbarrier_init.release.cluster;" nvvm.fence.mbarrier.init - + func.return +} // - func.func @fence_proxy() { ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [compiler-rt] [clang] [flang] [libcxx] [llvm] [libc] [mlir] [mlir][nvvm] Introduce `fence.mbarrier.init` (PR #74058)
https://github.com/grypp closed https://github.com/llvm/llvm-project/pull/74058 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[libc] [libcxx] [llvm] [compiler-rt] [clang-tools-extra] [clang] [mlir] [flang] [mlir][nvvm] Introduce `fence.mbarrier.init` (PR #74058)
https://github.com/grypp updated https://github.com/llvm/llvm-project/pull/74058 >From 9f35504e81246f97a9d8c14a06043685660ae15e Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Fri, 1 Dec 2023 11:10:40 +0100 Subject: [PATCH 1/2] [mlir][nvvm] Introduce `fence.mbarrier.init` This PR introduce `fence.mbarrier.init` OP --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 10 ++ mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir | 8 2 files changed, 18 insertions(+) diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index ecad1a16eb6c5..f400c18b5f32c 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -422,6 +422,16 @@ def NVVM_SetMaxRegisterOp : NVVM_PTXBuilder_Op<"setmaxregister"> { let hasVerifier = 1; } +def NVVM_FenceMbarrierInitOp : NVVM_PTXBuilder_Op<"fence.mbarrier.init"> { + let arguments = (ins ); + let assemblyFormat = "attr-dict"; + let extraClassDefinition = [{ +std::string $cppClass::getPtx() { + return std::string("fence.mbarrier_init.release.cluster;"); +} + }]; +} + def ShflKindBfly : I32EnumAttrCase<"bfly", 0>; def ShflKindUp : I32EnumAttrCase<"up", 1>; def ShflKindDown : I32EnumAttrCase<"down", 2>; diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir index 5482cc194192d..8366f1d109b1c 100644 --- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir +++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir @@ -629,3 +629,11 @@ func.func @cp_bulk_commit() { nvvm.cp.async.bulk.commit.group func.return } + +// - + +func.func @fence_mbarrier_init() { + //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.mbarrier_init.release.cluster;" + nvvm.fence.mbarrier.init + func.return +} >From 34e29b2bef58739dbcc2e34efcec644accd5c089 Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Fri, 1 Dec 2023 16:00:37 +0100 Subject: [PATCH 2/2] add descripton --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 6 ++ 1 file changed, 6 insertions(+) diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index f400c18b5f32c..adc60e72fdf82 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -424,6 +424,12 @@ def NVVM_SetMaxRegisterOp : NVVM_PTXBuilder_Op<"setmaxregister"> { def NVVM_FenceMbarrierInitOp : NVVM_PTXBuilder_Op<"fence.mbarrier.init"> { let arguments = (ins ); +let description = [{ +Fence operation that applies on the prior nvvm.mbarrier.init +[For more information, see PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) + }]; + let assemblyFormat = "attr-dict"; let extraClassDefinition = [{ std::string $cppClass::getPtx() { ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [MLIR] Add SyclRuntimeWrapper (PR #69648)
https://github.com/grypp commented: Looks good overall to me. I left a few comments https://github.com/llvm/llvm-project/pull/69648 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [MLIR] Add SyclRuntimeWrapper (PR #69648)
https://github.com/grypp edited https://github.com/llvm/llvm-project/pull/69648 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [MLIR] Add SyclRuntimeWrapper (PR #69648)
@@ -0,0 +1,222 @@ +//===- SyclRuntimeWrappers.cpp - MLIR SYCL wrapper library ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// Implements C wrappers around the sycl runtime library. +// +//===--===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#define SYCL_RUNTIME_EXPORT __declspec(dllexport) +#else +#define SYCL_RUNTIME_EXPORT +#endif // _WIN32 + +namespace { + +template +auto catchAll(F &&func) { + try { +return func(); + } catch (const std::exception &e) { +fprintf(stdout, "An exception was thrown: %s\n", e.what()); +fflush(stdout); +abort(); + } catch (...) { +fprintf(stdout, "An unknown exception was thrown\n"); +fflush(stdout); +abort(); + } +} + +#define L0_SAFE_CALL(call) \ + { \ +ze_result_t status = (call); \ +if (status != ZE_RESULT_SUCCESS) { \ + fprintf(stdout, "L0 error %d\n", status); \ + fflush(stdout); \ + abort(); \ +} \ + } + +} // namespace + +static sycl::device getDefaultDevice() { + static sycl::device syclDevice; + static bool isDeviceInitialised = false; + if(!isDeviceInitialised) { + auto platformList = sycl::platform::get_platforms(); + for (const auto &platform : platformList) { +auto platformName = platform.get_info(); +bool isLevelZero = platformName.find("Level-Zero") != std::string::npos; +if (!isLevelZero) + continue; + +syclDevice = platform.get_devices()[0]; +isDeviceInitialised = true; +return syclDevice; + } +throw std::runtime_error("getDefaultDevice failed"); grypp wrote: In our other runtimes, we don't use 'throw' or 'std::runtime_error.' Instead, we use on 'fprintf.' (see [CUDA](https://github.com/llvm/llvm-project/blob/main/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp#L44) and [ROCm](https://github.com/llvm/llvm-project/blob/main/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp#L30)). >From what I understand, throwing exceptions are heavy and requires a C++ >runtime. Do you really want to use exceptions in this context? Our approach keeps the error lightweight. In the event of an error, the runtime prints the error message, and the program proceeds to run. The compiler can implement fallback if needed (we don't do it right now). https://github.com/llvm/llvm-project/pull/69648 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [MLIR] Add SyclRuntimeWrapper (PR #69648)
@@ -0,0 +1,222 @@ +//===- SyclRuntimeWrappers.cpp - MLIR SYCL wrapper library ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// Implements C wrappers around the sycl runtime library. +// +//===--===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#define SYCL_RUNTIME_EXPORT __declspec(dllexport) +#else +#define SYCL_RUNTIME_EXPORT +#endif // _WIN32 grypp wrote: This macro exist in other runtimes too. Maybe we need a common header for all of them. But let's keep this in our mind, we can do it later. https://github.com/llvm/llvm-project/pull/69648 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [MLIR] Add SyclRuntimeWrapper (PR #69648)
@@ -0,0 +1,222 @@ +//===- SyclRuntimeWrappers.cpp - MLIR SYCL wrapper library ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// Implements C wrappers around the sycl runtime library. grypp wrote: Wrappers are in C++ but their linkage is C. Can we say something like below: `Implements wrappers around the sycl runtime library with C linkage` (I see the same wording in other runtimes too) https://github.com/llvm/llvm-project/pull/69648 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [MLIR] Add SyclRuntimeWrapper (PR #69648)
@@ -0,0 +1,222 @@ +//===- SyclRuntimeWrappers.cpp - MLIR SYCL wrapper library ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// Implements C wrappers around the sycl runtime library. +// +//===--===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#define SYCL_RUNTIME_EXPORT __declspec(dllexport) +#else +#define SYCL_RUNTIME_EXPORT +#endif // _WIN32 + +namespace { + +template +auto catchAll(F &&func) { + try { +return func(); + } catch (const std::exception &e) { +fprintf(stdout, "An exception was thrown: %s\n", e.what()); +fflush(stdout); +abort(); + } catch (...) { +fprintf(stdout, "An unknown exception was thrown\n"); +fflush(stdout); +abort(); + } +} + +#define L0_SAFE_CALL(call) \ + { \ +ze_result_t status = (call); \ +if (status != ZE_RESULT_SUCCESS) { \ + fprintf(stdout, "L0 error %d\n", status); \ + fflush(stdout); \ + abort(); \ +} \ + } + +} // namespace + +static sycl::device getDefaultDevice() { + static sycl::device syclDevice; + static bool isDeviceInitialised = false; + if(!isDeviceInitialised) { + auto platformList = sycl::platform::get_platforms(); + for (const auto &platform : platformList) { +auto platformName = platform.get_info(); +bool isLevelZero = platformName.find("Level-Zero") != std::string::npos; +if (!isLevelZero) + continue; + +syclDevice = platform.get_devices()[0]; +isDeviceInitialised = true; +return syclDevice; + } +throw std::runtime_error("getDefaultDevice failed"); grypp wrote: In our other runtimes, we don't use 'throw' or 'std::runtime_error.' Instead, we use on 'fprintf.' (see [CUDA](https://github.com/llvm/llvm-project/blob/main/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp#L44) and [ROCm](https://github.com/llvm/llvm-project/blob/main/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp#L30)). >From what I understand, throwing exceptions are heavy and requires a C++ >runtime. Do you really want to use exceptions in this context? Our approach keeps the error lightweight. In the event of an error, the runtime prints the error message, and the program proceeds to run. The compiler can implement fallback if needed (we don't do it right now). https://github.com/llvm/llvm-project/pull/69648 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [MLIR] Add SyclRuntimeWrapper (PR #69648)
@@ -0,0 +1,222 @@ +//===- SyclRuntimeWrappers.cpp - MLIR SYCL wrapper library ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// Implements C wrappers around the sycl runtime library. grypp wrote: Wrappers are in C++ but their linkage is C. Can we say something like below: `Implements wrappers around the sycl runtime library with C linkage` (I see the same wording in other runtimes too) https://github.com/llvm/llvm-project/pull/69648 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [MLIR] Add SyclRuntimeWrapper (PR #69648)
@@ -0,0 +1,222 @@ +//===- SyclRuntimeWrappers.cpp - MLIR SYCL wrapper library ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// Implements C wrappers around the sycl runtime library. +// +//===--===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#define SYCL_RUNTIME_EXPORT __declspec(dllexport) +#else +#define SYCL_RUNTIME_EXPORT +#endif // _WIN32 + +namespace { + +template +auto catchAll(F &&func) { + try { +return func(); + } catch (const std::exception &e) { +fprintf(stdout, "An exception was thrown: %s\n", e.what()); +fflush(stdout); +abort(); + } catch (...) { +fprintf(stdout, "An unknown exception was thrown\n"); +fflush(stdout); +abort(); + } +} + +#define L0_SAFE_CALL(call) \ + { \ +ze_result_t status = (call); \ +if (status != ZE_RESULT_SUCCESS) { \ + fprintf(stdout, "L0 error %d\n", status); \ + fflush(stdout); \ + abort(); \ +} \ + } + +} // namespace + +static sycl::device getDefaultDevice() { + static sycl::device syclDevice; + static bool isDeviceInitialised = false; + if(!isDeviceInitialised) { + auto platformList = sycl::platform::get_platforms(); + for (const auto &platform : platformList) { +auto platformName = platform.get_info(); +bool isLevelZero = platformName.find("Level-Zero") != std::string::npos; +if (!isLevelZero) + continue; + +syclDevice = platform.get_devices()[0]; +isDeviceInitialised = true; +return syclDevice; + } +throw std::runtime_error("getDefaultDevice failed"); grypp wrote: Thanks a lot for the explanation! In this case, using `throw std::runtime ` makes perfect sense to me. It aligns the behavior of this small runtime with SYCL. I was wondering if you truly needed exceptions here, and it turns out that yes, you do want them. https://github.com/llvm/llvm-project/pull/69648 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [MLIR] Add SyclRuntimeWrapper (PR #69648)
https://github.com/grypp approved this pull request. This looks good to me! I'm not a SYCL expert, but maybe someone with more expertise could take a quick look here. @Hardcode84 ? https://github.com/llvm/llvm-project/pull/69648 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [MLIR] Add SyclRuntimeWrapper (PR #69648)
https://github.com/grypp closed https://github.com/llvm/llvm-project/pull/69648 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [MLIR][NVGPU] Introduce `nvgpu.wargroup.mma.store` Op for Hopper GPUs (PR #65441)
https://github.com/grypp updated https://github.com/llvm/llvm-project/pull/65441 >From 7b71da55fca8fe2a7dbe4982b1959be6a6175fa1 Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Thu, 7 Sep 2023 11:52:38 +0200 Subject: [PATCH 1/6] [MLIR][NVGPU] Introduce `nvgpu.warpgroup.mma.store` Op for Hopper GPUs This work introduces a new operation called `warpgroup.mma.store` to the NVGPU dialect of MLIR. The purpose of this operation is to facilitate storing fragmanted results of WGMMA to the given memref. An example of fragmentation is given here : https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d The `warpgroup.mma.store` does followings: 1) Takes one or more fragmented results matrix. 2) Calculates indexes per thread in warp group and stores the data into give memref. Here's an example usage of the `nvgpu.warpgroup.mma` operation: ``` // Performs matmul, results are fragmented and in registers %res, %res2 = nvgpu.warpgroup.mma ... // Stores the fragmented result to the give memory nvgpu.warpgroup.mma.store [%res1, %res2], %matrixD : !nvgpu.warpgroup.result>, !nvgpu.warpgroup.result> to memref<128x128xf32,3> ``` Depends on #65440 --- mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td | 19 + .../Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp| 83 ++- mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp| 29 +++ 3 files changed, 129 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td index 90381648dac6acc..e102ae0dc581013 100644 --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td @@ -721,4 +721,23 @@ def NVGPU_WarpgroupMmaOp : NVGPU_Op<"warpgroup.mma"> { let hasVerifier = 1; } +def NVGPU_WarpgroupMmaStoreOp : NVGPU_Op<"warpgroup.mma.store"> { + let description = [{ +The `nvgpu.warpgroup.mma.store` op performs the store of fragmented result +in $matrixD to give memref. + +[See the details of register fragment layout for accumulator matrix D](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) + +Note that, the op must be run with warp group. + }]; + + let arguments = (ins Variadic:$matrixD, + Arg:$dstMemref); + + let assemblyFormat = [{ +`[` $matrixD `]` `,` $dstMemref attr-dict `:` type($matrixD) `to` type($dstMemref) + }]; + let hasVerifier = 1; +} + #endif // NVGPU diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index f74aa05c0c4c4ff..4f1a0bc651e81b7 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -11,6 +11,7 @@ #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" +#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/LLVMTypes.h" @@ -409,8 +410,8 @@ struct ConvertNVGPUToNVVMPass using Base::Base; void getDependentDialects(DialectRegistry ®istry) const override { -registry -.insert(); +registry.insert(); } void runOnOperation() override { @@ -451,6 +452,7 @@ struct ConvertNVGPUToNVVMPass populateNVGPUToNVVMConversionPatterns(converter, patterns); LLVMConversionTarget target(getContext()); target.addLegalDialect<::mlir::LLVM::LLVMDialect>(); +target.addLegalDialect<::mlir::arith::ArithDialect>(); target.addLegalDialect<::mlir::memref::MemRefDialect>(); target.addLegalDialect<::mlir::NVVM::NVVMDialect>(); mlir::scf::populateSCFStructuralTypeConversionsAndLegality( @@ -1299,11 +1301,88 @@ struct NVGPUWarpgroupMmaOpLowering } }; +struct NVGPUWarpgroupMmaStoreOpLowering +: public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern< + nvgpu::WarpgroupMmaStoreOp>::ConvertOpToLLVMPattern; + + void storeFragmentedMatrix(Value wgmmaResult, nvgpu::WarpgroupMmaStoreOp op, + OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, + int offset) const { +Location loc = op->getLoc(); +Type i32 = rewriter.getI32Type(); + +auto makeConst = [&](int32_t index) -> Value { + return rewriter.create( + loc, i32, rewriter.getI32IntegerAttr(index)); +}; +Value c4 = makeConst(4); +Value c32 = makeConst(kWarpSize); +Value c8 = makeConst(8); +Value c2 = makeConst(2); +Value c1 = makeConst(1); +Value c16 = makeConst(16); + +auto makeMul = [&](Value lhs, Value rhs) -> Value { + return rewriter.create(loc, lhs.getType(), lhs, rhs); +}; +auto makeAdd = [&](Value lhs, Value rhs) -> Value { + return rewriter.create(loc, lhs.getType(), lhs, rhs)
[clang] [MLIR][NVGPU] Introduce `nvgpu.wargroup.mma.store` Op for Hopper GPUs (PR #65441)
https://github.com/grypp updated https://github.com/llvm/llvm-project/pull/65441 >From 7b71da55fca8fe2a7dbe4982b1959be6a6175fa1 Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Thu, 7 Sep 2023 11:52:38 +0200 Subject: [PATCH 1/7] [MLIR][NVGPU] Introduce `nvgpu.warpgroup.mma.store` Op for Hopper GPUs This work introduces a new operation called `warpgroup.mma.store` to the NVGPU dialect of MLIR. The purpose of this operation is to facilitate storing fragmanted results of WGMMA to the given memref. An example of fragmentation is given here : https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d The `warpgroup.mma.store` does followings: 1) Takes one or more fragmented results matrix. 2) Calculates indexes per thread in warp group and stores the data into give memref. Here's an example usage of the `nvgpu.warpgroup.mma` operation: ``` // Performs matmul, results are fragmented and in registers %res, %res2 = nvgpu.warpgroup.mma ... // Stores the fragmented result to the give memory nvgpu.warpgroup.mma.store [%res1, %res2], %matrixD : !nvgpu.warpgroup.result>, !nvgpu.warpgroup.result> to memref<128x128xf32,3> ``` Depends on #65440 --- mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td | 19 + .../Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp| 83 ++- mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp| 29 +++ 3 files changed, 129 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td index 90381648dac6acc..e102ae0dc581013 100644 --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td @@ -721,4 +721,23 @@ def NVGPU_WarpgroupMmaOp : NVGPU_Op<"warpgroup.mma"> { let hasVerifier = 1; } +def NVGPU_WarpgroupMmaStoreOp : NVGPU_Op<"warpgroup.mma.store"> { + let description = [{ +The `nvgpu.warpgroup.mma.store` op performs the store of fragmented result +in $matrixD to give memref. + +[See the details of register fragment layout for accumulator matrix D](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) + +Note that, the op must be run with warp group. + }]; + + let arguments = (ins Variadic:$matrixD, + Arg:$dstMemref); + + let assemblyFormat = [{ +`[` $matrixD `]` `,` $dstMemref attr-dict `:` type($matrixD) `to` type($dstMemref) + }]; + let hasVerifier = 1; +} + #endif // NVGPU diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index f74aa05c0c4c4ff..4f1a0bc651e81b7 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -11,6 +11,7 @@ #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" +#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/LLVMTypes.h" @@ -409,8 +410,8 @@ struct ConvertNVGPUToNVVMPass using Base::Base; void getDependentDialects(DialectRegistry ®istry) const override { -registry -.insert(); +registry.insert(); } void runOnOperation() override { @@ -451,6 +452,7 @@ struct ConvertNVGPUToNVVMPass populateNVGPUToNVVMConversionPatterns(converter, patterns); LLVMConversionTarget target(getContext()); target.addLegalDialect<::mlir::LLVM::LLVMDialect>(); +target.addLegalDialect<::mlir::arith::ArithDialect>(); target.addLegalDialect<::mlir::memref::MemRefDialect>(); target.addLegalDialect<::mlir::NVVM::NVVMDialect>(); mlir::scf::populateSCFStructuralTypeConversionsAndLegality( @@ -1299,11 +1301,88 @@ struct NVGPUWarpgroupMmaOpLowering } }; +struct NVGPUWarpgroupMmaStoreOpLowering +: public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern< + nvgpu::WarpgroupMmaStoreOp>::ConvertOpToLLVMPattern; + + void storeFragmentedMatrix(Value wgmmaResult, nvgpu::WarpgroupMmaStoreOp op, + OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, + int offset) const { +Location loc = op->getLoc(); +Type i32 = rewriter.getI32Type(); + +auto makeConst = [&](int32_t index) -> Value { + return rewriter.create( + loc, i32, rewriter.getI32IntegerAttr(index)); +}; +Value c4 = makeConst(4); +Value c32 = makeConst(kWarpSize); +Value c8 = makeConst(8); +Value c2 = makeConst(2); +Value c1 = makeConst(1); +Value c16 = makeConst(16); + +auto makeMul = [&](Value lhs, Value rhs) -> Value { + return rewriter.create(loc, lhs.getType(), lhs, rhs); +}; +auto makeAdd = [&](Value lhs, Value rhs) -> Value { + return rewriter.create(loc, lhs.getType(), lhs, rhs)
[clang] [MLIR][NVGPU] Introduce `nvgpu.wargroup.mma.store` Op for Hopper GPUs (PR #65441)
@@ -53,6 +55,16 @@ static Value truncToI32(ConversionPatternRewriter &rewriter, Location loc, return rewriter.create(loc, rewriter.getI32Type(), value); } +/// Returns warp-size as a value. +static Value getWarpSizeValue(ImplicitLocOpBuilder &b) { grypp wrote: it was a bad idea to put a singleton. I thought with my previous compiler's logic, my bad. removed https://github.com/llvm/llvm-project/pull/65441 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [MLIR] Enabling Intel GPU Integration. (PR #65539)
@@ -811,8 +812,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( // descriptor. Type elementPtrType = this->getElementPtrType(memRefType); auto stream = adaptor.getAsyncDependencies().front(); + + auto isHostShared = rewriter.create( + loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared)); + Value allocatedPtr = - allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(); + allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared}) + .getResult(); grypp wrote: > Technically, SYCL provides a more abstract memory management with > sycl::buffer and sycl::accessor defining an implicit asynchronous task graph. > The allocation details are left to the implementation, asynchronous or > synchronous allocation is left to the implementers. I haven't touched SYCL much, thanks for the explanation. Creating a task graph implicitly sounds interesting. In this case, SYCL users are ware of asynchrony while writing their program. In CUDA (or HIP), users choose sync or async execution. > Here the lower-level synchronous USM memory management API of SYCL is used > instead, similar to CUDA/HIP memory management. Yes that's correct. I don't think there is an USM that can do allocation asynchronously. > So, should the async allocation in the example be synchronous instead? Yes, I think this is the correct behaviour. We can disallow `host_shared` and `async` on the Op. Here are the possible IRs: ``` // Valid %memref = gpu.alloc host_shared (): memref<3x3xi64> // Valid %memref = gpu.alloc (): memref<3x3xi64> // Invalid, USM managers don't allocate async %memref, %asyncToken = gpu.alloc async [%0] host_shared (): memref<3x3xi64> // Valid, only for CUDA. Afaik, SYCL or HIP cannot do that %memref, %asyncToken = gpu.alloc async [%0] (): memref<3x3xi64> ``` https://github.com/llvm/llvm-project/pull/65539 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [MLIR] Enabling Intel GPU Integration. (PR #65539)
@@ -811,8 +812,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( // descriptor. Type elementPtrType = this->getElementPtrType(memRefType); auto stream = adaptor.getAsyncDependencies().front(); + + auto isHostShared = rewriter.create( + loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared)); + Value allocatedPtr = - allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(); + allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared}) + .getResult(); grypp wrote: > Technically, SYCL provides a more abstract memory management with > sycl::buffer and sycl::accessor defining an implicit asynchronous task graph. > The allocation details are left to the implementation, asynchronous or > synchronous allocation is left to the implementers. I haven't touched SYCL much, thanks for the explanation. Creating a task graph implicitly sounds interesting. In this case, SYCL users are ware of asynchrony while writing their program. In CUDA (or HIP), users choose sync or async execution. > Here the lower-level synchronous USM memory management API of SYCL is used > instead, similar to CUDA/HIP memory management. Yes that's correct. I don't think there is an USM that can do allocation asynchronously. > So, should the async allocation in the example be synchronous instead? Yes, I think this is the correct behaviour. We can disallow `host_shared` and `async` on the Op. Here are the possible IRs: ``` // Valid %memref = gpu.alloc host_shared (): memref<3x3xi64> // Valid %memref = gpu.alloc (): memref<3x3xi64> // Invalid, USM managers don't allocate async %memref, %asyncToken = gpu.alloc async [%0] host_shared (): memref<3x3xi64> // Valid, only for CUDA. Afaik, SYCL or HIP cannot do that %memref, %asyncToken = gpu.alloc async [%0] (): memref<3x3xi64> ``` https://github.com/llvm/llvm-project/pull/65539 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [MLIR] Enabling Intel GPU Integration. (PR #65539)
@@ -811,8 +812,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( // descriptor. Type elementPtrType = this->getElementPtrType(memRefType); auto stream = adaptor.getAsyncDependencies().front(); + + auto isHostShared = rewriter.create( + loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared)); + Value allocatedPtr = - allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(); + allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared}) + .getResult(); grypp wrote: > the upstream GPUToLLVMConversion lowering does not support lowering of > gpu.alloc which is not async. Would that work if omit that check when `host_shared` is present? https://github.com/llvm/llvm-project/pull/65539 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [MLIR] Enabling Intel GPU Integration. (PR #65539)
@@ -811,8 +812,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( // descriptor. Type elementPtrType = this->getElementPtrType(memRefType); auto stream = adaptor.getAsyncDependencies().front(); + + auto isHostShared = rewriter.create( + loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared)); + Value allocatedPtr = - allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(); + allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared}) + .getResult(); grypp wrote: > the upstream GPUToLLVMConversion lowering does not support lowering of > gpu.alloc which is not async. Would that work if omit that check when `host_shared` is present? https://github.com/llvm/llvm-project/pull/65539 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [MLIR] Enabling Intel GPU Integration. (PR #65539)
@@ -811,8 +812,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( // descriptor. Type elementPtrType = this->getElementPtrType(memRefType); auto stream = adaptor.getAsyncDependencies().front(); + + auto isHostShared = rewriter.create( + loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared)); + Value allocatedPtr = - allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(); + allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared}) + .getResult(); grypp wrote: Regarding `host_shared`, I noticed this code in the examples: ``` %memref, %asyncToken = gpu.alloc async [%0] host_shared (): memref<3x3xi64> ``` Can SYCL's runtime allocate `host_shared` data asynchronously? It might be a good idea to prevent the use of `host_shared` and `async` together. FWIW, CUDA and HIP cannot do that. As far as I can see from the PR, the queue is not used when allocating `host_shared`. Nonetheless, having `async` on `gpu.alloc` is perfectly acceptable. CUDA does support asynchronous device memory allocation. https://github.com/llvm/llvm-project/pull/65539 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [MLIR] Enabling Intel GPU Integration. (PR #65539)
https://github.com/grypp edited https://github.com/llvm/llvm-project/pull/65539 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [NVPTX] Convert scalar function nvvm.annotations to attributes (PR #125908)
@@ -227,14 +228,14 @@ class NVVMDialectLLVMIRTranslationInterface } else if (attribute.getName() == grypp wrote: Sure sounds great https://github.com/llvm/llvm-project/pull/125908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [NVPTX] Switch front-ends and tests to ptx_kernel cc (PR #120806)
grypp wrote: In MLIR, we also have other NVVM metadata such as `reqntid` and `maxntid`, among others. What is the plan for these? Will they remain as metadata, or will they be expressed differently? Could you please elaborate on the compile-time improvements? https://github.com/llvm/llvm-project/pull/120806 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [NVPTX] Switch front-ends and tests to ptx_kernel cc (PR #120806)
https://github.com/grypp approved this pull request. https://github.com/llvm/llvm-project/pull/120806 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [NVPTX] Switch front-ends and tests to ptx_kernel cc (PR #120806)
grypp wrote: Thank you for clarifying! I wasn’t aware that this change also benefits nvcc. A 2% improvement is an excellent result! >From the MLIR side, the PR looks good to me. https://github.com/llvm/llvm-project/pull/120806 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [NVPTX] Add support for Shared Cluster Memory address space [2/2] (PR #136768)
grypp wrote: Can we please not merge anything without review? https://github.com/llvm/llvm-project/pull/136768 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [NVPTX] Convert vector function nvvm.annotations to attributes (PR #127736)
https://github.com/grypp approved this pull request. https://github.com/llvm/llvm-project/pull/127736 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [NVPTX] Unify and extend barrier{.cta} intrinsic support (PR #140615)
https://github.com/grypp edited https://github.com/llvm/llvm-project/pull/140615 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [NVPTX] Unify and extend barrier{.cta} intrinsic support (PR #140615)
@@ -462,24 +462,28 @@ def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">, // NVVM synchronization op definitions //===--===// -def NVVM_Barrier0Op : NVVM_IntrOp<"barrier0"> { +def NVVM_Barrier0Op : NVVM_Op<"barrier0"> { let assemblyFormat = "attr-dict"; + string llvmBuilder = [{ + createIntrinsicCall( + builder, llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_all, + {builder.getInt32(0)}); + }]; grypp wrote: you can remove this op completely actually. https://github.com/llvm/llvm-project/pull/140615 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [NVPTX] Unify and extend barrier{.cta} intrinsic support (PR #140615)
@@ -462,24 +462,28 @@ def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">, // NVVM synchronization op definitions //===--===// -def NVVM_Barrier0Op : NVVM_IntrOp<"barrier0"> { +def NVVM_Barrier0Op : NVVM_Op<"barrier0"> { let assemblyFormat = "attr-dict"; + string llvmBuilder = [{ + createIntrinsicCall( + builder, llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_all, + {builder.getInt32(0)}); + }]; } def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> { let arguments = (ins Optional:$barrierId, Optional:$numberOfThreads); string llvmBuilder = [{ -if ($numberOfThreads && $barrierId) { - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier, -{$barrierId, $numberOfThreads}); -} else if($barrierId) { - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_n, -{$barrierId}); -} else { - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier0); -} +auto id = $barrierId ? $barrierId : builder.getInt32(0); grypp wrote: We don't use auto when the type isn't obvious. https://github.com/llvm/llvm-project/pull/140615 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [NVPTX] Unify and extend barrier{.cta} intrinsic support (PR #140615)
@@ -462,24 +462,28 @@ def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">, // NVVM synchronization op definitions //===--===// -def NVVM_Barrier0Op : NVVM_IntrOp<"barrier0"> { +def NVVM_Barrier0Op : NVVM_Op<"barrier0"> { grypp wrote: do you need to change NVVM_IntrOp->NVVM_Op? https://github.com/llvm/llvm-project/pull/140615 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [NVPTX] Unify and extend barrier{.cta} intrinsic support (PR #140615)
@@ -71,14 +71,6 @@ define float @nvvm_rcp(float %0) { ret float %2 } -; CHECK-LABEL: @llvm_nvvm_barrier0() -define void @llvm_nvvm_barrier0() { - ; CHECK: nvvm.barrier0 - call void @llvm.nvvm.barrier0() - ret void -} - grypp wrote: test removal here. is it accident? https://github.com/llvm/llvm-project/pull/140615 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [mlir] [NVPTX] Unify and extend barrier{.cta} intrinsic support (PR #140615)
https://github.com/grypp approved this pull request. https://github.com/llvm/llvm-project/pull/140615 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits