https://github.com/nbpatel updated https://github.com/llvm/llvm-project/pull/66401
>From 4a3211750ede9d375dedc676e2cfb8bc3924aa0b Mon Sep 17 00:00:00 2001 From: Nishant Patel <nishant.b.pa...@intel.com> Date: Wed, 13 Sep 2023 20:58:25 +0000 Subject: [PATCH 1/4] Support lowering of hostShared in gpu.alloc op --- .../Conversion/GPUCommon/GPUToLLVMConversion.cpp | 15 ++++++++++----- mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp | 3 ++- mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp | 3 ++- .../lower-alloc-to-gpu-runtime-calls.mlir | 3 ++- .../test/Conversion/GPUCommon/typed-pointers.mlir | 3 ++- 5 files changed, 18 insertions(+), 9 deletions(-) diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index eddf3e9a47d0bc8..428d5d1d4b0e944 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -167,7 +167,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> { "mgpuMemAlloc", llvmPointerType /* void * */, {llvmIntPtrType /* intptr_t sizeBytes */, - llvmPointerType /* void *stream */}}; + llvmPointerType /* void *stream */, + llvmInt64Type /* bool isHostShared */}}; FunctionCallBuilder deallocCallBuilder = { "mgpuMemFree", llvmVoidType, @@ -786,9 +787,6 @@ LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite( LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::AllocOp allocOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { - if (adaptor.getHostShared()) - return rewriter.notifyMatchFailure( - allocOp, "host_shared allocation is not supported"); MemRefType memRefType = allocOp.getType(); @@ -799,6 +797,8 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( auto loc = allocOp.getLoc(); + bool isShared = allocOp.getHostShared(); + // Get shape of the memref as values: static sizes are constant // values and dynamic sizes are passed to 'alloc' as operands. SmallVector<Value, 4> shape; @@ -811,8 +811,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( // descriptor. Type elementPtrType = this->getElementPtrType(memRefType); auto stream = adaptor.getAsyncDependencies().front(); + + auto isHostShared = rewriter.create<mlir::LLVM::ConstantOp>( + loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared)); + Value allocatedPtr = - allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(); + allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared}) + .getResult(); if (!getTypeConverter()->useOpaquePointers()) allocatedPtr = rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr); diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp index 1dba677ebe66365..a0172f85a67a5c0 100644 --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -210,7 +210,8 @@ extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event, CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream)); } -extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) { +extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/, + bool /*isHostShared*/) { ScopedContext scopedContext; CUdeviceptr ptr; CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes)); diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp index bd3868a8e196f6f..292159536f5522f 100644 --- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp @@ -99,7 +99,8 @@ extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) { HIP_REPORT_IF_ERROR(hipEventRecord(event, stream)); } -extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/) { +extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/, + bool /*isHostShared*/) { void *ptr; HIP_REPORT_IF_ERROR(hipMalloc(&ptr, sizeBytes)); return ptr; diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir index 2506c6ceb990ef5..f365dcb02daf4c2 100644 --- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir @@ -8,7 +8,8 @@ module attributes {gpu.container_module} { %0 = gpu.wait async // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]] // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]] - // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]]) + // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant + // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]]) %1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32> // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0] // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]]) diff --git a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir index 2fa6c854c567819..e27162c7dbc1902 100644 --- a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir +++ b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir @@ -8,7 +8,8 @@ module attributes {gpu.container_module} { %0 = gpu.wait async // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]] // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]] - // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]]) + // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant + // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]]) %1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32> // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0] // CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]] >From 4752f1b1d569aea737cea6b92a34f5d2ef218122 Mon Sep 17 00:00:00 2001 From: Nishant Patel <nishant.b.pa...@intel.com> Date: Fri, 15 Sep 2023 18:10:03 +0000 Subject: [PATCH 2/4] gpu host_shared allocation cannot be async --- mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index 428d5d1d4b0e944..9d3d26002ec7f2c 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -791,14 +791,19 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( MemRefType memRefType = allocOp.getType(); if (failed(areAllLLVMTypes(allocOp, adaptor.getOperands(), rewriter)) || - !isConvertibleAndHasIdentityMaps(memRefType) || - failed(isAsyncWithOneDependency(rewriter, allocOp))) + !isConvertibleAndHasIdentityMaps(memRefType)) return failure(); auto loc = allocOp.getLoc(); bool isShared = allocOp.getHostShared(); + if (isShared && allocOp.getAsyncToken()) + return rewriter.notifyMatchFailure( + allocOp, "Host Shared allocation cannot be done async"); + else if (!isShared && failed(isAsyncWithOneDependency(rewriter, allocOp))) + return failure(); + // Get shape of the memref as values: static sizes are constant // values and dynamic sizes are passed to 'alloc' as operands. SmallVector<Value, 4> shape; >From 49043d9f2b7284fb8a6c82f37f3cec83882baa70 Mon Sep 17 00:00:00 2001 From: Nishant Patel <nishant.b.pa...@intel.com> Date: Mon, 25 Sep 2023 21:16:58 +0000 Subject: [PATCH 3/4] Address PR feedback --- mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index 9d3d26002ec7f2c..bcbd5d67de34dcd 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -168,7 +168,7 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> { llvmPointerType /* void * */, {llvmIntPtrType /* intptr_t sizeBytes */, llvmPointerType /* void *stream */, - llvmInt64Type /* bool isHostShared */}}; + llvmInt8Type /* bool isHostShared */}}; FunctionCallBuilder deallocCallBuilder = { "mgpuMemFree", llvmVoidType, @@ -818,7 +818,7 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( auto stream = adaptor.getAsyncDependencies().front(); auto isHostShared = rewriter.create<mlir::LLVM::ConstantOp>( - loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared)); + loc, llvmInt8Type, rewriter.getI64IntegerAttr(isShared)); Value allocatedPtr = allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared}) >From 44f3978dae0b3e73e41566cbab42c2825164f62f Mon Sep 17 00:00:00 2001 From: Nishant Patel <nishant.b.pa...@intel.com> Date: Mon, 25 Sep 2023 21:24:44 +0000 Subject: [PATCH 4/4] Use llvmInt8TypeI8 for bool --- mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index bcbd5d67de34dcd..f72500d7592d049 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -818,7 +818,7 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( auto stream = adaptor.getAsyncDependencies().front(); auto isHostShared = rewriter.create<mlir::LLVM::ConstantOp>( - loc, llvmInt8Type, rewriter.getI64IntegerAttr(isShared)); + loc, llvmInt8Type, rewriter.getI8IntegerAttr(isShared)); Value allocatedPtr = allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared}) _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits