krisb created this revision. krisb added a reviewer: tra. Herald added subscribers: asavonic, hiraditya. krisb requested review of this revision. Herald added subscribers: llvm-commits, cfe-commits, jdoerfert. Herald added projects: clang, LLVM.
NVVM IR specification defines them with i32 return type [0]: The following intrinsics synchronize a subset of threads in a warp and then broadcast and compare a value across threads in the subset. declare i32 @llvm.nvvm.match.any.sync.i64(i32 %membermask, i64 %value) declare {i32, i1} @llvm.nvvm.match.all.sync.i64(i32 %membermask, i64 %value) ... The i32 return value is a 32-bit mask where bit position in mask corresponds to thread’s laneid. as well as PTX ISA spec [1]: 9.7.12.8. Parallel Synchronization and Communication Instructions: match.sync ... Syntax match.any.sync.type d, a, membermask; match.all.sync.type d[|p], a, membermask; Description ... Destination d is a 32-bit mask where bit position in mask corresponds to thread’s laneid. So it doesn't make sense to define them with any other return type. Additionally, ptxas doesn't accept intructions, produced by NVPTX backend. Here is the ptxas output for `llvm/test/CodeGen/NVPTX/match.ll`: ptxas match.ptx, line 44; error : Arguments mismatch for instruction 'match' ptxas match.ptx, line 45; error : Arguments mismatch for instruction 'match' ptxas match.ptx, line 46; error : Arguments mismatch for instruction 'match' ptxas match.ptx, line 47; error : Arguments mismatch for instruction 'match' ptxas match.ptx, line 98; error : Arguments mismatch for instruction 'match' After this patch, it compiles with no issues. [0] https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#unique_341827171 [1] https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D120499 Files: clang/include/clang/Basic/BuiltinsNVPTX.def clang/lib/Headers/__clang_cuda_intrinsics.h clang/test/CodeGen/builtins-nvptx-ptx60.cu llvm/include/llvm/IR/IntrinsicsNVVM.td llvm/lib/Target/NVPTX/NVPTXIntrinsics.td llvm/test/CodeGen/NVPTX/match.ll
Index: llvm/test/CodeGen/NVPTX/match.ll =================================================================== --- llvm/test/CodeGen/NVPTX/match.ll +++ llvm/test/CodeGen/NVPTX/match.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s declare i32 @llvm.nvvm.match.any.sync.i32(i32, i32) -declare i64 @llvm.nvvm.match.any.sync.i64(i32, i64) +declare i32 @llvm.nvvm.match.any.sync.i64(i32, i64) ; CHECK-LABEL: .func{{.*}}match.any.sync.i32 define i32 @match.any.sync.i32(i32 %mask, i32 %value) { @@ -23,26 +23,26 @@ } ; CHECK-LABEL: .func{{.*}}match.any.sync.i64 -define i64 @match.any.sync.i64(i32 %mask, i64 %value) { +define i32 @match.any.sync.i64(i32 %mask, i64 %value) { ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]], [match.any.sync.i64_param_0]; ; CHECK: ld.param.u64 [[VALUE:%rd[0-9]+]], [match.any.sync.i64_param_1]; - ; CHECK: match.any.sync.b64 [[V0:%rd[0-9]+]], [[VALUE]], [[MASK]]; - %v0 = call i64 @llvm.nvvm.match.any.sync.i64(i32 %mask, i64 %value) - ; CHECK: match.any.sync.b64 [[V1:%rd[0-9]+]], [[VALUE]], 1; - %v1 = call i64 @llvm.nvvm.match.any.sync.i64(i32 1, i64 %value) - ; CHECK: match.any.sync.b64 [[V2:%rd[0-9]+]], 2, [[MASK]]; - %v2 = call i64 @llvm.nvvm.match.any.sync.i64(i32 %mask, i64 2) - ; CHECK: match.any.sync.b64 [[V3:%rd[0-9]+]], 4, 3; - %v3 = call i64 @llvm.nvvm.match.any.sync.i64(i32 3, i64 4) - %sum1 = add i64 %v0, %v1 - %sum2 = add i64 %v2, %v3 - %sum3 = add i64 %sum1, %sum2 - ret i64 %sum3; + ; CHECK: match.any.sync.b64 [[V0:%r[0-9]+]], [[VALUE]], [[MASK]]; + %v0 = call i32 @llvm.nvvm.match.any.sync.i64(i32 %mask, i64 %value) + ; CHECK: match.any.sync.b64 [[V1:%r[0-9]+]], [[VALUE]], 1; + %v1 = call i32 @llvm.nvvm.match.any.sync.i64(i32 1, i64 %value) + ; CHECK: match.any.sync.b64 [[V2:%r[0-9]+]], 2, [[MASK]]; + %v2 = call i32 @llvm.nvvm.match.any.sync.i64(i32 %mask, i64 2) + ; CHECK: match.any.sync.b64 [[V3:%r[0-9]+]], 4, 3; + %v3 = call i32 @llvm.nvvm.match.any.sync.i64(i32 3, i64 4) + %sum1 = add i32 %v0, %v1 + %sum2 = add i32 %v2, %v3 + %sum3 = add i32 %sum1, %sum2 + ret i32 %sum3; } declare {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32, i32) -declare {i64, i1} @llvm.nvvm.match.all.sync.i64p(i32, i64) +declare {i32, i1} @llvm.nvvm.match.all.sync.i64p(i32, i64) ; CHECK-LABEL: .func{{.*}}match.all.sync.i32p( define {i32,i1} @match.all.sync.i32p(i32 %mask, i32 %value) { @@ -81,37 +81,37 @@ } ; CHECK-LABEL: .func{{.*}}match.all.sync.i64p( -define {i64,i1} @match.all.sync.i64p(i32 %mask, i64 %value) { +define {i32,i1} @match.all.sync.i64p(i32 %mask, i64 %value) { ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]], [match.all.sync.i64p_param_0]; ; CHECK: ld.param.u64 [[VALUE:%rd[0-9]+]], [match.all.sync.i64p_param_1]; - ; CHECK: match.all.sync.b64 {{%rd[0-9]+\|%p[0-9]+}}, [[VALUE]], [[MASK]]; - %r1 = call {i64, i1} @llvm.nvvm.match.all.sync.i64p(i32 %mask, i64 %value) - %v1 = extractvalue {i64, i1} %r1, 0 - %p1 = extractvalue {i64, i1} %r1, 1 - - ; CHECK: match.all.sync.b64 {{%rd[0-9]+\|%p[0-9]+}}, 1, [[MASK]]; - %r2 = call {i64, i1} @llvm.nvvm.match.all.sync.i64p(i32 %mask, i64 1) - %v2 = extractvalue {i64, i1} %r2, 0 - %p2 = extractvalue {i64, i1} %r2, 1 - - ; CHECK: match.all.sync.b64 {{%rd[0-9]+\|%p[0-9]+}}, [[VALUE]], 2; - %r3 = call {i64, i1} @llvm.nvvm.match.all.sync.i64p(i32 2, i64 %value) - %v3 = extractvalue {i64, i1} %r3, 0 - %p3 = extractvalue {i64, i1} %r3, 1 - - ; CHECK: match.all.sync.b64 {{%rd[0-9]+\|%p[0-9]+}}, 4, 3; - %r4 = call {i64, i1} @llvm.nvvm.match.all.sync.i64p(i32 3, i64 4) - %v4 = extractvalue {i64, i1} %r4, 0 - %p4 = extractvalue {i64, i1} %r4, 1 - - %vsum1 = add i64 %v1, %v2 - %vsum2 = add i64 %v3, %v4 - %vsum3 = add i64 %vsum1, %vsum2 + ; CHECK: match.all.sync.b64 {{%r[0-9]+\|%p[0-9]+}}, [[VALUE]], [[MASK]]; + %r1 = call {i32, i1} @llvm.nvvm.match.all.sync.i64p(i32 %mask, i64 %value) + %v1 = extractvalue {i32, i1} %r1, 0 + %p1 = extractvalue {i32, i1} %r1, 1 + + ; CHECK: match.all.sync.b64 {{%r[0-9]+\|%p[0-9]+}}, 1, [[MASK]]; + %r2 = call {i32, i1} @llvm.nvvm.match.all.sync.i64p(i32 %mask, i64 1) + %v2 = extractvalue {i32, i1} %r2, 0 + %p2 = extractvalue {i32, i1} %r2, 1 + + ; CHECK: match.all.sync.b64 {{%r[0-9]+\|%p[0-9]+}}, [[VALUE]], 2; + %r3 = call {i32, i1} @llvm.nvvm.match.all.sync.i64p(i32 2, i64 %value) + %v3 = extractvalue {i32, i1} %r3, 0 + %p3 = extractvalue {i32, i1} %r3, 1 + + ; CHECK: match.all.sync.b64 {{%r[0-9]+\|%p[0-9]+}}, 4, 3; + %r4 = call {i32, i1} @llvm.nvvm.match.all.sync.i64p(i32 3, i64 4) + %v4 = extractvalue {i32, i1} %r4, 0 + %p4 = extractvalue {i32, i1} %r4, 1 + + %vsum1 = add i32 %v1, %v2 + %vsum2 = add i32 %v3, %v4 + %vsum3 = add i32 %vsum1, %vsum2 %psum1 = add i1 %p1, %p2 %psum2 = add i1 %p3, %p4 %psum3 = add i1 %psum1, %psum2 - %ret0 = insertvalue {i64, i1} undef, i64 %vsum3, 0 - %ret1 = insertvalue {i64, i1} %ret0, i1 %psum3, 1 - ret {i64, i1} %ret1; + %ret0 = insertvalue {i32, i1} undef, i32 %vsum3, 0 + %ret1 = insertvalue {i32, i1} %ret0, i1 %psum3, 1 + ret {i32, i1} %ret1; } Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -223,21 +223,21 @@ multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp, Operand ImmOp> { - def ii : NVPTXInst<(outs regclass:$dest), (ins i32imm:$mask, ImmOp:$value), + def ii : NVPTXInst<(outs Int32Regs:$dest), (ins i32imm:$mask, ImmOp:$value), "match.any.sync." # ptxtype # " \t$dest, $value, $mask;", - [(set regclass:$dest, (IntOp imm:$mask, imm:$value))]>, + [(set Int32Regs:$dest, (IntOp imm:$mask, imm:$value))]>, Requires<[hasPTX60, hasSM70]>; - def ir : NVPTXInst<(outs regclass:$dest), (ins Int32Regs:$mask, ImmOp:$value), + def ir : NVPTXInst<(outs Int32Regs:$dest), (ins Int32Regs:$mask, ImmOp:$value), "match.any.sync." # ptxtype # " \t$dest, $value, $mask;", - [(set regclass:$dest, (IntOp Int32Regs:$mask, imm:$value))]>, + [(set Int32Regs:$dest, (IntOp Int32Regs:$mask, imm:$value))]>, Requires<[hasPTX60, hasSM70]>; - def ri : NVPTXInst<(outs regclass:$dest), (ins i32imm:$mask, regclass:$value), + def ri : NVPTXInst<(outs Int32Regs:$dest), (ins i32imm:$mask, regclass:$value), "match.any.sync." # ptxtype # " \t$dest, $value, $mask;", - [(set regclass:$dest, (IntOp imm:$mask, regclass:$value))]>, + [(set Int32Regs:$dest, (IntOp imm:$mask, regclass:$value))]>, Requires<[hasPTX60, hasSM70]>; - def rr : NVPTXInst<(outs regclass:$dest), (ins Int32Regs:$mask, regclass:$value), + def rr : NVPTXInst<(outs Int32Regs:$dest), (ins Int32Regs:$mask, regclass:$value), "match.any.sync." # ptxtype # " \t$dest, $value, $mask;", - [(set regclass:$dest, (IntOp Int32Regs:$mask, regclass:$value))]>, + [(set Int32Regs:$dest, (IntOp Int32Regs:$mask, regclass:$value))]>, Requires<[hasPTX60, hasSM70]>; } @@ -248,25 +248,25 @@ multiclass MATCH_ALLP_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp, Operand ImmOp> { - def ii : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred), + def ii : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins i32imm:$mask, ImmOp:$value), "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;", - [(set regclass:$dest, Int1Regs:$pred, (IntOp imm:$mask, imm:$value))]>, + [(set Int32Regs:$dest, Int1Regs:$pred, (IntOp imm:$mask, imm:$value))]>, Requires<[hasPTX60, hasSM70]>; - def ir : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred), + def ir : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins Int32Regs:$mask, ImmOp:$value), "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;", - [(set regclass:$dest, Int1Regs:$pred, (IntOp Int32Regs:$mask, imm:$value))]>, + [(set Int32Regs:$dest, Int1Regs:$pred, (IntOp Int32Regs:$mask, imm:$value))]>, Requires<[hasPTX60, hasSM70]>; - def ri : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred), + def ri : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins i32imm:$mask, regclass:$value), "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;", - [(set regclass:$dest, Int1Regs:$pred, (IntOp imm:$mask, regclass:$value))]>, + [(set Int32Regs:$dest, Int1Regs:$pred, (IntOp imm:$mask, regclass:$value))]>, Requires<[hasPTX60, hasSM70]>; - def rr : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred), + def rr : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins Int32Regs:$mask, regclass:$value), "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;", - [(set regclass:$dest, Int1Regs:$pred, (IntOp Int32Regs:$mask, regclass:$value))]>, + [(set Int32Regs:$dest, Int1Regs:$pred, (IntOp Int32Regs:$mask, regclass:$value))]>, Requires<[hasPTX60, hasSM70]>; } defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC<Int32Regs, "b32", int_nvvm_match_all_sync_i32p, Index: llvm/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -4499,7 +4499,7 @@ GCCBuiltin<"__nvvm_match_any_sync_i32">; // match.any.sync.b64 mask, value def int_nvvm_match_any_sync_i64 : - Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty], + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty], [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.any.sync.i64">, GCCBuiltin<"__nvvm_match_any_sync_i64">; @@ -4513,7 +4513,7 @@ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.all.sync.i32p">; // match.all.sync.b64p mask, value def int_nvvm_match_all_sync_i64p : - Intrinsic<[llvm_i64_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty], + Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty], [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.all.sync.i64p">; // Index: clang/test/CodeGen/builtins-nvptx-ptx60.cu =================================================================== --- clang/test/CodeGen/builtins-nvptx-ptx60.cu +++ clang/test/CodeGen/builtins-nvptx-ptx60.cu @@ -91,13 +91,13 @@ // CHECK: call i32 @llvm.nvvm.match.any.sync.i32(i32 // expected-error@+1 {{'__nvvm_match_any_sync_i32' needs target feature ptx60}} __nvvm_match_any_sync_i32(mask, i); - // CHECK: call i64 @llvm.nvvm.match.any.sync.i64(i32 + // CHECK: call i32 @llvm.nvvm.match.any.sync.i64(i32 // expected-error@+1 {{'__nvvm_match_any_sync_i64' needs target feature ptx60}} __nvvm_match_any_sync_i64(mask, i64); // CHECK: call { i32, i1 } @llvm.nvvm.match.all.sync.i32p(i32 // expected-error@+1 {{'__nvvm_match_all_sync_i32p' needs target feature ptx60}} __nvvm_match_all_sync_i32p(mask, i, &i); - // CHECK: call { i64, i1 } @llvm.nvvm.match.all.sync.i64p(i32 + // CHECK: call { i32, i1 } @llvm.nvvm.match.all.sync.i64p(i32 // expected-error@+1 {{'__nvvm_match_all_sync_i64p' needs target feature ptx60}} __nvvm_match_all_sync_i64p(mask, i64, &i); Index: clang/lib/Headers/__clang_cuda_intrinsics.h =================================================================== --- clang/lib/Headers/__clang_cuda_intrinsics.h +++ clang/lib/Headers/__clang_cuda_intrinsics.h @@ -234,7 +234,7 @@ return __nvvm_match_any_sync_i32(mask, value); } -inline __device__ unsigned long long +inline __device__ unsigned int __match64_any_sync(unsigned int mask, unsigned long long value) { return __nvvm_match_any_sync_i64(mask, value); } @@ -244,7 +244,7 @@ return __nvvm_match_all_sync_i32p(mask, value, pred); } -inline __device__ unsigned long long +inline __device__ unsigned int __match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { return __nvvm_match_all_sync_i64p(mask, value, pred); } Index: clang/include/clang/Basic/BuiltinsNVPTX.def =================================================================== --- clang/include/clang/Basic/BuiltinsNVPTX.def +++ clang/include/clang/Basic/BuiltinsNVPTX.def @@ -474,10 +474,10 @@ // Match TARGET_BUILTIN(__nvvm_match_any_sync_i32, "UiUiUi", "", PTX60) -TARGET_BUILTIN(__nvvm_match_any_sync_i64, "WiUiWi", "", PTX60) +TARGET_BUILTIN(__nvvm_match_any_sync_i64, "UiUiWi", "", PTX60) // These return a pair {value, predicate}, which requires custom lowering. TARGET_BUILTIN(__nvvm_match_all_sync_i32p, "UiUiUii*", "", PTX60) -TARGET_BUILTIN(__nvvm_match_all_sync_i64p, "WiUiWii*", "", PTX60) +TARGET_BUILTIN(__nvvm_match_all_sync_i64p, "UiUiWii*", "", PTX60) // Redux TARGET_BUILTIN(__nvvm_redux_sync_add, "iii", "", AND(SM_80,PTX70))
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits