================ @@ -0,0 +1,427 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes +; RUN: opt -S -mtriple=amdgcn-- -passes=lower-gpu-intrinsic < %s | FileCheck %s --check-prefix=AMDGCN +; RUN: opt -S -mtriple=nvptx64-- -passes=lower-gpu-intrinsic < %s | FileCheck %s --check-prefix=NVPTX + +; Used by amdgpu to lower llvm.gpu.num.threads, harmless on nvptx +@__oclc_ABI_version = weak_odr hidden addrspace(4) constant i32 500 + +define i32 @num_blocks_x() { +; AMDGCN-LABEL: @num_blocks_x( +; AMDGCN-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], i32 12 +; AMDGCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !range [[RNG0:![0-9]+]], !invariant.load [[META1:![0-9]+]] +; AMDGCN-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4 +; AMDGCN-NEXT: [[TMP13:%.*]] = icmp sge i32 [[TMP12]], 500 +; AMDGCN-NEXT: [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; AMDGCN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 12 +; AMDGCN-NEXT: [[TMP8:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i32 4 +; AMDGCN-NEXT: [[TMP10:%.*]] = select i1 [[TMP13]], ptr addrspace(4) [[TMP7]], ptr addrspace(4) [[TMP9]] +; AMDGCN-NEXT: [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 2, !invariant.load [[META1]], !noundef [[META1]] +; AMDGCN-NEXT: [[TMP4:%.*]] = zext i16 [[TMP11]] to i32 +; AMDGCN-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP3]], [[TMP4]] +; AMDGCN-NEXT: ret i32 [[TMP5]] +; +; NVPTX-LABEL: @num_blocks_x( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.num.blocks.x() + ret i32 %1 +} + +declare i32 @llvm.gpu.num.blocks.x() + +define i32 @num_blocks_y() { +; AMDGCN-LABEL: @num_blocks_y( +; AMDGCN-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], i32 16 +; AMDGCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !range [[RNG0]], !invariant.load [[META1]] +; AMDGCN-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4 +; AMDGCN-NEXT: [[TMP13:%.*]] = icmp sge i32 [[TMP12]], 500 +; AMDGCN-NEXT: [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; AMDGCN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 14 +; AMDGCN-NEXT: [[TMP8:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i32 6 +; AMDGCN-NEXT: [[TMP10:%.*]] = select i1 [[TMP13]], ptr addrspace(4) [[TMP7]], ptr addrspace(4) [[TMP9]] +; AMDGCN-NEXT: [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 2, !invariant.load [[META1]], !noundef [[META1]] +; AMDGCN-NEXT: [[TMP4:%.*]] = zext i16 [[TMP11]] to i32 +; AMDGCN-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP3]], [[TMP4]] +; AMDGCN-NEXT: ret i32 [[TMP5]] +; +; NVPTX-LABEL: @num_blocks_y( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.num.blocks.y() + ret i32 %1 +} + +declare i32 @llvm.gpu.num.blocks.y() + +define i32 @num_blocks_z() { +; AMDGCN-LABEL: @num_blocks_z( +; AMDGCN-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], i32 20 +; AMDGCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !range [[RNG0]], !invariant.load [[META1]] +; AMDGCN-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4 +; AMDGCN-NEXT: [[TMP13:%.*]] = icmp sge i32 [[TMP12]], 500 +; AMDGCN-NEXT: [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; AMDGCN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 16 +; AMDGCN-NEXT: [[TMP8:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i32 8 +; AMDGCN-NEXT: [[TMP10:%.*]] = select i1 [[TMP13]], ptr addrspace(4) [[TMP7]], ptr addrspace(4) [[TMP9]] +; AMDGCN-NEXT: [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 2, !invariant.load [[META1]], !noundef [[META1]] +; AMDGCN-NEXT: [[TMP4:%.*]] = zext i16 [[TMP11]] to i32 +; AMDGCN-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP3]], [[TMP4]] +; AMDGCN-NEXT: ret i32 [[TMP5]] +; +; NVPTX-LABEL: @num_blocks_z( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.num.blocks.z() + ret i32 %1 +} + +declare i32 @llvm.gpu.num.blocks.z() + +define i32 @block_id_x() { +; AMDGCN-LABEL: @block_id_x( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @block_id_x( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.block.id.x() + ret i32 %1 +} + +declare i32 @llvm.gpu.block.id.x() + +define i32 @block_id_y() { +; AMDGCN-LABEL: @block_id_y( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @block_id_y( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.block.id.y() + ret i32 %1 +} + +declare i32 @llvm.gpu.block.id.y() + +define i32 @block_id_z() { +; AMDGCN-LABEL: @block_id_z( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @block_id_z( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.block.id.z() + ret i32 %1 +} + +declare i32 @llvm.gpu.block.id.z() + +define i32 @num_threads_x() { +; AMDGCN-LABEL: @num_threads_x( +; AMDGCN-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4 +; AMDGCN-NEXT: [[TMP2:%.*]] = icmp sge i32 [[TMP9]], 500 +; AMDGCN-NEXT: [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 12 +; AMDGCN-NEXT: [[TMP5:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 4 +; AMDGCN-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], ptr addrspace(4) [[TMP4]], ptr addrspace(4) [[TMP6]] +; AMDGCN-NEXT: [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, !invariant.load [[META1]], !noundef [[META1]] +; AMDGCN-NEXT: [[TMP1:%.*]] = zext i16 [[TMP8]] to i32 +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @num_threads_x( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.num.threads.x() + ret i32 %1 +} + +declare i32 @llvm.gpu.num.threads.x() + +define i32 @num_threads_y() { +; AMDGCN-LABEL: @num_threads_y( +; AMDGCN-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4 +; AMDGCN-NEXT: [[TMP2:%.*]] = icmp sge i32 [[TMP9]], 500 +; AMDGCN-NEXT: [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 14 +; AMDGCN-NEXT: [[TMP5:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 6 +; AMDGCN-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], ptr addrspace(4) [[TMP4]], ptr addrspace(4) [[TMP6]] +; AMDGCN-NEXT: [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, !invariant.load [[META1]], !noundef [[META1]] +; AMDGCN-NEXT: [[TMP1:%.*]] = zext i16 [[TMP8]] to i32 +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @num_threads_y( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.num.threads.y() + ret i32 %1 +} + +declare i32 @llvm.gpu.num.threads.y() + +define i32 @num_threads_z() { +; AMDGCN-LABEL: @num_threads_z( +; AMDGCN-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4 +; AMDGCN-NEXT: [[TMP2:%.*]] = icmp sge i32 [[TMP9]], 500 +; AMDGCN-NEXT: [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 16 +; AMDGCN-NEXT: [[TMP5:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 8 +; AMDGCN-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], ptr addrspace(4) [[TMP4]], ptr addrspace(4) [[TMP6]] +; AMDGCN-NEXT: [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, !invariant.load [[META1]], !noundef [[META1]] +; AMDGCN-NEXT: [[TMP1:%.*]] = zext i16 [[TMP8]] to i32 +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @num_threads_z( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.num.threads.z() + ret i32 %1 +} + +declare i32 @llvm.gpu.num.threads.z() + +define i32 @thread_id_x() { +; AMDGCN-LABEL: @thread_id_x( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @thread_id_x( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.thread.id.x() + ret i32 %1 +} + +declare i32 @llvm.gpu.thread.id.x() + +define i32 @thread_id_y() { +; AMDGCN-LABEL: @thread_id_y( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @thread_id_y( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.thread.id.y() + ret i32 %1 +} + +declare i32 @llvm.gpu.thread.id.y() + +define i32 @thread_id_z() { +; AMDGCN-LABEL: @thread_id_z( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @thread_id_z( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.thread.id.z() + ret i32 %1 +} + +declare i32 @llvm.gpu.thread.id.z() + +define i32 @num_lanes() { +; AMDGCN-LABEL: @num_lanes( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.wavefrontsize() +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @num_lanes( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.num.lanes() + ret i32 %1 +} + +declare i32 @llvm.gpu.num.lanes() + +define i32 @lane_id() { +; AMDGCN-LABEL: @lane_id( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; AMDGCN-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP1]]) +; AMDGCN-NEXT: ret i32 [[TMP2]] +; +; NVPTX-LABEL: @lane_id( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.laneid() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.lane.id() + ret i32 %1 +} + +declare i32 @llvm.gpu.lane.id() + +define i64 @lane_mask() { +; AMDGCN-LABEL: @lane_mask( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; AMDGCN-NEXT: ret i64 [[TMP1]] +; +; NVPTX-LABEL: @lane_mask( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.activemask() +; NVPTX-NEXT: [[CONV:%.*]] = zext i32 [[TMP1]] to i64 +; NVPTX-NEXT: ret i64 [[CONV]] +; + %1 = call i64 @llvm.gpu.lane.mask() + ret i64 %1 +} + +declare i64 @llvm.gpu.lane.mask() + +define i32 @read_first_lane_u32(i64 %lane_mask, i32 %x) { +; AMDGCN-LABEL: @read_first_lane_u32( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[X:%.*]]) +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @read_first_lane_u32( +; NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32 +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.cttz.i32(i32 [[CONV]], i1 true) +; NVPTX-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[CONV]], 0 +; NVPTX-NEXT: [[SUB:%.*]] = select i1 [[ISZERO]], i32 -1, i32 [[TMP1]] +; NVPTX-NEXT: [[TMP2:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[CONV]], i32 [[X:%.*]], i32 [[SUB]], i32 31) +; NVPTX-NEXT: ret i32 [[TMP2]] +; + %1 = call i32 @llvm.gpu.read.first.lane.u32(i64 %lane_mask, i32 %x) + ret i32 %1 +} + +declare i32 @llvm.gpu.read.first.lane.u32(i64, i32) + +define i64 @ballot(i64 %lane_mask, i1 zeroext %x) { +; AMDGCN-LABEL: @ballot( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[X:%.*]]) +; AMDGCN-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], [[LANE_MASK:%.*]] +; AMDGCN-NEXT: ret i64 [[TMP2]] +; +; NVPTX-LABEL: @ballot( +; NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32 +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.vote.ballot.sync(i32 [[CONV]], i1 [[X:%.*]]) +; NVPTX-NEXT: [[CONV1:%.*]] = zext i32 [[TMP1]] to i64 +; NVPTX-NEXT: ret i64 [[CONV1]] +; + %1 = call i64 @llvm.gpu.ballot(i64 %lane_mask, i1 %x) + ret i64 %1 +} + +declare i64 @llvm.gpu.ballot(i64, i1) + +define void @sync_threads() { +; AMDGCN-LABEL: @sync_threads( +; AMDGCN-NEXT: call void @llvm.amdgcn.s.barrier() +; AMDGCN-NEXT: fence syncscope("workgroup") seq_cst +; AMDGCN-NEXT: ret void +; +; NVPTX-LABEL: @sync_threads( +; NVPTX-NEXT: call void @llvm.nvvm.barrier0() +; NVPTX-NEXT: ret void +; + call void @llvm.gpu.sync.threads() + ret void +} + +declare void @llvm.gpu.sync.threads() + +define void @sync_lane(i64 %lane_mask) { +; AMDGCN-LABEL: @sync_lane( +; AMDGCN-NEXT: call void @llvm.amdgcn.wave.barrier() +; AMDGCN-NEXT: ret void +; +; NVPTX-LABEL: @sync_lane( +; NVPTX-NEXT: [[TMP1:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32 +; NVPTX-NEXT: call void @llvm.nvvm.bar.warp.sync(i32 [[TMP1]]) +; NVPTX-NEXT: ret void +; + call void @llvm.gpu.sync.lane(i64 %lane_mask) + ret void +} + +declare void @llvm.gpu.sync.lane(i64) + +define i32 @shuffle_idx_u32(i64 %lane_mask, i32 %idx, i32 %x, i32 %width) { +; AMDGCN-LABEL: @shuffle_idx_u32( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; AMDGCN-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP1]]) +; AMDGCN-NEXT: [[NOT:%.*]] = sub i32 0, [[WIDTH:%.*]] +; AMDGCN-NEXT: [[AND:%.*]] = and i32 [[TMP2]], [[NOT]] +; AMDGCN-NEXT: [[ADD:%.*]] = add i32 [[AND]], [[IDX:%.*]] +; AMDGCN-NEXT: [[SHL:%.*]] = shl i32 [[ADD]], 2 +; AMDGCN-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[SHL]], i32 [[X:%.*]]) +; AMDGCN-NEXT: ret i32 [[TMP3]] +; +; NVPTX-LABEL: @shuffle_idx_u32( +; NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32 +; NVPTX-NEXT: [[SH_PROM:%.*]] = zext i32 [[IDX:%.*]] to i64 +; NVPTX-NEXT: [[TMP1:%.*]] = shl i32 [[WIDTH:%.*]], 8 +; NVPTX-NEXT: [[OR:%.*]] = sub i32 8223, [[TMP1]] +; NVPTX-NEXT: [[TMP2:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[CONV]], i32 [[X:%.*]], i32 [[IDX]], i32 [[OR]]) +; NVPTX-NEXT: [[TMP3:%.*]] = shl i64 1, [[SH_PROM]] +; NVPTX-NEXT: [[TMP4:%.*]] = and i64 [[TMP3]], [[LANE_MASK]] +; NVPTX-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 0 +; NVPTX-NEXT: [[AND4:%.*]] = select i1 [[TMP5]], i32 0, i32 [[TMP2]] +; NVPTX-NEXT: ret i32 [[AND4]] +; + %1 = call i32 @llvm.gpu.shuffle.idx.u32(i64 %lane_mask, i32 %idx, i32 %x, i32 %width) + ret i32 %1 +} + +declare i32 @llvm.gpu.shuffle.idx.u32(i64, i32, i32, i32) + +define void @gpu_exit() { +; AMDGCN-LABEL: @gpu_exit( +; AMDGCN-NEXT: call void @llvm.amdgcn.endpgm() +; AMDGCN-NEXT: ret void +; +; NVPTX-LABEL: @gpu_exit( +; NVPTX-NEXT: call void @llvm.nvvm.exit() +; NVPTX-NEXT: ret void +; + call void @llvm.gpu.exit() + ret void +} + +declare void @llvm.gpu.exit() + +define void @thread_suspend() { +; AMDGCN-LABEL: @thread_suspend( +; AMDGCN-NEXT: call void @llvm.amdgcn.s.sleep(i32 2) ---------------- arsenm wrote:
why 2 https://github.com/llvm/llvm-project/pull/131190 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits