Author: Alex Voicu Date: 2024-11-25T10:29:50Z New Revision: 48ec59c234ce267a0454b15e9a79a326e21a4a97
URL: https://github.com/llvm/llvm-project/commit/48ec59c234ce267a0454b15e9a79a326e21a4a97 DIFF: https://github.com/llvm/llvm-project/commit/48ec59c234ce267a0454b15e9a79a326e21a4a97.diff LOG: [llvm][AMDGPU] Fold `llvm.amdgcn.wavefrontsize` early (#114481) Fold `llvm.amdgcn.wavefrontsize` early, during InstCombine, so that it's concrete value is used throughout subsequent optimisation passes. Added: llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll Modified: clang/test/CodeGenOpenCL/builtins-amdgcn.cl llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll Removed: ################################################################################ diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 3bc6107b7fd40d..c22a43146a8c89 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -1,6 +1,6 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK-AMDGCN,CHECK %s -// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefix=CHECK %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK,CHECK-SPIRV %s #pragma OPENCL EXTENSION cl_khr_fp64 : enable @@ -866,7 +866,8 @@ void test_atomic_inc_dec(__attribute__((address_space(3))) uint *lptr, __attribu // CHECK-LABEL test_wavefrontsize( unsigned test_wavefrontsize() { - // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize() + // CHECK-AMDGCN: ret i32 {{[0-9]+}} + // CHECK-SPIRV: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize() return __builtin_amdgcn_wavefrontsize(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 087de1bed86f76..18a09c39a06387 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1024,6 +1024,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { } break; } + case Intrinsic::amdgcn_wavefrontsize: { + if (ST->isWaveSizeKnown()) + return IC.replaceInstUsesWith( + II, ConstantInt::get(II.getType(), ST->getWavefrontSize())); + break; + } case Intrinsic::amdgcn_wqm_vote: { // wqm_vote is identity when the argument is constant. if (!isa<Constant>(II.getArgOperand(0))) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll index 824d3708c027db..33dd2bd540ad06 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll @@ -4,29 +4,15 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s -; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s - ; GCN-LABEL: {{^}}fold_wavefrontsize: -; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( ; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32 ; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64 ; GCN: store_{{dword|b32}} v{{.+}}, [[V]] -; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() -; OPT: store i32 %tmp, ptr addrspace(1) %arg, align 4 -; OPT-NEXT: ret void define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) { + bb: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 store i32 %tmp, ptr addrspace(1) %arg, align 4 @@ -34,18 +20,12 @@ bb: } ; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize: -; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( ; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}} ; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}} ; GCN-NOT: cndmask ; GCN: store_{{dword|b32}} v{{.+}}, [[V]] -; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() -; OPT: %tmp1 = icmp ugt i32 %tmp, 32 -; OPT: %tmp2 = select i1 %tmp1, i32 2, i32 1 -; OPT: store i32 %tmp2, ptr addrspace(1) %arg -; OPT-NEXT: ret void define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) { bb: @@ -57,13 +37,6 @@ bb: } ; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize: -; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( - -; OPT: bb: -; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() -; OPT: %tmp1 = icmp ugt i32 %tmp, 32 -; OPT: bb3: -; OPT-NEXT: ret void define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) { bb: diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll new file mode 100644 index 00000000000000..d9c105f753e264 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-- -passes=instcombine -S < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s + +define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) { +; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( +; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1:[0-9]+]] +; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4 +; OPT-NEXT: ret void +; +; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize( +; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-W32-NEXT: [[BB:.*:]] +; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4 +; OPT-W32-NEXT: ret void +; +; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize( +; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-W64-NEXT: [[BB:.*:]] +; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4 +; OPT-W64-NEXT: ret void +; +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + store i32 %tmp, ptr addrspace(1) %arg, align 4 + ret void +} + +define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) { +; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( +; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]] +; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 +; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1 +; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4 +; OPT-NEXT: ret void +; +; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( +; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W32-NEXT: [[BB:.*:]] +; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 +; OPT-W32-NEXT: ret void +; +; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( +; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W64-NEXT: [[BB:.*:]] +; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4 +; OPT-W64-NEXT: ret void +; +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + %tmp1 = icmp ugt i32 %tmp, 32 + %tmp2 = select i1 %tmp1, i32 2, i32 1 + store i32 %tmp2, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) { +; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( +; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]] +; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 +; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] +; OPT: [[BB2]]: +; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 +; OPT-NEXT: br label %[[BB3]] +; OPT: [[BB3]]: +; OPT-NEXT: ret void +; +; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( +; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W32-NEXT: [[BB:.*:]] +; OPT-W32-NEXT: br i1 false, label %[[BB2:.*]], label %[[BB3:.*]] +; OPT-W32: [[BB2]]: +; OPT-W32-NEXT: br label %[[BB3]] +; OPT-W32: [[BB3]]: +; OPT-W32-NEXT: ret void +; +; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( +; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W64-NEXT: [[BB:.*:]] +; OPT-W64-NEXT: br i1 true, label %[[BB2:.*]], label %[[BB3:.*]] +; OPT-W64: [[BB2]]: +; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 +; OPT-W64-NEXT: br label %[[BB3]] +; OPT-W64: [[BB3]]: +; OPT-W64-NEXT: ret void +; +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + %tmp1 = icmp ugt i32 %tmp, 32 + br i1 %tmp1, label %bb2, label %bb3 + +bb2: ; preds = %bb + store i32 1, ptr addrspace(1) %arg, align 4 + br label %bb3 + +bb3: ; preds = %bb2, %bb + ret void +} + +declare i32 @llvm.amdgcn.wavefrontsize() #0 + +attributes #0 = { nounwind readnone speculatable } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits