[llvm-branch-commits] [llvm] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` (PR #136798)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/136798 >From 2d75ec2eb1a927513bb92bcb26e313a3831426ef Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 23 Apr 2025 09:17:46 -0400 Subject: [PATCH] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 + llvm/test/CodeGen/AMDGPU/alloca-as0.ll| 122 -- .../InferAddressSpaces/AMDGPU/alloca-as0.ll | 35 + 3 files changed, 90 insertions(+), 70 deletions(-) create mode 100644 llvm/test/Transforms/InferAddressSpaces/AMDGPU/alloca-as0.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b6cc5137d711a..2c4052a30b10f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -951,6 +951,9 @@ bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, } unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { + if (isa(V)) +return AMDGPUAS::PRIVATE_ADDRESS; + const auto *LD = dyn_cast(V); if (!LD) // TODO: Handle invariant load like constant. return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; diff --git a/llvm/test/CodeGen/AMDGPU/alloca-as0.ll b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll index 9fcb362c153ba..5172ff011e45f 100644 --- a/llvm/test/CodeGen/AMDGPU/alloca-as0.ll +++ b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll @@ -14,7 +14,7 @@ define i32 @static_alloca() { ; ISEL-NEXT:buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; ISEL-NEXT:s_mov_b64 exec, s[18:19] ; ISEL-NEXT:s_addk_i32 s32, 0x400 -; ISEL-NEXT:v_writelane_b32 v40, s16, 4 +; ISEL-NEXT:v_writelane_b32 v40, s16, 3 ; ISEL-NEXT:s_getpc_b64 s[16:17] ; ISEL-NEXT:s_add_u32 s16, s16, bar@rel32@lo+4 ; ISEL-NEXT:s_addc_u32 s17, s17, bar@rel32@hi+12 @@ -27,25 +27,22 @@ define i32 @static_alloca() { ; ISEL-NEXT:v_writelane_b32 v40, s34, 2 ; ISEL-NEXT:s_cselect_b32 s34, s18, 0 ; ISEL-NEXT:s_mov_b64 s[18:19], src_private_base -; ISEL-NEXT:v_writelane_b32 v40, s35, 3 -; ISEL-NEXT:s_cselect_b32 s35, s19, 0 +; ISEL-NEXT:s_cselect_b32 s18, s19, 0 ; ISEL-NEXT:v_mov_b32_e32 v0, s34 -; ISEL-NEXT:v_mov_b32_e32 v1, s35 +; ISEL-NEXT:v_mov_b32_e32 v1, s18 ; ISEL-NEXT:s_swappc_b64 s[30:31], s[16:17] ; ISEL-NEXT:v_mov_b32_e32 v0, s34 -; ISEL-NEXT:v_mov_b32_e32 v1, s35 -; ISEL-NEXT:flat_load_dword v0, v[0:1] -; ISEL-NEXT:v_readlane_b32 s35, v40, 3 +; ISEL-NEXT:buffer_load_dword v0, v0, s[0:3], 0 offen ; ISEL-NEXT:v_readlane_b32 s34, v40, 2 ; ISEL-NEXT:v_readlane_b32 s31, v40, 1 ; ISEL-NEXT:v_readlane_b32 s30, v40, 0 ; ISEL-NEXT:s_mov_b32 s32, s33 -; ISEL-NEXT:v_readlane_b32 s4, v40, 4 +; ISEL-NEXT:v_readlane_b32 s4, v40, 3 ; ISEL-NEXT:s_or_saveexec_b64 s[6:7], -1 ; ISEL-NEXT:buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; ISEL-NEXT:s_mov_b64 exec, s[6:7] ; ISEL-NEXT:s_mov_b32 s33, s4 -; ISEL-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; ISEL-NEXT:s_waitcnt vmcnt(0) ; ISEL-NEXT:s_setpc_b64 s[30:31] ; ; GI-LABEL: static_alloca: @@ -56,35 +53,27 @@ define i32 @static_alloca() { ; GI-NEXT:s_or_saveexec_b64 s[18:19], -1 ; GI-NEXT:buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GI-NEXT:s_mov_b64 exec, s[18:19] -; GI-NEXT:v_writelane_b32 v40, s16, 4 -; GI-NEXT:v_writelane_b32 v40, s30, 0 -; GI-NEXT:v_writelane_b32 v40, s31, 1 +; GI-NEXT:v_writelane_b32 v40, s16, 2 ; GI-NEXT:s_addk_i32 s32, 0x400 -; GI-NEXT:v_writelane_b32 v40, s34, 2 -; GI-NEXT:s_lshr_b32 s34, s33, 6 ; GI-NEXT:s_mov_b64 s[16:17], src_private_base +; GI-NEXT:v_writelane_b32 v40, s30, 0 ; GI-NEXT:s_getpc_b64 s[18:19] ; GI-NEXT:s_add_u32 s18, s18, bar@rel32@lo+4 ; GI-NEXT:s_addc_u32 s19, s19, bar@rel32@hi+12 ; GI-NEXT:v_lshrrev_b32_e64 v0, 6, s33 ; GI-NEXT:v_mov_b32_e32 v1, s17 -; GI-NEXT:v_writelane_b32 v40, s35, 3 -; GI-NEXT:s_mov_b32 s35, s17 +; GI-NEXT:v_writelane_b32 v40, s31, 1 ; GI-NEXT:s_swappc_b64 s[30:31], s[18:19] -; GI-NEXT:v_mov_b32_e32 v0, s34 -; GI-NEXT:v_mov_b32_e32 v1, s35 -; GI-NEXT:flat_load_dword v0, v[0:1] -; GI-NEXT:v_readlane_b32 s35, v40, 3 -; GI-NEXT:v_readlane_b32 s34, v40, 2 +; GI-NEXT:buffer_load_dword v0, off, s[0:3], s33 ; GI-NEXT:v_readlane_b32 s31, v40, 1 ; GI-NEXT:v_readlane_b32 s30, v40, 0 ; GI-NEXT:s_mov_b32 s32, s33 -; GI-NEXT:v_readlane_b32 s4, v40, 4 +; GI-NEXT:v_readlane_b32 s4, v40, 2 ; GI-NEXT:s_or_saveexec_b64 s[6:7], -1 ; GI-NEXT:buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GI-NEXT:s_mov_b64 exec, s[6:7] ; GI-NEXT:s_mov_b32 s33, s4 -; GI-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GI-NEXT:s_waitcnt vmcnt(0) ; GI-NEXT:s_setpc
[llvm-branch-commits] [llvm] llvm-reduce: Reduce with early return of arguments (PR #133627)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/133627 >From bded004e4d4dbaf311de6d1bfbb2d443bad023cc Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 24 Mar 2025 14:33:36 +0700 Subject: [PATCH 1/2] llvm-reduce: Reduce with early return of arguments Extend the instruction -> return reduction with one that inserts return of function arguments. Not sure how useful this really is. This has more freedom since we could insert the return anywhere in the function, but this just inserts the return in the entry block. --- .../reduce-values-to-return-args.ll | 77 +++ ...-values-to-return-nonvoid-noncallee-use.ll | 2 +- .../llvm-reduce/reduce-values-to-return.ll| 2 +- llvm/tools/llvm-reduce/DeltaPasses.def| 5 +- .../deltas/ReduceValuesToReturn.cpp | 42 +- .../llvm-reduce/deltas/ReduceValuesToReturn.h | 3 +- 6 files changed, 124 insertions(+), 7 deletions(-) create mode 100644 llvm/test/tools/llvm-reduce/reduce-values-to-return-args.ll diff --git a/llvm/test/tools/llvm-reduce/reduce-values-to-return-args.ll b/llvm/test/tools/llvm-reduce/reduce-values-to-return-args.ll new file mode 100644 index 0..abbc643822033 --- /dev/null +++ b/llvm/test/tools/llvm-reduce/reduce-values-to-return-args.ll @@ -0,0 +1,77 @@ +; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=arguments-to-return --test FileCheck --test-arg --check-prefixes=INTERESTING --test-arg %s --test-arg --input-file %s -o %t +; RUN: FileCheck --check-prefixes=RESULT %s < %t + + +; INTERESTING-LABEL: @move_entry_block_use_argument_to_return(i32 %arg, ptr %ptr) { +; INTERESTING: %arg + +; RESULT-LABEL: define i32 @move_entry_block_use_argument_to_return( +; RESULT-NEXT: ret i32 %arg +; RESULT-NEXT: } +define void @move_entry_block_use_argument_to_return(i32 %arg, ptr %ptr) { + store i32 %arg, ptr %ptr + ret void +} + +; INTERESTING-LABEL: @move_entry_block_use_argument_to_return_existing_ret(i32 %arg, ptr %ptr) { +; INTERESTING: %arg + +; RESULT-LABEL: define i32 @move_entry_block_use_argument_to_return_existing_ret( +; RESULT-NEXT: ret i32 %arg +; RESULT-NEXT: } +define i32 @move_entry_block_use_argument_to_return_existing_ret(i32 %arg, ptr %ptr) { + store i32 %arg, ptr %ptr + ret i32 0 +} + +; INTERESTING-LABEL: @move_phi_block_use_argument_to_return(i32 %arg, ptr %ptr0, ptr %ptr1, i1 %cond0, i1 %cond1) { +; INTERESTING: %arg + +; RESULT-LABEL: define i32 @move_phi_block_use_argument_to_return( +; RESULT-NEXT: entry: +; RESULT-NEXT: ret i32 %arg +define void @move_phi_block_use_argument_to_return(i32 %arg, ptr %ptr0, ptr %ptr1, i1 %cond0, i1 %cond1) { +entry: + br i1 %cond0, label %bb0, label %bb1 + +bb0: + %phi = phi i32 [ %arg, %entry ], [ 123, %bb1 ] + store i32 %arg, ptr %ptr0 + store i32 %phi, ptr %ptr1 + br label %bb1 + +bb1: + br i1 %cond1, label %bb0, label %bb2 + +bb2: + ret void +} + +; INTERESTING-LABEL: define {{.*}} @keep_second_arg(i32 %arg0, ptr %arg1) { +; INTERESTING: %arg1 + +; RESULT-LABEL: define ptr @keep_second_arg( +; RESULT-NEXT: ret ptr %arg1 +; RESULT-NEXT: } +define void @keep_second_arg(i32 %arg0, ptr %arg1) { + store i32 %arg0, ptr %arg1 + ret void +} + +; INTERESTING-LABEL: @multi_void_return_arg(i1 %arg0, ptr %arg1, i32 %arg2) { +; INTERESTING: i32 %arg2 + +; RESULT-LABEL: define i32 @multi_void_return_arg(i1 %arg0, ptr %arg1, i32 %arg2) { +; RESULT-NEXT: entry: +; RESULT-NEXT: ret i32 %arg2 +define void @multi_void_return_arg(i1 %arg0, ptr %arg1, i32 %arg2) { +entry: + br i1 %arg0, label %bb0, label %bb1 + +bb0: + store i32 %arg2, ptr %arg1 + ret void + +bb1: + ret void +} diff --git a/llvm/test/tools/llvm-reduce/reduce-values-to-return-nonvoid-noncallee-use.ll b/llvm/test/tools/llvm-reduce/reduce-values-to-return-nonvoid-noncallee-use.ll index 215ea97a8be91..11166479318c6 100644 --- a/llvm/test/tools/llvm-reduce/reduce-values-to-return-nonvoid-noncallee-use.ll +++ b/llvm/test/tools/llvm-reduce/reduce-values-to-return-nonvoid-noncallee-use.ll @@ -1,7 +1,7 @@ ; Make sure we don't break on non-callee uses of funtions with a ; non-void return type. -; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=values-to-return --test FileCheck --test-arg --check-prefix=INTERESTING --test-arg %s --test-arg --input-file %s -o %t +; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=instructions-to-return --test FileCheck --test-arg --check-prefix=INTERESTING --test-arg %s --test-arg --input-file %s -o %t ; RUN: FileCheck --check-prefix=RESULT %s < %t ; INTERESTING-LABEL: @interesting( diff --git a/llvm/test/tools/llvm-reduce/reduce-values-to-return.ll b/llvm/test/tools/llvm-reduce/reduce-values-to-return.ll index 0c36db8ebc278..2af87aad05169 100644 --- a/llvm/test/tools/llvm-reduce/reduce-values-to-return.ll +++ b/llvm/test/tools/llvm-reduce/reduce-values-to-return.ll @@ -1,7 +1,7 @@ ; Test that llvm-reduce can move intermediate values by inserting ;
[llvm-branch-commits] [llvm] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` (PR #136798)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/136798 >From 9d2612c4379eb827406642b508f2dce32fc13e59 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 23 Apr 2025 09:17:46 -0400 Subject: [PATCH] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 + llvm/test/CodeGen/AMDGPU/alloca-as0.ll| 122 -- .../InferAddressSpaces/AMDGPU/alloca-as0.ll | 35 + 3 files changed, 90 insertions(+), 70 deletions(-) create mode 100644 llvm/test/Transforms/InferAddressSpaces/AMDGPU/alloca-as0.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b6cc5137d711a..2c4052a30b10f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -951,6 +951,9 @@ bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, } unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { + if (isa(V)) +return AMDGPUAS::PRIVATE_ADDRESS; + const auto *LD = dyn_cast(V); if (!LD) // TODO: Handle invariant load like constant. return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; diff --git a/llvm/test/CodeGen/AMDGPU/alloca-as0.ll b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll index 9fcb362c153ba..5172ff011e45f 100644 --- a/llvm/test/CodeGen/AMDGPU/alloca-as0.ll +++ b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll @@ -14,7 +14,7 @@ define i32 @static_alloca() { ; ISEL-NEXT:buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; ISEL-NEXT:s_mov_b64 exec, s[18:19] ; ISEL-NEXT:s_addk_i32 s32, 0x400 -; ISEL-NEXT:v_writelane_b32 v40, s16, 4 +; ISEL-NEXT:v_writelane_b32 v40, s16, 3 ; ISEL-NEXT:s_getpc_b64 s[16:17] ; ISEL-NEXT:s_add_u32 s16, s16, bar@rel32@lo+4 ; ISEL-NEXT:s_addc_u32 s17, s17, bar@rel32@hi+12 @@ -27,25 +27,22 @@ define i32 @static_alloca() { ; ISEL-NEXT:v_writelane_b32 v40, s34, 2 ; ISEL-NEXT:s_cselect_b32 s34, s18, 0 ; ISEL-NEXT:s_mov_b64 s[18:19], src_private_base -; ISEL-NEXT:v_writelane_b32 v40, s35, 3 -; ISEL-NEXT:s_cselect_b32 s35, s19, 0 +; ISEL-NEXT:s_cselect_b32 s18, s19, 0 ; ISEL-NEXT:v_mov_b32_e32 v0, s34 -; ISEL-NEXT:v_mov_b32_e32 v1, s35 +; ISEL-NEXT:v_mov_b32_e32 v1, s18 ; ISEL-NEXT:s_swappc_b64 s[30:31], s[16:17] ; ISEL-NEXT:v_mov_b32_e32 v0, s34 -; ISEL-NEXT:v_mov_b32_e32 v1, s35 -; ISEL-NEXT:flat_load_dword v0, v[0:1] -; ISEL-NEXT:v_readlane_b32 s35, v40, 3 +; ISEL-NEXT:buffer_load_dword v0, v0, s[0:3], 0 offen ; ISEL-NEXT:v_readlane_b32 s34, v40, 2 ; ISEL-NEXT:v_readlane_b32 s31, v40, 1 ; ISEL-NEXT:v_readlane_b32 s30, v40, 0 ; ISEL-NEXT:s_mov_b32 s32, s33 -; ISEL-NEXT:v_readlane_b32 s4, v40, 4 +; ISEL-NEXT:v_readlane_b32 s4, v40, 3 ; ISEL-NEXT:s_or_saveexec_b64 s[6:7], -1 ; ISEL-NEXT:buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; ISEL-NEXT:s_mov_b64 exec, s[6:7] ; ISEL-NEXT:s_mov_b32 s33, s4 -; ISEL-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; ISEL-NEXT:s_waitcnt vmcnt(0) ; ISEL-NEXT:s_setpc_b64 s[30:31] ; ; GI-LABEL: static_alloca: @@ -56,35 +53,27 @@ define i32 @static_alloca() { ; GI-NEXT:s_or_saveexec_b64 s[18:19], -1 ; GI-NEXT:buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GI-NEXT:s_mov_b64 exec, s[18:19] -; GI-NEXT:v_writelane_b32 v40, s16, 4 -; GI-NEXT:v_writelane_b32 v40, s30, 0 -; GI-NEXT:v_writelane_b32 v40, s31, 1 +; GI-NEXT:v_writelane_b32 v40, s16, 2 ; GI-NEXT:s_addk_i32 s32, 0x400 -; GI-NEXT:v_writelane_b32 v40, s34, 2 -; GI-NEXT:s_lshr_b32 s34, s33, 6 ; GI-NEXT:s_mov_b64 s[16:17], src_private_base +; GI-NEXT:v_writelane_b32 v40, s30, 0 ; GI-NEXT:s_getpc_b64 s[18:19] ; GI-NEXT:s_add_u32 s18, s18, bar@rel32@lo+4 ; GI-NEXT:s_addc_u32 s19, s19, bar@rel32@hi+12 ; GI-NEXT:v_lshrrev_b32_e64 v0, 6, s33 ; GI-NEXT:v_mov_b32_e32 v1, s17 -; GI-NEXT:v_writelane_b32 v40, s35, 3 -; GI-NEXT:s_mov_b32 s35, s17 +; GI-NEXT:v_writelane_b32 v40, s31, 1 ; GI-NEXT:s_swappc_b64 s[30:31], s[18:19] -; GI-NEXT:v_mov_b32_e32 v0, s34 -; GI-NEXT:v_mov_b32_e32 v1, s35 -; GI-NEXT:flat_load_dword v0, v[0:1] -; GI-NEXT:v_readlane_b32 s35, v40, 3 -; GI-NEXT:v_readlane_b32 s34, v40, 2 +; GI-NEXT:buffer_load_dword v0, off, s[0:3], s33 ; GI-NEXT:v_readlane_b32 s31, v40, 1 ; GI-NEXT:v_readlane_b32 s30, v40, 0 ; GI-NEXT:s_mov_b32 s32, s33 -; GI-NEXT:v_readlane_b32 s4, v40, 4 +; GI-NEXT:v_readlane_b32 s4, v40, 2 ; GI-NEXT:s_or_saveexec_b64 s[6:7], -1 ; GI-NEXT:buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GI-NEXT:s_mov_b64 exec, s[6:7] ; GI-NEXT:s_mov_b32 s33, s4 -; GI-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GI-NEXT:s_waitcnt vmcnt(0) ; GI-NEXT:s_setpc
[llvm-branch-commits] [llvm] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` (PR #136798)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/136798 >From 2d75ec2eb1a927513bb92bcb26e313a3831426ef Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 23 Apr 2025 09:17:46 -0400 Subject: [PATCH] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 + llvm/test/CodeGen/AMDGPU/alloca-as0.ll| 122 -- .../InferAddressSpaces/AMDGPU/alloca-as0.ll | 35 + 3 files changed, 90 insertions(+), 70 deletions(-) create mode 100644 llvm/test/Transforms/InferAddressSpaces/AMDGPU/alloca-as0.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b6cc5137d711a..2c4052a30b10f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -951,6 +951,9 @@ bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, } unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { + if (isa(V)) +return AMDGPUAS::PRIVATE_ADDRESS; + const auto *LD = dyn_cast(V); if (!LD) // TODO: Handle invariant load like constant. return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; diff --git a/llvm/test/CodeGen/AMDGPU/alloca-as0.ll b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll index 9fcb362c153ba..5172ff011e45f 100644 --- a/llvm/test/CodeGen/AMDGPU/alloca-as0.ll +++ b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll @@ -14,7 +14,7 @@ define i32 @static_alloca() { ; ISEL-NEXT:buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; ISEL-NEXT:s_mov_b64 exec, s[18:19] ; ISEL-NEXT:s_addk_i32 s32, 0x400 -; ISEL-NEXT:v_writelane_b32 v40, s16, 4 +; ISEL-NEXT:v_writelane_b32 v40, s16, 3 ; ISEL-NEXT:s_getpc_b64 s[16:17] ; ISEL-NEXT:s_add_u32 s16, s16, bar@rel32@lo+4 ; ISEL-NEXT:s_addc_u32 s17, s17, bar@rel32@hi+12 @@ -27,25 +27,22 @@ define i32 @static_alloca() { ; ISEL-NEXT:v_writelane_b32 v40, s34, 2 ; ISEL-NEXT:s_cselect_b32 s34, s18, 0 ; ISEL-NEXT:s_mov_b64 s[18:19], src_private_base -; ISEL-NEXT:v_writelane_b32 v40, s35, 3 -; ISEL-NEXT:s_cselect_b32 s35, s19, 0 +; ISEL-NEXT:s_cselect_b32 s18, s19, 0 ; ISEL-NEXT:v_mov_b32_e32 v0, s34 -; ISEL-NEXT:v_mov_b32_e32 v1, s35 +; ISEL-NEXT:v_mov_b32_e32 v1, s18 ; ISEL-NEXT:s_swappc_b64 s[30:31], s[16:17] ; ISEL-NEXT:v_mov_b32_e32 v0, s34 -; ISEL-NEXT:v_mov_b32_e32 v1, s35 -; ISEL-NEXT:flat_load_dword v0, v[0:1] -; ISEL-NEXT:v_readlane_b32 s35, v40, 3 +; ISEL-NEXT:buffer_load_dword v0, v0, s[0:3], 0 offen ; ISEL-NEXT:v_readlane_b32 s34, v40, 2 ; ISEL-NEXT:v_readlane_b32 s31, v40, 1 ; ISEL-NEXT:v_readlane_b32 s30, v40, 0 ; ISEL-NEXT:s_mov_b32 s32, s33 -; ISEL-NEXT:v_readlane_b32 s4, v40, 4 +; ISEL-NEXT:v_readlane_b32 s4, v40, 3 ; ISEL-NEXT:s_or_saveexec_b64 s[6:7], -1 ; ISEL-NEXT:buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; ISEL-NEXT:s_mov_b64 exec, s[6:7] ; ISEL-NEXT:s_mov_b32 s33, s4 -; ISEL-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; ISEL-NEXT:s_waitcnt vmcnt(0) ; ISEL-NEXT:s_setpc_b64 s[30:31] ; ; GI-LABEL: static_alloca: @@ -56,35 +53,27 @@ define i32 @static_alloca() { ; GI-NEXT:s_or_saveexec_b64 s[18:19], -1 ; GI-NEXT:buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GI-NEXT:s_mov_b64 exec, s[18:19] -; GI-NEXT:v_writelane_b32 v40, s16, 4 -; GI-NEXT:v_writelane_b32 v40, s30, 0 -; GI-NEXT:v_writelane_b32 v40, s31, 1 +; GI-NEXT:v_writelane_b32 v40, s16, 2 ; GI-NEXT:s_addk_i32 s32, 0x400 -; GI-NEXT:v_writelane_b32 v40, s34, 2 -; GI-NEXT:s_lshr_b32 s34, s33, 6 ; GI-NEXT:s_mov_b64 s[16:17], src_private_base +; GI-NEXT:v_writelane_b32 v40, s30, 0 ; GI-NEXT:s_getpc_b64 s[18:19] ; GI-NEXT:s_add_u32 s18, s18, bar@rel32@lo+4 ; GI-NEXT:s_addc_u32 s19, s19, bar@rel32@hi+12 ; GI-NEXT:v_lshrrev_b32_e64 v0, 6, s33 ; GI-NEXT:v_mov_b32_e32 v1, s17 -; GI-NEXT:v_writelane_b32 v40, s35, 3 -; GI-NEXT:s_mov_b32 s35, s17 +; GI-NEXT:v_writelane_b32 v40, s31, 1 ; GI-NEXT:s_swappc_b64 s[30:31], s[18:19] -; GI-NEXT:v_mov_b32_e32 v0, s34 -; GI-NEXT:v_mov_b32_e32 v1, s35 -; GI-NEXT:flat_load_dword v0, v[0:1] -; GI-NEXT:v_readlane_b32 s35, v40, 3 -; GI-NEXT:v_readlane_b32 s34, v40, 2 +; GI-NEXT:buffer_load_dword v0, off, s[0:3], s33 ; GI-NEXT:v_readlane_b32 s31, v40, 1 ; GI-NEXT:v_readlane_b32 s30, v40, 0 ; GI-NEXT:s_mov_b32 s32, s33 -; GI-NEXT:v_readlane_b32 s4, v40, 4 +; GI-NEXT:v_readlane_b32 s4, v40, 2 ; GI-NEXT:s_or_saveexec_b64 s[6:7], -1 ; GI-NEXT:buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GI-NEXT:s_mov_b64 exec, s[6:7] ; GI-NEXT:s_mov_b32 s33, s4 -; GI-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GI-NEXT:s_waitcnt vmcnt(0) ; GI-NEXT:s_setpc
[llvm-branch-commits] [llvm] [llvm] Add option to emit `callgraph` section (PR #87574)
https://github.com/Prabhuk updated https://github.com/llvm/llvm-project/pull/87574 >From 1d7ee612e408ee7e64e984eb08e6d7089a435d09 Mon Sep 17 00:00:00 2001 From: Necip Fazil Yildiran Date: Sun, 2 Feb 2025 00:58:49 + Subject: [PATCH 1/4] Simplify MIR test. Created using spr 1.3.6-beta.1 --- .../CodeGen/MIR/X86/call-site-info-typeid.mir | 21 ++- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-typeid.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-typeid.mir index 5ab797bfcc18f..a99ee50a608fb 100644 --- a/llvm/test/CodeGen/MIR/X86/call-site-info-typeid.mir +++ b/llvm/test/CodeGen/MIR/X86/call-site-info-typeid.mir @@ -8,11 +8,6 @@ # CHECK-NEXT: 123456789 } --- | - ; ModuleID = 'test.ll' - source_filename = "test.ll" - target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" - target triple = "x86_64-unknown-linux-gnu" - define dso_local void @foo(i8 signext %a) { entry: ret void @@ -21,10 +16,10 @@ define dso_local i32 @main() { entry: %retval = alloca i32, align 4 -%fp = alloca void (i8)*, align 8 -store i32 0, i32* %retval, align 4 -store void (i8)* @foo, void (i8)** %fp, align 8 -%0 = load void (i8)*, void (i8)** %fp, align 8 +%fp = alloca ptr, align 8 +store i32 0, ptr %retval, align 4 +store ptr @foo, ptr %fp, align 8 +%0 = load ptr, ptr %fp, align 8 call void %0(i8 signext 97) ret i32 0 } @@ -42,12 +37,8 @@ body: | name:main tracksRegLiveness: true stack: - - { id: 0, name: retval, type: default, offset: 0, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: fp, type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 0, name: retval, size: 4, alignment: 4 } + - { id: 1, name: fp, size: 8, alignment: 8 } callSites: - { bb: 0, offset: 6, fwdArgRegs: [], typeId: 123456789 } >From 86e2c9dc37170499252ed50c6bbef2931e106fbb Mon Sep 17 00:00:00 2001 From: prabhukr Date: Thu, 13 Mar 2025 01:03:40 + Subject: [PATCH 2/4] Add requested tests part 1. Created using spr 1.3.6-beta.1 --- ...te-info-ambiguous-indirect-call-typeid.mir | 145 ++ .../call-site-info-direct-calls-typeid.mir| 145 ++ 2 files changed, 290 insertions(+) create mode 100644 llvm/test/CodeGen/MIR/X86/call-site-info-ambiguous-indirect-call-typeid.mir create mode 100644 llvm/test/CodeGen/MIR/X86/call-site-info-direct-calls-typeid.mir diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-ambiguous-indirect-call-typeid.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-ambiguous-indirect-call-typeid.mir new file mode 100644 index 0..9d1b099cc9093 --- /dev/null +++ b/llvm/test/CodeGen/MIR/X86/call-site-info-ambiguous-indirect-call-typeid.mir @@ -0,0 +1,145 @@ +# Test MIR printer and parser for type id field in callSites. It is used +# for propogating call site type identifiers to emit in the call graph section. + +# RUN: llc --call-graph-section %s -run-pass=none -o - | FileCheck %s +# CHECK: name: main +# CHECK: callSites: +# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [] +# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [], typeId: +# CHECK-NEXT: 1234567890 } + +--- | + ; Function Attrs: mustprogress noinline nounwind optnone uwtable + define dso_local noundef i32 @_Z3addii(i32 noundef %a, i32 noundef %b) #0 !type !6 !type !6 { + entry: +%a.addr = alloca i32, align 4 +%b.addr = alloca i32, align 4 +store i32 %a, ptr %a.addr, align 4 +store i32 %b, ptr %b.addr, align 4 +%0 = load i32, ptr %a.addr, align 4 +%1 = load i32, ptr %b.addr, align 4 +%add = add nsw i32 %0, %1 +ret i32 %add + } + + ; Function Attrs: mustprogress noinline nounwind optnone uwtable + define dso_local noundef i32 @_Z8multiplyii(i32 noundef %a, i32 noundef %b) #0 !type !6 !type !6 { + entry: +%a.addr = alloca i32, align 4 +%b.addr = alloca i32, align 4 +store i32 %a, ptr %a.addr, align 4 +store i32 %b, ptr %b.addr, align 4 +%0 = load i32, ptr %a.addr, align 4 +%1 = load i32, ptr %b.addr, align 4 +%mul = mul nsw i32 %0, %1 +ret i32 %mul + } + + ; Function Attrs: mustprogress noinline nounwind optnone uwtable + define dso_local noundef ptr @_Z13get_operationb(i1 noundef zeroext %is_addition) #0 !type !7 !type !7 { + entry: +%is_addition.addr = alloca i8, align 1 +%storedv = zext i1 %is_addition to i8 +store i8 %storedv, ptr %is_addition.addr, align 1 +%0 = load i8, ptr %is_addition.addr, align 1 +%loadedv = trunc i8 %0 to i1 +br i1 %loade
[llvm-branch-commits] LiveRangeShrink: Early exit when encountering a code motion barrier. (PR #136806)
@@ -95,14 +95,24 @@ static MachineInstr *FindDominatedInstruction(MachineInstr &New, return Old; } +static bool isCodeMotionBarrier(MachineInstr &MI) { + return MI.hasUnmodeledSideEffects() && !MI.isPseudoProbe(); +} + /// Builds Instruction to its dominating order number map \p M by traversing /// from instruction \p Start. static void BuildInstOrderMap(MachineBasicBlock::iterator Start, InstOrderMap &M) { M.clear(); unsigned i = 0; - for (MachineInstr &I : make_range(Start, Start->getParent()->end())) + bool SawStore = false; + for (MachineInstr &I : make_range(Start, Start->getParent()->end())) { +if (I.mayStore()) + SawStore = true; +if (!I.isSafeToMove(SawStore) && isCodeMotionBarrier(I)) pcc wrote: Correct, updated this code to only call one. While reading the code I observed that the code to update SawStore in this pass is redundant with what isSafeToMove is doing, so I removed it. https://github.com/llvm/llvm-project/pull/136806 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Attributor] Use `getAssumedAddrSpace` to get address space for `AllocaInst` (PR #136865)
@@ -12603,6 +12603,18 @@ struct AAAddressSpaceImpl : public AAAddressSpace { auto CheckAddressSpace = [&](Value &Obj) { if (isa(&Obj)) return true; + // Some targets relax the requirement for alloca to be in an exact address + // space, allowing it in certain other address spaces instead. These + // targets later lower alloca to the correct address space in the + // pipeline. Therefore, we need to query TTI to determine the appropriate + // address space. + if (auto *AI = dyn_cast(&Obj)) { +Function *Fn = AI->getFunction(); +auto *TTI = +A.getInfoCache().getAnalysisResultForFunction( +*Fn); +return takeAddressSpace(TTI->getAssumedAddrSpace(AI)); shiltian wrote: This file is in middle end, so we can't access target machine, same as InferAddressSpacePass. https://github.com/llvm/llvm-project/pull/136865 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Attributor] Use `getAssumedAddrSpace` to get address space for `AllocaInst` (PR #136865)
shiltian wrote: > In the real world, people emit address space 0 allocas all over the place and > then report backend bugs when it fails in codegen Technically we can avoid that by just hard error https://github.com/llvm/llvm-project/pull/136865 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] Fix formatting (PR #136847)
https://github.com/rovka closed https://github.com/llvm/llvm-project/pull/136847 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] Propagate DebugLocs on phis in BreakCriticalEdges (PR #133492)
OCHyams wrote: > Seems fine; although why's it needed for key instructions, I have a vague > recollection that LLVM doesn't actually care about the source locations > attached to PHIs? The motivation came from reviewing code duplication sites to update for Key Instructions, finding this, trying to generate a test case and seeing the DebugLocs aren't propagated. That is it say it's not massively principled, and I can't remember off-hand whether this affected any of the "real code" I tested the feature on (I have a feeling the answer is no). I'm also not 100% sure if there's a good "policy" in place for PHI debug locs (paging @SLTozer) - I made this change erring on the side of correct-but-maybe-unnecessary. If we know it's unnecessary, we can ditch this patch and the next. https://github.com/llvm/llvm-project/pull/133492 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [NFC][RootSignatures] Conform to new std::optional calling conventions (PR #136747)
damyanp wrote: Is this ready to retarget to main? https://github.com/llvm/llvm-project/pull/136747 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [HLSL] Allow resource annotations to specify only register space (PR #135287)
@@ -4723,20 +4723,25 @@ def HLSLResourceBinding: InheritableAttr { private: RegisterType RegType; - unsigned SlotNumber; + int SlotNumber; // -1 if the register slot was not specified hekota wrote: Good catch! DXC actually ignores the register `u4294967295` and uses `u0` instead, but that is a bug. :) We should support the whole `uint32_t` range. https://github.com/llvm/llvm-project/pull/135287 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [llvm] Extract and propagate indirect call type id (PR #87575)
https://github.com/Prabhuk updated https://github.com/llvm/llvm-project/pull/87575 >From 1a8d810d352fbe84c0521c7614689b60ade693c8 Mon Sep 17 00:00:00 2001 From: Necip Fazil Yildiran Date: Tue, 19 Nov 2024 15:25:34 -0800 Subject: [PATCH 1/5] Fixed the tests and addressed most of the review comments. Created using spr 1.3.6-beta.1 --- llvm/include/llvm/CodeGen/MachineFunction.h | 15 +++-- .../CodeGen/AArch64/call-site-info-typeid.ll | 28 +++-- .../test/CodeGen/ARM/call-site-info-typeid.ll | 28 +++-- .../CodeGen/MIR/X86/call-site-info-typeid.ll | 58 --- .../CodeGen/MIR/X86/call-site-info-typeid.mir | 13 ++--- .../CodeGen/Mips/call-site-info-typeid.ll | 28 +++-- .../test/CodeGen/X86/call-site-info-typeid.ll | 28 +++-- 7 files changed, 71 insertions(+), 127 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h index bb0b87a3a04a3..44633df38a651 100644 --- a/llvm/include/llvm/CodeGen/MachineFunction.h +++ b/llvm/include/llvm/CodeGen/MachineFunction.h @@ -493,7 +493,7 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction { /// Callee type id. ConstantInt *TypeId = nullptr; -CallSiteInfo() {} +CallSiteInfo() = default; /// Extracts the numeric type id from the CallBase's type operand bundle, /// and sets TypeId. This is used as type id for the indirect call in the @@ -503,12 +503,11 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction { if (!CB.isIndirectCall()) return; - auto Opt = CB.getOperandBundle(LLVMContext::OB_type); - if (!Opt.has_value()) { -errs() << "warning: cannot find indirect call type operand bundle for " - "call graph section\n"; + std::optional Opt = + CB.getOperandBundle(LLVMContext::OB_type); + // Return if the operand bundle for call graph section cannot be found. + if (!Opt.has_value()) return; - } // Get generalized type id string auto OB = Opt.value(); @@ -520,9 +519,9 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction { "invalid type identifier"); // Compute numeric type id from generalized type id string - uint64_t TypeIdVal = llvm::MD5Hash(TypeIdStr->getString()); + uint64_t TypeIdVal = MD5Hash(TypeIdStr->getString()); IntegerType *Int64Ty = Type::getInt64Ty(CB.getContext()); - TypeId = llvm::ConstantInt::get(Int64Ty, TypeIdVal, /*IsSigned=*/false); + TypeId = ConstantInt::get(Int64Ty, TypeIdVal, /*IsSigned=*/false); } }; diff --git a/llvm/test/CodeGen/AArch64/call-site-info-typeid.ll b/llvm/test/CodeGen/AArch64/call-site-info-typeid.ll index f0a6b44755c5c..f3b98c2c7a395 100644 --- a/llvm/test/CodeGen/AArch64/call-site-info-typeid.ll +++ b/llvm/test/CodeGen/AArch64/call-site-info-typeid.ll @@ -1,14 +1,9 @@ -; Tests that call site type ids can be extracted and set from type operand -; bundles. +;; Tests that call site type ids can be extracted and set from type operand +;; bundles. -; Verify the exact typeId value to ensure it is not garbage but the value -; computed as the type id from the type operand bundle. -; RUN: llc --call-graph-section -mtriple aarch64-linux-gnu %s -stop-before=finalize-isel -o - | FileCheck %s - -; ModuleID = 'test.c' -source_filename = "test.c" -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-unknown-linux-gnu" +;; Verify the exact typeId value to ensure it is not garbage but the value +;; computed as the type id from the type operand bundle. +; RUN: llc --call-graph-section -mtriple aarch64-linux-gnu < %s -stop-before=finalize-isel -o - | FileCheck %s define dso_local void @foo(i8 signext %a) !type !3 { entry: @@ -19,10 +14,10 @@ entry: define dso_local i32 @main() !type !4 { entry: %retval = alloca i32, align 4 - %fp = alloca void (i8)*, align 8 - store i32 0, i32* %retval, align 4 - store void (i8)* @foo, void (i8)** %fp, align 8 - %0 = load void (i8)*, void (i8)** %fp, align 8 + %fp = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store ptr @foo, ptr %fp, align 8 + %0 = load ptr, ptr %fp, align 8 ; CHECK: callSites: ; CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [], typeId: ; CHECK-NEXT: 7854600665770582568 } @@ -30,10 +25,5 @@ entry: ret i32 0 } -!llvm.module.flags = !{!0, !1, !2} - -!0 = !{i32 1, !"wchar_size", i32 4} -!1 = !{i32 7, !"uwtable", i32 1} -!2 = !{i32 7, !"frame-pointer", i32 2} !3 = !{i64 0, !"_ZTSFvcE.generalized"} !4 = !{i64 0, !"_ZTSFiE.generalized"} diff --git a/llvm/test/CodeGen/ARM/call-site-info-typeid.ll b/llvm/test/CodeGen/ARM/call-site-info-typeid.ll index ec7f8a425051b..9feeef9a564cc 100644 --- a/llvm/test/CodeGen/ARM/call-site-info-typeid.ll +++ b/llvm/test/CodeGen/ARM/call-site-info-typeid.ll @@ -1,14 +1,9 @@ -; Tests that call site type ids can be extracted and set from type operand -; bundles. +;; Tests that ca
[llvm-branch-commits] [llvm] [KeyInstr][DwarfDebug] Add is_stmt emission support (PR #133495)
https://github.com/jmorse edited https://github.com/llvm/llvm-project/pull/133495 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [DirectX] Adding support for Root Descriptor in Obj2yaml/Yaml2Obj (PR #136732)
@@ -594,6 +599,25 @@ struct RootConstants { sys::swapByteOrder(Num32BitValues); } }; +struct RootDescriptor_V1_0 { inbelic wrote: Having poked a bit more in `DXContainer.h`, maybe it would be best to follow how it is done for `RuntimeInfo`? Defining a new namespace for each version and then having all later version inheret from the previous version. So something like: ``` namespace v0 { struct RootDescriptor { uint32_t ShaderRegister; uint32_t RegisterSpace; void swapbytes() {...} }; } namespace v1 { struct RootDescriptor : public v0::RootDescriptor { uint32_t Flags; void swapbytes() {...} }; } ``` https://github.com/llvm/llvm-project/pull/136732 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` (PR #136798)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/136798 >From 9d2612c4379eb827406642b508f2dce32fc13e59 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 23 Apr 2025 09:17:46 -0400 Subject: [PATCH] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 + llvm/test/CodeGen/AMDGPU/alloca-as0.ll| 122 -- .../InferAddressSpaces/AMDGPU/alloca-as0.ll | 35 + 3 files changed, 90 insertions(+), 70 deletions(-) create mode 100644 llvm/test/Transforms/InferAddressSpaces/AMDGPU/alloca-as0.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b6cc5137d711a..2c4052a30b10f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -951,6 +951,9 @@ bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, } unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { + if (isa(V)) +return AMDGPUAS::PRIVATE_ADDRESS; + const auto *LD = dyn_cast(V); if (!LD) // TODO: Handle invariant load like constant. return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; diff --git a/llvm/test/CodeGen/AMDGPU/alloca-as0.ll b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll index 9fcb362c153ba..5172ff011e45f 100644 --- a/llvm/test/CodeGen/AMDGPU/alloca-as0.ll +++ b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll @@ -14,7 +14,7 @@ define i32 @static_alloca() { ; ISEL-NEXT:buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; ISEL-NEXT:s_mov_b64 exec, s[18:19] ; ISEL-NEXT:s_addk_i32 s32, 0x400 -; ISEL-NEXT:v_writelane_b32 v40, s16, 4 +; ISEL-NEXT:v_writelane_b32 v40, s16, 3 ; ISEL-NEXT:s_getpc_b64 s[16:17] ; ISEL-NEXT:s_add_u32 s16, s16, bar@rel32@lo+4 ; ISEL-NEXT:s_addc_u32 s17, s17, bar@rel32@hi+12 @@ -27,25 +27,22 @@ define i32 @static_alloca() { ; ISEL-NEXT:v_writelane_b32 v40, s34, 2 ; ISEL-NEXT:s_cselect_b32 s34, s18, 0 ; ISEL-NEXT:s_mov_b64 s[18:19], src_private_base -; ISEL-NEXT:v_writelane_b32 v40, s35, 3 -; ISEL-NEXT:s_cselect_b32 s35, s19, 0 +; ISEL-NEXT:s_cselect_b32 s18, s19, 0 ; ISEL-NEXT:v_mov_b32_e32 v0, s34 -; ISEL-NEXT:v_mov_b32_e32 v1, s35 +; ISEL-NEXT:v_mov_b32_e32 v1, s18 ; ISEL-NEXT:s_swappc_b64 s[30:31], s[16:17] ; ISEL-NEXT:v_mov_b32_e32 v0, s34 -; ISEL-NEXT:v_mov_b32_e32 v1, s35 -; ISEL-NEXT:flat_load_dword v0, v[0:1] -; ISEL-NEXT:v_readlane_b32 s35, v40, 3 +; ISEL-NEXT:buffer_load_dword v0, v0, s[0:3], 0 offen ; ISEL-NEXT:v_readlane_b32 s34, v40, 2 ; ISEL-NEXT:v_readlane_b32 s31, v40, 1 ; ISEL-NEXT:v_readlane_b32 s30, v40, 0 ; ISEL-NEXT:s_mov_b32 s32, s33 -; ISEL-NEXT:v_readlane_b32 s4, v40, 4 +; ISEL-NEXT:v_readlane_b32 s4, v40, 3 ; ISEL-NEXT:s_or_saveexec_b64 s[6:7], -1 ; ISEL-NEXT:buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; ISEL-NEXT:s_mov_b64 exec, s[6:7] ; ISEL-NEXT:s_mov_b32 s33, s4 -; ISEL-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; ISEL-NEXT:s_waitcnt vmcnt(0) ; ISEL-NEXT:s_setpc_b64 s[30:31] ; ; GI-LABEL: static_alloca: @@ -56,35 +53,27 @@ define i32 @static_alloca() { ; GI-NEXT:s_or_saveexec_b64 s[18:19], -1 ; GI-NEXT:buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GI-NEXT:s_mov_b64 exec, s[18:19] -; GI-NEXT:v_writelane_b32 v40, s16, 4 -; GI-NEXT:v_writelane_b32 v40, s30, 0 -; GI-NEXT:v_writelane_b32 v40, s31, 1 +; GI-NEXT:v_writelane_b32 v40, s16, 2 ; GI-NEXT:s_addk_i32 s32, 0x400 -; GI-NEXT:v_writelane_b32 v40, s34, 2 -; GI-NEXT:s_lshr_b32 s34, s33, 6 ; GI-NEXT:s_mov_b64 s[16:17], src_private_base +; GI-NEXT:v_writelane_b32 v40, s30, 0 ; GI-NEXT:s_getpc_b64 s[18:19] ; GI-NEXT:s_add_u32 s18, s18, bar@rel32@lo+4 ; GI-NEXT:s_addc_u32 s19, s19, bar@rel32@hi+12 ; GI-NEXT:v_lshrrev_b32_e64 v0, 6, s33 ; GI-NEXT:v_mov_b32_e32 v1, s17 -; GI-NEXT:v_writelane_b32 v40, s35, 3 -; GI-NEXT:s_mov_b32 s35, s17 +; GI-NEXT:v_writelane_b32 v40, s31, 1 ; GI-NEXT:s_swappc_b64 s[30:31], s[18:19] -; GI-NEXT:v_mov_b32_e32 v0, s34 -; GI-NEXT:v_mov_b32_e32 v1, s35 -; GI-NEXT:flat_load_dword v0, v[0:1] -; GI-NEXT:v_readlane_b32 s35, v40, 3 -; GI-NEXT:v_readlane_b32 s34, v40, 2 +; GI-NEXT:buffer_load_dword v0, off, s[0:3], s33 ; GI-NEXT:v_readlane_b32 s31, v40, 1 ; GI-NEXT:v_readlane_b32 s30, v40, 0 ; GI-NEXT:s_mov_b32 s32, s33 -; GI-NEXT:v_readlane_b32 s4, v40, 4 +; GI-NEXT:v_readlane_b32 s4, v40, 2 ; GI-NEXT:s_or_saveexec_b64 s[6:7], -1 ; GI-NEXT:buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GI-NEXT:s_mov_b64 exec, s[6:7] ; GI-NEXT:s_mov_b32 s33, s4 -; GI-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GI-NEXT:s_waitcnt vmcnt(0) ; GI-NEXT:s_setpc
[llvm-branch-commits] [llvm] Propagate DebugLocs on phis in BreakCriticalEdges (PR #133492)
jmorse wrote: It can't hurt, let's making things slightly more correct! https://github.com/llvm/llvm-project/pull/133492 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [HLSL][RootSignature] Add parsing of ShaderVisibility to DescriptorTable (PR #136751)
https://github.com/bogner approved this pull request. https://github.com/llvm/llvm-project/pull/136751 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] Backport to 20.x "[clang][analyzer] Fix error path of builtin overflow (#136345)" (PR #136589)
https://github.com/steakhal edited https://github.com/llvm/llvm-project/pull/136589 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] Backport to 20.x "[clang][analyzer] Fix error path of builtin overflow (#136345)" (PR #136589)
steakhal wrote: Since I did the update to this PR the way I wanted, I invite another code owner to approve. /cc @Xazax-hun https://github.com/llvm/llvm-project/pull/136589 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [llvm][AsmPrinter] Emit call graph section (PR #87576)
@@ -1867,6 +1867,30 @@ static StringRef getMIMnemonic(const MachineInstr &MI, MCStreamer &Streamer) { return Name; } +void AsmPrinter::emitIndirectCalleeLabels( +FunctionInfo &FuncInfo, +const MachineFunction::CallSiteInfoMap &CallSitesInfoMap, +MachineInstr &MI) { + // Only indirect calls have type identifiers set. + const auto &CallSiteInfo = CallSitesInfoMap.find(&MI); + if (CallSiteInfo == CallSitesInfoMap.end()) +return; + if (CallSiteInfo->second.CalleeTypeIds.empty()) +return; arsenm wrote: ```suggestion ``` Don't need this empty check https://github.com/llvm/llvm-project/pull/87576 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [llvm][AsmPrinter] Emit call graph section (PR #87576)
@@ -356,6 +356,13 @@ class AsmPrinter : public MachineFunctionPass { DwarfUsesRelocationsAcrossSections = Enable; } + /// Generate and emit labels for callees of the indirect callsites which will + /// be used to populate the .callgraph section. + void emitIndirectCalleeLabels( + FunctionInfo &FuncInfo, + const MachineFunction::CallSiteInfoMap &CallSitesInfoMap, + MachineInstr &MI); arsenm wrote: ```suggestion const MachineInstr &MI); ``` https://github.com/llvm/llvm-project/pull/87576 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [llvm][AsmPrinter] Emit call graph section (PR #87576)
@@ -1867,6 +1867,30 @@ static StringRef getMIMnemonic(const MachineInstr &MI, MCStreamer &Streamer) { return Name; } +void AsmPrinter::emitIndirectCalleeLabels( +FunctionInfo &FuncInfo, +const MachineFunction::CallSiteInfoMap &CallSitesInfoMap, +MachineInstr &MI) { + // Only indirect calls have type identifiers set. + const auto &CallSiteInfo = CallSitesInfoMap.find(&MI); + if (CallSiteInfo == CallSitesInfoMap.end()) +return; + if (CallSiteInfo->second.CalleeTypeIds.empty()) +return; + + for (auto *CalleeTypeId : CallSiteInfo->second.CalleeTypeIds) { arsenm wrote: No auto https://github.com/llvm/llvm-project/pull/87576 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] LiveRangeShrink: Early exit when encountering a code motion barrier. (PR #136806)
@@ -95,14 +95,24 @@ static MachineInstr *FindDominatedInstruction(MachineInstr &New, return Old; } +static bool isCodeMotionBarrier(MachineInstr &MI) { + return MI.hasUnmodeledSideEffects() && !MI.isPseudoProbe(); +} + /// Builds Instruction to its dominating order number map \p M by traversing /// from instruction \p Start. static void BuildInstOrderMap(MachineBasicBlock::iterator Start, InstOrderMap &M) { M.clear(); unsigned i = 0; - for (MachineInstr &I : make_range(Start, Start->getParent()->end())) + bool SawStore = false; + for (MachineInstr &I : make_range(Start, Start->getParent()->end())) { +if (I.mayStore()) + SawStore = true; +if (!I.isSafeToMove(SawStore) && isCodeMotionBarrier(I)) arsenm wrote: The MI.hasUnmodeledSideEffects() is redundant with the isSafeToMove check? https://github.com/llvm/llvm-project/pull/136806 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] Fix formatting (PR #136847)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Diana Picus (rovka) Changes --- Full diff: https://github.com/llvm/llvm-project/pull/136847.diff 2 Files Affected: - (modified) llvm/lib/CodeGen/PrologEpilogInserter.cpp (+4-4) - (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+1-1) ``diff diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 9b852c0fd49cf..ac4090252cea0 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -608,9 +608,9 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock, MCRegister Reg = CS.getReg(); if (CS.isSpilledToReg()) { -BuildMI(SaveBlock, I, DebugLoc(), -TII.get(TargetOpcode::COPY), CS.getDstReg()) - .addReg(Reg, getKillRegState(true)); +BuildMI(SaveBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), +CS.getDstReg()) +.addReg(Reg, getKillRegState(true)); } else { const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC, @@ -637,7 +637,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock, MCRegister Reg = CI.getReg(); if (CI.isSpilledToReg()) { BuildMI(RestoreBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), Reg) - .addReg(CI.getDstReg(), getKillRegState(true)); +.addReg(CI.getDstReg(), getKillRegState(true)); } else { const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index c1ac9491b2363..7838fd91a94da 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2510,7 +2510,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); if (IsWWMRegSpill) { TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), - RS->isRegUsed(AMDGPU::SCC)); + RS->isRegUsed(AMDGPU::SCC)); } buildSpillLoadStore( `` https://github.com/llvm/llvm-project/pull/136847 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [llvm] Extract and propagate indirect call type id (PR #87575)
@@ -0,0 +1,19 @@ +;; Tests that call site callee type ids can be extracted and set from +;; callee_type metadata for indirect tail calls. + +;; Verify the exact calleeTypeId value to ensure it is not garbage but the value +;; computed as the type id from the callee_type metadata. +; RUN: llc --call-graph-section -mtriple arm-linux-gnu < %s -stop-after=finalize-isel -o - | FileCheck %s + +define dso_local noundef i32 @_Z13call_indirectPFicEc(ptr noundef readonly captures(none) %func, i8 noundef signext %x) local_unnamed_addr !type !0 { +entry: arsenm wrote: ```suggestion define i32 @_Z13call_indirectPFicEc(ptr %func, i8 %x) local_unnamed_addr !type !0 { entry: ``` Remove unnecessary attributes https://github.com/llvm/llvm-project/pull/87575 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] 334e3a8 - Revert "[AMDGPU] Support block load/store for CSR (#130013)"
Author: Diana Picus Date: 2025-04-23T13:04:37+02:00 New Revision: 334e3a844e6b02e400cc83fed2f71b3fe273a42e URL: https://github.com/llvm/llvm-project/commit/334e3a844e6b02e400cc83fed2f71b3fe273a42e DIFF: https://github.com/llvm/llvm-project/commit/334e3a844e6b02e400cc83fed2f71b3fe273a42e.diff LOG: Revert "[AMDGPU] Support block load/store for CSR (#130013)" This reverts commit 4a58071d87265dfccba72134b25cf4d1595d98c5. Added: Modified: llvm/include/llvm/CodeGen/MachineFrameInfo.h llvm/include/llvm/CodeGen/TargetFrameLowering.h llvm/lib/CodeGen/PrologEpilogInserter.cpp llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp llvm/lib/Target/AMDGPU/AMDGPU.td llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp llvm/lib/Target/AMDGPU/GCNSubtarget.h llvm/lib/Target/AMDGPU/SIFrameLowering.cpp llvm/lib/Target/AMDGPU/SIFrameLowering.h llvm/lib/Target/AMDGPU/SIInstrInfo.h llvm/lib/Target/AMDGPU/SIInstructions.td llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp llvm/lib/Target/AMDGPU/SIRegisterInfo.h llvm/unittests/Target/AMDGPU/CMakeLists.txt Removed: llvm/test/CodeGen/AMDGPU/pei-vgpr-block-spill-csr.mir llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll llvm/test/CodeGen/AMDGPU/vgpr-blocks-funcinfo.mir llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h index 9d1b536d23331..172c3e8c9a847 100644 --- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h +++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h @@ -61,7 +61,6 @@ class CalleeSavedInfo { MCRegister getReg() const { return Reg; } int getFrameIdx()const { return FrameIdx; } MCRegister getDstReg() const { return DstReg; } - void setReg(MCRegister R) { Reg = R; } void setFrameIdx(int FI) { FrameIdx = FI; SpilledToReg = false; diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h index 58b63f1769003..cdbefb36c00c7 100644 --- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h +++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h @@ -270,14 +270,6 @@ class TargetFrameLowering { return false; } - /// spillCalleeSavedRegister - Default implementation for spilling a single - /// callee saved register. - void spillCalleeSavedRegister(MachineBasicBlock &SaveBlock, -MachineBasicBlock::iterator MI, -const CalleeSavedInfo &CS, -const TargetInstrInfo *TII, -const TargetRegisterInfo *TRI) const; - /// restoreCalleeSavedRegisters - Issues instruction(s) to restore all callee /// saved registers and returns true if it isn't possible / profitable to do /// so by issuing a series of load instructions via loadRegToStackSlot(). @@ -292,15 +284,6 @@ class TargetFrameLowering { return false; } - // restoreCalleeSavedRegister - Default implementation for restoring a single - // callee saved register. Should be called in reverse order. Can insert - // multiple instructions. - void restoreCalleeSavedRegister(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const CalleeSavedInfo &CS, - const TargetInstrInfo *TII, - const TargetRegisterInfo *TRI) const; - /// hasFP - Return true if the specified function should have a dedicated /// frame pointer register. For most targets this is true only if the function /// has variable sized allocas or if frame pointer elimination is disabled. diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 0cd25c4feb8b9..9b852c0fd49cf 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -476,8 +476,8 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F, // Now that we know which registers need to be saved and restored, allocate // stack slots for them. for (auto &CS : CSI) { - // If the target has spilled this register to another register or already - // handled it , we don't need to allocate a stack slot. + // If the target has spilled this register to another register, we don't + // need to allocate a stack slot. if (CS.isSpilledToReg()) continue; @@ -597,14 +597,25 @@ static void updateLiveness(MachineFunction &MF) { static void insertCSRSaves(MachineBasicBlock &SaveBlock, ArrayRef CSI) { MachineFunction &MF = *SaveBlock.getParent(); - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + const TargetInstrInfo &TII = *
[llvm-branch-commits] [llvm] [KeyInstr][DwarfDebug] Add is_stmt emission support (PR #133495)
@@ -2333,6 +2352,170 @@ DwarfDebug::emitInitialLocDirective(const MachineFunction &MF, unsigned CUID) { return PrologEndLoc; } +void DwarfDebug::findKeyInstructions(const MachineFunction *MF) { + // New function - reset KeyInstructions. + KeyInstructions.clear(); + + // The current candidate is_stmt instructions for each source atom. + // Map {(InlinedAt, Group): (Rank, Instructions)}. + DenseMap, + std::pair>> + GroupCandidates; + + // For each instruction: + // * Skip insts without DebugLoc, AtomGroup or AtomRank, and line zeros. + // * Check if insts in this group have been seen already in GroupCandidates. + // * If this instr rank is equal, add this instruction to KeyInstructions. + // Remove existing instructions from KeyInstructions if they have the + // same parent. + // * If this instr rank is higher (lower precedence), ignore it. + // * If this instr rank is lower (higher precedence), erase existing + // instructions from KeyInstructions. Add this instr to KeyInstructions. + + for (auto &MBB : *MF) { +// Rather than apply is_stmt directly to Key Instructions, we "float" +// is_stmt up to the 1st instruction with the same line number in a +// contiguous block. That instruction is called the "buoy". The +// buoy gets reset if we encouner an instruction with an atom +// group. +const MachineInstr *Buoy = nullptr; +// The atom group number associated with Buoy which may be 0 if we haven't +// encountered an atom group yet in this blob of instructions with the same +// line number. +uint64_t BuoyAtom = 0; + +for (auto &MI : MBB) { + if (MI.isMetaInstruction()) +continue; + + if (!MI.getDebugLoc() || !MI.getDebugLoc().getLine()) +continue; + + // Reset the Buoy to this instruciton if it has a different line number. + if (!Buoy || + Buoy->getDebugLoc().getLine() != MI.getDebugLoc().getLine()) { +Buoy = &MI; +BuoyAtom = 0; + } + + // Call instructions are handled specially - we always mark them as key + // regardless of atom info. + const auto &TII = + *MI.getParent()->getParent()->getSubtarget().getInstrInfo(); + if (MI.isCall() || TII.isTailCall(MI)) { +assert(MI.getDebugLoc() && "Unexpectedly missing DL"); + +// Calls are always key. +KeyInstructions.insert(Buoy); + +uint64_t Group = MI.getDebugLoc()->getAtomGroup(); +uint8_t Rank = MI.getDebugLoc()->getAtomRank(); +if (Group && Rank) { + auto *InlinedAt = MI.getDebugLoc()->getInlinedAt(); + auto &[CandidateRank, CandidateInsts] = GroupCandidates[{InlinedAt, Group}]; + + // This looks similar to the non-call handling code, except that + // we don't put the call into CandidateInsts so that they can't be + // made un-key. As a result, we also have to take special care not + // to erase the is_stmt from the buoy, and prevent that happening + // in the future. + + if (CandidateRank == Rank) { +// We've seen other instructions in this group of this rank. Discard +// ones we've seen in this block, keep the others. +assert(!CandidateInsts.empty()); +SmallVector Insts; +Insts.reserve(CandidateInsts.size()); +for (auto &PrevInst : CandidateInsts) { + if (PrevInst->getParent() != MI.getParent()) +Insts.push_back(PrevInst); + else if (PrevInst != Buoy) +KeyInstructions.erase(PrevInst); +} + +if (Insts.empty()) { + CandidateInsts.clear(); + CandidateRank = 0; +} else { + CandidateInsts = std::move(Insts); +} + + } else if (CandidateRank > Rank) { +// We've seen other instructions in this group of lower precedence +// (higher rank). Discard them. +for (auto *Supplanted : CandidateInsts) { + // Don't erase the is_stmt we're using for this call. + if (Supplanted != Buoy) +KeyInstructions.erase(Supplanted); +} +CandidateInsts.clear(); +CandidateRank = 0; + } +} + +// Avoid floating any future is_stmts up to the call. +Buoy = nullptr; +continue; + } + + auto *InlinedAt = MI.getDebugLoc()->getInlinedAt(); + uint64_t Group = MI.getDebugLoc()->getAtomGroup(); + uint8_t Rank = MI.getDebugLoc()->getAtomRank(); + if (!Group || !Rank) +continue; + + // Don't let is_stmts float past instructions from different source atoms. + if (BuoyAtom && BuoyAtom != Group) { +Buoy = &MI; +BuoyAtom = MI.getDebugLoc()->getAtomGroup(); jmorse wrote: ```suggestion BuoyAtom = Group; ``` Avo
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: detect signing oracles (PR #134146)
https://github.com/kbeyls edited https://github.com/llvm/llvm-project/pull/134146 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr][DwarfDebug] Add is_stmt emission support (PR #133495)
@@ -0,0 +1,78 @@ +# RUN: llc %s --start-after=livedebugvalues --dwarf-use-key-instructions --filetype=obj -o - \ +# RUN: | llvm-objdump -d - --no-show-raw-insn \ +# RUN: | FileCheck %s --check-prefix=OBJ + +# RUN: llc %s --start-after=livedebugvalues --dwarf-use-key-instructions --filetype=obj -o - \ +# RUN: | llvm-dwarfdump - --debug-line \ +# RUN: | FileCheck %s --check-prefix=DBG + +# OBJ: <_Z1fPiii>: +# OBJ-NEXT: 0: movl$0x0, %ebx +# OBJ-NEXT: 5: movl$0x1, %ebx +# OBJ-NEXT: a: movl$0x2, %ebx +# OBJ-NEXT: f: movl$0x3, %ebx +# OBJ-NEXT: 14: movl$0x4, %eax +# OBJ-NEXT: 19: movl$0x5, %eax +# OBJ-NEXT: 1e: movl$0x6, %eax +# OBJ-NEXT: 23: movl$0x7, %eax +# OBJ-NEXT: 28: retq + +# DBG: AddressLine Column File ISA Discriminator OpIndex Flags +# DBG-NEXT: -- -- -- -- --- - --- - +# DBG-NEXT: 0x 1 0 0 0 0 0 is_stmt prologue_end +# DBG-NEXT: 0x0005 2 0 0 0 0 0 is_stmt +# DBG-NEXT: 0x000a 2 0 0 0 0 0 +# DBG-NEXT: 0x000f 2 0 0 0 0 0 +# DBG-NEXT: 0x0014 2 0 0 0 0 0 +# DBG-NEXT: 0x0019 2 0 0 0 0 0 +# DBG-NEXT: 0x001e 2 0 0 0 0 0 is_stmt +# DBG-NEXT: 0x0023 2 0 0 0 0 0 is_stmt +# DBG-NEXT: 0x0029 2 0 0 0 0 0 is_stmt end_sequence + +## Check that interleaving atoms on the same line still produces reasonable +## is_stmt placement (the is_stmts want to "float up" to the first instruction +## in a contiguous set with the same line, but we don't let them float past +## other atom groups). + +--- | + target triple = "x86_64-unknown-linux-gnu" + + define hidden noundef i32 @_Z1fPiii(ptr %a, i32 %b, i32 %c, i1 %cond) local_unnamed_addr !dbg !5 { + entry: +ret i32 2 + } + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!2, !3} + !llvm.ident = !{!4} + + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_17, file: !1, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None) + !1 = !DIFile(filename: "test.cpp", directory: "/") + !2 = !{i32 7, !"Dwarf Version", i32 5} + !3 = !{i32 2, !"Debug Info Version", i32 3} + !4 = !{!"clang version 19.0.0"} + !5 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !6, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) + !6 = !DISubroutineType(types: !7) + !7 = !{} + !8 = !DILocalVariable(name: "x", scope: !5, file: !1, line: 1, type: !7) + +... +--- +name:_Z1fPiii +alignment: 16 +body: | + bb.0.entry: +$ebx = MOV32ri 0, debug-location !DILocation(line: 1, scope: !5) +;; is_stmt floats up here from mov 3. +$ebx = MOV32ri 1, debug-location !DILocation(line: 2, scope: !5, atomGroup: 1, atomRank: 1) +$ebx = MOV32ri 2, debug-location !DILocation(line: 2, scope: !5, atomGroup: 1, atomRank: 2) +$ebx = MOV32ri 3, debug-location !DILocation(line: 2, scope: !5, atomGroup: 1, atomRank: 1) +$eax = MOV32ri 4, debug-location !DILocation(line: 2, scope: !5) +$eax = MOV32ri 5, debug-location !DILocation(line: 2, scope: !5, atomGroup: 2, atomRank: 1) jmorse wrote: My understanding is that this instruction should get an is_stmt in its own entry because it's a separate span of atom group 2, is that right? But doesn't (address 0x19 above?) as far as I can tell. https://github.com/llvm/llvm-project/pull/133495 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr][DwarfDebug] Add is_stmt emission support (PR #133495)
@@ -0,0 +1,117 @@ +; RUN: llc %s --filetype=obj -o - --dwarf-use-key-instructions \ +; RUN: | llvm-objdump -d - --no-show-raw-insn \ +; RUN: | FileCheck %s --check-prefix=OBJ + +; RUN: llc %s --filetype=obj -o - --dwarf-use-key-instructions \ +; RUN: | llvm-dwarfdump - --debug-line \ +; RUN: | FileCheck %s --check-prefix=DBG + +; OBJ: : +; OBJ-NEXT: 0: pushq %rbp +; OBJ-NEXT: 1: pushq %r14 +; OBJ-NEXT: 3: pushq %rbx +; OBJ-NEXT: 4: movq(%rip), %rax +; OBJ-NEXT: b: movl(%rax), %ebp +; OBJ-NEXT: d: callq 0x12 +; OBJ-NEXT: 12: callq 0x17 +; OBJ-NEXT: 17: movl%eax, %ebx +; OBJ-NEXT: 19: addl%ebp, %ebx +; OBJ-NEXT: 1b: movq(%rip), %r14 +; OBJ-NEXT: 22: movl$0x1, (%r14) +; OBJ-NEXT: 29: callq 0x2e +; OBJ-NEXT: 2e: movl$0x2, (%r14) +; OBJ-NEXT: 35: callq 0x3a +; OBJ-NEXT: 3a: movl$0x3, (%r14) +; OBJ-NEXT: 41: callq 0x46 +; OBJ-NEXT: 46: movl$0x4, (%r14) +; OBJ-NEXT: 4d: callq 0x52 +; OBJ-NEXT: 52: movl%ebx, %eax +; OBJ-NEXT: 54: popq%rbx +; OBJ-NEXT: 55: popq%r14 +; OBJ-NEXT: 57: popq%rbp +; OBJ-NEXT: 58: retq + +; DBG: AddressLine Column File ISA Discriminator OpIndex Flags +; DBG-NEXT: -- -- -- -- --- - --- - +; DBG-NEXT: 0x 1 0 0 0 0 0 is_stmt +; DBG-NEXT: 0x0004 2 0 0 0 0 0 is_stmt prologue_end + +;; Test A: +;; Check the 1st call (line 3) gets is_stmt despite having no atom group. +; DBG-NEXT: 0x000d 3 0 0 0 0 0 is_stmt + +;; Test B: +;; Check the 2nd call (line 4) gets is_stmt applied despite being part of group +;; 1 and having lower precedence than the add. Check that the add stil gets +;; is_stmt applied. +;; There are two is_stmt line 4 entries are is_stmt because we don't float jmorse wrote: ```suggestion ;; There are two is_stmt line 4 entries because we don't float ``` https://github.com/llvm/llvm-project/pull/133495 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr][DwarfDebug] Add is_stmt emission support (PR #133495)
https://github.com/jmorse commented: Tests: I'd personally prefer the input source and explanation at the top of the file, although this is a style thing. My understanding of this code is that within a basic block, it should be possible for there to be two sequences of instructions of equal group and rank that both get a buoy and is_stmt, if they're separated by some other atom group. Perhaps I'm wrong; but if I'm right, it probably wants explicit test coverage. There's a risk that the code being generated is brittle and frequently needs updating; I don't know what probability of that we find acceptable, let's just see how often it needs updating I guess. https://github.com/llvm/llvm-project/pull/133495 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: detect signing oracles (PR #134146)
@@ -339,6 +369,183 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { } } + std::optional> + getAuthCheckedReg(BinaryBasicBlock &BB) const override { +// Match several possible hard-coded sequences of instructions which can be +// emitted by LLVM backend to check that the authenticated pointer is +// correct (see AArch64AsmPrinter::emitPtrauthCheckAuthenticatedValue). +// +// This function only matches sequences involving branch instructions. +// All these sequences have the form: +// +// (0) ... regular code that authenticates a pointer in Xn ... +// (1) analyze Xn +// (2) branch to .Lon_success if the pointer is correct +// (3) BRK #imm (fall-through basic block) +// +// In the above pseudocode, (1) + (2) is one of the following sequences: +// +// - eor Xtmp, Xn, Xn, lsl #1 +// tbz Xtmp, #62, .Lon_success +// +// - mov Xtmp, Xn +// xpac(i|d) Xn (or xpaclri if Xn is LR) +// cmp Xtmp, Xn +// b.eq .Lon_success +// +// Note that any branch destination operand is accepted as .Lon_success - +// it is the responsibility of the caller of getAuthCheckedReg to inspect +// the list of successors of this basic block as appropriate. + +// Any of the above code sequences assume the fall-through basic block +// is a dead-end BRK instruction (any immediate operand is accepted). +const BinaryBasicBlock *BreakBB = BB.getFallthrough(); +if (!BreakBB || BreakBB->empty() || +BreakBB->front().getOpcode() != AArch64::BRK) + return std::nullopt; + +// Iterate over the instructions of BB in reverse order, matching opcodes +// and operands. +MCPhysReg TestedReg = 0; +MCPhysReg ScratchReg = 0; +auto It = BB.end(); +auto StepAndGetOpcode = [&It, &BB]() -> int { + if (It == BB.begin()) +return -1; + --It; + return It->getOpcode(); +}; + +switch (StepAndGetOpcode()) { +default: + // Not matched the branch instruction. + return std::nullopt; +case AArch64::Bcc: + // Bcc EQ, .Lon_success + if (It->getOperand(0).getImm() != AArch64CC::EQ) +return std::nullopt; + // Not checking .Lon_success (see above). + + // SUBSXrs XZR, TestedReg, ScratchReg, 0 (used by "CMP reg, reg" alias) + if (StepAndGetOpcode() != AArch64::SUBSXrs || + It->getOperand(0).getReg() != AArch64::XZR || + It->getOperand(3).getImm() != 0) +return std::nullopt; + TestedReg = It->getOperand(1).getReg(); + ScratchReg = It->getOperand(2).getReg(); + + // Either XPAC(I|D) ScratchReg, ScratchReg + // or XPACLRI + switch (StepAndGetOpcode()) { + default: +return std::nullopt; + case AArch64::XPACLRI: +// No operands to check, but using XPACLRI forces TestedReg to be X30. +if (TestedReg != AArch64::LR) + return std::nullopt; +break; + case AArch64::XPACI: + case AArch64::XPACD: +if (It->getOperand(0).getReg() != ScratchReg || +It->getOperand(1).getReg() != ScratchReg) + return std::nullopt; +break; + } + + // ORRXrs ScratchReg, XZR, TestedReg, 0 (used by "MOV reg, reg" alias) + if (StepAndGetOpcode() != AArch64::ORRXrs) +return std::nullopt; + if (It->getOperand(0).getReg() != ScratchReg || + It->getOperand(1).getReg() != AArch64::XZR || + It->getOperand(2).getReg() != TestedReg || + It->getOperand(3).getImm() != 0) +return std::nullopt; + + return std::make_pair(TestedReg, &*It); + +case AArch64::TBZX: + // TBZX ScratchReg, 62, .Lon_success + ScratchReg = It->getOperand(0).getReg(); + if (It->getOperand(1).getImm() != 62) +return std::nullopt; + // Not checking .Lon_success (see above). + + // EORXrs ScratchReg, TestedReg, TestedReg, 1 + if (StepAndGetOpcode() != AArch64::EORXrs) +return std::nullopt; + TestedReg = It->getOperand(1).getReg(); + if (It->getOperand(0).getReg() != ScratchReg || + It->getOperand(2).getReg() != TestedReg || + It->getOperand(3).getImm() != 1) +return std::nullopt; + + return std::make_pair(TestedReg, &*It); +} + } + + MCPhysReg getAuthCheckedReg(const MCInst &Inst, + bool MayOverwrite) const override { +// Cannot trivially reuse AArch64InstrInfo::getMemOperandWithOffsetWidth() +// method as it accepts an instance of MachineInstr, not MCInst. +const MCInstrDesc &Desc = Info->get(Inst.getOpcode()); + +// If signing oracles are considered, the particular value left in the base +// register after this instruction is important. This function checks that +// if the base register was overwritten, it is due to address write-back. +// +// Note that this function is not needed for authentication oracles, as the +
[llvm-branch-commits] [llvm] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` (PR #136798)
@@ -150,15 +138,11 @@ define amdgpu_kernel void @static_alloca_kernel(ptr %p) { ; GI-NEXT:v_mov_b32_e32 v1, s15 ; GI-NEXT:s_mov_b32 s14, s16 ; GI-NEXT:s_movk_i32 s32, 0x400 -; GI-NEXT:s_mov_b32 s36, 0 shiltian wrote: That's why I explicitly added `-O0` in the test case before. https://github.com/llvm/llvm-project/pull/136798 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr][SimplifyCFG] Remap atoms when folding br to common succ into pred (PR #133482)
https://github.com/jmorse approved this pull request. I continued my ramblings inline; I see the potential for some slightly unexpected stepping behaviour, but still strictly better than todays "everything is a breakpoint" approach. I don't think I can put my finger on a specific bad behaviour that might come from this, so LGTM. (Would still be nice to have the diagram in the test). https://github.com/llvm/llvm-project/pull/133482 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` (PR #136798)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/136798 >From b5f7d3d1f11da0b48fa5b634700c1bc539f4f413 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 23 Apr 2025 09:17:46 -0400 Subject: [PATCH] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 + llvm/test/CodeGen/AMDGPU/alloca-as0.ll| 122 -- .../InferAddressSpaces/AMDGPU/alloca-as0.ll | 35 + 3 files changed, 90 insertions(+), 70 deletions(-) create mode 100644 llvm/test/Transforms/InferAddressSpaces/AMDGPU/alloca-as0.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b6cc5137d711a..2c4052a30b10f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -951,6 +951,9 @@ bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, } unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { + if (isa(V)) +return AMDGPUAS::PRIVATE_ADDRESS; + const auto *LD = dyn_cast(V); if (!LD) // TODO: Handle invariant load like constant. return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; diff --git a/llvm/test/CodeGen/AMDGPU/alloca-as0.ll b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll index 9fcb362c153ba..5172ff011e45f 100644 --- a/llvm/test/CodeGen/AMDGPU/alloca-as0.ll +++ b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll @@ -14,7 +14,7 @@ define i32 @static_alloca() { ; ISEL-NEXT:buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; ISEL-NEXT:s_mov_b64 exec, s[18:19] ; ISEL-NEXT:s_addk_i32 s32, 0x400 -; ISEL-NEXT:v_writelane_b32 v40, s16, 4 +; ISEL-NEXT:v_writelane_b32 v40, s16, 3 ; ISEL-NEXT:s_getpc_b64 s[16:17] ; ISEL-NEXT:s_add_u32 s16, s16, bar@rel32@lo+4 ; ISEL-NEXT:s_addc_u32 s17, s17, bar@rel32@hi+12 @@ -27,25 +27,22 @@ define i32 @static_alloca() { ; ISEL-NEXT:v_writelane_b32 v40, s34, 2 ; ISEL-NEXT:s_cselect_b32 s34, s18, 0 ; ISEL-NEXT:s_mov_b64 s[18:19], src_private_base -; ISEL-NEXT:v_writelane_b32 v40, s35, 3 -; ISEL-NEXT:s_cselect_b32 s35, s19, 0 +; ISEL-NEXT:s_cselect_b32 s18, s19, 0 ; ISEL-NEXT:v_mov_b32_e32 v0, s34 -; ISEL-NEXT:v_mov_b32_e32 v1, s35 +; ISEL-NEXT:v_mov_b32_e32 v1, s18 ; ISEL-NEXT:s_swappc_b64 s[30:31], s[16:17] ; ISEL-NEXT:v_mov_b32_e32 v0, s34 -; ISEL-NEXT:v_mov_b32_e32 v1, s35 -; ISEL-NEXT:flat_load_dword v0, v[0:1] -; ISEL-NEXT:v_readlane_b32 s35, v40, 3 +; ISEL-NEXT:buffer_load_dword v0, v0, s[0:3], 0 offen ; ISEL-NEXT:v_readlane_b32 s34, v40, 2 ; ISEL-NEXT:v_readlane_b32 s31, v40, 1 ; ISEL-NEXT:v_readlane_b32 s30, v40, 0 ; ISEL-NEXT:s_mov_b32 s32, s33 -; ISEL-NEXT:v_readlane_b32 s4, v40, 4 +; ISEL-NEXT:v_readlane_b32 s4, v40, 3 ; ISEL-NEXT:s_or_saveexec_b64 s[6:7], -1 ; ISEL-NEXT:buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; ISEL-NEXT:s_mov_b64 exec, s[6:7] ; ISEL-NEXT:s_mov_b32 s33, s4 -; ISEL-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; ISEL-NEXT:s_waitcnt vmcnt(0) ; ISEL-NEXT:s_setpc_b64 s[30:31] ; ; GI-LABEL: static_alloca: @@ -56,35 +53,27 @@ define i32 @static_alloca() { ; GI-NEXT:s_or_saveexec_b64 s[18:19], -1 ; GI-NEXT:buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GI-NEXT:s_mov_b64 exec, s[18:19] -; GI-NEXT:v_writelane_b32 v40, s16, 4 -; GI-NEXT:v_writelane_b32 v40, s30, 0 -; GI-NEXT:v_writelane_b32 v40, s31, 1 +; GI-NEXT:v_writelane_b32 v40, s16, 2 ; GI-NEXT:s_addk_i32 s32, 0x400 -; GI-NEXT:v_writelane_b32 v40, s34, 2 -; GI-NEXT:s_lshr_b32 s34, s33, 6 ; GI-NEXT:s_mov_b64 s[16:17], src_private_base +; GI-NEXT:v_writelane_b32 v40, s30, 0 ; GI-NEXT:s_getpc_b64 s[18:19] ; GI-NEXT:s_add_u32 s18, s18, bar@rel32@lo+4 ; GI-NEXT:s_addc_u32 s19, s19, bar@rel32@hi+12 ; GI-NEXT:v_lshrrev_b32_e64 v0, 6, s33 ; GI-NEXT:v_mov_b32_e32 v1, s17 -; GI-NEXT:v_writelane_b32 v40, s35, 3 -; GI-NEXT:s_mov_b32 s35, s17 +; GI-NEXT:v_writelane_b32 v40, s31, 1 ; GI-NEXT:s_swappc_b64 s[30:31], s[18:19] -; GI-NEXT:v_mov_b32_e32 v0, s34 -; GI-NEXT:v_mov_b32_e32 v1, s35 -; GI-NEXT:flat_load_dword v0, v[0:1] -; GI-NEXT:v_readlane_b32 s35, v40, 3 -; GI-NEXT:v_readlane_b32 s34, v40, 2 +; GI-NEXT:buffer_load_dword v0, off, s[0:3], s33 ; GI-NEXT:v_readlane_b32 s31, v40, 1 ; GI-NEXT:v_readlane_b32 s30, v40, 0 ; GI-NEXT:s_mov_b32 s32, s33 -; GI-NEXT:v_readlane_b32 s4, v40, 4 +; GI-NEXT:v_readlane_b32 s4, v40, 2 ; GI-NEXT:s_or_saveexec_b64 s[6:7], -1 ; GI-NEXT:buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GI-NEXT:s_mov_b64 exec, s[6:7] ; GI-NEXT:s_mov_b32 s33, s4 -; GI-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GI-NEXT:s_waitcnt vmcnt(0) ; GI-NEXT:s_setpc
[llvm-branch-commits] [llvm] [KeyInstr] Merge atoms in DILocation::getMergedLocation (PR #133480)
@@ -1243,6 +1243,140 @@ TEST_F(DILocationTest, Merge) { auto *M2 = DILocation::getMergedLocation(A2, B); EXPECT_EQ(M1, M2); } + +#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS +#define EXPECT_ATOM(Loc, Group, Rank) \ + EXPECT_EQ(Group, M->getAtomGroup()); \ + EXPECT_EQ(Rank, M->getAtomRank()); +#else +#define EXPECT_ATOM(Loc, Group, Rank) \ + EXPECT_EQ(0u, M->getAtomGroup()); \ + EXPECT_EQ(0u, M->getAtomRank()); \ + (void)Group; \ + (void)Rank; +#endif + // Identical, including source atom numbers. + { +auto *A = DILocation::get(Context, 2, 7, N, nullptr, false, 1, 1); +auto *B = DILocation::get(Context, 2, 7, N, nullptr, false, 1, 1); +auto *M = DILocation::getMergedLocation(A, B); +EXPECT_ATOM(M, 1u, 1u); +// DILocations are uniqued, so we can check equality by ptr. +EXPECT_EQ(M, DILocation::getMergedLocation(A, B)); + } + + // Identical but different atom ranks (same atom) - choose the lowest nonzero + // rank. + { +auto *A = DILocation::get(Context, 2, 7, N, nullptr, false, 1, 1); +auto *B = DILocation::get(Context, 2, 7, N, nullptr, false, 1, 2); +auto *M = DILocation::getMergedLocation(A, B); +EXPECT_ATOM(M, 1u, 1u); +EXPECT_EQ(M, DILocation::getMergedLocation(B, A)); + +A = DILocation::get(Context, 2, 7, N, nullptr, false, 1, 0); +B = DILocation::get(Context, 2, 7, N, nullptr, false, 1, 2); +M = DILocation::getMergedLocation(A, B); +EXPECT_ATOM(M, 1u, 2u); +EXPECT_EQ(M, DILocation::getMergedLocation(B, A)); + } + + // Identical but different atom ranks (different atom) - choose the lowest + // nonzero rank. + { +auto *A = DILocation::get(Context, 2, 7, N, nullptr, false, 1, 1); +auto *B = DILocation::get(Context, 2, 7, N, nullptr, false, 2, 2); +auto *M = DILocation::getMergedLocation(A, B); +EXPECT_ATOM(M, 1u, 1u); +EXPECT_EQ(M, DILocation::getMergedLocation(B, A)); + +A = DILocation::get(Context, 2, 7, N, nullptr, false, 1, 0); +B = DILocation::get(Context, 2, 7, N, nullptr, false, 2, 2); +M = DILocation::getMergedLocation(A, B); +EXPECT_ATOM(M, 2u, 2u); +EXPECT_EQ(M, DILocation::getMergedLocation(B, A)); + } + + // Identical but equal atom rank (different atom) - choose the lowest non-zero + // group (arbitrary choice for deterministic behaviour). + { +auto *A = DILocation::get(Context, 2, 7, N, nullptr, false, 1, 1); +auto *B = DILocation::get(Context, 2, 7, N, nullptr, false, 2, 1); +auto *M = DILocation::getMergedLocation(A, B); +EXPECT_ATOM(M, 1u, 1u); +EXPECT_EQ(M, DILocation::getMergedLocation(B, A)); + +A = DILocation::get(Context, 2, 7, N, nullptr, false, 0, 1); +B = DILocation::get(Context, 2, 7, N, nullptr, false, 2, 1); +M = DILocation::getMergedLocation(A, B); +EXPECT_ATOM(M, 2u, 1u); +EXPECT_EQ(M, DILocation::getMergedLocation(B, A)); + } + + // Completely different except same atom numbers. Zero out the atoms. + { +auto *I = DILocation::get(Context, 2, 7, N); +auto *A = DILocation::get(Context, 1, 6, S, I, false, 1, 1); +auto *B = +DILocation::get(Context, 2, 7, getSubprogram(), nullptr, false, 1, 1); +auto *M = DILocation::getMergedLocation(A, B); +EXPECT_EQ(0u, M->getLine()); +EXPECT_EQ(0u, M->getColumn()); +EXPECT_TRUE(isa(M->getScope())); +EXPECT_EQ(S, M->getScope()); +EXPECT_EQ(nullptr, M->getInlinedAt()); + } + + // Same inlined-at chain but different atoms. Choose the lowest + // non-zero group (arbitrary choice for deterministic behaviour). + { +auto *I = DILocation::get(Context, 1, 7, N); +auto *F = getSubprogram(); +auto *A = DILocation::get(Context, 1, 1, F, I, false, 1, 2); +auto *B = DILocation::get(Context, 1, 1, F, I, false, 2, 1); +auto *M = DILocation::getMergedLocation(A, B); +EXPECT_ATOM(M, 2u, 1u); jmorse wrote: No real suggestions; I think the C++17 way might be to invent user-defined-literals and an integer-like object, but that seems like waaa overkill here. Or maybe enum classes? The unit tests will also (IMHO YMMV?) be the only place where literals get fed into this. An ugly alternative is to put the pattern in the name, i.e. "DILocation::getWithAtomRank", so that every time it appears you're reminded what argument order it should be. https://github.com/llvm/llvm-project/pull/133480 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` (PR #136798)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/136798 >From b5f7d3d1f11da0b48fa5b634700c1bc539f4f413 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 23 Apr 2025 09:17:46 -0400 Subject: [PATCH] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 + llvm/test/CodeGen/AMDGPU/alloca-as0.ll| 122 -- .../InferAddressSpaces/AMDGPU/alloca-as0.ll | 35 + 3 files changed, 90 insertions(+), 70 deletions(-) create mode 100644 llvm/test/Transforms/InferAddressSpaces/AMDGPU/alloca-as0.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b6cc5137d711a..2c4052a30b10f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -951,6 +951,9 @@ bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, } unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { + if (isa(V)) +return AMDGPUAS::PRIVATE_ADDRESS; + const auto *LD = dyn_cast(V); if (!LD) // TODO: Handle invariant load like constant. return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; diff --git a/llvm/test/CodeGen/AMDGPU/alloca-as0.ll b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll index 9fcb362c153ba..5172ff011e45f 100644 --- a/llvm/test/CodeGen/AMDGPU/alloca-as0.ll +++ b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll @@ -14,7 +14,7 @@ define i32 @static_alloca() { ; ISEL-NEXT:buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; ISEL-NEXT:s_mov_b64 exec, s[18:19] ; ISEL-NEXT:s_addk_i32 s32, 0x400 -; ISEL-NEXT:v_writelane_b32 v40, s16, 4 +; ISEL-NEXT:v_writelane_b32 v40, s16, 3 ; ISEL-NEXT:s_getpc_b64 s[16:17] ; ISEL-NEXT:s_add_u32 s16, s16, bar@rel32@lo+4 ; ISEL-NEXT:s_addc_u32 s17, s17, bar@rel32@hi+12 @@ -27,25 +27,22 @@ define i32 @static_alloca() { ; ISEL-NEXT:v_writelane_b32 v40, s34, 2 ; ISEL-NEXT:s_cselect_b32 s34, s18, 0 ; ISEL-NEXT:s_mov_b64 s[18:19], src_private_base -; ISEL-NEXT:v_writelane_b32 v40, s35, 3 -; ISEL-NEXT:s_cselect_b32 s35, s19, 0 +; ISEL-NEXT:s_cselect_b32 s18, s19, 0 ; ISEL-NEXT:v_mov_b32_e32 v0, s34 -; ISEL-NEXT:v_mov_b32_e32 v1, s35 +; ISEL-NEXT:v_mov_b32_e32 v1, s18 ; ISEL-NEXT:s_swappc_b64 s[30:31], s[16:17] ; ISEL-NEXT:v_mov_b32_e32 v0, s34 -; ISEL-NEXT:v_mov_b32_e32 v1, s35 -; ISEL-NEXT:flat_load_dword v0, v[0:1] -; ISEL-NEXT:v_readlane_b32 s35, v40, 3 +; ISEL-NEXT:buffer_load_dword v0, v0, s[0:3], 0 offen ; ISEL-NEXT:v_readlane_b32 s34, v40, 2 ; ISEL-NEXT:v_readlane_b32 s31, v40, 1 ; ISEL-NEXT:v_readlane_b32 s30, v40, 0 ; ISEL-NEXT:s_mov_b32 s32, s33 -; ISEL-NEXT:v_readlane_b32 s4, v40, 4 +; ISEL-NEXT:v_readlane_b32 s4, v40, 3 ; ISEL-NEXT:s_or_saveexec_b64 s[6:7], -1 ; ISEL-NEXT:buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; ISEL-NEXT:s_mov_b64 exec, s[6:7] ; ISEL-NEXT:s_mov_b32 s33, s4 -; ISEL-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; ISEL-NEXT:s_waitcnt vmcnt(0) ; ISEL-NEXT:s_setpc_b64 s[30:31] ; ; GI-LABEL: static_alloca: @@ -56,35 +53,27 @@ define i32 @static_alloca() { ; GI-NEXT:s_or_saveexec_b64 s[18:19], -1 ; GI-NEXT:buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GI-NEXT:s_mov_b64 exec, s[18:19] -; GI-NEXT:v_writelane_b32 v40, s16, 4 -; GI-NEXT:v_writelane_b32 v40, s30, 0 -; GI-NEXT:v_writelane_b32 v40, s31, 1 +; GI-NEXT:v_writelane_b32 v40, s16, 2 ; GI-NEXT:s_addk_i32 s32, 0x400 -; GI-NEXT:v_writelane_b32 v40, s34, 2 -; GI-NEXT:s_lshr_b32 s34, s33, 6 ; GI-NEXT:s_mov_b64 s[16:17], src_private_base +; GI-NEXT:v_writelane_b32 v40, s30, 0 ; GI-NEXT:s_getpc_b64 s[18:19] ; GI-NEXT:s_add_u32 s18, s18, bar@rel32@lo+4 ; GI-NEXT:s_addc_u32 s19, s19, bar@rel32@hi+12 ; GI-NEXT:v_lshrrev_b32_e64 v0, 6, s33 ; GI-NEXT:v_mov_b32_e32 v1, s17 -; GI-NEXT:v_writelane_b32 v40, s35, 3 -; GI-NEXT:s_mov_b32 s35, s17 +; GI-NEXT:v_writelane_b32 v40, s31, 1 ; GI-NEXT:s_swappc_b64 s[30:31], s[18:19] -; GI-NEXT:v_mov_b32_e32 v0, s34 -; GI-NEXT:v_mov_b32_e32 v1, s35 -; GI-NEXT:flat_load_dword v0, v[0:1] -; GI-NEXT:v_readlane_b32 s35, v40, 3 -; GI-NEXT:v_readlane_b32 s34, v40, 2 +; GI-NEXT:buffer_load_dword v0, off, s[0:3], s33 ; GI-NEXT:v_readlane_b32 s31, v40, 1 ; GI-NEXT:v_readlane_b32 s30, v40, 0 ; GI-NEXT:s_mov_b32 s32, s33 -; GI-NEXT:v_readlane_b32 s4, v40, 4 +; GI-NEXT:v_readlane_b32 s4, v40, 2 ; GI-NEXT:s_or_saveexec_b64 s[6:7], -1 ; GI-NEXT:buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GI-NEXT:s_mov_b64 exec, s[6:7] ; GI-NEXT:s_mov_b32 s33, s4 -; GI-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GI-NEXT:s_waitcnt vmcnt(0) ; GI-NEXT:s_setpc
[llvm-branch-commits] [llvm] [KeyInstr] Add Atom Group waterline to LLVMContext (PR #133478)
https://github.com/jmorse approved this pull request. LGTM with the style nits https://github.com/llvm/llvm-project/pull/133478 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [LV] Fix crash when building partial reductions using types that aren't known scale factors (#136680) (PR #136863)
https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/136863 Backport 1ce709c Requested by: @NickGuy-Arm >From b1cf8e2bf71cd71ce8f5e3499563747bdd3e0b18 Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Wed, 23 Apr 2025 13:19:18 +0100 Subject: [PATCH] [LV] Fix crash when building partial reductions using types that aren't known scale factors (#136680) (cherry picked from commit 1ce709cb845b8b0bc4625198afa7a26c0a198fe4) --- .../Transforms/Vectorize/LoopVectorize.cpp| 10 +++- .../AArch64/partial-reduce-no-dotprod.ll | 56 +++ 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1baec6d6ca37b..3b6166ab1fa9e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8778,9 +8778,13 @@ bool VPRecipeBuilder::getScaledReductions( PartialReductionChain Chain(RdxExitInstr, ExtA, ExtB, BinOp); - unsigned TargetScaleFactor = - PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor( - A->getType()->getPrimitiveSizeInBits()); + TypeSize PHISize = PHI->getType()->getPrimitiveSizeInBits(); + TypeSize ASize = A->getType()->getPrimitiveSizeInBits(); + + if (!PHISize.hasKnownScalarFactor(ASize)) +return false; + + unsigned TargetScaleFactor = PHISize.getKnownScalarFactor(ASize); if (LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll index 3561f52df9490..ef82154dfce66 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll @@ -59,3 +59,59 @@ for.body: ; preds = %for.body, %entry for.exit:; preds = %for.body ret i32 %add } + +; Test to ensure that we don't crash when evaluating an extend from a type +; that is not a factor of the target type. +define i40 @partial_reduce_not_known_factor(i32 %a, i32 %b, i16 %N) { +; CHECK-LABEL: define i40 @partial_reduce_not_known_factor( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i16 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT:[[SMAX:%.*]] = call i16 @llvm.smax.i16(i16 [[N]], i16 0) +; CHECK-NEXT:[[TMP0:%.*]] = zext nneg i16 [[SMAX]] to i32 +; CHECK-NEXT:[[TMP1:%.*]] = add nuw nsw i32 [[TMP0]], 1 +; CHECK-NEXT:[[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 4 +; CHECK-NEXT:br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT:[[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 4 +; CHECK-NEXT:[[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT:[[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0 +; CHECK-NEXT:[[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT:[[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0 +; CHECK-NEXT:[[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT:[[TMP2:%.*]] = trunc i32 [[N_VEC]] to i16 +; CHECK-NEXT:[[TMP3:%.*]] = sext <2 x i32> [[BROADCAST_SPLAT2]] to <2 x i40> +; CHECK-NEXT:[[TMP4:%.*]] = sext <2 x i32> [[BROADCAST_SPLAT]] to <2 x i40> +; CHECK-NEXT:[[TMP5:%.*]] = or <2 x i40> [[TMP4]], [[TMP3]] +; CHECK-NEXT:br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT:[[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <2 x i40> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT:[[VEC_PHI3:%.*]] = phi <2 x i40> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT:[[TMP6]] = or <2 x i40> [[VEC_PHI]], [[TMP5]] +; CHECK-NEXT:[[TMP8]] = or <2 x i40> [[VEC_PHI3]], [[TMP5]] +; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT:[[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT:br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT:[[BIN_RDX:%.*]] = or <2 x i40> [[TMP8]], [[TMP6]] +; CHECK-NEXT:[[TMP9:%.*]] = call i40 @llvm.vector.reduce.or.v2i40(<2 x i40> [[BIN_RDX]]) +; CHECK-NEXT:[[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] +; CHECK-NEXT:br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +entry: + br label %for.body + +for.body: + %red = phi i40 [ 0, %entry ], [ %1, %for.body ] + %iv = phi i16 [ 0, %entry ], [ %
[llvm-branch-commits] [llvm] release/20.x: [LV] Fix crash when building partial reductions using types that aren't known scale factors (#136680) (PR #136863)
https://github.com/llvmbot milestoned https://github.com/llvm/llvm-project/pull/136863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [LV] Fix crash when building partial reductions using types that aren't known scale factors (#136680) (PR #136863)
llvmbot wrote: @fhahn What do you think about merging this PR to the release branch? https://github.com/llvm/llvm-project/pull/136863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [clang] callee_type metadata for indirect calls (PR #117036)
@@ -1619,9 +1619,12 @@ class CodeGenModule : public CodeGenTypeCache { llvm::Metadata *CreateMetadataIdentifierGeneralized(QualType T); /// Create and attach type metadata to the given function. - void CreateFunctionTypeMetadataForIcall(const FunctionDecl *FD, + void createFunctionTypeMetadataForIcall(const FunctionDecl *FD, ilovepi wrote: I'd suggest making this a separate change. It can land independently of your PR. When you make it CC one of the clang maintainers to make sure this isn't some important public API. I'm like 90% sure its not, but :shrug: I've been wrong before. https://github.com/llvm/llvm-project/pull/117036 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [clang] callee_type metadata for indirect calls (PR #117036)
@@ -2860,9 +2861,25 @@ static void setLinkageForGV(llvm::GlobalValue *GV, const NamedDecl *ND) { GV->setLinkage(llvm::GlobalValue::ExternalWeakLinkage); } +static bool HasExistingGeneralizedTypeMD(llvm::Function *F) { + llvm::MDNode *MD = F->getMetadata(llvm::LLVMContext::MD_type); + if (!MD || !isa(MD->getOperand(1))) +return false; + + llvm::MDString *TypeIdStr = cast(MD->getOperand(1)); + return TypeIdStr->getString().ends_with(".generalized"); +} + void CodeGenModule::CreateFunctionTypeMetadataForIcall(const FunctionDecl *FD, llvm::Function *F) { - // Only if we are checking indirect calls. + if (CodeGenOpts.CallGraphSection && !HasExistingGeneralizedTypeMD(F) && ilovepi wrote: hmm, seems like we'd end up adding type metadata multiple times anyway, then, right? https://github.com/llvm/llvm-project/pull/117036 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [DirectX] Adding support for Root Descriptor in Obj2yaml/Yaml2Obj (PR #136732)
@@ -73,24 +75,50 @@ struct ShaderHash { std::vector Digest; }; -#define ROOT_ELEMENT_FLAG(Num, Val) bool Val = false; - struct RootConstantsYaml { uint32_t ShaderRegister; uint32_t RegisterSpace; uint32_t Num32BitValues; }; +#define ROOT_DESCRIPTOR_FLAG(Num, Val) bool Val = false; +struct RootDescriptorYaml { + RootDescriptorYaml() = default; + + uint32_t ShaderRegister; + uint32_t RegisterSpace; + + uint32_t getEncodedFlags(); + +#include "llvm/BinaryFormat/DXContainerConstants.def" +}; + struct RootParameterYamlDesc { uint32_t Type; uint32_t Visibility; uint32_t Offset; + RootParameterYamlDesc(){}; + RootParameterYamlDesc(uint32_t T) : Type(T) { inbelic wrote: Is the old constructor still used? If so, when would we use that instead of this one? https://github.com/llvm/llvm-project/pull/136732 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [DirectX] Adding support for Root Descriptor in Obj2yaml/Yaml2Obj (PR #136732)
@@ -89,6 +111,15 @@ DXContainerYAML::RootSignatureYamlDesc::create( return RootSigDesc; } +uint32_t DXContainerYAML::RootDescriptorYaml::getEncodedFlags() const { + uint64_t Flag = 0; +#define ROOT_DESCRIPTOR_FLAG(Num, Val) \ + if (Val) \ inbelic wrote: Note: `Val` is a member of the struct which is why this works https://github.com/llvm/llvm-project/pull/136732 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [DirectX] Adding support for Root Descriptor in Obj2yaml/Yaml2Obj (PR #136732)
@@ -594,6 +599,25 @@ struct RootConstants { sys::swapByteOrder(Num32BitValues); } }; +struct RootDescriptor_V1_0 { inbelic wrote: IIUC, this is how the structs were defined and planned to be extended in DXC. And I believe it was also documented there that each new version of a must guarantee it will only append data members. What were the reasons for keeping separate structures? >From glancing, it seems like we would just need to update the `readParameter` >function to determine the size of the struct based on the version. Maybe that >is more extensible? It seems like the logic elsewhere would be nicer: Currently it is like: ``` if (Version == 1) { Param1 = Param1; Param2 = Param2; } if (Version == 2) { Param1 = Param1; Param2 = Param2; ExtraParam1 = ExtraParam1; } if (Version == 3) { ... } ``` And it could be like ``` Param1 = Param1; Param2 = Param2; if (Version >= 2) { ExtraParam1 = ExtraParam2; } if (Version >= 3) { ExtraParam2 = ExtraParam2; } ``` And similar for the `sys::write` functionality, etc. Happy to just be told no, but wanted to make sure we haved reconsidered the format. https://github.com/llvm/llvm-project/pull/136732 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [DirectX] Adding support for Root Descriptor in Obj2yaml/Yaml2Obj (PR #136732)
https://github.com/inbelic edited https://github.com/llvm/llvm-project/pull/136732 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Attributor] Use `getAllocaAddrSpace` to get address space for `AllocaInst` (PR #136865)
shiltian wrote: > The A field does not assert anything about the content of the module. It does > not assert that any alloca with a non-A valued alloca can be replaced with an > A address space alloca. An alloca that does not match this address space is > not invalid, and you cannot say anything about it If I understand correctly, you're suggesting that there's no reliable way for the middle end to determine which address space an alloca will ultimately end up in, aside from cases where it's already in, unless it pulls that information directly from the backend, like what `InferAddressSpacePass` does. The data layout itself doesn't assert anything: first, it doesn't necessarily *match* the final code generator; and second, even if it does, the `A` field in the data layout doesn't necessarily guarantee or assert it. @nikic @efriedma-quic what do you think? https://github.com/llvm/llvm-project/pull/136865 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LoopVectorizer] Bundle partial reductions with different extensions (PR #136997)
llvmbot wrote: @llvm/pr-subscribers-llvm-transforms Author: Sam Tebbs (SamTebbs33) Changes This PR adds support for extensions of different signedness to VPMulAccumulateReductionRecipe and allows such partial reductions to be bundled into that class. --- Patch is 25.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/136997.diff 5 Files Affected: - (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+27-15) - (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+19-8) - (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+12-13) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll (+28-28) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll (+13-16) ``diff diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 20d272e69e6e7..e11f608d068da 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2493,11 +2493,13 @@ class VPExtendedReductionRecipe : public VPReductionRecipe { /// recipe is abstract and needs to be lowered to concrete recipes before /// codegen. The Operands are {ChainOp, VecOp1, VecOp2, [Condition]}. class VPMulAccumulateReductionRecipe : public VPReductionRecipe { - /// Opcode of the extend recipe. - Instruction::CastOps ExtOp; + /// Opcodes of the extend recipes. + Instruction::CastOps ExtOp0; + Instruction::CastOps ExtOp1; - /// Non-neg flag of the extend recipe. - bool IsNonNeg = false; + /// Non-neg flags of the extend recipe. + bool IsNonNeg0 = false; + bool IsNonNeg1 = false; Type *ResultTy; @@ -2512,7 +2514,8 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { MulAcc->getCondOp(), MulAcc->isOrdered(), WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()), MulAcc->getDebugLoc()), -ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()), +ExtOp0(MulAcc->getExt0Opcode()), ExtOp1(MulAcc->getExt1Opcode()), +IsNonNeg0(MulAcc->isNonNeg0()), IsNonNeg1(MulAcc->isNonNeg1()), ResultTy(MulAcc->getResultType()), IsPartialReduction(MulAcc->isPartialReduction()) {} @@ -2526,7 +2529,8 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { R->getCondOp(), R->isOrdered(), WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()), R->getDebugLoc()), -ExtOp(Ext0->getOpcode()), IsNonNeg(Ext0->isNonNeg()), +ExtOp0(Ext0->getOpcode()), ExtOp1(Ext1->getOpcode()), +IsNonNeg0(Ext0->isNonNeg()), IsNonNeg1(Ext1->isNonNeg()), ResultTy(ResultTy), IsPartialReduction(isa(R)) { assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) == @@ -2542,7 +2546,8 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { R->getCondOp(), R->isOrdered(), WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()), R->getDebugLoc()), -ExtOp(Instruction::CastOps::CastOpsEnd) { +ExtOp0(Instruction::CastOps::CastOpsEnd), +ExtOp1(Instruction::CastOps::CastOpsEnd) { assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) == Instruction::Add && "The reduction instruction in MulAccumulateReductionRecipe must be " @@ -2586,19 +2591,26 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { VPValue *getVecOp1() const { return getOperand(2); } /// Return if this MulAcc recipe contains extend instructions. - bool isExtended() const { return ExtOp != Instruction::CastOps::CastOpsEnd; } + bool isExtended() const { return ExtOp0 != Instruction::CastOps::CastOpsEnd; } /// Return if the operands of mul instruction come from same extend. - bool isSameExtend() const { return getVecOp0() == getVecOp1(); } + bool isSameExtendVal() const { return getVecOp0() == getVecOp1(); } - /// Return the opcode of the underlying extend. - Instruction::CastOps getExtOpcode() const { return ExtOp; } + /// Return the opcode of the underlying extends. + Instruction::CastOps getExt0Opcode() const { return ExtOp0; } + Instruction::CastOps getExt1Opcode() const { return ExtOp1; } + + /// Return if the first extend's opcode is ZExt. + bool isZExt0() const { return ExtOp0 == Instruction::CastOps::ZExt; } + + /// Return if the second extend's opcode is ZExt. + bool isZExt1() const { return ExtOp1 == Instruction::CastOps::ZExt; } - /// Return if the extend opcode is ZExt. - bool isZExt() const { return ExtOp == Instruction::CastOps::ZExt; } + /// Return the non negative flag of the first ext recipe. + bool isNonNeg0() const { return IsNonNeg0; } - /// Return the non negative flag of the ext recipe. - bool isNonNeg() const { return IsNonNeg; } + /// Return the non negative flag of the second ext recipe. + bool isNonNeg1() const
[llvm-branch-commits] [clang] [clang][OpenMP] Add AST node for root of compound directive (PR #118878)
https://github.com/jdoerfert commented: Looks reasonable to me. I left one comment and I believe you can verify what is best yourself. https://github.com/llvm/llvm-project/pull/118878 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Attributor] Use `getAllocaAddrSpace` to get address space for `AllocaInst` (PR #136865)
nikic wrote: @shiltian I'm not entirely sure what you're asking here. As @arsenm said, the alloca address space in the data layout is merely a hint on the address space to use when materializing allocas "out of thin air". There are targets that use multiple alloca address spaces, this just specifies a default one. https://github.com/llvm/llvm-project/pull/136865 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [clang][OpenMP] Add AST node for root of compound directive (PR #118878)
https://github.com/jdoerfert edited https://github.com/llvm/llvm-project/pull/118878 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [clang][OpenMP] Add AST node for root of compound directive (PR #118878)
@@ -9406,6 +9406,14 @@ StmtResult TreeTransform::TransformOMPInformationalDirective( D->getBeginLoc(), D->getEndLoc()); } +template +StmtResult TreeTransform::TransformOMPCompoundRootDirective( +OMPCompoundRootDirective *D) { + // This function should never be found in a template. Directive splitting + // only happens in non-template functions. + llvm_unreachable("TransformOMPCompoundRootDirective in a template"); jdoerfert wrote: I'm not sure llvm_unreachable is the right kind of error here. Maybe it is. Is there precedent? https://github.com/llvm/llvm-project/pull/118878 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr][DwarfDebug] Add is_stmt emission support (PR #133495)
@@ -2333,6 +2352,170 @@ DwarfDebug::emitInitialLocDirective(const MachineFunction &MF, unsigned CUID) { return PrologEndLoc; } +void DwarfDebug::findKeyInstructions(const MachineFunction *MF) { + // New function - reset KeyInstructions. + KeyInstructions.clear(); + + // The current candidate is_stmt instructions for each source atom. + // Map {(InlinedAt, Group): (Rank, Instructions)}. + DenseMap, + std::pair>> + GroupCandidates; + + // For each instruction: + // * Skip insts without DebugLoc, AtomGroup or AtomRank, and line zeros. + // * Check if insts in this group have been seen already in GroupCandidates. + // * If this instr rank is equal, add this instruction to KeyInstructions. + // Remove existing instructions from KeyInstructions if they have the + // same parent. + // * If this instr rank is higher (lower precedence), ignore it. + // * If this instr rank is lower (higher precedence), erase existing + // instructions from KeyInstructions. Add this instr to KeyInstructions. + + for (auto &MBB : *MF) { +// Rather than apply is_stmt directly to Key Instructions, we "float" +// is_stmt up to the 1st instruction with the same line number in a +// contiguous block. That instruction is called the "buoy". The +// buoy gets reset if we encouner an instruction with an atom +// group. +const MachineInstr *Buoy = nullptr; +// The atom group number associated with Buoy which may be 0 if we haven't +// encountered an atom group yet in this blob of instructions with the same +// line number. +uint64_t BuoyAtom = 0; + +for (auto &MI : MBB) { + if (MI.isMetaInstruction()) +continue; + + if (!MI.getDebugLoc() || !MI.getDebugLoc().getLine()) +continue; + + // Reset the Buoy to this instruciton if it has a different line number. + if (!Buoy || + Buoy->getDebugLoc().getLine() != MI.getDebugLoc().getLine()) { +Buoy = &MI; +BuoyAtom = 0; + } + + // Call instructions are handled specially - we always mark them as key + // regardless of atom info. + const auto &TII = + *MI.getParent()->getParent()->getSubtarget().getInstrInfo(); + if (MI.isCall() || TII.isTailCall(MI)) { +assert(MI.getDebugLoc() && "Unexpectedly missing DL"); + +// Calls are always key. +KeyInstructions.insert(Buoy); + +uint64_t Group = MI.getDebugLoc()->getAtomGroup(); +uint8_t Rank = MI.getDebugLoc()->getAtomRank(); +if (Group && Rank) { + auto *InlinedAt = MI.getDebugLoc()->getInlinedAt(); + auto &[CandidateRank, CandidateInsts] = GroupCandidates[{InlinedAt, Group}]; + + // This looks similar to the non-call handling code, except that + // we don't put the call into CandidateInsts so that they can't be + // made un-key. As a result, we also have to take special care not + // to erase the is_stmt from the buoy, and prevent that happening + // in the future. + + if (CandidateRank == Rank) { +// We've seen other instructions in this group of this rank. Discard +// ones we've seen in this block, keep the others. +assert(!CandidateInsts.empty()); +SmallVector Insts; +Insts.reserve(CandidateInsts.size()); +for (auto &PrevInst : CandidateInsts) { + if (PrevInst->getParent() != MI.getParent()) +Insts.push_back(PrevInst); + else if (PrevInst != Buoy) +KeyInstructions.erase(PrevInst); +} + +if (Insts.empty()) { + CandidateInsts.clear(); + CandidateRank = 0; +} else { + CandidateInsts = std::move(Insts); +} + + } else if (CandidateRank > Rank) { +// We've seen other instructions in this group of lower precedence +// (higher rank). Discard them. +for (auto *Supplanted : CandidateInsts) { + // Don't erase the is_stmt we're using for this call. + if (Supplanted != Buoy) +KeyInstructions.erase(Supplanted); +} +CandidateInsts.clear(); +CandidateRank = 0; + } +} + +// Avoid floating any future is_stmts up to the call. +Buoy = nullptr; +continue; + } + + auto *InlinedAt = MI.getDebugLoc()->getInlinedAt(); + uint64_t Group = MI.getDebugLoc()->getAtomGroup(); + uint8_t Rank = MI.getDebugLoc()->getAtomRank(); + if (!Group || !Rank) +continue; + + // Don't let is_stmts float past instructions from different source atoms. + if (BuoyAtom && BuoyAtom != Group) { +Buoy = &MI; +BuoyAtom = MI.getDebugLoc()->getAtomGroup(); + } + + auto &[CandidateRank, CandidateInsts] = GroupCandidates[{Inli
[llvm-branch-commits] [llvm] [KeyInstr][DwarfDebug] Add is_stmt emission support (PR #133495)
@@ -2333,6 +2352,170 @@ DwarfDebug::emitInitialLocDirective(const MachineFunction &MF, unsigned CUID) { return PrologEndLoc; } +void DwarfDebug::findKeyInstructions(const MachineFunction *MF) { + // New function - reset KeyInstructions. + KeyInstructions.clear(); + + // The current candidate is_stmt instructions for each source atom. + // Map {(InlinedAt, Group): (Rank, Instructions)}. + DenseMap, + std::pair>> + GroupCandidates; jmorse wrote: This will be big due to each dense-map element containing a SmallVector of pointers; IMO we need to do some profiling and set a SmallVector allocation size. https://github.com/llvm/llvm-project/pull/133495 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr][DwarfDebug] Add is_stmt emission support (PR #133495)
@@ -2333,6 +2352,170 @@ DwarfDebug::emitInitialLocDirective(const MachineFunction &MF, unsigned CUID) { return PrologEndLoc; } +void DwarfDebug::findKeyInstructions(const MachineFunction *MF) { + // New function - reset KeyInstructions. + KeyInstructions.clear(); + + // The current candidate is_stmt instructions for each source atom. + // Map {(InlinedAt, Group): (Rank, Instructions)}. + DenseMap, + std::pair>> + GroupCandidates; + + // For each instruction: + // * Skip insts without DebugLoc, AtomGroup or AtomRank, and line zeros. + // * Check if insts in this group have been seen already in GroupCandidates. + // * If this instr rank is equal, add this instruction to KeyInstructions. + // Remove existing instructions from KeyInstructions if they have the + // same parent. + // * If this instr rank is higher (lower precedence), ignore it. + // * If this instr rank is lower (higher precedence), erase existing + // instructions from KeyInstructions. Add this instr to KeyInstructions. + + for (auto &MBB : *MF) { +// Rather than apply is_stmt directly to Key Instructions, we "float" +// is_stmt up to the 1st instruction with the same line number in a +// contiguous block. That instruction is called the "buoy". The +// buoy gets reset if we encouner an instruction with an atom +// group. +const MachineInstr *Buoy = nullptr; +// The atom group number associated with Buoy which may be 0 if we haven't +// encountered an atom group yet in this blob of instructions with the same +// line number. +uint64_t BuoyAtom = 0; + +for (auto &MI : MBB) { + if (MI.isMetaInstruction()) +continue; + + if (!MI.getDebugLoc() || !MI.getDebugLoc().getLine()) +continue; + + // Reset the Buoy to this instruciton if it has a different line number. + if (!Buoy || + Buoy->getDebugLoc().getLine() != MI.getDebugLoc().getLine()) { +Buoy = &MI; +BuoyAtom = 0; + } + + // Call instructions are handled specially - we always mark them as key + // regardless of atom info. + const auto &TII = + *MI.getParent()->getParent()->getSubtarget().getInstrInfo(); + if (MI.isCall() || TII.isTailCall(MI)) { +assert(MI.getDebugLoc() && "Unexpectedly missing DL"); + +// Calls are always key. +KeyInstructions.insert(Buoy); + +uint64_t Group = MI.getDebugLoc()->getAtomGroup(); +uint8_t Rank = MI.getDebugLoc()->getAtomRank(); +if (Group && Rank) { + auto *InlinedAt = MI.getDebugLoc()->getInlinedAt(); + auto &[CandidateRank, CandidateInsts] = GroupCandidates[{InlinedAt, Group}]; + + // This looks similar to the non-call handling code, except that + // we don't put the call into CandidateInsts so that they can't be + // made un-key. As a result, we also have to take special care not + // to erase the is_stmt from the buoy, and prevent that happening + // in the future. + + if (CandidateRank == Rank) { +// We've seen other instructions in this group of this rank. Discard +// ones we've seen in this block, keep the others. +assert(!CandidateInsts.empty()); +SmallVector Insts; +Insts.reserve(CandidateInsts.size()); +for (auto &PrevInst : CandidateInsts) { + if (PrevInst->getParent() != MI.getParent()) +Insts.push_back(PrevInst); + else if (PrevInst != Buoy) +KeyInstructions.erase(PrevInst); +} + +if (Insts.empty()) { + CandidateInsts.clear(); + CandidateRank = 0; +} else { + CandidateInsts = std::move(Insts); +} + + } else if (CandidateRank > Rank) { +// We've seen other instructions in this group of lower precedence +// (higher rank). Discard them. +for (auto *Supplanted : CandidateInsts) { + // Don't erase the is_stmt we're using for this call. + if (Supplanted != Buoy) +KeyInstructions.erase(Supplanted); +} +CandidateInsts.clear(); +CandidateRank = 0; + } +} + +// Avoid floating any future is_stmts up to the call. +Buoy = nullptr; +continue; + } + + auto *InlinedAt = MI.getDebugLoc()->getInlinedAt(); + uint64_t Group = MI.getDebugLoc()->getAtomGroup(); + uint8_t Rank = MI.getDebugLoc()->getAtomRank(); + if (!Group || !Rank) +continue; + + // Don't let is_stmts float past instructions from different source atoms. + if (BuoyAtom && BuoyAtom != Group) { +Buoy = &MI; +BuoyAtom = MI.getDebugLoc()->getAtomGroup(); + } + + auto &[CandidateRank, CandidateInsts] = GroupCandidates[{Inli
[llvm-branch-commits] [llvm] [KeyInstr][DwarfDebug] Add is_stmt emission support (PR #133495)
https://github.com/jmorse edited https://github.com/llvm/llvm-project/pull/133495 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr][DwarfDebug] Add is_stmt emission support (PR #133495)
@@ -2087,13 +2095,18 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) { // If we have an ongoing unspecified location, nothing to do here. if (!DL) return; -// We have an explicit location, same as the previous location. -// But we might be coming back to it after a line 0 record. -if ((LastAsmLine == 0 && DL.getLine() != 0) || Flags) { - // Reinstate the source location but not marked as a statement. - RecordSourceLine(DL, Flags); + +// Skip this if the instruction is Key, else we might accidentally miss an +// is_stmt. +if (!IsKey) { jmorse wrote: Can we not fold this test, and the comment, into the existing test? "If we have an explicit non-key location". (Avoids some un-necessary indentation). https://github.com/llvm/llvm-project/pull/133495 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr][DwarfDebug] Add is_stmt emission support (PR #133495)
@@ -2333,6 +2352,170 @@ DwarfDebug::emitInitialLocDirective(const MachineFunction &MF, unsigned CUID) { return PrologEndLoc; } +void DwarfDebug::findKeyInstructions(const MachineFunction *MF) { + // New function - reset KeyInstructions. + KeyInstructions.clear(); + + // The current candidate is_stmt instructions for each source atom. + // Map {(InlinedAt, Group): (Rank, Instructions)}. + DenseMap, + std::pair>> + GroupCandidates; + + // For each instruction: + // * Skip insts without DebugLoc, AtomGroup or AtomRank, and line zeros. + // * Check if insts in this group have been seen already in GroupCandidates. + // * If this instr rank is equal, add this instruction to KeyInstructions. + // Remove existing instructions from KeyInstructions if they have the + // same parent. + // * If this instr rank is higher (lower precedence), ignore it. + // * If this instr rank is lower (higher precedence), erase existing + // instructions from KeyInstructions. Add this instr to KeyInstructions. + + for (auto &MBB : *MF) { +// Rather than apply is_stmt directly to Key Instructions, we "float" +// is_stmt up to the 1st instruction with the same line number in a +// contiguous block. That instruction is called the "buoy". The +// buoy gets reset if we encouner an instruction with an atom +// group. +const MachineInstr *Buoy = nullptr; +// The atom group number associated with Buoy which may be 0 if we haven't +// encountered an atom group yet in this blob of instructions with the same +// line number. +uint64_t BuoyAtom = 0; + +for (auto &MI : MBB) { + if (MI.isMetaInstruction()) +continue; + + if (!MI.getDebugLoc() || !MI.getDebugLoc().getLine()) +continue; + + // Reset the Buoy to this instruciton if it has a different line number. + if (!Buoy || + Buoy->getDebugLoc().getLine() != MI.getDebugLoc().getLine()) { +Buoy = &MI; +BuoyAtom = 0; + } + + // Call instructions are handled specially - we always mark them as key + // regardless of atom info. + const auto &TII = + *MI.getParent()->getParent()->getSubtarget().getInstrInfo(); + if (MI.isCall() || TII.isTailCall(MI)) { +assert(MI.getDebugLoc() && "Unexpectedly missing DL"); + +// Calls are always key. +KeyInstructions.insert(Buoy); + +uint64_t Group = MI.getDebugLoc()->getAtomGroup(); +uint8_t Rank = MI.getDebugLoc()->getAtomRank(); +if (Group && Rank) { jmorse wrote: Early-continue instead perhaps? https://github.com/llvm/llvm-project/pull/133495 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr][DwarfDebug] Add is_stmt emission support (PR #133495)
https://github.com/jmorse commented: I get the impression that `GroupCandidates` and `KeyInstructions` are being kept strictly in sync; thus couldn't one instead just load KeyInstructions from GroupCandidates once the scan is complete? This avoids filling up the dense map with tombstones. Am I right in understanding that the buoy means the "least precedence" instruction will get the is_stmt if the highest precedence appears after it in the contiguous blob? (Seems fine, just making sure I understand). On the whole, the computation function feels like it could be simpler, but in some intangible way I'm not immediately sure of. (Still reading the tests). https://github.com/llvm/llvm-project/pull/133495 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LoopVectorizer] Bundle partial reductions with different extensions (PR #136997)
https://github.com/SamTebbs33 created https://github.com/llvm/llvm-project/pull/136997 This PR adds support for extensions of different signedness to VPMulAccumulateReductionRecipe and allows such partial reductions to be bundled into that class. >From 10c4727074a7f5b4502ad08dc655be8fa5ffa3d2 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Wed, 23 Apr 2025 13:16:38 +0100 Subject: [PATCH] [LoopVectorizer] Bundle partial reductions with different extensions This PR adds support for extensions of different signedness to VPMulAccumulateReductionRecipe and allows such partial reductions to be bundled into that class. --- llvm/lib/Transforms/Vectorize/VPlan.h | 42 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 27 ++--- .../Transforms/Vectorize/VPlanTransforms.cpp | 25 - .../partial-reduce-dot-product-mixed.ll | 56 +-- .../LoopVectorize/AArch64/vplan-printing.ll | 29 +- 5 files changed, 99 insertions(+), 80 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 20d272e69e6e7..e11f608d068da 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2493,11 +2493,13 @@ class VPExtendedReductionRecipe : public VPReductionRecipe { /// recipe is abstract and needs to be lowered to concrete recipes before /// codegen. The Operands are {ChainOp, VecOp1, VecOp2, [Condition]}. class VPMulAccumulateReductionRecipe : public VPReductionRecipe { - /// Opcode of the extend recipe. - Instruction::CastOps ExtOp; + /// Opcodes of the extend recipes. + Instruction::CastOps ExtOp0; + Instruction::CastOps ExtOp1; - /// Non-neg flag of the extend recipe. - bool IsNonNeg = false; + /// Non-neg flags of the extend recipe. + bool IsNonNeg0 = false; + bool IsNonNeg1 = false; Type *ResultTy; @@ -2512,7 +2514,8 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { MulAcc->getCondOp(), MulAcc->isOrdered(), WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()), MulAcc->getDebugLoc()), -ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()), +ExtOp0(MulAcc->getExt0Opcode()), ExtOp1(MulAcc->getExt1Opcode()), +IsNonNeg0(MulAcc->isNonNeg0()), IsNonNeg1(MulAcc->isNonNeg1()), ResultTy(MulAcc->getResultType()), IsPartialReduction(MulAcc->isPartialReduction()) {} @@ -2526,7 +2529,8 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { R->getCondOp(), R->isOrdered(), WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()), R->getDebugLoc()), -ExtOp(Ext0->getOpcode()), IsNonNeg(Ext0->isNonNeg()), +ExtOp0(Ext0->getOpcode()), ExtOp1(Ext1->getOpcode()), +IsNonNeg0(Ext0->isNonNeg()), IsNonNeg1(Ext1->isNonNeg()), ResultTy(ResultTy), IsPartialReduction(isa(R)) { assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) == @@ -2542,7 +2546,8 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { R->getCondOp(), R->isOrdered(), WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()), R->getDebugLoc()), -ExtOp(Instruction::CastOps::CastOpsEnd) { +ExtOp0(Instruction::CastOps::CastOpsEnd), +ExtOp1(Instruction::CastOps::CastOpsEnd) { assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) == Instruction::Add && "The reduction instruction in MulAccumulateReductionRecipe must be " @@ -2586,19 +2591,26 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { VPValue *getVecOp1() const { return getOperand(2); } /// Return if this MulAcc recipe contains extend instructions. - bool isExtended() const { return ExtOp != Instruction::CastOps::CastOpsEnd; } + bool isExtended() const { return ExtOp0 != Instruction::CastOps::CastOpsEnd; } /// Return if the operands of mul instruction come from same extend. - bool isSameExtend() const { return getVecOp0() == getVecOp1(); } + bool isSameExtendVal() const { return getVecOp0() == getVecOp1(); } - /// Return the opcode of the underlying extend. - Instruction::CastOps getExtOpcode() const { return ExtOp; } + /// Return the opcode of the underlying extends. + Instruction::CastOps getExt0Opcode() const { return ExtOp0; } + Instruction::CastOps getExt1Opcode() const { return ExtOp1; } + + /// Return if the first extend's opcode is ZExt. + bool isZExt0() const { return ExtOp0 == Instruction::CastOps::ZExt; } + + /// Return if the second extend's opcode is ZExt. + bool isZExt1() const { return ExtOp1 == Instruction::CastOps::ZExt; } - /// Return if the extend opcode is ZExt. - bool isZExt() const { return ExtOp == Instruction::CastOps::ZExt; } + /// Return the non negative flag of the first ext recipe. + bool isNonNeg0() const { return IsNonNe
[llvm-branch-commits] [llvm] [KeyInstr][DwarfDebug] Add is_stmt emission support (PR #133495)
@@ -2333,6 +2352,170 @@ DwarfDebug::emitInitialLocDirective(const MachineFunction &MF, unsigned CUID) { return PrologEndLoc; } +void DwarfDebug::findKeyInstructions(const MachineFunction *MF) { + // New function - reset KeyInstructions. + KeyInstructions.clear(); + + // The current candidate is_stmt instructions for each source atom. + // Map {(InlinedAt, Group): (Rank, Instructions)}. + DenseMap, + std::pair>> + GroupCandidates; + + // For each instruction: + // * Skip insts without DebugLoc, AtomGroup or AtomRank, and line zeros. + // * Check if insts in this group have been seen already in GroupCandidates. + // * If this instr rank is equal, add this instruction to KeyInstructions. + // Remove existing instructions from KeyInstructions if they have the + // same parent. + // * If this instr rank is higher (lower precedence), ignore it. + // * If this instr rank is lower (higher precedence), erase existing + // instructions from KeyInstructions. Add this instr to KeyInstructions. + + for (auto &MBB : *MF) { +// Rather than apply is_stmt directly to Key Instructions, we "float" +// is_stmt up to the 1st instruction with the same line number in a +// contiguous block. That instruction is called the "buoy". The +// buoy gets reset if we encouner an instruction with an atom +// group. +const MachineInstr *Buoy = nullptr; +// The atom group number associated with Buoy which may be 0 if we haven't +// encountered an atom group yet in this blob of instructions with the same +// line number. +uint64_t BuoyAtom = 0; + +for (auto &MI : MBB) { + if (MI.isMetaInstruction()) +continue; + + if (!MI.getDebugLoc() || !MI.getDebugLoc().getLine()) +continue; + + // Reset the Buoy to this instruciton if it has a different line number. jmorse wrote: ```suggestion // Reset the Buoy to this instruction if it has a different line number. ``` https://github.com/llvm/llvm-project/pull/133495 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr][DwarfDebug] Add is_stmt emission support (PR #133495)
@@ -2333,6 +2352,170 @@ DwarfDebug::emitInitialLocDirective(const MachineFunction &MF, unsigned CUID) { return PrologEndLoc; } +void DwarfDebug::findKeyInstructions(const MachineFunction *MF) { + // New function - reset KeyInstructions. + KeyInstructions.clear(); + + // The current candidate is_stmt instructions for each source atom. + // Map {(InlinedAt, Group): (Rank, Instructions)}. + DenseMap, + std::pair>> + GroupCandidates; + + // For each instruction: + // * Skip insts without DebugLoc, AtomGroup or AtomRank, and line zeros. + // * Check if insts in this group have been seen already in GroupCandidates. + // * If this instr rank is equal, add this instruction to KeyInstructions. + // Remove existing instructions from KeyInstructions if they have the + // same parent. + // * If this instr rank is higher (lower precedence), ignore it. + // * If this instr rank is lower (higher precedence), erase existing + // instructions from KeyInstructions. Add this instr to KeyInstructions. + + for (auto &MBB : *MF) { +// Rather than apply is_stmt directly to Key Instructions, we "float" +// is_stmt up to the 1st instruction with the same line number in a +// contiguous block. That instruction is called the "buoy". The +// buoy gets reset if we encouner an instruction with an atom +// group. +const MachineInstr *Buoy = nullptr; +// The atom group number associated with Buoy which may be 0 if we haven't +// encountered an atom group yet in this blob of instructions with the same +// line number. +uint64_t BuoyAtom = 0; + +for (auto &MI : MBB) { + if (MI.isMetaInstruction()) +continue; + + if (!MI.getDebugLoc() || !MI.getDebugLoc().getLine()) +continue; + + // Reset the Buoy to this instruciton if it has a different line number. + if (!Buoy || + Buoy->getDebugLoc().getLine() != MI.getDebugLoc().getLine()) { +Buoy = &MI; +BuoyAtom = 0; + } + + // Call instructions are handled specially - we always mark them as key + // regardless of atom info. + const auto &TII = + *MI.getParent()->getParent()->getSubtarget().getInstrInfo(); + if (MI.isCall() || TII.isTailCall(MI)) { +assert(MI.getDebugLoc() && "Unexpectedly missing DL"); + +// Calls are always key. +KeyInstructions.insert(Buoy); jmorse wrote: ```suggestion KeyInstructions.insert(&MI); ``` Avoids any doubt about what's being inserted. https://github.com/llvm/llvm-project/pull/133495 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] be7adaf - Fix formatting (#136847)
Author: Diana Picus Date: 2025-04-23T13:19:46+02:00 New Revision: be7adaf7ee21096215578816514b119da68e4cc8 URL: https://github.com/llvm/llvm-project/commit/be7adaf7ee21096215578816514b119da68e4cc8 DIFF: https://github.com/llvm/llvm-project/commit/be7adaf7ee21096215578816514b119da68e4cc8.diff LOG: Fix formatting (#136847) Added: Modified: llvm/lib/CodeGen/PrologEpilogInserter.cpp llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp Removed: diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 9b852c0fd49cf..ac4090252cea0 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -608,9 +608,9 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock, MCRegister Reg = CS.getReg(); if (CS.isSpilledToReg()) { -BuildMI(SaveBlock, I, DebugLoc(), -TII.get(TargetOpcode::COPY), CS.getDstReg()) - .addReg(Reg, getKillRegState(true)); +BuildMI(SaveBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), +CS.getDstReg()) +.addReg(Reg, getKillRegState(true)); } else { const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC, @@ -637,7 +637,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock, MCRegister Reg = CI.getReg(); if (CI.isSpilledToReg()) { BuildMI(RestoreBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), Reg) - .addReg(CI.getDstReg(), getKillRegState(true)); +.addReg(CI.getDstReg(), getKillRegState(true)); } else { const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index c1ac9491b2363..7838fd91a94da 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2510,7 +2510,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); if (IsWWMRegSpill) { TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), - RS->isRegUsed(AMDGPU::SCC)); + RS->isRegUsed(AMDGPU::SCC)); } buildSpillLoadStore( ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Attributor] Use `getAllocaAddrSpace` to get address space for `AllocaInst` (PR #136865)
shiltian wrote: > The address space should just come directly from the alloca. You don't know > if it's correct to just replace the addrspace with whatever the datalayout > says is the alloca addrspace. The datalayout value is for new allocas where > the code has no additional context If the data layout doesn't match the target, the module is already broken to begin with, and any optimization that relies on data layout information can't be expected to work correctly. If that's the case, what's the point of having a data layout at all? Why don't just pull every piece of information from the backend anyway? https://github.com/llvm/llvm-project/pull/136865 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][ArmSVE] Add initial lowering of vector.contract to SVE `*MMLA` instructions (PR #135636)
https://github.com/banach-space commented: Thanks Momchil - this is great! I skimmed through the pattern logic, and it's very neatly written. It's actually quite easy to follow, despite the underlying logic being a bit convoluted - well done! I've left a few minor suggestions, but nothing major. Also, it seems like we should be able to extend this fairly easily to support NEON as well. Worth thinking about 🙂 Now, overall this patch is quite large, and I’d suggest extracting the end-to-end / integration tests into a separate PR. Additionally, the remaining tests currently use `--convert-vector-to-llvm=`, which lowers all the way to LLVM (i.e., it exercises a lot of patterns). Instead, I’d recommend testing `LowerContractionToSVEI8MMPattern` in isolation and only verifying that the correct sequence of ArmSVE ops (plus some Vector ops) is generated - for example: ```mlir (...) %33 = arm_sve.smmla %23, %7, %15 : vector<[16]xi8> to vector<[4]xi32> %34 = arm_sve.smmla %24, %7, %16 : vector<[16]xi8> to vector<[4]xi32> %35 = arm_sve.smmla %31, %13, %15 : vector<[16]xi8> to vector<[4]xi32> %36 = arm_sve.smmla %32, %13, %16 : vector<[16]xi8> to vector<[4]xi32> ``` That way, we will: * reduce noise in the test output (by focusing on a single pattern), * simplify expected output (fewer ops to match), * avoid re-testing functionality already covered elsewhere (e.g., `arm_sve.smmla` → `arm_sve.intr.smmla` lowering). Btw, this is already looking great, and I know I’m asking for a bit of a rewrite (especially around the tests), but I really think it’ll help with long-term maintainability. https://github.com/llvm/llvm-project/pull/135636 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` (PR #136798)
@@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s -o - | FileCheck %s + +declare void @bar(ptr) + +define i32 @static_alloca() { +; CHECK-LABEL: define i32 @static_alloca() { +; CHECK-NEXT:[[ALLOCA:%.*]] = alloca i32, align 4 +; CHECK-NEXT:[[TMP1:%.*]] = addrspacecast ptr [[ALLOCA]] to ptr addrspace(5) +; CHECK-NEXT:[[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr arsenm wrote: I didn't say anything about a load. I don't think this pair is legally folded out by instcombine https://github.com/llvm/llvm-project/pull/136798 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: detect signing oracles (PR #134146)
https://github.com/kbeyls commented: Apologies for only reviewing piece-meal. I'm struggling a bit at the time to reserve longer blocks of time to review this fully in one go. I hope my comments still make sense though https://github.com/llvm/llvm-project/pull/134146 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Attributor] Use `getAssumedAddrSpace` to get address space for `AllocaInst` (PR #136865)
shiltian wrote: > This looks like an attempt to fix up broken IR producers, but I guess that's > not it? Yeah, I initially thought that was broken IR too. At first, I was in favor of not allowing alloca in AS0 at all and just making it a verifier error, like what was done in https://github.com/llvm/llvm-project/commit/c9c1eefa7714fccc3661d79e690fc17945ab7fe1. But @arsenm convinced me otherwise in https://github.com/llvm/llvm-project/pull/135820#issuecomment-2818230936, where the idea is to relax the restriction and fix things up later in the backend. https://github.com/llvm/llvm-project/pull/136865 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr][SimplifyCFG] Remap atoms when folding br to common succ into pred (PR #133482)
@@ -1129,13 +1130,17 @@ static void cloneInstructionsIntoPredecessorBlockAndUpdateSSAUses( Instruction *NewBonusInst = BonusInst.clone(); -if (!isa(BonusInst) && -PTI->getDebugLoc() != NewBonusInst->getDebugLoc()) { - // Unless the instruction has the same !dbg location as the original - // branch, drop it. When we fold the bonus instructions we want to make - // sure we reset their debug locations in order to avoid stepping on - // dead code caused by folding dead branches. - NewBonusInst->setDebugLoc(DebugLoc()); +if (!isa(BonusInst)) { + if (!NewBonusInst->getDebugLoc().isSameSourceLocation( + PTI->getDebugLoc())) { +// Unless the instruction has the same !dbg location as the original +// branch, drop it. When we fold the bonus instructions we want to make +// sure we reset their debug locations in order to avoid stepping on +// dead code caused by folding dead branches. +NewBonusInst->setDebugLoc(DebugLoc()); + } else if (const DebugLoc &DL = NewBonusInst->getDebugLoc()) { +mapAtomInstance(DL, VMap); jmorse wrote: For the first part I think that answers the question (multiple instructions with the same source-location getting is_stmt). Would there also be potential for an intervening non-is_stmt instruction with the same source line between the two? It feels possible from instruction scheduling if not immediately in this pass, and if so I guess it's a general consequence of the technique. It doesn't seem unreasonable, although perhaps we need some (debuggers) test coverage of how to interpret it. After all we're not trying to program the debugger from the compiler, we're communicating that "this source location really does happen to execute twice". > I'm not sure what you mean about PTI's location becoming key? e.g. looking at > the test - the new branches in b and c are only "key" because they're clones > of the cond br from d (which is already "key"). I was wondering whether there can be a case where the instruction at PTI and the bonus instruction are in the same group, but the one being remapped here has the higher rank, and putting it in a different group causes the previously-lower-ranked instruction to become the highest ranked as a result. At face value this isn't a problem because the whole point of this function is we're duplicating a code path; but could there be scenarios where LLVM uses this utility to implement a move (i.e. clone-then-delete-old-path?) Or to put it another way: I believe (but may be wrong) that `cloneInstructionsIntoPredecessorBlockAndUpdateSSAUses` currently copies/moves instructions from one place to another. But, with this change, what was a plain copy/move now comes with changes to the stepping behaviour. I think this boils down to saying "so what?", and the answer to that is "the stepping is still superior to without key instructions". I'm just trying to think through the consequences of how these changes compose together. https://github.com/llvm/llvm-project/pull/133482 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr][SimplifyCFG] Remap atoms when folding br to common succ into pred (PR #133482)
https://github.com/jmorse edited https://github.com/llvm/llvm-project/pull/133482 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Attributor] Use `getAssumedAddrSpace` to get address space for `AllocaInst` (PR #136865)
shiltian wrote: I've updated the PR to use `getAssumedAddrSpace`, which is same as what `InferAddressSpacePass` does. @arsenm @nikic https://github.com/llvm/llvm-project/pull/136865 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Attributor] Use `getAssumedAddrSpace` to get address space for `AllocaInst` (PR #136865)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/136865 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: detect signing oracles (PR #134146)
@@ -339,6 +369,183 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { } } + std::optional> + getAuthCheckedReg(BinaryBasicBlock &BB) const override { +// Match several possible hard-coded sequences of instructions which can be +// emitted by LLVM backend to check that the authenticated pointer is +// correct (see AArch64AsmPrinter::emitPtrauthCheckAuthenticatedValue). +// +// This function only matches sequences involving branch instructions. +// All these sequences have the form: +// +// (0) ... regular code that authenticates a pointer in Xn ... +// (1) analyze Xn +// (2) branch to .Lon_success if the pointer is correct +// (3) BRK #imm (fall-through basic block) +// +// In the above pseudocode, (1) + (2) is one of the following sequences: +// +// - eor Xtmp, Xn, Xn, lsl #1 +// tbz Xtmp, #62, .Lon_success +// +// - mov Xtmp, Xn +// xpac(i|d) Xn (or xpaclri if Xn is LR) +// cmp Xtmp, Xn +// b.eq .Lon_success +// +// Note that any branch destination operand is accepted as .Lon_success - +// it is the responsibility of the caller of getAuthCheckedReg to inspect +// the list of successors of this basic block as appropriate. + +// Any of the above code sequences assume the fall-through basic block +// is a dead-end BRK instruction (any immediate operand is accepted). +const BinaryBasicBlock *BreakBB = BB.getFallthrough(); +if (!BreakBB || BreakBB->empty() || +BreakBB->front().getOpcode() != AArch64::BRK) + return std::nullopt; + +// Iterate over the instructions of BB in reverse order, matching opcodes +// and operands. +MCPhysReg TestedReg = 0; +MCPhysReg ScratchReg = 0; +auto It = BB.end(); +auto StepAndGetOpcode = [&It, &BB]() -> int { + if (It == BB.begin()) +return -1; + --It; + return It->getOpcode(); +}; + +switch (StepAndGetOpcode()) { +default: + // Not matched the branch instruction. + return std::nullopt; +case AArch64::Bcc: + // Bcc EQ, .Lon_success + if (It->getOperand(0).getImm() != AArch64CC::EQ) +return std::nullopt; + // Not checking .Lon_success (see above). + + // SUBSXrs XZR, TestedReg, ScratchReg, 0 (used by "CMP reg, reg" alias) + if (StepAndGetOpcode() != AArch64::SUBSXrs || + It->getOperand(0).getReg() != AArch64::XZR || + It->getOperand(3).getImm() != 0) +return std::nullopt; + TestedReg = It->getOperand(1).getReg(); + ScratchReg = It->getOperand(2).getReg(); + + // Either XPAC(I|D) ScratchReg, ScratchReg + // or XPACLRI + switch (StepAndGetOpcode()) { + default: +return std::nullopt; + case AArch64::XPACLRI: +// No operands to check, but using XPACLRI forces TestedReg to be X30. +if (TestedReg != AArch64::LR) + return std::nullopt; +break; + case AArch64::XPACI: + case AArch64::XPACD: +if (It->getOperand(0).getReg() != ScratchReg || +It->getOperand(1).getReg() != ScratchReg) + return std::nullopt; +break; + } + + // ORRXrs ScratchReg, XZR, TestedReg, 0 (used by "MOV reg, reg" alias) + if (StepAndGetOpcode() != AArch64::ORRXrs) +return std::nullopt; + if (It->getOperand(0).getReg() != ScratchReg || + It->getOperand(1).getReg() != AArch64::XZR || + It->getOperand(2).getReg() != TestedReg || + It->getOperand(3).getImm() != 0) +return std::nullopt; + + return std::make_pair(TestedReg, &*It); + +case AArch64::TBZX: + // TBZX ScratchReg, 62, .Lon_success + ScratchReg = It->getOperand(0).getReg(); + if (It->getOperand(1).getImm() != 62) +return std::nullopt; + // Not checking .Lon_success (see above). + + // EORXrs ScratchReg, TestedReg, TestedReg, 1 + if (StepAndGetOpcode() != AArch64::EORXrs) +return std::nullopt; + TestedReg = It->getOperand(1).getReg(); + if (It->getOperand(0).getReg() != ScratchReg || + It->getOperand(2).getReg() != TestedReg || + It->getOperand(3).getImm() != 1) +return std::nullopt; + + return std::make_pair(TestedReg, &*It); +} + } + + MCPhysReg getAuthCheckedReg(const MCInst &Inst, + bool MayOverwrite) const override { +// Cannot trivially reuse AArch64InstrInfo::getMemOperandWithOffsetWidth() +// method as it accepts an instance of MachineInstr, not MCInst. +const MCInstrDesc &Desc = Info->get(Inst.getOpcode()); + +// If signing oracles are considered, the particular value left in the base +// register after this instruction is important. This function checks that +// if the base register was overwritten, it is due to address write-back. +// +// Note that this function is not needed for authentication oracles, as the +
[llvm-branch-commits] [llvm] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` (PR #136798)
@@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s -o - | FileCheck %s + +declare void @bar(ptr) + +define i32 @static_alloca() { +; CHECK-LABEL: define i32 @static_alloca() { +; CHECK-NEXT:[[ALLOCA:%.*]] = alloca i32, align 4 +; CHECK-NEXT:[[TMP1:%.*]] = addrspacecast ptr [[ALLOCA]] to ptr addrspace(5) +; CHECK-NEXT:[[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr shiltian wrote: K, does a follow-up work? https://github.com/llvm/llvm-project/pull/136798 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] LiveRangeShrink: Early exit when encountering a code motion barrier. (PR #136806)
@@ -95,14 +95,24 @@ static MachineInstr *FindDominatedInstruction(MachineInstr &New, return Old; } +static bool isCodeMotionBarrier(MachineInstr &MI) { + return MI.hasUnmodeledSideEffects() && !MI.isPseudoProbe(); +} arsenm wrote: Document this? What is the exact problematic condition? We have several different barrier concepts already spread in MachineInstr and TargetInstrInfo, can we use one of those https://github.com/llvm/llvm-project/pull/136806 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [llvm][AsmPrinter] Emit call graph section (PR #87576)
@@ -1867,6 +1867,30 @@ static StringRef getMIMnemonic(const MachineInstr &MI, MCStreamer &Streamer) { return Name; } +void AsmPrinter::emitIndirectCalleeLabels( +FunctionInfo &FuncInfo, +const MachineFunction::CallSiteInfoMap &CallSitesInfoMap, +MachineInstr &MI) { arsenm wrote: ```suggestion const MachineInstr &MI) { ``` https://github.com/llvm/llvm-project/pull/87576 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] Backport to 20.x "[clang][analyzer] Fix error path of builtin overflow (#136345)" (PR #136589)
https://github.com/Xazax-hun approved this pull request. https://github.com/llvm/llvm-project/pull/136589 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][ArmSVE] Add initial lowering of vector.contract to SVE `*MMLA` instructions (PR #135636)
@@ -0,0 +1,304 @@ +//===- LowerContractionToSMMLAPattern.cpp - Contract to SMMLA ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This file implements lowering patterns from vector.contract to +// SVE I8MM operations. +// +//===--- + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/ArmSVE/IR/ArmSVEDialect.h" +#include "mlir/Dialect/ArmSVE/Transforms/Transforms.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/AffineMap.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#include "mlir/Dialect/UB/IR/UBOps.h" + +#define DEBUG_TYPE "lower-contract-to-arm-sve-i8mm" + +using namespace mlir; +using namespace mlir::arm_sve; + +namespace { +// Check if the given value is a result of the operation `T` (which must be +// sign- or zero- extend) from i8 to i32. Return the value before the extension. +template +inline std::enable_if_t<(std::is_base_of_v || + std::is_base_of_v), +std::optional> +extractExtOperand(Value v, Type i8Ty, Type i32Ty) { + auto extOp = dyn_cast_or_null(v.getDefiningOp()); + if (!extOp) +return {}; + + auto inOp = extOp.getIn(); + auto inTy = dyn_cast(inOp.getType()); + if (!inTy || inTy.getElementType() != i8Ty) +return {}; + + auto outTy = dyn_cast(extOp.getType()); + if (!outTy || outTy.getElementType() != i32Ty) +return {}; + + return inOp; +} + +// Designate the operation (resp. instruction) used to do sub-tile matrix +// multiplications. +enum class MMLA { + Signed, // smmla + Unsigned,// ummla + Mixed, // usmmla + MixedSwapped // usmmla with LHS and RHS swapped +}; + +// Create the matrix multply and accumulate operation according to `op`. +Value createMMLA(PatternRewriter &rewriter, MMLA op, Location loc, + mlir::VectorType accType, Value acc, Value lhs, Value rhs) { + switch (op) { + case MMLA::Signed: +return rewriter.create(loc, accType, acc, lhs, rhs); + case MMLA::Unsigned: +return rewriter.create(loc, accType, acc, lhs, rhs); + case MMLA::Mixed: +return rewriter.create(loc, accType, acc, lhs, rhs); + case MMLA::MixedSwapped: +// The accumulator comes transposed and the result will be transposed +// later, so all we have to do here is swap the operands. +return rewriter.create(loc, accType, acc, rhs, lhs); + } +} + +class LowerContractionToSVEI8MMPattern +: public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(vector::ContractionOp op, +PatternRewriter &rewriter) const override { + +Location loc = op.getLoc(); +mlir::VectorType lhsType = op.getLhsType(); +mlir::VectorType rhsType = op.getRhsType(); + +// For now handle LHS and RHS<8x[N]> - these are the types we +// eventually expect from MMT4D. M and N dimensions must be even and at +// least 2. +if (!lhsType.hasRank() || lhsType.getRank() != 2 || !rhsType.hasRank() || +rhsType.getRank() != 2) + return failure(); + +if (lhsType.isScalable() || !rhsType.isScalable()) + return failure(); + +// M, N, and K are the conventional names for matrix dimensions in the +// context of matrix multiplication. +auto M = lhsType.getDimSize(0); +auto N = rhsType.getDimSize(0); +auto K = rhsType.getDimSize(1); + +if (lhsType.getDimSize(1) != K || K != 8 || M < 2 || M % 2 != 0 || N < 2 || +N % 2 != 0 || !rhsType.getScalableDims()[0]) + return failure(); + +// Check permutation maps. For now only accept +// lhs: (d0, d1, d2) -> (d0, d2) +// rhs: (d0, d1, d2) -> (d1, d2) +// acc: (d0, d1, d2) -> (d0, d1) +// Note: RHS is transposed. +if (op.getIndexingMapsArray()[0] != +AffineMap::getMultiDimMapWithTargets(3, ArrayRef{0u, 2u}, + op.getContext()) || +op.getIndexingMapsArray()[1] != +AffineMap::getMultiDimMapWithTargets(3, ArrayRef{1u, 2u}, + op.getContext()) || +op.getIndexingMapsArray()[2] != +AffineMap::getMultiDimMapWithTargets(3, ArrayRef{0u, 1u}, + op.getContext())) + return failure(); + +// Check iterator types for matrix multiplication. +auto itTypes = op.getIteratorTypesArray(); +if (itTypes.size() != 3 || itTypes[0] != vector::IteratorType::parallel || +itTypes[1] != vector::IteratorType
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: detect signing oracles (PR #134146)
@@ -339,6 +369,183 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { } } + std::optional> + getAuthCheckedReg(BinaryBasicBlock &BB) const override { +// Match several possible hard-coded sequences of instructions which can be +// emitted by LLVM backend to check that the authenticated pointer is +// correct (see AArch64AsmPrinter::emitPtrauthCheckAuthenticatedValue). +// +// This function only matches sequences involving branch instructions. +// All these sequences have the form: +// +// (0) ... regular code that authenticates a pointer in Xn ... +// (1) analyze Xn +// (2) branch to .Lon_success if the pointer is correct +// (3) BRK #imm (fall-through basic block) +// +// In the above pseudocode, (1) + (2) is one of the following sequences: +// +// - eor Xtmp, Xn, Xn, lsl #1 +// tbz Xtmp, #62, .Lon_success +// +// - mov Xtmp, Xn +// xpac(i|d) Xn (or xpaclri if Xn is LR) +// cmp Xtmp, Xn +// b.eq .Lon_success +// +// Note that any branch destination operand is accepted as .Lon_success - +// it is the responsibility of the caller of getAuthCheckedReg to inspect +// the list of successors of this basic block as appropriate. + +// Any of the above code sequences assume the fall-through basic block +// is a dead-end BRK instruction (any immediate operand is accepted). +const BinaryBasicBlock *BreakBB = BB.getFallthrough(); +if (!BreakBB || BreakBB->empty() || +BreakBB->front().getOpcode() != AArch64::BRK) + return std::nullopt; + +// Iterate over the instructions of BB in reverse order, matching opcodes +// and operands. +MCPhysReg TestedReg = 0; +MCPhysReg ScratchReg = 0; +auto It = BB.end(); +auto StepAndGetOpcode = [&It, &BB]() -> int { + if (It == BB.begin()) +return -1; + --It; + return It->getOpcode(); +}; + +switch (StepAndGetOpcode()) { +default: + // Not matched the branch instruction. + return std::nullopt; +case AArch64::Bcc: + // Bcc EQ, .Lon_success + if (It->getOperand(0).getImm() != AArch64CC::EQ) +return std::nullopt; + // Not checking .Lon_success (see above). + + // SUBSXrs XZR, TestedReg, ScratchReg, 0 (used by "CMP reg, reg" alias) + if (StepAndGetOpcode() != AArch64::SUBSXrs || + It->getOperand(0).getReg() != AArch64::XZR || + It->getOperand(3).getImm() != 0) +return std::nullopt; + TestedReg = It->getOperand(1).getReg(); + ScratchReg = It->getOperand(2).getReg(); + + // Either XPAC(I|D) ScratchReg, ScratchReg + // or XPACLRI + switch (StepAndGetOpcode()) { + default: +return std::nullopt; + case AArch64::XPACLRI: +// No operands to check, but using XPACLRI forces TestedReg to be X30. +if (TestedReg != AArch64::LR) + return std::nullopt; +break; + case AArch64::XPACI: + case AArch64::XPACD: +if (It->getOperand(0).getReg() != ScratchReg || +It->getOperand(1).getReg() != ScratchReg) + return std::nullopt; +break; + } + + // ORRXrs ScratchReg, XZR, TestedReg, 0 (used by "MOV reg, reg" alias) + if (StepAndGetOpcode() != AArch64::ORRXrs) +return std::nullopt; + if (It->getOperand(0).getReg() != ScratchReg || + It->getOperand(1).getReg() != AArch64::XZR || + It->getOperand(2).getReg() != TestedReg || + It->getOperand(3).getImm() != 0) +return std::nullopt; + + return std::make_pair(TestedReg, &*It); + +case AArch64::TBZX: + // TBZX ScratchReg, 62, .Lon_success + ScratchReg = It->getOperand(0).getReg(); + if (It->getOperand(1).getImm() != 62) +return std::nullopt; + // Not checking .Lon_success (see above). + + // EORXrs ScratchReg, TestedReg, TestedReg, 1 + if (StepAndGetOpcode() != AArch64::EORXrs) +return std::nullopt; + TestedReg = It->getOperand(1).getReg(); + if (It->getOperand(0).getReg() != ScratchReg || + It->getOperand(2).getReg() != TestedReg || + It->getOperand(3).getImm() != 1) +return std::nullopt; + + return std::make_pair(TestedReg, &*It); +} + } + + MCPhysReg getAuthCheckedReg(const MCInst &Inst, + bool MayOverwrite) const override { +// Cannot trivially reuse AArch64InstrInfo::getMemOperandWithOffsetWidth() +// method as it accepts an instance of MachineInstr, not MCInst. +const MCInstrDesc &Desc = Info->get(Inst.getOpcode()); + +// If signing oracles are considered, the particular value left in the base +// register after this instruction is important. This function checks that +// if the base register was overwritten, it is due to address write-back. +// +// Note that this function is not needed for authentication oracles, as the +
[llvm-branch-commits] [llvm] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` (PR #136798)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/136798 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Attributor] Use `getAllocaAddrSpace` to get address space for `AllocaInst` (PR #136865)
shiltian wrote: The LLVM Lang Ref says: > The function of the data layout string may not be what you expect. Notably, > this is not a specification from the frontend of what alignment the code > generator should use. > > Instead, if specified, **the target data layout is required to match what the > ultimate code generator expects**. This string is used by the mid-level > optimizers to improve code, and this only works if it matches what the > ultimate code generator uses. My reading is, it has to match the ultimate code generator. Middle end optimization relies on it to improve the code. https://github.com/llvm/llvm-project/pull/136865 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` (PR #136798)
@@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s -o - | FileCheck %s + +declare void @bar(ptr) + +define i32 @static_alloca() { +; CHECK-LABEL: define i32 @static_alloca() { +; CHECK-NEXT:[[ALLOCA:%.*]] = alloca i32, align 4 +; CHECK-NEXT:[[TMP1:%.*]] = addrspacecast ptr [[ALLOCA]] to ptr addrspace(5) +; CHECK-NEXT:[[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr shiltian wrote: Oh we don't run instcombine here. https://github.com/llvm/llvm-project/pull/136798 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [GOFF] Add writing of section symbols (PR #133799)
redstar wrote: @uweigand I made all the suggested changes. https://github.com/llvm/llvm-project/pull/133799 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Attributor] Use `getAllocaAddrSpace` to get address space for `AllocaInst` (PR #136865)
arsenm wrote: > My reading is, it has to match the ultimate code generator. Middle end > optimization relies on it to improve the code. The definition of "match" leaves room for interpretation, and it would be a better system if we allowed more dynamic configuration for some fields. However this conversation is off topic. This is not about whether the datalayout matches the target or not, but the interpretation of the datalayout. The A field does not assert anything about the content of the module. It does not assert that any alloca with a non-A valued alloca can be replaced with an A address space alloca. An alloca that does not match this address space is not invalid, and you cannot say anything about it https://github.com/llvm/llvm-project/pull/136865 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` (PR #136798)
@@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s -o - | FileCheck %s + +declare void @bar(ptr) + +define i32 @static_alloca() { +; CHECK-LABEL: define i32 @static_alloca() { +; CHECK-NEXT:[[ALLOCA:%.*]] = alloca i32, align 4 +; CHECK-NEXT:[[TMP1:%.*]] = addrspacecast ptr [[ALLOCA]] to ptr addrspace(5) +; CHECK-NEXT:[[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr arsenm wrote: No, shouldn't run instcombine here. It's unnecessary, we should see the raw pass output. It's a question of whether the full pass pipeline eliminates the cast. There was a recent DAG combine where IIRC the conclusion was instcombine folds these but it's not necessarily legal https://github.com/llvm/llvm-project/pull/136798 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` (PR #136798)
@@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s -o - | FileCheck %s + +declare void @bar(ptr) + +define i32 @static_alloca() { +; CHECK-LABEL: define i32 @static_alloca() { +; CHECK-NEXT:[[ALLOCA:%.*]] = alloca i32, align 4 +; CHECK-NEXT:[[TMP1:%.*]] = addrspacecast ptr [[ALLOCA]] to ptr addrspace(5) +; CHECK-NEXT:[[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr shiltian wrote: No, this pair is for the function call. Not for the load. By "eliminate this cast pair", do you mean to rewrite `alloca` with the right AS? https://github.com/llvm/llvm-project/pull/136798 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr] Remap cloned PHIs in BreakCriticalEdges (PR #133493)
https://github.com/jmorse edited https://github.com/llvm/llvm-project/pull/133493 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr] Remap cloned PHIs in BreakCriticalEdges (PR #133493)
@@ -0,0 +1,52 @@ +; RUN: opt -passes='require,function(codegenprepare)' -S -mtriple=x86_64 < %s \ +; RUN: | FileCheck %s + +;; Check debug locations are propagated onto new PHIs. jmorse wrote: "...and that source locations have their atom groups remapped" https://github.com/llvm/llvm-project/pull/133493 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr] Remap cloned PHIs in BreakCriticalEdges (PR #133493)
https://github.com/jmorse approved this pull request. LGTM with nit https://github.com/llvm/llvm-project/pull/133493 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Make `AllocaInst` return AS5 in `getAssumedAddrSpace` (PR #136798)
@@ -951,6 +951,9 @@ bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, } unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { + if (isa(V)) shiltian wrote: I don't think we need to differentiate them. It is either flat or private. It can't be something else, especially after https://github.com/llvm/llvm-project/pull/135820. https://github.com/llvm/llvm-project/pull/136798 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Attributor] Use `getAllocaAddrSpace` to get address space for `AllocaInst` (PR #136865)
https://github.com/arsenm requested changes to this pull request. The address space should just come directly from the alloca. You don't know if it's correct to just replace the addrspace with whatever the datalayout says is the alloca addrspace. The datalayout value is for new allocas where the code has no additional context https://github.com/llvm/llvm-project/pull/136865 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [KeyInstr] Add MIR parser support (PR #133494)
https://github.com/jmorse approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/133494 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][ArmSVE] Add initial lowering of vector.contract to SVE `*MMLA` instructions (PR #135636)
@@ -0,0 +1,304 @@ +//===- LowerContractionToSMMLAPattern.cpp - Contract to SMMLA ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This file implements lowering patterns from vector.contract to +// SVE I8MM operations. banach-space wrote: Could you add a note that `vector.contract` needs to be accompanied by `arith.extsi` (or `arith.extui`) Ops? Also, is I8MM the official name? Shouldn't that be FEAT_I8MM? Basically, could we document a bit more? https://github.com/llvm/llvm-project/pull/135636 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][ArmSVE] Add initial lowering of vector.contract to SVE `*MMLA` instructions (PR #135636)
@@ -0,0 +1,304 @@ +//===- LowerContractionToSMMLAPattern.cpp - Contract to SMMLA ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This file implements lowering patterns from vector.contract to +// SVE I8MM operations. +// +//===--- + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/ArmSVE/IR/ArmSVEDialect.h" +#include "mlir/Dialect/ArmSVE/Transforms/Transforms.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/AffineMap.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#include "mlir/Dialect/UB/IR/UBOps.h" + +#define DEBUG_TYPE "lower-contract-to-arm-sve-i8mm" + +using namespace mlir; +using namespace mlir::arm_sve; + +namespace { +// Check if the given value is a result of the operation `T` (which must be +// sign- or zero- extend) from i8 to i32. Return the value before the extension. +template +inline std::enable_if_t<(std::is_base_of_v || + std::is_base_of_v), +std::optional> +extractExtOperand(Value v, Type i8Ty, Type i32Ty) { + auto extOp = dyn_cast_or_null(v.getDefiningOp()); + if (!extOp) +return {}; + + auto inOp = extOp.getIn(); + auto inTy = dyn_cast(inOp.getType()); + if (!inTy || inTy.getElementType() != i8Ty) +return {}; + + auto outTy = dyn_cast(extOp.getType()); + if (!outTy || outTy.getElementType() != i32Ty) +return {}; + + return inOp; +} + +// Designate the operation (resp. instruction) used to do sub-tile matrix +// multiplications. +enum class MMLA { + Signed, // smmla + Unsigned,// ummla + Mixed, // usmmla + MixedSwapped // usmmla with LHS and RHS swapped +}; + +// Create the matrix multply and accumulate operation according to `op`. +Value createMMLA(PatternRewriter &rewriter, MMLA op, Location loc, + mlir::VectorType accType, Value acc, Value lhs, Value rhs) { + switch (op) { + case MMLA::Signed: +return rewriter.create(loc, accType, acc, lhs, rhs); + case MMLA::Unsigned: +return rewriter.create(loc, accType, acc, lhs, rhs); + case MMLA::Mixed: +return rewriter.create(loc, accType, acc, lhs, rhs); + case MMLA::MixedSwapped: +// The accumulator comes transposed and the result will be transposed +// later, so all we have to do here is swap the operands. +return rewriter.create(loc, accType, acc, rhs, lhs); + } +} + +class LowerContractionToSVEI8MMPattern banach-space wrote: It's a very long pattern. Could you document the high-level logic? https://github.com/llvm/llvm-project/pull/135636 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][ArmSVE] Add initial lowering of vector.contract to SVE `*MMLA` instructions (PR #135636)
@@ -0,0 +1,304 @@ +//===- LowerContractionToSMMLAPattern.cpp - Contract to SMMLA ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This file implements lowering patterns from vector.contract to +// SVE I8MM operations. +// +//===--- + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/ArmSVE/IR/ArmSVEDialect.h" +#include "mlir/Dialect/ArmSVE/Transforms/Transforms.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/AffineMap.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#include "mlir/Dialect/UB/IR/UBOps.h" + +#define DEBUG_TYPE "lower-contract-to-arm-sve-i8mm" + +using namespace mlir; +using namespace mlir::arm_sve; + +namespace { +// Check if the given value is a result of the operation `T` (which must be +// sign- or zero- extend) from i8 to i32. Return the value before the extension. +template +inline std::enable_if_t<(std::is_base_of_v || + std::is_base_of_v), +std::optional> +extractExtOperand(Value v, Type i8Ty, Type i32Ty) { + auto extOp = dyn_cast_or_null(v.getDefiningOp()); + if (!extOp) +return {}; + + auto inOp = extOp.getIn(); + auto inTy = dyn_cast(inOp.getType()); + if (!inTy || inTy.getElementType() != i8Ty) +return {}; + + auto outTy = dyn_cast(extOp.getType()); + if (!outTy || outTy.getElementType() != i32Ty) +return {}; + + return inOp; +} + +// Designate the operation (resp. instruction) used to do sub-tile matrix +// multiplications. +enum class MMLA { + Signed, // smmla + Unsigned,// ummla + Mixed, // usmmla + MixedSwapped // usmmla with LHS and RHS swapped +}; + +// Create the matrix multply and accumulate operation according to `op`. +Value createMMLA(PatternRewriter &rewriter, MMLA op, Location loc, + mlir::VectorType accType, Value acc, Value lhs, Value rhs) { + switch (op) { + case MMLA::Signed: +return rewriter.create(loc, accType, acc, lhs, rhs); + case MMLA::Unsigned: +return rewriter.create(loc, accType, acc, lhs, rhs); + case MMLA::Mixed: +return rewriter.create(loc, accType, acc, lhs, rhs); + case MMLA::MixedSwapped: +// The accumulator comes transposed and the result will be transposed +// later, so all we have to do here is swap the operands. +return rewriter.create(loc, accType, acc, rhs, lhs); + } +} + +class LowerContractionToSVEI8MMPattern +: public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(vector::ContractionOp op, +PatternRewriter &rewriter) const override { + +Location loc = op.getLoc(); +mlir::VectorType lhsType = op.getLhsType(); +mlir::VectorType rhsType = op.getRhsType(); + +// For now handle LHS and RHS<8x[N]> - these are the types we +// eventually expect from MMT4D. M and N dimensions must be even and at banach-space wrote: [nit] We shouldn't be concerned with MMT4D in this dialect - it's a much higher-level abstraction and this logic should be valid irrespective of how the input is generated. https://github.com/llvm/llvm-project/pull/135636 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][ArmSVE] Add initial lowering of vector.contract to SVE `*MMLA` instructions (PR #135636)
@@ -0,0 +1,304 @@ +//===- LowerContractionToSMMLAPattern.cpp - Contract to SMMLA ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This file implements lowering patterns from vector.contract to +// SVE I8MM operations. +// +//===--- + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/ArmSVE/IR/ArmSVEDialect.h" +#include "mlir/Dialect/ArmSVE/Transforms/Transforms.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/AffineMap.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#include "mlir/Dialect/UB/IR/UBOps.h" + +#define DEBUG_TYPE "lower-contract-to-arm-sve-i8mm" + +using namespace mlir; +using namespace mlir::arm_sve; + +namespace { +// Check if the given value is a result of the operation `T` (which must be +// sign- or zero- extend) from i8 to i32. Return the value before the extension. +template +inline std::enable_if_t<(std::is_base_of_v || + std::is_base_of_v), +std::optional> +extractExtOperand(Value v, Type i8Ty, Type i32Ty) { + auto extOp = dyn_cast_or_null(v.getDefiningOp()); + if (!extOp) +return {}; + + auto inOp = extOp.getIn(); + auto inTy = dyn_cast(inOp.getType()); + if (!inTy || inTy.getElementType() != i8Ty) +return {}; + + auto outTy = dyn_cast(extOp.getType()); + if (!outTy || outTy.getElementType() != i32Ty) +return {}; + + return inOp; +} + +// Designate the operation (resp. instruction) used to do sub-tile matrix +// multiplications. +enum class MMLA { + Signed, // smmla + Unsigned,// ummla + Mixed, // usmmla + MixedSwapped // usmmla with LHS and RHS swapped +}; + +// Create the matrix multply and accumulate operation according to `op`. +Value createMMLA(PatternRewriter &rewriter, MMLA op, Location loc, + mlir::VectorType accType, Value acc, Value lhs, Value rhs) { + switch (op) { + case MMLA::Signed: +return rewriter.create(loc, accType, acc, lhs, rhs); + case MMLA::Unsigned: +return rewriter.create(loc, accType, acc, lhs, rhs); + case MMLA::Mixed: +return rewriter.create(loc, accType, acc, lhs, rhs); + case MMLA::MixedSwapped: +// The accumulator comes transposed and the result will be transposed +// later, so all we have to do here is swap the operands. +return rewriter.create(loc, accType, acc, rhs, lhs); + } +} + +class LowerContractionToSVEI8MMPattern +: public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(vector::ContractionOp op, +PatternRewriter &rewriter) const override { + +Location loc = op.getLoc(); +mlir::VectorType lhsType = op.getLhsType(); +mlir::VectorType rhsType = op.getRhsType(); + +// For now handle LHS and RHS<8x[N]> - these are the types we +// eventually expect from MMT4D. M and N dimensions must be even and at +// least 2. +if (!lhsType.hasRank() || lhsType.getRank() != 2 || !rhsType.hasRank() || +rhsType.getRank() != 2) + return failure(); + +if (lhsType.isScalable() || !rhsType.isScalable()) + return failure(); + +// M, N, and K are the conventional names for matrix dimensions in the +// context of matrix multiplication. +auto M = lhsType.getDimSize(0); +auto N = rhsType.getDimSize(0); +auto K = rhsType.getDimSize(1); + +if (lhsType.getDimSize(1) != K || K != 8 || M < 2 || M % 2 != 0 || N < 2 || +N % 2 != 0 || !rhsType.getScalableDims()[0]) + return failure(); + +// Check permutation maps. For now only accept +// lhs: (d0, d1, d2) -> (d0, d2) +// rhs: (d0, d1, d2) -> (d1, d2) +// acc: (d0, d1, d2) -> (d0, d1) +// Note: RHS is transposed. +if (op.getIndexingMapsArray()[0] != +AffineMap::getMultiDimMapWithTargets(3, ArrayRef{0u, 2u}, + op.getContext()) || +op.getIndexingMapsArray()[1] != +AffineMap::getMultiDimMapWithTargets(3, ArrayRef{1u, 2u}, + op.getContext()) || +op.getIndexingMapsArray()[2] != +AffineMap::getMultiDimMapWithTargets(3, ArrayRef{0u, 1u}, + op.getContext())) + return failure(); + +// Check iterator types for matrix multiplication. +auto itTypes = op.getIteratorTypesArray(); +if (itTypes.size() != 3 || itTypes[0] != vector::IteratorType::parallel || +itTypes[1] != vector::IteratorType
[llvm-branch-commits] [mlir] [MLIR][ArmSVE] Add initial lowering of vector.contract to SVE `*MMLA` instructions (PR #135636)
https://github.com/banach-space edited https://github.com/llvm/llvm-project/pull/135636 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [MLIR][ArmSVE] Add initial lowering of vector.contract to SVE `*MMLA` instructions (PR #135636)
@@ -0,0 +1,304 @@ +//===- LowerContractionToSMMLAPattern.cpp - Contract to SMMLA ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This file implements lowering patterns from vector.contract to +// SVE I8MM operations. +// +//===--- banach-space wrote: ```suggestion //===--===//``` https://github.com/llvm/llvm-project/pull/135636 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits