[llvm-branch-commits] [mlir] users/banach space/vector/update create write (PR #141567)
llvmbot wrote: @llvm/pr-subscribers-mlir-vector Author: Andrzej Warzyński (banach-space) Changes - **[[mlir][linalg] Refactor vectorization hooks to improve code reuse** - **[mlir][linalg] Simplify `createWriteOrMaskedWrite` (NFC)** --- Full diff: https://github.com/llvm/llvm-project/pull/141567.diff 1 Files Affected: - (modified) mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp (+40-78) ``diff diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 0113ba86a5ae3..2abb2f0ea467c 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1590,61 +1590,46 @@ static bool isMaskTriviallyFoldable(SmallVector &maskSizes, /// Creates an optionally masked TransferWriteOp /// /// Generates the following operation: -/// %res = vector.transfer_write %vectorToStore into %dest +/// %res = vector.transfer_write %vecToStore into %dest /// -/// If the leading N dimensions of the vector to store do not match -/// `inputVecSizesForLeadingDims` (N = rank(inputVecSizesForLeadingDims)), -/// masking is applied to ensure correctness: +/// If shape(vecToStore) != shape(dest), masking is used to ensure correctness: /// -/// %mask = vector.create_mask(%destShape) : %vectorToStoreShape +/// %mask = vector.create_mask(%destShape) : %vecToStoreShape /// %res = vector.mask %mask { -/// vector.transfer_write %vectorToStore into %dest +/// vector.transfer_write %vecToStore into %dest /// } /// -/// The mask shape is identical to `vectorToStore` (with the element type == +/// The mask shape is identical to `vecToStore` (with the element type == /// i1), and the mask values are based on the shape of the `dest` tensor. /// /// If `useInBoundsInsteadOfMasking` is set to `true`, the `in_bounds` attribute /// is used instead of masking: /// -/// %write = vector.transfer_write %vectorToStore into %dest +/// %write = vector.transfer_write %vecToStore into %dest /// in_bounds_flags = (...) /// %res = vector.transfer_write %input into %dest /// {in_bounds = in_bounds_flags} /// -/// `writeIndices` specifies the offsets to use. If empty, all indices are set -/// to 0. -/// -/// NOTE: When N < rank(vectorToStore), the missing vector sizes are taken from -/// `valueToStore`. -/// TODO: `inputVecSizesForLeadingDims` should not be required - these sizes are -/// already provided in `vectorToStore`. +/// Finally, `writeIndices` specifies the offsets to use. If empty, all indices +/// are set to 0. static Operation * -createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore, - Value dest, - ArrayRef inputVecSizesForLeadingDims, - SmallVector writeIndices = {}, +createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vecToStore, + Value dest, SmallVector writeIndices = {}, bool useInBoundsInsteadOfMasking = false) { ShapedType destType = cast(dest.getType()); int64_t destRank = destType.getRank(); auto destShape = destType.getShape(); - VectorType vecToStoreType = cast(vectorToStore.getType()); + VectorType vecToStoreType = cast(vecToStore.getType()); int64_t vecToStoreRank = vecToStoreType.getRank(); auto vecToStoreShape = vecToStoreType.getShape(); // Compute the in_bounds attribute SmallVector inBoundsVal(vecToStoreRank, true); if (useInBoundsInsteadOfMasking) { -// In this case, assume that all the required vector sizes have been -// provided. -assert(inputVecSizesForLeadingDims.size() == - static_cast(vecToStoreType.getRank()) && - "Insufficient number of input vector sizes!"); -// Update the inBounds attribute. for (unsigned i = 0; i < destRank; i++) - inBoundsVal[i] = (destShape[i] == inputVecSizesForLeadingDims[i]) && + inBoundsVal[i] = (destShape[i] == vecToStoreShape[i]) && !ShapedType::isDynamic(destShape[i]); } @@ -1660,7 +1645,7 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore, // Generate the xfer_write Op Operation *write = builder.create(loc, - /*vector=*/vectorToStore, + /*vector=*/vecToStore, /*source=*/dest, /*indices=*/writeIndices, /*inBounds=*/inBoundsVal); @@ -1669,46 +1654,25 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore, if (useInBoundsInsteadOfMasking) return write; - assert(llvm::none_of( - destShape.drop_front(inputVecSizesForLeadingDims.size()), - [](int64_t size) { return size == ShapedType::kDyna
[llvm-branch-commits] [mlir] users/banach space/vector/update create write (PR #141567)
llvmbot wrote: @llvm/pr-subscribers-mlir-llvm Author: Andrzej Warzyński (banach-space) Changes - **[[mlir][linalg] Refactor vectorization hooks to improve code reuse** - **[mlir][linalg] Simplify `createWriteOrMaskedWrite` (NFC)** --- Full diff: https://github.com/llvm/llvm-project/pull/141567.diff 1 Files Affected: - (modified) mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp (+40-78) ``diff diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 0113ba86a5ae3..2abb2f0ea467c 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1590,61 +1590,46 @@ static bool isMaskTriviallyFoldable(SmallVector &maskSizes, /// Creates an optionally masked TransferWriteOp /// /// Generates the following operation: -/// %res = vector.transfer_write %vectorToStore into %dest +/// %res = vector.transfer_write %vecToStore into %dest /// -/// If the leading N dimensions of the vector to store do not match -/// `inputVecSizesForLeadingDims` (N = rank(inputVecSizesForLeadingDims)), -/// masking is applied to ensure correctness: +/// If shape(vecToStore) != shape(dest), masking is used to ensure correctness: /// -/// %mask = vector.create_mask(%destShape) : %vectorToStoreShape +/// %mask = vector.create_mask(%destShape) : %vecToStoreShape /// %res = vector.mask %mask { -/// vector.transfer_write %vectorToStore into %dest +/// vector.transfer_write %vecToStore into %dest /// } /// -/// The mask shape is identical to `vectorToStore` (with the element type == +/// The mask shape is identical to `vecToStore` (with the element type == /// i1), and the mask values are based on the shape of the `dest` tensor. /// /// If `useInBoundsInsteadOfMasking` is set to `true`, the `in_bounds` attribute /// is used instead of masking: /// -/// %write = vector.transfer_write %vectorToStore into %dest +/// %write = vector.transfer_write %vecToStore into %dest /// in_bounds_flags = (...) /// %res = vector.transfer_write %input into %dest /// {in_bounds = in_bounds_flags} /// -/// `writeIndices` specifies the offsets to use. If empty, all indices are set -/// to 0. -/// -/// NOTE: When N < rank(vectorToStore), the missing vector sizes are taken from -/// `valueToStore`. -/// TODO: `inputVecSizesForLeadingDims` should not be required - these sizes are -/// already provided in `vectorToStore`. +/// Finally, `writeIndices` specifies the offsets to use. If empty, all indices +/// are set to 0. static Operation * -createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore, - Value dest, - ArrayRef inputVecSizesForLeadingDims, - SmallVector writeIndices = {}, +createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vecToStore, + Value dest, SmallVector writeIndices = {}, bool useInBoundsInsteadOfMasking = false) { ShapedType destType = cast(dest.getType()); int64_t destRank = destType.getRank(); auto destShape = destType.getShape(); - VectorType vecToStoreType = cast(vectorToStore.getType()); + VectorType vecToStoreType = cast(vecToStore.getType()); int64_t vecToStoreRank = vecToStoreType.getRank(); auto vecToStoreShape = vecToStoreType.getShape(); // Compute the in_bounds attribute SmallVector inBoundsVal(vecToStoreRank, true); if (useInBoundsInsteadOfMasking) { -// In this case, assume that all the required vector sizes have been -// provided. -assert(inputVecSizesForLeadingDims.size() == - static_cast(vecToStoreType.getRank()) && - "Insufficient number of input vector sizes!"); -// Update the inBounds attribute. for (unsigned i = 0; i < destRank; i++) - inBoundsVal[i] = (destShape[i] == inputVecSizesForLeadingDims[i]) && + inBoundsVal[i] = (destShape[i] == vecToStoreShape[i]) && !ShapedType::isDynamic(destShape[i]); } @@ -1660,7 +1645,7 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore, // Generate the xfer_write Op Operation *write = builder.create(loc, - /*vector=*/vectorToStore, + /*vector=*/vecToStore, /*source=*/dest, /*indices=*/writeIndices, /*inBounds=*/inBoundsVal); @@ -1669,46 +1654,25 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore, if (useInBoundsInsteadOfMasking) return write; - assert(llvm::none_of( - destShape.drop_front(inputVecSizesForLeadingDims.size()), - [](int64_t size) { return size == ShapedType::kDynami
[llvm-branch-commits] [mlir] users/banach space/vector/update create write (PR #141567)
https://github.com/banach-space edited https://github.com/llvm/llvm-project/pull/141567 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] users/banach space/vector/update create write (PR #141567)
llvmbot wrote: @llvm/pr-subscribers-mlir-linalg Author: Andrzej Warzyński (banach-space) Changes - **[[mlir][linalg] Refactor vectorization hooks to improve code reuse** - **[mlir][linalg] Simplify `createWriteOrMaskedWrite` (NFC)** --- Full diff: https://github.com/llvm/llvm-project/pull/141567.diff 1 Files Affected: - (modified) mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp (+40-78) ``diff diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 0113ba86a5ae3..2abb2f0ea467c 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1590,61 +1590,46 @@ static bool isMaskTriviallyFoldable(SmallVector &maskSizes, /// Creates an optionally masked TransferWriteOp /// /// Generates the following operation: -/// %res = vector.transfer_write %vectorToStore into %dest +/// %res = vector.transfer_write %vecToStore into %dest /// -/// If the leading N dimensions of the vector to store do not match -/// `inputVecSizesForLeadingDims` (N = rank(inputVecSizesForLeadingDims)), -/// masking is applied to ensure correctness: +/// If shape(vecToStore) != shape(dest), masking is used to ensure correctness: /// -/// %mask = vector.create_mask(%destShape) : %vectorToStoreShape +/// %mask = vector.create_mask(%destShape) : %vecToStoreShape /// %res = vector.mask %mask { -/// vector.transfer_write %vectorToStore into %dest +/// vector.transfer_write %vecToStore into %dest /// } /// -/// The mask shape is identical to `vectorToStore` (with the element type == +/// The mask shape is identical to `vecToStore` (with the element type == /// i1), and the mask values are based on the shape of the `dest` tensor. /// /// If `useInBoundsInsteadOfMasking` is set to `true`, the `in_bounds` attribute /// is used instead of masking: /// -/// %write = vector.transfer_write %vectorToStore into %dest +/// %write = vector.transfer_write %vecToStore into %dest /// in_bounds_flags = (...) /// %res = vector.transfer_write %input into %dest /// {in_bounds = in_bounds_flags} /// -/// `writeIndices` specifies the offsets to use. If empty, all indices are set -/// to 0. -/// -/// NOTE: When N < rank(vectorToStore), the missing vector sizes are taken from -/// `valueToStore`. -/// TODO: `inputVecSizesForLeadingDims` should not be required - these sizes are -/// already provided in `vectorToStore`. +/// Finally, `writeIndices` specifies the offsets to use. If empty, all indices +/// are set to 0. static Operation * -createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore, - Value dest, - ArrayRef inputVecSizesForLeadingDims, - SmallVector writeIndices = {}, +createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vecToStore, + Value dest, SmallVector writeIndices = {}, bool useInBoundsInsteadOfMasking = false) { ShapedType destType = cast(dest.getType()); int64_t destRank = destType.getRank(); auto destShape = destType.getShape(); - VectorType vecToStoreType = cast(vectorToStore.getType()); + VectorType vecToStoreType = cast(vecToStore.getType()); int64_t vecToStoreRank = vecToStoreType.getRank(); auto vecToStoreShape = vecToStoreType.getShape(); // Compute the in_bounds attribute SmallVector inBoundsVal(vecToStoreRank, true); if (useInBoundsInsteadOfMasking) { -// In this case, assume that all the required vector sizes have been -// provided. -assert(inputVecSizesForLeadingDims.size() == - static_cast(vecToStoreType.getRank()) && - "Insufficient number of input vector sizes!"); -// Update the inBounds attribute. for (unsigned i = 0; i < destRank; i++) - inBoundsVal[i] = (destShape[i] == inputVecSizesForLeadingDims[i]) && + inBoundsVal[i] = (destShape[i] == vecToStoreShape[i]) && !ShapedType::isDynamic(destShape[i]); } @@ -1660,7 +1645,7 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore, // Generate the xfer_write Op Operation *write = builder.create(loc, - /*vector=*/vectorToStore, + /*vector=*/vecToStore, /*source=*/dest, /*indices=*/writeIndices, /*inBounds=*/inBoundsVal); @@ -1669,46 +1654,25 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore, if (useInBoundsInsteadOfMasking) return write; - assert(llvm::none_of( - destShape.drop_front(inputVecSizesForLeadingDims.size()), - [](int64_t size) { return size == ShapedType::kDyna
[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)
@@ -665,3 +724,9 @@ def : InstAlias<"signx $rs1, $rd", (SRArr IntRegs:$rd, IntRegs:$rs1, G0), 0>, Re // sir -> sir 0 def : InstAlias<"sir", (SIR 0), 0>; + +// pause reg_or_imm -> wrasr %g0, reg_or_imm, %asr27 +let Predicates = [HasOSA2011] in { +def : InstAlias<"pause $rs2", (WRASRrr ASR27, G0, IntRegs:$rs2), 1>; +def : InstAlias<"pause $simm13", (WRASRri ASR27, G0, simm13Op:$simm13), 1>; s-barannikov wrote: Indentation ```suggestion def : InstAlias<"pause $rs2", (WRASRrr ASR27, G0, IntRegs:$rs2)>; def : InstAlias<"pause $simm13", (WRASRri ASR27, G0, simm13Op:$simm13)>; ``` https://github.com/llvm/llvm-project/pull/138403 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)
@@ -141,6 +147,26 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO, return 0; } +unsigned SparcMCCodeEmitter::getSImm5OpValue(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + + if (MO.isImm()) +return MO.getImm(); + + assert(MO.isExpr() && + "getSImm5OpValue expects only expressions or an immediate"); + + const MCExpr *Expr = MO.getExpr(); + + // Constant value, no fixup is needed + if (const MCConstantExpr *CE = dyn_cast(Expr)) +return CE->getValue(); + + llvm_unreachable("simm5 operands can only be used with constants!"); s-barannikov wrote: Shouldn't it be a `R_SPARC_5` relocation? EIther way, if this code is reachable, it shouldn't be `llvm_unreachable`. https://github.com/llvm/llvm-project/pull/138403 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)
@@ -331,6 +331,25 @@ multiclass reg_cond_alias { Requires<[Is64Bit]>; } +// Instruction aliases for compare-and-branch. +multiclass cwb_cond_alias { + def : InstAliashttps://github.com/llvm/llvm-project/pull/138403 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)
@@ -408,6 +427,46 @@ defm : reg_cond_alias<"ne", 0b101>; defm : reg_cond_alias<"gz", 0b110>; defm : reg_cond_alias<"gez", 0b111>; +defm : cwb_cond_alias<"ne", 0b1001>; +defm : cwb_cond_alias<"e",0b0001>; +defm : cwb_cond_alias<"g",0b1010>; +defm : cwb_cond_alias<"le", 0b0010>; +defm : cwb_cond_alias<"ge", 0b1011>; +defm : cwb_cond_alias<"l",0b0011>; +defm : cwb_cond_alias<"gu", 0b1100>; +defm : cwb_cond_alias<"leu", 0b0100>; +defm : cwb_cond_alias<"cc", 0b1101>; +defm : cwb_cond_alias<"cs", 0b0101>; +defm : cwb_cond_alias<"pos", 0b1110>; +defm : cwb_cond_alias<"neg", 0b0110>; +defm : cwb_cond_alias<"vc", 0b>; +defm : cwb_cond_alias<"vs", 0b0111>; +let EmitPriority = 0 in +{ s-barannikov wrote: ```suggestion let EmitPriority = 0 in { ``` https://github.com/llvm/llvm-project/pull/138403 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)
@@ -102,6 +102,49 @@ class F2_4 pattern = [], InstrItinClass itin = NoItinerary> + : InstSP { s-barannikov wrote: ```suggestion : InstSP { ``` https://github.com/llvm/llvm-project/pull/138403 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)
@@ -217,6 +243,18 @@ unsigned SparcMCCodeEmitter::getBranchOnRegTargetOpValue( return 0; } +unsigned SparcMCCodeEmitter::getCompareAndBranchTargetOpValue( +const MCInst &MI, unsigned OpNo, SmallVectorImpl &Fixups, +const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isReg() || MO.isImm()) s-barannikov wrote: It can't be a register, can it? https://github.com/llvm/llvm-project/pull/138403 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)
@@ -50,6 +50,15 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { return (d16hi << 20) | d16lo; } + case ELF::R_SPARC_WDISP10: { +// 7.17 Compare and Branch +// Inst{20-19} = d10hi; +// Inst{12-5} = d10lo; +unsigned d10hi = (Value >> 10) & 0x3; s-barannikov wrote: Assert that `Value` is a multiple of 4? (Assuming it is guaranteed elsewhere.) https://github.com/llvm/llvm-project/pull/138403 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)
@@ -102,6 +102,49 @@ class F2_4 pattern = [], InstrItinClass itin = NoItinerary> + : InstSP { + bits<10> imm10; + bits<5> rs1; + bits<5> rs2; + bits<4> cond; + + let op = 0; // op = 0 + + let Inst{29}= cond{3}; + let Inst{28}= 1; + let Inst{27-25} = cond{2-0}; + let Inst{24-22} = 0b011; + let Inst{21}= cc; + let Inst{20-19} = imm10{9-8}; + let Inst{18-14} = rs1; + let Inst{13}= 0; // i = 0 + let Inst{12-5} = imm10{7-0}; + let Inst{4-0} = rs2; +} + +class F2_6 pattern = [], InstrItinClass itin = NoItinerary> + : InstSP { s-barannikov wrote: ```suggestion : InstSP { ``` https://github.com/llvm/llvm-project/pull/138403 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)
@@ -0,0 +1,267 @@ +! RUN: llvm-mc -triple=sparcv9 -mattr=+osa2011 -filetype=obj %s | llvm-objdump --mattr=+osa2011 --no-print-imm-hex -d - | FileCheck %s --check-prefix=BIN + +!! SPARCv9/SPARC64 BPr branches have different offset encoding from the others, s-barannikov wrote: Indentation looks unnecessary https://github.com/llvm/llvm-project/pull/138403 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LAA] Support monotonic pointers in LoopAccessAnalysis (PR #140721)
skachkov-sc wrote: Gentle ping https://github.com/llvm/llvm-project/pull/140721 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] users/banach space/vector/update create write (PR #141567)
https://github.com/banach-space edited https://github.com/llvm/llvm-project/pull/141567 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [compiler-rt] 0412d56 - Revert "[compiler-rt][XRay] Make `xray_interface.h` C compliant (#140068)"
Author: Jan Patrick Lehr Date: 2025-05-27T11:14:59+02:00 New Revision: 0412d56eaebb747cc77bd025969ed46e2b5cb12d URL: https://github.com/llvm/llvm-project/commit/0412d56eaebb747cc77bd025969ed46e2b5cb12d DIFF: https://github.com/llvm/llvm-project/commit/0412d56eaebb747cc77bd025969ed46e2b5cb12d.diff LOG: Revert "[compiler-rt][XRay] Make `xray_interface.h` C compliant (#140068)" This reverts commit 80da58da343620e458e34f01df95b329e7a5763c. Added: Modified: compiler-rt/include/xray/xray_interface.h Removed: compiler-rt/test/xray/TestCases/Posix/patching-unpatching.c diff --git a/compiler-rt/include/xray/xray_interface.h b/compiler-rt/include/xray/xray_interface.h index 3ef8ee348540f..675ea0cbc48c8 100644 --- a/compiler-rt/include/xray/xray_interface.h +++ b/compiler-rt/include/xray/xray_interface.h @@ -1,4 +1,4 @@ -//===- xray_interface.h ---===// +//===- xray_interface.h -*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -14,17 +14,10 @@ #ifndef XRAY_XRAY_INTERFACE_H #define XRAY_XRAY_INTERFACE_H -#ifdef __cplusplus #include #include -#else -#include -#include -#endif -#ifdef __cplusplus extern "C" { -#endif /// Synchronize this with AsmPrinter::SledKind in LLVM. enum XRayEntryType { @@ -56,7 +49,7 @@ enum XRayEntryType { /// achieved by marking them all with: __attribute__((xray_never_instrument)) /// /// Returns 1 on success, 0 on error. -extern int __xray_set_handler(void (*entry)(int32_t, enum XRayEntryType)); +extern int __xray_set_handler(void (*entry)(int32_t, XRayEntryType)); /// This removes whatever the currently provided handler is. Returns 1 on /// success, 0 on error. @@ -67,7 +60,7 @@ extern int __xray_remove_handler(); /// start logging their subsequent affected function calls (if patched). /// /// Returns 1 on success, 0 on error. -extern int __xray_set_handler_arg1(void (*entry)(int32_t, enum XRayEntryType, +extern int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType, uint64_t)); /// Disables the XRay handler used to log first arguments of function calls. @@ -75,7 +68,7 @@ extern int __xray_set_handler_arg1(void (*entry)(int32_t, enum XRayEntryType, extern int __xray_remove_handler_arg1(); /// Provide a function to invoke when XRay encounters a custom event. -extern int __xray_set_customevent_handler(void (*entry)(void *, size_t)); +extern int __xray_set_customevent_handler(void (*entry)(void *, std::size_t)); /// This removes whatever the currently provided custom event handler is. /// Returns 1 on success, 0 on error. @@ -102,39 +95,39 @@ enum XRayPatchingStatus { /// This tells XRay to patch the instrumentation points in all currently loaded /// objects. See XRayPatchingStatus for possible result values. -extern enum XRayPatchingStatus __xray_patch(); +extern XRayPatchingStatus __xray_patch(); /// This tells XRay to patch the instrumentation points in the given object. /// See XRayPatchingStatus for possible result values. -extern enum XRayPatchingStatus __xray_patch_object(int32_t ObjId); +extern XRayPatchingStatus __xray_patch_object(int32_t ObjId); /// Reverses the effect of __xray_patch(). See XRayPatchingStatus for possible /// result values. -extern enum XRayPatchingStatus __xray_unpatch(); +extern XRayPatchingStatus __xray_unpatch(); /// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for /// possible result values. -extern enum XRayPatchingStatus __xray_unpatch_object(int32_t ObjId); +extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId); /// This unpacks the given (packed) function id and patches /// the corresponding function. See XRayPatchingStatus for possible /// result values. -extern enum XRayPatchingStatus __xray_patch_function(int32_t FuncId); +extern XRayPatchingStatus __xray_patch_function(int32_t FuncId); /// This patches a specific function in the given object. See XRayPatchingStatus /// for possible result values. -extern enum XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId, - int32_t ObjId); +extern XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId, + int32_t ObjId); /// This unpacks the given (packed) function id and unpatches /// the corresponding function. See XRayPatchingStatus for possible /// result values. -extern enum XRayPatchingStatus __xray_unpatch_function(int32_t FuncId); +extern XRayPatchingStatus __xray_unpatch_function(int32_t FuncId); /// This unpatches a specific function in the given object. /// See XRayPatchingStatus for pos
[llvm-branch-commits] [llvm] AMDGPU: Add baseline tests for #139317 (PR #140607)
arsenm wrote: ping https://github.com/llvm/llvm-project/pull/140607 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [mlir][linalg] Simplify `createWriteOrMaskedWrite` (NFC) (PR #141567)
https://github.com/banach-space edited https://github.com/llvm/llvm-project/pull/141567 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Fix tracking subreg defs when folding through reg_sequence (PR #140608)
arsenm wrote: ping https://github.com/llvm/llvm-project/pull/140608 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [lld] ELF: Add branch-to-branch optimization. (PR #138366)
@@ -0,0 +1,92 @@ +//===- TargetImpl.h -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#ifndef LLD_ELF_ARCH_TARGETIMPL_H +#define LLD_ELF_ARCH_TARGETIMPL_H + +#include "InputFiles.h" +#include "InputSection.h" +#include "Relocations.h" +#include "Symbols.h" +#include "llvm/BinaryFormat/ELF.h" + +namespace lld { +namespace elf { + +// getControlTransferAddend: If this relocation is used for control transfer +// instructions (e.g. branch, branch-link or call) or code references (e.g. +// virtual function pointers) and indicates an address-insignificant reference, +// return the effective addend for the relocation, otherwise return +// std::nullopt. The effective addend for a relocation is the addend that is +// used to determine its branch destination. +// +// getBranchInfo: If a control transfer relocation referring to is+offset +// directly transfers control to a relocated branch instruction in the specified +// section, return the relocation for the branch target as well as its effective +// addend (see above). Otherwise return {nullptr, 0}. +// +// mergeControlTransferRelocations: Given r1, a relocation for which +// getControlTransferAddend() returned a value, and r2, a relocation returned by +// getBranchInfo(), modify r1 so that it branches directly to the target of r2. +template +inline void applyBranchToBranchOptImpl( +Ctx &ctx, GetBranchInfo getBranchInfo, +GetControlTransferAddend getControlTransferAddend, +MergeControlTransferRelocations mergeControlTransferRelocations) { + // Needs to run serially because it writes to the relocations array as well as + // reading relocations of other sections. + for (ELFFileBase *f : ctx.objectFiles) { +auto getRelocBranchInfo = +[&getBranchInfo](Relocation &r, + uint64_t addend) -> std::pair { + auto *target = dyn_cast_or_null(r.sym); + // We don't allow preemptible symbols or ifuncs (may go somewhere else), + // absolute symbols (runtime behavior unknown), non-executable memory + // (ditto) or non-regular sections (no section data). + if (!target || target->isPreemptible || target->isGnuIFunc() || smithp35 wrote: Yes, just checked and it does copy the relocation addend. I agree that this wouldn't need a test case. As an aside when checking where the addends were read in I ran into this bit of copyRelocations https://github.com/llvm/llvm-project/blob/main/lld/ELF/InputSection.cpp#L433 ``` if (ctx.arg.relax && !ctx.arg.relocatable && (ctx.arg.emachine == EM_RISCV || ctx.arg.emachine == EM_LOONGARCH)) { // On LoongArch and RISC-V, relaxation might change relocations: copy // from internal ones that are updated by relaxation. InputSectionBase *sec = getRelocatedSection(); copyRelocations( ctx, buf, llvm::make_range(sec->relocations.begin(), sec->relocations.end())); ``` I think I mentioned in a previous comment that bolt uses emit-relocations so it may be worth following suite here when the transformation is applied. I suspect that if bolt trusts the original relocation then in worst case the transformation is undone though. https://github.com/llvm/llvm-project/pull/138366 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [Driver] Fix link order of BareMetal toolchain object (PR #132806)
https://github.com/quic-garvgupt updated https://github.com/llvm/llvm-project/pull/132806 >From 597b0e0514f26f22e9425f25652657194aa48fc0 Mon Sep 17 00:00:00 2001 From: Garvit Gupta Date: Mon, 24 Mar 2025 06:17:42 -0700 Subject: [PATCH] [Driver] Fix link order of BareMetal toolchain object The linker job in BareMetal toolchain object will be used by gnuld and lld both. However, gnuld process the arguments in the order in which they appear on command line, whereas there is no such restriction with lld. The previous order was: LibraryPaths -> Libraries -> LTOOptions -> LinkerInputs The new iorder is: LibraryPaths -> LTOOptions -> LinkerInputs -> Libraries LTO options need to be added before adding any linker inputs because file format after compile stage during LTO is bitcode which gnuld natively cannot process. Hence iwill need to pass appropriate plugins before adding any bitcode file on the command line. Object files that are getting linked need to be passed before processing any libraries so that gnuld can appropriately do symbol resolution for the symbols for which no definition is provided through user code. Similar link order is also followed by other linker jobs for gnuld such as in gnutools::Linker in Gnu.cpp This is the 3rd patch in the series of patches of merging RISCVToolchain into BareMetal toolchain object. RFC: https://discourse.llvm.org/t/merging-riscvtoolchain-and-baremetal-toolchains/75524 Change-Id: I0e68e403c08b5687cc3346e833981f7b9f3819c4 --- clang/lib/Driver/ToolChains/BareMetal.cpp | 32 - clang/test/Driver/aarch64-toolchain-extra.c | 2 +- clang/test/Driver/aarch64-toolchain.c | 28 clang/test/Driver/arm-toolchain-extra.c | 2 +- clang/test/Driver/arm-toolchain.c | 28 clang/test/Driver/baremetal-multilib.yaml | 3 +- clang/test/Driver/baremetal-sysroot.cpp | 8 ++- clang/test/Driver/baremetal.cpp | 79 + 8 files changed, 102 insertions(+), 80 deletions(-) diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp index 8aee32c04d46a..35cc052429270 100644 --- a/clang/lib/Driver/ToolChains/BareMetal.cpp +++ b/clang/lib/Driver/ToolChains/BareMetal.cpp @@ -545,8 +545,6 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA, const llvm::Triple::ArchType Arch = TC.getArch(); const llvm::Triple &Triple = getToolChain().getEffectiveTriple(); - AddLinkerInputs(TC, Inputs, Args, CmdArgs, JA); - CmdArgs.push_back("-Bstatic"); if (TC.getTriple().isRISCV() && Args.hasArg(options::OPT_mno_relax)) @@ -596,6 +594,22 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA, for (const auto &LibPath : TC.getLibraryPaths()) CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-L", LibPath))); + if (D.isUsingLTO()) { +assert(!Inputs.empty() && "Must have at least one input."); +// Find the first filename InputInfo object. +auto Input = llvm::find_if( +Inputs, [](const InputInfo &II) -> bool { return II.isFilename(); }); +if (Input == Inputs.end()) + // For a very rare case, all of the inputs to the linker are + // InputArg. If that happens, just use the first InputInfo. + Input = Inputs.begin(); + +addLTOOptions(TC, Args, CmdArgs, Output, *Input, + D.getLTOMode() == LTOK_Thin); + } + + AddLinkerInputs(TC, Inputs, Args, CmdArgs, JA); + if (TC.ShouldLinkCXXStdlib(Args)) { bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) && !Args.hasArg(options::OPT_static); @@ -616,20 +630,6 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("--end-group"); } - if (D.isUsingLTO()) { -assert(!Inputs.empty() && "Must have at least one input."); -// Find the first filename InputInfo object. -auto Input = llvm::find_if( -Inputs, [](const InputInfo &II) -> bool { return II.isFilename(); }); -if (Input == Inputs.end()) - // For a very rare case, all of the inputs to the linker are - // InputArg. If that happens, just use the first InputInfo. - Input = Inputs.begin(); - -addLTOOptions(TC, Args, CmdArgs, Output, *Input, - D.getLTOMode() == LTOK_Thin); - } - if ((TC.hasValidGCCInstallation() || hasGCCToolChainAlongSideClang(D)) && NeedCRTs) CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath(CRTEnd))); diff --git a/clang/test/Driver/aarch64-toolchain-extra.c b/clang/test/Driver/aarch64-toolchain-extra.c index 2a930e35acd45..a0b5f2902962f 100644 --- a/clang/test/Driver/aarch64-toolchain-extra.c +++ b/clang/test/Driver/aarch64-toolchain-extra.c @@ -31,5 +31,5 @@ // C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/lib/crt0.o" // C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/{{.*}}/aarch64-none-elf/lib/crtbegin.o" // C-AARCH64-BAREMETAL-NOGCC
[llvm-branch-commits] [clang] [RISCV] Integrate RISCV target in baremetal toolchain object and deprecate RISCVToolchain object (PR #121831)
https://github.com/quic-garvgupt updated https://github.com/llvm/llvm-project/pull/121831 >From 08e30252fc4bd87d84decc81161c081c774d398e Mon Sep 17 00:00:00 2001 From: Garvit Gupta Date: Mon, 6 Jan 2025 10:05:08 -0800 Subject: [PATCH] [RISCV] Integrate RISCV target in baremetal toolchain object and deprecate RISCVToolchain object This patch: - Adds CXXStdlib, runtimelib and unwindlib defaults for riscv target to BareMetal toolchain object. - Add riscv 32 and 64-bit emulation flags to linker job of BareMetal toolchain. - Removes call to RISCVToolChain object from llvm. This PR is last patch in the series of patches of merging RISCVToolchain object into BareMetal toolchain object. RFC: https: //discourse.llvm.org/t/merging-riscvtoolchain-and-baremetal-toolchains/75524 Change-Id: Ic5d64a4ed3ebc58c30c12d9827e7e57a02eb13ca --- clang/lib/Driver/CMakeLists.txt | 1 - clang/lib/Driver/Driver.cpp | 10 +- clang/lib/Driver/ToolChains/BareMetal.cpp | 20 ++ clang/lib/Driver/ToolChains/BareMetal.h | 11 +- .../lib/Driver/ToolChains/RISCVToolchain.cpp | 232 -- clang/lib/Driver/ToolChains/RISCVToolchain.h | 67 - .../test/Driver/baremetal-undefined-symbols.c | 14 +- clang/test/Driver/riscv32-toolchain-extra.c | 7 +- clang/test/Driver/riscv32-toolchain.c | 26 +- clang/test/Driver/riscv64-toolchain-extra.c | 7 +- clang/test/Driver/riscv64-toolchain.c | 20 +- 11 files changed, 61 insertions(+), 354 deletions(-) delete mode 100644 clang/lib/Driver/ToolChains/RISCVToolchain.cpp delete mode 100644 clang/lib/Driver/ToolChains/RISCVToolchain.h diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt index 5bdb6614389cf..eee29af5d181a 100644 --- a/clang/lib/Driver/CMakeLists.txt +++ b/clang/lib/Driver/CMakeLists.txt @@ -74,7 +74,6 @@ add_clang_library(clangDriver ToolChains/OHOS.cpp ToolChains/OpenBSD.cpp ToolChains/PS4CPU.cpp - ToolChains/RISCVToolchain.cpp ToolChains/Solaris.cpp ToolChains/SPIRV.cpp ToolChains/SPIRVOpenMP.cpp diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 07e36ea2efba4..cfc0ba63d5749 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -41,7 +41,6 @@ #include "ToolChains/PPCFreeBSD.h" #include "ToolChains/PPCLinux.h" #include "ToolChains/PS4CPU.h" -#include "ToolChains/RISCVToolchain.h" #include "ToolChains/SPIRV.h" #include "ToolChains/SPIRVOpenMP.h" #include "ToolChains/SYCL.h" @@ -6889,16 +6888,11 @@ const ToolChain &Driver::getToolChain(const ArgList &Args, TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::msp430: -TC = -std::make_unique(*this, Target, Args); +TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::riscv32: case llvm::Triple::riscv64: -if (toolchains::RISCVToolChain::hasGCCToolchain(*this, Args)) - TC = - std::make_unique(*this, Target, Args); -else - TC = std::make_unique(*this, Target, Args); +TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::ve: TC = std::make_unique(*this, Target, Args); diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp index ac85757442e18..b9302433a1d63 100644 --- a/clang/lib/Driver/ToolChains/BareMetal.cpp +++ b/clang/lib/Driver/ToolChains/BareMetal.cpp @@ -352,6 +352,26 @@ BareMetal::OrderedMultilibs BareMetal::getOrderedMultilibs() const { return llvm::reverse(Default); } +ToolChain::CXXStdlibType BareMetal::GetDefaultCXXStdlibType() const { + if (getTriple().isRISCV() && GCCInstallation.isValid()) +return ToolChain::CST_Libstdcxx; + return ToolChain::CST_Libcxx; +} + +ToolChain::RuntimeLibType BareMetal::GetDefaultRuntimeLibType() const { + if (getTriple().isRISCV() && GCCInstallation.isValid()) +return ToolChain::RLT_Libgcc; + return ToolChain::RLT_CompilerRT; +} + +ToolChain::UnwindLibType +BareMetal::GetUnwindLibType(const llvm::opt::ArgList &Args) const { + if (getTriple().isRISCV()) +return ToolChain::UNW_None; + + return ToolChain::GetUnwindLibType(Args); +} + void BareMetal::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { if (DriverArgs.hasArg(options::OPT_nostdinc)) diff --git a/clang/lib/Driver/ToolChains/BareMetal.h b/clang/lib/Driver/ToolChains/BareMetal.h index 87f173342def2..580f5c6903c1f 100644 --- a/clang/lib/Driver/ToolChains/BareMetal.h +++ b/clang/lib/Driver/ToolChains/BareMetal.h @@ -54,12 +54,11 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public Generic_ELF { return UnwindTableLevel::None; } - RuntimeLibType GetDefaultRuntimeLibType() const override { -return ToolChain::RLT_CompilerRT; - } - CXXStdlibType GetDefaultCXXStdlibType() const overr
[llvm-branch-commits] [clang] [RISCV][Driver] Add riscv emulation mode to linker job of BareMetal toolchain (PR #134442)
https://github.com/quic-garvgupt updated https://github.com/llvm/llvm-project/pull/134442 >From 46b1136d0bab3cfc30029070597b76b4c2cbcbcf Mon Sep 17 00:00:00 2001 From: Garvit Gupta Date: Fri, 4 Apr 2025 12:51:19 -0700 Subject: [PATCH] [RISCV][Driver] Add riscv emulation mode to linker job of BareMetal toolchain Change-Id: Ifce8a3a7f1df9c12561d35ca3c923595e3619428 --- clang/lib/Driver/ToolChains/BareMetal.cpp | 15 - clang/lib/Driver/ToolChains/CommonArgs.cpp | 70 ++ clang/lib/Driver/ToolChains/CommonArgs.h | 2 + clang/lib/Driver/ToolChains/Gnu.cpp| 70 -- clang/test/Driver/baremetal.cpp| 28 - 5 files changed, 99 insertions(+), 86 deletions(-) diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp index 691b6c7336c7a..ac85757442e18 100644 --- a/clang/lib/Driver/ToolChains/BareMetal.cpp +++ b/clang/lib/Driver/ToolChains/BareMetal.cpp @@ -550,8 +550,19 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-Bstatic"); - if (TC.getTriple().isRISCV() && Args.hasArg(options::OPT_mno_relax)) -CmdArgs.push_back("--no-relax"); + if (Triple.isRISCV()) { +if (const char *LDMOption = getLDMOption(TC.getTriple(), Args)) { + CmdArgs.push_back("-m"); + CmdArgs.push_back(LDMOption); +} else { + D.Diag(diag::err_target_unknown_triple) << Triple.str(); + return; +} + +CmdArgs.push_back("-X"); +if (Args.hasArg(options::OPT_mno_relax)) + CmdArgs.push_back("--no-relax"); + } if (Triple.isARM() || Triple.isThumb()) { bool IsBigEndian = arm::isARMBigEndian(Triple, Args); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index ddeadff8f6dfb..292d52acdc002 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -535,6 +535,76 @@ void tools::AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs, } } +const char *tools::getLDMOption(const llvm::Triple &T, const ArgList &Args) { + switch (T.getArch()) { + case llvm::Triple::x86: +if (T.isOSIAMCU()) + return "elf_iamcu"; +return "elf_i386"; + case llvm::Triple::aarch64: +return "aarch64linux"; + case llvm::Triple::aarch64_be: +return "aarch64linuxb"; + case llvm::Triple::arm: + case llvm::Triple::thumb: + case llvm::Triple::armeb: + case llvm::Triple::thumbeb: +return tools::arm::isARMBigEndian(T, Args) ? "armelfb_linux_eabi" + : "armelf_linux_eabi"; + case llvm::Triple::m68k: +return "m68kelf"; + case llvm::Triple::ppc: +if (T.isOSLinux()) + return "elf32ppclinux"; +return "elf32ppc"; + case llvm::Triple::ppcle: +if (T.isOSLinux()) + return "elf32lppclinux"; +return "elf32lppc"; + case llvm::Triple::ppc64: +return "elf64ppc"; + case llvm::Triple::ppc64le: +return "elf64lppc"; + case llvm::Triple::riscv32: +return "elf32lriscv"; + case llvm::Triple::riscv64: +return "elf64lriscv"; + case llvm::Triple::sparc: + case llvm::Triple::sparcel: +return "elf32_sparc"; + case llvm::Triple::sparcv9: +return "elf64_sparc"; + case llvm::Triple::loongarch32: +return "elf32loongarch"; + case llvm::Triple::loongarch64: +return "elf64loongarch"; + case llvm::Triple::mips: +return "elf32btsmip"; + case llvm::Triple::mipsel: +return "elf32ltsmip"; + case llvm::Triple::mips64: +if (tools::mips::hasMipsAbiArg(Args, "n32") || T.isABIN32()) + return "elf32btsmipn32"; +return "elf64btsmip"; + case llvm::Triple::mips64el: +if (tools::mips::hasMipsAbiArg(Args, "n32") || T.isABIN32()) + return "elf32ltsmipn32"; +return "elf64ltsmip"; + case llvm::Triple::systemz: +return "elf64_s390"; + case llvm::Triple::x86_64: +if (T.isX32()) + return "elf32_x86_64"; +return "elf_x86_64"; + case llvm::Triple::ve: +return "elf64ve"; + case llvm::Triple::csky: +return "cskyelf_linux"; + default: +return nullptr; + } +} + void tools::addLinkerCompressDebugSectionsOption( const ToolChain &TC, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) { diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h index 96bc0619dcbc0..875354e969a2a 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.h +++ b/clang/lib/Driver/ToolChains/CommonArgs.h @@ -31,6 +31,8 @@ void AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs, const JobAction &JA); +const char *getLDMOption(const llvm::Triple &T, const llvm::opt::ArgList &Args); + void addLinkerCompressDebugSectionsOption(const ToolChain &TC, const llvm::opt::ArgList &Args,
[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)
Pierre-vh wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/141589?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#141591** https://app.graphite.dev/github/pr/llvm/llvm-project/141591?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#141590** https://app.graphite.dev/github/pr/llvm/llvm-project/141590?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#141589** https://app.graphite.dev/github/pr/llvm/llvm-project/141589?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/141589?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#141588** https://app.graphite.dev/github/pr/llvm/llvm-project/141588?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/141589 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)
https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/141589 None >From e5f24775ff988e5c6ac302f36b010fc0421eca34 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 27 May 2025 11:16:16 +0200 Subject: [PATCH] [AMDGPU] Move S_BFE lowering into RegBankCombiner --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 14 +- .../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 51 +++ .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 125 -- 3 files changed, 119 insertions(+), 71 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 9587fad1ecd63..94e1175b06b14 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[ canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl ]>; +// Early select of uniform BFX into S_BFE instructions. +// These instructions encode the offset/width in a way that requires using +// bitwise operations. Selecting these instructions early allow the combiner +// to potentially fold these. +class lower_uniform_bfx : GICombineRule< + (defs root:$bfx), + (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); }])>; + +def lower_uniform_sbfx : lower_uniform_bfx; +def lower_uniform_ubfx : lower_uniform_bfx; + let Predicates = [Has16BitInsts, NotHasMed3_16] in { // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This // saves one instruction compared to the promotion. @@ -198,5 +209,6 @@ def AMDGPURegBankCombiner : GICombiner< zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, - cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> { + cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, + lower_uniform_sbfx, lower_uniform_ubfx]> { } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index ee324a5e93f0f..2100900bb8eb2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -89,6 +89,8 @@ class AMDGPURegBankCombinerImpl : public Combiner { void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const; + bool lowerUniformBFX(MachineInstr &MI) const; + private: SIModeRegisterDefaults getMode() const; bool getIEEE() const; @@ -392,6 +394,55 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt( MI.eraseFromParent(); } +bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const { + assert(MI.getOpcode() == TargetOpcode::G_UBFX || + MI.getOpcode() == TargetOpcode::G_SBFX); + const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX); + + Register DstReg = MI.getOperand(0).getReg(); + const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI); + assert(RB && "No RB?"); + if (RB->getID() != AMDGPU::SGPRRegBankID) +return false; + + Register SrcReg = MI.getOperand(1).getReg(); + Register OffsetReg = MI.getOperand(2).getReg(); + Register WidthReg = MI.getOperand(3).getReg(); + + const LLT S32 = LLT::scalar(32); + LLT Ty = MRI.getType(DstReg); + + const unsigned Opc = (Ty == S32) + ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) + : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); + + // Ensure the high bits are clear to insert the offset. + auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6)); + auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); + + // Zeros out the low bits, so don't bother clamping the input value. + auto ShiftAmt = B.buildConstant(S32, 16); + auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt); + + // Transformation function, pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); + + MRI.setRegBank(OffsetMask.getReg(0), *RB); + MRI.setRegBank(ClampOffset.getReg(0), *RB); + MRI.setRegBank(ShiftAmt.getReg(0), *RB); + MRI.setRegBank(ShiftWidth.getReg(0), *RB); + MRI.setRegBank(MergedInputs.getReg(0), *RB); + + auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); + if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) +llvm_unreachable("failed to constrain BFE"); + + MI.eraseFromParent(); + return true; +} + SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const { return MF.getInfo()->getMode(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index dd7aef8f0c583..0b7d64ee67c34 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/
[llvm-branch-commits] [llvm] AMDGPU: Fix tracking subreg defs when folding through reg_sequence (PR #140608)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/140608 >From 6e7ab227c9d12cf82958ea0dd11461ec49bc4945 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 15 May 2025 10:51:39 +0200 Subject: [PATCH] AMDGPU: Fix tracking subreg defs when folding through reg_sequence We weren't fully respecting the type of a def of an immediate vs. the type at the use point. Refactor the folding logic to track the value to fold, as well as a subregister to apply to the underlying value. This is similar to how PeepholeOpt tracks subregisters (though only for pure copy-like instructions, no constants). Fixes #139317 --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 408 +++--- ...issue139317-bad-opsel-reg-sequence-fold.ll | 3 +- .../si-fold-operands-subreg-imm.gfx942.mir| 8 +- .../AMDGPU/si-fold-operands-subreg-imm.mir| 4 +- 4 files changed, 253 insertions(+), 170 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index cc18d6b4aba10..d7a4fa85e4034 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -25,52 +25,151 @@ using namespace llvm; namespace { -struct FoldCandidate { - MachineInstr *UseMI; +/// Track a value we may want to fold into downstream users, applying +/// subregister extracts along the way. +struct FoldableDef { union { -MachineOperand *OpToFold; +MachineOperand *OpToFold = nullptr; uint64_t ImmToFold; int FrameIndexToFold; }; - int ShrinkOpcode; - unsigned UseOpNo; + + /// Register class of the originally defined value. + const TargetRegisterClass *DefRC = nullptr; + + /// Track the original defining instruction for the value. + const MachineInstr *DefMI = nullptr; + + /// Subregister to apply to the value at the use point. + unsigned DefSubReg = AMDGPU::NoSubRegister; + + /// Kind of value stored in the union. MachineOperand::MachineOperandType Kind; - bool Commuted; - FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, -bool Commuted_ = false, -int ShrinkOp = -1) : -UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), -Kind(FoldOp->getType()), -Commuted(Commuted_) { -if (FoldOp->isImm()) { - ImmToFold = FoldOp->getImm(); -} else if (FoldOp->isFI()) { - FrameIndexToFold = FoldOp->getIndex(); + FoldableDef() = delete; + FoldableDef(MachineOperand &FoldOp, const TargetRegisterClass *DefRC, + unsigned DefSubReg = AMDGPU::NoSubRegister) + : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) { + +if (FoldOp.isImm()) { + ImmToFold = FoldOp.getImm(); +} else if (FoldOp.isFI()) { + FrameIndexToFold = FoldOp.getIndex(); } else { - assert(FoldOp->isReg() || FoldOp->isGlobal()); - OpToFold = FoldOp; + assert(FoldOp.isReg() || FoldOp.isGlobal()); + OpToFold = &FoldOp; } + +DefMI = FoldOp.getParent(); } - FoldCandidate(MachineInstr *MI, unsigned OpNo, int64_t FoldImm, -bool Commuted_ = false, int ShrinkOp = -1) - : UseMI(MI), ImmToFold(FoldImm), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), -Kind(MachineOperand::MO_Immediate), Commuted(Commuted_) {} + FoldableDef(int64_t FoldImm, const TargetRegisterClass *DefRC, + unsigned DefSubReg = AMDGPU::NoSubRegister) + : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg), +Kind(MachineOperand::MO_Immediate) {} + + /// Copy the current def and apply \p SubReg to the value. + FoldableDef getWithSubReg(const SIRegisterInfo &TRI, unsigned SubReg) const { +FoldableDef Copy(*this); +Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg); +return Copy; + } + + bool isReg() const { return Kind == MachineOperand::MO_Register; } + + Register getReg() const { +assert(isReg()); +return OpToFold->getReg(); + } + + unsigned getSubReg() const { +assert(isReg()); +return OpToFold->getSubReg(); + } + + bool isImm() const { return Kind == MachineOperand::MO_Immediate; } bool isFI() const { return Kind == MachineOperand::MO_FrameIndex; } - bool isImm() const { -return Kind == MachineOperand::MO_Immediate; + int getFI() const { +assert(isFI()); +return FrameIndexToFold; } - bool isReg() const { -return Kind == MachineOperand::MO_Register; + bool isGlobal() const { return OpToFold->isGlobal(); } + + /// Return the effective immediate value defined by this instruction, after + /// application of any subregister extracts which may exist between the use + /// and def instruction. + std::optional getEffectiveImmVal() const { +assert(isImm()); +return SIInstrInfo::extractSubregFromImm(ImmToFold, DefSubReg); } - bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; } + /// Check if it is legal to fold this effective value into
[llvm-branch-commits] [llvm] AMDGPU: Handle folding vector splats of inline split f64 inline immediates (PR #140878)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/140878 >From 609dc72abf36343e62c4bb0bc149f9ba453f4236 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 19 May 2025 21:51:06 +0200 Subject: [PATCH] AMDGPU: Handle folding vector splats of inline split f64 inline immediates Recognize a reg_sequence with 32-bit elements that produce a 64-bit splat value. This enables folding f64 constants into mfma operands --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 103 -- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 41 +-- 2 files changed, 76 insertions(+), 68 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index d7a4fa85e4034..7cf2549804bee 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -227,12 +227,12 @@ class SIFoldOperandsImpl { getRegSeqInit(SmallVectorImpl> &Defs, Register UseReg) const; - std::pair + std::pair isRegSeqSplat(MachineInstr &RegSeg) const; - MachineOperand *tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx, - MachineOperand *SplatVal, - const TargetRegisterClass *SplatRC) const; + bool tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx, + int64_t SplatVal, + const TargetRegisterClass *SplatRC) const; bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, @@ -966,15 +966,15 @@ const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit( return getRegSeqInit(*Def, Defs); } -std::pair +std::pair SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const { SmallVector, 32> Defs; const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs); if (!SrcRC) return {}; - // TODO: Recognize 64-bit splats broken into 32-bit pieces (i.e. recognize - // every other other element is 0 for 64-bit immediates) + bool TryToMatchSplat64 = false; + int64_t Imm; for (unsigned I = 0, E = Defs.size(); I != E; ++I) { const MachineOperand *Op = Defs[I].first; @@ -986,38 +986,75 @@ SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const { Imm = SubImm; continue; } -if (Imm != SubImm) + +if (Imm != SubImm) { + if (I == 1 && (E & 1) == 0) { +// If we have an even number of inputs, there's a chance this is a +// 64-bit element splat broken into 32-bit pieces. +TryToMatchSplat64 = true; +break; + } + return {}; // Can only fold splat constants +} + } + + if (!TryToMatchSplat64) +return {Defs[0].first->getImm(), SrcRC}; + + // Fallback to recognizing 64-bit splats broken into 32-bit pieces + // (i.e. recognize every other other element is 0 for 64-bit immediates) + int64_t SplatVal64; + for (unsigned I = 0, E = Defs.size(); I != E; I += 2) { +const MachineOperand *Op0 = Defs[I].first; +const MachineOperand *Op1 = Defs[I + 1].first; + +if (!Op0->isImm() || !Op1->isImm()) + return {}; + +unsigned SubReg0 = Defs[I].second; +unsigned SubReg1 = Defs[I + 1].second; + +// Assume we're going to generally encounter reg_sequences with sorted +// subreg indexes, so reject any that aren't consecutive. +if (TRI->getChannelFromSubReg(SubReg0) + 1 != +TRI->getChannelFromSubReg(SubReg1)) + return {}; + +int64_t MergedVal = Make_64(Op1->getImm(), Op0->getImm()); +if (I == 0) + SplatVal64 = MergedVal; +else if (SplatVal64 != MergedVal) + return {}; } - return {Defs[0].first, SrcRC}; + const TargetRegisterClass *RC64 = TRI->getSubRegisterClass( + MRI->getRegClass(RegSeq.getOperand(0).getReg()), AMDGPU::sub0_sub1); + + return {SplatVal64, RC64}; } -MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat( -MachineInstr *UseMI, unsigned UseOpIdx, MachineOperand *SplatVal, +bool SIFoldOperandsImpl::tryFoldRegSeqSplat( +MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal, const TargetRegisterClass *SplatRC) const { const MCInstrDesc &Desc = UseMI->getDesc(); if (UseOpIdx >= Desc.getNumOperands()) -return nullptr; +return false; // Filter out unhandled pseudos. if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx)) -return nullptr; +return false; int16_t RCID = Desc.operands()[UseOpIdx].RegClass; if (RCID == -1) -return nullptr; +return false; + + const TargetRegisterClass *OpRC = TRI->getRegClass(RCID); // Special case 0/-1, since when interpreted as a 64-bit element both halves - // have the same bits. Effectively this code does not handle 64-bit element - // operands correctly, as the incoming 64-bit constants are already split into - // 32-bit sequence elements. - // - // TODO: We should try to figure out how to interpret the reg_sequence as a - /
[llvm-branch-commits] [llvm] AMDGPU: Remove redundant operand folding checks (PR #140587)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/140587 >From c3f0ed4891b6cc34dc808e8673da5ff86a903df0 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 19 May 2025 20:02:54 +0200 Subject: [PATCH 1/2] AMDGPU: Remove redundant operand folding checks This was pre-filtering out a specific situation from being added to the fold candidate list. The operand legality will ultimately be checked with isOperandLegal before the fold is performed, so I don't see the plus in pre-filtering this one case. --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 18 .../AMDGPU/fold-operands-frame-index.mir | 101 ++ 2 files changed, 101 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index c45611582a53a..cc18d6b4aba10 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -777,24 +777,6 @@ bool SIFoldOperandsImpl::tryAddToFoldList( return true; } - // Check the case where we might introduce a second constant operand to a - // scalar instruction - if (TII->isSALU(MI->getOpcode())) { -const MCInstrDesc &InstDesc = MI->getDesc(); -const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; - -// Fine if the operand can be encoded as an inline constant -if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) { - // Otherwise check for another constant - for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) { -auto &Op = MI->getOperand(i); -if (OpNo != i && !Op.isReg() && -!TII->isInlineConstant(Op, InstDesc.operands()[i])) - return false; - } -} - } - appendFoldCandidate(FoldList, MI, OpNo, OpToFold); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir index 4417f205646ee..7fad2f466bc9f 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir @@ -573,3 +573,104 @@ body: | S_ENDPGM 0, implicit %2 ... + +--- +name:no_fold_multiple_fi_s_cselect_b32 +tracksRegLiveness: true +stack: + - { id: 0, size: 64, alignment: 4 } + - { id: 1, size: 32, alignment: 4 } +body: | + bb.0: +; CHECK-LABEL: name: no_fold_multiple_fi_s_cselect_b32 +; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.1 +; CHECK-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_MOV_B32_]], %stack.0, implicit undef $scc +; CHECK-NEXT: S_ENDPGM 0, implicit [[S_CSELECT_B32_]] +%0:sreg_32 = S_MOV_B32 %stack.0 +%1:sreg_32 = S_MOV_B32 %stack.1 +%2:sreg_32 = S_CSELECT_B32 killed %1, killed %0, implicit undef $scc +S_ENDPGM 0, implicit %2 + +... + +--- +name:no_fold_multiple_fi_v_cndmask_b32_e64 +tracksRegLiveness: true +stack: + - { id: 0, size: 64, alignment: 4 } + - { id: 1, size: 32, alignment: 4 } +body: | + bb.0: +liveins: $sgpr8_sgpr9 +; GFX9-LABEL: name: no_fold_multiple_fi_v_cndmask_b32_e64 +; GFX9: liveins: $sgpr8_sgpr9 +; GFX9-NEXT: {{ $}} +; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr8_sgpr9 +; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec +; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.1, implicit $exec +; GFX9-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_MOV_B32_e32_]], 0, killed [[V_MOV_B32_e32_1]], [[COPY]], implicit $exec +; GFX9-NEXT: S_ENDPGM 0, implicit [[V_CNDMASK_B32_e64_]] +; +; GFX10-LABEL: name: no_fold_multiple_fi_v_cndmask_b32_e64 +; GFX10: liveins: $sgpr8_sgpr9 +; GFX10-NEXT: {{ $}} +; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr8_sgpr9 +; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.1, implicit $exec +; GFX10-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, %stack.0, 0, killed [[V_MOV_B32_e32_]], [[COPY]], implicit $exec +; GFX10-NEXT: S_ENDPGM 0, implicit [[V_CNDMASK_B32_e64_]] +; +; GFX12-LABEL: name: no_fold_multiple_fi_v_cndmask_b32_e64 +; GFX12: liveins: $sgpr8_sgpr9 +; GFX12-NEXT: {{ $}} +; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr8_sgpr9 +; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.1, implicit $exec +; GFX12-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, %stack.0, 0, killed [[V_MOV_B32_e32_]], [[COPY]], implicit $exec +; GFX12-NEXT: S_ENDPGM 0, implicit [[V_CNDMASK_B32_e64_]] +%0:sreg_64_xexec = COPY $sgpr8_sgpr9 +%1:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec +%2:vgpr_32 = V_MOV_B32_e32 %stack.1, implicit $exec +%3:vgpr_32 = V_CNDMASK_B32_e64 0, killed %1, 0, killed %2, %0, implicit $exec +
[llvm-branch-commits] [llvm] AMDGPU: Add baseline tests for #139317 (PR #140607)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/140607 >From 8a6cb7bba02c0c6638a9b1789cf0feccd229f8b3 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 14 May 2025 08:50:59 +0200 Subject: [PATCH] AMDGPU: Add baseline tests for #139317 --- .../CodeGen/AMDGPU/fold-imm-copy-agpr.mir | 135 + llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir| 513 ++ .../AMDGPU/fold-short-64-bit-literals.mir | 392 - ...issue139317-bad-opsel-reg-sequence-fold.ll | 66 +++ .../si-fold-operands-subreg-imm.gfx942.mir| 202 +++ .../AMDGPU/si-fold-operands-subreg-imm.mir| 26 + 6 files changed, 1329 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir create mode 100644 llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll create mode 100644 llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.gfx942.mir diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir new file mode 100644 index 0..a079ee1296f41 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir @@ -0,0 +1,135 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=si-fold-operands %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: v_mov_b64_pseudo_imm_0_copy_to_areg_64 +tracksRegLiveness: true +body: | + bb.0: +; GCN-LABEL: name: v_mov_b64_pseudo_imm_0_copy_to_areg_64 +; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec +; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[V_MOV_B]] +; GCN-NEXT: $agpr0_agpr1 = COPY [[COPY]] +; GCN-NEXT: S_ENDPGM 0 +%0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec +%1:areg_64_align2 = COPY %0 +$agpr0_agpr1 = COPY %1 +S_ENDPGM 0 + +... + +--- +name: v_mov_b64_pseudo_imm_neg1_copy_to_areg_64 +tracksRegLiveness: true +body: | + bb.0: +; GCN-LABEL: name: v_mov_b64_pseudo_imm_neg1_copy_to_areg_64 +; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -1, implicit $exec +; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[V_MOV_B]] +; GCN-NEXT: $agpr0_agpr1 = COPY [[COPY]] +; GCN-NEXT: S_ENDPGM 0 +%0:vreg_64_align2 = V_MOV_B64_PSEUDO -1, implicit $exec +%1:areg_64_align2 = COPY %0 +$agpr0_agpr1 = COPY %1 +S_ENDPGM 0 + +... + +--- +name: v_mov_b64_pseudo_literal_copy_to_areg_64 +tracksRegLiveness: true +body: | + bb.0: +; GCN-LABEL: name: v_mov_b64_pseudo_literal_copy_to_areg_64 +; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 999, implicit $exec +; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[V_MOV_B]] +; GCN-NEXT: $agpr0_agpr1 = COPY [[COPY]] +; GCN-NEXT: S_ENDPGM 0 +%0:vreg_64_align2 = V_MOV_B64_PSEUDO 999, implicit $exec +%1:areg_64_align2 = COPY %0 +$agpr0_agpr1 = COPY %1 +S_ENDPGM 0 + +... + +--- +name: v_mov_b64_pseudo_imm_0_copy_sub0_to_agpr_32 +tracksRegLiveness: true +body: | + bb.0: +; GCN-LABEL: name: v_mov_b64_pseudo_imm_0_copy_sub0_to_agpr_32 +; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec +; GCN-NEXT: $agpr0 = COPY [[V_ACCVGPR_WRITE_B32_e64_]] +; GCN-NEXT: S_ENDPGM 0 +%0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec +%1:agpr_32 = COPY %0.sub0 +$agpr0 = COPY %1 +S_ENDPGM 0 + +... + +--- +name: v_mov_b64_pseudo_imm_0_copy_sub1_to_agpr_32 +tracksRegLiveness: true +body: | + bb.0: +; GCN-LABEL: name: v_mov_b64_pseudo_imm_0_copy_sub1_to_agpr_32 +; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec +; GCN-NEXT: $agpr0 = COPY [[V_ACCVGPR_WRITE_B32_e64_]] +; GCN-NEXT: S_ENDPGM 0 +%0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec +%1:agpr_32 = COPY %0.sub1 +$agpr0 = COPY %1 +S_ENDPGM 0 + +... + +--- +name: v_mov_b64_pseudo_lit_copy_sub0_to_agpr_32 +tracksRegLiveness: true +body: | + bb.0: +; GCN-LABEL: name: v_mov_b64_pseudo_lit_copy_sub0_to_agpr_32 +; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329592, implicit $exec +; GCN-NEXT: [[COPY:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B]].sub0 +; GCN-NEXT: $agpr0 = COPY [[COPY]] +; GCN-NEXT: S_ENDPGM 0 +%0:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329592, implicit $exec +%1:agpr_32 = COPY %0.sub0 +$agpr0 = COPY %1 +S_ENDPGM 0 + +... + +--- +name: v_mov_b64_pseudo_lit_copy_sub1_to_agpr_32 +tracksRegLiveness: true +body: | + bb.0: +; GCN-LABEL: name: v_mov_b64_pseudo_lit_copy_sub1_to_agpr_32 +; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329592, implicit $exec +; GCN-NEXT: [[COPY:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B]].sub1 +; GCN-NEXT: $agpr0 = COPY [[CO
[llvm-branch-commits] [llvm] AMDGPU: Add baseline tests for #139317 (PR #140607)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/140607 >From 8a6cb7bba02c0c6638a9b1789cf0feccd229f8b3 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 14 May 2025 08:50:59 +0200 Subject: [PATCH] AMDGPU: Add baseline tests for #139317 --- .../CodeGen/AMDGPU/fold-imm-copy-agpr.mir | 135 + llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir| 513 ++ .../AMDGPU/fold-short-64-bit-literals.mir | 392 - ...issue139317-bad-opsel-reg-sequence-fold.ll | 66 +++ .../si-fold-operands-subreg-imm.gfx942.mir| 202 +++ .../AMDGPU/si-fold-operands-subreg-imm.mir| 26 + 6 files changed, 1329 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir create mode 100644 llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll create mode 100644 llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.gfx942.mir diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir new file mode 100644 index 0..a079ee1296f41 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir @@ -0,0 +1,135 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=si-fold-operands %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: v_mov_b64_pseudo_imm_0_copy_to_areg_64 +tracksRegLiveness: true +body: | + bb.0: +; GCN-LABEL: name: v_mov_b64_pseudo_imm_0_copy_to_areg_64 +; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec +; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[V_MOV_B]] +; GCN-NEXT: $agpr0_agpr1 = COPY [[COPY]] +; GCN-NEXT: S_ENDPGM 0 +%0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec +%1:areg_64_align2 = COPY %0 +$agpr0_agpr1 = COPY %1 +S_ENDPGM 0 + +... + +--- +name: v_mov_b64_pseudo_imm_neg1_copy_to_areg_64 +tracksRegLiveness: true +body: | + bb.0: +; GCN-LABEL: name: v_mov_b64_pseudo_imm_neg1_copy_to_areg_64 +; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -1, implicit $exec +; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[V_MOV_B]] +; GCN-NEXT: $agpr0_agpr1 = COPY [[COPY]] +; GCN-NEXT: S_ENDPGM 0 +%0:vreg_64_align2 = V_MOV_B64_PSEUDO -1, implicit $exec +%1:areg_64_align2 = COPY %0 +$agpr0_agpr1 = COPY %1 +S_ENDPGM 0 + +... + +--- +name: v_mov_b64_pseudo_literal_copy_to_areg_64 +tracksRegLiveness: true +body: | + bb.0: +; GCN-LABEL: name: v_mov_b64_pseudo_literal_copy_to_areg_64 +; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 999, implicit $exec +; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[V_MOV_B]] +; GCN-NEXT: $agpr0_agpr1 = COPY [[COPY]] +; GCN-NEXT: S_ENDPGM 0 +%0:vreg_64_align2 = V_MOV_B64_PSEUDO 999, implicit $exec +%1:areg_64_align2 = COPY %0 +$agpr0_agpr1 = COPY %1 +S_ENDPGM 0 + +... + +--- +name: v_mov_b64_pseudo_imm_0_copy_sub0_to_agpr_32 +tracksRegLiveness: true +body: | + bb.0: +; GCN-LABEL: name: v_mov_b64_pseudo_imm_0_copy_sub0_to_agpr_32 +; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec +; GCN-NEXT: $agpr0 = COPY [[V_ACCVGPR_WRITE_B32_e64_]] +; GCN-NEXT: S_ENDPGM 0 +%0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec +%1:agpr_32 = COPY %0.sub0 +$agpr0 = COPY %1 +S_ENDPGM 0 + +... + +--- +name: v_mov_b64_pseudo_imm_0_copy_sub1_to_agpr_32 +tracksRegLiveness: true +body: | + bb.0: +; GCN-LABEL: name: v_mov_b64_pseudo_imm_0_copy_sub1_to_agpr_32 +; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec +; GCN-NEXT: $agpr0 = COPY [[V_ACCVGPR_WRITE_B32_e64_]] +; GCN-NEXT: S_ENDPGM 0 +%0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec +%1:agpr_32 = COPY %0.sub1 +$agpr0 = COPY %1 +S_ENDPGM 0 + +... + +--- +name: v_mov_b64_pseudo_lit_copy_sub0_to_agpr_32 +tracksRegLiveness: true +body: | + bb.0: +; GCN-LABEL: name: v_mov_b64_pseudo_lit_copy_sub0_to_agpr_32 +; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329592, implicit $exec +; GCN-NEXT: [[COPY:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B]].sub0 +; GCN-NEXT: $agpr0 = COPY [[COPY]] +; GCN-NEXT: S_ENDPGM 0 +%0:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329592, implicit $exec +%1:agpr_32 = COPY %0.sub0 +$agpr0 = COPY %1 +S_ENDPGM 0 + +... + +--- +name: v_mov_b64_pseudo_lit_copy_sub1_to_agpr_32 +tracksRegLiveness: true +body: | + bb.0: +; GCN-LABEL: name: v_mov_b64_pseudo_lit_copy_sub1_to_agpr_32 +; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329592, implicit $exec +; GCN-NEXT: [[COPY:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B]].sub1 +; GCN-NEXT: $agpr0 = COPY [[CO
[llvm-branch-commits] [llvm] AMDGPU: Handle folding vector splats of inline split f64 inline immediates (PR #140878)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/140878 >From 609dc72abf36343e62c4bb0bc149f9ba453f4236 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 19 May 2025 21:51:06 +0200 Subject: [PATCH] AMDGPU: Handle folding vector splats of inline split f64 inline immediates Recognize a reg_sequence with 32-bit elements that produce a 64-bit splat value. This enables folding f64 constants into mfma operands --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 103 -- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 41 +-- 2 files changed, 76 insertions(+), 68 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index d7a4fa85e4034..7cf2549804bee 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -227,12 +227,12 @@ class SIFoldOperandsImpl { getRegSeqInit(SmallVectorImpl> &Defs, Register UseReg) const; - std::pair + std::pair isRegSeqSplat(MachineInstr &RegSeg) const; - MachineOperand *tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx, - MachineOperand *SplatVal, - const TargetRegisterClass *SplatRC) const; + bool tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx, + int64_t SplatVal, + const TargetRegisterClass *SplatRC) const; bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, @@ -966,15 +966,15 @@ const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit( return getRegSeqInit(*Def, Defs); } -std::pair +std::pair SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const { SmallVector, 32> Defs; const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs); if (!SrcRC) return {}; - // TODO: Recognize 64-bit splats broken into 32-bit pieces (i.e. recognize - // every other other element is 0 for 64-bit immediates) + bool TryToMatchSplat64 = false; + int64_t Imm; for (unsigned I = 0, E = Defs.size(); I != E; ++I) { const MachineOperand *Op = Defs[I].first; @@ -986,38 +986,75 @@ SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const { Imm = SubImm; continue; } -if (Imm != SubImm) + +if (Imm != SubImm) { + if (I == 1 && (E & 1) == 0) { +// If we have an even number of inputs, there's a chance this is a +// 64-bit element splat broken into 32-bit pieces. +TryToMatchSplat64 = true; +break; + } + return {}; // Can only fold splat constants +} + } + + if (!TryToMatchSplat64) +return {Defs[0].first->getImm(), SrcRC}; + + // Fallback to recognizing 64-bit splats broken into 32-bit pieces + // (i.e. recognize every other other element is 0 for 64-bit immediates) + int64_t SplatVal64; + for (unsigned I = 0, E = Defs.size(); I != E; I += 2) { +const MachineOperand *Op0 = Defs[I].first; +const MachineOperand *Op1 = Defs[I + 1].first; + +if (!Op0->isImm() || !Op1->isImm()) + return {}; + +unsigned SubReg0 = Defs[I].second; +unsigned SubReg1 = Defs[I + 1].second; + +// Assume we're going to generally encounter reg_sequences with sorted +// subreg indexes, so reject any that aren't consecutive. +if (TRI->getChannelFromSubReg(SubReg0) + 1 != +TRI->getChannelFromSubReg(SubReg1)) + return {}; + +int64_t MergedVal = Make_64(Op1->getImm(), Op0->getImm()); +if (I == 0) + SplatVal64 = MergedVal; +else if (SplatVal64 != MergedVal) + return {}; } - return {Defs[0].first, SrcRC}; + const TargetRegisterClass *RC64 = TRI->getSubRegisterClass( + MRI->getRegClass(RegSeq.getOperand(0).getReg()), AMDGPU::sub0_sub1); + + return {SplatVal64, RC64}; } -MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat( -MachineInstr *UseMI, unsigned UseOpIdx, MachineOperand *SplatVal, +bool SIFoldOperandsImpl::tryFoldRegSeqSplat( +MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal, const TargetRegisterClass *SplatRC) const { const MCInstrDesc &Desc = UseMI->getDesc(); if (UseOpIdx >= Desc.getNumOperands()) -return nullptr; +return false; // Filter out unhandled pseudos. if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx)) -return nullptr; +return false; int16_t RCID = Desc.operands()[UseOpIdx].RegClass; if (RCID == -1) -return nullptr; +return false; + + const TargetRegisterClass *OpRC = TRI->getRegClass(RCID); // Special case 0/-1, since when interpreted as a 64-bit element both halves - // have the same bits. Effectively this code does not handle 64-bit element - // operands correctly, as the incoming 64-bit constants are already split into - // 32-bit sequence elements. - // - // TODO: We should try to figure out how to interpret the reg_sequence as a - /
[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)
https://github.com/Pierre-vh edited https://github.com/llvm/llvm-project/pull/141589 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Add BFX Formation Combines to RegBankCombiner (PR #141590)
Pierre-vh wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/141590?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#141591** https://app.graphite.dev/github/pr/llvm/llvm-project/141591?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#141590** https://app.graphite.dev/github/pr/llvm/llvm-project/141590?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/141590?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#141589** https://app.graphite.dev/github/pr/llvm/llvm-project/141589?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#141588** https://app.graphite.dev/github/pr/llvm/llvm-project/141588?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/141590 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Add BFX Formation Combines to RegBankCombiner (PR #141590)
https://github.com/Pierre-vh ready_for_review https://github.com/llvm/llvm-project/pull/141590 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] Change inSectionBlame to return pair (FileIdx, LineNo). (PR #141540)
https://github.com/qinkunbao edited https://github.com/llvm/llvm-project/pull/141540 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [lld] ELF: Add branch-to-branch optimization. (PR #138366)
@@ -975,6 +977,62 @@ void AArch64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { } } +static std::optional getControlTransferAddend(InputSection &is, +Relocation &r) { + // Identify a control transfer relocation for the branch-to-branch + // optimization. A "control transfer relocation" means a B or BL + // target but it also includes relative vtable relocations for example. + // + // We require the relocation type to be JUMP26, CALL26 or PLT32. With a + // relocation type of PLT32 the value may be assumed to be used for branching + // directly to the symbol and the addend is only used to produce the relocated + // value (hence the effective addend is always 0). This is because if a PLT is + // needed the addend will be added to the address of the PLT, and it doesn't + // make sense to branch into the middle of a PLT. For example, relative vtable + // relocations use PLT32 and 0 or a positive value as the addend but still are + // used to branch to the symbol. + // + // With JUMP26 or CALL26 the only reasonable interpretation of a non-zero + // addend is that we are branching to symbol+addend so that becomes the + // effective addend. + if (r.type == R_AARCH64_PLT32) +return 0; + if (r.type == R_AARCH64_JUMP26 || r.type == R_AARCH64_CALL26) +return r.addend; + return std::nullopt; +} + +static std::pair getBranchInfo(InputSection &is, + uint64_t offset) { + auto *i = std::lower_bound( + is.relocations.begin(), is.relocations.end(), offset, + [](Relocation &r, uint64_t offset) { return r.offset < offset; }); + if (i != is.relocations.end() && i->offset == offset && + i->type == R_AARCH64_JUMP26) { +return {i, i->addend}; + } smithp35 wrote: Agree that BTI instructions should be in a separate patch. It would require disassembling to find one so may result in longer link times. Skipping over BTI with direct branches could apply even when the target wasn't another branch. https://github.com/llvm/llvm-project/pull/138366 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [KeyInstr][Clang] Assign matrix element atom (PR #134650)
https://github.com/SLTozer approved this pull request. https://github.com/llvm/llvm-project/pull/134650 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [lld] ELF: Add branch-to-branch optimization. (PR #138366)
smithp35 wrote: Thanks for the updates. I don't have any more comments. https://github.com/llvm/llvm-project/pull/138366 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/141591 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Pierre van Houtryve (Pierre-vh) Changes This --- Patch is 38.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141591.diff 8 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPUCombine.td (+2-1) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll (+30-29) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+21-40) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+22-41) - (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+13-17) - (modified) llvm/test/CodeGen/AMDGPU/itofp.i128.ll (+5-6) - (modified) llvm/test/CodeGen/AMDGPU/lround.ll (+9-9) - (modified) llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll (+2-14) ``diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 96be17c487130..df867aaa204b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -210,5 +210,6 @@ def AMDGPURegBankCombiner : GICombiner< fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, - lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> { + lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract, + known_bits_simplifications]> { } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 6baa10bb48621..cc0f45681a3e2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1744,63 +1744,64 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX6-LABEL: v_lshr_i65_33: ; GFX6: ; %bb.0: ; GFX6-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT:v_mov_b32_e32 v3, v1 -; GFX6-NEXT:v_mov_b32_e32 v0, 1 +; GFX6-NEXT:v_mov_b32_e32 v3, 1 +; GFX6-NEXT:v_mov_b32_e32 v4, 0 +; GFX6-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX6-NEXT:v_lshl_b64 v[2:3], v[3:4], 31 +; GFX6-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX6-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT:v_mov_b32_e32 v1, 0 -; GFX6-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX6-NEXT:v_lshl_b64 v[0:1], v[0:1], 31 -; GFX6-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX6-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT:v_mov_b32_e32 v2, 0 ; GFX6-NEXT:s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_i65_33: ; GFX8: ; %bb.0: ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT:v_mov_b32_e32 v3, v1 -; GFX8-NEXT:v_mov_b32_e32 v0, 1 +; GFX8-NEXT:v_mov_b32_e32 v3, 1 +; GFX8-NEXT:v_mov_b32_e32 v4, 0 +; GFX8-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX8-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX8-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX8-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT:v_mov_b32_e32 v1, 0 -; GFX8-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX8-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX8-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX8-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT:v_mov_b32_e32 v2, 0 ; GFX8-NEXT:s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_lshr_i65_33: ; GFX9: ; %bb.0: ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT:v_mov_b32_e32 v3, v1 -; GFX9-NEXT:v_mov_b32_e32 v0, 1 +; GFX9-NEXT:v_mov_b32_e32 v3, 1 +; GFX9-NEXT:v_mov_b32_e32 v4, 0 +; GFX9-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX9-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX9-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX9-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT:v_mov_b32_e32 v1, 0 -; GFX9-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX9-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX9-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX9-NEXT:v_mov_b32_e32 v2, 0 ; GFX9-NEXT:s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_lshr_i65_33: ; GFX10: ; %bb.0: ; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT:v_mov_b32_e32 v3, v1 -; GFX10-NEXT:v_mov_b32_e32 v0, 1 +; GFX10-NEXT:v_mov_b32_e32 v3, 1 +; GFX10-NEXT:v_mov_b32_e32 v4, 0 +; GFX10-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX10-NEXT:v_lshrrev_b32_e32 v0, 1, v1 ; GFX10-NEXT:v_mov_b32_e32 v1, 0 -; GFX10-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX10-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX10-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX10-NEXT:v_or_b32_e32 v0, v2, v0 +; GFX10-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX10-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT:v_mov_b32_e32 v2, 0 ; GFX10-NEXT:s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_lshr_i65_33: ; GFX11: ; %bb.0: ; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT:v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1 -; GFX11-NEXT:v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v2 -; GFX11-NEXT:v_lshrrev
[llvm-branch-commits] [llvm] [AMDGPU] Add BFX Formation Combines to RegBankCombiner (PR #141590)
llvmbot wrote: @llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) Changes They're relatively safe to use there I believe. The only new registers they may create are the constants for the BFX. For those, borrow the RC from the source register. Fixes #140040 --- Patch is 153.51 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141590.diff 9 Files Affected: - (modified) llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp (+29) - (modified) llvm/lib/Target/AMDGPU/AMDGPUCombine.td (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll (+56-63) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+484-541) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+458-506) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll (+13-15) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+111-121) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+172-182) - (modified) llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll (+8-9) ``diff diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index b1e851183de0d..8981b13dac7ed 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -4629,10 +4629,17 @@ bool CombinerHelper::matchBitfieldExtractFromSExtInReg( if (ShiftImm < 0 || ShiftImm + Width > Ty.getScalarSizeInBits()) return false; + const RegisterBank *RB = getRegBank(ShiftSrc); + MatchInfo = [=](MachineIRBuilder &B) { auto Cst1 = B.buildConstant(ExtractTy, ShiftImm); auto Cst2 = B.buildConstant(ExtractTy, Width); B.buildSbfx(Dst, ShiftSrc, Cst1, Cst2); + +if (RB) { + MRI.setRegBank(Cst1.getReg(0), *RB); + MRI.setRegBank(Cst2.getReg(0), *RB); +} }; return true; } @@ -4667,10 +4674,18 @@ bool CombinerHelper::matchBitfieldExtractFromAnd(MachineInstr &MI, return false; uint64_t Width = APInt(Size, AndImm).countr_one(); + + const RegisterBank *RB = getRegBank(ShiftSrc); + MatchInfo = [=](MachineIRBuilder &B) { auto WidthCst = B.buildConstant(ExtractTy, Width); auto LSBCst = B.buildConstant(ExtractTy, LSBImm); B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {ShiftSrc, LSBCst, WidthCst}); + +if (RB) { + MRI.setRegBank(WidthCst.getReg(0), *RB); + MRI.setRegBank(LSBCst.getReg(0), *RB); +} }; return true; } @@ -4717,10 +4732,17 @@ bool CombinerHelper::matchBitfieldExtractFromShr( const int64_t Pos = ShrAmt - ShlAmt; const int64_t Width = Size - ShrAmt; + const RegisterBank *RB = getRegBank(ShlSrc); + MatchInfo = [=](MachineIRBuilder &B) { auto WidthCst = B.buildConstant(ExtractTy, Width); auto PosCst = B.buildConstant(ExtractTy, Pos); B.buildInstr(ExtrOpcode, {Dst}, {ShlSrc, PosCst, WidthCst}); + +if (RB) { + MRI.setRegBank(WidthCst.getReg(0), *RB); + MRI.setRegBank(PosCst.getReg(0), *RB); +} }; return true; } @@ -4775,10 +4797,17 @@ bool CombinerHelper::matchBitfieldExtractFromShrAnd( if (Opcode == TargetOpcode::G_ASHR && Width + ShrAmt == Size) return false; + const RegisterBank *RB = getRegBank(AndSrc); + MatchInfo = [=](MachineIRBuilder &B) { auto WidthCst = B.buildConstant(ExtractTy, Width); auto PosCst = B.buildConstant(ExtractTy, Pos); B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {AndSrc, PosCst, WidthCst}); + +if (RB) { + MRI.setRegBank(WidthCst.getReg(0), *RB); + MRI.setRegBank(PosCst.getReg(0), *RB); +} }; return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 94e1175b06b14..96be17c487130 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -210,5 +210,5 @@ def AMDGPURegBankCombiner : GICombiner< fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, - lower_uniform_sbfx, lower_uniform_ubfx]> { + lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> { } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index ff03cf1231d08..b0a239bef649e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -811,16 +811,15 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou ; ; GFX8-LABEL: s_ashr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT:s_lshr_b32 s2, s0, 16 -; GFX8-NEXT:s_sext_i32_i16 s0, s0 -; GFX8-NEXT:s_lshr_b32 s3, s1, 16 -; GFX8-NEXT:s_ashr_i32 s0, s0, s1 -; GFX8-NEXT:s_sext_i32_i16 s1, s2 -; GFX8-NEXT:s_ashr_i32 s1, s1, s3 -; GFX8-NEXT:s_and_b32 s1, 0x, s1 +; GFX8-NEXT:s_lshr_b32 s2, s1, 16 +; GFX8-NE
[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) Changes NFC --- Full diff: https://github.com/llvm/llvm-project/pull/141589.diff 3 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPUCombine.td (+13-1) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp (+51) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+55-70) ``diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 9587fad1ecd63..94e1175b06b14 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[ canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl ]>; +// Early select of uniform BFX into S_BFE instructions. +// These instructions encode the offset/width in a way that requires using +// bitwise operations. Selecting these instructions early allow the combiner +// to potentially fold these. +class lower_uniform_bfx : GICombineRule< + (defs root:$bfx), + (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); }])>; + +def lower_uniform_sbfx : lower_uniform_bfx; +def lower_uniform_ubfx : lower_uniform_bfx; + let Predicates = [Has16BitInsts, NotHasMed3_16] in { // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This // saves one instruction compared to the promotion. @@ -198,5 +209,6 @@ def AMDGPURegBankCombiner : GICombiner< zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, - cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> { + cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, + lower_uniform_sbfx, lower_uniform_ubfx]> { } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index ee324a5e93f0f..2100900bb8eb2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -89,6 +89,8 @@ class AMDGPURegBankCombinerImpl : public Combiner { void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const; + bool lowerUniformBFX(MachineInstr &MI) const; + private: SIModeRegisterDefaults getMode() const; bool getIEEE() const; @@ -392,6 +394,55 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt( MI.eraseFromParent(); } +bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const { + assert(MI.getOpcode() == TargetOpcode::G_UBFX || + MI.getOpcode() == TargetOpcode::G_SBFX); + const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX); + + Register DstReg = MI.getOperand(0).getReg(); + const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI); + assert(RB && "No RB?"); + if (RB->getID() != AMDGPU::SGPRRegBankID) +return false; + + Register SrcReg = MI.getOperand(1).getReg(); + Register OffsetReg = MI.getOperand(2).getReg(); + Register WidthReg = MI.getOperand(3).getReg(); + + const LLT S32 = LLT::scalar(32); + LLT Ty = MRI.getType(DstReg); + + const unsigned Opc = (Ty == S32) + ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) + : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); + + // Ensure the high bits are clear to insert the offset. + auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6)); + auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); + + // Zeros out the low bits, so don't bother clamping the input value. + auto ShiftAmt = B.buildConstant(S32, 16); + auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt); + + // Transformation function, pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); + + MRI.setRegBank(OffsetMask.getReg(0), *RB); + MRI.setRegBank(ClampOffset.getReg(0), *RB); + MRI.setRegBank(ShiftAmt.getReg(0), *RB); + MRI.setRegBank(ShiftWidth.getReg(0), *RB); + MRI.setRegBank(MergedInputs.getReg(0), *RB); + + auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); + if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) +llvm_unreachable("failed to constrain BFE"); + + MI.eraseFromParent(); + return true; +} + SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const { return MF.getInfo()->getMode(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index dd7aef8f0c583..0b7d64ee67c34 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1492,88 +1492,73 @@ bool AMDGPURegisterBankInfo::applyMappingB
[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)
https://github.com/Pierre-vh edited https://github.com/llvm/llvm-project/pull/141591 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Remove redundant operand folding checks (PR #140587)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/140587 >From c3f0ed4891b6cc34dc808e8673da5ff86a903df0 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 19 May 2025 20:02:54 +0200 Subject: [PATCH 1/2] AMDGPU: Remove redundant operand folding checks This was pre-filtering out a specific situation from being added to the fold candidate list. The operand legality will ultimately be checked with isOperandLegal before the fold is performed, so I don't see the plus in pre-filtering this one case. --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 18 .../AMDGPU/fold-operands-frame-index.mir | 101 ++ 2 files changed, 101 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index c45611582a53a..cc18d6b4aba10 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -777,24 +777,6 @@ bool SIFoldOperandsImpl::tryAddToFoldList( return true; } - // Check the case where we might introduce a second constant operand to a - // scalar instruction - if (TII->isSALU(MI->getOpcode())) { -const MCInstrDesc &InstDesc = MI->getDesc(); -const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; - -// Fine if the operand can be encoded as an inline constant -if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) { - // Otherwise check for another constant - for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) { -auto &Op = MI->getOperand(i); -if (OpNo != i && !Op.isReg() && -!TII->isInlineConstant(Op, InstDesc.operands()[i])) - return false; - } -} - } - appendFoldCandidate(FoldList, MI, OpNo, OpToFold); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir index 4417f205646ee..7fad2f466bc9f 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir @@ -573,3 +573,104 @@ body: | S_ENDPGM 0, implicit %2 ... + +--- +name:no_fold_multiple_fi_s_cselect_b32 +tracksRegLiveness: true +stack: + - { id: 0, size: 64, alignment: 4 } + - { id: 1, size: 32, alignment: 4 } +body: | + bb.0: +; CHECK-LABEL: name: no_fold_multiple_fi_s_cselect_b32 +; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.1 +; CHECK-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_MOV_B32_]], %stack.0, implicit undef $scc +; CHECK-NEXT: S_ENDPGM 0, implicit [[S_CSELECT_B32_]] +%0:sreg_32 = S_MOV_B32 %stack.0 +%1:sreg_32 = S_MOV_B32 %stack.1 +%2:sreg_32 = S_CSELECT_B32 killed %1, killed %0, implicit undef $scc +S_ENDPGM 0, implicit %2 + +... + +--- +name:no_fold_multiple_fi_v_cndmask_b32_e64 +tracksRegLiveness: true +stack: + - { id: 0, size: 64, alignment: 4 } + - { id: 1, size: 32, alignment: 4 } +body: | + bb.0: +liveins: $sgpr8_sgpr9 +; GFX9-LABEL: name: no_fold_multiple_fi_v_cndmask_b32_e64 +; GFX9: liveins: $sgpr8_sgpr9 +; GFX9-NEXT: {{ $}} +; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr8_sgpr9 +; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec +; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.1, implicit $exec +; GFX9-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_MOV_B32_e32_]], 0, killed [[V_MOV_B32_e32_1]], [[COPY]], implicit $exec +; GFX9-NEXT: S_ENDPGM 0, implicit [[V_CNDMASK_B32_e64_]] +; +; GFX10-LABEL: name: no_fold_multiple_fi_v_cndmask_b32_e64 +; GFX10: liveins: $sgpr8_sgpr9 +; GFX10-NEXT: {{ $}} +; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr8_sgpr9 +; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.1, implicit $exec +; GFX10-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, %stack.0, 0, killed [[V_MOV_B32_e32_]], [[COPY]], implicit $exec +; GFX10-NEXT: S_ENDPGM 0, implicit [[V_CNDMASK_B32_e64_]] +; +; GFX12-LABEL: name: no_fold_multiple_fi_v_cndmask_b32_e64 +; GFX12: liveins: $sgpr8_sgpr9 +; GFX12-NEXT: {{ $}} +; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr8_sgpr9 +; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.1, implicit $exec +; GFX12-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, %stack.0, 0, killed [[V_MOV_B32_e32_]], [[COPY]], implicit $exec +; GFX12-NEXT: S_ENDPGM 0, implicit [[V_CNDMASK_B32_e64_]] +%0:sreg_64_xexec = COPY $sgpr8_sgpr9 +%1:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec +%2:vgpr_32 = V_MOV_B32_e32 %stack.1, implicit $exec +%3:vgpr_32 = V_CNDMASK_B32_e64 0, killed %1, 0, killed %2, %0, implicit $exec +
[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)
Pierre-vh wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/141591?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#141591** https://app.graphite.dev/github/pr/llvm/llvm-project/141591?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/141591?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#141590** https://app.graphite.dev/github/pr/llvm/llvm-project/141590?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#141589** https://app.graphite.dev/github/pr/llvm/llvm-project/141589?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#141588** https://app.graphite.dev/github/pr/llvm/llvm-project/141588?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/141591 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] Change inSectionBlame to return pair (FileIdx, LineNo). (PR #141540)
https://github.com/qinkunbao updated https://github.com/llvm/llvm-project/pull/141540 >From d5508cc217f413b3bbb7a301b2110cfc0c2c6cbc Mon Sep 17 00:00:00 2001 From: Qinkun Bao Date: Tue, 27 May 2025 03:24:26 + Subject: [PATCH 1/2] Format SpecialCaseList.h Created using spr 1.3.6 --- llvm/include/llvm/Support/SpecialCaseList.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h index bce337f553a93..d54b242a9c501 100644 --- a/llvm/include/llvm/Support/SpecialCaseList.h +++ b/llvm/include/llvm/Support/SpecialCaseList.h @@ -17,8 +17,8 @@ #include "llvm/Support/GlobPattern.h" #include "llvm/Support/Regex.h" #include -#include #include +#include #include namespace llvm { >From b094fc2f5e3fe0d9b65f86a3f6eda04a6ab41e47 Mon Sep 17 00:00:00 2001 From: Qinkun Bao Date: Tue, 27 May 2025 14:01:19 + Subject: [PATCH 2/2] Remove irelevant format changes Created using spr 1.3.6 --- llvm/unittests/Support/SpecialCaseListTest.cpp | 13 + 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/llvm/unittests/Support/SpecialCaseListTest.cpp b/llvm/unittests/Support/SpecialCaseListTest.cpp index da4c557e740e6..0fe6a427c0562 100644 --- a/llvm/unittests/Support/SpecialCaseListTest.cpp +++ b/llvm/unittests/Support/SpecialCaseListTest.cpp @@ -218,9 +218,8 @@ TEST_F(SpecialCaseListTest, NoTrigramsInARule) { } TEST_F(SpecialCaseListTest, RepetitiveRule) { - std::unique_ptr SCL = - makeSpecialCaseList("fun:*bar*bar*bar*bar*\n" - "fun:bar*\n"); + std::unique_ptr SCL = makeSpecialCaseList("fun:*bar*bar*bar*bar*\n" + "fun:bar*\n"); EXPECT_TRUE(SCL->inSection("", "fun", "bara")); EXPECT_FALSE(SCL->inSection("", "fun", "abara")); EXPECT_TRUE(SCL->inSection("", "fun", "barbarbarbar")); @@ -229,8 +228,7 @@ TEST_F(SpecialCaseListTest, RepetitiveRule) { } TEST_F(SpecialCaseListTest, SpecialSymbolRule) { - std::unique_ptr SCL = - makeSpecialCaseList("src:*c\\+\\+abi*\n"); + std::unique_ptr SCL = makeSpecialCaseList("src:*c\\+\\+abi*\n"); EXPECT_TRUE(SCL->inSection("", "src", "c++abi")); EXPECT_FALSE(SCL->inSection("", "src", "c\\+\\+abi")); } @@ -246,9 +244,8 @@ TEST_F(SpecialCaseListTest, PopularTrigram) { } TEST_F(SpecialCaseListTest, EscapedSymbols) { - std::unique_ptr SCL = - makeSpecialCaseList("src:*c\\+\\+abi*\n" - "src:*helloworld*\n"); + std::unique_ptr SCL = makeSpecialCaseList("src:*c\\+\\+abi*\n" + "src:*helloworld*\n"); EXPECT_TRUE(SCL->inSection("", "src", "dir/c++abi")); EXPECT_FALSE(SCL->inSection("", "src", "dir/c\\+\\+abi")); EXPECT_FALSE(SCL->inSection("", "src", "c\\+\\+abi")); ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libcxx] release/20.x: [libcxx] Provide locale conversions to tests through lit substitution (#105651) (PR #136449)
mstorsjo wrote: So, this backport in itself should be clean, but running libcxx CI on the 20.x release branch is broken and would require a few other backports, that are entangled. So it's basically up to @ldionne if he thinks it's ok to merge despite the unrelated CI failures. (It is tricky because those failures _are_ in the same area; updated macOS changed locale definitions, which would require similar changes like these, plus more, to get a green run there. 95d23f58b6bc8e31a5a2f027338c1f6ecab1a0f1 on git main just marks those tests as unsupported...) https://github.com/llvm/llvm-project/pull/136449 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)
https://github.com/Pierre-vh edited https://github.com/llvm/llvm-project/pull/141591 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libcxx] [llvm] release/20.x: [libcxx] Provide locale conversions to tests through lit substitution (#105651) (PR #139468)
mstorsjo wrote: Closing this one. Backporting the extra changes turned out to be a bit messy and entangled. (It's not impossible to do though, but it would require buy-in from libcxx maintainers that we do want to fix up the CI on the 20.x branch.) The original backport in #136449 should be fine on its own though (while the CI has unrelated failures). https://github.com/llvm/llvm-project/pull/139468 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libcxx] [llvm] release/20.x: [libcxx] Provide locale conversions to tests through lit substitution (#105651) (PR #139468)
https://github.com/mstorsjo closed https://github.com/llvm/llvm-project/pull/139468 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] Change SpecialCaseList::inSectionBlame to return pair (FileIdx, LineNo). (PR #141540)
https://github.com/qinkunbao edited https://github.com/llvm/llvm-project/pull/141540 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/141591 >From 687bf11493d38ba323e90c1b40ae6919d48ed016 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 27 May 2025 12:29:02 +0200 Subject: [PATCH] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 3 +- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 59 - .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 61 +++--- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 63 +++ llvm/test/CodeGen/AMDGPU/div_i128.ll | 30 - llvm/test/CodeGen/AMDGPU/itofp.i128.ll| 11 ++-- llvm/test/CodeGen/AMDGPU/lround.ll| 18 +++--- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 16 + 8 files changed, 104 insertions(+), 157 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 96be17c487130..df867aaa204b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -210,5 +210,6 @@ def AMDGPURegBankCombiner : GICombiner< fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, - lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> { + lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract, + known_bits_simplifications]> { } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 6baa10bb48621..cc0f45681a3e2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1744,63 +1744,64 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX6-LABEL: v_lshr_i65_33: ; GFX6: ; %bb.0: ; GFX6-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT:v_mov_b32_e32 v3, v1 -; GFX6-NEXT:v_mov_b32_e32 v0, 1 +; GFX6-NEXT:v_mov_b32_e32 v3, 1 +; GFX6-NEXT:v_mov_b32_e32 v4, 0 +; GFX6-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX6-NEXT:v_lshl_b64 v[2:3], v[3:4], 31 +; GFX6-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX6-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT:v_mov_b32_e32 v1, 0 -; GFX6-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX6-NEXT:v_lshl_b64 v[0:1], v[0:1], 31 -; GFX6-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX6-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT:v_mov_b32_e32 v2, 0 ; GFX6-NEXT:s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_i65_33: ; GFX8: ; %bb.0: ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT:v_mov_b32_e32 v3, v1 -; GFX8-NEXT:v_mov_b32_e32 v0, 1 +; GFX8-NEXT:v_mov_b32_e32 v3, 1 +; GFX8-NEXT:v_mov_b32_e32 v4, 0 +; GFX8-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX8-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX8-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX8-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT:v_mov_b32_e32 v1, 0 -; GFX8-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX8-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX8-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX8-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT:v_mov_b32_e32 v2, 0 ; GFX8-NEXT:s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_lshr_i65_33: ; GFX9: ; %bb.0: ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT:v_mov_b32_e32 v3, v1 -; GFX9-NEXT:v_mov_b32_e32 v0, 1 +; GFX9-NEXT:v_mov_b32_e32 v3, 1 +; GFX9-NEXT:v_mov_b32_e32 v4, 0 +; GFX9-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX9-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX9-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX9-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT:v_mov_b32_e32 v1, 0 -; GFX9-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX9-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX9-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX9-NEXT:v_mov_b32_e32 v2, 0 ; GFX9-NEXT:s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_lshr_i65_33: ; GFX10: ; %bb.0: ; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT:v_mov_b32_e32 v3, v1 -; GFX10-NEXT:v_mov_b32_e32 v0, 1 +; GFX10-NEXT:v_mov_b32_e32 v3, 1 +; GFX10-NEXT:v_mov_b32_e32 v4, 0 +; GFX10-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX10-NEXT:v_lshrrev_b32_e32 v0, 1, v1 ; GFX10-NEXT:v_mov_b32_e32 v1, 0 -; GFX10-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX10-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX10-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX10-NEXT:v_or_b32_e32 v0, v2, v0 +; GFX10-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX10-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT:v_mov_b32_e32 v2, 0 ; GFX10-NEXT:s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_lshr_i65_33: ; GFX11: ; %bb.0: ; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT:v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1 -; GFX11-NEXT:v_dual_mov_b32 v1, 0 :: v_dual_and_b3
[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/141589 >From 150fe8c86c080a075fef344b20cd15b1097d3f29 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 27 May 2025 11:16:16 +0200 Subject: [PATCH] [AMDGPU] Move S_BFE lowering into RegBankCombiner --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 14 +- .../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 51 +++ .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 125 -- 3 files changed, 119 insertions(+), 71 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 9587fad1ecd63..94e1175b06b14 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[ canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl ]>; +// Early select of uniform BFX into S_BFE instructions. +// These instructions encode the offset/width in a way that requires using +// bitwise operations. Selecting these instructions early allow the combiner +// to potentially fold these. +class lower_uniform_bfx : GICombineRule< + (defs root:$bfx), + (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); }])>; + +def lower_uniform_sbfx : lower_uniform_bfx; +def lower_uniform_ubfx : lower_uniform_bfx; + let Predicates = [Has16BitInsts, NotHasMed3_16] in { // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This // saves one instruction compared to the promotion. @@ -198,5 +209,6 @@ def AMDGPURegBankCombiner : GICombiner< zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, - cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> { + cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, + lower_uniform_sbfx, lower_uniform_ubfx]> { } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index ee324a5e93f0f..2100900bb8eb2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -89,6 +89,8 @@ class AMDGPURegBankCombinerImpl : public Combiner { void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const; + bool lowerUniformBFX(MachineInstr &MI) const; + private: SIModeRegisterDefaults getMode() const; bool getIEEE() const; @@ -392,6 +394,55 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt( MI.eraseFromParent(); } +bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const { + assert(MI.getOpcode() == TargetOpcode::G_UBFX || + MI.getOpcode() == TargetOpcode::G_SBFX); + const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX); + + Register DstReg = MI.getOperand(0).getReg(); + const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI); + assert(RB && "No RB?"); + if (RB->getID() != AMDGPU::SGPRRegBankID) +return false; + + Register SrcReg = MI.getOperand(1).getReg(); + Register OffsetReg = MI.getOperand(2).getReg(); + Register WidthReg = MI.getOperand(3).getReg(); + + const LLT S32 = LLT::scalar(32); + LLT Ty = MRI.getType(DstReg); + + const unsigned Opc = (Ty == S32) + ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) + : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); + + // Ensure the high bits are clear to insert the offset. + auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6)); + auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); + + // Zeros out the low bits, so don't bother clamping the input value. + auto ShiftAmt = B.buildConstant(S32, 16); + auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt); + + // Transformation function, pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); + + MRI.setRegBank(OffsetMask.getReg(0), *RB); + MRI.setRegBank(ClampOffset.getReg(0), *RB); + MRI.setRegBank(ShiftAmt.getReg(0), *RB); + MRI.setRegBank(ShiftWidth.getReg(0), *RB); + MRI.setRegBank(MergedInputs.getReg(0), *RB); + + auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); + if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) +llvm_unreachable("failed to constrain BFE"); + + MI.eraseFromParent(); + return true; +} + SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const { return MF.getInfo()->getMode(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index dd7aef8f0c583..0b7d64ee67c34 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Ta
[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/141591 >From 687bf11493d38ba323e90c1b40ae6919d48ed016 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 27 May 2025 12:29:02 +0200 Subject: [PATCH] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 3 +- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 59 - .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 61 +++--- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 63 +++ llvm/test/CodeGen/AMDGPU/div_i128.ll | 30 - llvm/test/CodeGen/AMDGPU/itofp.i128.ll| 11 ++-- llvm/test/CodeGen/AMDGPU/lround.ll| 18 +++--- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 16 + 8 files changed, 104 insertions(+), 157 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 96be17c487130..df867aaa204b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -210,5 +210,6 @@ def AMDGPURegBankCombiner : GICombiner< fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, - lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> { + lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract, + known_bits_simplifications]> { } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 6baa10bb48621..cc0f45681a3e2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1744,63 +1744,64 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX6-LABEL: v_lshr_i65_33: ; GFX6: ; %bb.0: ; GFX6-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT:v_mov_b32_e32 v3, v1 -; GFX6-NEXT:v_mov_b32_e32 v0, 1 +; GFX6-NEXT:v_mov_b32_e32 v3, 1 +; GFX6-NEXT:v_mov_b32_e32 v4, 0 +; GFX6-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX6-NEXT:v_lshl_b64 v[2:3], v[3:4], 31 +; GFX6-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX6-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT:v_mov_b32_e32 v1, 0 -; GFX6-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX6-NEXT:v_lshl_b64 v[0:1], v[0:1], 31 -; GFX6-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX6-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT:v_mov_b32_e32 v2, 0 ; GFX6-NEXT:s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_i65_33: ; GFX8: ; %bb.0: ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT:v_mov_b32_e32 v3, v1 -; GFX8-NEXT:v_mov_b32_e32 v0, 1 +; GFX8-NEXT:v_mov_b32_e32 v3, 1 +; GFX8-NEXT:v_mov_b32_e32 v4, 0 +; GFX8-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX8-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX8-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX8-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT:v_mov_b32_e32 v1, 0 -; GFX8-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX8-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX8-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX8-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT:v_mov_b32_e32 v2, 0 ; GFX8-NEXT:s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_lshr_i65_33: ; GFX9: ; %bb.0: ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT:v_mov_b32_e32 v3, v1 -; GFX9-NEXT:v_mov_b32_e32 v0, 1 +; GFX9-NEXT:v_mov_b32_e32 v3, 1 +; GFX9-NEXT:v_mov_b32_e32 v4, 0 +; GFX9-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX9-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX9-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX9-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT:v_mov_b32_e32 v1, 0 -; GFX9-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX9-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX9-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX9-NEXT:v_mov_b32_e32 v2, 0 ; GFX9-NEXT:s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_lshr_i65_33: ; GFX10: ; %bb.0: ; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT:v_mov_b32_e32 v3, v1 -; GFX10-NEXT:v_mov_b32_e32 v0, 1 +; GFX10-NEXT:v_mov_b32_e32 v3, 1 +; GFX10-NEXT:v_mov_b32_e32 v4, 0 +; GFX10-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX10-NEXT:v_lshrrev_b32_e32 v0, 1, v1 ; GFX10-NEXT:v_mov_b32_e32 v1, 0 -; GFX10-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX10-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX10-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX10-NEXT:v_or_b32_e32 v0, v2, v0 +; GFX10-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX10-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT:v_mov_b32_e32 v2, 0 ; GFX10-NEXT:s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_lshr_i65_33: ; GFX11: ; %bb.0: ; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT:v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1 -; GFX11-NEXT:v_dual_mov_b32 v1, 0 :: v_dual_and_b3
[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/141589 >From 150fe8c86c080a075fef344b20cd15b1097d3f29 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 27 May 2025 11:16:16 +0200 Subject: [PATCH] [AMDGPU] Move S_BFE lowering into RegBankCombiner --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 14 +- .../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 51 +++ .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 125 -- 3 files changed, 119 insertions(+), 71 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 9587fad1ecd63..94e1175b06b14 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[ canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl ]>; +// Early select of uniform BFX into S_BFE instructions. +// These instructions encode the offset/width in a way that requires using +// bitwise operations. Selecting these instructions early allow the combiner +// to potentially fold these. +class lower_uniform_bfx : GICombineRule< + (defs root:$bfx), + (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); }])>; + +def lower_uniform_sbfx : lower_uniform_bfx; +def lower_uniform_ubfx : lower_uniform_bfx; + let Predicates = [Has16BitInsts, NotHasMed3_16] in { // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This // saves one instruction compared to the promotion. @@ -198,5 +209,6 @@ def AMDGPURegBankCombiner : GICombiner< zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, - cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> { + cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, + lower_uniform_sbfx, lower_uniform_ubfx]> { } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index ee324a5e93f0f..2100900bb8eb2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -89,6 +89,8 @@ class AMDGPURegBankCombinerImpl : public Combiner { void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const; + bool lowerUniformBFX(MachineInstr &MI) const; + private: SIModeRegisterDefaults getMode() const; bool getIEEE() const; @@ -392,6 +394,55 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt( MI.eraseFromParent(); } +bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const { + assert(MI.getOpcode() == TargetOpcode::G_UBFX || + MI.getOpcode() == TargetOpcode::G_SBFX); + const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX); + + Register DstReg = MI.getOperand(0).getReg(); + const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI); + assert(RB && "No RB?"); + if (RB->getID() != AMDGPU::SGPRRegBankID) +return false; + + Register SrcReg = MI.getOperand(1).getReg(); + Register OffsetReg = MI.getOperand(2).getReg(); + Register WidthReg = MI.getOperand(3).getReg(); + + const LLT S32 = LLT::scalar(32); + LLT Ty = MRI.getType(DstReg); + + const unsigned Opc = (Ty == S32) + ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) + : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); + + // Ensure the high bits are clear to insert the offset. + auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6)); + auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); + + // Zeros out the low bits, so don't bother clamping the input value. + auto ShiftAmt = B.buildConstant(S32, 16); + auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt); + + // Transformation function, pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); + + MRI.setRegBank(OffsetMask.getReg(0), *RB); + MRI.setRegBank(ClampOffset.getReg(0), *RB); + MRI.setRegBank(ShiftAmt.getReg(0), *RB); + MRI.setRegBank(ShiftWidth.getReg(0), *RB); + MRI.setRegBank(MergedInputs.getReg(0), *RB); + + auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); + if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) +llvm_unreachable("failed to constrain BFE"); + + MI.eraseFromParent(); + return true; +} + SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const { return MF.getInfo()->getMode(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index dd7aef8f0c583..0b7d64ee67c34 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Ta
[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)
@@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[ canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl ]>; +// Early select of uniform BFX into S_BFE instructions. +// These instructions encode the offset/width in a way that requires using +// bitwise operations. Selecting these instructions early allow the combiner +// to potentially fold these. +class lower_uniform_bfx : GICombineRule< + (defs root:$bfx), + (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); }])>; + +def lower_uniform_sbfx : lower_uniform_bfx; +def lower_uniform_ubfx : lower_uniform_bfx; arsenm wrote: This needs more elaboration; needs to be clear that this can't be a mandatory lowering performed in a combiner https://github.com/llvm/llvm-project/pull/141589 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)
@@ -392,6 +394,55 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt( MI.eraseFromParent(); } +bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const { + assert(MI.getOpcode() == TargetOpcode::G_UBFX || + MI.getOpcode() == TargetOpcode::G_SBFX); + const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX); + + Register DstReg = MI.getOperand(0).getReg(); + const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI); + assert(RB && "No RB?"); + if (RB->getID() != AMDGPU::SGPRRegBankID) +return false; + + Register SrcReg = MI.getOperand(1).getReg(); + Register OffsetReg = MI.getOperand(2).getReg(); + Register WidthReg = MI.getOperand(3).getReg(); + + const LLT S32 = LLT::scalar(32); + LLT Ty = MRI.getType(DstReg); + + const unsigned Opc = (Ty == S32) arsenm wrote: ```suggestion const unsigned Opc = Ty == S32 ``` https://github.com/llvm/llvm-project/pull/141589 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [clang] Introduce CallGraphSection option (PR #117037)
https://github.com/Prabhuk updated https://github.com/llvm/llvm-project/pull/117037 >From 6a12be2c5b60a95a06875b0b2c4f14228d1fa882 Mon Sep 17 00:00:00 2001 From: prabhukr Date: Wed, 12 Mar 2025 23:30:01 + Subject: [PATCH] Fix EOF newlines. Created using spr 1.3.6-beta.1 --- clang/test/Driver/call-graph-section.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/test/Driver/call-graph-section.c b/clang/test/Driver/call-graph-section.c index 108446729d857..5832aa6754137 100644 --- a/clang/test/Driver/call-graph-section.c +++ b/clang/test/Driver/call-graph-section.c @@ -2,4 +2,4 @@ // RUN: %clang -### -S -fcall-graph-section -fno-call-graph-section %s 2>&1 | FileCheck --check-prefix=NO-CALL-GRAPH-SECTION %s // CALL-GRAPH-SECTION: "-fcall-graph-section" -// NO-CALL-GRAPH-SECTION-NOT: "-fcall-graph-section" \ No newline at end of file +// NO-CALL-GRAPH-SECTION-NOT: "-fcall-graph-section" ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [MachO] Improve bounds check (#141083) (PR #141461)
https://github.com/JDevlieghere approved this pull request. https://github.com/llvm/llvm-project/pull/141461 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [clang] callee_type metadata for indirect calls (PR #117036)
https://github.com/Prabhuk updated https://github.com/llvm/llvm-project/pull/117036 >From b7fbe09b32ff02d4f7c52d82fbf8b5cd28138852 Mon Sep 17 00:00:00 2001 From: prabhukr Date: Wed, 23 Apr 2025 04:05:47 + Subject: [PATCH] Address review comments. Created using spr 1.3.6-beta.1 --- clang/lib/CodeGen/CGCall.cpp| 8 clang/lib/CodeGen/CodeGenModule.cpp | 10 +- clang/lib/CodeGen/CodeGenModule.h | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 185ee1a970aac..d8ab7140f7943 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -5780,19 +5780,19 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, if (callOrInvoke) { *callOrInvoke = CI; if (CGM.getCodeGenOpts().CallGraphSection) { - assert((TargetDecl && TargetDecl->getFunctionType() || - Callee.getAbstractInfo().getCalleeFunctionProtoType()) && - "cannot find callsite type"); QualType CST; if (TargetDecl && TargetDecl->getFunctionType()) CST = QualType(TargetDecl->getFunctionType(), 0); else if (const auto *FPT = Callee.getAbstractInfo().getCalleeFunctionProtoType()) CST = QualType(FPT, 0); + else +llvm_unreachable( +"Cannot find the callee type to generate callee_type metadata."); // Set type identifier metadata of indirect calls for call graph section. if (!CST.isNull()) -CGM.CreateCalleeTypeMetadataForIcall(CST, *callOrInvoke); +CGM.createCalleeTypeMetadataForIcall(CST, *callOrInvoke); } } diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 43cd2405571cf..2fc99639a75cb 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -2654,7 +2654,7 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D, // Skip available_externally functions. They won't be codegen'ed in the // current module anyway. if (getContext().GetGVALinkageForFunction(FD) != GVA_AvailableExternally) -CreateFunctionTypeMetadataForIcall(FD, F); +createFunctionTypeMetadataForIcall(FD, F); } } @@ -2868,7 +2868,7 @@ static bool hasExistingGeneralizedTypeMD(llvm::Function *F) { return MD->hasGeneralizedMDString(); } -void CodeGenModule::CreateFunctionTypeMetadataForIcall(const FunctionDecl *FD, +void CodeGenModule::createFunctionTypeMetadataForIcall(const FunctionDecl *FD, llvm::Function *F) { if (CodeGenOpts.CallGraphSection && !hasExistingGeneralizedTypeMD(F) && (!F->hasLocalLinkage() || @@ -2898,7 +2898,7 @@ void CodeGenModule::CreateFunctionTypeMetadataForIcall(const FunctionDecl *FD, F->addTypeMetadata(0, llvm::ConstantAsMetadata::get(CrossDsoTypeId)); } -void CodeGenModule::CreateCalleeTypeMetadataForIcall(const QualType &QT, +void CodeGenModule::createCalleeTypeMetadataForIcall(const QualType &QT, llvm::CallBase *CB) { // Only if needed for call graph section and only for indirect calls. if (!CodeGenOpts.CallGraphSection || !CB->isIndirectCall()) @@ -2909,7 +2909,7 @@ void CodeGenModule::CreateCalleeTypeMetadataForIcall(const QualType &QT, getLLVMContext(), {llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( llvm::Type::getInt64Ty(getLLVMContext()), 0)), TypeIdMD}); - llvm::MDTuple *MDN = llvm::MDNode::get(getLLVMContext(), { TypeTuple }); + llvm::MDTuple *MDN = llvm::MDNode::get(getLLVMContext(), {TypeTuple}); CB->setMetadata(llvm::LLVMContext::MD_callee_type, MDN); } @@ -3041,7 +3041,7 @@ void CodeGenModule::SetFunctionAttributes(GlobalDecl GD, llvm::Function *F, // jump table. if (!CodeGenOpts.SanitizeCfiCrossDso || !CodeGenOpts.SanitizeCfiCanonicalJumpTables) -CreateFunctionTypeMetadataForIcall(FD, F); +createFunctionTypeMetadataForIcall(FD, F); if (LangOpts.Sanitize.has(SanitizerKind::KCFI)) setKCFIType(FD, F); diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h index dfbe4388349dd..4b53f0f241b52 100644 --- a/clang/lib/CodeGen/CodeGenModule.h +++ b/clang/lib/CodeGen/CodeGenModule.h @@ -1619,11 +1619,11 @@ class CodeGenModule : public CodeGenTypeCache { llvm::Metadata *CreateMetadataIdentifierGeneralized(QualType T); /// Create and attach type metadata to the given function. - void CreateFunctionTypeMetadataForIcall(const FunctionDecl *FD, + void createFunctionTypeMetadataForIcall(const FunctionDecl *FD, llvm::Function *F); /// Create and attach type metadata to the given call. - void CreateCalleeTypeMetadataForIcall(const QualType &QT, llvm::CallBase *CB); + void createCa
[llvm-branch-commits] LowerTypeTests: Set small code model on imported globals. (PR #141324)
https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/141324 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] LowerTypeTests: Set small code model on imported globals. (PR #141324)
@@ -1019,8 +1019,14 @@ LowerTypeTestsModule::importTypeId(StringRef TypeId) { return C; }; - if (TIL.TheKind != TypeTestResolution::Unsat) -TIL.OffsetedGlobal = ImportGlobal("global_addr"); + if (TIL.TheKind != TypeTestResolution::Unsat) { +auto *GV = ImportGlobal("global_addr"); +// This is either a vtable (in .data.rel.ro) or a jump table (in .text). +// Either way it's expected to be in the low 2 GiB, so set the small code pcc wrote: Added comment with an explanation. https://github.com/llvm/llvm-project/pull/141324 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] LowerTypeTests: Set small code model on imported globals. (PR #141324)
https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/141324 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [llvm][EmbedBitcodePass] Prevent modifying the module with ThinLTO (PR #139999)
@@ -33,8 +34,11 @@ PreservedAnalyses EmbedBitcodePass::run(Module &M, ModuleAnalysisManager &AM) { std::string Data; raw_string_ostream OS(Data); + // Clone the module with with Thin LTO, since ThinLTOBitcodeWriterPass changes nikic wrote: ```suggestion // Clone the module with Thin LTO, since ThinLTOBitcodeWriterPass changes ``` https://github.com/llvm/llvm-project/pull/13 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [llvm][EmbedBitcodePass] Prevent modifying the module with ThinLTO (PR #139999)
https://github.com/nikic approved this pull request. Okay, let's go with this for now. Compile-time impact of cloning the module is about 0.2% when building clang with fat LTO: https://llvm-compile-time-tracker.com/compare.php?from=11a01e851a06188ae946ace1140f866d7a667221&to=46e037d763e7997a83ce78c9a602248fd67f0d44&stat=instructions:u https://github.com/llvm/llvm-project/pull/13 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [llvm][EmbedBitcodePass] Prevent modifying the module with ThinLTO (PR #139999)
https://github.com/nikic edited https://github.com/llvm/llvm-project/pull/13 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)
https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/141591 None >From d102621b16b8c893c4b56248d9c4cf59b3e1bf6e Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 27 May 2025 12:29:02 +0200 Subject: [PATCH] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 3 +- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 59 - .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 61 +++--- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 63 +++ llvm/test/CodeGen/AMDGPU/div_i128.ll | 30 - llvm/test/CodeGen/AMDGPU/itofp.i128.ll| 11 ++-- llvm/test/CodeGen/AMDGPU/lround.ll| 18 +++--- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 16 + 8 files changed, 104 insertions(+), 157 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 96be17c487130..df867aaa204b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -210,5 +210,6 @@ def AMDGPURegBankCombiner : GICombiner< fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, - lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> { + lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract, + known_bits_simplifications]> { } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 6baa10bb48621..cc0f45681a3e2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1744,63 +1744,64 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX6-LABEL: v_lshr_i65_33: ; GFX6: ; %bb.0: ; GFX6-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT:v_mov_b32_e32 v3, v1 -; GFX6-NEXT:v_mov_b32_e32 v0, 1 +; GFX6-NEXT:v_mov_b32_e32 v3, 1 +; GFX6-NEXT:v_mov_b32_e32 v4, 0 +; GFX6-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX6-NEXT:v_lshl_b64 v[2:3], v[3:4], 31 +; GFX6-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX6-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT:v_mov_b32_e32 v1, 0 -; GFX6-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX6-NEXT:v_lshl_b64 v[0:1], v[0:1], 31 -; GFX6-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX6-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT:v_mov_b32_e32 v2, 0 ; GFX6-NEXT:s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_i65_33: ; GFX8: ; %bb.0: ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT:v_mov_b32_e32 v3, v1 -; GFX8-NEXT:v_mov_b32_e32 v0, 1 +; GFX8-NEXT:v_mov_b32_e32 v3, 1 +; GFX8-NEXT:v_mov_b32_e32 v4, 0 +; GFX8-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX8-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX8-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX8-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT:v_mov_b32_e32 v1, 0 -; GFX8-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX8-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX8-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX8-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT:v_mov_b32_e32 v2, 0 ; GFX8-NEXT:s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_lshr_i65_33: ; GFX9: ; %bb.0: ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT:v_mov_b32_e32 v3, v1 -; GFX9-NEXT:v_mov_b32_e32 v0, 1 +; GFX9-NEXT:v_mov_b32_e32 v3, 1 +; GFX9-NEXT:v_mov_b32_e32 v4, 0 +; GFX9-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX9-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX9-NEXT:v_lshrrev_b32_e32 v0, 1, v1 +; GFX9-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT:v_mov_b32_e32 v1, 0 -; GFX9-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX9-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX9-NEXT:v_or_b32_e32 v0, v2, v0 ; GFX9-NEXT:v_mov_b32_e32 v2, 0 ; GFX9-NEXT:s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_lshr_i65_33: ; GFX10: ; %bb.0: ; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT:v_mov_b32_e32 v3, v1 -; GFX10-NEXT:v_mov_b32_e32 v0, 1 +; GFX10-NEXT:v_mov_b32_e32 v3, 1 +; GFX10-NEXT:v_mov_b32_e32 v4, 0 +; GFX10-NEXT:v_and_b32_e32 v3, 1, v2 +; GFX10-NEXT:v_lshrrev_b32_e32 v0, 1, v1 ; GFX10-NEXT:v_mov_b32_e32 v1, 0 -; GFX10-NEXT:v_and_b32_e32 v0, 1, v2 -; GFX10-NEXT:v_lshrrev_b32_e32 v2, 1, v3 -; GFX10-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX10-NEXT:v_or_b32_e32 v0, v2, v0 +; GFX10-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4] +; GFX10-NEXT:v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT:v_mov_b32_e32 v2, 0 ; GFX10-NEXT:s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_lshr_i65_33: ; GFX11: ; %bb.0: ; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT:v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1 -; GFX11-NEXT:v_dual_mov_b32 v1, 0 :: v_dual_
[llvm-branch-commits] X86: Add X86TargetLowering::isProfitableToHoist hook for immediate operands. (PR #141326)
pcc wrote: I'm fixing the code generation for the test cases that I'm adding (inhibit-zext-constant-hoist.ll) which were all extracted from a build of a large internal program built with CFI. Previously f1 looked like this where align was hoisted: ``` f1: # @f1 .cfi_startproc # %bb.0: movl$__typeid__ZTS1S_align, %eax movzbl %al, %ecx movb$64, %al subb%cl, %al movzbl %al, %eax testl %edi, %edi je .LBB0_3 # %bb.1: movq(%rsi), %r8 movl$__typeid__ZTS1S_global_addr, %edx movq%r8, %rdi subq%rdx, %rdi movq%rdi, %rdx # kill: def $cl killed $cl killed $rcx shrq%cl, %rdx movl%eax, %ecx shlq%cl, %rdi orq %rdx, %rdi cmpq$__typeid__ZTS1S_size_m1@ABS8, %rdi jbe .LBB0_4 .LBB0_2: ud1l2(%eax), %eax .LBB0_3: movq(%rdx), %r8 movl$__typeid__ZTS1S_global_addr, %esi movq%r8, %rdi subq%rsi, %rdi movq%rdi, %rsi # kill: def $cl killed $cl killed $rcx shrq%cl, %rsi movl%eax, %ecx shlq%cl, %rdi orq %rsi, %rdi cmpq$__typeid__ZTS1S_size_m1@ABS8, %rdi movq%rdx, %rsi ja .LBB0_2 .LBB0_4: movq%rsi, %rdi jmpq*(%r8) # TAILCALL .Lfunc_end0: .size f1, .Lfunc_end0-f1 .cfi_endproc ``` Now f1 looks like this: ``` f1: # @f1 .cfi_startproc # %bb.0: testl %edi, %edi je .LBB0_3 # %bb.1: movq(%rsi), %rax movl$__typeid__ZTS1S_global_addr, %ecx movq%rax, %rdx subq%rcx, %rdx rorq$__typeid__ZTS1S_align, %rdx cmpq$__typeid__ZTS1S_size_m1@ABS8, %rdx jbe .LBB0_4 .LBB0_2: ud1l2(%eax), %eax .LBB0_3: movq(%rdx), %rax movl$__typeid__ZTS1S_global_addr, %ecx movq%rax, %rsi subq%rcx, %rsi rorq$__typeid__ZTS1S_align, %rsi cmpq$__typeid__ZTS1S_size_m1@ABS8, %rsi movq%rdx, %rsi ja .LBB0_2 .LBB0_4: movq%rsi, %rdi jmpq*(%rax) # TAILCALL .Lfunc_end0: .size f1, .Lfunc_end0-f1 .cfi_endproc ``` The other cases look similar before my change. The poor codegen issue was introduced (I believe) by https://github.com/llvm/llvm-project/pull/71040 which removed the zext ConstantExprs that LowerTypeTests was using to keep everything in the same basic block for matching. I think that because it's the zext being hoisted and not the shifts it wouldn't make a difference to use fshl/fshr but I can check. https://github.com/llvm/llvm-project/pull/141326 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: account for BRK when searching for auth oracles (PR #137975)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/137975 >From ec1655b1fab18e3c2e13bc7b35ac1e151af4b615 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Wed, 30 Apr 2025 16:08:10 +0300 Subject: [PATCH] [BOLT] Gadget scanner: account for BRK when searching for auth oracles An authenticated pointer can be explicitly checked by the compiler via a sequence of instructions that executes BRK on failure. It is important to recognize such BRK instruction as checking every register (as it is expected to immediately trigger an abnormal program termination) to prevent false positive reports about authentication oracles: autia x2, x3 autia x0, x1 ; neither x0 nor x2 are checked at this point eor x16, x0, x0, lsl #1 tbz x16, #62, on_success ; marks x0 as checked ; end of BB: for x2 to be checked here, it must be checked in both ; successor basic blocks on_failure: brk 0xc470 on_success: ; x2 is checked ldr x1, [x2] ; marks x2 as checked --- bolt/include/bolt/Core/MCPlusBuilder.h| 14 ++ bolt/lib/Passes/PAuthGadgetScanner.cpp| 13 +- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 24 -- .../AArch64/gs-pauth-address-checks.s | 44 +-- .../AArch64/gs-pauth-authentication-oracles.s | 9 ++-- .../AArch64/gs-pauth-signing-oracles.s| 6 +-- 6 files changed, 75 insertions(+), 35 deletions(-) diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index b233452985502..c8cbcaf33f4b5 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -707,6 +707,20 @@ class MCPlusBuilder { return false; } + /// Returns true if Inst is a trap instruction. + /// + /// Tests if Inst is an instruction that immediately causes an abnormal + /// program termination, for example when a security violation is detected + /// by a compiler-inserted check. + /// + /// @note An implementation of this method should likely return false for + /// calls to library functions like abort(), as it is possible that the + /// execution state is partially attacker-controlled at this point. + virtual bool isTrap(const MCInst &Inst) const { +llvm_unreachable("not implemented"); +return false; + } + virtual bool isBreakpoint(const MCInst &Inst) const { llvm_unreachable("not implemented"); return false; diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index b539f1a211d4f..20d921728283e 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -1053,6 +1053,15 @@ class DstSafetyAnalysis { dbgs() << ")\n"; }); +// If this instruction terminates the program immediately, no +// authentication oracles are possible past this point. +if (BC.MIB->isTrap(Point)) { + LLVM_DEBUG({ traceInst(BC, "Trap instruction found", Point); }); + DstState Next(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters()); + Next.CannotEscapeUnchecked.set(); + return Next; +} + // If this instruction is reachable by the analysis, a non-empty state will // be propagated to it sooner or later. Until then, skip computeNext(). if (Cur.empty()) { @@ -1160,8 +1169,8 @@ class DataflowDstSafetyAnalysis // // A basic block without any successors, on the other hand, can be // pessimistically initialized to everything-is-unsafe: this will naturally -// handle both return and tail call instructions and is harmless for -// internal indirect branch instructions (such as computed gotos). +// handle return, trap and tail call instructions. At the same time, it is +// harmless for internal indirect branch instructions, like computed gotos. if (BB.succ_empty()) return createUnsafeState(); diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 9d5a578cfbdff..b669d32cc2032 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -386,10 +386,9 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { // the list of successors of this basic block as appropriate. // Any of the above code sequences assume the fall-through basic block -// is a dead-end BRK instruction (any immediate operand is accepted). +// is a dead-end trap instruction. const BinaryBasicBlock *BreakBB = BB.getFallthrough(); -if (!BreakBB || BreakBB->empty() || -BreakBB->front().getOpcode() != AArch64::BRK) +if (!BreakBB || BreakBB->empty() || !isTrap(BreakBB->front())) return std::nullopt; // Iterate over the instructions of BB in reverse order, matching opcodes @@ -1751,6 +1750,25 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { Inst.addOperand(MCOperand::createImm(0)); }
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: account for BRK when searching for auth oracles (PR #137975)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/137975 >From ec1655b1fab18e3c2e13bc7b35ac1e151af4b615 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Wed, 30 Apr 2025 16:08:10 +0300 Subject: [PATCH] [BOLT] Gadget scanner: account for BRK when searching for auth oracles An authenticated pointer can be explicitly checked by the compiler via a sequence of instructions that executes BRK on failure. It is important to recognize such BRK instruction as checking every register (as it is expected to immediately trigger an abnormal program termination) to prevent false positive reports about authentication oracles: autia x2, x3 autia x0, x1 ; neither x0 nor x2 are checked at this point eor x16, x0, x0, lsl #1 tbz x16, #62, on_success ; marks x0 as checked ; end of BB: for x2 to be checked here, it must be checked in both ; successor basic blocks on_failure: brk 0xc470 on_success: ; x2 is checked ldr x1, [x2] ; marks x2 as checked --- bolt/include/bolt/Core/MCPlusBuilder.h| 14 ++ bolt/lib/Passes/PAuthGadgetScanner.cpp| 13 +- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 24 -- .../AArch64/gs-pauth-address-checks.s | 44 +-- .../AArch64/gs-pauth-authentication-oracles.s | 9 ++-- .../AArch64/gs-pauth-signing-oracles.s| 6 +-- 6 files changed, 75 insertions(+), 35 deletions(-) diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index b233452985502..c8cbcaf33f4b5 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -707,6 +707,20 @@ class MCPlusBuilder { return false; } + /// Returns true if Inst is a trap instruction. + /// + /// Tests if Inst is an instruction that immediately causes an abnormal + /// program termination, for example when a security violation is detected + /// by a compiler-inserted check. + /// + /// @note An implementation of this method should likely return false for + /// calls to library functions like abort(), as it is possible that the + /// execution state is partially attacker-controlled at this point. + virtual bool isTrap(const MCInst &Inst) const { +llvm_unreachable("not implemented"); +return false; + } + virtual bool isBreakpoint(const MCInst &Inst) const { llvm_unreachable("not implemented"); return false; diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index b539f1a211d4f..20d921728283e 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -1053,6 +1053,15 @@ class DstSafetyAnalysis { dbgs() << ")\n"; }); +// If this instruction terminates the program immediately, no +// authentication oracles are possible past this point. +if (BC.MIB->isTrap(Point)) { + LLVM_DEBUG({ traceInst(BC, "Trap instruction found", Point); }); + DstState Next(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters()); + Next.CannotEscapeUnchecked.set(); + return Next; +} + // If this instruction is reachable by the analysis, a non-empty state will // be propagated to it sooner or later. Until then, skip computeNext(). if (Cur.empty()) { @@ -1160,8 +1169,8 @@ class DataflowDstSafetyAnalysis // // A basic block without any successors, on the other hand, can be // pessimistically initialized to everything-is-unsafe: this will naturally -// handle both return and tail call instructions and is harmless for -// internal indirect branch instructions (such as computed gotos). +// handle return, trap and tail call instructions. At the same time, it is +// harmless for internal indirect branch instructions, like computed gotos. if (BB.succ_empty()) return createUnsafeState(); diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 9d5a578cfbdff..b669d32cc2032 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -386,10 +386,9 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { // the list of successors of this basic block as appropriate. // Any of the above code sequences assume the fall-through basic block -// is a dead-end BRK instruction (any immediate operand is accepted). +// is a dead-end trap instruction. const BinaryBasicBlock *BreakBB = BB.getFallthrough(); -if (!BreakBB || BreakBB->empty() || -BreakBB->front().getOpcode() != AArch64::BRK) +if (!BreakBB || BreakBB->empty() || !isTrap(BreakBB->front())) return std::nullopt; // Iterate over the instructions of BB in reverse order, matching opcodes @@ -1751,6 +1750,25 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { Inst.addOperand(MCOperand::createImm(0)); }
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: detect untrusted LR before tail call (PR #137224)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/137224 >From 11094a446c4b193d5b5e3023cdd01de0e619 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Tue, 22 Apr 2025 21:43:14 +0300 Subject: [PATCH 1/2] [BOLT] Gadget scanner: detect untrusted LR before tail call Implement the detection of tail calls performed with untrusted link register, which violates the assumption made on entry to every function. Unlike other pauth gadgets, this one involves some amount of guessing which branch instructions should be checked as tail calls. --- bolt/lib/Passes/PAuthGadgetScanner.cpp| 94 ++- .../AArch64/gs-pacret-autiasp.s | 31 +- .../AArch64/gs-pauth-debug-output.s | 30 +- .../AArch64/gs-pauth-tail-calls.s | 597 ++ 4 files changed, 706 insertions(+), 46 deletions(-) create mode 100644 bolt/test/binary-analysis/AArch64/gs-pauth-tail-calls.s diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index fc6120e922baa..b539f1a211d4f 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -737,19 +737,14 @@ template class CFGUnawareAnalysis { // // Then, a function can be split into a number of disjoint contiguous sequences // of instructions without labels in between. These sequences can be processed -// the same way basic blocks are processed by data-flow analysis, assuming -// pessimistically that all registers are unsafe at the start of each sequence. +// the same way basic blocks are processed by data-flow analysis, with the same +// pessimistic estimation of the initial state at the start of each sequence +// (except the first instruction of the function). class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis, public CFGUnawareAnalysis { using SrcSafetyAnalysis::BC; BinaryFunction &BF; - /// Creates a state with all registers marked unsafe (not to be confused - /// with empty state). - SrcState createUnsafeState() const { -return SrcState(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters()); - } - public: CFGUnawareSrcSafetyAnalysis(BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId, @@ -759,6 +754,7 @@ class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis, } void run() override { +const SrcState DefaultState = computePessimisticState(BF); SrcState S = createEntryState(); for (auto &I : BF.instrs()) { MCInst &Inst = I.second; @@ -773,7 +769,7 @@ class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis, LLVM_DEBUG({ traceInst(BC, "Due to label, resetting the state before", Inst); }); -S = createUnsafeState(); +S = DefaultState; } // Attach the state *before* this instruction executes. @@ -1302,6 +1298,83 @@ shouldReportReturnGadget(const BinaryContext &BC, const MCInstReference &Inst, return make_gadget_report(RetKind, Inst, *RetReg); } +/// While BOLT already marks some of the branch instructions as tail calls, +/// this function tries to improve the coverage by including less obvious cases +/// when it is possible to do without introducing too many false positives. +static bool shouldAnalyzeTailCallInst(const BinaryContext &BC, + const BinaryFunction &BF, + const MCInstReference &Inst) { + // Some BC.MIB->isXYZ(Inst) methods simply delegate to MCInstrDesc::isXYZ() + // (such as isBranch at the time of writing this comment), some don't (such + // as isCall). For that reason, call MCInstrDesc's methods explicitly when + // it is important. + const MCInstrDesc &Desc = + BC.MII->get(static_cast(Inst).getOpcode()); + // Tail call should be a branch (but not necessarily an indirect one). + if (!Desc.isBranch()) +return false; + + // Always analyze the branches already marked as tail calls by BOLT. + if (BC.MIB->isTailCall(Inst)) +return true; + + // Try to also check the branches marked as "UNKNOWN CONTROL FLOW" - the + // below is a simplified condition from BinaryContext::printInstruction. + bool IsUnknownControlFlow = + BC.MIB->isIndirectBranch(Inst) && !BC.MIB->getJumpTable(Inst); + + if (BF.hasCFG() && IsUnknownControlFlow) +return true; + + return false; +} + +static std::optional> +shouldReportUnsafeTailCall(const BinaryContext &BC, const BinaryFunction &BF, + const MCInstReference &Inst, const SrcState &S) { + static const GadgetKind UntrustedLRKind( + "untrusted link register found before tail call"); + + if (!shouldAnalyzeTailCallInst(BC, BF, Inst)) +return std::nullopt; + + // Not only the set of registers returned by getTrustedLiveInRegs() can be + // seen as a reasonable target-independent _approximation_ of "the LR", these + // are *exactly* those regis
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: prevent false positives due to jump tables (PR #138884)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/138884 >From 65f42b50bc58d2e5b78946bf82be3db9f5d63230 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Tue, 6 May 2025 11:31:03 +0300 Subject: [PATCH] [BOLT] Gadget scanner: prevent false positives due to jump tables As part of PAuth hardening, AArch64 LLVM backend can use a special BR_JumpTable pseudo (enabled by -faarch64-jump-table-hardening Clang option) which is expanded in the AsmPrinter into a contiguous sequence without unsafe instructions in the middle. This commit adds another target-specific callback to MCPlusBuilder to make it possible to inhibit false positives for known-safe jump table dispatch sequences. Without special handling, the branch instruction is likely to be reported as a non-protected call (as its destination is not produced by an auth instruction, PC-relative address materialization, etc.) and possibly as a tail call being performed with unsafe link register (as the detection whether the branch instruction is a tail call is an heuristic). For now, only the specific instruction sequence used by the AArch64 LLVM backend is matched. --- bolt/include/bolt/Core/MCInstUtils.h | 9 + bolt/include/bolt/Core/MCPlusBuilder.h| 14 + bolt/lib/Core/MCInstUtils.cpp | 20 + bolt/lib/Passes/PAuthGadgetScanner.cpp| 10 + .../Target/AArch64/AArch64MCPlusBuilder.cpp | 73 ++ .../AArch64/gs-pauth-jump-table.s | 703 ++ 6 files changed, 829 insertions(+) create mode 100644 bolt/test/binary-analysis/AArch64/gs-pauth-jump-table.s diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h index 50b7d56470c99..33d36cccbcfff 100644 --- a/bolt/include/bolt/Core/MCInstUtils.h +++ b/bolt/include/bolt/Core/MCInstUtils.h @@ -154,6 +154,15 @@ class MCInstReference { return nullptr; } + /// Returns the only preceding instruction, or std::nullopt if multiple or no + /// predecessors are possible. + /// + /// If CFG information is available, basic block boundary can be crossed, + /// provided there is exactly one predecessor. If CFG is not available, the + /// preceding instruction in the offset order is returned, unless this is the + /// first instruction of the function. + std::optional getSinglePredecessor(); + raw_ostream &print(raw_ostream &OS) const; }; diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index c8cbcaf33f4b5..3abf4d18e94da 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -14,6 +14,7 @@ #ifndef BOLT_CORE_MCPLUSBUILDER_H #define BOLT_CORE_MCPLUSBUILDER_H +#include "bolt/Core/MCInstUtils.h" #include "bolt/Core/MCPlus.h" #include "bolt/Core/Relocation.h" #include "llvm/ADT/ArrayRef.h" @@ -700,6 +701,19 @@ class MCPlusBuilder { return std::nullopt; } + /// Tests if BranchInst corresponds to an instruction sequence which is known + /// to be a safe dispatch via jump table. + /// + /// The target can decide which instruction sequences to consider "safe" from + /// the Pointer Authentication point of view, such as any jump table dispatch + /// sequence without function calls inside, any sequence which is contiguous, + /// or only some specific well-known sequences. + virtual bool + isSafeJumpTableBranchForPtrAuth(MCInstReference BranchInst) const { +llvm_unreachable("not implemented"); +return false; + } + virtual bool isTerminator(const MCInst &Inst) const; virtual bool isNoop(const MCInst &Inst) const { diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp index 40f6edd59135c..b7c6d898988af 100644 --- a/bolt/lib/Core/MCInstUtils.cpp +++ b/bolt/lib/Core/MCInstUtils.cpp @@ -55,3 +55,23 @@ raw_ostream &MCInstReference::print(raw_ostream &OS) const { OS << ">"; return OS; } + +std::optional MCInstReference::getSinglePredecessor() { + if (const RefInBB *Ref = tryGetRefInBB()) { +if (Ref->It != Ref->BB->begin()) + return MCInstReference(Ref->BB, &*std::prev(Ref->It)); + +if (Ref->BB->pred_size() != 1) + return std::nullopt; + +BinaryBasicBlock *PredBB = *Ref->BB->pred_begin(); +assert(!PredBB->empty() && "Empty basic blocks are not supported yet"); +return MCInstReference(PredBB, &*PredBB->rbegin()); + } + + const RefInBF &Ref = getRefInBF(); + if (Ref.It == Ref.BF->instrs().begin()) +return std::nullopt; + + return MCInstReference(Ref.BF, std::prev(Ref.It)); +} diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index d8d14b50d9216..85c0f34af74a1 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -1342,6 +1342,11 @@ shouldReportUnsafeTailCall(const BinaryContext &BC, const BinaryFunction &BF, return std::nullopt; } + if (BC.MIB->isSafeJumpTableBranchForPtrAuth(Inst)) { +LL
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: improve handling of unreachable basic blocks (PR #136183)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/136183 >From 1bd54289ecbb513831c9b94adfc1822abf3deb73 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Thu, 17 Apr 2025 20:51:16 +0300 Subject: [PATCH 1/3] [BOLT] Gadget scanner: improve handling of unreachable basic blocks Instead of refusing to analyze an instruction completely, when it is unreachable according to the CFG reconstructed by BOLT, pessimistically assume all registers to be unsafe at the start of basic blocks without any predecessors. Nevertheless, unreachable basic blocks found in optimized code likely means imprecise CFG reconstruction, thus report a warning once per basic block without predecessors. --- bolt/lib/Passes/PAuthGadgetScanner.cpp| 46 ++- .../AArch64/gs-pacret-autiasp.s | 7 ++- .../binary-analysis/AArch64/gs-pauth-calls.s | 57 +++ 3 files changed, 95 insertions(+), 15 deletions(-) diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index c419ff74992a7..0b0f8fb77b2fd 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -342,6 +342,12 @@ class SrcSafetyAnalysis { return S; } + /// Creates a state with all registers marked unsafe (not to be confused + /// with empty state). + SrcState createUnsafeState() const { +return SrcState(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters()); + } + BitVector getClobberedRegs(const MCInst &Point) const { BitVector Clobbered(NumRegs); // Assume a call can clobber all registers, including callee-saved @@ -585,6 +591,13 @@ class DataflowSrcSafetyAnalysis if (BB.isEntryPoint()) return createEntryState(); +// If a basic block without any predecessors is found in an optimized code, +// this likely means that some CFG edges were not detected. Pessimistically +// assume all registers to be unsafe before this basic block and warn about +// this fact in FunctionAnalysis::findUnsafeUses(). +if (BB.pred_empty()) + return createUnsafeState(); + return SrcState(); } @@ -689,12 +702,6 @@ class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis, using SrcSafetyAnalysis::BC; BinaryFunction &BF; - /// Creates a state with all registers marked unsafe (not to be confused - /// with empty state). - SrcState createUnsafeState() const { -return SrcState(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters()); - } - public: CFGUnawareSrcSafetyAnalysis(BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId, @@ -1355,19 +1362,30 @@ void FunctionAnalysisContext::findUnsafeUses( BF.dump(); }); + if (BF.hasCFG()) { +// Warn on basic blocks being unreachable according to BOLT, as this +// likely means CFG is imprecise. +for (BinaryBasicBlock &BB : BF) { + if (!BB.pred_empty() || BB.isEntryPoint()) +continue; + // Arbitrarily attach the report to the first instruction of BB. + MCInst *InstToReport = BB.getFirstNonPseudoInstr(); + if (!InstToReport) +continue; // BB has no real instructions + + Reports.push_back( + make_generic_report(MCInstReference::get(InstToReport, BF), + "Warning: no predecessor basic blocks detected " + "(possibly incomplete CFG)")); +} + } + iterateOverInstrs(BF, [&](MCInstReference Inst) { if (BC.MIB->isCFI(Inst)) return; const SrcState &S = Analysis->getStateBefore(Inst); - -// If non-empty state was never propagated from the entry basic block -// to Inst, assume it to be unreachable and report a warning. -if (S.empty()) { - Reports.push_back( - make_generic_report(Inst, "Warning: unreachable instruction found")); - return; -} +assert(!S.empty() && "Instruction has no associated state"); if (auto Report = shouldReportReturnGadget(BC, Inst, S)) Reports.push_back(*Report); diff --git a/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s b/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s index 284f0bea607a5..6559ba336e8de 100644 --- a/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s +++ b/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s @@ -215,12 +215,17 @@ f_callclobbered_calleesaved: .globl f_unreachable_instruction .type f_unreachable_instruction,@function f_unreachable_instruction: -// CHECK-LABEL: GS-PAUTH: Warning: unreachable instruction found in function f_unreachable_instruction, basic block {{[0-9a-zA-Z.]+}}, at address +// CHECK-LABEL: GS-PAUTH: Warning: no predecessor basic blocks detected (possibly incomplete CFG) in function f_unreachable_instruction, basic block {{[0-9a-zA-Z.]+}}, at address // CHECK-NEXT:The instruction is {{[0-9a-f]+}}: add x0, x1, x2 // CHECK-NOT: instructions that write t
[llvm-branch-commits] [llvm] [BOLT] Introduce helpers to match `MCInst`s one at a time (NFC) (PR #138883)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/138883 >From daff3ce49a0272612af7d94d6be176a2c65e305c Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Wed, 7 May 2025 16:42:00 +0300 Subject: [PATCH] [BOLT] Introduce helpers to match `MCInst`s one at a time (NFC) Introduce matchInst helper function to capture and/or match the operands of MCInst. Unlike the existing `MCPlusBuilder::MCInstMatcher` machinery, matchInst is intended for the use cases when precise control over the instruction order is required. For example, when validating PtrAuth hardening, all registers are usually considered unsafe after a function call, even though callee-saved registers should preserve their old values *under normal operation*. --- bolt/include/bolt/Core/MCInstUtils.h | 128 ++ .../Target/AArch64/AArch64MCPlusBuilder.cpp | 90 +--- 2 files changed, 162 insertions(+), 56 deletions(-) diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h index 69bf5e6159b74..50b7d56470c99 100644 --- a/bolt/include/bolt/Core/MCInstUtils.h +++ b/bolt/include/bolt/Core/MCInstUtils.h @@ -162,6 +162,134 @@ static inline raw_ostream &operator<<(raw_ostream &OS, return Ref.print(OS); } +/// Instruction-matching helpers operating on a single instruction at a time. +/// +/// Unlike MCPlusBuilder::MCInstMatcher, this matchInst() function focuses on +/// the cases where a precise control over the instruction order is important: +/// +/// // Bring the short names into the local scope: +/// using namespace MCInstMatcher; +/// // Declare the registers to capture: +/// Reg Xn, Xm; +/// // Capture the 0th and 1st operands, match the 2nd operand against the +/// // just captured Xm register, match the 3rd operand against literal 0: +/// if (!matchInst(MaybeAdd, AArch64::ADDXrs, Xm, Xn, Xm, Imm(0)) +/// return AArch64::NoRegister; +/// // Match the 0th operand against Xm: +/// if (!matchInst(MaybeBr, AArch64::BR, Xm)) +/// return AArch64::NoRegister; +/// // Return the matched register: +/// return Xm.get(); +namespace MCInstMatcher { + +// The base class to match an operand of type T. +// +// The subclasses of OpMatcher are intended to be allocated on the stack and +// to only be used by passing them to matchInst() and by calling their get() +// function, thus the peculiar `mutable` specifiers: to make the calling code +// compact and readable, the templated matchInst() function has to accept both +// long-lived Imm/Reg wrappers declared as local variables (intended to capture +// the first operand's value and match the subsequent operands, whether inside +// a single instruction or across multiple instructions), as well as temporary +// wrappers around literal values to match, f.e. Imm(42) or Reg(AArch64::XZR). +template class OpMatcher { + mutable std::optional Value; + mutable std::optional SavedValue; + + // Remember/restore the last Value - to be called by matchInst. + void remember() const { SavedValue = Value; } + void restore() const { Value = SavedValue; } + + template + friend bool matchInst(const MCInst &, unsigned, const OpMatchers &...); + +protected: + OpMatcher(std::optional ValueToMatch) : Value(ValueToMatch) {} + + bool matchValue(T OpValue) const { +// Check that OpValue does not contradict the existing Value. +bool MatchResult = !Value || *Value == OpValue; +// If MatchResult is false, all matchers will be reset before returning from +// matchInst, including this one, thus no need to assign conditionally. +Value = OpValue; + +return MatchResult; + } + +public: + /// Returns the captured value. + T get() const { +assert(Value.has_value()); +return *Value; + } +}; + +class Reg : public OpMatcher { + bool matches(const MCOperand &Op) const { +if (!Op.isReg()) + return false; + +return matchValue(Op.getReg()); + } + + template + friend bool matchInst(const MCInst &, unsigned, const OpMatchers &...); + +public: + Reg(std::optional RegToMatch = std::nullopt) + : OpMatcher(RegToMatch) {} +}; + +class Imm : public OpMatcher { + bool matches(const MCOperand &Op) const { +if (!Op.isImm()) + return false; + +return matchValue(Op.getImm()); + } + + template + friend bool matchInst(const MCInst &, unsigned, const OpMatchers &...); + +public: + Imm(std::optional ImmToMatch = std::nullopt) + : OpMatcher(ImmToMatch) {} +}; + +/// Tries to match Inst and updates Ops on success. +/// +/// If Inst has the specified Opcode and its operand list prefix matches Ops, +/// this function returns true and updates Ops, otherwise false is returned and +/// values of Ops are kept as before matchInst was called. +/// +/// Please note that while Ops are technically passed by a const reference to +/// make invocations like `matchInst(MI, Opcode, Imm(42))` possible, all their +/// fields are marked mut
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: detect untrusted LR before tail call (PR #137224)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/137224 >From 11094a446c4b193d5b5e3023cdd01de0e619 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Tue, 22 Apr 2025 21:43:14 +0300 Subject: [PATCH 1/2] [BOLT] Gadget scanner: detect untrusted LR before tail call Implement the detection of tail calls performed with untrusted link register, which violates the assumption made on entry to every function. Unlike other pauth gadgets, this one involves some amount of guessing which branch instructions should be checked as tail calls. --- bolt/lib/Passes/PAuthGadgetScanner.cpp| 94 ++- .../AArch64/gs-pacret-autiasp.s | 31 +- .../AArch64/gs-pauth-debug-output.s | 30 +- .../AArch64/gs-pauth-tail-calls.s | 597 ++ 4 files changed, 706 insertions(+), 46 deletions(-) create mode 100644 bolt/test/binary-analysis/AArch64/gs-pauth-tail-calls.s diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index fc6120e922baa..b539f1a211d4f 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -737,19 +737,14 @@ template class CFGUnawareAnalysis { // // Then, a function can be split into a number of disjoint contiguous sequences // of instructions without labels in between. These sequences can be processed -// the same way basic blocks are processed by data-flow analysis, assuming -// pessimistically that all registers are unsafe at the start of each sequence. +// the same way basic blocks are processed by data-flow analysis, with the same +// pessimistic estimation of the initial state at the start of each sequence +// (except the first instruction of the function). class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis, public CFGUnawareAnalysis { using SrcSafetyAnalysis::BC; BinaryFunction &BF; - /// Creates a state with all registers marked unsafe (not to be confused - /// with empty state). - SrcState createUnsafeState() const { -return SrcState(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters()); - } - public: CFGUnawareSrcSafetyAnalysis(BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId, @@ -759,6 +754,7 @@ class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis, } void run() override { +const SrcState DefaultState = computePessimisticState(BF); SrcState S = createEntryState(); for (auto &I : BF.instrs()) { MCInst &Inst = I.second; @@ -773,7 +769,7 @@ class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis, LLVM_DEBUG({ traceInst(BC, "Due to label, resetting the state before", Inst); }); -S = createUnsafeState(); +S = DefaultState; } // Attach the state *before* this instruction executes. @@ -1302,6 +1298,83 @@ shouldReportReturnGadget(const BinaryContext &BC, const MCInstReference &Inst, return make_gadget_report(RetKind, Inst, *RetReg); } +/// While BOLT already marks some of the branch instructions as tail calls, +/// this function tries to improve the coverage by including less obvious cases +/// when it is possible to do without introducing too many false positives. +static bool shouldAnalyzeTailCallInst(const BinaryContext &BC, + const BinaryFunction &BF, + const MCInstReference &Inst) { + // Some BC.MIB->isXYZ(Inst) methods simply delegate to MCInstrDesc::isXYZ() + // (such as isBranch at the time of writing this comment), some don't (such + // as isCall). For that reason, call MCInstrDesc's methods explicitly when + // it is important. + const MCInstrDesc &Desc = + BC.MII->get(static_cast(Inst).getOpcode()); + // Tail call should be a branch (but not necessarily an indirect one). + if (!Desc.isBranch()) +return false; + + // Always analyze the branches already marked as tail calls by BOLT. + if (BC.MIB->isTailCall(Inst)) +return true; + + // Try to also check the branches marked as "UNKNOWN CONTROL FLOW" - the + // below is a simplified condition from BinaryContext::printInstruction. + bool IsUnknownControlFlow = + BC.MIB->isIndirectBranch(Inst) && !BC.MIB->getJumpTable(Inst); + + if (BF.hasCFG() && IsUnknownControlFlow) +return true; + + return false; +} + +static std::optional> +shouldReportUnsafeTailCall(const BinaryContext &BC, const BinaryFunction &BF, + const MCInstReference &Inst, const SrcState &S) { + static const GadgetKind UntrustedLRKind( + "untrusted link register found before tail call"); + + if (!shouldAnalyzeTailCallInst(BC, BF, Inst)) +return std::nullopt; + + // Not only the set of registers returned by getTrustedLiveInRegs() can be + // seen as a reasonable target-independent _approximation_ of "the LR", these + // are *exactly* those regis
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: optionally assume auth traps on failure (PR #139778)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/139778 >From d6536f26399b2c385821c65af779fc694e8eee72 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Tue, 13 May 2025 19:50:41 +0300 Subject: [PATCH] [BOLT] Gadget scanner: optionally assume auth traps on failure On AArch64 it is possible for an auth instruction to either return an invalid address value on failure (without FEAT_FPAC) or generate an error (with FEAT_FPAC). It thus may be possible to never emit explicit pointer checks, if the target CPU is known to support FEAT_FPAC. This commit implements an --auth-traps-on-failure command line option, which essentially makes "safe-to-dereference" and "trusted" register properties identical and disables scanning for authentication oracles completely. --- bolt/lib/Passes/PAuthGadgetScanner.cpp| 112 +++ .../binary-analysis/AArch64/cmdline-args.test | 1 + .../AArch64/gs-pauth-authentication-oracles.s | 6 +- .../binary-analysis/AArch64/gs-pauth-calls.s | 5 +- .../AArch64/gs-pauth-debug-output.s | 177 ++--- .../AArch64/gs-pauth-jump-table.s | 6 +- .../AArch64/gs-pauth-signing-oracles.s| 54 ++--- .../AArch64/gs-pauth-tail-calls.s | 184 +- 8 files changed, 318 insertions(+), 227 deletions(-) diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index 85c0f34af74a1..b7573c96d183e 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -14,6 +14,7 @@ #include "bolt/Passes/PAuthGadgetScanner.h" #include "bolt/Core/ParallelUtilities.h" #include "bolt/Passes/DataflowAnalysis.h" +#include "bolt/Utils/CommandLineOpts.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/MC/MCInst.h" @@ -26,6 +27,11 @@ namespace llvm { namespace bolt { namespace PAuthGadgetScanner { +static cl::opt AuthTrapsOnFailure( +"auth-traps-on-failure", +cl::desc("Assume authentication instructions always trap on failure"), +cl::cat(opts::BinaryAnalysisCategory)); + [[maybe_unused]] static void traceInst(const BinaryContext &BC, StringRef Label, const MCInst &MI) { dbgs() << " " << Label << ": "; @@ -364,6 +370,34 @@ class SrcSafetyAnalysis { return Clobbered; } + std::optional getRegMadeTrustedByChecking(const MCInst &Inst, + SrcState Cur) const { +// This functions cannot return multiple registers. This is never the case +// on AArch64. +std::optional RegCheckedByInst = +BC.MIB->getAuthCheckedReg(Inst, /*MayOverwrite=*/false); +if (RegCheckedByInst && Cur.SafeToDerefRegs[*RegCheckedByInst]) + return *RegCheckedByInst; + +auto It = CheckerSequenceInfo.find(&Inst); +if (It == CheckerSequenceInfo.end()) + return std::nullopt; + +MCPhysReg RegCheckedBySequence = It->second.first; +const MCInst *FirstCheckerInst = It->second.second; + +// FirstCheckerInst should belong to the same basic block (see the +// assertion in DataflowSrcSafetyAnalysis::run()), meaning it was +// deterministically processed a few steps before this instruction. +const SrcState &StateBeforeChecker = getStateBefore(*FirstCheckerInst); + +// The sequence checks the register, but it should be authenticated before. +if (!StateBeforeChecker.SafeToDerefRegs[RegCheckedBySequence]) + return std::nullopt; + +return RegCheckedBySequence; + } + // Returns all registers that can be treated as if they are written by an // authentication instruction. SmallVector getRegsMadeSafeToDeref(const MCInst &Point, @@ -386,18 +420,38 @@ class SrcSafetyAnalysis { Regs.push_back(DstAndSrc->first); } +// Make sure explicit checker sequence keeps register safe-to-dereference +// when the register would be clobbered according to the regular rules: +// +//; LR is safe to dereference here +//mov x16, x30 ; start of the sequence, LR is s-t-d right before +//xpaclri ; clobbers LR, LR is not safe anymore +//cmp x30, x16 +//b.eq 1f; end of the sequence: LR is marked as trusted +//brk 0x1234 +// 1: +//; at this point LR would be marked as trusted, +//; but not safe-to-dereference +// +// or even just +// +//; X1 is safe to dereference here +//ldr x0, [x1, #8]! +//; X1 is trusted here, but it was clobbered due to address write-back +if (auto CheckedReg = getRegMadeTrustedByChecking(Point, Cur)) + Regs.push_back(*CheckedReg); + return Regs; } // Returns all registers made trusted by this instruction. SmallVector getRegsMadeTrusted(const MCInst &Point, const SrcState &Cur) const { +assert(!AuthTrapsOnFailure &&
[llvm-branch-commits] [llvm] [BOLT] Introduce helpers to match `MCInst`s one at a time (NFC) (PR #138883)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/138883 >From daff3ce49a0272612af7d94d6be176a2c65e305c Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Wed, 7 May 2025 16:42:00 +0300 Subject: [PATCH] [BOLT] Introduce helpers to match `MCInst`s one at a time (NFC) Introduce matchInst helper function to capture and/or match the operands of MCInst. Unlike the existing `MCPlusBuilder::MCInstMatcher` machinery, matchInst is intended for the use cases when precise control over the instruction order is required. For example, when validating PtrAuth hardening, all registers are usually considered unsafe after a function call, even though callee-saved registers should preserve their old values *under normal operation*. --- bolt/include/bolt/Core/MCInstUtils.h | 128 ++ .../Target/AArch64/AArch64MCPlusBuilder.cpp | 90 +--- 2 files changed, 162 insertions(+), 56 deletions(-) diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h index 69bf5e6159b74..50b7d56470c99 100644 --- a/bolt/include/bolt/Core/MCInstUtils.h +++ b/bolt/include/bolt/Core/MCInstUtils.h @@ -162,6 +162,134 @@ static inline raw_ostream &operator<<(raw_ostream &OS, return Ref.print(OS); } +/// Instruction-matching helpers operating on a single instruction at a time. +/// +/// Unlike MCPlusBuilder::MCInstMatcher, this matchInst() function focuses on +/// the cases where a precise control over the instruction order is important: +/// +/// // Bring the short names into the local scope: +/// using namespace MCInstMatcher; +/// // Declare the registers to capture: +/// Reg Xn, Xm; +/// // Capture the 0th and 1st operands, match the 2nd operand against the +/// // just captured Xm register, match the 3rd operand against literal 0: +/// if (!matchInst(MaybeAdd, AArch64::ADDXrs, Xm, Xn, Xm, Imm(0)) +/// return AArch64::NoRegister; +/// // Match the 0th operand against Xm: +/// if (!matchInst(MaybeBr, AArch64::BR, Xm)) +/// return AArch64::NoRegister; +/// // Return the matched register: +/// return Xm.get(); +namespace MCInstMatcher { + +// The base class to match an operand of type T. +// +// The subclasses of OpMatcher are intended to be allocated on the stack and +// to only be used by passing them to matchInst() and by calling their get() +// function, thus the peculiar `mutable` specifiers: to make the calling code +// compact and readable, the templated matchInst() function has to accept both +// long-lived Imm/Reg wrappers declared as local variables (intended to capture +// the first operand's value and match the subsequent operands, whether inside +// a single instruction or across multiple instructions), as well as temporary +// wrappers around literal values to match, f.e. Imm(42) or Reg(AArch64::XZR). +template class OpMatcher { + mutable std::optional Value; + mutable std::optional SavedValue; + + // Remember/restore the last Value - to be called by matchInst. + void remember() const { SavedValue = Value; } + void restore() const { Value = SavedValue; } + + template + friend bool matchInst(const MCInst &, unsigned, const OpMatchers &...); + +protected: + OpMatcher(std::optional ValueToMatch) : Value(ValueToMatch) {} + + bool matchValue(T OpValue) const { +// Check that OpValue does not contradict the existing Value. +bool MatchResult = !Value || *Value == OpValue; +// If MatchResult is false, all matchers will be reset before returning from +// matchInst, including this one, thus no need to assign conditionally. +Value = OpValue; + +return MatchResult; + } + +public: + /// Returns the captured value. + T get() const { +assert(Value.has_value()); +return *Value; + } +}; + +class Reg : public OpMatcher { + bool matches(const MCOperand &Op) const { +if (!Op.isReg()) + return false; + +return matchValue(Op.getReg()); + } + + template + friend bool matchInst(const MCInst &, unsigned, const OpMatchers &...); + +public: + Reg(std::optional RegToMatch = std::nullopt) + : OpMatcher(RegToMatch) {} +}; + +class Imm : public OpMatcher { + bool matches(const MCOperand &Op) const { +if (!Op.isImm()) + return false; + +return matchValue(Op.getImm()); + } + + template + friend bool matchInst(const MCInst &, unsigned, const OpMatchers &...); + +public: + Imm(std::optional ImmToMatch = std::nullopt) + : OpMatcher(ImmToMatch) {} +}; + +/// Tries to match Inst and updates Ops on success. +/// +/// If Inst has the specified Opcode and its operand list prefix matches Ops, +/// this function returns true and updates Ops, otherwise false is returned and +/// values of Ops are kept as before matchInst was called. +/// +/// Please note that while Ops are technically passed by a const reference to +/// make invocations like `matchInst(MI, Opcode, Imm(42))` possible, all their +/// fields are marked mut
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: optionally assume auth traps on failure (PR #139778)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/139778 >From d6536f26399b2c385821c65af779fc694e8eee72 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Tue, 13 May 2025 19:50:41 +0300 Subject: [PATCH] [BOLT] Gadget scanner: optionally assume auth traps on failure On AArch64 it is possible for an auth instruction to either return an invalid address value on failure (without FEAT_FPAC) or generate an error (with FEAT_FPAC). It thus may be possible to never emit explicit pointer checks, if the target CPU is known to support FEAT_FPAC. This commit implements an --auth-traps-on-failure command line option, which essentially makes "safe-to-dereference" and "trusted" register properties identical and disables scanning for authentication oracles completely. --- bolt/lib/Passes/PAuthGadgetScanner.cpp| 112 +++ .../binary-analysis/AArch64/cmdline-args.test | 1 + .../AArch64/gs-pauth-authentication-oracles.s | 6 +- .../binary-analysis/AArch64/gs-pauth-calls.s | 5 +- .../AArch64/gs-pauth-debug-output.s | 177 ++--- .../AArch64/gs-pauth-jump-table.s | 6 +- .../AArch64/gs-pauth-signing-oracles.s| 54 ++--- .../AArch64/gs-pauth-tail-calls.s | 184 +- 8 files changed, 318 insertions(+), 227 deletions(-) diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index 85c0f34af74a1..b7573c96d183e 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -14,6 +14,7 @@ #include "bolt/Passes/PAuthGadgetScanner.h" #include "bolt/Core/ParallelUtilities.h" #include "bolt/Passes/DataflowAnalysis.h" +#include "bolt/Utils/CommandLineOpts.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/MC/MCInst.h" @@ -26,6 +27,11 @@ namespace llvm { namespace bolt { namespace PAuthGadgetScanner { +static cl::opt AuthTrapsOnFailure( +"auth-traps-on-failure", +cl::desc("Assume authentication instructions always trap on failure"), +cl::cat(opts::BinaryAnalysisCategory)); + [[maybe_unused]] static void traceInst(const BinaryContext &BC, StringRef Label, const MCInst &MI) { dbgs() << " " << Label << ": "; @@ -364,6 +370,34 @@ class SrcSafetyAnalysis { return Clobbered; } + std::optional getRegMadeTrustedByChecking(const MCInst &Inst, + SrcState Cur) const { +// This functions cannot return multiple registers. This is never the case +// on AArch64. +std::optional RegCheckedByInst = +BC.MIB->getAuthCheckedReg(Inst, /*MayOverwrite=*/false); +if (RegCheckedByInst && Cur.SafeToDerefRegs[*RegCheckedByInst]) + return *RegCheckedByInst; + +auto It = CheckerSequenceInfo.find(&Inst); +if (It == CheckerSequenceInfo.end()) + return std::nullopt; + +MCPhysReg RegCheckedBySequence = It->second.first; +const MCInst *FirstCheckerInst = It->second.second; + +// FirstCheckerInst should belong to the same basic block (see the +// assertion in DataflowSrcSafetyAnalysis::run()), meaning it was +// deterministically processed a few steps before this instruction. +const SrcState &StateBeforeChecker = getStateBefore(*FirstCheckerInst); + +// The sequence checks the register, but it should be authenticated before. +if (!StateBeforeChecker.SafeToDerefRegs[RegCheckedBySequence]) + return std::nullopt; + +return RegCheckedBySequence; + } + // Returns all registers that can be treated as if they are written by an // authentication instruction. SmallVector getRegsMadeSafeToDeref(const MCInst &Point, @@ -386,18 +420,38 @@ class SrcSafetyAnalysis { Regs.push_back(DstAndSrc->first); } +// Make sure explicit checker sequence keeps register safe-to-dereference +// when the register would be clobbered according to the regular rules: +// +//; LR is safe to dereference here +//mov x16, x30 ; start of the sequence, LR is s-t-d right before +//xpaclri ; clobbers LR, LR is not safe anymore +//cmp x30, x16 +//b.eq 1f; end of the sequence: LR is marked as trusted +//brk 0x1234 +// 1: +//; at this point LR would be marked as trusted, +//; but not safe-to-dereference +// +// or even just +// +//; X1 is safe to dereference here +//ldr x0, [x1, #8]! +//; X1 is trusted here, but it was clobbered due to address write-back +if (auto CheckedReg = getRegMadeTrustedByChecking(Point, Cur)) + Regs.push_back(*CheckedReg); + return Regs; } // Returns all registers made trusted by this instruction. SmallVector getRegsMadeTrusted(const MCInst &Point, const SrcState &Cur) const { +assert(!AuthTrapsOnFailure &&
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: prevent false positives due to jump tables (PR #138884)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/138884 >From 65f42b50bc58d2e5b78946bf82be3db9f5d63230 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Tue, 6 May 2025 11:31:03 +0300 Subject: [PATCH] [BOLT] Gadget scanner: prevent false positives due to jump tables As part of PAuth hardening, AArch64 LLVM backend can use a special BR_JumpTable pseudo (enabled by -faarch64-jump-table-hardening Clang option) which is expanded in the AsmPrinter into a contiguous sequence without unsafe instructions in the middle. This commit adds another target-specific callback to MCPlusBuilder to make it possible to inhibit false positives for known-safe jump table dispatch sequences. Without special handling, the branch instruction is likely to be reported as a non-protected call (as its destination is not produced by an auth instruction, PC-relative address materialization, etc.) and possibly as a tail call being performed with unsafe link register (as the detection whether the branch instruction is a tail call is an heuristic). For now, only the specific instruction sequence used by the AArch64 LLVM backend is matched. --- bolt/include/bolt/Core/MCInstUtils.h | 9 + bolt/include/bolt/Core/MCPlusBuilder.h| 14 + bolt/lib/Core/MCInstUtils.cpp | 20 + bolt/lib/Passes/PAuthGadgetScanner.cpp| 10 + .../Target/AArch64/AArch64MCPlusBuilder.cpp | 73 ++ .../AArch64/gs-pauth-jump-table.s | 703 ++ 6 files changed, 829 insertions(+) create mode 100644 bolt/test/binary-analysis/AArch64/gs-pauth-jump-table.s diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h index 50b7d56470c99..33d36cccbcfff 100644 --- a/bolt/include/bolt/Core/MCInstUtils.h +++ b/bolt/include/bolt/Core/MCInstUtils.h @@ -154,6 +154,15 @@ class MCInstReference { return nullptr; } + /// Returns the only preceding instruction, or std::nullopt if multiple or no + /// predecessors are possible. + /// + /// If CFG information is available, basic block boundary can be crossed, + /// provided there is exactly one predecessor. If CFG is not available, the + /// preceding instruction in the offset order is returned, unless this is the + /// first instruction of the function. + std::optional getSinglePredecessor(); + raw_ostream &print(raw_ostream &OS) const; }; diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index c8cbcaf33f4b5..3abf4d18e94da 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -14,6 +14,7 @@ #ifndef BOLT_CORE_MCPLUSBUILDER_H #define BOLT_CORE_MCPLUSBUILDER_H +#include "bolt/Core/MCInstUtils.h" #include "bolt/Core/MCPlus.h" #include "bolt/Core/Relocation.h" #include "llvm/ADT/ArrayRef.h" @@ -700,6 +701,19 @@ class MCPlusBuilder { return std::nullopt; } + /// Tests if BranchInst corresponds to an instruction sequence which is known + /// to be a safe dispatch via jump table. + /// + /// The target can decide which instruction sequences to consider "safe" from + /// the Pointer Authentication point of view, such as any jump table dispatch + /// sequence without function calls inside, any sequence which is contiguous, + /// or only some specific well-known sequences. + virtual bool + isSafeJumpTableBranchForPtrAuth(MCInstReference BranchInst) const { +llvm_unreachable("not implemented"); +return false; + } + virtual bool isTerminator(const MCInst &Inst) const; virtual bool isNoop(const MCInst &Inst) const { diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp index 40f6edd59135c..b7c6d898988af 100644 --- a/bolt/lib/Core/MCInstUtils.cpp +++ b/bolt/lib/Core/MCInstUtils.cpp @@ -55,3 +55,23 @@ raw_ostream &MCInstReference::print(raw_ostream &OS) const { OS << ">"; return OS; } + +std::optional MCInstReference::getSinglePredecessor() { + if (const RefInBB *Ref = tryGetRefInBB()) { +if (Ref->It != Ref->BB->begin()) + return MCInstReference(Ref->BB, &*std::prev(Ref->It)); + +if (Ref->BB->pred_size() != 1) + return std::nullopt; + +BinaryBasicBlock *PredBB = *Ref->BB->pred_begin(); +assert(!PredBB->empty() && "Empty basic blocks are not supported yet"); +return MCInstReference(PredBB, &*PredBB->rbegin()); + } + + const RefInBF &Ref = getRefInBF(); + if (Ref.It == Ref.BF->instrs().begin()) +return std::nullopt; + + return MCInstReference(Ref.BF, std::prev(Ref.It)); +} diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index d8d14b50d9216..85c0f34af74a1 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -1342,6 +1342,11 @@ shouldReportUnsafeTailCall(const BinaryContext &BC, const BinaryFunction &BF, return std::nullopt; } + if (BC.MIB->isSafeJumpTableBranchForPtrAuth(Inst)) { +LL
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: make use of C++17 features and LLVM helpers (PR #141665)
https://github.com/atrosinenko created https://github.com/llvm/llvm-project/pull/141665 Perform trivial syntactical cleanups: * make use of structured binding declarations * use LLVM utility functions when appropriate * omit braces around single expression inside single-line LLVM_DEBUG() This patch is NFC aside from minor debug output changes. >From cbf7911e11d6d69dbab934a384004b8f886aad13 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Tue, 27 May 2025 21:06:03 +0300 Subject: [PATCH] [BOLT] Gadget scanner: make use of C++17 features and LLVM helpers Perform trivial syntactical cleanups: * make use of structured binding declarations * use LLVM utility functions when appropriate * omit braces around single expression inside single-line LLVM_DEBUG() This patch is NFC aside from minor debug output changes. --- bolt/lib/Passes/PAuthGadgetScanner.cpp| 67 +-- .../AArch64/gs-pauth-debug-output.s | 14 ++-- 2 files changed, 38 insertions(+), 43 deletions(-) diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index b7573c96d183e..4d283482d9472 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -88,8 +88,8 @@ class TrackedRegisters { TrackedRegisters(ArrayRef RegsToTrack) : Registers(RegsToTrack), RegToIndexMapping(getMappingSize(RegsToTrack), NoIndex) { -for (unsigned I = 0; I < RegsToTrack.size(); ++I) - RegToIndexMapping[RegsToTrack[I]] = I; +for (auto [MappedIndex, Reg] : llvm::enumerate(RegsToTrack)) + RegToIndexMapping[Reg] = MappedIndex; } ArrayRef getRegisters() const { return Registers; } @@ -203,9 +203,9 @@ struct SrcState { SafeToDerefRegs &= StateIn.SafeToDerefRegs; TrustedRegs &= StateIn.TrustedRegs; -for (unsigned I = 0; I < LastInstWritingReg.size(); ++I) - for (const MCInst *J : StateIn.LastInstWritingReg[I]) -LastInstWritingReg[I].insert(J); +for (auto [ThisSet, OtherSet] : + llvm::zip_equal(LastInstWritingReg, StateIn.LastInstWritingReg)) + ThisSet.insert_range(OtherSet); return *this; } @@ -224,11 +224,9 @@ struct SrcState { static void printInstsShort(raw_ostream &OS, ArrayRef Insts) { OS << "Insts: "; - for (unsigned I = 0; I < Insts.size(); ++I) { -auto &Set = Insts[I]; + for (auto [I, PtrSet] : llvm::enumerate(Insts)) { OS << "[" << I << "]("; -for (const MCInst *MCInstP : Set) - OS << MCInstP << " "; +interleave(PtrSet, OS, " "); OS << ")"; } } @@ -416,8 +414,9 @@ class SrcSafetyAnalysis { // ... an address can be updated in a safe manner, producing the result // which is as trusted as the input address. if (auto DstAndSrc = BC.MIB->analyzeAddressArithmeticsForPtrAuth(Point)) { - if (Cur.SafeToDerefRegs[DstAndSrc->second]) -Regs.push_back(DstAndSrc->first); + auto [DstReg, SrcReg] = *DstAndSrc; + if (Cur.SafeToDerefRegs[SrcReg]) +Regs.push_back(DstReg); } // Make sure explicit checker sequence keeps register safe-to-dereference @@ -469,8 +468,9 @@ class SrcSafetyAnalysis { // ... an address can be updated in a safe manner, producing the result // which is as trusted as the input address. if (auto DstAndSrc = BC.MIB->analyzeAddressArithmeticsForPtrAuth(Point)) { - if (Cur.TrustedRegs[DstAndSrc->second]) -Regs.push_back(DstAndSrc->first); + auto [DstReg, SrcReg] = *DstAndSrc; + if (Cur.TrustedRegs[SrcReg]) +Regs.push_back(DstReg); } return Regs; @@ -845,9 +845,9 @@ struct DstState { return (*this = StateIn); CannotEscapeUnchecked &= StateIn.CannotEscapeUnchecked; -for (unsigned I = 0; I < FirstInstLeakingReg.size(); ++I) - for (const MCInst *J : StateIn.FirstInstLeakingReg[I]) -FirstInstLeakingReg[I].insert(J); +for (auto [ThisSet, OtherSet] : + llvm::zip_equal(FirstInstLeakingReg, StateIn.FirstInstLeakingReg)) + ThisSet.insert_range(OtherSet); return *this; } @@ -1012,8 +1012,7 @@ class DstSafetyAnalysis { // ... an address can be updated in a safe manner, or if (auto DstAndSrc = BC.MIB->analyzeAddressArithmeticsForPtrAuth(Inst)) { - MCPhysReg DstReg, SrcReg; - std::tie(DstReg, SrcReg) = *DstAndSrc; + auto [DstReg, SrcReg] = *DstAndSrc; // Note that *all* registers containing the derived values must be safe, // both source and destination ones. No temporaries are supported at now. if (Cur.CannotEscapeUnchecked[SrcReg] && @@ -1052,7 +1051,7 @@ class DstSafetyAnalysis { // If this instruction terminates the program immediately, no // authentication oracles are possible past this point. if (BC.MIB->isTrap(Point)) { - LLVM_DEBUG({ traceInst(BC, "Trap instruction found", Point); }); + LLVM_DEBUG(traceInst(BC, "Trap instruction found", Point));
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: make use of C++17 features and LLVM helpers (PR #141665)
atrosinenko wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/141665?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#141665** https://app.graphite.dev/github/pr/llvm/llvm-project/141665?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/141665?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#139778** https://app.graphite.dev/github/pr/llvm/llvm-project/139778?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#138884** https://app.graphite.dev/github/pr/llvm/llvm-project/138884?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#138883** https://app.graphite.dev/github/pr/llvm/llvm-project/138883?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#138655** https://app.graphite.dev/github/pr/llvm/llvm-project/138655?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#137975** https://app.graphite.dev/github/pr/llvm/llvm-project/137975?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#137224** https://app.graphite.dev/github/pr/llvm/llvm-project/137224?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#136183** https://app.graphite.dev/github/pr/llvm/llvm-project/136183?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#136151** https://app.graphite.dev/github/pr/llvm/llvm-project/136151?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#135663** https://app.graphite.dev/github/pr/llvm/llvm-project/135663?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#136147** https://app.graphite.dev/github/pr/llvm/llvm-project/136147?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#135662** https://app.graphite.dev/github/pr/llvm/llvm-project/135662?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#135661** https://app.graphite.dev/github/pr/llvm/llvm-project/135661?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#134146** https://app.graphite.dev/github/pr/llvm/llvm-project/134146?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#133461** https://app.graphite.dev/github/pr/llvm/llvm-project/133461?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#135073** https://app.graphite.dev/github/pr/llvm/llvm-project/135073?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/141665 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: make use of C++17 features and LLVM helpers (PR #141665)
https://github.com/atrosinenko ready_for_review https://github.com/llvm/llvm-project/pull/141665 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: do not crash on debug-printing CFI instructions (PR #136151)
https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/136151 >From 25fda06fe3c11cd52ee67e0bbd42b6f8dc44921d Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Tue, 15 Apr 2025 21:47:18 +0300 Subject: [PATCH] [BOLT] Gadget scanner: do not crash on debug-printing CFI instructions Some instruction-printing code used under LLVM_DEBUG does not handle CFI instructions well. While CFI instructions seem to be harmless for the correctness of the analysis results, they do not convey any useful information to the analysis either, so skip them early. --- bolt/lib/Passes/PAuthGadgetScanner.cpp| 16 ++ .../AArch64/gs-pauth-debug-output.s | 32 +++ 2 files changed, 48 insertions(+) diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index 6a130e71c842e..c419ff74992a7 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -430,6 +430,9 @@ class SrcSafetyAnalysis { } SrcState computeNext(const MCInst &Point, const SrcState &Cur) { +if (BC.MIB->isCFI(Point)) + return Cur; + SrcStatePrinter P(BC); LLVM_DEBUG({ dbgs() << " SrcSafetyAnalysis::ComputeNext("; @@ -704,6 +707,8 @@ class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis, SrcState S = createEntryState(); for (auto &I : BF.instrs()) { MCInst &Inst = I.second; + if (BC.MIB->isCFI(Inst)) +continue; // If there is a label before this instruction, it is possible that it // can be jumped-to, thus conservatively resetting S. As an exception, @@ -985,6 +990,9 @@ class DstSafetyAnalysis { } DstState computeNext(const MCInst &Point, const DstState &Cur) { +if (BC.MIB->isCFI(Point)) + return Cur; + DstStatePrinter P(BC); LLVM_DEBUG({ dbgs() << " DstSafetyAnalysis::ComputeNext("; @@ -1156,6 +1164,8 @@ class CFGUnawareDstSafetyAnalysis : public DstSafetyAnalysis, DstState S = createUnsafeState(); for (auto &I : llvm::reverse(BF.instrs())) { MCInst &Inst = I.second; + if (BC.MIB->isCFI(Inst)) +continue; // If Inst can change the control flow, we cannot be sure that the next // instruction (to be executed in analyzed program) is the one processed @@ -1346,6 +1356,9 @@ void FunctionAnalysisContext::findUnsafeUses( }); iterateOverInstrs(BF, [&](MCInstReference Inst) { +if (BC.MIB->isCFI(Inst)) + return; + const SrcState &S = Analysis->getStateBefore(Inst); // If non-empty state was never propagated from the entry basic block @@ -1409,6 +1422,9 @@ void FunctionAnalysisContext::findUnsafeDefs( }); iterateOverInstrs(BF, [&](MCInstReference Inst) { +if (BC.MIB->isCFI(Inst)) + return; + const DstState &S = Analysis->getStateAfter(Inst); if (auto Report = shouldReportAuthOracle(BC, Inst, S)) diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s index 61aa84377b88e..5aec945621987 100644 --- a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s +++ b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s @@ -329,6 +329,38 @@ auth_oracle: // PAUTH-EMPTY: // PAUTH-NEXT: Attaching leakage info to: : autia x0, x1 # DataflowDstSafetyAnalysis: dst-state +// Gadget scanner should not crash on CFI instructions, including when debug-printing them. +// Note that the particular debug output is not checked, but BOLT should be +// compiled with assertions enabled to support -debug-only argument. + +.globl cfi_inst_df +.type cfi_inst_df,@function +cfi_inst_df: +.cfi_startproc +sub sp, sp, #16 +.cfi_def_cfa_offset 16 +add sp, sp, #16 +.cfi_def_cfa_offset 0 +ret +.size cfi_inst_df, .-cfi_inst_df +.cfi_endproc + +.globl cfi_inst_nocfg +.type cfi_inst_nocfg,@function +cfi_inst_nocfg: +.cfi_startproc +sub sp, sp, #16 +.cfi_def_cfa_offset 16 + +adr x0, 1f +br x0 +1: +add sp, sp, #16 +.cfi_def_cfa_offset 0 +ret +.size cfi_inst_nocfg, .-cfi_inst_nocfg +.cfi_endproc + // CHECK-LABEL:Analyzing function main, AllocatorId = 1 .globl main .type main,@function ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [RISCV][Driver] Add riscv emulation mode to linker job of BareMetal toolchain (PR #134442)
@@ -534,8 +534,18 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-Bstatic"); - if (TC.getTriple().isRISCV() && Args.hasArg(options::OPT_mno_relax)) -CmdArgs.push_back("--no-relax"); + if (Triple.isRISCV()) { +CmdArgs.push_back("-X"); +if (Args.hasArg(options::OPT_mno_relax)) + CmdArgs.push_back("--no-relax"); +if (const char *LDMOption = getLDMOption(TC.getTriple(), Args)) { + CmdArgs.push_back("-m"); + CmdArgs.push_back(LDMOption); +} else { + D.Diag(diag::err_target_unknown_triple) << Triple.str(); + return; +} quic-garvgupt wrote: Done in the latest patchset https://github.com/llvm/llvm-project/pull/134442 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [X86] Add atomic vector tests for unaligned >1 sizes. (PR #120387)
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/120387 >From 7b4708f1ddcd76bd8ba94b0c85317e86bab36ef7 Mon Sep 17 00:00:00 2001 From: jofrn Date: Wed, 18 Dec 2024 03:40:32 -0500 Subject: [PATCH] [X86] Add atomic vector tests for unaligned >1 sizes. Unaligned atomic vectors with size >1 are lowered to calls. Adding their tests separately here. commit-id:a06a5cc6 --- llvm/test/CodeGen/X86/atomic-load-store.ll | 588 + 1 file changed, 588 insertions(+) diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 9fab8b98b4af0..3e7b73a65fe07 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -270,6 +270,82 @@ define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind { ret <1 x i64> %ret } +define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec1_ptr: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT:pushq %rax +; CHECK-O3-NEXT:movq %rdi, %rsi +; CHECK-O3-NEXT:movq %rsp, %rdx +; CHECK-O3-NEXT:movl $8, %edi +; CHECK-O3-NEXT:movl $2, %ecx +; CHECK-O3-NEXT:callq __atomic_load@PLT +; CHECK-O3-NEXT:movq (%rsp), %rax +; CHECK-O3-NEXT:popq %rcx +; CHECK-O3-NEXT:retq +; +; CHECK-SSE-O3-LABEL: atomic_vec1_ptr: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT:pushq %rax +; CHECK-SSE-O3-NEXT:movq %rdi, %rsi +; CHECK-SSE-O3-NEXT:movq %rsp, %rdx +; CHECK-SSE-O3-NEXT:movl $8, %edi +; CHECK-SSE-O3-NEXT:movl $2, %ecx +; CHECK-SSE-O3-NEXT:callq __atomic_load@PLT +; CHECK-SSE-O3-NEXT:movq (%rsp), %rax +; CHECK-SSE-O3-NEXT:popq %rcx +; CHECK-SSE-O3-NEXT:retq +; +; CHECK-AVX-O3-LABEL: atomic_vec1_ptr: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT:pushq %rax +; CHECK-AVX-O3-NEXT:movq %rdi, %rsi +; CHECK-AVX-O3-NEXT:movq %rsp, %rdx +; CHECK-AVX-O3-NEXT:movl $8, %edi +; CHECK-AVX-O3-NEXT:movl $2, %ecx +; CHECK-AVX-O3-NEXT:callq __atomic_load@PLT +; CHECK-AVX-O3-NEXT:movq (%rsp), %rax +; CHECK-AVX-O3-NEXT:popq %rcx +; CHECK-AVX-O3-NEXT:retq +; +; CHECK-O0-LABEL: atomic_vec1_ptr: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT:pushq %rax +; CHECK-O0-NEXT:movq %rdi, %rsi +; CHECK-O0-NEXT:movl $8, %edi +; CHECK-O0-NEXT:movq %rsp, %rdx +; CHECK-O0-NEXT:movl $2, %ecx +; CHECK-O0-NEXT:callq __atomic_load@PLT +; CHECK-O0-NEXT:movq (%rsp), %rax +; CHECK-O0-NEXT:popq %rcx +; CHECK-O0-NEXT:retq +; +; CHECK-SSE-O0-LABEL: atomic_vec1_ptr: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT:pushq %rax +; CHECK-SSE-O0-NEXT:movq %rdi, %rsi +; CHECK-SSE-O0-NEXT:movl $8, %edi +; CHECK-SSE-O0-NEXT:movq %rsp, %rdx +; CHECK-SSE-O0-NEXT:movl $2, %ecx +; CHECK-SSE-O0-NEXT:callq __atomic_load@PLT +; CHECK-SSE-O0-NEXT:movq (%rsp), %rax +; CHECK-SSE-O0-NEXT:popq %rcx +; CHECK-SSE-O0-NEXT:retq +; +; CHECK-AVX-O0-LABEL: atomic_vec1_ptr: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT:pushq %rax +; CHECK-AVX-O0-NEXT:movq %rdi, %rsi +; CHECK-AVX-O0-NEXT:movl $8, %edi +; CHECK-AVX-O0-NEXT:movq %rsp, %rdx +; CHECK-AVX-O0-NEXT:movl $2, %ecx +; CHECK-AVX-O0-NEXT:callq __atomic_load@PLT +; CHECK-AVX-O0-NEXT:movq (%rsp), %rax +; CHECK-AVX-O0-NEXT:popq %rcx +; CHECK-AVX-O0-NEXT:retq + %ret = load atomic <1 x ptr>, ptr %x acquire, align 4 + ret <1 x ptr> %ret +} + define <1 x half> @atomic_vec1_half(ptr %x) { ; CHECK-O3-LABEL: atomic_vec1_half: ; CHECK-O3: # %bb.0: @@ -386,3 +462,515 @@ define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind { %ret = load atomic <1 x double>, ptr %x acquire, align 8 ret <1 x double> %ret } + +define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec1_i64: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT:pushq %rax +; CHECK-O3-NEXT:movq %rdi, %rsi +; CHECK-O3-NEXT:movq %rsp, %rdx +; CHECK-O3-NEXT:movl $8, %edi +; CHECK-O3-NEXT:movl $2, %ecx +; CHECK-O3-NEXT:callq __atomic_load@PLT +; CHECK-O3-NEXT:movq (%rsp), %rax +; CHECK-O3-NEXT:popq %rcx +; CHECK-O3-NEXT:retq +; +; CHECK-SSE-O3-LABEL: atomic_vec1_i64: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT:pushq %rax +; CHECK-SSE-O3-NEXT:movq %rdi, %rsi +; CHECK-SSE-O3-NEXT:movq %rsp, %rdx +; CHECK-SSE-O3-NEXT:movl $8, %edi +; CHECK-SSE-O3-NEXT:movl $2, %ecx +; CHECK-SSE-O3-NEXT:callq __atomic_load@PLT +; CHECK-SSE-O3-NEXT:movq (%rsp), %rax +; CHECK-SSE-O3-NEXT:popq %rcx +; CHECK-SSE-O3-NEXT:retq +; +; CHECK-AVX-O3-LABEL: atomic_vec1_i64: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT:pushq %rax +; CHECK-AVX-O3-NEXT:movq %rdi, %rsi +; CHECK-AVX-O3-NEXT:movq %rsp, %rdx +; CHECK-AVX-O3-NEXT:movl $8, %edi +; CHECK-AVX-O3-NEXT:movl $2, %ecx +; CHECK-AVX-O3-NEXT:callq __atomic_load@PLT +; CHECK-AVX-O3-NEXT:movq (%rsp), %rax +; CHECK-AV
[llvm-branch-commits] [llvm] [SelectionDAG][X86] Remove unused elements from atomic vector. (PR #125432)
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/125432 >From 7ba3e69a759f59bf746cb14640ea8ea426fa09fd Mon Sep 17 00:00:00 2001 From: jofrn Date: Fri, 31 Jan 2025 13:12:56 -0500 Subject: [PATCH] [SelectionDAG][X86] Remove unused elements from atomic vector. After splitting, all elements are created. The two components must be found by looking at the upper and lower half of the value. This change extends EltsFromConsecutiveLoads to understand AtomicSDNode so that unused elements can be removed. commit-id:b83937a8 --- llvm/include/llvm/CodeGen/SelectionDAG.h | 2 +- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 61 +++ 3 files changed, 51 insertions(+), 14 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 87b6914f8a0ee..40550d96a5b3d 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1873,7 +1873,7 @@ class SelectionDAG { /// chain to the token factor. This ensures that the new memory node will have /// the same relative memory dependency position as the old load. Returns the /// new merged load chain. - SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp); + SDValue makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp); /// Topological-sort the AllNodes list and a /// assign a unique node id for each node in the DAG based on their diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index e6a7d092b7b79..1c1445f9f44b7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -12236,7 +12236,7 @@ SDValue SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain, return TokenFactor; } -SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, +SDValue SelectionDAG::makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp) { assert(isa(NewMemOp.getNode()) && "Expected a memop node"); SDValue OldChain = SDValue(OldLoad, 1); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1fc50a36fda72..5ce8d83feb0dd 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7193,15 +7193,19 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, } // Recurse to find a LoadSDNode source and the accumulated ByteOffest. -static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { - if (ISD::isNON_EXTLoad(Elt.getNode())) { -auto *BaseLd = cast(Elt); -if (!BaseLd->isSimple()) - return false; +static bool findEltLoadSrc(SDValue Elt, MemSDNode *&Ld, int64_t &ByteOffset) { + if (auto *BaseLd = dyn_cast(Elt)) { Ld = BaseLd; ByteOffset = 0; return true; - } + } else if (auto *BaseLd = dyn_cast(Elt)) +if (ISD::isNON_EXTLoad(Elt.getNode())) { + if (!BaseLd->isSimple()) +return false; + Ld = BaseLd; + ByteOffset = 0; + return true; +} switch (Elt.getOpcode()) { case ISD::BITCAST: @@ -7254,7 +7258,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, APInt ZeroMask = APInt::getZero(NumElems); APInt UndefMask = APInt::getZero(NumElems); - SmallVector Loads(NumElems, nullptr); + SmallVector Loads(NumElems, nullptr); SmallVector ByteOffsets(NumElems, 0); // For each element in the initializer, see if we've found a load, zero or an @@ -7304,7 +7308,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, EVT EltBaseVT = EltBase.getValueType(); assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && "Register/Memory size mismatch"); - LoadSDNode *LDBase = Loads[FirstLoadedElt]; + MemSDNode *LDBase = Loads[FirstLoadedElt]; assert(LDBase && "Did not find base load for merging consecutive loads"); unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); unsigned BaseSizeInBytes = BaseSizeInBits / 8; @@ -7318,15 +7322,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, // Check to see if the element's load is consecutive to the base load // or offset from a previous (already checked) load. - auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) { -LoadSDNode *Ld = Loads[EltIdx]; + auto CheckConsecutiveLoad = [&](MemSDNode *Base, int EltIdx) { +MemSDNode *Ld = Loads[EltIdx]; int64_t ByteOffset = ByteOffsets[EltIdx]; if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) { int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes); return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] && Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0); } -return DAG.areNo
[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #120716)
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/120716 >From ac4069c8fa8e69173b203824f4db5fbd73ecb5a4 Mon Sep 17 00:00:00 2001 From: jofrn Date: Fri, 20 Dec 2024 06:14:28 -0500 Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector AtomicExpand fails for aligned `load atomic ` because it does not find a compatible library call. This change adds appropriate bitcasts so that the call can be lowered. It also adds support for 128 bit lowering in tablegen to support SSE/AVX. commit-id:f430c1af --- .../include/llvm/Target/TargetSelectionDAG.td | 14 + llvm/lib/CodeGen/AtomicExpandPass.cpp | 15 +- llvm/lib/Target/X86/X86InstrCompiler.td | 5 + llvm/test/CodeGen/ARM/atomic-load-store.ll| 51 llvm/test/CodeGen/X86/atomic-load-store.ll| 94 +++ .../X86/expand-atomic-non-integer.ll | 263 -- 6 files changed, 360 insertions(+), 82 deletions(-) diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 406baa4f5fdaa..3b8a34ca0eb51 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -1904,6 +1904,20 @@ def atomic_load_64 : let MemoryVT = i64; } +def atomic_load_128_v2i64 : + PatFrag<(ops node:$ptr), + (atomic_load node:$ptr)> { + let IsAtomic = true; + let MemoryVT = v2i64; +} + +def atomic_load_128_v4i32 : + PatFrag<(ops node:$ptr), + (atomic_load node:$ptr)> { + let IsAtomic = true; + let MemoryVT = v4i32; +} + def atomic_load_nonext_8 : PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> { let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic? diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index c376de877ac7d..70f59eafc6ecb 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -2066,9 +2066,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( I->replaceAllUsesWith(V); } else if (HasResult) { Value *V; -if (UseSizedLibcall) - V = Builder.CreateBitOrPointerCast(Result, I->getType()); -else { +if (UseSizedLibcall) { + // Add bitcasts from Result's scalar type to I's vector type + auto *PtrTy = dyn_cast(I->getType()->getScalarType()); + auto *VTy = dyn_cast(I->getType()); + if (VTy && PtrTy && !Result->getType()->isVectorTy()) { +unsigned AS = PtrTy->getAddressSpace(); +Value *BC = Builder.CreateBitCast( +Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS))); +V = Builder.CreateIntToPtr(BC, I->getType()); + } else +V = Builder.CreateBitOrPointerCast(Result, I->getType()); +} else { V = Builder.CreateAlignedLoad(I->getType(), AllocaResult, AllocaAlignment); Builder.CreateLifetimeEnd(AllocaResult, SizeVal64); diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 26b76dd1ca83a..3143015b7ec66 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1211,6 +1211,11 @@ def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src, def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src, (MOV64toPQIrm addr:$src)>; // load atomic <2 x i32,float> +def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)), + (VMOVAPDrm addr:$src)>; // load atomic <2 x i64> +def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)), + (VMOVAPDrm addr:$src)>; // load atomic <4 x i32> + // Floating point loads/stores. def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst), (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>; diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll b/llvm/test/CodeGen/ARM/atomic-load-store.ll index 560dfde356c29..eaa2ffd9b2731 100644 --- a/llvm/test/CodeGen/ARM/atomic-load-store.ll +++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll @@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) { store atomic double %val1, ptr %ptr seq_cst, align 8 ret void } + +define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 { +; ARM-LABEL: atomic_vec1_ptr: +; ARM: @ %bb.0: +; ARM-NEXT:ldr r0, [r0] +; ARM-NEXT:dmb ish +; ARM-NEXT:bx lr +; +; ARMOPTNONE-LABEL: atomic_vec1_ptr: +; ARMOPTNONE: @ %bb.0: +; ARMOPTNONE-NEXT:ldr r0, [r0] +; ARMOPTNONE-NEXT:dmb ish +; ARMOPTNONE-NEXT:bx lr +; +; THUMBTWO-LABEL: atomic_vec1_ptr: +; THUMBTWO: @ %bb.0: +; THUMBTWO-NEXT:ldr r0, [r0] +; THUMBTWO-NEXT:dmb ish +; THUMBTWO-NEXT:bx lr +; +; THUMBONE-LABEL: atomic_vec1_ptr: +; THUMBONE: @ %bb.0: +; THUMBONE-NEXT:push {r7, lr} +; THUMBONE-NEXT:movs r1, #0 +; THUMBONE-NEXT:mov r2, r1 +; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4 +; THUMBONE-NEXT:
[llvm-branch-commits] [llvm] [X86] Remove extra MOV after widening atomic load (PR #138635)
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/138635 >From 109bc6009d93645b42e0be8fbf858368770f49e7 Mon Sep 17 00:00:00 2001 From: jofernau_amdeng Date: Tue, 6 May 2025 01:48:11 -0400 Subject: [PATCH] [X86] Remove extra MOV after widening atomic load This change adds patterns to optimize out an extra MOV present after widening the atomic load. It also casts floats to ints in an atomic load during AtomicExpand to support 128 bit vectors in SSE/AVX. commit-id:45989503 --- llvm/lib/Target/X86/X86ISelLowering.cpp| 7 + llvm/lib/Target/X86/X86ISelLowering.h | 2 + llvm/lib/Target/X86/X86InstrCompiler.td| 7 + llvm/test/CodeGen/X86/atomic-load-store.ll | 249 +++-- 4 files changed, 102 insertions(+), 163 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a203c84630fe1..1fc50a36fda72 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -32070,6 +32070,13 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { } } +TargetLowering::AtomicExpansionKind +X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const { + if (LI->getType()->getScalarType()->isFloatingPointTy()) +return AtomicExpansionKind::CastToInteger; + return AtomicExpansionKind::None; +} + LoadInst * X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 359f24768b3da..14b79e8d726f6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1838,6 +1838,8 @@ namespace llvm { shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const; +TargetLoweringBase::AtomicExpansionKind +shouldCastAtomicLoadInIR(LoadInst *LI) const override; void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override; void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override; diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 927b2c8b22f05..26b76dd1ca83a 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1204,6 +1204,13 @@ def : Pat<(i16 (atomic_load_nonext_16 addr:$src)), (MOV16rm addr:$src)>; def : Pat<(i32 (atomic_load_nonext_32 addr:$src)), (MOV32rm addr:$src)>; def : Pat<(i64 (atomic_load_nonext_64 addr:$src)), (MOV64rm addr:$src)>; +def : Pat<(v4i32 (scalar_to_vector (i32 (zext (i16 (atomic_load_16 addr:$src)), + (MOVDI2PDIrm addr:$src)>; // load atomic <2 x i8> +def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src, + (MOVDI2PDIrm addr:$src)>; // load atomic <2 x i16> +def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src, + (MOV64toPQIrm addr:$src)>; // load atomic <2 x i32,float> + // Floating point loads/stores. def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst), (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>; diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index ff5391f44bbe3..535a87316e162 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -207,19 +207,19 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) { ; CHECK-O3-LABEL: atomic_vec1_bfloat: ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT:movzwl (%rdi), %eax -; CHECK-O3-NEXT:pinsrw $0, %eax, %xmm0 +; CHECK-O3-NEXT:movd %eax, %xmm0 ; CHECK-O3-NEXT:retq ; ; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat: ; CHECK-SSE-O3: # %bb.0: ; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax -; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O3-NEXT:movd %eax, %xmm0 ; CHECK-SSE-O3-NEXT:retq ; ; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat: ; CHECK-AVX-O3: # %bb.0: ; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax -; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0 ; CHECK-AVX-O3-NEXT:retq ; ; CHECK-O0-LABEL: atomic_vec1_bfloat: @@ -227,8 +227,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) { ; CHECK-O0-NEXT:movw (%rdi), %cx ; CHECK-O0-NEXT:# implicit-def: $eax ; CHECK-O0-NEXT:movw %cx, %ax -; CHECK-O0-NEXT:# implicit-def: $xmm0 -; CHECK-O0-NEXT:pinsrw $0, %eax, %xmm0 +; CHECK-O0-NEXT:movd %eax, %xmm0 ; CHECK-O0-NEXT:retq ; ; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat: @@ -236,8 +235,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) { ; CHECK-SSE-O0-NEXT:movw (%rdi), %cx ; CHECK-SSE-O0-NEXT:# implicit-def: $eax ; CHECK-SSE-O0-NEXT:movw %cx, %ax -; CHECK-SSE-O0-NEXT:# implicit-def: $xm
[llvm-branch-commits] [llvm] [SelectionDAG][X86] Remove unused elements from atomic vector. (PR #125432)
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/125432 >From 7ba3e69a759f59bf746cb14640ea8ea426fa09fd Mon Sep 17 00:00:00 2001 From: jofrn Date: Fri, 31 Jan 2025 13:12:56 -0500 Subject: [PATCH] [SelectionDAG][X86] Remove unused elements from atomic vector. After splitting, all elements are created. The two components must be found by looking at the upper and lower half of the value. This change extends EltsFromConsecutiveLoads to understand AtomicSDNode so that unused elements can be removed. commit-id:b83937a8 --- llvm/include/llvm/CodeGen/SelectionDAG.h | 2 +- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 61 +++ 3 files changed, 51 insertions(+), 14 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 87b6914f8a0ee..40550d96a5b3d 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1873,7 +1873,7 @@ class SelectionDAG { /// chain to the token factor. This ensures that the new memory node will have /// the same relative memory dependency position as the old load. Returns the /// new merged load chain. - SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp); + SDValue makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp); /// Topological-sort the AllNodes list and a /// assign a unique node id for each node in the DAG based on their diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index e6a7d092b7b79..1c1445f9f44b7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -12236,7 +12236,7 @@ SDValue SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain, return TokenFactor; } -SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, +SDValue SelectionDAG::makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp) { assert(isa(NewMemOp.getNode()) && "Expected a memop node"); SDValue OldChain = SDValue(OldLoad, 1); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1fc50a36fda72..5ce8d83feb0dd 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7193,15 +7193,19 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, } // Recurse to find a LoadSDNode source and the accumulated ByteOffest. -static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { - if (ISD::isNON_EXTLoad(Elt.getNode())) { -auto *BaseLd = cast(Elt); -if (!BaseLd->isSimple()) - return false; +static bool findEltLoadSrc(SDValue Elt, MemSDNode *&Ld, int64_t &ByteOffset) { + if (auto *BaseLd = dyn_cast(Elt)) { Ld = BaseLd; ByteOffset = 0; return true; - } + } else if (auto *BaseLd = dyn_cast(Elt)) +if (ISD::isNON_EXTLoad(Elt.getNode())) { + if (!BaseLd->isSimple()) +return false; + Ld = BaseLd; + ByteOffset = 0; + return true; +} switch (Elt.getOpcode()) { case ISD::BITCAST: @@ -7254,7 +7258,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, APInt ZeroMask = APInt::getZero(NumElems); APInt UndefMask = APInt::getZero(NumElems); - SmallVector Loads(NumElems, nullptr); + SmallVector Loads(NumElems, nullptr); SmallVector ByteOffsets(NumElems, 0); // For each element in the initializer, see if we've found a load, zero or an @@ -7304,7 +7308,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, EVT EltBaseVT = EltBase.getValueType(); assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && "Register/Memory size mismatch"); - LoadSDNode *LDBase = Loads[FirstLoadedElt]; + MemSDNode *LDBase = Loads[FirstLoadedElt]; assert(LDBase && "Did not find base load for merging consecutive loads"); unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); unsigned BaseSizeInBytes = BaseSizeInBits / 8; @@ -7318,15 +7322,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, // Check to see if the element's load is consecutive to the base load // or offset from a previous (already checked) load. - auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) { -LoadSDNode *Ld = Loads[EltIdx]; + auto CheckConsecutiveLoad = [&](MemSDNode *Base, int EltIdx) { +MemSDNode *Ld = Loads[EltIdx]; int64_t ByteOffset = ByteOffsets[EltIdx]; if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) { int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes); return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] && Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0); } -return DAG.areNo
[llvm-branch-commits] [llvm] [SelectionDAG] Widen <2 x T> vector types for atomic load (PR #120598)
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/120598 >From 39f039e23b95204affb61d1ac004c562f08222c5 Mon Sep 17 00:00:00 2001 From: jofrn Date: Thu, 19 Dec 2024 11:19:39 -0500 Subject: [PATCH] [SelectionDAG] Widen <2 x T> vector types for atomic load Vector types of 2 elements must be widened. This change does this for vector types of atomic load in SelectionDAG so that it can translate aligned vectors of >1 size. commit-id:2894ccd1 --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 + .../SelectionDAG/LegalizeVectorTypes.cpp | 97 -- llvm/test/CodeGen/X86/atomic-load-store.ll| 286 ++ 3 files changed, 361 insertions(+), 23 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index d24b4517a460d..b6e018ba0e454 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -1068,6 +1068,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N); SDValue WidenVecRes_INSERT_SUBVECTOR(SDNode *N); SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N); + SDValue WidenVecRes_ATOMIC_LOAD(AtomicSDNode *N); SDValue WidenVecRes_LOAD(SDNode* N); SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N); SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index d6cbf2211f053..42763aab5bb55 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -4622,6 +4622,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { break; case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break; case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break; + case ISD::ATOMIC_LOAD: +Res = WidenVecRes_ATOMIC_LOAD(cast(N)); +break; case ISD::LOAD: Res = WidenVecRes_LOAD(N); break; case ISD::STEP_VECTOR: case ISD::SPLAT_VECTOR: @@ -6003,6 +6006,74 @@ SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) { N->getOperand(1), N->getOperand(2)); } +/// Either return the same load or provide appropriate casts +/// from the load and return that. +static SDValue coerceLoadedValue(SDValue LdOp, EVT FirstVT, EVT WidenVT, + TypeSize LdWidth, TypeSize FirstVTWidth, + SDLoc dl, SelectionDAG &DAG) { + assert(TypeSize::isKnownLE(LdWidth, FirstVTWidth)); + TypeSize WidenWidth = WidenVT.getSizeInBits(); + if (!FirstVT.isVector()) { +unsigned NumElts = +WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue(); +EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), FirstVT, NumElts); +SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp); +return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp); + } + assert(FirstVT == WidenVT); + return LdOp; +} + +static std::optional findMemType(SelectionDAG &DAG, + const TargetLowering &TLI, unsigned Width, + EVT WidenVT, unsigned Align, + unsigned WidenEx); + +SDValue DAGTypeLegalizer::WidenVecRes_ATOMIC_LOAD(AtomicSDNode *LD) { + EVT WidenVT = + TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0)); + EVT LdVT = LD->getMemoryVT(); + SDLoc dl(LD); + assert(LdVT.isVector() && WidenVT.isVector() && "Expected vectors"); + assert(LdVT.isScalableVector() == WidenVT.isScalableVector() && + "Must be scalable"); + assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType() && + "Expected equivalent element types"); + + // Load information + SDValue Chain = LD->getChain(); + SDValue BasePtr = LD->getBasePtr(); + MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); + AAMDNodes AAInfo = LD->getAAInfo(); + + TypeSize LdWidth = LdVT.getSizeInBits(); + TypeSize WidenWidth = WidenVT.getSizeInBits(); + TypeSize WidthDiff = WidenWidth - LdWidth; + + // Find the vector type that can load from. + std::optional FirstVT = + findMemType(DAG, TLI, LdWidth.getKnownMinValue(), WidenVT, /*LdAlign=*/0, + WidthDiff.getKnownMinValue()); + + if (!FirstVT) +return SDValue(); + + SmallVector MemVTs; + TypeSize FirstVTWidth = FirstVT->getSizeInBits(); + + SDValue LdOp = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, *FirstVT, *FirstVT, + Chain, BasePtr, LD->getMemOperand()); + + // Load the element with one instruction. + SDValue Result = coerceLoadedValue(LdOp, *FirstVT, WidenVT, LdWidth, + FirstVTWidth, dl, DAG); + + // Modified the chain - switch anything that used the old chain to use + // the new one. +
[llvm-branch-commits] [llvm] [SelectionDAG][X86] Remove unused elements from atomic vector. (PR #125432)
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/125432 >From 7ba3e69a759f59bf746cb14640ea8ea426fa09fd Mon Sep 17 00:00:00 2001 From: jofrn Date: Fri, 31 Jan 2025 13:12:56 -0500 Subject: [PATCH] [SelectionDAG][X86] Remove unused elements from atomic vector. After splitting, all elements are created. The two components must be found by looking at the upper and lower half of the value. This change extends EltsFromConsecutiveLoads to understand AtomicSDNode so that unused elements can be removed. commit-id:b83937a8 --- llvm/include/llvm/CodeGen/SelectionDAG.h | 2 +- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 61 +++ 3 files changed, 51 insertions(+), 14 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 87b6914f8a0ee..40550d96a5b3d 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1873,7 +1873,7 @@ class SelectionDAG { /// chain to the token factor. This ensures that the new memory node will have /// the same relative memory dependency position as the old load. Returns the /// new merged load chain. - SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp); + SDValue makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp); /// Topological-sort the AllNodes list and a /// assign a unique node id for each node in the DAG based on their diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index e6a7d092b7b79..1c1445f9f44b7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -12236,7 +12236,7 @@ SDValue SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain, return TokenFactor; } -SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, +SDValue SelectionDAG::makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp) { assert(isa(NewMemOp.getNode()) && "Expected a memop node"); SDValue OldChain = SDValue(OldLoad, 1); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1fc50a36fda72..5ce8d83feb0dd 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7193,15 +7193,19 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, } // Recurse to find a LoadSDNode source and the accumulated ByteOffest. -static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { - if (ISD::isNON_EXTLoad(Elt.getNode())) { -auto *BaseLd = cast(Elt); -if (!BaseLd->isSimple()) - return false; +static bool findEltLoadSrc(SDValue Elt, MemSDNode *&Ld, int64_t &ByteOffset) { + if (auto *BaseLd = dyn_cast(Elt)) { Ld = BaseLd; ByteOffset = 0; return true; - } + } else if (auto *BaseLd = dyn_cast(Elt)) +if (ISD::isNON_EXTLoad(Elt.getNode())) { + if (!BaseLd->isSimple()) +return false; + Ld = BaseLd; + ByteOffset = 0; + return true; +} switch (Elt.getOpcode()) { case ISD::BITCAST: @@ -7254,7 +7258,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, APInt ZeroMask = APInt::getZero(NumElems); APInt UndefMask = APInt::getZero(NumElems); - SmallVector Loads(NumElems, nullptr); + SmallVector Loads(NumElems, nullptr); SmallVector ByteOffsets(NumElems, 0); // For each element in the initializer, see if we've found a load, zero or an @@ -7304,7 +7308,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, EVT EltBaseVT = EltBase.getValueType(); assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && "Register/Memory size mismatch"); - LoadSDNode *LDBase = Loads[FirstLoadedElt]; + MemSDNode *LDBase = Loads[FirstLoadedElt]; assert(LDBase && "Did not find base load for merging consecutive loads"); unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); unsigned BaseSizeInBytes = BaseSizeInBits / 8; @@ -7318,15 +7322,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, // Check to see if the element's load is consecutive to the base load // or offset from a previous (already checked) load. - auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) { -LoadSDNode *Ld = Loads[EltIdx]; + auto CheckConsecutiveLoad = [&](MemSDNode *Base, int EltIdx) { +MemSDNode *Ld = Loads[EltIdx]; int64_t ByteOffset = ByteOffsets[EltIdx]; if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) { int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes); return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] && Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0); } -return DAG.areNo
[llvm-branch-commits] [llvm] [SelectionDAG] Split vector types for atomic load (PR #120640)
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/120640 >From f91634795e4dbfd6e081f0b096b872411051ff0f Mon Sep 17 00:00:00 2001 From: jofrn Date: Thu, 19 Dec 2024 16:25:55 -0500 Subject: [PATCH] [SelectionDAG] Split vector types for atomic load Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half. commit-id:3a045357 --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 + .../SelectionDAG/LegalizeVectorTypes.cpp | 37 llvm/test/CodeGen/X86/atomic-load-store.ll| 86 +++ 3 files changed, 124 insertions(+) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index b6e018ba0e454..b15f70cbec1cd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -965,6 +965,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD, SDValue &Lo, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 42763aab5bb55..af48a5e216803 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1161,6 +1161,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { SplitVecRes_STEP_VECTOR(N, Lo, Hi); break; case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break; + case ISD::ATOMIC_LOAD: +SplitVecRes_ATOMIC_LOAD(cast(N), Lo, Hi); +break; case ISD::LOAD: SplitVecRes_LOAD(cast(N), Lo, Hi); break; @@ -1414,6 +1417,40 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { SetSplitVector(SDValue(N, ResNo), Lo, Hi); } +void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, + SDValue &Hi) { + assert(LD->getExtensionType() == ISD::NON_EXTLOAD && + "Extended load during type legalization!"); + SDLoc dl(LD); + EVT VT = LD->getValueType(0); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + + SDValue Ch = LD->getChain(); + SDValue Ptr = LD->getBasePtr(); + + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + EVT MemIntVT = + EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits()); + SDValue ALD = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, MemIntVT, IntVT, Ch, + Ptr, LD->getMemOperand()); + + EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits()); + EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits()); + SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD); + SDValue ExtractHi = + DAG.getNode(ISD::SRL, dl, IntVT, ALD, + DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl)); + ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi); + + Lo = DAG.getBitcast(LoVT, ExtractLo); + Hi = DAG.getBitcast(HiVT, ExtractHi); + + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1)); +} + void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT, MachinePointerInfo &MPI, SDValue &Ptr, uint64_t *ScaledOffset) { diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 535a87316e162..039edcbf83544 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -374,6 +374,74 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) { ret <2 x float> %ret } +define <2 x half> @atomic_vec2_half(ptr %x) { +; CHECK-O3-LABEL: atomic_vec2_half: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-O3-NEXT:retq +; +; CHECK-SSE-O3-LABEL: atomic_vec2_half: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE-O3-NEXT:retq +; +; CHECK-AVX-O3-LABEL: atomic_vec2_half: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-O3-NEXT:retq +; +
[llvm-branch-commits] [llvm] [SelectionDAG] Legalize <1 x T> vector types for atomic load (PR #120385)
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/120385 >From 85e5dc5e42cfe0f2f4875cb4db990f92b68295ed Mon Sep 17 00:00:00 2001 From: jofrn Date: Wed, 18 Dec 2024 03:37:17 -0500 Subject: [PATCH] [SelectionDAG] Legalize <1 x T> vector types for atomic load `load atomic <1 x T>` is not valid. This change legalizes vector types of atomic load via scalarization in SelectionDAG so that it can, for example, translate from `v1i32` to `i32`. commit-id:5c36cc8c --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 + .../SelectionDAG/LegalizeVectorTypes.cpp | 15 ++ llvm/test/CodeGen/X86/atomic-load-store.ll| 250 +- 3 files changed, 257 insertions(+), 9 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index dd9af47da5287..d24b4517a460d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -879,6 +879,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue ScalarizeVecRes_UnaryOpWithExtraInput(SDNode *N); SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N); SDValue ScalarizeVecRes_LOAD(LoadSDNode *N); + SDValue ScalarizeVecRes_ATOMIC_LOAD(AtomicSDNode *N); SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N); SDValue ScalarizeVecRes_VSELECT(SDNode *N); SDValue ScalarizeVecRes_SELECT(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 4d844f0036a75..d6cbf2211f053 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -65,6 +65,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { R = ScalarizeVecRes_UnaryOpWithExtraInput(N); break; case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break; + case ISD::ATOMIC_LOAD: +R = ScalarizeVecRes_ATOMIC_LOAD(cast(N)); +break; case ISD::LOAD: R = ScalarizeVecRes_LOAD(cast(N));break; case ISD::SCALAR_TO_VECTOR: R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break; case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break; @@ -455,6 +458,18 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) { return Op; } +SDValue DAGTypeLegalizer::ScalarizeVecRes_ATOMIC_LOAD(AtomicSDNode *N) { + SDValue Result = DAG.getAtomicLoad( + ISD::NON_EXTLOAD, SDLoc(N), N->getMemoryVT().getVectorElementType(), + N->getValueType(0).getVectorElementType(), N->getChain(), N->getBasePtr(), + N->getMemOperand()); + + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Result.getValue(1)); + return Result; +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) { assert(N->isUnindexed() && "Indexed vector load?"); diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 45277ce3d26c4..4f5cb5a4e9247 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64| FileCheck %s --check-prefixes=CHECK,CHECK-O3 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O3 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O3 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O3 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O3 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64| FileCheck %s --check-prefixes=CHECK,CHECK-O0 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O0 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O0 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O0 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O0 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-6
[llvm-branch-commits] [llvm] [X86] Manage atomic load of fp -> int promotion in DAG (PR #120386)
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/120386 >From 87d478c96fa8e64c4a5035c467cc800d0c55df2c Mon Sep 17 00:00:00 2001 From: jofrn Date: Wed, 18 Dec 2024 03:38:23 -0500 Subject: [PATCH] [X86] Manage atomic load of fp -> int promotion in DAG When lowering atomic <1 x T> vector types with floats, selection can fail since this pattern is unsupported. To support this, floats can be casted to an integer type of the same size. commit-id:f9d761c5 --- llvm/lib/Target/X86/X86ISelLowering.cpp| 4 + llvm/test/CodeGen/X86/atomic-load-store.ll | 117 + 2 files changed, 121 insertions(+) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 99a82cab384aa..a203c84630fe1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2653,6 +2653,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(Op, MVT::f32, Promote); } + setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16); + setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32); + setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64); + // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine({ISD::VECTOR_SHUFFLE, ISD::SCALAR_TO_VECTOR, diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 4f5cb5a4e9247..9fab8b98b4af0 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -269,3 +269,120 @@ define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind { %ret = load atomic <1 x i64>, ptr %x acquire, align 8 ret <1 x i64> %ret } + +define <1 x half> @atomic_vec1_half(ptr %x) { +; CHECK-O3-LABEL: atomic_vec1_half: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT:movzwl (%rdi), %eax +; CHECK-O3-NEXT:pinsrw $0, %eax, %xmm0 +; CHECK-O3-NEXT:retq +; +; CHECK-SSE-O3-LABEL: atomic_vec1_half: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax +; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O3-NEXT:retq +; +; CHECK-AVX-O3-LABEL: atomic_vec1_half: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax +; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-O3-NEXT:retq +; +; CHECK-O0-LABEL: atomic_vec1_half: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT:movw (%rdi), %cx +; CHECK-O0-NEXT:# implicit-def: $eax +; CHECK-O0-NEXT:movw %cx, %ax +; CHECK-O0-NEXT:# implicit-def: $xmm0 +; CHECK-O0-NEXT:pinsrw $0, %eax, %xmm0 +; CHECK-O0-NEXT:retq +; +; CHECK-SSE-O0-LABEL: atomic_vec1_half: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT:movw (%rdi), %cx +; CHECK-SSE-O0-NEXT:# implicit-def: $eax +; CHECK-SSE-O0-NEXT:movw %cx, %ax +; CHECK-SSE-O0-NEXT:# implicit-def: $xmm0 +; CHECK-SSE-O0-NEXT:pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O0-NEXT:retq +; +; CHECK-AVX-O0-LABEL: atomic_vec1_half: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT:movw (%rdi), %cx +; CHECK-AVX-O0-NEXT:# implicit-def: $eax +; CHECK-AVX-O0-NEXT:movw %cx, %ax +; CHECK-AVX-O0-NEXT:# implicit-def: $xmm0 +; CHECK-AVX-O0-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-O0-NEXT:retq + %ret = load atomic <1 x half>, ptr %x acquire, align 2 + ret <1 x half> %ret +} + +define <1 x float> @atomic_vec1_float(ptr %x) { +; CHECK-O3-LABEL: atomic_vec1_float: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-O3-NEXT:retq +; +; CHECK-SSE-O3-LABEL: atomic_vec1_float: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE-O3-NEXT:retq +; +; CHECK-AVX-O3-LABEL: atomic_vec1_float: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-O3-NEXT:retq +; +; CHECK-O0-LABEL: atomic_vec1_float: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-O0-NEXT:retq +; +; CHECK-SSE-O0-LABEL: atomic_vec1_float: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE-O0-NEXT:retq +; +; CHECK-AVX-O0-LABEL: atomic_vec1_float: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-O0-NEXT:retq + %ret = load atomic <1 x float>, ptr %x acquire, align 4 + ret <1 x float> %ret +} + +define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec1_double_align: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT:movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-O3-NEXT:retq +; +; CHECK-SSE-O3-LABEL: atomic_vec1_double_align: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT:movsd {{.*#+}} xmm0 = m
[llvm-branch-commits] [llvm] [SelectionDAG] Split vector types for atomic load (PR #120640)
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/120640 >From f91634795e4dbfd6e081f0b096b872411051ff0f Mon Sep 17 00:00:00 2001 From: jofrn Date: Thu, 19 Dec 2024 16:25:55 -0500 Subject: [PATCH] [SelectionDAG] Split vector types for atomic load Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half. commit-id:3a045357 --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 + .../SelectionDAG/LegalizeVectorTypes.cpp | 37 llvm/test/CodeGen/X86/atomic-load-store.ll| 86 +++ 3 files changed, 124 insertions(+) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index b6e018ba0e454..b15f70cbec1cd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -965,6 +965,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD, SDValue &Lo, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 42763aab5bb55..af48a5e216803 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1161,6 +1161,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { SplitVecRes_STEP_VECTOR(N, Lo, Hi); break; case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break; + case ISD::ATOMIC_LOAD: +SplitVecRes_ATOMIC_LOAD(cast(N), Lo, Hi); +break; case ISD::LOAD: SplitVecRes_LOAD(cast(N), Lo, Hi); break; @@ -1414,6 +1417,40 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { SetSplitVector(SDValue(N, ResNo), Lo, Hi); } +void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, + SDValue &Hi) { + assert(LD->getExtensionType() == ISD::NON_EXTLOAD && + "Extended load during type legalization!"); + SDLoc dl(LD); + EVT VT = LD->getValueType(0); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + + SDValue Ch = LD->getChain(); + SDValue Ptr = LD->getBasePtr(); + + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + EVT MemIntVT = + EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits()); + SDValue ALD = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, MemIntVT, IntVT, Ch, + Ptr, LD->getMemOperand()); + + EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits()); + EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits()); + SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD); + SDValue ExtractHi = + DAG.getNode(ISD::SRL, dl, IntVT, ALD, + DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl)); + ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi); + + Lo = DAG.getBitcast(LoVT, ExtractLo); + Hi = DAG.getBitcast(HiVT, ExtractHi); + + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1)); +} + void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT, MachinePointerInfo &MPI, SDValue &Ptr, uint64_t *ScaledOffset) { diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 535a87316e162..039edcbf83544 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -374,6 +374,74 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) { ret <2 x float> %ret } +define <2 x half> @atomic_vec2_half(ptr %x) { +; CHECK-O3-LABEL: atomic_vec2_half: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-O3-NEXT:retq +; +; CHECK-SSE-O3-LABEL: atomic_vec2_half: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE-O3-NEXT:retq +; +; CHECK-AVX-O3-LABEL: atomic_vec2_half: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-O3-NEXT:retq +; +
[llvm-branch-commits] [clang] [UBSan] Support src:*=sanitize for multiple ignorelists. (PR #141640)
https://github.com/qinkunbao updated https://github.com/llvm/llvm-project/pull/141640 >From cddba024f55d52e30d9c74369b3707b5fce64a20 Mon Sep 17 00:00:00 2001 From: Qinkun Bao Date: Tue, 27 May 2025 17:34:51 + Subject: [PATCH] Add some comments. Created using spr 1.3.6 --- clang/lib/Basic/NoSanitizeList.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clang/lib/Basic/NoSanitizeList.cpp b/clang/lib/Basic/NoSanitizeList.cpp index c58a67971dfb6..549bbda55e459 100644 --- a/clang/lib/Basic/NoSanitizeList.cpp +++ b/clang/lib/Basic/NoSanitizeList.cpp @@ -53,6 +53,8 @@ bool NoSanitizeList::containsFile(SanitizerMask Mask, StringRef FileName, // If we have two cases such as `src:a.cpp=sanitize` and `src:a.cpp`, the // current entry override the previous entry. if (SanLine > 0) +// std::pair uses lexicographic comparison. It will compare the file index +// first and then comapre the line number. return std::make_pair(NoSanFileIdx, NoSanLine) > std::make_pair(SanFileIdx, SanLine); return true; ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [UBSan] Support src:*=sanitize for multiple ignorelists. (PR #141640)
https://github.com/qinkunbao edited https://github.com/llvm/llvm-project/pull/141640 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [X86] Add atomic vector tests for unaligned >1 sizes. (PR #120387)
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/120387 >From 7b4708f1ddcd76bd8ba94b0c85317e86bab36ef7 Mon Sep 17 00:00:00 2001 From: jofrn Date: Wed, 18 Dec 2024 03:40:32 -0500 Subject: [PATCH] [X86] Add atomic vector tests for unaligned >1 sizes. Unaligned atomic vectors with size >1 are lowered to calls. Adding their tests separately here. commit-id:a06a5cc6 --- llvm/test/CodeGen/X86/atomic-load-store.ll | 588 + 1 file changed, 588 insertions(+) diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 9fab8b98b4af0..3e7b73a65fe07 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -270,6 +270,82 @@ define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind { ret <1 x i64> %ret } +define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec1_ptr: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT:pushq %rax +; CHECK-O3-NEXT:movq %rdi, %rsi +; CHECK-O3-NEXT:movq %rsp, %rdx +; CHECK-O3-NEXT:movl $8, %edi +; CHECK-O3-NEXT:movl $2, %ecx +; CHECK-O3-NEXT:callq __atomic_load@PLT +; CHECK-O3-NEXT:movq (%rsp), %rax +; CHECK-O3-NEXT:popq %rcx +; CHECK-O3-NEXT:retq +; +; CHECK-SSE-O3-LABEL: atomic_vec1_ptr: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT:pushq %rax +; CHECK-SSE-O3-NEXT:movq %rdi, %rsi +; CHECK-SSE-O3-NEXT:movq %rsp, %rdx +; CHECK-SSE-O3-NEXT:movl $8, %edi +; CHECK-SSE-O3-NEXT:movl $2, %ecx +; CHECK-SSE-O3-NEXT:callq __atomic_load@PLT +; CHECK-SSE-O3-NEXT:movq (%rsp), %rax +; CHECK-SSE-O3-NEXT:popq %rcx +; CHECK-SSE-O3-NEXT:retq +; +; CHECK-AVX-O3-LABEL: atomic_vec1_ptr: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT:pushq %rax +; CHECK-AVX-O3-NEXT:movq %rdi, %rsi +; CHECK-AVX-O3-NEXT:movq %rsp, %rdx +; CHECK-AVX-O3-NEXT:movl $8, %edi +; CHECK-AVX-O3-NEXT:movl $2, %ecx +; CHECK-AVX-O3-NEXT:callq __atomic_load@PLT +; CHECK-AVX-O3-NEXT:movq (%rsp), %rax +; CHECK-AVX-O3-NEXT:popq %rcx +; CHECK-AVX-O3-NEXT:retq +; +; CHECK-O0-LABEL: atomic_vec1_ptr: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT:pushq %rax +; CHECK-O0-NEXT:movq %rdi, %rsi +; CHECK-O0-NEXT:movl $8, %edi +; CHECK-O0-NEXT:movq %rsp, %rdx +; CHECK-O0-NEXT:movl $2, %ecx +; CHECK-O0-NEXT:callq __atomic_load@PLT +; CHECK-O0-NEXT:movq (%rsp), %rax +; CHECK-O0-NEXT:popq %rcx +; CHECK-O0-NEXT:retq +; +; CHECK-SSE-O0-LABEL: atomic_vec1_ptr: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT:pushq %rax +; CHECK-SSE-O0-NEXT:movq %rdi, %rsi +; CHECK-SSE-O0-NEXT:movl $8, %edi +; CHECK-SSE-O0-NEXT:movq %rsp, %rdx +; CHECK-SSE-O0-NEXT:movl $2, %ecx +; CHECK-SSE-O0-NEXT:callq __atomic_load@PLT +; CHECK-SSE-O0-NEXT:movq (%rsp), %rax +; CHECK-SSE-O0-NEXT:popq %rcx +; CHECK-SSE-O0-NEXT:retq +; +; CHECK-AVX-O0-LABEL: atomic_vec1_ptr: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT:pushq %rax +; CHECK-AVX-O0-NEXT:movq %rdi, %rsi +; CHECK-AVX-O0-NEXT:movl $8, %edi +; CHECK-AVX-O0-NEXT:movq %rsp, %rdx +; CHECK-AVX-O0-NEXT:movl $2, %ecx +; CHECK-AVX-O0-NEXT:callq __atomic_load@PLT +; CHECK-AVX-O0-NEXT:movq (%rsp), %rax +; CHECK-AVX-O0-NEXT:popq %rcx +; CHECK-AVX-O0-NEXT:retq + %ret = load atomic <1 x ptr>, ptr %x acquire, align 4 + ret <1 x ptr> %ret +} + define <1 x half> @atomic_vec1_half(ptr %x) { ; CHECK-O3-LABEL: atomic_vec1_half: ; CHECK-O3: # %bb.0: @@ -386,3 +462,515 @@ define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind { %ret = load atomic <1 x double>, ptr %x acquire, align 8 ret <1 x double> %ret } + +define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec1_i64: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT:pushq %rax +; CHECK-O3-NEXT:movq %rdi, %rsi +; CHECK-O3-NEXT:movq %rsp, %rdx +; CHECK-O3-NEXT:movl $8, %edi +; CHECK-O3-NEXT:movl $2, %ecx +; CHECK-O3-NEXT:callq __atomic_load@PLT +; CHECK-O3-NEXT:movq (%rsp), %rax +; CHECK-O3-NEXT:popq %rcx +; CHECK-O3-NEXT:retq +; +; CHECK-SSE-O3-LABEL: atomic_vec1_i64: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT:pushq %rax +; CHECK-SSE-O3-NEXT:movq %rdi, %rsi +; CHECK-SSE-O3-NEXT:movq %rsp, %rdx +; CHECK-SSE-O3-NEXT:movl $8, %edi +; CHECK-SSE-O3-NEXT:movl $2, %ecx +; CHECK-SSE-O3-NEXT:callq __atomic_load@PLT +; CHECK-SSE-O3-NEXT:movq (%rsp), %rax +; CHECK-SSE-O3-NEXT:popq %rcx +; CHECK-SSE-O3-NEXT:retq +; +; CHECK-AVX-O3-LABEL: atomic_vec1_i64: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT:pushq %rax +; CHECK-AVX-O3-NEXT:movq %rdi, %rsi +; CHECK-AVX-O3-NEXT:movq %rsp, %rdx +; CHECK-AVX-O3-NEXT:movl $8, %edi +; CHECK-AVX-O3-NEXT:movl $2, %ecx +; CHECK-AVX-O3-NEXT:callq __atomic_load@PLT +; CHECK-AVX-O3-NEXT:movq (%rsp), %rax +; CHECK-AV
[llvm-branch-commits] [llvm] [SelectionDAG] Widen <2 x T> vector types for atomic load (PR #120598)
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/120598 >From 39f039e23b95204affb61d1ac004c562f08222c5 Mon Sep 17 00:00:00 2001 From: jofrn Date: Thu, 19 Dec 2024 11:19:39 -0500 Subject: [PATCH] [SelectionDAG] Widen <2 x T> vector types for atomic load Vector types of 2 elements must be widened. This change does this for vector types of atomic load in SelectionDAG so that it can translate aligned vectors of >1 size. commit-id:2894ccd1 --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 + .../SelectionDAG/LegalizeVectorTypes.cpp | 97 -- llvm/test/CodeGen/X86/atomic-load-store.ll| 286 ++ 3 files changed, 361 insertions(+), 23 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index d24b4517a460d..b6e018ba0e454 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -1068,6 +1068,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N); SDValue WidenVecRes_INSERT_SUBVECTOR(SDNode *N); SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N); + SDValue WidenVecRes_ATOMIC_LOAD(AtomicSDNode *N); SDValue WidenVecRes_LOAD(SDNode* N); SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N); SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index d6cbf2211f053..42763aab5bb55 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -4622,6 +4622,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { break; case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break; case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break; + case ISD::ATOMIC_LOAD: +Res = WidenVecRes_ATOMIC_LOAD(cast(N)); +break; case ISD::LOAD: Res = WidenVecRes_LOAD(N); break; case ISD::STEP_VECTOR: case ISD::SPLAT_VECTOR: @@ -6003,6 +6006,74 @@ SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) { N->getOperand(1), N->getOperand(2)); } +/// Either return the same load or provide appropriate casts +/// from the load and return that. +static SDValue coerceLoadedValue(SDValue LdOp, EVT FirstVT, EVT WidenVT, + TypeSize LdWidth, TypeSize FirstVTWidth, + SDLoc dl, SelectionDAG &DAG) { + assert(TypeSize::isKnownLE(LdWidth, FirstVTWidth)); + TypeSize WidenWidth = WidenVT.getSizeInBits(); + if (!FirstVT.isVector()) { +unsigned NumElts = +WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue(); +EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), FirstVT, NumElts); +SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp); +return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp); + } + assert(FirstVT == WidenVT); + return LdOp; +} + +static std::optional findMemType(SelectionDAG &DAG, + const TargetLowering &TLI, unsigned Width, + EVT WidenVT, unsigned Align, + unsigned WidenEx); + +SDValue DAGTypeLegalizer::WidenVecRes_ATOMIC_LOAD(AtomicSDNode *LD) { + EVT WidenVT = + TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0)); + EVT LdVT = LD->getMemoryVT(); + SDLoc dl(LD); + assert(LdVT.isVector() && WidenVT.isVector() && "Expected vectors"); + assert(LdVT.isScalableVector() == WidenVT.isScalableVector() && + "Must be scalable"); + assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType() && + "Expected equivalent element types"); + + // Load information + SDValue Chain = LD->getChain(); + SDValue BasePtr = LD->getBasePtr(); + MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); + AAMDNodes AAInfo = LD->getAAInfo(); + + TypeSize LdWidth = LdVT.getSizeInBits(); + TypeSize WidenWidth = WidenVT.getSizeInBits(); + TypeSize WidthDiff = WidenWidth - LdWidth; + + // Find the vector type that can load from. + std::optional FirstVT = + findMemType(DAG, TLI, LdWidth.getKnownMinValue(), WidenVT, /*LdAlign=*/0, + WidthDiff.getKnownMinValue()); + + if (!FirstVT) +return SDValue(); + + SmallVector MemVTs; + TypeSize FirstVTWidth = FirstVT->getSizeInBits(); + + SDValue LdOp = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, *FirstVT, *FirstVT, + Chain, BasePtr, LD->getMemOperand()); + + // Load the element with one instruction. + SDValue Result = coerceLoadedValue(LdOp, *FirstVT, WidenVT, LdWidth, + FirstVTWidth, dl, DAG); + + // Modified the chain - switch anything that used the old chain to use + // the new one. +
[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #120716)
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/120716 >From ac4069c8fa8e69173b203824f4db5fbd73ecb5a4 Mon Sep 17 00:00:00 2001 From: jofrn Date: Fri, 20 Dec 2024 06:14:28 -0500 Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector AtomicExpand fails for aligned `load atomic ` because it does not find a compatible library call. This change adds appropriate bitcasts so that the call can be lowered. It also adds support for 128 bit lowering in tablegen to support SSE/AVX. commit-id:f430c1af --- .../include/llvm/Target/TargetSelectionDAG.td | 14 + llvm/lib/CodeGen/AtomicExpandPass.cpp | 15 +- llvm/lib/Target/X86/X86InstrCompiler.td | 5 + llvm/test/CodeGen/ARM/atomic-load-store.ll| 51 llvm/test/CodeGen/X86/atomic-load-store.ll| 94 +++ .../X86/expand-atomic-non-integer.ll | 263 -- 6 files changed, 360 insertions(+), 82 deletions(-) diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 406baa4f5fdaa..3b8a34ca0eb51 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -1904,6 +1904,20 @@ def atomic_load_64 : let MemoryVT = i64; } +def atomic_load_128_v2i64 : + PatFrag<(ops node:$ptr), + (atomic_load node:$ptr)> { + let IsAtomic = true; + let MemoryVT = v2i64; +} + +def atomic_load_128_v4i32 : + PatFrag<(ops node:$ptr), + (atomic_load node:$ptr)> { + let IsAtomic = true; + let MemoryVT = v4i32; +} + def atomic_load_nonext_8 : PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> { let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic? diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index c376de877ac7d..70f59eafc6ecb 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -2066,9 +2066,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( I->replaceAllUsesWith(V); } else if (HasResult) { Value *V; -if (UseSizedLibcall) - V = Builder.CreateBitOrPointerCast(Result, I->getType()); -else { +if (UseSizedLibcall) { + // Add bitcasts from Result's scalar type to I's vector type + auto *PtrTy = dyn_cast(I->getType()->getScalarType()); + auto *VTy = dyn_cast(I->getType()); + if (VTy && PtrTy && !Result->getType()->isVectorTy()) { +unsigned AS = PtrTy->getAddressSpace(); +Value *BC = Builder.CreateBitCast( +Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS))); +V = Builder.CreateIntToPtr(BC, I->getType()); + } else +V = Builder.CreateBitOrPointerCast(Result, I->getType()); +} else { V = Builder.CreateAlignedLoad(I->getType(), AllocaResult, AllocaAlignment); Builder.CreateLifetimeEnd(AllocaResult, SizeVal64); diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 26b76dd1ca83a..3143015b7ec66 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1211,6 +1211,11 @@ def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src, def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src, (MOV64toPQIrm addr:$src)>; // load atomic <2 x i32,float> +def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)), + (VMOVAPDrm addr:$src)>; // load atomic <2 x i64> +def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)), + (VMOVAPDrm addr:$src)>; // load atomic <4 x i32> + // Floating point loads/stores. def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst), (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>; diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll b/llvm/test/CodeGen/ARM/atomic-load-store.ll index 560dfde356c29..eaa2ffd9b2731 100644 --- a/llvm/test/CodeGen/ARM/atomic-load-store.ll +++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll @@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) { store atomic double %val1, ptr %ptr seq_cst, align 8 ret void } + +define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 { +; ARM-LABEL: atomic_vec1_ptr: +; ARM: @ %bb.0: +; ARM-NEXT:ldr r0, [r0] +; ARM-NEXT:dmb ish +; ARM-NEXT:bx lr +; +; ARMOPTNONE-LABEL: atomic_vec1_ptr: +; ARMOPTNONE: @ %bb.0: +; ARMOPTNONE-NEXT:ldr r0, [r0] +; ARMOPTNONE-NEXT:dmb ish +; ARMOPTNONE-NEXT:bx lr +; +; THUMBTWO-LABEL: atomic_vec1_ptr: +; THUMBTWO: @ %bb.0: +; THUMBTWO-NEXT:ldr r0, [r0] +; THUMBTWO-NEXT:dmb ish +; THUMBTWO-NEXT:bx lr +; +; THUMBONE-LABEL: atomic_vec1_ptr: +; THUMBONE: @ %bb.0: +; THUMBONE-NEXT:push {r7, lr} +; THUMBONE-NEXT:movs r1, #0 +; THUMBONE-NEXT:mov r2, r1 +; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4 +; THUMBONE-NEXT: