[llvm-branch-commits] [mlir] [mlir][Transforms] Merge 1:1 and 1:N type converters (PR #113032)
@@ -409,32 +419,50 @@ class TypeConverter { /// callback. /// /// With callback of form: - /// `Value(OpBuilder &, T, ValueRange, Location, Type)` + /// - Value(OpBuilder &, T, ValueRange, Location, Type) + /// - SmallVector(OpBuilder &, TypeRange, ValueRange, Location, Type) template std::enable_if_t< std::is_invocable_v, TargetMaterializationCallbackFn> wrapTargetMaterialization(FnT &&callback) const { return [callback = std::forward(callback)]( - OpBuilder &builder, Type resultType, ValueRange inputs, - Location loc, Type originalType) -> Value { - if (T derivedType = dyn_cast(resultType)) -return callback(builder, derivedType, inputs, loc, originalType); - return Value(); + OpBuilder &builder, TypeRange resultTypes, ValueRange inputs, + Location loc, Type originalType) -> SmallVector { + SmallVector result; + if constexpr (std::is_same::value) { +// This is a 1:N target materialization. Return the produces values +// directly. +result = callback(builder, resultTypes, inputs, loc, originalType); + } else { +// This is a 1:1 target materialization. Invoke it only if the result +// type class of the callback matches the requested result type. +if (T derivedType = dyn_cast(resultTypes.front())) { + // 1:1 materializations produce single values, but we store 1:N + // target materialization functions in the type converter. Wrap the + // result value in a SmallVector. + std::optional val = + callback(builder, derivedType, inputs, loc, originalType); + if (val.has_value() && *val) +result.push_back(*val); zero9178 wrote: ```suggestion Value val = callback(builder, derivedType, inputs, loc, originalType); if (val) result.push_back(val); ``` https://github.com/llvm/llvm-project/pull/113032 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [mlir][Transforms] Merge 1:1 and 1:N type converters (PR #113032)
@@ -409,32 +419,50 @@ class TypeConverter { /// callback. /// /// With callback of form: - /// `Value(OpBuilder &, T, ValueRange, Location, Type)` + /// - Value(OpBuilder &, T, ValueRange, Location, Type) + /// - SmallVector(OpBuilder &, TypeRange, ValueRange, Location, Type) template std::enable_if_t< std::is_invocable_v, TargetMaterializationCallbackFn> wrapTargetMaterialization(FnT &&callback) const { return [callback = std::forward(callback)]( - OpBuilder &builder, Type resultType, ValueRange inputs, - Location loc, Type originalType) -> Value { - if (T derivedType = dyn_cast(resultType)) -return callback(builder, derivedType, inputs, loc, originalType); - return Value(); + OpBuilder &builder, TypeRange resultTypes, ValueRange inputs, + Location loc, Type originalType) -> SmallVector { + SmallVector result; + if constexpr (std::is_same::value) { +// This is a 1:N target materialization. Return the produces values +// directly. +result = callback(builder, resultTypes, inputs, loc, originalType); + } else { +// This is a 1:1 target materialization. Invoke it only if the result +// type class of the callback matches the requested result type. +if (T derivedType = dyn_cast(resultTypes.front())) { + // 1:1 materializations produce single values, but we store 1:N + // target materialization functions in the type converter. Wrap the + // result value in a SmallVector. + std::optional val = + callback(builder, derivedType, inputs, loc, originalType); + if (val.has_value() && *val) +result.push_back(*val); +} zero9178 wrote: Should this additionally check whether `resultTypes` is of size 1? My thought was that a 1:1 target materialization might otherwise accidently be called for what was meant to be a 1:N target materialization. Skipping a 1:1 target materialization for 1:N target materializations makes the most sense to me. https://github.com/llvm/llvm-project/pull/113032 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [StructuralHash] Support Differences (PR #112638)
https://github.com/kyulee-com updated https://github.com/llvm/llvm-project/pull/112638 >From 6225d74229d41068c57109a24b063f6fcba13985 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee Date: Wed, 16 Oct 2024 17:09:07 -0700 Subject: [PATCH 1/3] [StructuralHash] Support Differences This comutes a structural hash while allowing for selective ignoring of certain operands based on a custom function that is provided. Instead of a single hash value, it now returns FunctionHashInfo which includes a hash value, an instruction mapping, and a map to track the operand location and its corresponding hash value that is ignored. --- llvm/include/llvm/IR/StructuralHash.h| 46 ++ llvm/lib/IR/StructuralHash.cpp | 188 +-- llvm/unittests/IR/StructuralHashTest.cpp | 55 +++ 3 files changed, 275 insertions(+), 14 deletions(-) diff --git a/llvm/include/llvm/IR/StructuralHash.h b/llvm/include/llvm/IR/StructuralHash.h index aa292bc3446799..bc82c204c4d1f6 100644 --- a/llvm/include/llvm/IR/StructuralHash.h +++ b/llvm/include/llvm/IR/StructuralHash.h @@ -14,7 +14,9 @@ #ifndef LLVM_IR_STRUCTURALHASH_H #define LLVM_IR_STRUCTURALHASH_H +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/StableHashing.h" +#include "llvm/IR/Instruction.h" #include namespace llvm { @@ -23,6 +25,7 @@ class Function; class Module; using IRHash = stable_hash; +using OpndHash = stable_hash; /// Returns a hash of the function \p F. /// \param F The function to hash. @@ -37,6 +40,49 @@ IRHash StructuralHash(const Function &F, bool DetailedHash = false); /// composed the module hash. IRHash StructuralHash(const Module &M, bool DetailedHash = false); +/// The pair of an instruction index and a operand index. +using IndexPair = std::pair; + +/// A map from an instruction index to an instruction pointer. +using IndexInstrMap = MapVector; + +/// A map from an IndexPair to an OpndHash. +using IndexOperandHashMapType = DenseMap; + +/// A function that takes an instruction and an operand index and returns true +/// if the operand should be ignored in the function hash computation. +using IgnoreOperandFunc = std::function; + +struct FunctionHashInfo { + /// A hash value representing the structural content of the function + IRHash FunctionHash; + /// A mapping from instruction indices to instruction pointers + std::unique_ptr IndexInstruction; + /// A mapping from pairs of instruction indices and operand indices + /// to the hashes of the operands. This can be used to analyze or + /// reconstruct the differences in ignored operands + std::unique_ptr IndexOperandHashMap; + + FunctionHashInfo(IRHash FuntionHash, + std::unique_ptr IndexInstruction, + std::unique_ptr IndexOperandHashMap) + : FunctionHash(FuntionHash), +IndexInstruction(std::move(IndexInstruction)), +IndexOperandHashMap(std::move(IndexOperandHashMap)) {} +}; + +/// Computes a structural hash of a given function, considering the structure +/// and content of the function's instructions while allowing for selective +/// ignoring of certain operands based on custom criteria. This hash can be used +/// to identify functions that are structurally similar or identical, which is +/// useful in optimizations, deduplication, or analysis tasks. +/// \param F The function to hash. +/// \param IgnoreOp A callable that takes an instruction and an operand index, +/// and returns true if the operand should be ignored in the hash computation. +/// \return A FunctionHashInfo structure +FunctionHashInfo StructuralHashWithDifferences(const Function &F, + IgnoreOperandFunc IgnoreOp); + } // end namespace llvm #endif diff --git a/llvm/lib/IR/StructuralHash.cpp b/llvm/lib/IR/StructuralHash.cpp index a1fabab77d52b2..6e0af666010a05 100644 --- a/llvm/lib/IR/StructuralHash.cpp +++ b/llvm/lib/IR/StructuralHash.cpp @@ -28,6 +28,19 @@ class StructuralHashImpl { bool DetailedHash; + /// IgnoreOp is a function that returns true if the operand should be ignored. + IgnoreOperandFunc IgnoreOp = nullptr; + /// A mapping from instruction indices to instruction pointers. + /// The index represents the position of an instruction based on the order in + /// which it is first encountered. + std::unique_ptr IndexInstruction = nullptr; + /// A mapping from pairs of instruction indices and operand indices + /// to the hashes of the operands. + std::unique_ptr IndexOperandHashMap = nullptr; + + /// Assign a unique ID to each Value in the order they are first seen. + DenseMap ValueToId; + // This will produce different values on 32-bit and 64-bit systens as // hash_combine returns a size_t. However, this is only used for // detailed hashing which, in-tree, only needs to distinguish between @@ -47,24 +60,140 @@ class StructuralHashImpl { public: StructuralHashImpl() = delete; - explicit StructuralHashImpl(bool DetailedHash) : DetailedHas
[llvm-branch-commits] [clang] clang/AMDGPU: Emit grid size builtins with range metadata (PR #113038)
https://github.com/shiltian approved this pull request. https://github.com/llvm/llvm-project/pull/113038 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] clang/AMDGPU: Emit grid size builtins with range metadata (PR #113038)
@@ -896,5 +896,6 @@ void test_set_fpenv(unsigned long env) { __builtin_amdgcn_set_fpenv(env); } +// CHECK-DAG: [[$GRID_RANGE]] = !{i32 1, i32 0} shiltian wrote: oh I c. range does allow wrap. https://github.com/llvm/llvm-project/pull/113038 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -69,6 +81,241 @@ FunctionPass *llvm::createAMDGPURBLegalizePass() { using namespace AMDGPU; +const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, + MachineRegisterInfo &MRI) { + static std::mutex GlobalMutex; + static SmallDenseMap> + CacheForRuleSet; + std::lock_guard Lock(GlobalMutex); + if (!CacheForRuleSet.contains(ST.getGeneration())) { +auto Rules = std::make_unique(ST, MRI); +CacheForRuleSet[ST.getGeneration()] = std::move(Rules); + } else { +CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI); + } + return *CacheForRuleSet[ST.getGeneration()]; +} + bool AMDGPURBLegalize::runOnMachineFunction(MachineFunction &MF) { + + const GCNSubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Setup the instruction builder with CSE. + std::unique_ptr MIRBuilder; + const TargetPassConfig &TPC = getAnalysis(); + GISelCSEAnalysisWrapper &Wrapper = + getAnalysis().getCSEWrapper(); + GISelCSEInfo *CSEInfo = nullptr; + GISelObserverWrapper Observer; + + if (TPC.isGISelCSEEnabled()) { +MIRBuilder = std::make_unique(); +CSEInfo = &Wrapper.get(TPC.getCSEConfig()); +MIRBuilder->setCSEInfo(CSEInfo); +Observer.addObserver(CSEInfo); +MIRBuilder->setChangeObserver(Observer); + } else { +MIRBuilder = std::make_unique(); + } + MIRBuilder->setMF(MF); + + RAIIDelegateInstaller DelegateInstaller(MF, &Observer); + RAIIMFObserverInstaller MFObserverInstaller(MF, Observer); + + const MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes. + const RegBankLegalizeRules &RBLRules = getRules(ST, MRI); + + // Logic that does legalization based on IDs assigned to Opcode. + RegBankLegalizeHelper RBLegalizeHelper(*MIRBuilder, MRI, MUI, RBI, RBLRules); + + SmallVector AllInst; + + for (auto &MBB : MF) { +for (MachineInstr &MI : MBB) { + AllInst.push_back(&MI); +} + } + + for (auto &MI : AllInst) { +if (!MI->isPreISelOpcode()) + continue; + +unsigned Opc = MI->getOpcode(); + +// Insert point for use operands needs some calculation. +if (Opc == G_PHI) { + RBLegalizeHelper.applyMappingPHI(*MI); + continue; +} + +// Opcodes that support pretty much all combinations of reg banks and LLTs +// (except S1). There is no point in writing rules for them. +if (Opc == G_BUILD_VECTOR || Opc == G_UNMERGE_VALUES || +Opc == G_MERGE_VALUES) { + RBLegalizeHelper.applyMappingTrivial(*MI); + continue; +} + +// Opcodes that also support S1. S1 rules are in RegBankLegalizeRules. +// Remaining reg bank and LLT combinations are trivially accepted. +if ((Opc == G_CONSTANT || Opc == G_FCONSTANT || Opc == G_IMPLICIT_DEF) && +!isS1(MI->getOperand(0).getReg(), MRI)) { + assert(isSgprRB(MI->getOperand(0).getReg(), MRI)); + continue; +} + +if (!RBLegalizeHelper.findRuleAndApplyMapping(*MI)) { + MI->dump(); + llvm_unreachable("failed to match any of the rules"); +} + } + + LLT S1 = LLT::scalar(1); + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + + // SGPR S1 clean up combines: + // - SGPR S1(S32) to SGPR S1(S32) Copy: anyext + trunc combine. + // In RBLegalize 'S1 Dst' are legalized into S32 as'S1Dst = Trunc S32Dst' + // and 'S1 Src' into 'S32Src = Anyext S1Src'. + // S1 Truncs and Anyexts that come from legalizer will also be cleaned up. + // Note: they can have non-S32 types e.g. S16 = Anyext S1 or S1 = Trunc S64. + // - Sgpr S1(S32) to VCC Copy: G_COPY_VCC_SCC combine. + // Divergent instruction uses Sgpr S1 as input that should be lane mask(VCC) + // Legalizing this use creates Sgpr S1(S32) to VCC Copy. + + // Note: Remaining S1 copies, S1s are either SGPR S1(S32) or VCC S1: + // - VCC to VCC Copy: nothing to do here, just a regular copy. + // - VCC to SGPR S1 Copy: Should not exist in a form of COPY instruction(*). + // Note: For 'uniform-in-VCC to SGPR-S1 copy' G_COPY_SCC_VCC is used + // instead. When only available instruction creates VCC result, use of + // UniformInVcc results in creating G_COPY_SCC_VCC. + + // (*)Explanation for 'SGPR S1(uniform) = COPY VCC(divergent)': + // Copy from divergent to uniform register indicates an error in either: + // - Uniformity analysis: Uniform instruction has divergent input. If one of + // the inputs is divergent, instruction should be divergent! + // - RBLegalizer not executing in waterfall loop (missing implementation) + + using namespace MIPatternMatch; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + for (auto &MBB : MF) { +for (auto &MI : make_early_inc_range(MBB)) { + + if (MI.getOpcode() == G_TRUNC && isTriviallyDead(MI, MRI)) { +
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize rules for load (PR #112882)
@@ -236,6 +328,127 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const { return GRules.at(GRulesAlias.at(Opc)); } +// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'. +class Predicate { +public: + struct Elt { +// Save formula composed of Pred, '&&', '||' and '!' as a jump table. +// Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C +// Sequences of && and || will be represented by jumps, for example: +// (A && B && ... X) or (A && B && ... X) || Y +// A == true jump to B +// A == false jump to end or Y, result is A(false) or Y +// (A || B || ... X) or (A || B || ... X) && Y +// A == true jump to end or Y, result is B(true) or Y +// A == false jump B +// Notice that when negating expression, we apply simply flip Neg on each +// Pred and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&). +std::function Pred; +bool Neg; // Neg of Pred is calculated before jump +unsigned TJumpOffset; +unsigned FJumpOffset; + }; + + SmallVector Expression; + + Predicate(std::function Pred) { +Expression.push_back({Pred, false, 1, 1}); + }; + + Predicate(SmallVectorImpl &Expr) { Expression.swap(Expr); }; + + bool operator()(const MachineInstr &MI) const { +unsigned Idx = 0; +unsigned ResultIdx = Expression.size(); +bool Result; +do { + Result = Expression[Idx].Pred(MI); + Result = Expression[Idx].Neg ? !Result : Result; + if (Result) { +Idx += Expression[Idx].TJumpOffset; + } else { +Idx += Expression[Idx].FJumpOffset; + } +} while ((Idx != ResultIdx)); + +return Result; + }; + + Predicate operator!() { +SmallVector NegExpression; +for (unsigned i = 0; i < Expression.size(); ++i) { arsenm wrote: Range loop https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize rules for load (PR #112882)
@@ -119,6 +210,53 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); break; } + case SplitLoad: { +LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); +LLT V8S16 = LLT::fixed_vector(8, S16); +LLT V4S32 = LLT::fixed_vector(4, S32); +LLT V2S64 = LLT::fixed_vector(2, S64); + +if (DstTy == LLT::fixed_vector(8, S32)) arsenm wrote: Can you rework this into be a function that returns the type to use for the load? https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize rules for load (PR #112882)
@@ -37,6 +37,97 @@ bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { return true; } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register BasePtrReg = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(BasePtrReg); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(BasePtrReg); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePtrPlusOffsetReg; +if (ByteOffset == 0) { + BasePtrPlusOffsetReg = BasePtrReg; +} else { + BasePtrPlusOffsetReg = MRI.createVirtualRegister({PtrRB, PtrTy}); + Register OffsetReg = MRI.createVirtualRegister({PtrRB, OffsetTy}); + B.buildConstant(OffsetReg, ByteOffset); + B.buildPtrAdd(BasePtrPlusOffsetReg, BasePtrReg, OffsetReg); arsenm wrote: Would be nice if we could fold createVirtualRegister with bank into the build* methods https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize rules for load (PR #112882)
@@ -293,7 +506,87 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}}) .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}}); - addRulesForGOpcs({G_LOAD}).Any({{DivS32, DivP1}, {{Vgpr32}, {VgprP1}}}); + bool hasUnAlignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12; + bool hasSMRDSmall = ST->hasScalarSubwordLoads(); + + Predicate isAlign16([](const MachineInstr &MI) -> bool { +return (*MI.memoperands_begin())->getAlign() >= Align(16); + }); + + Predicate isAlign4([](const MachineInstr &MI) -> bool { +return (*MI.memoperands_begin())->getAlign() >= Align(4); + }); + + Predicate isAtomicMMO([](const MachineInstr &MI) -> bool { +return (*MI.memoperands_begin())->isAtomic(); + }); + + Predicate isUniMMO([](const MachineInstr &MI) -> bool { +return AMDGPUInstrInfo::isUniformMMO(*MI.memoperands_begin()); + }); + + Predicate isConst([](const MachineInstr &MI) -> bool { +// This is wierd. Can AS in MMO be different then AS on pointer? arsenm wrote: Typo wierd. Yes, the verifier doesn't enforce this (although maybe it should). In the cases where the addrspace comes from an underlying IR value, it definitely can mismatch https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [StructuralHash] Support Differences (PR #112638)
@@ -47,24 +60,140 @@ class StructuralHashImpl { public: StructuralHashImpl() = delete; - explicit StructuralHashImpl(bool DetailedHash) : DetailedHash(DetailedHash) {} + explicit StructuralHashImpl(bool DetailedHash, + IgnoreOperandFunc IgnoreOp = nullptr) + : DetailedHash(DetailedHash), IgnoreOp(IgnoreOp) { +if (IgnoreOp) { + IndexInstruction = std::make_unique(); + IndexOperandHashMap = std::make_unique(); +} + } - stable_hash hashConstant(Constant *C) { + stable_hash hashAPInt(const APInt &I) { SmallVector Hashes; -// TODO: hashArbitaryType() is not stable. -if (ConstantInt *ConstInt = dyn_cast(C)) { - Hashes.emplace_back(hashArbitaryType(ConstInt->getValue())); -} else if (ConstantFP *ConstFP = dyn_cast(C)) { - Hashes.emplace_back(hashArbitaryType(ConstFP->getValue())); -} else if (Function *Func = dyn_cast(C)) - // Hashing the name will be deterministic as LLVM's hashing infrastructure - // has explicit support for hashing strings and will not simply hash - // the pointer. - Hashes.emplace_back(hashArbitaryType(Func->getName())); +Hashes.emplace_back(I.getBitWidth()); +for (unsigned J = 0; J < I.getNumWords(); ++J) + Hashes.emplace_back((I.getRawData())[J]); +return stable_hash_combine(Hashes); + } + stable_hash hashAPFloat(const APFloat &F) { +SmallVector Hashes; +const fltSemantics &S = F.getSemantics(); +Hashes.emplace_back(APFloat::semanticsPrecision(S)); +Hashes.emplace_back(APFloat::semanticsMaxExponent(S)); +Hashes.emplace_back(APFloat::semanticsMinExponent(S)); +Hashes.emplace_back(APFloat::semanticsSizeInBits(S)); +Hashes.emplace_back(hashAPInt(F.bitcastToAPInt())); return stable_hash_combine(Hashes); } + stable_hash hashGlobalValue(const GlobalValue *GV) { +if (!GV->hasName()) + return 0; +return stable_hash_name(GV->getName()); + } + + // Compute a hash for a Constant. This function is logically similar to + // FunctionComparator::cmpConstants() in FunctionComparator.cpp, but here + // we're interested in computing a hash rather than comparing two Constants. + // Some of the logic is simplified, e.g, we don't expand GEPOperator. + stable_hash hashConstant(Constant *C) { +SmallVector Hashes; + +Type *Ty = C->getType(); +Hashes.emplace_back(hashType(Ty)); + +if (C->isNullValue()) { + Hashes.emplace_back(static_cast('N')); + return stable_hash_combine(Hashes); +} + +auto *G = dyn_cast(C); +if (G) { + Hashes.emplace_back(hashGlobalValue(G)); + return stable_hash_combine(Hashes); +} + +if (const auto *Seq = dyn_cast(C)) { + Hashes.emplace_back(xxh3_64bits(Seq->getRawDataValues())); + return stable_hash_combine(Hashes); +} + +switch (C->getValueID()) { +case Value::UndefValueVal: +case Value::PoisonValueVal: +case Value::ConstantTokenNoneVal: { + return stable_hash_combine(Hashes); +} +case Value::ConstantIntVal: { + const APInt &Int = cast(C)->getValue(); + Hashes.emplace_back(hashAPInt(Int)); + return stable_hash_combine(Hashes); +} +case Value::ConstantFPVal: { + const APFloat &APF = cast(C)->getValueAPF(); + Hashes.emplace_back(hashAPFloat(APF)); + return stable_hash_combine(Hashes); +} +case Value::ConstantArrayVal: { + const ConstantArray *A = cast(C); + uint64_t NumElements = cast(Ty)->getNumElements(); + Hashes.emplace_back(NumElements); kyulee-com wrote: Yeah. We could remove the count. https://github.com/llvm/llvm-project/pull/112638 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [StructuralHash] Support Differences (PR #112638)
@@ -47,24 +60,140 @@ class StructuralHashImpl { public: StructuralHashImpl() = delete; - explicit StructuralHashImpl(bool DetailedHash) : DetailedHash(DetailedHash) {} + explicit StructuralHashImpl(bool DetailedHash, + IgnoreOperandFunc IgnoreOp = nullptr) + : DetailedHash(DetailedHash), IgnoreOp(IgnoreOp) { +if (IgnoreOp) { + IndexInstruction = std::make_unique(); + IndexOperandHashMap = std::make_unique(); +} + } - stable_hash hashConstant(Constant *C) { + stable_hash hashAPInt(const APInt &I) { SmallVector Hashes; -// TODO: hashArbitaryType() is not stable. -if (ConstantInt *ConstInt = dyn_cast(C)) { - Hashes.emplace_back(hashArbitaryType(ConstInt->getValue())); -} else if (ConstantFP *ConstFP = dyn_cast(C)) { - Hashes.emplace_back(hashArbitaryType(ConstFP->getValue())); -} else if (Function *Func = dyn_cast(C)) - // Hashing the name will be deterministic as LLVM's hashing infrastructure - // has explicit support for hashing strings and will not simply hash - // the pointer. - Hashes.emplace_back(hashArbitaryType(Func->getName())); +Hashes.emplace_back(I.getBitWidth()); +for (unsigned J = 0; J < I.getNumWords(); ++J) + Hashes.emplace_back((I.getRawData())[J]); +return stable_hash_combine(Hashes); + } + stable_hash hashAPFloat(const APFloat &F) { +SmallVector Hashes; +const fltSemantics &S = F.getSemantics(); +Hashes.emplace_back(APFloat::semanticsPrecision(S)); +Hashes.emplace_back(APFloat::semanticsMaxExponent(S)); +Hashes.emplace_back(APFloat::semanticsMinExponent(S)); +Hashes.emplace_back(APFloat::semanticsSizeInBits(S)); +Hashes.emplace_back(hashAPInt(F.bitcastToAPInt())); return stable_hash_combine(Hashes); } + stable_hash hashGlobalValue(const GlobalValue *GV) { +if (!GV->hasName()) + return 0; +return stable_hash_name(GV->getName()); kyulee-com wrote: `stable_hash_name` itself already handles it by calling `get_stable_name`. https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/ADT/StableHashing.h#L55-L74 https://github.com/llvm/llvm-project/pull/112638 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + Register Reg = Op.getReg(); + if (Reg == AMDGPU::EXEC) { +return true; + } +} + } + + return false; +} + +Register getVReg(MachineOperand &Op) { + if (!Op.isReg()) +return 0; + + Register Reg = Op.getReg(); + if (!Reg.isVirtual()) +return 0; + + return Reg; +} + +bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { + MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(MF); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + MachineIRBuilder B(MF); + + // Assign register banks to ALL def registers on G_ instructions. + // Same for copies if they have no register bank or class on def. + for (MachineBasicBlock &MBB : MF) { +for (MachineInstr &MI : MBB) { + if (!shouldRBSelect(MI)) +continue; + + for (MachineOperand &DefOP : MI.defs()) { +Register DefReg = getVReg(DefOP); +if (!DefReg) + continue; + +// Copies can have register class on def registers. +if (MI.isCopy() && MRI.getRegClassOrNull(DefReg)) { + continue; +} + +if (MUI.isUniform(DefReg) || ILMA.isS32S64LaneMask(DefReg)) { + setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::SGPRRegBankID)); +} else { + if (MRI.getType(DefReg) == LLT::scalar(1)) +setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::VCCRegBankID)); arsenm wrote: Can you directly use the pointer to th
[llvm-branch-commits] [llvm] [StructuralHash] Support Differences (PR #112638)
@@ -47,24 +60,140 @@ class StructuralHashImpl { public: StructuralHashImpl() = delete; - explicit StructuralHashImpl(bool DetailedHash) : DetailedHash(DetailedHash) {} + explicit StructuralHashImpl(bool DetailedHash, + IgnoreOperandFunc IgnoreOp = nullptr) + : DetailedHash(DetailedHash), IgnoreOp(IgnoreOp) { +if (IgnoreOp) { + IndexInstruction = std::make_unique(); + IndexOperandHashMap = std::make_unique(); +} + } - stable_hash hashConstant(Constant *C) { + stable_hash hashAPInt(const APInt &I) { SmallVector Hashes; -// TODO: hashArbitaryType() is not stable. -if (ConstantInt *ConstInt = dyn_cast(C)) { - Hashes.emplace_back(hashArbitaryType(ConstInt->getValue())); -} else if (ConstantFP *ConstFP = dyn_cast(C)) { - Hashes.emplace_back(hashArbitaryType(ConstFP->getValue())); -} else if (Function *Func = dyn_cast(C)) - // Hashing the name will be deterministic as LLVM's hashing infrastructure - // has explicit support for hashing strings and will not simply hash - // the pointer. - Hashes.emplace_back(hashArbitaryType(Func->getName())); +Hashes.emplace_back(I.getBitWidth()); +for (unsigned J = 0; J < I.getNumWords(); ++J) + Hashes.emplace_back((I.getRawData())[J]); +return stable_hash_combine(Hashes); + } + stable_hash hashAPFloat(const APFloat &F) { +SmallVector Hashes; +const fltSemantics &S = F.getSemantics(); +Hashes.emplace_back(APFloat::semanticsPrecision(S)); +Hashes.emplace_back(APFloat::semanticsMaxExponent(S)); +Hashes.emplace_back(APFloat::semanticsMinExponent(S)); +Hashes.emplace_back(APFloat::semanticsSizeInBits(S)); +Hashes.emplace_back(hashAPInt(F.bitcastToAPInt())); return stable_hash_combine(Hashes); } + stable_hash hashGlobalValue(const GlobalValue *GV) { +if (!GV->hasName()) + return 0; +return stable_hash_name(GV->getName()); + } + + // Compute a hash for a Constant. This function is logically similar to + // FunctionComparator::cmpConstants() in FunctionComparator.cpp, but here + // we're interested in computing a hash rather than comparing two Constants. + // Some of the logic is simplified, e.g, we don't expand GEPOperator. + stable_hash hashConstant(Constant *C) { +SmallVector Hashes; + +Type *Ty = C->getType(); +Hashes.emplace_back(hashType(Ty)); + +if (C->isNullValue()) { + Hashes.emplace_back(static_cast('N')); + return stable_hash_combine(Hashes); +} + +auto *G = dyn_cast(C); +if (G) { + Hashes.emplace_back(hashGlobalValue(G)); + return stable_hash_combine(Hashes); +} + +if (const auto *Seq = dyn_cast(C)) { + Hashes.emplace_back(xxh3_64bits(Seq->getRawDataValues())); + return stable_hash_combine(Hashes); +} + +switch (C->getValueID()) { +case Value::UndefValueVal: +case Value::PoisonValueVal: +case Value::ConstantTokenNoneVal: { + return stable_hash_combine(Hashes); +} +case Value::ConstantIntVal: { + const APInt &Int = cast(C)->getValue(); + Hashes.emplace_back(hashAPInt(Int)); + return stable_hash_combine(Hashes); +} +case Value::ConstantFPVal: { + const APFloat &APF = cast(C)->getValueAPF(); + Hashes.emplace_back(hashAPFloat(APF)); + return stable_hash_combine(Hashes); +} +case Value::ConstantArrayVal: { + const ConstantArray *A = cast(C); + uint64_t NumElements = cast(Ty)->getNumElements(); + Hashes.emplace_back(NumElements); + for (auto &Op : A->operands()) { +auto H = hashConstant(cast(Op)); +Hashes.emplace_back(H); + } + return stable_hash_combine(Hashes); +} +case Value::ConstantStructVal: { + const ConstantStruct *S = cast(C); + unsigned NumElements = cast(Ty)->getNumElements(); + Hashes.emplace_back(NumElements); + for (auto &Op : S->operands()) { +auto H = hashConstant(cast(Op)); +Hashes.emplace_back(H); + } + return stable_hash_combine(Hashes); +} +case Value::ConstantVectorVal: { + const ConstantVector *V = cast(C); + unsigned NumElements = cast(Ty)->getNumElements(); + Hashes.emplace_back(NumElements); + for (auto &Op : V->operands()) { +auto H = hashConstant(cast(Op)); +Hashes.emplace_back(H); + } + return stable_hash_combine(Hashes); +} +case Value::ConstantExprVal: { + const ConstantExpr *E = cast(C); + unsigned NumOperands = E->getNumOperands(); + Hashes.emplace_back(NumOperands); + for (auto &Op : E->operands()) { +auto H = hashConstant(cast(Op)); +Hashes.emplace_back(H); + } + return stable_hash_combine(Hashes); +} +case Value::BlockAddressVal: { + const BlockAddress *BA = cast(C); + auto H = hashGlobalValue(BA->getFunction()); + Hashes.emplace_back(H); + return stable_hash_co
[llvm-branch-commits] [clang] release/19.x: [clang] Make LazyOffsetPtr more portable (#112927) (PR #113052)
https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/113052 Backport 76196998e25b98d81abc437708622261810782ca Requested by: @mgorny >From f062f8c06bb237ee79c098414eafe5eb075aa9c8 Mon Sep 17 00:00:00 2001 From: Jessica Clarke Date: Fri, 18 Oct 2024 21:49:23 +0100 Subject: [PATCH] [clang] Make LazyOffsetPtr more portable (#112927) LazyOffsetPtr currently relies on uint64_t being able to store a pointer and, unless sizeof(uint64_t) == sizeof(void *), little endianness, since getAddressOfPointer reinterprets the memory as a pointer. This also doesn't properly respect the C++ object model. As removing getAddressOfPointer would have wide-reaching implications, improve the implementation to account for these problems by using placement new and a suitably sized-and-aligned buffer, "right"-aligning the objects on big-endian platforms so the LSBs are in the same place for use as the discriminator. Fixes: bc73ef0031b50f7443615fef614fb4ecaaa4bd11 Fixes: https://github.com/llvm/llvm-project/issues/111993 (cherry picked from commit 76196998e25b98d81abc437708622261810782ca) --- clang/include/clang/AST/ExternalASTSource.h | 48 +++-- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/clang/include/clang/AST/ExternalASTSource.h b/clang/include/clang/AST/ExternalASTSource.h index 385c32edbae0fd..582ed7c65f58ca 100644 --- a/clang/include/clang/AST/ExternalASTSource.h +++ b/clang/include/clang/AST/ExternalASTSource.h @@ -25,10 +25,12 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator.h" #include "llvm/Support/PointerLikeTypeTraits.h" +#include #include #include #include #include +#include #include #include @@ -326,29 +328,49 @@ struct LazyOffsetPtr { /// /// If the low bit is clear, a pointer to the AST node. If the low /// bit is set, the upper 63 bits are the offset. - mutable uint64_t Ptr = 0; + static constexpr size_t DataSize = std::max(sizeof(uint64_t), sizeof(T *)); + alignas(uint64_t) alignas(T *) mutable unsigned char Data[DataSize] = {}; + + unsigned char GetLSB() const { +return Data[llvm::sys::IsBigEndianHost ? DataSize - 1 : 0]; + } + + template U &As(bool New) const { +unsigned char *Obj = +Data + (llvm::sys::IsBigEndianHost ? DataSize - sizeof(U) : 0); +if (New) + return *new (Obj) U; +return *std::launder(reinterpret_cast(Obj)); + } + + T *&GetPtr() const { return As(false); } + uint64_t &GetU64() const { return As(false); } + void SetPtr(T *Ptr) const { As(true) = Ptr; } + void SetU64(uint64_t U64) const { As(true) = U64; } public: LazyOffsetPtr() = default; - explicit LazyOffsetPtr(T *Ptr) : Ptr(reinterpret_cast(Ptr)) {} + explicit LazyOffsetPtr(T *Ptr) : Data() { SetPtr(Ptr); } - explicit LazyOffsetPtr(uint64_t Offset) : Ptr((Offset << 1) | 0x01) { + explicit LazyOffsetPtr(uint64_t Offset) : Data() { assert((Offset << 1 >> 1) == Offset && "Offsets must require < 63 bits"); if (Offset == 0) - Ptr = 0; + SetPtr(nullptr); +else + SetU64((Offset << 1) | 0x01); } LazyOffsetPtr &operator=(T *Ptr) { -this->Ptr = reinterpret_cast(Ptr); +SetPtr(Ptr); return *this; } LazyOffsetPtr &operator=(uint64_t Offset) { assert((Offset << 1 >> 1) == Offset && "Offsets must require < 63 bits"); if (Offset == 0) - Ptr = 0; + SetPtr(nullptr); else - Ptr = (Offset << 1) | 0x01; + SetU64((Offset << 1) | 0x01); return *this; } @@ -356,15 +378,15 @@ struct LazyOffsetPtr { /// Whether this pointer is non-NULL. /// /// This operation does not require the AST node to be deserialized. - explicit operator bool() const { return Ptr != 0; } + explicit operator bool() const { return isOffset() || GetPtr() != nullptr; } /// Whether this pointer is non-NULL. /// /// This operation does not require the AST node to be deserialized. - bool isValid() const { return Ptr != 0; } + bool isValid() const { return isOffset() || GetPtr() != nullptr; } /// Whether this pointer is currently stored as an offset. - bool isOffset() const { return Ptr & 0x01; } + bool isOffset() const { return GetLSB() & 0x01; } /// Retrieve the pointer to the AST node that this lazy pointer points to. /// @@ -375,9 +397,9 @@ struct LazyOffsetPtr { if (isOffset()) { assert(Source && "Cannot deserialize a lazy pointer without an AST source"); - Ptr = reinterpret_cast((Source->*Get)(OffsT(Ptr >> 1))); + SetPtr((Source->*Get)(OffsT(GetU64() >> 1))); } -return reinterpret_cast(Ptr); +return GetPtr(); } /// Retrieve the address of the AST node pointer. Deserializes the pointee if @@ -385,7 +407,7 @@ struct LazyOffsetPtr { T **getAddressOfPointer(ExternalASTSource *Source) const { // Ensure the integer is in pointer form. (void)get(Source); -return reinterpret_cast(&Ptr); +return &GetPtr(); } }; _
[llvm-branch-commits] [clang] release/19.x: [clang] Make LazyOffsetPtr more portable (#112927) (PR #113052)
https://github.com/llvmbot milestoned https://github.com/llvm/llvm-project/pull/113052 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang] Introduce custom loop nest generation for loops in workshare construct (PR #101445)
https://github.com/ivanradanov updated https://github.com/llvm/llvm-project/pull/101445 >From 6f114e0501f1759eab34dc8ddfc3030c03037cd4 Mon Sep 17 00:00:00 2001 From: Ivan Radanov Ivanov Date: Thu, 22 Aug 2024 18:07:05 +0900 Subject: [PATCH 1/2] [flang] Introduce ws loop nest generation for HLFIR lowering Emit loop nests in a custom wrapper Only emit unordered loops as omp loops Fix uninitialized memory bug in genLoopNest --- .../flang/Optimizer/Builder/HLFIRTools.h | 12 +++-- flang/lib/Lower/ConvertCall.cpp | 2 +- flang/lib/Lower/OpenMP/ReductionProcessor.cpp | 4 +- flang/lib/Optimizer/Builder/HLFIRTools.cpp| 52 ++- .../HLFIR/Transforms/BufferizeHLFIR.cpp | 3 +- .../LowerHLFIROrderedAssignments.cpp | 33 ++-- .../Transforms/OptimizedBufferization.cpp | 6 +-- 7 files changed, 69 insertions(+), 43 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h index 6b41025eea0780..f073f494b3fb21 100644 --- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h +++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h @@ -357,8 +357,8 @@ hlfir::ElementalOp genElementalOp( /// Structure to describe a loop nest. struct LoopNest { - fir::DoLoopOp outerLoop; - fir::DoLoopOp innerLoop; + mlir::Operation *outerOp = nullptr; + mlir::Block *body = nullptr; llvm::SmallVector oneBasedIndices; }; @@ -366,11 +366,13 @@ struct LoopNest { /// \p isUnordered specifies whether the loops in the loop nest /// are unordered. LoopNest genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder, - mlir::ValueRange extents, bool isUnordered = false); + mlir::ValueRange extents, bool isUnordered = false, + bool emitWorkshareLoop = false); inline LoopNest genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder, -mlir::Value shape, bool isUnordered = false) { +mlir::Value shape, bool isUnordered = false, +bool emitWorkshareLoop = false) { return genLoopNest(loc, builder, getIndexExtents(loc, builder, shape), - isUnordered); + isUnordered, emitWorkshareLoop); } /// Inline the body of an hlfir.elemental at the current insertion point diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp index 9f5b58590fb79e..e84e7afbe82e09 100644 --- a/flang/lib/Lower/ConvertCall.cpp +++ b/flang/lib/Lower/ConvertCall.cpp @@ -2135,7 +2135,7 @@ class ElementalCallBuilder { hlfir::genLoopNest(loc, builder, shape, !mustBeOrdered); mlir::ValueRange oneBasedIndices = loopNest.oneBasedIndices; auto insPt = builder.saveInsertionPoint(); - builder.setInsertionPointToStart(loopNest.innerLoop.getBody()); + builder.setInsertionPointToStart(loopNest.body); callContext.stmtCtx.pushScope(); for (auto &preparedActual : loweredActuals) if (preparedActual) diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp index 6b98ea3d0615b6..736de2ee511bef 100644 --- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp @@ -374,7 +374,7 @@ static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc, // know this won't miss any opportuinties for clever elemental inlining hlfir::LoopNest nest = hlfir::genLoopNest( loc, builder, shapeShift.getExtents(), /*isUnordered=*/true); - builder.setInsertionPointToStart(nest.innerLoop.getBody()); + builder.setInsertionPointToStart(nest.body); mlir::Type refTy = fir::ReferenceType::get(seqTy.getEleTy()); auto lhsEleAddr = builder.create( loc, refTy, lhs, shapeShift, /*slice=*/mlir::Value{}, @@ -388,7 +388,7 @@ static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc, builder, loc, redId, refTy, lhsEle, rhsEle); builder.create(loc, scalarReduction, lhsEleAddr); - builder.setInsertionPointAfter(nest.outerLoop); + builder.setInsertionPointAfter(nest.outerOp); builder.create(loc, lhsAddr); } diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp index 8d0ae2f195178c..31378841ed 100644 --- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp +++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp @@ -20,6 +20,7 @@ #include "mlir/IR/IRMapping.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/TypeSwitch.h" +#include #include // Return explicit extents. If the base is a fir.box, this won't read it to @@ -855,26 +856,51 @@ mlir::Value hlfir::inlineElementalOp( hlfir::LoopNest hlfir::genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder, - mlir::ValueRange extents, bool isUnordered) { +
[llvm-branch-commits] [flang] [flang][omp] Emit omp.workshare in frontend (PR #101444)
https://github.com/ivanradanov updated https://github.com/llvm/llvm-project/pull/101444 >From bf363883787e9b4989dd858f8573579688f7044b Mon Sep 17 00:00:00 2001 From: Ivan Radanov Ivanov Date: Wed, 31 Jul 2024 14:11:47 +0900 Subject: [PATCH 1/2] [flang][omp] Emit omp.workshare in frontend Fix lower test for workshare --- flang/lib/Lower/OpenMP/OpenMP.cpp | 30 +++ flang/test/Lower/OpenMP/workshare.f90 | 6 +++--- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index cf469003b7298d..22f6d5bd09cd65 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1330,6 +1330,15 @@ static void genTaskwaitClauses(lower::AbstractConverter &converter, loc, llvm::omp::Directive::OMPD_taskwait); } +static void genWorkshareClauses(lower::AbstractConverter &converter, +semantics::SemanticsContext &semaCtx, +lower::StatementContext &stmtCtx, +const List &clauses, mlir::Location loc, +mlir::omp::WorkshareOperands &clauseOps) { + ClauseProcessor cp(converter, semaCtx, clauses); + cp.processNowait(clauseOps); +} + static void genTeamsClauses(lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx, lower::StatementContext &stmtCtx, @@ -1923,6 +1932,22 @@ genTaskyieldOp(lower::AbstractConverter &converter, lower::SymMap &symTable, return converter.getFirOpBuilder().create(loc); } +static mlir::omp::WorkshareOp +genWorkshareOp(lower::AbstractConverter &converter, lower::SymMap &symTable, + semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, + mlir::Location loc, const ConstructQueue &queue, + ConstructQueue::iterator item) { + lower::StatementContext stmtCtx; + mlir::omp::WorkshareOperands clauseOps; + genWorkshareClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps); + + return genOpWithBody( + OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval, +llvm::omp::Directive::OMPD_workshare) + .setClauses(&item->clauses), + queue, item, clauseOps); +} + static mlir::omp::TeamsOp genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, @@ -2515,10 +2540,7 @@ static void genOMPDispatch(lower::AbstractConverter &converter, llvm::omp::getOpenMPDirectiveName(dir) + ")"); // case llvm::omp::Directive::OMPD_workdistribute: case llvm::omp::Directive::OMPD_workshare: -// FIXME: Workshare is not a commonly used OpenMP construct, an -// implementation for this feature will come later. For the codes -// that use this construct, add a single construct for now. -genSingleOp(converter, symTable, semaCtx, eval, loc, queue, item); +genWorkshareOp(converter, symTable, semaCtx, eval, loc, queue, item); break; default: // Combined and composite constructs should have been split into a sequence diff --git a/flang/test/Lower/OpenMP/workshare.f90 b/flang/test/Lower/OpenMP/workshare.f90 index 1e11677a15e1f0..8e771952f5b6da 100644 --- a/flang/test/Lower/OpenMP/workshare.f90 +++ b/flang/test/Lower/OpenMP/workshare.f90 @@ -6,7 +6,7 @@ subroutine sb1(arr) integer :: arr(:) !CHECK: omp.parallel { !$omp parallel -!CHECK: omp.single { +!CHECK: omp.workshare { !$omp workshare arr = 0 !$omp end workshare @@ -20,7 +20,7 @@ subroutine sb2(arr) integer :: arr(:) !CHECK: omp.parallel { !$omp parallel -!CHECK: omp.single nowait { +!CHECK: omp.workshare nowait { !$omp workshare arr = 0 !$omp end workshare nowait @@ -33,7 +33,7 @@ subroutine sb2(arr) subroutine sb3(arr) integer :: arr(:) !CHECK: omp.parallel { -!CHECK: omp.single { +!CHECK: omp.workshare { !$omp parallel workshare arr = 0 !$omp end parallel workshare >From e23cf320ed37cb73971bed74cf260e524210a187 Mon Sep 17 00:00:00 2001 From: Ivan Radanov Ivanov Date: Thu, 22 Aug 2024 17:01:43 +0900 Subject: [PATCH 2/2] Fix function signature --- flang/lib/Lower/OpenMP/OpenMP.cpp | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 22f6d5bd09cd65..daeb928e53d061 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1934,12 +1934,14 @@ genTaskyieldOp(lower::AbstractConverter &converter, lower::SymMap &symTable, static mlir::omp::WorkshareOp genWorkshareOp(lower::AbstractConverter &converter, lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - mlir::Location loc, const ConstructQueue &queue, - ConstructQueue::
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + Register Reg = Op.getReg(); + if (Reg == AMDGPU::EXEC) { +return true; + } +} + } + + return false; +} + +Register getVReg(MachineOperand &Op) { + if (!Op.isReg()) +return 0; arsenm wrote: Use explicit Register() https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -107,3 +107,183 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) { S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg()); } } + +MachineInstrBuilder AMDGPU::buildReadAnyLaneB32(MachineIRBuilder &B, +const DstOp &SgprDst, +const SrcOp &VgprSrc, +const RegisterBankInfo &RBI) { + auto RFL = B.buildInstr(AMDGPU::G_READANYLANE, {SgprDst}, {VgprSrc}); + Register Dst = RFL->getOperand(0).getReg(); + Register Src = RFL->getOperand(1).getReg(); + MachineRegisterInfo &MRI = *B.getMRI(); + if (!MRI.getRegBankOrNull(Dst)) +MRI.setRegBank(Dst, RBI.getRegBank(SGPRRegBankID)); + if (!MRI.getRegBankOrNull(Src)) +MRI.setRegBank(Src, RBI.getRegBank(VGPRRegBankID)); + return RFL; +} + +MachineInstrBuilder +AMDGPU::buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B, const DstOp &SgprDst, + const SrcOp &VgprSrc, LLT B32Ty, + const RegisterBankInfo &RBI) { + MachineRegisterInfo &MRI = *B.getMRI(); + SmallVector SgprDstParts; + auto Unmerge = B.buildUnmerge(B32Ty, VgprSrc); + for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { +SgprDstParts.push_back( +buildReadAnyLaneB32(B, B32Ty, Unmerge.getReg(i), RBI).getReg(0)); + } + + auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts); + MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID)); + return Merge; +} + +MachineInstrBuilder +AMDGPU::buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B, const DstOp &SgprDst, + const SrcOp &VgprSrc, + const RegisterBankInfo &RBI) { + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + MachineRegisterInfo &MRI = *B.getMRI(); + SmallVector SgprDstParts; + auto Unmerge = B.buildUnmerge(S64, VgprSrc); + + for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { +MRI.setRegBank(Unmerge.getReg(i), RBI.getRegBank(AMDGPU::VGPRRegBankID)); arsenm wrote: Use the direct VGPRRegBank pointer or pull this out of the loop https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -69,6 +81,241 @@ FunctionPass *llvm::createAMDGPURBLegalizePass() { using namespace AMDGPU; +const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, + MachineRegisterInfo &MRI) { + static std::mutex GlobalMutex; + static SmallDenseMap> + CacheForRuleSet; + std::lock_guard Lock(GlobalMutex); + if (!CacheForRuleSet.contains(ST.getGeneration())) { +auto Rules = std::make_unique(ST, MRI); +CacheForRuleSet[ST.getGeneration()] = std::move(Rules); + } else { +CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI); + } + return *CacheForRuleSet[ST.getGeneration()]; +} + bool AMDGPURBLegalize::runOnMachineFunction(MachineFunction &MF) { + + const GCNSubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Setup the instruction builder with CSE. + std::unique_ptr MIRBuilder; + const TargetPassConfig &TPC = getAnalysis(); + GISelCSEAnalysisWrapper &Wrapper = + getAnalysis().getCSEWrapper(); + GISelCSEInfo *CSEInfo = nullptr; + GISelObserverWrapper Observer; + + if (TPC.isGISelCSEEnabled()) { +MIRBuilder = std::make_unique(); +CSEInfo = &Wrapper.get(TPC.getCSEConfig()); +MIRBuilder->setCSEInfo(CSEInfo); +Observer.addObserver(CSEInfo); +MIRBuilder->setChangeObserver(Observer); + } else { +MIRBuilder = std::make_unique(); + } + MIRBuilder->setMF(MF); + + RAIIDelegateInstaller DelegateInstaller(MF, &Observer); + RAIIMFObserverInstaller MFObserverInstaller(MF, Observer); + + const MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes. + const RegBankLegalizeRules &RBLRules = getRules(ST, MRI); + + // Logic that does legalization based on IDs assigned to Opcode. + RegBankLegalizeHelper RBLegalizeHelper(*MIRBuilder, MRI, MUI, RBI, RBLRules); + + SmallVector AllInst; + + for (auto &MBB : MF) { +for (MachineInstr &MI : MBB) { + AllInst.push_back(&MI); +} + } + + for (auto &MI : AllInst) { +if (!MI->isPreISelOpcode()) + continue; + +unsigned Opc = MI->getOpcode(); + +// Insert point for use operands needs some calculation. +if (Opc == G_PHI) { arsenm wrote: Missing namespace on all the opcodes is a bit jarring https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -69,6 +81,241 @@ FunctionPass *llvm::createAMDGPURBLegalizePass() { using namespace AMDGPU; +const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, + MachineRegisterInfo &MRI) { + static std::mutex GlobalMutex; + static SmallDenseMap> + CacheForRuleSet; + std::lock_guard Lock(GlobalMutex); arsenm wrote: Use the usual llvm::call_once hack for this? https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -69,6 +81,241 @@ FunctionPass *llvm::createAMDGPURBLegalizePass() { using namespace AMDGPU; +const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, + MachineRegisterInfo &MRI) { + static std::mutex GlobalMutex; + static SmallDenseMap> + CacheForRuleSet; + std::lock_guard Lock(GlobalMutex); + if (!CacheForRuleSet.contains(ST.getGeneration())) { +auto Rules = std::make_unique(ST, MRI); +CacheForRuleSet[ST.getGeneration()] = std::move(Rules); + } else { +CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI); + } + return *CacheForRuleSet[ST.getGeneration()]; +} + bool AMDGPURBLegalize::runOnMachineFunction(MachineFunction &MF) { + + const GCNSubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Setup the instruction builder with CSE. + std::unique_ptr MIRBuilder; + const TargetPassConfig &TPC = getAnalysis(); + GISelCSEAnalysisWrapper &Wrapper = + getAnalysis().getCSEWrapper(); + GISelCSEInfo *CSEInfo = nullptr; + GISelObserverWrapper Observer; + + if (TPC.isGISelCSEEnabled()) { +MIRBuilder = std::make_unique(); +CSEInfo = &Wrapper.get(TPC.getCSEConfig()); +MIRBuilder->setCSEInfo(CSEInfo); +Observer.addObserver(CSEInfo); +MIRBuilder->setChangeObserver(Observer); + } else { +MIRBuilder = std::make_unique(); + } + MIRBuilder->setMF(MF); + + RAIIDelegateInstaller DelegateInstaller(MF, &Observer); + RAIIMFObserverInstaller MFObserverInstaller(MF, Observer); + + const MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes. + const RegBankLegalizeRules &RBLRules = getRules(ST, MRI); + + // Logic that does legalization based on IDs assigned to Opcode. + RegBankLegalizeHelper RBLegalizeHelper(*MIRBuilder, MRI, MUI, RBI, RBLRules); + + SmallVector AllInst; + + for (auto &MBB : MF) { +for (MachineInstr &MI : MBB) { + AllInst.push_back(&MI); +} + } + + for (auto &MI : AllInst) { +if (!MI->isPreISelOpcode()) + continue; + +unsigned Opc = MI->getOpcode(); + +// Insert point for use operands needs some calculation. +if (Opc == G_PHI) { + RBLegalizeHelper.applyMappingPHI(*MI); + continue; +} + +// Opcodes that support pretty much all combinations of reg banks and LLTs +// (except S1). There is no point in writing rules for them. +if (Opc == G_BUILD_VECTOR || Opc == G_UNMERGE_VALUES || +Opc == G_MERGE_VALUES) { + RBLegalizeHelper.applyMappingTrivial(*MI); + continue; +} + +// Opcodes that also support S1. S1 rules are in RegBankLegalizeRules. +// Remaining reg bank and LLT combinations are trivially accepted. +if ((Opc == G_CONSTANT || Opc == G_FCONSTANT || Opc == G_IMPLICIT_DEF) && +!isS1(MI->getOperand(0).getReg(), MRI)) { + assert(isSgprRB(MI->getOperand(0).getReg(), MRI)); + continue; +} + +if (!RBLegalizeHelper.findRuleAndApplyMapping(*MI)) { + MI->dump(); + llvm_unreachable("failed to match any of the rules"); +} + } + + LLT S1 = LLT::scalar(1); + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + + // SGPR S1 clean up combines: + // - SGPR S1(S32) to SGPR S1(S32) Copy: anyext + trunc combine. + // In RBLegalize 'S1 Dst' are legalized into S32 as'S1Dst = Trunc S32Dst' + // and 'S1 Src' into 'S32Src = Anyext S1Src'. + // S1 Truncs and Anyexts that come from legalizer will also be cleaned up. + // Note: they can have non-S32 types e.g. S16 = Anyext S1 or S1 = Trunc S64. + // - Sgpr S1(S32) to VCC Copy: G_COPY_VCC_SCC combine. + // Divergent instruction uses Sgpr S1 as input that should be lane mask(VCC) + // Legalizing this use creates Sgpr S1(S32) to VCC Copy. + + // Note: Remaining S1 copies, S1s are either SGPR S1(S32) or VCC S1: + // - VCC to VCC Copy: nothing to do here, just a regular copy. + // - VCC to SGPR S1 Copy: Should not exist in a form of COPY instruction(*). + // Note: For 'uniform-in-VCC to SGPR-S1 copy' G_COPY_SCC_VCC is used + // instead. When only available instruction creates VCC result, use of + // UniformInVcc results in creating G_COPY_SCC_VCC. + + // (*)Explanation for 'SGPR S1(uniform) = COPY VCC(divergent)': + // Copy from divergent to uniform register indicates an error in either: + // - Uniformity analysis: Uniform instruction has divergent input. If one of + // the inputs is divergent, instruction should be divergent! + // - RBLegalizer not executing in waterfall loop (missing implementation) + + using namespace MIPatternMatch; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + for (auto &MBB : MF) { +for (auto &MI : make_early_inc_range(MBB)) { + + if (MI.getOpcode() == G_TRUNC && isTriviallyDead(MI, MRI)) { +
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -107,3 +107,183 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) { S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg()); } } + +MachineInstrBuilder AMDGPU::buildReadAnyLaneB32(MachineIRBuilder &B, +const DstOp &SgprDst, +const SrcOp &VgprSrc, +const RegisterBankInfo &RBI) { + auto RFL = B.buildInstr(AMDGPU::G_READANYLANE, {SgprDst}, {VgprSrc}); + Register Dst = RFL->getOperand(0).getReg(); + Register Src = RFL->getOperand(1).getReg(); + MachineRegisterInfo &MRI = *B.getMRI(); + if (!MRI.getRegBankOrNull(Dst)) +MRI.setRegBank(Dst, RBI.getRegBank(SGPRRegBankID)); + if (!MRI.getRegBankOrNull(Src)) +MRI.setRegBank(Src, RBI.getRegBank(VGPRRegBankID)); + return RFL; +} + +MachineInstrBuilder +AMDGPU::buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B, const DstOp &SgprDst, + const SrcOp &VgprSrc, LLT B32Ty, + const RegisterBankInfo &RBI) { + MachineRegisterInfo &MRI = *B.getMRI(); + SmallVector SgprDstParts; + auto Unmerge = B.buildUnmerge(B32Ty, VgprSrc); + for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { +SgprDstParts.push_back( +buildReadAnyLaneB32(B, B32Ty, Unmerge.getReg(i), RBI).getReg(0)); + } + + auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts); + MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID)); + return Merge; +} + +MachineInstrBuilder +AMDGPU::buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B, const DstOp &SgprDst, + const SrcOp &VgprSrc, + const RegisterBankInfo &RBI) { + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + MachineRegisterInfo &MRI = *B.getMRI(); + SmallVector SgprDstParts; + auto Unmerge = B.buildUnmerge(S64, VgprSrc); + + for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { +MRI.setRegBank(Unmerge.getReg(i), RBI.getRegBank(AMDGPU::VGPRRegBankID)); +auto Unmerge64 = B.buildUnmerge(S32, Unmerge.getReg(i)); +SmallVector Unmerge64Parts; +Unmerge64Parts.push_back( +buildReadAnyLaneB32(B, S32, Unmerge64.getReg(0), RBI).getReg(0)); +Unmerge64Parts.push_back( +buildReadAnyLaneB32(B, S32, Unmerge64.getReg(1), RBI).getReg(0)); +Register MergeReg = B.buildMergeLikeInstr(S64, Unmerge64Parts).getReg(0); +MRI.setRegBank(MergeReg, RBI.getRegBank(AMDGPU::SGPRRegBankID)); +SgprDstParts.push_back(MergeReg); + } + + auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts); + MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID)); + return Merge; +} + +MachineInstrBuilder AMDGPU::buildReadAnyLane(MachineIRBuilder &B, + const DstOp &SgprDst, + const SrcOp &VgprSrc, arsenm wrote: SrcOp / DstOp are for MachineIRBuilder, and other code probably shouldn't be using them https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -0,0 +1,118 @@ +//===- AMDGPURBLegalizeHelper *- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURBLEGALIZEHELPER_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPURBLEGALIZEHELPER_H + +#include "AMDGPURBLegalizeRules.h" +#include "AMDGPURegisterBankInfo.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" + +namespace llvm { +namespace AMDGPU { + +// Receives list of RegBankLLTMapingApplyID and applies register banks on all +// operands. It is user's responsibility to provide RegBankLLTMapingApplyIDs for +// all register operands, there is no need to specify NonReg for trailing imm +// operands. This finishes selection of register banks if there is no need to +// replace instruction. In other case InstApplyMethod will create new +// instruction(s). +class RegBankLegalizeHelper { + MachineIRBuilder &B; + MachineRegisterInfo &MRI; + const MachineUniformityInfo &MUI; + const RegisterBankInfo &RBI; + const RegBankLegalizeRules &RBLRules; + const RegisterBank *SgprRB; + const RegisterBank *VgprRB; + const RegisterBank *VccRB; + + LLT S1 = LLT::scalar(1); + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + LLT V2S16 = LLT::fixed_vector(2, 16); + LLT V2S32 = LLT::fixed_vector(2, 32); + LLT V3S32 = LLT::fixed_vector(3, 32); + LLT V4S32 = LLT::fixed_vector(4, 32); + LLT V6S32 = LLT::fixed_vector(6, 32); + LLT V7S32 = LLT::fixed_vector(7, 32); + LLT V8S32 = LLT::fixed_vector(8, 32); + + LLT V3S64 = LLT::fixed_vector(3, 64); + LLT V4S64 = LLT::fixed_vector(4, 64); + LLT V16S64 = LLT::fixed_vector(16, 64); + + LLT P1 = LLT::pointer(1, 64); + LLT P4 = LLT::pointer(4, 64); + LLT P6 = LLT::pointer(6, 32); + +public: + RegBankLegalizeHelper(MachineIRBuilder &B, MachineRegisterInfo &MRI, +const MachineUniformityInfo &MUI, +const RegisterBankInfo &RBI, +const RegBankLegalizeRules &RBLRules) + : B(B), MRI(MRI), MUI(MUI), RBI(RBI), RBLRules(RBLRules), +SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), +VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), +VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}; + + bool findRuleAndApplyMapping(MachineInstr &MI); + + // Manual apply helpers. + void applyMappingPHI(MachineInstr &MI); + void applyMappingTrivial(MachineInstr &MI); + +private: + Register createVgpr(LLT Ty) { +return MRI.createVirtualRegister({VgprRB, Ty}); + } + Register createSgpr(LLT Ty) { +return MRI.createVirtualRegister({SgprRB, Ty}); + } + Register createVcc() { return MRI.createVirtualRegister({VccRB, S1}); } + + const RegisterBank *getRegBank(Register Reg) { +const RegisterBank *RB = MRI.getRegBankOrNull(Reg); +// This assert is not guaranteed by default. RB-select ensures that all +// instructions that we want to RB-legalize have reg banks on all registers. +// There might be a few exceptions. Workaround for them is to not write +// 'mapping' for register operand that is expected to have reg class. +assert(RB); +return RB; arsenm wrote: Should just introduce an asserting variant directly in MRI. What about the assigned register class case? https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -69,6 +81,241 @@ FunctionPass *llvm::createAMDGPURBLegalizePass() { using namespace AMDGPU; +const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, + MachineRegisterInfo &MRI) { + static std::mutex GlobalMutex; + static SmallDenseMap> + CacheForRuleSet; + std::lock_guard Lock(GlobalMutex); + if (!CacheForRuleSet.contains(ST.getGeneration())) { +auto Rules = std::make_unique(ST, MRI); +CacheForRuleSet[ST.getGeneration()] = std::move(Rules); + } else { +CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI); + } + return *CacheForRuleSet[ST.getGeneration()]; +} + bool AMDGPURBLegalize::runOnMachineFunction(MachineFunction &MF) { + + const GCNSubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Setup the instruction builder with CSE. + std::unique_ptr MIRBuilder; + const TargetPassConfig &TPC = getAnalysis(); + GISelCSEAnalysisWrapper &Wrapper = + getAnalysis().getCSEWrapper(); + GISelCSEInfo *CSEInfo = nullptr; + GISelObserverWrapper Observer; + + if (TPC.isGISelCSEEnabled()) { +MIRBuilder = std::make_unique(); +CSEInfo = &Wrapper.get(TPC.getCSEConfig()); +MIRBuilder->setCSEInfo(CSEInfo); +Observer.addObserver(CSEInfo); +MIRBuilder->setChangeObserver(Observer); + } else { +MIRBuilder = std::make_unique(); + } + MIRBuilder->setMF(MF); + + RAIIDelegateInstaller DelegateInstaller(MF, &Observer); + RAIIMFObserverInstaller MFObserverInstaller(MF, Observer); + + const MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes. + const RegBankLegalizeRules &RBLRules = getRules(ST, MRI); + + // Logic that does legalization based on IDs assigned to Opcode. + RegBankLegalizeHelper RBLegalizeHelper(*MIRBuilder, MRI, MUI, RBI, RBLRules); + + SmallVector AllInst; + + for (auto &MBB : MF) { +for (MachineInstr &MI : MBB) { + AllInst.push_back(&MI); +} + } + + for (auto &MI : AllInst) { arsenm wrote: no auto https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -0,0 +1,334 @@ +//===-- AMDGPURBLegalizeRules.cpp -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +/// Definitions of RBLegalize Rules for all opcodes. +/// Implementation of container for all the Rules and search. +/// Fast search for most common case when Rule.Predicate checks LLT and +/// uniformity of register in operand 0. +// +//===--===// + +#include "AMDGPURBLegalizeRules.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" + +using namespace llvm; +using namespace AMDGPU; + +RegBankLLTMapping::RegBankLLTMapping( +std::initializer_list DstOpMappingList, +std::initializer_list SrcOpMappingList, +LoweringMethodID LoweringMethod) +: DstOpMapping(DstOpMappingList), SrcOpMapping(SrcOpMappingList), + LoweringMethod(LoweringMethod) {} + +PredicateMapping::PredicateMapping( +std::initializer_list OpList, +std::function TestFunc) +: OpUniformityAndTypes(OpList), TestFunc(TestFunc) {} + +bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, + const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) { + switch (UniID) { + case S1: +return MRI.getType(Reg) == LLT::scalar(1); + case S16: +return MRI.getType(Reg) == LLT::scalar(16); + case S32: +return MRI.getType(Reg) == LLT::scalar(32); + case S64: +return MRI.getType(Reg) == LLT::scalar(64); + case P1: +return MRI.getType(Reg) == LLT::pointer(1, 64); + + case UniS1: +return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniform(Reg); + case UniS16: +return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniform(Reg); + case UniS32: +return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg); + case UniS64: +return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg); + + case DivS1: +return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg); + case DivS32: +return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg); + case DivS64: +return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg); + case DivP1: +return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg); + + case _: +return true; + default: +llvm_unreachable("missing matchUniformityAndLLT\n"); + } +} + +bool PredicateMapping::match(const MachineInstr &MI, + const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) const { + // Check LLT signature. + for (unsigned i = 0; i < OpUniformityAndTypes.size(); ++i) { +if (OpUniformityAndTypes[i] == _) { + if (MI.getOperand(i).isReg() && + MI.getOperand(i).getReg() != AMDGPU::NoRegister) +return false; + continue; +} + +// Remaining IDs check registers. +if (!MI.getOperand(i).isReg()) + return false; + +if (!matchUniformityAndLLT(MI.getOperand(i).getReg(), + OpUniformityAndTypes[i], MUI, MRI)) + return false; + } + + // More complex check. + if (TestFunc) +return TestFunc(MI); + + return true; +} + +SetOfRulesForOpcode::SetOfRulesForOpcode() {} + +SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes) +: FastTypes(FastTypes) {} + +UniformityLLTOpPredicateID LLTToId(LLT Ty) { + if (Ty == LLT::scalar(16)) +return S16; + if (Ty == LLT::scalar(32)) +return S32; + if (Ty == LLT::scalar(64)) +return S64; + if (Ty == LLT::fixed_vector(2, 16)) +return V2S16; + if (Ty == LLT::fixed_vector(2, 32)) +return V2S32; + if (Ty == LLT::fixed_vector(3, 32)) +return V3S32; + if (Ty == LLT::fixed_vector(4, 32)) +return V4S32; + return _; +} + +const RegBankLLTMapping & +SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const MachineUniformityInfo &MUI) const { + // Search in "Fast Rules". + // Note: if fast rules are enabled, RegBankLLTMapping must be added in each + // slot that could "match fast Predicate". If not, Invalid Mapping is + // returned which results in failure, does not search "Slow Rules". + if (FastTypes != No) { +Register Reg = MI.getOperand(0).getReg(); +int Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg))); +if (Slot != -1) { + if (MUI.isUniform(Reg)) +return Uni[Slot]; + else arsenm wrote: No else after return https://github.com/llvm/llvm-project/pull/112864 __
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -0,0 +1,258 @@ +//===- AMDGPURBLegalizeRules -*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURBLEGALIZERULES_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPURBLEGALIZERULES_H + +#include "llvm/CodeGen/MachineUniformityAnalysis.h" + +namespace llvm { + +class GCNSubtarget; + +namespace AMDGPU { + +// IDs used to build predicate for RBSRule. Predicate can have one or more IDs +// and each represents a check for 'uniform or divergent' + LLT or just LLT on +// register operand. +// Most often checking one operand is enough to decide which RegBankLLTMapping +// to apply (see Fast Rules), IDs are useful when two or more operands need to +// be checked. +enum UniformityLLTOpPredicateID { + _, + // scalars + S1, + S16, + S32, + S64, + + UniS1, + UniS16, + UniS32, + UniS64, + + DivS1, + DivS32, + DivS64, + + // pointers + P1, + + DivP1, + + // vectors + V2S16, + V2S32, + V3S32, + V4S32, +}; + +// How to apply register bank on register operand. +// In most cases, this serves as a LLT and register bank assert. +// Can change operands and insert copies, extends, truncs, and readfirstlanes. +// Anything more complicated requires LoweringMethod. +enum RegBankLLTMapingApplyID { + Invalid, + None, + IntrId, + Imm, + Vcc, + + // sgpr scalars, pointers, vectors and B-types + Sgpr16, + Sgpr32, + Sgpr64, + SgprV4S32, + + // vgpr scalars, pointers, vectors and B-types + Vgpr32, + Vgpr64, + VgprP1, + VgprV4S32, + + // Dst only modifiers: read-any-lane and truncs + UniInVcc, + UniInVgprS32, + UniInVgprV4S32, + + Sgpr32Trunc, + + // Src only modifiers: waterfalls, extends + Sgpr32AExt, + Sgpr32AExtBoolInReg, + Sgpr32SExt, +}; + +// Instruction needs to be replaced with sequence of instructions. Lowering was +// not done by legalizer since instructions is available in either SGPR or VGPR. +// For example S64 AND is available on SGPR, for that reason S64 AND is legal in +// context of Legalizer that only checks LLT. But S64 AND is not available on +// VGPR. Lower it to two S32 VGPR ANDs. +enum LoweringMethodID { + DoNotLower, + UniExtToSel, + VgprToVccCopy, + SplitTo32, + Ext32To64, + UniCstExt, +}; + +enum FastRulesTypes { + No, + Standard, // S16, S32, S64, V2S16 + Vector, // S32, V2S32, V3S32, V4S32 +}; + +struct RegBankLLTMapping { + SmallVector DstOpMapping; + SmallVector SrcOpMapping; + LoweringMethodID LoweringMethod; + RegBankLLTMapping( + std::initializer_list DstOpMappingList, + std::initializer_list SrcOpMappingList, + LoweringMethodID LoweringMethod = DoNotLower); +}; + +struct PredicateMapping { + SmallVector OpUniformityAndTypes; + std::function TestFunc; + PredicateMapping( + std::initializer_list OpList, + std::function TestFunc = nullptr); + + bool match(const MachineInstr &MI, const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) const; +}; + +struct RBSRule { + PredicateMapping Predicate; + RegBankLLTMapping OperandMapping; +}; + +class SetOfRulesForOpcode { + // "Slow Rules". More complex 'Rules[i].Predicate', check them one by one. + SmallVector Rules; + + // "Fast Rules" + // Instead of testing each 'Rules[i].Predicate' we do direct access to + // RegBankLLTMapping using getFastPredicateSlot. For example if: + // - FastTypes == Standard Uni[0] holds Mapping in case Op 0 is uniform S32 + // - FastTypes == Vector Div[3] holds Mapping in case Op 0 is divergent V4S32 + FastRulesTypes FastTypes = No; +#define InvMapping RegBankLLTMapping({Invalid}, {Invalid}) + RegBankLLTMapping Uni[4] = {InvMapping, InvMapping, InvMapping, InvMapping}; + RegBankLLTMapping Div[4] = {InvMapping, InvMapping, InvMapping, InvMapping}; + +public: + SetOfRulesForOpcode(); + SetOfRulesForOpcode(FastRulesTypes FastTypes); + + const RegBankLLTMapping & + findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const MachineUniformityInfo &MUI) const; + + void addRule(RBSRule Rule); + + void addFastRuleDivergent(UniformityLLTOpPredicateID Ty, +RegBankLLTMapping RuleApplyIDs); + void addFastRuleUniform(UniformityLLTOpPredicateID Ty, + RegBankLLTMapping RuleApplyIDs); + +private: + int getFastPredicateSlot(UniformityLLTOpPredicateID Ty) const; +}; + +// Essentially 'map' but a +// little more efficient. +class RegBankLegalizeRules { + const GCNSubtarget *ST; + MachineRegisterInfo *MRI; + // Separate maps for G-opcodes and instrinsics since they are in differents + // enums. Multiple opcodes can share same set of rules. + // RulesAlias = map + // Rules = map + SmallDens
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -0,0 +1,334 @@ +//===-- AMDGPURBLegalizeRules.cpp -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +/// Definitions of RBLegalize Rules for all opcodes. +/// Implementation of container for all the Rules and search. +/// Fast search for most common case when Rule.Predicate checks LLT and +/// uniformity of register in operand 0. +// +//===--===// + +#include "AMDGPURBLegalizeRules.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" + +using namespace llvm; +using namespace AMDGPU; + +RegBankLLTMapping::RegBankLLTMapping( +std::initializer_list DstOpMappingList, +std::initializer_list SrcOpMappingList, +LoweringMethodID LoweringMethod) +: DstOpMapping(DstOpMappingList), SrcOpMapping(SrcOpMappingList), + LoweringMethod(LoweringMethod) {} + +PredicateMapping::PredicateMapping( +std::initializer_list OpList, +std::function TestFunc) +: OpUniformityAndTypes(OpList), TestFunc(TestFunc) {} + +bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, + const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) { + switch (UniID) { + case S1: +return MRI.getType(Reg) == LLT::scalar(1); + case S16: +return MRI.getType(Reg) == LLT::scalar(16); + case S32: +return MRI.getType(Reg) == LLT::scalar(32); + case S64: +return MRI.getType(Reg) == LLT::scalar(64); + case P1: +return MRI.getType(Reg) == LLT::pointer(1, 64); + + case UniS1: +return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniform(Reg); + case UniS16: +return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniform(Reg); + case UniS32: +return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg); + case UniS64: +return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg); + + case DivS1: +return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg); + case DivS32: +return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg); + case DivS64: +return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg); + case DivP1: +return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg); + + case _: +return true; + default: +llvm_unreachable("missing matchUniformityAndLLT\n"); + } +} + +bool PredicateMapping::match(const MachineInstr &MI, + const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) const { + // Check LLT signature. + for (unsigned i = 0; i < OpUniformityAndTypes.size(); ++i) { +if (OpUniformityAndTypes[i] == _) { + if (MI.getOperand(i).isReg() && + MI.getOperand(i).getReg() != AMDGPU::NoRegister) +return false; + continue; +} + +// Remaining IDs check registers. +if (!MI.getOperand(i).isReg()) + return false; + +if (!matchUniformityAndLLT(MI.getOperand(i).getReg(), + OpUniformityAndTypes[i], MUI, MRI)) + return false; + } + + // More complex check. + if (TestFunc) +return TestFunc(MI); + + return true; +} + +SetOfRulesForOpcode::SetOfRulesForOpcode() {} + +SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes) +: FastTypes(FastTypes) {} + +UniformityLLTOpPredicateID LLTToId(LLT Ty) { + if (Ty == LLT::scalar(16)) +return S16; + if (Ty == LLT::scalar(32)) +return S32; + if (Ty == LLT::scalar(64)) +return S64; + if (Ty == LLT::fixed_vector(2, 16)) +return V2S16; + if (Ty == LLT::fixed_vector(2, 32)) +return V2S32; + if (Ty == LLT::fixed_vector(3, 32)) +return V3S32; + if (Ty == LLT::fixed_vector(4, 32)) +return V4S32; + return _; +} + +const RegBankLLTMapping & +SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const MachineUniformityInfo &MUI) const { + // Search in "Fast Rules". + // Note: if fast rules are enabled, RegBankLLTMapping must be added in each + // slot that could "match fast Predicate". If not, Invalid Mapping is + // returned which results in failure, does not search "Slow Rules". + if (FastTypes != No) { arsenm wrote: "No"? https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -107,3 +107,183 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) { S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg()); } } + +MachineInstrBuilder AMDGPU::buildReadAnyLaneB32(MachineIRBuilder &B, +const DstOp &SgprDst, +const SrcOp &VgprSrc, +const RegisterBankInfo &RBI) { + auto RFL = B.buildInstr(AMDGPU::G_READANYLANE, {SgprDst}, {VgprSrc}); + Register Dst = RFL->getOperand(0).getReg(); + Register Src = RFL->getOperand(1).getReg(); + MachineRegisterInfo &MRI = *B.getMRI(); + if (!MRI.getRegBankOrNull(Dst)) +MRI.setRegBank(Dst, RBI.getRegBank(SGPRRegBankID)); + if (!MRI.getRegBankOrNull(Src)) +MRI.setRegBank(Src, RBI.getRegBank(VGPRRegBankID)); + return RFL; +} + +MachineInstrBuilder +AMDGPU::buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B, const DstOp &SgprDst, + const SrcOp &VgprSrc, LLT B32Ty, + const RegisterBankInfo &RBI) { + MachineRegisterInfo &MRI = *B.getMRI(); + SmallVector SgprDstParts; + auto Unmerge = B.buildUnmerge(B32Ty, VgprSrc); + for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { +SgprDstParts.push_back( +buildReadAnyLaneB32(B, B32Ty, Unmerge.getReg(i), RBI).getReg(0)); + } + + auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts); + MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID)); + return Merge; +} + +MachineInstrBuilder +AMDGPU::buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B, const DstOp &SgprDst, + const SrcOp &VgprSrc, + const RegisterBankInfo &RBI) { + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + MachineRegisterInfo &MRI = *B.getMRI(); + SmallVector SgprDstParts; + auto Unmerge = B.buildUnmerge(S64, VgprSrc); + + for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { +MRI.setRegBank(Unmerge.getReg(i), RBI.getRegBank(AMDGPU::VGPRRegBankID)); +auto Unmerge64 = B.buildUnmerge(S32, Unmerge.getReg(i)); +SmallVector Unmerge64Parts; +Unmerge64Parts.push_back( +buildReadAnyLaneB32(B, S32, Unmerge64.getReg(0), RBI).getReg(0)); +Unmerge64Parts.push_back( +buildReadAnyLaneB32(B, S32, Unmerge64.getReg(1), RBI).getReg(0)); +Register MergeReg = B.buildMergeLikeInstr(S64, Unmerge64Parts).getReg(0); +MRI.setRegBank(MergeReg, RBI.getRegBank(AMDGPU::SGPRRegBankID)); +SgprDstParts.push_back(MergeReg); + } + + auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts); + MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID)); + return Merge; +} + +MachineInstrBuilder AMDGPU::buildReadAnyLane(MachineIRBuilder &B, + const DstOp &SgprDst, + const SrcOp &VgprSrc, + const RegisterBankInfo &RBI) { + MachineRegisterInfo &MRI = *B.getMRI(); + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + LLT S256 = LLT::scalar(256); + LLT V2S16 = LLT::fixed_vector(2, 16); + LLT Ty = SgprDst.getLLTTy(MRI); + + if (Ty == S16) { +return B.buildTrunc( +SgprDst, buildReadAnyLaneB32(B, S32, B.buildAnyExt(S32, VgprSrc), RBI)); + } + + if (Ty == S32 || Ty == V2S16 || + (Ty.isPointer() && Ty.getSizeInBits() == 32)) { +return buildReadAnyLaneB32(B, SgprDst, VgprSrc, RBI); + } + + if (Ty == S64 || Ty == S256 || (Ty.isPointer() && Ty.getSizeInBits() == 64) || + (Ty.isVector() && Ty.getElementType() == S32)) { +return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, S32, RBI); + } + + if (Ty.isVector() && Ty.getElementType() == S16) { +return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, V2S16, RBI); + } + + if (Ty.isVector() && Ty.getElementType() == S64) { +return buildReadAnyLaneSequenceOfS64(B, SgprDst, VgprSrc, RBI); + } + + llvm_unreachable("Type not supported"); +} + +void AMDGPU::buildReadAnyLaneDst(MachineIRBuilder &B, MachineInstr &MI, + const RegisterBankInfo &RBI) { + MachineRegisterInfo &MRI = *B.getMRI(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstBank = MRI.getRegBankOrNull(Dst); + if (DstBank != &RBI.getRegBank(AMDGPU::SGPRRegBankID)) +return; + + Register VgprDst = MRI.createGenericVirtualRegister(MRI.getType(Dst)); + MRI.setRegBank(VgprDst, RBI.getRegBank(AMDGPU::VGPRRegBankID)); + + MI.getOperand(0).setReg(VgprDst); + MachineBasicBlock *MBB = MI.getParent(); + B.setInsertPt(*MBB, std::next(MI.getIterator())); + // readAnyLane VgprDst into Dst after MI. + buildReadAnyLane(B, Dst, VgprDst, RBI); + return; +} + +bool AMDGPU::isLaneMask(Register Reg, MachineRegisterInfo &MRI, +const SIReg
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -69,6 +81,241 @@ FunctionPass *llvm::createAMDGPURBLegalizePass() { using namespace AMDGPU; +const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, + MachineRegisterInfo &MRI) { + static std::mutex GlobalMutex; + static SmallDenseMap> + CacheForRuleSet; + std::lock_guard Lock(GlobalMutex); + if (!CacheForRuleSet.contains(ST.getGeneration())) { +auto Rules = std::make_unique(ST, MRI); +CacheForRuleSet[ST.getGeneration()] = std::move(Rules); + } else { +CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI); + } + return *CacheForRuleSet[ST.getGeneration()]; +} + bool AMDGPURBLegalize::runOnMachineFunction(MachineFunction &MF) { + + const GCNSubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Setup the instruction builder with CSE. + std::unique_ptr MIRBuilder; + const TargetPassConfig &TPC = getAnalysis(); + GISelCSEAnalysisWrapper &Wrapper = + getAnalysis().getCSEWrapper(); + GISelCSEInfo *CSEInfo = nullptr; + GISelObserverWrapper Observer; + + if (TPC.isGISelCSEEnabled()) { +MIRBuilder = std::make_unique(); +CSEInfo = &Wrapper.get(TPC.getCSEConfig()); +MIRBuilder->setCSEInfo(CSEInfo); +Observer.addObserver(CSEInfo); +MIRBuilder->setChangeObserver(Observer); + } else { +MIRBuilder = std::make_unique(); + } + MIRBuilder->setMF(MF); + + RAIIDelegateInstaller DelegateInstaller(MF, &Observer); + RAIIMFObserverInstaller MFObserverInstaller(MF, Observer); + + const MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes. + const RegBankLegalizeRules &RBLRules = getRules(ST, MRI); + + // Logic that does legalization based on IDs assigned to Opcode. + RegBankLegalizeHelper RBLegalizeHelper(*MIRBuilder, MRI, MUI, RBI, RBLRules); + + SmallVector AllInst; + + for (auto &MBB : MF) { +for (MachineInstr &MI : MBB) { + AllInst.push_back(&MI); +} + } + + for (auto &MI : AllInst) { +if (!MI->isPreISelOpcode()) + continue; + +unsigned Opc = MI->getOpcode(); + +// Insert point for use operands needs some calculation. +if (Opc == G_PHI) { + RBLegalizeHelper.applyMappingPHI(*MI); + continue; +} + +// Opcodes that support pretty much all combinations of reg banks and LLTs +// (except S1). There is no point in writing rules for them. +if (Opc == G_BUILD_VECTOR || Opc == G_UNMERGE_VALUES || +Opc == G_MERGE_VALUES) { + RBLegalizeHelper.applyMappingTrivial(*MI); + continue; +} + +// Opcodes that also support S1. S1 rules are in RegBankLegalizeRules. +// Remaining reg bank and LLT combinations are trivially accepted. +if ((Opc == G_CONSTANT || Opc == G_FCONSTANT || Opc == G_IMPLICIT_DEF) && +!isS1(MI->getOperand(0).getReg(), MRI)) { + assert(isSgprRB(MI->getOperand(0).getReg(), MRI)); + continue; +} + +if (!RBLegalizeHelper.findRuleAndApplyMapping(*MI)) { + MI->dump(); + llvm_unreachable("failed to match any of the rules"); +} + } + + LLT S1 = LLT::scalar(1); + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + + // SGPR S1 clean up combines: + // - SGPR S1(S32) to SGPR S1(S32) Copy: anyext + trunc combine. + // In RBLegalize 'S1 Dst' are legalized into S32 as'S1Dst = Trunc S32Dst' + // and 'S1 Src' into 'S32Src = Anyext S1Src'. + // S1 Truncs and Anyexts that come from legalizer will also be cleaned up. + // Note: they can have non-S32 types e.g. S16 = Anyext S1 or S1 = Trunc S64. + // - Sgpr S1(S32) to VCC Copy: G_COPY_VCC_SCC combine. + // Divergent instruction uses Sgpr S1 as input that should be lane mask(VCC) + // Legalizing this use creates Sgpr S1(S32) to VCC Copy. + + // Note: Remaining S1 copies, S1s are either SGPR S1(S32) or VCC S1: + // - VCC to VCC Copy: nothing to do here, just a regular copy. + // - VCC to SGPR S1 Copy: Should not exist in a form of COPY instruction(*). + // Note: For 'uniform-in-VCC to SGPR-S1 copy' G_COPY_SCC_VCC is used + // instead. When only available instruction creates VCC result, use of + // UniformInVcc results in creating G_COPY_SCC_VCC. + + // (*)Explanation for 'SGPR S1(uniform) = COPY VCC(divergent)': + // Copy from divergent to uniform register indicates an error in either: + // - Uniformity analysis: Uniform instruction has divergent input. If one of + // the inputs is divergent, instruction should be divergent! + // - RBLegalizer not executing in waterfall loop (missing implementation) + + using namespace MIPatternMatch; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + for (auto &MBB : MF) { +for (auto &MI : make_early_inc_range(MBB)) { + + if (MI.getOpcode() == G_TRUNC && isTriviallyDead(MI, MRI)) { +
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -107,3 +107,183 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) { S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg()); } } + +MachineInstrBuilder AMDGPU::buildReadAnyLaneB32(MachineIRBuilder &B, +const DstOp &SgprDst, +const SrcOp &VgprSrc, +const RegisterBankInfo &RBI) { + auto RFL = B.buildInstr(AMDGPU::G_READANYLANE, {SgprDst}, {VgprSrc}); + Register Dst = RFL->getOperand(0).getReg(); + Register Src = RFL->getOperand(1).getReg(); + MachineRegisterInfo &MRI = *B.getMRI(); + if (!MRI.getRegBankOrNull(Dst)) +MRI.setRegBank(Dst, RBI.getRegBank(SGPRRegBankID)); + if (!MRI.getRegBankOrNull(Src)) +MRI.setRegBank(Src, RBI.getRegBank(VGPRRegBankID)); + return RFL; +} + +MachineInstrBuilder +AMDGPU::buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B, const DstOp &SgprDst, + const SrcOp &VgprSrc, LLT B32Ty, + const RegisterBankInfo &RBI) { + MachineRegisterInfo &MRI = *B.getMRI(); + SmallVector SgprDstParts; + auto Unmerge = B.buildUnmerge(B32Ty, VgprSrc); + for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { +SgprDstParts.push_back( +buildReadAnyLaneB32(B, B32Ty, Unmerge.getReg(i), RBI).getReg(0)); + } + + auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts); + MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID)); + return Merge; +} + +MachineInstrBuilder +AMDGPU::buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B, const DstOp &SgprDst, + const SrcOp &VgprSrc, + const RegisterBankInfo &RBI) { + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + MachineRegisterInfo &MRI = *B.getMRI(); + SmallVector SgprDstParts; + auto Unmerge = B.buildUnmerge(S64, VgprSrc); + + for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { +MRI.setRegBank(Unmerge.getReg(i), RBI.getRegBank(AMDGPU::VGPRRegBankID)); +auto Unmerge64 = B.buildUnmerge(S32, Unmerge.getReg(i)); +SmallVector Unmerge64Parts; +Unmerge64Parts.push_back( +buildReadAnyLaneB32(B, S32, Unmerge64.getReg(0), RBI).getReg(0)); +Unmerge64Parts.push_back( +buildReadAnyLaneB32(B, S32, Unmerge64.getReg(1), RBI).getReg(0)); +Register MergeReg = B.buildMergeLikeInstr(S64, Unmerge64Parts).getReg(0); +MRI.setRegBank(MergeReg, RBI.getRegBank(AMDGPU::SGPRRegBankID)); +SgprDstParts.push_back(MergeReg); + } + + auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts); + MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID)); + return Merge; +} + +MachineInstrBuilder AMDGPU::buildReadAnyLane(MachineIRBuilder &B, + const DstOp &SgprDst, + const SrcOp &VgprSrc, + const RegisterBankInfo &RBI) { + MachineRegisterInfo &MRI = *B.getMRI(); + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + LLT S256 = LLT::scalar(256); + LLT V2S16 = LLT::fixed_vector(2, 16); + LLT Ty = SgprDst.getLLTTy(MRI); + + if (Ty == S16) { +return B.buildTrunc( +SgprDst, buildReadAnyLaneB32(B, S32, B.buildAnyExt(S32, VgprSrc), RBI)); + } + + if (Ty == S32 || Ty == V2S16 || + (Ty.isPointer() && Ty.getSizeInBits() == 32)) { +return buildReadAnyLaneB32(B, SgprDst, VgprSrc, RBI); + } + + if (Ty == S64 || Ty == S256 || (Ty.isPointer() && Ty.getSizeInBits() == 64) || + (Ty.isVector() && Ty.getElementType() == S32)) { +return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, S32, RBI); + } + + if (Ty.isVector() && Ty.getElementType() == S16) { +return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, V2S16, RBI); + } + + if (Ty.isVector() && Ty.getElementType() == S64) { +return buildReadAnyLaneSequenceOfS64(B, SgprDst, VgprSrc, RBI); + } + + llvm_unreachable("Type not supported"); +} + +void AMDGPU::buildReadAnyLaneDst(MachineIRBuilder &B, MachineInstr &MI, + const RegisterBankInfo &RBI) { + MachineRegisterInfo &MRI = *B.getMRI(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstBank = MRI.getRegBankOrNull(Dst); + if (DstBank != &RBI.getRegBank(AMDGPU::SGPRRegBankID)) +return; + + Register VgprDst = MRI.createGenericVirtualRegister(MRI.getType(Dst)); + MRI.setRegBank(VgprDst, RBI.getRegBank(AMDGPU::VGPRRegBankID)); + + MI.getOperand(0).setReg(VgprDst); + MachineBasicBlock *MBB = MI.getParent(); + B.setInsertPt(*MBB, std::next(MI.getIterator())); + // readAnyLane VgprDst into Dst after MI. + buildReadAnyLane(B, Dst, VgprDst, RBI); + return; +} + +bool AMDGPU::isLaneMask(Register Reg, MachineRegisterInfo &MRI, +const SIReg
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -69,3 +72,38 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, return std::pair(Reg, 0); } + +IntrinsicLaneMaskAnalyzer::IntrinsicLaneMaskAnalyzer(MachineFunction &MF) +: MRI(MF.getRegInfo()) { + initLaneMaskIntrinsics(MF); +} + +bool IntrinsicLaneMaskAnalyzer::isS32S64LaneMask(Register Reg) { + return S32S64LaneMask.contains(Reg); +} + +void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) { + for (auto &MBB : MF) { +for (auto &MI : MBB) { + if (MI.getOpcode() == AMDGPU::G_INTRINSIC && + MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID() == + Intrinsic::amdgcn_if_break) { +S32S64LaneMask.insert(MI.getOperand(3).getReg()); +findLCSSAPhi(MI.getOperand(0).getReg()); + } + + if (MI.getOpcode() == AMDGPU::SI_IF || + MI.getOpcode() == AMDGPU::SI_ELSE) { +findLCSSAPhi(MI.getOperand(0).getReg()); + } +} + } +} + +void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) { + S32S64LaneMask.insert(Reg); + for (auto &LCSSAPhi : MRI.use_instructions(Reg)) { arsenm wrote: ```suggestion for (const MachineInstr &LCSSAPhi : MRI.use_instructions(Reg)) { ``` https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang] Lower omp.workshare to other omp constructs (PR #101446)
https://github.com/ivanradanov updated https://github.com/llvm/llvm-project/pull/101446 >From cc9096e80fc62ba9c5a7d511ee7b8fd18750cb44 Mon Sep 17 00:00:00 2001 From: Ivan Radanov Ivanov Date: Sun, 4 Aug 2024 22:06:55 +0900 Subject: [PATCH 01/14] [flang] Lower omp.workshare to other omp constructs Change to workshare loop wrapper op Move single op declaration Schedule pass properly Correctly handle nested nested loop nests to be parallelized by workshare Leave comments for shouldUseWorkshareLowering Use copyprivate to scatter val from omp.single TODO still need to implement copy function TODO transitive check for usage outside of omp.single not imiplemented yet Transitively check for users outisde of single op TODO need to implement copy func TODO need to hoist allocas outside of single regions Add tests Hoist allocas More tests Emit body for copy func Test the tmp storing logic Clean up trivially dead ops Only handle single-block regions for now Fix tests for custom assembly for loop wrapper Only run the lower workshare pass if openmp is enabled Implement some missing functionality Fix tests Fix test Iterate backwards to find all trivially dead ops Add expalanation comment for createCopyFun Update test --- flang/include/flang/Optimizer/OpenMP/Passes.h | 5 + .../include/flang/Optimizer/OpenMP/Passes.td | 5 + flang/include/flang/Tools/CrossToolHelpers.h | 1 + flang/lib/Frontend/FrontendActions.cpp| 10 +- flang/lib/Optimizer/OpenMP/CMakeLists.txt | 1 + flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 446 ++ flang/lib/Optimizer/Passes/Pipelines.cpp | 6 +- flang/test/Fir/basic-program.fir | 1 + .../Transforms/OpenMP/lower-workshare.mlir| 189 .../Transforms/OpenMP/lower-workshare2.mlir | 23 + .../Transforms/OpenMP/lower-workshare3.mlir | 74 +++ .../Transforms/OpenMP/lower-workshare4.mlir | 59 +++ .../Transforms/OpenMP/lower-workshare5.mlir | 42 ++ .../Transforms/OpenMP/lower-workshare6.mlir | 51 ++ flang/tools/bbc/bbc.cpp | 5 +- flang/tools/tco/tco.cpp | 1 + 16 files changed, 915 insertions(+), 4 deletions(-) create mode 100644 flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp create mode 100644 flang/test/Transforms/OpenMP/lower-workshare.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare2.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare3.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare4.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare5.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare6.mlir diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.h b/flang/include/flang/Optimizer/OpenMP/Passes.h index 403d79667bf448..feb395f1a12dbd 100644 --- a/flang/include/flang/Optimizer/OpenMP/Passes.h +++ b/flang/include/flang/Optimizer/OpenMP/Passes.h @@ -25,6 +25,11 @@ namespace flangomp { #define GEN_PASS_REGISTRATION #include "flang/Optimizer/OpenMP/Passes.h.inc" +/// Impelements the logic specified in the 2.8.3 workshare Construct section of +/// the OpenMP standard which specifies what statements or constructs shall be +/// divided into units of work. +bool shouldUseWorkshareLowering(mlir::Operation *op); + } // namespace flangomp #endif // FORTRAN_OPTIMIZER_OPENMP_PASSES_H diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td index 1c0ce08f5b4838..dc1956bea9fb29 100644 --- a/flang/include/flang/Optimizer/OpenMP/Passes.td +++ b/flang/include/flang/Optimizer/OpenMP/Passes.td @@ -37,4 +37,9 @@ def FunctionFilteringPass : Pass<"omp-function-filtering"> { ]; } +// Needs to be scheduled on Module as we create functions in it +def LowerWorkshare : Pass<"lower-workshare", "::mlir::ModuleOp"> { + let summary = "Lower workshare construct"; +} + #endif //FORTRAN_OPTIMIZER_OPENMP_PASSES diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h index df4b21ada058fe..d936b739e58157 100644 --- a/flang/include/flang/Tools/CrossToolHelpers.h +++ b/flang/include/flang/Tools/CrossToolHelpers.h @@ -123,6 +123,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks { false; ///< Set no-signed-zeros-fp-math attribute for functions. bool UnsafeFPMath = false; ///< Set unsafe-fp-math attribute for functions. bool NSWOnLoopVarInc = false; ///< Add nsw flag to loop variable increments. + bool EnableOpenMP = false; ///< Enable OpenMP lowering. }; struct OffloadModuleOpts { diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index f2e460fc53a67f..8c21fe18e67b4d 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -715,7 +715,11 @@ void CodeGenAction::lowerHLFIRToFIR() { pm.enableVerifier(/*verifyPasses=*/true); // Create
[llvm-branch-commits] [flang] [WIP][flang] Introduce HLFIR lowerings to omp.workshare_loop_nest (PR #104748)
https://github.com/ivanradanov updated https://github.com/llvm/llvm-project/pull/104748 >From 5aca24559fc6f64a06f66a6d7e35f1edc82995a5 Mon Sep 17 00:00:00 2001 From: Ivan Radanov Ivanov Date: Sun, 4 Aug 2024 17:33:52 +0900 Subject: [PATCH 1/8] Add workshare loop wrapper lowerings Bufferize test Bufferize test Bufferize test Add test for should use workshare lowering --- .../HLFIR/Transforms/BufferizeHLFIR.cpp | 4 +- .../Transforms/OptimizedBufferization.cpp | 10 +- flang/test/HLFIR/bufferize-workshare.fir | 58 .../OpenMP/should-use-workshare-lowering.mlir | 140 ++ 4 files changed, 208 insertions(+), 4 deletions(-) create mode 100644 flang/test/HLFIR/bufferize-workshare.fir create mode 100644 flang/test/Transforms/OpenMP/should-use-workshare-lowering.mlir diff --git a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp index 07794828fce267..1848dbe2c7a2c2 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp @@ -26,6 +26,7 @@ #include "flang/Optimizer/HLFIR/HLFIRDialect.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/HLFIR/Passes.h" +#include "flang/Optimizer/OpenMP/Passes.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/IR/Dominance.h" #include "mlir/IR/PatternMatch.h" @@ -792,7 +793,8 @@ struct ElementalOpConversion // Generate a loop nest looping around the fir.elemental shape and clone // fir.elemental region inside the inner loop. hlfir::LoopNest loopNest = -hlfir::genLoopNest(loc, builder, extents, !elemental.isOrdered()); +hlfir::genLoopNest(loc, builder, extents, !elemental.isOrdered(), + flangomp::shouldUseWorkshareLowering(elemental)); auto insPt = builder.saveInsertionPoint(); builder.setInsertionPointToStart(loopNest.body); auto yield = hlfir::inlineElementalOp(loc, builder, elemental, diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp index 3a0a98dc594463..f014724861e333 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp @@ -20,6 +20,7 @@ #include "flang/Optimizer/HLFIR/HLFIRDialect.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/HLFIR/Passes.h" +#include "flang/Optimizer/OpenMP/Passes.h" #include "flang/Optimizer/Transforms/Utils.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/IR/Dominance.h" @@ -482,7 +483,8 @@ llvm::LogicalResult ElementalAssignBufferization::matchAndRewrite( // Generate a loop nest looping around the hlfir.elemental shape and clone // hlfir.elemental region inside the inner loop hlfir::LoopNest loopNest = - hlfir::genLoopNest(loc, builder, extents, !elemental.isOrdered()); + hlfir::genLoopNest(loc, builder, extents, !elemental.isOrdered(), + flangomp::shouldUseWorkshareLowering(elemental)); builder.setInsertionPointToStart(loopNest.body); auto yield = hlfir::inlineElementalOp(loc, builder, elemental, loopNest.oneBasedIndices); @@ -553,7 +555,8 @@ llvm::LogicalResult BroadcastAssignBufferization::matchAndRewrite( llvm::SmallVector extents = hlfir::getIndexExtents(loc, builder, shape); hlfir::LoopNest loopNest = - hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true); + hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true, + flangomp::shouldUseWorkshareLowering(assign)); builder.setInsertionPointToStart(loopNest.body); auto arrayElement = hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices); @@ -648,7 +651,8 @@ llvm::LogicalResult VariableAssignBufferization::matchAndRewrite( llvm::SmallVector extents = hlfir::getIndexExtents(loc, builder, shape); hlfir::LoopNest loopNest = - hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true); + hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true, + flangomp::shouldUseWorkshareLowering(assign)); builder.setInsertionPointToStart(loopNest.body); auto rhsArrayElement = hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices); diff --git a/flang/test/HLFIR/bufferize-workshare.fir b/flang/test/HLFIR/bufferize-workshare.fir new file mode 100644 index 00..9b7341ae43398a --- /dev/null +++ b/flang/test/HLFIR/bufferize-workshare.fir @@ -0,0 +1,58 @@ +// RUN: fir-opt --bufferize-hlfir %s | FileCheck %s + +// CHECK-LABEL: func.func @simple( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>) { +// CHECK: omp.parallel { +// CHECK: omp.workshare { +// CHECK: %[[VAL_1:.*]] = arith.c
[llvm-branch-commits] [llvm] [StructuralHash] Support Differences (PR #112638)
kyulee-com wrote: > IIRC we have several lit tests that cover structural hash, shouldn't we have > a new test there that uses the new functionality? Extended the existing `StructuralHashPrinterPass` with `Options`, and updated the corresponding lit test accordingly. https://github.com/llvm/llvm-project/pull/112638 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [StructuralHash] Support Differences (PR #112638)
@@ -47,24 +60,140 @@ class StructuralHashImpl { public: StructuralHashImpl() = delete; - explicit StructuralHashImpl(bool DetailedHash) : DetailedHash(DetailedHash) {} + explicit StructuralHashImpl(bool DetailedHash, + IgnoreOperandFunc IgnoreOp = nullptr) + : DetailedHash(DetailedHash), IgnoreOp(IgnoreOp) { +if (IgnoreOp) { + IndexInstruction = std::make_unique(); + IndexOperandHashMap = std::make_unique(); +} + } - stable_hash hashConstant(Constant *C) { + stable_hash hashAPInt(const APInt &I) { SmallVector Hashes; -// TODO: hashArbitaryType() is not stable. -if (ConstantInt *ConstInt = dyn_cast(C)) { - Hashes.emplace_back(hashArbitaryType(ConstInt->getValue())); -} else if (ConstantFP *ConstFP = dyn_cast(C)) { - Hashes.emplace_back(hashArbitaryType(ConstFP->getValue())); -} else if (Function *Func = dyn_cast(C)) - // Hashing the name will be deterministic as LLVM's hashing infrastructure - // has explicit support for hashing strings and will not simply hash - // the pointer. - Hashes.emplace_back(hashArbitaryType(Func->getName())); +Hashes.emplace_back(I.getBitWidth()); +for (unsigned J = 0; J < I.getNumWords(); ++J) + Hashes.emplace_back((I.getRawData())[J]); +return stable_hash_combine(Hashes); + } + stable_hash hashAPFloat(const APFloat &F) { +SmallVector Hashes; +const fltSemantics &S = F.getSemantics(); +Hashes.emplace_back(APFloat::semanticsPrecision(S)); +Hashes.emplace_back(APFloat::semanticsMaxExponent(S)); +Hashes.emplace_back(APFloat::semanticsMinExponent(S)); +Hashes.emplace_back(APFloat::semanticsSizeInBits(S)); +Hashes.emplace_back(hashAPInt(F.bitcastToAPInt())); kyulee-com wrote: yeah. we could simplify it. https://github.com/llvm/llvm-project/pull/112638 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang] Introduce custom loop nest generation for loops in workshare construct (PR #101445)
https://github.com/ivanradanov updated https://github.com/llvm/llvm-project/pull/101445 >From 159a2f46bf3a01322cb24539ede289ea089e62c6 Mon Sep 17 00:00:00 2001 From: Ivan Radanov Ivanov Date: Thu, 22 Aug 2024 18:07:05 +0900 Subject: [PATCH 1/2] [flang] Introduce ws loop nest generation for HLFIR lowering Emit loop nests in a custom wrapper Only emit unordered loops as omp loops Fix uninitialized memory bug in genLoopNest --- .../flang/Optimizer/Builder/HLFIRTools.h | 12 +++-- flang/lib/Lower/ConvertCall.cpp | 2 +- flang/lib/Lower/OpenMP/ReductionProcessor.cpp | 4 +- flang/lib/Optimizer/Builder/HLFIRTools.cpp| 52 ++- .../HLFIR/Transforms/BufferizeHLFIR.cpp | 3 +- .../LowerHLFIROrderedAssignments.cpp | 33 ++-- .../Transforms/OptimizedBufferization.cpp | 6 +-- 7 files changed, 69 insertions(+), 43 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h index 6b41025eea0780..f073f494b3fb21 100644 --- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h +++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h @@ -357,8 +357,8 @@ hlfir::ElementalOp genElementalOp( /// Structure to describe a loop nest. struct LoopNest { - fir::DoLoopOp outerLoop; - fir::DoLoopOp innerLoop; + mlir::Operation *outerOp = nullptr; + mlir::Block *body = nullptr; llvm::SmallVector oneBasedIndices; }; @@ -366,11 +366,13 @@ struct LoopNest { /// \p isUnordered specifies whether the loops in the loop nest /// are unordered. LoopNest genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder, - mlir::ValueRange extents, bool isUnordered = false); + mlir::ValueRange extents, bool isUnordered = false, + bool emitWorkshareLoop = false); inline LoopNest genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder, -mlir::Value shape, bool isUnordered = false) { +mlir::Value shape, bool isUnordered = false, +bool emitWorkshareLoop = false) { return genLoopNest(loc, builder, getIndexExtents(loc, builder, shape), - isUnordered); + isUnordered, emitWorkshareLoop); } /// Inline the body of an hlfir.elemental at the current insertion point diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp index 9f5b58590fb79e..e84e7afbe82e09 100644 --- a/flang/lib/Lower/ConvertCall.cpp +++ b/flang/lib/Lower/ConvertCall.cpp @@ -2135,7 +2135,7 @@ class ElementalCallBuilder { hlfir::genLoopNest(loc, builder, shape, !mustBeOrdered); mlir::ValueRange oneBasedIndices = loopNest.oneBasedIndices; auto insPt = builder.saveInsertionPoint(); - builder.setInsertionPointToStart(loopNest.innerLoop.getBody()); + builder.setInsertionPointToStart(loopNest.body); callContext.stmtCtx.pushScope(); for (auto &preparedActual : loweredActuals) if (preparedActual) diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp index 6b98ea3d0615b6..736de2ee511bef 100644 --- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp @@ -374,7 +374,7 @@ static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc, // know this won't miss any opportuinties for clever elemental inlining hlfir::LoopNest nest = hlfir::genLoopNest( loc, builder, shapeShift.getExtents(), /*isUnordered=*/true); - builder.setInsertionPointToStart(nest.innerLoop.getBody()); + builder.setInsertionPointToStart(nest.body); mlir::Type refTy = fir::ReferenceType::get(seqTy.getEleTy()); auto lhsEleAddr = builder.create( loc, refTy, lhs, shapeShift, /*slice=*/mlir::Value{}, @@ -388,7 +388,7 @@ static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc, builder, loc, redId, refTy, lhsEle, rhsEle); builder.create(loc, scalarReduction, lhsEleAddr); - builder.setInsertionPointAfter(nest.outerLoop); + builder.setInsertionPointAfter(nest.outerOp); builder.create(loc, lhsAddr); } diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp index 8d0ae2f195178c..31378841ed 100644 --- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp +++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp @@ -20,6 +20,7 @@ #include "mlir/IR/IRMapping.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/TypeSwitch.h" +#include #include // Return explicit extents. If the base is a fir.box, this won't read it to @@ -855,26 +856,51 @@ mlir::Value hlfir::inlineElementalOp( hlfir::LoopNest hlfir::genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder, - mlir::ValueRange extents, bool isUnordered) { +
[llvm-branch-commits] [flang] [WIP][flang] Introduce HLFIR lowerings to omp.workshare_loop_nest (PR #104748)
https://github.com/ivanradanov ready_for_review https://github.com/llvm/llvm-project/pull/104748 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang][omp] Emit omp.workshare in frontend (PR #101444)
https://github.com/ivanradanov updated https://github.com/llvm/llvm-project/pull/101444 >From 31ddd5c8bf59c4f6b386415c89bd87f80bb83409 Mon Sep 17 00:00:00 2001 From: Ivan Radanov Ivanov Date: Wed, 31 Jul 2024 14:11:47 +0900 Subject: [PATCH 1/2] [flang][omp] Emit omp.workshare in frontend Fix lower test for workshare --- flang/lib/Lower/OpenMP/OpenMP.cpp | 30 +++ flang/test/Lower/OpenMP/workshare.f90 | 6 +++--- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index cf469003b7298d..22f6d5bd09cd65 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1330,6 +1330,15 @@ static void genTaskwaitClauses(lower::AbstractConverter &converter, loc, llvm::omp::Directive::OMPD_taskwait); } +static void genWorkshareClauses(lower::AbstractConverter &converter, +semantics::SemanticsContext &semaCtx, +lower::StatementContext &stmtCtx, +const List &clauses, mlir::Location loc, +mlir::omp::WorkshareOperands &clauseOps) { + ClauseProcessor cp(converter, semaCtx, clauses); + cp.processNowait(clauseOps); +} + static void genTeamsClauses(lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx, lower::StatementContext &stmtCtx, @@ -1923,6 +1932,22 @@ genTaskyieldOp(lower::AbstractConverter &converter, lower::SymMap &symTable, return converter.getFirOpBuilder().create(loc); } +static mlir::omp::WorkshareOp +genWorkshareOp(lower::AbstractConverter &converter, lower::SymMap &symTable, + semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, + mlir::Location loc, const ConstructQueue &queue, + ConstructQueue::iterator item) { + lower::StatementContext stmtCtx; + mlir::omp::WorkshareOperands clauseOps; + genWorkshareClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps); + + return genOpWithBody( + OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval, +llvm::omp::Directive::OMPD_workshare) + .setClauses(&item->clauses), + queue, item, clauseOps); +} + static mlir::omp::TeamsOp genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, @@ -2515,10 +2540,7 @@ static void genOMPDispatch(lower::AbstractConverter &converter, llvm::omp::getOpenMPDirectiveName(dir) + ")"); // case llvm::omp::Directive::OMPD_workdistribute: case llvm::omp::Directive::OMPD_workshare: -// FIXME: Workshare is not a commonly used OpenMP construct, an -// implementation for this feature will come later. For the codes -// that use this construct, add a single construct for now. -genSingleOp(converter, symTable, semaCtx, eval, loc, queue, item); +genWorkshareOp(converter, symTable, semaCtx, eval, loc, queue, item); break; default: // Combined and composite constructs should have been split into a sequence diff --git a/flang/test/Lower/OpenMP/workshare.f90 b/flang/test/Lower/OpenMP/workshare.f90 index 1e11677a15e1f0..8e771952f5b6da 100644 --- a/flang/test/Lower/OpenMP/workshare.f90 +++ b/flang/test/Lower/OpenMP/workshare.f90 @@ -6,7 +6,7 @@ subroutine sb1(arr) integer :: arr(:) !CHECK: omp.parallel { !$omp parallel -!CHECK: omp.single { +!CHECK: omp.workshare { !$omp workshare arr = 0 !$omp end workshare @@ -20,7 +20,7 @@ subroutine sb2(arr) integer :: arr(:) !CHECK: omp.parallel { !$omp parallel -!CHECK: omp.single nowait { +!CHECK: omp.workshare nowait { !$omp workshare arr = 0 !$omp end workshare nowait @@ -33,7 +33,7 @@ subroutine sb2(arr) subroutine sb3(arr) integer :: arr(:) !CHECK: omp.parallel { -!CHECK: omp.single { +!CHECK: omp.workshare { !$omp parallel workshare arr = 0 !$omp end parallel workshare >From 2ff1ac16aff53775a7ed450a68eb46f23f342139 Mon Sep 17 00:00:00 2001 From: Ivan Radanov Ivanov Date: Thu, 22 Aug 2024 17:01:43 +0900 Subject: [PATCH 2/2] Fix function signature --- flang/lib/Lower/OpenMP/OpenMP.cpp | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 22f6d5bd09cd65..daeb928e53d061 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1934,12 +1934,14 @@ genTaskyieldOp(lower::AbstractConverter &converter, lower::SymMap &symTable, static mlir::omp::WorkshareOp genWorkshareOp(lower::AbstractConverter &converter, lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - mlir::Location loc, const ConstructQueue &queue, - ConstructQueue::
[llvm-branch-commits] [llvm] [StructuralHash] Support Differences (PR #112638)
@@ -47,24 +60,140 @@ class StructuralHashImpl { public: StructuralHashImpl() = delete; - explicit StructuralHashImpl(bool DetailedHash) : DetailedHash(DetailedHash) {} + explicit StructuralHashImpl(bool DetailedHash, + IgnoreOperandFunc IgnoreOp = nullptr) + : DetailedHash(DetailedHash), IgnoreOp(IgnoreOp) { +if (IgnoreOp) { + IndexInstruction = std::make_unique(); + IndexOperandHashMap = std::make_unique(); +} + } - stable_hash hashConstant(Constant *C) { + stable_hash hashAPInt(const APInt &I) { SmallVector Hashes; -// TODO: hashArbitaryType() is not stable. -if (ConstantInt *ConstInt = dyn_cast(C)) { - Hashes.emplace_back(hashArbitaryType(ConstInt->getValue())); -} else if (ConstantFP *ConstFP = dyn_cast(C)) { - Hashes.emplace_back(hashArbitaryType(ConstFP->getValue())); -} else if (Function *Func = dyn_cast(C)) - // Hashing the name will be deterministic as LLVM's hashing infrastructure - // has explicit support for hashing strings and will not simply hash - // the pointer. - Hashes.emplace_back(hashArbitaryType(Func->getName())); +Hashes.emplace_back(I.getBitWidth()); +for (unsigned J = 0; J < I.getNumWords(); ++J) + Hashes.emplace_back((I.getRawData())[J]); +return stable_hash_combine(Hashes); + } + stable_hash hashAPFloat(const APFloat &F) { +SmallVector Hashes; +const fltSemantics &S = F.getSemantics(); +Hashes.emplace_back(APFloat::semanticsPrecision(S)); +Hashes.emplace_back(APFloat::semanticsMaxExponent(S)); +Hashes.emplace_back(APFloat::semanticsMinExponent(S)); +Hashes.emplace_back(APFloat::semanticsSizeInBits(S)); +Hashes.emplace_back(hashAPInt(F.bitcastToAPInt())); return stable_hash_combine(Hashes); } + stable_hash hashGlobalValue(const GlobalValue *GV) { +if (!GV->hasName()) + return 0; +return stable_hash_name(GV->getName()); + } + + // Compute a hash for a Constant. This function is logically similar to + // FunctionComparator::cmpConstants() in FunctionComparator.cpp, but here + // we're interested in computing a hash rather than comparing two Constants. + // Some of the logic is simplified, e.g, we don't expand GEPOperator. + stable_hash hashConstant(Constant *C) { +SmallVector Hashes; + +Type *Ty = C->getType(); +Hashes.emplace_back(hashType(Ty)); + +if (C->isNullValue()) { + Hashes.emplace_back(static_cast('N')); + return stable_hash_combine(Hashes); +} + +auto *G = dyn_cast(C); +if (G) { + Hashes.emplace_back(hashGlobalValue(G)); + return stable_hash_combine(Hashes); +} + +if (const auto *Seq = dyn_cast(C)) { + Hashes.emplace_back(xxh3_64bits(Seq->getRawDataValues())); + return stable_hash_combine(Hashes); +} + +switch (C->getValueID()) { +case Value::UndefValueVal: +case Value::PoisonValueVal: +case Value::ConstantTokenNoneVal: { + return stable_hash_combine(Hashes); +} +case Value::ConstantIntVal: { + const APInt &Int = cast(C)->getValue(); + Hashes.emplace_back(hashAPInt(Int)); + return stable_hash_combine(Hashes); +} +case Value::ConstantFPVal: { + const APFloat &APF = cast(C)->getValueAPF(); + Hashes.emplace_back(hashAPFloat(APF)); + return stable_hash_combine(Hashes); +} +case Value::ConstantArrayVal: { + const ConstantArray *A = cast(C); + uint64_t NumElements = cast(Ty)->getNumElements(); + Hashes.emplace_back(NumElements); + for (auto &Op : A->operands()) { +auto H = hashConstant(cast(Op)); +Hashes.emplace_back(H); + } + return stable_hash_combine(Hashes); +} +case Value::ConstantStructVal: { + const ConstantStruct *S = cast(C); + unsigned NumElements = cast(Ty)->getNumElements(); + Hashes.emplace_back(NumElements); + for (auto &Op : S->operands()) { +auto H = hashConstant(cast(Op)); +Hashes.emplace_back(H); + } + return stable_hash_combine(Hashes); kyulee-com wrote: Most cases are simply grouped. https://github.com/llvm/llvm-project/pull/112638 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [StructuralHash] Support Differences (PR #112638)
@@ -100,8 +233,20 @@ class StructuralHashImpl { if (const auto *ComparisonInstruction = dyn_cast(&Inst)) Hashes.emplace_back(ComparisonInstruction->getPredicate()); -for (const auto &Op : Inst.operands()) - Hashes.emplace_back(hashOperand(Op)); +unsigned InstIdx = 0; +if (IndexInstruction) { + InstIdx = IndexInstruction->size(); + IndexInstruction->insert({InstIdx, const_cast(&Inst)}); kyulee-com wrote: Instruction is inserted once by design in this pass. In fact, this map `IndexInstruction` itself can't catch the duplication as the key is `index`, not `Instruction *`. Anyhow, replaced `insert` by `trace_emplace` for efficiency. https://github.com/llvm/llvm-project/pull/112638 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + Register Reg = Op.getReg(); + if (Reg == AMDGPU::EXEC) { +return true; + } +} + } + + return false; +} + +Register getVReg(MachineOperand &Op) { + if (!Op.isReg()) +return 0; tschuett wrote: `std::nullopt` https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + Register Reg = Op.getReg(); + if (Reg == AMDGPU::EXEC) { +return true; + } +} + } + + return false; +} + +Register getVReg(MachineOperand &Op) { tschuett wrote: std::optional https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
https://github.com/tschuett edited https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [StructuralHash] Support Differences (PR #112638)
kyulee-com wrote: The test failure `TableGen/x86-fold-tables.td` seems unrelated. https://github.com/llvm/llvm-project/pull/112638 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] release/19.x: [clang] Make LazyOffsetPtr more portable (#112927) (PR #113052)
llvmbot wrote: @llvm/pr-subscribers-clang Author: None (llvmbot) Changes Backport 76196998e25b98d81abc437708622261810782ca Requested by: @mgorny --- Full diff: https://github.com/llvm/llvm-project/pull/113052.diff 1 Files Affected: - (modified) clang/include/clang/AST/ExternalASTSource.h (+35-13) ``diff diff --git a/clang/include/clang/AST/ExternalASTSource.h b/clang/include/clang/AST/ExternalASTSource.h index 385c32edbae0fd..582ed7c65f58ca 100644 --- a/clang/include/clang/AST/ExternalASTSource.h +++ b/clang/include/clang/AST/ExternalASTSource.h @@ -25,10 +25,12 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator.h" #include "llvm/Support/PointerLikeTypeTraits.h" +#include #include #include #include #include +#include #include #include @@ -326,29 +328,49 @@ struct LazyOffsetPtr { /// /// If the low bit is clear, a pointer to the AST node. If the low /// bit is set, the upper 63 bits are the offset. - mutable uint64_t Ptr = 0; + static constexpr size_t DataSize = std::max(sizeof(uint64_t), sizeof(T *)); + alignas(uint64_t) alignas(T *) mutable unsigned char Data[DataSize] = {}; + + unsigned char GetLSB() const { +return Data[llvm::sys::IsBigEndianHost ? DataSize - 1 : 0]; + } + + template U &As(bool New) const { +unsigned char *Obj = +Data + (llvm::sys::IsBigEndianHost ? DataSize - sizeof(U) : 0); +if (New) + return *new (Obj) U; +return *std::launder(reinterpret_cast(Obj)); + } + + T *&GetPtr() const { return As(false); } + uint64_t &GetU64() const { return As(false); } + void SetPtr(T *Ptr) const { As(true) = Ptr; } + void SetU64(uint64_t U64) const { As(true) = U64; } public: LazyOffsetPtr() = default; - explicit LazyOffsetPtr(T *Ptr) : Ptr(reinterpret_cast(Ptr)) {} + explicit LazyOffsetPtr(T *Ptr) : Data() { SetPtr(Ptr); } - explicit LazyOffsetPtr(uint64_t Offset) : Ptr((Offset << 1) | 0x01) { + explicit LazyOffsetPtr(uint64_t Offset) : Data() { assert((Offset << 1 >> 1) == Offset && "Offsets must require < 63 bits"); if (Offset == 0) - Ptr = 0; + SetPtr(nullptr); +else + SetU64((Offset << 1) | 0x01); } LazyOffsetPtr &operator=(T *Ptr) { -this->Ptr = reinterpret_cast(Ptr); +SetPtr(Ptr); return *this; } LazyOffsetPtr &operator=(uint64_t Offset) { assert((Offset << 1 >> 1) == Offset && "Offsets must require < 63 bits"); if (Offset == 0) - Ptr = 0; + SetPtr(nullptr); else - Ptr = (Offset << 1) | 0x01; + SetU64((Offset << 1) | 0x01); return *this; } @@ -356,15 +378,15 @@ struct LazyOffsetPtr { /// Whether this pointer is non-NULL. /// /// This operation does not require the AST node to be deserialized. - explicit operator bool() const { return Ptr != 0; } + explicit operator bool() const { return isOffset() || GetPtr() != nullptr; } /// Whether this pointer is non-NULL. /// /// This operation does not require the AST node to be deserialized. - bool isValid() const { return Ptr != 0; } + bool isValid() const { return isOffset() || GetPtr() != nullptr; } /// Whether this pointer is currently stored as an offset. - bool isOffset() const { return Ptr & 0x01; } + bool isOffset() const { return GetLSB() & 0x01; } /// Retrieve the pointer to the AST node that this lazy pointer points to. /// @@ -375,9 +397,9 @@ struct LazyOffsetPtr { if (isOffset()) { assert(Source && "Cannot deserialize a lazy pointer without an AST source"); - Ptr = reinterpret_cast((Source->*Get)(OffsT(Ptr >> 1))); + SetPtr((Source->*Get)(OffsT(GetU64() >> 1))); } -return reinterpret_cast(Ptr); +return GetPtr(); } /// Retrieve the address of the AST node pointer. Deserializes the pointee if @@ -385,7 +407,7 @@ struct LazyOffsetPtr { T **getAddressOfPointer(ExternalASTSource *Source) const { // Ensure the integer is in pointer form. (void)get(Source); -return reinterpret_cast(&Ptr); +return &GetPtr(); } }; `` https://github.com/llvm/llvm-project/pull/113052 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { tschuett wrote: Why free-standing functions, when it is **your**. register bank select pass? https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + Register Reg = Op.getReg(); + if (Reg == AMDGPU::EXEC) { +return true; + } +} + } + + return false; +} + +Register getVReg(MachineOperand &Op) { tschuett wrote: And probably something like `tryGetVReg`. It is fallible. https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + Register Reg = Op.getReg(); + if (Reg == AMDGPU::EXEC) { +return true; + } +} + } + + return false; +} + +Register getVReg(MachineOperand &Op) { + if (!Op.isReg()) +return 0; + + Register Reg = Op.getReg(); + if (!Reg.isVirtual()) +return 0; + + return Reg; +} + +bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { + MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(MF); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + MachineIRBuilder B(MF); + + // Assign register banks to ALL def registers on G_ instructions. + // Same for copies if they have no register bank or class on def. + for (MachineBasicBlock &MBB : MF) { +for (MachineInstr &MI : MBB) { + if (!shouldRBSelect(MI)) +continue; + + for (MachineOperand &DefOP : MI.defs()) { +Register DefReg = getVReg(DefOP); +if (!DefReg) + continue; + +// Copies can have register class on def registers. +if (MI.isCopy() && MRI.getRegClassOrNull(DefReg)) { + continue; +} + +if (MUI.isUniform(DefReg) || ILMA.isS32S64LaneMask(DefReg)) { + setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::SGPRRegBankID)); +} else { + if (MRI.getType(DefReg) == LLT::scalar(1)) +setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::VCCRegBankID)); + else +setRB(MI, DefOP, B, MRI, RBI.getRegBank(A
[llvm-branch-commits] [llvm] [StructuralHash] Support Differences (PR #112638)
boomanaiden154 wrote: > IIRC we have several lit tests that cover structural hash, shouldn't we have > a new test there that uses the new functionality? The lit tests for structural hashing are pretty limited and mostly designed to just test the structural hash printer pass that I needed for other things rather than just testing within LLVM. Adding new lit tests definitely wouldn't hurt, but typically unit testing makes a lot more sense for what StructuralHash is supposed to guarantee. > The test failure TableGen/x86-fold-tables.td seems unrelated. Yes. https://github.com/llvm/llvm-project/issues/112961 seems to have more information. https://github.com/llvm/llvm-project/pull/112638 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { tschuett wrote: member function and use instance variables, i.e., MRI. https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, tschuett wrote: member function https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [mlir][Transforms] Merge 1:1 and 1:N type converters (PR #113032)
https://github.com/matthias-springer updated https://github.com/llvm/llvm-project/pull/113032 >From 7ac8aa2ee4ee5634d8dd6f2d64d0e10b800e2d70 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Sat, 19 Oct 2024 12:05:13 +0200 Subject: [PATCH] [mlir][Transforms] Merge 1:1 and 1:N type converters --- .../Dialect/SparseTensor/Transforms/Passes.h | 2 +- .../mlir/Transforms/DialectConversion.h | 56 ++- .../mlir/Transforms/OneToNTypeConversion.h| 45 +-- .../ArmSME/Transforms/VectorLegalization.cpp | 2 +- .../Transforms/Utils/DialectConversion.cpp| 24 ++-- .../Transforms/Utils/OneToNTypeConversion.cpp | 44 +-- .../TestOneToNTypeConversionPass.cpp | 18 -- 7 files changed, 93 insertions(+), 98 deletions(-) diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h index 6ccbc40bdd6034..2e9c297f20182a 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h @@ -150,7 +150,7 @@ std::unique_ptr createLowerForeachToSCFPass(); //===--===// /// Type converter for iter_space and iterator. -struct SparseIterationTypeConverter : public OneToNTypeConverter { +struct SparseIterationTypeConverter : public TypeConverter { SparseIterationTypeConverter(); }; diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 5ff36160dd6162..37da03bbe386e9 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -173,7 +173,9 @@ class TypeConverter { /// conversion has finished. /// /// Note: Target materializations may optionally accept an additional Type - /// parameter, which is the original type of the SSA value. + /// parameter, which is the original type of the SSA value. Furthermore, `T` + /// can be a TypeRange; in that case, the function must return a + /// SmallVector. /// This method registers a materialization that will be called when /// converting (potentially multiple) block arguments that were the result of @@ -210,6 +212,9 @@ class TypeConverter { /// will be invoked with: outputType = "t3", inputs = "v2", // originalType = "t1". Note that the original type "t1" cannot be recovered /// from just "t3" and "v2"; that's why the originalType parameter exists. + /// + /// Note: During a 1:N conversion, the result types can be a TypeRange. In + /// that case the materialization produces a SmallVector. template >::template arg_t<1>> void addTargetMaterialization(FnT &&callback) { @@ -316,6 +321,11 @@ class TypeConverter { Value materializeTargetConversion(OpBuilder &builder, Location loc, Type resultType, ValueRange inputs, Type originalType = {}) const; + SmallVector materializeTargetConversion(OpBuilder &builder, + Location loc, + TypeRange resultType, + ValueRange inputs, + Type originalType = {}) const; /// Convert an attribute present `attr` from within the type `type` using /// the registered conversion functions. If no applicable conversion has been @@ -340,9 +350,9 @@ class TypeConverter { /// The signature of the callback used to materialize a target conversion. /// - /// Arguments: builder, result type, inputs, location, original type - using TargetMaterializationCallbackFn = - std::function; + /// Arguments: builder, result types, inputs, location, original type + using TargetMaterializationCallbackFn = std::function( + OpBuilder &, TypeRange, ValueRange, Location, Type)>; /// The signature of the callback used to convert a type attribute. using TypeAttributeConversionCallbackFn = @@ -409,22 +419,40 @@ class TypeConverter { /// callback. /// /// With callback of form: - /// `Value(OpBuilder &, T, ValueRange, Location, Type)` + /// - Value(OpBuilder &, T, ValueRange, Location, Type) + /// - SmallVector(OpBuilder &, TypeRange, ValueRange, Location, Type) template std::enable_if_t< std::is_invocable_v, TargetMaterializationCallbackFn> wrapTargetMaterialization(FnT &&callback) const { return [callback = std::forward(callback)]( - OpBuilder &builder, Type resultType, ValueRange inputs, - Location loc, Type originalType) -> Value { - if (T derivedType = dyn_cast(resultType)) -return callback(builder, derivedType, inputs, loc, originalType); - return Value(); + OpBuilder &builder, TypeRange resultTypes, ValueRange inputs, + Locatio
[llvm-branch-commits] [mlir] [mlir][Transforms] Merge 1:1 and 1:N type converters (PR #113032)
llvmbot wrote: @llvm/pr-subscribers-mlir-core Author: Matthias Springer (matthias-springer) Changes The 1:N type converter derived from the 1:1 type converter and extends it with 1:N target materializations. This commit merges the two type converters and stores 1:N target materializations in the 1:1 type converter. This is in preparation of merging the 1:1 and 1:N dialect conversion infrastructures. 1:1 target materializations (producing a single `Value`) will remain valid. An additional API is added to the type converter to register 1:N target materializations (producing a `SmallVector`). Internally, all target materializations are stored as 1:N materializations. The 1:N type converter is removed. Note for LLVM integration: If you are using the `OneToNTypeConverter`, simply switch all occurrences to `TypeConverter`. Depends on #113031. --- Full diff: https://github.com/llvm/llvm-project/pull/113032.diff 7 Files Affected: - (modified) mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h (+1-1) - (modified) mlir/include/mlir/Transforms/DialectConversion.h (+41-13) - (modified) mlir/include/mlir/Transforms/OneToNTypeConversion.h (+1-44) - (modified) mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp (+1-1) - (modified) mlir/lib/Transforms/Utils/DialectConversion.cpp (+20-4) - (modified) mlir/lib/Transforms/Utils/OneToNTypeConversion.cpp (+14-30) - (modified) mlir/test/lib/Conversion/OneToNTypeConversion/TestOneToNTypeConversionPass.cpp (+14-4) ``diff diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h index 6ccbc40bdd6034..2e9c297f20182a 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h @@ -150,7 +150,7 @@ std::unique_ptr createLowerForeachToSCFPass(); //===--===// /// Type converter for iter_space and iterator. -struct SparseIterationTypeConverter : public OneToNTypeConverter { +struct SparseIterationTypeConverter : public TypeConverter { SparseIterationTypeConverter(); }; diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 5ff36160dd6162..eb7da67c1bb995 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -173,7 +173,9 @@ class TypeConverter { /// conversion has finished. /// /// Note: Target materializations may optionally accept an additional Type - /// parameter, which is the original type of the SSA value. + /// parameter, which is the original type of the SSA value. Furthermore `T` + /// can be a TypeRange; in that case, the function must return a + /// SmallVector. /// This method registers a materialization that will be called when /// converting (potentially multiple) block arguments that were the result of @@ -210,6 +212,9 @@ class TypeConverter { /// will be invoked with: outputType = "t3", inputs = "v2", // originalType = "t1". Note that the original type "t1" cannot be recovered /// from just "t3" and "v2"; that's why the originalType parameter exists. + /// + /// Note: During a 1:N conversion, the result types can be a TypeRange. In + /// that case the materialization produces a SmallVector. template >::template arg_t<1>> void addTargetMaterialization(FnT &&callback) { @@ -316,6 +321,11 @@ class TypeConverter { Value materializeTargetConversion(OpBuilder &builder, Location loc, Type resultType, ValueRange inputs, Type originalType = {}) const; + SmallVector materializeTargetConversion(OpBuilder &builder, + Location loc, + TypeRange resultType, + ValueRange inputs, + Type originalType = {}) const; /// Convert an attribute present `attr` from within the type `type` using /// the registered conversion functions. If no applicable conversion has been @@ -341,8 +351,8 @@ class TypeConverter { /// The signature of the callback used to materialize a target conversion. /// /// Arguments: builder, result type, inputs, location, original type - using TargetMaterializationCallbackFn = - std::function; + using TargetMaterializationCallbackFn = std::function( + OpBuilder &, TypeRange, ValueRange, Location, Type)>; /// The signature of the callback used to convert a type attribute. using TypeAttributeConversionCallbackFn = @@ -409,22 +419,40 @@ class TypeConverter { /// callback. /// /// With callback of form: - /// `Value(OpBuilder &, T, ValueRange, Location, Type)` + /// - Value(OpBuilder &, T, ValueRange, Location,
[llvm-branch-commits] [mlir] [mlir][Transforms] Merge 1:1 and 1:N type converters (PR #113032)
llvmbot wrote: @llvm/pr-subscribers-mlir Author: Matthias Springer (matthias-springer) Changes The 1:N type converter derived from the 1:1 type converter and extends it with 1:N target materializations. This commit merges the two type converters and stores 1:N target materializations in the 1:1 type converter. This is in preparation of merging the 1:1 and 1:N dialect conversion infrastructures. 1:1 target materializations (producing a single `Value`) will remain valid. An additional API is added to the type converter to register 1:N target materializations (producing a `SmallVector`). Internally, all target materializations are stored as 1:N materializations. The 1:N type converter is removed. Note for LLVM integration: If you are using the `OneToNTypeConverter`, simply switch all occurrences to `TypeConverter`. Depends on #113031. --- Full diff: https://github.com/llvm/llvm-project/pull/113032.diff 7 Files Affected: - (modified) mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h (+1-1) - (modified) mlir/include/mlir/Transforms/DialectConversion.h (+41-13) - (modified) mlir/include/mlir/Transforms/OneToNTypeConversion.h (+1-44) - (modified) mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp (+1-1) - (modified) mlir/lib/Transforms/Utils/DialectConversion.cpp (+20-4) - (modified) mlir/lib/Transforms/Utils/OneToNTypeConversion.cpp (+14-30) - (modified) mlir/test/lib/Conversion/OneToNTypeConversion/TestOneToNTypeConversionPass.cpp (+14-4) ``diff diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h index 6ccbc40bdd6034..2e9c297f20182a 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h @@ -150,7 +150,7 @@ std::unique_ptr createLowerForeachToSCFPass(); //===--===// /// Type converter for iter_space and iterator. -struct SparseIterationTypeConverter : public OneToNTypeConverter { +struct SparseIterationTypeConverter : public TypeConverter { SparseIterationTypeConverter(); }; diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 5ff36160dd6162..eb7da67c1bb995 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -173,7 +173,9 @@ class TypeConverter { /// conversion has finished. /// /// Note: Target materializations may optionally accept an additional Type - /// parameter, which is the original type of the SSA value. + /// parameter, which is the original type of the SSA value. Furthermore `T` + /// can be a TypeRange; in that case, the function must return a + /// SmallVector. /// This method registers a materialization that will be called when /// converting (potentially multiple) block arguments that were the result of @@ -210,6 +212,9 @@ class TypeConverter { /// will be invoked with: outputType = "t3", inputs = "v2", // originalType = "t1". Note that the original type "t1" cannot be recovered /// from just "t3" and "v2"; that's why the originalType parameter exists. + /// + /// Note: During a 1:N conversion, the result types can be a TypeRange. In + /// that case the materialization produces a SmallVector. template >::template arg_t<1>> void addTargetMaterialization(FnT &&callback) { @@ -316,6 +321,11 @@ class TypeConverter { Value materializeTargetConversion(OpBuilder &builder, Location loc, Type resultType, ValueRange inputs, Type originalType = {}) const; + SmallVector materializeTargetConversion(OpBuilder &builder, + Location loc, + TypeRange resultType, + ValueRange inputs, + Type originalType = {}) const; /// Convert an attribute present `attr` from within the type `type` using /// the registered conversion functions. If no applicable conversion has been @@ -341,8 +351,8 @@ class TypeConverter { /// The signature of the callback used to materialize a target conversion. /// /// Arguments: builder, result type, inputs, location, original type - using TargetMaterializationCallbackFn = - std::function; + using TargetMaterializationCallbackFn = std::function( + OpBuilder &, TypeRange, ValueRange, Location, Type)>; /// The signature of the callback used to convert a type attribute. using TypeAttributeConversionCallbackFn = @@ -409,22 +419,40 @@ class TypeConverter { /// callback. /// /// With callback of form: - /// `Value(OpBuilder &, T, ValueRange, Location, Type)` + /// - Value(OpBuilder &, T, ValueRange, Location, Type)
[llvm-branch-commits] [mlir] [mlir][Transforms] Merge 1:1 and 1:N type converters (PR #113032)
https://github.com/matthias-springer created https://github.com/llvm/llvm-project/pull/113032 The 1:N type converter derived from the 1:1 type converter and extends it with 1:N target materializations. This commit merges the two type converters and stores 1:N target materializations in the 1:1 type converter. This is in preparation of merging the 1:1 and 1:N dialect conversion infrastructures. 1:1 target materializations (producing a single `Value`) will remain valid. An additional API is added to the type converter to register 1:N target materializations (producing a `SmallVector`). Internally, all target materializations are stored as 1:N materializations. The 1:N type converter is removed. Note for LLVM integration: If you are using the `OneToNTypeConverter`, simply switch all occurrences to `TypeConverter`. Depends on #113031. >From 8fc814758e01196ba01c7a89452bf65dd00e452b Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Sat, 19 Oct 2024 12:05:13 +0200 Subject: [PATCH] [mlir][Transforms] Merge 1:1 and 1:N type converters --- .../Dialect/SparseTensor/Transforms/Passes.h | 2 +- .../mlir/Transforms/DialectConversion.h | 54 ++- .../mlir/Transforms/OneToNTypeConversion.h| 45 +--- .../ArmSME/Transforms/VectorLegalization.cpp | 2 +- .../Transforms/Utils/DialectConversion.cpp| 24 +++-- .../Transforms/Utils/OneToNTypeConversion.cpp | 44 +-- .../TestOneToNTypeConversionPass.cpp | 18 +-- 7 files changed, 92 insertions(+), 97 deletions(-) diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h index 6ccbc40bdd6034..2e9c297f20182a 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h @@ -150,7 +150,7 @@ std::unique_ptr createLowerForeachToSCFPass(); //===--===// /// Type converter for iter_space and iterator. -struct SparseIterationTypeConverter : public OneToNTypeConverter { +struct SparseIterationTypeConverter : public TypeConverter { SparseIterationTypeConverter(); }; diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 5ff36160dd6162..eb7da67c1bb995 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -173,7 +173,9 @@ class TypeConverter { /// conversion has finished. /// /// Note: Target materializations may optionally accept an additional Type - /// parameter, which is the original type of the SSA value. + /// parameter, which is the original type of the SSA value. Furthermore `T` + /// can be a TypeRange; in that case, the function must return a + /// SmallVector. /// This method registers a materialization that will be called when /// converting (potentially multiple) block arguments that were the result of @@ -210,6 +212,9 @@ class TypeConverter { /// will be invoked with: outputType = "t3", inputs = "v2", // originalType = "t1". Note that the original type "t1" cannot be recovered /// from just "t3" and "v2"; that's why the originalType parameter exists. + /// + /// Note: During a 1:N conversion, the result types can be a TypeRange. In + /// that case the materialization produces a SmallVector. template >::template arg_t<1>> void addTargetMaterialization(FnT &&callback) { @@ -316,6 +321,11 @@ class TypeConverter { Value materializeTargetConversion(OpBuilder &builder, Location loc, Type resultType, ValueRange inputs, Type originalType = {}) const; + SmallVector materializeTargetConversion(OpBuilder &builder, + Location loc, + TypeRange resultType, + ValueRange inputs, + Type originalType = {}) const; /// Convert an attribute present `attr` from within the type `type` using /// the registered conversion functions. If no applicable conversion has been @@ -341,8 +351,8 @@ class TypeConverter { /// The signature of the callback used to materialize a target conversion. /// /// Arguments: builder, result type, inputs, location, original type - using TargetMaterializationCallbackFn = - std::function; + using TargetMaterializationCallbackFn = std::function( + OpBuilder &, TypeRange, ValueRange, Location, Type)>; /// The signature of the callback used to convert a type attribute. using TypeAttributeConversionCallbackFn = @@ -409,22 +419,40 @@ class TypeConverter { /// callback. /// /// With callback of form: - /// `Value(OpBuilder &, T, ValueRange, Location, Type)` + /// - Value(OpBuilder &,
[llvm-branch-commits] [mlir] [mlir][Transforms] Merge 1:1 and 1:N type converters (PR #113032)
llvmbot wrote: @llvm/pr-subscribers-mlir-sparse Author: Matthias Springer (matthias-springer) Changes The 1:N type converter derived from the 1:1 type converter and extends it with 1:N target materializations. This commit merges the two type converters and stores 1:N target materializations in the 1:1 type converter. This is in preparation of merging the 1:1 and 1:N dialect conversion infrastructures. 1:1 target materializations (producing a single `Value`) will remain valid. An additional API is added to the type converter to register 1:N target materializations (producing a `SmallVector`). Internally, all target materializations are stored as 1:N materializations. The 1:N type converter is removed. Note for LLVM integration: If you are using the `OneToNTypeConverter`, simply switch all occurrences to `TypeConverter`. Depends on #113031. --- Full diff: https://github.com/llvm/llvm-project/pull/113032.diff 7 Files Affected: - (modified) mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h (+1-1) - (modified) mlir/include/mlir/Transforms/DialectConversion.h (+41-13) - (modified) mlir/include/mlir/Transforms/OneToNTypeConversion.h (+1-44) - (modified) mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp (+1-1) - (modified) mlir/lib/Transforms/Utils/DialectConversion.cpp (+20-4) - (modified) mlir/lib/Transforms/Utils/OneToNTypeConversion.cpp (+14-30) - (modified) mlir/test/lib/Conversion/OneToNTypeConversion/TestOneToNTypeConversionPass.cpp (+14-4) ``diff diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h index 6ccbc40bdd6034..2e9c297f20182a 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h @@ -150,7 +150,7 @@ std::unique_ptr createLowerForeachToSCFPass(); //===--===// /// Type converter for iter_space and iterator. -struct SparseIterationTypeConverter : public OneToNTypeConverter { +struct SparseIterationTypeConverter : public TypeConverter { SparseIterationTypeConverter(); }; diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 5ff36160dd6162..eb7da67c1bb995 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -173,7 +173,9 @@ class TypeConverter { /// conversion has finished. /// /// Note: Target materializations may optionally accept an additional Type - /// parameter, which is the original type of the SSA value. + /// parameter, which is the original type of the SSA value. Furthermore `T` + /// can be a TypeRange; in that case, the function must return a + /// SmallVector. /// This method registers a materialization that will be called when /// converting (potentially multiple) block arguments that were the result of @@ -210,6 +212,9 @@ class TypeConverter { /// will be invoked with: outputType = "t3", inputs = "v2", // originalType = "t1". Note that the original type "t1" cannot be recovered /// from just "t3" and "v2"; that's why the originalType parameter exists. + /// + /// Note: During a 1:N conversion, the result types can be a TypeRange. In + /// that case the materialization produces a SmallVector. template >::template arg_t<1>> void addTargetMaterialization(FnT &&callback) { @@ -316,6 +321,11 @@ class TypeConverter { Value materializeTargetConversion(OpBuilder &builder, Location loc, Type resultType, ValueRange inputs, Type originalType = {}) const; + SmallVector materializeTargetConversion(OpBuilder &builder, + Location loc, + TypeRange resultType, + ValueRange inputs, + Type originalType = {}) const; /// Convert an attribute present `attr` from within the type `type` using /// the registered conversion functions. If no applicable conversion has been @@ -341,8 +351,8 @@ class TypeConverter { /// The signature of the callback used to materialize a target conversion. /// /// Arguments: builder, result type, inputs, location, original type - using TargetMaterializationCallbackFn = - std::function; + using TargetMaterializationCallbackFn = std::function( + OpBuilder &, TypeRange, ValueRange, Location, Type)>; /// The signature of the callback used to convert a type attribute. using TypeAttributeConversionCallbackFn = @@ -409,22 +419,40 @@ class TypeConverter { /// callback. /// /// With callback of form: - /// `Value(OpBuilder &, T, ValueRange, Location, Type)` + /// - Value(OpBuilder &, T, ValueRange, Location
[llvm-branch-commits] [clang] clang/AMDGPU: Emit grid size builtins with range metadata (PR #113038)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/113038?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#113038** https://app.graphite.dev/github/pr/llvm/llvm-project/113038?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 * **#113019** https://app.graphite.dev/github/pr/llvm/llvm-project/113019?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#113018** https://app.graphite.dev/github/pr/llvm/llvm-project/113018?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about stacking. Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="11px" height="11px"/> Graphite https://github.com/llvm/llvm-project/pull/113038 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] clang/AMDGPU: Emit grid size builtins with range metadata (PR #113038)
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/113038 These cannot be 0. >From 708215d0a144caafe7a9ebfbce5f8617c8215c49 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 19 Oct 2024 02:39:06 +0400 Subject: [PATCH] clang/AMDGPU: Emit grid size builtins with range metadata These cannot be 0. --- clang/lib/CodeGen/CGBuiltin.cpp | 6 ++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 28f28c70b5ae52..69a7dfc2433ae8 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18538,6 +18538,12 @@ Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) { auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset); auto *LD = CGF.Builder.CreateLoad( Address(GEP, CGF.Int32Ty, CharUnits::fromQuantity(4))); + + llvm::MDBuilder MDB(CGF.getLLVMContext()); + + // Known non-zero. + LD->setMetadata(llvm::LLVMContext::MD_range, + MDB.createRange(APInt(32, 1), APInt::getZero(32))); LD->setMetadata(llvm::LLVMContext::MD_invariant_load, llvm::MDNode::get(CGF.getLLVMContext(), std::nullopt)); return LD; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index bf5f2971cf118c..be6cee5e9217bf 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -639,7 +639,7 @@ void test_get_workgroup_size(int d, global int *out) // CHECK-LABEL: @test_get_grid_size( // CHECK: {{.*}}call align 4 dereferenceable(64){{.*}} ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() // CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 %.sink -// CHECK: load i32, ptr addrspace(4) %{{.*}}, align 4, !invariant.load +// CHECK: load i32, ptr addrspace(4) %{{.*}}, align 4, !range [[$GRID_RANGE:![0-9]+]], !invariant.load void test_get_grid_size(int d, global int *out) { switch (d) { @@ -896,5 +896,6 @@ void test_set_fpenv(unsigned long env) { __builtin_amdgcn_set_fpenv(env); } +// CHECK-DAG: [[$GRID_RANGE]] = !{i32 1, i32 0} // CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025} // CHECK-DAG: attributes #[[$NOUNWIND_READONLY]] = { convergent mustprogress nocallback nofree nounwind willreturn memory(none) } ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] clang/AMDGPU: Emit grid size builtins with range metadata (PR #113038)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/113038 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] clang/AMDGPU: Emit grid size builtins with range metadata (PR #113038)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) Changes These cannot be 0. --- Full diff: https://github.com/llvm/llvm-project/pull/113038.diff 2 Files Affected: - (modified) clang/lib/CodeGen/CGBuiltin.cpp (+6) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn.cl (+2-1) ``diff diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 28f28c70b5ae52..69a7dfc2433ae8 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18538,6 +18538,12 @@ Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) { auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset); auto *LD = CGF.Builder.CreateLoad( Address(GEP, CGF.Int32Ty, CharUnits::fromQuantity(4))); + + llvm::MDBuilder MDB(CGF.getLLVMContext()); + + // Known non-zero. + LD->setMetadata(llvm::LLVMContext::MD_range, + MDB.createRange(APInt(32, 1), APInt::getZero(32))); LD->setMetadata(llvm::LLVMContext::MD_invariant_load, llvm::MDNode::get(CGF.getLLVMContext(), std::nullopt)); return LD; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index bf5f2971cf118c..be6cee5e9217bf 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -639,7 +639,7 @@ void test_get_workgroup_size(int d, global int *out) // CHECK-LABEL: @test_get_grid_size( // CHECK: {{.*}}call align 4 dereferenceable(64){{.*}} ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() // CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 %.sink -// CHECK: load i32, ptr addrspace(4) %{{.*}}, align 4, !invariant.load +// CHECK: load i32, ptr addrspace(4) %{{.*}}, align 4, !range [[$GRID_RANGE:![0-9]+]], !invariant.load void test_get_grid_size(int d, global int *out) { switch (d) { @@ -896,5 +896,6 @@ void test_set_fpenv(unsigned long env) { __builtin_amdgcn_set_fpenv(env); } +// CHECK-DAG: [[$GRID_RANGE]] = !{i32 1, i32 0} // CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025} // CHECK-DAG: attributes #[[$NOUNWIND_READONLY]] = { convergent mustprogress nocallback nofree nounwind willreturn memory(none) } `` https://github.com/llvm/llvm-project/pull/113038 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] clang/AMDGPU: Emit grid size builtins with range metadata (PR #113038)
https://github.com/jhuber6 approved this pull request. https://github.com/llvm/llvm-project/pull/113038 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: [loongarch][DAG][FREEZE] Fix crash when FREEZE a half(f16) type on loongarch (#107791) (PR #109093)
heiher wrote: Update: https://github.com/llvm/llvm-project/pull/109368#issuecomment-2423879356 I suggest continuing this PR to ensure that fp16 support is functional on the release/19.x. https://github.com/llvm/llvm-project/pull/109093 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: [loongarch][DAG][FREEZE] Fix crash when FREEZE a half(f16) type on loongarch (#107791) (PR #109093)
https://github.com/heiher reopened https://github.com/llvm/llvm-project/pull/109093 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/19.x: [loongarch][DAG][FREEZE] Fix crash when FREEZE a half(f16) type on loongarch (#107791) (PR #109093)
https://github.com/heiher updated https://github.com/llvm/llvm-project/pull/109093 >From bd494f3735df409fbe7360e98ff4d5cb55d4bf98 Mon Sep 17 00:00:00 2001 From: YANG Xudong Date: Fri, 13 Sep 2024 08:49:54 +0800 Subject: [PATCH] [loongarch][DAG][FREEZE] Fix crash when FREEZE a half(f16) type on loongarch (#107791) For zig with LLVM 19.1.0rc4, we are seeing the following error when bootstrapping a `loongarch64-linux-musl` target. https://github.com/ziglang/zig-bootstrap/issues/164#issuecomment-2332357069 It seems that this issue is caused by `PromoteFloatResult` is not handling FREEZE OP on loongarch. Here is the reproduction of the error: https://godbolt.org/z/PPfvWjjG5 ~~This patch adds the FREEZE OP handling with `PromoteFloatRes_UnaryOp` and adds a test case.~~ This patch changes loongarch's way of floating point promotion to soft promotion to avoid this problem. See: loongarch's handling of `half`: - https://github.com/llvm/llvm-project/issues/93894 - https://github.com/llvm/llvm-project/pull/94456 Also see: other float promotion FREEZE handling - https://github.com/llvm/llvm-project/commit/0019c2f194a5e1f4cd65c5284e204328cc40ab3d (cherry picked from commit 13280d99aec5b4f383a2f3d5c10ecb148a07384e) --- .../Target/LoongArch/LoongArchISelLowering.h | 2 + llvm/test/CodeGen/LoongArch/fp16-promote.ll | 198 +++--- 2 files changed, 128 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index fc5b36c2124e01..267837add575dc 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -332,6 +332,8 @@ class LoongArchTargetLowering : public TargetLowering { bool isEligibleForTailCallOptimization( CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF, const SmallVectorImpl &ArgLocs) const; + + bool softPromoteHalfType() const override { return true; } }; } // end namespace llvm diff --git a/llvm/test/CodeGen/LoongArch/fp16-promote.ll b/llvm/test/CodeGen/LoongArch/fp16-promote.ll index 75f920b43a06ce..03965ac81f3763 100644 --- a/llvm/test/CodeGen/LoongArch/fp16-promote.ll +++ b/llvm/test/CodeGen/LoongArch/fp16-promote.ll @@ -126,42 +126,40 @@ define void @test_fptrunc_double(double %d, ptr %p) nounwind { define half @test_fadd_reg(half %a, half %b) nounwind { ; LA32-LABEL: test_fadd_reg: ; LA32: # %bb.0: -; LA32-NEXT:addi.w $sp, $sp, -32 -; LA32-NEXT:st.w $ra, $sp, 28 # 4-byte Folded Spill -; LA32-NEXT:fst.d $fs0, $sp, 16 # 8-byte Folded Spill -; LA32-NEXT:fst.d $fs1, $sp, 8 # 8-byte Folded Spill +; LA32-NEXT:addi.w $sp, $sp, -16 +; LA32-NEXT:st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT:st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT:fst.d $fs0, $sp, 0 # 8-byte Folded Spill +; LA32-NEXT:move $fp, $a0 +; LA32-NEXT:move $a0, $a1 +; LA32-NEXT:bl %plt(__gnu_h2f_ieee) ; LA32-NEXT:fmov.s $fs0, $fa0 -; LA32-NEXT:fmov.s $fa0, $fa1 -; LA32-NEXT:bl %plt(__gnu_f2h_ieee) +; LA32-NEXT:move $a0, $fp ; LA32-NEXT:bl %plt(__gnu_h2f_ieee) -; LA32-NEXT:fmov.s $fs1, $fa0 -; LA32-NEXT:fmov.s $fa0, $fs0 +; LA32-NEXT:fadd.s $fa0, $fa0, $fs0 ; LA32-NEXT:bl %plt(__gnu_f2h_ieee) -; LA32-NEXT:bl %plt(__gnu_h2f_ieee) -; LA32-NEXT:fadd.s $fa0, $fa0, $fs1 -; LA32-NEXT:fld.d $fs1, $sp, 8 # 8-byte Folded Reload -; LA32-NEXT:fld.d $fs0, $sp, 16 # 8-byte Folded Reload -; LA32-NEXT:ld.w $ra, $sp, 28 # 4-byte Folded Reload -; LA32-NEXT:addi.w $sp, $sp, 32 +; LA32-NEXT:fld.d $fs0, $sp, 0 # 8-byte Folded Reload +; LA32-NEXT:ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT:ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT:addi.w $sp, $sp, 16 ; LA32-NEXT:ret ; ; LA64-LABEL: test_fadd_reg: ; LA64: # %bb.0: ; LA64-NEXT:addi.d $sp, $sp, -32 ; LA64-NEXT:st.d $ra, $sp, 24 # 8-byte Folded Spill -; LA64-NEXT:fst.d $fs0, $sp, 16 # 8-byte Folded Spill -; LA64-NEXT:fst.d $fs1, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT:st.d $fp, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT:fst.d $fs0, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT:move $fp, $a0 +; LA64-NEXT:move $a0, $a1 +; LA64-NEXT:bl %plt(__gnu_h2f_ieee) ; LA64-NEXT:fmov.s $fs0, $fa0 -; LA64-NEXT:fmov.s $fa0, $fa1 -; LA64-NEXT:bl %plt(__gnu_f2h_ieee) +; LA64-NEXT:move $a0, $fp ; LA64-NEXT:bl %plt(__gnu_h2f_ieee) -; LA64-NEXT:fmov.s $fs1, $fa0 -; LA64-NEXT:fmov.s $fa0, $fs0 +; LA64-NEXT:fadd.s $fa0, $fa0, $fs0 ; LA64-NEXT:bl %plt(__gnu_f2h_ieee) -; LA64-NEXT:bl %plt(__gnu_h2f_ieee) -; LA64-NEXT:fadd.s $fa0, $fa0, $fs1 -; LA64-NEXT:fld.d $fs1, $sp, 8 # 8-byte Folded Reload -; LA64-NEXT:fld.d $fs0, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT:fld.d $fs0, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT:ld.d $fp, $sp, 16 # 8-byte Folded Reload ; LA64
[llvm-branch-commits] [flang] [flang] Lower omp.workshare to other omp constructs (PR #101446)
https://github.com/ivanradanov updated https://github.com/llvm/llvm-project/pull/101446 >From e56dbd6a0625890fd9a3d6a62675e864ca94a8f5 Mon Sep 17 00:00:00 2001 From: Ivan Radanov Ivanov Date: Sun, 4 Aug 2024 22:06:55 +0900 Subject: [PATCH 01/13] [flang] Lower omp.workshare to other omp constructs Change to workshare loop wrapper op Move single op declaration Schedule pass properly Correctly handle nested nested loop nests to be parallelized by workshare Leave comments for shouldUseWorkshareLowering Use copyprivate to scatter val from omp.single TODO still need to implement copy function TODO transitive check for usage outside of omp.single not imiplemented yet Transitively check for users outisde of single op TODO need to implement copy func TODO need to hoist allocas outside of single regions Add tests Hoist allocas More tests Emit body for copy func Test the tmp storing logic Clean up trivially dead ops Only handle single-block regions for now Fix tests for custom assembly for loop wrapper Only run the lower workshare pass if openmp is enabled Implement some missing functionality Fix tests Fix test Iterate backwards to find all trivially dead ops Add expalanation comment for createCopyFun Update test --- flang/include/flang/Optimizer/OpenMP/Passes.h | 5 + .../include/flang/Optimizer/OpenMP/Passes.td | 5 + flang/include/flang/Tools/CLOptions.inc | 6 +- flang/include/flang/Tools/CrossToolHelpers.h | 1 + flang/lib/Frontend/FrontendActions.cpp| 10 +- flang/lib/Optimizer/OpenMP/CMakeLists.txt | 1 + flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 446 ++ flang/test/Fir/basic-program.fir | 1 + .../Transforms/OpenMP/lower-workshare.mlir| 189 .../Transforms/OpenMP/lower-workshare2.mlir | 23 + .../Transforms/OpenMP/lower-workshare3.mlir | 74 +++ .../Transforms/OpenMP/lower-workshare4.mlir | 59 +++ .../Transforms/OpenMP/lower-workshare5.mlir | 42 ++ .../Transforms/OpenMP/lower-workshare6.mlir | 51 ++ flang/tools/bbc/bbc.cpp | 5 +- flang/tools/tco/tco.cpp | 1 + 16 files changed, 915 insertions(+), 4 deletions(-) create mode 100644 flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp create mode 100644 flang/test/Transforms/OpenMP/lower-workshare.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare2.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare3.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare4.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare5.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workshare6.mlir diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.h b/flang/include/flang/Optimizer/OpenMP/Passes.h index 403d79667bf448..feb395f1a12dbd 100644 --- a/flang/include/flang/Optimizer/OpenMP/Passes.h +++ b/flang/include/flang/Optimizer/OpenMP/Passes.h @@ -25,6 +25,11 @@ namespace flangomp { #define GEN_PASS_REGISTRATION #include "flang/Optimizer/OpenMP/Passes.h.inc" +/// Impelements the logic specified in the 2.8.3 workshare Construct section of +/// the OpenMP standard which specifies what statements or constructs shall be +/// divided into units of work. +bool shouldUseWorkshareLowering(mlir::Operation *op); + } // namespace flangomp #endif // FORTRAN_OPTIMIZER_OPENMP_PASSES_H diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td index 395178e26a5762..041240cad12eb3 100644 --- a/flang/include/flang/Optimizer/OpenMP/Passes.td +++ b/flang/include/flang/Optimizer/OpenMP/Passes.td @@ -37,4 +37,9 @@ def FunctionFiltering : Pass<"omp-function-filtering"> { ]; } +// Needs to be scheduled on Module as we create functions in it +def LowerWorkshare : Pass<"lower-workshare", "::mlir::ModuleOp"> { + let summary = "Lower workshare construct"; +} + #endif //FORTRAN_OPTIMIZER_OPENMP_PASSES diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc index 1881e23b00045a..bb00e079008a0b 100644 --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -337,7 +337,7 @@ inline void createDefaultFIROptimizerPassPipeline( /// \param optLevel - optimization level used for creating FIR optimization /// passes pipeline inline void createHLFIRToFIRPassPipeline( -mlir::PassManager &pm, llvm::OptimizationLevel optLevel = defaultOptLevel) { +mlir::PassManager &pm, bool enableOpenMP, llvm::OptimizationLevel optLevel = defaultOptLevel) { if (optLevel.isOptimizingForSpeed()) { addCanonicalizerPassWithoutRegionSimplification(pm); addNestedPassToAllTopLevelOperations( @@ -354,6 +354,8 @@ inline void createHLFIRToFIRPassPipeline( pm.addPass(hlfir::createLowerHLFIRIntrinsics()); pm.addPass(hlfir::createBufferizeHLFIR()); pm.addPass(hlfir::createConvertHLFIRtoFIR()); + if (enableOpenMP) +pm.a
[llvm-branch-commits] [flang] [WIP][flang] Introduce HLFIR lowerings to omp.workshare_loop_nest (PR #104748)
https://github.com/ivanradanov updated https://github.com/llvm/llvm-project/pull/104748 >From 4257950e7df8d7eaf92a1a7b02f89422007ffe6a Mon Sep 17 00:00:00 2001 From: Ivan Radanov Ivanov Date: Sat, 19 Oct 2024 23:32:27 +0900 Subject: [PATCH 1/7] Do not emit empty omp.single's --- flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 50 --- .../OpenMP/lower-workshare-no-single.mlir | 20 2 files changed, 52 insertions(+), 18 deletions(-) create mode 100644 flang/test/Transforms/OpenMP/lower-workshare-no-single.mlir diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp index aa4371b3af6f7d..225c585a02d913 100644 --- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp +++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp @@ -239,11 +239,12 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion, return alloc; }; - auto moveToSingle = [&](SingleRegion sr, OpBuilder allocaBuilder, - OpBuilder singleBuilder, - OpBuilder parallelBuilder) -> SmallVector { + auto moveToSingle = + [&](SingleRegion sr, OpBuilder allocaBuilder, OpBuilder singleBuilder, + OpBuilder parallelBuilder) -> std::pair> { IRMapping singleMapping = rootMapping; SmallVector copyPrivate; +bool allParallelized = true; for (Operation &op : llvm::make_range(sr.begin, sr.end)) { if (isSafeToParallelize(&op)) { @@ -267,6 +268,7 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion, assert(llvm::all_of(op.getResults(), [&](Value v) { return !isTransitivelyUsedOutside(v, sr); })); + allParallelized = false; } } else if (auto alloca = dyn_cast(&op)) { auto hoisted = @@ -274,6 +276,7 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion, rootMapping.map(&*alloca, &*hoisted); rootMapping.map(alloca.getResult(), hoisted.getResult()); copyPrivate.push_back(hoisted); +allParallelized = false; } else { singleBuilder.clone(op, singleMapping); // Prepare reloaded values for results of operations that cannot be @@ -286,10 +289,11 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion, copyPrivate.push_back(alloc); } } +allParallelized = false; } } singleBuilder.create(loc); -return copyPrivate; +return {allParallelized, copyPrivate}; }; for (Block &block : sourceRegion) { @@ -343,25 +347,35 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion, Block *parallelBlock = new Block(); parallelBuilder.setInsertionPointToStart(parallelBlock); -omp::SingleOperands singleOperands; -if (isLast) - singleOperands.nowait = rootBuilder.getUnitAttr(); -singleOperands.copyprivateVars = +auto [allParallelized, copyprivateVars] = moveToSingle(std::get(opOrSingle), allocaBuilder, singleBuilder, parallelBuilder); -cleanupBlock(singleBlock); -for (auto var : singleOperands.copyprivateVars) { - mlir::func::FuncOp funcOp = - createCopyFunc(loc, var.getType(), firCopyFuncBuilder); - singleOperands.copyprivateSyms.push_back(SymbolRefAttr::get(funcOp)); +if (allParallelized) { + // The single region was not required as all operations were safe to + // parallelize + assert(copyprivateVars.empty()); + assert(allocaBlock->empty()); + delete singleBlock; +} else { + omp::SingleOperands singleOperands; + if (isLast) +singleOperands.nowait = rootBuilder.getUnitAttr(); + singleOperands.copyprivateVars = copyprivateVars; + cleanupBlock(singleBlock); + for (auto var : singleOperands.copyprivateVars) { +mlir::func::FuncOp funcOp = +createCopyFunc(loc, var.getType(), firCopyFuncBuilder); +singleOperands.copyprivateSyms.push_back( +SymbolRefAttr::get(funcOp)); + } + omp::SingleOp singleOp = + rootBuilder.create(loc, singleOperands); + singleOp.getRegion().push_back(singleBlock); + targetRegion.front().getOperations().splice( + singleOp->getIterator(), allocaBlock->getOperations()); } -omp::SingleOp singleOp = -rootBuilder.create(loc, singleOperands); -singleOp.getRegion().push_back(singleBlock); rootBuilder.getInsertionBlock()->getOperations().splice( rootBuilder.getInsertionPoint(), parallelBlock->getOperations()); -targetRegion.front().getOperations().splice( -singleOp->getIterator(), allocaBlock->getOperations()); delete allocaBlock; de
[llvm-branch-commits] [clang] clang/AMDGPU: Emit grid size builtins with range metadata (PR #113038)
@@ -896,5 +896,6 @@ void test_set_fpenv(unsigned long env) { __builtin_amdgcn_set_fpenv(env); } +// CHECK-DAG: [[$GRID_RANGE]] = !{i32 1, i32 0} shiltian wrote: the upper bound is smaller than the lower bound? https://github.com/llvm/llvm-project/pull/113038 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
https://github.com/arsenm commented: Don't forget about AGPRs https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Mark grid size loads with range metadata (PR #113019)
https://github.com/shiltian approved this pull request. https://github.com/llvm/llvm-project/pull/113019 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + Register Reg = Op.getReg(); + if (Reg == AMDGPU::EXEC) { +return true; + } +} + } + + return false; +} + +Register getVReg(MachineOperand &Op) { + if (!Op.isReg()) +return 0; + + Register Reg = Op.getReg(); + if (!Reg.isVirtual()) +return 0; + + return Reg; +} + +bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { + MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(MF); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); arsenm wrote: getSubtarget and use the target specific instance https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) arsenm wrote: These should have failed isPreISelOpcode https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, arsenm wrote: static. Also don't pass MachineIRBuilder by value https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + Register Reg = Op.getReg(); + if (Reg == AMDGPU::EXEC) { +return true; + } +} + } + + return false; +} + +Register getVReg(MachineOperand &Op) { + if (!Op.isReg()) +return 0; + + Register Reg = Op.getReg(); + if (!Reg.isVirtual()) +return 0; + + return Reg; +} + +bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { + MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(MF); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + MachineIRBuilder B(MF); + + // Assign register banks to ALL def registers on G_ instructions. + // Same for copies if they have no register bank or class on def. + for (MachineBasicBlock &MBB : MF) { +for (MachineInstr &MI : MBB) { + if (!shouldRBSelect(MI)) +continue; + + for (MachineOperand &DefOP : MI.defs()) { +Register DefReg = getVReg(DefOP); +if (!DefReg) + continue; + +// Copies can have register class on def registers. +if (MI.isCopy() && MRI.getRegClassOrNull(DefReg)) { + continue; +} + +if (MUI.isUniform(DefReg) || ILMA.isS32S64LaneMask(DefReg)) { + setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::SGPRRegBankID)); +} else { + if (MRI.getType(DefReg) == LLT::scalar(1)) +setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::VCCRegBankID)); + else +setRB(MI, DefOP, B, MRI, RBI.getRegBank(A
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; arsenm wrote: These should have failed isPreISelOpcode https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + Register Reg = Op.getReg(); + if (Reg == AMDGPU::EXEC) { arsenm wrote: What about wave32 https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { arsenm wrote: Early exit on non-copy and reduce indent https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] clang/AMDGPU: Emit grid size builtins with range metadata (PR #113038)
https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/113038 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] clang/AMDGPU: Emit grid size builtins with range metadata (PR #113038)
@@ -896,5 +896,6 @@ void test_set_fpenv(unsigned long env) { __builtin_amdgcn_set_fpenv(env); } +// CHECK-DAG: [[$GRID_RANGE]] = !{i32 1, i32 0} arsenm wrote: Yes, this is how you are supposed to represent the wrapped set where the 0 value isn't allowed but the uint32_max is https://github.com/llvm/llvm-project/pull/113038 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + Register Reg = Op.getReg(); + if (Reg == AMDGPU::EXEC) { +return true; + } +} + } + + return false; +} + +Register getVReg(MachineOperand &Op) { + if (!Op.isReg()) +return 0; + + Register Reg = Op.getReg(); + if (!Reg.isVirtual()) +return 0; arsenm wrote: Use explicit Register() https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -69,3 +72,38 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, return std::pair(Reg, 0); } + +IntrinsicLaneMaskAnalyzer::IntrinsicLaneMaskAnalyzer(MachineFunction &MF) +: MRI(MF.getRegInfo()) { + initLaneMaskIntrinsics(MF); +} + +bool IntrinsicLaneMaskAnalyzer::isS32S64LaneMask(Register Reg) { + return S32S64LaneMask.contains(Reg); +} + +void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) { + for (auto &MBB : MF) { +for (auto &MI : MBB) { + if (MI.getOpcode() == AMDGPU::G_INTRINSIC && + MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID() == + Intrinsic::amdgcn_if_break) { arsenm wrote: `cast(MI).getIntrinsicID() == Intrinsic::amdgcn_if_break` https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + Register Reg = Op.getReg(); + if (Reg == AMDGPU::EXEC) { +return true; + } +} + } + + return false; +} + +Register getVReg(MachineOperand &Op) { + if (!Op.isReg()) +return 0; + + Register Reg = Op.getReg(); + if (!Reg.isVirtual()) +return 0; + + return Reg; +} + +bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { + MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(MF); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + MachineIRBuilder B(MF); arsenm wrote: Should initialize the CSE info and other analyses https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang] Lower omp.workshare to other omp constructs (PR #101446)
ivanradanov wrote: I have rebased this on the latest main and also marked the follow up https://github.com/llvm/llvm-project/pull/104748 as ready for review. This follow up PR contains code and tests which are needed to fully check this implementation as well. I think this stack is currently in a good state to be merged. The 1/4 https://github.com/llvm/llvm-project/pull/101443 2/4 https://github.com/llvm/llvm-project/pull/101444 3/4 https://github.com/llvm/llvm-project/pull/101445 are already approved and good to go, but 2/4 https://github.com/llvm/llvm-project/pull/101444 must be merged together with this PR because otherwise it will result in compilation failures for `omp workshare`. Thus, it would be great if this PR can be reviewed as well and we can proceed with merging if it looks good. (The build failures are only on windows and coming from the main branch and not introduced by this) https://github.com/llvm/llvm-project/pull/101446 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [Coverage] Make additional counters available for BranchRegion. NFC. (PR #112730)
https://github.com/chapuni updated https://github.com/llvm/llvm-project/pull/112730 >From 5e460594c8a2550c38c759b2e6f1c5dc4152f820 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Thu, 17 Oct 2024 22:15:12 +0900 Subject: [PATCH 1/2] [Coverage] Make additional counters available for BranchRegion. NFC. `getBranchCounterPair()` allocates an additional Counter to SkipPath in `SingleByteCoverage`. `IsCounterEqual()` calculates the comparison with rewinding counter replacements. `NumRegionCounters` is updated to take additional counters in account. `incrementProfileCounter()` has a few additiona arguments. - `UseSkipPath=true`, to specify setting counters for SkipPath. It assumes `UseSkipPath=false` is used together. - `UseBoth` may be specified for marking another path. It introduces the same effect as issueing `markStmtAsUsed(!SkipPath, S)`. `llvm-cov` discovers counters in `FalseCount` to allocate `MaxCounterID` for empty profile data. --- clang/lib/CodeGen/CodeGenFunction.h | 8 - clang/lib/CodeGen/CodeGenPGO.cpp | 31 +-- clang/lib/CodeGen/CodeGenPGO.h| 1 + clang/lib/CodeGen/CoverageMappingGen.cpp | 31 ++- .../ProfileData/Coverage/CoverageMapping.cpp | 4 +++ 5 files changed, 65 insertions(+), 10 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 89ac3b342d0a7c..cb1192bf6e11fe 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -1629,11 +1629,17 @@ class CodeGenFunction : public CodeGenTypeCache { /// Increment the profiler's counter for the given statement by \p StepV. /// If \p StepV is null, the default increment is 1. void incrementProfileCounter(const Stmt *S, llvm::Value *StepV = nullptr) { +incrementProfileCounter(false, S, false, StepV); + } + + void incrementProfileCounter(bool UseSkipPath, const Stmt *S, + bool UseBoth = false, + llvm::Value *StepV = nullptr) { if (CGM.getCodeGenOpts().hasProfileClangInstr() && !CurFn->hasFnAttribute(llvm::Attribute::NoProfile) && !CurFn->hasFnAttribute(llvm::Attribute::SkipProfile)) { auto AL = ApplyDebugLocation::CreateArtificial(*this); - PGO.emitCounterSetOrIncrement(Builder, S, StepV); + PGO.emitCounterSetOrIncrement(Builder, S, UseSkipPath, UseBoth, StepV); } PGO.setCurrentStmt(S); } diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index 069469e3de856b..aefd53e12088b4 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -1138,6 +1138,19 @@ void CodeGenPGO::emitCounterRegionMapping(const Decl *D) { if (CoverageMapping.empty()) return; + // Scan max(FalseCnt) and update NumRegionCounters. + unsigned MaxNumCounters = NumRegionCounters; + for (const auto [_, V] : *RegionCounterMap) { +auto HasCounters = V.getIsCounterPair(); +assert((!HasCounters.first || +MaxNumCounters > (V.first & CounterPair::Mask)) && + "TrueCnt should not be reassigned"); +if (HasCounters.second) + MaxNumCounters = + std::max(MaxNumCounters, (V.second & CounterPair::Mask) + 1); + } + NumRegionCounters = MaxNumCounters; + CGM.getCoverageMapping()->addFunctionMappingRecord( FuncNameVar, FuncName, FunctionHash, CoverageMapping); } @@ -1193,11 +1206,25 @@ std::pair CodeGenPGO::getIsCounterPair(const Stmt *S) const { } void CodeGenPGO::emitCounterSetOrIncrement(CGBuilderTy &Builder, const Stmt *S, + bool UseSkipPath, bool UseBoth, llvm::Value *StepV) { - if (!RegionCounterMap || !Builder.GetInsertBlock()) + if (!RegionCounterMap) return; - unsigned Counter = (*RegionCounterMap)[S].first; + unsigned Counter; + auto &TheMap = (*RegionCounterMap)[S]; + auto IsCounter = TheMap.getIsCounterPair(); + if (!UseSkipPath) { +assert(IsCounter.first); +Counter = (TheMap.first & CounterPair::Mask); + } else { +if (!IsCounter.second) + return; +Counter = (TheMap.second & CounterPair::Mask); + } + + if (!Builder.GetInsertBlock()) +return; // Make sure that pointer to global is passed in with zero addrspace // This is relevant during GPU profiling diff --git a/clang/lib/CodeGen/CodeGenPGO.h b/clang/lib/CodeGen/CodeGenPGO.h index 83f35785e5327d..8b769dd88d7f1e 100644 --- a/clang/lib/CodeGen/CodeGenPGO.h +++ b/clang/lib/CodeGen/CodeGenPGO.h @@ -112,6 +112,7 @@ class CodeGenPGO { public: std::pair getIsCounterPair(const Stmt *S) const; void emitCounterSetOrIncrement(CGBuilderTy &Builder, const Stmt *S, + bool UseFalsePath, bool UseBoth, llvm::Value *StepV); void emitMCDCTestVectorBitmapUpdate(CGBuilderTy &Builder, const Expr *S,
[llvm-branch-commits] [clang] release/19.x: [clang] Make LazyOffsetPtr more portable (#112927) (PR #113052)
llvmbot wrote: @zygoloid What do you think about merging this PR to the release branch? https://github.com/llvm/llvm-project/pull/113052 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits