[llvm-branch-commits] [clang] [llvm] [HLSL][RootSignature] Implement Parsing of Descriptor Tables (PR #122982)
@@ -306,4 +307,254 @@ TEST_F(ParseHLSLRootSignatureTest, InvalidLexRegNumberTest) { ASSERT_FALSE(Consumer->IsSatisfied()); } +// Valid Parser Tests + +TEST_F(ParseHLSLRootSignatureTest, ValidParseEmptyTest) { + const llvm::StringLiteral Source = R"cc()cc"; + + TrivialModuleLoader ModLoader; + auto PP = CreatePP(Source, ModLoader); + auto TokLoc = SourceLocation(); + + hlsl::RootSignatureLexer Lexer(Source, TokLoc, *PP); + SmallVector Elements; + hlsl::RootSignatureParser Parser(Elements, Lexer, Diags); + + // Test no diagnostics produced + Consumer->SetNoDiag(); + + ASSERT_FALSE(Parser.Parse()); + ASSERT_EQ((int)Elements.size(), 0); bogner wrote: Usually better to convert the literal rather than the value under test in these things. Something like `ASSERT_EQ(Elements.size(), 0u)` https://github.com/llvm/llvm-project/pull/122982 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [HLSL][RootSignature] Implement Parsing of Descriptor Tables (PR #122982)
@@ -93,6 +96,108 @@ class RootSignatureLexer { } }; +class RootSignatureParser { +public: + RootSignatureParser(SmallVector &Elements, + RootSignatureLexer &Lexer, DiagnosticsEngine &Diags); + + /// Iterates over the provided tokens and constructs the in-memory + /// representations of the RootElements. + /// + /// The return value denotes if there was a failure and the method will + /// return on the first encountered failure, or, return false if it + /// can sucessfully reach the end of the tokens. + bool Parse(); + +private: + // Root Element helpers + bool ParseRootElement(); + bool ParseDescriptorTable(); + bool ParseDescriptorTableClause(); + + /// Helper dispatch method + /// + /// These will switch on the Variant kind to dispatch to the respective Parse + /// method and store the parsed value back into Ref. + /// + /// It is helpful to have a generalized dispatch method so that when we need + /// to parse multiple optional parameters in any order, we can invoke this + /// method + bool ParseParam(llvm::hlsl::rootsig::ParamType Ref); + + // Parse as many optional parameters as possible in any order + bool ParseOptionalParams( + llvm::SmallDenseMap &RefMap); + + // Common parsing helpers + bool ParseRegister(llvm::hlsl::rootsig::Register *Reg); + bool ParseUInt(uint32_t *X); + bool + ParseDescriptorRangeOffset(llvm::hlsl::rootsig::DescriptorRangeOffset *X); + + // Various flags/enum parsing helpers + template + bool ParseEnum(llvm::SmallDenseMap &EnumMap, + EnumType *Enum); + template + bool ParseFlags(llvm::SmallDenseMap &EnumMap, + FlagType *Enum); + bool + ParseDescriptorRangeFlags(llvm::hlsl::rootsig::DescriptorRangeFlags *Enum); + bool ParseShaderVisibility(llvm::hlsl::rootsig::ShaderVisibility *Enum); + + /// Invoke the lexer to consume a token and update CurToken with the result + /// + /// Return value denotes if we were already at the last token. + /// + /// This is used to avoid having to constantly access the Lexer's CurToken + bool ConsumeNextToken() { +if (Lexer.ConsumeToken()) + return true; // Report lexer err bogner wrote: This comment is confusing. Are you saying the caller should report an error, or that we should modify this function to do so? https://github.com/llvm/llvm-project/pull/122982 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [HLSL][RootSignature] Implement Parsing of Descriptor Tables (PR #122982)
@@ -93,6 +96,108 @@ class RootSignatureLexer { } }; +class RootSignatureParser { +public: + RootSignatureParser(SmallVector &Elements, + RootSignatureLexer &Lexer, DiagnosticsEngine &Diags); + + /// Iterates over the provided tokens and constructs the in-memory + /// representations of the RootElements. + /// + /// The return value denotes if there was a failure and the method will + /// return on the first encountered failure, or, return false if it + /// can sucessfully reach the end of the tokens. + bool Parse(); + +private: + // Root Element helpers + bool ParseRootElement(); + bool ParseDescriptorTable(); + bool ParseDescriptorTableClause(); + + /// Helper dispatch method + /// + /// These will switch on the Variant kind to dispatch to the respective Parse + /// method and store the parsed value back into Ref. + /// + /// It is helpful to have a generalized dispatch method so that when we need + /// to parse multiple optional parameters in any order, we can invoke this + /// method + bool ParseParam(llvm::hlsl::rootsig::ParamType Ref); + + // Parse as many optional parameters as possible in any order + bool ParseOptionalParams( + llvm::SmallDenseMap &RefMap); + + // Common parsing helpers bogner wrote: Doxygen has some syntax to group things together - look for the `///@{` syntax in https://www.doxygen.nl/manual/grouping.html - these might be worth playing with for trying to make comments like this one more useful. https://github.com/llvm/llvm-project/pull/122982 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [HLSL][RootSignature] Implement Parsing of Descriptor Tables (PR #122982)
@@ -93,6 +96,108 @@ class RootSignatureLexer { } }; +class RootSignatureParser { +public: + RootSignatureParser(SmallVector &Elements, + RootSignatureLexer &Lexer, DiagnosticsEngine &Diags); + + /// Iterates over the provided tokens and constructs the in-memory + /// representations of the RootElements. + /// + /// The return value denotes if there was a failure and the method will + /// return on the first encountered failure, or, return false if it + /// can sucessfully reach the end of the tokens. + bool Parse(); + +private: + // Root Element helpers + bool ParseRootElement(); + bool ParseDescriptorTable(); + bool ParseDescriptorTableClause(); + + /// Helper dispatch method + /// + /// These will switch on the Variant kind to dispatch to the respective Parse + /// method and store the parsed value back into Ref. + /// + /// It is helpful to have a generalized dispatch method so that when we need + /// to parse multiple optional parameters in any order, we can invoke this + /// method + bool ParseParam(llvm::hlsl::rootsig::ParamType Ref); + + // Parse as many optional parameters as possible in any order + bool ParseOptionalParams( + llvm::SmallDenseMap &RefMap); + + // Common parsing helpers + bool ParseRegister(llvm::hlsl::rootsig::Register *Reg); + bool ParseUInt(uint32_t *X); + bool + ParseDescriptorRangeOffset(llvm::hlsl::rootsig::DescriptorRangeOffset *X); + + // Various flags/enum parsing helpers + template + bool ParseEnum(llvm::SmallDenseMap &EnumMap, + EnumType *Enum); + template + bool ParseFlags(llvm::SmallDenseMap &EnumMap, + FlagType *Enum); + bool + ParseDescriptorRangeFlags(llvm::hlsl::rootsig::DescriptorRangeFlags *Enum); + bool ParseShaderVisibility(llvm::hlsl::rootsig::ShaderVisibility *Enum); + + /// Invoke the lexer to consume a token and update CurToken with the result + /// + /// Return value denotes if we were already at the last token. bogner wrote: "returns true if X" is generally more helpful documentation than "return value denotes such and such" https://github.com/llvm/llvm-project/pull/122982 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [HLSL][RootSignature] Implement Parsing of Descriptor Tables (PR #122982)
@@ -93,6 +96,108 @@ class RootSignatureLexer { } }; +class RootSignatureParser { +public: + RootSignatureParser(SmallVector &Elements, + RootSignatureLexer &Lexer, DiagnosticsEngine &Diags); + + /// Iterates over the provided tokens and constructs the in-memory + /// representations of the RootElements. + /// + /// The return value denotes if there was a failure and the method will + /// return on the first encountered failure, or, return false if it + /// can sucessfully reach the end of the tokens. + bool Parse(); + +private: + // Root Element helpers + bool ParseRootElement(); + bool ParseDescriptorTable(); + bool ParseDescriptorTableClause(); + + /// Helper dispatch method + /// + /// These will switch on the Variant kind to dispatch to the respective Parse + /// method and store the parsed value back into Ref. + /// + /// It is helpful to have a generalized dispatch method so that when we need + /// to parse multiple optional parameters in any order, we can invoke this + /// method + bool ParseParam(llvm::hlsl::rootsig::ParamType Ref); + + // Parse as many optional parameters as possible in any order + bool ParseOptionalParams( + llvm::SmallDenseMap &RefMap); + + // Common parsing helpers + bool ParseRegister(llvm::hlsl::rootsig::Register *Reg); + bool ParseUInt(uint32_t *X); + bool + ParseDescriptorRangeOffset(llvm::hlsl::rootsig::DescriptorRangeOffset *X); + + // Various flags/enum parsing helpers + template + bool ParseEnum(llvm::SmallDenseMap &EnumMap, + EnumType *Enum); + template + bool ParseFlags(llvm::SmallDenseMap &EnumMap, + FlagType *Enum); + bool + ParseDescriptorRangeFlags(llvm::hlsl::rootsig::DescriptorRangeFlags *Enum); + bool ParseShaderVisibility(llvm::hlsl::rootsig::ShaderVisibility *Enum); + + /// Invoke the lexer to consume a token and update CurToken with the result + /// + /// Return value denotes if we were already at the last token. + /// + /// This is used to avoid having to constantly access the Lexer's CurToken bogner wrote: This comment doesn't seem terribly necesary - the use of the function is fairly clear from the explanation of what it does. https://github.com/llvm/llvm-project/pull/122982 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [HLSL][RootSignature] Implement Parsing of Descriptor Tables (PR #122982)
@@ -1,8 +1,65 @@ #include "clang/Parse/ParseHLSLRootSignature.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm::hlsl::rootsig; + namespace clang { namespace hlsl { +// Helper definitions bogner wrote: This comment isn't very helpful. https://github.com/llvm/llvm-project/pull/122982 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [HLSL][RootSignature] Implement Parsing of Descriptor Tables (PR #122982)
@@ -162,5 +219,371 @@ std::optional RootSignatureLexer::PeekNextToken() { return Result; } +// Parser Definitions + +RootSignatureParser::RootSignatureParser(SmallVector &Elements, + RootSignatureLexer &Lexer, + DiagnosticsEngine &Diags) +: Elements(Elements), Lexer(Lexer), Diags(Diags) {} + +bool RootSignatureParser::Parse() { + // Handle edge-case of empty RootSignature() bogner wrote: "A completely empty root signature is invalid" https://github.com/llvm/llvm-project/pull/122982 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [HLSL][RootSignature] Implement Parsing of Descriptor Tables (PR #122982)
@@ -162,5 +219,371 @@ std::optional RootSignatureLexer::PeekNextToken() { return Result; } +// Parser Definitions + +RootSignatureParser::RootSignatureParser(SmallVector &Elements, + RootSignatureLexer &Lexer, + DiagnosticsEngine &Diags) +: Elements(Elements), Lexer(Lexer), Diags(Diags) {} + +bool RootSignatureParser::Parse() { + // Handle edge-case of empty RootSignature() + if (Lexer.EndOfBuffer()) +return false; + + // Iterate as many RootElements as possible + while (!ParseRootElement()) { +if (Lexer.EndOfBuffer()) + return false; +if (ConsumeExpectedToken(TokenKind::pu_comma)) + return true; + } + + return true; +} + +bool RootSignatureParser::ParseRootElement() { + if (ConsumeExpectedToken(TokenKind::kw_DescriptorTable)) +return true; + + // Dispatch onto the correct parse method + switch (CurToken.Kind) { + case TokenKind::kw_DescriptorTable: +return ParseDescriptorTable(); + default: +llvm_unreachable("Switch for an expected token was not provided"); + } + return true; bogner wrote: This is actually unreachable - we either return or call `llvm_unreachable` before this. Probably better to use `llvm_unreachable` rather than return a value. https://github.com/llvm/llvm-project/pull/122982 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [HLSL][RootSignature] Implement Parsing of Descriptor Tables (PR #122982)
@@ -162,5 +219,371 @@ std::optional RootSignatureLexer::PeekNextToken() { return Result; } +// Parser Definitions + +RootSignatureParser::RootSignatureParser(SmallVector &Elements, + RootSignatureLexer &Lexer, + DiagnosticsEngine &Diags) +: Elements(Elements), Lexer(Lexer), Diags(Diags) {} + +bool RootSignatureParser::Parse() { + // Handle edge-case of empty RootSignature() + if (Lexer.EndOfBuffer()) +return false; + + // Iterate as many RootElements as possible + while (!ParseRootElement()) { +if (Lexer.EndOfBuffer()) + return false; +if (ConsumeExpectedToken(TokenKind::pu_comma)) + return true; + } + + return true; +} + +bool RootSignatureParser::ParseRootElement() { + if (ConsumeExpectedToken(TokenKind::kw_DescriptorTable)) +return true; + + // Dispatch onto the correct parse method + switch (CurToken.Kind) { + case TokenKind::kw_DescriptorTable: +return ParseDescriptorTable(); + default: +llvm_unreachable("Switch for an expected token was not provided"); bogner wrote: An `llvm_unreachable` under a `default:` label always makes me nervous. Ideally, we could have a fully covered switch here, which would mean that if we add anything to `TokenKind` we would get warnings if we forget to update this. Is this something we could do here? https://github.com/llvm/llvm-project/pull/122982 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [HLSL][RootSignature] Implement Parsing of Descriptor Tables (PR #122982)
@@ -93,6 +96,108 @@ class RootSignatureLexer { } }; +class RootSignatureParser { +public: + RootSignatureParser(SmallVector &Elements, + RootSignatureLexer &Lexer, DiagnosticsEngine &Diags); + + /// Iterates over the provided tokens and constructs the in-memory + /// representations of the RootElements. + /// + /// The return value denotes if there was a failure and the method will + /// return on the first encountered failure, or, return false if it + /// can sucessfully reach the end of the tokens. + bool Parse(); + +private: + // Root Element helpers + bool ParseRootElement(); + bool ParseDescriptorTable(); + bool ParseDescriptorTableClause(); + + /// Helper dispatch method + /// + /// These will switch on the Variant kind to dispatch to the respective Parse + /// method and store the parsed value back into Ref. + /// + /// It is helpful to have a generalized dispatch method so that when we need + /// to parse multiple optional parameters in any order, we can invoke this + /// method bogner wrote: Use more verbs. This comment has a lot of text that explains why we might want to use an overloaded method here, but it doesn't really say what this method *does*. https://github.com/llvm/llvm-project/pull/122982 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] release/20.x: [clang-format] Fix a crash on parsing requires clause (#125021) (PR #126843)
https://github.com/llvmbot updated https://github.com/llvm/llvm-project/pull/126843 >From a7f00c8ed8c20a006fd7e592e99f3ca7b725b07e Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Thu, 30 Jan 2025 18:03:04 -0800 Subject: [PATCH] [clang-format] Fix a crash on parsing requires clause (#125021) Fixes #124921. (cherry picked from commit 14178deab0334d9ce095ae7adce408868659faee) --- clang/lib/Format/UnwrappedLineParser.cpp | 2 +- clang/unittests/Format/FormatTest.cpp| 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 906fc11a07d5e..120922d271aab 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -3632,7 +3632,7 @@ void UnwrappedLineParser::parseRequiresClause(FormatToken *RequiresToken) { // It could be inlined into here. parseConstraintExpression(); - if (!InRequiresExpression) + if (!InRequiresExpression && FormatTok->Previous) FormatTok->Previous->ClosesRequiresClause = true; } diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 57f12221cdc7e..30f3533ac73f7 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -26478,6 +26478,9 @@ TEST_F(FormatTest, RequiresClauses) { "foo();\n" "#endif\n" "bar(requires);"); + + verifyNoCrash("template \n" +"requires(requires { std::declval()"); } TEST_F(FormatTest, RequiresExpressionIndentation) { ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] a7f00c8 - [clang-format] Fix a crash on parsing requires clause (#125021)
Author: Owen Pan Date: 2025-02-13T18:38:24-08:00 New Revision: a7f00c8ed8c20a006fd7e592e99f3ca7b725b07e URL: https://github.com/llvm/llvm-project/commit/a7f00c8ed8c20a006fd7e592e99f3ca7b725b07e DIFF: https://github.com/llvm/llvm-project/commit/a7f00c8ed8c20a006fd7e592e99f3ca7b725b07e.diff LOG: [clang-format] Fix a crash on parsing requires clause (#125021) Fixes #124921. (cherry picked from commit 14178deab0334d9ce095ae7adce408868659faee) Added: Modified: clang/lib/Format/UnwrappedLineParser.cpp clang/unittests/Format/FormatTest.cpp Removed: diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 906fc11a07d5e..120922d271aab 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -3632,7 +3632,7 @@ void UnwrappedLineParser::parseRequiresClause(FormatToken *RequiresToken) { // It could be inlined into here. parseConstraintExpression(); - if (!InRequiresExpression) + if (!InRequiresExpression && FormatTok->Previous) FormatTok->Previous->ClosesRequiresClause = true; } diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 57f12221cdc7e..30f3533ac73f7 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -26478,6 +26478,9 @@ TEST_F(FormatTest, RequiresClauses) { "foo();\n" "#endif\n" "bar(requires);"); + + verifyNoCrash("template \n" +"requires(requires { std::declval()"); } TEST_F(FormatTest, RequiresExpressionIndentation) { ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] release/20.x: [clang-format] Fix a crash on parsing requires clause (#125021) (PR #126843)
github-actions[bot] wrote: @owenca (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. https://github.com/llvm/llvm-project/pull/126843 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] release/20.x: [clang-format] Fix a crash on parsing requires clause (#125021) (PR #126843)
https://github.com/tstellar closed https://github.com/llvm/llvm-project/pull/126843 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CodeGen][NewPM] Port RegAllocPriorityAdvisor analysis to NPM (PR #118462)
https://github.com/optimisan updated https://github.com/llvm/llvm-project/pull/118462 >From ed461217c0323fa1d172b42e81e20900653758c9 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Tue, 3 Dec 2024 10:12:36 + Subject: [PATCH 1/9] [CodeGen][NewPM] Port RegAllocPriorityAdvisor analysis to NPM --- .../llvm}/CodeGen/RegAllocPriorityAdvisor.h | 78 +++- llvm/include/llvm/InitializePasses.h | 2 +- .../llvm/Passes/MachinePassRegistry.def | 1 + llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp | 6 +- .../lib/CodeGen/MLRegAllocPriorityAdvisor.cpp | 184 +++--- llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp | 2 +- llvm/lib/CodeGen/RegAllocGreedy.cpp | 9 +- llvm/lib/CodeGen/RegAllocGreedy.h | 2 +- llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp | 155 +++ llvm/lib/Passes/PassBuilder.cpp | 1 + 10 files changed, 320 insertions(+), 120 deletions(-) rename llvm/{lib => include/llvm}/CodeGen/RegAllocPriorityAdvisor.h (57%) diff --git a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h b/llvm/include/llvm/CodeGen/RegAllocPriorityAdvisor.h similarity index 57% rename from llvm/lib/CodeGen/RegAllocPriorityAdvisor.h rename to llvm/include/llvm/CodeGen/RegAllocPriorityAdvisor.h index 0758743c2b140..a53739fdc3fc4 100644 --- a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h +++ b/llvm/include/llvm/CodeGen/RegAllocPriorityAdvisor.h @@ -9,8 +9,10 @@ #ifndef LLVM_CODEGEN_REGALLOCPRIORITYADVISOR_H #define LLVM_CODEGEN_REGALLOCPRIORITYADVISOR_H +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/RegAllocEvictionAdvisor.h" #include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/IR/PassManager.h" #include "llvm/Pass.h" namespace llvm { @@ -68,12 +70,72 @@ class DummyPriorityAdvisor : public RegAllocPriorityAdvisor { unsigned getPriority(const LiveInterval &LI) const override; }; -class RegAllocPriorityAdvisorAnalysis : public ImmutablePass { +/// Common provider for getting the priority advisor and logging rewards. +/// Legacy analysis forwards all calls to this provider. +/// New analysis serves the provider as the analysis result. +/// Expensive setup is done in the constructor, so that the advisor can be +/// created quickly for every machine function. +/// TODO: Remove once legacy PM support is dropped. +class RegAllocPriorityAdvisorProvider { public: enum class AdvisorMode : int { Default, Release, Development, Dummy }; - RegAllocPriorityAdvisorAnalysis(AdvisorMode Mode) - : ImmutablePass(ID), Mode(Mode){}; + RegAllocPriorityAdvisorProvider(AdvisorMode Mode) : Mode(Mode) {} + + virtual ~RegAllocPriorityAdvisorProvider() = default; + + virtual void logRewardIfNeeded(const MachineFunction &MF, + llvm::function_ref GetReward) {}; + + virtual std::unique_ptr + getAdvisor(const MachineFunction &MF, const RAGreedy &RA) = 0; + + void setAnalyses(SlotIndexes *SI) { this->SI = SI; } + + AdvisorMode getAdvisorMode() const { return Mode; } + +protected: + SlotIndexes *SI; + +private: + const AdvisorMode Mode; +}; + +RegAllocPriorityAdvisorProvider *createReleaseModePriorityAdvisorProvider(); + +RegAllocPriorityAdvisorProvider * +createDevelopmentModePriorityAdvisorProvider(LLVMContext &Ctx); + +class RegAllocPriorityAdvisorAnalysis +: public AnalysisInfoMixin { + static AnalysisKey Key; + friend AnalysisInfoMixin; + +public: + struct Result { +// Owned by this analysis. +RegAllocPriorityAdvisorProvider *Provider; + +bool invalidate(MachineFunction &MF, const PreservedAnalyses &PA, +MachineFunctionAnalysisManager::Invalidator &Inv) { + auto PAC = PA.getChecker(); + return !PAC.preservedWhenStateless() || + Inv.invalidate(MF, PA); +} + }; + + Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM); + +private: + void initializeProvider(LLVMContext &Ctx); + std::unique_ptr Provider; +}; + +class RegAllocPriorityAdvisorAnalysisLegacy : public ImmutablePass { +public: + using AdvisorMode = RegAllocPriorityAdvisorProvider::AdvisorMode; + RegAllocPriorityAdvisorAnalysisLegacy(AdvisorMode Mode) + : ImmutablePass(ID), Mode(Mode) {}; static char ID; /// Get an advisor for the given context (i.e. machine function, etc) @@ -81,7 +143,7 @@ class RegAllocPriorityAdvisorAnalysis : public ImmutablePass { getAdvisor(const MachineFunction &MF, const RAGreedy &RA) = 0; AdvisorMode getAdvisorMode() const { return Mode; } virtual void logRewardIfNeeded(const MachineFunction &MF, - llvm::function_ref GetReward){}; + llvm::function_ref GetReward) {}; protected: // This analysis preserves everything, and subclasses may have additional @@ -97,11 +159,13 @@ class RegAllocPriorityAdvisorAnalysis : public ImmutablePass { /// Specialization for the API used by the analysis infrastructure to create /// an instance
[llvm-branch-commits] [llvm] [CodeGen][NewPM] Port RegAllocGreedy to NPM (PR #119540)
https://github.com/optimisan updated https://github.com/llvm/llvm-project/pull/119540 >From b407c8640b5936bcd6824bd0ab550243ae6a7f3b Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Wed, 11 Dec 2024 08:51:55 + Subject: [PATCH 1/7] [CodeGen][NewPM] Port RegAllocGreedy to NPM --- llvm/include/llvm/CodeGen/MachineFunction.h | 1 + llvm/include/llvm/CodeGen/Passes.h| 2 +- llvm/include/llvm/InitializePasses.h | 2 +- .../llvm/Passes/MachinePassRegistry.def | 9 + llvm/lib/CodeGen/CodeGen.cpp | 2 +- llvm/lib/CodeGen/RegAllocGreedy.cpp | 185 ++ llvm/lib/CodeGen/RegAllocGreedy.h | 57 +++--- llvm/lib/Passes/PassBuilder.cpp | 1 + 8 files changed, 196 insertions(+), 63 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h index f1e595cde54e3..7fd0994883fe8 100644 --- a/llvm/include/llvm/CodeGen/MachineFunction.h +++ b/llvm/include/llvm/CodeGen/MachineFunction.h @@ -927,6 +927,7 @@ class LLVM_ABI MachineFunction { /// Run the current MachineFunction through the machine code verifier, useful /// for debugger use. + /// TODO: Add the param LiveStks /// \returns true if no problems were found. bool verify(LiveIntervals *LiveInts, SlotIndexes *Indexes, const char *Banner = nullptr, raw_ostream *OS = nullptr, diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index b5d2a7e6bf035..0182f21bee5f5 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -171,7 +171,7 @@ namespace llvm { extern char &LiveRangeShrinkID; /// Greedy register allocator. - extern char &RAGreedyID; + extern char &RAGreedyLegacyID; /// Basic register allocator. extern char &RABasicID; diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 55a0051eb13fb..c0e839b3467d1 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -250,7 +250,7 @@ void initializeProfileSummaryInfoWrapperPassPass(PassRegistry &); void initializePromoteLegacyPassPass(PassRegistry &); void initializeRABasicPass(PassRegistry &); void initializePseudoProbeInserterPass(PassRegistry &); -void initializeRAGreedyPass(PassRegistry &); +void initializeRAGreedyLegacyPass(PassRegistry &); void initializeReachingDefAnalysisPass(PassRegistry &); void initializeReassociateLegacyPassPass(PassRegistry &); void initializeRegAllocEvictionAdvisorAnalysisLegacyPass(PassRegistry &); diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 123e5ae92bc23..6535d4a2c0438 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -192,6 +192,15 @@ MACHINE_FUNCTION_PASS_WITH_PARAMS( return parseRegAllocFastPassOptions(*PB, Params); }, "filter=reg-filter;no-clear-vregs") + +MACHINE_FUNCTION_PASS_WITH_PARAMS( +"regallocgreedy", "RAGreedy", +[](RegAllocFilterFunc F) { return RAGreedyPass(F); }, +[PB = this](StringRef Params) { + // TODO: parseRegAllocFilter(*PB, Params); + return Expected(nullptr); +}, "" +) #undef MACHINE_FUNCTION_PASS_WITH_PARAMS // After a pass is converted to new pass manager, its entry should be moved from diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index d69a24f00871e..a34285ff3e935 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -112,7 +112,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializePreISelIntrinsicLoweringLegacyPassPass(Registry); initializeProcessImplicitDefsPass(Registry); initializeRABasicPass(Registry); - initializeRAGreedyPass(Registry); + initializeRAGreedyLegacyPass(Registry); initializeRegAllocFastPass(Registry); initializeRegUsageInfoCollectorLegacyPass(Registry); initializeRegUsageInfoPropagationLegacyPass(Registry); diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index ac9b1795e1ec9..f5afbcd39f1fa 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -43,8 +43,10 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegAllocEvictionAdvisor.h" +#include "llvm/CodeGen/RegAllocGreedyPass.h" #include "llvm/CodeGen/RegAllocPriorityAdvisor.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/RegisterClassInfo.h" @@ -55,6 +57,7 @@ #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/Debug
[llvm-branch-commits] [llvm] [RegAlloc][NewPM] Plug Greedy RA in codegen pipeline (PR #120557)
https://github.com/optimisan updated https://github.com/llvm/llvm-project/pull/120557 >From 6b013984ddb49a7c60a6eb60db33aa73f9e55c4d Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Tue, 11 Feb 2025 12:36:40 + Subject: [PATCH 1/2] [CodeGen][NewPM] Plug greedy RA in codegen pipeline --- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 51 ++- .../llvm/Passes/MachinePassRegistry.def | 4 +- .../include/llvm/Target/CGPassBuilderOption.h | 4 +- llvm/lib/Passes/PassBuilder.cpp | 14 + ...plicit-def-remat-requires-impdef-check.mir | 1 + ...implicit-def-with-impdef-greedy-assert.mir | 1 + llvm/test/CodeGen/AArch64/pr51516.mir | 1 + llvm/test/CodeGen/AArch64/spill-fold.mir | 2 + llvm/test/CodeGen/MIR/Generic/runPass.mir | 1 + .../SystemZ/clear-liverange-spillreg.mir | 1 + llvm/test/CodeGen/Thumb/high-reg-clobber.mir | 1 + llvm/test/CodeGen/X86/limit-split-cost.mir| 1 + llvm/tools/llc/NewPMDriver.cpp| 15 -- 13 files changed, 75 insertions(+), 22 deletions(-) diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index a1e81f4b7375a..31120b0b43485 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -1061,7 +1061,9 @@ void CodeGenPassBuilder::addMachineSSAOptimization( /// /// A target that uses the standard regalloc pass order for fast or optimized /// allocation may still override this for per-target regalloc -/// selection. But -regalloc=... always takes precedence. +/// selection. But -regalloc-npm=... always takes precedence. +/// If a target does not want to allow users to set -regalloc-npm=... at all, +/// check if Opt.RegAlloc == RegAllocType::Unset. template void CodeGenPassBuilder::addTargetRegisterAllocator( AddMachinePass &addPass, bool Optimized) const { @@ -1074,10 +1076,29 @@ void CodeGenPassBuilder::addTargetRegisterAllocator( /// Find and instantiate the register allocation pass requested by this target /// at the current optimization level. Different register allocators are /// defined as separate passes because they may require different analysis. +/// +/// This helper ensures that the -regalloc-npm= option is always available, +/// even for targets that override the default allocator. template void CodeGenPassBuilder::addRegAllocPass( AddMachinePass &addPass, bool Optimized) const { - // TODO: Parse Opt.RegAlloc to add register allocator. + // Use the specified -regalloc-npm={basic|greedy|fast|pbqp} + if (Opt.RegAlloc > RegAllocType::Default) { +switch (Opt.RegAlloc) { +case RegAllocType::Fast: + addPass(RegAllocFastPass()); + break; +case RegAllocType::Greedy: + addPass(RAGreedyPass()); + break; +default: + report_fatal_error("register allocator not supported yet.", false); +} +return; + } + // -regalloc=default or unspecified, so pick based on the optimization level + // or ask the target for the regalloc pass. + derived().addTargetRegisterAllocator(addPass, Optimized); } template @@ -1148,20 +1169,22 @@ void CodeGenPassBuilder::addOptimizedRegAlloc( // PreRA instruction scheduling. addPass(MachineSchedulerPass()); - if (derived().addRegAssignmentOptimized(addPass)) { -// Allow targets to expand pseudo instructions depending on the choice of -// registers before MachineCopyPropagation. -derived().addPostRewrite(addPass); + if (auto E = derived().addRegAssignmentOptimized(addPass)) { +// addRegAssignmentOptimized did not add a reg alloc pass, so do nothing. +return; + } + // Allow targets to expand pseudo instructions depending on the choice of + // registers before MachineCopyPropagation. + derived().addPostRewrite(addPass); -// Copy propagate to forward register uses and try to eliminate COPYs that -// were not coalesced. -addPass(MachineCopyPropagationPass()); + // Copy propagate to forward register uses and try to eliminate COPYs that + // were not coalesced. + addPass(MachineCopyPropagationPass()); -// Run post-ra machine LICM to hoist reloads / remats. -// -// FIXME: can this move into MachineLateOptimization? -addPass(MachineLICMPass()); - } + // Run post-ra machine LICM to hoist reloads / remats. + // + // FIXME: can this move into MachineLateOptimization? + addPass(MachineLICMPass()); } //===-===// diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 9e2d9e071e85d..3656e5db9fb83 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -193,12 +193,12 @@ MACHINE_FUNCTION_PASS_WITH_PARAMS( }, "filter=reg-filter;no-clear-vregs") +// 'all' is the default filter MACHINE_FUNCTION_PASS_WITH_PARAMS(
[llvm-branch-commits] [llvm] [CodeGen][NewPM] Port RegAllocGreedy to NPM (PR #119540)
https://github.com/optimisan updated https://github.com/llvm/llvm-project/pull/119540 >From b407c8640b5936bcd6824bd0ab550243ae6a7f3b Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Wed, 11 Dec 2024 08:51:55 + Subject: [PATCH 1/7] [CodeGen][NewPM] Port RegAllocGreedy to NPM --- llvm/include/llvm/CodeGen/MachineFunction.h | 1 + llvm/include/llvm/CodeGen/Passes.h| 2 +- llvm/include/llvm/InitializePasses.h | 2 +- .../llvm/Passes/MachinePassRegistry.def | 9 + llvm/lib/CodeGen/CodeGen.cpp | 2 +- llvm/lib/CodeGen/RegAllocGreedy.cpp | 185 ++ llvm/lib/CodeGen/RegAllocGreedy.h | 57 +++--- llvm/lib/Passes/PassBuilder.cpp | 1 + 8 files changed, 196 insertions(+), 63 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h index f1e595cde54e3..7fd0994883fe8 100644 --- a/llvm/include/llvm/CodeGen/MachineFunction.h +++ b/llvm/include/llvm/CodeGen/MachineFunction.h @@ -927,6 +927,7 @@ class LLVM_ABI MachineFunction { /// Run the current MachineFunction through the machine code verifier, useful /// for debugger use. + /// TODO: Add the param LiveStks /// \returns true if no problems were found. bool verify(LiveIntervals *LiveInts, SlotIndexes *Indexes, const char *Banner = nullptr, raw_ostream *OS = nullptr, diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index b5d2a7e6bf035..0182f21bee5f5 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -171,7 +171,7 @@ namespace llvm { extern char &LiveRangeShrinkID; /// Greedy register allocator. - extern char &RAGreedyID; + extern char &RAGreedyLegacyID; /// Basic register allocator. extern char &RABasicID; diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 55a0051eb13fb..c0e839b3467d1 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -250,7 +250,7 @@ void initializeProfileSummaryInfoWrapperPassPass(PassRegistry &); void initializePromoteLegacyPassPass(PassRegistry &); void initializeRABasicPass(PassRegistry &); void initializePseudoProbeInserterPass(PassRegistry &); -void initializeRAGreedyPass(PassRegistry &); +void initializeRAGreedyLegacyPass(PassRegistry &); void initializeReachingDefAnalysisPass(PassRegistry &); void initializeReassociateLegacyPassPass(PassRegistry &); void initializeRegAllocEvictionAdvisorAnalysisLegacyPass(PassRegistry &); diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 123e5ae92bc23..6535d4a2c0438 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -192,6 +192,15 @@ MACHINE_FUNCTION_PASS_WITH_PARAMS( return parseRegAllocFastPassOptions(*PB, Params); }, "filter=reg-filter;no-clear-vregs") + +MACHINE_FUNCTION_PASS_WITH_PARAMS( +"regallocgreedy", "RAGreedy", +[](RegAllocFilterFunc F) { return RAGreedyPass(F); }, +[PB = this](StringRef Params) { + // TODO: parseRegAllocFilter(*PB, Params); + return Expected(nullptr); +}, "" +) #undef MACHINE_FUNCTION_PASS_WITH_PARAMS // After a pass is converted to new pass manager, its entry should be moved from diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index d69a24f00871e..a34285ff3e935 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -112,7 +112,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializePreISelIntrinsicLoweringLegacyPassPass(Registry); initializeProcessImplicitDefsPass(Registry); initializeRABasicPass(Registry); - initializeRAGreedyPass(Registry); + initializeRAGreedyLegacyPass(Registry); initializeRegAllocFastPass(Registry); initializeRegUsageInfoCollectorLegacyPass(Registry); initializeRegUsageInfoPropagationLegacyPass(Registry); diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index ac9b1795e1ec9..f5afbcd39f1fa 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -43,8 +43,10 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegAllocEvictionAdvisor.h" +#include "llvm/CodeGen/RegAllocGreedyPass.h" #include "llvm/CodeGen/RegAllocPriorityAdvisor.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/RegisterClassInfo.h" @@ -55,6 +57,7 @@ #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/Debug
[llvm-branch-commits] [llvm] [AMDGPU][NewPM] Port SIOptimizeExecMaskingPreRA to NPM (PR #125351)
https://github.com/optimisan updated https://github.com/llvm/llvm-project/pull/125351 >From 4bf4fe28a25a1ac7e216b4dcd66da210114a5482 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Sat, 1 Feb 2025 18:21:24 + Subject: [PATCH 1/2] [AMDGPU][NewPM] Port SIOptimizeExecMaskingPreRA to NPM --- llvm/lib/Target/AMDGPU/AMDGPU.h | 2 +- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 +- .../AMDGPU/SIOptimizeExecMaskingPreRA.cpp | 42 ++- .../AMDGPU/SIOptimizeExecMaskingPreRA.h | 24 +++ .../CodeGen/AMDGPU/collapse-endcf-broken.mir | 1 + ...ask-pre-ra-non-empty-but-used-interval.mir | 1 + 7 files changed, 62 insertions(+), 13 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.h diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 67bad5884c260..59839fd54d014 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -368,7 +368,7 @@ struct AMDGPUUnifyMetadataPass : PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); }; -void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry&); +void initializeSIOptimizeExecMaskingPreRALegacyPass(PassRegistry &); extern char &SIOptimizeExecMaskingPreRAID; void initializeSIOptimizeVGPRLiveRangeLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index d9d97928062f5..520f1a4282cc8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -110,6 +110,7 @@ MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass()) MACHINE_FUNCTION_PASS("si-lower-wwm-copies", SILowerWWMCopiesPass()) MACHINE_FUNCTION_PASS("si-opt-vgpr-liverange", SIOptimizeVGPRLiveRangePass()) MACHINE_FUNCTION_PASS("si-optimize-exec-masking", SIOptimizeExecMaskingPass()) +MACHINE_FUNCTION_PASS("si-optimize-exec-masking-pre-ra", SIOptimizeExecMaskingPreRAPass()) MACHINE_FUNCTION_PASS("si-peephole-sdwa", SIPeepholeSDWAPass()) MACHINE_FUNCTION_PASS("si-pre-allocate-wwm-regs", SIPreAllocateWWMRegsPass()) MACHINE_FUNCTION_PASS("si-shrink-instructions", SIShrinkInstructionsPass()) @@ -128,7 +129,6 @@ DUMMY_MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass()) DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass()) DUMMY_MACHINE_FUNCTION_PASS("si-memory-legalizer", SIMemoryLegalizerPass()) DUMMY_MACHINE_FUNCTION_PASS("si-mode-register", SIModeRegisterPass()) -DUMMY_MACHINE_FUNCTION_PASS("si-optimize-exec-masking-pre-ra", SIOptimizeExecMaskingPreRAPass()) DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) // TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it // already exists. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 8b387a191efba..855c522ca4de7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -48,6 +48,7 @@ #include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" #include "SIOptimizeExecMasking.h" +#include "SIOptimizeExecMaskingPreRA.h" #include "SIOptimizeVGPRLiveRange.h" #include "SIPeepholeSDWA.h" #include "SIPreAllocateWWMRegs.h" @@ -497,7 +498,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIFoldOperandsLegacyPass(*PR); initializeSIPeepholeSDWALegacyPass(*PR); initializeSIShrinkInstructionsLegacyPass(*PR); - initializeSIOptimizeExecMaskingPreRAPass(*PR); + initializeSIOptimizeExecMaskingPreRALegacyPass(*PR); initializeSIOptimizeVGPRLiveRangeLegacyPass(*PR); initializeSILoadStoreOptimizerLegacyPass(*PR); initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index 31f65d82a4d2b..2a8a398d7429d 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -12,6 +12,7 @@ /// //===--===// +#include "SIOptimizeExecMaskingPreRA.h" #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -25,7 +26,7 @@ using namespace llvm; namespace { -class SIOptimizeExecMaskingPreRA : public MachineFunctionPass { +class SIOptimizeExecMaskingPreRA { private: const SIRegisterInfo *TRI; const SIInstrInfo *TII; @@ -42,11 +43,18 @@ class SIOptimizeExecMaskingPreRA : public MachineFunctionPass { bool optimizeVcndVcmpPair(MachineBasicBlock &MBB); bool optimizeElseBranch(MachineBasicBlock &MBB); +public: + SIOptimizeExecMaskingPreRA(LiveIntervals *LIS) : LIS(LIS) {} + bool run(MachineFunction &MF); +}; + +class SIOptimi
[llvm-branch-commits] [llvm] [RegAlloc][NewPM] Plug Greedy RA in codegen pipeline (PR #120557)
https://github.com/optimisan updated https://github.com/llvm/llvm-project/pull/120557 >From 6b013984ddb49a7c60a6eb60db33aa73f9e55c4d Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Tue, 11 Feb 2025 12:36:40 + Subject: [PATCH 1/2] [CodeGen][NewPM] Plug greedy RA in codegen pipeline --- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 51 ++- .../llvm/Passes/MachinePassRegistry.def | 4 +- .../include/llvm/Target/CGPassBuilderOption.h | 4 +- llvm/lib/Passes/PassBuilder.cpp | 14 + ...plicit-def-remat-requires-impdef-check.mir | 1 + ...implicit-def-with-impdef-greedy-assert.mir | 1 + llvm/test/CodeGen/AArch64/pr51516.mir | 1 + llvm/test/CodeGen/AArch64/spill-fold.mir | 2 + llvm/test/CodeGen/MIR/Generic/runPass.mir | 1 + .../SystemZ/clear-liverange-spillreg.mir | 1 + llvm/test/CodeGen/Thumb/high-reg-clobber.mir | 1 + llvm/test/CodeGen/X86/limit-split-cost.mir| 1 + llvm/tools/llc/NewPMDriver.cpp| 15 -- 13 files changed, 75 insertions(+), 22 deletions(-) diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index a1e81f4b7375a..31120b0b43485 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -1061,7 +1061,9 @@ void CodeGenPassBuilder::addMachineSSAOptimization( /// /// A target that uses the standard regalloc pass order for fast or optimized /// allocation may still override this for per-target regalloc -/// selection. But -regalloc=... always takes precedence. +/// selection. But -regalloc-npm=... always takes precedence. +/// If a target does not want to allow users to set -regalloc-npm=... at all, +/// check if Opt.RegAlloc == RegAllocType::Unset. template void CodeGenPassBuilder::addTargetRegisterAllocator( AddMachinePass &addPass, bool Optimized) const { @@ -1074,10 +1076,29 @@ void CodeGenPassBuilder::addTargetRegisterAllocator( /// Find and instantiate the register allocation pass requested by this target /// at the current optimization level. Different register allocators are /// defined as separate passes because they may require different analysis. +/// +/// This helper ensures that the -regalloc-npm= option is always available, +/// even for targets that override the default allocator. template void CodeGenPassBuilder::addRegAllocPass( AddMachinePass &addPass, bool Optimized) const { - // TODO: Parse Opt.RegAlloc to add register allocator. + // Use the specified -regalloc-npm={basic|greedy|fast|pbqp} + if (Opt.RegAlloc > RegAllocType::Default) { +switch (Opt.RegAlloc) { +case RegAllocType::Fast: + addPass(RegAllocFastPass()); + break; +case RegAllocType::Greedy: + addPass(RAGreedyPass()); + break; +default: + report_fatal_error("register allocator not supported yet.", false); +} +return; + } + // -regalloc=default or unspecified, so pick based on the optimization level + // or ask the target for the regalloc pass. + derived().addTargetRegisterAllocator(addPass, Optimized); } template @@ -1148,20 +1169,22 @@ void CodeGenPassBuilder::addOptimizedRegAlloc( // PreRA instruction scheduling. addPass(MachineSchedulerPass()); - if (derived().addRegAssignmentOptimized(addPass)) { -// Allow targets to expand pseudo instructions depending on the choice of -// registers before MachineCopyPropagation. -derived().addPostRewrite(addPass); + if (auto E = derived().addRegAssignmentOptimized(addPass)) { +// addRegAssignmentOptimized did not add a reg alloc pass, so do nothing. +return; + } + // Allow targets to expand pseudo instructions depending on the choice of + // registers before MachineCopyPropagation. + derived().addPostRewrite(addPass); -// Copy propagate to forward register uses and try to eliminate COPYs that -// were not coalesced. -addPass(MachineCopyPropagationPass()); + // Copy propagate to forward register uses and try to eliminate COPYs that + // were not coalesced. + addPass(MachineCopyPropagationPass()); -// Run post-ra machine LICM to hoist reloads / remats. -// -// FIXME: can this move into MachineLateOptimization? -addPass(MachineLICMPass()); - } + // Run post-ra machine LICM to hoist reloads / remats. + // + // FIXME: can this move into MachineLateOptimization? + addPass(MachineLICMPass()); } //===-===// diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 9e2d9e071e85d..3656e5db9fb83 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -193,12 +193,12 @@ MACHINE_FUNCTION_PASS_WITH_PARAMS( }, "filter=reg-filter;no-clear-vregs") +// 'all' is the default filter MACHINE_FUNCTION_PASS_WITH_PARAMS(
[llvm-branch-commits] [llvm] [AMDGPU][NewPM] Port SIOptimizeExecMaskingPreRA to NPM (PR #125351)
https://github.com/optimisan updated https://github.com/llvm/llvm-project/pull/125351 >From 4bf4fe28a25a1ac7e216b4dcd66da210114a5482 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Sat, 1 Feb 2025 18:21:24 + Subject: [PATCH 1/2] [AMDGPU][NewPM] Port SIOptimizeExecMaskingPreRA to NPM --- llvm/lib/Target/AMDGPU/AMDGPU.h | 2 +- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 +- .../AMDGPU/SIOptimizeExecMaskingPreRA.cpp | 42 ++- .../AMDGPU/SIOptimizeExecMaskingPreRA.h | 24 +++ .../CodeGen/AMDGPU/collapse-endcf-broken.mir | 1 + ...ask-pre-ra-non-empty-but-used-interval.mir | 1 + 7 files changed, 62 insertions(+), 13 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.h diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 67bad5884c260..59839fd54d014 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -368,7 +368,7 @@ struct AMDGPUUnifyMetadataPass : PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); }; -void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry&); +void initializeSIOptimizeExecMaskingPreRALegacyPass(PassRegistry &); extern char &SIOptimizeExecMaskingPreRAID; void initializeSIOptimizeVGPRLiveRangeLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index d9d97928062f5..520f1a4282cc8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -110,6 +110,7 @@ MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass()) MACHINE_FUNCTION_PASS("si-lower-wwm-copies", SILowerWWMCopiesPass()) MACHINE_FUNCTION_PASS("si-opt-vgpr-liverange", SIOptimizeVGPRLiveRangePass()) MACHINE_FUNCTION_PASS("si-optimize-exec-masking", SIOptimizeExecMaskingPass()) +MACHINE_FUNCTION_PASS("si-optimize-exec-masking-pre-ra", SIOptimizeExecMaskingPreRAPass()) MACHINE_FUNCTION_PASS("si-peephole-sdwa", SIPeepholeSDWAPass()) MACHINE_FUNCTION_PASS("si-pre-allocate-wwm-regs", SIPreAllocateWWMRegsPass()) MACHINE_FUNCTION_PASS("si-shrink-instructions", SIShrinkInstructionsPass()) @@ -128,7 +129,6 @@ DUMMY_MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass()) DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass()) DUMMY_MACHINE_FUNCTION_PASS("si-memory-legalizer", SIMemoryLegalizerPass()) DUMMY_MACHINE_FUNCTION_PASS("si-mode-register", SIModeRegisterPass()) -DUMMY_MACHINE_FUNCTION_PASS("si-optimize-exec-masking-pre-ra", SIOptimizeExecMaskingPreRAPass()) DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) // TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it // already exists. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 8b387a191efba..855c522ca4de7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -48,6 +48,7 @@ #include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" #include "SIOptimizeExecMasking.h" +#include "SIOptimizeExecMaskingPreRA.h" #include "SIOptimizeVGPRLiveRange.h" #include "SIPeepholeSDWA.h" #include "SIPreAllocateWWMRegs.h" @@ -497,7 +498,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIFoldOperandsLegacyPass(*PR); initializeSIPeepholeSDWALegacyPass(*PR); initializeSIShrinkInstructionsLegacyPass(*PR); - initializeSIOptimizeExecMaskingPreRAPass(*PR); + initializeSIOptimizeExecMaskingPreRALegacyPass(*PR); initializeSIOptimizeVGPRLiveRangeLegacyPass(*PR); initializeSILoadStoreOptimizerLegacyPass(*PR); initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index 31f65d82a4d2b..2a8a398d7429d 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -12,6 +12,7 @@ /// //===--===// +#include "SIOptimizeExecMaskingPreRA.h" #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -25,7 +26,7 @@ using namespace llvm; namespace { -class SIOptimizeExecMaskingPreRA : public MachineFunctionPass { +class SIOptimizeExecMaskingPreRA { private: const SIRegisterInfo *TRI; const SIInstrInfo *TII; @@ -42,11 +43,18 @@ class SIOptimizeExecMaskingPreRA : public MachineFunctionPass { bool optimizeVcndVcmpPair(MachineBasicBlock &MBB); bool optimizeElseBranch(MachineBasicBlock &MBB); +public: + SIOptimizeExecMaskingPreRA(LiveIntervals *LIS) : LIS(LIS) {} + bool run(MachineFunction &MF); +}; + +class SIOptimi
[llvm-branch-commits] [llvm] [CodeGen][NewPM] Port RegAllocPriorityAdvisor analysis to NPM (PR #118462)
https://github.com/optimisan updated https://github.com/llvm/llvm-project/pull/118462 >From ed461217c0323fa1d172b42e81e20900653758c9 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Tue, 3 Dec 2024 10:12:36 + Subject: [PATCH 1/9] [CodeGen][NewPM] Port RegAllocPriorityAdvisor analysis to NPM --- .../llvm}/CodeGen/RegAllocPriorityAdvisor.h | 78 +++- llvm/include/llvm/InitializePasses.h | 2 +- .../llvm/Passes/MachinePassRegistry.def | 1 + llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp | 6 +- .../lib/CodeGen/MLRegAllocPriorityAdvisor.cpp | 184 +++--- llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp | 2 +- llvm/lib/CodeGen/RegAllocGreedy.cpp | 9 +- llvm/lib/CodeGen/RegAllocGreedy.h | 2 +- llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp | 155 +++ llvm/lib/Passes/PassBuilder.cpp | 1 + 10 files changed, 320 insertions(+), 120 deletions(-) rename llvm/{lib => include/llvm}/CodeGen/RegAllocPriorityAdvisor.h (57%) diff --git a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h b/llvm/include/llvm/CodeGen/RegAllocPriorityAdvisor.h similarity index 57% rename from llvm/lib/CodeGen/RegAllocPriorityAdvisor.h rename to llvm/include/llvm/CodeGen/RegAllocPriorityAdvisor.h index 0758743c2b140..a53739fdc3fc4 100644 --- a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h +++ b/llvm/include/llvm/CodeGen/RegAllocPriorityAdvisor.h @@ -9,8 +9,10 @@ #ifndef LLVM_CODEGEN_REGALLOCPRIORITYADVISOR_H #define LLVM_CODEGEN_REGALLOCPRIORITYADVISOR_H +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/RegAllocEvictionAdvisor.h" #include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/IR/PassManager.h" #include "llvm/Pass.h" namespace llvm { @@ -68,12 +70,72 @@ class DummyPriorityAdvisor : public RegAllocPriorityAdvisor { unsigned getPriority(const LiveInterval &LI) const override; }; -class RegAllocPriorityAdvisorAnalysis : public ImmutablePass { +/// Common provider for getting the priority advisor and logging rewards. +/// Legacy analysis forwards all calls to this provider. +/// New analysis serves the provider as the analysis result. +/// Expensive setup is done in the constructor, so that the advisor can be +/// created quickly for every machine function. +/// TODO: Remove once legacy PM support is dropped. +class RegAllocPriorityAdvisorProvider { public: enum class AdvisorMode : int { Default, Release, Development, Dummy }; - RegAllocPriorityAdvisorAnalysis(AdvisorMode Mode) - : ImmutablePass(ID), Mode(Mode){}; + RegAllocPriorityAdvisorProvider(AdvisorMode Mode) : Mode(Mode) {} + + virtual ~RegAllocPriorityAdvisorProvider() = default; + + virtual void logRewardIfNeeded(const MachineFunction &MF, + llvm::function_ref GetReward) {}; + + virtual std::unique_ptr + getAdvisor(const MachineFunction &MF, const RAGreedy &RA) = 0; + + void setAnalyses(SlotIndexes *SI) { this->SI = SI; } + + AdvisorMode getAdvisorMode() const { return Mode; } + +protected: + SlotIndexes *SI; + +private: + const AdvisorMode Mode; +}; + +RegAllocPriorityAdvisorProvider *createReleaseModePriorityAdvisorProvider(); + +RegAllocPriorityAdvisorProvider * +createDevelopmentModePriorityAdvisorProvider(LLVMContext &Ctx); + +class RegAllocPriorityAdvisorAnalysis +: public AnalysisInfoMixin { + static AnalysisKey Key; + friend AnalysisInfoMixin; + +public: + struct Result { +// Owned by this analysis. +RegAllocPriorityAdvisorProvider *Provider; + +bool invalidate(MachineFunction &MF, const PreservedAnalyses &PA, +MachineFunctionAnalysisManager::Invalidator &Inv) { + auto PAC = PA.getChecker(); + return !PAC.preservedWhenStateless() || + Inv.invalidate(MF, PA); +} + }; + + Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM); + +private: + void initializeProvider(LLVMContext &Ctx); + std::unique_ptr Provider; +}; + +class RegAllocPriorityAdvisorAnalysisLegacy : public ImmutablePass { +public: + using AdvisorMode = RegAllocPriorityAdvisorProvider::AdvisorMode; + RegAllocPriorityAdvisorAnalysisLegacy(AdvisorMode Mode) + : ImmutablePass(ID), Mode(Mode) {}; static char ID; /// Get an advisor for the given context (i.e. machine function, etc) @@ -81,7 +143,7 @@ class RegAllocPriorityAdvisorAnalysis : public ImmutablePass { getAdvisor(const MachineFunction &MF, const RAGreedy &RA) = 0; AdvisorMode getAdvisorMode() const { return Mode; } virtual void logRewardIfNeeded(const MachineFunction &MF, - llvm::function_ref GetReward){}; + llvm::function_ref GetReward) {}; protected: // This analysis preserves everything, and subclasses may have additional @@ -97,11 +159,13 @@ class RegAllocPriorityAdvisorAnalysis : public ImmutablePass { /// Specialization for the API used by the analysis infrastructure to create /// an instance
[llvm-branch-commits] [lld] [lld][LoongArch] Relax TLS LE/GD/LD (PR #123600)
https://github.com/ylzsx updated https://github.com/llvm/llvm-project/pull/123600 >From 7993434e2973437b010034051003f8c03d8eff71 Mon Sep 17 00:00:00 2001 From: yangzhaoxin Date: Fri, 27 Dec 2024 19:29:32 +0800 Subject: [PATCH 1/6] Relax TLS LE/GD/LD. In local-exec form, the code sequence is converted as follows: ``` From: lu12i.w $rd, %le_hi20_r(sym) R_LARCH_TLS_LE_HI20_R, R_LARCH_RELAX add.w/d $rd, $rd, $tp, %le_add_r(sym) R_LARCH_TLS_LE_ADD_R, R_LARCH_RELAX addi/ld/st.w/d $rd, $rd, %le_lo12_r(sym) R_LARCH_TLS_LE_LO12_R, R_LARCH_RELAX To: addi/ld/st.w/d $rd, $tp, %le_lo12_r(sym) R_LARCH_TLS_LE_LO12_R ``` In global-dynamic or local-dynamic, the code sequence is converted as follows: ``` From: pcalau12i $a0, %ld_pc_hi20(sym) | %gd_pc_hi20(sym) R_LARCH_TLS_GD_PC_HI20 | R_LARCH_TLS_LD_PC_HI20, R_LARCH_RELAX addi.w/d $a0, $a0, %got_pc_lo12(sym) | %got_pc_lo12(sym) R_LARCH_GOT_PC_LO12, R_LARCH_RELAX To: pcaddi$a0, %got_pc_lo12(sym) | %got_pc_lo12(sym) R_LARCH_TLS_GD_PCREL20_S2 | R_LARCH_TLS_LD_PCREL20_S2 ``` Note: For initial-exec form, since it involves the conversion from IE to LE, we will implement it in a future patch. --- lld/ELF/Arch/LoongArch.cpp | 68 +++--- 1 file changed, 64 insertions(+), 4 deletions(-) diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp index 0aa0cf5b657a0..2d6d86d2ca63b 100644 --- a/lld/ELF/Arch/LoongArch.cpp +++ b/lld/ELF/Arch/LoongArch.cpp @@ -761,10 +761,10 @@ static bool isPairRelaxable(ArrayRef relocs, size_t i) { // Relax code sequence. // From: -// pcalau12i $a0, %pc_hi20(sym) -// addi.w/d $a0, $a0, %pc_lo12(sym) +// pcalau12i $a0, %pc_hi20(sym) | %ld_pc_hi20(sym) | %gd_pc_hi20(sym) +// addi.w/d $a0, $a0, %pc_lo12(sym) | %got_pc_lo12(sym) | %got_pc_lo12(sym) // To: -// pcaddi $a0, %pc_lo12(sym) +// pcaddi $a0, %pc_lo12(sym) | %got_pc_lo12(sym) | %got_pc_lo12(sym) // // From: // pcalau12i $a0, %got_pc_hi20(sym_got) @@ -778,6 +778,10 @@ static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i, if (!((rHi20.type == R_LARCH_PCALA_HI20 && rLo12.type == R_LARCH_PCALA_LO12) || (rHi20.type == R_LARCH_GOT_PC_HI20 && + rLo12.type == R_LARCH_GOT_PC_LO12) || +(rHi20.type == R_LARCH_TLS_GD_PC_HI20 && + rLo12.type == R_LARCH_GOT_PC_LO12) || +(rHi20.type == R_LARCH_TLS_LD_PC_HI20 && rLo12.type == R_LARCH_GOT_PC_LO12))) return; @@ -798,6 +802,8 @@ static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i, else if (rHi20.expr == RE_LOONGARCH_PAGE_PC || rHi20.expr == RE_LOONGARCH_GOT_PAGE_PC) symBase = rHi20.sym->getVA(ctx); + else if (rHi20.expr == RE_LOONGARCH_TLSGD_PAGE_PC) +symBase = ctx.in.got->getGlobalDynAddr(*rHi20.sym); else { Err(ctx) << getErrorLoc(ctx, (const uint8_t *)loc) << "unknown expr (" << rHi20.expr << ") against symbol " << rHi20.sym @@ -827,7 +833,12 @@ static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i, return; sec.relaxAux->relocTypes[i] = R_LARCH_RELAX; - sec.relaxAux->relocTypes[i + 2] = R_LARCH_PCREL20_S2; + if (rHi20.type == R_LARCH_TLS_GD_PC_HI20) +sec.relaxAux->relocTypes[i + 2] = R_LARCH_TLS_GD_PCREL20_S2; + else if (rHi20.type == R_LARCH_TLS_LD_PC_HI20) +sec.relaxAux->relocTypes[i + 2] = R_LARCH_TLS_LD_PCREL20_S2; + else +sec.relaxAux->relocTypes[i + 2] = R_LARCH_PCREL20_S2; sec.relaxAux->writes.push_back(insn(PCADDI, getD5(nextInsn), 0, 0)); remove = 4; } @@ -863,6 +874,35 @@ static void relaxCall36(Ctx &ctx, const InputSection &sec, size_t i, } } +// Relax code sequence. +// From: +// lu12i.w $rd, %le_hi20_r(sym) +// add.w/d $rd, $rd, $tp, %le_add_r(sym) +// addi/ld/st.w/d $rd, $rd, %le_lo12_r(sym) +// To: +// addi/ld/st.w/d $rd, $tp, %le_lo12_r(sym) +static void relaxTlsLe(Ctx &ctx, const InputSection &sec, size_t i, + uint64_t loc, Relocation &r, uint32_t &remove) { + uint64_t val = r.sym->getVA(ctx, r.addend); + // Check if the val exceeds the range of addi/ld/st. + if (!isInt<12>(val)) +return; + uint32_t currInsn = read32le(sec.content().data() + r.offset); + switch (r.type) { + case R_LARCH_TLS_LE_HI20_R: + case R_LARCH_TLS_LE_ADD_R: +sec.relaxAux->relocTypes[i] = R_LARCH_RELAX; +remove = 4; +break; + case R_LARCH_TLS_LE_LO12_R: +currInsn = +insn(extractBits(currInsn, 31, 22) << 22, getD5(currInsn), R_TP, 0); +sec.relaxAux->writes.push_back(currInsn); +sec.relaxAux->relocTypes[i] = R_LARCH_TLS_LE_LO12_R; +break; + } +} + static bool relax(Ctx &ctx, InputSection &sec) { const uint64_t secAddr = sec.getVA(); const MutableArrayRef relocs = sec.relocs(); @@ -903,6 +943,8 @@ static bool relax(Ctx &ctx, InputSection &sec) { } case R_LARCH_PCALA_HI20: case R_LARCH_GOT_PC_HI20: +case R_LARCH_TLS_GD_PC_HI20: +case R_LARCH_TLS_LD_PC_HI20:
[llvm-branch-commits] [lld] [lld][LoongArch] Relax call36/tail36: R_LARCH_CALL36 (PR #123576)
https://github.com/ylzsx updated https://github.com/llvm/llvm-project/pull/123576 >From f1f995b5fc8e90126b5825d52b9c75cd45d27cfc Mon Sep 17 00:00:00 2001 From: yangzhaoxin Date: Thu, 26 Dec 2024 11:32:33 +0800 Subject: [PATCH 1/6] Relax call36/tail36. Instructions with relocation `R_LARCH_CALL36` may be relax as follows: ``` From: pcaddu18i $dest, %call36(foo) R_LARCH_CALL36, R_LARCH_RELAX jirl $r, $dest, 0 To: b/bl foo # bl if r=$ra, b if r=$zero R_LARCH_B26 ``` --- lld/ELF/Arch/LoongArch.cpp | 41 ++ 1 file changed, 41 insertions(+) diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp index b999e7fd27ae9..0aa0cf5b657a0 100644 --- a/lld/ELF/Arch/LoongArch.cpp +++ b/lld/ELF/Arch/LoongArch.cpp @@ -58,6 +58,8 @@ enum Op { LD_W = 0x2880, LD_D = 0x28c0, JIRL = 0x4c00, + B = 0x5000, + BL = 0x5400, }; enum Reg { @@ -830,6 +832,37 @@ static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i, remove = 4; } +// Relax code sequence. +// From: +// pcaddu18i $ra, %call36(foo) +// jirl $ra, $ra, 0 +// To: +// b/bl foo +static void relaxCall36(Ctx &ctx, const InputSection &sec, size_t i, +uint64_t loc, Relocation &r, uint32_t &remove) { + const uint64_t symLocal = + (r.expr == R_PLT_PC ? r.sym->getPltVA(ctx) : r.sym->getVA(ctx)) + + r.addend; + + const int64_t distance = symLocal - loc; + // Check if the distance aligns 4 bytes or exceeds the range of b[l]. + if ((distance & 0x3) != 0 || !isInt<28>(distance)) +return; + + const uint32_t nextInsn = read32le(sec.content().data() + r.offset + 4); + if (getD5(nextInsn) == R_RA) { +// convert jirl to bl +sec.relaxAux->relocTypes[i] = R_LARCH_B26; +sec.relaxAux->writes.push_back(insn(BL, 0, 0, 0)); +remove = 4; + } else if (getD5(nextInsn) == R_ZERO) { +// convert jirl to b +sec.relaxAux->relocTypes[i] = R_LARCH_B26; +sec.relaxAux->writes.push_back(insn(B, 0, 0, 0)); +remove = 4; + } +} + static bool relax(Ctx &ctx, InputSection &sec) { const uint64_t secAddr = sec.getVA(); const MutableArrayRef relocs = sec.relocs(); @@ -874,6 +907,10 @@ static bool relax(Ctx &ctx, InputSection &sec) { if (isPairRelaxable(relocs, i)) relaxPCHi20Lo12(ctx, sec, i, loc, r, relocs[i + 2], remove); break; +case R_LARCH_CALL36: + if (relaxable(relocs, i)) +relaxCall36(ctx, sec, i, loc, r, remove); + break; } // For all anchors whose offsets are <= r.offset, they are preceded by @@ -977,6 +1014,10 @@ void LoongArch::finalizeRelax(int passes) const { // RelExpr is needed for relocating. r.expr = r.sym->hasFlag(NEEDS_PLT) ? R_PLT_PC : R_PC; break; + case R_LARCH_B26: +skip = 4; +write32le(p, aux.writes[writesIdx++]); +break; default: llvm_unreachable("unsupported type"); } >From f227ae532236e20148a872c811721a8de4e16318 Mon Sep 17 00:00:00 2001 From: yangzhaoxin Date: Fri, 27 Dec 2024 14:37:40 +0800 Subject: [PATCH 2/6] modify test for call36/tail36. --- lld/test/ELF/loongarch-relax-call36-2.s | 63 + lld/test/ELF/loongarch-relax-call36.s| 135 +++ lld/test/ELF/loongarch-relax-emit-relocs-2.s | 61 + 3 files changed, 259 insertions(+) create mode 100644 lld/test/ELF/loongarch-relax-call36-2.s create mode 100644 lld/test/ELF/loongarch-relax-call36.s create mode 100644 lld/test/ELF/loongarch-relax-emit-relocs-2.s diff --git a/lld/test/ELF/loongarch-relax-call36-2.s b/lld/test/ELF/loongarch-relax-call36-2.s new file mode 100644 index 0..1c216a9bdc35e --- /dev/null +++ b/lld/test/ELF/loongarch-relax-call36-2.s @@ -0,0 +1,63 @@ +# REQUIRES: loongarch +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: llvm-mc -filetype=obj -triple=loongarch64 -mattr=+relax a.s -o a.o + +# RUN: ld.lld -T lds a.o -o a +# RUN: llvm-objdump -d --no-show-raw-insn a | FileCheck %s --check-prefixes=RELAX,RELAX-MID + +## Unsure whether this needs a diagnostic. GNU ld allows this. +# RUN: ld.lld -T lds -pie a.o -o a.pie +# RUN: llvm-objdump -d --no-show-raw-insn a.pie | FileCheck %s --check-prefixes=RELAX,RELAX-MID + +# RUN: ld.lld -T lds -pie -z notext -z ifunc-noplt a.o -o a.ifunc-noplt +# RUN: llvm-objdump -d --no-show-raw-insn a.ifunc-noplt | FileCheck %s --check-prefixes=RELAX,NORELAX-MID + +# RELAX-LABEL: <_start>: +## offset = 0x1000 - 0x800 = 0x800(134217728), hi=512, lo18=0 +# RELAX-NEXT:800: pcaddu18i $ra, 512 +# RELAX-NEXT: jirl $ra, $ra, 0 +# RELAX-NEXT: bl 134217720 +# RELAX-NEXT: bl -134217728 +## offset = 12 - 0x810 = -0x804(-134217732), hi=512, lo18=-4 +# RELAX-NEXT:810: pcaddu18i $ra, -512 +# RELAX-NEXT: jirl $ra, $ra, -4 +# RELAX-EMPTY: + +# RELAX-MID
[llvm-branch-commits] [lld] [lld][LoongArch] Relax TLS LE/GD/LD (PR #123600)
https://github.com/ylzsx updated https://github.com/llvm/llvm-project/pull/123600 >From 7993434e2973437b010034051003f8c03d8eff71 Mon Sep 17 00:00:00 2001 From: yangzhaoxin Date: Fri, 27 Dec 2024 19:29:32 +0800 Subject: [PATCH 1/7] Relax TLS LE/GD/LD. In local-exec form, the code sequence is converted as follows: ``` From: lu12i.w $rd, %le_hi20_r(sym) R_LARCH_TLS_LE_HI20_R, R_LARCH_RELAX add.w/d $rd, $rd, $tp, %le_add_r(sym) R_LARCH_TLS_LE_ADD_R, R_LARCH_RELAX addi/ld/st.w/d $rd, $rd, %le_lo12_r(sym) R_LARCH_TLS_LE_LO12_R, R_LARCH_RELAX To: addi/ld/st.w/d $rd, $tp, %le_lo12_r(sym) R_LARCH_TLS_LE_LO12_R ``` In global-dynamic or local-dynamic, the code sequence is converted as follows: ``` From: pcalau12i $a0, %ld_pc_hi20(sym) | %gd_pc_hi20(sym) R_LARCH_TLS_GD_PC_HI20 | R_LARCH_TLS_LD_PC_HI20, R_LARCH_RELAX addi.w/d $a0, $a0, %got_pc_lo12(sym) | %got_pc_lo12(sym) R_LARCH_GOT_PC_LO12, R_LARCH_RELAX To: pcaddi$a0, %got_pc_lo12(sym) | %got_pc_lo12(sym) R_LARCH_TLS_GD_PCREL20_S2 | R_LARCH_TLS_LD_PCREL20_S2 ``` Note: For initial-exec form, since it involves the conversion from IE to LE, we will implement it in a future patch. --- lld/ELF/Arch/LoongArch.cpp | 68 +++--- 1 file changed, 64 insertions(+), 4 deletions(-) diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp index 0aa0cf5b657a0..2d6d86d2ca63b 100644 --- a/lld/ELF/Arch/LoongArch.cpp +++ b/lld/ELF/Arch/LoongArch.cpp @@ -761,10 +761,10 @@ static bool isPairRelaxable(ArrayRef relocs, size_t i) { // Relax code sequence. // From: -// pcalau12i $a0, %pc_hi20(sym) -// addi.w/d $a0, $a0, %pc_lo12(sym) +// pcalau12i $a0, %pc_hi20(sym) | %ld_pc_hi20(sym) | %gd_pc_hi20(sym) +// addi.w/d $a0, $a0, %pc_lo12(sym) | %got_pc_lo12(sym) | %got_pc_lo12(sym) // To: -// pcaddi $a0, %pc_lo12(sym) +// pcaddi $a0, %pc_lo12(sym) | %got_pc_lo12(sym) | %got_pc_lo12(sym) // // From: // pcalau12i $a0, %got_pc_hi20(sym_got) @@ -778,6 +778,10 @@ static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i, if (!((rHi20.type == R_LARCH_PCALA_HI20 && rLo12.type == R_LARCH_PCALA_LO12) || (rHi20.type == R_LARCH_GOT_PC_HI20 && + rLo12.type == R_LARCH_GOT_PC_LO12) || +(rHi20.type == R_LARCH_TLS_GD_PC_HI20 && + rLo12.type == R_LARCH_GOT_PC_LO12) || +(rHi20.type == R_LARCH_TLS_LD_PC_HI20 && rLo12.type == R_LARCH_GOT_PC_LO12))) return; @@ -798,6 +802,8 @@ static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i, else if (rHi20.expr == RE_LOONGARCH_PAGE_PC || rHi20.expr == RE_LOONGARCH_GOT_PAGE_PC) symBase = rHi20.sym->getVA(ctx); + else if (rHi20.expr == RE_LOONGARCH_TLSGD_PAGE_PC) +symBase = ctx.in.got->getGlobalDynAddr(*rHi20.sym); else { Err(ctx) << getErrorLoc(ctx, (const uint8_t *)loc) << "unknown expr (" << rHi20.expr << ") against symbol " << rHi20.sym @@ -827,7 +833,12 @@ static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i, return; sec.relaxAux->relocTypes[i] = R_LARCH_RELAX; - sec.relaxAux->relocTypes[i + 2] = R_LARCH_PCREL20_S2; + if (rHi20.type == R_LARCH_TLS_GD_PC_HI20) +sec.relaxAux->relocTypes[i + 2] = R_LARCH_TLS_GD_PCREL20_S2; + else if (rHi20.type == R_LARCH_TLS_LD_PC_HI20) +sec.relaxAux->relocTypes[i + 2] = R_LARCH_TLS_LD_PCREL20_S2; + else +sec.relaxAux->relocTypes[i + 2] = R_LARCH_PCREL20_S2; sec.relaxAux->writes.push_back(insn(PCADDI, getD5(nextInsn), 0, 0)); remove = 4; } @@ -863,6 +874,35 @@ static void relaxCall36(Ctx &ctx, const InputSection &sec, size_t i, } } +// Relax code sequence. +// From: +// lu12i.w $rd, %le_hi20_r(sym) +// add.w/d $rd, $rd, $tp, %le_add_r(sym) +// addi/ld/st.w/d $rd, $rd, %le_lo12_r(sym) +// To: +// addi/ld/st.w/d $rd, $tp, %le_lo12_r(sym) +static void relaxTlsLe(Ctx &ctx, const InputSection &sec, size_t i, + uint64_t loc, Relocation &r, uint32_t &remove) { + uint64_t val = r.sym->getVA(ctx, r.addend); + // Check if the val exceeds the range of addi/ld/st. + if (!isInt<12>(val)) +return; + uint32_t currInsn = read32le(sec.content().data() + r.offset); + switch (r.type) { + case R_LARCH_TLS_LE_HI20_R: + case R_LARCH_TLS_LE_ADD_R: +sec.relaxAux->relocTypes[i] = R_LARCH_RELAX; +remove = 4; +break; + case R_LARCH_TLS_LE_LO12_R: +currInsn = +insn(extractBits(currInsn, 31, 22) << 22, getD5(currInsn), R_TP, 0); +sec.relaxAux->writes.push_back(currInsn); +sec.relaxAux->relocTypes[i] = R_LARCH_TLS_LE_LO12_R; +break; + } +} + static bool relax(Ctx &ctx, InputSection &sec) { const uint64_t secAddr = sec.getVA(); const MutableArrayRef relocs = sec.relocs(); @@ -903,6 +943,8 @@ static bool relax(Ctx &ctx, InputSection &sec) { } case R_LARCH_PCALA_HI20: case R_LARCH_GOT_PC_HI20: +case R_LARCH_TLS_GD_PC_HI20: +case R_LARCH_TLS_LD_PC_HI20:
[llvm-branch-commits] [llvm] PeepholeOpt: Allow introducing subregister uses on reg_sequence (PR #127052)
https://github.com/qcolombet approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/127052 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] release/20.x: [Clang] Remove the PackExpansion restrictions for rewrite substitution (PR #127174)
https://github.com/hokein approved this pull request. https://github.com/llvm/llvm-project/pull/127174 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [RegAlloc][NewPM] Plug Greedy RA in codegen pipeline (PR #120557)
@@ -1412,6 +1412,20 @@ parseBoundsCheckingOptions(StringRef Params) { return Options; } +Expected +parseRegAllocGreedyFilterFunc(PassBuilder &PB, StringRef Params) { + if (Params.empty() || Params == "all") { +return RAGreedyPass::Options(); + } + std::optional Filter = PB.parseRegAllocFilter(Params); + if (Filter) { cdevadas wrote: Ditto. https://github.com/llvm/llvm-project/pull/120557 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [RegAlloc][NewPM] Plug Greedy RA in codegen pipeline (PR #120557)
@@ -1412,6 +1412,20 @@ parseBoundsCheckingOptions(StringRef Params) { return Options; } +Expected +parseRegAllocGreedyFilterFunc(PassBuilder &PB, StringRef Params) { + if (Params.empty() || Params == "all") { +return RAGreedyPass::Options(); + } cdevadas wrote: Still not integrated this suggestion. Remove the braces. https://github.com/llvm/llvm-project/pull/120557 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] PeepholeOpt: Handle subregister compose when looking through reg_sequence (PR #127051)
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/127051 Previously this would give up on folding subregister copies through a reg_sequence if the input operand already had a subregister index. d246cc618adc52fdbd69d44a2a375c8af97b6106 stopped introducing these subregister uses, and this is the first step to lifting that restriction. I was expecting to be able to implement this only purely with compose / reverse compose, but I wasn't able to make it work so relies on testing the lanemasks for whether the copy reads a subset of the input. >From a009eec4bc6159c26a909cbbb19debdc6206ffe7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 4 Feb 2025 17:05:35 +0700 Subject: [PATCH] PeepholeOpt: Handle subregister compose when looking through reg_sequence Previously this would give up on folding subregister copies through a reg_sequence if the input operand already had a subregister index. d246cc618adc52fdbd69d44a2a375c8af97b6106 stopped introducing these subregister uses, and this is the first step to lifting that restriction. I was expecting to be able to implement this only purely with compose / reverse compose, but I wasn't able to make it work so relies on testing the lanemasks for whether the copy reads a subset of the input. --- llvm/lib/CodeGen/PeepholeOptimizer.cpp| 33 +- .../AMDGPU/GlobalISel/extractelement.ll | 10 ++-- ...e92561-restore-undef-scc-verifier-error.ll | 60 ++- .../peephole-opt-fold-reg-sequence-subreg.mir | 52 4 files changed, 94 insertions(+), 61 deletions(-) diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 745c0d4b36a62..24bd9938bc45c 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -1984,12 +1984,43 @@ ValueTrackerResult ValueTracker::getNextSourceFromRegSequence() { // We are looking at: // Def = REG_SEQUENCE v0, sub0, v1, sub1, ... - // Check if one of the operand defines the subreg we are interested in. + // + // Check if one of the operands exactly defines the subreg we are interested + // in. for (const RegSubRegPairAndIdx &RegSeqInput : RegSeqInputRegs) { if (RegSeqInput.SubIdx == DefSubReg) return ValueTrackerResult(RegSeqInput.Reg, RegSeqInput.SubReg); } + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + + // If we did not find an exact match, see if we can do a composition to + // extract a sub-subregister. + for (const RegSubRegPairAndIdx &RegSeqInput : RegSeqInputRegs) { +// We don't check if the resulting class supports the subregister index +// yet. This will occur before any rewrite when looking for an eligible +// source. + +LaneBitmask DefMask = TRI->getSubRegIndexLaneMask(DefSubReg); +LaneBitmask ThisOpRegMask = TRI->getSubRegIndexLaneMask(RegSeqInput.SubIdx); + +// Check that this extract reads a subset of this single reg_sequence input. +// +// FIXME: We should be able to filter this in terms of the indexes directly +// without checking the lanemasks. +if ((DefMask & ThisOpRegMask) != DefMask) + continue; + +unsigned ReverseDefCompose = +TRI->reverseComposeSubRegIndices(RegSeqInput.SubIdx, DefSubReg); +if (!ReverseDefCompose) + continue; + +unsigned ComposedDefInSrcReg1 = +TRI->composeSubRegIndices(RegSeqInput.SubReg, ReverseDefCompose); +return ValueTrackerResult(RegSeqInput.Reg, ComposedDefInSrcReg1); + } + // If the subreg we are tracking is super-defined by another subreg, // we could follow this value. However, this would require to compose // the subreg and we do not do that for now. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index f2a4332bcb8ba..c136028f2de43 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -2872,8 +2872,8 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) { ; GCN-NEXT:v_cndmask_b32_e32 v0, v0, v12, vcc ; GCN-NEXT:v_cndmask_b32_e32 v1, v1, v13, vcc ; GCN-NEXT:v_cmp_eq_u32_e32 vcc, 7, v14 -; GCN-NEXT:v_cndmask_b32_e32 v0, v0, v14, vcc -; GCN-NEXT:v_cndmask_b32_e32 v1, v1, v15, vcc +; GCN-NEXT:v_cndmask_b32_e32 v0, v0, v0, vcc +; GCN-NEXT:v_cndmask_b32_e32 v1, v1, v1, vcc ; GCN-NEXT:s_setpc_b64 s[30:31] ; ; GFX10-LABEL: dyn_extract_v7f64_v_v: @@ -2898,8 +2898,8 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) { ; GFX10-NEXT:v_cndmask_b32_e32 v0, v0, v12, vcc_lo ; GFX10-NEXT:v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT:v_cmp_eq_u32_e32 vcc_lo, 7, v14 -; GFX10-NEXT:v_cndmask_b32_e32 v0, v0, v14, vcc_lo -; GFX10-NEXT:v_cndmask_b32_e32 v1, v1, v15, vcc_lo +; GFX10-NEXT:v_cndmask_b32_e32 v0, v0, v0, vcc_lo +; GFX10-NEXT:v_cndmask_b32_e32
[llvm-branch-commits] [llvm] PeepholeOpt: Handle subregister compose when looking through reg_sequence (PR #127051)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/127051?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#127052** https://app.graphite.dev/github/pr/llvm/llvm-project/127052?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#127051** https://app.graphite.dev/github/pr/llvm/llvm-project/127051?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/127051?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#127050** https://app.graphite.dev/github/pr/llvm/llvm-project/127050?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/127051 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] PeepholeOpt: Allow introducing subregister uses on reg_sequence (PR #127052)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/127052?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#127052** https://app.graphite.dev/github/pr/llvm/llvm-project/127052?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/127052?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#127051** https://app.graphite.dev/github/pr/llvm/llvm-project/127051?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#127050** https://app.graphite.dev/github/pr/llvm/llvm-project/127050?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/127052 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] PeepholeOpt: Handle subregister compose when looking through reg_sequence (PR #127051)
llvmbot wrote: @llvm/pr-subscribers-llvm-regalloc Author: Matt Arsenault (arsenm) Changes Previously this would give up on folding subregister copies through a reg_sequence if the input operand already had a subregister index. d246cc618adc52fdbd69d44a2a375c8af97b6106 stopped introducing these subregister uses, and this is the first step to lifting that restriction. I was expecting to be able to implement this only purely with compose / reverse compose, but I wasn't able to make it work so relies on testing the lanemasks for whether the copy reads a subset of the input. --- Patch is 21.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127051.diff 4 Files Affected: - (modified) llvm/lib/CodeGen/PeepholeOptimizer.cpp (+32-1) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll (+5-5) - (modified) llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll (+31-29) - (modified) llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir (+26-26) ``diff diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 745c0d4b36a62..24bd9938bc45c 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -1984,12 +1984,43 @@ ValueTrackerResult ValueTracker::getNextSourceFromRegSequence() { // We are looking at: // Def = REG_SEQUENCE v0, sub0, v1, sub1, ... - // Check if one of the operand defines the subreg we are interested in. + // + // Check if one of the operands exactly defines the subreg we are interested + // in. for (const RegSubRegPairAndIdx &RegSeqInput : RegSeqInputRegs) { if (RegSeqInput.SubIdx == DefSubReg) return ValueTrackerResult(RegSeqInput.Reg, RegSeqInput.SubReg); } + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + + // If we did not find an exact match, see if we can do a composition to + // extract a sub-subregister. + for (const RegSubRegPairAndIdx &RegSeqInput : RegSeqInputRegs) { +// We don't check if the resulting class supports the subregister index +// yet. This will occur before any rewrite when looking for an eligible +// source. + +LaneBitmask DefMask = TRI->getSubRegIndexLaneMask(DefSubReg); +LaneBitmask ThisOpRegMask = TRI->getSubRegIndexLaneMask(RegSeqInput.SubIdx); + +// Check that this extract reads a subset of this single reg_sequence input. +// +// FIXME: We should be able to filter this in terms of the indexes directly +// without checking the lanemasks. +if ((DefMask & ThisOpRegMask) != DefMask) + continue; + +unsigned ReverseDefCompose = +TRI->reverseComposeSubRegIndices(RegSeqInput.SubIdx, DefSubReg); +if (!ReverseDefCompose) + continue; + +unsigned ComposedDefInSrcReg1 = +TRI->composeSubRegIndices(RegSeqInput.SubReg, ReverseDefCompose); +return ValueTrackerResult(RegSeqInput.Reg, ComposedDefInSrcReg1); + } + // If the subreg we are tracking is super-defined by another subreg, // we could follow this value. However, this would require to compose // the subreg and we do not do that for now. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index f2a4332bcb8ba..c136028f2de43 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -2872,8 +2872,8 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) { ; GCN-NEXT:v_cndmask_b32_e32 v0, v0, v12, vcc ; GCN-NEXT:v_cndmask_b32_e32 v1, v1, v13, vcc ; GCN-NEXT:v_cmp_eq_u32_e32 vcc, 7, v14 -; GCN-NEXT:v_cndmask_b32_e32 v0, v0, v14, vcc -; GCN-NEXT:v_cndmask_b32_e32 v1, v1, v15, vcc +; GCN-NEXT:v_cndmask_b32_e32 v0, v0, v0, vcc +; GCN-NEXT:v_cndmask_b32_e32 v1, v1, v1, vcc ; GCN-NEXT:s_setpc_b64 s[30:31] ; ; GFX10-LABEL: dyn_extract_v7f64_v_v: @@ -2898,8 +2898,8 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) { ; GFX10-NEXT:v_cndmask_b32_e32 v0, v0, v12, vcc_lo ; GFX10-NEXT:v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT:v_cmp_eq_u32_e32 vcc_lo, 7, v14 -; GFX10-NEXT:v_cndmask_b32_e32 v0, v0, v14, vcc_lo -; GFX10-NEXT:v_cndmask_b32_e32 v1, v1, v15, vcc_lo +; GFX10-NEXT:v_cndmask_b32_e32 v0, v0, v0, vcc_lo +; GFX10-NEXT:v_cndmask_b32_e32 v1, v1, v1, vcc_lo ; GFX10-NEXT:s_setpc_b64 s[30:31] ; ; GFX11-LABEL: dyn_extract_v7f64_v_v: @@ -2918,7 +2918,7 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) { ; GFX11-NEXT:v_cmp_eq_u32_e32 vcc_lo, 6, v14 ; GFX11-NEXT:v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13 ; GFX11-NEXT:v_cmp_eq_u32_e32 vcc_lo, 7, v14 -; GFX11-NEXT:v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15 +; GFX11-NEXT:v_dual_cndmask_b32 v0, v0, v0 :: v_dual_cndmask_b32 v
[llvm-branch-commits] [llvm] PeepholeOpt: Allow introducing subregister uses on reg_sequence (PR #127052)
llvmbot wrote: @llvm/pr-subscribers-llvm-regalloc @llvm/pr-subscribers-tablegen Author: Matt Arsenault (arsenm) Changes This reverts d246cc618adc52fdbd69d44a2a375c8af97b6106. We now handle composing subregister extracts through reg_sequence. --- Patch is 576.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127052.diff 61 Files Affected: - (modified) llvm/lib/CodeGen/PeepholeOptimizer.cpp (-6) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (+112-112) - (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+28-28) - (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+24-39) - (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+110-117) - (modified) llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll (+36-36) - (modified) llvm/test/CodeGen/AMDGPU/ctpop64.ll (+18-18) - (modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (+10-10) - (modified) llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll (+5-8) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+31-27) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+33-29) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+33-29) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+61-53) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll (+140-124) - (modified) llvm/test/CodeGen/AMDGPU/fptoi.i128.ll (+178-178) - (modified) llvm/test/CodeGen/AMDGPU/fptrunc.ll (+37-37) - (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+36-55) - (modified) llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll (+171-298) - (modified) llvm/test/CodeGen/AMDGPU/idot4s.ll (+62-62) - (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+155-155) - (modified) llvm/test/CodeGen/AMDGPU/idot8u.ll (+18-18) - (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+44-44) - (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll (+150-148) - (modified) llvm/test/CodeGen/AMDGPU/kernel-args.ll (+3-6) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll (+440-440) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll (+81-81) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (+16-16) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+16-16) - (modified) llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll (+52-53) - (modified) llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll (+2-4) - (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+62-62) - (modified) llvm/test/CodeGen/AMDGPU/load-constant-i32.ll (+16-16) - (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+29-29) - (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+156-210) - (modified) llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll (+10-9) - (modified) llvm/test/CodeGen/AMDGPU/mul.ll (+25-25) - (modified) llvm/test/CodeGen/AMDGPU/mul_int24.ll (+15-18) - (modified) llvm/test/CodeGen/AMDGPU/select.f16.ll (+38-38) - (modified) llvm/test/CodeGen/AMDGPU/shl.ll (+11-11) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll (+304-304) - (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+109-118) - (modified) llvm/test/CodeGen/AMDGPU/spill-vgpr.ll (+8-8) - (modified) llvm/test/CodeGen/AMDGPU/sra.ll (+11-11) - (modified) llvm/test/CodeGen/AMDGPU/srl.ll (+11-11) - (modified) llvm/test/CodeGen/AMDGPU/udiv.ll (+2-4) - (modified) llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll (+21-21) - (modified) llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll (+1-9) - (modified) llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll (+59-59) - (modified) llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll (+66-81) - (modified) llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll (+26-32) - (modified) llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll (+27-27) - (modified) llvm/test/CodeGen/Thumb2/mve-shuffle.ll (+53-90) - (modified) llvm/test/CodeGen/Thumb2/mve-vabdus.ll (+25-25) - (modified) llvm/test/CodeGen/Thumb2/mve-vld2.ll (+9-9) - (modified) llvm/test/CodeGen/Thumb2/mve-vld3.ll (+70-66) - (modified) llvm/test/CodeGen/Thumb2/mve-vld4.ll (+33-33) - (modified) llvm/test/CodeGen/Thumb2/mve-vldst4.ll (+55-55) - (modified) llvm/test/CodeGen/Thumb2/mve-vst2.ll (+18-18) - (modified) llvm/test/CodeGen/Thumb2/mve-vst3.ll (+214-209) - (modified) llvm/test/CodeGen/Thumb2/mve-vst4-post.ll (+20-20) - (modified) llvm/test/CodeGen/Thumb2/mve-vst4.ll (+69-69) ``diff diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 24bd9938bc45c..5416cdd39aaf3 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -421,12 +421,6 @@ class RegSequenceRewriter : public Rewriter { } bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override { -// Do not introduce new subregister uses in a reg_sequence. Until composing -// subregister indices is supported while folding, we
[llvm-branch-commits] [llvm] PeepholeOpt: Allow introducing subregister uses on reg_sequence (PR #127052)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/127052 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] PeepholeOpt: Handle subregister compose when looking through reg_sequence (PR #127051)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/127051 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [Flang] Remove FLANG_INCLUDE_RUNTIME (PR #124126)
https://github.com/Meinersbur updated https://github.com/llvm/llvm-project/pull/124126 >From c515d13f0ad684763e6d76a87a610801482c15f4 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Fri, 24 Jan 2025 16:52:46 +0100 Subject: [PATCH 1/2] [Flang] Remove FLANG_INCLUDE_RUNTIME --- flang/CMakeLists.txt | 25 +- .../modules/AddFlangOffloadRuntime.cmake | 146 flang/runtime/CMakeLists.txt | 350 -- flang/runtime/CUDA/CMakeLists.txt | 41 -- flang/runtime/Float128Math/CMakeLists.txt | 133 --- flang/test/CMakeLists.txt | 10 - flang/test/lit.cfg.py | 3 - flang/test/lit.site.cfg.py.in | 1 - flang/tools/f18/CMakeLists.txt| 17 +- flang/unittests/CMakeLists.txt| 43 +-- flang/unittests/Evaluate/CMakeLists.txt | 16 - 11 files changed, 5 insertions(+), 780 deletions(-) delete mode 100644 flang/cmake/modules/AddFlangOffloadRuntime.cmake delete mode 100644 flang/runtime/CMakeLists.txt delete mode 100644 flang/runtime/CUDA/CMakeLists.txt delete mode 100644 flang/runtime/Float128Math/CMakeLists.txt diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index 38004c149b783..aceb2d09c5438 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -23,7 +23,6 @@ if (LLVM_ENABLE_EH) endif() set(FLANG_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -set(FLANG_RT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../flang-rt") if (CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR AND NOT MSVC_IDE) message(FATAL_ERROR "In-source builds are not allowed. \ @@ -237,24 +236,8 @@ else() include_directories(SYSTEM ${MLIR_TABLEGEN_OUTPUT_DIR}) endif() -set(FLANG_INCLUDE_RUNTIME_default ON) -if ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES) - set(FLANG_INCLUDE_RUNTIME_default OFF) -endif () -option(FLANG_INCLUDE_RUNTIME "Build the runtime in-tree (deprecated; to be replaced with LLVM_ENABLE_RUNTIMES=flang-rt)" FLANG_INCLUDE_RUNTIME_default) -if (FLANG_INCLUDE_RUNTIME) - if ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES) -message(WARNING "Building Flang-RT using LLVM_ENABLE_RUNTIMES. FLANG_INCLUDE_RUNTIME=${FLANG_INCLUDE_RUNTIME} ignored.") -set(FLANG_INCLUDE_RUNTIME OFF) - else () - message(STATUS "Building flang_rt in-tree") - endif () -else () - if ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES) -message(STATUS "Building Flang-RT using LLVM_ENABLE_RUNTIMES.") - else () -message(STATUS "Not building Flang-RT. For a usable Fortran toolchain, compile a standalone Flang-RT") - endif () +if (NOT "flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES) + message(STATUS "Not building Flang-RT. For a usable Fortran toolchain, compile a standalone Flang-RT") endif () set(FLANG_TOOLS_INSTALL_DIR "${CMAKE_INSTALL_BINDIR}" CACHE PATH @@ -484,10 +467,6 @@ if (FLANG_CUF_RUNTIME) find_package(CUDAToolkit REQUIRED) endif() -if (FLANG_INCLUDE_RUNTIME) - add_subdirectory(runtime) -endif () - if (LLVM_INCLUDE_EXAMPLES) add_subdirectory(examples) endif() diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake deleted file mode 100644 index 8e4f47d18535d..0 --- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake +++ /dev/null @@ -1,146 +0,0 @@ -option(FLANG_EXPERIMENTAL_CUDA_RUNTIME - "Compile Fortran runtime as CUDA sources (experimental)" OFF - ) - -option(FLANG_CUDA_RUNTIME_PTX_WITHOUT_GLOBAL_VARS - "Do not compile global variables' definitions when producing PTX library" OFF - ) - -set(FLANG_LIBCUDACXX_PATH "" CACHE PATH "Path to libcu++ package installation") - -set(FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD "off" CACHE STRING - "Compile Fortran runtime as OpenMP target offload sources (experimental). Valid options are 'off', 'host_device', 'nohost'") - -set(FLANG_OMP_DEVICE_ARCHITECTURES "all" CACHE STRING - "List of OpenMP device architectures to be used to compile the Fortran runtime (e.g. 'gfx1103;sm_90')") - -macro(enable_cuda_compilation name files) - if (FLANG_EXPERIMENTAL_CUDA_RUNTIME) -if (BUILD_SHARED_LIBS) - message(FATAL_ERROR -"BUILD_SHARED_LIBS is not supported for CUDA build of Fortran runtime" -) -endif() - -enable_language(CUDA) - -# TODO: figure out how to make target property CUDA_SEPARABLE_COMPILATION -# work, and avoid setting CMAKE_CUDA_SEPARABLE_COMPILATION. -set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - -# Treat all supported sources as CUDA files. -set_source_files_properties(${files} PROPERTIES LANGUAGE CUDA) -set(CUDA_COMPILE_OPTIONS) -if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "Clang") - # Allow varargs. - set(CUDA_COMPILE_OPTIONS --Xclang -fcuda-allow-variadic-functions -) -endif() -if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "NVIDIA") - set(CUDA_COMPILE_OPTIONS ---expt-relaxed-constexpr -# Disable these warnings: -#
[llvm-branch-commits] [llvm] [Flang-RT] Build libflang_rt.so (PR #121782)
https://github.com/Meinersbur updated https://github.com/llvm/llvm-project/pull/121782 >From b05c9a033158aea459d51ff34b8ec47e72f85740 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Fri, 24 Jan 2025 16:51:27 +0100 Subject: [PATCH 1/2] [Flang-RT] Build libflang_rt.so --- flang-rt/CMakeLists.txt | 30 ++ flang-rt/cmake/modules/AddFlangRT.cmake | 324 -- .../cmake/modules/AddFlangRTOffload.cmake | 18 +- flang-rt/lib/flang_rt/CMakeLists.txt | 9 +- flang-rt/lib/flang_rt/CUDA/CMakeLists.txt | 26 +- flang-rt/test/CMakeLists.txt | 2 +- flang-rt/test/lit.cfg.py | 2 +- 7 files changed, 283 insertions(+), 128 deletions(-) diff --git a/flang-rt/CMakeLists.txt b/flang-rt/CMakeLists.txt index 655d0a55b4004..0b91b6ae7eea7 100644 --- a/flang-rt/CMakeLists.txt +++ b/flang-rt/CMakeLists.txt @@ -115,6 +115,15 @@ endif () extend_path(FLANG_RT_INSTALL_RESOURCE_LIB_PATH "${FLANG_RT_INSTALL_RESOURCE_PATH}" "${toolchain_lib_subdir}") cmake_path(NORMAL_PATH FLANG_RT_OUTPUT_RESOURCE_DIR) cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_PATH) +# FIXME: For the libflang_rt.so, the toolchain resource lib dir is not a good +#destination because it is not a ld.so default search path. +#The machine where the executable is eventually executed may not be the +#machine where the Flang compiler and its resource dir is installed, so +#setting RPath by the driver is not an solution. It should belong into +#/usr/lib//libflang_rt.so, like e.g. libgcc_s.so. +#But the linker as invoked by the Flang driver also requires +#libflang_rt.so to be found when linking and the resource lib dir is +#the only reliable location. cmake_path(NORMAL_PATH FLANG_RT_OUTPUT_RESOURCE_LIB_DIR) cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_LIB_PATH) @@ -129,6 +138,27 @@ cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_LIB_PATH) option(FLANG_RT_INCLUDE_TESTS "Generate build targets for the flang-rt unit and regression-tests." "${LLVM_INCLUDE_TESTS}") +option(FLANG_RT_ENABLE_STATIC "Build Flang-RT as a static library." ON) +if (WIN32) + # Windows DLL currently not implemented. + set(FLANG_RT_ENABLE_SHARED OFF) +else () + # TODO: Enable by default to increase test coverage, and which version of the + # library should be the user's choice anyway. + # Currently, the Flang driver adds `-L"libdir" -lflang_rt` as linker + # argument, which leaves the choice which library to use to the linker. + # Since most linkers prefer the shared library, this would constitute a + # breaking change unless the driver is changed. + option(FLANG_RT_ENABLE_SHARED "Build Flang-RT as a shared library." OFF) +endif () +if (NOT FLANG_RT_ENABLE_STATIC AND NOT FLANG_RT_ENABLE_SHARED) + message(FATAL_ERROR " + Must build at least one type of library + (FLANG_RT_ENABLE_STATIC=ON, FLANG_RT_ENABLE_SHARED=ON, or both) +") +endif () + + set(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT "" CACHE STRING "Compile Flang-RT with GPU support (CUDA or OpenMP)") set_property(CACHE FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT PROPERTY STRINGS "" diff --git a/flang-rt/cmake/modules/AddFlangRT.cmake b/flang-rt/cmake/modules/AddFlangRT.cmake index aa8adedf61752..87ec58b2e854e 100644 --- a/flang-rt/cmake/modules/AddFlangRT.cmake +++ b/flang-rt/cmake/modules/AddFlangRT.cmake @@ -16,7 +16,8 @@ # STATIC # Build a static (.a/.lib) library # OBJECT -# Create only object files without static/dynamic library +# Always create an object library. +# Without SHARED/STATIC, build only the object library. # INSTALL_WITH_TOOLCHAIN # Install library into Clang's resource directory so it can be found by the # Flang driver during compilation, including tests @@ -48,17 +49,73 @@ function (add_flangrt_library name) ") endif () - # Forward libtype to add_library - set(extra_args "") - if (ARG_SHARED) -list(APPEND extra_args SHARED) + # Internal names of libraries. If called with just single type option, use + # the default name for it. Name of targets must only depend on function + # arguments to be predictable for callers. + set(name_static "${name}.static") + set(name_shared "${name}.shared") + set(name_object "obj.${name}") + if (ARG_STATIC AND NOT ARG_SHARED) +set(name_static "${name}") + elseif (NOT ARG_STATIC AND ARG_SHARED) +set(name_shared "${name}") + elseif (NOT ARG_STATIC AND NOT ARG_SHARED AND ARG_OBJECT) +set(name_object "${name}") + elseif (NOT ARG_STATIC AND NOT ARG_SHARED AND NOT ARG_OBJECT) +# Only one of them will actually be built. +set(name_static "${name}") +set(name_shared "${name}") endif () - if (ARG_STATIC) -list(APPEND extra_args STATIC) + + # Determine what to build. If not explicitly specified, honor + # BUILD_SHARED_LIBS (e.g. for unittest libraries). If can build s
[llvm-branch-commits] [clang] [flang] [lld] [llvm] [Flang] LLVM_ENABLE_RUNTIMES=flang-rt (PR #110217)
@@ -248,7 +237,25 @@ else() include_directories(SYSTEM ${MLIR_TABLEGEN_OUTPUT_DIR}) endif() -option(FLANG_INCLUDE_RUNTIME "Build the runtime in-tree (deprecated; to be replaced with LLVM_ENABLE_RUNTIMES=flang-rt)" ON) +set(FLANG_INCLUDE_RUNTIME_default ON) +if ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES) + set(FLANG_INCLUDE_RUNTIME_default OFF) +endif () +option(FLANG_INCLUDE_RUNTIME "Build the runtime in-tree (deprecated; to be replaced with LLVM_ENABLE_RUNTIMES=flang-rt)" FLANG_INCLUDE_RUNTIME_default) Meinersbur wrote: Thanks, I think I didn't notice because I am always passing `-DLLVM_ENABLE_RUNTIMES=...`. https://github.com/llvm/llvm-project/pull/110217 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Set inst_pref_size to maximum (PR #126981)
@@ -199,3 +201,28 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, return MCConstantExpr::create(0, Ctx); } + +uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { + if (!CodeSizeInBytes.has_value()) { +const GCNSubtarget &STM = MF.getSubtarget(); +const SIInstrInfo *TII = STM.getInstrInfo(); + +uint64_t CodeSize = 0; + +for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { arsenm wrote: Just one of the ways this function is broken, it isn't accounting for the block alignments which will eventually be nop padded https://github.com/llvm/llvm-project/pull/126981 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Set inst_pref_size to maximum (PR #126981)
@@ -199,3 +201,28 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, return MCConstantExpr::create(0, Ctx); } + +uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { + if (!CodeSizeInBytes.has_value()) { arsenm wrote: should early return on !codeSizeInBytes and reduce indent https://github.com/llvm/llvm-project/pull/126981 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] Add release note for Armv9.6 updates (PR #126513)
https://github.com/CarolineConcatto updated https://github.com/llvm/llvm-project/pull/126513 >From b8f15433ed7ac0e63731cae7ab30672d5fe0703e Mon Sep 17 00:00:00 2001 From: CarolineConcatto Date: Mon, 10 Feb 2025 13:14:13 + Subject: [PATCH] Add release note for Armv9.6 updates --- llvm/docs/ReleaseNotes.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 44a0b17d6a07b..3f9c1981972cb 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -151,6 +151,9 @@ Changes to the AArch64 Backend * Added support for the FUJITSU-MONAKA CPU. +* Updated feature dependency in Armv9.6 for FEAT_FAMINMAX, FEAT_LUT and + FEAT_FP8, now they depend only on FEAT_NEON. + Changes to the AMDGPU Backend - ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][docs][NFC] Replace gfx940 with gfx942 in the gfx940 ISA doc (PR #126906)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/126906 >From 3be3e8dcc88ed73e52864e57e835d9cd9f31cdc7 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 12 Feb 2025 08:04:43 -0500 Subject: [PATCH] [AMDGPU][docs][NFC] Replace gfx940 with gfx942 in the gfx940 ISA doc gfx940 and gfx941 are no longer supported. This is the last one of a series of PRs to remove them from the code base. The ISA documentation still contains a lot of links and file names with the "gfx940" identifier. Changing them to "gfx942" is probably not worth the cost of breaking all URLs to these pages that users might have saved in the past. For SWDEV-512631 --- llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst b/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst index 90797499dec22..7603bcc95383b 100644 --- a/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst +++ b/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst @@ -6,7 +6,7 @@ ** -Syntax of gfx940 Instructions +Syntax of gfx942 Instructions .. contents:: @@ -15,7 +15,7 @@ Syntax of gfx940 Instructions Introduction -This document describes the syntax of gfx940 instructions. +This document describes the syntax of gfx942 instructions. Notation ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [libc] [libclc] [llvm] [AMDGPU] Replace gfx940 and gfx941 with gfx942 in offload and libclc (PR #125826)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/125826 >From e8d38e53f175c5d481c6774b330f0da15d234322 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 5 Feb 2025 04:19:00 -0500 Subject: [PATCH] [AMDGPU] Replace gfx940 and gfx941 with gfx942 in offload and libclc gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. For SWDEV-512631 and SWDEV-512633 --- flang/cmake/modules/AddFlangOffloadRuntime.cmake | 2 +- libc/docs/gpu/using.rst | 2 +- libclc/CMakeLists.txt| 2 +- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 6 -- offload/test/lit.cfg | 4 +--- 5 files changed, 4 insertions(+), 12 deletions(-) diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake index 8e4f47d18535d..f1f6eb57c5d6c 100644 --- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake +++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake @@ -98,7 +98,7 @@ macro(enable_omp_offload_compilation files) set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906" -"gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030" +"gfx908;gfx90a;gfx90c;gfx942;gfx1010;gfx1030" "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036" "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151" "gfx1152;gfx1153" diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst index 1c1f9c9bfb0c6..f17f6287be313 100644 --- a/libc/docs/gpu/using.rst +++ b/libc/docs/gpu/using.rst @@ -44,7 +44,7 @@ this shouldn't be necessary. $> clang openmp.c -fopenmp --offload-arch=gfx90a -Xoffload-linker -lc $> clang cuda.cu --offload-arch=sm_80 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc - $> clang hip.hip --offload-arch=gfx940 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc + $> clang hip.hip --offload-arch=gfx942 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc This will automatically link in the needed function definitions if they were required by the user's application. Normally using the ``-fgpu-rdc`` option diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index c88ea9700d100..cddff9d0ec31a 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -211,7 +211,7 @@ set( cayman_aliases aruba ) set( tahiti_aliases pitcairn verde oland hainan bonaire kabini kaveri hawaii mullins tonga tongapro iceland carrizo fiji stoney polaris10 polaris11 gfx602 gfx705 gfx805 - gfx900 gfx902 gfx904 gfx906 gfx908 gfx909 gfx90a gfx90c gfx940 gfx941 gfx942 + gfx900 gfx902 gfx904 gfx906 gfx908 gfx909 gfx90a gfx90c gfx942 gfx1010 gfx1011 gfx1012 gfx1013 gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 gfx1100 gfx1101 gfx1102 gfx1103 diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 92184ba796dbd..e83d38a14f77f 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2854,12 +2854,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Error checkIfAPU() { // TODO: replace with ROCr API once it becomes available. llvm::StringRef StrGfxName(ComputeUnitKind); -IsAPU = llvm::StringSwitch(StrGfxName) -.Case("gfx940", true) -.Default(false); -if (IsAPU) - return Plugin::success(); - bool MayBeAPU = llvm::StringSwitch(StrGfxName) .Case("gfx942", true) .Default(false); diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 1e265d2c30904..f017bca85dd4f 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -134,12 +134,10 @@ elif config.libomptarget_current_target.startswith('amdgcn'): # amdgpu_test_arch contains a list of AMD GPUs in the system # only check the first one assuming that we will run the test on it. if not (config.amdgpu_test_arch.startswith("gfx90a") or -config.amdgpu_test_arch.startswith("gfx940") or config.amdgpu_test_arch.startswith("gfx942")): supports_unified_shared_memory = False # check if AMD architecture is an APU: -if (config.amdgpu_test_arch.startswith("gfx940") or -(config.amdgpu_test_arch.startswith("gfx942") and +if ((config.amdgpu_test_arch.startswith("gfx942") and evaluate_bool_env(config.environment['IS_APU']))): supports_apu = True if supports_unified_shared_memory: ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [AMDGPU] Add missing gfx architectures to AddFlangOffloadRuntime.cmake (PR #125827)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/125827 >From 55e82835c1fbf0259881c34dc55d7fd15cfd98a2 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 5 Feb 2025 04:45:26 -0500 Subject: [PATCH] [AMDGPU] Add missing gfx architectures to AddFlangOffloadRuntime.cmake --- flang/cmake/modules/AddFlangOffloadRuntime.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake index f1f6eb57c5d6c..eb0e964559ed5 100644 --- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake +++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake @@ -98,10 +98,10 @@ macro(enable_omp_offload_compilation files) set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906" -"gfx908;gfx90a;gfx90c;gfx942;gfx1010;gfx1030" +"gfx908;gfx90a;gfx90c;gfx942;gfx950;gfx1010;gfx1030" "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036" "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151" -"gfx1152;gfx1153" +"gfx1152;gfx1153;gfx1200;gfx1201" ) set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62" ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [AMDGPU][clang] Replace gfx940 and gfx941 with gfx942 in clang (PR #126762)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/126762 >From 7398a7f3a441fca9fbcddcc2b5db1e6a57c52165 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Tue, 11 Feb 2025 08:52:55 -0500 Subject: [PATCH] [AMDGPU][clang] Replace gfx940 and gfx941 with gfx942 in clang gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. This PR removes all occurrences of gfx940/gfx941 from clang that can be removed without changes in the llvm directory. The target-invalid-cpu-note/amdgcn.c test is not included here since it tests a list of targets that is defined in llvm/lib/TargetParser/TargetParser.cpp. For SWDEV-512631 --- clang/include/clang/Basic/Cuda.h | 2 - clang/lib/Basic/Cuda.cpp | 2 - clang/lib/Basic/Targets/NVPTX.cpp | 2 - clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 2 - clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu | 2 +- clang/test/CodeGenOpenCL/amdgpu-features.cl | 4 - .../test/CodeGenOpenCL/builtins-amdgcn-fp8.cl | 2 +- ...cn-gfx940.cl => builtins-amdgcn-gfx942.cl} | 2 +- .../builtins-amdgcn-gfx950-err.cl | 2 +- .../builtins-amdgcn-gws-insts.cl | 2 +- .../CodeGenOpenCL/builtins-amdgcn-mfma.cl | 110 +- ...fx940.cl => builtins-fp-atomics-gfx942.cl} | 34 +++--- clang/test/Driver/amdgpu-macros.cl| 2 - clang/test/Driver/amdgpu-mcpu.cl | 4 - clang/test/Driver/cuda-bad-arch.cu| 2 +- clang/test/Driver/hip-macros.hip | 10 +- .../test/Misc/target-invalid-cpu-note/nvptx.c | 2 - ... => builtins-amdgcn-error-gfx942-param.cl} | 2 +- .../builtins-amdgcn-error-gfx950.cl | 2 +- ...0-err.cl => builtins-amdgcn-gfx942-err.cl} | 14 +-- 20 files changed, 91 insertions(+), 113 deletions(-) rename clang/test/CodeGenOpenCL/{builtins-amdgcn-gfx940.cl => builtins-amdgcn-gfx942.cl} (98%) rename clang/test/CodeGenOpenCL/{builtins-fp-atomics-gfx940.cl => builtins-fp-atomics-gfx942.cl} (84%) rename clang/test/SemaOpenCL/{builtins-amdgcn-error-gfx940-param.cl => builtins-amdgcn-error-gfx942-param.cl} (99%) rename clang/test/SemaOpenCL/{builtins-amdgcn-gfx940-err.cl => builtins-amdgcn-gfx942-err.cl} (81%) diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index f33ba46233a7a..793cab1f4e84a 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -106,8 +106,6 @@ enum class OffloadArch { GFX90a, GFX90c, GFX9_4_GENERIC, - GFX940, - GFX941, GFX942, GFX950, GFX10_1_GENERIC, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 1bfec0b37c5ee..f45fb0eca3714 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -124,8 +124,6 @@ static const OffloadArchToStringMap arch_names[] = { GFX(90a), // gfx90a GFX(90c), // gfx90c {OffloadArch::GFX9_4_GENERIC, "gfx9-4-generic", "compute_amdgcn"}, -GFX(940), // gfx940 -GFX(941), // gfx941 GFX(942), // gfx942 GFX(950), // gfx950 {OffloadArch::GFX10_1_GENERIC, "gfx10-1-generic", "compute_amdgcn"}, diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 7d13c1f145440..547cf3dfa2be7 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -211,8 +211,6 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case OffloadArch::GFX90a: case OffloadArch::GFX90c: case OffloadArch::GFX9_4_GENERIC: - case OffloadArch::GFX940: - case OffloadArch::GFX941: case OffloadArch::GFX942: case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index c13928f61a748..826ec4da8ea28 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2302,8 +2302,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { case OffloadArch::GFX90a: case OffloadArch::GFX90c: case OffloadArch::GFX9_4_GENERIC: - case OffloadArch::GFX940: - case OffloadArch::GFX941: case OffloadArch::GFX942: case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu index 47fa3967fe237..37fca614c3111 100644 --- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu +++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu @@ -11,7 +11,7 @@ // RUN: -fnative-half-arguments-and-returns | FileCheck -check-prefix=SAFE %s // RUN: %clang_cc1 -x hip %s -O3 -S -o - -triple=amdgcn-amd-amdhsa \ -// RUN: -fcuda-is-device -target-cpu gfx940 -fnative-half-type \ +// RUN: -fcuda-is-device -target-cpu gfx942 -fnative-half-type \ // RUN: -fnative-half-arguments-and-retur
[llvm-branch-commits] [llvm] [AMDGPU][docs] Replace gfx940 and gfx941 with gfx942 in llvm/docs (PR #126887)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/126887 >From 6bc2689cf31303be4451b7c8c79546465a1c1016 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 12 Feb 2025 05:45:01 -0500 Subject: [PATCH] [AMDGPU][docs] Replace gfx940 and gfx941 with gfx942 in llvm/docs gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. This PR removes all documentation occurrences of gfx940/gfx941 except for the gfx940 ISA description, which will be the subject of a separate PR. For SWDEV-512631 --- llvm/docs/AMDGPUOperandSyntax.rst | 4 +- llvm/docs/AMDGPUUsage.rst | 97 ++- 2 files changed, 34 insertions(+), 67 deletions(-) diff --git a/llvm/docs/AMDGPUOperandSyntax.rst b/llvm/docs/AMDGPUOperandSyntax.rst index ff6ec6cf71ff2..e8a76322fe76a 100644 --- a/llvm/docs/AMDGPUOperandSyntax.rst +++ b/llvm/docs/AMDGPUOperandSyntax.rst @@ -63,7 +63,7 @@ Note: *N* and *K* must satisfy the following conditions: * 0 <= *K* <= 255. * *K-N+1* must be in the range from 1 to 12 or equal to 16 or 32. -GFX90A and GFX940 have an additional alignment requirement: +GFX90A and GFX942 have an additional alignment requirement: pairs of *vector* registers must be even-aligned (first register must be even). @@ -183,7 +183,7 @@ Note: *N* and *K* must satisfy the following conditions: * 0 <= *K* <= 255. * *K-N+1* must be in the range from 1 to 12 or equal to 16 or 32. -GFX90A and GFX940 have an additional alignment requirement: +GFX90A and GFX942 have an additional alignment requirement: pairs of *accumulator* registers must be even-aligned (first register must be even). diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index d4742bb1eaf09..3b57ea91282ec 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -323,7 +323,7 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following Add product names. - **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_ [AMD-GCN-GFX940-GFX942-CDNA3]_ + **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_ [AMD-GCN-GFX942-CDNA3]_ --- ``gfx900`` ``amdgcn`` dGPU - xnack - Absolute - *rocm-amdhsa* - Radeon Vega flat - *pal-amdhsa*Frontier Edition @@ -378,20 +378,6 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following - Ryzen 3 Pro 4350G - Ryzen 3 Pro 4350GE - ``gfx940`` ``amdgcn`` dGPU - sramecc - Architected *TBA* -- tgsplit flat -- xnack scratch .. TODO:: -- kernarg preload - Packed - work-item Add product -IDs names. - - ``gfx941`` ``amdgcn`` dGPU - sramecc - Architected *TBA* -- tgsplit flat -- xnack scratch .. TODO:: -- kernarg preload - Packed - work-item Add product -IDs names. - ``gfx942`` ``amdgcn`` dGPU - sramecc - Architected - AMD Instinct MI300X - tgsplit flat - AMD Instinct MI300A - xnack scratch @@ -583,10 +569,10 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor
[llvm-branch-commits] [mlir] [AMDGPU][MLIR] Replace gfx940 and gfx941 with gfx942 in MLIR (PR #125836)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/125836 >From d8557b87c87c4cf404f591e4f82e3c24928536fe Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 5 Feb 2025 05:50:12 -0500 Subject: [PATCH 1/2] [AMDGPU][MLIR] Replace gfx940 and gfx941 with gfx942 in MLIR gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. For SWDEV-512631 --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 8 +++ .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 22 +-- .../ArithToAMDGPU/ArithToAMDGPU.cpp | 2 +- .../AMDGPU/Transforms/EmulateAtomics.cpp | 8 +-- .../AMDGPUToROCDL/8-bit-floats.mlir | 2 +- mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir | 2 +- .../ArithToAMDGPU/8-bit-float-saturation.mlir | 2 +- .../ArithToAMDGPU/8-bit-floats.mlir | 2 +- .../Dialect/AMDGPU/AMDGPUUtilsTest.cpp| 20 +++-- 10 files changed, 30 insertions(+), 40 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 69745addfd748..24f541587cba8 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -602,7 +602,7 @@ def AMDGPU_MFMAOp : order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on). The negateA, negateB, and negateC flags are only supported for double-precision -operations on gfx940+. +operations on gfx942+. }]; let assemblyFormat = [{ $sourceA `*` $sourceB `+` $destC diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 7efa4ffa2aa6f..77401bd6de4bd 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -348,11 +348,11 @@ def ROCDL_mfma_f32_16x16x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x4bf16.1k"> def ROCDL_mfma_f32_4x4x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.4x4x4bf16.1k">; def ROCDL_mfma_f32_32x32x8bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x8bf16.1k">; def ROCDL_mfma_f32_16x16x16bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x16bf16.1k">; -// Note: in gfx940, unlike in gfx90a, the f64 xdlops use the "blgp" argument as a -// NEG bitfield. See IntrinsicsAMDGPU.td for more info. +// Note: in gfx942, unlike in gfx90a, the f64 xdlops use the "blgp" argument as +// a NEG bitfield. See IntrinsicsAMDGPU.td for more info. def ROCDL_mfma_f64_16x16x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.16x16x4f64">; def ROCDL_mfma_f64_4x4x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.4x4x4f64">; -// New in gfx940. +// New in gfx942. def ROCDL_mfma_i32_16x16x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.16x16x32.i8">; def ROCDL_mfma_i32_32x32x16_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x16.i8">; def ROCDL_mfma_f32_16x16x8_xf32 : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x8.xf32">; @@ -375,7 +375,7 @@ def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">; def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>; def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>; -// 2:4 Sparsity ops (GFX940) +// 2:4 Sparsity ops (GFX942) def ROCDL_smfmac_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.f16">; def ROCDL_smfmac_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.f16">; def ROCDL_smfmac_f32_16x16x32_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.bf16">; diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 9fb51f0bc1f1e..18fd0fc3f0381 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -80,7 +80,7 @@ namespace { // Define commonly used chipsets versions for convenience. constexpr Chipset kGfx908 = Chipset(9, 0, 8); constexpr Chipset kGfx90a = Chipset(9, 0, 0xa); -constexpr Chipset kGfx940 = Chipset(9, 4, 0); +constexpr Chipset kGfx942 = Chipset(9, 4, 2); /// Define lowering patterns for raw buffer ops template @@ -483,7 +483,7 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, destElem = destType.getElementType(); if (sourceElem.isF32() && destElem.isF32()) { -if (mfma.getReducePrecision() && chipset >= kGfx940) { +if (mfma.getReducePrecision() && chipset >= kGfx942) { if (m == 32 && n == 32 && k == 4 && b == 1) return ROCDL::mfma_f32_32x32x4_xf32::getOperationName(); if (m == 16 && n == 16 && k == 8 && b == 1) @@ -551,9 +551,9 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, return ROCDL::mfma_i32_32x32x8i8::getOperationName(); if (m == 16 && n == 16 && k == 16 && b == 1) return ROCDL::mfma_i32_16x16x16i8::getOperationName(); -if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx940) +
[llvm-branch-commits] [clang] [AMDGPU][clang] Replace gfx940 and gfx941 with gfx942 in clang (PR #126762)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/126762 >From 7398a7f3a441fca9fbcddcc2b5db1e6a57c52165 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Tue, 11 Feb 2025 08:52:55 -0500 Subject: [PATCH] [AMDGPU][clang] Replace gfx940 and gfx941 with gfx942 in clang gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. This PR removes all occurrences of gfx940/gfx941 from clang that can be removed without changes in the llvm directory. The target-invalid-cpu-note/amdgcn.c test is not included here since it tests a list of targets that is defined in llvm/lib/TargetParser/TargetParser.cpp. For SWDEV-512631 --- clang/include/clang/Basic/Cuda.h | 2 - clang/lib/Basic/Cuda.cpp | 2 - clang/lib/Basic/Targets/NVPTX.cpp | 2 - clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 2 - clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu | 2 +- clang/test/CodeGenOpenCL/amdgpu-features.cl | 4 - .../test/CodeGenOpenCL/builtins-amdgcn-fp8.cl | 2 +- ...cn-gfx940.cl => builtins-amdgcn-gfx942.cl} | 2 +- .../builtins-amdgcn-gfx950-err.cl | 2 +- .../builtins-amdgcn-gws-insts.cl | 2 +- .../CodeGenOpenCL/builtins-amdgcn-mfma.cl | 110 +- ...fx940.cl => builtins-fp-atomics-gfx942.cl} | 34 +++--- clang/test/Driver/amdgpu-macros.cl| 2 - clang/test/Driver/amdgpu-mcpu.cl | 4 - clang/test/Driver/cuda-bad-arch.cu| 2 +- clang/test/Driver/hip-macros.hip | 10 +- .../test/Misc/target-invalid-cpu-note/nvptx.c | 2 - ... => builtins-amdgcn-error-gfx942-param.cl} | 2 +- .../builtins-amdgcn-error-gfx950.cl | 2 +- ...0-err.cl => builtins-amdgcn-gfx942-err.cl} | 14 +-- 20 files changed, 91 insertions(+), 113 deletions(-) rename clang/test/CodeGenOpenCL/{builtins-amdgcn-gfx940.cl => builtins-amdgcn-gfx942.cl} (98%) rename clang/test/CodeGenOpenCL/{builtins-fp-atomics-gfx940.cl => builtins-fp-atomics-gfx942.cl} (84%) rename clang/test/SemaOpenCL/{builtins-amdgcn-error-gfx940-param.cl => builtins-amdgcn-error-gfx942-param.cl} (99%) rename clang/test/SemaOpenCL/{builtins-amdgcn-gfx940-err.cl => builtins-amdgcn-gfx942-err.cl} (81%) diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index f33ba46233a7a..793cab1f4e84a 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -106,8 +106,6 @@ enum class OffloadArch { GFX90a, GFX90c, GFX9_4_GENERIC, - GFX940, - GFX941, GFX942, GFX950, GFX10_1_GENERIC, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 1bfec0b37c5ee..f45fb0eca3714 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -124,8 +124,6 @@ static const OffloadArchToStringMap arch_names[] = { GFX(90a), // gfx90a GFX(90c), // gfx90c {OffloadArch::GFX9_4_GENERIC, "gfx9-4-generic", "compute_amdgcn"}, -GFX(940), // gfx940 -GFX(941), // gfx941 GFX(942), // gfx942 GFX(950), // gfx950 {OffloadArch::GFX10_1_GENERIC, "gfx10-1-generic", "compute_amdgcn"}, diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 7d13c1f145440..547cf3dfa2be7 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -211,8 +211,6 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case OffloadArch::GFX90a: case OffloadArch::GFX90c: case OffloadArch::GFX9_4_GENERIC: - case OffloadArch::GFX940: - case OffloadArch::GFX941: case OffloadArch::GFX942: case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index c13928f61a748..826ec4da8ea28 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2302,8 +2302,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { case OffloadArch::GFX90a: case OffloadArch::GFX90c: case OffloadArch::GFX9_4_GENERIC: - case OffloadArch::GFX940: - case OffloadArch::GFX941: case OffloadArch::GFX942: case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu index 47fa3967fe237..37fca614c3111 100644 --- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu +++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu @@ -11,7 +11,7 @@ // RUN: -fnative-half-arguments-and-returns | FileCheck -check-prefix=SAFE %s // RUN: %clang_cc1 -x hip %s -O3 -S -o - -triple=amdgcn-amd-amdhsa \ -// RUN: -fcuda-is-device -target-cpu gfx940 -fnative-half-type \ +// RUN: -fcuda-is-device -target-cpu gfx942 -fnative-half-type \ // RUN: -fnative-half-arguments-and-retur
[llvm-branch-commits] [llvm] [AMDGPU][docs] Replace gfx940 and gfx941 with gfx942 in llvm/docs (PR #126887)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/126887 >From 6bc2689cf31303be4451b7c8c79546465a1c1016 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 12 Feb 2025 05:45:01 -0500 Subject: [PATCH] [AMDGPU][docs] Replace gfx940 and gfx941 with gfx942 in llvm/docs gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. This PR removes all documentation occurrences of gfx940/gfx941 except for the gfx940 ISA description, which will be the subject of a separate PR. For SWDEV-512631 --- llvm/docs/AMDGPUOperandSyntax.rst | 4 +- llvm/docs/AMDGPUUsage.rst | 97 ++- 2 files changed, 34 insertions(+), 67 deletions(-) diff --git a/llvm/docs/AMDGPUOperandSyntax.rst b/llvm/docs/AMDGPUOperandSyntax.rst index ff6ec6cf71ff2..e8a76322fe76a 100644 --- a/llvm/docs/AMDGPUOperandSyntax.rst +++ b/llvm/docs/AMDGPUOperandSyntax.rst @@ -63,7 +63,7 @@ Note: *N* and *K* must satisfy the following conditions: * 0 <= *K* <= 255. * *K-N+1* must be in the range from 1 to 12 or equal to 16 or 32. -GFX90A and GFX940 have an additional alignment requirement: +GFX90A and GFX942 have an additional alignment requirement: pairs of *vector* registers must be even-aligned (first register must be even). @@ -183,7 +183,7 @@ Note: *N* and *K* must satisfy the following conditions: * 0 <= *K* <= 255. * *K-N+1* must be in the range from 1 to 12 or equal to 16 or 32. -GFX90A and GFX940 have an additional alignment requirement: +GFX90A and GFX942 have an additional alignment requirement: pairs of *accumulator* registers must be even-aligned (first register must be even). diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index d4742bb1eaf09..3b57ea91282ec 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -323,7 +323,7 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following Add product names. - **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_ [AMD-GCN-GFX940-GFX942-CDNA3]_ + **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_ [AMD-GCN-GFX942-CDNA3]_ --- ``gfx900`` ``amdgcn`` dGPU - xnack - Absolute - *rocm-amdhsa* - Radeon Vega flat - *pal-amdhsa*Frontier Edition @@ -378,20 +378,6 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following - Ryzen 3 Pro 4350G - Ryzen 3 Pro 4350GE - ``gfx940`` ``amdgcn`` dGPU - sramecc - Architected *TBA* -- tgsplit flat -- xnack scratch .. TODO:: -- kernarg preload - Packed - work-item Add product -IDs names. - - ``gfx941`` ``amdgcn`` dGPU - sramecc - Architected *TBA* -- tgsplit flat -- xnack scratch .. TODO:: -- kernarg preload - Packed - work-item Add product -IDs names. - ``gfx942`` ``amdgcn`` dGPU - sramecc - Architected - AMD Instinct MI300X - tgsplit flat - AMD Instinct MI300A - xnack scratch @@ -583,10 +569,10 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor
[llvm-branch-commits] [mlir] [AMDGPU][MLIR] Replace gfx940 and gfx941 with gfx942 in MLIR (PR #125836)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/125836 >From d8557b87c87c4cf404f591e4f82e3c24928536fe Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 5 Feb 2025 05:50:12 -0500 Subject: [PATCH 1/2] [AMDGPU][MLIR] Replace gfx940 and gfx941 with gfx942 in MLIR gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. For SWDEV-512631 --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 8 +++ .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 22 +-- .../ArithToAMDGPU/ArithToAMDGPU.cpp | 2 +- .../AMDGPU/Transforms/EmulateAtomics.cpp | 8 +-- .../AMDGPUToROCDL/8-bit-floats.mlir | 2 +- mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir | 2 +- .../ArithToAMDGPU/8-bit-float-saturation.mlir | 2 +- .../ArithToAMDGPU/8-bit-floats.mlir | 2 +- .../Dialect/AMDGPU/AMDGPUUtilsTest.cpp| 20 +++-- 10 files changed, 30 insertions(+), 40 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 69745addfd748..24f541587cba8 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -602,7 +602,7 @@ def AMDGPU_MFMAOp : order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on). The negateA, negateB, and negateC flags are only supported for double-precision -operations on gfx940+. +operations on gfx942+. }]; let assemblyFormat = [{ $sourceA `*` $sourceB `+` $destC diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 7efa4ffa2aa6f..77401bd6de4bd 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -348,11 +348,11 @@ def ROCDL_mfma_f32_16x16x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x4bf16.1k"> def ROCDL_mfma_f32_4x4x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.4x4x4bf16.1k">; def ROCDL_mfma_f32_32x32x8bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x8bf16.1k">; def ROCDL_mfma_f32_16x16x16bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x16bf16.1k">; -// Note: in gfx940, unlike in gfx90a, the f64 xdlops use the "blgp" argument as a -// NEG bitfield. See IntrinsicsAMDGPU.td for more info. +// Note: in gfx942, unlike in gfx90a, the f64 xdlops use the "blgp" argument as +// a NEG bitfield. See IntrinsicsAMDGPU.td for more info. def ROCDL_mfma_f64_16x16x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.16x16x4f64">; def ROCDL_mfma_f64_4x4x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.4x4x4f64">; -// New in gfx940. +// New in gfx942. def ROCDL_mfma_i32_16x16x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.16x16x32.i8">; def ROCDL_mfma_i32_32x32x16_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x16.i8">; def ROCDL_mfma_f32_16x16x8_xf32 : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x8.xf32">; @@ -375,7 +375,7 @@ def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">; def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>; def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>; -// 2:4 Sparsity ops (GFX940) +// 2:4 Sparsity ops (GFX942) def ROCDL_smfmac_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.f16">; def ROCDL_smfmac_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.f16">; def ROCDL_smfmac_f32_16x16x32_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.bf16">; diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 9fb51f0bc1f1e..18fd0fc3f0381 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -80,7 +80,7 @@ namespace { // Define commonly used chipsets versions for convenience. constexpr Chipset kGfx908 = Chipset(9, 0, 8); constexpr Chipset kGfx90a = Chipset(9, 0, 0xa); -constexpr Chipset kGfx940 = Chipset(9, 4, 0); +constexpr Chipset kGfx942 = Chipset(9, 4, 2); /// Define lowering patterns for raw buffer ops template @@ -483,7 +483,7 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, destElem = destType.getElementType(); if (sourceElem.isF32() && destElem.isF32()) { -if (mfma.getReducePrecision() && chipset >= kGfx940) { +if (mfma.getReducePrecision() && chipset >= kGfx942) { if (m == 32 && n == 32 && k == 4 && b == 1) return ROCDL::mfma_f32_32x32x4_xf32::getOperationName(); if (m == 16 && n == 16 && k == 8 && b == 1) @@ -551,9 +551,9 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, return ROCDL::mfma_i32_32x32x8i8::getOperationName(); if (m == 16 && n == 16 && k == 16 && b == 1) return ROCDL::mfma_i32_16x16x16i8::getOperationName(); -if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx940) +
[llvm-branch-commits] [flang] [AMDGPU] Add missing gfx architectures to AddFlangOffloadRuntime.cmake (PR #125827)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/125827 >From 55e82835c1fbf0259881c34dc55d7fd15cfd98a2 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 5 Feb 2025 04:45:26 -0500 Subject: [PATCH] [AMDGPU] Add missing gfx architectures to AddFlangOffloadRuntime.cmake --- flang/cmake/modules/AddFlangOffloadRuntime.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake index f1f6eb57c5d6c..eb0e964559ed5 100644 --- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake +++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake @@ -98,10 +98,10 @@ macro(enable_omp_offload_compilation files) set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906" -"gfx908;gfx90a;gfx90c;gfx942;gfx1010;gfx1030" +"gfx908;gfx90a;gfx90c;gfx942;gfx950;gfx1010;gfx1030" "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036" "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151" -"gfx1152;gfx1153" +"gfx1152;gfx1153;gfx1200;gfx1201" ) set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62" ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Remove FeatureForceStoreSC0SC1 (PR #126878)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/126878 >From baa8adda226fea78f2fd595edadf57af72d7d049 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 12 Feb 2025 04:13:18 -0500 Subject: [PATCH] [AMDGPU] Remove FeatureForceStoreSC0SC1 This was only used for gfx940 and gfx941, which have since been removed. For SWDEV-512631 --- llvm/lib/Target/AMDGPU/AMDGPU.td | 6 -- llvm/lib/Target/AMDGPU/GCNSubtarget.h| 3 --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 20 3 files changed, 29 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 3aabca49b249e..effc8d2ed6b49 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1040,12 +1040,6 @@ def FeatureVALUTransUseHazard : SubtargetFeature<"valu-trans-use-hazard", "Hazard when TRANS instructions are closely followed by a use of the result" >; -def FeatureForceStoreSC0SC1 : SubtargetFeature<"force-store-sc0-sc1", - "HasForceStoreSC0SC1", - "true", - "Has SC0 and SC1 on stores" ->; - def FeatureSALUFloatInsts : SubtargetFeature<"salu-float", "HasSALUFloatInsts", "true", diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f7c5c472c93a5..6664a70572ded 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -246,7 +246,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasMADIntraFwdBug = false; bool HasVOPDInsts = false; bool HasVALUTransUseHazard = false; - bool HasForceStoreSC0SC1 = false; bool HasRequiredExportPriority = false; bool HasVmemWriteVgprInOrder = false; bool HasAshrPkInsts = false; @@ -1264,8 +1263,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasCvtScaleForwardingHazard() const { return GFX950Insts; } - bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; } - bool requiresCodeObjectV6() const { return RequiresCOV6; } bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 79fb36acc0ea7..34953f9c08db7 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -359,11 +359,6 @@ class SICacheControl { /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; - - virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) const { -return false; - } }; class SIGfx6CacheControl : public SICacheControl { @@ -517,20 +512,6 @@ class SIGfx940CacheControl : public SIGfx90ACacheControl { bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const override; - - bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) const override { -bool Changed = false; -if (ST.hasForceStoreSC0SC1() && -(MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | -SIAtomicAddrSpace::GLOBAL | -SIAtomicAddrSpace::OTHER)) != - SIAtomicAddrSpace::NONE) { - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); -} -return Changed; - } }; class SIGfx10CacheControl : public SIGfx7CacheControl { @@ -2820,7 +2801,6 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { Changed |= expandLoad(*MOI, MI); else if (const auto &MOI = MOA.getStoreInfo(MI)) { Changed |= expandStore(*MOI, MI); -Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) Changed |= expandAtomicFence(*MOI, MI); else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Set inst_pref_size to maximum (PR #126981)
@@ -199,3 +201,28 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, return MCConstantExpr::create(0, Ctx); } + +uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { rampitec wrote: I wanted to look at this separately. Right now the problem is AsmPrinter emits end function label into an incorrect place, actually into a kernel descriptor in .rodata. This is even a wrong section. That will take more and really a separate thing, but when fixed I could replace that with MCExpr. I.e., I can emit a separate end label, but this is also a hack. https://github.com/llvm/llvm-project/pull/126981 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [flang] [lld] [llvm] [Flang] LLVM_ENABLE_RUNTIMES=flang-rt (PR #110217)
https://github.com/jeanPerier approved this pull request. Thanks for addressing all my concerns! Let's roll! https://github.com/llvm/llvm-project/pull/110217 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [Flang-RT] Build libflang_rt.so (PR #121782)
https://github.com/jeanPerier approved this pull request. Thanks! https://github.com/llvm/llvm-project/pull/121782 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [flang] [lld] [llvm] [Flang] LLVM_ENABLE_RUNTIMES=flang-rt (PR #110217)
https://github.com/jhuber6 approved this pull request. https://github.com/llvm/llvm-project/pull/110217 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [AMDGPU][MLIR] Replace gfx940 and gfx941 with gfx942 in MLIR (PR #125836)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/125836 >From 411774e0d3d6007130fb16be18fe9a37da4e3bfe Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 5 Feb 2025 05:50:12 -0500 Subject: [PATCH 1/2] [AMDGPU][MLIR] Replace gfx940 and gfx941 with gfx942 in MLIR gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. For SWDEV-512631 --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 8 +++ .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 22 +-- .../ArithToAMDGPU/ArithToAMDGPU.cpp | 2 +- .../AMDGPU/Transforms/EmulateAtomics.cpp | 8 +-- .../AMDGPUToROCDL/8-bit-floats.mlir | 2 +- mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir | 2 +- .../ArithToAMDGPU/8-bit-float-saturation.mlir | 2 +- .../ArithToAMDGPU/8-bit-floats.mlir | 2 +- .../Dialect/AMDGPU/AMDGPUUtilsTest.cpp| 20 +++-- 10 files changed, 30 insertions(+), 40 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 69745addfd748..24f541587cba8 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -602,7 +602,7 @@ def AMDGPU_MFMAOp : order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on). The negateA, negateB, and negateC flags are only supported for double-precision -operations on gfx940+. +operations on gfx942+. }]; let assemblyFormat = [{ $sourceA `*` $sourceB `+` $destC diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 7efa4ffa2aa6f..77401bd6de4bd 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -348,11 +348,11 @@ def ROCDL_mfma_f32_16x16x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x4bf16.1k"> def ROCDL_mfma_f32_4x4x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.4x4x4bf16.1k">; def ROCDL_mfma_f32_32x32x8bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x8bf16.1k">; def ROCDL_mfma_f32_16x16x16bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x16bf16.1k">; -// Note: in gfx940, unlike in gfx90a, the f64 xdlops use the "blgp" argument as a -// NEG bitfield. See IntrinsicsAMDGPU.td for more info. +// Note: in gfx942, unlike in gfx90a, the f64 xdlops use the "blgp" argument as +// a NEG bitfield. See IntrinsicsAMDGPU.td for more info. def ROCDL_mfma_f64_16x16x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.16x16x4f64">; def ROCDL_mfma_f64_4x4x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.4x4x4f64">; -// New in gfx940. +// New in gfx942. def ROCDL_mfma_i32_16x16x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.16x16x32.i8">; def ROCDL_mfma_i32_32x32x16_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x16.i8">; def ROCDL_mfma_f32_16x16x8_xf32 : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x8.xf32">; @@ -375,7 +375,7 @@ def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">; def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>; def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>; -// 2:4 Sparsity ops (GFX940) +// 2:4 Sparsity ops (GFX942) def ROCDL_smfmac_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.f16">; def ROCDL_smfmac_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.f16">; def ROCDL_smfmac_f32_16x16x32_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.bf16">; diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 9fb51f0bc1f1e..18fd0fc3f0381 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -80,7 +80,7 @@ namespace { // Define commonly used chipsets versions for convenience. constexpr Chipset kGfx908 = Chipset(9, 0, 8); constexpr Chipset kGfx90a = Chipset(9, 0, 0xa); -constexpr Chipset kGfx940 = Chipset(9, 4, 0); +constexpr Chipset kGfx942 = Chipset(9, 4, 2); /// Define lowering patterns for raw buffer ops template @@ -483,7 +483,7 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, destElem = destType.getElementType(); if (sourceElem.isF32() && destElem.isF32()) { -if (mfma.getReducePrecision() && chipset >= kGfx940) { +if (mfma.getReducePrecision() && chipset >= kGfx942) { if (m == 32 && n == 32 && k == 4 && b == 1) return ROCDL::mfma_f32_32x32x4_xf32::getOperationName(); if (m == 16 && n == 16 && k == 8 && b == 1) @@ -551,9 +551,9 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, return ROCDL::mfma_i32_32x32x8i8::getOperationName(); if (m == 16 && n == 16 && k == 16 && b == 1) return ROCDL::mfma_i32_16x16x16i8::getOperationName(); -if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx940) +
[llvm-branch-commits] [clang] [flang] [lld] [llvm] [Flang] LLVM_ENABLE_RUNTIMES=flang-rt (PR #110217)
Meinersbur wrote: The pre-merge tests are failing. AFAICS, these are issues with the Buildkite bots, not with this PR. * buildkite/libcxx-ci/aarch64(-fno-exceptons): timeouts * buildkite/github-pull-requests/linux-linux-x64 and /windows-windows-x64: Linux reports failing to create the directory `flang/include`. The Windows error is `ninja: error: FindFirstFileExA(include/flang): The directory name is invalid.`. This seems to be an instance of https://github.com/ninja-build/ninja/issues/2159 and hints that in both cases, "include/flang" is a file (instead of a directory as expected). I don't see `include/flang` being a file in my local tests build directory (it contains the `.mod` files), I am assuming this is a leftover from some previous build on the buildbot. Plus, this PR doesn't change anything about how the module files are being built. https://github.com/llvm/llvm-project/pull/110217 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [AMDGPU][clang] Replace gfx940 and gfx941 with gfx942 in clang (PR #126762)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/126762 >From a80db3a9fe9654bc91b9377f8d38e6270380f9a2 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Tue, 11 Feb 2025 08:52:55 -0500 Subject: [PATCH] [AMDGPU][clang] Replace gfx940 and gfx941 with gfx942 in clang gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. This PR removes all occurrences of gfx940/gfx941 from clang that can be removed without changes in the llvm directory. The target-invalid-cpu-note/amdgcn.c test is not included here since it tests a list of targets that is defined in llvm/lib/TargetParser/TargetParser.cpp. For SWDEV-512631 --- clang/include/clang/Basic/Cuda.h | 2 - clang/lib/Basic/Cuda.cpp | 2 - clang/lib/Basic/Targets/NVPTX.cpp | 2 - clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 2 - clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu | 2 +- clang/test/CodeGenOpenCL/amdgpu-features.cl | 4 - .../test/CodeGenOpenCL/builtins-amdgcn-fp8.cl | 2 +- ...cn-gfx940.cl => builtins-amdgcn-gfx942.cl} | 2 +- .../builtins-amdgcn-gfx950-err.cl | 2 +- .../builtins-amdgcn-gws-insts.cl | 2 +- .../CodeGenOpenCL/builtins-amdgcn-mfma.cl | 110 +- ...fx940.cl => builtins-fp-atomics-gfx942.cl} | 34 +++--- clang/test/Driver/amdgpu-macros.cl| 2 - clang/test/Driver/amdgpu-mcpu.cl | 4 - clang/test/Driver/cuda-bad-arch.cu| 2 +- clang/test/Driver/hip-macros.hip | 10 +- .../test/Misc/target-invalid-cpu-note/nvptx.c | 2 - ... => builtins-amdgcn-error-gfx942-param.cl} | 2 +- .../builtins-amdgcn-error-gfx950.cl | 2 +- ...0-err.cl => builtins-amdgcn-gfx942-err.cl} | 14 +-- 20 files changed, 91 insertions(+), 113 deletions(-) rename clang/test/CodeGenOpenCL/{builtins-amdgcn-gfx940.cl => builtins-amdgcn-gfx942.cl} (98%) rename clang/test/CodeGenOpenCL/{builtins-fp-atomics-gfx940.cl => builtins-fp-atomics-gfx942.cl} (84%) rename clang/test/SemaOpenCL/{builtins-amdgcn-error-gfx940-param.cl => builtins-amdgcn-error-gfx942-param.cl} (99%) rename clang/test/SemaOpenCL/{builtins-amdgcn-gfx940-err.cl => builtins-amdgcn-gfx942-err.cl} (81%) diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index f33ba46233a7a..793cab1f4e84a 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -106,8 +106,6 @@ enum class OffloadArch { GFX90a, GFX90c, GFX9_4_GENERIC, - GFX940, - GFX941, GFX942, GFX950, GFX10_1_GENERIC, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 1bfec0b37c5ee..f45fb0eca3714 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -124,8 +124,6 @@ static const OffloadArchToStringMap arch_names[] = { GFX(90a), // gfx90a GFX(90c), // gfx90c {OffloadArch::GFX9_4_GENERIC, "gfx9-4-generic", "compute_amdgcn"}, -GFX(940), // gfx940 -GFX(941), // gfx941 GFX(942), // gfx942 GFX(950), // gfx950 {OffloadArch::GFX10_1_GENERIC, "gfx10-1-generic", "compute_amdgcn"}, diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 7d13c1f145440..547cf3dfa2be7 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -211,8 +211,6 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case OffloadArch::GFX90a: case OffloadArch::GFX90c: case OffloadArch::GFX9_4_GENERIC: - case OffloadArch::GFX940: - case OffloadArch::GFX941: case OffloadArch::GFX942: case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index c13928f61a748..826ec4da8ea28 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2302,8 +2302,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { case OffloadArch::GFX90a: case OffloadArch::GFX90c: case OffloadArch::GFX9_4_GENERIC: - case OffloadArch::GFX940: - case OffloadArch::GFX941: case OffloadArch::GFX942: case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu index 47fa3967fe237..37fca614c3111 100644 --- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu +++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu @@ -11,7 +11,7 @@ // RUN: -fnative-half-arguments-and-returns | FileCheck -check-prefix=SAFE %s // RUN: %clang_cc1 -x hip %s -O3 -S -o - -triple=amdgcn-amd-amdhsa \ -// RUN: -fcuda-is-device -target-cpu gfx940 -fnative-half-type \ +// RUN: -fcuda-is-device -target-cpu gfx942 -fnative-half-type \ // RUN: -fnative-half-arguments-and-retur
[llvm-branch-commits] [llvm] [AMDGPU] Remove FeatureForceStoreSC0SC1 (PR #126878)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/126878 >From 731f633e66cf3118e5764ed4d0acbd794d2ca2d2 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 12 Feb 2025 04:13:18 -0500 Subject: [PATCH] [AMDGPU] Remove FeatureForceStoreSC0SC1 This was only used for gfx940 and gfx941, which have since been removed. For SWDEV-512631 --- llvm/lib/Target/AMDGPU/AMDGPU.td | 6 -- llvm/lib/Target/AMDGPU/GCNSubtarget.h| 3 --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 20 3 files changed, 29 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 3aabca49b249e..effc8d2ed6b49 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1040,12 +1040,6 @@ def FeatureVALUTransUseHazard : SubtargetFeature<"valu-trans-use-hazard", "Hazard when TRANS instructions are closely followed by a use of the result" >; -def FeatureForceStoreSC0SC1 : SubtargetFeature<"force-store-sc0-sc1", - "HasForceStoreSC0SC1", - "true", - "Has SC0 and SC1 on stores" ->; - def FeatureSALUFloatInsts : SubtargetFeature<"salu-float", "HasSALUFloatInsts", "true", diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f7c5c472c93a5..6664a70572ded 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -246,7 +246,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasMADIntraFwdBug = false; bool HasVOPDInsts = false; bool HasVALUTransUseHazard = false; - bool HasForceStoreSC0SC1 = false; bool HasRequiredExportPriority = false; bool HasVmemWriteVgprInOrder = false; bool HasAshrPkInsts = false; @@ -1264,8 +1263,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasCvtScaleForwardingHazard() const { return GFX950Insts; } - bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; } - bool requiresCodeObjectV6() const { return RequiresCOV6; } bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 79fb36acc0ea7..34953f9c08db7 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -359,11 +359,6 @@ class SICacheControl { /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; - - virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) const { -return false; - } }; class SIGfx6CacheControl : public SICacheControl { @@ -517,20 +512,6 @@ class SIGfx940CacheControl : public SIGfx90ACacheControl { bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const override; - - bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) const override { -bool Changed = false; -if (ST.hasForceStoreSC0SC1() && -(MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | -SIAtomicAddrSpace::GLOBAL | -SIAtomicAddrSpace::OTHER)) != - SIAtomicAddrSpace::NONE) { - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); -} -return Changed; - } }; class SIGfx10CacheControl : public SIGfx7CacheControl { @@ -2820,7 +2801,6 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { Changed |= expandLoad(*MOI, MI); else if (const auto &MOI = MOA.getStoreInfo(MI)) { Changed |= expandStore(*MOI, MI); -Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) Changed |= expandAtomicFence(*MOI, MI); else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [AMDGPU][MLIR] Replace gfx940 and gfx941 with gfx942 in MLIR (PR #125836)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/125836 >From 411774e0d3d6007130fb16be18fe9a37da4e3bfe Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 5 Feb 2025 05:50:12 -0500 Subject: [PATCH 1/2] [AMDGPU][MLIR] Replace gfx940 and gfx941 with gfx942 in MLIR gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. For SWDEV-512631 --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 8 +++ .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 22 +-- .../ArithToAMDGPU/ArithToAMDGPU.cpp | 2 +- .../AMDGPU/Transforms/EmulateAtomics.cpp | 8 +-- .../AMDGPUToROCDL/8-bit-floats.mlir | 2 +- mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir | 2 +- .../ArithToAMDGPU/8-bit-float-saturation.mlir | 2 +- .../ArithToAMDGPU/8-bit-floats.mlir | 2 +- .../Dialect/AMDGPU/AMDGPUUtilsTest.cpp| 20 +++-- 10 files changed, 30 insertions(+), 40 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 69745addfd748..24f541587cba8 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -602,7 +602,7 @@ def AMDGPU_MFMAOp : order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on). The negateA, negateB, and negateC flags are only supported for double-precision -operations on gfx940+. +operations on gfx942+. }]; let assemblyFormat = [{ $sourceA `*` $sourceB `+` $destC diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 7efa4ffa2aa6f..77401bd6de4bd 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -348,11 +348,11 @@ def ROCDL_mfma_f32_16x16x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x4bf16.1k"> def ROCDL_mfma_f32_4x4x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.4x4x4bf16.1k">; def ROCDL_mfma_f32_32x32x8bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x8bf16.1k">; def ROCDL_mfma_f32_16x16x16bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x16bf16.1k">; -// Note: in gfx940, unlike in gfx90a, the f64 xdlops use the "blgp" argument as a -// NEG bitfield. See IntrinsicsAMDGPU.td for more info. +// Note: in gfx942, unlike in gfx90a, the f64 xdlops use the "blgp" argument as +// a NEG bitfield. See IntrinsicsAMDGPU.td for more info. def ROCDL_mfma_f64_16x16x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.16x16x4f64">; def ROCDL_mfma_f64_4x4x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.4x4x4f64">; -// New in gfx940. +// New in gfx942. def ROCDL_mfma_i32_16x16x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.16x16x32.i8">; def ROCDL_mfma_i32_32x32x16_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x16.i8">; def ROCDL_mfma_f32_16x16x8_xf32 : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x8.xf32">; @@ -375,7 +375,7 @@ def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">; def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>; def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>; -// 2:4 Sparsity ops (GFX940) +// 2:4 Sparsity ops (GFX942) def ROCDL_smfmac_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.f16">; def ROCDL_smfmac_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.f16">; def ROCDL_smfmac_f32_16x16x32_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.bf16">; diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 9fb51f0bc1f1e..18fd0fc3f0381 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -80,7 +80,7 @@ namespace { // Define commonly used chipsets versions for convenience. constexpr Chipset kGfx908 = Chipset(9, 0, 8); constexpr Chipset kGfx90a = Chipset(9, 0, 0xa); -constexpr Chipset kGfx940 = Chipset(9, 4, 0); +constexpr Chipset kGfx942 = Chipset(9, 4, 2); /// Define lowering patterns for raw buffer ops template @@ -483,7 +483,7 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, destElem = destType.getElementType(); if (sourceElem.isF32() && destElem.isF32()) { -if (mfma.getReducePrecision() && chipset >= kGfx940) { +if (mfma.getReducePrecision() && chipset >= kGfx942) { if (m == 32 && n == 32 && k == 4 && b == 1) return ROCDL::mfma_f32_32x32x4_xf32::getOperationName(); if (m == 16 && n == 16 && k == 8 && b == 1) @@ -551,9 +551,9 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, return ROCDL::mfma_i32_32x32x8i8::getOperationName(); if (m == 16 && n == 16 && k == 16 && b == 1) return ROCDL::mfma_i32_16x16x16i8::getOperationName(); -if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx940) +
[llvm-branch-commits] [flang] [AMDGPU] Add missing gfx architectures to AddFlangOffloadRuntime.cmake (PR #125827)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/125827 >From 51b377df31cc5dcd9e1fd7e5f8ff69a1e4a25cbe Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 5 Feb 2025 04:45:26 -0500 Subject: [PATCH] [AMDGPU] Add missing gfx architectures to AddFlangOffloadRuntime.cmake --- flang/cmake/modules/AddFlangOffloadRuntime.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake index f1f6eb57c5d6c..eb0e964559ed5 100644 --- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake +++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake @@ -98,10 +98,10 @@ macro(enable_omp_offload_compilation files) set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906" -"gfx908;gfx90a;gfx90c;gfx942;gfx1010;gfx1030" +"gfx908;gfx90a;gfx90c;gfx942;gfx950;gfx1010;gfx1030" "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036" "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151" -"gfx1152;gfx1153" +"gfx1152;gfx1153;gfx1200;gfx1201" ) set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62" ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Remove FeatureForceStoreSC0SC1 (PR #126878)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/126878 >From 731f633e66cf3118e5764ed4d0acbd794d2ca2d2 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 12 Feb 2025 04:13:18 -0500 Subject: [PATCH] [AMDGPU] Remove FeatureForceStoreSC0SC1 This was only used for gfx940 and gfx941, which have since been removed. For SWDEV-512631 --- llvm/lib/Target/AMDGPU/AMDGPU.td | 6 -- llvm/lib/Target/AMDGPU/GCNSubtarget.h| 3 --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 20 3 files changed, 29 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 3aabca49b249e..effc8d2ed6b49 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1040,12 +1040,6 @@ def FeatureVALUTransUseHazard : SubtargetFeature<"valu-trans-use-hazard", "Hazard when TRANS instructions are closely followed by a use of the result" >; -def FeatureForceStoreSC0SC1 : SubtargetFeature<"force-store-sc0-sc1", - "HasForceStoreSC0SC1", - "true", - "Has SC0 and SC1 on stores" ->; - def FeatureSALUFloatInsts : SubtargetFeature<"salu-float", "HasSALUFloatInsts", "true", diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f7c5c472c93a5..6664a70572ded 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -246,7 +246,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasMADIntraFwdBug = false; bool HasVOPDInsts = false; bool HasVALUTransUseHazard = false; - bool HasForceStoreSC0SC1 = false; bool HasRequiredExportPriority = false; bool HasVmemWriteVgprInOrder = false; bool HasAshrPkInsts = false; @@ -1264,8 +1263,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasCvtScaleForwardingHazard() const { return GFX950Insts; } - bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; } - bool requiresCodeObjectV6() const { return RequiresCOV6; } bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 79fb36acc0ea7..34953f9c08db7 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -359,11 +359,6 @@ class SICacheControl { /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; - - virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) const { -return false; - } }; class SIGfx6CacheControl : public SICacheControl { @@ -517,20 +512,6 @@ class SIGfx940CacheControl : public SIGfx90ACacheControl { bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const override; - - bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) const override { -bool Changed = false; -if (ST.hasForceStoreSC0SC1() && -(MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | -SIAtomicAddrSpace::GLOBAL | -SIAtomicAddrSpace::OTHER)) != - SIAtomicAddrSpace::NONE) { - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); -} -return Changed; - } }; class SIGfx10CacheControl : public SIGfx7CacheControl { @@ -2820,7 +2801,6 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { Changed |= expandLoad(*MOI, MI); else if (const auto &MOI = MOA.getStoreInfo(MI)) { Changed |= expandStore(*MOI, MI); -Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) Changed |= expandAtomicFence(*MOI, MI); else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [AMDGPU][clang] Replace gfx940 and gfx941 with gfx942 in clang (PR #126762)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/126762 >From a80db3a9fe9654bc91b9377f8d38e6270380f9a2 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Tue, 11 Feb 2025 08:52:55 -0500 Subject: [PATCH] [AMDGPU][clang] Replace gfx940 and gfx941 with gfx942 in clang gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. This PR removes all occurrences of gfx940/gfx941 from clang that can be removed without changes in the llvm directory. The target-invalid-cpu-note/amdgcn.c test is not included here since it tests a list of targets that is defined in llvm/lib/TargetParser/TargetParser.cpp. For SWDEV-512631 --- clang/include/clang/Basic/Cuda.h | 2 - clang/lib/Basic/Cuda.cpp | 2 - clang/lib/Basic/Targets/NVPTX.cpp | 2 - clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 2 - clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu | 2 +- clang/test/CodeGenOpenCL/amdgpu-features.cl | 4 - .../test/CodeGenOpenCL/builtins-amdgcn-fp8.cl | 2 +- ...cn-gfx940.cl => builtins-amdgcn-gfx942.cl} | 2 +- .../builtins-amdgcn-gfx950-err.cl | 2 +- .../builtins-amdgcn-gws-insts.cl | 2 +- .../CodeGenOpenCL/builtins-amdgcn-mfma.cl | 110 +- ...fx940.cl => builtins-fp-atomics-gfx942.cl} | 34 +++--- clang/test/Driver/amdgpu-macros.cl| 2 - clang/test/Driver/amdgpu-mcpu.cl | 4 - clang/test/Driver/cuda-bad-arch.cu| 2 +- clang/test/Driver/hip-macros.hip | 10 +- .../test/Misc/target-invalid-cpu-note/nvptx.c | 2 - ... => builtins-amdgcn-error-gfx942-param.cl} | 2 +- .../builtins-amdgcn-error-gfx950.cl | 2 +- ...0-err.cl => builtins-amdgcn-gfx942-err.cl} | 14 +-- 20 files changed, 91 insertions(+), 113 deletions(-) rename clang/test/CodeGenOpenCL/{builtins-amdgcn-gfx940.cl => builtins-amdgcn-gfx942.cl} (98%) rename clang/test/CodeGenOpenCL/{builtins-fp-atomics-gfx940.cl => builtins-fp-atomics-gfx942.cl} (84%) rename clang/test/SemaOpenCL/{builtins-amdgcn-error-gfx940-param.cl => builtins-amdgcn-error-gfx942-param.cl} (99%) rename clang/test/SemaOpenCL/{builtins-amdgcn-gfx940-err.cl => builtins-amdgcn-gfx942-err.cl} (81%) diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index f33ba46233a7a..793cab1f4e84a 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -106,8 +106,6 @@ enum class OffloadArch { GFX90a, GFX90c, GFX9_4_GENERIC, - GFX940, - GFX941, GFX942, GFX950, GFX10_1_GENERIC, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 1bfec0b37c5ee..f45fb0eca3714 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -124,8 +124,6 @@ static const OffloadArchToStringMap arch_names[] = { GFX(90a), // gfx90a GFX(90c), // gfx90c {OffloadArch::GFX9_4_GENERIC, "gfx9-4-generic", "compute_amdgcn"}, -GFX(940), // gfx940 -GFX(941), // gfx941 GFX(942), // gfx942 GFX(950), // gfx950 {OffloadArch::GFX10_1_GENERIC, "gfx10-1-generic", "compute_amdgcn"}, diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 7d13c1f145440..547cf3dfa2be7 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -211,8 +211,6 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case OffloadArch::GFX90a: case OffloadArch::GFX90c: case OffloadArch::GFX9_4_GENERIC: - case OffloadArch::GFX940: - case OffloadArch::GFX941: case OffloadArch::GFX942: case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index c13928f61a748..826ec4da8ea28 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2302,8 +2302,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { case OffloadArch::GFX90a: case OffloadArch::GFX90c: case OffloadArch::GFX9_4_GENERIC: - case OffloadArch::GFX940: - case OffloadArch::GFX941: case OffloadArch::GFX942: case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu index 47fa3967fe237..37fca614c3111 100644 --- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu +++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu @@ -11,7 +11,7 @@ // RUN: -fnative-half-arguments-and-returns | FileCheck -check-prefix=SAFE %s // RUN: %clang_cc1 -x hip %s -O3 -S -o - -triple=amdgcn-amd-amdhsa \ -// RUN: -fcuda-is-device -target-cpu gfx940 -fnative-half-type \ +// RUN: -fcuda-is-device -target-cpu gfx942 -fnative-half-type \ // RUN: -fnative-half-arguments-and-retur
[llvm-branch-commits] [llvm] [AMDGPU][docs] Replace gfx940 and gfx941 with gfx942 in llvm/docs (PR #126887)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/126887 >From a5d359bb1a9b271316617fd84e10c64912e3c6a4 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 12 Feb 2025 05:45:01 -0500 Subject: [PATCH] [AMDGPU][docs] Replace gfx940 and gfx941 with gfx942 in llvm/docs gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. This PR removes all documentation occurrences of gfx940/gfx941 except for the gfx940 ISA description, which will be the subject of a separate PR. For SWDEV-512631 --- llvm/docs/AMDGPUOperandSyntax.rst | 4 +- llvm/docs/AMDGPUUsage.rst | 97 ++- 2 files changed, 34 insertions(+), 67 deletions(-) diff --git a/llvm/docs/AMDGPUOperandSyntax.rst b/llvm/docs/AMDGPUOperandSyntax.rst index ff6ec6cf71ff2..e8a76322fe76a 100644 --- a/llvm/docs/AMDGPUOperandSyntax.rst +++ b/llvm/docs/AMDGPUOperandSyntax.rst @@ -63,7 +63,7 @@ Note: *N* and *K* must satisfy the following conditions: * 0 <= *K* <= 255. * *K-N+1* must be in the range from 1 to 12 or equal to 16 or 32. -GFX90A and GFX940 have an additional alignment requirement: +GFX90A and GFX942 have an additional alignment requirement: pairs of *vector* registers must be even-aligned (first register must be even). @@ -183,7 +183,7 @@ Note: *N* and *K* must satisfy the following conditions: * 0 <= *K* <= 255. * *K-N+1* must be in the range from 1 to 12 or equal to 16 or 32. -GFX90A and GFX940 have an additional alignment requirement: +GFX90A and GFX942 have an additional alignment requirement: pairs of *accumulator* registers must be even-aligned (first register must be even). diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index d4742bb1eaf09..3b57ea91282ec 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -323,7 +323,7 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following Add product names. - **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_ [AMD-GCN-GFX940-GFX942-CDNA3]_ + **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_ [AMD-GCN-GFX942-CDNA3]_ --- ``gfx900`` ``amdgcn`` dGPU - xnack - Absolute - *rocm-amdhsa* - Radeon Vega flat - *pal-amdhsa*Frontier Edition @@ -378,20 +378,6 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following - Ryzen 3 Pro 4350G - Ryzen 3 Pro 4350GE - ``gfx940`` ``amdgcn`` dGPU - sramecc - Architected *TBA* -- tgsplit flat -- xnack scratch .. TODO:: -- kernarg preload - Packed - work-item Add product -IDs names. - - ``gfx941`` ``amdgcn`` dGPU - sramecc - Architected *TBA* -- tgsplit flat -- xnack scratch .. TODO:: -- kernarg preload - Packed - work-item Add product -IDs names. - ``gfx942`` ``amdgcn`` dGPU - sramecc - Architected - AMD Instinct MI300X - tgsplit flat - AMD Instinct MI300A - xnack scratch @@ -583,10 +569,10 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor
[llvm-branch-commits] [llvm] [AMDGPU][docs][NFC] Replace gfx940 with gfx942 in the gfx940 ISA doc (PR #126906)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/126906 >From 5bfa81dbc27204e4a7b8b8cafc57bfbab56467d4 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 12 Feb 2025 08:04:43 -0500 Subject: [PATCH] [AMDGPU][docs][NFC] Replace gfx940 with gfx942 in the gfx940 ISA doc gfx940 and gfx941 are no longer supported. This is the last one of a series of PRs to remove them from the code base. The ISA documentation still contains a lot of links and file names with the "gfx940" identifier. Changing them to "gfx942" is probably not worth the cost of breaking all URLs to these pages that users might have saved in the past. For SWDEV-512631 --- llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst b/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst index 90797499dec22..7603bcc95383b 100644 --- a/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst +++ b/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst @@ -6,7 +6,7 @@ ** -Syntax of gfx940 Instructions +Syntax of gfx942 Instructions .. contents:: @@ -15,7 +15,7 @@ Syntax of gfx940 Instructions Introduction -This document describes the syntax of gfx940 instructions. +This document describes the syntax of gfx942 instructions. Notation ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [AMDGPU] Add missing gfx architectures to AddFlangOffloadRuntime.cmake (PR #125827)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/125827 >From 51b377df31cc5dcd9e1fd7e5f8ff69a1e4a25cbe Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 5 Feb 2025 04:45:26 -0500 Subject: [PATCH] [AMDGPU] Add missing gfx architectures to AddFlangOffloadRuntime.cmake --- flang/cmake/modules/AddFlangOffloadRuntime.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake index f1f6eb57c5d6c..eb0e964559ed5 100644 --- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake +++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake @@ -98,10 +98,10 @@ macro(enable_omp_offload_compilation files) set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906" -"gfx908;gfx90a;gfx90c;gfx942;gfx1010;gfx1030" +"gfx908;gfx90a;gfx90c;gfx942;gfx950;gfx1010;gfx1030" "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036" "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151" -"gfx1152;gfx1153" +"gfx1152;gfx1153;gfx1200;gfx1201" ) set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62" ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][docs] Replace gfx940 and gfx941 with gfx942 in llvm/docs (PR #126887)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/126887 >From a5d359bb1a9b271316617fd84e10c64912e3c6a4 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 12 Feb 2025 05:45:01 -0500 Subject: [PATCH] [AMDGPU][docs] Replace gfx940 and gfx941 with gfx942 in llvm/docs gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. This PR removes all documentation occurrences of gfx940/gfx941 except for the gfx940 ISA description, which will be the subject of a separate PR. For SWDEV-512631 --- llvm/docs/AMDGPUOperandSyntax.rst | 4 +- llvm/docs/AMDGPUUsage.rst | 97 ++- 2 files changed, 34 insertions(+), 67 deletions(-) diff --git a/llvm/docs/AMDGPUOperandSyntax.rst b/llvm/docs/AMDGPUOperandSyntax.rst index ff6ec6cf71ff2..e8a76322fe76a 100644 --- a/llvm/docs/AMDGPUOperandSyntax.rst +++ b/llvm/docs/AMDGPUOperandSyntax.rst @@ -63,7 +63,7 @@ Note: *N* and *K* must satisfy the following conditions: * 0 <= *K* <= 255. * *K-N+1* must be in the range from 1 to 12 or equal to 16 or 32. -GFX90A and GFX940 have an additional alignment requirement: +GFX90A and GFX942 have an additional alignment requirement: pairs of *vector* registers must be even-aligned (first register must be even). @@ -183,7 +183,7 @@ Note: *N* and *K* must satisfy the following conditions: * 0 <= *K* <= 255. * *K-N+1* must be in the range from 1 to 12 or equal to 16 or 32. -GFX90A and GFX940 have an additional alignment requirement: +GFX90A and GFX942 have an additional alignment requirement: pairs of *accumulator* registers must be even-aligned (first register must be even). diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index d4742bb1eaf09..3b57ea91282ec 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -323,7 +323,7 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following Add product names. - **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_ [AMD-GCN-GFX940-GFX942-CDNA3]_ + **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_ [AMD-GCN-GFX942-CDNA3]_ --- ``gfx900`` ``amdgcn`` dGPU - xnack - Absolute - *rocm-amdhsa* - Radeon Vega flat - *pal-amdhsa*Frontier Edition @@ -378,20 +378,6 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following - Ryzen 3 Pro 4350G - Ryzen 3 Pro 4350GE - ``gfx940`` ``amdgcn`` dGPU - sramecc - Architected *TBA* -- tgsplit flat -- xnack scratch .. TODO:: -- kernarg preload - Packed - work-item Add product -IDs names. - - ``gfx941`` ``amdgcn`` dGPU - sramecc - Architected *TBA* -- tgsplit flat -- xnack scratch .. TODO:: -- kernarg preload - Packed - work-item Add product -IDs names. - ``gfx942`` ``amdgcn`` dGPU - sramecc - Architected - AMD Instinct MI300X - tgsplit flat - AMD Instinct MI300A - xnack scratch @@ -583,10 +569,10 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor
[llvm-branch-commits] [libc] release/20.x: [libc] Move __LLVM_LIBC__ define to __llvm-libc-common.h (#126877) (PR #126960)
https://github.com/lntue approved this pull request. https://github.com/llvm/llvm-project/pull/126960 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Set inst_pref_size to maximum (PR #126981)
@@ -199,3 +201,28 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, return MCConstantExpr::create(0, Ctx); } + +uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { + if (!CodeSizeInBytes.has_value()) { +const GCNSubtarget &STM = MF.getSubtarget(); +const SIInstrInfo *TII = STM.getInstrInfo(); + +uint64_t CodeSize = 0; + +for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { +// TODO: CodeSize should account for multiple functions. + +// TODO: Should we count size of debug info? +if (MI.isDebugInstr()) rampitec wrote: That said, the function was simply moved as is, the only added functionality is caching. And yes. it is incorrect and always was, at least because it does not correctly handle inline asm. https://github.com/llvm/llvm-project/pull/126981 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Remove FeatureForceStoreSC0SC1 (PR #126878)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/126878 >From baa8adda226fea78f2fd595edadf57af72d7d049 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 12 Feb 2025 04:13:18 -0500 Subject: [PATCH] [AMDGPU] Remove FeatureForceStoreSC0SC1 This was only used for gfx940 and gfx941, which have since been removed. For SWDEV-512631 --- llvm/lib/Target/AMDGPU/AMDGPU.td | 6 -- llvm/lib/Target/AMDGPU/GCNSubtarget.h| 3 --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 20 3 files changed, 29 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 3aabca49b249e..effc8d2ed6b49 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1040,12 +1040,6 @@ def FeatureVALUTransUseHazard : SubtargetFeature<"valu-trans-use-hazard", "Hazard when TRANS instructions are closely followed by a use of the result" >; -def FeatureForceStoreSC0SC1 : SubtargetFeature<"force-store-sc0-sc1", - "HasForceStoreSC0SC1", - "true", - "Has SC0 and SC1 on stores" ->; - def FeatureSALUFloatInsts : SubtargetFeature<"salu-float", "HasSALUFloatInsts", "true", diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f7c5c472c93a5..6664a70572ded 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -246,7 +246,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasMADIntraFwdBug = false; bool HasVOPDInsts = false; bool HasVALUTransUseHazard = false; - bool HasForceStoreSC0SC1 = false; bool HasRequiredExportPriority = false; bool HasVmemWriteVgprInOrder = false; bool HasAshrPkInsts = false; @@ -1264,8 +1263,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasCvtScaleForwardingHazard() const { return GFX950Insts; } - bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; } - bool requiresCodeObjectV6() const { return RequiresCOV6; } bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 79fb36acc0ea7..34953f9c08db7 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -359,11 +359,6 @@ class SICacheControl { /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; - - virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) const { -return false; - } }; class SIGfx6CacheControl : public SICacheControl { @@ -517,20 +512,6 @@ class SIGfx940CacheControl : public SIGfx90ACacheControl { bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const override; - - bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) const override { -bool Changed = false; -if (ST.hasForceStoreSC0SC1() && -(MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | -SIAtomicAddrSpace::GLOBAL | -SIAtomicAddrSpace::OTHER)) != - SIAtomicAddrSpace::NONE) { - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); -} -return Changed; - } }; class SIGfx10CacheControl : public SIGfx7CacheControl { @@ -2820,7 +2801,6 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { Changed |= expandLoad(*MOI, MI); else if (const auto &MOI = MOA.getStoreInfo(MI)) { Changed |= expandStore(*MOI, MI); -Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) Changed |= expandAtomicFence(*MOI, MI); else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [flang] [lld] [llvm] [Flang] LLVM_ENABLE_RUNTIMES=flang-rt (PR #110217)
@@ -248,7 +237,25 @@ else() include_directories(SYSTEM ${MLIR_TABLEGEN_OUTPUT_DIR}) endif() -option(FLANG_INCLUDE_RUNTIME "Build the runtime in-tree (deprecated; to be replaced with LLVM_ENABLE_RUNTIMES=flang-rt)" ON) +set(FLANG_INCLUDE_RUNTIME_default ON) +if ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES) + set(FLANG_INCLUDE_RUNTIME_default OFF) +endif () +option(FLANG_INCLUDE_RUNTIME "Build the runtime in-tree (deprecated; to be replaced with LLVM_ENABLE_RUNTIMES=flang-rt)" FLANG_INCLUDE_RUNTIME_default) jeanPerier wrote: `FLANG_INCLUDE_RUNTIME_default` -> `${FLANG_INCLUDE_RUNTIME_default}`. Otherwise, this always evaluates to OFF for me, and I do not get the runtime to be built by default. https://github.com/llvm/llvm-project/pull/110217 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [libc] [libclc] [llvm] [AMDGPU] Replace gfx940 and gfx941 with gfx942 in offload and libclc (PR #125826)
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/125826 >From e8d38e53f175c5d481c6774b330f0da15d234322 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 5 Feb 2025 04:19:00 -0500 Subject: [PATCH] [AMDGPU] Replace gfx940 and gfx941 with gfx942 in offload and libclc gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. For SWDEV-512631 and SWDEV-512633 --- flang/cmake/modules/AddFlangOffloadRuntime.cmake | 2 +- libc/docs/gpu/using.rst | 2 +- libclc/CMakeLists.txt| 2 +- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 6 -- offload/test/lit.cfg | 4 +--- 5 files changed, 4 insertions(+), 12 deletions(-) diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake index 8e4f47d18535d..f1f6eb57c5d6c 100644 --- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake +++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake @@ -98,7 +98,7 @@ macro(enable_omp_offload_compilation files) set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906" -"gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030" +"gfx908;gfx90a;gfx90c;gfx942;gfx1010;gfx1030" "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036" "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151" "gfx1152;gfx1153" diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst index 1c1f9c9bfb0c6..f17f6287be313 100644 --- a/libc/docs/gpu/using.rst +++ b/libc/docs/gpu/using.rst @@ -44,7 +44,7 @@ this shouldn't be necessary. $> clang openmp.c -fopenmp --offload-arch=gfx90a -Xoffload-linker -lc $> clang cuda.cu --offload-arch=sm_80 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc - $> clang hip.hip --offload-arch=gfx940 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc + $> clang hip.hip --offload-arch=gfx942 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc This will automatically link in the needed function definitions if they were required by the user's application. Normally using the ``-fgpu-rdc`` option diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index c88ea9700d100..cddff9d0ec31a 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -211,7 +211,7 @@ set( cayman_aliases aruba ) set( tahiti_aliases pitcairn verde oland hainan bonaire kabini kaveri hawaii mullins tonga tongapro iceland carrizo fiji stoney polaris10 polaris11 gfx602 gfx705 gfx805 - gfx900 gfx902 gfx904 gfx906 gfx908 gfx909 gfx90a gfx90c gfx940 gfx941 gfx942 + gfx900 gfx902 gfx904 gfx906 gfx908 gfx909 gfx90a gfx90c gfx942 gfx1010 gfx1011 gfx1012 gfx1013 gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 gfx1100 gfx1101 gfx1102 gfx1103 diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 92184ba796dbd..e83d38a14f77f 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2854,12 +2854,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Error checkIfAPU() { // TODO: replace with ROCr API once it becomes available. llvm::StringRef StrGfxName(ComputeUnitKind); -IsAPU = llvm::StringSwitch(StrGfxName) -.Case("gfx940", true) -.Default(false); -if (IsAPU) - return Plugin::success(); - bool MayBeAPU = llvm::StringSwitch(StrGfxName) .Case("gfx942", true) .Default(false); diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 1e265d2c30904..f017bca85dd4f 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -134,12 +134,10 @@ elif config.libomptarget_current_target.startswith('amdgcn'): # amdgpu_test_arch contains a list of AMD GPUs in the system # only check the first one assuming that we will run the test on it. if not (config.amdgpu_test_arch.startswith("gfx90a") or -config.amdgpu_test_arch.startswith("gfx940") or config.amdgpu_test_arch.startswith("gfx942")): supports_unified_shared_memory = False # check if AMD architecture is an APU: -if (config.amdgpu_test_arch.startswith("gfx940") or -(config.amdgpu_test_arch.startswith("gfx942") and +if ((config.amdgpu_test_arch.startswith("gfx942") and evaluate_bool_env(config.environment['IS_APU']))): supports_apu = True if supports_unified_shared_memory: ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Set inst_pref_size to maximum (PR #126981)
@@ -199,3 +201,28 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, return MCConstantExpr::create(0, Ctx); } + +uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { + if (!CodeSizeInBytes.has_value()) { +const GCNSubtarget &STM = MF.getSubtarget(); +const SIInstrInfo *TII = STM.getInstrInfo(); + +uint64_t CodeSize = 0; + +for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { +// TODO: CodeSize should account for multiple functions. + +// TODO: Should we count size of debug info? +if (MI.isDebugInstr()) rampitec wrote: Since this is really somewhat unrelated changes, I have split it into a separate https://github.com/llvm/llvm-project/pull/127111, which is just move of the code, and will create yet another PR to address the functional comments. https://github.com/llvm/llvm-project/pull/126981 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Set inst_pref_size to maximum (PR #126981)
https://github.com/rampitec edited https://github.com/llvm/llvm-project/pull/126981 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libc] release/20.x: libc/cmake: don't fail if LLVM_VERSION_SUFFIX isn't defined (#126359) (PR #127099)
https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/127099 Backport c81139f417a209dbd2a4e06465483d4b0951a9ac Requested by: @jhuber6 >From 5d310a324700bd34ecc369d3b0b66b3763a4d27a Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Thu, 13 Feb 2025 18:42:28 +0100 Subject: [PATCH] libc/cmake: don't fail if LLVM_VERSION_SUFFIX isn't defined (#126359) Closes: #126358 cc @samvangysegem - Co-authored-by: Joseph Huber (cherry picked from commit c81139f417a209dbd2a4e06465483d4b0951a9ac) --- libc/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index 1c4c0cd5aa22b..47708c2267818 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -51,7 +51,7 @@ set(LIBC_KERNEL_HEADERS "/usr/include" CACHE STRING "Path to Linux kernel header # Defining a global namespace to enclose all libc functions. set(default_namespace "__llvm_libc") if(LLVM_VERSION_MAJOR) - string(REPLACE "-" "" NS_LLVM_VERSION_SUFFIX ${LLVM_VERSION_SUFFIX}) + string(REPLACE "-" "" NS_LLVM_VERSION_SUFFIX "${LLVM_VERSION_SUFFIX}") set(default_namespace "__llvm_libc_${LLVM_VERSION_MAJOR}_${LLVM_VERSION_MINOR}_${LLVM_VERSION_PATCH}_${NS_LLVM_VERSION_SUFFIX}") endif() set(LIBC_NAMESPACE ${default_namespace} ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [lld] [lld][LoongArch] Relax call36/tail36: R_LARCH_CALL36 (PR #123576)
@@ -830,6 +832,37 @@ static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i, remove = 4; } +// Relax code sequence. +// From: +// pcaddu18i $ra, %call36(foo) +// jirl $ra, $ra, 0 +// To: +// b/bl foo +static void relaxCall36(Ctx &ctx, const InputSection &sec, size_t i, +uint64_t loc, Relocation &r, uint32_t &remove) { + const uint64_t symLocal = + (r.expr == R_PLT_PC ? r.sym->getPltVA(ctx) : r.sym->getVA(ctx)) + + r.addend; + + const int64_t distance = symLocal - loc; MaskRay wrote: and this `displace` https://github.com/llvm/llvm-project/pull/123576 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [lld] [lld][LoongArch] Relax call36/tail36: R_LARCH_CALL36 (PR #123576)
@@ -830,6 +832,37 @@ static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i, remove = 4; } +// Relax code sequence. +// From: +// pcaddu18i $ra, %call36(foo) +// jirl $ra, $ra, 0 +// To: +// b/bl foo +static void relaxCall36(Ctx &ctx, const InputSection &sec, size_t i, +uint64_t loc, Relocation &r, uint32_t &remove) { + const uint64_t symLocal = MaskRay wrote: Local in symLocal is misleading. In RISCV we name this `dest` https://github.com/llvm/llvm-project/pull/123576 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [lld] [lld][LoongArch] Relax call36/tail36: R_LARCH_CALL36 (PR #123576)
@@ -0,0 +1,61 @@ +# REQUIRES: loongarch MaskRay wrote: omit `-2` from the name https://github.com/llvm/llvm-project/pull/123576 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Early bail in getFunctionCodeSize for meta inst. NFC. (PR #127129)
rampitec wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/127129?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#127129** https://app.graphite.dev/github/pr/llvm/llvm-project/127129?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/127129?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#127111** https://app.graphite.dev/github/pr/llvm/llvm-project/127111?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/>: 1 other dependent PR ([#126981](https://github.com/llvm/llvm-project/pull/126981) https://app.graphite.dev/github/pr/llvm/llvm-project/126981?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/>) * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/127129 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Early bail in getFunctionCodeSize for meta inst. NFC. (PR #127129)
https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/127129 It does not change the estimate because getInstSizeInBytes() already returns 0 for meta instructions, but added a test and early bail. >From c0489545755c98dc2f87ffcd83af929816643074 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 13 Feb 2025 13:19:26 -0800 Subject: [PATCH] [AMDGPU] Early bail in getFunctionCodeSize for meta inst. NFC. It does not change the estimate because getInstSizeInBytes() already returns 0 for meta instructions, but added a test and early bail. --- llvm/lib/Target/AMDGPU/SIProgramInfo.cpp| 2 +- llvm/test/CodeGen/AMDGPU/code-size-estimate.mir | 13 + 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 5179288084010..b995687e71780 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -216,7 +216,7 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { // TODO: CodeSize should account for multiple functions. // TODO: Should we count size of debug info? - if (MI.isDebugInstr()) + if (MI.isDebugInstr() || MI.isMetaInstruction()) continue; CodeSize += TII->getInstSizeInBytes(MI); diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir b/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir index 9e46c58b6b5a9..76eaf350301e4 100644 --- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir +++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir @@ -18,3 +18,16 @@ body: | $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 V_MOV_B32_indirect_write undef $vgpr0, undef $vgpr3, implicit $exec, implicit $m0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3(tied-def 4) ... + +# CHECK: meta: ; @meta +# CHECK: ; wave barrier +# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] +# CHECK: ; codeLenInByte = 4 +--- +name:meta +tracksRegLiveness: true +body: | + bb.0: + + WAVE_BARRIER +... ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Early bail in getFunctionCodeSize for meta inst. NFC. (PR #127129)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) Changes It does not change the estimate because getInstSizeInBytes() already returns 0 for meta instructions, but added a test and early bail. --- Full diff: https://github.com/llvm/llvm-project/pull/127129.diff 2 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIProgramInfo.cpp (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/code-size-estimate.mir (+13) ``diff diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 5179288084010..b995687e71780 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -216,7 +216,7 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { // TODO: CodeSize should account for multiple functions. // TODO: Should we count size of debug info? - if (MI.isDebugInstr()) + if (MI.isDebugInstr() || MI.isMetaInstruction()) continue; CodeSize += TII->getInstSizeInBytes(MI); diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir b/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir index 9e46c58b6b5a9..76eaf350301e4 100644 --- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir +++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir @@ -18,3 +18,16 @@ body: | $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 V_MOV_B32_indirect_write undef $vgpr0, undef $vgpr3, implicit $exec, implicit $m0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3(tied-def 4) ... + +# CHECK: meta: ; @meta +# CHECK: ; wave barrier +# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] +# CHECK: ; codeLenInByte = 4 +--- +name:meta +tracksRegLiveness: true +body: | + bb.0: + + WAVE_BARRIER +... `` https://github.com/llvm/llvm-project/pull/127129 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [ORC][unittests] Remove hard coded 16k page size (#127115) (PR #127144)
https://github.com/llvmbot milestoned https://github.com/llvm/llvm-project/pull/127144 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [ORC][unittests] Remove hard coded 16k page size (#127115) (PR #127144)
https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/127144 Backport 415607e Requested by: @RossComputerGuy >From 7d0590cc577b4b6ff243fb733c8c9ac0cc6dc221 Mon Sep 17 00:00:00 2001 From: Tristan Ross Date: Thu, 13 Feb 2025 14:56:55 -0800 Subject: [PATCH] [ORC][unittests] Remove hard coded 16k page size (#127115) Fixes a couple hard coded 16k values which is being used as the page size. Replaces the hard coded value with the system's page size. This fixes #116753 on an Ampere Altra Q64-22 CC @lhames (cherry picked from commit 415607e10b56d0e6c4661ff1ec5b9b46bf433cba) --- .../Orc/JITLinkRedirectionManagerTest.cpp | 10 -- .../ExecutionEngine/Orc/ReOptimizeLayerTest.cpp| 9 - 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp index 4b8a3efe680f1..a57241b8a3da6 100644 --- a/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp @@ -48,12 +48,18 @@ class JITLinkRedirectionManagerTest : public testing::Test { if (Triple.isPPC()) GTEST_SKIP(); +auto PageSize = sys::Process::getPageSize(); +if (!PageSize) { + consumeError(PageSize.takeError()); + GTEST_SKIP(); +} + ES = std::make_unique( std::make_unique( -nullptr, nullptr, JTMB->getTargetTriple().getTriple())); +nullptr, nullptr, JTMB->getTargetTriple().getTriple(), *PageSize)); JD = &ES->createBareJITDylib("main"); ObjLinkingLayer = std::make_unique( -*ES, std::make_unique(16384)); +*ES, std::make_unique(*PageSize)); DL = std::make_unique(std::move(*DLOrErr)); } JITDylib *JD{nullptr}; diff --git a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp index 083a924ce9aa1..991b12def55fa 100644 --- a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp @@ -66,10 +66,17 @@ class ReOptimizeLayerTest : public testing::Test { consumeError(DLOrErr.takeError()); GTEST_SKIP(); } + +auto PageSize = sys::Process::getPageSize(); +if (!PageSize) { + consumeError(PageSize.takeError()); + GTEST_SKIP(); +} + ES = std::make_unique(std::move(*EPC)); JD = &ES->createBareJITDylib("main"); ObjLinkingLayer = std::make_unique( -*ES, std::make_unique(16384)); +*ES, std::make_unique(*PageSize)); DL = std::make_unique(std::move(*DLOrErr)); auto TM = JTMB->createTargetMachine(); ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [ORC][unittests] Remove hard coded 16k page size (#127115) (PR #127144)
llvmbot wrote: @lhames What do you think about merging this PR to the release branch? https://github.com/llvm/llvm-project/pull/127144 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Respect MBB alignment in the getFunctionCodeSize() (PR #127142)
@@ -212,6 +212,8 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { uint64_t CodeSize = 0; for (const MachineBasicBlock &MBB : MF) { +CodeSize = alignTo(CodeSize, MBB.getAlignment()); rampitec wrote: Pessimistic overestimate is actually worse for some applications of this function. For what I am doing now it may result prefetching memory far beyond the program. I believe our estimates shall be correct except for inline asm... https://github.com/llvm/llvm-project/pull/127142 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/20.x: [ORC][unittests] Remove hard coded 16k page size (#127115) (PR #127144)
https://github.com/lhames approved this pull request. https://github.com/llvm/llvm-project/pull/127144 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Respect MBB alignment in the getFunctionCodeSize() (PR #127142)
https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/127142 None >From d01d16815ade61a599b94bb18bc292e326767f15 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 13 Feb 2025 14:46:37 -0800 Subject: [PATCH] [AMDGPU] Respect MBB alignment in the getFunctionCodeSize() --- llvm/lib/Target/AMDGPU/SIProgramInfo.cpp | 2 + .../CodeGen/AMDGPU/code-size-estimate.mir | 89 +++ 2 files changed, 91 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index b995687e71780..9d9b4c83ac388 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -212,6 +212,8 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { uint64_t CodeSize = 0; for (const MachineBasicBlock &MBB : MF) { +CodeSize = alignTo(CodeSize, MBB.getAlignment()); + for (const MachineInstr &MI : MBB) { // TODO: CodeSize should account for multiple functions. diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir b/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir index 76eaf350301e4..9ae536af6f0e9 100644 --- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir +++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir @@ -31,3 +31,92 @@ body: | WAVE_BARRIER ... + +# CHECK: align4: ; @align4 +# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] +# CHECK: s_cbranch_scc1 .LBB{{[0-9_]+}} ; encoding: [A,A,0x85,0xbf] +# CHECK: s_barrier ; encoding: [0x00,0x00,0x8a,0xbf] +# CHECK: .p2align2 +# CHECK: s_endpgm; encoding: [0x00,0x00,0x81,0xbf] +# CHECK: ; codeLenInByte = 16 + +--- +name:align4 +tracksRegLiveness: true +body: | + bb.0: +$scc = IMPLICIT_DEF +S_CBRANCH_SCC1 %bb.2, implicit $scc + + bb.1: +S_BARRIER + + bb.2 (align 4): +S_ENDPGM 0 +... + +# CHECK: align8: ; @align8 +# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] +# CHECK: s_cbranch_scc1 .LBB{{[0-9_]+}} ; encoding: [A,A,0x85,0xbf] +# CHECK: s_barrier ; encoding: [0x00,0x00,0x8a,0xbf] +# CHECK: .p2align3 +# CHECK: s_endpgm; encoding: [0x00,0x00,0x81,0xbf] +# CHECK: ; codeLenInByte = 20 +--- +name:align8 +tracksRegLiveness: true +body: | + bb.0: +$scc = IMPLICIT_DEF +S_CBRANCH_SCC1 %bb.2, implicit $scc + + bb.1: +S_BARRIER + + bb.2 (align 8): +S_ENDPGM 0 +... + +# CHECK: align16:; @align16 +# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] +# CHECK: s_cbranch_scc1 .LBB{{[0-9_]+}} ; encoding: [A,A,0x85,0xbf] +# CHECK: s_barrier ; encoding: [0x00,0x00,0x8a,0xbf] +# CHECK: .p2align4 +# CHECK: s_endpgm; encoding: [0x00,0x00,0x81,0xbf] +# CHECK: ; codeLenInByte = 20 +--- +name:align16 +tracksRegLiveness: true +body: | + bb.0: +$scc = IMPLICIT_DEF +S_CBRANCH_SCC1 %bb.2, implicit $scc + + bb.1: +S_BARRIER + + bb.2 (align 16): +S_ENDPGM 0 +... + +# CHECK: align32:; @align32 +# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] +# CHECK: s_cbranch_scc1 .LBB{{[0-9_]+}} ; encoding: [A,A,0x85,0xbf] +# CHECK: s_barrier ; encoding: [0x00,0x00,0x8a,0xbf] +# CHECK: .p2align5 +# CHECK: s_endpgm; encoding: [0x00,0x00,0x81,0xbf] +# CHECK: ; codeLenInByte = 36 +--- +name:align32 +tracksRegLiveness: true +body: | + bb.0: +$scc = IMPLICIT_DEF +S_CBRANCH_SCC1 %bb.2, implicit $scc + + bb.1: +S_BARRIER + + bb.2 (align 32): +S_ENDPGM 0 +... ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Respect MBB alignment in the getFunctionCodeSize() (PR #127142)
rampitec wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/127142?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#127142** https://app.graphite.dev/github/pr/llvm/llvm-project/127142?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/127142?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#127129** https://app.graphite.dev/github/pr/llvm/llvm-project/127129?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#127111** https://app.graphite.dev/github/pr/llvm/llvm-project/127111?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/>: 1 other dependent PR ([#126981](https://github.com/llvm/llvm-project/pull/126981) https://app.graphite.dev/github/pr/llvm/llvm-project/126981?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/>) * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/127142 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Respect MBB alignment in the getFunctionCodeSize() (PR #127142)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) Changes --- Full diff: https://github.com/llvm/llvm-project/pull/127142.diff 2 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIProgramInfo.cpp (+2) - (modified) llvm/test/CodeGen/AMDGPU/code-size-estimate.mir (+89) ``diff diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index b995687e71780..9d9b4c83ac388 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -212,6 +212,8 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { uint64_t CodeSize = 0; for (const MachineBasicBlock &MBB : MF) { +CodeSize = alignTo(CodeSize, MBB.getAlignment()); + for (const MachineInstr &MI : MBB) { // TODO: CodeSize should account for multiple functions. diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir b/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir index 76eaf350301e4..9ae536af6f0e9 100644 --- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir +++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir @@ -31,3 +31,92 @@ body: | WAVE_BARRIER ... + +# CHECK: align4: ; @align4 +# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] +# CHECK: s_cbranch_scc1 .LBB{{[0-9_]+}} ; encoding: [A,A,0x85,0xbf] +# CHECK: s_barrier ; encoding: [0x00,0x00,0x8a,0xbf] +# CHECK: .p2align2 +# CHECK: s_endpgm; encoding: [0x00,0x00,0x81,0xbf] +# CHECK: ; codeLenInByte = 16 + +--- +name:align4 +tracksRegLiveness: true +body: | + bb.0: +$scc = IMPLICIT_DEF +S_CBRANCH_SCC1 %bb.2, implicit $scc + + bb.1: +S_BARRIER + + bb.2 (align 4): +S_ENDPGM 0 +... + +# CHECK: align8: ; @align8 +# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] +# CHECK: s_cbranch_scc1 .LBB{{[0-9_]+}} ; encoding: [A,A,0x85,0xbf] +# CHECK: s_barrier ; encoding: [0x00,0x00,0x8a,0xbf] +# CHECK: .p2align3 +# CHECK: s_endpgm; encoding: [0x00,0x00,0x81,0xbf] +# CHECK: ; codeLenInByte = 20 +--- +name:align8 +tracksRegLiveness: true +body: | + bb.0: +$scc = IMPLICIT_DEF +S_CBRANCH_SCC1 %bb.2, implicit $scc + + bb.1: +S_BARRIER + + bb.2 (align 8): +S_ENDPGM 0 +... + +# CHECK: align16:; @align16 +# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] +# CHECK: s_cbranch_scc1 .LBB{{[0-9_]+}} ; encoding: [A,A,0x85,0xbf] +# CHECK: s_barrier ; encoding: [0x00,0x00,0x8a,0xbf] +# CHECK: .p2align4 +# CHECK: s_endpgm; encoding: [0x00,0x00,0x81,0xbf] +# CHECK: ; codeLenInByte = 20 +--- +name:align16 +tracksRegLiveness: true +body: | + bb.0: +$scc = IMPLICIT_DEF +S_CBRANCH_SCC1 %bb.2, implicit $scc + + bb.1: +S_BARRIER + + bb.2 (align 16): +S_ENDPGM 0 +... + +# CHECK: align32:; @align32 +# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] +# CHECK: s_cbranch_scc1 .LBB{{[0-9_]+}} ; encoding: [A,A,0x85,0xbf] +# CHECK: s_barrier ; encoding: [0x00,0x00,0x8a,0xbf] +# CHECK: .p2align5 +# CHECK: s_endpgm; encoding: [0x00,0x00,0x81,0xbf] +# CHECK: ; codeLenInByte = 36 +--- +name:align32 +tracksRegLiveness: true +body: | + bb.0: +$scc = IMPLICIT_DEF +S_CBRANCH_SCC1 %bb.2, implicit $scc + + bb.1: +S_BARRIER + + bb.2 (align 32): +S_ENDPGM 0 +... `` https://github.com/llvm/llvm-project/pull/127142 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Set inst_pref_size to maximum (PR #126981)
@@ -199,3 +201,28 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, return MCConstantExpr::create(0, Ctx); } + +uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { + if (!CodeSizeInBytes.has_value()) { +const GCNSubtarget &STM = MF.getSubtarget(); +const SIInstrInfo *TII = STM.getInstrInfo(); + +uint64_t CodeSize = 0; + +for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { rampitec wrote: https://github.com/llvm/llvm-project/pull/127142 https://github.com/llvm/llvm-project/pull/126981 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Respect MBB alignment in the getFunctionCodeSize() (PR #127142)
https://github.com/rampitec ready_for_review https://github.com/llvm/llvm-project/pull/127142 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [Flang] Remove FLANG_INCLUDE_RUNTIME (PR #124126)
https://github.com/Meinersbur updated https://github.com/llvm/llvm-project/pull/124126 >From c515d13f0ad684763e6d76a87a610801482c15f4 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Fri, 24 Jan 2025 16:52:46 +0100 Subject: [PATCH 1/2] [Flang] Remove FLANG_INCLUDE_RUNTIME --- flang/CMakeLists.txt | 25 +- .../modules/AddFlangOffloadRuntime.cmake | 146 flang/runtime/CMakeLists.txt | 350 -- flang/runtime/CUDA/CMakeLists.txt | 41 -- flang/runtime/Float128Math/CMakeLists.txt | 133 --- flang/test/CMakeLists.txt | 10 - flang/test/lit.cfg.py | 3 - flang/test/lit.site.cfg.py.in | 1 - flang/tools/f18/CMakeLists.txt| 17 +- flang/unittests/CMakeLists.txt| 43 +-- flang/unittests/Evaluate/CMakeLists.txt | 16 - 11 files changed, 5 insertions(+), 780 deletions(-) delete mode 100644 flang/cmake/modules/AddFlangOffloadRuntime.cmake delete mode 100644 flang/runtime/CMakeLists.txt delete mode 100644 flang/runtime/CUDA/CMakeLists.txt delete mode 100644 flang/runtime/Float128Math/CMakeLists.txt diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index 38004c149b783..aceb2d09c5438 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -23,7 +23,6 @@ if (LLVM_ENABLE_EH) endif() set(FLANG_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -set(FLANG_RT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../flang-rt") if (CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR AND NOT MSVC_IDE) message(FATAL_ERROR "In-source builds are not allowed. \ @@ -237,24 +236,8 @@ else() include_directories(SYSTEM ${MLIR_TABLEGEN_OUTPUT_DIR}) endif() -set(FLANG_INCLUDE_RUNTIME_default ON) -if ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES) - set(FLANG_INCLUDE_RUNTIME_default OFF) -endif () -option(FLANG_INCLUDE_RUNTIME "Build the runtime in-tree (deprecated; to be replaced with LLVM_ENABLE_RUNTIMES=flang-rt)" FLANG_INCLUDE_RUNTIME_default) -if (FLANG_INCLUDE_RUNTIME) - if ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES) -message(WARNING "Building Flang-RT using LLVM_ENABLE_RUNTIMES. FLANG_INCLUDE_RUNTIME=${FLANG_INCLUDE_RUNTIME} ignored.") -set(FLANG_INCLUDE_RUNTIME OFF) - else () - message(STATUS "Building flang_rt in-tree") - endif () -else () - if ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES) -message(STATUS "Building Flang-RT using LLVM_ENABLE_RUNTIMES.") - else () -message(STATUS "Not building Flang-RT. For a usable Fortran toolchain, compile a standalone Flang-RT") - endif () +if (NOT "flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES) + message(STATUS "Not building Flang-RT. For a usable Fortran toolchain, compile a standalone Flang-RT") endif () set(FLANG_TOOLS_INSTALL_DIR "${CMAKE_INSTALL_BINDIR}" CACHE PATH @@ -484,10 +467,6 @@ if (FLANG_CUF_RUNTIME) find_package(CUDAToolkit REQUIRED) endif() -if (FLANG_INCLUDE_RUNTIME) - add_subdirectory(runtime) -endif () - if (LLVM_INCLUDE_EXAMPLES) add_subdirectory(examples) endif() diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake deleted file mode 100644 index 8e4f47d18535d..0 --- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake +++ /dev/null @@ -1,146 +0,0 @@ -option(FLANG_EXPERIMENTAL_CUDA_RUNTIME - "Compile Fortran runtime as CUDA sources (experimental)" OFF - ) - -option(FLANG_CUDA_RUNTIME_PTX_WITHOUT_GLOBAL_VARS - "Do not compile global variables' definitions when producing PTX library" OFF - ) - -set(FLANG_LIBCUDACXX_PATH "" CACHE PATH "Path to libcu++ package installation") - -set(FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD "off" CACHE STRING - "Compile Fortran runtime as OpenMP target offload sources (experimental). Valid options are 'off', 'host_device', 'nohost'") - -set(FLANG_OMP_DEVICE_ARCHITECTURES "all" CACHE STRING - "List of OpenMP device architectures to be used to compile the Fortran runtime (e.g. 'gfx1103;sm_90')") - -macro(enable_cuda_compilation name files) - if (FLANG_EXPERIMENTAL_CUDA_RUNTIME) -if (BUILD_SHARED_LIBS) - message(FATAL_ERROR -"BUILD_SHARED_LIBS is not supported for CUDA build of Fortran runtime" -) -endif() - -enable_language(CUDA) - -# TODO: figure out how to make target property CUDA_SEPARABLE_COMPILATION -# work, and avoid setting CMAKE_CUDA_SEPARABLE_COMPILATION. -set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - -# Treat all supported sources as CUDA files. -set_source_files_properties(${files} PROPERTIES LANGUAGE CUDA) -set(CUDA_COMPILE_OPTIONS) -if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "Clang") - # Allow varargs. - set(CUDA_COMPILE_OPTIONS --Xclang -fcuda-allow-variadic-functions -) -endif() -if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "NVIDIA") - set(CUDA_COMPILE_OPTIONS ---expt-relaxed-constexpr -# Disable these warnings: -#
[llvm-branch-commits] [llvm] [Flang-RT] Build libflang_rt.so (PR #121782)
https://github.com/Meinersbur updated https://github.com/llvm/llvm-project/pull/121782 >From b05c9a033158aea459d51ff34b8ec47e72f85740 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Fri, 24 Jan 2025 16:51:27 +0100 Subject: [PATCH 1/2] [Flang-RT] Build libflang_rt.so --- flang-rt/CMakeLists.txt | 30 ++ flang-rt/cmake/modules/AddFlangRT.cmake | 324 -- .../cmake/modules/AddFlangRTOffload.cmake | 18 +- flang-rt/lib/flang_rt/CMakeLists.txt | 9 +- flang-rt/lib/flang_rt/CUDA/CMakeLists.txt | 26 +- flang-rt/test/CMakeLists.txt | 2 +- flang-rt/test/lit.cfg.py | 2 +- 7 files changed, 283 insertions(+), 128 deletions(-) diff --git a/flang-rt/CMakeLists.txt b/flang-rt/CMakeLists.txt index 655d0a55b4004..0b91b6ae7eea7 100644 --- a/flang-rt/CMakeLists.txt +++ b/flang-rt/CMakeLists.txt @@ -115,6 +115,15 @@ endif () extend_path(FLANG_RT_INSTALL_RESOURCE_LIB_PATH "${FLANG_RT_INSTALL_RESOURCE_PATH}" "${toolchain_lib_subdir}") cmake_path(NORMAL_PATH FLANG_RT_OUTPUT_RESOURCE_DIR) cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_PATH) +# FIXME: For the libflang_rt.so, the toolchain resource lib dir is not a good +#destination because it is not a ld.so default search path. +#The machine where the executable is eventually executed may not be the +#machine where the Flang compiler and its resource dir is installed, so +#setting RPath by the driver is not an solution. It should belong into +#/usr/lib//libflang_rt.so, like e.g. libgcc_s.so. +#But the linker as invoked by the Flang driver also requires +#libflang_rt.so to be found when linking and the resource lib dir is +#the only reliable location. cmake_path(NORMAL_PATH FLANG_RT_OUTPUT_RESOURCE_LIB_DIR) cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_LIB_PATH) @@ -129,6 +138,27 @@ cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_LIB_PATH) option(FLANG_RT_INCLUDE_TESTS "Generate build targets for the flang-rt unit and regression-tests." "${LLVM_INCLUDE_TESTS}") +option(FLANG_RT_ENABLE_STATIC "Build Flang-RT as a static library." ON) +if (WIN32) + # Windows DLL currently not implemented. + set(FLANG_RT_ENABLE_SHARED OFF) +else () + # TODO: Enable by default to increase test coverage, and which version of the + # library should be the user's choice anyway. + # Currently, the Flang driver adds `-L"libdir" -lflang_rt` as linker + # argument, which leaves the choice which library to use to the linker. + # Since most linkers prefer the shared library, this would constitute a + # breaking change unless the driver is changed. + option(FLANG_RT_ENABLE_SHARED "Build Flang-RT as a shared library." OFF) +endif () +if (NOT FLANG_RT_ENABLE_STATIC AND NOT FLANG_RT_ENABLE_SHARED) + message(FATAL_ERROR " + Must build at least one type of library + (FLANG_RT_ENABLE_STATIC=ON, FLANG_RT_ENABLE_SHARED=ON, or both) +") +endif () + + set(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT "" CACHE STRING "Compile Flang-RT with GPU support (CUDA or OpenMP)") set_property(CACHE FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT PROPERTY STRINGS "" diff --git a/flang-rt/cmake/modules/AddFlangRT.cmake b/flang-rt/cmake/modules/AddFlangRT.cmake index aa8adedf61752..87ec58b2e854e 100644 --- a/flang-rt/cmake/modules/AddFlangRT.cmake +++ b/flang-rt/cmake/modules/AddFlangRT.cmake @@ -16,7 +16,8 @@ # STATIC # Build a static (.a/.lib) library # OBJECT -# Create only object files without static/dynamic library +# Always create an object library. +# Without SHARED/STATIC, build only the object library. # INSTALL_WITH_TOOLCHAIN # Install library into Clang's resource directory so it can be found by the # Flang driver during compilation, including tests @@ -48,17 +49,73 @@ function (add_flangrt_library name) ") endif () - # Forward libtype to add_library - set(extra_args "") - if (ARG_SHARED) -list(APPEND extra_args SHARED) + # Internal names of libraries. If called with just single type option, use + # the default name for it. Name of targets must only depend on function + # arguments to be predictable for callers. + set(name_static "${name}.static") + set(name_shared "${name}.shared") + set(name_object "obj.${name}") + if (ARG_STATIC AND NOT ARG_SHARED) +set(name_static "${name}") + elseif (NOT ARG_STATIC AND ARG_SHARED) +set(name_shared "${name}") + elseif (NOT ARG_STATIC AND NOT ARG_SHARED AND ARG_OBJECT) +set(name_object "${name}") + elseif (NOT ARG_STATIC AND NOT ARG_SHARED AND NOT ARG_OBJECT) +# Only one of them will actually be built. +set(name_static "${name}") +set(name_shared "${name}") endif () - if (ARG_STATIC) -list(APPEND extra_args STATIC) + + # Determine what to build. If not explicitly specified, honor + # BUILD_SHARED_LIBS (e.g. for unittest libraries). If can build s
[llvm-branch-commits] [llvm] [AMDGPU] Respect MBB alignment in the getFunctionCodeSize() (PR #127142)
@@ -212,6 +212,8 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { uint64_t CodeSize = 0; for (const MachineBasicBlock &MBB : MF) { +CodeSize = alignTo(CodeSize, MBB.getAlignment()); efriedma-quic wrote: This is not the right way to do this kind of math; you don't know how much padding the assembler is actually going to insert. Use a conservative estimate like `CodeSize += MBB.getAlignment() - 1`. See https://reviews.llvm.org/D150009 for further discussion. https://github.com/llvm/llvm-project/pull/127142 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] release/20.x: [reland][DebugInfo] Update DIBuilder insertion to take InsertPosition (#126967) (PR #127124)
llvmbot wrote: @llvm/pr-subscribers-clang Author: None (llvmbot) Changes Backport 1083ec647f16314bcc9af8c4d6b11f50d288bca6 Requested by: @hvdijk --- Patch is 32.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127124.diff 11 Files Affected: - (modified) clang/lib/CodeGen/CGDebugInfo.cpp (+3-3) - (modified) llvm/include/llvm/IR/DIBuilder.h (+8-44) - (modified) llvm/lib/IR/DIBuilder.cpp (+31-80) - (modified) llvm/lib/IR/DebugInfo.cpp (+28-19) - (modified) llvm/lib/Transforms/Coroutines/CoroFrame.cpp (+2-2) - (modified) llvm/lib/Transforms/Scalar/SROA.cpp (+1-1) - (modified) llvm/lib/Transforms/Utils/Debugify.cpp (+8-6) - (modified) llvm/lib/Transforms/Utils/Local.cpp (+6-17) - (modified) llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp (+2-1) - (modified) llvm/unittests/IR/IRBuilderTest.cpp (+6-4) - (modified) llvm/unittests/Transforms/Utils/CloningTest.cpp (+3-2) ``diff diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index f0abfaa7324fc..d28ef60f84e92 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -5119,7 +5119,7 @@ void CGDebugInfo::EmitLabel(const LabelDecl *D, CGBuilderTy &Builder) { DBuilder.insertLabel(L, llvm::DILocation::get(CGM.getLLVMContext(), Line, Column, Scope, CurInlinedAt), - Builder.GetInsertBlock()); + Builder.GetInsertBlock()->end()); } llvm::DIType *CGDebugInfo::CreateSelfType(const QualType &QualTy, @@ -5197,7 +5197,7 @@ void CGDebugInfo::EmitDeclareOfBlockDeclRefVariable( LexicalBlockStack.back(), CurInlinedAt); auto *Expr = DBuilder.createExpression(addr); if (InsertPoint) -DBuilder.insertDeclare(Storage, D, Expr, DL, InsertPoint); +DBuilder.insertDeclare(Storage, D, Expr, DL, InsertPoint->getIterator()); else DBuilder.insertDeclare(Storage, D, Expr, DL, Builder.GetInsertBlock()); } @@ -5862,7 +5862,7 @@ void CGDebugInfo::EmitPseudoVariable(CGBuilderTy &Builder, if (auto InsertPoint = Value->getInsertionPointAfterDef()) { DBuilder.insertDbgValueIntrinsic(Value, D, DBuilder.createExpression(), DIL, - &**InsertPoint); + *InsertPoint); } } diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h index 6c479415b9ed2..f777206d0d735 100644 --- a/llvm/include/llvm/IR/DIBuilder.h +++ b/llvm/include/llvm/IR/DIBuilder.h @@ -92,33 +92,15 @@ namespace llvm { /// Create an \a temporary node and track it in \a UnresolvedNodes. void trackIfUnresolved(MDNode *N); -/// Internal helper for insertDeclare. -DbgInstPtr insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo, - DIExpression *Expr, const DILocation *DL, - BasicBlock *InsertBB, Instruction *InsertBefore); - -/// Internal helper for insertLabel. -DbgInstPtr insertLabel(DILabel *LabelInfo, const DILocation *DL, - BasicBlock *InsertBB, Instruction *InsertBefore); - /// Internal helper. Track metadata if untracked and insert \p DVR. -void insertDbgVariableRecord(DbgVariableRecord *DVR, BasicBlock *InsertBB, - Instruction *InsertBefore, - bool InsertAtHead = false); +void insertDbgVariableRecord(DbgVariableRecord *DVR, + InsertPosition InsertPt); /// Internal helper with common code used by insertDbg{Value,Addr}Intrinsic. Instruction *insertDbgIntrinsic(llvm::Function *Intrinsic, llvm::Value *Val, DILocalVariable *VarInfo, DIExpression *Expr, const DILocation *DL, -BasicBlock *InsertBB, -Instruction *InsertBefore); - -/// Internal helper for insertDbgValueIntrinsic. -DbgInstPtr insertDbgValueIntrinsic(llvm::Value *Val, - DILocalVariable *VarInfo, - DIExpression *Expr, const DILocation *DL, - BasicBlock *InsertBB, - Instruction *InsertBefore); +InsertPosition InsertPt); public: /// Construct a builder for a module. @@ -993,46 +975,28 @@ namespace llvm { /// \param VarInfo Variable's debug info descriptor. /// \param Expr A complex location expression. /// \param DL Debug info location. -/// \param InsertBefore Location for the new intrinsic. +/// \param InsertPt Location for the new intrinsic. DbgInstPtr insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo
[llvm-branch-commits] [llvm] [AMDGPU] Push amdgpu-preload-kern-arg-prolog after livedebugvalues (PR #126148)
slinder1 wrote: I do have an llvm-reduce'd test case for downstream, just waiting on things to flow down first. I can't actually add the test at all without a fix for the duplicated metadata so didn't seem worth posting the review yet https://github.com/llvm/llvm-project/pull/126148 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Early bail in getFunctionCodeSize for meta inst. NFC. (PR #127129)
https://github.com/rampitec ready_for_review https://github.com/llvm/llvm-project/pull/127129 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [libc] release/20.x: [libc] Move __LLVM_LIBC__ define to __llvm-libc-common.h (#126877) (PR #126960)
https://github.com/petrhosek updated https://github.com/llvm/llvm-project/pull/126960 >From 87587f925442d6369a7d511426a55ad50f3141ea Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Wed, 12 Feb 2025 11:37:07 -0800 Subject: [PATCH] [libc] Move __LLVM_LIBC__ define to __llvm-libc-common.h (#126877) Relying on features.h is problematic since codebases are free to have such a header on their search path, which breaks compilation. libc should instead provide a more standard way of getting __LLVM_LIBC__. Since __llvm-libc-common.h is included from all libc headers, defining __LLVM_LIBC__ there ensures that this define is available whenever any of the standard header is included. (cherry picked from commit b0d782080529cf5d422847e1f91f29bd7c62f691) --- libc/include/__llvm-libc-common.h | 2 ++ libc/include/llvm-libc-macros/features-macros.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/include/__llvm-libc-common.h b/libc/include/__llvm-libc-common.h index b5a23c5765f4d..9112d1ebe302c 100644 --- a/libc/include/__llvm-libc-common.h +++ b/libc/include/__llvm-libc-common.h @@ -9,6 +9,8 @@ #ifndef LLVM_LIBC_COMMON_H #define LLVM_LIBC_COMMON_H +#define __LLVM_LIBC__ 1 + #ifdef __cplusplus #undef __BEGIN_C_DECLS diff --git a/libc/include/llvm-libc-macros/features-macros.h b/libc/include/llvm-libc-macros/features-macros.h index 5bc87a68fc0ba..f87ae4ad12408 100644 --- a/libc/include/llvm-libc-macros/features-macros.h +++ b/libc/include/llvm-libc-macros/features-macros.h @@ -9,6 +9,4 @@ #ifndef LLVM_LIBC_MACROS_FEATURES_MACROS_H #define LLVM_LIBC_MACROS_FEATURES_MACROS_H -#define __LLVM_LIBC__ 1 - #endif // LLVM_LIBC_MACROS_FEATURES_MACROS_H ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits