[clang] 8bd7e41 - Replace separator in OpenMP variant name mangling.
Author: Lukas Sommer Date: 2020-06-03T16:36:32-04:00 New Revision: 8bd7e4188a096b063065aac70ce39129c479f124 URL: https://github.com/llvm/llvm-project/commit/8bd7e4188a096b063065aac70ce39129c479f124 DIFF: https://github.com/llvm/llvm-project/commit/8bd7e4188a096b063065aac70ce39129c479f124.diff LOG: Replace separator in OpenMP variant name mangling. Summary: Nvidia PTX does not allow `.` to appear in identifiers, so OpenMP variant mangling now uses `$` to separate segments of the mangled name for variants of functions declared via `declare variant`. Reviewers: jdoerfert, Hahnfeld Reviewed By: jdoerfert Subscribers: yaxunl, guansong, sstefan1, cfe-commits Tags: #openmp, #clang Differential Revision: https://reviews.llvm.org/D80439 Added: clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp Modified: clang/include/clang/AST/Decl.h clang/lib/AST/OpenMPClause.cpp Removed: diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index 185ba2f4b4c1..6c39f6aab1b9 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -4560,7 +4560,7 @@ inline bool IsEnumDeclScoped(EnumDecl *ED) { /// The new name looks likes this: /// + OpenMPVariantManglingSeparatorStr + static constexpr StringRef getOpenMPVariantManglingSeparatorStr() { - return ".ompvariant"; + return "$ompvariant"; } } // namespace clang diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp index fa1c80fc6bbf..bcbe916820dc 100644 --- a/clang/lib/AST/OpenMPClause.cpp +++ b/clang/lib/AST/OpenMPClause.cpp @@ -2167,22 +2167,21 @@ std::string OMPTraitInfo::getMangledName() const { std::string MangledName; llvm::raw_string_ostream OS(MangledName); for (const OMPTraitSet &Set : Sets) { -OS << '.' << 'S' << unsigned(Set.Kind); +OS << '$' << 'S' << unsigned(Set.Kind); for (const OMPTraitSelector &Selector : Set.Selectors) { bool AllowsTraitScore = false; bool RequiresProperty = false; isValidTraitSelectorForTraitSet( Selector.Kind, Set.Kind, AllowsTraitScore, RequiresProperty); - OS << '.' << 's' << unsigned(Selector.Kind); + OS << '$' << 's' << unsigned(Selector.Kind); if (!RequiresProperty || Selector.Kind == TraitSelector::user_condition) continue; for (const OMPTraitProperty &Property : Selector.Properties) -OS << '.' << 'P' - << getOpenMPContextTraitPropertyName(Property.Kind); +OS << '$' << 'P' << getOpenMPContextTraitPropertyName(Property.Kind); } } return OS.str(); @@ -2191,7 +2190,7 @@ std::string OMPTraitInfo::getMangledName() const { OMPTraitInfo::OMPTraitInfo(StringRef MangledName) { unsigned long U; do { -if (!MangledName.consume_front(".S")) +if (!MangledName.consume_front("$S")) break; if (MangledName.consumeInteger(10, U)) break; @@ -2199,7 +2198,7 @@ OMPTraitInfo::OMPTraitInfo(StringRef MangledName) { OMPTraitSet &Set = Sets.back(); Set.Kind = TraitSet(U); do { - if (!MangledName.consume_front(".s")) + if (!MangledName.consume_front("$s")) break; if (MangledName.consumeInteger(10, U)) break; @@ -2207,11 +2206,11 @@ OMPTraitInfo::OMPTraitInfo(StringRef MangledName) { OMPTraitSelector &Selector = Set.Selectors.back(); Selector.Kind = TraitSelector(U); do { -if (!MangledName.consume_front(".P")) +if (!MangledName.consume_front("$P")) break; Selector.Properties.push_back(OMPTraitProperty()); OMPTraitProperty &Property = Selector.Properties.back(); -std::pair PropRestPair = MangledName.split('.'); +std::pair PropRestPair = MangledName.split('$'); Property.Kind = getOpenMPContextTraitPropertyKind(Set.Kind, PropRestPair.first); MangledName = PropRestPair.second; diff --git a/clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp b/clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp new file mode 100644 index ..6a9ce799d01e --- /dev/null +++ b/clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp @@ -0,0 +1,41 @@ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=50 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-version=50 | FileCheck %s --implicit-check-not='call i32 {@_Z3bazv|@_Z3barv}' +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -fopenmp-version=50 +// RU
[clang] [Clang][OpenMP]: fixed crash due to invalid binary expression in checking atomic semantics (PR #71480)
shiltian wrote: This doesn't look like the right place to fix this issue to me. @alexey-bataev might have better suggestion. https://github.com/llvm/llvm-project/pull/71480 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][OpenMP] fixed crash due to invalid binary expression in checking atomic semantics (PR #71480)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/71480 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][OpenMP] fixed crash due to invalid binary expression in checking atomic semantics (PR #71480)
https://github.com/shiltian closed https://github.com/llvm/llvm-project/pull/71480 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][OpenMP] fixed crash due to invalid binary expression in checking atomic semantics (PR #71480)
shiltian wrote: I have merged it given our front end expert @alexey-bataev has approved it. https://github.com/llvm/llvm-project/pull/71480 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][OpenMP] Return empty QualType when a negative array was created (PR #71552)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/71552 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [openmp] [OpenMP] Add support for Solaris/x86_64 (PR #70593)
@@ -70,6 +72,15 @@ struct kmp_sys_timer { struct timespec start; }; +#ifdef KMP_OS_SOLARIS +// Convert timeval to timespec. +#define TIMEVAL_TO_TIMESPEC(tv, ts) \ shiltian wrote: It looks like this macro is not guarded correct somewhere else such that the compiler complains on a non-Solaris system. ``` [19/128] Building CXX object runtime/src/CMakeFiles/omp.dir/z_Linux_util.cpp.o /home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/runtime/src/z_Linux_util.cpp:77:9: warning: 'TIMEVAL_TO_TIMESPEC' macro redefined [-Wmacro-redefined] 77 | #define TIMEVAL_TO_TIMESPEC(tv, ts) \ | ^ /usr/include/sys/time.h:38:10: note: previous definition is here 38 | # define TIMEVAL_TO_TIMESPEC(tv, ts) { \ | ^ ``` https://github.com/llvm/llvm-project/pull/70593 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][OpenMP] Return empty QualType when a negative array was created (PR #71552)
shiltian wrote: The changes look good to me. @alexey-bataev WDYT? https://github.com/llvm/llvm-project/pull/71552 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][OpenMP] Return empty QualType when a negative array was created (PR #71552)
https://github.com/shiltian closed https://github.com/llvm/llvm-project/pull/71552 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel (PR #68373)
shiltian wrote: Gentle ping https://github.com/llvm/llvm-project/pull/68373 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [openmp] [OpenMP] Directly use user's grid and block size in kernel language mode (PR #70612)
shiltian wrote: Gentle ping https://github.com/llvm/llvm-project/pull/70612 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel (PR #68373)
@@ -14633,6 +14633,26 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses, } setFunctionHasBranchProtectedScope(); + bool HasBareClause = false; + bool HasThreadLimitClause = false; + bool HasNumTeamsClause = false; + OMPClause *BareClause = nullptr; + + for (auto *C : Clauses) { +if (isa(C)) { + BareClause = C; + HasBareClause = true; shiltian wrote: It is not very necessarily needed, but I found it look nicer to check three boolean variables instead of two as well as a pointer. Just code preference. Nothing technical. https://github.com/llvm/llvm-project/pull/68373 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][OpenMP] Fix private variables registration in `simd` (PR #74105)
https://github.com/shiltian closed https://github.com/llvm/llvm-project/pull/74105 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel (PR #68373)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/68373 >From ef3d7309feb1fcedb27f10bd11273eeb08ebf7aa Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 11 Dec 2023 19:25:01 -0500 Subject: [PATCH] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel This patch makes `num_teams` and `thread_limit` mandatory for bare kernels, similar to a reguar kernel language that when launching a kernel, the grid size has to be set explicitly. --- .../clang/Basic/DiagnosticSemaKinds.td| 2 + clang/lib/Sema/SemaOpenMP.cpp | 13 ++ clang/test/OpenMP/ompx_bare_messages.c| 7 +- clang/test/OpenMP/target_teams_codegen.cpp| 206 +- 4 files changed, 127 insertions(+), 101 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 94e97a891baed..8b2f5d2746f7f 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11368,6 +11368,8 @@ def err_openmp_vla_in_task_untied : Error< def warn_omp_unterminated_declare_target : Warning< "expected '#pragma omp end declare target' at end of file to match '#pragma omp %0'">, InGroup; +def err_ompx_bare_no_grid : Error< + "'ompx_bare' clauses requires explicit grid size via 'num_teams' and 'thread_limit' clauses">; } // end of OpenMP category let CategoryName = "Related Result Type Issue" in { diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index e400f248d15aa..3826994ef2126 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -14658,6 +14658,19 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses, } setFunctionHasBranchProtectedScope(); + const OMPClause *BareClause = nullptr; + bool HasThreadLimitAndNumTeamsClause = hasClauses(Clauses, OMPC_num_teams) && + hasClauses(Clauses, OMPC_thread_limit); + bool HasBareClause = llvm::any_of(Clauses, [&](const OMPClause *C) { +BareClause = C; +return C->getClauseKind() == OMPC_ompx_bare; + }); + + if (HasBareClause && !HasThreadLimitAndNumTeamsClause) { +Diag(BareClause->getBeginLoc(), diag::err_ompx_bare_no_grid); +return StmtError(); + } + return OMPTargetTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt); } diff --git a/clang/test/OpenMP/ompx_bare_messages.c b/clang/test/OpenMP/ompx_bare_messages.c index a1b3c38028528..19ceee5625fee 100644 --- a/clang/test/OpenMP/ompx_bare_messages.c +++ b/clang/test/OpenMP/ompx_bare_messages.c @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown %s - // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s - // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown -fopenmp-targets=nvptx64 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s +// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown -fopenmp-targets=nvptx64 %s void foo() { } @@ -18,4 +18,7 @@ void bar() { #pragma omp target #pragma omp teams ompx_bare // expected-error {{unexpected OpenMP clause 'ompx_bare' in directive '#pragma omp teams'}} expected-note {{OpenMP extension clause 'ompx_bare' only allowed with '#pragma omp target teams'}} foo(); + +#pragma omp target teams ompx_bare // expected-error {{'ompx_bare' clauses requires explicit grid size via 'num_teams' and 'thread_limit' clauses}} + foo(); } diff --git a/clang/test/OpenMP/target_teams_codegen.cpp b/clang/test/OpenMP/target_teams_codegen.cpp index b7c7add229c14..8790a0fc87cbb 100644 --- a/clang/test/OpenMP/target_teams_codegen.cpp +++ b/clang/test/OpenMP/target_teams_codegen.cpp @@ -121,7 +121,7 @@ int foo(int n) { aa += 1; } - #pragma omp target teams ompx_bare + #pragma omp target teams ompx_bare num_teams(1) thread_limit(1) { a += 1; aa += 1; @@ -588,12 +588,12 @@ int bar(int n){ // CHECK1-NEXT:[[TMP116:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 9 // CHECK1-NEXT:store i64 0, ptr [[TMP116]], align 8 // CHECK1-NEXT:[[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 10 -// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP117]], align 4 +// CHECK1-NEXT:store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP117]], align 4 // CHECK1-NEXT:[[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 11 -// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP118]], align 4 +// CHECK1-NEXT:store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP118]], align 4 // CHECK1-NEXT:[[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 12 // CHECK
[libc] [clang-tools-extra] [libcxx] [clang] [lld] [lldb] [flang] [llvm] [OpenACC] Initial commits to support OpenACC (PR #70234)
shiltian wrote: We will want to have OpenACC label(s) such that people who are interested can be notified. https://github.com/llvm/llvm-project/pull/70234 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [libcxx] [flang] [lldb] [llvm] [libc] [lld] [OpenACC] Initial commits to support OpenACC (PR #70234)
shiltian wrote: You might also want to update the team "pr-subscribers-openacc" because currently it only reflects Flang. https://github.com/llvm/llvm-project/pull/70234 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP] Unify the min/max thread/teams pathways (PR #70273)
https://github.com/shiltian approved this pull request. LG https://github.com/llvm/llvm-project/pull/70273 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP] Add support for Solaris (PR #70593)
shiltian wrote: I'm not familiar with Solaris but does it need dedicated implementation of the function invocation written in ASM? https://github.com/llvm/llvm-project/pull/70593 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel (PR #68373)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/68373 >From d0d8bcc9fa002304ef79ca48d736853ca621c0bd Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Sun, 29 Oct 2023 19:17:19 -0400 Subject: [PATCH] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel This patch makes `num_teams` and `thread_limit` mandatory for bare kernels, similar to a reguar kernel language that when launching a kernel, the grid size has to be set explicitly. --- .../clang/Basic/DiagnosticSemaKinds.td| 2 + clang/lib/Sema/SemaOpenMP.cpp | 20 ++ clang/test/OpenMP/ompx_bare_messages.c| 7 +- clang/test/OpenMP/target_teams_codegen.cpp| 230 +- 4 files changed, 146 insertions(+), 113 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 453bd8a9a340425..d1398763b49c701 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11333,6 +11333,8 @@ def err_openmp_vla_in_task_untied : Error< def warn_omp_unterminated_declare_target : Warning< "expected '#pragma omp end declare target' at end of file to match '#pragma omp %0'">, InGroup; +def err_ompx_bare_no_grid : Error< + "'ompx_bare' clauses requires explicit grid size via 'num_teams' and 'thread_limit' clauses">; } // end of OpenMP category let CategoryName = "Related Result Type Issue" in { diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 75f9e152dca9297..90a0d1f70f268f1 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -14633,6 +14633,26 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses, } setFunctionHasBranchProtectedScope(); + bool HasBareClause = false; + bool HasThreadLimitClause = false; + bool HasNumTeamsClause = false; + OMPClause *BareClause = nullptr; + + for (auto *C : Clauses) { +if (isa(C)) { + BareClause = C; + HasBareClause = true; +} else if (isa(C)) + HasNumTeamsClause = true; +else if (isa(C)) + HasThreadLimitClause = true; + } + + if (HasBareClause && !(HasNumTeamsClause && HasThreadLimitClause)) { +Diag(BareClause->getBeginLoc(), diag::err_ompx_bare_no_grid); +return StmtError(); + } + return OMPTargetTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt); } diff --git a/clang/test/OpenMP/ompx_bare_messages.c b/clang/test/OpenMP/ompx_bare_messages.c index a1b3c380285287d..19ceee5625feecc 100644 --- a/clang/test/OpenMP/ompx_bare_messages.c +++ b/clang/test/OpenMP/ompx_bare_messages.c @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown %s - // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s - // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown -fopenmp-targets=nvptx64 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s +// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown -fopenmp-targets=nvptx64 %s void foo() { } @@ -18,4 +18,7 @@ void bar() { #pragma omp target #pragma omp teams ompx_bare // expected-error {{unexpected OpenMP clause 'ompx_bare' in directive '#pragma omp teams'}} expected-note {{OpenMP extension clause 'ompx_bare' only allowed with '#pragma omp target teams'}} foo(); + +#pragma omp target teams ompx_bare // expected-error {{'ompx_bare' clauses requires explicit grid size via 'num_teams' and 'thread_limit' clauses}} + foo(); } diff --git a/clang/test/OpenMP/target_teams_codegen.cpp b/clang/test/OpenMP/target_teams_codegen.cpp index 9ee1f74e8fdc468..95182c668c5e822 100644 --- a/clang/test/OpenMP/target_teams_codegen.cpp +++ b/clang/test/OpenMP/target_teams_codegen.cpp @@ -121,7 +121,7 @@ int foo(int n) { aa += 1; } - #pragma omp target teams ompx_bare + #pragma omp target teams ompx_bare num_teams(1) thread_limit(1) { a += 1; aa += 1; @@ -588,12 +588,12 @@ int bar(int n){ // CHECK1-NEXT:[[TMP116:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 9 // CHECK1-NEXT:store i64 0, ptr [[TMP116]], align 8 // CHECK1-NEXT:[[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 10 -// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP117]], align 4 +// CHECK1-NEXT:store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP117]], align 4 // CHECK1-NEXT:[[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 11 -// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP118]], align 4 +// CHECK1-NEXT:store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP118]], align 4 // CHECK1-NEXT:[[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL
[clang] no default grid size (PR #70612)
https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/70612 - [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel - [OpenMP] Directly use user's grid and block size in kernel language mode >From 7b0eaa1606ad2e557105fed9509c135f857db375 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Sun, 29 Oct 2023 19:18:49 -0400 Subject: [PATCH 1/2] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel This patch makes `num_teams` and `thread_limit` mandatory for bare kernels, similar to a reguar kernel language that when launching a kernel, the grid size has to be set explicitly. --- .../clang/Basic/DiagnosticSemaKinds.td| 2 + clang/lib/Sema/SemaOpenMP.cpp | 20 ++ clang/test/OpenMP/ompx_bare_messages.c| 7 +- clang/test/OpenMP/target_teams_codegen.cpp| 230 +- 4 files changed, 146 insertions(+), 113 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 453bd8a9a340425..d1398763b49c701 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11333,6 +11333,8 @@ def err_openmp_vla_in_task_untied : Error< def warn_omp_unterminated_declare_target : Warning< "expected '#pragma omp end declare target' at end of file to match '#pragma omp %0'">, InGroup; +def err_ompx_bare_no_grid : Error< + "'ompx_bare' clauses requires explicit grid size via 'num_teams' and 'thread_limit' clauses">; } // end of OpenMP category let CategoryName = "Related Result Type Issue" in { diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 75f9e152dca9297..90a0d1f70f268f1 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -14633,6 +14633,26 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses, } setFunctionHasBranchProtectedScope(); + bool HasBareClause = false; + bool HasThreadLimitClause = false; + bool HasNumTeamsClause = false; + OMPClause *BareClause = nullptr; + + for (auto *C : Clauses) { +if (isa(C)) { + BareClause = C; + HasBareClause = true; +} else if (isa(C)) + HasNumTeamsClause = true; +else if (isa(C)) + HasThreadLimitClause = true; + } + + if (HasBareClause && !(HasNumTeamsClause && HasThreadLimitClause)) { +Diag(BareClause->getBeginLoc(), diag::err_ompx_bare_no_grid); +return StmtError(); + } + return OMPTargetTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt); } diff --git a/clang/test/OpenMP/ompx_bare_messages.c b/clang/test/OpenMP/ompx_bare_messages.c index a1b3c380285287d..19ceee5625feecc 100644 --- a/clang/test/OpenMP/ompx_bare_messages.c +++ b/clang/test/OpenMP/ompx_bare_messages.c @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown %s - // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s - // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown -fopenmp-targets=nvptx64 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s +// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown -fopenmp-targets=nvptx64 %s void foo() { } @@ -18,4 +18,7 @@ void bar() { #pragma omp target #pragma omp teams ompx_bare // expected-error {{unexpected OpenMP clause 'ompx_bare' in directive '#pragma omp teams'}} expected-note {{OpenMP extension clause 'ompx_bare' only allowed with '#pragma omp target teams'}} foo(); + +#pragma omp target teams ompx_bare // expected-error {{'ompx_bare' clauses requires explicit grid size via 'num_teams' and 'thread_limit' clauses}} + foo(); } diff --git a/clang/test/OpenMP/target_teams_codegen.cpp b/clang/test/OpenMP/target_teams_codegen.cpp index 9ee1f74e8fdc468..95182c668c5e822 100644 --- a/clang/test/OpenMP/target_teams_codegen.cpp +++ b/clang/test/OpenMP/target_teams_codegen.cpp @@ -121,7 +121,7 @@ int foo(int n) { aa += 1; } - #pragma omp target teams ompx_bare + #pragma omp target teams ompx_bare num_teams(1) thread_limit(1) { a += 1; aa += 1; @@ -588,12 +588,12 @@ int bar(int n){ // CHECK1-NEXT:[[TMP116:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 9 // CHECK1-NEXT:store i64 0, ptr [[TMP116]], align 8 // CHECK1-NEXT:[[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 10 -// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP117]], align 4 +// CHECK1-NEXT:store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP117]], align 4 // CHECK1-NEXT:[[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 11 -// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP118]], align 4 +// CHECK1-NEXT:store [3 x i3
[clang] [OpenMP] Directly use user's grid and block size in kernel language mode (PR #70612)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/70612 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP] Directly use user's grid and block size in kernel language mode (PR #70612)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/70612 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP] Directly use user's grid and block size in kernel language mode (PR #70612)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/70612 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP] Directly use user's grid and block size in kernel language mode (PR #70612)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/70612 >From 7b0eaa1606ad2e557105fed9509c135f857db375 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Sun, 29 Oct 2023 19:18:49 -0400 Subject: [PATCH 1/2] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel This patch makes `num_teams` and `thread_limit` mandatory for bare kernels, similar to a reguar kernel language that when launching a kernel, the grid size has to be set explicitly. --- .../clang/Basic/DiagnosticSemaKinds.td| 2 + clang/lib/Sema/SemaOpenMP.cpp | 20 ++ clang/test/OpenMP/ompx_bare_messages.c| 7 +- clang/test/OpenMP/target_teams_codegen.cpp| 230 +- 4 files changed, 146 insertions(+), 113 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 453bd8a9a340425..d1398763b49c701 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11333,6 +11333,8 @@ def err_openmp_vla_in_task_untied : Error< def warn_omp_unterminated_declare_target : Warning< "expected '#pragma omp end declare target' at end of file to match '#pragma omp %0'">, InGroup; +def err_ompx_bare_no_grid : Error< + "'ompx_bare' clauses requires explicit grid size via 'num_teams' and 'thread_limit' clauses">; } // end of OpenMP category let CategoryName = "Related Result Type Issue" in { diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 75f9e152dca9297..90a0d1f70f268f1 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -14633,6 +14633,26 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses, } setFunctionHasBranchProtectedScope(); + bool HasBareClause = false; + bool HasThreadLimitClause = false; + bool HasNumTeamsClause = false; + OMPClause *BareClause = nullptr; + + for (auto *C : Clauses) { +if (isa(C)) { + BareClause = C; + HasBareClause = true; +} else if (isa(C)) + HasNumTeamsClause = true; +else if (isa(C)) + HasThreadLimitClause = true; + } + + if (HasBareClause && !(HasNumTeamsClause && HasThreadLimitClause)) { +Diag(BareClause->getBeginLoc(), diag::err_ompx_bare_no_grid); +return StmtError(); + } + return OMPTargetTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt); } diff --git a/clang/test/OpenMP/ompx_bare_messages.c b/clang/test/OpenMP/ompx_bare_messages.c index a1b3c380285287d..19ceee5625feecc 100644 --- a/clang/test/OpenMP/ompx_bare_messages.c +++ b/clang/test/OpenMP/ompx_bare_messages.c @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown %s - // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s - // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown -fopenmp-targets=nvptx64 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s +// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown -fopenmp-targets=nvptx64 %s void foo() { } @@ -18,4 +18,7 @@ void bar() { #pragma omp target #pragma omp teams ompx_bare // expected-error {{unexpected OpenMP clause 'ompx_bare' in directive '#pragma omp teams'}} expected-note {{OpenMP extension clause 'ompx_bare' only allowed with '#pragma omp target teams'}} foo(); + +#pragma omp target teams ompx_bare // expected-error {{'ompx_bare' clauses requires explicit grid size via 'num_teams' and 'thread_limit' clauses}} + foo(); } diff --git a/clang/test/OpenMP/target_teams_codegen.cpp b/clang/test/OpenMP/target_teams_codegen.cpp index 9ee1f74e8fdc468..95182c668c5e822 100644 --- a/clang/test/OpenMP/target_teams_codegen.cpp +++ b/clang/test/OpenMP/target_teams_codegen.cpp @@ -121,7 +121,7 @@ int foo(int n) { aa += 1; } - #pragma omp target teams ompx_bare + #pragma omp target teams ompx_bare num_teams(1) thread_limit(1) { a += 1; aa += 1; @@ -588,12 +588,12 @@ int bar(int n){ // CHECK1-NEXT:[[TMP116:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 9 // CHECK1-NEXT:store i64 0, ptr [[TMP116]], align 8 // CHECK1-NEXT:[[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 10 -// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP117]], align 4 +// CHECK1-NEXT:store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP117]], align 4 // CHECK1-NEXT:[[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 11 -// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP118]], align 4 +// CHECK1-NEXT:store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP118]], align 4 // CHECK1-NEXT:[[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KE
[clang] [openmp] Add memory diff dump for kernel record-replay (PR #70667)
@@ -274,7 +317,7 @@ struct RecordReplayTy { void saveKernelOutputInfo(const char *Name) { SmallString<128> OutputFilename = { Name, (isRecording() ? ".original.output" : ".replay.output")}; -dumpDeviceMemory(OutputFilename); +dumpDeviceMemory(OutputFilename, true); shiltian wrote: ```suggestion dumpDeviceMemory(OutputFilename, /*saveDiff*/true); ``` https://github.com/llvm/llvm-project/pull/70667 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [openmp] Add memory diff dump for kernel record-replay (PR #70667)
@@ -274,7 +317,7 @@ struct RecordReplayTy { void saveKernelOutputInfo(const char *Name) { SmallString<128> OutputFilename = { Name, (isRecording() ? ".original.output" : ".replay.output")}; -dumpDeviceMemory(OutputFilename); +dumpDeviceMemory(OutputFilename, true); shiltian wrote: ```suggestion dumpDeviceMemory(OutputFilename, /*saveDiff*/true); ``` https://github.com/llvm/llvm-project/pull/70667 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP] Add support for Solaris/x86_64 (PR #70593)
shiltian wrote: > > I'm not familiar with Solaris but does it need dedicated implementation of > > the function invocation written in ASM? > > Can you point out what you're referring to? Looking at the patches for adding > support for OpenBSD and other OS's I don't see any ASM additions. I was thinking about `openmp/runtime/src/z_Linux_asm.S` but it looks like we don't need extra stuff there. https://github.com/llvm/llvm-project/pull/70593 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [openmp] Add memory diff dump for kernel record-replay (PR #70667)
shiltian wrote: Could you fix the existing format issue in a separate RFC patch? https://github.com/llvm/llvm-project/pull/70667 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [openmp] Add memory diff dump for kernel record-replay (PR #70667)
shiltian wrote: Could you fix the existing format issue in a separate RFC patch? https://github.com/llvm/llvm-project/pull/70667 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [StackProtector] Do not emit the stack protector on GPU architectures (PR #70799)
https://github.com/shiltian approved this pull request. I think the changes make sense. https://github.com/llvm/llvm-project/pull/70799 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [openmp] [clang] [OpenMP] Introduce the KernelLaunchEnvironment as implicit argument (PR #70401)
shiltian wrote: Tests in `mlir` have to be updated as well. https://github.com/llvm/llvm-project/pull/70401 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [openmp] [OpenMP][FIX] Allocate per launch memory for GPU team reductions (PR #70752)
https://github.com/shiltian approved this pull request. LG with some nits https://github.com/llvm/llvm-project/pull/70752 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [openmp] [OpenMP][FIX] Allocate per launch memory for GPU team reductions (PR #70752)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/70752 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [openmp] [OpenMP][FIX] Allocate per launch memory for GPU team reductions (PR #70752)
@@ -194,6 +191,9 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2( ThreadId = 0; } + uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt; + uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt; shiltian wrote: These two variables seem not used anywhere else. https://github.com/llvm/llvm-project/pull/70752 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[openmp] [clang] [OpenMP][FIX] Allocate per launch memory for GPU team reductions (PR #70752)
@@ -387,7 +387,7 @@ struct GenericKernelTy { KernelEnvironmentTy KernelEnvironment; /// The prototype kernel launch environment. - KernelLaunchEnvironmentTy KernelLaunchEnvironment; + KernelLaunchEnvironmentTy KernelLaunchEnvironment = {0, 0}; shiltian wrote: ```suggestion KernelLaunchEnvironmentTy KernelLaunchEnvironment = {0, 0, nullptr}; ``` https://github.com/llvm/llvm-project/pull/70752 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[openmp] [clang] [OpenMP] Add support for Solaris/x86_64 (PR #70593)
https://github.com/shiltian approved this pull request. LG https://github.com/llvm/llvm-project/pull/70593 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel (PR #68373)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/68373 >From 380046a1117cef08e40f9bcdce2c51c3fe73a26f Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 12 Dec 2023 00:11:13 -0500 Subject: [PATCH] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel This patch makes `num_teams` and `thread_limit` mandatory for bare kernels, similar to a reguar kernel language that when launching a kernel, the grid size has to be set explicitly. --- .../clang/Basic/DiagnosticSemaKinds.td| 2 + clang/lib/Sema/SemaOpenMP.cpp | 13 ++ .../nvptx_target_teams_ompx_bare_codegen.cpp | 2 +- clang/test/OpenMP/ompx_bare_messages.c| 7 +- clang/test/OpenMP/target_teams_ast_print.cpp | 4 +- clang/test/OpenMP/target_teams_codegen.cpp| 206 +- 6 files changed, 130 insertions(+), 104 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 94e97a891baedc..8b2f5d2746f7fc 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11368,6 +11368,8 @@ def err_openmp_vla_in_task_untied : Error< def warn_omp_unterminated_declare_target : Warning< "expected '#pragma omp end declare target' at end of file to match '#pragma omp %0'">, InGroup; +def err_ompx_bare_no_grid : Error< + "'ompx_bare' clauses requires explicit grid size via 'num_teams' and 'thread_limit' clauses">; } // end of OpenMP category let CategoryName = "Related Result Type Issue" in { diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index e400f248d15aa3..3826994ef2126c 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -14658,6 +14658,19 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses, } setFunctionHasBranchProtectedScope(); + const OMPClause *BareClause = nullptr; + bool HasThreadLimitAndNumTeamsClause = hasClauses(Clauses, OMPC_num_teams) && + hasClauses(Clauses, OMPC_thread_limit); + bool HasBareClause = llvm::any_of(Clauses, [&](const OMPClause *C) { +BareClause = C; +return C->getClauseKind() == OMPC_ompx_bare; + }); + + if (HasBareClause && !HasThreadLimitAndNumTeamsClause) { +Diag(BareClause->getBeginLoc(), diag::err_ompx_bare_no_grid); +return StmtError(); + } + return OMPTargetTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt); } diff --git a/clang/test/OpenMP/nvptx_target_teams_ompx_bare_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_ompx_bare_codegen.cpp index 9f8046acb09703..2e6f0a9ce0169f 100644 --- a/clang/test/OpenMP/nvptx_target_teams_ompx_bare_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_ompx_bare_codegen.cpp @@ -10,7 +10,7 @@ template tx ftemplate(int n) { tx a = 0; - #pragma omp target teams ompx_bare + #pragma omp target teams ompx_bare num_teams(1) thread_limit(32) { a = 2; } diff --git a/clang/test/OpenMP/ompx_bare_messages.c b/clang/test/OpenMP/ompx_bare_messages.c index a1b3c380285287..19ceee5625feec 100644 --- a/clang/test/OpenMP/ompx_bare_messages.c +++ b/clang/test/OpenMP/ompx_bare_messages.c @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown %s - // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s - // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown -fopenmp-targets=nvptx64 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s +// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown -fopenmp-targets=nvptx64 %s void foo() { } @@ -18,4 +18,7 @@ void bar() { #pragma omp target #pragma omp teams ompx_bare // expected-error {{unexpected OpenMP clause 'ompx_bare' in directive '#pragma omp teams'}} expected-note {{OpenMP extension clause 'ompx_bare' only allowed with '#pragma omp target teams'}} foo(); + +#pragma omp target teams ompx_bare // expected-error {{'ompx_bare' clauses requires explicit grid size via 'num_teams' and 'thread_limit' clauses}} + foo(); } diff --git a/clang/test/OpenMP/target_teams_ast_print.cpp b/clang/test/OpenMP/target_teams_ast_print.cpp index 5f1040be01a25f..8eaf4cbf249331 100644 --- a/clang/test/OpenMP/target_teams_ast_print.cpp +++ b/clang/test/OpenMP/target_teams_ast_print.cpp @@ -111,8 +111,8 @@ int main (int argc, char **argv) { // CHECK-NEXT: #pragma omp target teams a=2; // CHECK-NEXT: a = 2; -#pragma omp target teams ompx_bare -// CHECK-NEXT: #pragma omp target teams ompx_bare +#pragma omp target teams ompx_bare num_teams(1) thread_limit(32) +// CHECK-NEXT: #pragma omp target teams ompx_bare num_teams(1) thread_limit(32) a=3; // CHECK-NEXT: a = 3; #pragma omp target teams default(none), private(argc,b) num_teams(f) firstprivate(argv) reduction(| : c, d) reduction(* : e) thre
[clang] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel (PR #68373)
shiltian wrote: gentle ping https://github.com/llvm/llvm-project/pull/68373 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[lld] [flang] [clang-tools-extra] [lldb] [compiler-rt] [libcxx] [clang] [libc] [llvm] [openmp] Gcc 75 libomptarget type convert (PR #75562)
shiltian wrote: FYI: #75419 https://github.com/llvm/llvm-project/pull/75562 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel (PR #68373)
https://github.com/shiltian closed https://github.com/llvm/llvm-project/pull/68373 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [openmp] [Clang][OpenMP] Fix mapping of structs to device (PR #75642)
shiltian wrote: The newly added test `offloading/struct_mapping_with_pointers.cpp` fails on NVIDIA GPUs as well. ``` TEST 'libomptarget :: nvptx64-nvidia-cuda :: offloading/struct_mapping_with_pointers.cpp' FAILED Exit Code: 1 Command Output (stdout): -- # RUN: at line 2 /gpfs/jlse-fs0/users/ac.shilei.tian/build/llvm/release/bin/clang++ -fopenmp -pthread -I /home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/test -I /gpfs/jlse -fs0/users/ac.shilei.tian/build/openmp/release/runtime/src -L /gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget -L /gpfs/jlse-fs0/users/ac.shilei.tian/build/ll vm/release/./lib -L /gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/runtime/src -Wl,-rpath,/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget -Wl,-rpa th,/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/runtime/src -Wl,-rpath,/gpfs/jlse-fs0/users/ac.shilei.tian/build/llvm/release/./lib -Wl,-rpath,/soft/compilers/cuda/cud a-11.8.0/targets/x86_64-linux/lib --libomptarget-nvptx-bc-path=/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget/DeviceRTL -fopenmp-targets=nvptx64-nvidia-cuda /home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp -o /gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/releas e/libomptarget/test/nvptx64-nvidia-cuda/offloading/Output/struct_mapping_with_pointers.cpp.tmp /gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget/libomptarget.d evicertl.a && env LIBOMPTARGET_DEBUG=1 /gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget/test/nvptx64-nvidia-cuda/offloading/Output/struct_mapping_with_pointer s.cpp.tmp 2>&1 | /gpfs/jlse-fs0/users/ac.shilei.tian/build/llvm/release/bin/FileCheck /home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/test/offloading/struct _mapping_with_pointers.cpp # executed command: /gpfs/jlse-fs0/users/ac.shilei.tian/build/llvm/release/bin/clang++ -fopenmp -pthread -I /home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/ test -I /gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/runtime/src -L /gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget -L /gpfs/jlse-fs0/users/ac.sh ilei.tian/build/llvm/release/./lib -L /gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/runtime/src -Wl,-rpath,/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libo mptarget -Wl,-rpath,/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/runtime/src -Wl,-rpath,/gpfs/jlse-fs0/users/ac.shilei.tian/build/llvm/release/./lib -Wl,-rpath,/soft/c ompilers/cuda/cuda-11.8.0/targets/x86_64-linux/lib --libomptarget-nvptx-bc-path=/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget/DeviceRTL -fopenmp-targets=nv ptx64-nvidia-cuda /home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp -o /gpfs/jlse-fs0/users/ac.shilei.tian/bu ild/openmp/release/libomptarget/test/nvptx64-nvidia-cuda/offloading/Output/struct_mapping_with_pointers.cpp.tmp /gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarg et/libomptarget.devicertl.a # executed command: env LIBOMPTARGET_DEBUG=1 /gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget/test/nvptx64-nvidia-cuda/offloading/Output/struct_mapping_with_p ointers.cpp.tmp # executed command: /gpfs/jlse-fs0/users/ac.shilei.tian/build/llvm/release/bin/FileCheck /home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/test/offloading/str uct_mapping_with_pointers.cpp # .---command stderr # | /home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp:106:12: error: CHECK: expected string not found in inpu t # | // CHECK: dat.datum[dat.arr[0][0]] = 0 # |^ # | :124:24: note: scanning from here # | dat.val_more_datum = 18 # |^ # | :125:1: note: possible intended match here # | dat.datum[dat.arr[0][0]] = 32542 # | ^ # | # | Input file: # | Check file: /home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp # | # | -dump-input=help explains the following input dump. # | # | Input was: # | << # | . # | . # | . # |119: omptarget --> Done unregistering library! # |120: omptarget --> Deinit offload library! # |121: TARGET CUDA RTL --> Missing 2 resources to be returned # |122: dat.xi = 4 # |123: dat.val_datum = 8 # |124: dat.val_more_datum = 18 # | check:106'0X error: no match found # |125: dat.datum[dat.arr[0][0]] = 32542 # | check:106'0 ~ # | check:106'1 ?
[clang] [Clang][OpenMP] Emit unsupported directive error (PR #70233)
https://github.com/shiltian approved this pull request. LG with a nit https://github.com/llvm/llvm-project/pull/70233 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][OpenMP] Emit unsupported directive error (PR #70233)
@@ -0,0 +1,7 @@ +// RUN: %clang_cc1 -emit-llvm -fopenmp -disable-llvm-passes %s -verify=expected + +// expected-error@+2 {{cannot compile this OpenMP dispatch directive yet}} +void a(){ +#pragma omp dispatch +a(); +} shiltian wrote: Leave an empty line at the end of the file https://github.com/llvm/llvm-project/pull/70233 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][OpenMP] Emit unsupported directive error (PR #70233)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/70233 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][OpenMP] Emit unsupported directive error (PR #70233)
https://github.com/shiltian closed https://github.com/llvm/llvm-project/pull/70233 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] 114df24 - [Clang][Doc] Update the release note for clang
Author: Shilei Tian Date: 2022-07-26T15:39:21-04:00 New Revision: 114df244ec50ce0145702974335965c3aa2c3dcc URL: https://github.com/llvm/llvm-project/commit/114df244ec50ce0145702974335965c3aa2c3dcc DIFF: https://github.com/llvm/llvm-project/commit/114df244ec50ce0145702974335965c3aa2c3dcc.diff LOG: [Clang][Doc] Update the release note for clang Add the support for `atomic compare` and `atomic compare capture` in the release note of clang. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D129211 Added: Modified: clang/docs/ReleaseNotes.rst Removed: diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 74505dd30baa..84c74335ea5d 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -592,6 +592,8 @@ ABI Changes in Clang OpenMP Support in Clang --- +* Added the support for ``atomic compare`` and ``atomic compare capture`` + (``-fopenmp-version=51`` is required). ... ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] e21202d - [Clang][OpenMP] Fix the issue that `llvm.lifetime.end` is emitted too early for variables captured in linear clause
Author: Shilei Tian Date: 2022-08-06T16:50:37-04:00 New Revision: e21202dac18ed7f718d26a0e131f96b399b4891c URL: https://github.com/llvm/llvm-project/commit/e21202dac18ed7f718d26a0e131f96b399b4891c DIFF: https://github.com/llvm/llvm-project/commit/e21202dac18ed7f718d26a0e131f96b399b4891c.diff LOG: [Clang][OpenMP] Fix the issue that `llvm.lifetime.end` is emitted too early for variables captured in linear clause Currently if an OpenMP program uses `linear` clause, and is compiled with optimization, `llvm.lifetime.end` for variables listed in `linear` clause are emitted too early such that there could still be uses after that. Let's take the following code as example: ``` // loop.c int j; int *u; void loop(int n) { int i; for (i = 0; i < n; ++i) { ++j; u = &j; } } ``` We compile using the command: ``` clang -cc1 -fopenmp-simd -O3 -x c -triple x86_64-apple-darwin10 -emit-llvm loop.c -o loop.ll ``` The following IR (simplified) will be generated: ``` @j = local_unnamed_addr global i32 0, align 4 @u = local_unnamed_addr global ptr null, align 8 define void @loop(i32 noundef %n) local_unnamed_addr { entry: %j = alloca i32, align 4 %cmp = icmp sgt i32 %n, 0 br i1 %cmp, label %simd.if.then, label %simd.if.end simd.if.then: ; preds = %entry call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %j) store ptr %j, ptr @u, align 8 call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %j) %0 = load i32, ptr %j, align 4 store i32 %0, ptr @j, align 4 br label %simd.if.end simd.if.end: ; preds = %simd.if.then, %entry ret void } ``` The most important part is: ``` call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %j) %0 = load i32, ptr %j, align 4 store i32 %0, ptr @j, align 4 ``` `%j` is still loaded after `@llvm.lifetime.end.p0(i64 4, ptr nonnull %j)`. This could cause the backend incorrectly optimizes the code and further generates incorrect code. The root cause is, when we emit a construct that could have `linear` clause, it usually has the following pattern: ``` EmitOMPLinearClauseInit(S) { OMPPrivateScope LoopScope(*this); ... EmitOMPLinearClause(S, LoopScope); ... (void)LoopScope.Privatize(); ... } EmitOMPLinearClauseFinal(S, [](CodeGenFunction &) { return nullptr; }); ``` Variables that need to be privatized are added into `LoopScope`, which also serves as a RAII object. When `LoopScope` is destructed and if optimization is enabled, a `@llvm.lifetime.end` is also emitted for each privatized variable. However, the writing back to original variables in `linear` clause happens after the scope in `EmitOMPLinearClauseFinal`, causing the issue we see above. A quick "fix" seems to be, moving `EmitOMPLinearClauseFinal` inside the scope. However, it doesn't work. That's because the local variable map has been updated by `LoopScope` such that a variable declaration is mapped to the privatized variable, instead of the actual one. In that way, the following code will be generated: ``` %0 = load i32, ptr %j, align 4 store i32 %0, ptr %j, align 4 call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %j) ``` Well, now the life time is correct, but apparently the writing back is broken. In this patch, a new function `OMPPrivateScope::restoreMap` is added and called before calling `EmitOMPLinearClauseFinal`. This can make sure that `EmitOMPLinearClauseFinal` can find the orignal varaibls to write back. Fixes #56913. Reviewed By: ABataev Differential Revision: https://reviews.llvm.org/D131272 Added: clang/test/OpenMP/bug56913.c Modified: clang/lib/CodeGen/CGStmtOpenMP.cpp clang/lib/CodeGen/CodeGenFunction.h clang/test/OpenMP/for_linear_codegen.cpp clang/test/OpenMP/parallel_master_taskloop_simd_lastprivate_codegen.cpp Removed: diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 962620f43a393..5219b6e39f4e2 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -2582,8 +2582,9 @@ static void emitOMPSimdRegion(CodeGenFunction &CGF, const OMPLoopDirective &S, CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_simd); emitPostUpdateForReductionClause(CGF, S, [](CodeGenFunction &) { return nullptr; }); +LoopScope.restoreMap(); +CGF.EmitOMPLinearClauseFinal(S, [](CodeGenFunction &) { return nullptr; }); } - CGF.EmitOMPLinearClauseFinal(S, [](CodeGenFunction &) { return nullptr; }); // Emit: if (PreCond) - end. if (ContBlock) { CGF.EmitBranch(ContBlock); @@ -3428,11 +3429,12 @@ bool CodeGenFunction::EmitOMPWorksharingLoop( EmitOMPLastprivateClauseFinal( S, isOpenMPSimdDirective(S.getDirectiveKind()), Builder.CreateIsNotNull(EmitLoadOfScalar(IL, S.getBeginLoc(; + LoopScope.restoreMap(); +
[clang] 3274cdc - [Clang][OpenMP] Remove the mandatory flush for capture for OpenMP 5.1
Author: Shilei Tian Date: 2021-07-26T11:00:44-04:00 New Revision: 3274cdc83ecdf2af569ad4f564d55d0e43b1072e URL: https://github.com/llvm/llvm-project/commit/3274cdc83ecdf2af569ad4f564d55d0e43b1072e DIFF: https://github.com/llvm/llvm-project/commit/3274cdc83ecdf2af569ad4f564d55d0e43b1072e.diff LOG: [Clang][OpenMP] Remove the mandatory flush for capture for OpenMP 5.1 In OpenMP 5.1: > If the `write` or `update` clause is specifieded, the atomic operation is not > an atomic conditional update for which the comparison fails, and the > effective memory ordering is `release`, `acq_rel`, or `seq_cst`, the strong > flush on entry to the atomic operation is also a release flush. If the `read` > or `update` clause is specified and the effective memory ordering is > `acquire`, `acq_rel`, or `seq_cst` then the strong flush on exit from the > atomic operation is also an acquire flush. In OpenMP 5.0: > If the `write`, `update`, or **`capture`** clause is specified and the > `release`, `acq_rel`, or `seq_cst` clause is specified then the strong flush > on entry to the atomic operation is also a release flush. If the `read` or > `capture` clause is specified and the `acquire`, `acq_rel`, or `seq_cst` > clause is specified then the strong flush on exit from the atomic operation > is also an acquire flush. >From my understanding, in OpenMP 5.1, `capture` is removed from the >requirement for flush, therefore we don't have to enforce it. Reviewed By: ABataev Differential Revision: https://reviews.llvm.org/D100768 Added: Modified: clang/lib/CodeGen/CGStmtOpenMP.cpp clang/test/OpenMP/atomic_capture_codegen.cpp Removed: diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 486b48bca0a62..1f913590339f8 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -5725,32 +5725,35 @@ static void emitOMPAtomicCaptureExpr(CodeGenFunction &CGF, // Emit post-update store to 'v' of old/new 'x' value. CGF.emitOMPSimpleStore(VLValue, NewVVal, NewVValType, Loc); CGF.CGM.getOpenMPRuntime().checkAndEmitLastprivateConditional(CGF, V); - // OpenMP, 2.17.7, atomic Construct - // If the write, update, or capture clause is specified and the release, - // acq_rel, or seq_cst clause is specified then the strong flush on entry to - // the atomic operation is also a release flush. - // If the read or capture clause is specified and the acquire, acq_rel, or - // seq_cst clause is specified then the strong flush on exit from the atomic - // operation is also an acquire flush. - switch (AO) { - case llvm::AtomicOrdering::Release: -CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc, - llvm::AtomicOrdering::Release); -break; - case llvm::AtomicOrdering::Acquire: -CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc, - llvm::AtomicOrdering::Acquire); -break; - case llvm::AtomicOrdering::AcquireRelease: - case llvm::AtomicOrdering::SequentiallyConsistent: -CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc, - llvm::AtomicOrdering::AcquireRelease); -break; - case llvm::AtomicOrdering::Monotonic: -break; - case llvm::AtomicOrdering::NotAtomic: - case llvm::AtomicOrdering::Unordered: -llvm_unreachable("Unexpected ordering."); + // OpenMP 5.1 removes the required flush for capture clause. + if (CGF.CGM.getLangOpts().OpenMP < 51) { +// OpenMP, 2.17.7, atomic Construct +// If the write, update, or capture clause is specified and the release, +// acq_rel, or seq_cst clause is specified then the strong flush on entry to +// the atomic operation is also a release flush. +// If the read or capture clause is specified and the acquire, acq_rel, or +// seq_cst clause is specified then the strong flush on exit from the atomic +// operation is also an acquire flush. +switch (AO) { +case llvm::AtomicOrdering::Release: + CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc, + llvm::AtomicOrdering::Release); + break; +case llvm::AtomicOrdering::Acquire: + CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc, + llvm::AtomicOrdering::Acquire); + break; +case llvm::AtomicOrdering::AcquireRelease: +case llvm::AtomicOrdering::SequentiallyConsistent: + CGF.CGM.getOpenMPRuntime().emitFlush( + CGF, llvm::None, Loc, llvm::AtomicOrdering::AcquireRelease); + break; +case llvm::AtomicOrdering::Monotonic: + break; +case llvm::AtomicOrdering::NotAtomic: +case llvm::AtomicOrdering::Unordered: + llvm_unreachable("Unexpected ordering."); +} } } diff --git a/clang/test/OpenMP/atomic_capture
[clang] 52e6a27 - Clean up `OMPAtomicDirective::Create`
Author: Shilei Tian Date: 2022-04-15T11:41:26-04:00 New Revision: 52e6a27690ca8e5f07cc646716c3736475b7746b URL: https://github.com/llvm/llvm-project/commit/52e6a27690ca8e5f07cc646716c3736475b7746b DIFF: https://github.com/llvm/llvm-project/commit/52e6a27690ca8e5f07cc646716c3736475b7746b.diff LOG: Clean up `OMPAtomicDirective::Create` Added: Modified: clang/include/clang/AST/StmtOpenMP.h clang/lib/AST/StmtOpenMP.cpp clang/lib/Sema/SemaOpenMP.cpp Removed: diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h index 28b3567b36556..0aa318d84a93f 100644 --- a/clang/include/clang/AST/StmtOpenMP.h +++ b/clang/include/clang/AST/StmtOpenMP.h @@ -2889,6 +2889,27 @@ class OMPAtomicDirective : public OMPExecutableDirective { void setCond(Expr *C) { Data->getChildren()[DataPositionTy::POS_Cond] = C; } public: + struct Expressions { +/// 'x' part of the associated expression/statement. +Expr *X = nullptr; +/// 'v' part of the associated expression/statement. +Expr *V = nullptr; +/// 'expr' part of the associated expression/statement. +Expr *E = nullptr; +/// UE Helper expression of the form: +/// 'OpaqueValueExpr(x) binop OpaqueValueExpr(expr)' or +/// 'OpaqueValueExpr(expr) binop OpaqueValueExpr(x)'. +Expr *UE = nullptr; +/// 'd' part of the associated expression/statement. +Expr *D = nullptr; +/// Conditional expression in `atomic compare` construct. +Expr *Cond = nullptr; +/// True if UE has the first form and false if the second. +bool IsXLHSInRHSPart; +/// True if original value of 'x' must be stored in 'v', not an updated one. +bool IsPostfixUpdate; + }; + /// Creates directive with a list of \a Clauses and 'x', 'v' and 'expr' /// parts of the atomic construct (see Section 2.12.6, atomic Construct, for /// detailed description of 'x', 'v' and 'expr'). @@ -2898,23 +2919,12 @@ class OMPAtomicDirective : public OMPExecutableDirective { /// \param EndLoc Ending Location of the directive. /// \param Clauses List of clauses. /// \param AssociatedStmt Statement, associated with the directive. - /// \param X 'x' part of the associated expression/statement. - /// \param V 'v' part of the associated expression/statement. - /// \param E 'expr' part of the associated expression/statement. - /// \param UE Helper expression of the form - /// 'OpaqueValueExpr(x) binop OpaqueValueExpr(expr)' or - /// 'OpaqueValueExpr(expr) binop OpaqueValueExpr(x)'. - /// \param D 'd' part of the associated expression/statement. - /// \param Cond Conditional expression in `atomic compare` construct. - /// \param IsXLHSInRHSPart true if \a UE has the first form and false if the - /// second. - /// \param IsPostfixUpdate true if original value of 'x' must be stored in - /// 'v', not an updated one. - static OMPAtomicDirective * - Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc, - ArrayRef Clauses, Stmt *AssociatedStmt, Expr *X, Expr *V, - Expr *E, Expr *UE, Expr *D, Expr *Cond, bool IsXLHSInRHSPart, - bool IsPostfixUpdate); + /// \param Exprs Associated expressions or statements. + static OMPAtomicDirective *Create(const ASTContext &C, +SourceLocation StartLoc, +SourceLocation EndLoc, +ArrayRef Clauses, +Stmt *AssociatedStmt, Expressions Exprs); /// Creates an empty directive with the place for \a NumClauses /// clauses. diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp index 84a4de00328a8..15e13da27dd84 100644 --- a/clang/lib/AST/StmtOpenMP.cpp +++ b/clang/lib/AST/StmtOpenMP.cpp @@ -866,19 +866,17 @@ OMPOrderedDirective *OMPOrderedDirective::CreateEmpty(const ASTContext &C, OMPAtomicDirective * OMPAtomicDirective::Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc, ArrayRef Clauses, - Stmt *AssociatedStmt, Expr *X, Expr *V, Expr *E, - Expr *UE, Expr *D, Expr *Cond, bool IsXLHSInRHSPart, - bool IsPostfixUpdate) { + Stmt *AssociatedStmt, Expressions Exprs) { auto *Dir = createDirective( C, Clauses, AssociatedStmt, /*NumChildren=*/6, StartLoc, EndLoc); - Dir->setX(X); - Dir->setV(V); - Dir->setExpr(E); - Dir->setUpdateExpr(UE); - Dir->setD(D); - Dir->setCond(Cond); - Dir->IsXLHSInRHSPart = IsXLHSInRHSPart; - Dir->IsPostfixUpdate = IsPostfixUpdate; + Dir->setX(Exprs.X); + Dir->setV(Exprs.V); + Dir->setExpr(Exprs.E); + Dir->setUpdateExpr(Exprs.UE); + Dir->setD(Exprs.D); + Dir->setCond(Exprs.Cond); + Dir->IsXLHSInRHSPart = Exprs.IsXLHSInRHSPart; + Dir->IsPostfixUpdate = Exprs.IsPostf
[clang] e8760b5 - [Clang][OpenMP] Use bitfields for flags in `OMPAtomicDirective`
Author: Shilei Tian Date: 2022-04-15T21:34:28-04:00 New Revision: e8760b51ee0f972587cb0af922a3f828ab6926d6 URL: https://github.com/llvm/llvm-project/commit/e8760b51ee0f972587cb0af922a3f828ab6926d6 DIFF: https://github.com/llvm/llvm-project/commit/e8760b51ee0f972587cb0af922a3f828ab6926d6.diff LOG: [Clang][OpenMP] Use bitfields for flags in `OMPAtomicDirective` As suggested in D120290. Reviewed By: ABataev Differential Revision: https://reviews.llvm.org/D123862 Added: Modified: clang/include/clang/AST/StmtOpenMP.h clang/lib/AST/StmtOpenMP.cpp clang/lib/Serialization/ASTReaderStmt.cpp Removed: diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h index 0aa318d84a93f..dfaf8b5a77385 100644 --- a/clang/include/clang/AST/StmtOpenMP.h +++ b/clang/include/clang/AST/StmtOpenMP.h @@ -2827,25 +2827,28 @@ class OMPOrderedDirective : public OMPExecutableDirective { class OMPAtomicDirective : public OMPExecutableDirective { friend class ASTStmtReader; friend class OMPExecutableDirective; - /// Used for 'atomic update' or 'atomic capture' constructs. They may - /// have atomic expressions of forms - /// \code - /// x = x binop expr; - /// x = expr binop x; - /// \endcode - /// This field is true for the first form of the expression and false for the - /// second. Required for correct codegen of non-associative operations (like - /// << or >>). - bool IsXLHSInRHSPart = false; - /// Used for 'atomic update' or 'atomic capture' constructs. They may - /// have atomic expressions of forms - /// \code - /// v = x; ; - /// ; v = x; - /// \endcode - /// This field is true for the first(postfix) form of the expression and false - /// otherwise. - bool IsPostfixUpdate = false; + + struct FlagTy { +/// Used for 'atomic update' or 'atomic capture' constructs. They may +/// have atomic expressions of forms: +/// \code +/// x = x binop expr; +/// x = expr binop x; +/// \endcode +/// This field is 1 for the first form of the expression and 0 for the +/// second. Required for correct codegen of non-associative operations (like +/// << or >>). +uint8_t IsXLHSInRHSPart : 1; +/// Used for 'atomic update' or 'atomic capture' constructs. They may +/// have atomic expressions of forms: +/// \code +/// v = x; ; +/// ; v = x; +/// \endcode +/// This field is 1 for the first(postfix) form of the expression and 0 +/// otherwise. +uint8_t IsPostfixUpdate : 1; + } Flags; /// Build directive with the given start and end location. /// @@ -2956,10 +2959,10 @@ class OMPAtomicDirective : public OMPExecutableDirective { /// Return true if helper update expression has form /// 'OpaqueValueExpr(x) binop OpaqueValueExpr(expr)' and false if it has form /// 'OpaqueValueExpr(expr) binop OpaqueValueExpr(x)'. - bool isXLHSInRHSPart() const { return IsXLHSInRHSPart; } + bool isXLHSInRHSPart() const { return Flags.IsXLHSInRHSPart; } /// Return true if 'v' expression must be updated to original value of /// 'x', false if 'v' must be updated to the new value of 'x'. - bool isPostfixUpdate() const { return IsPostfixUpdate; } + bool isPostfixUpdate() const { return Flags.IsPostfixUpdate; } /// Get 'v' part of the associated expression/statement. Expr *getV() { return cast_or_null(Data->getChildren()[DataPositionTy::POS_V]); diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp index 15e13da27dd84..3535b0620ee50 100644 --- a/clang/lib/AST/StmtOpenMP.cpp +++ b/clang/lib/AST/StmtOpenMP.cpp @@ -875,8 +875,8 @@ OMPAtomicDirective::Create(const ASTContext &C, SourceLocation StartLoc, Dir->setUpdateExpr(Exprs.UE); Dir->setD(Exprs.D); Dir->setCond(Exprs.Cond); - Dir->IsXLHSInRHSPart = Exprs.IsXLHSInRHSPart; - Dir->IsPostfixUpdate = Exprs.IsPostfixUpdate; + Dir->Flags.IsXLHSInRHSPart = Exprs.IsXLHSInRHSPart ? 1 : 0; + Dir->Flags.IsPostfixUpdate = Exprs.IsPostfixUpdate ? 1 : 0; return Dir; } diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index ed9f1d2b34289..281385ad9e7d9 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -2449,8 +2449,8 @@ void ASTStmtReader::VisitOMPOrderedDirective(OMPOrderedDirective *D) { void ASTStmtReader::VisitOMPAtomicDirective(OMPAtomicDirective *D) { VisitStmt(D); VisitOMPExecutableDirective(D); - D->IsXLHSInRHSPart = Record.readBool(); - D->IsPostfixUpdate = Record.readBool(); + D->Flags.IsXLHSInRHSPart = Record.readBool() ? 1 : 0; + D->Flags.IsPostfixUpdate = Record.readBool() ? 1 : 0; } void ASTStmtReader::VisitOMPTargetDirective(OMPTargetDirective *D) { ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cg
[clang] b35be6f - [Clang][Sema][OpenMP] Sema support for `atomic compare`
Author: Shilei Tian Date: 2022-02-04T12:30:56-05:00 New Revision: b35be6fe98e30b2373e8fdf024ef8c13a32121d7 URL: https://github.com/llvm/llvm-project/commit/b35be6fe98e30b2373e8fdf024ef8c13a32121d7 DIFF: https://github.com/llvm/llvm-project/commit/b35be6fe98e30b2373e8fdf024ef8c13a32121d7.diff LOG: [Clang][Sema][OpenMP] Sema support for `atomic compare` This patch adds the Sema support for `atomic compare`. Reviewed By: ABataev Differential Revision: https://reviews.llvm.org/D116637 Added: Modified: clang/include/clang/Basic/DiagnosticSemaKinds.td clang/lib/CodeGen/CGStmtOpenMP.cpp clang/lib/Sema/SemaOpenMP.cpp clang/test/OpenMP/atomic_ast_print.cpp clang/test/OpenMP/atomic_messages.c clang/test/OpenMP/atomic_messages.cpp Removed: diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index d5e653a7fa192..7f73b9b285e37 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -10516,6 +10516,15 @@ def err_omp_atomic_capture_not_compound_statement : Error< " where x is an lvalue expression with scalar type">; def note_omp_atomic_capture: Note< "%select{expected assignment expression|expected compound statement|expected exactly two expression statements|expected in right hand side of the first expression}0">; +def err_omp_atomic_compare : Error< + "the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}'," + " '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}'," + " 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type," + " and 'ordop' is one of '<' or '>'.">; +def note_omp_atomic_compare: Note< + "%select{expected compound statement|expected exactly one expression statement|expected assignment statement|expected conditional operator|expect result value to be at false expression|" + "expect binary operator in conditional expression|expect '<', '>' or '==' as order operator|expect comparison in a form of 'x == e', 'e == x', 'x ordop expr', or 'expr ordop x'|" + "expect lvalue for result value|expect scalar value|expect integer value}0">; def err_omp_atomic_several_clauses : Error< "directive '#pragma omp atomic' cannot contain more than one 'read', 'write', 'update', 'capture', or 'compare' clause">; def err_omp_several_mem_order_clauses : Error< diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 39dd4c00765d2..596cef8ce60c1 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -6031,9 +6031,13 @@ static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind, emitOMPAtomicCaptureExpr(CGF, AO, IsPostfixUpdate, V, X, E, UE, IsXLHSInRHSPart, Loc); break; - case OMPC_compare: -// Do nothing here as we already emit an error. + case OMPC_compare: { +// Emit an error here. +unsigned DiagID = CGF.CGM.getDiags().getCustomDiagID( +DiagnosticsEngine::Error, "'atomic compare' is not supported for now"); +CGF.CGM.getDiags().Report(DiagID); break; + } case OMPC_if: case OMPC_final: case OMPC_num_threads: diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index a4092f0d2b543..1fafd58ba3edd 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -10925,6 +10925,357 @@ bool OpenMPAtomicUpdateChecker::checkStatement(Stmt *S, unsigned DiagId, } return ErrorFound != NoError; } + +/// Get the node id of the fixed point of an expression \a S. +llvm::FoldingSetNodeID getNodeId(ASTContext &Context, const Expr *S) { + llvm::FoldingSetNodeID Id; + S->IgnoreParenImpCasts()->Profile(Id, Context, true); + return Id; +} + +/// Check if two expressions are same. +bool checkIfTwoExprsAreSame(ASTContext &Context, const Expr *LHS, +const Expr *RHS) { + return getNodeId(Context, LHS) == getNodeId(Context, RHS); +} + +class OpenMPAtomicCompareChecker { +public: + /// All kinds of errors that can occur in `atomic compare` + enum ErrorTy { +/// Empty compound statement. +NoStmt = 0, +/// More than one statement in a compound statement. +MoreThanOneStmt, +/// Not an assignment binary operator. +NotAnAssignment, +/// Not a conditional operator. +NotCondOp, +/// Wrong false expr. According to the spec, 'x' should be at the false +/// expression of a conditional expression. +WrongFalseExpr, +/// The condition of a conditional expression is not a binary operator. +NotABinaryOp, +/// Invalid binary operator (not <, >,
[clang] b8ec430 - [Clang][Sema][OpenMP] Fix uninitialized variable Op
Author: Shilei Tian Date: 2022-02-04T15:00:43-05:00 New Revision: b8ec430de71766d9a35a6b737c8a789c0c7cf812 URL: https://github.com/llvm/llvm-project/commit/b8ec430de71766d9a35a6b737c8a789c0c7cf812 DIFF: https://github.com/llvm/llvm-project/commit/b8ec430de71766d9a35a6b737c8a789c0c7cf812.diff LOG: [Clang][Sema][OpenMP] Fix uninitialized variable Op This can fix the case atomic_messages Added: Modified: clang/lib/Sema/SemaOpenMP.cpp Removed: diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 4143e070a8738..7486e1389172c 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -11063,8 +11063,18 @@ bool OpenMPAtomicCompareChecker::checkCondUpdateStmt(IfStmt *S, ErrorInfo.ErrorRange = ErrorInfo.NoteRange = S->getCond()->getSourceRange(); return false; } - if (Cond->getOpcode() != BO_EQ && Cond->getOpcode() != BO_LT && - Cond->getOpcode() != BO_GT) { + + switch (Cond->getOpcode()) { + case BO_EQ: +Op = OMPAtomicCompareOp::EQ; +break; + case BO_LT: +Op = OMPAtomicCompareOp::MIN; +break; + case BO_GT: +Op = OMPAtomicCompareOp::MAX; +break; + default: ErrorInfo.Error = ErrorTy::InvalidBinaryOp; ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc(); ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange(); @@ -11148,8 +11158,17 @@ bool OpenMPAtomicCompareChecker::checkCondExprStmt(Stmt *S, return false; } - if (Cond->getOpcode() != BO_EQ && Cond->getOpcode() != BO_LT && - Cond->getOpcode() != BO_GT) { + switch (Cond->getOpcode()) { + case BO_EQ: +Op = OMPAtomicCompareOp::EQ; +break; + case BO_LT: +Op = OMPAtomicCompareOp::MIN; +break; + case BO_GT: +Op = OMPAtomicCompareOp::MAX; +break; + default: ErrorInfo.Error = ErrorTy::InvalidBinaryOp; ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc(); ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange(); ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] 20a9fb9 - [Clang][OpenMP] Fix the issue that temp cubin files are not removed after compilation when using new OpenMP driver
Author: Shilei Tian Date: 2022-04-22T18:07:28-04:00 New Revision: 20a9fb953e46b1d97aaee7b182b0f3d48f340bd1 URL: https://github.com/llvm/llvm-project/commit/20a9fb953e46b1d97aaee7b182b0f3d48f340bd1 DIFF: https://github.com/llvm/llvm-project/commit/20a9fb953e46b1d97aaee7b182b0f3d48f340bd1.diff LOG: [Clang][OpenMP] Fix the issue that temp cubin files are not removed after compilation when using new OpenMP driver The root cause of this is, in `NVPTX::Assembler::ConstructJob`, the output file name might not match the `Output`'s file name passed into the function because `CudaToolChain::getInputFilename` is a specialized version. That means the real output file is not added to the temp files list, which will be all removed in the d'tor of `Compilation`. In order to "fix" it, in the function `NVPTX::OpenMPLinker::ConstructJob`, before calling `clang-nvlink-wrapper`, the function calls `getToolChain().getInputFilename(II)` to get the right output file name for each input, and add it to temp file, and then they can be removed w/o any issue. However, this whole logic doesn't work when using the new OpenMP driver because `NVPTX::OpenMPLinker::ConstructJob` is not called at all, which causing the issue that the cubin file generated in each single unit compilation is out of track. In this patch, we add the real output file into temp files if its name doesn't match `Output`. We add it when the file is an output instead of doing it when it is an input, like what we did in `NVPTX::OpenMPLinker::ConstructJob`, which makes more sense. Reviewed By: jhuber6 Differential Revision: https://reviews.llvm.org/D124253 Added: Modified: clang/lib/Driver/ToolChains/Cuda.cpp Removed: diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index f8a06a2e09ab7..6103c42bf7547 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -447,7 +447,10 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("--gpu-name"); CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch))); CmdArgs.push_back("--output-file"); - CmdArgs.push_back(Args.MakeArgString(TC.getInputFilename(Output))); + const char *OutputFileName = Args.MakeArgString(TC.getInputFilename(Output)); + if (std::string(OutputFileName) != std::string(Output.getFilename())) +C.addTempFile(OutputFileName); + CmdArgs.push_back(OutputFileName); for (const auto& II : Inputs) CmdArgs.push_back(Args.MakeArgString(II.getFilename())); @@ -606,8 +609,8 @@ void NVPTX::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA, if (!II.isFilename()) continue; -const char *CubinF = C.addTempFile( -C.getArgs().MakeArgString(getToolChain().getInputFilename(II))); +const char *CubinF = +C.getArgs().MakeArgString(getToolChain().getInputFilename(II)); CmdArgs.push_back(CubinF); } ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] 9c1085c - [Clang][OpenMP] Add the support for floating-point variables for specific atomic clauses
Author: Shilei Tian Date: 2022-05-03T11:30:54-04:00 New Revision: 9c1085c7e20bdd7c4a487f50313ebeeb2b6683b8 URL: https://github.com/llvm/llvm-project/commit/9c1085c7e20bdd7c4a487f50313ebeeb2b6683b8 DIFF: https://github.com/llvm/llvm-project/commit/9c1085c7e20bdd7c4a487f50313ebeeb2b6683b8.diff LOG: [Clang][OpenMP] Add the support for floating-point variables for specific atomic clauses Currently when using `atomic update` with floating-point variables, if the operation is add or sub, `cmpxchg`, instead of `atomicrmw` is emitted, as shown in [1]. In fact, about three years ago, llvm-svn: 351850 added the support for FP operations. This patch adds the support in OpenMP as well. [1] https://godbolt.org/z/M7b4ba9na Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D124724 Added: Modified: clang/lib/CodeGen/CGStmtOpenMP.cpp clang/test/OpenMP/atomic_capture_codegen.cpp clang/test/OpenMP/atomic_update_codegen.cpp clang/test/OpenMP/for_reduction_codegen.cpp clang/test/OpenMP/parallel_reduction_codegen.cpp clang/test/OpenMP/reduction_implicit_map.cpp clang/test/OpenMP/sections_reduction_codegen.cpp Removed: diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index fc8156a78d6a7..5e75e8884bb49 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -5838,25 +5838,38 @@ static std::pair emitOMPAtomicRMW(CodeGenFunction &CGF, LValue X, // Allow atomicrmw only if 'x' and 'update' are integer values, lvalue for 'x' // expression is simple and atomic is allowed for the given type for the // target platform. - if (BO == BO_Comma || !Update.isScalar() || - !Update.getScalarVal()->getType()->isIntegerTy() || !X.isSimple() || + if (BO == BO_Comma || !Update.isScalar() || !X.isSimple() || (!isa(Update.getScalarVal()) && (Update.getScalarVal()->getType() != X.getAddress(CGF).getElementType())) || - !X.getAddress(CGF).getElementType()->isIntegerTy() || !Context.getTargetInfo().hasBuiltinAtomic( Context.getTypeSize(X.getType()), Context.toBits(X.getAlignment( return std::make_pair(false, RValue::get(nullptr)); + auto &&CheckAtomicSupport = [&CGF](llvm::Type *T, BinaryOperatorKind BO) { +if (T->isIntegerTy()) + return true; + +if (T->isFloatingPointTy() && (BO == BO_Add || BO == BO_Sub)) + return llvm::isPowerOf2_64(CGF.CGM.getDataLayout().getTypeStoreSize(T)); + +return false; + }; + + if (!CheckAtomicSupport(Update.getScalarVal()->getType(), BO) || + !CheckAtomicSupport(X.getAddress(CGF).getElementType(), BO)) +return std::make_pair(false, RValue::get(nullptr)); + + bool IsInteger = X.getAddress(CGF).getElementType()->isIntegerTy(); llvm::AtomicRMWInst::BinOp RMWOp; switch (BO) { case BO_Add: -RMWOp = llvm::AtomicRMWInst::Add; +RMWOp = IsInteger ? llvm::AtomicRMWInst::Add : llvm::AtomicRMWInst::FAdd; break; case BO_Sub: if (!IsXLHSInRHSPart) return std::make_pair(false, RValue::get(nullptr)); -RMWOp = llvm::AtomicRMWInst::Sub; +RMWOp = IsInteger ? llvm::AtomicRMWInst::Sub : llvm::AtomicRMWInst::FSub; break; case BO_And: RMWOp = llvm::AtomicRMWInst::And; @@ -5914,9 +5927,13 @@ static std::pair emitOMPAtomicRMW(CodeGenFunction &CGF, LValue X, } llvm::Value *UpdateVal = Update.getScalarVal(); if (auto *IC = dyn_cast(UpdateVal)) { -UpdateVal = CGF.Builder.CreateIntCast( -IC, X.getAddress(CGF).getElementType(), -X.getType()->hasSignedIntegerRepresentation()); +if (IsInteger) + UpdateVal = CGF.Builder.CreateIntCast( + IC, X.getAddress(CGF).getElementType(), + X.getType()->hasSignedIntegerRepresentation()); +else + UpdateVal = CGF.Builder.CreateCast(llvm::Instruction::CastOps::UIToFP, IC, + X.getAddress(CGF).getElementType()); } llvm::Value *Res = CGF.Builder.CreateAtomicRMW(RMWOp, X.getPointer(CGF), UpdateVal, AO); diff --git a/clang/test/OpenMP/atomic_capture_codegen.cpp b/clang/test/OpenMP/atomic_capture_codegen.cpp index c5f45a39232c0..95509df9ba935 100644 --- a/clang/test/OpenMP/atomic_capture_codegen.cpp +++ b/clang/test/OpenMP/atomic_capture_codegen.cpp @@ -216,20 +216,8 @@ int main(void) { #pragma omp atomic capture llv = ullx |= ullv; // CHECK: [[EXPR:%.+]] = load float, float* @{{.+}}, -// CHECK: [[X:%.+]] = load atomic i32, i32* bitcast (float* [[X_ADDR:@.+]] to i32*) monotonic, align 4 -// CHECK: br label %[[CONT:.+]] -// CHECK: [[CONT]] -// CHECK: [[EXPECTED:%.+]] = phi i32 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], %[[CONT]] ] -// CHECK: [[TEMP_I:%.+]] = bitcast float* [[TEMP:%.+]] to i32* -// CHECK: [[OLD:%.+]] = bitcast i32 [[EXPECTED]] to float +// CHECK: [[OLD:%.+]] = atomicrmw fadd float* @{{.+}}, float
[clang] AMDGPU: Rename and add bf16 support for global_load_tr builtins (PR #86202)
@@ -432,13 +432,15 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32") - -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", "gfx12-insts,wavefrontsize32") + +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64") shiltian wrote: Do we still want to keep the old builtins to maintain compatibility, though I doubt there is any legacy code using them? https://github.com/llvm/llvm-project/pull/86202 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] AMDGPU: Rename and add bf16 support for global_load_tr builtins (PR #86202)
shiltian wrote: > > > > Do you want to rename intrinsics as well? Because now intrinsic names > > > > do not match builtin names. > > > > > > > > > Do we have to match builtins with intrinsics? Renaming intrinsics here > > > means we will have to duplicate the intrinsics. > > > > > > Is that because of the mangling? > > Right. It was originally suggested to use a single instrinsic "load_lr". > > But eventually we use global_load_tr to indicate this is in global address > > space. If we want to rename intrinsics here, it should be > > global_load_tr_b64 and global_load_tr_b128. > > We should rename intrinsic if users can use intrinsics directly. I think > use-friendly is more important. I don't think intrinsics are meant for users. Builtins are the user-facing front. :-) https://github.com/llvm/llvm-project/pull/86202 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP] Allow dynamic `condition` selector in Metadirective (PR #86457)
shiltian wrote: I'm not familiar with that section of code. Maybe @jdoerfert could give you more insights. https://github.com/llvm/llvm-project/pull/86457 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][OpenMP] Fix `!isNull() && "Cannot retrieve a NULL type pointer"' fail. (PR #81015)
@@ -21124,6 +21124,8 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data, ExprTy = ATy->getElementType(); else ExprTy = BaseType->getPointeeType(); +if (ExprTy.isNull()) + continue; shiltian wrote: I'm not sure to `continue` here is a good idea. Probably you want `return nullptr;`. https://github.com/llvm/llvm-project/pull/81015 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][OpenMP] Fix `!isNull() && "Cannot retrieve a NULL type pointer"' fail. (PR #81015)
https://github.com/shiltian approved this pull request. https://github.com/llvm/llvm-project/pull/81015 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From a535bf3e8cd9b10d87281f94fed68fc300f3e24c Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 7 Feb 2024 14:43:24 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 12 ++-- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 5 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 66 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 10 +++ .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 60 + llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 22 ++- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 7 ++ llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 +- .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 14 ++-- llvm/test/MC/AMDGPU/bf16_imm.s| 8 +++ 16 files changed, 204 insertions(+), 65 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 213311b96df74f..4fe236e8aca12d 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -246,8 +246,8 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomi TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot10-insts") TARGET_BUILTIN(__builtin_amdgcn_fdot2_f16_f16, "hV2hV2hh", "nc", "dot9-insts") -TARGET_BUILTIN(__builtin_amdgcn_fdot2_bf16_bf16, "sV2sV2ss", "nc", "dot9-insts") -TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2sV2sfIb", "nc", "dot9-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot2_bf16_bf16, "yV2yV2yy", "nc", "dot9-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2yV2yfIb", "nc", "dot9-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot2-insts") TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dot2-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dot1-insts") diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..0f29653f1f5bec 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; @@ -2835,8 +2835,8 @@ def int_amdgcn_fdot2_f32_bf16 : DefaultAttrsIntrinsic< [llvm_float_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b llvm_float_ty, // %c llvm_i1_ty // %clamp ], diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index dd38317c26bff6..a1c638d931b7f8 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (U.getType()->getScalarType()->isBFloatTy() || - U.getOperand(0)->getType()->getScalarType()->isBFloatTy()) + if (Opcode != TargetOpcode::G_BITCAST && + (U.getType()->getScalarType()->isBFloatTy() || + U.getOperand(0)->getType()->getScalarType()->isBFloatTy())) return false; Register Op = getOrCreateVReg(*U.getOperand(0)); Register Res = getOrCreateVReg(U); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 225e781588668f..787217171721d8 100644 --- a/llvm/lib/Target/A
[clang] [llvm] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (U.getType()->getScalarType()->isBFloatTy() || - U.getOperand(0)->getType()->getScalarType()->isBFloatTy()) + if (Opcode != TargetOpcode::G_BITCAST && shiltian wrote: I think bitcast can still be supported even if we don't know how to convert from bfloat to other types mentioned in #71470. If so, we might want a separate patch for this. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][OpenMP] Add a NULL check (PR #77131)
https://github.com/shiltian closed https://github.com/llvm/llvm-project/pull/77131 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang][OpenMP] Add a NULL check (PR #77131)
shiltian wrote: This has been fixed by #81015. https://github.com/llvm/llvm-project/pull/77131 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [transforms] Inline simple variadic functions (PR #81058)
@@ -0,0 +1,698 @@ +//===-- ExpandVariadicsPass.cpp *- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This is an optimisation pass for variadic functions. If called from codegen, +// it can serve as the implementation of variadic functions for a given target. +// +// The target-dependent parts are in namespace VariadicABIInfo. Enabling a new +// target means adding a case to VariadicABIInfo::create() along with tests. +// +// The module pass using that information is class ExpandVariadics. +// +// The strategy is: +// 1. Test whether a variadic function is sufficiently simple +// 2. If it was, calls to it can be replaced with calls to a different function +// 3. If it wasn't, try to split it into a simple function and a remainder +// 4. Optionally rewrite the varadic function calling convention as well +// +// This pass considers "sufficiently simple" to mean a variadic function that +// calls into a different function taking a va_list to do the real work. For +// example, libc might implement fprintf as a single basic block calling into +// vfprintf. This pass can then rewrite call to the variadic into some code +// to construct a target-specific value to use for the va_list and a call +// into the non-variadic implementation function. There's a test for that. +// +// Most other variadic functions whose definition is known can be converted into +// that form. Create a new internal function taking a va_list where the original +// took a ... parameter. Move the blocks across. Create a new block containing a +// va_start that calls into the new function. This is nearly target independent. +// +// Where this transform is consistent with the ABI, e.g. AMDGPU or NVPTX, or +// where the ABI can be chosen to align with this transform, the function +// interface can be rewritten along with calls to unknown variadic functions. +// +// The aggregate effect is to unblock other transforms, most critically the +// general purpose inliner. Known calls to variadic functions become zero cost. +// +// This pass does define some target specific information which is partially +// redundant with other parts of the compiler. In particular, the call frame +// it builds must be the exact complement of the va_arg lowering performed +// by clang. The va_list construction is similar to work done by the backend +// for targets that lower variadics there, though distinct in that this pass +// constructs the pieces using alloca instead of relative to stack pointers. +// +// Consistency with clang is primarily tested by emitting va_arg using clang +// then expanding the variadic functions using this pass, followed by trying +// to constant fold the functions to no-ops. +// +// Target specific behaviour is tested in IR - mainly checking that values are +// put into positions in call frames that make sense for that particular target. +// +//===--===// + +#include "llvm/Transforms/IPO/ExpandVariadics.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/TargetParser/Triple.h" + +#define DEBUG_TYPE "expand-variadics" + +using namespace llvm; + +namespace { +namespace VariadicABIInfo { + +// calling convention for passing as valist object, same as it would be in C +// aarch64 uses byval +enum class valistCC { value, pointer, /*byval*/ }; + +struct Interface { +protected: + Interface(uint32_t MinAlign, uint32_t MaxAlign) + : MinAlign(MinAlign), MaxAlign(MaxAlign) {} + +public: + virtual ~Interface() {} + const uint32_t MinAlign; + const uint32_t MaxAlign; + + // Most ABIs use a void* or char* for va_list, others can specialise + virtual Type *vaListType(LLVMContext &Ctx) { +return PointerType::getUnqual(Ctx); + } + + // How the vaListType is passed + virtual valistCC vaListCC() { return valistCC::value; } + + // The valist might need to be stack allocated. + virtual bool valistOnStack() { return false; } + + virtual void initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder, +AllocaInst * /*va_list*/, Value * /*buffer*/) { +// Function needs to be implemented if valist is on the stack +assert(!valistOnStack()); +__builtin_unreachable(); + } + + // All targets currently implemented use a ptr for the valist parameter + Type *vaListParameterType(LLVMContext &Ctx) { +return PointerType::getUnqual(Ctx); + } + + bool VAEndIsNop() { return
[clang] [llvm] [transforms] Inline simple variadic functions (PR #81058)
@@ -0,0 +1,698 @@ +//===-- ExpandVariadicsPass.cpp *- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This is an optimisation pass for variadic functions. If called from codegen, +// it can serve as the implementation of variadic functions for a given target. +// +// The target-dependent parts are in namespace VariadicABIInfo. Enabling a new +// target means adding a case to VariadicABIInfo::create() along with tests. +// +// The module pass using that information is class ExpandVariadics. +// +// The strategy is: +// 1. Test whether a variadic function is sufficiently simple +// 2. If it was, calls to it can be replaced with calls to a different function +// 3. If it wasn't, try to split it into a simple function and a remainder +// 4. Optionally rewrite the varadic function calling convention as well +// +// This pass considers "sufficiently simple" to mean a variadic function that +// calls into a different function taking a va_list to do the real work. For +// example, libc might implement fprintf as a single basic block calling into +// vfprintf. This pass can then rewrite call to the variadic into some code +// to construct a target-specific value to use for the va_list and a call +// into the non-variadic implementation function. There's a test for that. +// +// Most other variadic functions whose definition is known can be converted into +// that form. Create a new internal function taking a va_list where the original +// took a ... parameter. Move the blocks across. Create a new block containing a +// va_start that calls into the new function. This is nearly target independent. +// +// Where this transform is consistent with the ABI, e.g. AMDGPU or NVPTX, or +// where the ABI can be chosen to align with this transform, the function +// interface can be rewritten along with calls to unknown variadic functions. +// +// The aggregate effect is to unblock other transforms, most critically the +// general purpose inliner. Known calls to variadic functions become zero cost. +// +// This pass does define some target specific information which is partially +// redundant with other parts of the compiler. In particular, the call frame +// it builds must be the exact complement of the va_arg lowering performed +// by clang. The va_list construction is similar to work done by the backend +// for targets that lower variadics there, though distinct in that this pass +// constructs the pieces using alloca instead of relative to stack pointers. +// +// Consistency with clang is primarily tested by emitting va_arg using clang +// then expanding the variadic functions using this pass, followed by trying +// to constant fold the functions to no-ops. +// +// Target specific behaviour is tested in IR - mainly checking that values are +// put into positions in call frames that make sense for that particular target. +// +//===--===// + +#include "llvm/Transforms/IPO/ExpandVariadics.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/TargetParser/Triple.h" + +#define DEBUG_TYPE "expand-variadics" + +using namespace llvm; + +namespace { +namespace VariadicABIInfo { + +// calling convention for passing as valist object, same as it would be in C +// aarch64 uses byval +enum class valistCC { value, pointer, /*byval*/ }; + +struct Interface { +protected: + Interface(uint32_t MinAlign, uint32_t MaxAlign) + : MinAlign(MinAlign), MaxAlign(MaxAlign) {} + +public: + virtual ~Interface() {} + const uint32_t MinAlign; + const uint32_t MaxAlign; + + // Most ABIs use a void* or char* for va_list, others can specialise + virtual Type *vaListType(LLVMContext &Ctx) { +return PointerType::getUnqual(Ctx); + } + + // How the vaListType is passed + virtual valistCC vaListCC() { return valistCC::value; } + + // The valist might need to be stack allocated. + virtual bool valistOnStack() { return false; } + + virtual void initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder, +AllocaInst * /*va_list*/, Value * /*buffer*/) { +// Function needs to be implemented if valist is on the stack +assert(!valistOnStack()); +__builtin_unreachable(); shiltian wrote: Yeah, `llvm_unreachable` is usually used inside LLVM, so definitely better to use `llvm_unreachable` here. https://github.com/llvm/llvm-project/pull/81058 __
[clang] [llvm] [transforms] Inline simple variadic functions (PR #81058)
@@ -0,0 +1,17 @@ +#ifndef LLVM_TRANSFORMS_IPO_EXPANDVARIADICS_H shiltian wrote: LLVM copyright header as well as (brief) documentation of the pass https://github.com/llvm/llvm-project/pull/81058 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [transforms] Inline simple variadic functions (PR #81058)
@@ -0,0 +1,698 @@ +//===-- ExpandVariadicsPass.cpp *- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This is an optimisation pass for variadic functions. If called from codegen, +// it can serve as the implementation of variadic functions for a given target. +// +// The target-dependent parts are in namespace VariadicABIInfo. Enabling a new +// target means adding a case to VariadicABIInfo::create() along with tests. +// +// The module pass using that information is class ExpandVariadics. +// +// The strategy is: +// 1. Test whether a variadic function is sufficiently simple +// 2. If it was, calls to it can be replaced with calls to a different function +// 3. If it wasn't, try to split it into a simple function and a remainder +// 4. Optionally rewrite the varadic function calling convention as well +// +// This pass considers "sufficiently simple" to mean a variadic function that +// calls into a different function taking a va_list to do the real work. For +// example, libc might implement fprintf as a single basic block calling into +// vfprintf. This pass can then rewrite call to the variadic into some code +// to construct a target-specific value to use for the va_list and a call +// into the non-variadic implementation function. There's a test for that. +// +// Most other variadic functions whose definition is known can be converted into +// that form. Create a new internal function taking a va_list where the original +// took a ... parameter. Move the blocks across. Create a new block containing a +// va_start that calls into the new function. This is nearly target independent. +// +// Where this transform is consistent with the ABI, e.g. AMDGPU or NVPTX, or +// where the ABI can be chosen to align with this transform, the function +// interface can be rewritten along with calls to unknown variadic functions. +// +// The aggregate effect is to unblock other transforms, most critically the +// general purpose inliner. Known calls to variadic functions become zero cost. +// +// This pass does define some target specific information which is partially +// redundant with other parts of the compiler. In particular, the call frame +// it builds must be the exact complement of the va_arg lowering performed +// by clang. The va_list construction is similar to work done by the backend +// for targets that lower variadics there, though distinct in that this pass +// constructs the pieces using alloca instead of relative to stack pointers. +// +// Consistency with clang is primarily tested by emitting va_arg using clang +// then expanding the variadic functions using this pass, followed by trying +// to constant fold the functions to no-ops. +// +// Target specific behaviour is tested in IR - mainly checking that values are +// put into positions in call frames that make sense for that particular target. +// +//===--===// + +#include "llvm/Transforms/IPO/ExpandVariadics.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/TargetParser/Triple.h" + +#define DEBUG_TYPE "expand-variadics" + +using namespace llvm; + +namespace { +namespace VariadicABIInfo { + +// calling convention for passing as valist object, same as it would be in C +// aarch64 uses byval +enum class valistCC { value, pointer, /*byval*/ }; + +struct Interface { +protected: + Interface(uint32_t MinAlign, uint32_t MaxAlign) + : MinAlign(MinAlign), MaxAlign(MaxAlign) {} + +public: + virtual ~Interface() {} + const uint32_t MinAlign; + const uint32_t MaxAlign; + + // Most ABIs use a void* or char* for va_list, others can specialise + virtual Type *vaListType(LLVMContext &Ctx) { +return PointerType::getUnqual(Ctx); + } + + // How the vaListType is passed + virtual valistCC vaListCC() { return valistCC::value; } + + // The valist might need to be stack allocated. + virtual bool valistOnStack() { return false; } + + virtual void initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder, +AllocaInst * /*va_list*/, Value * /*buffer*/) { +// Function needs to be implemented if valist is on the stack +assert(!valistOnStack()); +__builtin_unreachable(); + } + + // All targets currently implemented use a ptr for the valist parameter + Type *vaListParameterType(LLVMContext &Ctx) { +return PointerType::getUnqual(Ctx); + } + + bool VAEndIsNop() { return
[clang] [Clang] Fix a non-effective assertion (PR #81083)
https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/81083 `PTy` here is literally `FTy->getParamType(i)`, which makes this assertion not work as expected. >From 076e6d3e1f5a88c4c54b0d2bf6932c9d9ae33678 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 7 Feb 2024 22:35:28 -0500 Subject: [PATCH] [Clang] Fix a non-effective assertion `PTy` here is literally `FTy->getParamType(i)`, which makes this assertion not work as expected. --- clang/lib/CodeGen/CGBuiltin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index e051cbc6486353..a7a410dab1a018 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5908,7 +5908,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) && +assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r shiltian wrote: Does it make sense if we also update both the Clang builtins and LLVM builtins here, and use IR auto upgrade to rewrite existing uses? https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From 6a2bacee940d95abc53bcff2332b0d9aa0f1073f Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 7 Feb 2024 23:09:33 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 +- .../builtins-amdgcn-dl-insts-err.cl | 9 ++- .../builtins-amdgcn-dl-insts-gfx11.cl | 13 ++-- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 12 ++-- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 5 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 66 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 10 +++ .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 60 + llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 22 ++- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 7 ++ llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 +- .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 14 ++-- llvm/test/MC/AMDGPU/bf16_imm.s| 8 +++ 18 files changed, 217 insertions(+), 74 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 213311b96df74f..4fe236e8aca12d 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -246,8 +246,8 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomi TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot10-insts") TARGET_BUILTIN(__builtin_amdgcn_fdot2_f16_f16, "hV2hV2hh", "nc", "dot9-insts") -TARGET_BUILTIN(__builtin_amdgcn_fdot2_bf16_bf16, "sV2sV2ss", "nc", "dot9-insts") -TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2sV2sfIb", "nc", "dot9-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot2_bf16_bf16, "yV2yV2yy", "nc", "dot9-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2yV2yfIb", "nc", "dot9-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot2-insts") TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dot2-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dot1-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl index f5317683d0ff97..fa225c4962c90b 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl @@ -5,6 +5,8 @@ typedef unsigned int uint; typedef half __attribute__((ext_vector_type(2))) half2; +typedef __bf16 bfloat; +typedef bfloat __attribute__((ext_vector_type(2))) bfloat2; typedef short __attribute__((ext_vector_type(2))) short2; typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; @@ -15,16 +17,17 @@ kernel void builtins_amdgcn_dl_insts_err( half2 v2hA, half2 v2hB, float fC, half hC, short2 v2ssA, short2 v2ssB, short sC, int siA, int siB, int siC, ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC, +bfloat2 v2bfsA, bfloat2 v2bfsB, bfloat bfC, int A, int B, int C) { fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot10-insts}} fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot10-insts}} hOut[0] = __builtin_amdgcn_fdot2_f16_f16(v2hA, v2hB, hC); // expected-error {{'__builtin_amdgcn_fdot2_f16_f16' needs target feature dot9-insts}} - sOut[0] = __builtin_amdgcn_fdot2_bf16_bf16(v2ssA, v2ssB, sC); // expected-error {{'__builtin_amdgcn_fdot2_bf16_bf16' needs target feature dot9-insts}} + sOut[0] = __builtin_amdgcn_fdot2_bf16_bf16(v2bfsA, v2bfsB, bfC); // expected-error {{'__builtin_amdgcn_fdot2_bf16_bf16' needs target feature dot9-insts}} - fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot9-insts}} - fOut[4] = __builtin_a
[clang] [Clang] Fix a non-effective assertion (PR #81083)
@@ -5908,7 +5908,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) && +assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && shiltian wrote: `canLosslesslyBitCastTo` is supposed to be stricter than `castIsValid` but it looks too conservative. For example, it doesn't allow bitcast between `float` and `int`. https://github.com/llvm/llvm-project/pull/81083 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang] Fix a non-effective assertion (PR #81083)
https://github.com/shiltian closed https://github.com/llvm/llvm-project/pull/81083 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From 672fd3cf584480eb4769ccdb5f86acbc03865ec9 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 8 Feb 2024 11:17:25 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 -- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 12 ++-- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 5 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 66 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 10 +++ .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 60 + llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 22 ++- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 7 ++ llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 +- .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 14 ++-- llvm/test/MC/AMDGPU/bf16_imm.s| 8 +++ 16 files changed, 202 insertions(+), 67 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index a7a410dab1a018..daf651917f2a96 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5939,8 +5937,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..0f29653f1f5bec 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; @@ -2835,8 +2835,8 @@ def int_amdgcn_fdot2_f32_bf16 : DefaultAttrsIntrinsic< [llvm_float_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b llvm_float_ty, // %c llvm_i1_ty // %clamp ], diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index c1d8e890a66edb..828229f3e569e3 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (U.getType()->getScalarType()->isBFloatTy() || - U.getOperand(0)->getType()->getScalarType()->isBFloatTy()) + if (Opcode != TargetOpcode::G_BITCAST && + (U.getType()->getScalarType()->isBFloatTy() || + U.getOperand(0)->getType()->getScalarType()->isBFloatTy())) return false; Register Op = getOrCreateVReg(*U.getOperand(0)); Register Res = getOrCreateVReg(U); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 225e781588668f..787217171721d8 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPU
[clang] [llvm] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From d14668fdfeef603624af520d11f5b66aa19da7be Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 8 Feb 2024 12:12:48 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 -- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +-- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 5 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 66 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 10 +++ .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 22 ++- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 7 ++ llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 +- llvm/test/MC/AMDGPU/bf16_imm.s| 8 +++ 14 files changed, 191 insertions(+), 56 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index a7a410dab1a018..daf651917f2a96 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5939,8 +5937,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index c1d8e890a66edb..828229f3e569e3 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (U.getType()->getScalarType()->isBFloatTy() || - U.getOperand(0)->getType()->getScalarType()->isBFloatTy()) + if (Opcode != TargetOpcode::G_BITCAST && + (U.getType()->getScalarType()->isBFloatTy() || + U.getOperand(0)->getType()->getScalarType()->isBFloatTy())) return false; Register Op = getOrCreateVReg(*U.getOperand(0)); Register Res = getOrCreateVReg(U); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index a94da992b33859..d6d96c251f7e30 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); } + bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); } + bool isSSrc_f16() const { return isSCSrcB16() || isLiteralImm(MVT::f16); } bool isSSrcV2F16() const { @@ -541,22 +543,40 @@ class AMDGPUOperand : public MC
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian ready_for_review https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -0,0 +1,8 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck %s + +v_dot2_bf16_bf16 v5, v1, v2, 100.0 shiltian wrote: The two instructions are from #79369 https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && shiltian wrote: This change might need to go to a separate patch. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -4181,13 +4181,20 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: return AMDGPU::isInlinableLiteralV2I16(Imm); + case AMDGPU::OPERAND_REG_IMM_V2BF16: shiltian wrote: Yeah, I made some mistakes here. Will take care of them. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] LLVM] Add `__builtin_readfixedtimer` intrinsic and buiiltin for realtime clocks (PR #81331)
https://github.com/shiltian commented: Generally looks good to me. Just not sure about the name. "fixed timer" sounds pretty confusing to me. probably `readfixedfreqtimer`? https://github.com/llvm/llvm-project/pull/81331 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] LLVM] Add `__builtin_readfixedtimer` intrinsic and buiiltin for realtime clocks (PR #81331)
@@ -312,6 +312,12 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) { CI->replaceAllUsesWith(ConstantInt::get(Type::getInt64Ty(Context), 0)); break; } + case Intrinsic::readfixedtimer: { +errs() << "WARNING: this target does not support the llvm.readfixedtimer" + << " intrinsic. It is being lowered to a constant 0\n"; +CI->replaceAllUsesWith(ConstantInt::get(Type::getInt64Ty(Context), 0)); shiltian wrote: `Constant::getNullValue(Type::getInt64Ty(Context))`? https://github.com/llvm/llvm-project/pull/81331 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] LLVM] Add `__builtin_readfixedtimer` intrinsic and buiiltin for realtime clocks (PR #81331)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/81331 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -79,17 +79,17 @@ define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis( ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT:v_mov_b32_e32 v2, s1 ; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x10001, v2 +; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x3f803f80, v2 shiltian wrote: Yeah, but I tested the FP16 version `llvm.amdgcn.fdot2.f16.f16` (w/ trunk w/o my patch), it generates `v_dot2_f16_f16 v2, s0, 0x3c003c00, v2`. I think we generally have issues with showing inline literals. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From 4196e998349d663a9a9922937cc4bedbec95fe5f Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 12 Feb 2024 13:48:39 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 -- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +-- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 5 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 71 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 59 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 8 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 22 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 48 ++--- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 15 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 +- llvm/test/MC/AMDGPU/bf16_imm.s| 8 +++ 16 files changed, 295 insertions(+), 65 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index a7a410dab1a018..daf651917f2a96 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5939,8 +5937,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index c1d8e890a66edb..828229f3e569e3 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (U.getType()->getScalarType()->isBFloatTy() || - U.getOperand(0)->getType()->getScalarType()->isBFloatTy()) + if (Opcode != TargetOpcode::G_BITCAST && + (U.getType()->getScalarType()->isBFloatTy() || + U.getOperand(0)->getType()->getScalarType()->isBFloatTy())) return false; Register Op = getOrCreateVReg(*U.getOperand(0)); Register Res = getOrCreateVReg(U); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index a94da992b33859..65d6fb587c19ca 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); } + bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); } + bool isSSrc_f16() const {
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -2730,6 +2749,12 @@ std::optional getInlineEncodingV2I16(uint32_t Literal) { return getInlineEncodingV216(false, Literal); } +// Encoding of the literal as an inline constant for a V_PK_*_BF16 instruction +// or nullopt. +std::optional getInlineEncodingV2BF16(uint32_t Literal) { + return getInlineEncodingV216(true, Literal); shiltian wrote: This part is still WIP along with https://github.com/llvm/llvm-project/pull/81345. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -2660,15 +2660,34 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { return true; uint16_t Val = static_cast(Literal); - return Val == 0x3C00 || // 1.0 - Val == 0xBC00 || // -1.0 - Val == 0x3800 || // 0.5 - Val == 0xB800 || // -0.5 - Val == 0x4000 || // 2.0 - Val == 0xC000 || // -2.0 - Val == 0x4400 || // 4.0 - Val == 0xC400 || // -4.0 - Val == 0x3118; // 1/2pi + + // FP16 + if (Val == 0x3C00 || // 1.0 shiltian wrote: This function might be removed eventually in https://github.com/llvm/llvm-project/pull/81345. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From df3dbb6b9c257157c4afb407e40447a25c27a2a8 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 12 Feb 2024 18:03:57 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 - llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 5 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 71 ++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 59 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 8 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 22 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 74 --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 15 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 - llvm/test/MC/AMDGPU/bf16_imm.s| 10 +++ 16 files changed, 323 insertions(+), 65 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index a7a410dab1a018..daf651917f2a96 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5939,8 +5937,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index c1d8e890a66edb..828229f3e569e3 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (U.getType()->getScalarType()->isBFloatTy() || - U.getOperand(0)->getType()->getScalarType()->isBFloatTy()) + if (Opcode != TargetOpcode::G_BITCAST && + (U.getType()->getScalarType()->isBFloatTy() || + U.getOperand(0)->getType()->getScalarType()->isBFloatTy())) return false; Register Op = getOrCreateVReg(*U.getOperand(0)); Register Res = getOrCreateVReg(U); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 79ad6ddf7861fc..09f25215beb9e5 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); } + bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); } + bool isSSrc_f16() const {
[clang] [llvm] [LLVM] Add `__builtin_readsteadycounter` intrinsic and builtin for realtime clocks (PR #81331)
https://github.com/shiltian approved this pull request. LG https://github.com/llvm/llvm-project/pull/81331 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From c556e40c13adb9d253ef7c5ebb2b46cb12969d46 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 13 Feb 2024 15:30:51 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 -- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +-- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 5 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 71 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 59 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 8 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 21 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 37 ++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 15 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 +- llvm/test/MC/AMDGPU/bf16_imm.s| 8 +++ 16 files changed, 292 insertions(+), 56 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ee0b7504769622..9bc60466d09be6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5912,8 +5912,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5943,8 +5941,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 311dd9d9739a6d..3290262816ef0a 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (U.getType()->getScalarType()->isBFloatTy() || - U.getOperand(0)->getType()->getScalarType()->isBFloatTy()) + if (Opcode != TargetOpcode::G_BITCAST && + (U.getType()->getScalarType()->isBFloatTy() || + U.getOperand(0)->getType()->getScalarType()->isBFloatTy())) return false; Register Op = getOrCreateVReg(*U.getOperand(0)); Register Res = getOrCreateVReg(U); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 79ad6ddf7861fc..09f25215beb9e5 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); } + bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); } + bool isSSrc_f16() const { ret
[clang] [Clang][CodeGen] Loose the cast check when emitting builtins (PR #81669)
https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/81669 This patch looses the cast check (`canLosslesslyBitCastTo`) and leaves it to the one inside `CreateBitCast`. It seems too conservative for the use case here. >From 813441fd3106a0069346aabd0dd828d8feb8ea53 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 13 Feb 2024 16:39:25 -0500 Subject: [PATCH] [Clang][CodeGen] Loose the cast check when emitting builtins This patch looses the cast check (`canLosslesslyBitCastTo`) and leaves it to the one inside `CreateBitCast`. It seems too conservative for the use case here. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 1 file changed, 4 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ee0b7504769622..9bc60466d09be6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5912,8 +5912,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5943,8 +5941,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
shiltian wrote: The patch is in a good shape now. I have made two other prime patches (#81674 and #81669). I'll rebase this one once they are landed. This patch only changes one bf16 instruction with the necessary infrastructure for others. I'll update all of them once this patch is landed. However, I don't think `isInlinableLiteral16` works correctly because the encoding of the floating point inline literals are different for fp16 and bf16, but apparently for now it can only recognize fp16 encoding. This patch at least makes the asm printer work properly. #81345 is trying to fix it correctly, but that is unrelated to this patch. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -79,17 +79,17 @@ define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis( ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT:v_mov_b32_e32 v2, s1 ; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x10001, v2 +; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x3f803f80, v2 shiltian wrote: FWIW, #81345 can solve the issue but I'm struggling with getting two test cases passed. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From bfd3170dc5e4d6e53fb98b46b37f2bf3c3ebf86d Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 13 Feb 2024 17:39:23 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 -- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +-- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 71 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 59 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 8 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 21 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 37 ++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 15 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 49 + llvm/test/MC/AMDGPU/bf16_imm.s| 8 +++ 15 files changed, 289 insertions(+), 67 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ee0b7504769622..9bc60466d09be6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5912,8 +5912,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5943,8 +5941,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 79ad6ddf7861fc..09f25215beb9e5 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); } + bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); } + bool isSSrc_f16() const { return isSCSrcB16() || isLiteralImm(MVT::f16); } bool isSSrcV2F16() const { @@ -541,22 +543,40 @@ class AMDGPUOperand : public MCParsedAsmOperand { return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64); } + bool isVCSrcTBF16() const { +return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::bf16); + } + bool isVCSrcTF16() const { return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::f16); } + bool isVCSrcTBF16_Lo128() const { +return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::bf16); + } + bool isVCSrcTF16_Lo128() const { return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::f16); } + bool isVCSrcFake16BF16_Lo128() const { +return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT::bf16); + } + bool isVCSrcFake16F16_Lo128() const { return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT:
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -1,8 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11 -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11 shiltian wrote: This change is because of the discussion here (https://github.com/llvm/llvm-project/pull/80908/files#r1483394728). https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r shiltian wrote: The cast will be inserted automatically in `clang/lib/CodeGen/CGBuiltin.cpp` after removing the two assertions. I reverted my change to the test case by accident. Lol https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -488,6 +488,49 @@ static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI, return true; } +static bool printImmediateBFloat16(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O) { + if (Imm == 0x3F80) +O << "1.0"; + else if (Imm == 0xBF80) +O << "-1.0"; + else if (Imm == 0x3F00) +O << "0.5"; + else if (Imm == 0xBF00) +O << "-0.5"; + else if (Imm == 0x4000) +O << "2.0"; + else if (Imm == 0xC000) +O << "-2.0"; + else if (Imm == 0x4080) +O << "4.0"; + else if (Imm == 0xC080) +O << "-4.0"; + else if (Imm == 0x3E22 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) +O << "0.15915494"; + else +return false; + + return true; +} + +void AMDGPUInstPrinter::printImmediateBF16(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + int16_t SImm = static_cast(Imm); + if (isInlinableIntLiteral(SImm)) { +O << SImm; +return; + } + + uint16_t HImm = static_cast(Imm); + if (printImmediateBFloat16(HImm, STI, O)) +return; + + uint64_t Imm16 = static_cast(Imm); shiltian wrote: Yeah, but it is to make the type promoted to `uint64_t` w/o any ambiguity. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
@@ -4185,9 +4185,17 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: return AMDGPU::isInlinableLiteralV2F16(Imm); + case AMDGPU::OPERAND_REG_IMM_V2BF16: + case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16: +return AMDGPU::isInlinableLiteralV2BF16(Imm); + case AMDGPU::OPERAND_REG_IMM_BF16: case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED: case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: + case AMDGPU::OPERAND_REG_INLINE_C_BF16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_BF16: shiltian wrote: Yes, `isInlinableLiteral16` can't because it can't tell `fp16` and `bf16` apart by just looking at the value. That's the reason I tried really hard to get rid of `isInlinableLiteral16` in #81345 and favors the explicit version. https://github.com/llvm/llvm-project/pull/80908 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/80908 >From d72bf8bb9d1091ba76e17bf09b0aad9073e18caa Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 13 Feb 2024 19:02:41 -0500 Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat Currently it looks like we generally use `i16` to represent `bf16` in those tablegen files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was not available when we enabled the support. This patch is trying to use `bf16` directly in those tablegen files, aiming at fixing #79369. Of course for #79369 a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`, but it doesn't look good IMHO. Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out where I don't understand correctly. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 -- .../builtins-amdgcn-dl-insts-gfx11.cl | 5 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +-- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 71 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 57 +++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 7 ++ llvm/lib/Target/AMDGPU/SIDefines.h| 7 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 8 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 21 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 37 ++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 15 llvm/lib/Target/AMDGPU/VOP3Instructions.td| 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 51 ++--- llvm/test/MC/AMDGPU/bf16_imm.s| 8 +++ 16 files changed, 293 insertions(+), 68 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ee0b7504769622..9bc60466d09be6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5912,8 +5912,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } -assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. if (PTy->isX86_AMXTy()) @@ -5943,8 +5941,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. if (V->getType()->isX86_AMXTy()) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl index dc7069decaaa61..7688dfa55a78e3 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl @@ -11,7 +11,10 @@ typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 true) // CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %v2hA, <2 x half> %v2hB, half %hC) -// CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, i16 %sC) +// CHECK: [[s1:%[0-9]+]] = bitcast <2 x i16> %v2ssA to <2 x bfloat> +// CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat> +// CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat +// CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat [[s3]]) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false) // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true) // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 202fa4e8f4ea81..6795fb7aa0edb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 : def int_amdgcn_fdot2_bf16_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, DefaultAttrsIntrinsic< -[llvm_i16_ty], // %r +[llvm_bfloat_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b - llvm_i16_ty// %c + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b + llvm_bfloat_ty// %c ], [IntrNoMem, IntrSpec