[clang] 8bd7e41 - Replace separator in OpenMP variant name mangling.

2020-06-03 Thread Shilei Tian via cfe-commits

Author: Lukas Sommer
Date: 2020-06-03T16:36:32-04:00
New Revision: 8bd7e4188a096b063065aac70ce39129c479f124

URL: 
https://github.com/llvm/llvm-project/commit/8bd7e4188a096b063065aac70ce39129c479f124
DIFF: 
https://github.com/llvm/llvm-project/commit/8bd7e4188a096b063065aac70ce39129c479f124.diff

LOG: Replace separator in OpenMP variant name mangling.

Summary:
Nvidia PTX does not allow `.` to appear in identifiers, so OpenMP variant 
mangling now uses `$` to separate segments of the mangled name for variants of 
functions declared via `declare variant`.

Reviewers: jdoerfert, Hahnfeld

Reviewed By: jdoerfert

Subscribers: yaxunl, guansong, sstefan1, cfe-commits

Tags: #openmp, #clang

Differential Revision: https://reviews.llvm.org/D80439

Added: 
clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp

Modified: 
clang/include/clang/AST/Decl.h
clang/lib/AST/OpenMPClause.cpp

Removed: 




diff  --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 185ba2f4b4c1..6c39f6aab1b9 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -4560,7 +4560,7 @@ inline bool IsEnumDeclScoped(EnumDecl *ED) {
 /// The new name looks likes this:
 ///   + OpenMPVariantManglingSeparatorStr + 
 static constexpr StringRef getOpenMPVariantManglingSeparatorStr() {
-  return ".ompvariant";
+  return "$ompvariant";
 }
 
 } // namespace clang

diff  --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index fa1c80fc6bbf..bcbe916820dc 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -2167,22 +2167,21 @@ std::string OMPTraitInfo::getMangledName() const {
   std::string MangledName;
   llvm::raw_string_ostream OS(MangledName);
   for (const OMPTraitSet &Set : Sets) {
-OS << '.' << 'S' << unsigned(Set.Kind);
+OS << '$' << 'S' << unsigned(Set.Kind);
 for (const OMPTraitSelector &Selector : Set.Selectors) {
 
   bool AllowsTraitScore = false;
   bool RequiresProperty = false;
   isValidTraitSelectorForTraitSet(
   Selector.Kind, Set.Kind, AllowsTraitScore, RequiresProperty);
-  OS << '.' << 's' << unsigned(Selector.Kind);
+  OS << '$' << 's' << unsigned(Selector.Kind);
 
   if (!RequiresProperty ||
   Selector.Kind == TraitSelector::user_condition)
 continue;
 
   for (const OMPTraitProperty &Property : Selector.Properties)
-OS << '.' << 'P'
-   << getOpenMPContextTraitPropertyName(Property.Kind);
+OS << '$' << 'P' << getOpenMPContextTraitPropertyName(Property.Kind);
 }
   }
   return OS.str();
@@ -2191,7 +2190,7 @@ std::string OMPTraitInfo::getMangledName() const {
 OMPTraitInfo::OMPTraitInfo(StringRef MangledName) {
   unsigned long U;
   do {
-if (!MangledName.consume_front(".S"))
+if (!MangledName.consume_front("$S"))
   break;
 if (MangledName.consumeInteger(10, U))
   break;
@@ -2199,7 +2198,7 @@ OMPTraitInfo::OMPTraitInfo(StringRef MangledName) {
 OMPTraitSet &Set = Sets.back();
 Set.Kind = TraitSet(U);
 do {
-  if (!MangledName.consume_front(".s"))
+  if (!MangledName.consume_front("$s"))
 break;
   if (MangledName.consumeInteger(10, U))
 break;
@@ -2207,11 +2206,11 @@ OMPTraitInfo::OMPTraitInfo(StringRef MangledName) {
   OMPTraitSelector &Selector = Set.Selectors.back();
   Selector.Kind = TraitSelector(U);
   do {
-if (!MangledName.consume_front(".P"))
+if (!MangledName.consume_front("$P"))
   break;
 Selector.Properties.push_back(OMPTraitProperty());
 OMPTraitProperty &Property = Selector.Properties.back();
-std::pair PropRestPair = MangledName.split('.');
+std::pair PropRestPair = MangledName.split('$');
 Property.Kind =
 getOpenMPContextTraitPropertyKind(Set.Kind, PropRestPair.first);
 MangledName = PropRestPair.second;

diff  --git a/clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp 
b/clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp
new file mode 100644
index ..6a9ce799d01e
--- /dev/null
+++ b/clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown 
-fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc 
-fopenmp-version=50
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown 
-aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device 
-fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-version=50 | FileCheck 
%s --implicit-check-not='call i32 {@_Z3bazv|@_Z3barv}'
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown 
-aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-device 
-fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -fopenmp-version=50
+// RU

[clang] [Clang][OpenMP]: fixed crash due to invalid binary expression in checking atomic semantics (PR #71480)

2023-11-06 Thread Shilei Tian via cfe-commits

shiltian wrote:

This doesn't look like the right place to fix this issue to me. @alexey-bataev 
might have better suggestion.

https://github.com/llvm/llvm-project/pull/71480
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][OpenMP] fixed crash due to invalid binary expression in checking atomic semantics (PR #71480)

2023-11-06 Thread Shilei Tian via cfe-commits

https://github.com/shiltian edited 
https://github.com/llvm/llvm-project/pull/71480
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][OpenMP] fixed crash due to invalid binary expression in checking atomic semantics (PR #71480)

2023-11-07 Thread Shilei Tian via cfe-commits

https://github.com/shiltian closed 
https://github.com/llvm/llvm-project/pull/71480
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][OpenMP] fixed crash due to invalid binary expression in checking atomic semantics (PR #71480)

2023-11-07 Thread Shilei Tian via cfe-commits

shiltian wrote:

I have merged it given our front end expert @alexey-bataev has approved it.

https://github.com/llvm/llvm-project/pull/71480
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][OpenMP] Return empty QualType when a negative array was created (PR #71552)

2023-11-07 Thread Shilei Tian via cfe-commits

https://github.com/shiltian edited 
https://github.com/llvm/llvm-project/pull/71552
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [openmp] [OpenMP] Add support for Solaris/x86_64 (PR #70593)

2023-11-08 Thread Shilei Tian via cfe-commits


@@ -70,6 +72,15 @@ struct kmp_sys_timer {
   struct timespec start;
 };
 
+#ifdef KMP_OS_SOLARIS
+// Convert timeval to timespec.
+#define TIMEVAL_TO_TIMESPEC(tv, ts)
\

shiltian wrote:

It looks like this macro is not guarded correct somewhere else such that the 
compiler complains on a non-Solaris system.
```
[19/128] Building CXX object runtime/src/CMakeFiles/omp.dir/z_Linux_util.cpp.o
/home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/runtime/src/z_Linux_util.cpp:77:9:
 warning: 'TIMEVAL_TO_TIMESPEC' macro redefined [-Wmacro-redefined]
   77 | #define TIMEVAL_TO_TIMESPEC(tv, ts) 
   \
  | ^
/usr/include/sys/time.h:38:10: note: previous definition is here
   38 | # define TIMEVAL_TO_TIMESPEC(tv, ts) {  
 \
  |  ^
```

https://github.com/llvm/llvm-project/pull/70593
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][OpenMP] Return empty QualType when a negative array was created (PR #71552)

2023-11-09 Thread Shilei Tian via cfe-commits

shiltian wrote:

The changes look good to me. @alexey-bataev WDYT?

https://github.com/llvm/llvm-project/pull/71552
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][OpenMP] Return empty QualType when a negative array was created (PR #71552)

2023-11-09 Thread Shilei Tian via cfe-commits

https://github.com/shiltian closed 
https://github.com/llvm/llvm-project/pull/71552
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel (PR #68373)

2023-11-12 Thread Shilei Tian via cfe-commits

shiltian wrote:

Gentle ping

https://github.com/llvm/llvm-project/pull/68373
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [openmp] [OpenMP] Directly use user's grid and block size in kernel language mode (PR #70612)

2023-11-12 Thread Shilei Tian via cfe-commits

shiltian wrote:

Gentle ping

https://github.com/llvm/llvm-project/pull/70612
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel (PR #68373)

2023-11-13 Thread Shilei Tian via cfe-commits


@@ -14633,6 +14633,26 @@ StmtResult 
Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses,
   }
   setFunctionHasBranchProtectedScope();
 
+  bool HasBareClause = false;
+  bool HasThreadLimitClause = false;
+  bool HasNumTeamsClause = false;
+  OMPClause *BareClause = nullptr;
+
+  for (auto *C : Clauses) {
+if (isa(C)) {
+  BareClause = C;
+  HasBareClause = true;

shiltian wrote:

It is not very necessarily needed, but I found it look nicer to check three 
boolean variables instead of two as well as a pointer. Just code preference. 
Nothing technical.

https://github.com/llvm/llvm-project/pull/68373
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][OpenMP] Fix private variables registration in `simd` (PR #74105)

2023-12-05 Thread Shilei Tian via cfe-commits

https://github.com/shiltian closed 
https://github.com/llvm/llvm-project/pull/74105
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel (PR #68373)

2023-12-11 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/68373

>From ef3d7309feb1fcedb27f10bd11273eeb08ebf7aa Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Mon, 11 Dec 2023 19:25:01 -0500
Subject: [PATCH] [OpenMP][Clang] Force use of `num_teams` and `thread_limit`
 for bare kernel

This patch makes `num_teams` and `thread_limit` mandatory for bare kernels,
similar to a reguar kernel language that when launching a kernel, the grid size
has to be set explicitly.
---
 .../clang/Basic/DiagnosticSemaKinds.td|   2 +
 clang/lib/Sema/SemaOpenMP.cpp |  13 ++
 clang/test/OpenMP/ompx_bare_messages.c|   7 +-
 clang/test/OpenMP/target_teams_codegen.cpp| 206 +-
 4 files changed, 127 insertions(+), 101 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td 
b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 94e97a891baed..8b2f5d2746f7f 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11368,6 +11368,8 @@ def err_openmp_vla_in_task_untied : Error<
 def warn_omp_unterminated_declare_target : Warning<
   "expected '#pragma omp end declare target' at end of file to match '#pragma 
omp %0'">,
   InGroup;
+def err_ompx_bare_no_grid : Error<
+  "'ompx_bare' clauses requires explicit grid size via 'num_teams' and 
'thread_limit' clauses">;
 } // end of OpenMP category
 
 let CategoryName = "Related Result Type Issue" in {
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index e400f248d15aa..3826994ef2126 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -14658,6 +14658,19 @@ StmtResult 
Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses,
   }
   setFunctionHasBranchProtectedScope();
 
+  const OMPClause *BareClause = nullptr;
+  bool HasThreadLimitAndNumTeamsClause = hasClauses(Clauses, OMPC_num_teams) &&
+ hasClauses(Clauses, 
OMPC_thread_limit);
+  bool HasBareClause = llvm::any_of(Clauses, [&](const OMPClause *C) {
+BareClause = C;
+return C->getClauseKind() == OMPC_ompx_bare;
+  });
+
+  if (HasBareClause && !HasThreadLimitAndNumTeamsClause) {
+Diag(BareClause->getBeginLoc(), diag::err_ompx_bare_no_grid);
+return StmtError();
+  }
+
   return OMPTargetTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses,
  AStmt);
 }
diff --git a/clang/test/OpenMP/ompx_bare_messages.c 
b/clang/test/OpenMP/ompx_bare_messages.c
index a1b3c38028528..19ceee5625fee 100644
--- a/clang/test/OpenMP/ompx_bare_messages.c
+++ b/clang/test/OpenMP/ompx_bare_messages.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown %s
- // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s
- // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown 
-fopenmp-targets=nvptx64 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s
+// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown 
-fopenmp-targets=nvptx64 %s
 
 void foo() {
 }
@@ -18,4 +18,7 @@ void bar() {
 #pragma omp target
 #pragma omp teams ompx_bare // expected-error {{unexpected OpenMP clause 
'ompx_bare' in directive '#pragma omp teams'}} expected-note {{OpenMP extension 
clause 'ompx_bare' only allowed with '#pragma omp target teams'}}
   foo();
+
+#pragma omp target teams ompx_bare // expected-error {{'ompx_bare' clauses 
requires explicit grid size via 'num_teams' and 'thread_limit' clauses}}
+  foo();
 }
diff --git a/clang/test/OpenMP/target_teams_codegen.cpp 
b/clang/test/OpenMP/target_teams_codegen.cpp
index b7c7add229c14..8790a0fc87cbb 100644
--- a/clang/test/OpenMP/target_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_codegen.cpp
@@ -121,7 +121,7 @@ int foo(int n) {
 aa += 1;
   }
 
-  #pragma omp target teams ompx_bare
+  #pragma omp target teams ompx_bare num_teams(1) thread_limit(1)
   {
 a += 1;
 aa += 1;
@@ -588,12 +588,12 @@ int bar(int n){
 // CHECK1-NEXT:[[TMP116:%.*]] = getelementptr inbounds 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 9
 // CHECK1-NEXT:store i64 0, ptr [[TMP116]], align 8
 // CHECK1-NEXT:[[TMP117:%.*]] = getelementptr inbounds 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 10
-// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP117]], align 4
+// CHECK1-NEXT:store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP117]], 
align 4
 // CHECK1-NEXT:[[TMP118:%.*]] = getelementptr inbounds 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 11
-// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP118]], align 4
+// CHECK1-NEXT:store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP118]], 
align 4
 // CHECK1-NEXT:[[TMP119:%.*]] = getelementptr inbounds 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 12
 // CHECK

[libc] [clang-tools-extra] [libcxx] [clang] [lld] [lldb] [flang] [llvm] [OpenACC] Initial commits to support OpenACC (PR #70234)

2023-11-17 Thread Shilei Tian via cfe-commits

shiltian wrote:

We will want to have OpenACC label(s) such that people who are interested can 
be notified.

https://github.com/llvm/llvm-project/pull/70234
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [libcxx] [flang] [lldb] [llvm] [libc] [lld] [OpenACC] Initial commits to support OpenACC (PR #70234)

2023-11-17 Thread Shilei Tian via cfe-commits

shiltian wrote:

You might also want to update the team "pr-subscribers-openacc" because 
currently it only reflects Flang.

https://github.com/llvm/llvm-project/pull/70234
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP] Unify the min/max thread/teams pathways (PR #70273)

2023-10-28 Thread Shilei Tian via cfe-commits

https://github.com/shiltian approved this pull request.

LG

https://github.com/llvm/llvm-project/pull/70273
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP] Add support for Solaris (PR #70593)

2023-10-29 Thread Shilei Tian via cfe-commits

shiltian wrote:

I'm not familiar with Solaris but does it need dedicated implementation of the 
function invocation written in ASM?

https://github.com/llvm/llvm-project/pull/70593
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel (PR #68373)

2023-10-29 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/68373

>From d0d8bcc9fa002304ef79ca48d736853ca621c0bd Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Sun, 29 Oct 2023 19:17:19 -0400
Subject: [PATCH] [OpenMP][Clang] Force use of `num_teams` and `thread_limit`
 for bare kernel

This patch makes `num_teams` and `thread_limit` mandatory for bare kernels,
similar to a reguar kernel language that when launching a kernel, the grid size
has to be set explicitly.
---
 .../clang/Basic/DiagnosticSemaKinds.td|   2 +
 clang/lib/Sema/SemaOpenMP.cpp |  20 ++
 clang/test/OpenMP/ompx_bare_messages.c|   7 +-
 clang/test/OpenMP/target_teams_codegen.cpp| 230 +-
 4 files changed, 146 insertions(+), 113 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td 
b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 453bd8a9a340425..d1398763b49c701 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11333,6 +11333,8 @@ def err_openmp_vla_in_task_untied : Error<
 def warn_omp_unterminated_declare_target : Warning<
   "expected '#pragma omp end declare target' at end of file to match '#pragma 
omp %0'">,
   InGroup;
+def err_ompx_bare_no_grid : Error<
+  "'ompx_bare' clauses requires explicit grid size via 'num_teams' and 
'thread_limit' clauses">;
 } // end of OpenMP category
 
 let CategoryName = "Related Result Type Issue" in {
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 75f9e152dca9297..90a0d1f70f268f1 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -14633,6 +14633,26 @@ StmtResult 
Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses,
   }
   setFunctionHasBranchProtectedScope();
 
+  bool HasBareClause = false;
+  bool HasThreadLimitClause = false;
+  bool HasNumTeamsClause = false;
+  OMPClause *BareClause = nullptr;
+
+  for (auto *C : Clauses) {
+if (isa(C)) {
+  BareClause = C;
+  HasBareClause = true;
+} else if (isa(C))
+  HasNumTeamsClause = true;
+else if (isa(C))
+  HasThreadLimitClause = true;
+  }
+
+  if (HasBareClause && !(HasNumTeamsClause && HasThreadLimitClause)) {
+Diag(BareClause->getBeginLoc(), diag::err_ompx_bare_no_grid);
+return StmtError();
+  }
+
   return OMPTargetTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses,
  AStmt);
 }
diff --git a/clang/test/OpenMP/ompx_bare_messages.c 
b/clang/test/OpenMP/ompx_bare_messages.c
index a1b3c380285287d..19ceee5625feecc 100644
--- a/clang/test/OpenMP/ompx_bare_messages.c
+++ b/clang/test/OpenMP/ompx_bare_messages.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown %s
- // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s
- // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown 
-fopenmp-targets=nvptx64 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s
+// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown 
-fopenmp-targets=nvptx64 %s
 
 void foo() {
 }
@@ -18,4 +18,7 @@ void bar() {
 #pragma omp target
 #pragma omp teams ompx_bare // expected-error {{unexpected OpenMP clause 
'ompx_bare' in directive '#pragma omp teams'}} expected-note {{OpenMP extension 
clause 'ompx_bare' only allowed with '#pragma omp target teams'}}
   foo();
+
+#pragma omp target teams ompx_bare // expected-error {{'ompx_bare' clauses 
requires explicit grid size via 'num_teams' and 'thread_limit' clauses}}
+  foo();
 }
diff --git a/clang/test/OpenMP/target_teams_codegen.cpp 
b/clang/test/OpenMP/target_teams_codegen.cpp
index 9ee1f74e8fdc468..95182c668c5e822 100644
--- a/clang/test/OpenMP/target_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_codegen.cpp
@@ -121,7 +121,7 @@ int foo(int n) {
 aa += 1;
   }
 
-  #pragma omp target teams ompx_bare
+  #pragma omp target teams ompx_bare num_teams(1) thread_limit(1)
   {
 a += 1;
 aa += 1;
@@ -588,12 +588,12 @@ int bar(int n){
 // CHECK1-NEXT:[[TMP116:%.*]] = getelementptr inbounds 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 9
 // CHECK1-NEXT:store i64 0, ptr [[TMP116]], align 8
 // CHECK1-NEXT:[[TMP117:%.*]] = getelementptr inbounds 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 10
-// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP117]], align 4
+// CHECK1-NEXT:store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP117]], 
align 4
 // CHECK1-NEXT:[[TMP118:%.*]] = getelementptr inbounds 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 11
-// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP118]], align 4
+// CHECK1-NEXT:store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP118]], 
align 4
 // CHECK1-NEXT:[[TMP119:%.*]] = getelementptr inbounds 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL

[clang] no default grid size (PR #70612)

2023-10-29 Thread Shilei Tian via cfe-commits

https://github.com/shiltian created 
https://github.com/llvm/llvm-project/pull/70612

- [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel
- [OpenMP] Directly use user's grid and block size in kernel language mode


>From 7b0eaa1606ad2e557105fed9509c135f857db375 Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Sun, 29 Oct 2023 19:18:49 -0400
Subject: [PATCH 1/2] [OpenMP][Clang] Force use of `num_teams` and
 `thread_limit` for bare kernel

This patch makes `num_teams` and `thread_limit` mandatory for bare kernels,
similar to a reguar kernel language that when launching a kernel, the grid size
has to be set explicitly.
---
 .../clang/Basic/DiagnosticSemaKinds.td|   2 +
 clang/lib/Sema/SemaOpenMP.cpp |  20 ++
 clang/test/OpenMP/ompx_bare_messages.c|   7 +-
 clang/test/OpenMP/target_teams_codegen.cpp| 230 +-
 4 files changed, 146 insertions(+), 113 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td 
b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 453bd8a9a340425..d1398763b49c701 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11333,6 +11333,8 @@ def err_openmp_vla_in_task_untied : Error<
 def warn_omp_unterminated_declare_target : Warning<
   "expected '#pragma omp end declare target' at end of file to match '#pragma 
omp %0'">,
   InGroup;
+def err_ompx_bare_no_grid : Error<
+  "'ompx_bare' clauses requires explicit grid size via 'num_teams' and 
'thread_limit' clauses">;
 } // end of OpenMP category
 
 let CategoryName = "Related Result Type Issue" in {
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 75f9e152dca9297..90a0d1f70f268f1 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -14633,6 +14633,26 @@ StmtResult 
Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses,
   }
   setFunctionHasBranchProtectedScope();
 
+  bool HasBareClause = false;
+  bool HasThreadLimitClause = false;
+  bool HasNumTeamsClause = false;
+  OMPClause *BareClause = nullptr;
+
+  for (auto *C : Clauses) {
+if (isa(C)) {
+  BareClause = C;
+  HasBareClause = true;
+} else if (isa(C))
+  HasNumTeamsClause = true;
+else if (isa(C))
+  HasThreadLimitClause = true;
+  }
+
+  if (HasBareClause && !(HasNumTeamsClause && HasThreadLimitClause)) {
+Diag(BareClause->getBeginLoc(), diag::err_ompx_bare_no_grid);
+return StmtError();
+  }
+
   return OMPTargetTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses,
  AStmt);
 }
diff --git a/clang/test/OpenMP/ompx_bare_messages.c 
b/clang/test/OpenMP/ompx_bare_messages.c
index a1b3c380285287d..19ceee5625feecc 100644
--- a/clang/test/OpenMP/ompx_bare_messages.c
+++ b/clang/test/OpenMP/ompx_bare_messages.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown %s
- // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s
- // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown 
-fopenmp-targets=nvptx64 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s
+// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown 
-fopenmp-targets=nvptx64 %s
 
 void foo() {
 }
@@ -18,4 +18,7 @@ void bar() {
 #pragma omp target
 #pragma omp teams ompx_bare // expected-error {{unexpected OpenMP clause 
'ompx_bare' in directive '#pragma omp teams'}} expected-note {{OpenMP extension 
clause 'ompx_bare' only allowed with '#pragma omp target teams'}}
   foo();
+
+#pragma omp target teams ompx_bare // expected-error {{'ompx_bare' clauses 
requires explicit grid size via 'num_teams' and 'thread_limit' clauses}}
+  foo();
 }
diff --git a/clang/test/OpenMP/target_teams_codegen.cpp 
b/clang/test/OpenMP/target_teams_codegen.cpp
index 9ee1f74e8fdc468..95182c668c5e822 100644
--- a/clang/test/OpenMP/target_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_codegen.cpp
@@ -121,7 +121,7 @@ int foo(int n) {
 aa += 1;
   }
 
-  #pragma omp target teams ompx_bare
+  #pragma omp target teams ompx_bare num_teams(1) thread_limit(1)
   {
 a += 1;
 aa += 1;
@@ -588,12 +588,12 @@ int bar(int n){
 // CHECK1-NEXT:[[TMP116:%.*]] = getelementptr inbounds 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 9
 // CHECK1-NEXT:store i64 0, ptr [[TMP116]], align 8
 // CHECK1-NEXT:[[TMP117:%.*]] = getelementptr inbounds 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 10
-// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP117]], align 4
+// CHECK1-NEXT:store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP117]], 
align 4
 // CHECK1-NEXT:[[TMP118:%.*]] = getelementptr inbounds 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 11
-// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP118]], align 4
+// CHECK1-NEXT:store [3 x i3

[clang] [OpenMP] Directly use user's grid and block size in kernel language mode (PR #70612)

2023-10-29 Thread Shilei Tian via cfe-commits

https://github.com/shiltian edited 
https://github.com/llvm/llvm-project/pull/70612
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP] Directly use user's grid and block size in kernel language mode (PR #70612)

2023-10-29 Thread Shilei Tian via cfe-commits

https://github.com/shiltian edited 
https://github.com/llvm/llvm-project/pull/70612
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP] Directly use user's grid and block size in kernel language mode (PR #70612)

2023-10-29 Thread Shilei Tian via cfe-commits

https://github.com/shiltian edited 
https://github.com/llvm/llvm-project/pull/70612
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP] Directly use user's grid and block size in kernel language mode (PR #70612)

2023-10-29 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/70612

>From 7b0eaa1606ad2e557105fed9509c135f857db375 Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Sun, 29 Oct 2023 19:18:49 -0400
Subject: [PATCH 1/2] [OpenMP][Clang] Force use of `num_teams` and
 `thread_limit` for bare kernel

This patch makes `num_teams` and `thread_limit` mandatory for bare kernels,
similar to a reguar kernel language that when launching a kernel, the grid size
has to be set explicitly.
---
 .../clang/Basic/DiagnosticSemaKinds.td|   2 +
 clang/lib/Sema/SemaOpenMP.cpp |  20 ++
 clang/test/OpenMP/ompx_bare_messages.c|   7 +-
 clang/test/OpenMP/target_teams_codegen.cpp| 230 +-
 4 files changed, 146 insertions(+), 113 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td 
b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 453bd8a9a340425..d1398763b49c701 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11333,6 +11333,8 @@ def err_openmp_vla_in_task_untied : Error<
 def warn_omp_unterminated_declare_target : Warning<
   "expected '#pragma omp end declare target' at end of file to match '#pragma 
omp %0'">,
   InGroup;
+def err_ompx_bare_no_grid : Error<
+  "'ompx_bare' clauses requires explicit grid size via 'num_teams' and 
'thread_limit' clauses">;
 } // end of OpenMP category
 
 let CategoryName = "Related Result Type Issue" in {
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 75f9e152dca9297..90a0d1f70f268f1 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -14633,6 +14633,26 @@ StmtResult 
Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses,
   }
   setFunctionHasBranchProtectedScope();
 
+  bool HasBareClause = false;
+  bool HasThreadLimitClause = false;
+  bool HasNumTeamsClause = false;
+  OMPClause *BareClause = nullptr;
+
+  for (auto *C : Clauses) {
+if (isa(C)) {
+  BareClause = C;
+  HasBareClause = true;
+} else if (isa(C))
+  HasNumTeamsClause = true;
+else if (isa(C))
+  HasThreadLimitClause = true;
+  }
+
+  if (HasBareClause && !(HasNumTeamsClause && HasThreadLimitClause)) {
+Diag(BareClause->getBeginLoc(), diag::err_ompx_bare_no_grid);
+return StmtError();
+  }
+
   return OMPTargetTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses,
  AStmt);
 }
diff --git a/clang/test/OpenMP/ompx_bare_messages.c 
b/clang/test/OpenMP/ompx_bare_messages.c
index a1b3c380285287d..19ceee5625feecc 100644
--- a/clang/test/OpenMP/ompx_bare_messages.c
+++ b/clang/test/OpenMP/ompx_bare_messages.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown %s
- // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s
- // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown 
-fopenmp-targets=nvptx64 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s
+// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown 
-fopenmp-targets=nvptx64 %s
 
 void foo() {
 }
@@ -18,4 +18,7 @@ void bar() {
 #pragma omp target
 #pragma omp teams ompx_bare // expected-error {{unexpected OpenMP clause 
'ompx_bare' in directive '#pragma omp teams'}} expected-note {{OpenMP extension 
clause 'ompx_bare' only allowed with '#pragma omp target teams'}}
   foo();
+
+#pragma omp target teams ompx_bare // expected-error {{'ompx_bare' clauses 
requires explicit grid size via 'num_teams' and 'thread_limit' clauses}}
+  foo();
 }
diff --git a/clang/test/OpenMP/target_teams_codegen.cpp 
b/clang/test/OpenMP/target_teams_codegen.cpp
index 9ee1f74e8fdc468..95182c668c5e822 100644
--- a/clang/test/OpenMP/target_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_codegen.cpp
@@ -121,7 +121,7 @@ int foo(int n) {
 aa += 1;
   }
 
-  #pragma omp target teams ompx_bare
+  #pragma omp target teams ompx_bare num_teams(1) thread_limit(1)
   {
 a += 1;
 aa += 1;
@@ -588,12 +588,12 @@ int bar(int n){
 // CHECK1-NEXT:[[TMP116:%.*]] = getelementptr inbounds 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 9
 // CHECK1-NEXT:store i64 0, ptr [[TMP116]], align 8
 // CHECK1-NEXT:[[TMP117:%.*]] = getelementptr inbounds 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 10
-// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP117]], align 4
+// CHECK1-NEXT:store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP117]], 
align 4
 // CHECK1-NEXT:[[TMP118:%.*]] = getelementptr inbounds 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 11
-// CHECK1-NEXT:store [3 x i32] zeroinitializer, ptr [[TMP118]], align 4
+// CHECK1-NEXT:store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP118]], 
align 4
 // CHECK1-NEXT:[[TMP119:%.*]] = getelementptr inbounds 
[[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KE

[clang] [openmp] Add memory diff dump for kernel record-replay (PR #70667)

2023-10-30 Thread Shilei Tian via cfe-commits


@@ -274,7 +317,7 @@ struct RecordReplayTy {
   void saveKernelOutputInfo(const char *Name) {
 SmallString<128> OutputFilename = {
 Name, (isRecording() ? ".original.output" : ".replay.output")};
-dumpDeviceMemory(OutputFilename);
+dumpDeviceMemory(OutputFilename, true);

shiltian wrote:

```suggestion
dumpDeviceMemory(OutputFilename, /*saveDiff*/true);
```

https://github.com/llvm/llvm-project/pull/70667
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [openmp] Add memory diff dump for kernel record-replay (PR #70667)

2023-10-30 Thread Shilei Tian via cfe-commits


@@ -274,7 +317,7 @@ struct RecordReplayTy {
   void saveKernelOutputInfo(const char *Name) {
 SmallString<128> OutputFilename = {
 Name, (isRecording() ? ".original.output" : ".replay.output")};
-dumpDeviceMemory(OutputFilename);
+dumpDeviceMemory(OutputFilename, true);

shiltian wrote:

```suggestion
dumpDeviceMemory(OutputFilename, /*saveDiff*/true);
```

https://github.com/llvm/llvm-project/pull/70667
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP] Add support for Solaris/x86_64 (PR #70593)

2023-10-30 Thread Shilei Tian via cfe-commits

shiltian wrote:

> > I'm not familiar with Solaris but does it need dedicated implementation of 
> > the function invocation written in ASM?
> 
> Can you point out what you're referring to? Looking at the patches for adding 
> support for OpenBSD and other OS's I don't see any ASM additions.

I was thinking about `openmp/runtime/src/z_Linux_asm.S` but it looks like we 
don't need extra stuff there.

https://github.com/llvm/llvm-project/pull/70593
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [openmp] Add memory diff dump for kernel record-replay (PR #70667)

2023-10-30 Thread Shilei Tian via cfe-commits

shiltian wrote:

Could you fix the existing format issue in a separate RFC patch?

https://github.com/llvm/llvm-project/pull/70667
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [openmp] Add memory diff dump for kernel record-replay (PR #70667)

2023-10-30 Thread Shilei Tian via cfe-commits

shiltian wrote:

Could you fix the existing format issue in a separate RFC patch?

https://github.com/llvm/llvm-project/pull/70667
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [StackProtector] Do not emit the stack protector on GPU architectures (PR #70799)

2023-10-31 Thread Shilei Tian via cfe-commits

https://github.com/shiltian approved this pull request.

I think the changes make sense.

https://github.com/llvm/llvm-project/pull/70799
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [openmp] [clang] [OpenMP] Introduce the KernelLaunchEnvironment as implicit argument (PR #70401)

2023-10-31 Thread Shilei Tian via cfe-commits

shiltian wrote:

Tests in `mlir` have to be updated as well.

https://github.com/llvm/llvm-project/pull/70401
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [openmp] [OpenMP][FIX] Allocate per launch memory for GPU team reductions (PR #70752)

2023-11-01 Thread Shilei Tian via cfe-commits

https://github.com/shiltian approved this pull request.

LG with some nits

https://github.com/llvm/llvm-project/pull/70752
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [openmp] [OpenMP][FIX] Allocate per launch memory for GPU team reductions (PR #70752)

2023-11-01 Thread Shilei Tian via cfe-commits

https://github.com/shiltian edited 
https://github.com/llvm/llvm-project/pull/70752
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [openmp] [OpenMP][FIX] Allocate per launch memory for GPU team reductions (PR #70752)

2023-11-01 Thread Shilei Tian via cfe-commits


@@ -194,6 +191,9 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
 ThreadId = 0;
   }
 
+  uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
+  uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;

shiltian wrote:

These two variables seem not used anywhere else.

https://github.com/llvm/llvm-project/pull/70752
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[openmp] [clang] [OpenMP][FIX] Allocate per launch memory for GPU team reductions (PR #70752)

2023-11-01 Thread Shilei Tian via cfe-commits


@@ -387,7 +387,7 @@ struct GenericKernelTy {
   KernelEnvironmentTy KernelEnvironment;
 
   /// The prototype kernel launch environment.
-  KernelLaunchEnvironmentTy KernelLaunchEnvironment;
+  KernelLaunchEnvironmentTy KernelLaunchEnvironment = {0, 0};

shiltian wrote:

```suggestion
  KernelLaunchEnvironmentTy KernelLaunchEnvironment = {0, 0, nullptr};
```

https://github.com/llvm/llvm-project/pull/70752
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[openmp] [clang] [OpenMP] Add support for Solaris/x86_64 (PR #70593)

2023-11-02 Thread Shilei Tian via cfe-commits

https://github.com/shiltian approved this pull request.

LG

https://github.com/llvm/llvm-project/pull/70593
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel (PR #68373)

2023-12-11 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/68373

>From 380046a1117cef08e40f9bcdce2c51c3fe73a26f Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Tue, 12 Dec 2023 00:11:13 -0500
Subject: [PATCH] [OpenMP][Clang] Force use of `num_teams` and `thread_limit`
 for bare kernel

This patch makes `num_teams` and `thread_limit` mandatory for bare kernels,
similar to a reguar kernel language that when launching a kernel, the grid size
has to be set explicitly.
---
 .../clang/Basic/DiagnosticSemaKinds.td|   2 +
 clang/lib/Sema/SemaOpenMP.cpp |  13 ++
 .../nvptx_target_teams_ompx_bare_codegen.cpp  |   2 +-
 clang/test/OpenMP/ompx_bare_messages.c|   7 +-
 clang/test/OpenMP/target_teams_ast_print.cpp  |   4 +-
 clang/test/OpenMP/target_teams_codegen.cpp| 206 +-
 6 files changed, 130 insertions(+), 104 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td 
b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 94e97a891baedc..8b2f5d2746f7fc 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11368,6 +11368,8 @@ def err_openmp_vla_in_task_untied : Error<
 def warn_omp_unterminated_declare_target : Warning<
   "expected '#pragma omp end declare target' at end of file to match '#pragma 
omp %0'">,
   InGroup;
+def err_ompx_bare_no_grid : Error<
+  "'ompx_bare' clauses requires explicit grid size via 'num_teams' and 
'thread_limit' clauses">;
 } // end of OpenMP category
 
 let CategoryName = "Related Result Type Issue" in {
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index e400f248d15aa3..3826994ef2126c 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -14658,6 +14658,19 @@ StmtResult 
Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses,
   }
   setFunctionHasBranchProtectedScope();
 
+  const OMPClause *BareClause = nullptr;
+  bool HasThreadLimitAndNumTeamsClause = hasClauses(Clauses, OMPC_num_teams) &&
+ hasClauses(Clauses, 
OMPC_thread_limit);
+  bool HasBareClause = llvm::any_of(Clauses, [&](const OMPClause *C) {
+BareClause = C;
+return C->getClauseKind() == OMPC_ompx_bare;
+  });
+
+  if (HasBareClause && !HasThreadLimitAndNumTeamsClause) {
+Diag(BareClause->getBeginLoc(), diag::err_ompx_bare_no_grid);
+return StmtError();
+  }
+
   return OMPTargetTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses,
  AStmt);
 }
diff --git a/clang/test/OpenMP/nvptx_target_teams_ompx_bare_codegen.cpp 
b/clang/test/OpenMP/nvptx_target_teams_ompx_bare_codegen.cpp
index 9f8046acb09703..2e6f0a9ce0169f 100644
--- a/clang/test/OpenMP/nvptx_target_teams_ompx_bare_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_ompx_bare_codegen.cpp
@@ -10,7 +10,7 @@ template
 tx ftemplate(int n) {
   tx a = 0;
 
-  #pragma omp target teams ompx_bare
+  #pragma omp target teams ompx_bare num_teams(1) thread_limit(32)
   {
 a = 2;
   }
diff --git a/clang/test/OpenMP/ompx_bare_messages.c 
b/clang/test/OpenMP/ompx_bare_messages.c
index a1b3c380285287..19ceee5625feec 100644
--- a/clang/test/OpenMP/ompx_bare_messages.c
+++ b/clang/test/OpenMP/ompx_bare_messages.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown %s
- // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s
- // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown 
-fopenmp-targets=nvptx64 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s
+// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown 
-fopenmp-targets=nvptx64 %s
 
 void foo() {
 }
@@ -18,4 +18,7 @@ void bar() {
 #pragma omp target
 #pragma omp teams ompx_bare // expected-error {{unexpected OpenMP clause 
'ompx_bare' in directive '#pragma omp teams'}} expected-note {{OpenMP extension 
clause 'ompx_bare' only allowed with '#pragma omp target teams'}}
   foo();
+
+#pragma omp target teams ompx_bare // expected-error {{'ompx_bare' clauses 
requires explicit grid size via 'num_teams' and 'thread_limit' clauses}}
+  foo();
 }
diff --git a/clang/test/OpenMP/target_teams_ast_print.cpp 
b/clang/test/OpenMP/target_teams_ast_print.cpp
index 5f1040be01a25f..8eaf4cbf249331 100644
--- a/clang/test/OpenMP/target_teams_ast_print.cpp
+++ b/clang/test/OpenMP/target_teams_ast_print.cpp
@@ -111,8 +111,8 @@ int main (int argc, char **argv) {
 // CHECK-NEXT: #pragma omp target teams
   a=2;
 // CHECK-NEXT: a = 2;
-#pragma omp target teams ompx_bare
-// CHECK-NEXT: #pragma omp target teams ompx_bare
+#pragma omp target teams ompx_bare num_teams(1) thread_limit(32)
+// CHECK-NEXT: #pragma omp target teams ompx_bare num_teams(1) thread_limit(32)
   a=3;
 // CHECK-NEXT: a = 3;
 #pragma omp target teams default(none), private(argc,b) num_teams(f) 
firstprivate(argv) reduction(| : c, d) reduction(* : e) thre

[clang] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel (PR #68373)

2023-12-12 Thread Shilei Tian via cfe-commits

shiltian wrote:

gentle ping

https://github.com/llvm/llvm-project/pull/68373
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[lld] [flang] [clang-tools-extra] [lldb] [compiler-rt] [libcxx] [clang] [libc] [llvm] [openmp] Gcc 75 libomptarget type convert (PR #75562)

2023-12-15 Thread Shilei Tian via cfe-commits

shiltian wrote:

FYI: #75419

https://github.com/llvm/llvm-project/pull/75562
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP][Clang] Force use of `num_teams` and `thread_limit` for bare kernel (PR #68373)

2023-12-18 Thread Shilei Tian via cfe-commits

https://github.com/shiltian closed 
https://github.com/llvm/llvm-project/pull/68373
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [openmp] [Clang][OpenMP] Fix mapping of structs to device (PR #75642)

2023-12-18 Thread Shilei Tian via cfe-commits

shiltian wrote:

The newly added test `offloading/struct_mapping_with_pointers.cpp` fails on 
NVIDIA GPUs as well.
```
 TEST 'libomptarget :: nvptx64-nvidia-cuda :: 
offloading/struct_mapping_with_pointers.cpp' FAILED 
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 2
/gpfs/jlse-fs0/users/ac.shilei.tian/build/llvm/release/bin/clang++ -fopenmp 
-pthread   -I 
/home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/test -I 
/gpfs/jlse
-fs0/users/ac.shilei.tian/build/openmp/release/runtime/src -L 
/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget -L 
/gpfs/jlse-fs0/users/ac.shilei.tian/build/ll
vm/release/./lib -L 
/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/runtime/src  
-Wl,-rpath,/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget
 -Wl,-rpa
th,/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/runtime/src 
-Wl,-rpath,/gpfs/jlse-fs0/users/ac.shilei.tian/build/llvm/release/./lib 
-Wl,-rpath,/soft/compilers/cuda/cud
a-11.8.0/targets/x86_64-linux/lib 
--libomptarget-nvptx-bc-path=/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget/DeviceRTL
 -fopenmp-targets=nvptx64-nvidia-cuda
 
/home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp
 -o /gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/releas
e/libomptarget/test/nvptx64-nvidia-cuda/offloading/Output/struct_mapping_with_pointers.cpp.tmp
 
/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget/libomptarget.d
evicertl.a && env LIBOMPTARGET_DEBUG=1 
/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget/test/nvptx64-nvidia-cuda/offloading/Output/struct_mapping_with_pointer
s.cpp.tmp 2>&1 | 
/gpfs/jlse-fs0/users/ac.shilei.tian/build/llvm/release/bin/FileCheck 
/home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/test/offloading/struct
_mapping_with_pointers.cpp
# executed command: 
/gpfs/jlse-fs0/users/ac.shilei.tian/build/llvm/release/bin/clang++ -fopenmp 
-pthread -I 
/home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/
test -I /gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/runtime/src -L 
/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget -L 
/gpfs/jlse-fs0/users/ac.sh
ilei.tian/build/llvm/release/./lib -L 
/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/runtime/src 
-Wl,-rpath,/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libo
mptarget 
-Wl,-rpath,/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/runtime/src 
-Wl,-rpath,/gpfs/jlse-fs0/users/ac.shilei.tian/build/llvm/release/./lib 
-Wl,-rpath,/soft/c
ompilers/cuda/cuda-11.8.0/targets/x86_64-linux/lib 
--libomptarget-nvptx-bc-path=/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget/DeviceRTL
 -fopenmp-targets=nv
ptx64-nvidia-cuda 
/home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp
 -o /gpfs/jlse-fs0/users/ac.shilei.tian/bu
ild/openmp/release/libomptarget/test/nvptx64-nvidia-cuda/offloading/Output/struct_mapping_with_pointers.cpp.tmp
 /gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarg
et/libomptarget.devicertl.a
# executed command: env LIBOMPTARGET_DEBUG=1 
/gpfs/jlse-fs0/users/ac.shilei.tian/build/openmp/release/libomptarget/test/nvptx64-nvidia-cuda/offloading/Output/struct_mapping_with_p
ointers.cpp.tmp
# executed command: 
/gpfs/jlse-fs0/users/ac.shilei.tian/build/llvm/release/bin/FileCheck 
/home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/test/offloading/str
uct_mapping_with_pointers.cpp
# .---command stderr
# | 
/home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp:106:12:
 error: CHECK: expected string not found in inpu
t
# |  // CHECK: dat.datum[dat.arr[0][0]] = 0
# |^
# | :124:24: note: scanning from here
# | dat.val_more_datum = 18
# |^
# | :125:1: note: possible intended match here
# | dat.datum[dat.arr[0][0]] = 32542
# | ^
# |
# | Input file: 
# | Check file: 
/home/ac.shilei.tian/Documents/vscode/llvm-project/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp
# |
# | -dump-input=help explains the following input dump.
# |
# | Input was:
# | <<
# |  .
# |  .
# |  .
# |119: omptarget --> Done unregistering library!
# |120: omptarget --> Deinit offload library!
# |121: TARGET CUDA RTL --> Missing 2 resources to be returned
# |122: dat.xi = 4
# |123: dat.val_datum = 8
# |124: dat.val_more_datum = 18
# | check:106'0X error: no match found
# |125: dat.datum[dat.arr[0][0]] = 32542
# | check:106'0 ~
# | check:106'1 ?   

[clang] [Clang][OpenMP] Emit unsupported directive error (PR #70233)

2023-11-24 Thread Shilei Tian via cfe-commits

https://github.com/shiltian approved this pull request.

LG with a nit

https://github.com/llvm/llvm-project/pull/70233
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][OpenMP] Emit unsupported directive error (PR #70233)

2023-11-24 Thread Shilei Tian via cfe-commits


@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -emit-llvm -fopenmp -disable-llvm-passes %s -verify=expected
+
+// expected-error@+2 {{cannot compile this OpenMP dispatch directive yet}} 
+void a(){
+#pragma omp dispatch
+a();
+}

shiltian wrote:

Leave an empty line at the end of the file

https://github.com/llvm/llvm-project/pull/70233
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][OpenMP] Emit unsupported directive error (PR #70233)

2023-11-24 Thread Shilei Tian via cfe-commits

https://github.com/shiltian edited 
https://github.com/llvm/llvm-project/pull/70233
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][OpenMP] Emit unsupported directive error (PR #70233)

2023-11-24 Thread Shilei Tian via cfe-commits

https://github.com/shiltian closed 
https://github.com/llvm/llvm-project/pull/70233
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] 114df24 - [Clang][Doc] Update the release note for clang

2022-07-26 Thread Shilei Tian via cfe-commits

Author: Shilei Tian
Date: 2022-07-26T15:39:21-04:00
New Revision: 114df244ec50ce0145702974335965c3aa2c3dcc

URL: 
https://github.com/llvm/llvm-project/commit/114df244ec50ce0145702974335965c3aa2c3dcc
DIFF: 
https://github.com/llvm/llvm-project/commit/114df244ec50ce0145702974335965c3aa2c3dcc.diff

LOG: [Clang][Doc] Update the release note for clang

Add the support for `atomic compare` and `atomic compare capture` in the
release note of clang.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D129211

Added: 


Modified: 
clang/docs/ReleaseNotes.rst

Removed: 




diff  --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 74505dd30baa..84c74335ea5d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -592,6 +592,8 @@ ABI Changes in Clang
 
 OpenMP Support in Clang
 ---
+* Added the support for ``atomic compare`` and ``atomic compare capture``
+  (``-fopenmp-version=51`` is required).
 
 ...
 



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] e21202d - [Clang][OpenMP] Fix the issue that `llvm.lifetime.end` is emitted too early for variables captured in linear clause

2022-08-06 Thread Shilei Tian via cfe-commits

Author: Shilei Tian
Date: 2022-08-06T16:50:37-04:00
New Revision: e21202dac18ed7f718d26a0e131f96b399b4891c

URL: 
https://github.com/llvm/llvm-project/commit/e21202dac18ed7f718d26a0e131f96b399b4891c
DIFF: 
https://github.com/llvm/llvm-project/commit/e21202dac18ed7f718d26a0e131f96b399b4891c.diff

LOG: [Clang][OpenMP] Fix the issue that `llvm.lifetime.end` is emitted too 
early for variables captured in linear clause

Currently if an OpenMP program uses `linear` clause, and is compiled with
optimization, `llvm.lifetime.end` for variables listed in `linear` clause are
emitted too early such that there could still be uses after that. Let's take the
following code as example:
```
// loop.c
int j;
int *u;

void loop(int n) {
  int i;
  for (i = 0; i < n; ++i) {
++j;
u = &j;
  }
}
```
We compile using the command:
```
clang -cc1 -fopenmp-simd -O3 -x c -triple x86_64-apple-darwin10 -emit-llvm 
loop.c -o loop.ll
```
The following IR (simplified) will be generated:
```
@j = local_unnamed_addr global i32 0, align 4
@u = local_unnamed_addr global ptr null, align 8

define void @loop(i32 noundef %n) local_unnamed_addr {
entry:
  %j = alloca i32, align 4
  %cmp = icmp sgt i32 %n, 0
  br i1 %cmp, label %simd.if.then, label %simd.if.end

simd.if.then: ; preds = %entry
  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %j)
  store ptr %j, ptr @u, align 8
  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %j)
  %0 = load i32, ptr %j, align 4
  store i32 %0, ptr @j, align 4
  br label %simd.if.end

simd.if.end:  ; preds = %simd.if.then, 
%entry
  ret void
}
```
The most important part is:
```
  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %j)
  %0 = load i32, ptr %j, align 4
  store i32 %0, ptr @j, align 4
```
`%j` is still loaded after `@llvm.lifetime.end.p0(i64 4, ptr nonnull %j)`. This
could cause the backend incorrectly optimizes the code and further generates
incorrect code. The root cause is, when we emit a construct that could have
`linear` clause, it usually has the following pattern:
```
EmitOMPLinearClauseInit(S)
{
  OMPPrivateScope LoopScope(*this);
  ...
  EmitOMPLinearClause(S, LoopScope);
  ...
  (void)LoopScope.Privatize();
  ...
}
EmitOMPLinearClauseFinal(S, [](CodeGenFunction &) { return nullptr; });
```
Variables that need to be privatized are added into `LoopScope`, which also
serves as a RAII object. When `LoopScope` is destructed and if optimization is
enabled, a `@llvm.lifetime.end` is also emitted for each privatized variable.
However, the writing back to original variables in `linear` clause happens after
the scope in `EmitOMPLinearClauseFinal`, causing the issue we see above.

A quick "fix" seems to be, moving `EmitOMPLinearClauseFinal` inside the scope.
However, it doesn't work. That's because the local variable map has been updated
by `LoopScope` such that a variable declaration is mapped to the privatized
variable, instead of the actual one. In that way, the following code will be
generated:
```
  %0 = load i32, ptr %j, align 4
  store i32 %0, ptr %j, align 4
  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %j)
```
Well, now the life time is correct, but apparently the writing back is broken.

In this patch, a new function `OMPPrivateScope::restoreMap` is added and called
before calling `EmitOMPLinearClauseFinal`. This can make sure that
`EmitOMPLinearClauseFinal` can find the orignal varaibls to write back.

Fixes #56913.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D131272

Added: 
clang/test/OpenMP/bug56913.c

Modified: 
clang/lib/CodeGen/CGStmtOpenMP.cpp
clang/lib/CodeGen/CodeGenFunction.h
clang/test/OpenMP/for_linear_codegen.cpp
clang/test/OpenMP/parallel_master_taskloop_simd_lastprivate_codegen.cpp

Removed: 




diff  --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp 
b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 962620f43a393..5219b6e39f4e2 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -2582,8 +2582,9 @@ static void emitOMPSimdRegion(CodeGenFunction &CGF, const 
OMPLoopDirective &S,
 CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_simd);
 emitPostUpdateForReductionClause(CGF, S,
  [](CodeGenFunction &) { return nullptr; 
});
+LoopScope.restoreMap();
+CGF.EmitOMPLinearClauseFinal(S, [](CodeGenFunction &) { return nullptr; });
   }
-  CGF.EmitOMPLinearClauseFinal(S, [](CodeGenFunction &) { return nullptr; });
   // Emit: if (PreCond) - end.
   if (ContBlock) {
 CGF.EmitBranch(ContBlock);
@@ -3428,11 +3429,12 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
 EmitOMPLastprivateClauseFinal(
 S, isOpenMPSimdDirective(S.getDirectiveKind()),
 Builder.CreateIsNotNull(EmitLoadOfScalar(IL, S.getBeginLoc(;
+  LoopScope.restoreMap();
+ 

[clang] 3274cdc - [Clang][OpenMP] Remove the mandatory flush for capture for OpenMP 5.1

2021-07-26 Thread Shilei Tian via cfe-commits

Author: Shilei Tian
Date: 2021-07-26T11:00:44-04:00
New Revision: 3274cdc83ecdf2af569ad4f564d55d0e43b1072e

URL: 
https://github.com/llvm/llvm-project/commit/3274cdc83ecdf2af569ad4f564d55d0e43b1072e
DIFF: 
https://github.com/llvm/llvm-project/commit/3274cdc83ecdf2af569ad4f564d55d0e43b1072e.diff

LOG: [Clang][OpenMP] Remove the mandatory flush for capture for OpenMP 5.1

In OpenMP 5.1:
> If the `write` or `update` clause is specifieded, the atomic operation is not 
> an atomic conditional update for which the comparison fails, and the 
> effective memory ordering is `release`, `acq_rel`, or `seq_cst`, the strong 
> flush on entry to the atomic operation is also a release flush. If the `read` 
> or `update` clause is specified and the effective memory ordering is 
> `acquire`, `acq_rel`, or `seq_cst` then the strong flush on exit from the 
> atomic operation is also an acquire flush.

In OpenMP 5.0:
> If the `write`, `update`, or **`capture`** clause is specified and the 
> `release`, `acq_rel`, or `seq_cst` clause is specified then the strong flush 
> on entry to the atomic operation is also a release flush. If the `read` or 
> `capture` clause is specified and the `acquire`, `acq_rel`, or `seq_cst` 
> clause is specified then the strong flush on exit from the atomic operation 
> is also an acquire flush.

>From my understanding, in OpenMP 5.1, `capture` is removed from the 
>requirement for flush, therefore we don't have to enforce it.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D100768

Added: 


Modified: 
clang/lib/CodeGen/CGStmtOpenMP.cpp
clang/test/OpenMP/atomic_capture_codegen.cpp

Removed: 




diff  --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp 
b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 486b48bca0a62..1f913590339f8 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -5725,32 +5725,35 @@ static void emitOMPAtomicCaptureExpr(CodeGenFunction 
&CGF,
   // Emit post-update store to 'v' of old/new 'x' value.
   CGF.emitOMPSimpleStore(VLValue, NewVVal, NewVValType, Loc);
   CGF.CGM.getOpenMPRuntime().checkAndEmitLastprivateConditional(CGF, V);
-  // OpenMP, 2.17.7, atomic Construct
-  // If the write, update, or capture clause is specified and the release,
-  // acq_rel, or seq_cst clause is specified then the strong flush on entry to
-  // the atomic operation is also a release flush.
-  // If the read or capture clause is specified and the acquire, acq_rel, or
-  // seq_cst clause is specified then the strong flush on exit from the atomic
-  // operation is also an acquire flush.
-  switch (AO) {
-  case llvm::AtomicOrdering::Release:
-CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc,
- llvm::AtomicOrdering::Release);
-break;
-  case llvm::AtomicOrdering::Acquire:
-CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc,
- llvm::AtomicOrdering::Acquire);
-break;
-  case llvm::AtomicOrdering::AcquireRelease:
-  case llvm::AtomicOrdering::SequentiallyConsistent:
-CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc,
- llvm::AtomicOrdering::AcquireRelease);
-break;
-  case llvm::AtomicOrdering::Monotonic:
-break;
-  case llvm::AtomicOrdering::NotAtomic:
-  case llvm::AtomicOrdering::Unordered:
-llvm_unreachable("Unexpected ordering.");
+  // OpenMP 5.1 removes the required flush for capture clause.
+  if (CGF.CGM.getLangOpts().OpenMP < 51) {
+// OpenMP, 2.17.7, atomic Construct
+// If the write, update, or capture clause is specified and the release,
+// acq_rel, or seq_cst clause is specified then the strong flush on entry 
to
+// the atomic operation is also a release flush.
+// If the read or capture clause is specified and the acquire, acq_rel, or
+// seq_cst clause is specified then the strong flush on exit from the 
atomic
+// operation is also an acquire flush.
+switch (AO) {
+case llvm::AtomicOrdering::Release:
+  CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc,
+   llvm::AtomicOrdering::Release);
+  break;
+case llvm::AtomicOrdering::Acquire:
+  CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc,
+   llvm::AtomicOrdering::Acquire);
+  break;
+case llvm::AtomicOrdering::AcquireRelease:
+case llvm::AtomicOrdering::SequentiallyConsistent:
+  CGF.CGM.getOpenMPRuntime().emitFlush(
+  CGF, llvm::None, Loc, llvm::AtomicOrdering::AcquireRelease);
+  break;
+case llvm::AtomicOrdering::Monotonic:
+  break;
+case llvm::AtomicOrdering::NotAtomic:
+case llvm::AtomicOrdering::Unordered:
+  llvm_unreachable("Unexpected ordering.");
+}
   }
 }
 

diff  --git a/clang/test/OpenMP/atomic_capture

[clang] 52e6a27 - Clean up `OMPAtomicDirective::Create`

2022-04-15 Thread Shilei Tian via cfe-commits

Author: Shilei Tian
Date: 2022-04-15T11:41:26-04:00
New Revision: 52e6a27690ca8e5f07cc646716c3736475b7746b

URL: 
https://github.com/llvm/llvm-project/commit/52e6a27690ca8e5f07cc646716c3736475b7746b
DIFF: 
https://github.com/llvm/llvm-project/commit/52e6a27690ca8e5f07cc646716c3736475b7746b.diff

LOG: Clean up `OMPAtomicDirective::Create`

Added: 


Modified: 
clang/include/clang/AST/StmtOpenMP.h
clang/lib/AST/StmtOpenMP.cpp
clang/lib/Sema/SemaOpenMP.cpp

Removed: 




diff  --git a/clang/include/clang/AST/StmtOpenMP.h 
b/clang/include/clang/AST/StmtOpenMP.h
index 28b3567b36556..0aa318d84a93f 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -2889,6 +2889,27 @@ class OMPAtomicDirective : public OMPExecutableDirective 
{
   void setCond(Expr *C) { Data->getChildren()[DataPositionTy::POS_Cond] = C; }
 
 public:
+  struct Expressions {
+/// 'x' part of the associated expression/statement.
+Expr *X = nullptr;
+/// 'v' part of the associated expression/statement.
+Expr *V = nullptr;
+/// 'expr' part of the associated expression/statement.
+Expr *E = nullptr;
+/// UE Helper expression of the form:
+/// 'OpaqueValueExpr(x) binop OpaqueValueExpr(expr)' or
+/// 'OpaqueValueExpr(expr) binop OpaqueValueExpr(x)'.
+Expr *UE = nullptr;
+/// 'd' part of the associated expression/statement.
+Expr *D = nullptr;
+/// Conditional expression in `atomic compare` construct.
+Expr *Cond = nullptr;
+/// True if UE has the first form and false if the second.
+bool IsXLHSInRHSPart;
+/// True if original value of 'x' must be stored in 'v', not an updated 
one.
+bool IsPostfixUpdate;
+  };
+
   /// Creates directive with a list of \a Clauses and 'x', 'v' and 'expr'
   /// parts of the atomic construct (see Section 2.12.6, atomic Construct, for
   /// detailed description of 'x', 'v' and 'expr').
@@ -2898,23 +2919,12 @@ class OMPAtomicDirective : public 
OMPExecutableDirective {
   /// \param EndLoc Ending Location of the directive.
   /// \param Clauses List of clauses.
   /// \param AssociatedStmt Statement, associated with the directive.
-  /// \param X 'x' part of the associated expression/statement.
-  /// \param V 'v' part of the associated expression/statement.
-  /// \param E 'expr' part of the associated expression/statement.
-  /// \param UE Helper expression of the form
-  /// 'OpaqueValueExpr(x) binop OpaqueValueExpr(expr)' or
-  /// 'OpaqueValueExpr(expr) binop OpaqueValueExpr(x)'.
-  /// \param D 'd' part of the associated expression/statement.
-  /// \param Cond Conditional expression in `atomic compare` construct.
-  /// \param IsXLHSInRHSPart true if \a UE has the first form and false if the
-  /// second.
-  /// \param IsPostfixUpdate true if original value of 'x' must be stored in
-  /// 'v', not an updated one.
-  static OMPAtomicDirective *
-  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
- ArrayRef Clauses, Stmt *AssociatedStmt, Expr *X, Expr *V,
- Expr *E, Expr *UE, Expr *D, Expr *Cond, bool IsXLHSInRHSPart,
- bool IsPostfixUpdate);
+  /// \param Exprs Associated expressions or statements.
+  static OMPAtomicDirective *Create(const ASTContext &C,
+SourceLocation StartLoc,
+SourceLocation EndLoc,
+ArrayRef Clauses,
+Stmt *AssociatedStmt, Expressions Exprs);
 
   /// Creates an empty directive with the place for \a NumClauses
   /// clauses.

diff  --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index 84a4de00328a8..15e13da27dd84 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -866,19 +866,17 @@ OMPOrderedDirective 
*OMPOrderedDirective::CreateEmpty(const ASTContext &C,
 OMPAtomicDirective *
 OMPAtomicDirective::Create(const ASTContext &C, SourceLocation StartLoc,
SourceLocation EndLoc, ArrayRef 
Clauses,
-   Stmt *AssociatedStmt, Expr *X, Expr *V, Expr *E,
-   Expr *UE, Expr *D, Expr *Cond, bool IsXLHSInRHSPart,
-   bool IsPostfixUpdate) {
+   Stmt *AssociatedStmt, Expressions Exprs) {
   auto *Dir = createDirective(
   C, Clauses, AssociatedStmt, /*NumChildren=*/6, StartLoc, EndLoc);
-  Dir->setX(X);
-  Dir->setV(V);
-  Dir->setExpr(E);
-  Dir->setUpdateExpr(UE);
-  Dir->setD(D);
-  Dir->setCond(Cond);
-  Dir->IsXLHSInRHSPart = IsXLHSInRHSPart;
-  Dir->IsPostfixUpdate = IsPostfixUpdate;
+  Dir->setX(Exprs.X);
+  Dir->setV(Exprs.V);
+  Dir->setExpr(Exprs.E);
+  Dir->setUpdateExpr(Exprs.UE);
+  Dir->setD(Exprs.D);
+  Dir->setCond(Exprs.Cond);
+  Dir->IsXLHSInRHSPart = Exprs.IsXLHSInRHSPart;
+  Dir->IsPostfixUpdate = Exprs.IsPostf

[clang] e8760b5 - [Clang][OpenMP] Use bitfields for flags in `OMPAtomicDirective`

2022-04-15 Thread Shilei Tian via cfe-commits

Author: Shilei Tian
Date: 2022-04-15T21:34:28-04:00
New Revision: e8760b51ee0f972587cb0af922a3f828ab6926d6

URL: 
https://github.com/llvm/llvm-project/commit/e8760b51ee0f972587cb0af922a3f828ab6926d6
DIFF: 
https://github.com/llvm/llvm-project/commit/e8760b51ee0f972587cb0af922a3f828ab6926d6.diff

LOG: [Clang][OpenMP] Use bitfields for flags in `OMPAtomicDirective`

As suggested in D120290.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D123862

Added: 


Modified: 
clang/include/clang/AST/StmtOpenMP.h
clang/lib/AST/StmtOpenMP.cpp
clang/lib/Serialization/ASTReaderStmt.cpp

Removed: 




diff  --git a/clang/include/clang/AST/StmtOpenMP.h 
b/clang/include/clang/AST/StmtOpenMP.h
index 0aa318d84a93f..dfaf8b5a77385 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -2827,25 +2827,28 @@ class OMPOrderedDirective : public 
OMPExecutableDirective {
 class OMPAtomicDirective : public OMPExecutableDirective {
   friend class ASTStmtReader;
   friend class OMPExecutableDirective;
-  /// Used for 'atomic update' or 'atomic capture' constructs. They may
-  /// have atomic expressions of forms
-  /// \code
-  /// x = x binop expr;
-  /// x = expr binop x;
-  /// \endcode
-  /// This field is true for the first form of the expression and false for the
-  /// second. Required for correct codegen of non-associative operations (like
-  /// << or >>).
-  bool IsXLHSInRHSPart = false;
-  /// Used for 'atomic update' or 'atomic capture' constructs. They may
-  /// have atomic expressions of forms
-  /// \code
-  /// v = x; ;
-  /// ; v = x;
-  /// \endcode
-  /// This field is true for the first(postfix) form of the expression and 
false
-  /// otherwise.
-  bool IsPostfixUpdate = false;
+
+  struct FlagTy {
+/// Used for 'atomic update' or 'atomic capture' constructs. They may
+/// have atomic expressions of forms:
+/// \code
+/// x = x binop expr;
+/// x = expr binop x;
+/// \endcode
+/// This field is 1 for the first form of the expression and 0 for the
+/// second. Required for correct codegen of non-associative operations 
(like
+/// << or >>).
+uint8_t IsXLHSInRHSPart : 1;
+/// Used for 'atomic update' or 'atomic capture' constructs. They may
+/// have atomic expressions of forms:
+/// \code
+/// v = x; ;
+/// ; v = x;
+/// \endcode
+/// This field is 1 for the first(postfix) form of the expression and 0
+/// otherwise.
+uint8_t IsPostfixUpdate : 1;
+  } Flags;
 
   /// Build directive with the given start and end location.
   ///
@@ -2956,10 +2959,10 @@ class OMPAtomicDirective : public 
OMPExecutableDirective {
   /// Return true if helper update expression has form
   /// 'OpaqueValueExpr(x) binop OpaqueValueExpr(expr)' and false if it has form
   /// 'OpaqueValueExpr(expr) binop OpaqueValueExpr(x)'.
-  bool isXLHSInRHSPart() const { return IsXLHSInRHSPart; }
+  bool isXLHSInRHSPart() const { return Flags.IsXLHSInRHSPart; }
   /// Return true if 'v' expression must be updated to original value of
   /// 'x', false if 'v' must be updated to the new value of 'x'.
-  bool isPostfixUpdate() const { return IsPostfixUpdate; }
+  bool isPostfixUpdate() const { return Flags.IsPostfixUpdate; }
   /// Get 'v' part of the associated expression/statement.
   Expr *getV() {
 return cast_or_null(Data->getChildren()[DataPositionTy::POS_V]);

diff  --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index 15e13da27dd84..3535b0620ee50 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -875,8 +875,8 @@ OMPAtomicDirective::Create(const ASTContext &C, 
SourceLocation StartLoc,
   Dir->setUpdateExpr(Exprs.UE);
   Dir->setD(Exprs.D);
   Dir->setCond(Exprs.Cond);
-  Dir->IsXLHSInRHSPart = Exprs.IsXLHSInRHSPart;
-  Dir->IsPostfixUpdate = Exprs.IsPostfixUpdate;
+  Dir->Flags.IsXLHSInRHSPart = Exprs.IsXLHSInRHSPart ? 1 : 0;
+  Dir->Flags.IsPostfixUpdate = Exprs.IsPostfixUpdate ? 1 : 0;
   return Dir;
 }
 

diff  --git a/clang/lib/Serialization/ASTReaderStmt.cpp 
b/clang/lib/Serialization/ASTReaderStmt.cpp
index ed9f1d2b34289..281385ad9e7d9 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2449,8 +2449,8 @@ void 
ASTStmtReader::VisitOMPOrderedDirective(OMPOrderedDirective *D) {
 void ASTStmtReader::VisitOMPAtomicDirective(OMPAtomicDirective *D) {
   VisitStmt(D);
   VisitOMPExecutableDirective(D);
-  D->IsXLHSInRHSPart = Record.readBool();
-  D->IsPostfixUpdate = Record.readBool();
+  D->Flags.IsXLHSInRHSPart = Record.readBool() ? 1 : 0;
+  D->Flags.IsPostfixUpdate = Record.readBool() ? 1 : 0;
 }
 
 void ASTStmtReader::VisitOMPTargetDirective(OMPTargetDirective *D) {



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cg

[clang] b35be6f - [Clang][Sema][OpenMP] Sema support for `atomic compare`

2022-02-04 Thread Shilei Tian via cfe-commits

Author: Shilei Tian
Date: 2022-02-04T12:30:56-05:00
New Revision: b35be6fe98e30b2373e8fdf024ef8c13a32121d7

URL: 
https://github.com/llvm/llvm-project/commit/b35be6fe98e30b2373e8fdf024ef8c13a32121d7
DIFF: 
https://github.com/llvm/llvm-project/commit/b35be6fe98e30b2373e8fdf024ef8c13a32121d7.diff

LOG: [Clang][Sema][OpenMP] Sema support for `atomic compare`

This patch adds the Sema support for `atomic compare`.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D116637

Added: 


Modified: 
clang/include/clang/Basic/DiagnosticSemaKinds.td
clang/lib/CodeGen/CGStmtOpenMP.cpp
clang/lib/Sema/SemaOpenMP.cpp
clang/test/OpenMP/atomic_ast_print.cpp
clang/test/OpenMP/atomic_messages.c
clang/test/OpenMP/atomic_messages.cpp

Removed: 




diff  --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td 
b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index d5e653a7fa192..7f73b9b285e37 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10516,6 +10516,15 @@ def err_omp_atomic_capture_not_compound_statement : 
Error<
   " where x is an lvalue expression with scalar type">;
 def note_omp_atomic_capture: Note<
   "%select{expected assignment expression|expected compound statement|expected 
exactly two expression statements|expected in right hand side of the first 
expression}0">;
+def err_omp_atomic_compare : Error<
+  "the statement for 'atomic compare' must be a compound statement of form '{x 
= expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}',"
+  " '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x 
= expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}',"
+  " 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 
'expr', 'e', and 'd' are expressions with scalar type,"
+  " and 'ordop' is one of '<' or '>'.">;
+def note_omp_atomic_compare: Note<
+  "%select{expected compound statement|expected exactly one expression 
statement|expected assignment statement|expected conditional operator|expect 
result value to be at false expression|"
+  "expect binary operator in conditional expression|expect '<', '>' or '==' as 
order operator|expect comparison in a form of 'x == e', 'e == x', 'x ordop 
expr', or 'expr ordop x'|"
+  "expect lvalue for result value|expect scalar value|expect integer value}0">;
 def err_omp_atomic_several_clauses : Error<
   "directive '#pragma omp atomic' cannot contain more than one 'read', 
'write', 'update', 'capture', or 'compare' clause">;
 def err_omp_several_mem_order_clauses : Error<

diff  --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp 
b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 39dd4c00765d2..596cef8ce60c1 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -6031,9 +6031,13 @@ static void emitOMPAtomicExpr(CodeGenFunction &CGF, 
OpenMPClauseKind Kind,
 emitOMPAtomicCaptureExpr(CGF, AO, IsPostfixUpdate, V, X, E, UE,
  IsXLHSInRHSPart, Loc);
 break;
-  case OMPC_compare:
-// Do nothing here as we already emit an error.
+  case OMPC_compare: {
+// Emit an error here.
+unsigned DiagID = CGF.CGM.getDiags().getCustomDiagID(
+DiagnosticsEngine::Error, "'atomic compare' is not supported for now");
+CGF.CGM.getDiags().Report(DiagID);
 break;
+  }
   case OMPC_if:
   case OMPC_final:
   case OMPC_num_threads:

diff  --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index a4092f0d2b543..1fafd58ba3edd 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -10925,6 +10925,357 @@ bool OpenMPAtomicUpdateChecker::checkStatement(Stmt 
*S, unsigned DiagId,
   }
   return ErrorFound != NoError;
 }
+
+/// Get the node id of the fixed point of an expression \a S.
+llvm::FoldingSetNodeID getNodeId(ASTContext &Context, const Expr *S) {
+  llvm::FoldingSetNodeID Id;
+  S->IgnoreParenImpCasts()->Profile(Id, Context, true);
+  return Id;
+}
+
+/// Check if two expressions are same.
+bool checkIfTwoExprsAreSame(ASTContext &Context, const Expr *LHS,
+const Expr *RHS) {
+  return getNodeId(Context, LHS) == getNodeId(Context, RHS);
+}
+
+class OpenMPAtomicCompareChecker {
+public:
+  /// All kinds of errors that can occur in `atomic compare`
+  enum ErrorTy {
+/// Empty compound statement.
+NoStmt = 0,
+/// More than one statement in a compound statement.
+MoreThanOneStmt,
+/// Not an assignment binary operator.
+NotAnAssignment,
+/// Not a conditional operator.
+NotCondOp,
+/// Wrong false expr. According to the spec, 'x' should be at the false
+/// expression of a conditional expression.
+WrongFalseExpr,
+/// The condition of a conditional expression is not a binary operator.
+NotABinaryOp,
+/// Invalid binary operator (not <, >, 

[clang] b8ec430 - [Clang][Sema][OpenMP] Fix uninitialized variable Op

2022-02-04 Thread Shilei Tian via cfe-commits

Author: Shilei Tian
Date: 2022-02-04T15:00:43-05:00
New Revision: b8ec430de71766d9a35a6b737c8a789c0c7cf812

URL: 
https://github.com/llvm/llvm-project/commit/b8ec430de71766d9a35a6b737c8a789c0c7cf812
DIFF: 
https://github.com/llvm/llvm-project/commit/b8ec430de71766d9a35a6b737c8a789c0c7cf812.diff

LOG: [Clang][Sema][OpenMP] Fix uninitialized variable Op

This can fix the case atomic_messages

Added: 


Modified: 
clang/lib/Sema/SemaOpenMP.cpp

Removed: 




diff  --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 4143e070a8738..7486e1389172c 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -11063,8 +11063,18 @@ bool 
OpenMPAtomicCompareChecker::checkCondUpdateStmt(IfStmt *S,
 ErrorInfo.ErrorRange = ErrorInfo.NoteRange = 
S->getCond()->getSourceRange();
 return false;
   }
-  if (Cond->getOpcode() != BO_EQ && Cond->getOpcode() != BO_LT &&
-  Cond->getOpcode() != BO_GT) {
+
+  switch (Cond->getOpcode()) {
+  case BO_EQ:
+Op = OMPAtomicCompareOp::EQ;
+break;
+  case BO_LT:
+Op = OMPAtomicCompareOp::MIN;
+break;
+  case BO_GT:
+Op = OMPAtomicCompareOp::MAX;
+break;
+  default:
 ErrorInfo.Error = ErrorTy::InvalidBinaryOp;
 ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc();
 ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange();
@@ -11148,8 +11158,17 @@ bool 
OpenMPAtomicCompareChecker::checkCondExprStmt(Stmt *S,
 return false;
   }
 
-  if (Cond->getOpcode() != BO_EQ && Cond->getOpcode() != BO_LT &&
-  Cond->getOpcode() != BO_GT) {
+  switch (Cond->getOpcode()) {
+  case BO_EQ:
+Op = OMPAtomicCompareOp::EQ;
+break;
+  case BO_LT:
+Op = OMPAtomicCompareOp::MIN;
+break;
+  case BO_GT:
+Op = OMPAtomicCompareOp::MAX;
+break;
+  default:
 ErrorInfo.Error = ErrorTy::InvalidBinaryOp;
 ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc();
 ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange();



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] 20a9fb9 - [Clang][OpenMP] Fix the issue that temp cubin files are not removed after compilation when using new OpenMP driver

2022-04-22 Thread Shilei Tian via cfe-commits

Author: Shilei Tian
Date: 2022-04-22T18:07:28-04:00
New Revision: 20a9fb953e46b1d97aaee7b182b0f3d48f340bd1

URL: 
https://github.com/llvm/llvm-project/commit/20a9fb953e46b1d97aaee7b182b0f3d48f340bd1
DIFF: 
https://github.com/llvm/llvm-project/commit/20a9fb953e46b1d97aaee7b182b0f3d48f340bd1.diff

LOG: [Clang][OpenMP] Fix the issue that temp cubin files are not removed after 
compilation when using new OpenMP driver

The root cause of this is, in `NVPTX::Assembler::ConstructJob`, the output file 
name might not match the `Output`'s file name passed into the function because 
`CudaToolChain::getInputFilename` is a specialized version. That means the real 
output file is not added to the temp files list, which will be all removed in 
the d'tor of `Compilation`. In order to "fix" it, in the function 
`NVPTX::OpenMPLinker::ConstructJob`, before calling `clang-nvlink-wrapper`, the 
function calls `getToolChain().getInputFilename(II)` to get the right output 
file name for each input, and add it to temp file, and then they can be removed 
w/o any issue. However, this whole logic doesn't work when using the new OpenMP 
driver because `NVPTX::OpenMPLinker::ConstructJob` is not called at all, which 
causing the issue that the cubin file generated in each single unit compilation 
is out of track.

In this patch, we add the real output file into temp files if its name doesn't 
match `Output`. We add it when the file is an output instead of doing it when 
it is an input, like what we did in `NVPTX::OpenMPLinker::ConstructJob`, which 
makes more sense.

Reviewed By: jhuber6

Differential Revision: https://reviews.llvm.org/D124253

Added: 


Modified: 
clang/lib/Driver/ToolChains/Cuda.cpp

Removed: 




diff  --git a/clang/lib/Driver/ToolChains/Cuda.cpp 
b/clang/lib/Driver/ToolChains/Cuda.cpp
index f8a06a2e09ab7..6103c42bf7547 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -447,7 +447,10 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const 
JobAction &JA,
   CmdArgs.push_back("--gpu-name");
   CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch)));
   CmdArgs.push_back("--output-file");
-  CmdArgs.push_back(Args.MakeArgString(TC.getInputFilename(Output)));
+  const char *OutputFileName = Args.MakeArgString(TC.getInputFilename(Output));
+  if (std::string(OutputFileName) != std::string(Output.getFilename()))
+C.addTempFile(OutputFileName);
+  CmdArgs.push_back(OutputFileName);
   for (const auto& II : Inputs)
 CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
 
@@ -606,8 +609,8 @@ void NVPTX::OpenMPLinker::ConstructJob(Compilation &C, 
const JobAction &JA,
 if (!II.isFilename())
   continue;
 
-const char *CubinF = C.addTempFile(
-C.getArgs().MakeArgString(getToolChain().getInputFilename(II)));
+const char *CubinF =
+C.getArgs().MakeArgString(getToolChain().getInputFilename(II));
 
 CmdArgs.push_back(CubinF);
   }



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] 9c1085c - [Clang][OpenMP] Add the support for floating-point variables for specific atomic clauses

2022-05-03 Thread Shilei Tian via cfe-commits

Author: Shilei Tian
Date: 2022-05-03T11:30:54-04:00
New Revision: 9c1085c7e20bdd7c4a487f50313ebeeb2b6683b8

URL: 
https://github.com/llvm/llvm-project/commit/9c1085c7e20bdd7c4a487f50313ebeeb2b6683b8
DIFF: 
https://github.com/llvm/llvm-project/commit/9c1085c7e20bdd7c4a487f50313ebeeb2b6683b8.diff

LOG: [Clang][OpenMP] Add the support for floating-point variables for specific 
atomic clauses

Currently when using `atomic update` with floating-point variables, if
the operation is add or sub, `cmpxchg`, instead of `atomicrmw` is emitted, as
shown in [1].  In fact, about three years ago, llvm-svn: 351850 added the
support for FP operations. This patch adds the support in OpenMP as well.

[1] https://godbolt.org/z/M7b4ba9na

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D124724

Added: 


Modified: 
clang/lib/CodeGen/CGStmtOpenMP.cpp
clang/test/OpenMP/atomic_capture_codegen.cpp
clang/test/OpenMP/atomic_update_codegen.cpp
clang/test/OpenMP/for_reduction_codegen.cpp
clang/test/OpenMP/parallel_reduction_codegen.cpp
clang/test/OpenMP/reduction_implicit_map.cpp
clang/test/OpenMP/sections_reduction_codegen.cpp

Removed: 




diff  --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp 
b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index fc8156a78d6a7..5e75e8884bb49 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -5838,25 +5838,38 @@ static std::pair 
emitOMPAtomicRMW(CodeGenFunction &CGF, LValue X,
   // Allow atomicrmw only if 'x' and 'update' are integer values, lvalue for 
'x'
   // expression is simple and atomic is allowed for the given type for the
   // target platform.
-  if (BO == BO_Comma || !Update.isScalar() ||
-  !Update.getScalarVal()->getType()->isIntegerTy() || !X.isSimple() ||
+  if (BO == BO_Comma || !Update.isScalar() || !X.isSimple() ||
   (!isa(Update.getScalarVal()) &&
(Update.getScalarVal()->getType() !=
 X.getAddress(CGF).getElementType())) ||
-  !X.getAddress(CGF).getElementType()->isIntegerTy() ||
   !Context.getTargetInfo().hasBuiltinAtomic(
   Context.getTypeSize(X.getType()), Context.toBits(X.getAlignment(
 return std::make_pair(false, RValue::get(nullptr));
 
+  auto &&CheckAtomicSupport = [&CGF](llvm::Type *T, BinaryOperatorKind BO) {
+if (T->isIntegerTy())
+  return true;
+
+if (T->isFloatingPointTy() && (BO == BO_Add || BO == BO_Sub))
+  return llvm::isPowerOf2_64(CGF.CGM.getDataLayout().getTypeStoreSize(T));
+
+return false;
+  };
+
+  if (!CheckAtomicSupport(Update.getScalarVal()->getType(), BO) ||
+  !CheckAtomicSupport(X.getAddress(CGF).getElementType(), BO))
+return std::make_pair(false, RValue::get(nullptr));
+
+  bool IsInteger = X.getAddress(CGF).getElementType()->isIntegerTy();
   llvm::AtomicRMWInst::BinOp RMWOp;
   switch (BO) {
   case BO_Add:
-RMWOp = llvm::AtomicRMWInst::Add;
+RMWOp = IsInteger ? llvm::AtomicRMWInst::Add : llvm::AtomicRMWInst::FAdd;
 break;
   case BO_Sub:
 if (!IsXLHSInRHSPart)
   return std::make_pair(false, RValue::get(nullptr));
-RMWOp = llvm::AtomicRMWInst::Sub;
+RMWOp = IsInteger ? llvm::AtomicRMWInst::Sub : llvm::AtomicRMWInst::FSub;
 break;
   case BO_And:
 RMWOp = llvm::AtomicRMWInst::And;
@@ -5914,9 +5927,13 @@ static std::pair 
emitOMPAtomicRMW(CodeGenFunction &CGF, LValue X,
   }
   llvm::Value *UpdateVal = Update.getScalarVal();
   if (auto *IC = dyn_cast(UpdateVal)) {
-UpdateVal = CGF.Builder.CreateIntCast(
-IC, X.getAddress(CGF).getElementType(),
-X.getType()->hasSignedIntegerRepresentation());
+if (IsInteger)
+  UpdateVal = CGF.Builder.CreateIntCast(
+  IC, X.getAddress(CGF).getElementType(),
+  X.getType()->hasSignedIntegerRepresentation());
+else
+  UpdateVal = CGF.Builder.CreateCast(llvm::Instruction::CastOps::UIToFP, 
IC,
+ X.getAddress(CGF).getElementType());
   }
   llvm::Value *Res =
   CGF.Builder.CreateAtomicRMW(RMWOp, X.getPointer(CGF), UpdateVal, AO);

diff  --git a/clang/test/OpenMP/atomic_capture_codegen.cpp 
b/clang/test/OpenMP/atomic_capture_codegen.cpp
index c5f45a39232c0..95509df9ba935 100644
--- a/clang/test/OpenMP/atomic_capture_codegen.cpp
+++ b/clang/test/OpenMP/atomic_capture_codegen.cpp
@@ -216,20 +216,8 @@ int main(void) {
 #pragma omp atomic capture
   llv = ullx |= ullv;
 // CHECK: [[EXPR:%.+]] = load float, float* @{{.+}},
-// CHECK: [[X:%.+]] = load atomic i32, i32*  bitcast (float* [[X_ADDR:@.+]] to 
i32*) monotonic, align 4
-// CHECK: br label %[[CONT:.+]]
-// CHECK: [[CONT]]
-// CHECK: [[EXPECTED:%.+]] = phi i32 [ [[X]], %{{.+}} ], [ [[OLD_X:%.+]], 
%[[CONT]] ]
-// CHECK: [[TEMP_I:%.+]] = bitcast float* [[TEMP:%.+]] to i32*
-// CHECK: [[OLD:%.+]] = bitcast i32 [[EXPECTED]] to float
+// CHECK: [[OLD:%.+]] = atomicrmw fadd float* @{{.+}}, float 

[clang] AMDGPU: Rename and add bf16 support for global_load_tr builtins (PR #86202)

2024-03-21 Thread Shilei Tian via cfe-commits


@@ -432,13 +432,15 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", 
"n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
 
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v2i32, "V2iV2i*1", "nc", 
"gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8i16, "V8sV8s*1", "nc", 
"gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8f16, "V8hV8h*1", "nc", 
"gfx12-insts,wavefrontsize32")
-
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", 
"gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", 
"gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", 
"gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", 
"gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", 
"gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", 
"gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", 
"gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", 
"gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", 
"gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", 
"gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", 
"gfx12-insts,wavefrontsize64")

shiltian wrote:

Do we still want to keep the old builtins to maintain compatibility, though I 
doubt there is any legacy code using them?

https://github.com/llvm/llvm-project/pull/86202
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] AMDGPU: Rename and add bf16 support for global_load_tr builtins (PR #86202)

2024-03-21 Thread Shilei Tian via cfe-commits

shiltian wrote:

> > > > Do you want to rename intrinsics as well? Because now intrinsic names 
> > > > do not match builtin names.
> > > 
> > > 
> > > Do we have to match builtins with intrinsics? Renaming intrinsics here 
> > > means we will have to duplicate the intrinsics.
> > 
> > 
> > Is that because of the mangling?
> > Right.  It was originally suggested to use  a single instrinsic "load_lr".  
> > But eventually we use global_load_tr to indicate this is in global address 
> > space.  If we want to rename intrinsics here, it should be 
> > global_load_tr_b64 and global_load_tr_b128.
> 
> We should rename intrinsic if users can use intrinsics directly. I think 
> use-friendly is more important.

I don't think intrinsics are meant for users. Builtins are the user-facing 
front. :-)

https://github.com/llvm/llvm-project/pull/86202
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP] Allow dynamic `condition` selector in Metadirective (PR #86457)

2024-03-24 Thread Shilei Tian via cfe-commits

shiltian wrote:

I'm not familiar with that section of code. Maybe @jdoerfert could give you 
more insights.

https://github.com/llvm/llvm-project/pull/86457
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][OpenMP] Fix `!isNull() && "Cannot retrieve a NULL type pointer"' fail. (PR #81015)

2024-02-07 Thread Shilei Tian via cfe-commits


@@ -21124,6 +21124,8 @@ Sema::ActOnOpenMPDependClause(const 
OMPDependClause::DependDataTy &Data,
   ExprTy = ATy->getElementType();
 else
   ExprTy = BaseType->getPointeeType();
+if (ExprTy.isNull())
+  continue;

shiltian wrote:

I'm not sure to `continue` here is a good idea. Probably you want `return 
nullptr;`.

https://github.com/llvm/llvm-project/pull/81015
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][OpenMP] Fix `!isNull() && "Cannot retrieve a NULL type pointer"' fail. (PR #81015)

2024-02-07 Thread Shilei Tian via cfe-commits

https://github.com/shiltian approved this pull request.


https://github.com/llvm/llvm-project/pull/81015
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-07 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/80908

>From a535bf3e8cd9b10d87281f94fed68fc300f3e24c Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Wed, 7 Feb 2024 14:43:24 -0500
Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat

Currently it looks like we generally use `i16` to represent `bf16` in those 
tablegen
files. I'm not sure of the reason behind it. My wild guess is the type `bf16` 
was
not available when we enabled the support. This patch is trying to use `bf16`
directly in those tablegen files, aiming at fixing #79369. Of course for #79369
a workaround can be to treat all `INT16` variants as `BFloat` in 
`getOpFltSemantics`,
but it doesn't look good IMHO.

Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out
where I don't understand correctly.
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |  4 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  | 12 ++--
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |  5 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp  | 66 +++
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 10 +++
 .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp  |  7 ++
 llvm/lib/Target/AMDGPU/SIDefines.h|  7 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|  7 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.td | 60 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td  | 22 ++-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  7 ++
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  2 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  2 +-
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 +-
 .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll  | 14 ++--
 llvm/test/MC/AMDGPU/bf16_imm.s|  8 +++
 16 files changed, 204 insertions(+), 65 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 213311b96df74f..4fe236e8aca12d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -246,8 +246,8 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, 
"V2hV2h*3V2h", "t", "atomi
 
 TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot10-insts")
 TARGET_BUILTIN(__builtin_amdgcn_fdot2_f16_f16, "hV2hV2hh", "nc", "dot9-insts")
-TARGET_BUILTIN(__builtin_amdgcn_fdot2_bf16_bf16, "sV2sV2ss", "nc", 
"dot9-insts")
-TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2sV2sfIb", "nc", 
"dot9-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot2_bf16_bf16, "yV2yV2yy", "nc", 
"dot9-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2yV2yfIb", "nc", 
"dot9-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot2-insts")
 TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dot2-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dot1-insts")
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 202fa4e8f4ea81..0f29653f1f5bec 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 :
 def int_amdgcn_fdot2_bf16_bf16 :
   ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
   DefaultAttrsIntrinsic<
-[llvm_i16_ty],   // %r
+[llvm_bfloat_ty],   // %r
 [
-  llvm_v2i16_ty, // %a
-  llvm_v2i16_ty, // %b
-  llvm_i16_ty// %c
+  llvm_v2bf16_ty, // %a
+  llvm_v2bf16_ty, // %b
+  llvm_bfloat_ty// %c
 ],
 [IntrNoMem, IntrSpeculatable]
   >;
@@ -2835,8 +2835,8 @@ def int_amdgcn_fdot2_f32_bf16 :
   DefaultAttrsIntrinsic<
 [llvm_float_ty], // %r
 [
-  llvm_v2i16_ty, // %a
-  llvm_v2i16_ty, // %b
+  llvm_v2bf16_ty, // %a
+  llvm_v2bf16_ty, // %b
   llvm_float_ty, // %c
   llvm_i1_ty // %clamp
 ],
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp 
b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index dd38317c26bff6..a1c638d931b7f8 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U,
 
 bool IRTranslator::translateCast(unsigned Opcode, const User &U,
  MachineIRBuilder &MIRBuilder) {
-  if (U.getType()->getScalarType()->isBFloatTy() ||
-  U.getOperand(0)->getType()->getScalarType()->isBFloatTy())
+  if (Opcode != TargetOpcode::G_BITCAST &&
+  (U.getType()->getScalarType()->isBFloatTy() ||
+   U.getOperand(0)->getType()->getScalarType()->isBFloatTy()))
 return false;
   Register Op = getOrCreateVReg(*U.getOperand(0));
   Register Res = getOrCreateVReg(U);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp 
b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 225e781588668f..787217171721d8 100644
--- a/llvm/lib/Target/A

[clang] [llvm] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-07 Thread Shilei Tian via cfe-commits


@@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U,
 
 bool IRTranslator::translateCast(unsigned Opcode, const User &U,
  MachineIRBuilder &MIRBuilder) {
-  if (U.getType()->getScalarType()->isBFloatTy() ||
-  U.getOperand(0)->getType()->getScalarType()->isBFloatTy())
+  if (Opcode != TargetOpcode::G_BITCAST &&

shiltian wrote:

I think bitcast can still be supported even if we don't know how to convert 
from bfloat to other types mentioned in #71470. If so, we might want a separate 
patch for this.

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][OpenMP] Add a NULL check (PR #77131)

2024-02-07 Thread Shilei Tian via cfe-commits

https://github.com/shiltian closed 
https://github.com/llvm/llvm-project/pull/77131
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][OpenMP] Add a NULL check (PR #77131)

2024-02-07 Thread Shilei Tian via cfe-commits

shiltian wrote:

This has been fixed by #81015.

https://github.com/llvm/llvm-project/pull/77131
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [transforms] Inline simple variadic functions (PR #81058)

2024-02-07 Thread Shilei Tian via cfe-commits


@@ -0,0 +1,698 @@
+//===-- ExpandVariadicsPass.cpp *- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This is an optimisation pass for variadic functions. If called from codegen,
+// it can serve as the implementation of variadic functions for a given target.
+//
+// The target-dependent parts are in namespace VariadicABIInfo. Enabling a new
+// target means adding a case to VariadicABIInfo::create() along with tests.
+//
+// The module pass using that information is class ExpandVariadics.
+//
+// The strategy is:
+// 1. Test whether a variadic function is sufficiently simple
+// 2. If it was, calls to it can be replaced with calls to a different function
+// 3. If it wasn't, try to split it into a simple function and a remainder
+// 4. Optionally rewrite the varadic function calling convention as well
+//
+// This pass considers "sufficiently simple" to mean a variadic function that
+// calls into a different function taking a va_list to do the real work. For
+// example, libc might implement fprintf as a single basic block calling into
+// vfprintf. This pass can then rewrite call to the variadic into some code
+// to construct a target-specific value to use for the va_list and a call
+// into the non-variadic implementation function. There's a test for that.
+//
+// Most other variadic functions whose definition is known can be converted 
into
+// that form. Create a new internal function taking a va_list where the 
original
+// took a ... parameter. Move the blocks across. Create a new block containing 
a
+// va_start that calls into the new function. This is nearly target 
independent.
+//
+// Where this transform is consistent with the ABI, e.g. AMDGPU or NVPTX, or
+// where the ABI can be chosen to align with this transform, the function
+// interface can be rewritten along with calls to unknown variadic functions.
+//
+// The aggregate effect is to unblock other transforms, most critically the
+// general purpose inliner. Known calls to variadic functions become zero cost.
+//
+// This pass does define some target specific information which is partially
+// redundant with other parts of the compiler. In particular, the call frame
+// it builds must be the exact complement of the va_arg lowering performed
+// by clang. The va_list construction is similar to work done by the backend
+// for targets that lower variadics there, though distinct in that this pass
+// constructs the pieces using alloca instead of relative to stack pointers.
+//
+// Consistency with clang is primarily tested by emitting va_arg using clang
+// then expanding the variadic functions using this pass, followed by trying
+// to constant fold the functions to no-ops.
+//
+// Target specific behaviour is tested in IR - mainly checking that values are
+// put into positions in call frames that make sense for that particular 
target.
+//
+//===--===//
+
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/TargetParser/Triple.h"
+
+#define DEBUG_TYPE "expand-variadics"
+
+using namespace llvm;
+
+namespace {
+namespace VariadicABIInfo {
+
+// calling convention for passing as valist object, same as it would be in C
+// aarch64 uses byval
+enum class valistCC { value, pointer, /*byval*/ };
+
+struct Interface {
+protected:
+  Interface(uint32_t MinAlign, uint32_t MaxAlign)
+  : MinAlign(MinAlign), MaxAlign(MaxAlign) {}
+
+public:
+  virtual ~Interface() {}
+  const uint32_t MinAlign;
+  const uint32_t MaxAlign;
+
+  // Most ABIs use a void* or char* for va_list, others can specialise
+  virtual Type *vaListType(LLVMContext &Ctx) {
+return PointerType::getUnqual(Ctx);
+  }
+
+  // How the vaListType is passed
+  virtual valistCC vaListCC() { return valistCC::value; }
+
+  // The valist might need to be stack allocated.
+  virtual bool valistOnStack() { return false; }
+
+  virtual void initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+AllocaInst * /*va_list*/, Value * /*buffer*/) {
+// Function needs to be implemented if valist is on the stack
+assert(!valistOnStack());
+__builtin_unreachable();
+  }
+
+  // All targets currently implemented use a ptr for the valist parameter
+  Type *vaListParameterType(LLVMContext &Ctx) {
+return PointerType::getUnqual(Ctx);
+  }
+
+  bool VAEndIsNop() { return 

[clang] [llvm] [transforms] Inline simple variadic functions (PR #81058)

2024-02-07 Thread Shilei Tian via cfe-commits


@@ -0,0 +1,698 @@
+//===-- ExpandVariadicsPass.cpp *- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This is an optimisation pass for variadic functions. If called from codegen,
+// it can serve as the implementation of variadic functions for a given target.
+//
+// The target-dependent parts are in namespace VariadicABIInfo. Enabling a new
+// target means adding a case to VariadicABIInfo::create() along with tests.
+//
+// The module pass using that information is class ExpandVariadics.
+//
+// The strategy is:
+// 1. Test whether a variadic function is sufficiently simple
+// 2. If it was, calls to it can be replaced with calls to a different function
+// 3. If it wasn't, try to split it into a simple function and a remainder
+// 4. Optionally rewrite the varadic function calling convention as well
+//
+// This pass considers "sufficiently simple" to mean a variadic function that
+// calls into a different function taking a va_list to do the real work. For
+// example, libc might implement fprintf as a single basic block calling into
+// vfprintf. This pass can then rewrite call to the variadic into some code
+// to construct a target-specific value to use for the va_list and a call
+// into the non-variadic implementation function. There's a test for that.
+//
+// Most other variadic functions whose definition is known can be converted 
into
+// that form. Create a new internal function taking a va_list where the 
original
+// took a ... parameter. Move the blocks across. Create a new block containing 
a
+// va_start that calls into the new function. This is nearly target 
independent.
+//
+// Where this transform is consistent with the ABI, e.g. AMDGPU or NVPTX, or
+// where the ABI can be chosen to align with this transform, the function
+// interface can be rewritten along with calls to unknown variadic functions.
+//
+// The aggregate effect is to unblock other transforms, most critically the
+// general purpose inliner. Known calls to variadic functions become zero cost.
+//
+// This pass does define some target specific information which is partially
+// redundant with other parts of the compiler. In particular, the call frame
+// it builds must be the exact complement of the va_arg lowering performed
+// by clang. The va_list construction is similar to work done by the backend
+// for targets that lower variadics there, though distinct in that this pass
+// constructs the pieces using alloca instead of relative to stack pointers.
+//
+// Consistency with clang is primarily tested by emitting va_arg using clang
+// then expanding the variadic functions using this pass, followed by trying
+// to constant fold the functions to no-ops.
+//
+// Target specific behaviour is tested in IR - mainly checking that values are
+// put into positions in call frames that make sense for that particular 
target.
+//
+//===--===//
+
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/TargetParser/Triple.h"
+
+#define DEBUG_TYPE "expand-variadics"
+
+using namespace llvm;
+
+namespace {
+namespace VariadicABIInfo {
+
+// calling convention for passing as valist object, same as it would be in C
+// aarch64 uses byval
+enum class valistCC { value, pointer, /*byval*/ };
+
+struct Interface {
+protected:
+  Interface(uint32_t MinAlign, uint32_t MaxAlign)
+  : MinAlign(MinAlign), MaxAlign(MaxAlign) {}
+
+public:
+  virtual ~Interface() {}
+  const uint32_t MinAlign;
+  const uint32_t MaxAlign;
+
+  // Most ABIs use a void* or char* for va_list, others can specialise
+  virtual Type *vaListType(LLVMContext &Ctx) {
+return PointerType::getUnqual(Ctx);
+  }
+
+  // How the vaListType is passed
+  virtual valistCC vaListCC() { return valistCC::value; }
+
+  // The valist might need to be stack allocated.
+  virtual bool valistOnStack() { return false; }
+
+  virtual void initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+AllocaInst * /*va_list*/, Value * /*buffer*/) {
+// Function needs to be implemented if valist is on the stack
+assert(!valistOnStack());
+__builtin_unreachable();

shiltian wrote:

Yeah, `llvm_unreachable` is usually used inside LLVM, so definitely better to 
use `llvm_unreachable` here.

https://github.com/llvm/llvm-project/pull/81058
__

[clang] [llvm] [transforms] Inline simple variadic functions (PR #81058)

2024-02-07 Thread Shilei Tian via cfe-commits


@@ -0,0 +1,17 @@
+#ifndef LLVM_TRANSFORMS_IPO_EXPANDVARIADICS_H

shiltian wrote:

LLVM copyright header as well as (brief) documentation of the pass

https://github.com/llvm/llvm-project/pull/81058
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [transforms] Inline simple variadic functions (PR #81058)

2024-02-07 Thread Shilei Tian via cfe-commits


@@ -0,0 +1,698 @@
+//===-- ExpandVariadicsPass.cpp *- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This is an optimisation pass for variadic functions. If called from codegen,
+// it can serve as the implementation of variadic functions for a given target.
+//
+// The target-dependent parts are in namespace VariadicABIInfo. Enabling a new
+// target means adding a case to VariadicABIInfo::create() along with tests.
+//
+// The module pass using that information is class ExpandVariadics.
+//
+// The strategy is:
+// 1. Test whether a variadic function is sufficiently simple
+// 2. If it was, calls to it can be replaced with calls to a different function
+// 3. If it wasn't, try to split it into a simple function and a remainder
+// 4. Optionally rewrite the varadic function calling convention as well
+//
+// This pass considers "sufficiently simple" to mean a variadic function that
+// calls into a different function taking a va_list to do the real work. For
+// example, libc might implement fprintf as a single basic block calling into
+// vfprintf. This pass can then rewrite call to the variadic into some code
+// to construct a target-specific value to use for the va_list and a call
+// into the non-variadic implementation function. There's a test for that.
+//
+// Most other variadic functions whose definition is known can be converted 
into
+// that form. Create a new internal function taking a va_list where the 
original
+// took a ... parameter. Move the blocks across. Create a new block containing 
a
+// va_start that calls into the new function. This is nearly target 
independent.
+//
+// Where this transform is consistent with the ABI, e.g. AMDGPU or NVPTX, or
+// where the ABI can be chosen to align with this transform, the function
+// interface can be rewritten along with calls to unknown variadic functions.
+//
+// The aggregate effect is to unblock other transforms, most critically the
+// general purpose inliner. Known calls to variadic functions become zero cost.
+//
+// This pass does define some target specific information which is partially
+// redundant with other parts of the compiler. In particular, the call frame
+// it builds must be the exact complement of the va_arg lowering performed
+// by clang. The va_list construction is similar to work done by the backend
+// for targets that lower variadics there, though distinct in that this pass
+// constructs the pieces using alloca instead of relative to stack pointers.
+//
+// Consistency with clang is primarily tested by emitting va_arg using clang
+// then expanding the variadic functions using this pass, followed by trying
+// to constant fold the functions to no-ops.
+//
+// Target specific behaviour is tested in IR - mainly checking that values are
+// put into positions in call frames that make sense for that particular 
target.
+//
+//===--===//
+
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/TargetParser/Triple.h"
+
+#define DEBUG_TYPE "expand-variadics"
+
+using namespace llvm;
+
+namespace {
+namespace VariadicABIInfo {
+
+// calling convention for passing as valist object, same as it would be in C
+// aarch64 uses byval
+enum class valistCC { value, pointer, /*byval*/ };
+
+struct Interface {
+protected:
+  Interface(uint32_t MinAlign, uint32_t MaxAlign)
+  : MinAlign(MinAlign), MaxAlign(MaxAlign) {}
+
+public:
+  virtual ~Interface() {}
+  const uint32_t MinAlign;
+  const uint32_t MaxAlign;
+
+  // Most ABIs use a void* or char* for va_list, others can specialise
+  virtual Type *vaListType(LLVMContext &Ctx) {
+return PointerType::getUnqual(Ctx);
+  }
+
+  // How the vaListType is passed
+  virtual valistCC vaListCC() { return valistCC::value; }
+
+  // The valist might need to be stack allocated.
+  virtual bool valistOnStack() { return false; }
+
+  virtual void initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+AllocaInst * /*va_list*/, Value * /*buffer*/) {
+// Function needs to be implemented if valist is on the stack
+assert(!valistOnStack());
+__builtin_unreachable();
+  }
+
+  // All targets currently implemented use a ptr for the valist parameter
+  Type *vaListParameterType(LLVMContext &Ctx) {
+return PointerType::getUnqual(Ctx);
+  }
+
+  bool VAEndIsNop() { return 

[clang] [Clang] Fix a non-effective assertion (PR #81083)

2024-02-07 Thread Shilei Tian via cfe-commits

https://github.com/shiltian created 
https://github.com/llvm/llvm-project/pull/81083

`PTy` here is literally `FTy->getParamType(i)`, which makes this assertion not
work as expected.


>From 076e6d3e1f5a88c4c54b0d2bf6932c9d9ae33678 Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Wed, 7 Feb 2024 22:35:28 -0500
Subject: [PATCH] [Clang] Fix a non-effective assertion

`PTy` here is literally `FTy->getParamType(i)`, which makes this assertion not
work as expected.
---
 clang/lib/CodeGen/CGBuiltin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index e051cbc6486353..a7a410dab1a018 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5908,7 +5908,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
   }
 }
 
-assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) &&
+assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) &&
"Must be able to losslessly bit cast to param");
 // Cast vector type (e.g., v256i32) to x86_amx, this only happen
 // in amx intrinsics.

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-07 Thread Shilei Tian via cfe-commits


@@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 :
 def int_amdgcn_fdot2_bf16_bf16 :
   ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
   DefaultAttrsIntrinsic<
-[llvm_i16_ty],   // %r
+[llvm_bfloat_ty],   // %r

shiltian wrote:

Does it make sense if we also update both the Clang builtins and LLVM builtins 
here, and use IR auto upgrade to rewrite existing uses?

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-07 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/80908

>From 6a2bacee940d95abc53bcff2332b0d9aa0f1073f Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Wed, 7 Feb 2024 23:09:33 -0500
Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat

Currently it looks like we generally use `i16` to represent `bf16` in those 
tablegen
files. I'm not sure of the reason behind it. My wild guess is the type `bf16` 
was
not available when we enabled the support. This patch is trying to use `bf16`
directly in those tablegen files, aiming at fixing #79369. Of course for #79369
a workaround can be to treat all `INT16` variants as `BFloat` in 
`getOpFltSemantics`,
but it doesn't look good IMHO.

Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out
where I don't understand correctly.
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |  4 +-
 .../builtins-amdgcn-dl-insts-err.cl   |  9 ++-
 .../builtins-amdgcn-dl-insts-gfx11.cl | 13 ++--
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  | 12 ++--
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |  5 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp  | 66 +++
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 10 +++
 .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp  |  7 ++
 llvm/lib/Target/AMDGPU/SIDefines.h|  7 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|  7 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.td | 60 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td  | 22 ++-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  7 ++
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  2 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  2 +-
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 +-
 .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll  | 14 ++--
 llvm/test/MC/AMDGPU/bf16_imm.s|  8 +++
 18 files changed, 217 insertions(+), 74 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 213311b96df74f..4fe236e8aca12d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -246,8 +246,8 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, 
"V2hV2h*3V2h", "t", "atomi
 
 TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot10-insts")
 TARGET_BUILTIN(__builtin_amdgcn_fdot2_f16_f16, "hV2hV2hh", "nc", "dot9-insts")
-TARGET_BUILTIN(__builtin_amdgcn_fdot2_bf16_bf16, "sV2sV2ss", "nc", 
"dot9-insts")
-TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2sV2sfIb", "nc", 
"dot9-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot2_bf16_bf16, "yV2yV2yy", "nc", 
"dot9-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2yV2yfIb", "nc", 
"dot9-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot2-insts")
 TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dot2-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dot1-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
index f5317683d0ff97..fa225c4962c90b 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
@@ -5,6 +5,8 @@
 
 typedef unsigned int uint;
 typedef half __attribute__((ext_vector_type(2))) half2;
+typedef __bf16 bfloat;
+typedef bfloat __attribute__((ext_vector_type(2))) bfloat2;
 typedef short __attribute__((ext_vector_type(2))) short2;
 typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
 
@@ -15,16 +17,17 @@ kernel void builtins_amdgcn_dl_insts_err(
 half2 v2hA, half2 v2hB, float fC, half hC,
 short2 v2ssA, short2 v2ssB, short sC, int siA, int siB, int siC,
 ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC,
+bfloat2 v2bfsA, bfloat2 v2bfsB, bfloat bfC,
 int A, int B, int C) {
   fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false);  // 
expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot10-insts}}
   fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true);   // 
expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot10-insts}}
 
   hOut[0] = __builtin_amdgcn_fdot2_f16_f16(v2hA, v2hB, hC);   // 
expected-error {{'__builtin_amdgcn_fdot2_f16_f16' needs target feature 
dot9-insts}}
 
-  sOut[0] = __builtin_amdgcn_fdot2_bf16_bf16(v2ssA, v2ssB, sC);   // 
expected-error {{'__builtin_amdgcn_fdot2_bf16_bf16' needs target feature 
dot9-insts}}
+  sOut[0] = __builtin_amdgcn_fdot2_bf16_bf16(v2bfsA, v2bfsB, bfC);   // 
expected-error {{'__builtin_amdgcn_fdot2_bf16_bf16' needs target feature 
dot9-insts}}
 
-  fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); // 
expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature 
dot9-insts}}
-  fOut[4] = __builtin_a

[clang] [Clang] Fix a non-effective assertion (PR #81083)

2024-02-08 Thread Shilei Tian via cfe-commits


@@ -5908,7 +5908,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
   }
 }
 
-assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) &&
+assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) &&

shiltian wrote:

`canLosslesslyBitCastTo` is supposed to be stricter than `castIsValid` but it 
looks too conservative. For example, it doesn't allow bitcast between `float` 
and `int`.

https://github.com/llvm/llvm-project/pull/81083
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Fix a non-effective assertion (PR #81083)

2024-02-08 Thread Shilei Tian via cfe-commits

https://github.com/shiltian closed 
https://github.com/llvm/llvm-project/pull/81083
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-08 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/80908

>From 672fd3cf584480eb4769ccdb5f86acbc03865ec9 Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Thu, 8 Feb 2024 11:17:25 -0500
Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat

Currently it looks like we generally use `i16` to represent `bf16` in those 
tablegen
files. I'm not sure of the reason behind it. My wild guess is the type `bf16` 
was
not available when we enabled the support. This patch is trying to use `bf16`
directly in those tablegen files, aiming at fixing #79369. Of course for #79369
a workaround can be to treat all `INT16` variants as `BFloat` in 
`getOpFltSemantics`,
but it doesn't look good IMHO.

Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out
where I don't understand correctly.
---
 clang/lib/CodeGen/CGBuiltin.cpp   |  4 --
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  | 12 ++--
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |  5 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp  | 66 +++
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 10 +++
 .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp  |  7 ++
 llvm/lib/Target/AMDGPU/SIDefines.h|  7 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|  7 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.td | 60 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td  | 22 ++-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  7 ++
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  2 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  2 +-
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 +-
 .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll  | 14 ++--
 llvm/test/MC/AMDGPU/bf16_imm.s|  8 +++
 16 files changed, 202 insertions(+), 67 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a7a410dab1a018..daf651917f2a96 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
   }
 }
 
-assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) &&
-   "Must be able to losslessly bit cast to param");
 // Cast vector type (e.g., v256i32) to x86_amx, this only happen
 // in amx intrinsics.
 if (PTy->isX86_AMXTy())
@@ -5939,8 +5937,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
 }
   }
 
-  assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
- "Must be able to losslessly bit cast result type");
   // Cast x86_amx to vector type (e.g., v256i32), this only happen
   // in amx intrinsics.
   if (V->getType()->isX86_AMXTy())
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 202fa4e8f4ea81..0f29653f1f5bec 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 :
 def int_amdgcn_fdot2_bf16_bf16 :
   ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
   DefaultAttrsIntrinsic<
-[llvm_i16_ty],   // %r
+[llvm_bfloat_ty],   // %r
 [
-  llvm_v2i16_ty, // %a
-  llvm_v2i16_ty, // %b
-  llvm_i16_ty// %c
+  llvm_v2bf16_ty, // %a
+  llvm_v2bf16_ty, // %b
+  llvm_bfloat_ty// %c
 ],
 [IntrNoMem, IntrSpeculatable]
   >;
@@ -2835,8 +2835,8 @@ def int_amdgcn_fdot2_f32_bf16 :
   DefaultAttrsIntrinsic<
 [llvm_float_ty], // %r
 [
-  llvm_v2i16_ty, // %a
-  llvm_v2i16_ty, // %b
+  llvm_v2bf16_ty, // %a
+  llvm_v2bf16_ty, // %b
   llvm_float_ty, // %c
   llvm_i1_ty // %clamp
 ],
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp 
b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index c1d8e890a66edb..828229f3e569e3 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U,
 
 bool IRTranslator::translateCast(unsigned Opcode, const User &U,
  MachineIRBuilder &MIRBuilder) {
-  if (U.getType()->getScalarType()->isBFloatTy() ||
-  U.getOperand(0)->getType()->getScalarType()->isBFloatTy())
+  if (Opcode != TargetOpcode::G_BITCAST &&
+  (U.getType()->getScalarType()->isBFloatTy() ||
+   U.getOperand(0)->getType()->getScalarType()->isBFloatTy()))
 return false;
   Register Op = getOrCreateVReg(*U.getOperand(0));
   Register Res = getOrCreateVReg(U);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp 
b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 225e781588668f..787217171721d8 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPU

[clang] [llvm] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-08 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/80908

>From d14668fdfeef603624af520d11f5b66aa19da7be Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Thu, 8 Feb 2024 12:12:48 -0500
Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat

Currently it looks like we generally use `i16` to represent `bf16` in those 
tablegen
files. I'm not sure of the reason behind it. My wild guess is the type `bf16` 
was
not available when we enabled the support. This patch is trying to use `bf16`
directly in those tablegen files, aiming at fixing #79369. Of course for #79369
a workaround can be to treat all `INT16` variants as `BFloat` in 
`getOpFltSemantics`,
but it doesn't look good IMHO.

Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out
where I don't understand correctly.
---
 clang/lib/CodeGen/CGBuiltin.cpp   |  4 --
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  8 +--
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |  5 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp  | 66 +++
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 10 +++
 .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp  |  7 ++
 llvm/lib/Target/AMDGPU/SIDefines.h|  7 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|  7 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td  | 22 ++-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  7 ++
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  2 +-
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 +-
 llvm/test/MC/AMDGPU/bf16_imm.s|  8 +++
 14 files changed, 191 insertions(+), 56 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a7a410dab1a018..daf651917f2a96 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
   }
 }
 
-assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) &&
-   "Must be able to losslessly bit cast to param");
 // Cast vector type (e.g., v256i32) to x86_amx, this only happen
 // in amx intrinsics.
 if (PTy->isX86_AMXTy())
@@ -5939,8 +5937,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
 }
   }
 
-  assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
- "Must be able to losslessly bit cast result type");
   // Cast x86_amx to vector type (e.g., v256i32), this only happen
   // in amx intrinsics.
   if (V->getType()->isX86_AMXTy())
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 202fa4e8f4ea81..6795fb7aa0edb8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 :
 def int_amdgcn_fdot2_bf16_bf16 :
   ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
   DefaultAttrsIntrinsic<
-[llvm_i16_ty],   // %r
+[llvm_bfloat_ty],   // %r
 [
-  llvm_v2i16_ty, // %a
-  llvm_v2i16_ty, // %b
-  llvm_i16_ty// %c
+  llvm_v2bf16_ty, // %a
+  llvm_v2bf16_ty, // %b
+  llvm_bfloat_ty// %c
 ],
 [IntrNoMem, IntrSpeculatable]
   >;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp 
b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index c1d8e890a66edb..828229f3e569e3 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U,
 
 bool IRTranslator::translateCast(unsigned Opcode, const User &U,
  MachineIRBuilder &MIRBuilder) {
-  if (U.getType()->getScalarType()->isBFloatTy() ||
-  U.getOperand(0)->getType()->getScalarType()->isBFloatTy())
+  if (Opcode != TargetOpcode::G_BITCAST &&
+  (U.getType()->getScalarType()->isBFloatTy() ||
+   U.getOperand(0)->getType()->getScalarType()->isBFloatTy()))
 return false;
   Register Op = getOrCreateVReg(*U.getOperand(0));
   Register Res = getOrCreateVReg(U);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp 
b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index a94da992b33859..d6d96c251f7e30 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
 
   bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); }
 
+  bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); }
+
   bool isSSrc_f16() const { return isSCSrcB16() || isLiteralImm(MVT::f16); }
 
   bool isSSrcV2F16() const {
@@ -541,22 +543,40 @@ class AMDGPUOperand : public MC

[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-08 Thread Shilei Tian via cfe-commits

https://github.com/shiltian edited 
https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-08 Thread Shilei Tian via cfe-commits

https://github.com/shiltian ready_for_review 
https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-08 Thread Shilei Tian via cfe-commits


@@ -0,0 +1,8 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck %s
+
+v_dot2_bf16_bf16 v5, v1, v2, 100.0

shiltian wrote:

The two instructions are from #79369 

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-08 Thread Shilei Tian via cfe-commits

https://github.com/shiltian edited 
https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-08 Thread Shilei Tian via cfe-commits


@@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
   }
 }
 
-assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) &&

shiltian wrote:

This change might need to go to a separate patch.

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-08 Thread Shilei Tian via cfe-commits


@@ -4181,13 +4181,20 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand 
&MO,
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
 return AMDGPU::isInlinableLiteralV2I16(Imm);
+  case AMDGPU::OPERAND_REG_IMM_V2BF16:

shiltian wrote:

Yeah, I made some mistakes here. Will take care of them.

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] LLVM] Add `__builtin_readfixedtimer` intrinsic and buiiltin for realtime clocks (PR #81331)

2024-02-09 Thread Shilei Tian via cfe-commits

https://github.com/shiltian commented:

Generally looks good to me. Just not sure about the name. "fixed timer" sounds 
pretty confusing to me. probably `readfixedfreqtimer`?

https://github.com/llvm/llvm-project/pull/81331
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] LLVM] Add `__builtin_readfixedtimer` intrinsic and buiiltin for realtime clocks (PR #81331)

2024-02-09 Thread Shilei Tian via cfe-commits


@@ -312,6 +312,12 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
 CI->replaceAllUsesWith(ConstantInt::get(Type::getInt64Ty(Context), 0));
 break;
   }
+  case Intrinsic::readfixedtimer: {
+errs() << "WARNING: this target does not support the llvm.readfixedtimer"
+   << " intrinsic.  It is being lowered to a constant 0\n";
+CI->replaceAllUsesWith(ConstantInt::get(Type::getInt64Ty(Context), 0));

shiltian wrote:

`Constant::getNullValue(Type::getInt64Ty(Context))`?

https://github.com/llvm/llvm-project/pull/81331
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] LLVM] Add `__builtin_readfixedtimer` intrinsic and buiiltin for realtime clocks (PR #81331)

2024-02-09 Thread Shilei Tian via cfe-commits

https://github.com/shiltian edited 
https://github.com/llvm/llvm-project/pull/81331
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-12 Thread Shilei Tian via cfe-commits


@@ -79,17 +79,17 @@ define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis(
 ; GFX11:   ; %bb.0: ; %entry
 ; GFX11-NEXT:v_mov_b32_e32 v2, s1
 ; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x10001, v2
+; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x3f803f80, v2

shiltian wrote:

Yeah, but I tested the FP16 version `llvm.amdgcn.fdot2.f16.f16` (w/ trunk w/o 
my patch), it generates `v_dot2_f16_f16 v2, s0, 0x3c003c00, v2`. I think we 
generally have issues with showing inline literals.

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-12 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/80908

>From 4196e998349d663a9a9922937cc4bedbec95fe5f Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Mon, 12 Feb 2024 13:48:39 -0500
Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat

Currently it looks like we generally use `i16` to represent `bf16` in those 
tablegen
files. I'm not sure of the reason behind it. My wild guess is the type `bf16` 
was
not available when we enabled the support. This patch is trying to use `bf16`
directly in those tablegen files, aiming at fixing #79369. Of course for #79369
a workaround can be to treat all `INT16` variants as `BFloat` in 
`getOpFltSemantics`,
but it doesn't look good IMHO.

Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out
where I don't understand correctly.
---
 clang/lib/CodeGen/CGBuiltin.cpp   |  4 --
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  8 +--
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |  5 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp  | 71 +++
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 59 +++
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h   |  2 +
 .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp  |  7 ++
 llvm/lib/Target/AMDGPU/SIDefines.h|  7 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|  8 +++
 llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 ---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td  | 22 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 48 ++---
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 15 
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  2 +-
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 +-
 llvm/test/MC/AMDGPU/bf16_imm.s|  8 +++
 16 files changed, 295 insertions(+), 65 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a7a410dab1a018..daf651917f2a96 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
   }
 }
 
-assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) &&
-   "Must be able to losslessly bit cast to param");
 // Cast vector type (e.g., v256i32) to x86_amx, this only happen
 // in amx intrinsics.
 if (PTy->isX86_AMXTy())
@@ -5939,8 +5937,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
 }
   }
 
-  assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
- "Must be able to losslessly bit cast result type");
   // Cast x86_amx to vector type (e.g., v256i32), this only happen
   // in amx intrinsics.
   if (V->getType()->isX86_AMXTy())
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 202fa4e8f4ea81..6795fb7aa0edb8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 :
 def int_amdgcn_fdot2_bf16_bf16 :
   ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
   DefaultAttrsIntrinsic<
-[llvm_i16_ty],   // %r
+[llvm_bfloat_ty],   // %r
 [
-  llvm_v2i16_ty, // %a
-  llvm_v2i16_ty, // %b
-  llvm_i16_ty// %c
+  llvm_v2bf16_ty, // %a
+  llvm_v2bf16_ty, // %b
+  llvm_bfloat_ty// %c
 ],
 [IntrNoMem, IntrSpeculatable]
   >;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp 
b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index c1d8e890a66edb..828229f3e569e3 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U,
 
 bool IRTranslator::translateCast(unsigned Opcode, const User &U,
  MachineIRBuilder &MIRBuilder) {
-  if (U.getType()->getScalarType()->isBFloatTy() ||
-  U.getOperand(0)->getType()->getScalarType()->isBFloatTy())
+  if (Opcode != TargetOpcode::G_BITCAST &&
+  (U.getType()->getScalarType()->isBFloatTy() ||
+   U.getOperand(0)->getType()->getScalarType()->isBFloatTy()))
 return false;
   Register Op = getOrCreateVReg(*U.getOperand(0));
   Register Res = getOrCreateVReg(U);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp 
b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index a94da992b33859..65d6fb587c19ca 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
 
   bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); }
 
+  bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); }
+
   bool isSSrc_f16() const { 

[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-12 Thread Shilei Tian via cfe-commits


@@ -2730,6 +2749,12 @@ std::optional getInlineEncodingV2I16(uint32_t 
Literal) {
   return getInlineEncodingV216(false, Literal);
 }
 
+// Encoding of the literal as an inline constant for a V_PK_*_BF16 instruction
+// or nullopt.
+std::optional getInlineEncodingV2BF16(uint32_t Literal) {
+  return getInlineEncodingV216(true, Literal);

shiltian wrote:

This part is still WIP along with 
https://github.com/llvm/llvm-project/pull/81345.

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-12 Thread Shilei Tian via cfe-commits


@@ -2660,15 +2660,34 @@ bool isInlinableLiteral16(int16_t Literal, bool 
HasInv2Pi) {
 return true;
 
   uint16_t Val = static_cast(Literal);
-  return Val == 0x3C00 || // 1.0
- Val == 0xBC00 || // -1.0
- Val == 0x3800 || // 0.5
- Val == 0xB800 || // -0.5
- Val == 0x4000 || // 2.0
- Val == 0xC000 || // -2.0
- Val == 0x4400 || // 4.0
- Val == 0xC400 || // -4.0
- Val == 0x3118;   // 1/2pi
+
+  // FP16
+  if (Val == 0x3C00 || // 1.0

shiltian wrote:

This function might be removed eventually in 
https://github.com/llvm/llvm-project/pull/81345.

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-12 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/80908

>From df3dbb6b9c257157c4afb407e40447a25c27a2a8 Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Mon, 12 Feb 2024 18:03:57 -0500
Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat

Currently it looks like we generally use `i16` to represent `bf16` in those 
tablegen
files. I'm not sure of the reason behind it. My wild guess is the type `bf16` 
was
not available when we enabled the support. This patch is trying to use `bf16`
directly in those tablegen files, aiming at fixing #79369. Of course for #79369
a workaround can be to treat all `INT16` variants as `BFloat` in 
`getOpFltSemantics`,
but it doesn't look good IMHO.

Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out
where I don't understand correctly.
---
 clang/lib/CodeGen/CGBuiltin.cpp   |  4 -
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  8 +-
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |  5 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp  | 71 ++
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 59 +++
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h   |  2 +
 .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp  |  7 ++
 llvm/lib/Target/AMDGPU/SIDefines.h|  7 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|  8 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 ---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td  | 22 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 74 ---
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 15 
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  2 +-
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 -
 llvm/test/MC/AMDGPU/bf16_imm.s| 10 +++
 16 files changed, 323 insertions(+), 65 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a7a410dab1a018..daf651917f2a96 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
   }
 }
 
-assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) &&
-   "Must be able to losslessly bit cast to param");
 // Cast vector type (e.g., v256i32) to x86_amx, this only happen
 // in amx intrinsics.
 if (PTy->isX86_AMXTy())
@@ -5939,8 +5937,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
 }
   }
 
-  assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
- "Must be able to losslessly bit cast result type");
   // Cast x86_amx to vector type (e.g., v256i32), this only happen
   // in amx intrinsics.
   if (V->getType()->isX86_AMXTy())
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 202fa4e8f4ea81..6795fb7aa0edb8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 :
 def int_amdgcn_fdot2_bf16_bf16 :
   ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
   DefaultAttrsIntrinsic<
-[llvm_i16_ty],   // %r
+[llvm_bfloat_ty],   // %r
 [
-  llvm_v2i16_ty, // %a
-  llvm_v2i16_ty, // %b
-  llvm_i16_ty// %c
+  llvm_v2bf16_ty, // %a
+  llvm_v2bf16_ty, // %b
+  llvm_bfloat_ty// %c
 ],
 [IntrNoMem, IntrSpeculatable]
   >;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp 
b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index c1d8e890a66edb..828229f3e569e3 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U,
 
 bool IRTranslator::translateCast(unsigned Opcode, const User &U,
  MachineIRBuilder &MIRBuilder) {
-  if (U.getType()->getScalarType()->isBFloatTy() ||
-  U.getOperand(0)->getType()->getScalarType()->isBFloatTy())
+  if (Opcode != TargetOpcode::G_BITCAST &&
+  (U.getType()->getScalarType()->isBFloatTy() ||
+   U.getOperand(0)->getType()->getScalarType()->isBFloatTy()))
 return false;
   Register Op = getOrCreateVReg(*U.getOperand(0));
   Register Res = getOrCreateVReg(U);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp 
b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 79ad6ddf7861fc..09f25215beb9e5 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
 
   bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); }
 
+  bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); }
+
   bool isSSrc_f16() const {

[clang] [llvm] [LLVM] Add `__builtin_readsteadycounter` intrinsic and builtin for realtime clocks (PR #81331)

2024-02-13 Thread Shilei Tian via cfe-commits

https://github.com/shiltian approved this pull request.

LG

https://github.com/llvm/llvm-project/pull/81331
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-13 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/80908

>From c556e40c13adb9d253ef7c5ebb2b46cb12969d46 Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Tue, 13 Feb 2024 15:30:51 -0500
Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat

Currently it looks like we generally use `i16` to represent `bf16` in those 
tablegen
files. I'm not sure of the reason behind it. My wild guess is the type `bf16` 
was
not available when we enabled the support. This patch is trying to use `bf16`
directly in those tablegen files, aiming at fixing #79369. Of course for #79369
a workaround can be to treat all `INT16` variants as `BFloat` in 
`getOpFltSemantics`,
but it doesn't look good IMHO.

Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out
where I don't understand correctly.
---
 clang/lib/CodeGen/CGBuiltin.cpp   |  4 --
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  8 +--
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |  5 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp  | 71 +++
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 59 +++
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h   |  2 +
 .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp  |  7 ++
 llvm/lib/Target/AMDGPU/SIDefines.h|  7 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|  8 +++
 llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 ---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td  | 21 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 37 ++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 15 
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  2 +-
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 36 +-
 llvm/test/MC/AMDGPU/bf16_imm.s|  8 +++
 16 files changed, 292 insertions(+), 56 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ee0b7504769622..9bc60466d09be6 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5912,8 +5912,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
   }
 }
 
-assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) &&
-   "Must be able to losslessly bit cast to param");
 // Cast vector type (e.g., v256i32) to x86_amx, this only happen
 // in amx intrinsics.
 if (PTy->isX86_AMXTy())
@@ -5943,8 +5941,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
 }
   }
 
-  assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
- "Must be able to losslessly bit cast result type");
   // Cast x86_amx to vector type (e.g., v256i32), this only happen
   // in amx intrinsics.
   if (V->getType()->isX86_AMXTy())
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 202fa4e8f4ea81..6795fb7aa0edb8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 :
 def int_amdgcn_fdot2_bf16_bf16 :
   ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
   DefaultAttrsIntrinsic<
-[llvm_i16_ty],   // %r
+[llvm_bfloat_ty],   // %r
 [
-  llvm_v2i16_ty, // %a
-  llvm_v2i16_ty, // %b
-  llvm_i16_ty// %c
+  llvm_v2bf16_ty, // %a
+  llvm_v2bf16_ty, // %b
+  llvm_bfloat_ty// %c
 ],
 [IntrNoMem, IntrSpeculatable]
   >;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp 
b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 311dd9d9739a6d..3290262816ef0a 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U,
 
 bool IRTranslator::translateCast(unsigned Opcode, const User &U,
  MachineIRBuilder &MIRBuilder) {
-  if (U.getType()->getScalarType()->isBFloatTy() ||
-  U.getOperand(0)->getType()->getScalarType()->isBFloatTy())
+  if (Opcode != TargetOpcode::G_BITCAST &&
+  (U.getType()->getScalarType()->isBFloatTy() ||
+   U.getOperand(0)->getType()->getScalarType()->isBFloatTy()))
 return false;
   Register Op = getOrCreateVReg(*U.getOperand(0));
   Register Res = getOrCreateVReg(U);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp 
b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 79ad6ddf7861fc..09f25215beb9e5 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
 
   bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); }
 
+  bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); }
+
   bool isSSrc_f16() const { ret

[clang] [Clang][CodeGen] Loose the cast check when emitting builtins (PR #81669)

2024-02-13 Thread Shilei Tian via cfe-commits

https://github.com/shiltian created 
https://github.com/llvm/llvm-project/pull/81669

This patch looses the cast check (`canLosslesslyBitCastTo`) and leaves it to the
one inside `CreateBitCast`. It seems too conservative for the use case here.


>From 813441fd3106a0069346aabd0dd828d8feb8ea53 Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Tue, 13 Feb 2024 16:39:25 -0500
Subject: [PATCH] [Clang][CodeGen] Loose the cast check when emitting builtins

This patch looses the cast check (`canLosslesslyBitCastTo`) and leaves it to the
one inside `CreateBitCast`. It seems too conservative for the use case here.
---
 clang/lib/CodeGen/CGBuiltin.cpp | 4 
 1 file changed, 4 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ee0b7504769622..9bc60466d09be6 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5912,8 +5912,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
   }
 }
 
-assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) &&
-   "Must be able to losslessly bit cast to param");
 // Cast vector type (e.g., v256i32) to x86_amx, this only happen
 // in amx intrinsics.
 if (PTy->isX86_AMXTy())
@@ -5943,8 +5941,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
 }
   }
 
-  assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
- "Must be able to losslessly bit cast result type");
   // Cast x86_amx to vector type (e.g., v256i32), this only happen
   // in amx intrinsics.
   if (V->getType()->isX86_AMXTy())

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-13 Thread Shilei Tian via cfe-commits

shiltian wrote:

The patch is in a good shape now. I have made two other prime patches (#81674 
and #81669). I'll rebase this one once they are landed.

This patch only changes one bf16 instruction with the necessary infrastructure 
for others. I'll update all of them once this patch is landed.

However, I don't think `isInlinableLiteral16` works correctly because the 
encoding of the floating point inline literals are different for fp16 and bf16, 
but apparently for now it can only recognize fp16 encoding. This patch at least 
makes the asm printer work properly. #81345 is trying to fix it correctly, but 
that is unrelated to this patch.

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-13 Thread Shilei Tian via cfe-commits


@@ -79,17 +79,17 @@ define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis(
 ; GFX11:   ; %bb.0: ; %entry
 ; GFX11-NEXT:v_mov_b32_e32 v2, s1
 ; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x10001, v2
+; GFX11-NEXT:v_dot2_bf16_bf16 v2, s0, 0x3f803f80, v2

shiltian wrote:

FWIW, #81345 can solve the issue but I'm struggling with getting two test cases 
passed.

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-13 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/80908

>From bfd3170dc5e4d6e53fb98b46b37f2bf3c3ebf86d Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Tue, 13 Feb 2024 17:39:23 -0500
Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat

Currently it looks like we generally use `i16` to represent `bf16` in those 
tablegen
files. I'm not sure of the reason behind it. My wild guess is the type `bf16` 
was
not available when we enabled the support. This patch is trying to use `bf16`
directly in those tablegen files, aiming at fixing #79369. Of course for #79369
a workaround can be to treat all `INT16` variants as `BFloat` in 
`getOpFltSemantics`,
but it doesn't look good IMHO.

Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out
where I don't understand correctly.
---
 clang/lib/CodeGen/CGBuiltin.cpp   |  4 --
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  8 +--
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp  | 71 +++
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 59 +++
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h   |  2 +
 .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp  |  7 ++
 llvm/lib/Target/AMDGPU/SIDefines.h|  7 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|  8 +++
 llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 ---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td  | 21 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 37 ++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 15 
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  2 +-
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 49 +
 llvm/test/MC/AMDGPU/bf16_imm.s|  8 +++
 15 files changed, 289 insertions(+), 67 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ee0b7504769622..9bc60466d09be6 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5912,8 +5912,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
   }
 }
 
-assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) &&
-   "Must be able to losslessly bit cast to param");
 // Cast vector type (e.g., v256i32) to x86_amx, this only happen
 // in amx intrinsics.
 if (PTy->isX86_AMXTy())
@@ -5943,8 +5941,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
 }
   }
 
-  assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
- "Must be able to losslessly bit cast result type");
   // Cast x86_amx to vector type (e.g., v256i32), this only happen
   // in amx intrinsics.
   if (V->getType()->isX86_AMXTy())
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 202fa4e8f4ea81..6795fb7aa0edb8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 :
 def int_amdgcn_fdot2_bf16_bf16 :
   ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
   DefaultAttrsIntrinsic<
-[llvm_i16_ty],   // %r
+[llvm_bfloat_ty],   // %r
 [
-  llvm_v2i16_ty, // %a
-  llvm_v2i16_ty, // %b
-  llvm_i16_ty// %c
+  llvm_v2bf16_ty, // %a
+  llvm_v2bf16_ty, // %b
+  llvm_bfloat_ty// %c
 ],
 [IntrNoMem, IntrSpeculatable]
   >;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp 
b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 79ad6ddf7861fc..09f25215beb9e5 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -475,6 +475,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
 
   bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); }
 
+  bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); }
+
   bool isSSrc_f16() const { return isSCSrcB16() || isLiteralImm(MVT::f16); }
 
   bool isSSrcV2F16() const {
@@ -541,22 +543,40 @@ class AMDGPUOperand : public MCParsedAsmOperand {
 return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64);
   }
 
+  bool isVCSrcTBF16() const {
+return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::bf16);
+  }
+
   bool isVCSrcTF16() const {
 return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::f16);
   }
 
+  bool isVCSrcTBF16_Lo128() const {
+return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::bf16);
+  }
+
   bool isVCSrcTF16_Lo128() const {
 return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::f16);
   }
 
+  bool isVCSrcFake16BF16_Lo128() const {
+return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT::bf16);
+  }
+
   bool isVCSrcFake16F16_Lo128() const {
 return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT:

[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-13 Thread Shilei Tian via cfe-commits


@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | 
FileCheck %s --check-prefixes=GFX11,SDAG-GFX11
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < 
%s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11

shiltian wrote:

This change is because of the discussion here 
(https://github.com/llvm/llvm-project/pull/80908/files#r1483394728).

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-13 Thread Shilei Tian via cfe-commits

https://github.com/shiltian edited 
https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-13 Thread Shilei Tian via cfe-commits


@@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 :
 def int_amdgcn_fdot2_bf16_bf16 :
   ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
   DefaultAttrsIntrinsic<
-[llvm_i16_ty],   // %r
+[llvm_bfloat_ty],   // %r

shiltian wrote:

The cast will be inserted automatically in `clang/lib/CodeGen/CGBuiltin.cpp` 
after removing the two assertions. I reverted my change to the test case by 
accident. Lol

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-13 Thread Shilei Tian via cfe-commits


@@ -488,6 +488,49 @@ static bool printImmediateFloat16(uint32_t Imm, const 
MCSubtargetInfo &STI,
   return true;
 }
 
+static bool printImmediateBFloat16(uint32_t Imm, const MCSubtargetInfo &STI,
+   raw_ostream &O) {
+  if (Imm == 0x3F80)
+O << "1.0";
+  else if (Imm == 0xBF80)
+O << "-1.0";
+  else if (Imm == 0x3F00)
+O << "0.5";
+  else if (Imm == 0xBF00)
+O << "-0.5";
+  else if (Imm == 0x4000)
+O << "2.0";
+  else if (Imm == 0xC000)
+O << "-2.0";
+  else if (Imm == 0x4080)
+O << "4.0";
+  else if (Imm == 0xC080)
+O << "-4.0";
+  else if (Imm == 0x3E22 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
+O << "0.15915494";
+  else
+return false;
+
+  return true;
+}
+
+void AMDGPUInstPrinter::printImmediateBF16(uint32_t Imm,
+   const MCSubtargetInfo &STI,
+   raw_ostream &O) {
+  int16_t SImm = static_cast(Imm);
+  if (isInlinableIntLiteral(SImm)) {
+O << SImm;
+return;
+  }
+
+  uint16_t HImm = static_cast(Imm);
+  if (printImmediateBFloat16(HImm, STI, O))
+return;
+
+  uint64_t Imm16 = static_cast(Imm);

shiltian wrote:

Yeah, but it is to make the type promoted to `uint64_t` w/o any ambiguity.

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-13 Thread Shilei Tian via cfe-commits


@@ -4185,9 +4185,17 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand 
&MO,
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
 return AMDGPU::isInlinableLiteralV2F16(Imm);
+  case AMDGPU::OPERAND_REG_IMM_V2BF16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16:
+return AMDGPU::isInlinableLiteralV2BF16(Imm);
+  case AMDGPU::OPERAND_REG_IMM_BF16:
   case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED:
   case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
+  case AMDGPU::OPERAND_REG_INLINE_C_BF16:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_BF16:

shiltian wrote:

Yes, `isInlinableLiteral16` can't because it can't tell `fp16` and `bf16` apart 
by just looking at the value. That's the reason I tried really hard to get rid 
of `isInlinableLiteral16` in #81345 and favors the explicit version.

https://github.com/llvm/llvm-project/pull/80908
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [RFC][AMDGPU] Use `bf16` instead of `i16` for bfloat (PR #80908)

2024-02-13 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/80908

>From d72bf8bb9d1091ba76e17bf09b0aad9073e18caa Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Tue, 13 Feb 2024 19:02:41 -0500
Subject: [PATCH] [RFC][WIP][AMDGPU] Use `bf16` instead of `i16` for bfloat

Currently it looks like we generally use `i16` to represent `bf16` in those 
tablegen
files. I'm not sure of the reason behind it. My wild guess is the type `bf16` 
was
not available when we enabled the support. This patch is trying to use `bf16`
directly in those tablegen files, aiming at fixing #79369. Of course for #79369
a workaround can be to treat all `INT16` variants as `BFloat` in 
`getOpFltSemantics`,
but it doesn't look good IMHO.

Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out
where I don't understand correctly.
---
 clang/lib/CodeGen/CGBuiltin.cpp   |  4 --
 .../builtins-amdgcn-dl-insts-gfx11.cl |  5 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  8 +--
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp  | 71 +++
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 57 +++
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h   |  2 +
 .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp  |  7 ++
 llvm/lib/Target/AMDGPU/SIDefines.h|  7 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|  8 +++
 llvm/lib/Target/AMDGPU/SIInstrInfo.td | 58 ---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td  | 21 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 37 ++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 15 
 llvm/lib/Target/AMDGPU/VOP3Instructions.td|  2 +-
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 51 ++---
 llvm/test/MC/AMDGPU/bf16_imm.s|  8 +++
 16 files changed, 293 insertions(+), 68 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/bf16_imm.s

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ee0b7504769622..9bc60466d09be6 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5912,8 +5912,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
   }
 }
 
-assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) &&
-   "Must be able to losslessly bit cast to param");
 // Cast vector type (e.g., v256i32) to x86_amx, this only happen
 // in amx intrinsics.
 if (PTy->isX86_AMXTy())
@@ -5943,8 +5941,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
 }
   }
 
-  assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
- "Must be able to losslessly bit cast result type");
   // Cast x86_amx to vector type (e.g., v256i32), this only happen
   // in amx intrinsics.
   if (V->getType()->isX86_AMXTy())
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
index dc7069decaaa61..7688dfa55a78e3 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
@@ -11,7 +11,10 @@ typedef unsigned short __attribute__((ext_vector_type(2))) 
ushort2;
 // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, 
float %fC, i1 false)
 // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, 
float %fC, i1 true)
 // CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %v2hA, <2 x half> 
%v2hB, half %hC)
-// CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> 
%v2ssB, i16 %sC)
+// CHECK: [[s1:%[0-9]+]] = bitcast <2 x i16> %v2ssA to <2 x bfloat>
+// CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat>
+// CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat
+// CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat 
@llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat 
[[s3]])
 // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> 
%v2ssB, float %fC, i1 false)
 // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> 
%v2ssB, float %fC, i1 true)
 // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 202fa4e8f4ea81..6795fb7aa0edb8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 :
 def int_amdgcn_fdot2_bf16_bf16 :
   ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
   DefaultAttrsIntrinsic<
-[llvm_i16_ty],   // %r
+[llvm_bfloat_ty],   // %r
 [
-  llvm_v2i16_ty, // %a
-  llvm_v2i16_ty, // %b
-  llvm_i16_ty// %c
+  llvm_v2bf16_ty, // %a
+  llvm_v2bf16_ty, // %b
+  llvm_bfloat_ty// %c
 ],
 [IntrNoMem, IntrSpec

  1   2   3   4   5   6   7   8   9   10   >