date:20241031

[llvm-branch-commits] [clang] [flang] [lld] [llvm] [Flang] LLVM_ENABLE_RUNTIMES=FortranRuntime (PR #110217)

2024-10-31 Thread Jan Patrick Lehr via llvm-branch-commits


jplehr wrote:

I tested this locally and it appears that it requires are more modern CMake 
version than what was installed (`3.22`).
According to the LLVM docs 
(https://releases.llvm.org/12.0.0/docs/GettingStarted.html#id8) currently CMake 
3.20 is the minimum required version.

https://github.com/llvm/llvm-project/pull/110217
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [mlir] d0422f6 - Revert "Extend `getBackwardSlice` to track values captured from above (#113478)"

2024-10-31 Thread via llvm-branch-commits


Author: Mehdi Amini
Date: 2024-10-31T18:28:41+01:00
New Revision: d0422f6d72d84dbf78d248fa9665d2aaf50dd1fa

URL: 
https://github.com/llvm/llvm-project/commit/d0422f6d72d84dbf78d248fa9665d2aaf50dd1fa
DIFF: 
https://github.com/llvm/llvm-project/commit/d0422f6d72d84dbf78d248fa9665d2aaf50dd1fa.diff

LOG: Revert "Extend `getBackwardSlice` to track values captured from above 
(#113478)"

This reverts commit 1bc58a258e2edb6221009a26d0f0037eda6c7c47.

Added: 


Modified: 
mlir/include/mlir/Analysis/SliceAnalysis.h
mlir/lib/Analysis/SliceAnalysis.cpp
mlir/test/IR/slice.mlir
mlir/test/lib/IR/TestSlicing.cpp

Removed: 




diff  --git a/mlir/include/mlir/Analysis/SliceAnalysis.h 
b/mlir/include/mlir/Analysis/SliceAnalysis.h
index a4f5d937cd51da..99279fdfe427c8 100644
--- a/mlir/include/mlir/Analysis/SliceAnalysis.h
+++ b/mlir/include/mlir/Analysis/SliceAnalysis.h
@@ -47,11 +47,6 @@ struct BackwardSliceOptions : public SliceOptions {
   /// backward slice computation traverses block arguments and asserts that the
   /// parent op has a single region with a single block.
   bool omitBlockArguments = false;
-
-  /// When omitUsesFromAbove is true, the backward slice computation omits
-  /// traversing values that are captured from above.
-  /// TODO: this should default to `false` after users have been updated.
-  bool omitUsesFromAbove = true;
 };
 
 using ForwardSliceOptions = SliceOptions;

diff  --git a/mlir/lib/Analysis/SliceAnalysis.cpp 
b/mlir/lib/Analysis/SliceAnalysis.cpp
index 7ec999fa0370f9..2b1cf411cb 100644
--- a/mlir/lib/Analysis/SliceAnalysis.cpp
+++ b/mlir/lib/Analysis/SliceAnalysis.cpp
@@ -16,8 +16,6 @@
 #include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/RegionUtils.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 
@@ -93,13 +91,14 @@ static void getBackwardSliceImpl(Operation *op,
   if (options.filter && !options.filter(op))
 return;
 
-  auto processValue = [&](Value value) {
-if (auto *definingOp = value.getDefiningOp()) {
+  for (const auto &en : llvm::enumerate(op->getOperands())) {
+auto operand = en.value();
+if (auto *definingOp = operand.getDefiningOp()) {
   if (backwardSlice->count(definingOp) == 0)
 getBackwardSliceImpl(definingOp, backwardSlice, options);
-} else if (auto blockArg = dyn_cast(value)) {
+} else if (auto blockArg = dyn_cast(operand)) {
   if (options.omitBlockArguments)
-return;
+continue;
 
   Block *block = blockArg.getOwner();
   Operation *parentOp = block->getParentOp();
@@ -114,14 +113,7 @@ static void getBackwardSliceImpl(Operation *op,
 } else {
   llvm_unreachable("No definingOp and not a block argument.");
 }
-  };
-
-  if (!options.omitUsesFromAbove) {
-visitUsedValuesDefinedAbove(op->getRegions(), [&](OpOperand *operand) {
-  processValue(operand->get());
-});
   }
-  llvm::for_each(op->getOperands(), processValue);
 
   backwardSlice->insert(op);
 }

diff  --git a/mlir/test/IR/slice.mlir b/mlir/test/IR/slice.mlir
index 87d446c8f415af..0a32a0f231baf2 100644
--- a/mlir/test/IR/slice.mlir
+++ b/mlir/test/IR/slice.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -slice-analysis-test -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -slice-analysis-test %s | FileCheck %s
 
 func.func @slicing_linalg_op(%arg0 : index, %arg1 : index, %arg2 : index) {
   %a = memref.alloc(%arg0, %arg2) : memref
@@ -33,29 +33,3 @@ func.func @slicing_linalg_op(%arg0 : index, %arg1 : index, 
%arg2 : index) {
 //   CHECK-DAG:   %[[B:.+]] = memref.alloc(%[[ARG2]], %[[ARG1]]) : 
memref
 //   CHECK-DAG:   %[[C:.+]] = memref.alloc(%[[ARG0]], %[[ARG1]]) : 
memref
 //   CHECK:   return
-
-// -
-
-#map = affine_map<(d0, d1) -> (d0, d1)>
-func.func @slice_use_from_above(%arg0: tensor<5x5xf32>, %arg1: 
tensor<5x5xf32>) {
-  %0 = linalg.generic {indexing_maps = [#map, #map], iterator_types = 
["parallel", "parallel"]} ins(%arg0 : tensor<5x5xf32>) outs(%arg1 : 
tensor<5x5xf32>) {
-  ^bb0(%in: f32, %out: f32):
-%2 = arith.addf %in, %in : f32
-linalg.yield %2 : f32
-  } -> tensor<5x5xf32>
-  %collapsed = tensor.collapse_shape %0 [[0, 1]] : tensor<5x5xf32> into 
tensor<25xf32>
-  %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = 
["parallel", "parallel"]} ins(%0 : tensor<5x5xf32>) outs(%arg1 : 
tensor<5x5xf32>) {
-  ^bb0(%in: f32, %out: f32):
-%c2 = arith.constant 2 : index
-%extracted = tensor.extract %collapsed[%c2] : tensor<25xf32>
-%2 = arith.addf %extracted, %extracted : f32
-linalg.yield %2 : f32
-  } -> tensor<5x5xf32>
-  return
-}
-
-// CHECK-LABEL: func @slice_use_from_above__backward_slice__0
-//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor 
-//   CHECK:   %[[A:.+]] = linalg.generic {{.*}} ins(%[[ARG0]]
-//

[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)

2024-10-31 Thread Shilei Tian via llvm-branch-commits


https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/114438

>From 66264a1254c322fc0d3aa464125370886ad7da7c Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Thu, 31 Oct 2024 12:49:07 -0400
Subject: [PATCH] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor
 existing attribute

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   | 80 +++
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  | 47 ++-
 2 files changed, 69 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 205673cdcc0e23..9a165d9be529e2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -215,6 +215,19 @@ class AMDGPUInformationCache : public InformationCache {
 return ST.getWavesPerEU(F, FlatWorkGroupSize);
   }
 
+  std::optional>
+  getWavesPerEUAttr(const Function &F) {
+Attribute Attr = F.getFnAttribute("amdgpu-waves-per-eu");
+if (!Attr.isStringAttribute())
+  return std::nullopt;
+auto Val = parseRangeAttribute(Attr.getValueAsString());
+if (Val && Val->second == 0) {
+  const GCNSubtarget &ST = TM.getSubtarget(F);
+  Val->second = ST.getMaxWavesPerEU();
+}
+return Val;
+  }
+
   std::pair
   getEffectiveWavesPerEU(const Function &F,
  std::pair WavesPerEU,
@@ -785,22 +798,6 @@ struct AAAMDSizeRangeAttribute
/*ForceReplace=*/true);
   }
 
-  ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min,
- unsigned Max) {
-// Don't add the attribute if it's the implied default.
-if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
-  return ChangeStatus::UNCHANGED;
-
-Function *F = getAssociatedFunction();
-LLVMContext &Ctx = F->getContext();
-SmallString<10> Buffer;
-raw_svector_ostream OS(Buffer);
-OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
-return A.manifestAttrs(getIRPosition(),
-   {Attribute::get(Ctx, AttrName, OS.str())},
-   /*ForceReplace=*/true);
-  }
-
   const std::string getAsStr(Attributor *) const override {
 std::string Str;
 raw_string_ostream OS(Str);
@@ -885,29 +882,44 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
   AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
   : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
 
-  bool isValidState() const override {
-return !Assumed.isEmptySet() && IntegerRangeState::isValidState();
-  }
-
   void initialize(Attributor &A) override {
 Function *F = getAssociatedFunction();
 auto &InfoCache = static_cast(A.getInfoCache());
 
-if (const auto *AssumedGroupSize = A.getAAFor(
-*this, IRPosition::function(*F), DepClassTy::REQUIRED);
-AssumedGroupSize->isValidState()) {
-
-  unsigned Min, Max;
-  std::tie(Min, Max) = InfoCache.getWavesPerEU(
-  *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
-   AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
-
+auto TakeRange = [&](std::pair R) {
+  auto [Min, Max] = R;
   ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
-  intersectKnown(Range);
+  IntegerRangeState RangeState(Range);
+  clampStateAndIndicateChange(this->getState(), RangeState);
+  indicateOptimisticFixpoint();
+};
+
+// If the attribute exists, simple honor it.
+if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
+  TakeRange(*Attr);
+  return;
 }
 
-if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
-  indicatePessimisticFixpoint();
+// It's getting trickier here, different from AAAMDFlatWorkGroupSize. Since
+// the calculation of waves per EU involves flat work group size, we can't
+// simply use an assumed flat work group size as a start point, because the
+// update of flat work group size is in an inverse direction of waves per
+// EU. However, we can still do something if it is an entry function. Since
+// an entry function is a terminal node, and flat work group size either
+// from attribute or default will be used anyway, we can take that value 
and
+// calculate the waves per EU based on it. This result can't be updated by
+// no means, but that could still allow us to propagate it.
+if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+  std::pair MaxWavesPerEURange{
+  1U, InfoCache.getMaxWavesPerEU(*F)};
+  std::pair FlatWorkGroupSize;
+  if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F))
+FlatWorkGroupSize = *Attr;
+  else
+FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F);
+  TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange,
+ FlatWorkGroupSize)

[llvm-branch-commits] [llvm] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)

2024-10-31 Thread Shilei Tian via llvm-branch-commits



@@ -404,21 +404,20 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
 ; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" 
"uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" 
"uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" 
"uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" 
"uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" 
"uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" 
"uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" 
"amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" 
"uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-ld

[llvm-branch-commits] [llvm] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)

2024-10-31 Thread Shilei Tian via llvm-branch-commits



@@ -404,21 +404,20 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
 ; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" 
"uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" 
"uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" 
"uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" 
"uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" 
"uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" 
"uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" 
"amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" 
"uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" 
"amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-ld

[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)

2024-10-31 Thread Shilei Tian via llvm-branch-commits


https://github.com/shiltian edited 
https://github.com/llvm/llvm-project/pull/114438
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AArch64] Define high bits of FPR and GPR registers. (PR #114263)

2024-10-31 Thread Sander de Smalen via llvm-branch-commits



@@ -424,6 +424,58 @@ AArch64RegisterInfo::explainReservedReg(const 
MachineFunction &MF,
   return {};
 }
 
+static SmallVector ReservedHi = {

sdesmalen-arm wrote:

Without marking the registers as reserved, then for the example below:
```
---
name:sv2i64
tracksRegLiveness: true
body: |
  bb.0.entry:
liveins: $q0, $q1

%0:fpr128 = COPY $q0
%1:fpr128 = COPY $q1
%35:gpr64 = COPY %0.dsub
%36:gpr64 = COPY %1.dsub
%9:gpr64 = SDIVXr %35, %36
%37:gpr64 = UMOVvi64 %0, 1
%38:gpr64 = UMOVvi64 %1, 1
%10:gpr64 = SDIVXr %37, %38
%19:fpr128 = INSvi64gpr undef %19, 0, %9
%19:fpr128 = INSvi64gpr %19, 1, %10
%39:gpr64 = COPY %19.dsub
%24:gpr64 = MADDXrrr %39, %36, $xzr
%41:gpr64 = UMOVvi64 %19, 1
%25:gpr64 = MADDXrrr %41, %38, $xzr
%34:fpr128 = INSvi64gpr undef %34, 0, %24
%34:fpr128 = INSvi64gpr %34, 1, %25
%2:fpr128 = SUBv2i64 %0, %34
$q0 = COPY %2
RET_ReallyLR implicit $q0
...
```

When I run this with:
```
llc -global-isel -verify-machineinstrs -run-pass=machine-scheduler
```

It fails with:
```
Use of $xzr does not have a corresponding definition on every path:
216r %10:gpr64 = MADDXrrr %9:gpr64, %3:gpr64, $xzr
LLVM ERROR: Use not jointly dominated by defs.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and 
include the crash backtrace.
Stack dump:
0.  Program arguments: ./bin/llc -global-isel -verify-machineinstrs 
-run-pass=machine-scheduler /tmp/t.mir -o -
1.  Running pass 'Function Pass Manager' on module '/tmp/t.mir'.
2.  Running pass 'Machine Instruction Scheduler' on function '@sv2i64'
 ...
 #8 0x80062b7c llvm::LiveRangeCalc::findReachingDefs(llvm::LiveRange&, 
llvm::MachineBasicBlock&, llvm::SlotIndex, unsigned int, 
llvm::ArrayRef)
 #9 0x80063e94 llvm::LiveRangeCalc::extend(llvm::LiveRange&, 
llvm::SlotIndex, unsigned int, llvm::ArrayRef)
#10 0x80064a18 llvm::LiveIntervalCalc::extendToUses(llvm::LiveRange&, 
llvm::Register, llvm::LaneBitmask, llvm::LiveInterval*)
#11 0x8003e82c 
llvm::LiveIntervals::computeRegUnitRange(llvm::LiveRange&, unsigned int)
#12 0x80044cdc 
llvm::LiveIntervals::HMEditor::updateAllRanges(llvm::MachineInstr*)
#13 0x8004848c llvm::LiveIntervals::handleMove(llvm::MachineInstr&, 
bool)
#14 0x801f44ec 
llvm::ScheduleDAGMI::moveInstruction(llvm::MachineInstr*, 
llvm::MachineInstrBundleIterator)
#15 0x801fdb58 llvm::ScheduleDAGMILive::scheduleMI(llvm::SUnit*, bool)
#16 0x8020b214 llvm::ScheduleDAGMILive::schedule()
#17 0x801f0934 (anonymous 
namespace)::MachineSchedulerBase::scheduleRegions(llvm::ScheduleDAGInstrs&, 
bool) (.isra.0) MachineScheduler.cpp:0:0
```

https://github.com/llvm/llvm-project/pull/114263
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)

2024-10-31 Thread Shilei Tian via llvm-branch-commits


https://github.com/shiltian created 
https://github.com/llvm/llvm-project/pull/114438

None

>From 687d29af2f79b07cdc8b8b0044a8c1f828745cfd Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Thu, 31 Oct 2024 12:49:07 -0400
Subject: [PATCH] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor
 existing attribute

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   | 64 +--
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  | 47 +++---
 2 files changed, 69 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 205673cdcc0e23..ed7cd1f53b41e9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -215,6 +215,19 @@ class AMDGPUInformationCache : public InformationCache {
 return ST.getWavesPerEU(F, FlatWorkGroupSize);
   }
 
+  std::optional>
+  getWavesPerEUAttr(const Function &F) {
+Attribute Attr = F.getFnAttribute("amdgpu-waves-per-eu");
+if (!Attr.isStringAttribute())
+  return std::nullopt;
+auto Val = parseRangeAttribute(Attr.getValueAsString());
+if (Val && Val->second == 0) {
+  const GCNSubtarget &ST = TM.getSubtarget(F);
+  Val->second = ST.getMaxWavesPerEU();
+}
+return Val;
+  }
+
   std::pair
   getEffectiveWavesPerEU(const Function &F,
  std::pair WavesPerEU,
@@ -885,29 +898,44 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
   AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
   : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
 
-  bool isValidState() const override {
-return !Assumed.isEmptySet() && IntegerRangeState::isValidState();
-  }
-
   void initialize(Attributor &A) override {
 Function *F = getAssociatedFunction();
 auto &InfoCache = static_cast(A.getInfoCache());
 
-if (const auto *AssumedGroupSize = A.getAAFor(
-*this, IRPosition::function(*F), DepClassTy::REQUIRED);
-AssumedGroupSize->isValidState()) {
-
-  unsigned Min, Max;
-  std::tie(Min, Max) = InfoCache.getWavesPerEU(
-  *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
-   AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
-
+auto TakeRange = [&](std::pair R) {
+  auto [Min, Max] = R;
   ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
-  intersectKnown(Range);
+  IntegerRangeState RangeState(Range);
+  clampStateAndIndicateChange(this->getState(), RangeState);
+  indicateOptimisticFixpoint();
+};
+
+// If the attribute exists, simple honor it.
+if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
+  TakeRange(*Attr);
+  return;
 }
 
-if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
-  indicatePessimisticFixpoint();
+// It's getting trickier here, different from AAAMDFlatWorkGroupSize. Since
+// the calculation of waves per EU involves flat work group size, we can't
+// simply use an assumed flat work group size as a start point, because the
+// update of flat work group size is in an inverse direction of waves per
+// EU. However, we can still do something if it is an entry function. Since
+// an entry function is a terminal node, and flat work group size either
+// from attribute or default will be used anyway, we can take that value 
and
+// calculate the waves per EU based on it. This result can't be updated by
+// no means, but that could still allow us to propagate it.
+if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+  std::pair MaxWavesPerEURange{
+  1U, InfoCache.getMaxWavesPerEU(*F)};
+  std::pair FlatWorkGroupSize;
+  if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F))
+FlatWorkGroupSize = *Attr;
+  else
+FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F);
+  TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange,
+ FlatWorkGroupSize));
+}
   }
 
   ChangeStatus updateImpl(Attributor &A) override {
@@ -956,8 +984,8 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
   ChangeStatus manifest(Attributor &A) override {
 Function *F = getAssociatedFunction();
 auto &InfoCache = static_cast(A.getInfoCache());
-unsigned Max = InfoCache.getMaxWavesPerEU(*F);
-return emitAttributeIfNotDefault(A, 1, Max);
+return emitAttributeIfNotDefaultAfterClamp(
+A, {1, InfoCache.getMaxWavesPerEU(*F)});
   }
 
   /// See AbstractAttribute::getName()
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll 
b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index 1272bf655e309d..e28bccfb99343b 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -117,7 +117,7 @@ define amdgpu_kernel void @kernel_2_9() #6 {
 
 define amdgpu_kernel void @kernel_9_9()

[llvm-branch-commits] [llvm] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)

2024-10-31 Thread via llvm-branch-commits


llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Shilei Tian (shiltian)


Changes



---

Patch is 25.38 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/114438.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp (+46-18) 
- (modified) llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll (+23-24) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 205673cdcc0e23..ed7cd1f53b41e9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -215,6 +215,19 @@ class AMDGPUInformationCache : public InformationCache {
 return ST.getWavesPerEU(F, FlatWorkGroupSize);
   }
 
+  std::optional>
+  getWavesPerEUAttr(const Function &F) {
+Attribute Attr = F.getFnAttribute("amdgpu-waves-per-eu");
+if (!Attr.isStringAttribute())
+  return std::nullopt;
+auto Val = parseRangeAttribute(Attr.getValueAsString());
+if (Val && Val->second == 0) {
+  const GCNSubtarget &ST = TM.getSubtarget(F);
+  Val->second = ST.getMaxWavesPerEU();
+}
+return Val;
+  }
+
   std::pair
   getEffectiveWavesPerEU(const Function &F,
  std::pair WavesPerEU,
@@ -885,29 +898,44 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
   AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
   : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
 
-  bool isValidState() const override {
-return !Assumed.isEmptySet() && IntegerRangeState::isValidState();
-  }
-
   void initialize(Attributor &A) override {
 Function *F = getAssociatedFunction();
 auto &InfoCache = static_cast(A.getInfoCache());
 
-if (const auto *AssumedGroupSize = A.getAAFor(
-*this, IRPosition::function(*F), DepClassTy::REQUIRED);
-AssumedGroupSize->isValidState()) {
-
-  unsigned Min, Max;
-  std::tie(Min, Max) = InfoCache.getWavesPerEU(
-  *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
-   AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
-
+auto TakeRange = [&](std::pair R) {
+  auto [Min, Max] = R;
   ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
-  intersectKnown(Range);
+  IntegerRangeState RangeState(Range);
+  clampStateAndIndicateChange(this->getState(), RangeState);
+  indicateOptimisticFixpoint();
+};
+
+// If the attribute exists, simple honor it.
+if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
+  TakeRange(*Attr);
+  return;
 }
 
-if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
-  indicatePessimisticFixpoint();
+// It's getting trickier here, different from AAAMDFlatWorkGroupSize. Since
+// the calculation of waves per EU involves flat work group size, we can't
+// simply use an assumed flat work group size as a start point, because the
+// update of flat work group size is in an inverse direction of waves per
+// EU. However, we can still do something if it is an entry function. Since
+// an entry function is a terminal node, and flat work group size either
+// from attribute or default will be used anyway, we can take that value 
and
+// calculate the waves per EU based on it. This result can't be updated by
+// no means, but that could still allow us to propagate it.
+if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+  std::pair MaxWavesPerEURange{
+  1U, InfoCache.getMaxWavesPerEU(*F)};
+  std::pair FlatWorkGroupSize;
+  if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F))
+FlatWorkGroupSize = *Attr;
+  else
+FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F);
+  TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange,
+ FlatWorkGroupSize));
+}
   }
 
   ChangeStatus updateImpl(Attributor &A) override {
@@ -956,8 +984,8 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
   ChangeStatus manifest(Attributor &A) override {
 Function *F = getAssociatedFunction();
 auto &InfoCache = static_cast(A.getInfoCache());
-unsigned Max = InfoCache.getMaxWavesPerEU(*F);
-return emitAttributeIfNotDefault(A, 1, Max);
+return emitAttributeIfNotDefaultAfterClamp(
+A, {1, InfoCache.getMaxWavesPerEU(*F)});
   }
 
   /// See AbstractAttribute::getName()
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll 
b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index 1272bf655e309d..e28bccfb99343b 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -117,7 +117,7 @@ define amdgpu_kernel void @kernel_2_9() #6 {
 
 define amdgpu_kernel void @kernel_9_9() #7 {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_9_9
-; CHECK-SAME: () #[[ATTR6]] {
+; CHECK-SAME: ()

[llvm-branch-commits] [llvm] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)

2024-10-31 Thread Shilei Tian via llvm-branch-commits


shiltian wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/114438?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#114438** https://app.graphite.dev/github/pr/llvm/llvm-project/114438?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#114357** https://app.graphite.dev/github/pr/llvm/llvm-project/114357?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @shiltian and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/114438
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] AtomicExpand: Copy metadata from atomicrmw to cmpxchg (PR #109409)

2024-10-31 Thread Matt Arsenault via llvm-branch-commits


arsenm wrote:

### Merge activity

* **Oct 31, 2:43 PM EDT**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/109409).


https://github.com/llvm/llvm-project/pull/109409
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)

2024-10-31 Thread Shilei Tian via llvm-branch-commits



@@ -215,6 +215,19 @@ class AMDGPUInformationCache : public InformationCache {
 return ST.getWavesPerEU(F, FlatWorkGroupSize);
   }
 
+  std::optional>
+  getWavesPerEUAttr(const Function &F) {
+Attribute Attr = F.getFnAttribute("amdgpu-waves-per-eu");
+if (!Attr.isStringAttribute())
+  return std::nullopt;
+auto Val = parseRangeAttribute(Attr.getValueAsString());

shiltian wrote:

In the parent PR

https://github.com/llvm/llvm-project/pull/114438
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)

2024-10-31 Thread Matt Arsenault via llvm-branch-commits



@@ -215,6 +215,19 @@ class AMDGPUInformationCache : public InformationCache {
 return ST.getWavesPerEU(F, FlatWorkGroupSize);
   }
 
+  std::optional>
+  getWavesPerEUAttr(const Function &F) {
+Attribute Attr = F.getFnAttribute("amdgpu-waves-per-eu");
+if (!Attr.isStringAttribute())
+  return std::nullopt;
+auto Val = parseRangeAttribute(Attr.getValueAsString());

arsenm wrote:

I don't see where this is defined 

https://github.com/llvm/llvm-project/pull/114438
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AArch64] Define high bits of FPR and GPR registers. (PR #114263)

2024-10-31 Thread Matt Arsenault via llvm-branch-commits



@@ -424,6 +424,58 @@ AArch64RegisterInfo::explainReservedReg(const 
MachineFunction &MF,
   return {};
 }
 
+static SmallVector ReservedHi = {

arsenm wrote:

What kind of failures? AMDGPU also has synthetic 16-bit high sub registers and 
they are not explicitly reserved. Are you adding these to an allocatable class? 

https://github.com/llvm/llvm-project/pull/114263
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)

2024-10-31 Thread Petar Avramovic via llvm-branch-commits


https://github.com/petar-avramovic updated 
https://github.com/llvm/llvm-project/pull/112866

>From a269e8da1b872b3f2390037a594757940cf8369b Mon Sep 17 00:00:00 2001
From: Petar Avramovic 
Date: Thu, 31 Oct 2024 14:10:57 +0100
Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi

Change existing code for G_PHI to match what LLVM-IR version is doing
via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI
since it may appear with an undef operand and getVRegDef can fail.
Most notably this improves number of values that can be allocated
to sgpr register bank in AMDGPURegBankSelect.
Common case here are phis that appear in structurize-cfg lowering
for cycles with multiple exits:
Undef incoming value is coming from block that reached cycle exit
condition, if other incoming is uniform keep the phi uniform despite
the fact it is joining values from pair of blocks that are entered
via divergent condition branch.
---
 llvm/lib/CodeGen/MachineSSAContext.cpp| 27 +-
 .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++
 .../AMDGPU/MIR/hidden-loop-diverge.mir|  4 +-
 .../AMDGPU/MIR/uses-value-from-cycle.mir  |  8 +-
 .../GlobalISel/divergence-structurizer.mir| 80 --
 .../regbankselect-mui-regbanklegalize.mir | 69 ---
 .../regbankselect-mui-regbankselect.mir   | 18 ++--
 .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++-
 .../AMDGPU/GlobalISel/regbankselect-mui.mir   | 51 ++-
 9 files changed, 191 insertions(+), 178 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp 
b/llvm/lib/CodeGen/MachineSSAContext.cpp
index e384187b6e8593..8e13c0916dd9e1 100644
--- a/llvm/lib/CodeGen/MachineSSAContext.cpp
+++ b/llvm/lib/CodeGen/MachineSSAContext.cpp
@@ -54,9 +54,34 @@ const MachineBasicBlock 
*MachineSSAContext::getDefBlock(Register value) const {
   return F->getRegInfo().getVRegDef(value)->getParent();
 }
 
+static bool isUndef(const MachineInstr &MI) {
+  return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF ||
+ MI.getOpcode() == TargetOpcode::IMPLICIT_DEF;
+}
+
+/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI.
 template <>
 bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) {
-  return Phi.isConstantValuePHI();
+  if (!Phi.isPHI())
+return false;
+
+  // In later passes PHI may appear with an undef operand, getVRegDef can fail.
+  if (Phi.getOpcode() == TargetOpcode::PHI)
+return Phi.isConstantValuePHI();
+
+  // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue().
+  const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo();
+  Register This = Phi.getOperand(0).getReg();
+  Register ConstantValue;
+  for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) {
+Register Incoming = Phi.getOperand(i).getReg();
+if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) {
+  if (ConstantValue && ConstantValue != Incoming)
+return false;
+  ConstantValue = Incoming;
+}
+  }
+  return true;
 }
 
 template <>
diff --git 
a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir 
b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
index ce00edf3363f77..9694a340b5e906 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
@@ -1,24 +1,24 @@
 # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | 
FileCheck %s
 # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge
 # CHECK-LABEL: BLOCK bb.0
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC 
intrinsic(@llvm.amdgcn.workitem.id.x)
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt)
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, 
%{{[0-9]*}}:_
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = 
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = 
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
-# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1
-# CHECK: DIVERGENT: G_BR %bb.2
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC 
intrinsic(@llvm.amdgcn.workitem.id.x)
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt)
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, 
%{{[0-9]*}}:_
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = 
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = 
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1
+# CHECK: DIVERGENT: G_BR %bb.2
 # CHECK-LABEL: BLOCK bb.1
 # CHECK-LABEL: BLOCK bb.2
-# CHECK: D

[llvm-branch-commits] [llvm] AMDGPU: Custom expand flat cmpxchg which may access private (PR #109410)

2024-10-31 Thread Matt Arsenault via llvm-branch-commits


https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/109410

>From 834ff3b40bd82cb54bb33532a54ad36870ea2b24 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 14 Aug 2024 13:57:14 +0400
Subject: [PATCH 1/2] AMDGPU: Custom expand flat cmpxchg which may access
 private

64-bit flat cmpxchg instructions do not work correctly for scratch
addresses, and need to be expanded as non-atomic.

Allow custom expansion of cmpxchg in AtomicExpand, as is
already the case for atomicrmw.
---
 llvm/include/llvm/CodeGen/TargetLowering.h|5 +
 .../llvm/Transforms/Utils/LowerAtomic.h   |7 +
 llvm/lib/CodeGen/AtomicExpandPass.cpp |4 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  146 ++-
 llvm/lib/Target/AMDGPU/SIISelLowering.h   |3 +
 llvm/lib/Transforms/Utils/LowerAtomic.cpp |   21 +-
 llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll  | 1019 +++--
 ...expand-atomicrmw-flat-noalias-addrspace.ll |6 +-
 ...expand-atomicrmw-integer-ops-0-to-add-0.ll |6 +-
 .../expand-cmpxchg-flat-maybe-private.ll  |  104 +-
 10 files changed, 1157 insertions(+), 164 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h 
b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8e0cdc6f1a5e77..e0b638201a0474 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2204,6 +2204,11 @@ class TargetLoweringBase {
 "Generic atomicrmw expansion unimplemented on this target");
   }
 
+  /// Perform a cmpxchg expansion using a target-specific method.
+  virtual void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+llvm_unreachable("Generic cmpxchg expansion unimplemented on this target");
+  }
+
   /// Perform a bit test atomicrmw using a target-specific intrinsic. This
   /// represents the combined bit test intrinsic which will be lowered at a 
late
   /// stage by the backend.
diff --git a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h 
b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h
index b25b281667f9cb..295c2bd2b4b47e 100644
--- a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h
+++ b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h
@@ -23,6 +23,13 @@ class IRBuilderBase;
 /// Convert the given Cmpxchg into primitive load and compare.
 bool lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI);
 
+/// Emit IR to implement the given cmpxchg operation on values in registers,
+/// returning the new value.
+std::pair buildAtomicCmpXchgValue(IRBuilderBase &Builder,
+Value *Ptr, Value *Cmp,
+Value *Val,
+Align Alignment);
+
 /// Convert the given RMWI into primitive load and stores,
 /// assuming that doing so is legal. Return true if the lowering
 /// succeeds.
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp 
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 0aff4f1f5cf1cb..1471e3d7cbc29d 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -1674,6 +1674,10 @@ bool 
AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
 return true;
   case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
 return lowerAtomicCmpXchgInst(CI);
+  case TargetLoweringBase::AtomicExpansionKind::Expand: {
+TLI->emitExpandAtomicCmpXchg(CI);
+return true;
+  }
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d66610ae0a160d..c8a46875bda408 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16577,9 +16577,21 @@ 
SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
 
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const 
{
-  return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
- ? AtomicExpansionKind::NotAtomic
- : AtomicExpansionKind::None;
+  unsigned AddrSpace = CmpX->getPointerAddressSpace();
+  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
+return AtomicExpansionKind::NotAtomic;
+
+  if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
+return AtomicExpansionKind::None;
+
+  const DataLayout &DL = CmpX->getDataLayout();
+
+  Type *ValTy = CmpX->getNewValOperand()->getType();
+
+  // If a 64-bit flat atomic may alias private, we need to avoid using the
+  // atomic in the private case.
+  return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
+   : AtomicExpansionKind::None;
 }
 
 const TargetRegisterClass *
@@ -16745,40 +16757,8 @@ bool SITargetLowering::checkForPhysRegDependency(
   return false;
 }
 
-void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
-  AtomicRMWInst::BinOp Op = AI->getOperation();
-
-  if (Op == AtomicRMWInst::Sub || Op ==

[llvm-branch-commits] [llvm] [AArch64] Define high bits of FPR and GPR registers. (PR #114263)

2024-10-31 Thread Matt Arsenault via llvm-branch-commits



@@ -424,6 +424,57 @@ AArch64RegisterInfo::explainReservedReg(const 
MachineFunction &MF,
   return {};
 }
 
+static MCPhysReg ReservedHi[] = {

arsenm wrote:

missing const 

https://github.com/llvm/llvm-project/pull/114263
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [flang] Revert "[Flang][OpenMP] Disable lowering of omp.simd reductions in co… (PR #113683)

2024-10-31 Thread Sergio Afonso via llvm-branch-commits


https://github.com/skatrak updated 
https://github.com/llvm/llvm-project/pull/113683

>From cf57ecd0984bcb2335e8ecfaffdca600a5c7cf1b Mon Sep 17 00:00:00 2001
From: Sergio Afonso 
Date: Fri, 25 Oct 2024 12:07:22 +0100
Subject: [PATCH] Revert "[Flang][OpenMP] Disable lowering of omp.simd
 reductions in composites (#112686)"

Lowering of reductions in composite operations can now be re-enabled, since 
previous commits in this PR stack fix the MLIR representation produced and it 
no longer triggers a compiler crash during translation to LLVM IR.

This reverts commit c44860c8d2582abd88794267b4fa0fa953bbef80.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp   | 20 ++--
 flang/test/Lower/OpenMP/wsloop-simd.f90 | 21 +
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp 
b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 329cbf3d7539f5..4f9e2347308aa1 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2246,12 +2246,6 @@ static void genCompositeDistributeParallelDoSimd(
   genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps,
  simdReductionSyms);
 
-  // TODO: Remove this after omp.simd reductions on composite constructs are
-  // supported.
-  simdClauseOps.reductionVars.clear();
-  simdClauseOps.reductionByref.clear();
-  simdClauseOps.reductionSyms.clear();
-
   mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector iv;
   genLoopNestClauses(converter, semaCtx, eval, simdItem->clauses, loc,
@@ -2273,7 +2267,9 @@ static void genCompositeDistributeParallelDoSimd(
   wsloopOp.setComposite(/*val=*/true);
 
   EntryBlockArgs simdArgs;
-  // TODO: Add private and reduction syms and vars.
+  // TODO: Add private syms and vars.
+  simdArgs.reduction.syms = simdReductionSyms;
+  simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
   genWrapperOp(converter, loc, simdClauseOps, simdArgs);
   simdOp.setComposite(/*val=*/true);
@@ -2366,12 +2362,6 @@ static void genCompositeDoSimd(lower::AbstractConverter 
&converter,
   genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps,
  simdReductionSyms);
 
-  // TODO: Remove this after omp.simd reductions on composite constructs are
-  // supported.
-  simdClauseOps.reductionVars.clear();
-  simdClauseOps.reductionByref.clear();
-  simdClauseOps.reductionSyms.clear();
-
   // TODO: Support delayed privatization.
   DataSharingProcessor dsp(converter, semaCtx, simdItem->clauses, eval,
/*shouldCollectPreDeterminedSymbols=*/true,
@@ -2395,7 +2385,9 @@ static void genCompositeDoSimd(lower::AbstractConverter 
&converter,
   wsloopOp.setComposite(/*val=*/true);
 
   EntryBlockArgs simdArgs;
-  // TODO: Add private and reduction syms and vars.
+  // TODO: Add private syms and vars.
+  simdArgs.reduction.syms = simdReductionSyms;
+  simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
   genWrapperOp(converter, loc, simdClauseOps, simdArgs);
   simdOp.setComposite(/*val=*/true);
diff --git a/flang/test/Lower/OpenMP/wsloop-simd.f90 
b/flang/test/Lower/OpenMP/wsloop-simd.f90
index 899ab59714f144..49a9a523e11fe7 100644
--- a/flang/test/Lower/OpenMP/wsloop-simd.f90
+++ b/flang/test/Lower/OpenMP/wsloop-simd.f90
@@ -45,3 +45,24 @@ subroutine do_simd_simdlen()
 end do
   !$omp end do simd
 end subroutine do_simd_simdlen
+
+! CHECK-LABEL: func.func @_QPdo_simd_reduction(
+subroutine do_simd_reduction()
+  integer :: sum
+  sum = 0
+  ! CHECK:  omp.wsloop
+  ! CHECK-SAME: reduction(@[[RED_SYM:.*]] %{{.*}} -> %[[RED_OUTER:.*]] : 
!fir.ref)
+  ! CHECK-NEXT: omp.simd
+  ! CHECK-SAME: reduction(@[[RED_SYM]] %[[RED_OUTER]] -> %[[RED_INNER:.*]] : 
!fir.ref)
+  ! CHECK-NEXT: omp.loop_nest
+  ! CHECK:  %[[RED_DECL:.*]]:2 = hlfir.declare %[[RED_INNER]]
+  ! CHECK:  %[[RED:.*]] = fir.load %[[RED_DECL]]#0 : !fir.ref
+  ! CHECK:  %[[RESULT:.*]] = arith.addi %[[RED]], %{{.*}} : i32
+  ! CHECK:  hlfir.assign %[[RESULT]] to %[[RED_DECL]]#0 : i32, 
!fir.ref
+  ! CHECK-NEXT: omp.yield
+  !$omp do simd reduction(+:sum)
+do index_ = 1, 10
+  sum = sum + 1
+end do
+  !$omp end do simd
+end subroutine do_simd_reduction

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [TableGen] Fix calculation of Lanemask for RCs with artificial subregs. (PR #114392)

2024-10-31 Thread Matt Arsenault via llvm-branch-commits


https://github.com/arsenm approved this pull request.


https://github.com/llvm/llvm-project/pull/114392
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [mlir] [OpenMP][MLIR] Descriptor explicit member map lowering changes (PR #113556)

2024-10-31 Thread Sergio Afonso via llvm-branch-commits


https://github.com/skatrak approved this pull request.

Ah, I see. Then this LGTM, thanks for explaining!

https://github.com/llvm/llvm-project/pull/113556
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] AMDGPU: Propagate amdgpu-max-num-workgroups attribute (PR #113018)

2024-10-31 Thread Matt Arsenault via llvm-branch-commits



@@ -821,6 +826,152 @@ AAAMDFlatWorkGroupSize::createForPosition(const 
IRPosition &IRP,
   "AAAMDFlatWorkGroupSize is only valid for function position");
 }
 
+struct TupleDecIntegerRangeState : public AbstractState {
+  DecIntegerState X, Y, Z;
+
+  bool isValidState() const override {
+return X.isValidState() && Y.isValidState() && Z.isValidState();
+  }
+
+  bool isAtFixpoint() const override {
+return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
+  }
+
+  ChangeStatus indicateOptimisticFixpoint() override {
+return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
+   Z.indicateOptimisticFixpoint();
+  }
+
+  ChangeStatus indicatePessimisticFixpoint() override {
+return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
+   Z.indicatePessimisticFixpoint();
+  }
+
+  TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) 
{
+X ^= Other.X;
+Y ^= Other.Y;
+Z ^= Other.Z;
+return *this;
+  }
+
+  bool operator==(const TupleDecIntegerRangeState &Other) const {
+return X == Other.X && Y == Other.Y && Z == Other.Z;
+  }
+
+  TupleDecIntegerRangeState &getAssumed() { return *this; }
+  const TupleDecIntegerRangeState &getAssumed() const { return *this; }
+};
+
+using AAAMDMaxNumWorkgroupsState =
+StateWrapper;
+
+/// Propagate amdgpu-max-num-workgroups attribute.
+struct AAAMDMaxNumWorkgroups
+: public StateWrapper {
+  using Base = StateWrapper;
+
+  AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+  void initialize(Attributor &A) override {
+Function *F = getAssociatedFunction();
+auto &InfoCache = static_cast(A.getInfoCache());
+
+SmallVector MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);
+
+// FIXME: What is the interpretation of 0?
+for (unsigned &Entry : MaxNumWorkgroups) {
+  if (Entry == 0)
+Entry = std::numeric_limits::max();
+}
+
+X.takeKnownMinimum(MaxNumWorkgroups[0]);
+Y.takeKnownMinimum(MaxNumWorkgroups[1]);
+Z.takeKnownMinimum(MaxNumWorkgroups[2]);
+
+if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
+  indicatePessimisticFixpoint();
+  }
+
+  ChangeStatus updateImpl(Attributor &A) override {
+ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+auto CheckCallSite = [&](AbstractCallSite CS) {
+  Function *Caller = CS.getInstruction()->getFunction();
+  LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
+<< "->" << getAssociatedFunction()->getName() << '\n');
+
+  const auto *CallerInfo = A.getAAFor(
+  *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
+  if (!CallerInfo || !CallerInfo->isValidState())
+return false;
+
+  Change |=
+  clampStateAndIndicateChange(this->getState(), 
CallerInfo->getState());
+  return true;
+};
+
+bool AllCallSitesKnown = true;
+if (!A.checkForAllCallSites(CheckCallSite, *this,
+/*RequireAllCallSites=*/true,
+AllCallSitesKnown))
+  return indicatePessimisticFixpoint();
+
+return Change;
+  }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
+  Attributor &A);
+
+  ChangeStatus manifest(Attributor &A) override {
+Function *F = getAssociatedFunction();
+// TODO: Skip adding if worst case?

arsenm wrote:

Yes, uint32_max x 3 

https://github.com/llvm/llvm-project/pull/113018
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)

2024-10-31 Thread Petar Avramovic via llvm-branch-commits



@@ -66,9 +73,215 @@ FunctionPass *llvm::createAMDGPURegBankSelectPass() {
   return new AMDGPURegBankSelect();
 }
 
+class RegBankSelectHelper {
+  MachineIRBuilder &B;
+  MachineRegisterInfo &MRI;
+  AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
+  const MachineUniformityInfo &MUI;
+  const SIRegisterInfo &TRI;
+  const RegisterBank *SgprRB;
+  const RegisterBank *VgprRB;
+  const RegisterBank *VccRB;
+
+public:
+  RegBankSelectHelper(MachineIRBuilder &B,
+  AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
+  const MachineUniformityInfo &MUI,
+  const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
+  : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI),
+SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
+VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
+VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
+
+  bool shouldRegBankSelect(MachineInstr &MI) {
+return MI.isPreISelOpcode() || MI.isCopy();
+  }
+
+  // Temporal divergence copy: COPY to vgpr with implicit use of $exec inside 
of
+  // the cycle
+  // Note: uniformity analysis does not consider that registers with vgpr def
+  // are divergent (you can have uniform value in vgpr).
+  // - TODO: implicit use of $exec could be implemented as indicator that
+  //   instruction is divergent
+  bool isTemporalDivergenceCopy(Register Reg) {
+MachineInstr *MI = MRI.getVRegDef(Reg);
+if (!MI->isCopy())
+  return false;
+
+for (auto Op : MI->implicit_operands()) {
+  if (!Op.isReg())
+continue;
+
+  if (Op.getReg() == TRI.getExec()) {
+return true;
+  }
+}
+
+return false;
+  }
+
+  void setRBDef(MachineInstr &MI, MachineOperand &DefOP,
+const RegisterBank *RB) {
+Register Reg = DefOP.getReg();
+// Register that already has Register class got it during pre-inst 
selection
+// of another instruction. Maybe cross bank copy was required so we insert 
a
+// copy that can be removed later. This simplifies post regbanklegalize
+// combiner and avoids need to special case some patterns.
+if (MRI.getRegClassOrNull(Reg)) {
+  LLT Ty = MRI.getType(Reg);
+  Register NewReg = MRI.createVirtualRegister({RB, Ty});
+  DefOP.setReg(NewReg);
+
+  auto &MBB = *MI.getParent();
+  B.setInsertPt(MBB, MBB.SkipPHIsAndLabels(std::next(MI.getIterator(;
+  B.buildCopy(Reg, NewReg);
+
+  // The problem was discovered for uniform S1 that was used as both
+  // lane mask(vcc) and regular sgpr S1.
+  // - lane-mask(vcc) use was by si_if, this use is divergent and requires
+  //   non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if 
sets
+  //   sreg_64_xexec(S1) on def of uniform S1 making it lane-mask.
+  // - the regular sgpr S1(uniform) instruction is now broken since
+  //   it uses sreg_64_xexec(S1) which is divergent.
+
+  // Replace virtual registers with register class on generic instructions
+  // uses with virtual registers with register bank.
+  for (auto &UseMI : MRI.use_instructions(Reg)) {
+if (shouldRegBankSelect(UseMI)) {
+  for (MachineOperand &Op : UseMI.operands()) {
+if (Op.isReg() && Op.getReg() == Reg)
+  Op.setReg(NewReg);
+  }
+}
+  }
+
+} else {
+  MRI.setRegBank(Reg, *RB);
+}
+  }
+
+  std::optional tryGetVReg(MachineOperand &Op) {
+if (!Op.isReg())
+  return std::nullopt;
+
+Register Reg = Op.getReg();
+if (!Reg.isVirtual())
+  return std::nullopt;
+
+return Reg;
+  }
+
+  void assignBanksOnDefs(MachineInstr &MI) {
+if (!shouldRegBankSelect(MI))
+  return;
+
+for (MachineOperand &DefOP : MI.defs()) {
+  auto MaybeDefReg = tryGetVReg(DefOP);
+  if (!MaybeDefReg)
+continue;
+  Register DefReg = *MaybeDefReg;
+
+  // Copies can have register class on def registers.
+  if (MI.isCopy() && MRI.getRegClassOrNull(DefReg)) {
+continue;
+  }
+
+  if (MUI.isUniform(DefReg) || ILMA.isS32S64LaneMask(DefReg)) {
+setRBDef(MI, DefOP, SgprRB);
+  } else {
+if (MRI.getType(DefReg) == LLT::scalar(1))
+  setRBDef(MI, DefOP, VccRB);
+else
+  setRBDef(MI, DefOP, VgprRB);
+  }
+}
+  }
+
+  void constrainRBUse(MachineInstr &MI, MachineOperand &UseOP,
+  const RegisterBank *RB) {
+Register Reg = UseOP.getReg();
+
+LLT Ty = MRI.getType(Reg);
+Register NewReg = MRI.createVirtualRegister({RB, Ty});
+UseOP.setReg(NewReg);
+
+if (MI.isPHI()) {
+  auto DefMI = MRI.getVRegDef(Reg)->getIterator();
+  MachineBasicBlock *DefMBB = DefMI->getParent();
+  B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
+} else {
+  B.setInstr(MI);
+}
+
+B.buildCopy(NewReg, Reg);
+  }
+
+  void constrainBanksOnUses(MachineInstr &MI) {
+if (!shouldRegBankSelect(

[llvm-branch-commits] [flang] Revert "[Flang][OpenMP] Disable lowering of omp.simd reductions in co… (PR #113683)

2024-10-31 Thread via llvm-branch-commits


https://github.com/NimishMishra approved this pull request.

This looks okay to me, given the PR stack.

There is still https://github.com/llvm/llvm-project/pull/113682 pending a 
merge; I'll take a look at that PR tomorrow. Thanks for the work on this.

https://github.com/llvm/llvm-project/pull/113683
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AArch64] Define high bits of FPR and GPR registers. (PR #114263)

2024-10-31 Thread Matt Arsenault via llvm-branch-commits



@@ -424,6 +424,58 @@ AArch64RegisterInfo::explainReservedReg(const 
MachineFunction &MF,
   return {};
 }
 
+static SmallVector ReservedHi = {

arsenm wrote:

But what are the actual failures, messages, location?  If the high half of 
register isn't allocatable / addressable in the first place, it shouldn't just 
appear to cause issues 

https://github.com/llvm/llvm-project/pull/114263
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [RISCV] Add initial support of memcmp expansion (PR #107548)

2024-10-31 Thread Pengcheng Wang via llvm-branch-commits


https://github.com/wangpc-pp updated 
https://github.com/llvm/llvm-project/pull/107548

>From f21cfcfc90330ee3856746b6315a81a00313b0e0 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng 
Date: Fri, 6 Sep 2024 17:20:51 +0800
Subject: [PATCH 1/5] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
 =?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.6-beta.1
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  15 +
 .../Target/RISCV/RISCVTargetTransformInfo.h   |   3 +
 llvm/test/CodeGen/RISCV/memcmp.ll | 932 ++
 3 files changed, 950 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/memcmp.ll

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp 
b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index e809e15eacf696..ad532aadc83266 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2113,3 +2113,18 @@ bool RISCVTTIImpl::shouldConsiderAddressTypePromotion(
   }
   return Considerable;
 }
+
+RISCVTTIImpl::TTI::MemCmpExpansionOptions
+RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+  TTI::MemCmpExpansionOptions Options;
+  // FIXME: Vector haven't been tested.
+  Options.AllowOverlappingLoads =
+  (ST->enableUnalignedScalarMem() || ST->enableUnalignedScalarMem());
+  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+  Options.NumLoadsPerBlock = Options.MaxNumLoads;
+  if (ST->is64Bit())
+Options.LoadSizes.push_back(8);
+  llvm::append_range(Options.LoadSizes, ArrayRef({4, 2, 1}));
+  Options.AllowedTailExpansions = {3, 5, 6};
+  return Options;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h 
b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 763b89bfec0a66..ee9bed09df97f3 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -404,6 +404,9 @@ class RISCVTTIImpl : public BasicTTIImplBase {
   shouldConsiderAddressTypePromotion(const Instruction &I,
  bool &AllowPromotionWithoutCommonHeader);
   std::optional getMinPageSize() const { return 4096; }
+
+  TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
+bool IsZeroCmp) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll 
b/llvm/test/CodeGen/RISCV/memcmp.ll
new file mode 100644
index 00..652cd02e2c750a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/memcmp.ll
@@ -0,0 +1,932 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -O2 | FileCheck %s 
--check-prefix=CHECK-ALIGNED-RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -O2 | FileCheck %s 
--check-prefix=CHECK-ALIGNED-RV64
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 
-mattr=+unaligned-scalar-mem -O2 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-UNALIGNED-RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 
-mattr=+unaligned-scalar-mem -O2 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-UNALIGNED-RV64
+
+declare i32 @bcmp(i8*, i8*, iXLen) nounwind readonly
+declare i32 @memcmp(i8*, i8*, iXLen) nounwind readonly
+
+define i1 @bcmp_size_15(i8* %s1, i8* %s2) {
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_15:
+; CHECK-ALIGNED-RV32:   # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:lbu a2, 1(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a3, 0(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a4, 2(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a5, 3(a0)
+; CHECK-ALIGNED-RV32-NEXT:slli a2, a2, 8
+; CHECK-ALIGNED-RV32-NEXT:or a2, a2, a3
+; CHECK-ALIGNED-RV32-NEXT:slli a4, a4, 16
+; CHECK-ALIGNED-RV32-NEXT:slli a5, a5, 24
+; CHECK-ALIGNED-RV32-NEXT:or a4, a5, a4
+; CHECK-ALIGNED-RV32-NEXT:or a2, a4, a2
+; CHECK-ALIGNED-RV32-NEXT:lbu a3, 1(a1)
+; CHECK-ALIGNED-RV32-NEXT:lbu a4, 0(a1)
+; CHECK-ALIGNED-RV32-NEXT:lbu a5, 2(a1)
+; CHECK-ALIGNED-RV32-NEXT:lbu a6, 3(a1)
+; CHECK-ALIGNED-RV32-NEXT:slli a3, a3, 8
+; CHECK-ALIGNED-RV32-NEXT:or a3, a3, a4
+; CHECK-ALIGNED-RV32-NEXT:slli a5, a5, 16
+; CHECK-ALIGNED-RV32-NEXT:slli a6, a6, 24
+; CHECK-ALIGNED-RV32-NEXT:or a4, a6, a5
+; CHECK-ALIGNED-RV32-NEXT:or a3, a4, a3
+; CHECK-ALIGNED-RV32-NEXT:xor a2, a2, a3
+; CHECK-ALIGNED-RV32-NEXT:lbu a3, 5(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a4, 4(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a5, 6(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a6, 7(a0)
+; CHECK-ALIGNED-RV32-NEXT:slli a3, a3, 8
+; CHECK-ALIGNED-RV32-NEXT:or a3, a3, a4
+; CHECK-ALIGNED-RV32-NEXT:slli a5, a5, 16
+; CHECK-ALIGNED-RV32-NEXT:slli a6, a6, 24
+; CHECK-ALIGNED-RV32-NEXT:or a4, a6, a5
+; CHECK-ALIGNED-RV32-NEXT:or a3, a4, a3
+; CHECK-ALIGNED-RV32-NEXT:lbu a4, 5(a1)
+; CHECK-ALIGNED-RV32-NEXT

[llvm-branch-commits] [llvm] [AArch64] Define high bits of FPR and GPR registers. (PR #114263)

2024-10-31 Thread Sander de Smalen via llvm-branch-commits


https://github.com/sdesmalen-arm edited 
https://github.com/llvm/llvm-project/pull/114263
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [RISCV] Add initial support of memcmp expansion (PR #107548)

2024-10-31 Thread Pengcheng Wang via llvm-branch-commits


https://github.com/wangpc-pp updated 
https://github.com/llvm/llvm-project/pull/107548

>From f21cfcfc90330ee3856746b6315a81a00313b0e0 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng 
Date: Fri, 6 Sep 2024 17:20:51 +0800
Subject: [PATCH 1/5] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
 =?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.6-beta.1
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  15 +
 .../Target/RISCV/RISCVTargetTransformInfo.h   |   3 +
 llvm/test/CodeGen/RISCV/memcmp.ll | 932 ++
 3 files changed, 950 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/memcmp.ll

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp 
b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index e809e15eacf696..ad532aadc83266 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2113,3 +2113,18 @@ bool RISCVTTIImpl::shouldConsiderAddressTypePromotion(
   }
   return Considerable;
 }
+
+RISCVTTIImpl::TTI::MemCmpExpansionOptions
+RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+  TTI::MemCmpExpansionOptions Options;
+  // FIXME: Vector haven't been tested.
+  Options.AllowOverlappingLoads =
+  (ST->enableUnalignedScalarMem() || ST->enableUnalignedScalarMem());
+  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+  Options.NumLoadsPerBlock = Options.MaxNumLoads;
+  if (ST->is64Bit())
+Options.LoadSizes.push_back(8);
+  llvm::append_range(Options.LoadSizes, ArrayRef({4, 2, 1}));
+  Options.AllowedTailExpansions = {3, 5, 6};
+  return Options;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h 
b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 763b89bfec0a66..ee9bed09df97f3 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -404,6 +404,9 @@ class RISCVTTIImpl : public BasicTTIImplBase {
   shouldConsiderAddressTypePromotion(const Instruction &I,
  bool &AllowPromotionWithoutCommonHeader);
   std::optional getMinPageSize() const { return 4096; }
+
+  TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
+bool IsZeroCmp) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll 
b/llvm/test/CodeGen/RISCV/memcmp.ll
new file mode 100644
index 00..652cd02e2c750a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/memcmp.ll
@@ -0,0 +1,932 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -O2 | FileCheck %s 
--check-prefix=CHECK-ALIGNED-RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -O2 | FileCheck %s 
--check-prefix=CHECK-ALIGNED-RV64
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 
-mattr=+unaligned-scalar-mem -O2 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-UNALIGNED-RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 
-mattr=+unaligned-scalar-mem -O2 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-UNALIGNED-RV64
+
+declare i32 @bcmp(i8*, i8*, iXLen) nounwind readonly
+declare i32 @memcmp(i8*, i8*, iXLen) nounwind readonly
+
+define i1 @bcmp_size_15(i8* %s1, i8* %s2) {
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_15:
+; CHECK-ALIGNED-RV32:   # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:lbu a2, 1(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a3, 0(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a4, 2(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a5, 3(a0)
+; CHECK-ALIGNED-RV32-NEXT:slli a2, a2, 8
+; CHECK-ALIGNED-RV32-NEXT:or a2, a2, a3
+; CHECK-ALIGNED-RV32-NEXT:slli a4, a4, 16
+; CHECK-ALIGNED-RV32-NEXT:slli a5, a5, 24
+; CHECK-ALIGNED-RV32-NEXT:or a4, a5, a4
+; CHECK-ALIGNED-RV32-NEXT:or a2, a4, a2
+; CHECK-ALIGNED-RV32-NEXT:lbu a3, 1(a1)
+; CHECK-ALIGNED-RV32-NEXT:lbu a4, 0(a1)
+; CHECK-ALIGNED-RV32-NEXT:lbu a5, 2(a1)
+; CHECK-ALIGNED-RV32-NEXT:lbu a6, 3(a1)
+; CHECK-ALIGNED-RV32-NEXT:slli a3, a3, 8
+; CHECK-ALIGNED-RV32-NEXT:or a3, a3, a4
+; CHECK-ALIGNED-RV32-NEXT:slli a5, a5, 16
+; CHECK-ALIGNED-RV32-NEXT:slli a6, a6, 24
+; CHECK-ALIGNED-RV32-NEXT:or a4, a6, a5
+; CHECK-ALIGNED-RV32-NEXT:or a3, a4, a3
+; CHECK-ALIGNED-RV32-NEXT:xor a2, a2, a3
+; CHECK-ALIGNED-RV32-NEXT:lbu a3, 5(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a4, 4(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a5, 6(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a6, 7(a0)
+; CHECK-ALIGNED-RV32-NEXT:slli a3, a3, 8
+; CHECK-ALIGNED-RV32-NEXT:or a3, a3, a4
+; CHECK-ALIGNED-RV32-NEXT:slli a5, a5, 16
+; CHECK-ALIGNED-RV32-NEXT:slli a6, a6, 24
+; CHECK-ALIGNED-RV32-NEXT:or a4, a6, a5
+; CHECK-ALIGNED-RV32-NEXT:or a3, a4, a3
+; CHECK-ALIGNED-RV32-NEXT:lbu a4, 5(a1)
+; CHECK-ALIGNED-RV32-NEXT

[llvm-branch-commits] [mlir] [OpenMP][MLIR] Descriptor explicit member map lowering changes (PR #113556)

2024-10-31 Thread via llvm-branch-commits


https://github.com/agozillon updated 
https://github.com/llvm/llvm-project/pull/113556

>From 70265b81b3e8ab7b6d04ed3d019861abd0b0e4aa Mon Sep 17 00:00:00 2001
From: agozillon 
Date: Fri, 4 Oct 2024 13:03:22 -0500
Subject: [PATCH] [OpenMP][MLIR] Descriptor explicit member map lowering
 changes

This is one of 3 PRs in a PR stack that aims to add support for explicit 
mapping of
allocatable members in derived types.

The primary changes in this PR are the OpenMPToLLVMIRTranslation.cpp changes,
which are small and seek to alter the current member mapping to add an
additional map insertion for pointers. Effectively, if the member is a pointer
(currently indicated by having a varPtrPtr field) we add an additional map for
the pointer and then alter the subsequent mapping of the member (the data)
to utilise the member rather than the parents base pointer. This appears to be
necessary in certain cases when mapping pointer data within record types to
avoid segfaulting on device (due to incorrect data mapping). In general this
record type mapping may be simplifiable in the future.

There are also additions of tests which should help to showcase the affect
of the changes above.
---
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |   2 +-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |  58 +++--
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp  |  81 -
 mlir/test/Dialect/OpenMP/ops.mlir |   4 +-
 ...t-nested-ptr-record-type-mapping-host.mlir |  66 ++
 ...arget-nested-record-type-mapping-host.mlir |   2 +-
 ...get-record-type-with-ptr-member-host.mlir} | 114 ++
 7 files changed, 197 insertions(+), 130 deletions(-)
 create mode 100644 
mlir/test/Target/LLVMIR/omptarget-nested-ptr-record-type-mapping-host.mlir
 rename mlir/test/Target/LLVMIR/{omptarget-fortran-allocatable-types-host.mlir 
=> omptarget-record-type-with-ptr-member-host.mlir} (58%)

diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td 
b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 626539cb7bde42..348c1b9c2b8bdf 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -895,7 +895,7 @@ def MapInfoOp : OpenMP_Op<"map.info", 
[AttrSizedOperandSegments]> {
TypeAttr:$var_type,
Optional:$var_ptr_ptr,
Variadic:$members,
-   OptionalAttr:$members_index,
+   OptionalAttr:$members_index,
Variadic:$bounds, /* rank-0 to 
rank-{n-1} */
OptionalAttr:$map_type,
OptionalAttr:$map_capture_type,
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp 
b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index e1df647d6a3c71..8d31cda3a33ee9 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1395,16 +1395,15 @@ static void printMapClause(OpAsmPrinter &p, Operation 
*op,
 }
 
 static ParseResult parseMembersIndex(OpAsmParser &parser,
- DenseIntElementsAttr &membersIdx) {
-  SmallVector values;
-  int64_t value;
-  int64_t shape[2] = {0, 0};
-  unsigned shapeTmp = 0;
+ ArrayAttr &membersIdx) {
+  SmallVector values, memberIdxs;
+
   auto parseIndices = [&]() -> ParseResult {
+int64_t value;
 if (parser.parseInteger(value))
   return failure();
-shapeTmp++;
-values.push_back(APInt(32, value, /*isSigned=*/true));
+values.push_back(IntegerAttr::get(parser.getBuilder().getIntegerType(64),
+  APInt(64, value, /*isSigned=*/false)));
 return success();
   };
 
@@ -1418,52 +1417,29 @@ static ParseResult parseMembersIndex(OpAsmParser 
&parser,
 if (failed(parser.parseRSquare()))
   return failure();
 
-// Only set once, if any indices are not the same size
-// we error out in the next check as that's unsupported
-if (shape[1] == 0)
-  shape[1] = shapeTmp;
-
-// Verify that the recently parsed list is equal to the
-// first one we parsed, they must be equal lengths to
-// keep the rectangular shape DenseIntElementsAttr
-// requires
-if (shapeTmp != shape[1])
-  return failure();
-
-shapeTmp = 0;
-shape[0]++;
+memberIdxs.push_back(ArrayAttr::get(parser.getContext(), values));
+values.clear();
   } while (succeeded(parser.parseOptionalComma()));
 
-  if (!values.empty()) {
-ShapedType valueType =
-VectorType::get(shape, IntegerType::get(parser.getContext(), 32));
-membersIdx = DenseIntElementsAttr::get(valueType, values);
-  }
+  if (!memberIdxs.empty())
+membersIdx = ArrayAttr::get(parser.getContext(), memberIdxs);
 
   return success();
 }
 
 static void printMembersIndex(OpAsmPrinter &p, MapInfoOp op,
-  DenseIntElementsAttr membersIdx) {
-  llvm::ArrayRef shape = membersIdx.getShapedType

[llvm-branch-commits] [clang] ab28646 - Revert "[webkit.UncountedLambdaCapturesChecker] Ignore trivial functions and …"

2024-10-31 Thread via llvm-branch-commits


Author: Ryosuke Niwa
Date: 2024-10-31T00:27:46-07:00
New Revision: ab286462f15736a6e86f0113eab473fb859744be

URL: 
https://github.com/llvm/llvm-project/commit/ab286462f15736a6e86f0113eab473fb859744be
DIFF: 
https://github.com/llvm/llvm-project/commit/ab286462f15736a6e86f0113eab473fb859744be.diff

LOG: Revert "[webkit.UncountedLambdaCapturesChecker] Ignore trivial functions 
and …"

This reverts commit 287781c7c9dbd7674cf7cbab8a8fe8a49a4b9317.

Added: 


Modified: 
clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp
clang/test/Analysis/Checkers/WebKit/mock-types.h
clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp

Removed: 




diff  --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h 
b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
index 814015c311d61e..4b41ca96e1df1d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
@@ -63,10 +63,6 @@ std::optional isUncounted(const clang::CXXRecordDecl* 
Class);
 /// class, false if not, std::nullopt if inconclusive.
 std::optional isUncountedPtr(const clang::QualType T);
 
-/// \returns true if \p T is either a raw pointer or reference to an uncounted
-/// or unchecked class, false if not, std::nullopt if inconclusive.
-std::optional isUnsafePtr(const QualType T);
-
 /// \returns true if \p T is a RefPtr, Ref, CheckedPtr, CheckedRef, or its
 /// variant, false if not.
 bool isSafePtrType(const clang::QualType T);

diff  --git 
a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp 
b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp
index d3484d74a2e3eb..998bd4ccee07db 100644
--- 
a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp
+++ 
b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp
@@ -6,7 +6,6 @@
 //
 
//===--===//
 
-#include "ASTUtils.h"
 #include "DiagOutputUtils.h"
 #include "PtrTypesSemantics.h"
 #include "clang/AST/CXXInheritance.h"
@@ -27,7 +26,6 @@ class UncountedLambdaCapturesChecker
   BugType Bug{this, "Lambda capture of uncounted variable",
   "WebKit coding guidelines"};
   mutable BugReporter *BR = nullptr;
-  TrivialFunctionAnalysis TFA;
 
 public:
   void checkASTDecl(const TranslationUnitDecl *TUD, AnalysisManager &MGR,
@@ -39,8 +37,6 @@ class UncountedLambdaCapturesChecker
 // want to visit those, so we make our own RecursiveASTVisitor.
 struct LocalVisitor : public RecursiveASTVisitor {
   const UncountedLambdaCapturesChecker *Checker;
-  llvm::DenseSet DeclRefExprsToIgnore;
-
   explicit LocalVisitor(const UncountedLambdaCapturesChecker *Checker)
   : Checker(Checker) {
 assert(Checker);
@@ -49,100 +45,32 @@ class UncountedLambdaCapturesChecker
   bool shouldVisitTemplateInstantiations() const { return true; }
   bool shouldVisitImplicitCode() const { return false; }
 
-  bool VisitDeclRefExpr(DeclRefExpr *DRE) {
-if (DeclRefExprsToIgnore.contains(DRE))
-  return true;
-auto *VD = dyn_cast_or_null(DRE->getDecl());
-if (!VD)
-  return true;
-auto *Init = VD->getInit()->IgnoreParenCasts();
-auto *L = dyn_cast_or_null(Init);
-if (!L)
-  return true;
+  bool VisitLambdaExpr(LambdaExpr *L) {
 Checker->visitLambdaExpr(L);
 return true;
   }
-
-  // WTF::switchOn(T, F... f) is a variadic template function and couldn't
-  // be annotated with NOESCAPE. We hard code it here to workaround that.
-  bool shouldTreatAllArgAsNoEscape(FunctionDecl *Decl) {
-auto *NsDecl = Decl->getParent();
-if (!NsDecl || !isa(NsDecl))
-  return false;
-return safeGetName(NsDecl) == "WTF" && safeGetName(Decl) == "switchOn";
-  }
-
-  bool VisitCallExpr(CallExpr *CE) {
-checkCalleeLambda(CE);
-if (auto *Callee = CE->getDirectCallee()) {
-  bool TreatAllArgsAsNoEscape = shouldTreatAllArgAsNoEscape(Callee);
-  unsigned ArgIndex = 0;
-  for (auto *Param : Callee->parameters()) {
-if (ArgIndex >= CE->getNumArgs())
-  break;
-auto *Arg = CE->getArg(ArgIndex)->IgnoreParenCasts();
-if (!Param->hasAttr() && !TreatAllArgsAsNoEscape) {
-  if (auto *L = dyn_cast_or_null(Arg))
-Checker->visitLambdaExpr(L);
-}
-++ArgIndex;
-  }
-}
-return true;
-  }
-
-  void checkCalleeLambda(CallExpr *CE) {
-auto *Callee = CE->getCallee();
-if (!Callee)
-  return;
-auto *DRE = dyn_cast(Callee->IgnoreParenCasts

[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)

2024-10-31 Thread Petar Avramovic via llvm-branch-commits


https://github.com/petar-avramovic updated 
https://github.com/llvm/llvm-project/pull/112882

>From e6285ef8415e03337a080fa13456a2495023a8e6 Mon Sep 17 00:00:00 2001
From: Petar Avramovic 
Date: Wed, 30 Oct 2024 15:37:59 +0100
Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load

Add IDs for bit width that cover multiple LLTs: B32 B64 etc.
"Predicate" wrapper class for bool predicate functions used to
write pretty rules. Predicates can be combined using &&, || and !.
Lowering for splitting and widening loads.
Write rules for loads to not change existing mir tests from old
regbankselect.
---
 .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 287 +++-
 .../AMDGPU/AMDGPURegBankLegalizeHelper.h  |   5 +
 .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 309 -
 .../AMDGPU/AMDGPURegBankLegalizeRules.h   |  65 +++-
 .../AMDGPU/GlobalISel/regbankselect-load.mir  | 320 +++---
 .../GlobalISel/regbankselect-zextload.mir |   9 +-
 6 files changed, 929 insertions(+), 66 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 15ccf1a38af9a5..19d8d466e3b12e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -36,6 +36,83 @@ void 
RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
   lower(MI, Mapping, WaterfallSgprs);
 }
 
+void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
+  ArrayRef LLTBreakdown, LLT MergeTy) 
{
+  MachineFunction &MF = B.getMF();
+  assert(MI.getNumMemOperands() == 1);
+  MachineMemOperand &BaseMMO = **MI.memoperands_begin();
+  Register Dst = MI.getOperand(0).getReg();
+  const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
+  Register Base = MI.getOperand(1).getReg();
+  LLT PtrTy = MRI.getType(Base);
+  const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
+  LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
+  SmallVector LoadPartRegs;
+
+  unsigned ByteOffset = 0;
+  for (LLT PartTy : LLTBreakdown) {
+Register BasePlusOffset;
+if (ByteOffset == 0) {
+  BasePlusOffset = Base;
+} else {
+  auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
+  BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0);
+}
+auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
+auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
+LoadPartRegs.push_back(LoadPart.getReg(0));
+ByteOffset += PartTy.getSizeInBytes();
+  }
+
+  if (!MergeTy.isValid()) {
+// Loads are of same size, concat or merge them together.
+B.buildMergeLikeInstr(Dst, LoadPartRegs);
+  } else {
+// Loads are not all of same size, need to unmerge them to smaller pieces
+// of MergeTy type, then merge pieces to Dst.
+SmallVector MergeTyParts;
+for (Register Reg : LoadPartRegs) {
+  if (MRI.getType(Reg) == MergeTy) {
+MergeTyParts.push_back(Reg);
+  } else {
+auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
+for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
+  MergeTyParts.push_back(Unmerge.getReg(i));
+  }
+}
+B.buildMergeLikeInstr(Dst, MergeTyParts);
+  }
+  MI.eraseFromParent();
+}
+
+void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
+  LLT MergeTy) {
+  MachineFunction &MF = B.getMF();
+  assert(MI.getNumMemOperands() == 1);
+  MachineMemOperand &BaseMMO = **MI.memoperands_begin();
+  Register Dst = MI.getOperand(0).getReg();
+  const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
+  Register Base = MI.getOperand(1).getReg();
+
+  MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
+  auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
+
+  if (WideTy.isScalar()) {
+B.buildTrunc(Dst, WideLoad);
+  } else {
+SmallVector MergeTyParts;
+auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
+
+LLT DstTy = MRI.getType(Dst);
+unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
+for (unsigned i = 0; i < NumElts; ++i) {
+  MergeTyParts.push_back(Unmerge.getReg(i));
+}
+B.buildMergeLikeInstr(Dst, MergeTyParts);
+  }
+  MI.eraseFromParent();
+}
+
 void RegBankLegalizeHelper::lower(MachineInstr &MI,
   const RegBankLLTMapping &Mapping,
   SmallSet &WaterfallSgprs) {
@@ -114,6 +191,50 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
 MI.eraseFromParent();
 break;
   }
+  case SplitLoad: {
+LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+unsigned Size = DstTy.getSizeInBits();
+// Even split to 128-bit loads
+if (Size > 128) {
+  LLT B128;
+  if (DstTy.isVector()) {
+LLT EltTy = DstTy.getElementType();
+B128 = LLT:

[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)

2024-10-31 Thread Petar Avramovic via llvm-branch-commits


https://github.com/petar-avramovic updated 
https://github.com/llvm/llvm-project/pull/112882

>From eb6a8fc2973ad31f607af56c61a4c6ba6f30d982 Mon Sep 17 00:00:00 2001
From: Petar Avramovic 
Date: Wed, 30 Oct 2024 15:37:59 +0100
Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load

Add IDs for bit width that cover multiple LLTs: B32 B64 etc.
"Predicate" wrapper class for bool predicate functions used to
write pretty rules. Predicates can be combined using &&, || and !.
Lowering for splitting and widening loads.
Write rules for loads to not change existing mir tests from old
regbankselect.
---
 .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 287 +++-
 .../AMDGPU/AMDGPURegBankLegalizeHelper.h  |   5 +
 .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 309 -
 .../AMDGPU/AMDGPURegBankLegalizeRules.h   |  65 +++-
 .../AMDGPU/GlobalISel/regbankselect-load.mir  | 320 +++---
 .../GlobalISel/regbankselect-zextload.mir |   9 +-
 6 files changed, 929 insertions(+), 66 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 15ccf1a38af9a5..19d8d466e3b12e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -36,6 +36,83 @@ void 
RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
   lower(MI, Mapping, WaterfallSgprs);
 }
 
+void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
+  ArrayRef LLTBreakdown, LLT MergeTy) 
{
+  MachineFunction &MF = B.getMF();
+  assert(MI.getNumMemOperands() == 1);
+  MachineMemOperand &BaseMMO = **MI.memoperands_begin();
+  Register Dst = MI.getOperand(0).getReg();
+  const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
+  Register Base = MI.getOperand(1).getReg();
+  LLT PtrTy = MRI.getType(Base);
+  const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
+  LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
+  SmallVector LoadPartRegs;
+
+  unsigned ByteOffset = 0;
+  for (LLT PartTy : LLTBreakdown) {
+Register BasePlusOffset;
+if (ByteOffset == 0) {
+  BasePlusOffset = Base;
+} else {
+  auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
+  BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0);
+}
+auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
+auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
+LoadPartRegs.push_back(LoadPart.getReg(0));
+ByteOffset += PartTy.getSizeInBytes();
+  }
+
+  if (!MergeTy.isValid()) {
+// Loads are of same size, concat or merge them together.
+B.buildMergeLikeInstr(Dst, LoadPartRegs);
+  } else {
+// Loads are not all of same size, need to unmerge them to smaller pieces
+// of MergeTy type, then merge pieces to Dst.
+SmallVector MergeTyParts;
+for (Register Reg : LoadPartRegs) {
+  if (MRI.getType(Reg) == MergeTy) {
+MergeTyParts.push_back(Reg);
+  } else {
+auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
+for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
+  MergeTyParts.push_back(Unmerge.getReg(i));
+  }
+}
+B.buildMergeLikeInstr(Dst, MergeTyParts);
+  }
+  MI.eraseFromParent();
+}
+
+void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
+  LLT MergeTy) {
+  MachineFunction &MF = B.getMF();
+  assert(MI.getNumMemOperands() == 1);
+  MachineMemOperand &BaseMMO = **MI.memoperands_begin();
+  Register Dst = MI.getOperand(0).getReg();
+  const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
+  Register Base = MI.getOperand(1).getReg();
+
+  MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
+  auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
+
+  if (WideTy.isScalar()) {
+B.buildTrunc(Dst, WideLoad);
+  } else {
+SmallVector MergeTyParts;
+auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
+
+LLT DstTy = MRI.getType(Dst);
+unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
+for (unsigned i = 0; i < NumElts; ++i) {
+  MergeTyParts.push_back(Unmerge.getReg(i));
+}
+B.buildMergeLikeInstr(Dst, MergeTyParts);
+  }
+  MI.eraseFromParent();
+}
+
 void RegBankLegalizeHelper::lower(MachineInstr &MI,
   const RegBankLLTMapping &Mapping,
   SmallSet &WaterfallSgprs) {
@@ -114,6 +191,50 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
 MI.eraseFromParent();
 break;
   }
+  case SplitLoad: {
+LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+unsigned Size = DstTy.getSizeInBits();
+// Even split to 128-bit loads
+if (Size > 128) {
+  LLT B128;
+  if (DstTy.isVector()) {
+LLT EltTy = DstTy.getElementType();
+B128 = LLT:

[llvm-branch-commits] [clang] [llvm] [LLVM] [Clang] Backport "Support for Gentoo `*t64` triples (64-bit time_t ABIs)" (PR #112364)

2024-10-31 Thread Tobias Hieta via llvm-branch-commits



@@ -294,7 +294,11 @@ class Triple {
 
 PAuthTest,
 
-LastEnvironmentType = PAuthTest
+GNUT64,
+GNUEABIT64,
+GNUEABIHFT64,
+
+LastEnvironmentType = GNUEABIHFT64

tru wrote:

Let's continue the discussion and the next steps on discourse, I posted a new 
post here: https://discourse.llvm.org/t/potential-abi-break-in-19-1-3/82865

https://github.com/llvm/llvm-project/pull/112364
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [clang] [llvm] [LLVM] [Clang] Backport "Support for Gentoo `*t64` triples (64-bit time_t ABIs)" (PR #112364)

2024-10-31 Thread Tom Stellard via llvm-branch-commits



@@ -294,7 +294,11 @@ class Triple {
 
 PAuthTest,
 
-LastEnvironmentType = PAuthTest
+GNUT64,
+GNUEABIT64,
+GNUEABIHFT64,
+
+LastEnvironmentType = GNUEABIHFT64

tstellar wrote:

> This patch doesn't break ABI/API compatibility. The Zig check is overly 
> restrictive and unnecessary. Zig should be fixed instead.

It does technically break ABI compatibility, because it changes the value of an 
enum. This is something we usually try to avoid.

https://github.com/llvm/llvm-project/pull/112364
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)

2024-10-31 Thread Petar Avramovic via llvm-branch-commits


https://github.com/petar-avramovic updated 
https://github.com/llvm/llvm-project/pull/112866

>From 6ec049db2a5572c4cb0514b9ca44c7ff215b461f Mon Sep 17 00:00:00 2001
From: Petar Avramovic 
Date: Thu, 31 Oct 2024 14:10:57 +0100
Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi

Change existing code for G_PHI to match what LLVM-IR version is doing
via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI
since it may appear with an undef operand and getVRegDef can fail.
Most notably this improves number of values that can be allocated
to sgpr register bank in AMDGPURegBankSelect.
Common case here are phis that appear in structurize-cfg lowering
for cycles with multiple exits:
Undef incoming value is coming from block that reached cycle exit
condition, if other incoming is uniform keep the phi uniform despite
the fact it is joining values from pair of blocks that are entered
via divergent condition branch.
---
 llvm/lib/CodeGen/MachineSSAContext.cpp| 27 +-
 .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++
 .../AMDGPU/MIR/hidden-loop-diverge.mir|  4 +-
 .../AMDGPU/MIR/uses-value-from-cycle.mir  |  8 +-
 .../GlobalISel/divergence-structurizer.mir| 80 --
 .../regbankselect-mui-regbanklegalize.mir | 69 ---
 .../regbankselect-mui-regbankselect.mir   | 18 ++--
 .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++-
 .../AMDGPU/GlobalISel/regbankselect-mui.mir   | 51 ++-
 9 files changed, 191 insertions(+), 178 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp 
b/llvm/lib/CodeGen/MachineSSAContext.cpp
index e384187b6e8593..8e13c0916dd9e1 100644
--- a/llvm/lib/CodeGen/MachineSSAContext.cpp
+++ b/llvm/lib/CodeGen/MachineSSAContext.cpp
@@ -54,9 +54,34 @@ const MachineBasicBlock 
*MachineSSAContext::getDefBlock(Register value) const {
   return F->getRegInfo().getVRegDef(value)->getParent();
 }
 
+static bool isUndef(const MachineInstr &MI) {
+  return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF ||
+ MI.getOpcode() == TargetOpcode::IMPLICIT_DEF;
+}
+
+/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI.
 template <>
 bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) {
-  return Phi.isConstantValuePHI();
+  if (!Phi.isPHI())
+return false;
+
+  // In later passes PHI may appear with an undef operand, getVRegDef can fail.
+  if (Phi.getOpcode() == TargetOpcode::PHI)
+return Phi.isConstantValuePHI();
+
+  // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue().
+  const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo();
+  Register This = Phi.getOperand(0).getReg();
+  Register ConstantValue;
+  for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) {
+Register Incoming = Phi.getOperand(i).getReg();
+if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) {
+  if (ConstantValue && ConstantValue != Incoming)
+return false;
+  ConstantValue = Incoming;
+}
+  }
+  return true;
 }
 
 template <>
diff --git 
a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir 
b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
index ce00edf3363f77..9694a340b5e906 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
@@ -1,24 +1,24 @@
 # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | 
FileCheck %s
 # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge
 # CHECK-LABEL: BLOCK bb.0
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC 
intrinsic(@llvm.amdgcn.workitem.id.x)
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt)
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, 
%{{[0-9]*}}:_
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = 
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = 
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
-# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1
-# CHECK: DIVERGENT: G_BR %bb.2
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC 
intrinsic(@llvm.amdgcn.workitem.id.x)
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt)
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, 
%{{[0-9]*}}:_
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = 
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = 
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1
+# CHECK: DIVERGENT: G_BR %bb.2
 # CHECK-LABEL: BLOCK bb.1
 # CHECK-LABEL: BLOCK bb.2
-# CHECK: D

[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)

2024-10-31 Thread Petar Avramovic via llvm-branch-commits



@@ -66,9 +73,215 @@ FunctionPass *llvm::createAMDGPURegBankSelectPass() {
   return new AMDGPURegBankSelect();
 }
 
+class RegBankSelectHelper {
+  MachineIRBuilder &B;
+  MachineRegisterInfo &MRI;
+  AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
+  const MachineUniformityInfo &MUI;
+  const SIRegisterInfo &TRI;
+  const RegisterBank *SgprRB;
+  const RegisterBank *VgprRB;
+  const RegisterBank *VccRB;
+
+public:
+  RegBankSelectHelper(MachineIRBuilder &B,
+  AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
+  const MachineUniformityInfo &MUI,
+  const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
+  : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI),
+SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
+VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
+VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
+
+  bool shouldRegBankSelect(MachineInstr &MI) {
+return MI.isPreISelOpcode() || MI.isCopy();
+  }
+
+  // Temporal divergence copy: COPY to vgpr with implicit use of $exec inside 
of
+  // the cycle
+  // Note: uniformity analysis does not consider that registers with vgpr def
+  // are divergent (you can have uniform value in vgpr).
+  // - TODO: implicit use of $exec could be implemented as indicator that
+  //   instruction is divergent
+  bool isTemporalDivergenceCopy(Register Reg) {
+MachineInstr *MI = MRI.getVRegDef(Reg);
+if (!MI->isCopy())
+  return false;
+
+for (auto Op : MI->implicit_operands()) {
+  if (!Op.isReg())
+continue;
+
+  if (Op.getReg() == TRI.getExec()) {
+return true;
+  }
+}
+
+return false;
+  }
+
+  void setRBDef(MachineInstr &MI, MachineOperand &DefOP,
+const RegisterBank *RB) {
+Register Reg = DefOP.getReg();
+// Register that already has Register class got it during pre-inst 
selection
+// of another instruction. Maybe cross bank copy was required so we insert 
a
+// copy that can be removed later. This simplifies post regbanklegalize
+// combiner and avoids need to special case some patterns.
+if (MRI.getRegClassOrNull(Reg)) {
+  LLT Ty = MRI.getType(Reg);
+  Register NewReg = MRI.createVirtualRegister({RB, Ty});
+  DefOP.setReg(NewReg);
+
+  auto &MBB = *MI.getParent();
+  B.setInsertPt(MBB, MBB.SkipPHIsAndLabels(std::next(MI.getIterator(;
+  B.buildCopy(Reg, NewReg);
+
+  // The problem was discovered for uniform S1 that was used as both
+  // lane mask(vcc) and regular sgpr S1.
+  // - lane-mask(vcc) use was by si_if, this use is divergent and requires
+  //   non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if 
sets
+  //   sreg_64_xexec(S1) on def of uniform S1 making it lane-mask.
+  // - the regular sgpr S1(uniform) instruction is now broken since
+  //   it uses sreg_64_xexec(S1) which is divergent.
+
+  // Replace virtual registers with register class on generic instructions
+  // uses with virtual registers with register bank.
+  for (auto &UseMI : MRI.use_instructions(Reg)) {
+if (shouldRegBankSelect(UseMI)) {
+  for (MachineOperand &Op : UseMI.operands()) {
+if (Op.isReg() && Op.getReg() == Reg)
+  Op.setReg(NewReg);

petar-avramovic wrote:

Yes, this was a bug.
make_early_inc_range also works but it might be assuming how MRI keeps track of 
use instructions internally

https://github.com/llvm/llvm-project/pull/112863
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)

2024-10-31 Thread Petar Avramovic via llvm-branch-commits



@@ -66,9 +73,215 @@ FunctionPass *llvm::createAMDGPURegBankSelectPass() {
   return new AMDGPURegBankSelect();
 }
 
+class RegBankSelectHelper {
+  MachineIRBuilder &B;
+  MachineRegisterInfo &MRI;
+  AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
+  const MachineUniformityInfo &MUI;
+  const SIRegisterInfo &TRI;
+  const RegisterBank *SgprRB;
+  const RegisterBank *VgprRB;
+  const RegisterBank *VccRB;
+
+public:
+  RegBankSelectHelper(MachineIRBuilder &B,
+  AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
+  const MachineUniformityInfo &MUI,
+  const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
+  : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI),
+SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
+VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
+VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
+
+  bool shouldRegBankSelect(MachineInstr &MI) {
+return MI.isPreISelOpcode() || MI.isCopy();
+  }
+
+  // Temporal divergence copy: COPY to vgpr with implicit use of $exec inside 
of
+  // the cycle
+  // Note: uniformity analysis does not consider that registers with vgpr def
+  // are divergent (you can have uniform value in vgpr).
+  // - TODO: implicit use of $exec could be implemented as indicator that
+  //   instruction is divergent
+  bool isTemporalDivergenceCopy(Register Reg) {
+MachineInstr *MI = MRI.getVRegDef(Reg);
+if (!MI->isCopy())
+  return false;
+
+for (auto Op : MI->implicit_operands()) {
+  if (!Op.isReg())
+continue;
+
+  if (Op.getReg() == TRI.getExec()) {
+return true;
+  }
+}
+
+return false;
+  }
+
+  void setRBDef(MachineInstr &MI, MachineOperand &DefOP,
+const RegisterBank *RB) {
+Register Reg = DefOP.getReg();
+// Register that already has Register class got it during pre-inst 
selection
+// of another instruction. Maybe cross bank copy was required so we insert 
a
+// copy that can be removed later. This simplifies post regbanklegalize
+// combiner and avoids need to special case some patterns.
+if (MRI.getRegClassOrNull(Reg)) {
+  LLT Ty = MRI.getType(Reg);
+  Register NewReg = MRI.createVirtualRegister({RB, Ty});
+  DefOP.setReg(NewReg);

petar-avramovic wrote:

Why? I intend for new regbankselect be simple and not use observers.

https://github.com/llvm/llvm-project/pull/112863
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] GlobalISel: Fix combine duplicating atomic loads (PR #111730)

2024-10-31 Thread Matt Arsenault via llvm-branch-commits


arsenm wrote:

### Merge activity

* **Oct 31, 10:38 AM EDT**: A user started a stack merge that includes this 
pull request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/111730).


https://github.com/llvm/llvm-project/pull/111730
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AArch64] Define high bits of FPR and GPR registers. (PR #114263)

2024-10-31 Thread Matt Arsenault via llvm-branch-commits



@@ -424,6 +424,58 @@ AArch64RegisterInfo::explainReservedReg(const 
MachineFunction &MF,
   return {};
 }
 
+static SmallVector ReservedHi = {

arsenm wrote:

This smells like an unrelated bug, this is not the kind of error I expected 

https://github.com/llvm/llvm-project/pull/114263
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)

2024-10-31 Thread Shilei Tian via llvm-branch-commits


https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/114438

>From d0ec41bcb8f0594b86336e45028d490dd4ebf6c4 Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Thu, 31 Oct 2024 12:49:07 -0400
Subject: [PATCH] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor
 existing attribute

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   | 79 +++
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  | 47 ++-
 2 files changed, 69 insertions(+), 57 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 182f4fcc88a79a..a462e88a6e745d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -198,6 +198,17 @@ class AMDGPUInformationCache : public InformationCache {
 return ST.getWavesPerEU(F, FlatWorkGroupSize);
   }
 
+  std::optional>
+  getWavesPerEUAttr(const Function &F) {
+auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu",
+   /*OnlyFirstRequired=*/true);
+if (Val && Val->second == 0) {
+  const GCNSubtarget &ST = TM.getSubtarget(F);
+  Val->second = ST.getMaxWavesPerEU();
+}
+return Val;
+  }
+
   std::pair
   getEffectiveWavesPerEU(const Function &F,
  std::pair WavesPerEU,
@@ -768,22 +779,6 @@ struct AAAMDSizeRangeAttribute
/*ForceReplace=*/true);
   }
 
-  ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min,
- unsigned Max) {
-// Don't add the attribute if it's the implied default.
-if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
-  return ChangeStatus::UNCHANGED;
-
-Function *F = getAssociatedFunction();
-LLVMContext &Ctx = F->getContext();
-SmallString<10> Buffer;
-raw_svector_ostream OS(Buffer);
-OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
-return A.manifestAttrs(getIRPosition(),
-   {Attribute::get(Ctx, AttrName, OS.str())},
-   /*ForceReplace=*/true);
-  }
-
   const std::string getAsStr(Attributor *) const override {
 std::string Str;
 raw_string_ostream OS(Str);
@@ -873,29 +868,47 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
   AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
   : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
 
-  bool isValidState() const override {
-return !Assumed.isEmptySet() && IntegerRangeState::isValidState();
-  }
-
   void initialize(Attributor &A) override {
 Function *F = getAssociatedFunction();
 auto &InfoCache = static_cast(A.getInfoCache());
 
-if (const auto *AssumedGroupSize = A.getAAFor(
-*this, IRPosition::function(*F), DepClassTy::REQUIRED);
-AssumedGroupSize->isValidState()) {
+auto TakeRange = [&](std::pair R) {
+  auto [Min, Max] = R;
+  ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
+  IntegerRangeState RangeState(Range);
+  clampStateAndIndicateChange(this->getState(), RangeState);
+  indicateOptimisticFixpoint();
+};
 
-  unsigned Min, Max;
-  std::tie(Min, Max) = InfoCache.getWavesPerEU(
-  *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
-   AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
+std::pair MaxWavesPerEURange{
+1U, InfoCache.getMaxWavesPerEU(*F)};
 
-  ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
-  intersectKnown(Range);
+// If the attribute exists, we will honor it if it is not the default.
+if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
+  if (*Attr != MaxWavesPerEURange) {
+TakeRange(*Attr);
+return;
+  }
 }
 
-if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
-  indicatePessimisticFixpoint();
+// Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the
+// calculation of waves per EU involves flat work group size, we can't
+// simply use an assumed flat work group size as a start point, because the
+// update of flat work group size is in an inverse direction of waves per
+// EU. However, we can still do something if it is an entry function. Since
+// an entry function is a terminal node, and flat work group size either
+// from attribute or default will be used anyway, we can take that value 
and
+// calculate the waves per EU based on it. This result can't be updated by
+// no means, but that could still allow us to propagate it.
+if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+  std::pair FlatWorkGroupSize;
+  if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F))
+FlatWorkGroupSize = *Attr;
+  else
+FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F);
+  TakeRange(InfoCache.getEffectiveWavesPerEU

[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)

2024-10-31 Thread Shilei Tian via llvm-branch-commits


https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/114438

>From b7f1c2bd5d33a060ab2a8ee942874d208d42cac9 Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Thu, 31 Oct 2024 12:49:07 -0400
Subject: [PATCH] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor
 existing attribute

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   | 77 +++
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  | 47 ++-
 2 files changed, 66 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 642b278db70437..8b9e3f37dc507c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -198,6 +198,16 @@ class AMDGPUInformationCache : public InformationCache {
 return ST.getWavesPerEU(F, FlatWorkGroupSize);
   }
 
+  std::optional>
+  getWavesPerEUAttr(const Function &F) {
+auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu");
+if (Val && Val->second == 0) {
+  const GCNSubtarget &ST = TM.getSubtarget(F);
+  Val->second = ST.getMaxWavesPerEU();
+}
+return Val;
+  }
+
   std::pair
   getEffectiveWavesPerEU(const Function &F,
  std::pair WavesPerEU,
@@ -768,22 +778,6 @@ struct AAAMDSizeRangeAttribute
/*ForceReplace=*/true);
   }
 
-  ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min,
- unsigned Max) {
-// Don't add the attribute if it's the implied default.
-if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
-  return ChangeStatus::UNCHANGED;
-
-Function *F = getAssociatedFunction();
-LLVMContext &Ctx = F->getContext();
-SmallString<10> Buffer;
-raw_svector_ostream OS(Buffer);
-OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
-return A.manifestAttrs(getIRPosition(),
-   {Attribute::get(Ctx, AttrName, OS.str())},
-   /*ForceReplace=*/true);
-  }
-
   const std::string getAsStr(Attributor *) const override {
 std::string Str;
 raw_string_ostream OS(Str);
@@ -868,29 +862,44 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
   AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
   : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
 
-  bool isValidState() const override {
-return !Assumed.isEmptySet() && IntegerRangeState::isValidState();
-  }
-
   void initialize(Attributor &A) override {
 Function *F = getAssociatedFunction();
 auto &InfoCache = static_cast(A.getInfoCache());
 
-if (const auto *AssumedGroupSize = A.getAAFor(
-*this, IRPosition::function(*F), DepClassTy::REQUIRED);
-AssumedGroupSize->isValidState()) {
-
-  unsigned Min, Max;
-  std::tie(Min, Max) = InfoCache.getWavesPerEU(
-  *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
-   AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
-
+auto TakeRange = [&](std::pair R) {
+  auto [Min, Max] = R;
   ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
-  intersectKnown(Range);
+  IntegerRangeState RangeState(Range);
+  clampStateAndIndicateChange(this->getState(), RangeState);
+  indicateOptimisticFixpoint();
+};
+
+// If the attribute exists, simple honor it.
+if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
+  TakeRange(*Attr);
+  return;
 }
 
-if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
-  indicatePessimisticFixpoint();
+// It's getting trickier here, different from AAAMDFlatWorkGroupSize. Since
+// the calculation of waves per EU involves flat work group size, we can't
+// simply use an assumed flat work group size as a start point, because the
+// update of flat work group size is in an inverse direction of waves per
+// EU. However, we can still do something if it is an entry function. Since
+// an entry function is a terminal node, and flat work group size either
+// from attribute or default will be used anyway, we can take that value 
and
+// calculate the waves per EU based on it. This result can't be updated by
+// no means, but that could still allow us to propagate it.
+if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+  std::pair MaxWavesPerEURange{
+  1U, InfoCache.getMaxWavesPerEU(*F)};
+  std::pair FlatWorkGroupSize;
+  if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F))
+FlatWorkGroupSize = *Attr;
+  else
+FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F);
+  TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange,
+ FlatWorkGroupSize));
+}
   }
 
   ChangeStatus updateImpl(Attributor &A) override {
@@ -939,8 +948,8 @@ struct AAAMDWavesPerEU :

[llvm-branch-commits] [llvm] [RISCV] Add initial support of memcmp expansion (PR #107548)

2024-10-31 Thread Pengcheng Wang via llvm-branch-commits


wangpc-pp wrote:

Ping, any comment for current scalar part? I'm working on vector expansion and 
will post it in a few days.

https://github.com/llvm/llvm-project/pull/107548
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)

2024-10-31 Thread Shilei Tian via llvm-branch-commits


https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/114438

>From 25b1ec0e80072c70628da9d72be8969fd6bb3d87 Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Thu, 31 Oct 2024 12:49:07 -0400
Subject: [PATCH] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor
 existing attribute

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   | 78 +++
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  | 47 ++-
 2 files changed, 68 insertions(+), 57 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 182f4fcc88a79a..99d5ca8403dc21 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -198,6 +198,16 @@ class AMDGPUInformationCache : public InformationCache {
 return ST.getWavesPerEU(F, FlatWorkGroupSize);
   }
 
+  std::optional>
+  getWavesPerEUAttr(const Function &F) {
+auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu");
+if (Val && Val->second == 0) {
+  const GCNSubtarget &ST = TM.getSubtarget(F);
+  Val->second = ST.getMaxWavesPerEU();
+}
+return Val;
+  }
+
   std::pair
   getEffectiveWavesPerEU(const Function &F,
  std::pair WavesPerEU,
@@ -768,22 +778,6 @@ struct AAAMDSizeRangeAttribute
/*ForceReplace=*/true);
   }
 
-  ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min,
- unsigned Max) {
-// Don't add the attribute if it's the implied default.
-if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
-  return ChangeStatus::UNCHANGED;
-
-Function *F = getAssociatedFunction();
-LLVMContext &Ctx = F->getContext();
-SmallString<10> Buffer;
-raw_svector_ostream OS(Buffer);
-OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
-return A.manifestAttrs(getIRPosition(),
-   {Attribute::get(Ctx, AttrName, OS.str())},
-   /*ForceReplace=*/true);
-  }
-
   const std::string getAsStr(Attributor *) const override {
 std::string Str;
 raw_string_ostream OS(Str);
@@ -873,29 +867,47 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
   AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
   : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
 
-  bool isValidState() const override {
-return !Assumed.isEmptySet() && IntegerRangeState::isValidState();
-  }
-
   void initialize(Attributor &A) override {
 Function *F = getAssociatedFunction();
 auto &InfoCache = static_cast(A.getInfoCache());
 
-if (const auto *AssumedGroupSize = A.getAAFor(
-*this, IRPosition::function(*F), DepClassTy::REQUIRED);
-AssumedGroupSize->isValidState()) {
+auto TakeRange = [&](std::pair R) {
+  auto [Min, Max] = R;
+  ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
+  IntegerRangeState RangeState(Range);
+  clampStateAndIndicateChange(this->getState(), RangeState);
+  indicateOptimisticFixpoint();
+};
 
-  unsigned Min, Max;
-  std::tie(Min, Max) = InfoCache.getWavesPerEU(
-  *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
-   AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
+std::pair MaxWavesPerEURange{
+1U, InfoCache.getMaxWavesPerEU(*F)};
 
-  ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
-  intersectKnown(Range);
+// If the attribute exists, we will honor it if it is not the default.
+if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
+  if (*Attr != MaxWavesPerEURange) {
+TakeRange(*Attr);
+return;
+  }
 }
 
-if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
-  indicatePessimisticFixpoint();
+// It's getting trickier here, different from AAAMDFlatWorkGroupSize. Since
+// the calculation of waves per EU involves flat work group size, we can't
+// simply use an assumed flat work group size as a start point, because the
+// update of flat work group size is in an inverse direction of waves per
+// EU. However, we can still do something if it is an entry function. Since
+// an entry function is a terminal node, and flat work group size either
+// from attribute or default will be used anyway, we can take that value 
and
+// calculate the waves per EU based on it. This result can't be updated by
+// no means, but that could still allow us to propagate it.
+if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+  std::pair FlatWorkGroupSize;
+  if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F))
+FlatWorkGroupSize = *Attr;
+  else
+FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F);
+  TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange,
+

[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)

2024-10-31 Thread Shilei Tian via llvm-branch-commits


https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/114438

>From b48566210212165429e6a29665a4fefdf2695e61 Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Thu, 31 Oct 2024 12:49:07 -0400
Subject: [PATCH] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor
 existing attribute

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   | 78 +++
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  | 47 ++-
 2 files changed, 68 insertions(+), 57 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 182f4fcc88a79a..03a15639aa6bcb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -198,6 +198,16 @@ class AMDGPUInformationCache : public InformationCache {
 return ST.getWavesPerEU(F, FlatWorkGroupSize);
   }
 
+  std::optional>
+  getWavesPerEUAttr(const Function &F) {
+auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu");
+if (Val && Val->second == 0) {
+  const GCNSubtarget &ST = TM.getSubtarget(F);
+  Val->second = ST.getMaxWavesPerEU();
+}
+return Val;
+  }
+
   std::pair
   getEffectiveWavesPerEU(const Function &F,
  std::pair WavesPerEU,
@@ -768,22 +778,6 @@ struct AAAMDSizeRangeAttribute
/*ForceReplace=*/true);
   }
 
-  ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min,
- unsigned Max) {
-// Don't add the attribute if it's the implied default.
-if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
-  return ChangeStatus::UNCHANGED;
-
-Function *F = getAssociatedFunction();
-LLVMContext &Ctx = F->getContext();
-SmallString<10> Buffer;
-raw_svector_ostream OS(Buffer);
-OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
-return A.manifestAttrs(getIRPosition(),
-   {Attribute::get(Ctx, AttrName, OS.str())},
-   /*ForceReplace=*/true);
-  }
-
   const std::string getAsStr(Attributor *) const override {
 std::string Str;
 raw_string_ostream OS(Str);
@@ -873,29 +867,47 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
   AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
   : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
 
-  bool isValidState() const override {
-return !Assumed.isEmptySet() && IntegerRangeState::isValidState();
-  }
-
   void initialize(Attributor &A) override {
 Function *F = getAssociatedFunction();
 auto &InfoCache = static_cast(A.getInfoCache());
 
-if (const auto *AssumedGroupSize = A.getAAFor(
-*this, IRPosition::function(*F), DepClassTy::REQUIRED);
-AssumedGroupSize->isValidState()) {
+auto TakeRange = [&](std::pair R) {
+  auto [Min, Max] = R;
+  ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
+  IntegerRangeState RangeState(Range);
+  clampStateAndIndicateChange(this->getState(), RangeState);
+  indicateOptimisticFixpoint();
+};
 
-  unsigned Min, Max;
-  std::tie(Min, Max) = InfoCache.getWavesPerEU(
-  *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
-   AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
+std::pair MaxWavesPerEURange{
+1U, InfoCache.getMaxWavesPerEU(*F)};
 
-  ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
-  intersectKnown(Range);
+// If the attribute exists, we will honor it if it is not the default.
+if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
+  if (*Attr != MaxWavesPerEURange) {
+TakeRange(*Attr);
+return;
+  }
 }
 
-if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
-  indicatePessimisticFixpoint();
+// Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the
+// calculation of waves per EU involves flat work group size, we can't
+// simply use an assumed flat work group size as a start point, because the
+// update of flat work group size is in an inverse direction of waves per
+// EU. However, we can still do something if it is an entry function. Since
+// an entry function is a terminal node, and flat work group size either
+// from attribute or default will be used anyway, we can take that value 
and
+// calculate the waves per EU based on it. This result can't be updated by
+// no means, but that could still allow us to propagate it.
+if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+  std::pair FlatWorkGroupSize;
+  if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F))
+FlatWorkGroupSize = *Attr;
+  else
+FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F);
+  TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange,
+ F

[llvm-branch-commits] [llvm] 8b1b475 - Revert "[ConstantFold] Fold `tgamma` and `tgammaf` when the input parameter i…"

2024-10-31 Thread via llvm-branch-commits


Author: c8ef
Date: 2024-11-01T09:25:43+08:00
New Revision: 8b1b4753ac16cba5a153536171a243d76300e4bb

URL: 
https://github.com/llvm/llvm-project/commit/8b1b4753ac16cba5a153536171a243d76300e4bb
DIFF: 
https://github.com/llvm/llvm-project/commit/8b1b4753ac16cba5a153536171a243d76300e4bb.diff

LOG: Revert "[ConstantFold] Fold `tgamma` and `tgammaf` when the input 
parameter i…"

This reverts commit 1f07f995cc994dfb46b65fe97986efca15cf304b.

Added: 


Modified: 
llvm/lib/Analysis/ConstantFolding.cpp

Removed: 
llvm/test/Transforms/InstCombine/tgamma.ll



diff  --git a/llvm/lib/Analysis/ConstantFolding.cpp 
b/llvm/lib/Analysis/ConstantFolding.cpp
index a96c3bebba790e..c5a2c2f52f8dc2 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -57,7 +57,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
@@ -1699,9 +1698,9 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, 
const Function *F) {
Name == "sinh" || Name == "sinhf" ||
Name == "sqrt" || Name == "sqrtf";
   case 't':
-return Name == "tan" || Name == "tanf" || Name == "tanh" ||
-   Name == "tanhf" || Name == "trunc" || Name == "truncf" ||
-   Name == "tgamma" || Name == "tgammaf";
+return Name == "tan" || Name == "tanf" ||
+   Name == "tanh" || Name == "tanhf" ||
+   Name == "trunc" || Name == "truncf";
   case '_':
 // Check for various function names that get used for the math functions
 // when the header files are preprocessed with the macro
@@ -2418,14 +2417,6 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
   if (TLI->has(Func))
 return ConstantFoldFP(erf, APF, Ty);
   break;
-case LibFunc_tgamma:
-case LibFunc_tgammaf:
-  // NOTE: These boundaries are somewhat conservative.
-  if (TLI->has(Func) &&
-  (Ty->isDoubleTy() && APF > APFloat(DBL_MIN) && APF < APFloat(171.0) 
||
-   Ty->isFloatTy() && APF > APFloat(FLT_MIN) && APF < APFloat(35.0f)))
-return ConstantFoldFP(tgamma, APF, Ty);
-  break;
 case LibFunc_nearbyint:
 case LibFunc_nearbyintf:
 case LibFunc_rint:
@@ -3638,10 +3629,6 @@ bool llvm::isMathLibCallNoop(const CallBase *Call,
   case LibFunc_sqrtf:
 return Op.isNaN() || Op.isZero() || !Op.isNegative();
 
-  case LibFunc_tgamma:
-  case LibFunc_tgammaf:
-return true;
-
   // FIXME: Add more functions: sqrt_finite, atanh, expm1, log1p,
   // maybe others?
   default:

diff  --git a/llvm/test/Transforms/InstCombine/tgamma.ll 
b/llvm/test/Transforms/InstCombine/tgamma.ll
deleted file mode 100644
index dd74617fee83e5..00
--- a/llvm/test/Transforms/InstCombine/tgamma.ll
+++ /dev/null
@@ -1,255 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 5
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-define float @tgammaf_in_range() {
-; CHECK-LABEL: define float @tgammaf_in_range() {
-; CHECK-NEXT:ret float 0x479A21628000
-;
-  %r = call float @tgammaf(float 34.0)
-  ret float %r
-}
-
-define double @tgamma_in_range() {
-; CHECK-LABEL: define double @tgamma_in_range() {
-; CHECK-NEXT:ret double 0x605166C698CF183B
-;
-  %r = call double @tgamma(double 100.0)
-  ret double %r
-}
-
-define float @tgammaf_const_left_range() {
-; CHECK-LABEL: define float @tgammaf_const_left_range() {
-; CHECK-NEXT:[[R:%.*]] = call float @tgammaf(float 0x3810)
-; CHECK-NEXT:ret float [[R]]
-;
-  %r = call float @tgammaf(float 0x3810)
-  ret float %r
-}
-
-define double @tgamma_const_left_range() {
-; CHECK-LABEL: define double @tgamma_const_left_range() {
-; CHECK-NEXT:[[R:%.*]] = call double @tgamma(double 0x10)
-; CHECK-NEXT:ret double [[R]]
-;
-  %r = call double @tgamma(double 0x0010)
-  ret double %r
-}
-
-define float @tgammaf_const_right_range() {
-; CHECK-LABEL: define float @tgammaf_const_right_range() {
-; CHECK-NEXT:[[R:%.*]] = call float @tgammaf(float 3.60e+01)
-; CHECK-NEXT:ret float [[R]]
-;
-  %r = call float @tgammaf(float 36.0)
-  ret float %r
-}
-
-define double @tgamma_const_right_range() {
-; CHECK-LABEL: define double @tgamma_const_right_range() {
-; CHECK-NEXT:[[R:%.*]] = call double @tgamma(double 1.72e+02)
-; CHECK-NEXT:ret double [[R]]
-;
-  %r = call double @tgamma(double 172.0)
-  ret double %r
-}
-
-define float @tgammaf_minus_one() {
-; CHECK-LABEL: define float @tgammaf_minus_one() {
-; CHECK-NEXT:[[R:%.*]] = call float @tgammaf(float -1.00e+00)
-; CHECK-NEXT:ret float [[R]]
-;
-  %r = call float @tgammaf(float -1.00e+00)
-  ret float %r
-}
-
-define double @tgamma_minus_one() {
-; CHECK-LABEL: define double @tgamma_minus_one() {
-; CHECK-NEXT:[[R:%.*]] = call double @tgamma(double -1.00e+00)
-; CHE

[llvm-branch-commits] [llvm] [RISCV] Add initial support of memcmp expansion (PR #107548)

2024-10-31 Thread Craig Topper via llvm-branch-commits



@@ -1144,42 +2872,116 @@ entry:
 define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-LABEL: memcmp_size_4:
 ; CHECK-ALIGNED-RV32:   # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-NEXT:addi sp, sp, -16
-; CHECK-ALIGNED-RV32-NEXT:sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-ALIGNED-RV32-NEXT:li a2, 4
-; CHECK-ALIGNED-RV32-NEXT:call memcmp
-; CHECK-ALIGNED-RV32-NEXT:lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-ALIGNED-RV32-NEXT:addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:lbu a2, 0(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a3, 1(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a4, 3(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a0, 2(a0)
+; CHECK-ALIGNED-RV32-NEXT:lbu a5, 0(a1)
+; CHECK-ALIGNED-RV32-NEXT:lbu a6, 1(a1)
+; CHECK-ALIGNED-RV32-NEXT:lbu a7, 3(a1)
+; CHECK-ALIGNED-RV32-NEXT:lbu a1, 2(a1)
+; CHECK-ALIGNED-RV32-NEXT:slli a0, a0, 8
+; CHECK-ALIGNED-RV32-NEXT:or a0, a0, a4
+; CHECK-ALIGNED-RV32-NEXT:slli a3, a3, 16
+; CHECK-ALIGNED-RV32-NEXT:slli a2, a2, 24
+; CHECK-ALIGNED-RV32-NEXT:or a2, a2, a3
+; CHECK-ALIGNED-RV32-NEXT:or a0, a2, a0
+; CHECK-ALIGNED-RV32-NEXT:slli a1, a1, 8
+; CHECK-ALIGNED-RV32-NEXT:or a1, a1, a7
+; CHECK-ALIGNED-RV32-NEXT:slli a6, a6, 16
+; CHECK-ALIGNED-RV32-NEXT:slli a5, a5, 24
+; CHECK-ALIGNED-RV32-NEXT:or a2, a5, a6
+; CHECK-ALIGNED-RV32-NEXT:or a1, a2, a1
+; CHECK-ALIGNED-RV32-NEXT:sltu a2, a1, a0
+; CHECK-ALIGNED-RV32-NEXT:sltu a0, a0, a1
+; CHECK-ALIGNED-RV32-NEXT:sub a0, a2, a0
 ; CHECK-ALIGNED-RV32-NEXT:ret
 ;
 ; CHECK-ALIGNED-RV64-LABEL: memcmp_size_4:
 ; CHECK-ALIGNED-RV64:   # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-NEXT:addi sp, sp, -16
-; CHECK-ALIGNED-RV64-NEXT:sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-ALIGNED-RV64-NEXT:li a2, 4
-; CHECK-ALIGNED-RV64-NEXT:call memcmp
-; CHECK-ALIGNED-RV64-NEXT:ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-ALIGNED-RV64-NEXT:addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:lbu a2, 0(a0)
+; CHECK-ALIGNED-RV64-NEXT:lbu a3, 1(a0)
+; CHECK-ALIGNED-RV64-NEXT:lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-NEXT:lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-NEXT:lbu a5, 0(a1)
+; CHECK-ALIGNED-RV64-NEXT:lbu a6, 1(a1)
+; CHECK-ALIGNED-RV64-NEXT:lbu a7, 2(a1)
+; CHECK-ALIGNED-RV64-NEXT:lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-NEXT:andi a0, a0, 255
+; CHECK-ALIGNED-RV64-NEXT:slli a4, a4, 8
+; CHECK-ALIGNED-RV64-NEXT:or a0, a4, a0
+; CHECK-ALIGNED-RV64-NEXT:slli a3, a3, 16
+; CHECK-ALIGNED-RV64-NEXT:slliw a2, a2, 24
+; CHECK-ALIGNED-RV64-NEXT:or a2, a2, a3
+; CHECK-ALIGNED-RV64-NEXT:or a0, a2, a0
+; CHECK-ALIGNED-RV64-NEXT:andi a1, a1, 255
+; CHECK-ALIGNED-RV64-NEXT:slli a7, a7, 8
+; CHECK-ALIGNED-RV64-NEXT:or a1, a7, a1
+; CHECK-ALIGNED-RV64-NEXT:slli a6, a6, 16
+; CHECK-ALIGNED-RV64-NEXT:slliw a2, a5, 24
+; CHECK-ALIGNED-RV64-NEXT:or a2, a2, a6
+; CHECK-ALIGNED-RV64-NEXT:or a1, a2, a1
+; CHECK-ALIGNED-RV64-NEXT:sltu a2, a1, a0
+; CHECK-ALIGNED-RV64-NEXT:sltu a0, a0, a1
+; CHECK-ALIGNED-RV64-NEXT:sub a0, a2, a0
 ; CHECK-ALIGNED-RV64-NEXT:ret
 ;
 ; CHECK-UNALIGNED-RV32-LABEL: memcmp_size_4:
 ; CHECK-UNALIGNED-RV32:   # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-NEXT:addi sp, sp, -16
-; CHECK-UNALIGNED-RV32-NEXT:sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-NEXT:li a2, 4
-; CHECK-UNALIGNED-RV32-NEXT:call memcmp
-; CHECK-UNALIGNED-RV32-NEXT:lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-NEXT:addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:lw a0, 0(a0)

topperc wrote:

Can we test the non-Zbb config on qemu and get the instruction count?

https://github.com/llvm/llvm-project/pull/107548
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] 4b61314 - Revert "[InstCombine] Fix FMF propagation in `foldSelectIntoOp` (#114356)"

2024-10-31 Thread via llvm-branch-commits


Author: gulfemsavrun
Date: 2024-10-31T13:19:56-07:00
New Revision: 4b61314fc83dfc9a8ec29dcc4c9ccfb0057b990a

URL: 
https://github.com/llvm/llvm-project/commit/4b61314fc83dfc9a8ec29dcc4c9ccfb0057b990a
DIFF: 
https://github.com/llvm/llvm-project/commit/4b61314fc83dfc9a8ec29dcc4c9ccfb0057b990a.diff

LOG: Revert "[InstCombine] Fix FMF propagation in `foldSelectIntoOp` (#114356)"

This reverts commit cf1963afad335cf74a9411f106d1f2fe80dbed2f.

Added: 


Modified: 
llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll
llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll

Removed: 




diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp 
b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 9e193e7faa8ac3..c5f39a4c381ed1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -529,6 +529,9 @@ Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst 
&SI, Value *TrueVal,
 if (!OpToFold)
   return nullptr;
 
+// TODO: We probably ought to revisit cases where the select and FP
+// instructions have 
diff erent flags and add tests to ensure the
+// behaviour is correct.
 FastMathFlags FMF;
 if (isa(&SI))
   FMF = SI.getFastMathFlags();
@@ -561,8 +564,6 @@ Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst 
&SI, Value *TrueVal,
 BinaryOperator *BO =
 BinaryOperator::Create(TVI->getOpcode(), FalseVal, NewSel);
 BO->copyIRFlags(TVI);
-if (isa(&SI))
-  BO->andIRFlags(NewSel);
 return BO;
   };
 

diff  --git a/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll 
b/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll
index caf38c676e20d7..1c28b151825c12 100644
--- a/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll
+++ b/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll
@@ -468,7 +468,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_fmul(float 
%x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_fmul(
 ; CHECK-NEXT:[[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.00e+00
 ; CHECK-NEXT:[[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float 
[[Y:%.*]], float 1.00e+00
-; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul float [[X]], [[SCALED_X]]
+; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul nsz float [[X]], 
[[SCALED_X]]
 ; CHECK-NEXT:ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
@@ -482,7 +482,7 @@ define float 
@fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul(
 ; CHECK-NEXT:[[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.00e+00
 ; CHECK-NEXT:[[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float 
[[Y:%.*]], float 1.00e+00
-; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul float [[X]], [[SCALED_X]]
+; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul ninf nsz float [[X]], 
[[SCALED_X]]
 ; CHECK-NEXT:ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
@@ -496,7 +496,7 @@ define float 
@fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_fmul(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_fmul(
 ; CHECK-NEXT:[[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.00e+00
 ; CHECK-NEXT:[[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float 
[[Y:%.*]], float 1.00e+00
-; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], 
[[SCALED_X]]
+; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul nnan nsz float [[X]], 
[[SCALED_X]]
 ; CHECK-NEXT:ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
@@ -510,7 +510,7 @@ define float 
@fmul_by_var_if_0_oeq_zero_f32_nnan_ninf_fmul(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nnan_ninf_fmul(
 ; CHECK-NEXT:[[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.00e+00
 ; CHECK-NEXT:[[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float 
[[Y:%.*]], float 1.00e+00
-; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], 
[[SCALED_X]]
+; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[X]], 
[[SCALED_X]]
 ; CHECK-NEXT:ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
@@ -559,7 +559,7 @@ define float 
@fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(float %x, float %
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(
 ; CHECK-NEXT:[[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.00e+00
 ; CHECK-NEXT:[[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float 
[[Y:%.*]], float 1.00e+00
-; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], 
[[SCALED_X]]
+; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[X]], 
[[

46 matches

Mail list logo