[llvm-branch-commits] [clang] [flang] [lld] [llvm] [Flang] LLVM_ENABLE_RUNTIMES=FortranRuntime (PR #110217)
jplehr wrote: I tested this locally and it appears that it requires are more modern CMake version than what was installed (`3.22`). According to the LLVM docs (https://releases.llvm.org/12.0.0/docs/GettingStarted.html#id8) currently CMake 3.20 is the minimum required version. https://github.com/llvm/llvm-project/pull/110217 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] d0422f6 - Revert "Extend `getBackwardSlice` to track values captured from above (#113478)"
Author: Mehdi Amini Date: 2024-10-31T18:28:41+01:00 New Revision: d0422f6d72d84dbf78d248fa9665d2aaf50dd1fa URL: https://github.com/llvm/llvm-project/commit/d0422f6d72d84dbf78d248fa9665d2aaf50dd1fa DIFF: https://github.com/llvm/llvm-project/commit/d0422f6d72d84dbf78d248fa9665d2aaf50dd1fa.diff LOG: Revert "Extend `getBackwardSlice` to track values captured from above (#113478)" This reverts commit 1bc58a258e2edb6221009a26d0f0037eda6c7c47. Added: Modified: mlir/include/mlir/Analysis/SliceAnalysis.h mlir/lib/Analysis/SliceAnalysis.cpp mlir/test/IR/slice.mlir mlir/test/lib/IR/TestSlicing.cpp Removed: diff --git a/mlir/include/mlir/Analysis/SliceAnalysis.h b/mlir/include/mlir/Analysis/SliceAnalysis.h index a4f5d937cd51da..99279fdfe427c8 100644 --- a/mlir/include/mlir/Analysis/SliceAnalysis.h +++ b/mlir/include/mlir/Analysis/SliceAnalysis.h @@ -47,11 +47,6 @@ struct BackwardSliceOptions : public SliceOptions { /// backward slice computation traverses block arguments and asserts that the /// parent op has a single region with a single block. bool omitBlockArguments = false; - - /// When omitUsesFromAbove is true, the backward slice computation omits - /// traversing values that are captured from above. - /// TODO: this should default to `false` after users have been updated. - bool omitUsesFromAbove = true; }; using ForwardSliceOptions = SliceOptions; diff --git a/mlir/lib/Analysis/SliceAnalysis.cpp b/mlir/lib/Analysis/SliceAnalysis.cpp index 7ec999fa0370f9..2b1cf411cb 100644 --- a/mlir/lib/Analysis/SliceAnalysis.cpp +++ b/mlir/lib/Analysis/SliceAnalysis.cpp @@ -16,8 +16,6 @@ #include "mlir/IR/Operation.h" #include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Support/LLVM.h" -#include "mlir/Transforms/RegionUtils.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" @@ -93,13 +91,14 @@ static void getBackwardSliceImpl(Operation *op, if (options.filter && !options.filter(op)) return; - auto processValue = [&](Value value) { -if (auto *definingOp = value.getDefiningOp()) { + for (const auto &en : llvm::enumerate(op->getOperands())) { +auto operand = en.value(); +if (auto *definingOp = operand.getDefiningOp()) { if (backwardSlice->count(definingOp) == 0) getBackwardSliceImpl(definingOp, backwardSlice, options); -} else if (auto blockArg = dyn_cast(value)) { +} else if (auto blockArg = dyn_cast(operand)) { if (options.omitBlockArguments) -return; +continue; Block *block = blockArg.getOwner(); Operation *parentOp = block->getParentOp(); @@ -114,14 +113,7 @@ static void getBackwardSliceImpl(Operation *op, } else { llvm_unreachable("No definingOp and not a block argument."); } - }; - - if (!options.omitUsesFromAbove) { -visitUsedValuesDefinedAbove(op->getRegions(), [&](OpOperand *operand) { - processValue(operand->get()); -}); } - llvm::for_each(op->getOperands(), processValue); backwardSlice->insert(op); } diff --git a/mlir/test/IR/slice.mlir b/mlir/test/IR/slice.mlir index 87d446c8f415af..0a32a0f231baf2 100644 --- a/mlir/test/IR/slice.mlir +++ b/mlir/test/IR/slice.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -slice-analysis-test -split-input-file %s | FileCheck %s +// RUN: mlir-opt -slice-analysis-test %s | FileCheck %s func.func @slicing_linalg_op(%arg0 : index, %arg1 : index, %arg2 : index) { %a = memref.alloc(%arg0, %arg2) : memref @@ -33,29 +33,3 @@ func.func @slicing_linalg_op(%arg0 : index, %arg1 : index, %arg2 : index) { // CHECK-DAG: %[[B:.+]] = memref.alloc(%[[ARG2]], %[[ARG1]]) : memref // CHECK-DAG: %[[C:.+]] = memref.alloc(%[[ARG0]], %[[ARG1]]) : memref // CHECK: return - -// - - -#map = affine_map<(d0, d1) -> (d0, d1)> -func.func @slice_use_from_above(%arg0: tensor<5x5xf32>, %arg1: tensor<5x5xf32>) { - %0 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<5x5xf32>) outs(%arg1 : tensor<5x5xf32>) { - ^bb0(%in: f32, %out: f32): -%2 = arith.addf %in, %in : f32 -linalg.yield %2 : f32 - } -> tensor<5x5xf32> - %collapsed = tensor.collapse_shape %0 [[0, 1]] : tensor<5x5xf32> into tensor<25xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<5x5xf32>) outs(%arg1 : tensor<5x5xf32>) { - ^bb0(%in: f32, %out: f32): -%c2 = arith.constant 2 : index -%extracted = tensor.extract %collapsed[%c2] : tensor<25xf32> -%2 = arith.addf %extracted, %extracted : f32 -linalg.yield %2 : f32 - } -> tensor<5x5xf32> - return -} - -// CHECK-LABEL: func @slice_use_from_above__backward_slice__0 -// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor -// CHECK: %[[A:.+]] = linalg.generic {{.*}} ins(%[[ARG0]] -//
[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/114438 >From 66264a1254c322fc0d3aa464125370886ad7da7c Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 31 Oct 2024 12:49:07 -0400 Subject: [PATCH] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 80 +++ .../CodeGen/AMDGPU/propagate-waves-per-eu.ll | 47 ++- 2 files changed, 69 insertions(+), 58 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 205673cdcc0e23..9a165d9be529e2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -215,6 +215,19 @@ class AMDGPUInformationCache : public InformationCache { return ST.getWavesPerEU(F, FlatWorkGroupSize); } + std::optional> + getWavesPerEUAttr(const Function &F) { +Attribute Attr = F.getFnAttribute("amdgpu-waves-per-eu"); +if (!Attr.isStringAttribute()) + return std::nullopt; +auto Val = parseRangeAttribute(Attr.getValueAsString()); +if (Val && Val->second == 0) { + const GCNSubtarget &ST = TM.getSubtarget(F); + Val->second = ST.getMaxWavesPerEU(); +} +return Val; + } + std::pair getEffectiveWavesPerEU(const Function &F, std::pair WavesPerEU, @@ -785,22 +798,6 @@ struct AAAMDSizeRangeAttribute /*ForceReplace=*/true); } - ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min, - unsigned Max) { -// Don't add the attribute if it's the implied default. -if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max) - return ChangeStatus::UNCHANGED; - -Function *F = getAssociatedFunction(); -LLVMContext &Ctx = F->getContext(); -SmallString<10> Buffer; -raw_svector_ostream OS(Buffer); -OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; -return A.manifestAttrs(getIRPosition(), - {Attribute::get(Ctx, AttrName, OS.str())}, - /*ForceReplace=*/true); - } - const std::string getAsStr(Attributor *) const override { std::string Str; raw_string_ostream OS(Str); @@ -885,29 +882,44 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A) : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {} - bool isValidState() const override { -return !Assumed.isEmptySet() && IntegerRangeState::isValidState(); - } - void initialize(Attributor &A) override { Function *F = getAssociatedFunction(); auto &InfoCache = static_cast(A.getInfoCache()); -if (const auto *AssumedGroupSize = A.getAAFor( -*this, IRPosition::function(*F), DepClassTy::REQUIRED); -AssumedGroupSize->isValidState()) { - - unsigned Min, Max; - std::tie(Min, Max) = InfoCache.getWavesPerEU( - *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(), - AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1}); - +auto TakeRange = [&](std::pair R) { + auto [Min, Max] = R; ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); - intersectKnown(Range); + IntegerRangeState RangeState(Range); + clampStateAndIndicateChange(this->getState(), RangeState); + indicateOptimisticFixpoint(); +}; + +// If the attribute exists, simple honor it. +if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) { + TakeRange(*Attr); + return; } -if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) - indicatePessimisticFixpoint(); +// It's getting trickier here, different from AAAMDFlatWorkGroupSize. Since +// the calculation of waves per EU involves flat work group size, we can't +// simply use an assumed flat work group size as a start point, because the +// update of flat work group size is in an inverse direction of waves per +// EU. However, we can still do something if it is an entry function. Since +// an entry function is a terminal node, and flat work group size either +// from attribute or default will be used anyway, we can take that value and +// calculate the waves per EU based on it. This result can't be updated by +// no means, but that could still allow us to propagate it. +if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) { + std::pair MaxWavesPerEURange{ + 1U, InfoCache.getMaxWavesPerEU(*F)}; + std::pair FlatWorkGroupSize; + if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) +FlatWorkGroupSize = *Attr; + else +FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F); + TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange, + FlatWorkGroupSize)
[llvm-branch-commits] [llvm] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)
@@ -404,21 +404,20 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" } ; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-ld
[llvm-branch-commits] [llvm] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)
@@ -404,21 +404,20 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" } ; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-ld
[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)
https://github.com/shiltian edited https://github.com/llvm/llvm-project/pull/114438 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Define high bits of FPR and GPR registers. (PR #114263)
@@ -424,6 +424,58 @@ AArch64RegisterInfo::explainReservedReg(const MachineFunction &MF, return {}; } +static SmallVector ReservedHi = { sdesmalen-arm wrote: Without marking the registers as reserved, then for the example below: ``` --- name:sv2i64 tracksRegLiveness: true body: | bb.0.entry: liveins: $q0, $q1 %0:fpr128 = COPY $q0 %1:fpr128 = COPY $q1 %35:gpr64 = COPY %0.dsub %36:gpr64 = COPY %1.dsub %9:gpr64 = SDIVXr %35, %36 %37:gpr64 = UMOVvi64 %0, 1 %38:gpr64 = UMOVvi64 %1, 1 %10:gpr64 = SDIVXr %37, %38 %19:fpr128 = INSvi64gpr undef %19, 0, %9 %19:fpr128 = INSvi64gpr %19, 1, %10 %39:gpr64 = COPY %19.dsub %24:gpr64 = MADDXrrr %39, %36, $xzr %41:gpr64 = UMOVvi64 %19, 1 %25:gpr64 = MADDXrrr %41, %38, $xzr %34:fpr128 = INSvi64gpr undef %34, 0, %24 %34:fpr128 = INSvi64gpr %34, 1, %25 %2:fpr128 = SUBv2i64 %0, %34 $q0 = COPY %2 RET_ReallyLR implicit $q0 ... ``` When I run this with: ``` llc -global-isel -verify-machineinstrs -run-pass=machine-scheduler ``` It fails with: ``` Use of $xzr does not have a corresponding definition on every path: 216r %10:gpr64 = MADDXrrr %9:gpr64, %3:gpr64, $xzr LLVM ERROR: Use not jointly dominated by defs. PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace. Stack dump: 0. Program arguments: ./bin/llc -global-isel -verify-machineinstrs -run-pass=machine-scheduler /tmp/t.mir -o - 1. Running pass 'Function Pass Manager' on module '/tmp/t.mir'. 2. Running pass 'Machine Instruction Scheduler' on function '@sv2i64' ... #8 0x80062b7c llvm::LiveRangeCalc::findReachingDefs(llvm::LiveRange&, llvm::MachineBasicBlock&, llvm::SlotIndex, unsigned int, llvm::ArrayRef) #9 0x80063e94 llvm::LiveRangeCalc::extend(llvm::LiveRange&, llvm::SlotIndex, unsigned int, llvm::ArrayRef) #10 0x80064a18 llvm::LiveIntervalCalc::extendToUses(llvm::LiveRange&, llvm::Register, llvm::LaneBitmask, llvm::LiveInterval*) #11 0x8003e82c llvm::LiveIntervals::computeRegUnitRange(llvm::LiveRange&, unsigned int) #12 0x80044cdc llvm::LiveIntervals::HMEditor::updateAllRanges(llvm::MachineInstr*) #13 0x8004848c llvm::LiveIntervals::handleMove(llvm::MachineInstr&, bool) #14 0x801f44ec llvm::ScheduleDAGMI::moveInstruction(llvm::MachineInstr*, llvm::MachineInstrBundleIterator) #15 0x801fdb58 llvm::ScheduleDAGMILive::scheduleMI(llvm::SUnit*, bool) #16 0x8020b214 llvm::ScheduleDAGMILive::schedule() #17 0x801f0934 (anonymous namespace)::MachineSchedulerBase::scheduleRegions(llvm::ScheduleDAGInstrs&, bool) (.isra.0) MachineScheduler.cpp:0:0 ``` https://github.com/llvm/llvm-project/pull/114263 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)
https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/114438 None >From 687d29af2f79b07cdc8b8b0044a8c1f828745cfd Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 31 Oct 2024 12:49:07 -0400 Subject: [PATCH] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 64 +-- .../CodeGen/AMDGPU/propagate-waves-per-eu.ll | 47 +++--- 2 files changed, 69 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 205673cdcc0e23..ed7cd1f53b41e9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -215,6 +215,19 @@ class AMDGPUInformationCache : public InformationCache { return ST.getWavesPerEU(F, FlatWorkGroupSize); } + std::optional> + getWavesPerEUAttr(const Function &F) { +Attribute Attr = F.getFnAttribute("amdgpu-waves-per-eu"); +if (!Attr.isStringAttribute()) + return std::nullopt; +auto Val = parseRangeAttribute(Attr.getValueAsString()); +if (Val && Val->second == 0) { + const GCNSubtarget &ST = TM.getSubtarget(F); + Val->second = ST.getMaxWavesPerEU(); +} +return Val; + } + std::pair getEffectiveWavesPerEU(const Function &F, std::pair WavesPerEU, @@ -885,29 +898,44 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A) : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {} - bool isValidState() const override { -return !Assumed.isEmptySet() && IntegerRangeState::isValidState(); - } - void initialize(Attributor &A) override { Function *F = getAssociatedFunction(); auto &InfoCache = static_cast(A.getInfoCache()); -if (const auto *AssumedGroupSize = A.getAAFor( -*this, IRPosition::function(*F), DepClassTy::REQUIRED); -AssumedGroupSize->isValidState()) { - - unsigned Min, Max; - std::tie(Min, Max) = InfoCache.getWavesPerEU( - *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(), - AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1}); - +auto TakeRange = [&](std::pair R) { + auto [Min, Max] = R; ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); - intersectKnown(Range); + IntegerRangeState RangeState(Range); + clampStateAndIndicateChange(this->getState(), RangeState); + indicateOptimisticFixpoint(); +}; + +// If the attribute exists, simple honor it. +if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) { + TakeRange(*Attr); + return; } -if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) - indicatePessimisticFixpoint(); +// It's getting trickier here, different from AAAMDFlatWorkGroupSize. Since +// the calculation of waves per EU involves flat work group size, we can't +// simply use an assumed flat work group size as a start point, because the +// update of flat work group size is in an inverse direction of waves per +// EU. However, we can still do something if it is an entry function. Since +// an entry function is a terminal node, and flat work group size either +// from attribute or default will be used anyway, we can take that value and +// calculate the waves per EU based on it. This result can't be updated by +// no means, but that could still allow us to propagate it. +if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) { + std::pair MaxWavesPerEURange{ + 1U, InfoCache.getMaxWavesPerEU(*F)}; + std::pair FlatWorkGroupSize; + if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) +FlatWorkGroupSize = *Attr; + else +FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F); + TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange, + FlatWorkGroupSize)); +} } ChangeStatus updateImpl(Attributor &A) override { @@ -956,8 +984,8 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { ChangeStatus manifest(Attributor &A) override { Function *F = getAssociatedFunction(); auto &InfoCache = static_cast(A.getInfoCache()); -unsigned Max = InfoCache.getMaxWavesPerEU(*F); -return emitAttributeIfNotDefault(A, 1, Max); +return emitAttributeIfNotDefaultAfterClamp( +A, {1, InfoCache.getMaxWavesPerEU(*F)}); } /// See AbstractAttribute::getName() diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll index 1272bf655e309d..e28bccfb99343b 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll @@ -117,7 +117,7 @@ define amdgpu_kernel void @kernel_2_9() #6 { define amdgpu_kernel void @kernel_9_9()
[llvm-branch-commits] [llvm] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) Changes --- Patch is 25.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114438.diff 2 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp (+46-18) - (modified) llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll (+23-24) ``diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 205673cdcc0e23..ed7cd1f53b41e9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -215,6 +215,19 @@ class AMDGPUInformationCache : public InformationCache { return ST.getWavesPerEU(F, FlatWorkGroupSize); } + std::optional> + getWavesPerEUAttr(const Function &F) { +Attribute Attr = F.getFnAttribute("amdgpu-waves-per-eu"); +if (!Attr.isStringAttribute()) + return std::nullopt; +auto Val = parseRangeAttribute(Attr.getValueAsString()); +if (Val && Val->second == 0) { + const GCNSubtarget &ST = TM.getSubtarget(F); + Val->second = ST.getMaxWavesPerEU(); +} +return Val; + } + std::pair getEffectiveWavesPerEU(const Function &F, std::pair WavesPerEU, @@ -885,29 +898,44 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A) : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {} - bool isValidState() const override { -return !Assumed.isEmptySet() && IntegerRangeState::isValidState(); - } - void initialize(Attributor &A) override { Function *F = getAssociatedFunction(); auto &InfoCache = static_cast(A.getInfoCache()); -if (const auto *AssumedGroupSize = A.getAAFor( -*this, IRPosition::function(*F), DepClassTy::REQUIRED); -AssumedGroupSize->isValidState()) { - - unsigned Min, Max; - std::tie(Min, Max) = InfoCache.getWavesPerEU( - *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(), - AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1}); - +auto TakeRange = [&](std::pair R) { + auto [Min, Max] = R; ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); - intersectKnown(Range); + IntegerRangeState RangeState(Range); + clampStateAndIndicateChange(this->getState(), RangeState); + indicateOptimisticFixpoint(); +}; + +// If the attribute exists, simple honor it. +if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) { + TakeRange(*Attr); + return; } -if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) - indicatePessimisticFixpoint(); +// It's getting trickier here, different from AAAMDFlatWorkGroupSize. Since +// the calculation of waves per EU involves flat work group size, we can't +// simply use an assumed flat work group size as a start point, because the +// update of flat work group size is in an inverse direction of waves per +// EU. However, we can still do something if it is an entry function. Since +// an entry function is a terminal node, and flat work group size either +// from attribute or default will be used anyway, we can take that value and +// calculate the waves per EU based on it. This result can't be updated by +// no means, but that could still allow us to propagate it. +if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) { + std::pair MaxWavesPerEURange{ + 1U, InfoCache.getMaxWavesPerEU(*F)}; + std::pair FlatWorkGroupSize; + if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) +FlatWorkGroupSize = *Attr; + else +FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F); + TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange, + FlatWorkGroupSize)); +} } ChangeStatus updateImpl(Attributor &A) override { @@ -956,8 +984,8 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { ChangeStatus manifest(Attributor &A) override { Function *F = getAssociatedFunction(); auto &InfoCache = static_cast(A.getInfoCache()); -unsigned Max = InfoCache.getMaxWavesPerEU(*F); -return emitAttributeIfNotDefault(A, 1, Max); +return emitAttributeIfNotDefaultAfterClamp( +A, {1, InfoCache.getMaxWavesPerEU(*F)}); } /// See AbstractAttribute::getName() diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll index 1272bf655e309d..e28bccfb99343b 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll @@ -117,7 +117,7 @@ define amdgpu_kernel void @kernel_2_9() #6 { define amdgpu_kernel void @kernel_9_9() #7 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_9_9 -; CHECK-SAME: () #[[ATTR6]] { +; CHECK-SAME: ()
[llvm-branch-commits] [llvm] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)
shiltian wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/114438?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#114438** https://app.graphite.dev/github/pr/llvm/llvm-project/114438?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 * **#114357** https://app.graphite.dev/github/pr/llvm/llvm-project/114357?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about stacking. Join @shiltian and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="11px" height="11px"/> Graphite https://github.com/llvm/llvm-project/pull/114438 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AtomicExpand: Copy metadata from atomicrmw to cmpxchg (PR #109409)
arsenm wrote: ### Merge activity * **Oct 31, 2:43 PM EDT**: A user started a stack merge that includes this pull request via [Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/109409). https://github.com/llvm/llvm-project/pull/109409 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)
@@ -215,6 +215,19 @@ class AMDGPUInformationCache : public InformationCache { return ST.getWavesPerEU(F, FlatWorkGroupSize); } + std::optional> + getWavesPerEUAttr(const Function &F) { +Attribute Attr = F.getFnAttribute("amdgpu-waves-per-eu"); +if (!Attr.isStringAttribute()) + return std::nullopt; +auto Val = parseRangeAttribute(Attr.getValueAsString()); shiltian wrote: In the parent PR https://github.com/llvm/llvm-project/pull/114438 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)
@@ -215,6 +215,19 @@ class AMDGPUInformationCache : public InformationCache { return ST.getWavesPerEU(F, FlatWorkGroupSize); } + std::optional> + getWavesPerEUAttr(const Function &F) { +Attribute Attr = F.getFnAttribute("amdgpu-waves-per-eu"); +if (!Attr.isStringAttribute()) + return std::nullopt; +auto Val = parseRangeAttribute(Attr.getValueAsString()); arsenm wrote: I don't see where this is defined https://github.com/llvm/llvm-project/pull/114438 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Define high bits of FPR and GPR registers. (PR #114263)
@@ -424,6 +424,58 @@ AArch64RegisterInfo::explainReservedReg(const MachineFunction &MF, return {}; } +static SmallVector ReservedHi = { arsenm wrote: What kind of failures? AMDGPU also has synthetic 16-bit high sub registers and they are not explicitly reserved. Are you adding these to an allocatable class? https://github.com/llvm/llvm-project/pull/114263 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From a269e8da1b872b3f2390037a594757940cf8369b Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 31 Oct 2024 14:10:57 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr register bank in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 27 +- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 69 --- .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 51 ++- 9 files changed, 191 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..8e13c0916dd9e1 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -54,9 +54,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 +# CHECK: DIVERGENT: G_BR %bb.2 # CHECK-LABEL: BLOCK bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: D
[llvm-branch-commits] [llvm] AMDGPU: Custom expand flat cmpxchg which may access private (PR #109410)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/109410 >From 834ff3b40bd82cb54bb33532a54ad36870ea2b24 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 14 Aug 2024 13:57:14 +0400 Subject: [PATCH 1/2] AMDGPU: Custom expand flat cmpxchg which may access private 64-bit flat cmpxchg instructions do not work correctly for scratch addresses, and need to be expanded as non-atomic. Allow custom expansion of cmpxchg in AtomicExpand, as is already the case for atomicrmw. --- llvm/include/llvm/CodeGen/TargetLowering.h|5 + .../llvm/Transforms/Utils/LowerAtomic.h |7 + llvm/lib/CodeGen/AtomicExpandPass.cpp |4 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 146 ++- llvm/lib/Target/AMDGPU/SIISelLowering.h |3 + llvm/lib/Transforms/Utils/LowerAtomic.cpp | 21 +- llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 1019 +++-- ...expand-atomicrmw-flat-noalias-addrspace.ll |6 +- ...expand-atomicrmw-integer-ops-0-to-add-0.ll |6 +- .../expand-cmpxchg-flat-maybe-private.ll | 104 +- 10 files changed, 1157 insertions(+), 164 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 8e0cdc6f1a5e77..e0b638201a0474 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2204,6 +2204,11 @@ class TargetLoweringBase { "Generic atomicrmw expansion unimplemented on this target"); } + /// Perform a cmpxchg expansion using a target-specific method. + virtual void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const { +llvm_unreachable("Generic cmpxchg expansion unimplemented on this target"); + } + /// Perform a bit test atomicrmw using a target-specific intrinsic. This /// represents the combined bit test intrinsic which will be lowered at a late /// stage by the backend. diff --git a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h index b25b281667f9cb..295c2bd2b4b47e 100644 --- a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h +++ b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h @@ -23,6 +23,13 @@ class IRBuilderBase; /// Convert the given Cmpxchg into primitive load and compare. bool lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI); +/// Emit IR to implement the given cmpxchg operation on values in registers, +/// returning the new value. +std::pair buildAtomicCmpXchgValue(IRBuilderBase &Builder, +Value *Ptr, Value *Cmp, +Value *Val, +Align Alignment); + /// Convert the given RMWI into primitive load and stores, /// assuming that doing so is legal. Return true if the lowering /// succeeds. diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 0aff4f1f5cf1cb..1471e3d7cbc29d 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -1674,6 +1674,10 @@ bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) { return true; case TargetLoweringBase::AtomicExpansionKind::NotAtomic: return lowerAtomicCmpXchgInst(CI); + case TargetLoweringBase::AtomicExpansionKind::Expand: { +TLI->emitExpandAtomicCmpXchg(CI); +return true; + } } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d66610ae0a160d..c8a46875bda408 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16577,9 +16577,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const { - return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS - ? AtomicExpansionKind::NotAtomic - : AtomicExpansionKind::None; + unsigned AddrSpace = CmpX->getPointerAddressSpace(); + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) +return AtomicExpansionKind::NotAtomic; + + if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX)) +return AtomicExpansionKind::None; + + const DataLayout &DL = CmpX->getDataLayout(); + + Type *ValTy = CmpX->getNewValOperand()->getType(); + + // If a 64-bit flat atomic may alias private, we need to avoid using the + // atomic in the private case. + return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand + : AtomicExpansionKind::None; } const TargetRegisterClass * @@ -16745,40 +16757,8 @@ bool SITargetLowering::checkForPhysRegDependency( return false; } -void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { - AtomicRMWInst::BinOp Op = AI->getOperation(); - - if (Op == AtomicRMWInst::Sub || Op ==
[llvm-branch-commits] [llvm] [AArch64] Define high bits of FPR and GPR registers. (PR #114263)
@@ -424,6 +424,57 @@ AArch64RegisterInfo::explainReservedReg(const MachineFunction &MF, return {}; } +static MCPhysReg ReservedHi[] = { arsenm wrote: missing const https://github.com/llvm/llvm-project/pull/114263 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] Revert "[Flang][OpenMP] Disable lowering of omp.simd reductions in co… (PR #113683)
https://github.com/skatrak updated https://github.com/llvm/llvm-project/pull/113683 >From cf57ecd0984bcb2335e8ecfaffdca600a5c7cf1b Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Fri, 25 Oct 2024 12:07:22 +0100 Subject: [PATCH] Revert "[Flang][OpenMP] Disable lowering of omp.simd reductions in composites (#112686)" Lowering of reductions in composite operations can now be re-enabled, since previous commits in this PR stack fix the MLIR representation produced and it no longer triggers a compiler crash during translation to LLVM IR. This reverts commit c44860c8d2582abd88794267b4fa0fa953bbef80. --- flang/lib/Lower/OpenMP/OpenMP.cpp | 20 ++-- flang/test/Lower/OpenMP/wsloop-simd.f90 | 21 + 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 329cbf3d7539f5..4f9e2347308aa1 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -2246,12 +2246,6 @@ static void genCompositeDistributeParallelDoSimd( genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps, simdReductionSyms); - // TODO: Remove this after omp.simd reductions on composite constructs are - // supported. - simdClauseOps.reductionVars.clear(); - simdClauseOps.reductionByref.clear(); - simdClauseOps.reductionSyms.clear(); - mlir::omp::LoopNestOperands loopNestClauseOps; llvm::SmallVector iv; genLoopNestClauses(converter, semaCtx, eval, simdItem->clauses, loc, @@ -2273,7 +2267,9 @@ static void genCompositeDistributeParallelDoSimd( wsloopOp.setComposite(/*val=*/true); EntryBlockArgs simdArgs; - // TODO: Add private and reduction syms and vars. + // TODO: Add private syms and vars. + simdArgs.reduction.syms = simdReductionSyms; + simdArgs.reduction.vars = simdClauseOps.reductionVars; auto simdOp = genWrapperOp(converter, loc, simdClauseOps, simdArgs); simdOp.setComposite(/*val=*/true); @@ -2366,12 +2362,6 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter, genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps, simdReductionSyms); - // TODO: Remove this after omp.simd reductions on composite constructs are - // supported. - simdClauseOps.reductionVars.clear(); - simdClauseOps.reductionByref.clear(); - simdClauseOps.reductionSyms.clear(); - // TODO: Support delayed privatization. DataSharingProcessor dsp(converter, semaCtx, simdItem->clauses, eval, /*shouldCollectPreDeterminedSymbols=*/true, @@ -2395,7 +2385,9 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter, wsloopOp.setComposite(/*val=*/true); EntryBlockArgs simdArgs; - // TODO: Add private and reduction syms and vars. + // TODO: Add private syms and vars. + simdArgs.reduction.syms = simdReductionSyms; + simdArgs.reduction.vars = simdClauseOps.reductionVars; auto simdOp = genWrapperOp(converter, loc, simdClauseOps, simdArgs); simdOp.setComposite(/*val=*/true); diff --git a/flang/test/Lower/OpenMP/wsloop-simd.f90 b/flang/test/Lower/OpenMP/wsloop-simd.f90 index 899ab59714f144..49a9a523e11fe7 100644 --- a/flang/test/Lower/OpenMP/wsloop-simd.f90 +++ b/flang/test/Lower/OpenMP/wsloop-simd.f90 @@ -45,3 +45,24 @@ subroutine do_simd_simdlen() end do !$omp end do simd end subroutine do_simd_simdlen + +! CHECK-LABEL: func.func @_QPdo_simd_reduction( +subroutine do_simd_reduction() + integer :: sum + sum = 0 + ! CHECK: omp.wsloop + ! CHECK-SAME: reduction(@[[RED_SYM:.*]] %{{.*}} -> %[[RED_OUTER:.*]] : !fir.ref) + ! CHECK-NEXT: omp.simd + ! CHECK-SAME: reduction(@[[RED_SYM]] %[[RED_OUTER]] -> %[[RED_INNER:.*]] : !fir.ref) + ! CHECK-NEXT: omp.loop_nest + ! CHECK: %[[RED_DECL:.*]]:2 = hlfir.declare %[[RED_INNER]] + ! CHECK: %[[RED:.*]] = fir.load %[[RED_DECL]]#0 : !fir.ref + ! CHECK: %[[RESULT:.*]] = arith.addi %[[RED]], %{{.*}} : i32 + ! CHECK: hlfir.assign %[[RESULT]] to %[[RED_DECL]]#0 : i32, !fir.ref + ! CHECK-NEXT: omp.yield + !$omp do simd reduction(+:sum) +do index_ = 1, 10 + sum = sum + 1 +end do + !$omp end do simd +end subroutine do_simd_reduction ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [TableGen] Fix calculation of Lanemask for RCs with artificial subregs. (PR #114392)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/114392 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [OpenMP][MLIR] Descriptor explicit member map lowering changes (PR #113556)
https://github.com/skatrak approved this pull request. Ah, I see. Then this LGTM, thanks for explaining! https://github.com/llvm/llvm-project/pull/113556 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Propagate amdgpu-max-num-workgroups attribute (PR #113018)
@@ -821,6 +826,152 @@ AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP, "AAAMDFlatWorkGroupSize is only valid for function position"); } +struct TupleDecIntegerRangeState : public AbstractState { + DecIntegerState X, Y, Z; + + bool isValidState() const override { +return X.isValidState() && Y.isValidState() && Z.isValidState(); + } + + bool isAtFixpoint() const override { +return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint(); + } + + ChangeStatus indicateOptimisticFixpoint() override { +return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() | + Z.indicateOptimisticFixpoint(); + } + + ChangeStatus indicatePessimisticFixpoint() override { +return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() | + Z.indicatePessimisticFixpoint(); + } + + TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) { +X ^= Other.X; +Y ^= Other.Y; +Z ^= Other.Z; +return *this; + } + + bool operator==(const TupleDecIntegerRangeState &Other) const { +return X == Other.X && Y == Other.Y && Z == Other.Z; + } + + TupleDecIntegerRangeState &getAssumed() { return *this; } + const TupleDecIntegerRangeState &getAssumed() const { return *this; } +}; + +using AAAMDMaxNumWorkgroupsState = +StateWrapper; + +/// Propagate amdgpu-max-num-workgroups attribute. +struct AAAMDMaxNumWorkgroups +: public StateWrapper { + using Base = StateWrapper; + + AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + + void initialize(Attributor &A) override { +Function *F = getAssociatedFunction(); +auto &InfoCache = static_cast(A.getInfoCache()); + +SmallVector MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F); + +// FIXME: What is the interpretation of 0? +for (unsigned &Entry : MaxNumWorkgroups) { + if (Entry == 0) +Entry = std::numeric_limits::max(); +} + +X.takeKnownMinimum(MaxNumWorkgroups[0]); +Y.takeKnownMinimum(MaxNumWorkgroups[1]); +Z.takeKnownMinimum(MaxNumWorkgroups[2]); + +if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) + indicatePessimisticFixpoint(); + } + + ChangeStatus updateImpl(Attributor &A) override { +ChangeStatus Change = ChangeStatus::UNCHANGED; + +auto CheckCallSite = [&](AbstractCallSite CS) { + Function *Caller = CS.getInstruction()->getFunction(); + LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName() +<< "->" << getAssociatedFunction()->getName() << '\n'); + + const auto *CallerInfo = A.getAAFor( + *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); + if (!CallerInfo || !CallerInfo->isValidState()) +return false; + + Change |= + clampStateAndIndicateChange(this->getState(), CallerInfo->getState()); + return true; +}; + +bool AllCallSitesKnown = true; +if (!A.checkForAllCallSites(CheckCallSite, *this, +/*RequireAllCallSites=*/true, +AllCallSitesKnown)) + return indicatePessimisticFixpoint(); + +return Change; + } + + /// Create an abstract attribute view for the position \p IRP. + static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP, + Attributor &A); + + ChangeStatus manifest(Attributor &A) override { +Function *F = getAssociatedFunction(); +// TODO: Skip adding if worst case? arsenm wrote: Yes, uint32_max x 3 https://github.com/llvm/llvm-project/pull/113018 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)
@@ -66,9 +73,215 @@ FunctionPass *llvm::createAMDGPURegBankSelectPass() { return new AMDGPURegBankSelect(); } +class RegBankSelectHelper { + MachineIRBuilder &B; + MachineRegisterInfo &MRI; + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA; + const MachineUniformityInfo &MUI; + const SIRegisterInfo &TRI; + const RegisterBank *SgprRB; + const RegisterBank *VgprRB; + const RegisterBank *VccRB; + +public: + RegBankSelectHelper(MachineIRBuilder &B, + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA, + const MachineUniformityInfo &MUI, + const SIRegisterInfo &TRI, const RegisterBankInfo &RBI) + : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI), +SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), +VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), +VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} + + bool shouldRegBankSelect(MachineInstr &MI) { +return MI.isPreISelOpcode() || MI.isCopy(); + } + + // Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of + // the cycle + // Note: uniformity analysis does not consider that registers with vgpr def + // are divergent (you can have uniform value in vgpr). + // - TODO: implicit use of $exec could be implemented as indicator that + // instruction is divergent + bool isTemporalDivergenceCopy(Register Reg) { +MachineInstr *MI = MRI.getVRegDef(Reg); +if (!MI->isCopy()) + return false; + +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + + if (Op.getReg() == TRI.getExec()) { +return true; + } +} + +return false; + } + + void setRBDef(MachineInstr &MI, MachineOperand &DefOP, +const RegisterBank *RB) { +Register Reg = DefOP.getReg(); +// Register that already has Register class got it during pre-inst selection +// of another instruction. Maybe cross bank copy was required so we insert a +// copy that can be removed later. This simplifies post regbanklegalize +// combiner and avoids need to special case some patterns. +if (MRI.getRegClassOrNull(Reg)) { + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({RB, Ty}); + DefOP.setReg(NewReg); + + auto &MBB = *MI.getParent(); + B.setInsertPt(MBB, MBB.SkipPHIsAndLabels(std::next(MI.getIterator(; + B.buildCopy(Reg, NewReg); + + // The problem was discovered for uniform S1 that was used as both + // lane mask(vcc) and regular sgpr S1. + // - lane-mask(vcc) use was by si_if, this use is divergent and requires + // non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets + // sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. + // - the regular sgpr S1(uniform) instruction is now broken since + // it uses sreg_64_xexec(S1) which is divergent. + + // Replace virtual registers with register class on generic instructions + // uses with virtual registers with register bank. + for (auto &UseMI : MRI.use_instructions(Reg)) { +if (shouldRegBankSelect(UseMI)) { + for (MachineOperand &Op : UseMI.operands()) { +if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); + } +} + } + +} else { + MRI.setRegBank(Reg, *RB); +} + } + + std::optional tryGetVReg(MachineOperand &Op) { +if (!Op.isReg()) + return std::nullopt; + +Register Reg = Op.getReg(); +if (!Reg.isVirtual()) + return std::nullopt; + +return Reg; + } + + void assignBanksOnDefs(MachineInstr &MI) { +if (!shouldRegBankSelect(MI)) + return; + +for (MachineOperand &DefOP : MI.defs()) { + auto MaybeDefReg = tryGetVReg(DefOP); + if (!MaybeDefReg) +continue; + Register DefReg = *MaybeDefReg; + + // Copies can have register class on def registers. + if (MI.isCopy() && MRI.getRegClassOrNull(DefReg)) { +continue; + } + + if (MUI.isUniform(DefReg) || ILMA.isS32S64LaneMask(DefReg)) { +setRBDef(MI, DefOP, SgprRB); + } else { +if (MRI.getType(DefReg) == LLT::scalar(1)) + setRBDef(MI, DefOP, VccRB); +else + setRBDef(MI, DefOP, VgprRB); + } +} + } + + void constrainRBUse(MachineInstr &MI, MachineOperand &UseOP, + const RegisterBank *RB) { +Register Reg = UseOP.getReg(); + +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({RB, Ty}); +UseOP.setReg(NewReg); + +if (MI.isPHI()) { + auto DefMI = MRI.getVRegDef(Reg)->getIterator(); + MachineBasicBlock *DefMBB = DefMI->getParent(); + B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); +} else { + B.setInstr(MI); +} + +B.buildCopy(NewReg, Reg); + } + + void constrainBanksOnUses(MachineInstr &MI) { +if (!shouldRegBankSelect(
[llvm-branch-commits] [flang] Revert "[Flang][OpenMP] Disable lowering of omp.simd reductions in co… (PR #113683)
https://github.com/NimishMishra approved this pull request. This looks okay to me, given the PR stack. There is still https://github.com/llvm/llvm-project/pull/113682 pending a merge; I'll take a look at that PR tomorrow. Thanks for the work on this. https://github.com/llvm/llvm-project/pull/113683 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Define high bits of FPR and GPR registers. (PR #114263)
@@ -424,6 +424,58 @@ AArch64RegisterInfo::explainReservedReg(const MachineFunction &MF, return {}; } +static SmallVector ReservedHi = { arsenm wrote: But what are the actual failures, messages, location? If the high half of register isn't allocatable / addressable in the first place, it shouldn't just appear to cause issues https://github.com/llvm/llvm-project/pull/114263 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [RISCV] Add initial support of memcmp expansion (PR #107548)
https://github.com/wangpc-pp updated https://github.com/llvm/llvm-project/pull/107548 >From f21cfcfc90330ee3856746b6315a81a00313b0e0 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Fri, 6 Sep 2024 17:20:51 +0800 Subject: [PATCH 1/5] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?= =?UTF-8?q?itial=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.6-beta.1 --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 15 + .../Target/RISCV/RISCVTargetTransformInfo.h | 3 + llvm/test/CodeGen/RISCV/memcmp.ll | 932 ++ 3 files changed, 950 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/memcmp.ll diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index e809e15eacf696..ad532aadc83266 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2113,3 +2113,18 @@ bool RISCVTTIImpl::shouldConsiderAddressTypePromotion( } return Considerable; } + +RISCVTTIImpl::TTI::MemCmpExpansionOptions +RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { + TTI::MemCmpExpansionOptions Options; + // FIXME: Vector haven't been tested. + Options.AllowOverlappingLoads = + (ST->enableUnalignedScalarMem() || ST->enableUnalignedScalarMem()); + Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); + Options.NumLoadsPerBlock = Options.MaxNumLoads; + if (ST->is64Bit()) +Options.LoadSizes.push_back(8); + llvm::append_range(Options.LoadSizes, ArrayRef({4, 2, 1})); + Options.AllowedTailExpansions = {3, 5, 6}; + return Options; +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 763b89bfec0a66..ee9bed09df97f3 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -404,6 +404,9 @@ class RISCVTTIImpl : public BasicTTIImplBase { shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader); std::optional getMinPageSize() const { return 4096; } + + TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, +bool IsZeroCmp) const; }; } // end namespace llvm diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll new file mode 100644 index 00..652cd02e2c750a --- /dev/null +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -0,0 +1,932 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -O2 | FileCheck %s --check-prefix=CHECK-ALIGNED-RV32 +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -O2 | FileCheck %s --check-prefix=CHECK-ALIGNED-RV64 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+unaligned-scalar-mem -O2 \ +; RUN: | FileCheck %s --check-prefix=CHECK-UNALIGNED-RV32 +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+unaligned-scalar-mem -O2 \ +; RUN: | FileCheck %s --check-prefix=CHECK-UNALIGNED-RV64 + +declare i32 @bcmp(i8*, i8*, iXLen) nounwind readonly +declare i32 @memcmp(i8*, i8*, iXLen) nounwind readonly + +define i1 @bcmp_size_15(i8* %s1, i8* %s2) { +; CHECK-ALIGNED-RV32-LABEL: bcmp_size_15: +; CHECK-ALIGNED-RV32: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-NEXT:lbu a2, 1(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a3, 0(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a4, 2(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a5, 3(a0) +; CHECK-ALIGNED-RV32-NEXT:slli a2, a2, 8 +; CHECK-ALIGNED-RV32-NEXT:or a2, a2, a3 +; CHECK-ALIGNED-RV32-NEXT:slli a4, a4, 16 +; CHECK-ALIGNED-RV32-NEXT:slli a5, a5, 24 +; CHECK-ALIGNED-RV32-NEXT:or a4, a5, a4 +; CHECK-ALIGNED-RV32-NEXT:or a2, a4, a2 +; CHECK-ALIGNED-RV32-NEXT:lbu a3, 1(a1) +; CHECK-ALIGNED-RV32-NEXT:lbu a4, 0(a1) +; CHECK-ALIGNED-RV32-NEXT:lbu a5, 2(a1) +; CHECK-ALIGNED-RV32-NEXT:lbu a6, 3(a1) +; CHECK-ALIGNED-RV32-NEXT:slli a3, a3, 8 +; CHECK-ALIGNED-RV32-NEXT:or a3, a3, a4 +; CHECK-ALIGNED-RV32-NEXT:slli a5, a5, 16 +; CHECK-ALIGNED-RV32-NEXT:slli a6, a6, 24 +; CHECK-ALIGNED-RV32-NEXT:or a4, a6, a5 +; CHECK-ALIGNED-RV32-NEXT:or a3, a4, a3 +; CHECK-ALIGNED-RV32-NEXT:xor a2, a2, a3 +; CHECK-ALIGNED-RV32-NEXT:lbu a3, 5(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a4, 4(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a5, 6(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a6, 7(a0) +; CHECK-ALIGNED-RV32-NEXT:slli a3, a3, 8 +; CHECK-ALIGNED-RV32-NEXT:or a3, a3, a4 +; CHECK-ALIGNED-RV32-NEXT:slli a5, a5, 16 +; CHECK-ALIGNED-RV32-NEXT:slli a6, a6, 24 +; CHECK-ALIGNED-RV32-NEXT:or a4, a6, a5 +; CHECK-ALIGNED-RV32-NEXT:or a3, a4, a3 +; CHECK-ALIGNED-RV32-NEXT:lbu a4, 5(a1) +; CHECK-ALIGNED-RV32-NEXT
[llvm-branch-commits] [llvm] [AArch64] Define high bits of FPR and GPR registers. (PR #114263)
https://github.com/sdesmalen-arm edited https://github.com/llvm/llvm-project/pull/114263 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [RISCV] Add initial support of memcmp expansion (PR #107548)
https://github.com/wangpc-pp updated https://github.com/llvm/llvm-project/pull/107548 >From f21cfcfc90330ee3856746b6315a81a00313b0e0 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Fri, 6 Sep 2024 17:20:51 +0800 Subject: [PATCH 1/5] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?= =?UTF-8?q?itial=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.6-beta.1 --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 15 + .../Target/RISCV/RISCVTargetTransformInfo.h | 3 + llvm/test/CodeGen/RISCV/memcmp.ll | 932 ++ 3 files changed, 950 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/memcmp.ll diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index e809e15eacf696..ad532aadc83266 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2113,3 +2113,18 @@ bool RISCVTTIImpl::shouldConsiderAddressTypePromotion( } return Considerable; } + +RISCVTTIImpl::TTI::MemCmpExpansionOptions +RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { + TTI::MemCmpExpansionOptions Options; + // FIXME: Vector haven't been tested. + Options.AllowOverlappingLoads = + (ST->enableUnalignedScalarMem() || ST->enableUnalignedScalarMem()); + Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); + Options.NumLoadsPerBlock = Options.MaxNumLoads; + if (ST->is64Bit()) +Options.LoadSizes.push_back(8); + llvm::append_range(Options.LoadSizes, ArrayRef({4, 2, 1})); + Options.AllowedTailExpansions = {3, 5, 6}; + return Options; +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 763b89bfec0a66..ee9bed09df97f3 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -404,6 +404,9 @@ class RISCVTTIImpl : public BasicTTIImplBase { shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader); std::optional getMinPageSize() const { return 4096; } + + TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, +bool IsZeroCmp) const; }; } // end namespace llvm diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll new file mode 100644 index 00..652cd02e2c750a --- /dev/null +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -0,0 +1,932 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -O2 | FileCheck %s --check-prefix=CHECK-ALIGNED-RV32 +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -O2 | FileCheck %s --check-prefix=CHECK-ALIGNED-RV64 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+unaligned-scalar-mem -O2 \ +; RUN: | FileCheck %s --check-prefix=CHECK-UNALIGNED-RV32 +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+unaligned-scalar-mem -O2 \ +; RUN: | FileCheck %s --check-prefix=CHECK-UNALIGNED-RV64 + +declare i32 @bcmp(i8*, i8*, iXLen) nounwind readonly +declare i32 @memcmp(i8*, i8*, iXLen) nounwind readonly + +define i1 @bcmp_size_15(i8* %s1, i8* %s2) { +; CHECK-ALIGNED-RV32-LABEL: bcmp_size_15: +; CHECK-ALIGNED-RV32: # %bb.0: # %entry +; CHECK-ALIGNED-RV32-NEXT:lbu a2, 1(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a3, 0(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a4, 2(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a5, 3(a0) +; CHECK-ALIGNED-RV32-NEXT:slli a2, a2, 8 +; CHECK-ALIGNED-RV32-NEXT:or a2, a2, a3 +; CHECK-ALIGNED-RV32-NEXT:slli a4, a4, 16 +; CHECK-ALIGNED-RV32-NEXT:slli a5, a5, 24 +; CHECK-ALIGNED-RV32-NEXT:or a4, a5, a4 +; CHECK-ALIGNED-RV32-NEXT:or a2, a4, a2 +; CHECK-ALIGNED-RV32-NEXT:lbu a3, 1(a1) +; CHECK-ALIGNED-RV32-NEXT:lbu a4, 0(a1) +; CHECK-ALIGNED-RV32-NEXT:lbu a5, 2(a1) +; CHECK-ALIGNED-RV32-NEXT:lbu a6, 3(a1) +; CHECK-ALIGNED-RV32-NEXT:slli a3, a3, 8 +; CHECK-ALIGNED-RV32-NEXT:or a3, a3, a4 +; CHECK-ALIGNED-RV32-NEXT:slli a5, a5, 16 +; CHECK-ALIGNED-RV32-NEXT:slli a6, a6, 24 +; CHECK-ALIGNED-RV32-NEXT:or a4, a6, a5 +; CHECK-ALIGNED-RV32-NEXT:or a3, a4, a3 +; CHECK-ALIGNED-RV32-NEXT:xor a2, a2, a3 +; CHECK-ALIGNED-RV32-NEXT:lbu a3, 5(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a4, 4(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a5, 6(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a6, 7(a0) +; CHECK-ALIGNED-RV32-NEXT:slli a3, a3, 8 +; CHECK-ALIGNED-RV32-NEXT:or a3, a3, a4 +; CHECK-ALIGNED-RV32-NEXT:slli a5, a5, 16 +; CHECK-ALIGNED-RV32-NEXT:slli a6, a6, 24 +; CHECK-ALIGNED-RV32-NEXT:or a4, a6, a5 +; CHECK-ALIGNED-RV32-NEXT:or a3, a4, a3 +; CHECK-ALIGNED-RV32-NEXT:lbu a4, 5(a1) +; CHECK-ALIGNED-RV32-NEXT
[llvm-branch-commits] [mlir] [OpenMP][MLIR] Descriptor explicit member map lowering changes (PR #113556)
https://github.com/agozillon updated https://github.com/llvm/llvm-project/pull/113556 >From 70265b81b3e8ab7b6d04ed3d019861abd0b0e4aa Mon Sep 17 00:00:00 2001 From: agozillon Date: Fri, 4 Oct 2024 13:03:22 -0500 Subject: [PATCH] [OpenMP][MLIR] Descriptor explicit member map lowering changes This is one of 3 PRs in a PR stack that aims to add support for explicit mapping of allocatable members in derived types. The primary changes in this PR are the OpenMPToLLVMIRTranslation.cpp changes, which are small and seek to alter the current member mapping to add an additional map insertion for pointers. Effectively, if the member is a pointer (currently indicated by having a varPtrPtr field) we add an additional map for the pointer and then alter the subsequent mapping of the member (the data) to utilise the member rather than the parents base pointer. This appears to be necessary in certain cases when mapping pointer data within record types to avoid segfaulting on device (due to incorrect data mapping). In general this record type mapping may be simplifiable in the future. There are also additions of tests which should help to showcase the affect of the changes above. --- mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 2 +- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 58 +++-- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 81 - mlir/test/Dialect/OpenMP/ops.mlir | 4 +- ...t-nested-ptr-record-type-mapping-host.mlir | 66 ++ ...arget-nested-record-type-mapping-host.mlir | 2 +- ...get-record-type-with-ptr-member-host.mlir} | 114 ++ 7 files changed, 197 insertions(+), 130 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/omptarget-nested-ptr-record-type-mapping-host.mlir rename mlir/test/Target/LLVMIR/{omptarget-fortran-allocatable-types-host.mlir => omptarget-record-type-with-ptr-member-host.mlir} (58%) diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 626539cb7bde42..348c1b9c2b8bdf 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -895,7 +895,7 @@ def MapInfoOp : OpenMP_Op<"map.info", [AttrSizedOperandSegments]> { TypeAttr:$var_type, Optional:$var_ptr_ptr, Variadic:$members, - OptionalAttr:$members_index, + OptionalAttr:$members_index, Variadic:$bounds, /* rank-0 to rank-{n-1} */ OptionalAttr:$map_type, OptionalAttr:$map_capture_type, diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index e1df647d6a3c71..8d31cda3a33ee9 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1395,16 +1395,15 @@ static void printMapClause(OpAsmPrinter &p, Operation *op, } static ParseResult parseMembersIndex(OpAsmParser &parser, - DenseIntElementsAttr &membersIdx) { - SmallVector values; - int64_t value; - int64_t shape[2] = {0, 0}; - unsigned shapeTmp = 0; + ArrayAttr &membersIdx) { + SmallVector values, memberIdxs; + auto parseIndices = [&]() -> ParseResult { +int64_t value; if (parser.parseInteger(value)) return failure(); -shapeTmp++; -values.push_back(APInt(32, value, /*isSigned=*/true)); +values.push_back(IntegerAttr::get(parser.getBuilder().getIntegerType(64), + APInt(64, value, /*isSigned=*/false))); return success(); }; @@ -1418,52 +1417,29 @@ static ParseResult parseMembersIndex(OpAsmParser &parser, if (failed(parser.parseRSquare())) return failure(); -// Only set once, if any indices are not the same size -// we error out in the next check as that's unsupported -if (shape[1] == 0) - shape[1] = shapeTmp; - -// Verify that the recently parsed list is equal to the -// first one we parsed, they must be equal lengths to -// keep the rectangular shape DenseIntElementsAttr -// requires -if (shapeTmp != shape[1]) - return failure(); - -shapeTmp = 0; -shape[0]++; +memberIdxs.push_back(ArrayAttr::get(parser.getContext(), values)); +values.clear(); } while (succeeded(parser.parseOptionalComma())); - if (!values.empty()) { -ShapedType valueType = -VectorType::get(shape, IntegerType::get(parser.getContext(), 32)); -membersIdx = DenseIntElementsAttr::get(valueType, values); - } + if (!memberIdxs.empty()) +membersIdx = ArrayAttr::get(parser.getContext(), memberIdxs); return success(); } static void printMembersIndex(OpAsmPrinter &p, MapInfoOp op, - DenseIntElementsAttr membersIdx) { - llvm::ArrayRef shape = membersIdx.getShapedType
[llvm-branch-commits] [clang] ab28646 - Revert "[webkit.UncountedLambdaCapturesChecker] Ignore trivial functions and …"
Author: Ryosuke Niwa Date: 2024-10-31T00:27:46-07:00 New Revision: ab286462f15736a6e86f0113eab473fb859744be URL: https://github.com/llvm/llvm-project/commit/ab286462f15736a6e86f0113eab473fb859744be DIFF: https://github.com/llvm/llvm-project/commit/ab286462f15736a6e86f0113eab473fb859744be.diff LOG: Revert "[webkit.UncountedLambdaCapturesChecker] Ignore trivial functions and …" This reverts commit 287781c7c9dbd7674cf7cbab8a8fe8a49a4b9317. Added: Modified: clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp clang/test/Analysis/Checkers/WebKit/mock-types.h clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp Removed: diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h index 814015c311d61e..4b41ca96e1df1d 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h @@ -63,10 +63,6 @@ std::optional isUncounted(const clang::CXXRecordDecl* Class); /// class, false if not, std::nullopt if inconclusive. std::optional isUncountedPtr(const clang::QualType T); -/// \returns true if \p T is either a raw pointer or reference to an uncounted -/// or unchecked class, false if not, std::nullopt if inconclusive. -std::optional isUnsafePtr(const QualType T); - /// \returns true if \p T is a RefPtr, Ref, CheckedPtr, CheckedRef, or its /// variant, false if not. bool isSafePtrType(const clang::QualType T); diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp index d3484d74a2e3eb..998bd4ccee07db 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp @@ -6,7 +6,6 @@ // //===--===// -#include "ASTUtils.h" #include "DiagOutputUtils.h" #include "PtrTypesSemantics.h" #include "clang/AST/CXXInheritance.h" @@ -27,7 +26,6 @@ class UncountedLambdaCapturesChecker BugType Bug{this, "Lambda capture of uncounted variable", "WebKit coding guidelines"}; mutable BugReporter *BR = nullptr; - TrivialFunctionAnalysis TFA; public: void checkASTDecl(const TranslationUnitDecl *TUD, AnalysisManager &MGR, @@ -39,8 +37,6 @@ class UncountedLambdaCapturesChecker // want to visit those, so we make our own RecursiveASTVisitor. struct LocalVisitor : public RecursiveASTVisitor { const UncountedLambdaCapturesChecker *Checker; - llvm::DenseSet DeclRefExprsToIgnore; - explicit LocalVisitor(const UncountedLambdaCapturesChecker *Checker) : Checker(Checker) { assert(Checker); @@ -49,100 +45,32 @@ class UncountedLambdaCapturesChecker bool shouldVisitTemplateInstantiations() const { return true; } bool shouldVisitImplicitCode() const { return false; } - bool VisitDeclRefExpr(DeclRefExpr *DRE) { -if (DeclRefExprsToIgnore.contains(DRE)) - return true; -auto *VD = dyn_cast_or_null(DRE->getDecl()); -if (!VD) - return true; -auto *Init = VD->getInit()->IgnoreParenCasts(); -auto *L = dyn_cast_or_null(Init); -if (!L) - return true; + bool VisitLambdaExpr(LambdaExpr *L) { Checker->visitLambdaExpr(L); return true; } - - // WTF::switchOn(T, F... f) is a variadic template function and couldn't - // be annotated with NOESCAPE. We hard code it here to workaround that. - bool shouldTreatAllArgAsNoEscape(FunctionDecl *Decl) { -auto *NsDecl = Decl->getParent(); -if (!NsDecl || !isa(NsDecl)) - return false; -return safeGetName(NsDecl) == "WTF" && safeGetName(Decl) == "switchOn"; - } - - bool VisitCallExpr(CallExpr *CE) { -checkCalleeLambda(CE); -if (auto *Callee = CE->getDirectCallee()) { - bool TreatAllArgsAsNoEscape = shouldTreatAllArgAsNoEscape(Callee); - unsigned ArgIndex = 0; - for (auto *Param : Callee->parameters()) { -if (ArgIndex >= CE->getNumArgs()) - break; -auto *Arg = CE->getArg(ArgIndex)->IgnoreParenCasts(); -if (!Param->hasAttr() && !TreatAllArgsAsNoEscape) { - if (auto *L = dyn_cast_or_null(Arg)) -Checker->visitLambdaExpr(L); -} -++ArgIndex; - } -} -return true; - } - - void checkCalleeLambda(CallExpr *CE) { -auto *Callee = CE->getCallee(); -if (!Callee) - return; -auto *DRE = dyn_cast(Callee->IgnoreParenCasts
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From e6285ef8415e03337a080fa13456a2495023a8e6 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 30 Oct 2024 15:37:59 +0100 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 287 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 309 - .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 929 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 15ccf1a38af9a5..19d8d466e3b12e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -36,6 +36,83 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(Base); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePlusOffset; +if (ByteOffset == 0) { + BasePlusOffset = Base; +} else { + auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); + BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); +} +auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); +LoadPartRegs.push_back(LoadPart.getReg(0)); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Loads are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge pieces to Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) + MergeTyParts.push_back(Unmerge.getReg(i)); + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + + MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad); + +LLT DstTy = MRI.getType(Dst); +unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits(); +for (unsigned i = 0; i < NumElts; ++i) { + MergeTyParts.push_back(Unmerge.getReg(i)); +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet &WaterfallSgprs) { @@ -114,6 +191,50 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); break; } + case SplitLoad: { +LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); +unsigned Size = DstTy.getSizeInBits(); +// Even split to 128-bit loads +if (Size > 128) { + LLT B128; + if (DstTy.isVector()) { +LLT EltTy = DstTy.getElementType(); +B128 = LLT:
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From eb6a8fc2973ad31f607af56c61a4c6ba6f30d982 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 30 Oct 2024 15:37:59 +0100 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 287 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 309 - .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 929 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 15ccf1a38af9a5..19d8d466e3b12e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -36,6 +36,83 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(Base); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePlusOffset; +if (ByteOffset == 0) { + BasePlusOffset = Base; +} else { + auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); + BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); +} +auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); +LoadPartRegs.push_back(LoadPart.getReg(0)); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Loads are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge pieces to Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) + MergeTyParts.push_back(Unmerge.getReg(i)); + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + + MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad); + +LLT DstTy = MRI.getType(Dst); +unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits(); +for (unsigned i = 0; i < NumElts; ++i) { + MergeTyParts.push_back(Unmerge.getReg(i)); +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet &WaterfallSgprs) { @@ -114,6 +191,50 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); break; } + case SplitLoad: { +LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); +unsigned Size = DstTy.getSizeInBits(); +// Even split to 128-bit loads +if (Size > 128) { + LLT B128; + if (DstTy.isVector()) { +LLT EltTy = DstTy.getElementType(); +B128 = LLT:
[llvm-branch-commits] [clang] [llvm] [LLVM] [Clang] Backport "Support for Gentoo `*t64` triples (64-bit time_t ABIs)" (PR #112364)
@@ -294,7 +294,11 @@ class Triple { PAuthTest, -LastEnvironmentType = PAuthTest +GNUT64, +GNUEABIT64, +GNUEABIHFT64, + +LastEnvironmentType = GNUEABIHFT64 tru wrote: Let's continue the discussion and the next steps on discourse, I posted a new post here: https://discourse.llvm.org/t/potential-abi-break-in-19-1-3/82865 https://github.com/llvm/llvm-project/pull/112364 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [LLVM] [Clang] Backport "Support for Gentoo `*t64` triples (64-bit time_t ABIs)" (PR #112364)
@@ -294,7 +294,11 @@ class Triple { PAuthTest, -LastEnvironmentType = PAuthTest +GNUT64, +GNUEABIT64, +GNUEABIHFT64, + +LastEnvironmentType = GNUEABIHFT64 tstellar wrote: > This patch doesn't break ABI/API compatibility. The Zig check is overly > restrictive and unnecessary. Zig should be fixed instead. It does technically break ABI compatibility, because it changes the value of an enum. This is something we usually try to avoid. https://github.com/llvm/llvm-project/pull/112364 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From 6ec049db2a5572c4cb0514b9ca44c7ff215b461f Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 31 Oct 2024 14:10:57 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr register bank in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 27 +- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 69 --- .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 51 ++- 9 files changed, 191 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..8e13c0916dd9e1 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -54,9 +54,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 +# CHECK: DIVERGENT: G_BR %bb.2 # CHECK-LABEL: BLOCK bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: D
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)
@@ -66,9 +73,215 @@ FunctionPass *llvm::createAMDGPURegBankSelectPass() { return new AMDGPURegBankSelect(); } +class RegBankSelectHelper { + MachineIRBuilder &B; + MachineRegisterInfo &MRI; + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA; + const MachineUniformityInfo &MUI; + const SIRegisterInfo &TRI; + const RegisterBank *SgprRB; + const RegisterBank *VgprRB; + const RegisterBank *VccRB; + +public: + RegBankSelectHelper(MachineIRBuilder &B, + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA, + const MachineUniformityInfo &MUI, + const SIRegisterInfo &TRI, const RegisterBankInfo &RBI) + : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI), +SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), +VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), +VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} + + bool shouldRegBankSelect(MachineInstr &MI) { +return MI.isPreISelOpcode() || MI.isCopy(); + } + + // Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of + // the cycle + // Note: uniformity analysis does not consider that registers with vgpr def + // are divergent (you can have uniform value in vgpr). + // - TODO: implicit use of $exec could be implemented as indicator that + // instruction is divergent + bool isTemporalDivergenceCopy(Register Reg) { +MachineInstr *MI = MRI.getVRegDef(Reg); +if (!MI->isCopy()) + return false; + +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + + if (Op.getReg() == TRI.getExec()) { +return true; + } +} + +return false; + } + + void setRBDef(MachineInstr &MI, MachineOperand &DefOP, +const RegisterBank *RB) { +Register Reg = DefOP.getReg(); +// Register that already has Register class got it during pre-inst selection +// of another instruction. Maybe cross bank copy was required so we insert a +// copy that can be removed later. This simplifies post regbanklegalize +// combiner and avoids need to special case some patterns. +if (MRI.getRegClassOrNull(Reg)) { + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({RB, Ty}); + DefOP.setReg(NewReg); + + auto &MBB = *MI.getParent(); + B.setInsertPt(MBB, MBB.SkipPHIsAndLabels(std::next(MI.getIterator(; + B.buildCopy(Reg, NewReg); + + // The problem was discovered for uniform S1 that was used as both + // lane mask(vcc) and regular sgpr S1. + // - lane-mask(vcc) use was by si_if, this use is divergent and requires + // non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets + // sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. + // - the regular sgpr S1(uniform) instruction is now broken since + // it uses sreg_64_xexec(S1) which is divergent. + + // Replace virtual registers with register class on generic instructions + // uses with virtual registers with register bank. + for (auto &UseMI : MRI.use_instructions(Reg)) { +if (shouldRegBankSelect(UseMI)) { + for (MachineOperand &Op : UseMI.operands()) { +if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); petar-avramovic wrote: Yes, this was a bug. make_early_inc_range also works but it might be assuming how MRI keeps track of use instructions internally https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)
@@ -66,9 +73,215 @@ FunctionPass *llvm::createAMDGPURegBankSelectPass() { return new AMDGPURegBankSelect(); } +class RegBankSelectHelper { + MachineIRBuilder &B; + MachineRegisterInfo &MRI; + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA; + const MachineUniformityInfo &MUI; + const SIRegisterInfo &TRI; + const RegisterBank *SgprRB; + const RegisterBank *VgprRB; + const RegisterBank *VccRB; + +public: + RegBankSelectHelper(MachineIRBuilder &B, + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA, + const MachineUniformityInfo &MUI, + const SIRegisterInfo &TRI, const RegisterBankInfo &RBI) + : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI), +SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), +VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), +VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} + + bool shouldRegBankSelect(MachineInstr &MI) { +return MI.isPreISelOpcode() || MI.isCopy(); + } + + // Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of + // the cycle + // Note: uniformity analysis does not consider that registers with vgpr def + // are divergent (you can have uniform value in vgpr). + // - TODO: implicit use of $exec could be implemented as indicator that + // instruction is divergent + bool isTemporalDivergenceCopy(Register Reg) { +MachineInstr *MI = MRI.getVRegDef(Reg); +if (!MI->isCopy()) + return false; + +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + + if (Op.getReg() == TRI.getExec()) { +return true; + } +} + +return false; + } + + void setRBDef(MachineInstr &MI, MachineOperand &DefOP, +const RegisterBank *RB) { +Register Reg = DefOP.getReg(); +// Register that already has Register class got it during pre-inst selection +// of another instruction. Maybe cross bank copy was required so we insert a +// copy that can be removed later. This simplifies post regbanklegalize +// combiner and avoids need to special case some patterns. +if (MRI.getRegClassOrNull(Reg)) { + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({RB, Ty}); + DefOP.setReg(NewReg); petar-avramovic wrote: Why? I intend for new regbankselect be simple and not use observers. https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] GlobalISel: Fix combine duplicating atomic loads (PR #111730)
arsenm wrote: ### Merge activity * **Oct 31, 10:38 AM EDT**: A user started a stack merge that includes this pull request via [Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/111730). https://github.com/llvm/llvm-project/pull/111730 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Define high bits of FPR and GPR registers. (PR #114263)
@@ -424,6 +424,58 @@ AArch64RegisterInfo::explainReservedReg(const MachineFunction &MF, return {}; } +static SmallVector ReservedHi = { arsenm wrote: This smells like an unrelated bug, this is not the kind of error I expected https://github.com/llvm/llvm-project/pull/114263 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/114438 >From d0ec41bcb8f0594b86336e45028d490dd4ebf6c4 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 31 Oct 2024 12:49:07 -0400 Subject: [PATCH] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 79 +++ .../CodeGen/AMDGPU/propagate-waves-per-eu.ll | 47 ++- 2 files changed, 69 insertions(+), 57 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 182f4fcc88a79a..a462e88a6e745d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -198,6 +198,17 @@ class AMDGPUInformationCache : public InformationCache { return ST.getWavesPerEU(F, FlatWorkGroupSize); } + std::optional> + getWavesPerEUAttr(const Function &F) { +auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", + /*OnlyFirstRequired=*/true); +if (Val && Val->second == 0) { + const GCNSubtarget &ST = TM.getSubtarget(F); + Val->second = ST.getMaxWavesPerEU(); +} +return Val; + } + std::pair getEffectiveWavesPerEU(const Function &F, std::pair WavesPerEU, @@ -768,22 +779,6 @@ struct AAAMDSizeRangeAttribute /*ForceReplace=*/true); } - ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min, - unsigned Max) { -// Don't add the attribute if it's the implied default. -if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max) - return ChangeStatus::UNCHANGED; - -Function *F = getAssociatedFunction(); -LLVMContext &Ctx = F->getContext(); -SmallString<10> Buffer; -raw_svector_ostream OS(Buffer); -OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; -return A.manifestAttrs(getIRPosition(), - {Attribute::get(Ctx, AttrName, OS.str())}, - /*ForceReplace=*/true); - } - const std::string getAsStr(Attributor *) const override { std::string Str; raw_string_ostream OS(Str); @@ -873,29 +868,47 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A) : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {} - bool isValidState() const override { -return !Assumed.isEmptySet() && IntegerRangeState::isValidState(); - } - void initialize(Attributor &A) override { Function *F = getAssociatedFunction(); auto &InfoCache = static_cast(A.getInfoCache()); -if (const auto *AssumedGroupSize = A.getAAFor( -*this, IRPosition::function(*F), DepClassTy::REQUIRED); -AssumedGroupSize->isValidState()) { +auto TakeRange = [&](std::pair R) { + auto [Min, Max] = R; + ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); + IntegerRangeState RangeState(Range); + clampStateAndIndicateChange(this->getState(), RangeState); + indicateOptimisticFixpoint(); +}; - unsigned Min, Max; - std::tie(Min, Max) = InfoCache.getWavesPerEU( - *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(), - AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1}); +std::pair MaxWavesPerEURange{ +1U, InfoCache.getMaxWavesPerEU(*F)}; - ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); - intersectKnown(Range); +// If the attribute exists, we will honor it if it is not the default. +if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) { + if (*Attr != MaxWavesPerEURange) { +TakeRange(*Attr); +return; + } } -if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) - indicatePessimisticFixpoint(); +// Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the +// calculation of waves per EU involves flat work group size, we can't +// simply use an assumed flat work group size as a start point, because the +// update of flat work group size is in an inverse direction of waves per +// EU. However, we can still do something if it is an entry function. Since +// an entry function is a terminal node, and flat work group size either +// from attribute or default will be used anyway, we can take that value and +// calculate the waves per EU based on it. This result can't be updated by +// no means, but that could still allow us to propagate it. +if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) { + std::pair FlatWorkGroupSize; + if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) +FlatWorkGroupSize = *Attr; + else +FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F); + TakeRange(InfoCache.getEffectiveWavesPerEU
[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/114438 >From b7f1c2bd5d33a060ab2a8ee942874d208d42cac9 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 31 Oct 2024 12:49:07 -0400 Subject: [PATCH] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 77 +++ .../CodeGen/AMDGPU/propagate-waves-per-eu.ll | 47 ++- 2 files changed, 66 insertions(+), 58 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 642b278db70437..8b9e3f37dc507c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -198,6 +198,16 @@ class AMDGPUInformationCache : public InformationCache { return ST.getWavesPerEU(F, FlatWorkGroupSize); } + std::optional> + getWavesPerEUAttr(const Function &F) { +auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu"); +if (Val && Val->second == 0) { + const GCNSubtarget &ST = TM.getSubtarget(F); + Val->second = ST.getMaxWavesPerEU(); +} +return Val; + } + std::pair getEffectiveWavesPerEU(const Function &F, std::pair WavesPerEU, @@ -768,22 +778,6 @@ struct AAAMDSizeRangeAttribute /*ForceReplace=*/true); } - ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min, - unsigned Max) { -// Don't add the attribute if it's the implied default. -if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max) - return ChangeStatus::UNCHANGED; - -Function *F = getAssociatedFunction(); -LLVMContext &Ctx = F->getContext(); -SmallString<10> Buffer; -raw_svector_ostream OS(Buffer); -OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; -return A.manifestAttrs(getIRPosition(), - {Attribute::get(Ctx, AttrName, OS.str())}, - /*ForceReplace=*/true); - } - const std::string getAsStr(Attributor *) const override { std::string Str; raw_string_ostream OS(Str); @@ -868,29 +862,44 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A) : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {} - bool isValidState() const override { -return !Assumed.isEmptySet() && IntegerRangeState::isValidState(); - } - void initialize(Attributor &A) override { Function *F = getAssociatedFunction(); auto &InfoCache = static_cast(A.getInfoCache()); -if (const auto *AssumedGroupSize = A.getAAFor( -*this, IRPosition::function(*F), DepClassTy::REQUIRED); -AssumedGroupSize->isValidState()) { - - unsigned Min, Max; - std::tie(Min, Max) = InfoCache.getWavesPerEU( - *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(), - AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1}); - +auto TakeRange = [&](std::pair R) { + auto [Min, Max] = R; ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); - intersectKnown(Range); + IntegerRangeState RangeState(Range); + clampStateAndIndicateChange(this->getState(), RangeState); + indicateOptimisticFixpoint(); +}; + +// If the attribute exists, simple honor it. +if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) { + TakeRange(*Attr); + return; } -if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) - indicatePessimisticFixpoint(); +// It's getting trickier here, different from AAAMDFlatWorkGroupSize. Since +// the calculation of waves per EU involves flat work group size, we can't +// simply use an assumed flat work group size as a start point, because the +// update of flat work group size is in an inverse direction of waves per +// EU. However, we can still do something if it is an entry function. Since +// an entry function is a terminal node, and flat work group size either +// from attribute or default will be used anyway, we can take that value and +// calculate the waves per EU based on it. This result can't be updated by +// no means, but that could still allow us to propagate it. +if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) { + std::pair MaxWavesPerEURange{ + 1U, InfoCache.getMaxWavesPerEU(*F)}; + std::pair FlatWorkGroupSize; + if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) +FlatWorkGroupSize = *Attr; + else +FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F); + TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange, + FlatWorkGroupSize)); +} } ChangeStatus updateImpl(Attributor &A) override { @@ -939,8 +948,8 @@ struct AAAMDWavesPerEU :
[llvm-branch-commits] [llvm] [RISCV] Add initial support of memcmp expansion (PR #107548)
wangpc-pp wrote: Ping, any comment for current scalar part? I'm working on vector expansion and will post it in a few days. https://github.com/llvm/llvm-project/pull/107548 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/114438 >From 25b1ec0e80072c70628da9d72be8969fd6bb3d87 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 31 Oct 2024 12:49:07 -0400 Subject: [PATCH] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 78 +++ .../CodeGen/AMDGPU/propagate-waves-per-eu.ll | 47 ++- 2 files changed, 68 insertions(+), 57 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 182f4fcc88a79a..99d5ca8403dc21 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -198,6 +198,16 @@ class AMDGPUInformationCache : public InformationCache { return ST.getWavesPerEU(F, FlatWorkGroupSize); } + std::optional> + getWavesPerEUAttr(const Function &F) { +auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu"); +if (Val && Val->second == 0) { + const GCNSubtarget &ST = TM.getSubtarget(F); + Val->second = ST.getMaxWavesPerEU(); +} +return Val; + } + std::pair getEffectiveWavesPerEU(const Function &F, std::pair WavesPerEU, @@ -768,22 +778,6 @@ struct AAAMDSizeRangeAttribute /*ForceReplace=*/true); } - ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min, - unsigned Max) { -// Don't add the attribute if it's the implied default. -if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max) - return ChangeStatus::UNCHANGED; - -Function *F = getAssociatedFunction(); -LLVMContext &Ctx = F->getContext(); -SmallString<10> Buffer; -raw_svector_ostream OS(Buffer); -OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; -return A.manifestAttrs(getIRPosition(), - {Attribute::get(Ctx, AttrName, OS.str())}, - /*ForceReplace=*/true); - } - const std::string getAsStr(Attributor *) const override { std::string Str; raw_string_ostream OS(Str); @@ -873,29 +867,47 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A) : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {} - bool isValidState() const override { -return !Assumed.isEmptySet() && IntegerRangeState::isValidState(); - } - void initialize(Attributor &A) override { Function *F = getAssociatedFunction(); auto &InfoCache = static_cast(A.getInfoCache()); -if (const auto *AssumedGroupSize = A.getAAFor( -*this, IRPosition::function(*F), DepClassTy::REQUIRED); -AssumedGroupSize->isValidState()) { +auto TakeRange = [&](std::pair R) { + auto [Min, Max] = R; + ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); + IntegerRangeState RangeState(Range); + clampStateAndIndicateChange(this->getState(), RangeState); + indicateOptimisticFixpoint(); +}; - unsigned Min, Max; - std::tie(Min, Max) = InfoCache.getWavesPerEU( - *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(), - AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1}); +std::pair MaxWavesPerEURange{ +1U, InfoCache.getMaxWavesPerEU(*F)}; - ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); - intersectKnown(Range); +// If the attribute exists, we will honor it if it is not the default. +if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) { + if (*Attr != MaxWavesPerEURange) { +TakeRange(*Attr); +return; + } } -if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) - indicatePessimisticFixpoint(); +// It's getting trickier here, different from AAAMDFlatWorkGroupSize. Since +// the calculation of waves per EU involves flat work group size, we can't +// simply use an assumed flat work group size as a start point, because the +// update of flat work group size is in an inverse direction of waves per +// EU. However, we can still do something if it is an entry function. Since +// an entry function is a terminal node, and flat work group size either +// from attribute or default will be used anyway, we can take that value and +// calculate the waves per EU based on it. This result can't be updated by +// no means, but that could still allow us to propagate it. +if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) { + std::pair FlatWorkGroupSize; + if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) +FlatWorkGroupSize = *Attr; + else +FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F); + TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange, +
[llvm-branch-commits] [llvm] [AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute (PR #114438)
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/114438 >From b48566210212165429e6a29665a4fefdf2695e61 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 31 Oct 2024 12:49:07 -0400 Subject: [PATCH] [WIP][AMDGPU][Attributor] Make `AAAMDWavesPerEU` honor existing attribute --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 78 +++ .../CodeGen/AMDGPU/propagate-waves-per-eu.ll | 47 ++- 2 files changed, 68 insertions(+), 57 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 182f4fcc88a79a..03a15639aa6bcb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -198,6 +198,16 @@ class AMDGPUInformationCache : public InformationCache { return ST.getWavesPerEU(F, FlatWorkGroupSize); } + std::optional> + getWavesPerEUAttr(const Function &F) { +auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu"); +if (Val && Val->second == 0) { + const GCNSubtarget &ST = TM.getSubtarget(F); + Val->second = ST.getMaxWavesPerEU(); +} +return Val; + } + std::pair getEffectiveWavesPerEU(const Function &F, std::pair WavesPerEU, @@ -768,22 +778,6 @@ struct AAAMDSizeRangeAttribute /*ForceReplace=*/true); } - ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min, - unsigned Max) { -// Don't add the attribute if it's the implied default. -if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max) - return ChangeStatus::UNCHANGED; - -Function *F = getAssociatedFunction(); -LLVMContext &Ctx = F->getContext(); -SmallString<10> Buffer; -raw_svector_ostream OS(Buffer); -OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; -return A.manifestAttrs(getIRPosition(), - {Attribute::get(Ctx, AttrName, OS.str())}, - /*ForceReplace=*/true); - } - const std::string getAsStr(Attributor *) const override { std::string Str; raw_string_ostream OS(Str); @@ -873,29 +867,47 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A) : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {} - bool isValidState() const override { -return !Assumed.isEmptySet() && IntegerRangeState::isValidState(); - } - void initialize(Attributor &A) override { Function *F = getAssociatedFunction(); auto &InfoCache = static_cast(A.getInfoCache()); -if (const auto *AssumedGroupSize = A.getAAFor( -*this, IRPosition::function(*F), DepClassTy::REQUIRED); -AssumedGroupSize->isValidState()) { +auto TakeRange = [&](std::pair R) { + auto [Min, Max] = R; + ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); + IntegerRangeState RangeState(Range); + clampStateAndIndicateChange(this->getState(), RangeState); + indicateOptimisticFixpoint(); +}; - unsigned Min, Max; - std::tie(Min, Max) = InfoCache.getWavesPerEU( - *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(), - AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1}); +std::pair MaxWavesPerEURange{ +1U, InfoCache.getMaxWavesPerEU(*F)}; - ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); - intersectKnown(Range); +// If the attribute exists, we will honor it if it is not the default. +if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) { + if (*Attr != MaxWavesPerEURange) { +TakeRange(*Attr); +return; + } } -if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) - indicatePessimisticFixpoint(); +// Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the +// calculation of waves per EU involves flat work group size, we can't +// simply use an assumed flat work group size as a start point, because the +// update of flat work group size is in an inverse direction of waves per +// EU. However, we can still do something if it is an entry function. Since +// an entry function is a terminal node, and flat work group size either +// from attribute or default will be used anyway, we can take that value and +// calculate the waves per EU based on it. This result can't be updated by +// no means, but that could still allow us to propagate it. +if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) { + std::pair FlatWorkGroupSize; + if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) +FlatWorkGroupSize = *Attr; + else +FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F); + TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange, + F
[llvm-branch-commits] [llvm] 8b1b475 - Revert "[ConstantFold] Fold `tgamma` and `tgammaf` when the input parameter i…"
Author: c8ef Date: 2024-11-01T09:25:43+08:00 New Revision: 8b1b4753ac16cba5a153536171a243d76300e4bb URL: https://github.com/llvm/llvm-project/commit/8b1b4753ac16cba5a153536171a243d76300e4bb DIFF: https://github.com/llvm/llvm-project/commit/8b1b4753ac16cba5a153536171a243d76300e4bb.diff LOG: Revert "[ConstantFold] Fold `tgamma` and `tgammaf` when the input parameter i…" This reverts commit 1f07f995cc994dfb46b65fe97986efca15cf304b. Added: Modified: llvm/lib/Analysis/ConstantFolding.cpp Removed: llvm/test/Transforms/InstCombine/tgamma.ll diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index a96c3bebba790e..c5a2c2f52f8dc2 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -57,7 +57,6 @@ #include #include #include -#include #include #include @@ -1699,9 +1698,9 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) { Name == "sinh" || Name == "sinhf" || Name == "sqrt" || Name == "sqrtf"; case 't': -return Name == "tan" || Name == "tanf" || Name == "tanh" || - Name == "tanhf" || Name == "trunc" || Name == "truncf" || - Name == "tgamma" || Name == "tgammaf"; +return Name == "tan" || Name == "tanf" || + Name == "tanh" || Name == "tanhf" || + Name == "trunc" || Name == "truncf"; case '_': // Check for various function names that get used for the math functions // when the header files are preprocessed with the macro @@ -2418,14 +2417,6 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, if (TLI->has(Func)) return ConstantFoldFP(erf, APF, Ty); break; -case LibFunc_tgamma: -case LibFunc_tgammaf: - // NOTE: These boundaries are somewhat conservative. - if (TLI->has(Func) && - (Ty->isDoubleTy() && APF > APFloat(DBL_MIN) && APF < APFloat(171.0) || - Ty->isFloatTy() && APF > APFloat(FLT_MIN) && APF < APFloat(35.0f))) -return ConstantFoldFP(tgamma, APF, Ty); - break; case LibFunc_nearbyint: case LibFunc_nearbyintf: case LibFunc_rint: @@ -3638,10 +3629,6 @@ bool llvm::isMathLibCallNoop(const CallBase *Call, case LibFunc_sqrtf: return Op.isNaN() || Op.isZero() || !Op.isNegative(); - case LibFunc_tgamma: - case LibFunc_tgammaf: -return true; - // FIXME: Add more functions: sqrt_finite, atanh, expm1, log1p, // maybe others? default: diff --git a/llvm/test/Transforms/InstCombine/tgamma.ll b/llvm/test/Transforms/InstCombine/tgamma.ll deleted file mode 100644 index dd74617fee83e5..00 --- a/llvm/test/Transforms/InstCombine/tgamma.ll +++ /dev/null @@ -1,255 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -passes=instcombine -S | FileCheck %s - -define float @tgammaf_in_range() { -; CHECK-LABEL: define float @tgammaf_in_range() { -; CHECK-NEXT:ret float 0x479A21628000 -; - %r = call float @tgammaf(float 34.0) - ret float %r -} - -define double @tgamma_in_range() { -; CHECK-LABEL: define double @tgamma_in_range() { -; CHECK-NEXT:ret double 0x605166C698CF183B -; - %r = call double @tgamma(double 100.0) - ret double %r -} - -define float @tgammaf_const_left_range() { -; CHECK-LABEL: define float @tgammaf_const_left_range() { -; CHECK-NEXT:[[R:%.*]] = call float @tgammaf(float 0x3810) -; CHECK-NEXT:ret float [[R]] -; - %r = call float @tgammaf(float 0x3810) - ret float %r -} - -define double @tgamma_const_left_range() { -; CHECK-LABEL: define double @tgamma_const_left_range() { -; CHECK-NEXT:[[R:%.*]] = call double @tgamma(double 0x10) -; CHECK-NEXT:ret double [[R]] -; - %r = call double @tgamma(double 0x0010) - ret double %r -} - -define float @tgammaf_const_right_range() { -; CHECK-LABEL: define float @tgammaf_const_right_range() { -; CHECK-NEXT:[[R:%.*]] = call float @tgammaf(float 3.60e+01) -; CHECK-NEXT:ret float [[R]] -; - %r = call float @tgammaf(float 36.0) - ret float %r -} - -define double @tgamma_const_right_range() { -; CHECK-LABEL: define double @tgamma_const_right_range() { -; CHECK-NEXT:[[R:%.*]] = call double @tgamma(double 1.72e+02) -; CHECK-NEXT:ret double [[R]] -; - %r = call double @tgamma(double 172.0) - ret double %r -} - -define float @tgammaf_minus_one() { -; CHECK-LABEL: define float @tgammaf_minus_one() { -; CHECK-NEXT:[[R:%.*]] = call float @tgammaf(float -1.00e+00) -; CHECK-NEXT:ret float [[R]] -; - %r = call float @tgammaf(float -1.00e+00) - ret float %r -} - -define double @tgamma_minus_one() { -; CHECK-LABEL: define double @tgamma_minus_one() { -; CHECK-NEXT:[[R:%.*]] = call double @tgamma(double -1.00e+00) -; CHE
[llvm-branch-commits] [llvm] [RISCV] Add initial support of memcmp expansion (PR #107548)
@@ -1144,42 +2872,116 @@ entry: define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV32-LABEL: memcmp_size_4: ; CHECK-ALIGNED-RV32: # %bb.0: # %entry -; CHECK-ALIGNED-RV32-NEXT:addi sp, sp, -16 -; CHECK-ALIGNED-RV32-NEXT:sw ra, 12(sp) # 4-byte Folded Spill -; CHECK-ALIGNED-RV32-NEXT:li a2, 4 -; CHECK-ALIGNED-RV32-NEXT:call memcmp -; CHECK-ALIGNED-RV32-NEXT:lw ra, 12(sp) # 4-byte Folded Reload -; CHECK-ALIGNED-RV32-NEXT:addi sp, sp, 16 +; CHECK-ALIGNED-RV32-NEXT:lbu a2, 0(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a3, 1(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a4, 3(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a0, 2(a0) +; CHECK-ALIGNED-RV32-NEXT:lbu a5, 0(a1) +; CHECK-ALIGNED-RV32-NEXT:lbu a6, 1(a1) +; CHECK-ALIGNED-RV32-NEXT:lbu a7, 3(a1) +; CHECK-ALIGNED-RV32-NEXT:lbu a1, 2(a1) +; CHECK-ALIGNED-RV32-NEXT:slli a0, a0, 8 +; CHECK-ALIGNED-RV32-NEXT:or a0, a0, a4 +; CHECK-ALIGNED-RV32-NEXT:slli a3, a3, 16 +; CHECK-ALIGNED-RV32-NEXT:slli a2, a2, 24 +; CHECK-ALIGNED-RV32-NEXT:or a2, a2, a3 +; CHECK-ALIGNED-RV32-NEXT:or a0, a2, a0 +; CHECK-ALIGNED-RV32-NEXT:slli a1, a1, 8 +; CHECK-ALIGNED-RV32-NEXT:or a1, a1, a7 +; CHECK-ALIGNED-RV32-NEXT:slli a6, a6, 16 +; CHECK-ALIGNED-RV32-NEXT:slli a5, a5, 24 +; CHECK-ALIGNED-RV32-NEXT:or a2, a5, a6 +; CHECK-ALIGNED-RV32-NEXT:or a1, a2, a1 +; CHECK-ALIGNED-RV32-NEXT:sltu a2, a1, a0 +; CHECK-ALIGNED-RV32-NEXT:sltu a0, a0, a1 +; CHECK-ALIGNED-RV32-NEXT:sub a0, a2, a0 ; CHECK-ALIGNED-RV32-NEXT:ret ; ; CHECK-ALIGNED-RV64-LABEL: memcmp_size_4: ; CHECK-ALIGNED-RV64: # %bb.0: # %entry -; CHECK-ALIGNED-RV64-NEXT:addi sp, sp, -16 -; CHECK-ALIGNED-RV64-NEXT:sd ra, 8(sp) # 8-byte Folded Spill -; CHECK-ALIGNED-RV64-NEXT:li a2, 4 -; CHECK-ALIGNED-RV64-NEXT:call memcmp -; CHECK-ALIGNED-RV64-NEXT:ld ra, 8(sp) # 8-byte Folded Reload -; CHECK-ALIGNED-RV64-NEXT:addi sp, sp, 16 +; CHECK-ALIGNED-RV64-NEXT:lbu a2, 0(a0) +; CHECK-ALIGNED-RV64-NEXT:lbu a3, 1(a0) +; CHECK-ALIGNED-RV64-NEXT:lbu a4, 2(a0) +; CHECK-ALIGNED-RV64-NEXT:lb a0, 3(a0) +; CHECK-ALIGNED-RV64-NEXT:lbu a5, 0(a1) +; CHECK-ALIGNED-RV64-NEXT:lbu a6, 1(a1) +; CHECK-ALIGNED-RV64-NEXT:lbu a7, 2(a1) +; CHECK-ALIGNED-RV64-NEXT:lb a1, 3(a1) +; CHECK-ALIGNED-RV64-NEXT:andi a0, a0, 255 +; CHECK-ALIGNED-RV64-NEXT:slli a4, a4, 8 +; CHECK-ALIGNED-RV64-NEXT:or a0, a4, a0 +; CHECK-ALIGNED-RV64-NEXT:slli a3, a3, 16 +; CHECK-ALIGNED-RV64-NEXT:slliw a2, a2, 24 +; CHECK-ALIGNED-RV64-NEXT:or a2, a2, a3 +; CHECK-ALIGNED-RV64-NEXT:or a0, a2, a0 +; CHECK-ALIGNED-RV64-NEXT:andi a1, a1, 255 +; CHECK-ALIGNED-RV64-NEXT:slli a7, a7, 8 +; CHECK-ALIGNED-RV64-NEXT:or a1, a7, a1 +; CHECK-ALIGNED-RV64-NEXT:slli a6, a6, 16 +; CHECK-ALIGNED-RV64-NEXT:slliw a2, a5, 24 +; CHECK-ALIGNED-RV64-NEXT:or a2, a2, a6 +; CHECK-ALIGNED-RV64-NEXT:or a1, a2, a1 +; CHECK-ALIGNED-RV64-NEXT:sltu a2, a1, a0 +; CHECK-ALIGNED-RV64-NEXT:sltu a0, a0, a1 +; CHECK-ALIGNED-RV64-NEXT:sub a0, a2, a0 ; CHECK-ALIGNED-RV64-NEXT:ret ; ; CHECK-UNALIGNED-RV32-LABEL: memcmp_size_4: ; CHECK-UNALIGNED-RV32: # %bb.0: # %entry -; CHECK-UNALIGNED-RV32-NEXT:addi sp, sp, -16 -; CHECK-UNALIGNED-RV32-NEXT:sw ra, 12(sp) # 4-byte Folded Spill -; CHECK-UNALIGNED-RV32-NEXT:li a2, 4 -; CHECK-UNALIGNED-RV32-NEXT:call memcmp -; CHECK-UNALIGNED-RV32-NEXT:lw ra, 12(sp) # 4-byte Folded Reload -; CHECK-UNALIGNED-RV32-NEXT:addi sp, sp, 16 +; CHECK-UNALIGNED-RV32-NEXT:lw a0, 0(a0) topperc wrote: Can we test the non-Zbb config on qemu and get the instruction count? https://github.com/llvm/llvm-project/pull/107548 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] 4b61314 - Revert "[InstCombine] Fix FMF propagation in `foldSelectIntoOp` (#114356)"
Author: gulfemsavrun Date: 2024-10-31T13:19:56-07:00 New Revision: 4b61314fc83dfc9a8ec29dcc4c9ccfb0057b990a URL: https://github.com/llvm/llvm-project/commit/4b61314fc83dfc9a8ec29dcc4c9ccfb0057b990a DIFF: https://github.com/llvm/llvm-project/commit/4b61314fc83dfc9a8ec29dcc4c9ccfb0057b990a.diff LOG: Revert "[InstCombine] Fix FMF propagation in `foldSelectIntoOp` (#114356)" This reverts commit cf1963afad335cf74a9411f106d1f2fe80dbed2f. Added: Modified: llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll Removed: diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 9e193e7faa8ac3..c5f39a4c381ed1 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -529,6 +529,9 @@ Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst &SI, Value *TrueVal, if (!OpToFold) return nullptr; +// TODO: We probably ought to revisit cases where the select and FP +// instructions have diff erent flags and add tests to ensure the +// behaviour is correct. FastMathFlags FMF; if (isa(&SI)) FMF = SI.getFastMathFlags(); @@ -561,8 +564,6 @@ Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst &SI, Value *TrueVal, BinaryOperator *BO = BinaryOperator::Create(TVI->getOpcode(), FalseVal, NewSel); BO->copyIRFlags(TVI); -if (isa(&SI)) - BO->andIRFlags(NewSel); return BO; }; diff --git a/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll b/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll index caf38c676e20d7..1c28b151825c12 100644 --- a/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll +++ b/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll @@ -468,7 +468,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_fmul(float %x, float %y) { ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_fmul( ; CHECK-NEXT:[[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.00e+00 ; CHECK-NEXT:[[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.00e+00 -; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul float [[X]], [[SCALED_X]] +; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul nsz float [[X]], [[SCALED_X]] ; CHECK-NEXT:ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -482,7 +482,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul(float %x, float %y) { ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul( ; CHECK-NEXT:[[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.00e+00 ; CHECK-NEXT:[[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.00e+00 -; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul float [[X]], [[SCALED_X]] +; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul ninf nsz float [[X]], [[SCALED_X]] ; CHECK-NEXT:ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -496,7 +496,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_fmul(float %x, float %y) { ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_fmul( ; CHECK-NEXT:[[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.00e+00 ; CHECK-NEXT:[[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.00e+00 -; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]] +; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul nnan nsz float [[X]], [[SCALED_X]] ; CHECK-NEXT:ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -510,7 +510,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nnan_ninf_fmul(float %x, float %y) { ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nnan_ninf_fmul( ; CHECK-NEXT:[[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.00e+00 ; CHECK-NEXT:[[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.00e+00 -; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]] +; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[X]], [[SCALED_X]] ; CHECK-NEXT:ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -559,7 +559,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(float %x, float % ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz( ; CHECK-NEXT:[[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.00e+00 ; CHECK-NEXT:[[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.00e+00 -; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]] +; CHECK-NEXT:[[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[X]], [[