Author: Pierre van Houtryve
Date: 2022-10-19T10:16:08Z
New Revision: edaf6a07a4aafd963ea958703890d03ab58ff2dd
URL:
https://github.com/llvm/llvm-project/commit/edaf6a07a4aafd963ea958703890d03ab58ff2dd
DIFF:
https://github.com/llvm/llvm-project/commit/edaf6a07a4aafd963ea958703890d03ab58ff2dd.diff
LOG: [AMDGPU][GISel] Combine G_INSERT_VECTOR_ELT to G_SHUFFLE_VECTOR
Depends on D134967
Differential Revision: https://reviews.llvm.org/D135145
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-insertvecelt-to-shufflevector.mir
Modified:
llvm/lib/Target/AMDGPU/AMDGPUCombine.td
llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
Removed:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 2415fdfecaae2..8b2ff164d3365 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -45,6 +45,12 @@ def cvt_f32_ubyteN : GICombineRule<
[{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN},
${matchinfo}); }]),
(apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN},
${matchinfo}); }])>;
+def insert_vec_elt_to_shuffle : GICombineRule<
+ (defs root:$insertelt, unsigned_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_INSERT_VECTOR_ELT):$insertelt,
+ [{ return
PreLegalizerHelper.matchInsertVectorEltToShuffle(*${insertelt}, ${matchinfo});
}]),
+ (apply [{ PreLegalizerHelper.applyInsertVectorEltToShuffle(*${insertelt},
${matchinfo}); }])>;
+
def clamp_i64_to_i16_matchdata :
GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">;
def clamp_i64_to_i16 : GICombineRule<
@@ -109,7 +115,7 @@ def gfx6gfx7_combines :
GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
"AMDGPUGenPreLegalizerCombinerHelper",
- [all_combines, clamp_i64_to_i16, foldable_fneg]> {
+ [all_combines, clamp_i64_to_i16, foldable_fneg, insert_vec_elt_to_shuffle]> {
let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
let StateClass = "AMDGPUPreLegalizerCombinerHelperState";
let AdditionalArguments = [];
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 6d6c69adaa658..08eefc6da4d31 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -55,6 +55,9 @@ class AMDGPUPreLegalizerCombinerHelper {
void applyClampI64ToI16(MachineInstr &MI,
const ClampI64ToI16MatchInfo &MatchInfo);
+
+ bool matchInsertVectorEltToShuffle(MachineInstr &MI, unsigned &Idx);
+ void applyInsertVectorEltToShuffle(MachineInstr &MI, unsigned &Idx);
};
bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
@@ -154,6 +157,73 @@ void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
MI.eraseFromParent();
}
+bool AMDGPUPreLegalizerCombinerHelper::matchInsertVectorEltToShuffle(
+MachineInstr &MI, unsigned &Idx) {
+ // Transfroms a G_INSERT_VECTOR_ELT into an equivalent G_SHUFFLE_MASK if:
+ //- Scalar Pack insts are present (for <32 bits element types)
+ //- The vector has <= 4 elements.
+ // as this is a preferred canonical form of the operation.
+ //
+ // Note that both restrictions are arbitrary. Currently, it's mostly targeted
+ // towards 2x16 vectors. Restrictions could be relaxed or entirely removed in
+ // the future if codegen can handle it without causing regressions.
+
+ LLT VecTy = MRI.getType(MI.getOperand(0).getReg());
+ const unsigned EltSize = VecTy.getElementType().getSizeInBits();
+ if (EltSize < 32 &&
+ !MI.getMF()->getSubtarget().hasScalarPackInsts())
+return false;
+
+ if (VecTy.isScalable() || VecTy.getNumElements() > 4)
+return false;
+
+ Optional MaybeIdxVal =
+ getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
+ if (!MaybeIdxVal)
+return false;
+
+ Idx = MaybeIdxVal->Value.getZExtValue();
+ return true;
+}
+
+void AMDGPUPreLegalizerCombinerHelper::applyInsertVectorEltToShuffle(
+MachineInstr &MI, unsigned &Idx) {
+ B.setInstrAndDebugLoc(MI);
+
+ Register Ins = MI.getOperand(2).getReg();
+ Register Vec = MI.getOperand(1).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+
+ LLT VecTy = MRI.getType(Dst);
+ LLT EltTy = VecTy.getElementType();
+ const unsigned NumElts = VecTy.getNumElements();
+
+ const auto Undef = MRI.createGenericVirtualRegister(EltTy);
+ B.buildUndef(Undef);
+
+ const auto OtherVec = MRI.createGenericVirtualRegister(VecTy);
+
+ SmallVector Srcs;
+ Srcs.push_back(Ins);
+ for (unsigned K = 1; K < NumElts; ++K)
+Srcs.push_back(Undef);
+
+ B.buildBuildVector(OtherVec, Srcs);
+
+ // NumElts == Ins in OtherVec
+ // 0...(NumElts-1) = Original elements
+ SmallVector ShuffleMask;
+ for (unsig