On Tue, Nov 13, 2012 at 07:41:20AM -0800, Vincent Lejeune wrote: > > > > > ----- Mail original ----- > > De : Tom Stellard <t...@stellard.net> > > À : mesa-dev@lists.freedesktop.org > > Cc : Tom Stellard <thomas.stell...@amd.com> > > Envoyé le : Vendredi 9 novembre 2012 21h47 > > Objet : [Mesa-dev] [PATCH 4/4] R600: Fold immediates into ALU instructions > > when possible > > > > From: Tom Stellard <thomas.stell...@amd.com> > > > > --- > > lib/Target/AMDGPU/R600ISelLowering.cpp | 65 > > ++++++++++++++++++++++--- > > lib/Target/AMDGPU/R600InstrInfo.cpp | 40 +++++++++++++++ > > lib/Target/AMDGPU/R600InstrInfo.h | 12 +++++ > > test/CodeGen/R600/fcmp-cnd.ll | 4 +- > > test/CodeGen/R600/fcmp-cnde-int-args.ll | 2 +- > > test/CodeGen/R600/selectcc-icmp-select-float.ll | 2 +- > > test/CodeGen/R600/selectcc_cnde.ll | 2 +- > > test/CodeGen/R600/selectcc_cnde_int.ll | 2 +- > > 8 files changed, 117 insertions(+), 12 deletions(-) > > > > diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp > > b/lib/Target/AMDGPU/R600ISelLowering.cpp > > index 75a2a90..dc2247a 100644 > > --- a/lib/Target/AMDGPU/R600ISelLowering.cpp > > +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp > > @@ -131,15 +131,66 @@ MachineBasicBlock * > > R600TargetLowering::EmitInstrWithCustomInserter( > > } > > > > case AMDGPU::MOV_IMM_F32: > > - TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), > > - MI->getOperand(1).getFPImm()->getValueAPF() > > - .bitcastToAPInt().getZExtValue()); > > - break; > > case AMDGPU::MOV_IMM_I32: > > - TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), > > - MI->getOperand(1).getImm()); > > - break; > > + { > > + unsigned DstReg = MI->getOperand(0).getReg(); > > + bool CanReplace = true; > > + int64_t Imm; > > + unsigned ImmReg = AMDGPU::ALU_LITERAL_X; > > + bool IsInlineConstant = false; > > + > > + if (MI->getOpcode() == AMDGPU::MOV_IMM_F32) { > > + APFloat Value = MI->getOperand(1).getFPImm()->getValueAPF(); > > + float FloatValue = Value.convertToFloat(); > > + if (FloatValue == 0.0f) { > > + ImmReg = AMDGPU::ZERO; > > + IsInlineConstant = true; > > + } else if (FloatValue == 1.0f) { > > + ImmReg = AMDGPU::ONE; > > + IsInlineConstant = true; > > + } > > + Imm = Value.bitcastToAPInt().getZExtValue(); > > + } else { > > + Imm = MI->getOperand(1).getImm(); > > + switch (Imm) { > > + case 0: > > + ImmReg = AMDGPU::ZERO; > > + IsInlineConstant = true; > > + break; > > + case 1: > > + ImmReg = AMDGPU::ONE_INT; > > + IsInlineConstant = true; > > + break; > > + } > > + } > > + > > + // Check all the uses of DstReg to make sure it is safe to fold the > > + // immediate. > > + for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(DstReg); > > + UI != MachineRegisterInfo::use_end(); > > ++UI) > > I think use_iterator are a bit tricky to use ; I was told by people on #llvm > that the > use_iterator has no guarantees on order, and is not a def-use chain per se. > The use iterator returns every machine operand marked as "use" or "implicit > use" by a > machine instruction. For a virtual register that is defined only once, it > matches the definition > of a def-use chain but not for physical register. For instance in the > following block : > > vreg1 = MOV T2_X (1) > ... > T2_X = MOV vreg2 (2) > ... > vreg3 = MOV T2_X (3) > > the use iterator will returns operand in instruction (1) and (3), even if it > was redefined in (2). > This situation can occur in our backend in the InstrEmitterPass because we > generate physical > register read for some input loading intrinsics, and register write for some > output storing intrinsics. >
Ok, so I guess use_iterator won't work for us here. > I think using a COPY instruction here would be better, because we'll let > register coalescer fold litterals for > us. > I think a COPY will work for AMDGPU::ZERO and AMDGPU::ONE, but probably not for when we use AMDGPU::ALU_LITERAL_X, because we can't add the literal to the copy instruction. I've thought about this more, and I think it might work to handle the constant folding in AMDGPUISelDAGToDAG::Select(), so I'm going to give this a try. This way we won't need to use the use_iterator and it should help the instruction scheduler do a better job of scheduling. -Tom > > { > > > > + if (!TII->canUseImmediate(UI.getOperand()) && > > + !(IsInlineConstant && > > TII->isALUInstr((*UI).getOpcode()))) { > > + CanReplace = false; > > + break; > > + } > > + } > > + > > + if (!CanReplace) { > > + TII->buildMovImm(*BB, I, DstReg, Imm); > > + } else { > > + if (!IsInlineConstant) { > > + // Add the immediate to the use instructions > > + for (MachineRegisterInfo::use_iterator UI = > > MRI.use_begin(DstReg); > > + UI != MachineRegisterInfo::use_end(); > > ++UI) { > > + TII->setImmOperand(&*UI, R600Operands::IMM, Imm); > > + } > > + } > > + // Update the use's register > > + MRI.replaceRegWith(DstReg, ImmReg); > > + } > > + break; > > + } > > > > case AMDGPU::RAT_WRITE_CACHELESS_32_eg: > > case AMDGPU::RAT_WRITE_CACHELESS_128_eg: > > diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp > > b/lib/Target/AMDGPU/R600InstrInfo.cpp > > index e437313..2ca5e45 100644 > > --- a/lib/Target/AMDGPU/R600InstrInfo.cpp > > +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp > > @@ -138,6 +138,15 @@ bool R600InstrInfo::isCubeOp(unsigned Opcode) const > > } > > } > > > > +bool R600InstrInfo::isALUInstr(unsigned Opcode) const > > +{ > > + unsigned TargetFlags = get(Opcode).TSFlags; > > + > > + return ((TargetFlags & R600_InstFlag::OP1) | > > + (TargetFlags & R600_InstFlag::OP2) | > > + (TargetFlags & R600_InstFlag::OP3)); > > +} > > + > > DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine > > *TM, > > const ScheduleDAG *DAG) const > > { > > @@ -582,6 +591,37 @@ void R600InstrInfo::setImmOperand(MachineInstr *MI, > > R600Operands::Ops Op, > > MI->getOperand(Idx).setImm(Imm); > > } > > > > +bool R600InstrInfo::operandUsesImmediate(const MachineInstr &MI, > > + R600Operands::Ops Op) const > > +{ > > + int OpIndex = getOperandIdx(MI, Op); > > + return OpIndex != -1 && > > + MI.getOperand(OpIndex).getReg() == AMDGPU::ALU_LITERAL_X; > > +} > > + > > +bool R600InstrInfo::canUseImmediate(const MachineOperand &MO) const > > +{ > > + const MachineInstr *MI = MO.getParent(); > > + > > + if (!isALUInstr(MI->getOpcode())) { > > + return false; > > + } > > + > > + // XXX: Handle cases when an instruction can reference an immediate from > > + // multiple operands: > > + // > > + // 1. Each operand uses the same immediate value > > + // 2. There is room in the instruction group (MIBundle) for another > > immediate. > > + // We need to start using bundles first. > > + if (operandUsesImmediate(*MI, R600Operands::SRC0) || > > + operandUsesImmediate(*MI, R600Operands::SRC1) || > > + operandUsesImmediate(*MI, R600Operands::SRC2)) { > > + return false; > > + } > > + > > + return true; > > +} > > + > > //===----------------------------------------------------------------------===// > > // Instruction flag getters/setters > > //===----------------------------------------------------------------------===// > > diff --git a/lib/Target/AMDGPU/R600InstrInfo.h > > b/lib/Target/AMDGPU/R600InstrInfo.h > > index cec1c3b..7c62a53 100644 > > --- a/lib/Target/AMDGPU/R600InstrInfo.h > > +++ b/lib/Target/AMDGPU/R600InstrInfo.h > > @@ -50,6 +50,9 @@ namespace llvm { > > bool isReductionOp(unsigned opcode) const; > > bool isCubeOp(unsigned opcode) const; > > > > + /// isALUInstr - Returns true if this Opcode represents an ALU > > instruction. > > + bool isALUInstr(unsigned Opcode) const; > > + > > /// isVector - Vector instructions are instructions that must fill all > > /// instruction slots within an instruction group. > > bool isVector(const MachineInstr &MI) const; > > @@ -133,6 +136,15 @@ namespace llvm { > > /// setImmOperand - Helper function for setting instruction flag values. > > void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) > > const; > > > > + /// operandUsesImmediate - Return true if it is legal for this > > instruction > > + /// to reference an immediate from one of its opreand and the given > > operand > > + /// is using an immediate. > > + bool operandUsesImmediate(const MachineInstr &MI, R600Operands::Ops Op) > > const; > > + > > + /// canUseImmediate - Return true if this operand can be replaced with an > > + /// immediate. > > + bool canUseImmediate(const MachineOperand &MO) const; > > + > > ///hasFlagOperand - Returns true if this instruction has an operand for > > /// storing target flags. > > bool hasFlagOperand(const MachineInstr &MI) const; > > diff --git a/test/CodeGen/R600/fcmp-cnd.ll b/test/CodeGen/R600/fcmp-cnd.ll > > index c6b6236..a94cfb5 100644 > > --- a/test/CodeGen/R600/fcmp-cnd.ll > > +++ b/test/CodeGen/R600/fcmp-cnd.ll > > @@ -1,6 +1,8 @@ > > ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s > > > > -;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], > > T[0-9]+\.[XYZW]}} > > +;Not checking arguments 2 and 3 to CNDE, because they may change between > > +;registers and literal.x depending on what the optimizer does. > > +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} > > > > define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { > > entry: > > diff --git a/test/CodeGen/R600/fcmp-cnde-int-args.ll > > b/test/CodeGen/R600/fcmp-cnde-int-args.ll > > index 92f3b5f..5c981ef 100644 > > --- a/test/CodeGen/R600/fcmp-cnde-int-args.ll > > +++ b/test/CodeGen/R600/fcmp-cnde-int-args.ll > > @@ -4,7 +4,7 @@ > > ; chance to optimize the fcmp + select instructions to CNDE was missed > > ; due to the fact that the operands to fcmp and select had different types > > > > -;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], > > T[0-9]+\.[XYZW]}} > > +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, 0.0, -1}} > > > > define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { > > entry: > > diff --git a/test/CodeGen/R600/selectcc-icmp-select-float.ll > > b/test/CodeGen/R600/selectcc-icmp-select-float.ll > > index f1f8ab1..f65a300 100644 > > --- a/test/CodeGen/R600/selectcc-icmp-select-float.ll > > +++ b/test/CodeGen/R600/selectcc-icmp-select-float.ll > > @@ -2,7 +2,7 @@ > > > > ; Note additional optimizations may cause this SGT to be replaced with a > > ; CND* instruction. > > -; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], > > T[0-9]+\.[XYZW]}} > > +; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, -1}} > > ; Test a selectcc with i32 LHS/RHS and float True/False > > > > define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) { > > diff --git a/test/CodeGen/R600/selectcc_cnde.ll > > b/test/CodeGen/R600/selectcc_cnde.ll > > index e06a170..f0a0f51 100644 > > --- a/test/CodeGen/R600/selectcc_cnde.ll > > +++ b/test/CodeGen/R600/selectcc_cnde.ll > > @@ -1,7 +1,7 @@ > > ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s > > > > ;CHECK-NOT: SETE > > -;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], > > T[0-9]+\.[XYZW]}} > > +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1.0, literal.x, > > [-0-9]+\(2.0}} > > define void @test(float addrspace(1)* %out, float addrspace(1)* %in) { > > %1 = load float addrspace(1)* %in > > %2 = fcmp oeq float %1, 0.0 > > diff --git a/test/CodeGen/R600/selectcc_cnde_int.ll > > b/test/CodeGen/R600/selectcc_cnde_int.ll > > index 03d000f..b38078e 100644 > > --- a/test/CodeGen/R600/selectcc_cnde_int.ll > > +++ b/test/CodeGen/R600/selectcc_cnde_int.ll > > @@ -1,7 +1,7 @@ > > ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s > > > > ;CHECK-NOT: SETE_INT > > -;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], > > T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} > > +;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1, literal.x, 2}} > > define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { > > %1 = load i32 addrspace(1)* %in > > %2 = icmp eq i32 %1, 0 > > -- > > 1.7.11.4 > > > > _______________________________________________ > > mesa-dev mailing list > > mesa-dev@lists.freedesktop.org > > http://lists.freedesktop.org/mailman/listinfo/mesa-dev > > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev