Hello Ian, Since I am working on a direct SPIR-V to NV50 IR translator, ultimately to be used for OpenCL kernels, I will still need the patch for that work. (I even wrote that patch because I needed it when handling 64-bit addresses. :-) ) But thanks for the heads-up!
Pierre On 02:07 pm - Oct 17 2016, Ian Romanick wrote: > I know know if it will make this patch unnecessary, but I have a GLSL > IR-level lowering pass for 64-bit multiplication. I'm going to send > that out with the rest of the GL_ARB_gpu_shader_int64 series within the > next day or so. > > On 10/15/2016 03:24 PM, Pierre Moreau wrote: > > Hardware does not support 64-bit integers MAD and MUL operations, so we need > > to transform them in 32-bit operations. > > > > Signed-off-by: Pierre Moreau <[email protected]> > > --- > > .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 121 > > +++++++++++++++++++++ > > 1 file changed, 121 insertions(+) > > > > Tested with (the GPU result was compared to the CPU result): > > * 0xfffffffffffffff3lu * 0xfffffffffffffff2lu + 0x8000000700000002lu > > * 0xfffffffffffffff3lu * 0x8000000700000002lu + 0x8000000700000002lu > > * 0x8000000100000003lu * 0xfffffffffffffff2lu + 0x8000000700000002lu > > * 0x8000000100000003lu * 0x8000000700000002lu + 0x8000000700000002lu > > > > * -523456791234l * 929835793793l + -100005793793l > > * 523456791234l * 929835793793l + -100005793793l > > * -523456791234l * -929835793793l + -100005793793l > > * 523456791234l * -929835793793l + -100005793793l > > > > v2: > > * Completely re-write the patch, as it was completely flawed (Ilia Mirkin) > > * Move pass prior to Register Allocation, as some temporaries need to > > be created. > > > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > > b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > > index d88bb34..a610eb5 100644 > > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > > @@ -2218,6 +2218,126 @@ LateAlgebraicOpt::visit(Instruction *i) > > > > // > > ============================================================================= > > > > +// Split 64-bit MUL and MAD > > +class Split64BitOpPreRA : public Pass > > +{ > > +private: > > + virtual bool visit(BasicBlock *); > > + void split64BitReg(Function *, Instruction *, Instruction *, > > + Instruction *, Value *, int); > > + void split64MulMad(Function *, Instruction *, DataType); > > + > > + BuildUtil bld; > > +}; > > + > > +bool > > +Split64BitOpPreRA::visit(BasicBlock *bb) > > +{ > > + Instruction *i, *next; > > + Modifier mod; > > + > > + for (i = bb->getEntry(); i; i = next) { > > + next = i->next; > > + > > + if (typeSizeof(i->dType) != 8) > > + continue; > > + > > + DataType hTy; > > + switch (i->dType) { > > + case TYPE_U64: hTy = TYPE_U32; break; > > + case TYPE_S64: hTy = TYPE_S32; break; > > + default: > > + continue; > > + } > > + > > + if (i->op == OP_MAD || i->op == OP_MUL) > > + split64MulMad(bb->getFunction(), i, hTy); > > + } > > + > > + return true; > > +} > > + > > +void > > +Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType > > hTy) > > +{ > > + assert(i->op == OP_MAD || i->op == OP_MUL); > > + if (isFloatType(i->dType) || isFloatType(i->sType)) > > + return; > > + > > + bld.setPosition(i, true); > > + > > + Value *zero = bld.mkImm(0u); > > + Value *carry = bld.getSSA(1, FILE_FLAGS); > > + > > + // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit > > + // values (a, b and c might be 32-bit values), using 32-bit operations. > > This > > + // gives the following operations: > > + // * `d.low = low(a.low * b.low) (+ c.low)?` > > + // * `d.high = low(a.high * b.low) + low(a.low * b.high) > > + // + high(a.low * b.low) (+ c.high)?` > > + // > > + // To compute the high bits, we can split in the following operations: > > + // * `tmp1 = low(a.high * b.low) (+ c.high)?` > > + // * `tmp2 = low(a.low * b.high) + tmp1` > > + // * `d.high = high(a.low * b.low) + tmp2` > > + // > > + // mkSplit put lower bits at index 0 and higher bits at index 1 > > + > > + Value *op1[2]; > > + if (i->getSrc(0)->reg.size == 8) > > + bld.mkSplit(op1, typeSizeof(hTy), i->getSrc(0)); > > + else { > > + op1[0] = i->getSrc(0); > > + op1[1] = zero; > > + } > > + Value *op2[2]; > > + if (i->getSrc(1)->reg.size == 8) > > + bld.mkSplit(op2, typeSizeof(hTy), i->getSrc(1)); > > + else { > > + op2[0] = i->getSrc(1); > > + op2[1] = zero; > > + } > > + > > + Value *op3[2] = { NULL, NULL }; > > + if (i->op == OP_MAD) { > > + if (i->getSrc(2)->reg.size == 8) > > + bld.mkSplit(op3, typeSizeof(hTy), i->getSrc(2)); > > + else { > > + op3[0] = i->getSrc(2); > > + op3[1] = zero; > > + } > > + } > > + > > + Value *tmpRes1Hi = bld.getSSA(); > > + if (i->op == OP_MAD) > > + bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]); > > + else > > + bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]); > > + > > + Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], > > op2[1], tmpRes1Hi); > > + > > + Value *def[2] = { bld.getSSA(), bld.getSSA() }; > > + > > + // If it was a MAD, add the carry from the low bits > > + // It is not needed if it was a MUL, since we added high(a.low * b.low) > > to > > + // d.high > > + if (i->op == OP_MAD) > > + bld.mkOp3(OP_MAD, hTy, def[0], op1[0], op2[0], > > op3[0])->setFlagsDef(1, carry); > > + else > > + bld.mkOp2(OP_MUL, hTy, def[0], op1[0], op2[0]); > > + > > + Instruction *hiPart3 = bld.mkOp3(OP_MAD, hTy, def[1], op1[0], op2[0], > > tmpRes2Hi); > > + hiPart3->subOp = NV50_IR_SUBOP_MUL_HIGH; > > + if (i->op == OP_MAD) > > + hiPart3->setFlagsSrc(3, carry); > > + > > + bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]); > > + > > + delete_Instruction(fn->getProgram(), i); > > +} > > + > > +// > > ============================================================================= > > + > > static inline void > > updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn) > > { > > @@ -3523,6 +3643,7 @@ Program::optimizeSSA(int level) > > RUN_PASS(2, ModifierFolding, run); // before load propagation -> less > > checks > > RUN_PASS(1, ConstantFolding, foldAll); > > RUN_PASS(2, LateAlgebraicOpt, run); > > + RUN_PASS(1, Split64BitOpPreRA, run); > > RUN_PASS(1, LoadPropagation, run); > > RUN_PASS(1, IndirectPropagation, run); > > RUN_PASS(2, MemoryOpt, run); > > >
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-dev
