Oh right. I think the NVIDIA blob executes those steps conditionally based on the upper bits not being 0x7ff (== infinity/nan). I should do the same thing here. [FWIW I was able to test the nv50 code last night and that one's a total fail for rcp/rsq... will need to port that over to my nvc0 and debug there.]
On Mon, Feb 23, 2015 at 8:24 AM, Roland Scheidegger <srol...@vmware.com> wrote: > Does this give correct results for special floats (0, infs)? > We tried to improve (for single floats) x86 rcp in llvmpipe with > newton-raphson, but unfortunately not being able to give correct results > for these two cases (without even more additional code) meant it got all > disabled in the end (you can still see that code in the driver) since > the problems are at least as bad as those due to bad accuracy... > > Roland > > Am 23.02.2015 um 05:01 schrieb Ilia Mirkin: >> Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu> >> --- >> >> Not sure how many steps are needed for the necessary accuracy. Just >> doing 2 because that seems like a reasonable number. >> >> .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 42 >> ++++++++++++++++++++-- >> 1 file changed, 39 insertions(+), 3 deletions(-) >> >> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >> b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >> index 87e75e1..9767566 100644 >> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >> @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) >> bld.setPosition(i, false); >> >> // 1. Take the source and it up. >> - Value *src[2], *dst[2], *def = i->getDef(0); >> - bld.mkSplit(src, 4, i->getSrc(0)); >> + Value *input = i->getSrc(0); >> + Value *src[2], *dst[2], *guess, *def = i->getDef(0); >> + bld.mkSplit(src, 4, input); >> >> // 2. We don't care about the low 32 bits of the destination. Stick a 0 >> in. >> dst[0] = bld.loadImm(NULL, 0); >> @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) >> >> // 4. Recombine the two dst pieces back into the original destination. >> bld.setPosition(i, true); >> - bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]); >> + guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]); >> + >> + // 5. Perform 2 Newton-Raphson steps >> + if (i->op == OP_RCP) { >> + // RCP: x_{n+1} = 2 * x_n - input * x_n^2 >> + Value *two = bld.getSSA(8); >> + >> + bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f)); >> + >> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), >> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, >> guess), >> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, >> + bld.mkOp2v(OP_MUL, TYPE_F64, >> bld.getSSA(8), guess, guess))); >> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), >> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, >> guess), >> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, >> + bld.mkOp2v(OP_MUL, TYPE_F64, >> bld.getSSA(8), guess, guess))); >> + } else { >> + // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2) >> + Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8); >> + bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, >> -0.5f)); >> + bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, >> 1.5f)); >> + >> + half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, >> input); >> + // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2) >> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, >> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), >> half_input, >> + bld.mkOp2v(OP_MUL, TYPE_F64, >> bld.getSSA(8), guess, guess), >> + three_half)); >> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, >> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), >> half_input, >> + bld.mkOp2v(OP_MUL, TYPE_F64, >> bld.getSSA(8), guess, guess), >> + three_half)); >> + } >> + >> + bld.mkMov(def, guess); >> } >> >> bool >> > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev