Seems to be about two times faster at the cost of being larger, using and clobbering one more register and being a little more difficult to read.
Signed-off-by: Rhys Perry <pendingchao...@gmail.com> --- src/gallium/drivers/nouveau/codegen/lib/gm107.asm | 92 ++++++++++++++------- .../drivers/nouveau/codegen/lib/gm107.asm.h | 94 +++++++++++++++------- .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 2 +- 3 files changed, 128 insertions(+), 60 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm index 7ee5f8fc65..d7db3ee7fb 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm +++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm @@ -7,36 +7,56 @@ // // INPUT: $r0: dividend, $r1: divisor // OUTPUT: $r0: result, $r1: modulus -// CLOBBER: $r2 - $r3, $p0 - $p1 -// SIZE: 22 / 14 * 8 bytes +// CLOBBER: $r2 - $r4, $p0 - $p1 +// +// xmad $r4 s1 s0 s2 +// xmad mrg d0 s1 h1 s0 $r255 +// xmad psl cbcc d0 h1 s1 h1 d0 $r4 +// is d0 = s0 * s1 + s2 (clobbering $r4) // gm107_div_u32: sched (st 0xd wr 0x0 wt 0x3f) (st 0x1 wt 0x1) (st 0x6) flo u32 $r2 $r1 lop xor 1 $r2 $r2 0x1f mov $r3 0x1 0xf - sched (st 0x1) (st 0xf wr 0x0) (st 0x6 wr 0x0 wt 0x1) + sched (st 0x1) (st 0xf wr 0x0) (st 0x1 wt 0x1) shl $r2 $r3 $r2 i2i u32 u32 $r1 neg $r1 - imul u32 u32 $r3 $r1 $r2 - sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) + xmad $r4 $r2 $r1 $r255 + sched (st 0x6) (st 0x6) (st 0x6 wr 0x0) + xmad mrg $r3 $r2 h1 $r1 $r255 + xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4 imad u32 u32 hi $r2 $r2 $r3 $r2 - imul u32 u32 $r3 $r1 $r2 + sched (st 0x1 wt 0x1) (st 0x6) (st 0x6) + xmad $r4 $r2 $r1 $r255 + xmad mrg $r3 $r2 h1 $r1 $r255 + xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4 + sched (st 0x6 wr 0x0) (st 0x1 wt 0x1) (st 0x6) imad u32 u32 hi $r2 $r2 $r3 $r2 - sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) - imul u32 u32 $r3 $r1 $r2 + xmad $r4 $r2 $r1 $r255 + xmad mrg $r3 $r2 h1 $r1 $r255 + sched (st 0x6) (st 0x6 wr 0x0 wt 0x1) (st 0x1 wt 0x1) + xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4 imad u32 u32 hi $r2 $r2 $r3 $r2 - imul u32 u32 $r3 $r1 $r2 - sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 rd 0x1 wt 0x1) + xmad $r4 $r2 $r1 $r255 + sched (st 0x6) (st 0x6) (st 0x6 wr 0x0) + xmad mrg $r3 $r2 h1 $r1 $r255 + xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4 imad u32 u32 hi $r2 $r2 $r3 $r2 - imul u32 u32 $r3 $r1 $r2 + sched (st 0x1 wt 0x1) (st 0x6) (st 0x6) + xmad $r4 $r2 $r1 $r255 + xmad mrg $r3 $r2 h1 $r1 $r255 + xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4 + sched (st 0x6 wr 0x0 rd 0x1) (st 0x6 wt 0x2) (st 0x6 wr 0x0 rd 0x1 wt 0x1) imad u32 u32 hi $r2 $r2 $r3 $r2 - sched (st 0x6 wt 0x2) (st 0x6 wr 0x0 rd 0x1 wt 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x2) mov $r3 $r0 0xf imul u32 u32 hi $r0 $r0 $r2 + sched (st 0xf wr 0x0 rd 0x1 wt 0x2) (st 0x1 wt 0x3) (st 0x6) i2i u32 u32 $r2 neg $r1 - sched (st 0x6 wr 0x0 wt 0x3) (st 0xd wt 0x1) (st 0x1) - imad u32 u32 $r1 $r1 $r0 $r3 + xmad $r4 $r0 $r1 $r3 + xmad mrg $r1 $r0 h1 $r1 $r255 + sched (st 0x6) (st 0xd) (st 0x1) + xmad psl cbcc $r1 h1 $r0 h1 $r1 $r4 isetp ge u32 and $p0 1 $r1 $r2 1 $p0 iadd $r1 $r1 neg $r2 sched (st 0x5) (st 0xd) (st 0x1) @@ -52,7 +72,7 @@ gm107_div_u32: // // INPUT: $r0: dividend, $r1: divisor // OUTPUT: $r0: result, $r1: modulus -// CLOBBER: $r2 - $r3, $p0 - $p3 +// CLOBBER: $r2 - $r4, $p0 - $p3 // gm107_div_s32: sched (st 0xd wt 0x3f) (st 0x1) (st 0x1 wr 0x0) @@ -63,35 +83,51 @@ gm107_div_s32: i2i s32 s32 $r1 abs $r1 flo u32 $r2 $r1 lop xor 1 $r2 $r2 0x1f - sched (st 0x6) (st 0x1) (st 0xf wr 0x1) + sched (st 0x6) (st 0x1) (st 0xf wr 0x0) mov $r3 0x1 0xf shl $r2 $r3 $r2 i2i u32 u32 $r1 neg $r1 - sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) - imul u32 u32 $r3 $r1 $r2 + sched (st 0x1 wt 0x1) (st 0x6) (st 0x6) + xmad $r4 $r2 $r1 $r255 + xmad mrg $r3 $r2 h1 $r1 $r255 + xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4 + sched (st 0x6 wr 0x0) (st 0x1 wt 0x1) (st 0x6) imad u32 u32 hi $r2 $r2 $r3 $r2 - imul u32 u32 $r3 $r1 $r2 - sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) + xmad $r4 $r2 $r1 $r255 + xmad mrg $r3 $r2 h1 $r1 $r255 + sched (st 0x6) (st 0x6 wr 0x0) (st 0x1 wt 0x1) + xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4 imad u32 u32 hi $r2 $r2 $r3 $r2 - imul u32 u32 $r3 $r1 $r2 + xmad $r4 $r2 $r1 $r255 + sched (st 0x6) (st 0x6) (st 0x6 wr 0x0 wt 0x1) + xmad mrg $r3 $r2 h1 $r1 $r255 + xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4 imad u32 u32 hi $r2 $r2 $r3 $r2 - sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) - imul u32 u32 $r3 $r1 $r2 + sched (st 0x1 wt 0x1) (st 0x6) (st 0x6) + xmad $r4 $r2 $r1 $r255 + xmad mrg $r3 $r2 h1 $r1 $r255 + xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4 + sched (st 0x6 wr 0x0) (st 0x1 wt 0x1) (st 0x6) imad u32 u32 hi $r2 $r2 $r3 $r2 - imul u32 u32 $r3 $r1 $r2 - sched (st 0x6 wr 0x1 rd 0x2 wt 0x2) (st 0x2 wt 0x5) (st 0x6 wr 0x0 rd 0x1 wt 0x2) + xmad $r4 $r2 $r1 $r255 + xmad mrg $r3 $r2 h1 $r1 $r255 + sched (st 0x6) (st 0x6 wr 0x0 rd 0x1) (st 0x6 wt 0x2) + xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4 imad u32 u32 hi $r2 $r2 $r3 $r2 mov $r3 $r0 0xf + sched (st 0x6 wr 0x0 rd 0x1 wt 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x2) (st 0x1 wt 0x3) imul u32 u32 hi $r0 $r0 $r2 - sched (st 0xf wr 0x1 rd 0x2 wt 0x2) (st 0x6 wr 0x0 wt 0x5) (st 0xd wt 0x3) i2i u32 u32 $r2 neg $r1 - imad u32 u32 $r1 $r1 $r0 $r3 + xmad $r4 $r0 $r1 $r3 + sched (st 0x6) (st 0x6) (st 0xd) + xmad mrg $r1 $r0 h1 $r1 $r255 + xmad psl cbcc $r1 h1 $r0 h1 $r1 $r4 isetp ge u32 and $p0 1 $r1 $r2 1 sched (st 0x1) (st 0x5) (st 0xd) $p0 iadd $r1 $r1 neg $r2 $p0 iadd $r0 $r0 0x1 $p0 isetp ge u32 and $p0 1 $r1 $r2 1 - sched (st 0x1) (st 0x2) (st 0xf wr 0x0) + sched (st 0x1) (st 0x1) (st 0xf wr 0x0) $p0 iadd $r1 $r1 neg $r2 $p0 iadd $r0 $r0 0x1 $p3 i2i s32 s32 $r0 neg $r0 diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h index 65c93f7ae8..ec8316b72d 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h +++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h @@ -4,28 +4,44 @@ uint64_t gm107_builtin_code[] = { 0x5c30000000170002, 0x3847040001f70202, 0x3898078000170003, - 0x003c1800e1e007e1, + 0x003f8400e1e007e1, 0x5c48000000270302, 0x5ce0200000170a01, - 0x5c38000000270103, - 0x003c1801e0c00f06, + 0x5b007f8000170204, + 0x001c1800fcc007e6, + 0x5b007fa800170203, + 0x5b30021800370203, 0x5a40010000370202, - 0x5c38000000270103, + 0x001f9800fcc00fe1, + 0x5b007f8000170204, + 0x5b007fa800170203, + 0x5b30021800370203, + 0x001f9801fc200706, 0x5a40010000370202, - 0x003c1801e0c00f06, - 0x5c38000000270103, + 0x5b007f8000170204, + 0x5b007fa800170203, + 0x003f8401e0c007e6, + 0x5b30021800370203, 0x5a40010000370202, - 0x5c38000000270103, - 0x00241801e0c00f06, + 0x5b007f8000170204, + 0x001c1800fcc007e6, + 0x5b007fa800170203, + 0x5b30021800370203, 0x5a40010000370202, - 0x5c38000000270103, + 0x001f9800fcc00fe1, + 0x5b007f8000170204, + 0x5b007fa800170203, + 0x5b30021800370203, + 0x00241802fcc00106, 0x5a40010000370202, - 0x00443c0120c017e6, 0x5c98078000070003, 0x5c38008000270000, + 0x001f9803fc20110f, 0x5ce0200000170a02, - 0x001f8401fda01f06, - 0x5a00018000070101, + 0x5b00018000170004, + 0x5b007fa800170001, + 0x001f8400fda007e6, + 0x5b30021800170001, 0x5b6c038000270107, 0x5c11000000200101, 0x001f8400fda007e5, @@ -36,7 +52,7 @@ uint64_t gm107_builtin_code[] = { 0x3810000000100000, 0xe32000000007000f, 0x50b0000000070f00, -/* 0x0120: gm107_div_s32 */ +/* 0x01a0: gm107_div_s32 */ 0x001c0400fc21ffed, 0x5b6303800ff70017, 0x5b6341000ff7011f, @@ -45,35 +61,51 @@ uint64_t gm107_builtin_code[] = { 0x5ce2000000173a01, 0x5c30000000170002, 0x3847040001f70202, - 0x001cbc00fc2007e6, + 0x001c3c00fc2007e6, 0x3898078000170003, 0x5c48000000270302, 0x5ce0200000170a01, - 0x005c9802e4c01726, - 0x5c38000000270103, + 0x001f9800fcc00fe1, + 0x5b007f8000170204, + 0x5b007fa800170203, + 0x5b30021800370203, + 0x001f9801fc200706, 0x5a40010000370202, - 0x5c38000000270103, - 0x005c9802e4c01726, + 0x5b007f8000170204, + 0x5b007fa800170203, + 0x003f8400e0c007e6, + 0x5b30021800370203, 0x5a40010000370202, - 0x5c38000000270103, + 0x5b007f8000170204, + 0x003c1800fcc007e6, + 0x5b007fa800170203, + 0x5b30021800370203, 0x5a40010000370202, - 0x005c9802e4c01726, - 0x5c38000000270103, + 0x001f9800fcc00fe1, + 0x5b007f8000170204, + 0x5b007fa800170203, + 0x5b30021800370203, + 0x001f9801fc200706, 0x5a40010000370202, - 0x5c38000000270103, - 0x00441805fc401226, + 0x5b007f8000170204, + 0x5b007fa800170203, + 0x005f980020c007e6, + 0x5b30021800370203, 0x5a40010000370202, 0x5c98078000070003, + 0x007f840221e00906, 0x5c38008000270000, - 0x007fb405e0c0122f, 0x5ce0200000170a02, - 0x5a00018000070101, + 0x5b00018000170004, + 0x001fb400fcc007e6, + 0x5b007fa800170001, + 0x5b30021800170001, 0x5b6c038000270107, 0x001fb400fca007e1, 0x5c11000000200101, 0x3810000000100000, 0x5b6c038000200107, - 0x001c3c00fc4007e1, + 0x001c3c00fc2007e1, 0x5c11000000200101, 0x3810000000100000, 0x5ce0200000033a00, @@ -81,8 +113,8 @@ uint64_t gm107_builtin_code[] = { 0x5ce0200000123a01, 0xe32000000007000f, 0x50b0000000070f00, -/* 0x0280: gm107_rcp_f64 */ -/* 0x0280: gm107_rsq_f64 */ +/* 0x0380: gm107_rcp_f64 */ +/* 0x0380: gm107_rsq_f64 */ 0x001f8000fc0007e0, 0xe32000000007000f, 0x50b0000000070f00, @@ -91,7 +123,7 @@ uint64_t gm107_builtin_code[] = { uint64_t gm107_builtin_offsets[] = { 0x0000000000000000, - 0x0000000000000120, - 0x0000000000000280, - 0x0000000000000280, + 0x00000000000001a0, + 0x0000000000000380, + 0x0000000000000380, }; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 597dcdffbe..d51ebbe258 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -74,7 +74,7 @@ NVC0LegalizeSSA::handleDIV(Instruction *i) } call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL); bld.mkMovFromReg(i->getDef(0), i->op == OP_DIV ? 0 : 1); - bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2); + bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0x1e : 0x1d, 2); bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0); call->fixed = 1; -- 2.14.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev