Acked-by: Chris Forbes <[email protected]>
On Wed, Oct 2, 2013 at 8:38 AM, Paul Berry <[email protected]> wrote: > Previously, we computed dFdy() using the following instruction: > > add(8) dst<1>F src<4,4,0)F -src.2<4,4,0>F { align1 1Q } > > That had the disadvantage that it computed the same value for all 4 > pixels of a 2x2 subspan, which meant that it was less accurate than > dFdx(). This patch changes it to the following instruction when > c->key.high_quality_derivatives is set: > > add(8) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1Q } > > This gives it comparable accuracy to dFdx(). > > Unfortunately, for some reason the SIMD16 version of this instruction: > > add(16) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1H } > > Doesn't seem to work reliably (presumably the hardware designers never > validated the combination of align16 mode with compressed > instructions), so we unroll it to: > > add(8) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1Q } > add(8) (dst+1)<1>F (src+1)<4,4,1>.xyxyF -(src+1)<4,4,1>.zwzwF { align16 2Q } > > Fixes piglit test spec/glsl-1.10/execution/fs-dfdy-accuracy. > --- > > Note: this patch needs to be applied atop Chia-I Wu's "i965: compute > DDX in a subspan based only on top row" > (http://lists.freedesktop.org/archives/mesa-dev/2013-September/045544.html). > > src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 77 > +++++++++++++++++++------- > src/mesa/drivers/dri/i965/brw_reg.h | 1 + > 2 files changed, 58 insertions(+), 20 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > index 9eb5e17..4efedc57 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > @@ -560,10 +560,8 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg > dst, struct brw_reg src > * sample_d. On at least Haswell, sample_d instruction does some > * optimizations if the same LOD is used for all pixels in the subspan. > * > - * For DDY, it's harder, as we want to produce the pairs swizzled between > each > - * other. We could probably do it like ddx and swizzle the right order > later, > - * but bail for now and just produce > - * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) > + * For DDY, we need to use ALIGN16 mode since it's capable of doing the > + * appropriate swizzling. > */ > void > fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg > src) > @@ -604,22 +602,61 @@ void > fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg > src, > bool negate_value) > { > - struct brw_reg src0 = brw_reg(src.file, src.nr, 0, > - BRW_REGISTER_TYPE_F, > - BRW_VERTICAL_STRIDE_4, > - BRW_WIDTH_4, > - BRW_HORIZONTAL_STRIDE_0, > - BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); > - struct brw_reg src1 = brw_reg(src.file, src.nr, 2, > - BRW_REGISTER_TYPE_F, > - BRW_VERTICAL_STRIDE_4, > - BRW_WIDTH_4, > - BRW_HORIZONTAL_STRIDE_0, > - BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); > - if (negate_value) > - brw_ADD(p, dst, src1, negate(src0)); > - else > - brw_ADD(p, dst, src0, negate(src1)); > + if (c->key.high_quality_derivatives) { > + /* produce accurate derivatives */ > + struct brw_reg src0 = brw_reg(src.file, src.nr, 0, > + BRW_REGISTER_TYPE_F, > + BRW_VERTICAL_STRIDE_4, > + BRW_WIDTH_4, > + BRW_HORIZONTAL_STRIDE_1, > + BRW_SWIZZLE_XYXY, WRITEMASK_XYZW); > + struct brw_reg src1 = brw_reg(src.file, src.nr, 0, > + BRW_REGISTER_TYPE_F, > + BRW_VERTICAL_STRIDE_4, > + BRW_WIDTH_4, > + BRW_HORIZONTAL_STRIDE_1, > + BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW); > + brw_push_insn_state(p); > + brw_set_access_mode(p, BRW_ALIGN_16); > + brw_set_compression_control(p, BRW_COMPRESSION_NONE); > + if (negate_value) > + brw_ADD(p, dst, src1, negate(src0)); > + else > + brw_ADD(p, dst, src0, negate(src1)); > + if (dispatch_width == 16) { > + /* For some reason, instruction compression doesn't seem to work > + * properly with ALIGN16 mode, so when doing a 16-wide dispatch, > just > + * manually unroll to two instructions. > + */ > + brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); > + src0 = sechalf(src0); > + src1 = sechalf(src1); > + dst = sechalf(dst); > + if (negate_value) > + brw_ADD(p, dst, src1, negate(src0)); > + else > + brw_ADD(p, dst, src0, negate(src1)); > + } > + brw_pop_insn_state(p); > + } else { > + /* replicate the derivative at the top-left pixel to other pixels */ > + struct brw_reg src0 = brw_reg(src.file, src.nr, 0, > + BRW_REGISTER_TYPE_F, > + BRW_VERTICAL_STRIDE_4, > + BRW_WIDTH_4, > + BRW_HORIZONTAL_STRIDE_0, > + BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); > + struct brw_reg src1 = brw_reg(src.file, src.nr, 2, > + BRW_REGISTER_TYPE_F, > + BRW_VERTICAL_STRIDE_4, > + BRW_WIDTH_4, > + BRW_HORIZONTAL_STRIDE_0, > + BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); > + if (negate_value) > + brw_ADD(p, dst, src1, negate(src0)); > + else > + brw_ADD(p, dst, src0, negate(src1)); > + } > } > > void > diff --git a/src/mesa/drivers/dri/i965/brw_reg.h > b/src/mesa/drivers/dri/i965/brw_reg.h > index 6df3366..3ee3543 100644 > --- a/src/mesa/drivers/dri/i965/brw_reg.h > +++ b/src/mesa/drivers/dri/i965/brw_reg.h > @@ -77,6 +77,7 @@ extern "C" { > #define BRW_SWIZZLE_ZZZZ BRW_SWIZZLE4(2,2,2,2) > #define BRW_SWIZZLE_WWWW BRW_SWIZZLE4(3,3,3,3) > #define BRW_SWIZZLE_XYXY BRW_SWIZZLE4(0,1,0,1) > +#define BRW_SWIZZLE_ZWZW BRW_SWIZZLE4(2,3,2,3) > > static inline bool > brw_is_single_value_swizzle(int swiz) > -- > 1.8.4 > > _______________________________________________ > mesa-dev mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/mesa-dev
