from:"Chia\-I Wu"

Re: [Mesa-dev] [PATCH 3/4] ilo: unref old fence

2015-07-09 Thread Chia-I Wu

On Thu, Jul 9, 2015 at 8:46 AM, Rob Clark  wrote:
> From: Rob Clark 
>
> Some, but not all, state trackers will explicitly unref (and set to
> NULL) the previous *fence before calling pipe->flush().  So driver
> should use fence_ref() which will unref the old fence if not NULL.
Looks good.  Thanks.
>
> Signed-off-by: Rob Clark 
> ---
>  src/gallium/drivers/ilo/ilo_context.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/src/gallium/drivers/ilo/ilo_context.c 
> b/src/gallium/drivers/ilo/ilo_context.c
> index 3d5c7b6..b9a16aa 100644
> --- a/src/gallium/drivers/ilo/ilo_context.c
> +++ b/src/gallium/drivers/ilo/ilo_context.c
> @@ -62,6 +62,8 @@ ilo_flush(struct pipe_context *pipe,
>   (flags & PIPE_FLUSH_END_OF_FRAME) ? "frame end" : "user request");
>
> if (f) {
> +  struct pipe_screen *screen = pipe->screen;
> +  screen->fence_reference(screen, f, NULL);
>*f = ilo_screen_fence_create(pipe->screen, ilo->cp->last_submitted_bo);
> }
>  }
> --
> 2.4.3
>



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 1/3] ilo: EOL unplumb unmaintained gallium drv from winsys

2017-02-02 Thread Chia-I Wu

All three are

Acked-by:Chia-I Wu 

On Thu, Feb 2, 2017 at 12:15 AM, Edward O'Callaghan
 wrote:
> This is no longer actively maintained and is just
> accumulating bitrot.
>
> Signed-off-by: Edward O'Callaghan 
> ---
>  .../auxiliary/pipe-loader/pipe_loader_drm.c|  5 ---
>  src/gallium/auxiliary/target-helpers/drm_helper.h  | 29 -
>  src/gallium/targets/dri/target.c   |  3 --
>  src/gallium/targets/pipe-loader/pipe_i965.c| 47 
> --
>  src/gallium/winsys/intel/drm/intel_drm_winsys.c|  1 -
>  5 files changed, 85 deletions(-)
>  delete mode 100644 src/gallium/targets/pipe-loader/pipe_i965.c
>
> diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c 
> b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
> index 6c89fe5..09549e5 100644
> --- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
> +++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
> @@ -105,11 +105,6 @@ static const struct drm_driver_descriptor 
> driver_descriptors[] = {
>  },
>  #endif
>  {
> -.driver_name = "i965",
> -.create_screen = pipe_ilo_create_screen,
> -.configuration = configuration_query,
> -},
> -{
>  .driver_name = "nouveau",
>  .create_screen = pipe_nouveau_create_screen,
>  .configuration = configuration_query,
> diff --git a/src/gallium/auxiliary/target-helpers/drm_helper.h 
> b/src/gallium/auxiliary/target-helpers/drm_helper.h
> index f847b17..3159df6 100644
> --- a/src/gallium/auxiliary/target-helpers/drm_helper.h
> +++ b/src/gallium/auxiliary/target-helpers/drm_helper.h
> @@ -34,35 +34,6 @@ pipe_i915_create_screen(int fd)
>
>  #endif
>
> -#ifdef GALLIUM_ILO
> -#include "intel/drm/intel_drm_public.h"
> -#include "ilo/ilo_public.h"
> -
> -struct pipe_screen *
> -pipe_ilo_create_screen(int fd)
> -{
> -   struct intel_winsys *iws;
> -   struct pipe_screen *screen;
> -
> -   iws = intel_winsys_create_for_fd(fd);
> -   if (!iws)
> -  return NULL;
> -
> -   screen = ilo_screen_create(iws);
> -   return screen ? debug_screen_wrap(screen) : NULL;
> -}
> -
> -#else
> -
> -struct pipe_screen *
> -pipe_ilo_create_screen(int fd)
> -{
> -   fprintf(stderr, "ilo: driver missing\n");
> -   return NULL;
> -}
> -
> -#endif
> -
>  #ifdef GALLIUM_NOUVEAU
>  #include "nouveau/drm/nouveau_drm_public.h"
>
> diff --git a/src/gallium/targets/dri/target.c 
> b/src/gallium/targets/dri/target.c
> index 441a27f..dba18cc 100644
> --- a/src/gallium/targets/dri/target.c
> +++ b/src/gallium/targets/dri/target.c
> @@ -151,9 +151,6 @@ const __DRIextension 
> **__driDriverGetExtensions_i965(void);
>   * i965 driver so that you can just make a directory with a link from
>   * i965_dri.so to the built vc4_dri.so, and point LIBGL_DRIVERS_PATH to that
>   * on your i965-using host to run the driver under simulation.
> - *
> - * This is, of course, incompatible with building with the ilo driver, but 
> you
> - * shouldn't be building that anyway.
>   */
>  PUBLIC const __DRIextension **__driDriverGetExtensions_i965(void)
>  {
> diff --git a/src/gallium/targets/pipe-loader/pipe_i965.c 
> b/src/gallium/targets/pipe-loader/pipe_i965.c
> deleted file mode 100644
> index a2d8deb..000
> --- a/src/gallium/targets/pipe-loader/pipe_i965.c
> +++ /dev/null
> @@ -1,47 +0,0 @@
> -#include "target-helpers/inline_debug_helper.h"
> -#include "state_tracker/drm_driver.h"
> -#include "intel/drm/intel_drm_public.h"
> -#include "ilo/ilo_public.h"
> -
> -static struct pipe_screen *
> -create_screen(int fd)
> -{
> -   struct intel_winsys *iws;
> -   struct pipe_screen *screen;
> -
> -   iws = intel_winsys_create_for_fd(fd);
> -   if (!iws)
> -  return NULL;
> -
> -   screen = ilo_screen_create(iws);
> -   if (!screen)
> -  return NULL;
> -
> -   screen = debug_screen_wrap(screen);
> -
> -   return screen;
> -}
> -static const struct drm_conf_ret throttle_ret = {
> -   .type = DRM_CONF_INT,
> -   .val.val_int = 2,
> -};
> -
> -static const struct drm_conf_ret share_fd_ret = {
> -   .type = DRM_CONF_BOOL,
> -   .val.val_int = true,
> -};
> -
> -static const struct drm_conf_ret *drm_configuration(enum drm_conf conf)
> -{
> -   switch (conf) {
> -   case DRM_CONF_THROTTLE:
> -  return &throttle_ret;
> -   case DRM_CONF_SHARE_FD:
> -  return &share_fd_ret;
> -   default:
> -  break;
> -   }
> -   return NULL;
> -}
> -PUBLIC
> -DRM_DRIVER_DESCRIPTOR("i965", create_screen, drm_configuration)
> diff --git a/src/gallium/winsys/intel/drm/intel_drm_winsys.c 
> b/src/gallium/winsys/intel/drm/intel_drm_winsys.c
> index d3bc430..63c5c8b 100644
> --- a/src/gallium/winsys/intel/drm/intel_drm_winsys.c
> +++ b/src/gallium/winsys/intel/drm/intel_drm_winsys.c
> @@ -41,7 +41,6 @@
>  #include "util/u_inlines.h"
>  #include "util/u_memory.h"
>  #include "util/u_debug.h"
> -#include "ilo/core/intel_winsys.h"
>  #include "intel_drm_public.h"
>
>  struct intel_winsys {
> --
> 2.9.3
>
___
mesa-d

Re: [Mesa-dev] Potentially EOL ilo gallium driver

2016-12-07 Thread Chia-I Wu

Hi all,

Sorry for the slow response.  I think it is fine to drop the driver :(

Not because the driver is currently unmaintained, which is very true
and is a very good reason, but that there is now a Intel Vulkan
driver.  Vulkan is somewhat as low-level as Gallium is (or even
lower-level).  The driver has most things I like to see as well (low
CPU overhead, minimal/predictable heap allocation, generated register
descriptions, etc.).  Sorry for the confusions and burdens it bring to
others, and thanks to the few individuals/groups who find it useful
for their needs at various times.


On Thu, Dec 8, 2016 at 8:33 AM, Edward O'Callaghan
 wrote:
>
>
> On 12/08/2016 11:28 AM, Roland Scheidegger wrote:
>> I haven't seen the driver author's opinion on this yet, so it's probably
>> fair to give him some more time to answer. It's not like this is really
>> urgent...
>
> Absolutely!
>
>>
>> Roland
>>
>> Am 08.12.2016 um 01:11 schrieb Edward O'Callaghan:
>>> Hi all,
>>>
>>> So I'll get right to the crux of this; In summary the consensus would
>>> then be to drop ilo?
>>>
>>> If so, I am not sure of this communities procedure? However, if it helps
>>> the patch is here:
>>> https://cgit.freedesktop.org/~funfunctor/mesa/log/?h=eol-ilo
>>>
>>> Kind Regards,
>>> Edward.
>>>
>>> On 12/07/2016 07:08 AM, Ilia Mirkin wrote:
>>>> On Tue, Dec 6, 2016 at 3:00 PM, Rob Clark  wrote:
>>>>> On Tue, Dec 6, 2016 at 2:11 PM, Jason Ekstrand  
>>>>> wrote:
>>>>>> On Tue, Dec 6, 2016 at 8:39 AM, Rob Clark  wrote:
>>>>>>>
>>>>>>> On Tue, Dec 6, 2016 at 8:42 AM, Emil Velikov 
>>>>>>> wrote:
>>>>>>>> On 6 December 2016 at 03:16, Edward O'Callaghan
>>>>>>>>  wrote:
>>>>>>>>> This patch is to potentially remove ourself from the maintaince
>>>>>>>>> burden of the ilo driver that appears to now be essentially
>>>>>>>>> unmaintained?
>>>>>>>>>
>>>>>>>>> I am not sure of our policy here or if there are too many
>>>>>>>>> users so this patch is really only to gauge a response of
>>>>>>>>> how folks feel?
>>>>>>>>>
>>>>>>>> Surely you want to CC the core/sole developer of the driver when
>>>>>>>> considering its removal.
>>>>>>>> Maybe mailman was "nice" and hid his email in the header ;-)
>>>>>>>>
>>>>>>>> Either way adding Chia-I Wu to the list.
>>>>>>>>
>>>>>>>> -Emil
>>>>>>>> P.S. Not sure/sold how much of an actual burden the driver is, yet I
>>>>>>>> don't make serious gallium infra changes.
>>>>>>>
>>>>>>> really hasn't been a problem for me..
>>>>>>>
>>>>>>> That said, it would be nice if someday someone wired this up to use
>>>>>>> glsl_to_nir path in gallium and re-used i965's nir backend.  I think
>>>>>>> that would make ilo somewhat more interesting..
>>>>>>
>>>>>>
>>>>>> We had a bit of a chat about this on IRC and what I told Ilia there was 
>>>>>> that
>>>>>> the more interesting thing to do, if someone really wanted to do Intel on
>>>>>> gallium, would probably be to build a new driver based on ISL, blorp, the
>>>>>> i965 compiler, NIR, and genxml.  We've made a pretty good driver-building
>>>>>> toolbox.  Having an almost unmaintained driver that has it's own 
>>>>>> hand-rolled
>>>>>> and inferrior compiler, surface layout, etc. isn't doing much good.
>>>>>>
>>>>>
>>>>> yeah, reusing the other bits would be nice too, and hopefully would be
>>>>> the long term goal if someone where to spend time on this.. I guess
>>>>> I'd prefer a more incremental approach of converting parts one by one
>>>>> if I were doing it myself.  It's kind of a moot point either way until
>>>>> someone has time/motivation to spend on it.
>>>>>
>>>>> But I've no real objection to dropping ilo until then if others feel
>>>>> strongly.. it's still there in git history so it can be resurrected if
>>>>> someone wants to convert to reuse other i965 bits incrementally rather
>>>>> than starting from scratch.
>>>>
>>>> As mentioned on IRC, I think the real use-case that ilo could cover
>>>> that i965/anv can't (easily) handle is acting as a gallium-nine
>>>> backend. (I know someone's working on DX9 over vulkan, but that's
>>>> hardly ready, and will never be available on gen6.)
>>>>
>>>> However at this time, it's not sufficiently functional to handle
>>>> gallium-nine, so I don't see any serious downside to dropping it.
>>>>
>>>>   -ilia
>>>>
>>>
>>>
>>>
>>> ___
>>> mesa-dev mailing list
>>> mesa-dev@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>>
>>
>
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH RFC 0/6] i965: emulate SIMD16 sample_d with dual SIMD8 ones

2013-09-30 Thread Chia-I Wu

From: Chia-I Wu 

Hi,

This series of patches implements the emulation SIMD16 sample_d with dual
SIMD8 sample_d.  Before the changes, the compiler would fail to generate
SIMD16 code for fragment shaders that use textureGrad.  And that hurts the
performance.

The first four patches prepare the compiler for supporting SIMD8 sampler
messages in SIMD16 mode.  The last two patches implement the emulation.  For
some changes, there are more than one way to achieve the same goals.  That is
why this series is marked RFC.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH RFC 1/6] i965: make BRW_COMPRESSION_2NDHALF valid for brw_SAMPLE

2013-09-30 Thread Chia-I Wu

From: Chia-I Wu 

SIMD8 sampler messages are allowed in SIMD16 mode, and they could not work
without BRW_COMPRESSION_2NDHALF.  Later PRMs (gen5 and later) do not
explicitly state whether BRW_COMPRESSION_2NDHALF is allowed, but they do have
examples using send with SecHalf.  It should be safe to assume SecHalf is
valid.

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_eu_emit.c | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c 
b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 7ed3df0..12515ec 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -2195,7 +2195,22 @@ void brw_SAMPLE(struct brw_compile *p,
 
insn = next_insn(p, BRW_OPCODE_SEND);
insn->header.predicate_control = 0; /* XXX */
-   insn->header.compression_control = BRW_COMPRESSION_NONE;
+
+   /* From the 965 PRM (volume 4, part 1, section 14.2.41):
+*
+*"Instruction compression is not allowed for this instruction (that
+* is, send). The hardware behavior is undefined if this instruction is
+* set as compressed. However, compress control can be set to "SecHalf"
+* to affect the EMask generation."
+*
+* No similar wording is found in later PRMs, but there are examples
+* utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
+* are allowed in SIMD16 mode and they could not work without SecHalf.  For
+* these reasons, we allow BRW_COMPRESSION_2NDHALF here.
+*/
+   if (insn->header.compression_control != BRW_COMPRESSION_2NDHALF)
+  insn->header.compression_control = BRW_COMPRESSION_NONE;
+
if (brw->gen < 6)
   insn->header.destreg__conditionalmod = msg_reg_nr;
 
-- 
1.8.3.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH RFC 2/6] i965: allow SIMD8 sampler messages in SIMD16 mode

2013-09-30 Thread Chia-I Wu

From: Chia-I Wu 

When the instruction to send the sampler message is forced uncompressed or
sechalf, send SIMD8 one even in SIMD16 mode.

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 4475058..9406f7b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -381,7 +381,8 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg 
dst, struct brw_reg src
   break;
}
 
-   if (dispatch_width == 16)
+   if (dispatch_width == 16 &&
+  !inst->force_uncompressed && !inst->force_sechalf)
   simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 
if (brw->gen >= 5) {
-- 
1.8.3.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH RFC 3/6] i965: add FS_OPCODE_OVERWRITE_DST

2013-09-30 Thread Chia-I Wu

From: Chia-I Wu 

FS_OPCODE_OVERWRITE_DST is used to indicate that the destination register is
(completely) overwritten.  No code is emitted, but the liveness analysis can
use it as a hint to add the destination register to DEF bitset.  This is
needed because it is hard to figure out if some partial writes combined
constitute a complete write during liveness analysis, while it is easier for
the FS visitor to know if that is the case.

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_defines.h | 1 +
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp  | 4 
 src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp | 5 +++--
 src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp | 3 ++-
 src/mesa/drivers/dri/i965/brw_shader.cpp| 3 +++
 5 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index b14c346..2618180 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -789,6 +789,7 @@ enum opcode {
FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X,
FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y,
FS_OPCODE_PLACEHOLDER_HALT,
+   FS_OPCODE_OVERWRITE_DST,
 
VS_OPCODE_URB_WRITE,
VS_OPCODE_SCRATCH_READ,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 9406f7b..2b179b6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1484,6 +1484,10 @@ fs_generator::generate_code(exec_list *instructions)
  patch_discard_jumps_to_fb_writes();
  break;
 
+  case FS_OPCODE_OVERWRITE_DST:
+ /* This is to help liveness analysis. */
+ break;
+
   default:
 if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
_mesa_problem(ctx, "Unsupported opcode `%s' in FS",
diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
index f5daab2..13891f8 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
@@ -77,8 +77,9 @@ fs_live_variables::setup_def_use()
  * variable, and thus qualify for being in def[].
  */
 if (inst->dst.file == GRF &&
-inst->regs_written == v->virtual_grf_sizes[inst->dst.reg] &&
-!inst->is_partial_write()) {
+(inst->opcode == FS_OPCODE_OVERWRITE_DST ||
+ (inst->regs_written == v->virtual_grf_sizes[inst->dst.reg] &&
+  !inst->is_partial_write( {
int reg = inst->dst.reg;
 if (!BITSET_TEST(bd[b].use, reg))
BITSET_SET(bd[b].def, reg);
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp 
b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 5530683..4e59a10 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -562,7 +562,8 @@ fs_instruction_scheduler::calculate_deps()
   schedule_node *n = (schedule_node *)node;
   fs_inst *inst = (fs_inst *)n->inst;
 
-  if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT)
+  if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
+  inst->opcode == FS_OPCODE_OVERWRITE_DST)
  add_barrier_deps(n);
 
   /* read-after-write deps. */
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp 
b/src/mesa/drivers/dri/i965/brw_shader.cpp
index a558d36..78029e2 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -485,6 +485,9 @@ brw_instruction_name(enum opcode op)
case FS_OPCODE_PLACEHOLDER_HALT:
   return "placeholder_halt";
 
+   case FS_OPCODE_OVERWRITE_DST:
+  return "overwrite_dst";
+
case VS_OPCODE_URB_WRITE:
   return "vs_urb_write";
case VS_OPCODE_SCRATCH_READ:
-- 
1.8.3.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH RFC 4/6] i965: keep SecHalf flag after register coalescing

2013-09-30 Thread Chia-I Wu

From: Chia-I Wu 

Copy sechalf to the new register, otherwise we would read wrong HW registers.

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 2ebadc8..8991ee8 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2217,6 +2217,7 @@ fs_visitor::register_coalesce()
   new_src.abs = 1;
}
   new_src.negate ^= scan_inst->src[i].negate;
+  new_src.sechalf = scan_inst->src[i].sechalf;
   scan_inst->src[i] = new_src;
}
 }
-- 
1.8.3.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH RFC 5/6] i965: refactor texture instruction emission

2013-09-30 Thread Chia-I Wu

From: Chia-I Wu 

Add fs_visitor::emit_texture, which is used to emit the texture instruction
after the message payload has been set up.

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_fs.h   |  10 ++-
 src/mesa/drivers/dri/i965/brw_fs_fp.cpp  |  13 ++-
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 129 ---
 3 files changed, 70 insertions(+), 82 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
b/src/mesa/drivers/dri/i965/brw_fs.h
index b2aa041..c161e7d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -327,13 +327,17 @@ public:
fs_reg rescale_texcoord(ir_texture *ir, fs_reg coordinate,
bool is_rect, int sampler, int texunit);
fs_inst *emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
- fs_reg shadow_comp, fs_reg lod, fs_reg lod2);
+ fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
+ int sampler);
fs_inst *emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
   fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
-  fs_reg sample_index);
+  fs_reg sample_index, int sampler);
fs_inst *emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
   fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
-  fs_reg sample_index);
+  fs_reg sample_index, int sampler);
+   fs_inst *emit_texture(ir_texture *ir, fs_reg dst, int base_mrf, int mlen,
+ bool header_present, int regs_written, int sampler);
+
fs_reg fix_math_operand(fs_reg src);
fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0);
fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0, fs_reg src1);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
index 0594948..46ff03d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
@@ -499,18 +499,17 @@ fs_visitor::emit_fragment_program_code()
fpi->TexSrcTarget == TEXTURE_RECT_INDEX,
fpi->TexSrcUnit, fpi->TexSrcUnit);
 
- fs_inst *inst;
  if (brw->gen >= 7) {
-inst = emit_texture_gen7(ir, dst, coordinate, shadow_c, lod, dpdy, 
sample_index);
+emit_texture_gen7(ir, dst, coordinate, shadow_c, lod, dpdy,
+  sample_index, fpi->TexSrcUnit);
  } else if (brw->gen >= 5) {
-inst = emit_texture_gen5(ir, dst, coordinate, shadow_c, lod, dpdy, 
sample_index);
+emit_texture_gen5(ir, dst, coordinate, shadow_c, lod, dpdy,
+  sample_index, fpi->TexSrcUnit);
  } else {
-inst = emit_texture_gen4(ir, dst, coordinate, shadow_c, lod, dpdy);
+emit_texture_gen4(ir, dst, coordinate, shadow_c, lod, dpdy,
+  fpi->TexSrcUnit);
  }
 
- inst->sampler = fpi->TexSrcUnit;
- inst->shadow_compare = fpi->TexShadow;
-
  /* Reuse the GLSL swizzle_result() handler. */
  swizzle_result(ir, dst, fpi->TexSrcUnit);
  dst = this->result;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 72c379a..6435a17 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -870,8 +870,46 @@ fs_visitor::visit(ir_assignment *ir)
 }
 
 fs_inst *
+fs_visitor::emit_texture(ir_texture *ir, fs_reg dst, int base_mrf, int mlen,
+ bool header_present, int regs_written, int sampler)
+{
+   fs_inst *inst;
+
+   switch (ir->op) {
+   case ir_tex: inst = emit(SHADER_OPCODE_TEX, dst); break;
+   case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break;
+   case ir_txl: inst = emit(SHADER_OPCODE_TXL, dst); break;
+   case ir_txd: inst = emit(SHADER_OPCODE_TXD, dst); break;
+   case ir_txf: inst = emit(SHADER_OPCODE_TXF, dst); break;
+   case ir_txf_ms: inst = emit(SHADER_OPCODE_TXF_MS, dst); break;
+   case ir_txs: inst = emit(SHADER_OPCODE_TXS, dst); break;
+   case ir_lod: inst = emit(SHADER_OPCODE_LOD, dst); break;
+   default: return NULL;
+   }
+
+   inst->base_mrf = base_mrf;
+   inst->mlen = mlen;
+   inst->header_present = header_present;
+   inst->regs_written = regs_written;
+
+   /* The header is set up by generate_tex() when necessary. */
+   inst->src[0] = reg_undef;
+
+   if (ir->offset != NULL && ir->op != ir_txf)
+  inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
+
+   inst->sampler = sampler;
+
+   if (ir->shadow_comparitor)
+  inst->shadow_compare =

[Mesa-dev] [PATCH RFC 6/6] i965/gen7: emulate SIMD16 sample_d with dual SIMD8 sample_d

2013-09-30 Thread Chia-I Wu

From: Chia-I Wu 

Add fs_visitor::emit_dual_texture_gen7 that emulate SIMD16 sample_d with dual
SIMD8 sample_d on gen7+.  Fix fs_generator::generate_tex to send SIMD8
messages when force_uncompressed or force_sechalf is set.

No piglit quick.tests regression on Ivy Bridge and Haswell.

With this change, I am seeing 6.76479% +/- 0.619064% (at 95.0% confidence)
improvement on Xonotic with Ultra effects.

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_fs.h   |   3 +
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 137 ++-
 2 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
b/src/mesa/drivers/dri/i965/brw_fs.h
index c161e7d..82a0a7d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -335,6 +335,9 @@ public:
fs_inst *emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
   fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
   fs_reg sample_index, int sampler);
+   void emit_dual_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
+   fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
+   fs_reg sample_index, int sampler);
fs_inst *emit_texture(ir_texture *ir, fs_reg dst, int base_mrf, int mlen,
  bool header_present, int regs_written, int sampler);
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 6435a17..b9f97b6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1334,6 +1334,133 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg 
dst, fs_reg coordinate,
return emit_texture(ir, dst, base_mrf, mlen, header_present, 4, sampler);
 }
 
+/* Emulate a SIMD16 sampler message with dual SIMD8 sampler messages.  For
+ * now, and for pratical reaons, only ir_txd is supported.
+ */
+void
+fs_visitor::emit_dual_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg 
coordinate,
+   fs_reg shadow_c, fs_reg lod, fs_reg lod2,
+   fs_reg sample_index, int sampler)
+{
+   /* no need to emit dual SIMD8 messages */
+   if (dispatch_width != 16 || ir->op != ir_txd) {
+  emit_texture_gen7(ir, dst, coordinate, shadow_c,
+lod, lod2, sample_index, sampler);
+  return;
+   }
+
+   const int reg_width = 1;
+   int mlen = 0;
+   int base_mrf = 2;
+   bool header_present = false;
+   fs_reg temp = fs_reg(GRF, virtual_grf_alloc(4),
+ brw_type_for_base_type(ir->type));
+
+   emit(FS_OPCODE_OVERWRITE_DST, dst);
+   emit(FS_OPCODE_OVERWRITE_DST, temp);
+
+   for (int msg = 0; msg < 2; msg++) {
+  if (msg == 0)
+ push_force_uncompressed();
+  else
+ push_force_sechalf();
+
+  /* only txd is supported for now */
+  assert(ir->op == ir_txd);
+
+  if (ir->offset) {
+ /* The offsets set up by the ir_texture visitor are in the
+  * m1 header, so we can't go headerless.
+  */
+ header_present = true;
+ mlen++;
+ base_mrf--;
+  }
+
+  if (ir->shadow_comparitor) {
+ emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
+ mlen += reg_width;
+  }
+
+  /* Load dPdx and the coordinate together:
+   * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
+   */
+  fs_reg coord = coordinate, ddx = lod, ddy = lod2;
+  for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
+ emit(MOV(fs_reg(MRF, base_mrf + mlen), coord));
+ coord.reg_offset++;
+ mlen += reg_width;
+
+ /* For cube map array, the coordinate is (u,v,r,ai) but there are
+  * only derivatives for (u, v, r).
+  */
+ if (i < ir->lod_info.grad.dPdx->type->vector_elements) {
+emit(MOV(fs_reg(MRF, base_mrf + mlen), ddx));
+ddx.reg_offset++;
+mlen += reg_width;
+
+emit(MOV(fs_reg(MRF, base_mrf + mlen), ddy));
+ddy.reg_offset++;
+mlen += reg_width;
+ }
+  }
+
+  if (mlen > 11) {
+ fail("Message length >11 disallowed by hardware\n");
+ break;
+  }
+
+  /* response length is 4, which are 2 vgrf */
+  emit_texture(ir, temp, base_mrf, mlen, header_present, 2, sampler);
+
+  if (msg == 0) {
+ /* move from temp to dst */
+ for (int i = 0; i < 4; i++) {
+fs_reg d = dst;
+d.reg_offset += i;
+
+fs_reg s = temp;
+s.reg_offset += i / 2;
+s.sechalf = (i % 2);
+
+emit(MOV(d, s));
+ }
+
+ pop_force_uncompressed();
+
+ /* use non-overlapping MRF range if possible */
+ if (base_mrf + mlen * 2

[Mesa-dev] [PATCH] i965: compute DDX in a subspan based only on top row

2013-09-30 Thread Chia-I Wu

From: Chia-I Wu 

Consider only the top-left and top-right pixels to approximate DDX in a 2x2
subspan, unless the application requests a more accurate approximation via
GL_FRAGMENT_SHADER_DERIVATIVE_HINT or this optimization is disabled from the
new driconf option disable_derivative_optimization.

This results in a less accurate approximation.  However, it improves the
performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% (at 95.0%
confidence) on Haswell.  No noticeable image quality difference observed.

The improvement comes from faster sample_d.  It seems, on Haswell, some
optimizations are introduced to allow faster sample_d when all pixels in a
subspan have the same derivative.  I considered SAMPLE_STATE too, which allows
one to control the quality of sample_d on Haswell.  But it gave much worse
image quality without giving better performance comparing to this change.

No piglit quick.tests regression on Haswell, except with in-parameter-struct
and normal-parameter-struct tests which appear to be noises.

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_context.c|  2 ++
 src/mesa/drivers/dri/i965/brw_context.h|  1 +
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 33 +++---
 src/mesa/drivers/dri/i965/brw_wm.c | 11 +
 src/mesa/drivers/dri/i965/brw_wm.h |  1 +
 src/mesa/drivers/dri/i965/intel_screen.c   |  4 
 6 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.c 
b/src/mesa/drivers/dri/i965/brw_context.c
index 5f58a29..18b8e57 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -478,6 +478,8 @@ brwCreateContext(int api,
brw_draw_init( brw );
 
brw->precompile = driQueryOptionb(&brw->optionCache, "shader_precompile");
+   brw->disable_derivative_optimization =
+  driQueryOptionb(&brw->optionCache, "disable_derivative_optimization");
 
ctx->Const.ContextFlags = 0;
if ((flags & __DRI_CTX_FLAG_FORWARD_COMPATIBLE) != 0)
diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
b/src/mesa/drivers/dri/i965/brw_context.h
index 0f88bad..0ec1218 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1005,6 +1005,7 @@ struct brw_context
bool always_flush_cache;
bool disable_throttling;
bool precompile;
+   bool disable_derivative_optimization;
 
driOptionCache optionCache;
/** @} */
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 7ce42c4..9eb5e17 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -540,7 +540,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg 
dst, struct brw_reg src
  *
  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
  *
- * and we're trying to produce:
+ * Ideally, we want to produce:
  *
  *   DDX DDY
  * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
@@ -556,24 +556,41 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg 
dst, struct brw_reg src
  *
  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
- * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
- * between each other.  We could probably do it like ddx and swizzle the right
- * order later, but bail for now and just produce
+ * pair.  But the ideal approximation may impose a huge performance cost on
+ * sample_d.  On at least Haswell, sample_d instruction does some
+ * optimizations if the same LOD is used for all pixels in the subspan.
+ *
+ * For DDY, it's harder, as we want to produce the pairs swizzled between each
+ * other.  We could probably do it like ddx and swizzle the right order later,
+ * but bail for now and just produce
  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
  */
 void
 fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg 
src)
 {
+   unsigned vstride, width;
+
+   if (c->key.high_quality_derivatives) {
+  /* produce accurate derivatives */
+  vstride = BRW_VERTICAL_STRIDE_2;
+  width = BRW_WIDTH_2;
+   }
+   else {
+  /* replicate the derivative at the top-left pixel to other pixels */
+  vstride = BRW_VERTICAL_STRIDE_4;
+  width = BRW_WIDTH_4;
+   }
+
struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
 BRW_REGISTER_TYPE_F,
-BRW_VERTICAL_STRIDE_2,
-BRW_WIDTH_2,
+vstride,
+width,
 BRW_HORIZONTAL_STRIDE_0,
 BRW_SWIZZLE_XYZW, WRITE

Re: [Mesa-dev] [PATCH] i965/hsw: approximate DDX with a uniform value across a subspan

2013-09-30 Thread Chia-I Wu

On Tue, Oct 1, 2013 at 12:16 PM, Kenneth Graunke  wrote:
> On 09/30/2013 07:16 PM, Ian Romanick wrote:
>> On 09/11/2013 10:00 PM, Chia-I Wu wrote:
>>> From: Chia-I Wu 
>>>
>>> Replicate the gradient of the top-left pixel to the other three pixels in 
>>> the
>>> subspan, as how DDY is implemented.  Before, different graidents were used 
>>> for
>>> pixels in the top row and pixels in the bottom row.
>>>
>>> This change results in a less accurate approximation.  However, it improves
>>> the performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% (at
>>> 95.0% confidence) on Haswell.  No noticeable image quality difference
>>> observed.
>>>
>>> No piglit gpu.tests regressions.
>>>
>>> I failed to come up with an explanation for the performance difference.  The
>>> change does not make a difference on Ivy Bridge either.  If anyone has the
>>> insight, please kindly enlighten me.  Performance differences may also be
>>> observed on other games that call textureGrad and dFdx.
>>
>> After all the experiments and discussions with the hardware guys, lets
>> go ahead and do this.  We should do a couple things, however.
>>
>> 1. Disable the optimization if the application explicitly sets
>> GL_FRAGMENT_SHADER_DERIVATIVE_HINT to GL_NICEST.
>
> Urgh...I always hate adding more state-dependent recompiles...
>
> To accomplish this, you'll have to:
> - Add a new high_quality_derivatives flag to brw_wm_prog_key.
> - In brw_wm_populate_key, add:
>   /* _NEW_HINT */
>   key->high_quality_derivatives =
>  ctx->Hint.FragmentShaderDerivative == GL_NICEST;
> - Add the _NEW_HINT dependency to brw_wm_prog's dirty flags.
>
>> 2. Add a driconf option, as suggested by Chris, to disable the optimization.
>
> ...which means changing the key setup to:
>
>   if (brw->disable_derivative_optimization) {
>  key->high_quality_derivatives =
> ctx->Hint.FragmentShaderDerivative != GL_FASTEST;
>   } else {
>  key->high_quality_derivatives =
> ctx->Hint.FragmentShaderDerivative == GL_NICEST;
>   }
>
> and, in brw_fs_precompile, setting
>
> key->high_quality_derivatives = brw->disable_derivative_optimization;
Thanks for the instructions.  I've sent an updated patch with all of
yours and Ian's comments incorporated.

>
> This all seems pretty awful to me...but I guess there's not really any
> getting around it.  If the register had worked out, we could've just
> added a Hint() driver hook that programmed it appropriately.  But alas.
>
>> 3. Use the same DDX / DDY calculation on all platforms.
>>
>> 4. Update the commit message and the comment in the code with the
>> explanation of the optimization (the HSW sample_d instruction does some
>> optimizations if the same LOD is used for all pixels, etc.).
>>
>>> Signed-off-by: Chia-I Wu 
>>> ---
>>>  src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 17 +
>>>  1 file changed, 13 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp 
>>> b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
>>> index bfb3d33..c0d24a0 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
>>> +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
>>> @@ -564,16 +564,25 @@ fs_generator::generate_tex(fs_inst *inst, struct 
>>> brw_reg dst, struct brw_reg src
>>>  void
>>>  fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct 
>>> brw_reg src)
>>>  {
>>> +   /* approximate with ((ss0.tr - ss0.tl)x4 (ss1.tr - ss1.tl)x4) on 
>>> Haswell,
>>> +* which gives much better performance when the result is used with
>>> +* sample_d
>>> +*/
>>> +   unsigned vstride = (brw->is_haswell) ? BRW_VERTICAL_STRIDE_4 :
>>> +  BRW_VERTICAL_STRIDE_2;
>>> +   unsigned width = (brw->is_haswell) ? BRW_WIDTH_4 :
>>> +BRW_WIDTH_2;
>>> +
>>> struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
>>>   BRW_REGISTER_TYPE_F,
>>> - BRW_VERTICAL_STRIDE_2,
>>> - BRW_WIDTH_2,
>>> + vstride,
>>> + width,
>>>   BRW_HORIZONTAL_STRIDE_0,
>>>

Re: [Mesa-dev] [PATCH] i965: compute DDX in a subspan based only on top row

2013-10-02 Thread Chia-I Wu

On Wed, Oct 2, 2013 at 2:14 AM, Chris Forbes  wrote:
> With those fixes:
>
> Reviewed-by: Chris Forbes 
Thanks, I will push it shortly.

With this change landed, the slowness of sample_d is no longer the
bottleneck.  Instead, the lack of native SIMD16 sample_d becomes the
problem.  I have posted my other series that emulates SIMD16 sample_d
with dual SIMD8 sample_d for review.

>
> On Wed, Oct 2, 2013 at 6:38 AM, Ian Romanick  wrote:
>> On 09/30/2013 10:54 PM, Chia-I Wu wrote:
>>> From: Chia-I Wu 
>>
>> I agree with both of Ken's comments.  With those fixed, this patch is
>>
>> Reviewed-by: Ian Romanick 
>>
>>> Consider only the top-left and top-right pixels to approximate DDX in a 2x2
>>> subspan, unless the application requests a more accurate approximation via
>>> GL_FRAGMENT_SHADER_DERIVATIVE_HINT or this optimization is disabled from the
>>> new driconf option disable_derivative_optimization.
>>>
>>> This results in a less accurate approximation.  However, it improves the
>>> performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% (at 
>>> 95.0%
>>> confidence) on Haswell.  No noticeable image quality difference observed.
>>>
>>> The improvement comes from faster sample_d.  It seems, on Haswell, some
>>> optimizations are introduced to allow faster sample_d when all pixels in a
>>> subspan have the same derivative.  I considered SAMPLE_STATE too, which 
>>> allows
>>> one to control the quality of sample_d on Haswell.  But it gave much worse
>>> image quality without giving better performance comparing to this change.
>>>
>>> No piglit quick.tests regression on Haswell, except with in-parameter-struct
>>> and normal-parameter-struct tests which appear to be noises.
>>>
>>> Signed-off-by: Chia-I Wu 
>>> ---
>>>  src/mesa/drivers/dri/i965/brw_context.c|  2 ++
>>>  src/mesa/drivers/dri/i965/brw_context.h|  1 +
>>>  src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 33 
>>> +++---
>>>  src/mesa/drivers/dri/i965/brw_wm.c | 11 +
>>>  src/mesa/drivers/dri/i965/brw_wm.h |  1 +
>>>  src/mesa/drivers/dri/i965/intel_screen.c   |  4 
>>>  6 files changed, 44 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/src/mesa/drivers/dri/i965/brw_context.c 
>>> b/src/mesa/drivers/dri/i965/brw_context.c
>>> index 5f58a29..18b8e57 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_context.c
>>> +++ b/src/mesa/drivers/dri/i965/brw_context.c
>>> @@ -478,6 +478,8 @@ brwCreateContext(int api,
>>> brw_draw_init( brw );
>>>
>>> brw->precompile = driQueryOptionb(&brw->optionCache, 
>>> "shader_precompile");
>>> +   brw->disable_derivative_optimization =
>>> +  driQueryOptionb(&brw->optionCache, 
>>> "disable_derivative_optimization");
>>>
>>> ctx->Const.ContextFlags = 0;
>>> if ((flags & __DRI_CTX_FLAG_FORWARD_COMPATIBLE) != 0)
>>> diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
>>> b/src/mesa/drivers/dri/i965/brw_context.h
>>> index 0f88bad..0ec1218 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_context.h
>>> +++ b/src/mesa/drivers/dri/i965/brw_context.h
>>> @@ -1005,6 +1005,7 @@ struct brw_context
>>> bool always_flush_cache;
>>> bool disable_throttling;
>>> bool precompile;
>>> +   bool disable_derivative_optimization;
>>>
>>> driOptionCache optionCache;
>>> /** @} */
>>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
>>> b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>>> index 7ce42c4..9eb5e17 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>>> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>>> @@ -540,7 +540,7 @@ fs_generator::generate_tex(fs_inst *inst, struct 
>>> brw_reg dst, struct brw_reg src
>>>   *
>>>   * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
>>>   *
>>> - * and we're trying to produce:
>>> + * Ideally, we want to produce:
>>>   *
>>>   *   DDX DDY
>>>   * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
>>> @@ -556,24 +556,41 @@ fs_generator::generate_tex(fs_inst *inst, struct 
>>> brw_reg dst, struct brw_reg src
>>>   *
>>>   * For DDX, it ends up being easy: width = 2, horiz=0 gets us the

Re: [Mesa-dev] [PATCH RFC 3/6] i965: add FS_OPCODE_OVERWRITE_DST

2013-10-09 Thread Chia-I Wu

On Wed, Oct 9, 2013 at 3:35 AM, Eric Anholt  wrote:
> Chia-I Wu  writes:
>
>> From: Chia-I Wu 
>>
>> FS_OPCODE_OVERWRITE_DST is used to indicate that the destination register is
>> (completely) overwritten.  No code is emitted, but the liveness analysis can
>> use it as a hint to add the destination register to DEF bitset.  This is
>> needed because it is hard to figure out if some partial writes combined
>> constitute a complete write during liveness analysis, while it is easier for
>> the FS visitor to know if that is the case.
>
> I'm not a fan of this one (particularly the adding of scheduling
> barriers).  Any other ways you looked at to deal with this?
The goal is for liveness analysis to know that a register is not a
live-in.  It should be possible to make the liveness analysis smarter,
knowing that these partial writes or SENDs in the basic block make a
full write.  But that's quite some code to figure that out, while we
already know whether it is the case or not when visiting an
ir_texture.

An alternative would be to add a flag, force_def, to fs_inst.  The
liveness analysis can use that to help decide whether the dst should
be in the DEF or not.


>
> Patch 1, 2, 4 are:
>
> Reviewed-by: Eric Anholt 
>
> and I wouldn't mind seeing them land early.
Thanks.  As this series could not be applied cleanly since texture
gather support, I will commit patch 1, 2, 4 first and send v2 for the
rest.


-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 1/3] i965: add FS_OPCODE_OVERWRITE_DST

2013-10-09 Thread Chia-I Wu

From: Chia-I Wu 

FS_OPCODE_OVERWRITE_DST is used to indicate that the destination register is
(completely) overwritten.  No code is emitted, but the liveness analysis can
use it as a hint to add the destination register to DEF bitset.  This is
needed because it is hard to figure out if some partial writes combined
constitute a complete write during liveness analysis, while it is easier for
the FS visitor to know if that is the case.

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_defines.h | 1 +
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp  | 4 
 src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp | 5 +++--
 src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp | 3 ++-
 src/mesa/drivers/dri/i965/brw_shader.cpp| 3 +++
 5 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index c1e7f31..753a9ec 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -793,6 +793,7 @@ enum opcode {
FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X,
FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y,
FS_OPCODE_PLACEHOLDER_HALT,
+   FS_OPCODE_OVERWRITE_DST,
 
VS_OPCODE_URB_WRITE,
VS_OPCODE_SCRATCH_READ,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index dbfbc11..4b37784 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1547,6 +1547,10 @@ fs_generator::generate_code(exec_list *instructions)
  patch_discard_jumps_to_fb_writes();
  break;
 
+  case FS_OPCODE_OVERWRITE_DST:
+ /* This is to help liveness analysis. */
+ break;
+
   default:
 if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
_mesa_problem(ctx, "Unsupported opcode `%s' in FS",
diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
index f5daab2..13891f8 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
@@ -77,8 +77,9 @@ fs_live_variables::setup_def_use()
  * variable, and thus qualify for being in def[].
  */
 if (inst->dst.file == GRF &&
-inst->regs_written == v->virtual_grf_sizes[inst->dst.reg] &&
-!inst->is_partial_write()) {
+(inst->opcode == FS_OPCODE_OVERWRITE_DST ||
+ (inst->regs_written == v->virtual_grf_sizes[inst->dst.reg] &&
+  !inst->is_partial_write( {
int reg = inst->dst.reg;
 if (!BITSET_TEST(bd[b].use, reg))
BITSET_SET(bd[b].def, reg);
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp 
b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 5530683..4e59a10 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -562,7 +562,8 @@ fs_instruction_scheduler::calculate_deps()
   schedule_node *n = (schedule_node *)node;
   fs_inst *inst = (fs_inst *)n->inst;
 
-  if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT)
+  if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
+  inst->opcode == FS_OPCODE_OVERWRITE_DST)
  add_barrier_deps(n);
 
   /* read-after-write deps. */
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp 
b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 61c4bf5..e226c94 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -485,6 +485,9 @@ brw_instruction_name(enum opcode op)
case FS_OPCODE_PLACEHOLDER_HALT:
   return "placeholder_halt";
 
+   case FS_OPCODE_OVERWRITE_DST:
+  return "overwrite_dst";
+
case VS_OPCODE_URB_WRITE:
   return "vs_urb_write";
case VS_OPCODE_SCRATCH_READ:
-- 
1.8.3.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 2/3] i965: refactor texture instruction emission

2013-10-09 Thread Chia-I Wu

From: Chia-I Wu 

Add fs_visitor::emit_texture, which is used to emit the texture instruction
after the message payload has been set up.

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_fs.h   |  10 +-
 src/mesa/drivers/dri/i965/brw_fs_fp.cpp  |  13 ++-
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 144 ---
 3 files changed, 74 insertions(+), 93 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
b/src/mesa/drivers/dri/i965/brw_fs.h
index 5c7089d..52f25fb 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -331,13 +331,17 @@ public:
fs_reg rescale_texcoord(ir_texture *ir, fs_reg coordinate,
bool is_rect, int sampler, int texunit);
fs_inst *emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
- fs_reg shadow_comp, fs_reg lod, fs_reg lod2);
+ fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
+ int sampler);
fs_inst *emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
   fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
-  fs_reg sample_index);
+  fs_reg sample_index, int sampler);
fs_inst *emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
   fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
-  fs_reg sample_index);
+  fs_reg sample_index, int sampler);
+   fs_inst *emit_texture(ir_texture *ir, fs_reg dst, int base_mrf, int mlen,
+ bool header_present, int regs_written, int sampler);
+
fs_reg fix_math_operand(fs_reg src);
fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0);
fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0, fs_reg src1);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
index 0594948..46ff03d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
@@ -499,18 +499,17 @@ fs_visitor::emit_fragment_program_code()
fpi->TexSrcTarget == TEXTURE_RECT_INDEX,
fpi->TexSrcUnit, fpi->TexSrcUnit);
 
- fs_inst *inst;
  if (brw->gen >= 7) {
-inst = emit_texture_gen7(ir, dst, coordinate, shadow_c, lod, dpdy, 
sample_index);
+emit_texture_gen7(ir, dst, coordinate, shadow_c, lod, dpdy,
+  sample_index, fpi->TexSrcUnit);
  } else if (brw->gen >= 5) {
-inst = emit_texture_gen5(ir, dst, coordinate, shadow_c, lod, dpdy, 
sample_index);
+emit_texture_gen5(ir, dst, coordinate, shadow_c, lod, dpdy,
+  sample_index, fpi->TexSrcUnit);
  } else {
-inst = emit_texture_gen4(ir, dst, coordinate, shadow_c, lod, dpdy);
+emit_texture_gen4(ir, dst, coordinate, shadow_c, lod, dpdy,
+  fpi->TexSrcUnit);
  }
 
- inst->sampler = fpi->TexSrcUnit;
- inst->shadow_compare = fpi->TexShadow;
-
  /* Reuse the GLSL swizzle_result() handler. */
  swizzle_result(ir, dst, fpi->TexSrcUnit);
  dst = this->result;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 15cfaa7..1e71fd9 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -900,8 +900,51 @@ fs_visitor::visit(ir_assignment *ir)
 }
 
 fs_inst *
+fs_visitor::emit_texture(ir_texture *ir, fs_reg dst, int base_mrf, int mlen,
+ bool header_present, int regs_written, int sampler)
+{
+   fs_inst *inst;
+
+   switch (ir->op) {
+   case ir_tex: inst = emit(SHADER_OPCODE_TEX, dst); break;
+   case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break;
+   case ir_txl: inst = emit(SHADER_OPCODE_TXL, dst); break;
+   case ir_txd: inst = emit(SHADER_OPCODE_TXD, dst); break;
+   case ir_txf: inst = emit(SHADER_OPCODE_TXF, dst); break;
+   case ir_txf_ms: inst = emit(SHADER_OPCODE_TXF_MS, dst); break;
+   case ir_txs: inst = emit(SHADER_OPCODE_TXS, dst); break;
+   case ir_query_levels: inst = emit(SHADER_OPCODE_TXS, dst); break;
+   case ir_lod: inst = emit(SHADER_OPCODE_LOD, dst); break;
+   case ir_tg4: inst = emit(SHADER_OPCODE_TG4, dst); break;
+   default: return NULL;
+   }
+
+   inst->base_mrf = base_mrf;
+   inst->mlen = mlen;
+   inst->header_present = header_present;
+   inst->regs_written = regs_written;
+
+   /* The header is set up by generate_tex() when necessary. */
+   inst->src[0] = reg_undef;
+
+   if (ir->offset != NULL && ir->op != ir_txf)
+  inst->texture_offset = brw_texture_offset(

[Mesa-dev] [PATCH v2 3/3] i965/gen7: emulate SIMD16 sample_d with dual SIMD8 sample_d

2013-10-09 Thread Chia-I Wu

From: Chia-I Wu 

Add fs_visitor::emit_dual_texture_gen7 that emulate SIMD16 sample_d with dual
SIMD8 sample_d on gen7+.

No piglit quick.tests regression on Ivy Bridge and Haswell.

Improved Xonotic with Ultra effects by 6.76479% +/- 0.619064% (at 95.0%
confidence) on Haswell.

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_fs.h   |   3 +
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 135 ++-
 2 files changed, 136 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
b/src/mesa/drivers/dri/i965/brw_fs.h
index 52f25fb..eec9d5f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -339,6 +339,9 @@ public:
fs_inst *emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
   fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
   fs_reg sample_index, int sampler);
+   void emit_dual_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
+   fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
+   fs_reg sample_index, int sampler);
fs_inst *emit_texture(ir_texture *ir, fs_reg dst, int base_mrf, int mlen,
  bool header_present, int regs_written, int sampler);
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 1e71fd9..d205b11 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1381,6 +1381,132 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg 
dst, fs_reg coordinate,
return emit_texture(ir, dst, base_mrf, mlen, header_present, 4, sampler);
 }
 
+/* Emulate a SIMD16 sampler message with dual SIMD8 sampler messages.  For
+ * now, and for pratical reaons, only ir_txd is supported.
+ */
+void
+fs_visitor::emit_dual_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg 
coordinate,
+   fs_reg shadow_c, fs_reg lod, fs_reg lod2,
+   fs_reg sample_index, int sampler)
+{
+   /* no need to emit dual SIMD8 messages */
+   if (dispatch_width != 16 || ir->op != ir_txd) {
+  emit_texture_gen7(ir, dst, coordinate, shadow_c,
+lod, lod2, sample_index, sampler);
+  return;
+   }
+
+   const int reg_width = 1;
+   int mlen = 0;
+   int base_mrf = 2;
+   bool header_present = false;
+   fs_reg temp = fs_reg(GRF, virtual_grf_alloc(4),
+ brw_type_for_base_type(ir->type));
+
+   emit(FS_OPCODE_OVERWRITE_DST, dst);
+   emit(FS_OPCODE_OVERWRITE_DST, temp);
+
+   for (int msg = 0; msg < 2; msg++) {
+  if (msg == 0)
+ push_force_uncompressed();
+  else
+ push_force_sechalf();
+
+  /* only txd is supported for now */
+  assert(ir->op == ir_txd);
+
+  if (ir->offset) {
+ /* The offsets set up by the ir_texture visitor are in the
+  * m1 header, so we can't go headerless.
+  */
+ header_present = true;
+ mlen++;
+ base_mrf--;
+  }
+
+  if (ir->shadow_comparitor) {
+ emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
+ mlen += reg_width;
+  }
+
+  /* Load dPdx and the coordinate together:
+   * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
+   */
+  fs_reg coord = coordinate, ddx = lod, ddy = lod2;
+  for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
+ emit(MOV(fs_reg(MRF, base_mrf + mlen), coord));
+ coord.reg_offset++;
+ mlen += reg_width;
+
+ /* For cube map array, the coordinate is (u,v,r,ai) but there are
+  * only derivatives for (u, v, r).
+  */
+ if (i < ir->lod_info.grad.dPdx->type->vector_elements) {
+emit(MOV(fs_reg(MRF, base_mrf + mlen), ddx));
+ddx.reg_offset++;
+mlen += reg_width;
+
+emit(MOV(fs_reg(MRF, base_mrf + mlen), ddy));
+ddy.reg_offset++;
+mlen += reg_width;
+ }
+  }
+
+  if (mlen > 11) {
+ fail("Message length >11 disallowed by hardware\n");
+ break;
+  }
+
+  /* response length is 4, which are 2 vgrf */
+  emit_texture(ir, temp, base_mrf, mlen, header_present, 2, sampler);
+
+  if (msg == 0) {
+ /* move from temp to dst */
+ for (int i = 0; i < 4; i++) {
+fs_reg d = dst;
+d.reg_offset += i;
+
+fs_reg s = temp;
+s.reg_offset += i / 2;
+s.sechalf = (i % 2);
+
+emit(MOV(d, s));
+ }
+
+ pop_force_uncompressed();
+
+ /* use non-overlapping MRF range if possible */
+ if (base_mrf + mlen * 2 < BRW_MAX_MRF)
+base_mrf += mlen;
+
+ mlen = 0;
+
+ temp.reg_offset += 2;
+
+ coordinate.sec

Re: [Mesa-dev] Mesa (master): i965/fs: Convert gen7 to using GRFs for texture messages.

2013-10-13 Thread Chia-I Wu

On Sat, Oct 12, 2013 at 3:18 AM, Eric Anholt  wrote:
> Chia-I Wu  writes:
>
>> Hi Eric,
>> The frame rate of Unigine Tropics (with low shader quality) dropped
>> from 40.8 to 23.5 after this change.
>
> Thanks for the note.  I see the regression as well, and I see a shader
> that's started spilling.  It looks like we can drop the regs_written <=
> 1 check on gen7+'s pre-regalloc scheduling to fix the problem (the MRF
> setup thing is no longer an issue, and its presence is now making us
> pessimize instead of optimize in general in the pre-regalloc
> scheduling).  I'll want to run a few more tests to make sure that this
> doesn't regress something else.
>
> This shader is also in bad shape now that we don't have the redundant
> MRF move optimization, and we need to look into grf_size > 1 CSE.  That
> would probably also have avoided the problem on this shader, though the
> scheduling problem is more general than this one shader.
The last shader_time output[1] for the fragment shader in question gives

BEFORE fs8   glsl   465:959.12 Gcycles   1.1%
AFTER  fs8   glsl   465:  13336.14 Gcycles   9.6%

Comparing with the total cycles, those extra cycles should bring down
the fps to ~35.  What's odd is this shader

BEFORE vsglsl   264:  16127.47 Gcycles  17.7%
AFTER  vsglsl   264:  56543.36 Gcycles  40.8%

The generated code for this vertex shader is not affected by the
commit.  But it runs significantly slower, bringing the fps down to
~24.  I suspect it is context-switched away frequently and for a good
while, but I do not have a theory as to why.  Do you have a better
idea?

[1]  Formatted for this mail.  Also, the demo is time-based.  To make
sure the same frames are rendered, I have to use apitrace.

-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH v2 1/3] i965: add FS_OPCODE_OVERWRITE_DST

2013-10-16 Thread Chia-I Wu

On Wed, Oct 16, 2013 at 2:08 AM, Ian Romanick  wrote:
> On 10/09/2013 12:06 AM, Chia-I Wu wrote:
>> From: Chia-I Wu 
>>
>> FS_OPCODE_OVERWRITE_DST is used to indicate that the destination register is
>> (completely) overwritten.  No code is emitted, but the liveness analysis can
>> use it as a hint to add the destination register to DEF bitset.  This is
>> needed because it is hard to figure out if some partial writes combined
>> constitute a complete write during liveness analysis, while it is easier for
>> the FS visitor to know if that is the case.
>
> What changed from v1 of the series?  Just reordering (since some other
> patches from the original series landed) and rebasing on current master?
Yes, as said in my reply to Eric.  I will send v3 shortly, and add
notes as to what were changed.

>
>> Signed-off-by: Chia-I Wu 
>> ---
>>  src/mesa/drivers/dri/i965/brw_defines.h | 1 +
>>  src/mesa/drivers/dri/i965/brw_fs_generator.cpp  | 4 
>>  src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp | 5 +++--
>>  src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp | 3 ++-
>>  src/mesa/drivers/dri/i965/brw_shader.cpp| 3 +++
>>  5 files changed, 13 insertions(+), 3 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
>> b/src/mesa/drivers/dri/i965/brw_defines.h
>> index c1e7f31..753a9ec 100644
>> --- a/src/mesa/drivers/dri/i965/brw_defines.h
>> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
>> @@ -793,6 +793,7 @@ enum opcode {
>> FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X,
>> FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y,
>> FS_OPCODE_PLACEHOLDER_HALT,
>> +   FS_OPCODE_OVERWRITE_DST,
>>
>> VS_OPCODE_URB_WRITE,
>> VS_OPCODE_SCRATCH_READ,
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
>> b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> index dbfbc11..4b37784 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> @@ -1547,6 +1547,10 @@ fs_generator::generate_code(exec_list *instructions)
>>   patch_discard_jumps_to_fb_writes();
>>   break;
>>
>> +  case FS_OPCODE_OVERWRITE_DST:
>> + /* This is to help liveness analysis. */
>> + break;
>> +
>>default:
>>if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
>>   _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp 
>> b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
>> index f5daab2..13891f8 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
>> @@ -77,8 +77,9 @@ fs_live_variables::setup_def_use()
>> * variable, and thus qualify for being in def[].
>> */
>>if (inst->dst.file == GRF &&
>> -  inst->regs_written == v->virtual_grf_sizes[inst->dst.reg] &&
>> -  !inst->is_partial_write()) {
>> +  (inst->opcode == FS_OPCODE_OVERWRITE_DST ||
>> +   (inst->regs_written == v->virtual_grf_sizes[inst->dst.reg] &&
>> +!inst->is_partial_write( {
>>   int reg = inst->dst.reg;
>>  if (!BITSET_TEST(bd[b].use, reg))
>> BITSET_SET(bd[b].def, reg);
>> diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp 
>> b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
>> index 5530683..4e59a10 100644
>> --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
>> @@ -562,7 +562,8 @@ fs_instruction_scheduler::calculate_deps()
>>schedule_node *n = (schedule_node *)node;
>>fs_inst *inst = (fs_inst *)n->inst;
>>
>> -  if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT)
>> +  if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
>> +  inst->opcode == FS_OPCODE_OVERWRITE_DST)
>>   add_barrier_deps(n);
>>
>>/* read-after-write deps. */
>> diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp 
>> b/src/mesa/drivers/dri/i965/brw_shader.cpp
>> index 61c4bf5..e226c94 100644
>> --- a/src/mesa/drivers/dri/i965/brw_shader.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
>> @@ -485,6 +485,9 @@ brw_instruction_name(enum opcode op)
>> case FS_OPCODE_PLACEHOLDER_HALT:
>>return "placeholder_halt";
>>
>> +   case FS_OPCODE_OVERWRITE_DST:
>> +  return "overwrite_dst";
>> +
>> case VS_OPCODE_URB_WRITE:
>>return "vs_urb_write";
>> case VS_OPCODE_SCRATCH_READ:
>>
>



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCHv3 1/3] i965/fs: improve liveness analysis for partial writes

2013-10-16 Thread Chia-I Wu

From: Chia-I Wu 

When two partial writes write the first and second halves of a variable
respectively before the variable is used, the variable can be added to the def
bitset.

v2: no change
v3: no longer rely on hints from by the visitor

Signed-off-by: Chia-I Wu 
---
 .../drivers/dri/i965/brw_fs_live_variables.cpp | 26 +-
 src/mesa/drivers/dri/i965/brw_fs_live_variables.h  |  5 +++--
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
index b3026c2..cceff42 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
@@ -97,7 +97,8 @@ fs_live_variables::setup_one_read(bblock_t *block, fs_inst 
*inst,
 * channel) without having completely defined that variable within the
 * block.
 */
-   if (!BITSET_TEST(bd[block->block_num].def, var))
+   if (!BITSET_TEST(bd[block->block_num].def, var) &&
+   !BITSET_TEST(bd[block->block_num].def_sechalf, var))
   BITSET_SET(bd[block->block_num].use, var);
 }
 
@@ -113,9 +114,22 @@ fs_live_variables::setup_one_write(bblock_t *block, 
fs_inst *inst,
/* The def[] bitset marks when an initialization in a block completely
 * screens off previous updates of that variable (VGRF channel).
 */
-   if (inst->dst.file == GRF && !inst->is_partial_write()) {
-  if (!BITSET_TEST(bd[block->block_num].use, var))
- BITSET_SET(bd[block->block_num].def, var);
+   if (inst->dst.file == GRF) {
+  /* We do not want to call inst->is_partial_write() here as we track the
+   * two halves separately.
+   */
+  bool predicated = (inst->predicate && inst->opcode != BRW_OPCODE_SEL);
+
+  if (!predicated && !BITSET_TEST(bd[block->block_num].use, var)) {
+ if (inst->force_sechalf) {
+BITSET_SET(bd[block->block_num].def_sechalf, var);
+ } else {
+BITSET_SET(bd[block->block_num].def, var);
+
+if (!inst->force_uncompressed)
+   BITSET_SET(bd[block->block_num].def_sechalf, var);
+ }
+  }
}
 }
 
@@ -188,8 +202,9 @@ fs_live_variables::compute_live_variables()
   for (int b = 0; b < cfg->num_blocks; b++) {
 /* Update livein */
 for (int i = 0; i < bitset_words; i++) {
+BITSET_WORD def = (bd[b].def[i] & bd[b].def_sechalf[i]);
 BITSET_WORD new_livein = (bd[b].use[i] |
-  (bd[b].liveout[i] & ~bd[b].def[i]));
+  (bd[b].liveout[i] & ~def));
if (new_livein & ~bd[b].livein[i]) {
bd[b].livein[i] |= new_livein;
cont = true;
@@ -275,6 +290,7 @@ fs_live_variables::fs_live_variables(fs_visitor *v, cfg_t 
*cfg)
bitset_words = BITSET_WORDS(num_vars);
for (int i = 0; i < cfg->num_blocks; i++) {
   bd[i].def = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+  bd[i].def_sechalf = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
   bd[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
   bd[i].livein = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
   bd[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.h 
b/src/mesa/drivers/dri/i965/brw_fs_live_variables.h
index 82575d8..d7f0bba 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.h
@@ -37,9 +37,10 @@ struct block_data {
 * Which variables are defined before being used in the block.
 *
 * Note that for our purposes, "defined" means unconditionally, completely
-* defined.
+* defined.  As an instruction may write to only the first or the second
+* half of a variable, we need two bitsets.
 */
-   BITSET_WORD *def;
+   BITSET_WORD *def, *def_sechalf;
 
/**
 * Which variables are used before being defined in the block.
-- 
1.8.3.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCHv3 2/3] i965: refactor texture instruction emission

2013-10-16 Thread Chia-I Wu

From: Chia-I Wu 

Add fs_visitor::emit_texture, which is used to emit the texture instruction
after the message payload has been set up.

v2: rebased because of texture gather changes
v3: rebased because of texture-from-GRF changes

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_fs.h   |  10 +-
 src/mesa/drivers/dri/i965/brw_fs_fp.cpp  |  13 ++-
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 154 +--
 3 files changed, 85 insertions(+), 92 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
b/src/mesa/drivers/dri/i965/brw_fs.h
index b5aed23..c2ba351 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -339,13 +339,17 @@ public:
fs_reg rescale_texcoord(ir_texture *ir, fs_reg coordinate,
bool is_rect, int sampler, int texunit);
fs_inst *emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
- fs_reg shadow_comp, fs_reg lod, fs_reg lod2);
+ fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
+ int sampler);
fs_inst *emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
   fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
-  fs_reg sample_index);
+  fs_reg sample_index, int sampler);
fs_inst *emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
   fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
-  fs_reg sample_index);
+  fs_reg sample_index, int sampler);
+   fs_inst *emit_texture(ir_texture *ir, fs_reg dst, fs_reg payload, int mlen,
+ bool header_present, int regs_written, int sampler);
+
fs_reg fix_math_operand(fs_reg src);
fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0);
fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0, fs_reg src1);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
index 1ebaa4f..da9e99f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
@@ -499,18 +499,17 @@ fs_visitor::emit_fragment_program_code()
fpi->TexSrcTarget == TEXTURE_RECT_INDEX,
fpi->TexSrcUnit, fpi->TexSrcUnit);
 
- fs_inst *inst;
  if (brw->gen >= 7) {
-inst = emit_texture_gen7(ir, dst, coordinate, shadow_c, lod, dpdy, 
sample_index);
+emit_texture_gen7(ir, dst, coordinate, shadow_c, lod, dpdy,
+  sample_index, fpi->TexSrcUnit);
  } else if (brw->gen >= 5) {
-inst = emit_texture_gen5(ir, dst, coordinate, shadow_c, lod, dpdy, 
sample_index);
+emit_texture_gen5(ir, dst, coordinate, shadow_c, lod, dpdy,
+  sample_index, fpi->TexSrcUnit);
  } else {
-inst = emit_texture_gen4(ir, dst, coordinate, shadow_c, lod, dpdy);
+emit_texture_gen4(ir, dst, coordinate, shadow_c, lod, dpdy,
+  fpi->TexSrcUnit);
  }
 
- inst->sampler = fpi->TexSrcUnit;
- inst->shadow_compare = fpi->TexShadow;
-
  /* Reuse the GLSL swizzle_result() handler. */
  swizzle_result(ir, dst, fpi->TexSrcUnit);
  dst = this->result;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 9f37013..d164b04 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -901,8 +901,56 @@ fs_visitor::visit(ir_assignment *ir)
 }
 
 fs_inst *
+fs_visitor::emit_texture(ir_texture *ir, fs_reg dst, fs_reg payload, int mlen,
+ bool header_present, int regs_written, int sampler)
+{
+   int base_mrf;
+   fs_inst *inst;
+
+   if (payload.file == MRF) {
+  base_mrf = payload.reg;
+  payload = reg_undef;
+   } else {
+  base_mrf = -1;
+   }
+
+   switch (ir->op) {
+   case ir_tex: inst = emit(SHADER_OPCODE_TEX, dst, payload); break;
+   case ir_txb: inst = emit(FS_OPCODE_TXB, dst, payload); break;
+   case ir_txl: inst = emit(SHADER_OPCODE_TXL, dst, payload); break;
+   case ir_txd: inst = emit(SHADER_OPCODE_TXD, dst, payload); break;
+   case ir_txf: inst = emit(SHADER_OPCODE_TXF, dst, payload); break;
+   case ir_txf_ms: inst = emit(SHADER_OPCODE_TXF_MS, dst, payload); break;
+   case ir_txs: inst = emit(SHADER_OPCODE_TXS, dst, payload); break;
+   case ir_query_levels: inst = emit(SHADER_OPCODE_TXS, dst, payload); break;
+   case ir_lod: inst = emit(SHADER_OPCODE_LOD, dst, payload); break;
+   case ir_tg4: inst = emit(SHADER_OPCODE_TG4, dst, payload); break;
+   default: return NULL;
+   }
+
+   inst->base_mrf = bas

[Mesa-dev] [PATCHv3 3/3] i965/gen7: emulate SIMD16 sample_d with dual SIMD8 sample_d

2013-10-16 Thread Chia-I Wu

From: Chia-I Wu 

Add fs_visitor::emit_dual_texture_gen7 that emulate SIMD16 sample_d with dual
SIMD8 sample_d on gen7+.

No piglit quick.tests regression on Ivy Bridge and Haswell.

Improved Xonotic with Ultra effects by 6.0209% +/- 0.396586% (N=11) on
Haswell.

v2: no change
v3: reworked because of texture-from-GRF changes

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_fs.h   |   3 +
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 117 ++-
 2 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
b/src/mesa/drivers/dri/i965/brw_fs.h
index c2ba351..05bf39e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -347,6 +347,9 @@ public:
fs_inst *emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
   fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
   fs_reg sample_index, int sampler);
+   void emit_dual_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
+   fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
+   fs_reg sample_index, int sampler);
fs_inst *emit_texture(ir_texture *ir, fs_reg dst, fs_reg payload, int mlen,
  bool header_present, int regs_written, int sampler);
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index d164b04..19e3f1e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1394,6 +1394,114 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg 
dst, fs_reg coordinate,
return emit_texture(ir, dst, payload, mlen, header_present, 4, sampler);
 }
 
+/* Emulate a SIMD16 sampler message with dual SIMD8 sampler messages.  For
+ * now, and for pratical reaons, only ir_txd is supported.
+ */
+void
+fs_visitor::emit_dual_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg 
coordinate,
+   fs_reg shadow_c, fs_reg lod, fs_reg lod2,
+   fs_reg sample_index, int sampler)
+{
+   /* no need to emit dual SIMD8 messages */
+   if (dispatch_width != 16 || ir->op != ir_txd) {
+  emit_texture_gen7(ir, dst, coordinate, shadow_c,
+lod, lod2, sample_index, sampler);
+  return;
+   }
+
+   fs_reg simd8_dst = fs_reg(GRF, virtual_grf_alloc(4),
+ brw_type_for_base_type(ir->type));
+
+#define ADVANCE_HALF(reg) \
+   do { reg.reg_offset += reg.sechalf; reg.sechalf = !reg.sechalf; } while (0)
+
+   for (int msg = 0; msg < 2; msg++) {
+  bool header_present = false;
+  fs_reg payload = fs_reg(this, glsl_type::float_type);
+  fs_reg next = payload;
+
+  if (msg == 0)
+ push_force_uncompressed();
+  else
+ push_force_sechalf();
+
+  /* only txd is supported for now */
+  assert(ir->op == ir_txd);
+
+  if (ir->offset) {
+ /* Need the header to put texture offsets in */
+ header_present = true;
+ ADVANCE_HALF(next);
+  }
+
+  if (ir->shadow_comparitor) {
+ emit(MOV(next, shadow_c));
+ ADVANCE_HALF(next);
+  }
+
+  /* Load dPdx and the coordinate together:
+   * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
+   */
+  fs_reg coord = coordinate, ddx = lod, ddy = lod2;
+  for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
+ emit(MOV(next, coord));
+ coord.reg_offset++;
+ ADVANCE_HALF(next);
+
+ /* For cube map array, the coordinate is (u,v,r,ai) but there are
+  * only derivatives for (u, v, r).
+  */
+ if (i < ir->lod_info.grad.dPdx->type->vector_elements) {
+emit(MOV(next, ddx));
+ddx.reg_offset++;
+ADVANCE_HALF(next);
+
+emit(MOV(next, ddy));
+ddy.reg_offset++;
+ADVANCE_HALF(next);
+ }
+  }
+
+  int mlen = next.reg_offset * 2 + next.sechalf;
+  if (mlen > 11) {
+ fail("Message length >11 disallowed by hardware\n");
+ break;
+  }
+
+  /* Message length is mlen and response length is 4.  In vgrf, that means
+   * (mlen + 1) / 2 registers for payload and 2 registers for writeback.
+   */
+  virtual_grf_sizes[payload.reg] = (mlen + 1) / 2;
+  emit_texture(ir, simd8_dst, payload, mlen, header_present, 2, sampler);
+
+  fs_reg d = dst, s = simd8_dst;
+  d.sechalf = (msg == 1);
+
+  /* swizzle the result to match SIMD16 writeback */
+  for (int i = 0; i < 4; i++) {
+ emit(MOV(d, s));
+ d.reg_offset++;
+ ADVANCE_HALF(s);
+  }
+
+  if (msg == 0) {
+ pop_force_uncompressed();
+
+ /* prepare for the second message */
+ simd8_dst.reg_offset += 2;
+

Re: [Mesa-dev] [PATCH v2 1/3] i965: add FS_OPCODE_OVERWRITE_DST

2013-10-16 Thread Chia-I Wu

On Wed, Oct 16, 2013 at 6:26 AM, Eric Anholt  wrote:
> Chia-I Wu  writes:
>
>> From: Chia-I Wu 
>>
>> FS_OPCODE_OVERWRITE_DST is used to indicate that the destination register is
>> (completely) overwritten.  No code is emitted, but the liveness analysis can
>> use it as a hint to add the destination register to DEF bitset.  This is
>> needed because it is hard to figure out if some partial writes combined
>> constitute a complete write during liveness analysis, while it is easier for
>> the FS visitor to know if that is the case.
>
> Note that FS_OPCODE_OVERWRITE_DST should be avoidable now that
> texture-grf is landed -- the live intervals calculation is a lot
> smarter, and shouldn't take much more work to handle the setup of the
> two halves in 16-wide.
Thanks for the hint!

I've posted v3 for review.  FS_OPCODE_OVERWRITE_DST is gone, and the
payloads for dual sample_d are constructed directly in GRF.

-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] Mesa (master): i965/fs: Convert gen7 to using GRFs for texture messages.

2013-10-16 Thread Chia-I Wu

Hi Eric,

On Sat, Oct 12, 2013 at 3:18 AM, Eric Anholt  wrote:
> Chia-I Wu  writes:
>
>> Hi Eric,
>> The frame rate of Unigine Tropics (with low shader quality) dropped
>> from 40.8 to 23.5 after this change.
>
> Thanks for the note.  I see the regression as well, and I see a shader
> that's started spilling.  It looks like we can drop the regs_written <=
> 1 check on gen7+'s pre-regalloc scheduling to fix the problem (the MRF
> setup thing is no longer an issue, and its presence is now making us
> pessimize instead of optimize in general in the pre-regalloc
> scheduling).  I'll want to run a few more tests to make sure that this
> doesn't regress something else.
Are you looking at this issue?  The change you suggested does not
avoid spilling.

I think the problem can be demonstrated with this snippet:

  vec4 val = vec4(0.0);

  vec4 tmp_001 = texture(tex, texcoord * 0.01);
  val += tmp_001;
  vec4 tmp_002 = texture(tex, texcoord * 0.02);
  val += tmp_002;
  vec4 tmp_003 = texture(tex, texcoord * 0.03);
  val += tmp_003;
  ...
  vec4 tmp_099 = texture(tex, texcoord * 0.99);
  val += tmp_099;
  vec4 tmp_100 = texture(tex, texcoord * 1.00);
  val += tmp_100;

  gl_FragColor = val;

Before the change, the scheduler saw a dependency between any two
texture() calls (because of the use of MRF).  It was inclined to keep
the accumulation of tmp_xxx between texture() calls even though the
accumulation also had a dependency on the last texture() call.

After the change, the dependencies between texture()s are gone.  The
scheduler sees a chance to move all the high latency texture()
together and generate something like this:

  vec4 tmp_001 = texture(tex, texcoord * 0.01);
  vec4 tmp_002 = texture(tex, texcoord * 0.02);
  vec4 tmp_003 = texture(tex, texcoord * 0.03);
  ...
  vec4 tmp_099 = texture(tex, texcoord * 0.99);
  vec4 tmp_100 = texture(tex, texcoord * 1.00);

  val += tmp_001;
  val += tmp_002;
  val += tmp_003;
  ...
  val += tmp_099;
  val += tmp_100;

Since there are not enough registers to hold all tmp_xxx, the register
allocation starts spilling.

>
> This shader is also in bad shape now that we don't have the redundant
> MRF move optimization, and we need to look into grf_size > 1 CSE.  That
> would probably also have avoided the problem on this shader, though the
> scheduling problem is more general than this one shader.



-- 
o...@lunarg.com
  val = texture(tex, texcoord * 1.0);
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] Mesa (master): i965/fs: Convert gen7 to using GRFs for texture messages.

2013-10-17 Thread Chia-I Wu

On Thu, Oct 17, 2013 at 1:53 PM, Chia-I Wu  wrote:
> Hi Eric,
>
> On Sat, Oct 12, 2013 at 3:18 AM, Eric Anholt  wrote:
>> Chia-I Wu  writes:
>>
>>> Hi Eric,
>>> The frame rate of Unigine Tropics (with low shader quality) dropped
>>> from 40.8 to 23.5 after this change.
>>
>> Thanks for the note.  I see the regression as well, and I see a shader
>> that's started spilling.  It looks like we can drop the regs_written <=
>> 1 check on gen7+'s pre-regalloc scheduling to fix the problem (the MRF
>> setup thing is no longer an issue, and its presence is now making us
>> pessimize instead of optimize in general in the pre-regalloc
>> scheduling).  I'll want to run a few more tests to make sure that this
>> doesn't regress something else.
> Are you looking at this issue?  The change you suggested does not
> avoid spilling.
>
> I think the problem can be demonstrated with this snippet:
>
>   vec4 val = vec4(0.0);
>
>   vec4 tmp_001 = texture(tex, texcoord * 0.01);
>   val += tmp_001;
>   vec4 tmp_002 = texture(tex, texcoord * 0.02);
>   val += tmp_002;
>   vec4 tmp_003 = texture(tex, texcoord * 0.03);
>   val += tmp_003;
>   ...
>   vec4 tmp_099 = texture(tex, texcoord * 0.99);
>   val += tmp_099;
>   vec4 tmp_100 = texture(tex, texcoord * 1.00);
>   val += tmp_100;
>
>   gl_FragColor = val;
>
> Before the change, the scheduler saw a dependency between any two
> texture() calls (because of the use of MRF).  It was inclined to keep
> the accumulation of tmp_xxx between texture() calls even though the
> accumulation also had a dependency on the last texture() call.
>
> After the change, the dependencies between texture()s are gone.  The
> scheduler sees a chance to move all the high latency texture()
> together and generate something like this:
Ah, I started looking at post-reg-alloc scheduling in the middle
way...  My reasoning was wrong.  The correct one is:

It worked before this change because there were dependencies between
texture() calls, and those texture() calls must thus be scheduled in
that order.  Accumulations were scheduled as soon as they were
available, and thus were intermixed with texture() calls.

It does not work now because the dependencies between texture() calls
are gone.  Since the scheduler schedules in FILO order, texture()
calls are scheduled in reversed order.  Accumulations are thus
available only after all texture() calls are scheduled.

This remains true with the fix suggested (it is still desirable, only
that it is a partial fix).  The problem can be demonstrated with the
attached fragment shader.

>   vec4 tmp_003 = texture(tex, texcoord * 0.03);
>   ...
>   vec4 tmp_099 = texture(tex, texcoord * 0.99);
>   vec4 tmp_100 = texture(tex, texcoord * 1.00);
>
>   val += tmp_001;
>   val += tmp_002;
>   val += tmp_003;
>   ...
>   val += tmp_099;
>   val += tmp_100;
>
> Since there are not enough registers to hold all tmp_xxx, the register
> allocation starts spilling.
>
>>
>> This shader is also in bad shape now that we don't have the redundant
>> MRF move optimization, and we need to look into grf_size > 1 CSE.  That
>> would probably also have avoided the problem on this shader, though the
>> scheduling problem is more general than this one shader.
>
>
>
> --
> o...@lunarg.com
>   val = texture(tex, texcoord * 1.0);



-- 
o...@lunarg.com


465.frag
Description: Binary data
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] Mesa (master): i965/fs: Convert gen7 to using GRFs for texture messages.

2013-10-17 Thread Chia-I Wu

On Thu, Oct 17, 2013 at 3:29 PM, Chia-I Wu  wrote:
> On Thu, Oct 17, 2013 at 1:53 PM, Chia-I Wu  wrote:
>> Hi Eric,
>>
>> On Sat, Oct 12, 2013 at 3:18 AM, Eric Anholt  wrote:
>>> Chia-I Wu  writes:
>>>
>>>> Hi Eric,
>>>> The frame rate of Unigine Tropics (with low shader quality) dropped
>>>> from 40.8 to 23.5 after this change.
>>>
>>> Thanks for the note.  I see the regression as well, and I see a shader
>>> that's started spilling.  It looks like we can drop the regs_written <=
>>> 1 check on gen7+'s pre-regalloc scheduling to fix the problem (the MRF
>>> setup thing is no longer an issue, and its presence is now making us
>>> pessimize instead of optimize in general in the pre-regalloc
>>> scheduling).  I'll want to run a few more tests to make sure that this
>>> doesn't regress something else.
>> Are you looking at this issue?  The change you suggested does not
>> avoid spilling.
>>
>> I think the problem can be demonstrated with this snippet:
>>
>>   vec4 val = vec4(0.0);
>>
>>   vec4 tmp_001 = texture(tex, texcoord * 0.01);
>>   val += tmp_001;
>>   vec4 tmp_002 = texture(tex, texcoord * 0.02);
>>   val += tmp_002;
>>   vec4 tmp_003 = texture(tex, texcoord * 0.03);
>>   val += tmp_003;
>>   ...
>>   vec4 tmp_099 = texture(tex, texcoord * 0.99);
>>   val += tmp_099;
>>   vec4 tmp_100 = texture(tex, texcoord * 1.00);
>>   val += tmp_100;
>>
>>   gl_FragColor = val;
>>
>> Before the change, the scheduler saw a dependency between any two
>> texture() calls (because of the use of MRF).  It was inclined to keep
>> the accumulation of tmp_xxx between texture() calls even though the
>> accumulation also had a dependency on the last texture() call.
>>
>> After the change, the dependencies between texture()s are gone.  The
>> scheduler sees a chance to move all the high latency texture()
>> together and generate something like this:
> Ah, I started looking at post-reg-alloc scheduling in the middle
> way...  My reasoning was wrong.  The correct one is:
>
> It worked before this change because there were dependencies between
> texture() calls, and those texture() calls must thus be scheduled in
> that order.  Accumulations were scheduled as soon as they were
> available, and thus were intermixed with texture() calls.
>
> It does not work now because the dependencies between texture() calls
> are gone.  Since the scheduler schedules in FILO order, texture()
> calls are scheduled in reversed order.  Accumulations are thus
> available only after all texture() calls are scheduled.
Prior to register allocation, choose_instruction_to_schedule() chooses
from the available instructions in reverse order.  The attached change
fixes the order, while still doing depth-first search.

It fixes the problem I saw with my example shader, but does not
prevent the shader from Unigine Tropics from spilling.  Just want to
check with you about the idea.  I don't quite follow the comment for
the (inst->regs_written <= 1) check.  It seems to me you want to
schedule texturing last (between the newly available instructions),
but the comment is not clear to me.

>
> This remains true with the fix suggested (it is still desirable, only
> that it is a partial fix).  The problem can be demonstrated with the
> attached fragment shader.
>
>>   vec4 tmp_003 = texture(tex, texcoord * 0.03);
>>   ...
>>   vec4 tmp_099 = texture(tex, texcoord * 0.99);
>>   vec4 tmp_100 = texture(tex, texcoord * 1.00);
>>
>>   val += tmp_001;
>>   val += tmp_002;
>>   val += tmp_003;
>>   ...
>>   val += tmp_099;
>>   val += tmp_100;
>>
>> Since there are not enough registers to hold all tmp_xxx, the register
>> allocation starts spilling.
>>
>>>
>>> This shader is also in bad shape now that we don't have the redundant
>>> MRF move optimization, and we need to look into grf_size > 1 CSE.  That
>>> would probably also have avoided the problem on this shader, though the
>>> scheduling problem is more general than this one shader.
>>
>>
>>
>> --
>> o...@lunarg.com
>>   val = texture(tex, texcoord * 1.0);
>
>
>
> --
> o...@lunarg.com



-- 
o...@lunarg.com


0001-i965.patch
Description: Binary data
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] i965/fs: In the pre-regalloc schedule, try harder at reducing reg pressure.

2013-10-18 Thread Chia-I Wu

On Thu, Oct 17, 2013 at 3:24 AM, Matt Turner  wrote:
> On Mon, Oct 14, 2013 at 4:14 PM, Eric Anholt  wrote:
>> Previously, the best thing we had was to schedule the things unblocked by
>> the current instruction, on the hope that it would be consuming two values
>> at the end of their live intervals while only producing one new value.
>> Sometimes that wasn't the case.
>>
>> Now, when an instruction is the first user of a GRF we schedule (i.e. it
>> will probably be the virtual_grf_def[] instruction after computing live
>> intervals again), penalize it by how many regs it would take up.  When an
>> instruction is the last user of a GRF we have to schedule (when it will
>> probably be the virtual_grf_end[] instruction), give it a boost by how
>> many regs it would free.
>>
>> The new functions are made virtual (only 1 of 2 really needs to be
>> virtual) because I expect we'll soon lift the pre-regalloc scheduling
>> heuristic over to the vec4 backend.
>>
>> shader-db:
>> total instructions in shared programs: 1512756 -> 1511604 (-0.08%)
>> instructions in affected programs: 10292 -> 9140 (-11.19%)
>> GAINED:121
>> LOST:  38
>>
>> Improves tropics performance at my current settings by 4.50602% +/-
>> 2.60694% (n=5).  No difference on Lightsmark (n=5).  No difference on
>> GLB2.7 (n=11).
>>
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=70445
>> ---
>
> I think we're on the right track by considering register pressure when
> scheduling, but one aspect we're not considering is simply how many
> registers we think we're using.
>
> If I understand correctly, the pre-register allocation wants to
> shorten live intervals as much as possible which reduces register
> pressure but at the cost of larger stalls and less instruction level
> parallelism. We end up scheduling things like
>
> produce result 4
> produce result 3
> produce result 2
> produce result 1
> use result 1
> use result 2
> use result 3
> use result 4
>
> (this is why the MRF writes for the FB write are always done in the
> reverse order)
In this example, it will actually be

 produce result 4
 use result 4
 produce result 3
 use result 3
 produce result 2
 use result 2
 produce result 1
 use result 1

and post-regalloc will schedule again to something like

 produce result 4
 produce result 3
 produce result 2
 produce result 1
 use result 4
 use result 3
 use result 2
 use result 1

The pre-regalloc scheduling attempts to consume the results as soon as
they are available.

FB write is done in reverse order because, when a result is available,
its consumers are scheduled in reverse order.  The epilog of fragment
shaders is usually like this:

 placeholder_halt
 mov m1, g1
 mov m2, g2
 mov m3, g3
 mov m4, g4
 send

MOVs depend on placeholder_halt, and send depends on MOVs.  The
scheduler will schedule it as follows:

 placeholder_halt
 mov m4, g4
 mov m3, g3
 mov m2, g2
 mov m1, g1
 send

The order can be corrected with the change proposed here

  http://lists.freedesktop.org/archives/mesa-dev/2013-October/046570.html

But there is no point for making the change the current heuristic for
pre-regalloc is to be reworked.

>
> Take the main shader from FillTestC24Z16 in GLB2.5 or 2.7 as an
> example. Before texture-grf we serialized the eight texture sends.
> After that branch landed, we scheduled the code much better, leading
> to a performance improvement.
>
> This patch causes us again to serialize the 8 texture ops in
> GLB25_FillTestC24Z16, like we did before texture-from-grf. It reduces
> performance from 7.0 billion texels/sec to ~6.5 on IVB.
>
> The shader in question is structured, prior to scheduling as
>
> 16 PLNs to interpolate the texture coordinates
>  - 10 registers consumed, 16 results produced
> 8 TEX
>  - 16 registers consumed, 32 results produced
> 28 ADDs to sum the texture results into gl_FragColor.
>  - 32 registers consumed, 4 results produced
> FB write.
>  - 4 registers consumed
>
> Even doubling these numbers for SIMD16 we don't spill. There's no need
> to reduce live ranges and therefore ILP for this shader.
>
> Can we accurately track the number of registers in use and decide what
> to do based on that?
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] i965/fs: In the pre-regalloc schedule, try harder at reducing reg pressure.

2013-10-18 Thread Chia-I Wu

On Tue, Oct 15, 2013 at 7:14 AM, Eric Anholt  wrote:
> Previously, the best thing we had was to schedule the things unblocked by
> the current instruction, on the hope that it would be consuming two values
> at the end of their live intervals while only producing one new value.
> Sometimes that wasn't the case.
>
> Now, when an instruction is the first user of a GRF we schedule (i.e. it
> will probably be the virtual_grf_def[] instruction after computing live
> intervals again), penalize it by how many regs it would take up.  When an
> instruction is the last user of a GRF we have to schedule (when it will
> probably be the virtual_grf_end[] instruction), give it a boost by how
> many regs it would free.
texture2D() takes up 4 regs and at best free 2 regs.  It will always
be scheduled last.  When there are more than ~60 texture2D() calls (it
could happen because of loop unrolling), the message payloads could
take up all available registers.

I wonder if it helps to take how long an instruction is in the
available queue into consideration.  For after a couple of
texture2D()s are scheduled, the instructions that use the results may
become available and free them up.

>
> The new functions are made virtual (only 1 of 2 really needs to be
> virtual) because I expect we'll soon lift the pre-regalloc scheduling
> heuristic over to the vec4 backend.
>
> shader-db:
> total instructions in shared programs: 1512756 -> 1511604 (-0.08%)
> instructions in affected programs: 10292 -> 9140 (-11.19%)
> GAINED:121
> LOST:  38
>
> Improves tropics performance at my current settings by 4.50602% +/-
> 2.60694% (n=5).  No difference on Lightsmark (n=5).  No difference on
> GLB2.7 (n=11).
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=70445
> ---
>  .../drivers/dri/i965/brw_schedule_instructions.cpp | 125 
> ++---
>  1 file changed, 111 insertions(+), 14 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp 
> b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> index b24c38c..7cb0265 100644
> --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> @@ -353,6 +353,13 @@ public:
>this->instructions_to_schedule = 0;
>this->post_reg_alloc = post_reg_alloc;
>this->time = 0;
> +  if (!post_reg_alloc) {
> + this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count);
> + this->grf_active = rzalloc_array(mem_ctx, bool, grf_count);
> +  } else {
> + this->remaining_grf_uses = NULL;
> + this->grf_active = NULL;
> +  }
> }
>
> ~instruction_scheduler()
> @@ -377,6 +384,9 @@ public:
>  */
> virtual int issue_time(backend_instruction *inst) = 0;
>
> +   virtual void mod_remaining_grf_uses(backend_instruction *inst, int mod) = 
> 0;
> +   virtual int get_grf_pressure_benefit(backend_instruction *inst) = 0;
> +
> void schedule_instructions(backend_instruction *next_block_header);
>
> void *mem_ctx;
> @@ -387,6 +397,17 @@ public:
> int time;
> exec_list instructions;
> backend_visitor *bv;
> +
> +   /** Number of instructions left to schedule that reference each vgrf. */
> +   int *remaining_grf_uses;
> +
> +   /**
> +* Tracks whether each VGRF has had an instruction scheduled that uses it.
> +*
> +* This is used to estimate whether scheduling a new instruction will
> +* increase register pressure.
> +*/
> +   bool *grf_active;
>  };
>
>  class fs_instruction_scheduler : public instruction_scheduler
> @@ -398,6 +419,9 @@ public:
> schedule_node *choose_instruction_to_schedule();
> int issue_time(backend_instruction *inst);
> fs_visitor *v;
> +
> +   void mod_remaining_grf_uses(backend_instruction *inst, int mod);
> +   int get_grf_pressure_benefit(backend_instruction *inst);
>  };
>
>  fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
> @@ -408,6 +432,57 @@ 
> fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
>  {
>  }
>
> +void
> +fs_instruction_scheduler::mod_remaining_grf_uses(backend_instruction *be,
> + int mod)
> +{
> +   fs_inst *inst = (fs_inst *)be;
> +
> +   if (!remaining_grf_uses)
> +  return;
> +
> +   if (inst->dst.file == GRF) {
> +  remaining_grf_uses[inst->dst.reg] += mod;
> +  if (mod < 0 && !grf_active[inst->dst.reg])
> + grf_active[inst->dst.reg] = true;
> +   }
> +
> +   for (int i = 0; i < 3; i++) {
> +  if (inst->src[i].file != GRF)
> + continue;
> +
> +  remaining_grf_uses[inst->src[i].reg] += mod;
> +  if (mod < 0 && !grf_active[inst->src[i].reg])
> + grf_active[inst->src[i].reg] = true;
> +   }
> +}
> +
> +int
> +fs_instruction_scheduler::get_grf_pressure_benefit(backend_instruction *be)
> +{
> +   fs_inst *inst = (fs_inst *)be;
> +   int benefit = 0;

Re: [Mesa-dev] [PATCH] i965/fs: In the pre-regalloc schedule, try harder at reducing reg pressure.

2013-10-21 Thread Chia-I Wu

On Tue, Oct 22, 2013 at 3:05 AM, Eric Anholt  wrote:
> Chia-I Wu  writes:
>
>> On Thu, Oct 17, 2013 at 3:24 AM, Matt Turner  wrote:
>>> On Mon, Oct 14, 2013 at 4:14 PM, Eric Anholt  wrote:
>>>> Previously, the best thing we had was to schedule the things unblocked by
>>>> the current instruction, on the hope that it would be consuming two values
>>>> at the end of their live intervals while only producing one new value.
>>>> Sometimes that wasn't the case.
>>>>
>>>> Now, when an instruction is the first user of a GRF we schedule (i.e. it
>>>> will probably be the virtual_grf_def[] instruction after computing live
>>>> intervals again), penalize it by how many regs it would take up.  When an
>>>> instruction is the last user of a GRF we have to schedule (when it will
>>>> probably be the virtual_grf_end[] instruction), give it a boost by how
>>>> many regs it would free.
>>>>
>>>> The new functions are made virtual (only 1 of 2 really needs to be
>>>> virtual) because I expect we'll soon lift the pre-regalloc scheduling
>>>> heuristic over to the vec4 backend.
>>>>
>>>> shader-db:
>>>> total instructions in shared programs: 1512756 -> 1511604 (-0.08%)
>>>> instructions in affected programs: 10292 -> 9140 (-11.19%)
>>>> GAINED:121
>>>> LOST:  38
>>>>
>>>> Improves tropics performance at my current settings by 4.50602% +/-
>>>> 2.60694% (n=5).  No difference on Lightsmark (n=5).  No difference on
>>>> GLB2.7 (n=11).
>>>>
>>>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=70445
>>>> ---
>>>
>>> I think we're on the right track by considering register pressure when
>>> scheduling, but one aspect we're not considering is simply how many
>>> registers we think we're using.
>>>
>>> If I understand correctly, the pre-register allocation wants to
>>> shorten live intervals as much as possible which reduces register
>>> pressure but at the cost of larger stalls and less instruction level
>>> parallelism. We end up scheduling things like
>>>
>>> produce result 4
>>> produce result 3
>>> produce result 2
>>> produce result 1
>>> use result 1
>>> use result 2
>>> use result 3
>>> use result 4
>>>
>>> (this is why the MRF writes for the FB write are always done in the
>>> reverse order)
>> In this example, it will actually be
>>
>>  produce result 4
>>  use result 4
>>  produce result 3
>>  use result 3
>>  produce result 2
>>  use result 2
>>  produce result 1
>>  use result 1
>>
>> and post-regalloc will schedule again to something like
>>
>>  produce result 4
>>  produce result 3
>>  produce result 2
>>  produce result 1
>>  use result 4
>>  use result 3
>>  use result 2
>>  use result 1
>>
>> The pre-regalloc scheduling attempts to consume the results as soon as
>> they are available.
>>
>> FB write is done in reverse order because, when a result is available,
>> its consumers are scheduled in reverse order.  The epilog of fragment
>> shaders is usually like this:
>>
>>  placeholder_halt
>>  mov m1, g1
>>  mov m2, g2
>>  mov m3, g3
>>  mov m4, g4
>>  send
>>
>> MOVs depend on placeholder_halt, and send depends on MOVs.  The
>> scheduler will schedule it as follows:
>>
>>  placeholder_halt
>>  mov m4, g4
>>  mov m3, g3
>>  mov m2, g2
>>  mov m1, g1
>>  send
>>
>> The order can be corrected with the change proposed here
>>
>>   http://lists.freedesktop.org/archives/mesa-dev/2013-October/046570.html
>>
>> But there is no point for making the change the current heuristic for
>> pre-regalloc is to be reworked.
>
> Flipping the order in which we prefer ties (on betterthanlifo-2):
>
> commit 11a511576e465f02875f39c452561775a97416a1
> Author: Eric Anholt 
> Date:   Mon Oct 21 11:45:53 2013 -0700
>
> otherway
>
> diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp 
> b/src/mesa/
> index 9a480b4..b123015 100644
> --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> @@ -1049,9 +1049,9 @@ 
> fs_instruction_scheduler::choose_instruction_to

Re: [Mesa-dev] Static/shared pipe-drivers (was megadriver/pipe-loader-to-all)

2014-06-18 Thread Chia-I Wu

Hi Emil,

On Fri, Jun 13, 2014 at 3:56 AM, Emil Velikov  wrote:
> Hi all,
>
> These patches add support for building (grouping) the various targets per
> API, meaning that only one library will be created  for e.g. vdpau
> (libvdpau_gallium) with individual ones (libvdpau_r600) being a hardlink
> to it.
>
> This allows us to have substantial space savings as the API(state-tracker)
> is available only once. Additionally it adds support for shared
> pipe-drivers via a _unstable_ interface, which saves us the duplication
> across X APIs.
>
> The former method has been used by the egl-static while the latter by
> opencl and gbm targets since they were introduced.
>
> By default we build with "static pipe-drivers".
>
> Some numbers + extra info [1]
>
> [Static]
> dri:(r600|radeonsi|nouveau)_dri.so   -> 6.5 MiB
> vdpau:  libvdpau_(r600|radeonsi|nouveau).so  -> 3.5 MiB
>
> Total: 10MiB
>
> [Shared]
> Libraries:
> dri:(r600|radeonsi|nouveau)_dri.so   -> 3.9 MiB
> vdpau:  libvdpau_(r600|radeonsi|nouveau).so  -> 633 KiB
> gallium-pipe:   pipe_(r600|radeonsi|nouveau).so  -> 5.3 MiB
>
> Total: 9.8MiB
>
> [Current]
> dri:(r600|radeonsi|nouveau)_dri.so   -> 5.0+4.5+5.3 = 14.8 MiB
> vdpau:  libvdpau_(r600|radeonsi|nouveau).so  -> 1.9+1.2+2.3 = 5.4 MiB
>
> Total: 20.2MiB
>
>
> The previous series can be found here [2]
> Changes since then
>  - Convert targets individually.
>  - OMX targets now work, and the final library is now libomx-mesa.so
>  - Dropped the DRI targets for now
>  - A handfull of typos thinkos and bugs fixed.
>
>
> My plan is to have these pushed in ~4 stages, with two stages per week.
> This way I will be able to crack on with the remaining bits and have all
> of it tested well before we branch the next release.
>
> Series is availabe at
> https://github.com/evelikov/Mesa/tree/static-or-shared-pipe-drivers
>
> As always comments and suggestions are greatly appreciated.
Thanks for working on this.  This is a tough issue to tackle.  I have
a few questions/comments, which I am fine to see them resolved either
before or after landing your series.

I see this work as to define an internal API to manage pipe drivers.
The lack of such API previously led us to "targets", where each target
knows how to load a specific driver.  With your changes, state
trackers that need to work with pipe drivers have a way to do so.  As
a result, files such as

  dri/target.c,
  xa/target.c,
  xvmc/target.c,
  vdpau/target.c, and
  omx/target.c

become quite dummy and redundant.  Do you see a way to get rid of
"targets" entirely?

In the same view, and noticing that all users of the API have this snippet

#if GALLIUM_STATIC_TARGETS
   scrn->base.pscreen = dd_create_screen(fd);
#else
   if (pipe_loader_drm_probe_fd(&scrn->base.dev, fd, true))
  scrn->base.pscreen = pipe_loader_create_screen(scrn->base.dev,
PIPE_SEARCH_DIR);
#endif // GALLIUM_STATIC_TARGETS

I think it makes sense hide this difference behind the API.  Another
thing I noted is that the non-static path allows the user to auth the
fd while the static path doesn't.  It is not clear to me how come the
static path works.

On the other hand, the implementation of the API extends itself a bit
when DRI_TARGET is defined.  That is ugly from the API's point of
view.  Could that be abstracted somehow so that it can be used
elsewhere or at least looks nicer?

Finally, the API is limited to C API (i.e., inline_{drm,sw}_helper.h).
At the build system level, we also like a way to easily manage a pipe
driver.  For example, we can see this snippet in many of the
Makefile.am

if HAVE_GALLIUM_RADEONSI
STATIC_TARGET_CPPFLAGS += -DGALLIUM_RADEONSI
STATIC_TARGET_LIB_DEPS += \
$(top_builddir)/src/gallium/drivers/radeonsi/libradeonsi.la \
$(RADEON_LIBS)
endif

If pipe drivers can be accompanies by some Makefile rules specifying
how it should be built, in addition to how it can be loaded, we can
get rid of those duplicated rules.


>
> Cheers,
> -Emil
>
> [1] http://lists.freedesktop.org/archives/mesa-dev/2014-May/059806.html
> [2] http://lists.freedesktop.org/archives/mesa-dev/2014-May/059628.html
>
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] Static/shared pipe-drivers (was megadriver/pipe-loader-to-all)

2014-06-18 Thread Chia-I Wu

On Wed, Jun 18, 2014 at 8:14 PM, Emil Velikov  wrote:
> On 18 June 2014 08:21, Chia-I Wu  wrote:
>> Hi Emil,
>>
>> On Fri, Jun 13, 2014 at 3:56 AM, Emil Velikov  
>> wrote:
>>> Hi all,
>>>
>>> These patches add support for building (grouping) the various targets per
>>> API, meaning that only one library will be created  for e.g. vdpau
>>> (libvdpau_gallium) with individual ones (libvdpau_r600) being a hardlink
>>> to it.
>>>
>>> This allows us to have substantial space savings as the API(state-tracker)
>>> is available only once. Additionally it adds support for shared
>>> pipe-drivers via a _unstable_ interface, which saves us the duplication
>>> across X APIs.
>>>
>>> The former method has been used by the egl-static while the latter by
>>> opencl and gbm targets since they were introduced.
>>>
>>> By default we build with "static pipe-drivers".
>>>
>>> Some numbers + extra info [1]
>>>
>>> [Static]
>>> dri:(r600|radeonsi|nouveau)_dri.so   -> 6.5 MiB
>>> vdpau:  libvdpau_(r600|radeonsi|nouveau).so  -> 3.5 MiB
>>>
>>> Total: 10MiB
>>>
>>> [Shared]
>>> Libraries:
>>> dri:(r600|radeonsi|nouveau)_dri.so   -> 3.9 MiB
>>> vdpau:  libvdpau_(r600|radeonsi|nouveau).so  -> 633 KiB
>>> gallium-pipe:   pipe_(r600|radeonsi|nouveau).so  -> 5.3 MiB
>>>
>>> Total: 9.8MiB
>>>
>>> [Current]
>>> dri:(r600|radeonsi|nouveau)_dri.so   -> 5.0+4.5+5.3 = 14.8 
>>> MiB
>>> vdpau:  libvdpau_(r600|radeonsi|nouveau).so  -> 1.9+1.2+2.3 = 5.4 
>>> MiB
>>>
>>> Total: 20.2MiB
>>>
>>>
>>> The previous series can be found here [2]
>>> Changes since then
>>>  - Convert targets individually.
>>>  - OMX targets now work, and the final library is now libomx-mesa.so
>>>  - Dropped the DRI targets for now
>>>  - A handfull of typos thinkos and bugs fixed.
>>>
>>>
>>> My plan is to have these pushed in ~4 stages, with two stages per week.
>>> This way I will be able to crack on with the remaining bits and have all
>>> of it tested well before we branch the next release.
>>>
>>> Series is availabe at
>>> https://github.com/evelikov/Mesa/tree/static-or-shared-pipe-drivers
>>>
>>> As always comments and suggestions are greatly appreciated.
>> Thanks for working on this.  This is a tough issue to tackle.  I have
>> a few questions/comments, which I am fine to see them resolved either
>> before or after landing your series.
>>
>> I see this work as to define an internal API to manage pipe drivers.
> Interesting, I do not see this as an attempt to define an API, but to
> cleanup all the mayhem that our targets currently are:
>  * Cleanup the build system - drop symlinks, including the same source
> files from different locations.
>  * Make targets less error prone by using static pipe-drivers by
> default. Shared ones we lack versioning and ... are a big can of
> worms.
>  * Minimize all the target.c duplication across each target. Makefiles
> are in similar boat.
The reason that the state trackers can manage statically linked pipe
drivers, or the duplications in target.c can be killed is because of
the introduction of an API (inline_drm_helper.h), or if you prefer,
helper functions.

Either way, a set of functions are defined to help manage statically
linked pipe drivers.  State trackers tend to do

#ifdef GALLIUM_STATIC_TARGETS
/* use inline_drm_helper.h */
...
#else
/* use pipe_loader.h */
...
#endif

IMHO, we should be able to define a single API, or a single set of
helper functions, to manage pipe drivers, no matter they are
statically linked or dynamically loaded.  Note that
inline_drm_helper.h is not stateless: dd_create_screen must be called
first to initialize a static variable.  It may even be possible to
extend pipe loader for the statically linked case.

>  * Allow people to use the unstable pipe-drivers if they are really
> short on size and know what they are doing.
>
>> The lack of such API previously led us to "targets", where each target
>> knows how to load a specific driver.  With your changes, state
>> trackers that need to work with pipe drivers have a way to do so.  As
>> a result, files such as
>>
>>   dri/target.c,
>>   xa/target.c,
>>   xvmc/target.c,
>>   vdpau/target.c, and
>>   omx/target.c
>>
>> become quite dummy and redundant.

[Mesa-dev] [PATCHv2 00/13] multithread GLSL compiler

2014-07-09 Thread Chia-I Wu

Hi list,

This is my second try to add multithread support for the GLSL compiler.
Changes since v1 are

 - glLinkProgram can now be threaded
 - added ctx->Const.DeferCompileShader and ctx->Const.DeferLinkProgram to
   allow drivers to control what get threaded
 - minimal changes to i965 to enable DeferLinkProgram

The most interesting patch should be patch 8, which makes the necessary
changes to core mesa.  Patch 13 touches i965 and is more a RFC patch.

v2 still requires applications to "Do the Right Thing" to benefit from
multithread compiler.  But that can be changed as noted in patch 8.  I am open
to suggestions.

A summary of the patches:

Patch 1 makes our GL_KRH_debug implementation thread-safe.

Patch 2~5 make the GLSL compiler thread-safe.  These are mostly the same as v1,
except for how locale_t is initialized.

Patch 6~7 add a singleton thread pool to the compiler.

Patch 8 adds the infrastructure to core mesa to enable multithread
compiling/linkg, and patch 9 adds a dri option to enable DeferCompileShader
for i965.

Patch 10~12 refactor some code in i965 and patch 13 adds the necessary changes
to enable DeferLinkProgram.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCHv2 07/13] glsl: add a singleton GLSL thread pool

2014-07-09 Thread Chia-I Wu

This thread pool will be used by contexts to queue compilation tasks.

Signed-off-by: Chia-I Wu 
---
 src/glsl/glsl_parser_extras.cpp |  4 +++
 src/glsl/threadpool.c   | 72 +
 src/glsl/threadpool.h   |  9 ++
 3 files changed, 85 insertions(+)

diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index ad31469..cb7d59e 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -37,6 +37,7 @@ extern "C" {
 #include "glsl_parser.h"
 #include "ir_optimization.h"
 #include "loop_analysis.h"
+#include "threadpool.h"
 
 /**
  * Format a short human-readable description of the given GLSL version.
@@ -1599,6 +1600,8 @@ extern "C" {
 void
 _mesa_destroy_shader_compiler(void)
 {
+   _mesa_glsl_destroy_threadpool();
+
_mesa_destroy_shader_compiler_caches();
 
_mesa_glsl_release_types();
@@ -1612,6 +1615,7 @@ _mesa_destroy_shader_compiler(void)
 void
 _mesa_destroy_shader_compiler_caches(void)
 {
+   _mesa_glsl_wait_threadpool();
_mesa_glsl_release_builtin_functions();
 }
 
diff --git a/src/glsl/threadpool.c b/src/glsl/threadpool.c
index c069fd3..d6ed8c1 100644
--- a/src/glsl/threadpool.c
+++ b/src/glsl/threadpool.c
@@ -55,6 +55,7 @@ struct _mesa_threadpool_task {
 struct _mesa_threadpool {
mtx_t mutex;
int refcnt;
+   bool shutdown;
 
enum _mesa_threadpool_control thread_control;
thrd_t *threads;
@@ -168,6 +169,12 @@ _mesa_threadpool_queue_task(struct _mesa_threadpool *pool,
 
mtx_lock(&pool->mutex);
 
+   if (unlikely(pool->shutdown)) {
+  mtx_unlock(&pool->mutex);
+  free(task);
+  return NULL;
+   }
+
/* someone is joining with the threads */
while (unlikely(pool->thread_control != MESA_THREADPOOL_NORMAL))
   cnd_wait(&pool->thread_joined, &pool->mutex);
@@ -379,6 +386,17 @@ _mesa_threadpool_join(struct _mesa_threadpool *pool, bool 
graceful)
 }
 
 /**
+ * After this call, no task can be queued.
+ */
+static void
+_mesa_threadpool_set_shutdown(struct _mesa_threadpool *pool)
+{
+   mtx_lock(&pool->mutex);
+   pool->shutdown = true;
+   mtx_unlock(&pool->mutex);
+}
+
+/**
  * Decrease the reference count.  Destroy \p pool when the reference count
  * reaches zero.
  */
@@ -474,3 +492,57 @@ _mesa_threadpool_create(int max_threads)
 
return pool;
 }
+
+static mtx_t threadpool_lock = _MTX_INITIALIZER_NP;
+static struct _mesa_threadpool *threadpool;
+
+/**
+ * Get the singleton GLSL thread pool.  \p max_threads is honored only by the
+ * first call to this function.
+ */
+struct _mesa_threadpool *
+_mesa_glsl_get_threadpool(int max_threads)
+{
+   mtx_lock(&threadpool_lock);
+   if (!threadpool)
+  threadpool = _mesa_threadpool_create(max_threads);
+   if (threadpool)
+  _mesa_threadpool_ref(threadpool);
+   mtx_unlock(&threadpool_lock);
+
+   return threadpool;
+}
+
+/**
+ * Wait until all tasks are completed and threads are joined.
+ */
+void
+_mesa_glsl_wait_threadpool(void)
+{
+   mtx_lock(&threadpool_lock);
+   if (threadpool)
+  _mesa_threadpool_join(threadpool, true);
+   mtx_unlock(&threadpool_lock);
+}
+
+/**
+ * Destroy the GLSL thread pool.
+ */
+void
+_mesa_glsl_destroy_threadpool(void)
+{
+   mtx_lock(&threadpool_lock);
+   if (threadpool) {
+  /*
+   * This is called from _mesa_destroy_shader_compiler().  No new task is
+   * allowed since this point.  But contexts, who also own references to
+   * the pool, can still complete tasks that have been queued.
+   */
+  _mesa_threadpool_set_shutdown(threadpool);
+
+  _mesa_threadpool_join(threadpool, false);
+  _mesa_threadpool_unref(threadpool);
+  threadpool = NULL;
+   }
+   mtx_unlock(&threadpool_lock);
+}
diff --git a/src/glsl/threadpool.h b/src/glsl/threadpool.h
index 48e4a47..b1a8ea8 100644
--- a/src/glsl/threadpool.h
+++ b/src/glsl/threadpool.h
@@ -60,6 +60,15 @@ bool
 _mesa_threadpool_complete_task(struct _mesa_threadpool *pool,
struct _mesa_threadpool_task *task);
 
+struct _mesa_threadpool *
+_mesa_glsl_get_threadpool(int max_threads);
+
+void
+_mesa_glsl_wait_threadpool(void);
+
+void
+_mesa_glsl_destroy_threadpool(void);
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.0.0.rc2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCHv2 10/13] i965: refactor do_vs_prog

2014-07-09 Thread Chia-I Wu

Split do_vs_prog into

  brw_vs_init_compile
  brw_vs_do_compile
  brw_vs_upload_compile
  brw_vs_clear_complile

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_vec4.h |   6 ++
 src/mesa/drivers/dri/i965/brw_vs.c   | 117 ++-
 src/mesa/drivers/dri/i965/brw_vs.h   |   1 +
 3 files changed, 81 insertions(+), 43 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h 
b/src/mesa/drivers/dri/i965/brw_vec4.h
index 87247ea..dd66b6c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -48,6 +48,12 @@ extern "C" {
 
 struct brw_vec4_compile {
GLuint last_scratch; /**< measured in 32-byte (register size) units */
+
+   struct gl_shader_program *shader_prog;
+
+   void *mem_ctx;
+   const unsigned *program;
+   unsigned program_size;
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c 
b/src/mesa/drivers/dri/i965/brw_vs.c
index e0f32b3..22bc473 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -187,31 +187,31 @@ brw_vs_prog_data_compare(const void *in_a, const void 
*in_b)
return true;
 }
 
-static bool
-do_vs_prog(struct brw_context *brw,
-  struct gl_shader_program *prog,
-  struct brw_vertex_program *vp,
-  struct brw_vs_prog_key *key)
+static void
+brw_vs_init_compile(struct brw_context *brw,
+   struct gl_shader_program *prog,
+   struct brw_vertex_program *vp,
+   const struct brw_vs_prog_key *key,
+   struct brw_vs_compile *c)
 {
-   GLuint program_size;
-   const GLuint *program;
-   struct brw_vs_compile c;
-   struct brw_vs_prog_data prog_data;
-   struct brw_stage_prog_data *stage_prog_data = &prog_data.base.base;
-   void *mem_ctx;
-   int i;
-   struct gl_shader *vs = NULL;
-
-   if (prog)
-  vs = prog->_LinkedShaders[MESA_SHADER_VERTEX];
+   memset(c, 0, sizeof(*c));
 
-   memset(&c, 0, sizeof(c));
-   memcpy(&c.key, key, sizeof(*key));
-   memset(&prog_data, 0, sizeof(prog_data));
+   memcpy(&c->key, key, sizeof(*key));
+   c->vp = vp;
+   c->base.shader_prog = prog;
+   c->base.mem_ctx = ralloc_context(NULL);
+}
 
-   mem_ctx = ralloc_context(NULL);
+static bool
+brw_vs_do_compile(struct brw_context *brw,
+ struct brw_vs_compile *c)
+{
+   struct brw_stage_prog_data *stage_prog_data = &c->prog_data.base.base;
+   struct gl_shader *vs = NULL;
+   int i;
 
-   c.vp = vp;
+   if (c->base.shader_prog)
+  vs = c->base.shader_prog->_LinkedShaders[MESA_SHADER_VERTEX];
 
/* Allocate the references to the uniforms that will end up in the
 * prog_data associated with the compiled program, and which will be freed
@@ -226,12 +226,12 @@ do_vs_prog(struct brw_context *brw,
   param_count = vs->num_uniform_components * 4;
 
} else {
-  param_count = vp->program.Base.Parameters->NumParameters * 4;
+  param_count = c->vp->program.Base.Parameters->NumParameters * 4;
}
/* vec4_visitor::setup_uniform_clipplane_values() also uploads user clip
 * planes as uniforms.
 */
-   param_count += c.key.base.nr_userclip_plane_consts * 4;
+   param_count += c->key.base.nr_userclip_plane_consts * 4;
 
stage_prog_data->param = rzalloc_array(NULL, const float *, param_count);
stage_prog_data->pull_param = rzalloc_array(NULL, const float *, 
param_count);
@@ -245,12 +245,12 @@ do_vs_prog(struct brw_context *brw,
   stage_prog_data->nr_params += vs->num_samplers;
}
 
-   GLbitfield64 outputs_written = vp->program.Base.OutputsWritten;
-   prog_data.inputs_read = vp->program.Base.InputsRead;
+   GLbitfield64 outputs_written = c->vp->program.Base.OutputsWritten;
+   c->prog_data.inputs_read = c->vp->program.Base.InputsRead;
 
-   if (c.key.copy_edgeflag) {
+   if (c->key.copy_edgeflag) {
   outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE);
-  prog_data.inputs_read |= VERT_BIT_EDGEFLAG;
+  c->prog_data.inputs_read |= VERT_BIT_EDGEFLAG;
}
 
if (brw->gen < 6) {
@@ -261,7 +261,7 @@ do_vs_prog(struct brw_context *brw,
* coords, which would be a pain to handle.
*/
   for (i = 0; i < 8; i++) {
- if (c.key.point_coord_replace & (1 << i))
+ if (c->key.point_coord_replace & (1 << i))
 outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i);
   }
 
@@ -276,45 +276,76 @@ do_vs_prog(struct brw_context *brw,
 * distance varying slots whenever clipping is enabled, even if the vertex
 * shader doesn't write to gl_ClipDistance.
 */
-   if (c.key.base.userclip_active) {
+   if (c->key.base.userclip_active) {
   outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
   outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
}
 
-   brw_compute_vue_map(brw, &prog_data

[Mesa-dev] [PATCHv2 06/13] glsl: add a generic thread pool data structure

2014-07-09 Thread Chia-I Wu

It can be used to implement, for example, threaded glCompileShader and
glLinkProgram.

v2: allow tasks to "complete" other tasks

Signed-off-by: Chia-I Wu 
---
 src/glsl/Makefile.am   |  12 +-
 src/glsl/Makefile.sources  |   3 +-
 src/glsl/tests/threadpool_test.cpp | 137 +++
 src/glsl/threadpool.c  | 476 +
 src/glsl/threadpool.h  |  67 ++
 5 files changed, 693 insertions(+), 2 deletions(-)
 create mode 100644 src/glsl/tests/threadpool_test.cpp
 create mode 100644 src/glsl/threadpool.c
 create mode 100644 src/glsl/threadpool.h

diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am
index 00261fd..3d07af3 100644
--- a/src/glsl/Makefile.am
+++ b/src/glsl/Makefile.am
@@ -35,6 +35,7 @@ TESTS = glcpp/tests/glcpp-test
\
tests/general-ir-test   \
tests/optimization-test \
tests/ralloc-test   \
+   tests/threadpool-test   \
tests/sampler-types-test\
tests/uniform-initializer-test
 
@@ -48,6 +49,7 @@ check_PROGRAMS =  \
glsl_test   \
tests/general-ir-test   \
tests/ralloc-test   \
+   tests/threadpool-test   \
tests/sampler-types-test\
tests/uniform-initializer-test
 
@@ -95,6 +97,14 @@ tests_ralloc_test_LDADD =\
$(top_builddir)/src/gtest/libgtest.la   \
$(PTHREAD_LIBS)
 
+tests_threadpool_test_SOURCES =\
+   tests/threadpool_test.cpp   \
+   $(top_builddir)/src/glsl/threadpool.c
+tests_threadpool_test_CFLAGS = $(PTHREAD_CFLAGS)
+tests_threadpool_test_LDADD =  \
+   $(top_builddir)/src/gtest/libgtest.la   \
+   $(PTHREAD_LIBS)
+
 tests_sampler_types_test_SOURCES = \
$(top_srcdir)/src/mesa/program/prog_hash_table.c\
$(top_srcdir)/src/mesa/program/symbol_table.c   \
@@ -120,7 +130,7 @@ glcpp_glcpp_LDADD = \
libglcpp.la \
-lm
 
-libglsl_la_LIBADD = libglcpp.la
+libglsl_la_LIBADD = libglcpp.la $(PTHREAD_LIBS)
 libglsl_la_SOURCES =   \
glsl_lexer.cpp  \
glsl_parser.cpp \
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index 6fc94d6..bab2358 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -103,7 +103,8 @@ LIBGLSL_FILES = \
$(GLSL_SRCDIR)/opt_tree_grafting.cpp \
$(GLSL_SRCDIR)/opt_vectorize.cpp \
$(GLSL_SRCDIR)/s_expression.cpp \
-   $(GLSL_SRCDIR)/strtod.cpp
+   $(GLSL_SRCDIR)/strtod.cpp \
+   $(GLSL_SRCDIR)/threadpool.c
 
 # glsl_compiler
 
diff --git a/src/glsl/tests/threadpool_test.cpp 
b/src/glsl/tests/threadpool_test.cpp
new file mode 100644
index 000..63f55c5
--- /dev/null
+++ b/src/glsl/tests/threadpool_test.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright © 2014 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "c11/threads.h"
+
+#include "threadpool.h"
+
+#define NUM_THREADS 10
+#define OPS_PER_THREAD 100
+#define MAX_TASKS 10
+
+static void
+race_cb(void *data)
+{
+   usleep(1000 * 5);
+}
+
+static int
+race_random_op(void *data)
+{
+   struct _mesa_threadpool *pool = (struct _mesa_threadpool *) data;
+   struct _mesa_threadpool_task *tasks[MAX_TASKS];
+

[Mesa-dev] [PATCHv2 02/13] glsl: rename strtod.c to strtod.cpp

2014-07-09 Thread Chia-I Wu

We want to add a static object to initialize locale_t in the following commit.

Signed-off-by: Chia-I Wu 
---
 src/glsl/Makefile.sources |  2 +-
 src/glsl/strtod.c | 79 ---
 src/glsl/strtod.cpp   | 79 +++
 3 files changed, 80 insertions(+), 80 deletions(-)
 delete mode 100644 src/glsl/strtod.c
 create mode 100644 src/glsl/strtod.cpp

diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index b54eae7..6fc94d6 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -103,7 +103,7 @@ LIBGLSL_FILES = \
$(GLSL_SRCDIR)/opt_tree_grafting.cpp \
$(GLSL_SRCDIR)/opt_vectorize.cpp \
$(GLSL_SRCDIR)/s_expression.cpp \
-   $(GLSL_SRCDIR)/strtod.c
+   $(GLSL_SRCDIR)/strtod.cpp
 
 # glsl_compiler
 
diff --git a/src/glsl/strtod.c b/src/glsl/strtod.c
deleted file mode 100644
index 5d4346b..000
--- a/src/glsl/strtod.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright 2010 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-#include 
-
-#ifdef _GNU_SOURCE
-#include 
-#ifdef __APPLE__
-#include 
-#endif
-#endif
-
-#include "strtod.h"
-
-
-
-/**
- * Wrapper around strtod which uses the "C" locale so the decimal
- * point is always '.'
- */
-double
-glsl_strtod(const char *s, char **end)
-{
-#if defined(_GNU_SOURCE) && !defined(__CYGWIN__) && !defined(__FreeBSD__) && \
-   !defined(__HAIKU__) && !defined(__UCLIBC__)
-   static locale_t loc = NULL;
-   if (!loc) {
-  loc = newlocale(LC_CTYPE_MASK, "C", NULL);
-   }
-   return strtod_l(s, end, loc);
-#else
-   return strtod(s, end);
-#endif
-}
-
-
-/**
- * Wrapper around strtof which uses the "C" locale so the decimal
- * point is always '.'
- */
-float
-glsl_strtof(const char *s, char **end)
-{
-#if defined(_GNU_SOURCE) && !defined(__CYGWIN__) && !defined(__FreeBSD__) && \
-   !defined(__HAIKU__) && !defined(__UCLIBC__)
-   static locale_t loc = NULL;
-   if (!loc) {
-  loc = newlocale(LC_CTYPE_MASK, "C", NULL);
-   }
-   return strtof_l(s, end, loc);
-#elif _XOPEN_SOURCE >= 600 || _ISOC99_SOURCE
-   return strtof(s, end);
-#else
-   return (float) strtod(s, end);
-#endif
-}
diff --git a/src/glsl/strtod.cpp b/src/glsl/strtod.cpp
new file mode 100644
index 000..5d4346b
--- /dev/null
+++ b/src/glsl/strtod.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include 
+
+#ifdef _GNU_SOURCE
+#include 
+#ifdef __APPLE__
+#include 
+#endif
+#endif
+
+#include "strtod.h"
+
+
+
+/

[Mesa-dev] [PATCHv2 03/13] glsl: initialize locale_t with a static object

2014-07-09 Thread Chia-I Wu

The compiler may be used by multiple contexts simultaneously and needs to be
thread-safe.

Signed-off-by: Chia-I Wu 
---
 src/glsl/strtod.cpp | 29 +++--
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/glsl/strtod.cpp b/src/glsl/strtod.cpp
index 5d4346b..1ac29ec 100644
--- a/src/glsl/strtod.cpp
+++ b/src/glsl/strtod.cpp
@@ -35,6 +35,17 @@
 
 #include "strtod.h"
 
+#if defined(_GNU_SOURCE) && !defined(__CYGWIN__) && !defined(__FreeBSD__) && \
+   !defined(__HAIKU__) && !defined(__UCLIBC__)
+#define GLSL_HAVE_LOCALE_T
+#endif
+
+#ifdef GLSL_HAVE_LOCALE_T
+static struct locale_initializer {
+   locale_initializer() { loc = newlocale(LC_CTYPE_MASK, "C", NULL); }
+   locale_t loc;
+} loc_init;
+#endif
 
 
 /**
@@ -44,13 +55,8 @@
 double
 glsl_strtod(const char *s, char **end)
 {
-#if defined(_GNU_SOURCE) && !defined(__CYGWIN__) && !defined(__FreeBSD__) && \
-   !defined(__HAIKU__) && !defined(__UCLIBC__)
-   static locale_t loc = NULL;
-   if (!loc) {
-  loc = newlocale(LC_CTYPE_MASK, "C", NULL);
-   }
-   return strtod_l(s, end, loc);
+#ifdef GLSL_HAVE_LOCALE_T
+   return strtod_l(s, end, loc_init.loc);
 #else
return strtod(s, end);
 #endif
@@ -64,13 +70,8 @@ glsl_strtod(const char *s, char **end)
 float
 glsl_strtof(const char *s, char **end)
 {
-#if defined(_GNU_SOURCE) && !defined(__CYGWIN__) && !defined(__FreeBSD__) && \
-   !defined(__HAIKU__) && !defined(__UCLIBC__)
-   static locale_t loc = NULL;
-   if (!loc) {
-  loc = newlocale(LC_CTYPE_MASK, "C", NULL);
-   }
-   return strtof_l(s, end, loc);
+#ifdef GLSL_HAVE_LOCALE_T
+   return strtof_l(s, end, loc_init.loc);
 #elif _XOPEN_SOURCE >= 600 || _ISOC99_SOURCE
return strtof(s, end);
 #else
-- 
2.0.0.rc2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCHv2 01/13] mesa: protect the debug state with a mutex

2014-07-09 Thread Chia-I Wu

We are about to change mesa to spawn threads for deferred glCompileShader and
glLinkProgram, and we need to make sure those threads can send compiler
warnings/errors to the debug output safely.

Signed-off-by: Chia-I Wu 
---
 src/mesa/main/errors.c | 172 +++--
 src/mesa/main/mtypes.h |   1 +
 2 files changed, 126 insertions(+), 47 deletions(-)

diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c
index aa0ff50..156eb0d 100644
--- a/src/mesa/main/errors.c
+++ b/src/mesa/main/errors.c
@@ -676,22 +676,41 @@ debug_pop_group(struct gl_debug_state *debug)
 
 
 /**
- * Return debug state for the context.  The debug state will be allocated
- * and initialized upon the first call.
+ * Lock and return debug state for the context.  The debug state will be
+ * allocated and initialized upon the first call.  When NULL is returned, the
+ * debug state is not locked.
  */
 static struct gl_debug_state *
-_mesa_get_debug_state(struct gl_context *ctx)
+_mesa_lock_debug_state(struct gl_context *ctx)
 {
+   mtx_lock(&ctx->DebugMutex);
+
if (!ctx->Debug) {
   ctx->Debug = debug_create();
   if (!ctx->Debug) {
- _mesa_error(ctx, GL_OUT_OF_MEMORY, "allocating debug state");
+ GET_CURRENT_CONTEXT(cur);
+ mtx_unlock(&ctx->DebugMutex);
+
+ /*
+  * This function may be called from other threads.  When that is the
+  * case, we cannot record this OOM error.
+  */
+ if (ctx == cur)
+_mesa_error(ctx, GL_OUT_OF_MEMORY, "allocating debug state");
+
+ return NULL;
   }
}
 
return ctx->Debug;
 }
 
+static void
+_mesa_unlock_debug_state(struct gl_context *ctx)
+{
+   mtx_unlock(&ctx->DebugMutex);
+}
+
 /**
  * Set the integer debug state specified by \p pname.  This can be called from
  * _mesa_set_enable for example.
@@ -699,7 +718,7 @@ _mesa_get_debug_state(struct gl_context *ctx)
 bool
 _mesa_set_debug_state_int(struct gl_context *ctx, GLenum pname, GLint val)
 {
-   struct gl_debug_state *debug = _mesa_get_debug_state(ctx);
+   struct gl_debug_state *debug = _mesa_lock_debug_state(ctx);
 
if (!debug)
   return false;
@@ -716,6 +735,8 @@ _mesa_set_debug_state_int(struct gl_context *ctx, GLenum 
pname, GLint val)
   break;
}
 
+   _mesa_unlock_debug_state(ctx);
+
return true;
 }
 
@@ -729,9 +750,12 @@ _mesa_get_debug_state_int(struct gl_context *ctx, GLenum 
pname)
struct gl_debug_state *debug;
GLint val;
 
+   mtx_lock(&ctx->DebugMutex);
debug = ctx->Debug;
-   if (!debug)
+   if (!debug) {
+  mtx_unlock(&ctx->DebugMutex);
   return 0;
+   }
 
switch (pname) {
case GL_DEBUG_OUTPUT:
@@ -756,6 +780,8 @@ _mesa_get_debug_state_int(struct gl_context *ctx, GLenum 
pname)
   break;
}
 
+   mtx_unlock(&ctx->DebugMutex);
+
return val;
 }
 
@@ -769,9 +795,12 @@ _mesa_get_debug_state_ptr(struct gl_context *ctx, GLenum 
pname)
struct gl_debug_state *debug;
void *val;
 
+   mtx_lock(&ctx->DebugMutex);
debug = ctx->Debug;
-   if (!debug)
+   if (!debug) {
+  mtx_unlock(&ctx->DebugMutex);
   return NULL;
+   }
 
switch (pname) {
case GL_DEBUG_CALLBACK_FUNCTION_ARB:
@@ -786,9 +815,49 @@ _mesa_get_debug_state_ptr(struct gl_context *ctx, GLenum 
pname)
   break;
}
 
+   mtx_unlock(&ctx->DebugMutex);
+
return val;
 }
 
+/**
+ * Insert a debug message.  The mutex is assumed to be locked, and will be
+ * unlocked by this call.
+ */
+static void
+log_msg_locked_and_unlock(struct gl_context *ctx,
+  enum mesa_debug_source source,
+  enum mesa_debug_type type, GLuint id,
+  enum mesa_debug_severity severity,
+  GLint len, const char *buf)
+{
+   struct gl_debug_state *debug = ctx->Debug;
+
+   if (!debug_is_message_enabled(debug, source, type, id, severity)) {
+  _mesa_unlock_debug_state(ctx);
+  return;
+   }
+
+   if (ctx->Debug->Callback) {
+  GLenum gl_source = debug_source_enums[source];
+  GLenum gl_type = debug_type_enums[type];
+  GLenum gl_severity = debug_severity_enums[severity];
+  GLDEBUGPROC callback = ctx->Debug->Callback;
+  const void *data = ctx->Debug->CallbackData;
+
+  /*
+   * When ctx->Debug->SyncOutput is GL_FALSE, the client is prepared for
+   * unsynchronous calls.  When it is GL_TRUE, we will not spawn threads.
+   * In either case, we can call the callback unlocked.
+   */
+  _mesa_unlock_debug_state(ctx);
+  callback(gl_source, gl_type, id, gl_severity, len, buf, data);
+   }
+   else {
+  debug_log_message(ctx->Debug, source, type, id, severity, len, buf);
+  _mesa_unlock_debug_state(ctx);
+   }
+}
 
 /**
  * Log a client or driver debug message.
@@ -798,24 +867,12 @@ log_msg(struc

[Mesa-dev] [PATCHv2 11/13] i965: refactor do_gs_prog

2014-07-09 Thread Chia-I Wu

Split do_gs_prog into

  brw_gs_init_compile
  brw_gs_do_compile
  brw_gs_upload_compile
  brw_gs_clear_complile

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_vec4_gs.c | 153 
 1 file changed, 96 insertions(+), 57 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs.c 
b/src/mesa/drivers/dri/i965/brw_vec4_gs.c
index 6428291..39ee507 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs.c
@@ -33,22 +33,29 @@
 #include "brw_state.h"
 
 
-static bool
-do_gs_prog(struct brw_context *brw,
-   struct gl_shader_program *prog,
-   struct brw_geometry_program *gp,
-   struct brw_gs_prog_key *key)
+static void
+brw_gs_init_compile(struct brw_context *brw,
+struct gl_shader_program *prog,
+struct brw_geometry_program *gp,
+const struct brw_gs_prog_key *key,
+struct brw_gs_compile *c)
 {
-   struct brw_stage_state *stage_state = &brw->gs.base;
-   struct brw_gs_compile c;
-   memset(&c, 0, sizeof(c));
-   c.key = *key;
-   c.gp = gp;
+   memset(c, 0, sizeof(*c));
 
-   c.prog_data.include_primitive_id =
-  (gp->program.Base.InputsRead & VARYING_BIT_PRIMITIVE_ID) != 0;
+   c->key = *key;
+   c->gp = gp;
+   c->base.shader_prog = prog;
+   c->base.mem_ctx = ralloc_context(NULL);
+}
 
-   c.prog_data.invocations = gp->program.Invocations;
+static bool
+brw_gs_do_compile(struct brw_context *brw,
+  struct brw_gs_compile *c)
+{
+   c->prog_data.include_primitive_id =
+  (c->gp->program.Base.InputsRead & VARYING_BIT_PRIMITIVE_ID) != 0;
+
+   c->prog_data.invocations = c->gp->program.Invocations;
 
/* Allocate the references to the uniforms that will end up in the
 * prog_data associated with the compiled program, and which will be freed
@@ -58,34 +65,35 @@ do_gs_prog(struct brw_context *brw,
 * padding around uniform values below vec4 size, so the worst case is that
 * every uniform is a float which gets padded to the size of a vec4.
 */
-   struct gl_shader *gs = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
+   struct gl_shader *gs =
+  c->base.shader_prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
int param_count = gs->num_uniform_components * 4;
 
/* We also upload clip plane data as uniforms */
param_count += MAX_CLIP_PLANES * 4;
 
-   c.prog_data.base.base.param =
+   c->prog_data.base.base.param =
   rzalloc_array(NULL, const float *, param_count);
-   c.prog_data.base.base.pull_param =
+   c->prog_data.base.base.pull_param =
   rzalloc_array(NULL, const float *, param_count);
/* Setting nr_params here NOT to the size of the param and pull_param
 * arrays, but to the number of uniform components vec4_visitor
 * needs. vec4_visitor::setup_uniforms() will set it back to a proper value.
 */
-   c.prog_data.base.base.nr_params = ALIGN(param_count, 4) / 4 + 
gs->num_samplers;
+   c->prog_data.base.base.nr_params = ALIGN(param_count, 4) / 4 + 
gs->num_samplers;
 
-   if (gp->program.OutputType == GL_POINTS) {
+   if (c->gp->program.OutputType == GL_POINTS) {
   /* When the output type is points, the geometry shader may output data
* to multiple streams, and EndPrimitive() has no effect.  So we
* configure the hardware to interpret the control data as stream ID.
*/
-  c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
+  c->prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
 
   /* We only have to emit control bits if we are using streams */
-  if (prog->Geom.UsesStreams)
- c.control_data_bits_per_vertex = 2;
+  if (c->base.shader_prog->Geom.UsesStreams)
+ c->control_data_bits_per_vertex = 2;
   else
- c.control_data_bits_per_vertex = 0;
+ c->control_data_bits_per_vertex = 0;
} else {
   /* When the output type is triangle_strip or line_strip, EndPrimitive()
* may be used to terminate the current strip and start a new one
@@ -93,32 +101,33 @@ do_gs_prog(struct brw_context *brw,
* streams is not supported.  So we configure the hardware to interpret
* the control data as EndPrimitive information (a.k.a. "cut bits").
*/
-  c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
+  c->prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
 
   /* We only need to output control data if the shader actually calls
* EndPrimitive().
*/
-  c.control_data_bits_per_vertex = gp->program.UsesEndPrimitive ? 1 : 0;
+  c->control_data_bits_per_vertex =
+ c->gp->program.UsesEndPrimitive ? 1 : 0;
}
-   c.control_data_header_size_bits =
-  gp->program.VerticesOut * c.c

[Mesa-dev] [PATCHv2 08/13] mesa: add infrastructure for threaded shader compilation

2014-07-09 Thread Chia-I Wu

Add _mesa_enable_glsl_threadpool to enable the thread pool for a context, and
add ctx->Const.DeferCompileShader and ctx->Const.DeferLinkProgram to
fine-control what gets threaded.

Setting DeferCompileShader to true will make _mesa_glsl_compile_shader be
executed in a worker thread.  The function is thread-safe so there is no
restriction on DeferCompileShader.

Setting DeferLinkProgram to true will make _mesa_glsl_link_shader be executed
in a worker thread.  The function is thread-safe only when certain driver
functions (as documented in struct gl_constants) are thread-safe.  It is
drivers' responsibility to fix those driver functions before setting
DeferLinkProgram.

When DeferLinkProgram is set, drivers are not supposed to inspect the context
in their LinkShader callbacks.  Instead, NotifyLinkShader is added.  Drivers
should inspect the context in NotifyLinkShader and save what they need for
LinkShader in gl_shader_program.

As a final note, most applications will not benefit from threaded shader
compilation because they check GL_COMPILE_STATUS/GL_LINK_STATUS immediately,
giving the worker threads no time to do their jobs.  A possible improvement is
to split LinkShader into two parts: the first part links and error checks
while the second part optimizes and generates the machine code.  With the
split, we can always defer the second part to the thread pool.

Signed-off-by: Chia-I Wu 
---
 src/mesa/main/context.c |  29 +++
 src/mesa/main/context.h |   3 ++
 src/mesa/main/dd.h  |   8 +++
 src/mesa/main/mtypes.h  |  34 
 src/mesa/main/pipelineobj.c |  18 +++
 src/mesa/main/shaderapi.c   | 122 +++-
 src/mesa/main/shaderobj.c   |  74 +--
 src/mesa/main/shaderobj.h   |  55 ++--
 8 files changed, 322 insertions(+), 21 deletions(-)

diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index b082159..e27450c 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -112,6 +112,7 @@
 #include "points.h"
 #include "polygon.h"
 #include "queryobj.h"
+#include "shaderapi.h"
 #include "syncobj.h"
 #include "rastpos.h"
 #include "remap.h"
@@ -139,6 +140,7 @@
 #endif
 
 #include "glsl_parser_extras.h"
+#include "threadpool.h"
 #include 
 
 
@@ -1187,6 +1189,27 @@ _mesa_create_context(gl_api api,
}
 }
 
+void
+_mesa_enable_glsl_threadpool(struct gl_context *ctx, int max_threads)
+{
+   if (!ctx->ThreadPool)
+  ctx->ThreadPool = _mesa_glsl_get_threadpool(max_threads);
+}
+
+static void
+wait_shader_object_cb(GLuint id, void *data, void *userData)
+{
+   struct gl_context *ctx = (struct gl_context *) userData;
+   struct gl_shader *sh = (struct gl_shader *) data;
+
+   if (_mesa_validate_shader_target(ctx, sh->Type)) {
+  _mesa_wait_shaders(ctx, &sh, 1);
+   }
+   else {
+  struct gl_shader_program *shProg = (struct gl_shader_program *) data;
+  _mesa_wait_shader_program(ctx, shProg);
+   }
+}
 
 /**
  * Free the data associated with the given context.
@@ -1205,6 +1228,12 @@ _mesa_free_context_data( struct gl_context *ctx )
   _mesa_make_current(ctx, NULL, NULL);
}
 
+   if (ctx->ThreadPool) {
+  _mesa_HashWalk(ctx->Shared->ShaderObjects, wait_shader_object_cb, ctx);
+  _mesa_threadpool_unref(ctx->ThreadPool);
+  ctx->ThreadPool = NULL;
+   }
+
/* unreference WinSysDraw/Read buffers */
_mesa_reference_framebuffer(&ctx->WinSysDrawBuffer, NULL);
_mesa_reference_framebuffer(&ctx->WinSysReadBuffer, NULL);
diff --git a/src/mesa/main/context.h b/src/mesa/main/context.h
index 792ab4c..b23f9fa 100644
--- a/src/mesa/main/context.h
+++ b/src/mesa/main/context.h
@@ -118,6 +118,9 @@ _mesa_create_context(gl_api api,
  const struct dd_function_table *driverFunctions);
 
 extern void
+_mesa_enable_glsl_threadpool(struct gl_context *ctx, int max_threads);
+
+extern void
 _mesa_free_context_data( struct gl_context *ctx );
 
 extern void
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index 633ea2c..38f8c68 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -447,6 +447,14 @@ struct dd_function_table {
 */
/*@{*/
/**
+* Called when a shader program is to be linked.
+*
+* This is optional and gives drivers an opportunity to inspect the context
+* and prepare for LinkShader, which may be deferred to another thread.
+*/
+   void (*NotifyLinkShader)(struct gl_context *ctx,
+struct gl_shader_program *shader);
+   /**
 * Called when a shader program is linked.
 *
 * This gives drivers an opportunity to clone the IR and make their
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 5964576..316da23 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -71,6 +71,8 @@ typedef GLuint

[Mesa-dev] [PATCHv2 13/13] i965: enable threaded precompile

2014-07-09 Thread Chia-I Wu

Inherit gl_shader_program and add save/restore functions to save precompile
results in the shader programs.  When DeferLinkProgram is set, we will save
the precompile results instead of uploading them immediately because we may be
on a different thread.

A few other modifications are also needed.  brw_shader_program_precompile_key
is introduced and initialized in NofityLinkShader for we cannot inspect the
context during precompiling.

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_context.c  |   4 +-
 src/mesa/drivers/dri/i965/brw_fs.cpp |  33 --
 src/mesa/drivers/dri/i965/brw_program.c  |   1 +
 src/mesa/drivers/dri/i965/brw_shader.cpp | 177 ++-
 src/mesa/drivers/dri/i965/brw_shader.h   |  44 
 src/mesa/drivers/dri/i965/brw_vec4_gs.c  |  37 +--
 src/mesa/drivers/dri/i965/brw_vs.c   |  36 +--
 src/mesa/drivers/dri/i965/brw_wm.c   |  23 ++--
 src/mesa/drivers/dri/i965/brw_wm.h   |   5 +-
 9 files changed, 310 insertions(+), 50 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.c 
b/src/mesa/drivers/dri/i965/brw_context.c
index bd13ebf..4a28766 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -784,8 +784,8 @@ brwCreateContext(gl_api api,
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
   brw_init_shader_time(brw);
 
-   /* brw_shader_precompile is not thread-safe */
-   if (brw->precompile)
+   /* brw_shader_precompile is not thread-safe when debug flags are set */
+   if (brw->precompile && (INTEL_DEBUG || brw->perf_debug))
   ctx->Const.DeferLinkProgram = GL_FALSE;
 
_mesa_compute_version(ctx);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index a3ad375..61a0dff 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3288,6 +3288,8 @@ bool
 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
 {
struct brw_context *brw = brw_context(ctx);
+   const struct brw_shader_program_precompile_key *pre_key =
+  brw_shader_program_get_precompile_key(prog);
struct brw_wm_prog_key key;
 
if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
@@ -3329,7 +3331,7 @@ brw_fs_precompile(struct gl_context *ctx, struct 
gl_shader_program *prog)
}
 
if (fp->Base.InputsRead & VARYING_BIT_POS) {
-  key.drawable_height = ctx->DrawBuffer->Height;
+  key.drawable_height = pre_key->fbo_height;
}
 
key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
@@ -3337,7 +3339,7 @@ brw_fs_precompile(struct gl_context *ctx, struct 
gl_shader_program *prog)
  BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
 
if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
-  key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
+  key.render_to_fbo = pre_key->is_user_fbo ||
   key.nr_color_regions > 1;
}
 
@@ -3349,13 +3351,28 @@ brw_fs_precompile(struct gl_context *ctx, struct 
gl_shader_program *prog)
 
key.program_string_id = bfp->id;
 
-   uint32_t old_prog_offset = brw->wm.base.prog_offset;
-   struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
+   struct brw_wm_compile c;
 
-   bool success = do_wm_prog(brw, prog, bfp, &key);
+   brw_wm_init_compile(brw, prog, bfp, &key, &c);
+   if (!brw_wm_do_compile(brw, &c)) {
+  brw_wm_clear_compile(brw, &c);
+  return false;
+   }
+
+   if (brw->ctx.Const.DeferLinkProgram) {
+  brw_shader_program_save_wm_compile(prog, &c);
+   }
+   else {
+  uint32_t old_prog_offset = brw->wm.base.prog_offset;
+  struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
 
-   brw->wm.base.prog_offset = old_prog_offset;
-   brw->wm.prog_data = old_prog_data;
+  brw_wm_upload_compile(brw, &c);
 
-   return success;
+  brw->wm.base.prog_offset = old_prog_offset;
+  brw->wm.prog_data = old_prog_data;
+   }
+
+   brw_wm_clear_compile(brw, &c);
+
+   return true;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_program.c 
b/src/mesa/drivers/dri/i965/brw_program.c
index cff1188..2194640 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -259,6 +259,7 @@ void brwInitFragProgFuncs( struct dd_function_table 
*functions )
functions->NewShader = brw_new_shader;
functions->NewShaderProgram = brw_new_shader_program;
functions->LinkShader = brw_link_shader;
+   functions->NotifyLinkShader = brw_notify_link_shader;
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp 
b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 318802b..3cf1f15 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -25,14 +25,52 @@ extern "C" {
 #include "main/macros.h"
 #include "

[Mesa-dev] [PATCHv2 09/13] i965: add drirc option multithread_glsl_compiler

2014-07-09 Thread Chia-I Wu

Setting it to a non-zero value N will cause shader compilation to be deferred
to a thread pool.  When N is greater than 1, it indicates the maximum number
of threads in the pool.  When N is 1, the number of threads is up to the
driver (two for i965).

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/common/xmlpool/t_options.h |  4 
 src/mesa/drivers/dri/i965/brw_context.c | 15 +++
 src/mesa/drivers/dri/i965/intel_screen.c|  2 ++
 3 files changed, 21 insertions(+)

diff --git a/src/mesa/drivers/dri/common/xmlpool/t_options.h 
b/src/mesa/drivers/dri/common/xmlpool/t_options.h
index fc9e104..80abaf9 100644
--- a/src/mesa/drivers/dri/common/xmlpool/t_options.h
+++ b/src/mesa/drivers/dri/common/xmlpool/t_options.h
@@ -293,6 +293,10 @@ DRI_CONF_OPT_BEGIN_V(texture_heaps,enum,def,"0:2") \
DRI_CONF_DESC_END \
 DRI_CONF_OPT_END
 
+#define DRI_CONF_MULTITHREAD_GLSL_COMPILER(def) \
+DRI_CONF_OPT_BEGIN(multithread_glsl_compiler, int, def) \
+DRI_CONF_DESC(en,gettext("Enable multithreading in the GLSL 
compiler")) \
+DRI_CONF_OPT_END
 
 
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_context.c 
b/src/mesa/drivers/dri/i965/brw_context.c
index c47ad36..bd13ebf 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -570,6 +570,17 @@ brw_process_driconf_options(struct brw_context *brw)
 
ctx->Const.DisableGLSLLineContinuations =
   driQueryOptionb(options, "disable_glsl_line_continuations");
+
+   const int multithread_glsl_compiler =
+  driQueryOptioni(options, "multithread_glsl_compiler");
+   if (multithread_glsl_compiler > 0) {
+  const int max_threads = (multithread_glsl_compiler > 1) ?
+ multithread_glsl_compiler : 2;
+
+  _mesa_enable_glsl_threadpool(ctx, max_threads);
+  ctx->Const.DeferCompileShader = GL_TRUE;
+  ctx->Const.DeferLinkProgram = GL_TRUE;
+   }
 }
 
 GLboolean
@@ -773,6 +784,10 @@ brwCreateContext(gl_api api,
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
   brw_init_shader_time(brw);
 
+   /* brw_shader_precompile is not thread-safe */
+   if (brw->precompile)
+  ctx->Const.DeferLinkProgram = GL_FALSE;
+
_mesa_compute_version(ctx);
 
_mesa_initialize_dispatch_tables(ctx);
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c 
b/src/mesa/drivers/dri/i965/intel_screen.c
index 5b0cbf3..b91d1b1 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -48,6 +48,8 @@ static const __DRIconfigOptionsExtension brw_config_options = 
{
 DRI_CONF_BEGIN
DRI_CONF_SECTION_PERFORMANCE
   DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_ALWAYS_SYNC)
+  DRI_CONF_MULTITHREAD_GLSL_COMPILER(0)
+
   /* Options correspond to DRI_CONF_BO_REUSE_DISABLED,
* DRI_CONF_BO_REUSE_ALL
*/
-- 
2.0.0.rc2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCHv2 05/13] glsl: protect glsl_type with a mutex

2014-07-09 Thread Chia-I Wu

glsl_type has several static hash tables and a static ralloc context.  They
need to be protected by a mutex as they are not thread-safe.

Signed-off-by: Chia-I Wu 
---
 src/glsl/glsl_types.cpp | 57 +++--
 src/glsl/glsl_types.h   | 15 +
 2 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp
index f9cd258..a9261e4 100644
--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/glsl_types.cpp
@@ -29,6 +29,7 @@ extern "C" {
 #include "program/hash_table.h"
 }
 
+mtx_t glsl_type::mutex = _MTX_INITIALIZER_NP;
 hash_table *glsl_type::array_types = NULL;
 hash_table *glsl_type::record_types = NULL;
 hash_table *glsl_type::interface_types = NULL;
@@ -53,9 +54,14 @@ glsl_type::glsl_type(GLenum gl_type,
vector_elements(vector_elements), matrix_columns(matrix_columns),
length(0)
 {
+   mtx_lock(&glsl_type::mutex);
+
init_ralloc_type_ctx();
assert(name != NULL);
this->name = ralloc_strdup(this->mem_ctx, name);
+
+   mtx_unlock(&glsl_type::mutex);
+
/* Neither dimension is zero or both dimensions are zero.
 */
assert((vector_elements == 0) == (matrix_columns == 0));
@@ -71,9 +77,14 @@ glsl_type::glsl_type(GLenum gl_type, glsl_base_type 
base_type,
sampler_array(array), sampler_type(type), interface_packing(0),
length(0)
 {
+   mtx_lock(&glsl_type::mutex);
+
init_ralloc_type_ctx();
assert(name != NULL);
this->name = ralloc_strdup(this->mem_ctx, name);
+
+   mtx_unlock(&glsl_type::mutex);
+
memset(& fields, 0, sizeof(fields));
 
if (base_type == GLSL_TYPE_SAMPLER) {
@@ -95,11 +106,14 @@ glsl_type::glsl_type(const glsl_struct_field *fields, 
unsigned num_fields,
 {
unsigned int i;
 
+   mtx_lock(&glsl_type::mutex);
+
init_ralloc_type_ctx();
assert(name != NULL);
this->name = ralloc_strdup(this->mem_ctx, name);
this->fields.structure = ralloc_array(this->mem_ctx,
 glsl_struct_field, length);
+
for (i = 0; i < length; i++) {
   this->fields.structure[i].type = fields[i].type;
   this->fields.structure[i].name = ralloc_strdup(this->fields.structure,
@@ -110,6 +124,8 @@ glsl_type::glsl_type(const glsl_struct_field *fields, 
unsigned num_fields,
   this->fields.structure[i].sample = fields[i].sample;
   this->fields.structure[i].row_major = fields[i].row_major;
}
+
+   mtx_unlock(&glsl_type::mutex);
 }
 
 glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
@@ -123,6 +139,8 @@ glsl_type::glsl_type(const glsl_struct_field *fields, 
unsigned num_fields,
 {
unsigned int i;
 
+   mtx_lock(&glsl_type::mutex);
+
init_ralloc_type_ctx();
assert(name != NULL);
this->name = ralloc_strdup(this->mem_ctx, name);
@@ -138,6 +156,8 @@ glsl_type::glsl_type(const glsl_struct_field *fields, 
unsigned num_fields,
   this->fields.structure[i].sample = fields[i].sample;
   this->fields.structure[i].row_major = fields[i].row_major;
}
+
+   mtx_unlock(&glsl_type::mutex);
 }
 
 
@@ -285,6 +305,8 @@ const glsl_type *glsl_type::get_scalar_type() const
 void
 _mesa_glsl_release_types(void)
 {
+   mtx_lock(&glsl_type::mutex);
+
if (glsl_type::array_types != NULL) {
   hash_table_dtor(glsl_type::array_types);
   glsl_type::array_types = NULL;
@@ -294,6 +316,8 @@ _mesa_glsl_release_types(void)
   hash_table_dtor(glsl_type::record_types);
   glsl_type::record_types = NULL;
}
+
+   mtx_unlock(&glsl_type::mutex);
 }
 
 
@@ -316,7 +340,10 @@ glsl_type::glsl_type(const glsl_type *array, unsigned 
length) :
 * NUL.
 */
const unsigned name_length = strlen(array->name) + 10 + 3;
+
+   mtx_lock(&glsl_type::mutex);
char *const n = (char *) ralloc_size(this->mem_ctx, name_length);
+   mtx_unlock(&glsl_type::mutex);
 
if (length == 0)
   snprintf(n, name_length, "%s[]", array->name);
@@ -452,12 +479,6 @@ glsl_type::get_instance(unsigned base_type, unsigned rows, 
unsigned columns)
 const glsl_type *
 glsl_type::get_array_instance(const glsl_type *base, unsigned array_size)
 {
-
-   if (array_types == NULL) {
-  array_types = hash_table_ctor(64, hash_table_string_hash,
-   hash_table_string_compare);
-   }
-
/* Generate a name using the base type pointer in the key.  This is
 * done because the name of the base type may not be unique across
 * shaders.  For example, two shaders may have different record types
@@ -466,9 +487,19 @@ glsl_type::get_array_instance(const glsl_type *base, 
unsigned array_size)
char key[128];
snprintf(key, sizeof(key), "%p[%u]", (void *) base, array_size);
 
+   mtx_lock(&glsl_type::mutex);
+
+   if (array_types == NULL) {
+  array_types = hash_table_ctor(64, hash_table_string_hash,
+

[Mesa-dev] [PATCHv2 12/13] i965: refactor do_wm_prog

2014-07-09 Thread Chia-I Wu

Split do_wm_prog into

  brw_wm_init_compile
  brw_wm_do_compile
  brw_wm_upload_compile
  brw_wm_clear_complile

Add struct brw_wm_compile to be passed around them.

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_wm.c | 119 -
 src/mesa/drivers/dri/i965/brw_wm.h |  30 ++
 2 files changed, 107 insertions(+), 42 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_wm.c 
b/src/mesa/drivers/dri/i965/brw_wm.c
index d716e6f..6849963 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -135,27 +135,30 @@ brw_wm_prog_data_compare(const void *in_a, const void 
*in_b)
return true;
 }
 
-/**
- * All Mesa program -> GPU code generation goes through this function.
- * Depending on the instructions used (i.e. flow control instructions)
- * we'll use one of two code generators.
- */
-bool do_wm_prog(struct brw_context *brw,
-   struct gl_shader_program *prog,
-   struct brw_fragment_program *fp,
-   struct brw_wm_prog_key *key)
+void
+brw_wm_init_compile(struct brw_context *brw,
+   struct gl_shader_program *prog,
+   struct brw_fragment_program *fp,
+   const struct brw_wm_prog_key *key,
+   struct brw_wm_compile *c)
+{
+   memset(c, 0, sizeof(*c));
+
+   c->shader_prog = prog;
+   c->fp = fp;
+   c->key = key;
+   c->mem_ctx = ralloc_context(NULL);
+}
+
+bool
+brw_wm_do_compile(struct brw_context *brw,
+  struct brw_wm_compile *c)
 {
struct gl_context *ctx = &brw->ctx;
-   void *mem_ctx = ralloc_context(NULL);
-   struct brw_wm_prog_data prog_data;
-   const GLuint *program;
struct gl_shader *fs = NULL;
-   GLuint program_size;
 
-   if (prog)
-  fs = prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
-
-   memset(&prog_data, 0, sizeof(prog_data));
+   if (c->shader_prog)
+  fs = c->shader_prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
 
/* Allocate the references to the uniforms that will end up in the
 * prog_data associated with the compiled program, and which will be freed
@@ -165,42 +168,74 @@ bool do_wm_prog(struct brw_context *brw,
if (fs) {
   param_count = fs->num_uniform_components;
} else {
-  param_count = fp->program.Base.Parameters->NumParameters * 4;
+  param_count = c->fp->program.Base.Parameters->NumParameters * 4;
}
/* The backend also sometimes adds params for texture size. */
param_count += 2 * 
ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits;
-   prog_data.base.param = rzalloc_array(NULL, const float *, param_count);
-   prog_data.base.pull_param =
-   rzalloc_array(NULL, const float *, param_count);
-   prog_data.base.nr_params = param_count;
-
-   prog_data.barycentric_interp_modes =
-  brw_compute_barycentric_interp_modes(brw, key->flat_shade,
-   key->persample_shading,
-   &fp->program);
-
-   program = brw_wm_fs_emit(brw, mem_ctx, key, &prog_data,
-&fp->program, prog, &program_size);
-   if (program == NULL) {
-  ralloc_free(mem_ctx);
+   c->prog_data.base.param = rzalloc_array(NULL, const float *, param_count);
+   c->prog_data.base.pull_param =
+  rzalloc_array(NULL, const float *, param_count);
+   c->prog_data.base.nr_params = param_count;
+
+   c->prog_data.barycentric_interp_modes =
+  brw_compute_barycentric_interp_modes(brw, c->key->flat_shade,
+   c->key->persample_shading,
+   &c->fp->program);
+
+   c->program = brw_wm_fs_emit(brw, c->mem_ctx, c->key, &c->prog_data,
+ &c->fp->program, c->shader_prog, &c->program_size);
+   if (c->program == NULL)
   return false;
-   }
-
-   if (prog_data.total_scratch) {
-  brw_get_scratch_bo(brw, &brw->wm.base.scratch_bo,
-prog_data.total_scratch * brw->max_wm_threads);
-   }
 
if (unlikely(INTEL_DEBUG & DEBUG_WM))
   fprintf(stderr, "\n");
 
+   return true;
+}
+
+void
+brw_wm_upload_compile(struct brw_context *brw,
+  const struct brw_wm_compile *c)
+{
+   if (c->prog_data.total_scratch) {
+  brw_get_scratch_bo(brw, &brw->wm.base.scratch_bo,
+c->prog_data.total_scratch * brw->max_wm_threads);
+   }
+
brw_upload_cache(&brw->cache, BRW_WM_PROG,
-   key, sizeof(struct brw_wm_prog_key),
-   program, program_size,
-   &prog_data, sizeof(prog_data),
+   c->key, sizeof(struct brw_wm_prog_key),
+   c->program, c->program_size,
+   &c->prog

[Mesa-dev] [PATCHv2 04/13] glsl: protect anonymous struct id with a mutex

2014-07-09 Thread Chia-I Wu

There may be two contexts compiling shaders at the same time, and we want the
anonymous struct id to be globally unique.

Signed-off-by: Chia-I Wu 
---
 src/glsl/glsl_parser_extras.cpp | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index b327c2b..ad31469 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -1347,9 +1347,15 @@ ast_struct_specifier::ast_struct_specifier(const char 
*identifier,
   ast_declarator_list *declarator_list)
 {
if (identifier == NULL) {
+  static mtx_t mutex = _MTX_INITIALIZER_NP;
   static unsigned anon_count = 1;
-  identifier = ralloc_asprintf(this, "#anon_struct_%04x", anon_count);
-  anon_count++;
+  unsigned count;
+
+  mtx_lock(&mutex);
+  count = anon_count++;
+  mtx_unlock(&mutex);
+
+  identifier = ralloc_asprintf(this, "#anon_struct_%04x", count);
}
name = identifier;
this->declarations.push_degenerate_list_at_head(&declarator_list->link);
-- 
2.0.0.rc2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCHv2 06/13] glsl: add a generic thread pool data structure

2014-07-09 Thread Chia-I Wu

On Wed, Jul 9, 2014 at 10:42 PM, Brian Paul  wrote:
> On 07/09/2014 01:47 AM, Chia-I Wu wrote:
>>
>> It can be used to implement, for example, threaded glCompileShader and
>> glLinkProgram.
>>
>> v2: allow tasks to "complete" other tasks
>>
>> Signed-off-by: Chia-I Wu 
>> ---
>>   src/glsl/Makefile.am   |  12 +-
>>   src/glsl/Makefile.sources  |   3 +-
>>   src/glsl/tests/threadpool_test.cpp | 137 +++
>>   src/glsl/threadpool.c  | 476
>> +
>>   src/glsl/threadpool.h  |  67 ++
>
>
> Does the threadpool code have anything GLSL-specific in it?  If not, maybe
> these files should go in src/mesa/main/
No, there is no GLSL-specific code here.  The following commit (patch
7) adds the singleton of the pool to GLSL.  The code is here because
the singleton is the only user of it, and it does not make much sense
to have more than a pool.

I do not have a preference where these files should reside.  I will
move them if you or others have a preference.




>
>
>
>>   5 files changed, 693 insertions(+), 2 deletions(-)
>>   create mode 100644 src/glsl/tests/threadpool_test.cpp
>>   create mode 100644 src/glsl/threadpool.c
>>   create mode 100644 src/glsl/threadpool.h
>>
>> diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am
>> index 00261fd..3d07af3 100644
>> --- a/src/glsl/Makefile.am
>> +++ b/src/glsl/Makefile.am
>> @@ -35,6 +35,7 @@ TESTS = glcpp/tests/glcpp-test
>> \
>> tests/general-ir-test   \
>> tests/optimization-test \
>> tests/ralloc-test   \
>> +   tests/threadpool-test   \
>> tests/sampler-types-test\
>> tests/uniform-initializer-test
>>
>> @@ -48,6 +49,7 @@ check_PROGRAMS =  \
>> glsl_test   \
>> tests/general-ir-test   \
>> tests/ralloc-test   \
>> +   tests/threadpool-test   \
>> tests/sampler-types-test\
>> tests/uniform-initializer-test
>>
>> @@ -95,6 +97,14 @@ tests_ralloc_test_LDADD =\
>> $(top_builddir)/src/gtest/libgtest.la   \
>> $(PTHREAD_LIBS)
>>
>> +tests_threadpool_test_SOURCES =\
>> +   tests/threadpool_test.cpp   \
>> +   $(top_builddir)/src/glsl/threadpool.c
>> +tests_threadpool_test_CFLAGS = $(PTHREAD_CFLAGS)
>> +tests_threadpool_test_LDADD =  \
>> +   $(top_builddir)/src/gtest/libgtest.la   \
>> +   $(PTHREAD_LIBS)
>> +
>>   tests_sampler_types_test_SOURCES =\
>> $(top_srcdir)/src/mesa/program/prog_hash_table.c\
>> $(top_srcdir)/src/mesa/program/symbol_table.c   \
>> @@ -120,7 +130,7 @@ glcpp_glcpp_LDADD = \
>> libglcpp.la \
>> -lm
>>
>> -libglsl_la_LIBADD = libglcpp.la
>> +libglsl_la_LIBADD = libglcpp.la $(PTHREAD_LIBS)
>>   libglsl_la_SOURCES =  \
>> glsl_lexer.cpp  \
>> glsl_parser.cpp \
>> diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
>> index 6fc94d6..bab2358 100644
>> --- a/src/glsl/Makefile.sources
>> +++ b/src/glsl/Makefile.sources
>> @@ -103,7 +103,8 @@ LIBGLSL_FILES = \
>> $(GLSL_SRCDIR)/opt_tree_grafting.cpp \
>> $(GLSL_SRCDIR)/opt_vectorize.cpp \
>> $(GLSL_SRCDIR)/s_expression.cpp \
>> -   $(GLSL_SRCDIR)/strtod.cpp
>> +   $(GLSL_SRCDIR)/strtod.cpp \
>> +   $(GLSL_SRCDIR)/threadpool.c
>>
>>   # glsl_compiler
>>
>> diff --git a/src/glsl/tests/threadpool_test.cpp
>> b/src/glsl/tests/threadpool_test.cpp
>> new file mode 100644
>> index 000..63f55c5
>> --- /dev/null
>> +++ b/src/glsl/tests/threadpool_test.cpp
>> @@ -0,0 +1,137 @@
>> +/*
>> + * Copyright © 2014 LunarG, Inc.
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining
>> a
>> + * copy of this software and associated documentation files (the
>> "Software"),
>> + * to deal in the Software with

Re: [Mesa-dev] [PATCHv2 08/13] mesa: add infrastructure for threaded shader compilation

2014-07-09 Thread Chia-I Wu

On Wed, Jul 9, 2014 at 10:42 PM, Brian Paul  wrote:
> On 07/09/2014 01:47 AM, Chia-I Wu wrote:
>>
>> Add _mesa_enable_glsl_threadpool to enable the thread pool for a context,
>> and
>> add ctx->Const.DeferCompileShader and ctx->Const.DeferLinkProgram to
>> fine-control what gets threaded.
>>
>> Setting DeferCompileShader to true will make _mesa_glsl_compile_shader be
>> executed in a worker thread.  The function is thread-safe so there is no
>> restriction on DeferCompileShader.
>>
>> Setting DeferLinkProgram to true will make _mesa_glsl_link_shader be
>> executed
>> in a worker thread.  The function is thread-safe only when certain driver
>> functions (as documented in struct gl_constants) are thread-safe.  It is
>> drivers' responsibility to fix those driver functions before setting
>> DeferLinkProgram.
>>
>> When DeferLinkProgram is set, drivers are not supposed to inspect the
>> context
>> in their LinkShader callbacks.  Instead, NotifyLinkShader is added.
>> Drivers
>> should inspect the context in NotifyLinkShader and save what they need for
>> LinkShader in gl_shader_program.
>>
>> As a final note, most applications will not benefit from threaded shader
>> compilation because they check GL_COMPILE_STATUS/GL_LINK_STATUS
>> immediately,
>> giving the worker threads no time to do their jobs.  A possible
>> improvement is
>> to split LinkShader into two parts: the first part links and error checks
>> while the second part optimizes and generates the machine code.  With the
>> split, we can always defer the second part to the thread pool.
>>
>> Signed-off-by: Chia-I Wu 
>> ---
>>   src/mesa/main/context.c |  29 +++
>>   src/mesa/main/context.h |   3 ++
>>   src/mesa/main/dd.h  |   8 +++
>>   src/mesa/main/mtypes.h  |  34 
>>   src/mesa/main/pipelineobj.c |  18 +++
>>   src/mesa/main/shaderapi.c   | 122
>> +++-
>>   src/mesa/main/shaderobj.c   |  74 +--
>>   src/mesa/main/shaderobj.h   |  55 ++--
>>   8 files changed, 322 insertions(+), 21 deletions(-)
>>
>> diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
>> index b082159..e27450c 100644
>> --- a/src/mesa/main/context.c
>> +++ b/src/mesa/main/context.c
>> @@ -112,6 +112,7 @@
>>   #include "points.h"
>>   #include "polygon.h"
>>   #include "queryobj.h"
>> +#include "shaderapi.h"
>>   #include "syncobj.h"
>>   #include "rastpos.h"
>>   #include "remap.h"
>> @@ -139,6 +140,7 @@
>>   #endif
>>
>>   #include "glsl_parser_extras.h"
>> +#include "threadpool.h"
>>   #include 
>>
>>
>> @@ -1187,6 +1189,27 @@ _mesa_create_context(gl_api api,
>>  }
>>   }
>>
>> +void
>> +_mesa_enable_glsl_threadpool(struct gl_context *ctx, int max_threads)
>> +{
>> +   if (!ctx->ThreadPool)
>> +  ctx->ThreadPool = _mesa_glsl_get_threadpool(max_threads);
>> +}
>> +
>> +static void
>> +wait_shader_object_cb(GLuint id, void *data, void *userData)
>> +{
>> +   struct gl_context *ctx = (struct gl_context *) userData;
>> +   struct gl_shader *sh = (struct gl_shader *) data;
>> +
>> +   if (_mesa_validate_shader_target(ctx, sh->Type)) {
>> +  _mesa_wait_shaders(ctx, &sh, 1);
>> +   }
>> +   else {
>> +  struct gl_shader_program *shProg = (struct gl_shader_program *)
>> data;
>> +  _mesa_wait_shader_program(ctx, shProg);
>> +   }
>> +}
>>
>>   /**
>>* Free the data associated with the given context.
>> @@ -1205,6 +1228,12 @@ _mesa_free_context_data( struct gl_context *ctx )
>> _mesa_make_current(ctx, NULL, NULL);
>>  }
>>
>> +   if (ctx->ThreadPool) {
>> +  _mesa_HashWalk(ctx->Shared->ShaderObjects, wait_shader_object_cb,
>> ctx);
>> +  _mesa_threadpool_unref(ctx->ThreadPool);
>> +  ctx->ThreadPool = NULL;
>> +   }
>> +
>>  /* unreference WinSysDraw/Read buffers */
>>  _mesa_reference_framebuffer(&ctx->WinSysDrawBuffer, NULL);
>>  _mesa_reference_framebuffer(&ctx->WinSysReadBuffer, NULL);
>> diff --git a/src/mesa/main/context.h b/src/mesa/main/context.h
>> index 792ab4c..b23f9fa 100644
>> --- a/src/mesa/main/context.h
>> +++ b/src/mesa/main/context.h
>> @@ -118,6 +118,9 @@ _mesa

Re: [Mesa-dev] RFC/intel: Separate batch buffers from dynamic state

2013-07-16 Thread Chia-I Wu

On Tue, May 7, 2013 at 4:24 AM, Eric Anholt  wrote:
> Paul Berry  writes:
>
>> Currently the i965 driver uses a single buffer object to hold both batch
>> buffer commands and dynamic state data structures (which are pointed to by
>> batch buffer commands).  We use a "stack and heap model", where the former
>> are allocated from the front end of the bo and the latter are allocated
>> from the back end.
>>
>> I'd like us to consider splitting dynamic state to its own separate buffer
>> object.  It seems to me that it would carry advantages both in performance
>> and code simplicity:
>>
>> - The dynamic state bo would need recycling much less frequently, because
>> (a) it could be made much larger than the batch buffer bo, and (b) a batch
>> buffer flush would not necessitate re-emitting dynamic state.  Since a lot
>> of our CPU time is spent emitting state, this could potentially improve
>> performance.
>>
>> - When we did need to recycle the dynamic state bo, we could do so without
>> needlessly flushing the batch buffer.
>>
>> - On systems that support hardware contexts, when we flush the batch
>> buffer, we wouldn't need to re-emit as many batch buffer commands (since
>> dynamic state wouldn't have moved relative to the dynamic state base
>> address), saving further CPU time.
>>
>> - On systems that support hardware contexts, we wouldn't need to use a
>> "space available" heuristic to ensure that enough batch buffer space was
>> available before starting to emit the state for a draw call.  Instead we
>> could just emit commands until the batch buffer is full, flush it, start
>> the next batch off with a STATE_BASE_ADDRESS command, and pick up where we
>> left off.  This would make the state emission logic less fragile (since we
>> wouldn't need to remember to update the heuristic when adding features to
>> the driver).
>>
>> What do you think?
>
> It would need some sort of plan for how to deal with making
> drm_intel_bufmgr_check_aperture work correctly and efficiently when a
> reloced-to BO gets new relocs, which is the reason everything is in one
> BO currently.
I was looking into the same thing today for GEN6 and later.  It seems
the only states that require relocs are SURFACE_STATEs, and they can
still live in the batch bo while the other dynamic states are moved to
a separate state bo.

>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>



--
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 1/4] ilo: implement new float comparison instructions

2013-08-14 Thread Chia-I Wu

On Wed, Aug 14, 2013 at 1:04 AM,   wrote:
> From: Roland Scheidegger 
>
> untested.
Looks good to me.
> ---
>  src/gallium/drivers/ilo/shader/toy_tgsi.c |   20 
>  1 file changed, 12 insertions(+), 8 deletions(-)
>
> diff --git a/src/gallium/drivers/ilo/shader/toy_tgsi.c 
> b/src/gallium/drivers/ilo/shader/toy_tgsi.c
> index d5a3f2f..830aa57 100644
> --- a/src/gallium/drivers/ilo/shader/toy_tgsi.c
> +++ b/src/gallium/drivers/ilo/shader/toy_tgsi.c
> @@ -209,15 +209,18 @@ aos_set_on_cond(struct toy_compiler *tc,
> case TGSI_OPCODE_SLT:
> case TGSI_OPCODE_ISLT:
> case TGSI_OPCODE_USLT:
> +   case TGSI_OPCODE_FSLT:
>cond = BRW_CONDITIONAL_L;
>break;
> case TGSI_OPCODE_SGE:
> case TGSI_OPCODE_ISGE:
> case TGSI_OPCODE_USGE:
> +   case TGSI_OPCODE_FSGE:
>cond = BRW_CONDITIONAL_GE;
>break;
> case TGSI_OPCODE_SEQ:
> case TGSI_OPCODE_USEQ:
> +   case TGSI_OPCODE_FSEQ:
>cond = BRW_CONDITIONAL_EQ;
>break;
> case TGSI_OPCODE_SGT:
> @@ -228,6 +231,7 @@ aos_set_on_cond(struct toy_compiler *tc,
>break;
> case TGSI_OPCODE_SNE:
> case TGSI_OPCODE_USNE:
> +   case TGSI_OPCODE_FSNE:
>cond = BRW_CONDITIONAL_NEQ;
>break;
> default:
> @@ -935,10 +939,10 @@ static const toy_tgsi_translate 
> aos_translate_table[TGSI_OPCODE_LAST] = {
> [105]  = aos_unsupported,
> [106]  = aos_unsupported,
> [TGSI_OPCODE_NOP]  = aos_simple,
> -   [108]  = aos_unsupported,
> -   [109]  = aos_unsupported,
> -   [110]  = aos_unsupported,
> -   [111]  = aos_unsupported,
> +   [TGSI_OPCODE_FSEQ] = aos_set_on_cond,
> +   [TGSI_OPCODE_FSGE] = aos_set_on_cond,
> +   [TGSI_OPCODE_FSLT] = aos_set_on_cond,
> +   [TGSI_OPCODE_FSNE] = aos_set_on_cond,
> [TGSI_OPCODE_NRM4] = aos_NRM4,
> [TGSI_OPCODE_CALLNZ]   = aos_unsupported,
> [TGSI_OPCODE_BREAKC]   = aos_unsupported,
> @@ -1551,10 +1555,10 @@ static const toy_tgsi_translate 
> soa_translate_table[TGSI_OPCODE_LAST] = {
> [105]  = soa_unsupported,
> [106]  = soa_unsupported,
> [TGSI_OPCODE_NOP]  = soa_passthrough,
> -   [108]  = soa_unsupported,
> -   [109]  = soa_unsupported,
> -   [110]  = soa_unsupported,
> -   [111]  = soa_unsupported,
> +   [TGSI_OPCODE_FSEQ] = soa_per_channel,
> +   [TGSI_OPCODE_FSGE] = soa_per_channel,
> +   [TGSI_OPCODE_FSLT] = soa_per_channel,
> +   [TGSI_OPCODE_FSNE] = soa_per_channel,
> [TGSI_OPCODE_NRM4] = soa_NRM4,
> [TGSI_OPCODE_CALLNZ]   = soa_unsupported,
> [TGSI_OPCODE_BREAKC]   = soa_unsupported,
> --
> 1.7.9.5
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] Removing egl_glx?

2013-08-14 Thread Chia-I Wu

On Thu, Aug 15, 2013 at 10:03 AM, Kenneth Graunke  wrote:
> On 08/08/2013 03:13 PM, Chad Versace wrote:
> [snip]
>>
>> By the way, I talked to krh today, and he suggested that we delete egl_glx
>> rather than allow it to bitrot.
>
>
> I'm in favor, but I don't know who uses that.
>
> GLX on EGL would be an interesting experiment.  EGL on GLX is not useful,
> IMHO.
That sounds good to me.

> --Ken
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] gallium/dri-targets: hide all symbols except for __driDriverExtensions

2013-08-14 Thread Chia-I Wu

On Sat, Aug 10, 2013 at 2:56 AM, Marek Olšák  wrote:
> Most importantly, this hides all LLVM symbols. They shouldn't clash
> with a different LLVM version used by apps (at least in theory).
>
> $ nm -g --defined-only radeonsi_dri.so
> 01148f30 D __driDriverExtensions
I am not familiar with issues regarding LLVM symbols so I am fine with
the change if this is what needs to be done (except maybe use
-export-symbols-regex __driDriverExtensions to avoid the version
script?)

But I ran the nm command on ilo_dri.so, and almost all of the exported
symbols are from libdricommon or st/dri.  I think those two components
need VISIBILITY_CFLAGS in their AM_CFLAGS and __driDriverExtensions
needs to be marked as PUBLIC.  This way other gallium targets can
benefit.

> We could do something similar for the other targets.
> ---
>  src/gallium/targets/dri-freedreno/Makefile.am | 5 -
>  src/gallium/targets/dri-i915/Makefile.am  | 5 -
>  src/gallium/targets/dri-ilo/Makefile.am   | 5 -
>  src/gallium/targets/dri-nouveau/Makefile.am   | 5 -
>  src/gallium/targets/dri-r300/Makefile.am  | 5 -
>  src/gallium/targets/dri-r600/Makefile.am  | 5 -
>  src/gallium/targets/dri-radeonsi/Makefile.am  | 5 -
>  src/gallium/targets/dri-swrast/Makefile.am| 5 -
>  src/gallium/targets/dri-vmwgfx/Makefile.am| 5 -
>  src/gallium/targets/dri.version   | 5 +
>  10 files changed, 41 insertions(+), 9 deletions(-)
>  create mode 100644 src/gallium/targets/dri.version
>
> diff --git a/src/gallium/targets/dri-freedreno/Makefile.am 
> b/src/gallium/targets/dri-freedreno/Makefile.am
> index cfa1f37..79aa3db 100644
> --- a/src/gallium/targets/dri-freedreno/Makefile.am
> +++ b/src/gallium/targets/dri-freedreno/Makefile.am
> @@ -46,7 +46,10 @@ kgsl_dri_la_SOURCES = \
> $(top_srcdir)/src/mesa/drivers/dri/common/dri_util.c \
> $(top_srcdir)/src/mesa/drivers/dri/common/xmlconfig.c
>
> -kgsl_dri_la_LDFLAGS = -module -avoid-version -shared -no-undefined
> +kgsl_dri_la_LDFLAGS = -module -avoid-version -shared -no-undefined \
> +   -Wl,--version-script=../dri.version
> +
> +kgsl_dri_la_DEPENDENCIES = ../dri.version
>
>  kgsl_dri_la_LIBADD = \
> $(top_builddir)/src/mesa/libmesagallium.la \
> diff --git a/src/gallium/targets/dri-i915/Makefile.am 
> b/src/gallium/targets/dri-i915/Makefile.am
> index ce6be78..f13a10b 100644
> --- a/src/gallium/targets/dri-i915/Makefile.am
> +++ b/src/gallium/targets/dri-i915/Makefile.am
> @@ -46,7 +46,10 @@ i915_dri_la_SOURCES = \
> $(top_srcdir)/src/mesa/drivers/dri/common/dri_util.c \
> $(top_srcdir)/src/mesa/drivers/dri/common/xmlconfig.c
>
> -i915_dri_la_LDFLAGS = -module -avoid-version -shared -no-undefined
> +i915_dri_la_LDFLAGS = -module -avoid-version -shared -no-undefined \
> +   -Wl,--version-script=../dri.version
> +
> +i915_dri_la_DEPENDENCIES = ../dri.version
>
>  i915_dri_la_LIBADD = \
> $(top_builddir)/src/mesa/libmesagallium.la \
> diff --git a/src/gallium/targets/dri-ilo/Makefile.am 
> b/src/gallium/targets/dri-ilo/Makefile.am
> index 7761f33..9864e5c 100644
> --- a/src/gallium/targets/dri-ilo/Makefile.am
> +++ b/src/gallium/targets/dri-ilo/Makefile.am
> @@ -47,7 +47,10 @@ ilo_dri_la_SOURCES = \
>
>  # need -rpath to create a noinst shared library
>  ilo_dri_la_LDFLAGS = -module -avoid-version -shared -no-undefined \
> --rpath $(abs_builddir)
> +-rpath $(abs_builddir) \
> +-Wl,--version-script=../dri.version
> +
> +ilo_dri_la_DEPENDENCIES = ../dri.version
>
>  ilo_dri_la_LIBADD = \
> $(top_builddir)/src/mesa/libmesagallium.la \
> diff --git a/src/gallium/targets/dri-nouveau/Makefile.am 
> b/src/gallium/targets/dri-nouveau/Makefile.am
> index 69ccf32..26ed682 100644
> --- a/src/gallium/targets/dri-nouveau/Makefile.am
> +++ b/src/gallium/targets/dri-nouveau/Makefile.am
> @@ -45,7 +45,10 @@ nouveau_dri_la_SOURCES = \
> $(top_srcdir)/src/mesa/drivers/dri/common/dri_util.c \
> $(top_srcdir)/src/mesa/drivers/dri/common/xmlconfig.c
>
> -nouveau_dri_la_LDFLAGS = -module -avoid-version -shared -no-undefined
> +nouveau_dri_la_LDFLAGS = -module -avoid-version -shared -no-undefined \
> +   -Wl,--version-script=../dri.version
> +
> +nouveau_dri_la_DEPENDENCIES = ../dri.version
>
>  nouveau_dri_la_LIBADD = \
> $(top_builddir)/src/mesa/libmesagallium.la \
> diff --git a/src/gallium/targets/dri-r300/Makefile.am 
> b/src/gallium/targets/dri-r300/Makefile.am
> index 8c0215d..956e0b5 100644
> --- a/src/gallium/targets/dri-r300/Makefile.am
> +++ b/src/gallium/targets/dri-r300/Makefile.am
> @@ -46,7 +46,10 @@ r300_dri_la_SOURCES = \
> $(top_srcdir)/src/mesa/drivers/dri/common/dri_util.c \
> $(top_srcdir)/src/mesa/drivers/dri/common/xmlconfig.c
>
> -r300_dri_la_LDFLAGS = -module -avoid-version -shared -no-undefined
> +r300_dri_la_LDFLAGS = -module -avoid-version -shared -no-undefined \
> +   -Wl,--

Re: [Mesa-dev] [PATCH] gallium/dri-targets: hide all symbols except for __driDriverExtensions

2013-08-14 Thread Chia-I Wu

On Thu, Aug 15, 2013 at 1:26 PM, Chia-I Wu  wrote:
> On Sat, Aug 10, 2013 at 2:56 AM, Marek Olšák  wrote:
>> Most importantly, this hides all LLVM symbols. They shouldn't clash
>> with a different LLVM version used by apps (at least in theory).
>>
>> $ nm -g --defined-only radeonsi_dri.so
>> 01148f30 D __driDriverExtensions
> I am not familiar with issues regarding LLVM symbols so I am fine with
> the change if this is what needs to be done (except maybe use
> -export-symbols-regex __driDriverExtensions to avoid the version
> script?)
>
> But I ran the nm command on ilo_dri.so, and almost all of the exported
> symbols are from libdricommon or st/dri.  I think those two components
> need VISIBILITY_CFLAGS in their AM_CFLAGS and __driDriverExtensions
> needs to be marked as PUBLIC.  This way other gallium targets can
> benefit.
There is no other gallium target that uses st/dri :)

Anyway, in addition to controlling exported symbols using symbol
files, I still like to see VISIBILITY_CFLAGS be added to st/dri and
the dri targets, which directly list source files from libdrmcommon in
their SOURCES.  Besides, it seems __driConfigOptions and
__dri2ConfigOptions are also marked PUBLIC.  Do they need to be
exported?


>> We could do something similar for the other targets.
>> ---
>>  src/gallium/targets/dri-freedreno/Makefile.am | 5 -
>>  src/gallium/targets/dri-i915/Makefile.am  | 5 -
>>  src/gallium/targets/dri-ilo/Makefile.am   | 5 -
>>  src/gallium/targets/dri-nouveau/Makefile.am   | 5 -
>>  src/gallium/targets/dri-r300/Makefile.am  | 5 -
>>  src/gallium/targets/dri-r600/Makefile.am  | 5 -
>>  src/gallium/targets/dri-radeonsi/Makefile.am  | 5 -
>>  src/gallium/targets/dri-swrast/Makefile.am| 5 -
>>  src/gallium/targets/dri-vmwgfx/Makefile.am| 5 -
>>  src/gallium/targets/dri.version   | 5 +
>>  10 files changed, 41 insertions(+), 9 deletions(-)
>>  create mode 100644 src/gallium/targets/dri.version
>>
>> diff --git a/src/gallium/targets/dri-freedreno/Makefile.am 
>> b/src/gallium/targets/dri-freedreno/Makefile.am
>> index cfa1f37..79aa3db 100644
>> --- a/src/gallium/targets/dri-freedreno/Makefile.am
>> +++ b/src/gallium/targets/dri-freedreno/Makefile.am
>> @@ -46,7 +46,10 @@ kgsl_dri_la_SOURCES = \
>> $(top_srcdir)/src/mesa/drivers/dri/common/dri_util.c \
>> $(top_srcdir)/src/mesa/drivers/dri/common/xmlconfig.c
>>
>> -kgsl_dri_la_LDFLAGS = -module -avoid-version -shared -no-undefined
>> +kgsl_dri_la_LDFLAGS = -module -avoid-version -shared -no-undefined \
>> +   -Wl,--version-script=../dri.version
>> +
>> +kgsl_dri_la_DEPENDENCIES = ../dri.version
>>
>>  kgsl_dri_la_LIBADD = \
>> $(top_builddir)/src/mesa/libmesagallium.la \
>> diff --git a/src/gallium/targets/dri-i915/Makefile.am 
>> b/src/gallium/targets/dri-i915/Makefile.am
>> index ce6be78..f13a10b 100644
>> --- a/src/gallium/targets/dri-i915/Makefile.am
>> +++ b/src/gallium/targets/dri-i915/Makefile.am
>> @@ -46,7 +46,10 @@ i915_dri_la_SOURCES = \
>> $(top_srcdir)/src/mesa/drivers/dri/common/dri_util.c \
>> $(top_srcdir)/src/mesa/drivers/dri/common/xmlconfig.c
>>
>> -i915_dri_la_LDFLAGS = -module -avoid-version -shared -no-undefined
>> +i915_dri_la_LDFLAGS = -module -avoid-version -shared -no-undefined \
>> +   -Wl,--version-script=../dri.version
>> +
>> +i915_dri_la_DEPENDENCIES = ../dri.version
>>
>>  i915_dri_la_LIBADD = \
>> $(top_builddir)/src/mesa/libmesagallium.la \
>> diff --git a/src/gallium/targets/dri-ilo/Makefile.am 
>> b/src/gallium/targets/dri-ilo/Makefile.am
>> index 7761f33..9864e5c 100644
>> --- a/src/gallium/targets/dri-ilo/Makefile.am
>> +++ b/src/gallium/targets/dri-ilo/Makefile.am
>> @@ -47,7 +47,10 @@ ilo_dri_la_SOURCES = \
>>
>>  # need -rpath to create a noinst shared library
>>  ilo_dri_la_LDFLAGS = -module -avoid-version -shared -no-undefined \
>> --rpath $(abs_builddir)
>> +-rpath $(abs_builddir) \
>> +-Wl,--version-script=../dri.version
>> +
>> +ilo_dri_la_DEPENDENCIES = ../dri.version
>>
>>  ilo_dri_la_LIBADD = \
>> $(top_builddir)/src/mesa/libmesagallium.la \
>> diff --git a/src/gallium/targets/dri-nouveau/Makefile.am 
>> b/src/gallium/targets/dri-nouveau/Makefile.am
>> index 69ccf32..26ed682 100644
>> --- a/src/gallium/targets/dri-nouveau/Makefile.am
>> +++ b/src/gallium/targets/dri-nouveau/Makefile.am
>> @@ -45,7 +45,10 @@ nouveau_dri_la_S

[Mesa-dev] [PATCH] glx: make the interval of LIBGL_SHOW_FPS adjustable

2013-08-27 Thread Chia-I Wu

LIBGL_SHOW_FPS=1 makes GLX print FPS every second while other values do
nothing.  Extend it so that LIBGL_SHOW_FPS=N will print the FPS every N
seconds.
---
 src/glx/dri2_glx.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/glx/dri2_glx.c b/src/glx/dri2_glx.c
index c54edac..54fc21c 100644
--- a/src/glx/dri2_glx.c
+++ b/src/glx/dri2_glx.c
@@ -95,7 +95,7 @@ struct dri2_screen {
void *driver;
int fd;
 
-   Bool show_fps;
+   int show_fps_interval;
 };
 
 struct dri2_context
@@ -764,6 +764,8 @@ unsigned dri2GetSwapEventType(Display* dpy, XID drawable)
 
 static void show_fps(struct dri2_drawable *draw)
 {
+   const int interval =
+  ((struct dri2_screen *) draw->base.psc)->show_fps_interval;
struct timeval tv;
uint64_t current_time;
 
@@ -772,7 +774,7 @@ static void show_fps(struct dri2_drawable *draw)
 
draw->frames++;
 
-   if (draw->previous_time + 100 <= current_time) {
+   if (draw->previous_time + interval * 100 <= current_time) {
   if (draw->previous_time) {
  fprintf(stderr, "libGL: FPS = %.1f\n",
  ((uint64_t)draw->frames * 100) /
@@ -859,7 +861,7 @@ dri2SwapBuffers(__GLXDRIdrawable *pdraw, int64_t 
target_msc, int64_t divisor,
 target_msc, divisor, remainder);
 }
 
-if (psc->show_fps) {
+if (psc->show_fps_interval) {
show_fps(priv);
 }
 
@@ -1283,7 +1285,9 @@ dri2CreateScreen(int screen, struct glx_display * priv)
free(deviceName);
 
tmp = getenv("LIBGL_SHOW_FPS");
-   psc->show_fps = tmp && strcmp(tmp, "1") == 0;
+   psc->show_fps_interval = (tmp) ? atoi(tmp) : 0;
+   if (psc->show_fps_interval < 0)
+  psc->show_fps_interval = 0;
 
return &psc->base;
 
-- 
1.8.4.rc3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] i965/gen7: always lower textureGrad() on gen7

2013-09-05 Thread Chia-I Wu

sample_d is slower than the lowered version on gen7.  For gen7, this improves
Xonotic benchmark with Ultimate effects by as much as 25%:

 before the change:  40.06 fps
 after the change:   51.10 fps
 after the change with INTEL_DEBUG=no16: 44.46 fps

As sample_d is not allowed in SIMD16 mode, I firstly thought the difference
was from SIMD8 versus SIMD16.  If that was the case, we would want to apply
brw_lower_texture_gradients() only on fragment shaders in SIMD16 mode.

But, as the numbers show, there is still 10% improvement when SIMD16 is forced
off after the change.  Thus textureGrad() is lowered unconditionally for now.
Due to this and that I haven't tried it on Haswell, this is still RFC.

No piglit regressions.

Signed-off-by: Chia-I Wu 
---
 .../dri/i965/brw_lower_texture_gradients.cpp   | 54 ++
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp 
b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
index 1589a20..f3fcb56 100644
--- a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
+++ b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
@@ -34,8 +34,8 @@ using namespace ir_builder;
 
 class lower_texture_grad_visitor : public ir_hierarchical_visitor {
 public:
-   lower_texture_grad_visitor(bool has_sample_d_c)
-  : has_sample_d_c(has_sample_d_c)
+   lower_texture_grad_visitor(bool has_sample_d, bool has_sample_d_c)
+  : has_sample_d(has_sample_d), has_sample_d_c(has_sample_d_c)
{
   progress = false;
}
@@ -44,6 +44,7 @@ public:
 
 
bool progress;
+   bool has_sample_d;
bool has_sample_d_c;
 
 private:
@@ -90,22 +91,33 @@ txs_type(const glsl_type *type)
 ir_visitor_status
 lower_texture_grad_visitor::visit_leave(ir_texture *ir)
 {
-   /* Only lower textureGrad with shadow samplers */
-   if (ir->op != ir_txd || !ir->shadow_comparitor)
+   if (ir->op != ir_txd)
   return visit_continue;
 
-   /* Lower textureGrad() with samplerCubeShadow even if we have the sample_d_c
-* message.  GLSL provides gradients for the 'r' coordinate.  Unfortunately:
-*
-* From the Ivybridge PRM, Volume 4, Part 1, sample_d message description:
-* "The r coordinate contains the faceid, and the r gradients are ignored
-*  by hardware."
-*
-* We likely need to do a similar treatment for samplerCube and
-* samplerCubeArray, but we have insufficient testing for that at the 
moment.
-*/
-   bool need_lowering = !has_sample_d_c ||
-  ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE;
+   bool need_lowering = false;
+
+   if (ir->shadow_comparitor) {
+  /* Lower textureGrad() with samplerCubeShadow even if we have the
+   * sample_d_c message.  GLSL provides gradients for the 'r' coordinate.
+   * Unfortunately:
+   *
+   * From the Ivybridge PRM, Volume 4, Part 1, sample_d message
+   * description: "The r coordinate contains the faceid, and the r
+   * gradients are ignored by hardware."
+   */
+  if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE)
+ need_lowering = true;
+  else if (!has_sample_d_c)
+ need_lowering = true;
+   }
+   else {
+  /* We likely need to do a similar treatment for samplerCube and
+   * samplerCubeArray, but we have insufficient testing for that at the
+   * moment.
+   */
+  if (!has_sample_d)
+ need_lowering = true;
+   }
 
if (!need_lowering)
   return visit_continue;
@@ -154,7 +166,9 @@ lower_texture_grad_visitor::visit_leave(ir_texture *ir)
   expr(ir_unop_sqrt, dot(dPdy, dPdy)));
}
 
-   /* lambda_base = log2(rho).  We're ignoring GL state biases for now. */
+   /* lambda_base = log2(rho).  It will be biased and clamped by values
+* defined in SAMPLER_STATE to get the final lambda.
+*/
ir->op = ir_txl;
ir->lod_info.lod = expr(ir_unop_log2, rho);
 
@@ -168,8 +182,12 @@ bool
 brw_lower_texture_gradients(struct brw_context *brw,
 struct exec_list *instructions)
 {
+   /* sample_d is slower than the lowered version on gen7, and is not allowed
+* in SIMD16 mode.  Treating it as unsupported improves the performance.
+*/
+   bool has_sample_d = brw->gen != 7;
bool has_sample_d_c = brw->gen >= 8 || brw->is_haswell;
-   lower_texture_grad_visitor v(has_sample_d_c);
+   lower_texture_grad_visitor v(has_sample_d, has_sample_d_c);
 
visit_list_elements(&v, instructions);
 
-- 
1.8.3.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] i965/gen7: always lower textureGrad() on gen7

2013-09-05 Thread Chia-I Wu

On Thu, Sep 5, 2013 at 5:12 PM, Chris Forbes  wrote:
> A possible explanation for the perf change is that Xonotic uses
> anisotropic filtering at this quality level. Lowering to txl defeats
> it.
I had a look at that.  gl_sampler->MaxAnisotropy is never greater than
1.0 in gen7_update_sampler_state() so there is no anisotropic
filtering in this case.

It makes sense to me that avoiding punting to SIMD8 helps the
performance.  But it is not clear to me why >10% performance change
can still be observed when INTEL_DEBUG=no16 is specified.  A
reasonable explanation is that the image quality is degraded in some
way, which is why I am still nervous about the change.

An alternative approach to avoid punting seems to emulate SIMD16
sample_d with two SIMD8 sample_d.  It will take longer to implement
given my familiarity with the code, and may be less performant.  BUt
that would allow things like anisotropic filtering to be honored.


> It would be worth doing an image quality comparison before and after the 
> change.
Yeah, that is worth doing.  I will do that.

>
> -- Chris
>
> On Thu, Sep 5, 2013 at 8:35 PM, Chia-I Wu  wrote:
>> sample_d is slower than the lowered version on gen7.  For gen7, this improves
>> Xonotic benchmark with Ultimate effects by as much as 25%:
>>
>>  before the change:  40.06 fps
>>  after the change:   51.10 fps
>>  after the change with INTEL_DEBUG=no16: 44.46 fps
>>
>> As sample_d is not allowed in SIMD16 mode, I firstly thought the difference
>> was from SIMD8 versus SIMD16.  If that was the case, we would want to apply
>> brw_lower_texture_gradients() only on fragment shaders in SIMD16 mode.
>>
>> But, as the numbers show, there is still 10% improvement when SIMD16 is 
>> forced
>> off after the change.  Thus textureGrad() is lowered unconditionally for now.
>> Due to this and that I haven't tried it on Haswell, this is still RFC.
>>
>> No piglit regressions.
>>
>> Signed-off-by: Chia-I Wu 
>> ---
>>  .../dri/i965/brw_lower_texture_gradients.cpp   | 54 
>> ++
>>  1 file changed, 36 insertions(+), 18 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp 
>> b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>> index 1589a20..f3fcb56 100644
>> --- a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>> @@ -34,8 +34,8 @@ using namespace ir_builder;
>>
>>  class lower_texture_grad_visitor : public ir_hierarchical_visitor {
>>  public:
>> -   lower_texture_grad_visitor(bool has_sample_d_c)
>> -  : has_sample_d_c(has_sample_d_c)
>> +   lower_texture_grad_visitor(bool has_sample_d, bool has_sample_d_c)
>> +  : has_sample_d(has_sample_d), has_sample_d_c(has_sample_d_c)
>> {
>>progress = false;
>> }
>> @@ -44,6 +44,7 @@ public:
>>
>>
>> bool progress;
>> +   bool has_sample_d;
>> bool has_sample_d_c;
>>
>>  private:
>> @@ -90,22 +91,33 @@ txs_type(const glsl_type *type)
>>  ir_visitor_status
>>  lower_texture_grad_visitor::visit_leave(ir_texture *ir)
>>  {
>> -   /* Only lower textureGrad with shadow samplers */
>> -   if (ir->op != ir_txd || !ir->shadow_comparitor)
>> +   if (ir->op != ir_txd)
>>return visit_continue;
>>
>> -   /* Lower textureGrad() with samplerCubeShadow even if we have the 
>> sample_d_c
>> -* message.  GLSL provides gradients for the 'r' coordinate.  
>> Unfortunately:
>> -*
>> -* From the Ivybridge PRM, Volume 4, Part 1, sample_d message 
>> description:
>> -* "The r coordinate contains the faceid, and the r gradients are ignored
>> -*  by hardware."
>> -*
>> -* We likely need to do a similar treatment for samplerCube and
>> -* samplerCubeArray, but we have insufficient testing for that at the 
>> moment.
>> -*/
>> -   bool need_lowering = !has_sample_d_c ||
>> -  ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE;
>> +   bool need_lowering = false;
>> +
>> +   if (ir->shadow_comparitor) {
>> +  /* Lower textureGrad() with samplerCubeShadow even if we have the
>> +   * sample_d_c message.  GLSL provides gradients for the 'r' 
>> coordinate.
>> +   * Unfortunately:
>> +   *
>> +   * From the Ivybridge PRM, Volume 4, Part 1, sample_d message
>> +   * description: "The r coordinate contain

Re: [Mesa-dev] [PATCH] i965/gen7: always lower textureGrad() on gen7

2013-09-06 Thread Chia-I Wu

On Thu, Sep 5, 2013 at 11:18 PM, Roland Scheidegger  wrote:
> Hmm I don't think the math works out here actually, which may explain
> why it's faster.
> I believe the derivatives need to be transformed to cube coord system
> and I don't see that being done here (this is actually something I
> haven't figured out the math yet how to do with reasonable effort for
> llvmpipe).
When the sampler is samplerCube?  It seems lambda needs to be subtracted by
log2(2 * length-of-major-axis), from my read of GLSL spec.  There is
another pass in i965 that divides all coordinates by the length of the
major axis already.  If things are arranged carefully, all it takes
could be to subtract lambda by 1 when the target is a cube map.
> OTOH you could actually simplify the rho calculation a bit, since you
> could do the sqrt easily after the max hence only needing one instead of
> two sqrt (though if your hw has blazing fast sqrt it won't matter...).
Nice.
>
> Roland
>
>
> Am 05.09.2013 10:35, schrieb Chia-I Wu:
>> sample_d is slower than the lowered version on gen7.  For gen7, this improves
>> Xonotic benchmark with Ultimate effects by as much as 25%:
>>
>>  before the change:  40.06 fps
>>  after the change:   51.10 fps
>>  after the change with INTEL_DEBUG=no16: 44.46 fps
>>
>> As sample_d is not allowed in SIMD16 mode, I firstly thought the difference
>> was from SIMD8 versus SIMD16.  If that was the case, we would want to apply
>> brw_lower_texture_gradients() only on fragment shaders in SIMD16 mode.
>>
>> But, as the numbers show, there is still 10% improvement when SIMD16 is 
>> forced
>> off after the change.  Thus textureGrad() is lowered unconditionally for now.
>> Due to this and that I haven't tried it on Haswell, this is still RFC.
>>
>> No piglit regressions.
>>
>> Signed-off-by: Chia-I Wu 
>> ---
>>  .../dri/i965/brw_lower_texture_gradients.cpp   | 54 
>> ++
>>  1 file changed, 36 insertions(+), 18 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp 
>> b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>> index 1589a20..f3fcb56 100644
>> --- a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>> @@ -34,8 +34,8 @@ using namespace ir_builder;
>>
>>  class lower_texture_grad_visitor : public ir_hierarchical_visitor {
>>  public:
>> -   lower_texture_grad_visitor(bool has_sample_d_c)
>> -  : has_sample_d_c(has_sample_d_c)
>> +   lower_texture_grad_visitor(bool has_sample_d, bool has_sample_d_c)
>> +  : has_sample_d(has_sample_d), has_sample_d_c(has_sample_d_c)
>> {
>>progress = false;
>> }
>> @@ -44,6 +44,7 @@ public:
>>
>>
>> bool progress;
>> +   bool has_sample_d;
>> bool has_sample_d_c;
>>
>>  private:
>> @@ -90,22 +91,33 @@ txs_type(const glsl_type *type)
>>  ir_visitor_status
>>  lower_texture_grad_visitor::visit_leave(ir_texture *ir)
>>  {
>> -   /* Only lower textureGrad with shadow samplers */
>> -   if (ir->op != ir_txd || !ir->shadow_comparitor)
>> +   if (ir->op != ir_txd)
>>return visit_continue;
>>
>> -   /* Lower textureGrad() with samplerCubeShadow even if we have the 
>> sample_d_c
>> -* message.  GLSL provides gradients for the 'r' coordinate.  
>> Unfortunately:
>> -*
>> -* From the Ivybridge PRM, Volume 4, Part 1, sample_d message 
>> description:
>> -* "The r coordinate contains the faceid, and the r gradients are ignored
>> -*  by hardware."
>> -*
>> -* We likely need to do a similar treatment for samplerCube and
>> -* samplerCubeArray, but we have insufficient testing for that at the 
>> moment.
>> -*/
>> -   bool need_lowering = !has_sample_d_c ||
>> -  ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE;
>> +   bool need_lowering = false;
>> +
>> +   if (ir->shadow_comparitor) {
>> +  /* Lower textureGrad() with samplerCubeShadow even if we have the
>> +   * sample_d_c message.  GLSL provides gradients for the 'r' 
>> coordinate.
>> +   * Unfortunately:
>> +   *
>> +   * From the Ivybridge PRM, Volume 4, Part 1, sample_d message
>> +   * description: "The r coordinate contains the faceid, and the r
>> +   * gradients are ignored by hardware.&

Re: [Mesa-dev] [PATCH] i965/gen7: always lower textureGrad() on gen7

2013-09-06 Thread Chia-I Wu

On Thu, Sep 5, 2013 at 9:57 PM, Chia-I Wu  wrote:
> On Thu, Sep 5, 2013 at 5:12 PM, Chris Forbes  wrote:
>> A possible explanation for the perf change is that Xonotic uses
>> anisotropic filtering at this quality level. Lowering to txl defeats
>> it.
> I had a look at that.  gl_sampler->MaxAnisotropy is never greater than
> 1.0 in gen7_update_sampler_state() so there is no anisotropic
> filtering in this case.
>
> It makes sense to me that avoiding punting to SIMD8 helps the
> performance.  But it is not clear to me why >10% performance change
> can still be observed when INTEL_DEBUG=no16 is specified.  A
> reasonable explanation is that the image quality is degraded in some
> way, which is why I am still nervous about the change.
With INTEL_DEBUG=no16 set, the same trick hurts the performance on
Haswell by about 5%.  That is, sample_d on Haswell is faster than the
one emulated with sample_l.

But since the trick makes SIMD16 possible, it gains 5% more fps when
INTEL_DEBUG=no16 is not set.

> An alternative approach to avoid punting seems to emulate SIMD16
> sample_d with two SIMD8 sample_d.  It will take longer to implement
> given my familiarity with the code, and may be less performant.  BUt
> that would allow things like anisotropic filtering to be honored.
>
>
>> It would be worth doing an image quality comparison before and after the 
>> change.
> Yeah, that is worth doing.  I will do that.
>
>>
>> -- Chris
>>
>> On Thu, Sep 5, 2013 at 8:35 PM, Chia-I Wu  wrote:
>>> sample_d is slower than the lowered version on gen7.  For gen7, this 
>>> improves
>>> Xonotic benchmark with Ultimate effects by as much as 25%:
>>>
>>>  before the change:  40.06 fps
>>>  after the change:   51.10 fps
>>>  after the change with INTEL_DEBUG=no16: 44.46 fps
>>>
>>> As sample_d is not allowed in SIMD16 mode, I firstly thought the difference
>>> was from SIMD8 versus SIMD16.  If that was the case, we would want to apply
>>> brw_lower_texture_gradients() only on fragment shaders in SIMD16 mode.
>>>
>>> But, as the numbers show, there is still 10% improvement when SIMD16 is 
>>> forced
>>> off after the change.  Thus textureGrad() is lowered unconditionally for 
>>> now.
>>> Due to this and that I haven't tried it on Haswell, this is still RFC.
>>>
>>> No piglit regressions.
>>>
>>> Signed-off-by: Chia-I Wu 
>>> ---
>>>  .../dri/i965/brw_lower_texture_gradients.cpp   | 54 
>>> ++
>>>  1 file changed, 36 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp 
>>> b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>>> index 1589a20..f3fcb56 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>>> +++ b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>>> @@ -34,8 +34,8 @@ using namespace ir_builder;
>>>
>>>  class lower_texture_grad_visitor : public ir_hierarchical_visitor {
>>>  public:
>>> -   lower_texture_grad_visitor(bool has_sample_d_c)
>>> -  : has_sample_d_c(has_sample_d_c)
>>> +   lower_texture_grad_visitor(bool has_sample_d, bool has_sample_d_c)
>>> +  : has_sample_d(has_sample_d), has_sample_d_c(has_sample_d_c)
>>> {
>>>progress = false;
>>> }
>>> @@ -44,6 +44,7 @@ public:
>>>
>>>
>>> bool progress;
>>> +   bool has_sample_d;
>>> bool has_sample_d_c;
>>>
>>>  private:
>>> @@ -90,22 +91,33 @@ txs_type(const glsl_type *type)
>>>  ir_visitor_status
>>>  lower_texture_grad_visitor::visit_leave(ir_texture *ir)
>>>  {
>>> -   /* Only lower textureGrad with shadow samplers */
>>> -   if (ir->op != ir_txd || !ir->shadow_comparitor)
>>> +   if (ir->op != ir_txd)
>>>return visit_continue;
>>>
>>> -   /* Lower textureGrad() with samplerCubeShadow even if we have the 
>>> sample_d_c
>>> -* message.  GLSL provides gradients for the 'r' coordinate.  
>>> Unfortunately:
>>> -*
>>> -* From the Ivybridge PRM, Volume 4, Part 1, sample_d message 
>>> description:
>>> -* "The r coordinate contains the faceid, and the r gradients are 
>>> ignored
>>> -*  by hardware."
>>> -*
>>> -* We likely need to do a similar treatment for samplerCube and

Re: [Mesa-dev] [PATCH] i965/gen7: always lower textureGrad() on gen7

2013-09-09 Thread Chia-I Wu

On Tue, Sep 10, 2013 at 3:48 AM, Ian Romanick  wrote:
> On 09/05/2013 08:57 AM, Chia-I Wu wrote:
>> On Thu, Sep 5, 2013 at 5:12 PM, Chris Forbes  wrote:
>>> A possible explanation for the perf change is that Xonotic uses
>>> anisotropic filtering at this quality level. Lowering to txl defeats
>>> it.
>> I had a look at that.  gl_sampler->MaxAnisotropy is never greater than
>> 1.0 in gen7_update_sampler_state() so there is no anisotropic
>> filtering in this case.
>>
>> It makes sense to me that avoiding punting to SIMD8 helps the
>> performance.  But it is not clear to me why >10% performance change
>> can still be observed when INTEL_DEBUG=no16 is specified.  A
>> reasonable explanation is that the image quality is degraded in some
>> way, which is why I am still nervous about the change.
>>
>> An alternative approach to avoid punting seems to emulate SIMD16
>> sample_d with two SIMD8 sample_d.  It will take longer to implement
>> given my familiarity with the code, and may be less performant.  BUt
>> that would allow things like anisotropic filtering to be honored.
>>
>>
>>> It would be worth doing an image quality comparison before and after the 
>>> change.
>> Yeah, that is worth doing.  I will do that.
>
> Any results?  Still waiting...
There is no difference in image quality as far as I can tell.  Here
are the screenshots of every 100 frames before and after the change

  https://www.dropbox.com/s/mdqh0e42sf0xfro/compare-textureGrad-lowering.tar.gz
(297MB)

They are taken with the game's built-in mechanism and effects such as
bullets or explosions are off a bit between runs.

>>> -- Chris
>>>
>>> On Thu, Sep 5, 2013 at 8:35 PM, Chia-I Wu  wrote:
>>>> sample_d is slower than the lowered version on gen7.  For gen7, this 
>>>> improves
>>>> Xonotic benchmark with Ultimate effects by as much as 25%:
>>>>
>>>>  before the change:  40.06 fps
>>>>  after the change:   51.10 fps
>>>>  after the change with INTEL_DEBUG=no16: 44.46 fps
>>>>
>>>> As sample_d is not allowed in SIMD16 mode, I firstly thought the difference
>>>> was from SIMD8 versus SIMD16.  If that was the case, we would want to apply
>>>> brw_lower_texture_gradients() only on fragment shaders in SIMD16 mode.
>>>>
>>>> But, as the numbers show, there is still 10% improvement when SIMD16 is 
>>>> forced
>>>> off after the change.  Thus textureGrad() is lowered unconditionally for 
>>>> now.
>>>> Due to this and that I haven't tried it on Haswell, this is still RFC.
>>>>
>>>> No piglit regressions.
>>>>
>>>> Signed-off-by: Chia-I Wu 
>>>> ---
>>>>  .../dri/i965/brw_lower_texture_gradients.cpp   | 54 
>>>> ++
>>>>  1 file changed, 36 insertions(+), 18 deletions(-)
>>>>
>>>> diff --git a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp 
>>>> b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>>>> index 1589a20..f3fcb56 100644
>>>> --- a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>>>> +++ b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>>>> @@ -34,8 +34,8 @@ using namespace ir_builder;
>>>>
>>>>  class lower_texture_grad_visitor : public ir_hierarchical_visitor {
>>>>  public:
>>>> -   lower_texture_grad_visitor(bool has_sample_d_c)
>>>> -  : has_sample_d_c(has_sample_d_c)
>>>> +   lower_texture_grad_visitor(bool has_sample_d, bool has_sample_d_c)
>>>> +  : has_sample_d(has_sample_d), has_sample_d_c(has_sample_d_c)
>>>> {
>>>>progress = false;
>>>> }
>>>> @@ -44,6 +44,7 @@ public:
>>>>
>>>>
>>>> bool progress;
>>>> +   bool has_sample_d;
>>>> bool has_sample_d_c;
>>>>
>>>>  private:
>>>> @@ -90,22 +91,33 @@ txs_type(const glsl_type *type)
>>>>  ir_visitor_status
>>>>  lower_texture_grad_visitor::visit_leave(ir_texture *ir)
>>>>  {
>>>> -   /* Only lower textureGrad with shadow samplers */
>>>> -   if (ir->op != ir_txd || !ir->shadow_comparitor)
>>>> +   if (ir->op != ir_txd)
>>>>return visit_continue;
>>>>
>>>> -   /* Lower textureGrad() with samplerCubeShadow even if we

Re: [Mesa-dev] [PATCH] i965/gen7: always lower textureGrad() on gen7

2013-09-09 Thread Chia-I Wu

On Tue, Sep 10, 2013 at 4:01 AM, Ian Romanick  wrote:
> On 09/06/2013 05:05 AM, Chia-I Wu wrote:
>> On Thu, Sep 5, 2013 at 9:57 PM, Chia-I Wu  wrote:
>>> On Thu, Sep 5, 2013 at 5:12 PM, Chris Forbes  wrote:
>>>> A possible explanation for the perf change is that Xonotic uses
>>>> anisotropic filtering at this quality level. Lowering to txl defeats
>>>> it.
>>> I had a look at that.  gl_sampler->MaxAnisotropy is never greater than
>>> 1.0 in gen7_update_sampler_state() so there is no anisotropic
>>> filtering in this case.
>>>
>>> It makes sense to me that avoiding punting to SIMD8 helps the
>>> performance.  But it is not clear to me why >10% performance change
>>> can still be observed when INTEL_DEBUG=no16 is specified.  A
>>> reasonable explanation is that the image quality is degraded in some
>>> way, which is why I am still nervous about the change.
>> With INTEL_DEBUG=no16 set, the same trick hurts the performance on
>> Haswell by about 5%.  That is, sample_d on Haswell is faster than the
>> one emulated with sample_l.
>
> What is the delta if sample_d is used for just SIMD8 shaders on HSW?
> Even when the shader can go SIMD16, some fragments will use the SIMD8 path.
brw_lower_texture_gradients applies on the IR so it is hard to
selectively apply it only for SIMD16 fs.  I will see if I can work
something out here to get the numbers you need.


>> But since the trick makes SIMD16 possible, it gains 5% more fps when
>> INTEL_DEBUG=no16 is not set.
>>
>>> An alternative approach to avoid punting seems to emulate SIMD16
>>> sample_d with two SIMD8 sample_d.  It will take longer to implement
>>> given my familiarity with the code, and may be less performant.  BUt
>>> that would allow things like anisotropic filtering to be honored.
>>>
>>>
>>>> It would be worth doing an image quality comparison before and after the 
>>>> change.
>>> Yeah, that is worth doing.  I will do that.
>>>
>>>>
>>>> -- Chris
>>>>
>>>> On Thu, Sep 5, 2013 at 8:35 PM, Chia-I Wu  wrote:
>>>>> sample_d is slower than the lowered version on gen7.  For gen7, this 
>>>>> improves
>>>>> Xonotic benchmark with Ultimate effects by as much as 25%:
>>>>>
>>>>>  before the change:  40.06 fps
>>>>>  after the change:   51.10 fps
>>>>>  after the change with INTEL_DEBUG=no16: 44.46 fps
>>>>>
>>>>> As sample_d is not allowed in SIMD16 mode, I firstly thought the 
>>>>> difference
>>>>> was from SIMD8 versus SIMD16.  If that was the case, we would want to 
>>>>> apply
>>>>> brw_lower_texture_gradients() only on fragment shaders in SIMD16 mode.
>>>>>
>>>>> But, as the numbers show, there is still 10% improvement when SIMD16 is 
>>>>> forced
>>>>> off after the change.  Thus textureGrad() is lowered unconditionally for 
>>>>> now.
>>>>> Due to this and that I haven't tried it on Haswell, this is still RFC.
>>>>>
>>>>> No piglit regressions.
>>>>>
>>>>> Signed-off-by: Chia-I Wu 
>>>>> ---
>>>>>  .../dri/i965/brw_lower_texture_gradients.cpp   | 54 
>>>>> ++
>>>>>  1 file changed, 36 insertions(+), 18 deletions(-)
>>>>>
>>>>> diff --git a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp 
>>>>> b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>>>>> index 1589a20..f3fcb56 100644
>>>>> --- a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>>>>> +++ b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>>>>> @@ -34,8 +34,8 @@ using namespace ir_builder;
>>>>>
>>>>>  class lower_texture_grad_visitor : public ir_hierarchical_visitor {
>>>>>  public:
>>>>> -   lower_texture_grad_visitor(bool has_sample_d_c)
>>>>> -  : has_sample_d_c(has_sample_d_c)
>>>>> +   lower_texture_grad_visitor(bool has_sample_d, bool has_sample_d_c)
>>>>> +  : has_sample_d(has_sample_d), has_sample_d_c(has_sample_d_c)
>>>>> {
>>>>>progress = false;
>>>>> }
>>>>> @@ -44,6 +44,7 @@ public:
>>>>>
>>>>>
>>>>> bool progre

Re: [Mesa-dev] [PATCH] i965/gen7: always lower textureGrad() on gen7

2013-09-09 Thread Chia-I Wu

On Tue, Sep 10, 2013 at 4:05 AM, Ian Romanick  wrote:
> On 09/05/2013 03:35 AM, Chia-I Wu wrote:
>> sample_d is slower than the lowered version on gen7.  For gen7, this improves
>> Xonotic benchmark with Ultimate effects by as much as 25%:
>>
>>  before the change:  40.06 fps
>>  after the change:   51.10 fps
>>  after the change with INTEL_DEBUG=no16: 44.46 fps
>>
>> As sample_d is not allowed in SIMD16 mode, I firstly thought the difference
>> was from SIMD8 versus SIMD16.  If that was the case, we would want to apply
>> brw_lower_texture_gradients() only on fragment shaders in SIMD16 mode.
>>
>> But, as the numbers show, there is still 10% improvement when SIMD16 is 
>> forced
>> off after the change.  Thus textureGrad() is lowered unconditionally for now.
>> Due to this and that I haven't tried it on Haswell, this is still RFC.
>
> A lot of this code depends on the texture targets being used.  What
> texture targets is Xonotic using with textureGrad?
Only sampler2D.
>
>> No piglit regressions.
>>
>> Signed-off-by: Chia-I Wu 
>> ---
>>  .../dri/i965/brw_lower_texture_gradients.cpp   | 54 
>> ++
>>  1 file changed, 36 insertions(+), 18 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp 
>> b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>> index 1589a20..f3fcb56 100644
>> --- a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>> @@ -34,8 +34,8 @@ using namespace ir_builder;
>>
>>  class lower_texture_grad_visitor : public ir_hierarchical_visitor {
>>  public:
>> -   lower_texture_grad_visitor(bool has_sample_d_c)
>> -  : has_sample_d_c(has_sample_d_c)
>> +   lower_texture_grad_visitor(bool has_sample_d, bool has_sample_d_c)
>> +  : has_sample_d(has_sample_d), has_sample_d_c(has_sample_d_c)
>> {
>>progress = false;
>> }
>> @@ -44,6 +44,7 @@ public:
>>
>>
>> bool progress;
>> +   bool has_sample_d;
>> bool has_sample_d_c;
>>
>>  private:
>> @@ -90,22 +91,33 @@ txs_type(const glsl_type *type)
>>  ir_visitor_status
>>  lower_texture_grad_visitor::visit_leave(ir_texture *ir)
>>  {
>> -   /* Only lower textureGrad with shadow samplers */
>> -   if (ir->op != ir_txd || !ir->shadow_comparitor)
>> +   if (ir->op != ir_txd)
>>return visit_continue;
>>
>> -   /* Lower textureGrad() with samplerCubeShadow even if we have the 
>> sample_d_c
>> -* message.  GLSL provides gradients for the 'r' coordinate.  
>> Unfortunately:
>> -*
>> -* From the Ivybridge PRM, Volume 4, Part 1, sample_d message 
>> description:
>> -* "The r coordinate contains the faceid, and the r gradients are ignored
>> -*  by hardware."
>> -*
>> -* We likely need to do a similar treatment for samplerCube and
>> -* samplerCubeArray, but we have insufficient testing for that at the 
>> moment.
>> -*/
>> -   bool need_lowering = !has_sample_d_c ||
>> -  ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE;
>> +   bool need_lowering = false;
>> +
>> +   if (ir->shadow_comparitor) {
>> +  /* Lower textureGrad() with samplerCubeShadow even if we have the
>> +   * sample_d_c message.  GLSL provides gradients for the 'r' 
>> coordinate.
>> +   * Unfortunately:
>> +   *
>> +   * From the Ivybridge PRM, Volume 4, Part 1, sample_d message
>> +   * description: "The r coordinate contains the faceid, and the r
>> +   * gradients are ignored by hardware."
>> +   */
>> +  if (ir->sampler->type->sampler_dimensionality == 
>> GLSL_SAMPLER_DIM_CUBE)
>> + need_lowering = true;
>> +  else if (!has_sample_d_c)
>> + need_lowering = true;
>
> This should look like the old code:
>
> need_lowering = !has_sample_d_c ||
>ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE;
Sure.  I moved it so that it is clear the comments are for the first if-block.
>> +   }
>> +   else {
>> +  /* We likely need to do a similar treatment for samplerCube and
>> +   * samplerCubeArray, but we have insufficient testing for that at the
>> +   * moment.
>> +   */
>> +  if (!has_sample_d)
>> +

Re: [Mesa-dev] [PATCH 01/21] ilo: Fix out-of-tree build.

2013-09-11 Thread Chia-I Wu

On Thu, Sep 12, 2013 at 6:32 AM, Johannes Obermayr
 wrote:
> ---
>  src/gallium/drivers/ilo/Makefile.am | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/src/gallium/drivers/ilo/Makefile.am 
> b/src/gallium/drivers/ilo/Makefile.am
> index 10b3da3..33f2045 100644
> --- a/src/gallium/drivers/ilo/Makefile.am
> +++ b/src/gallium/drivers/ilo/Makefile.am
> @@ -27,7 +27,7 @@ include $(top_srcdir)/src/gallium/Automake.inc
>  noinst_LTLIBRARIES = libilo.la
>
>  AM_CPPFLAGS = \
> -   -Iinclude \
> +   -I$(top_srcdir)/src/gallium/drivers/ilo/include \
Is -I$(srcdir)/include better?  I am not familiar with automake enough
to know which is preferred.  Either way, the patch looks to me.

> -I$(top_srcdir)/src/gallium/winsys/intel \
> $(GALLIUM_CFLAGS)
>
> --
> 1.8.1.4
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] i965/hsw: approximate DDX with a uniform value across a subspan

2013-09-11 Thread Chia-I Wu

From: Chia-I Wu 

Replicate the gradient of the top-left pixel to the other three pixels in the
subspan, as how DDY is implemented.  Before, different graidents were used for
pixels in the top row and pixels in the bottom row.

This change results in a less accurate approximation.  However, it improves
the performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% (at
95.0% confidence) on Haswell.  No noticeable image quality difference
observed.

No piglit gpu.tests regressions.

I failed to come up with an explanation for the performance difference.  The
change does not make a difference on Ivy Bridge either.  If anyone has the
insight, please kindly enlighten me.  Performance differences may also be
observed on other games that call textureGrad and dFdx.

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 17 +
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index bfb3d33..c0d24a0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -564,16 +564,25 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg 
dst, struct brw_reg src
 void
 fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg 
src)
 {
+   /* approximate with ((ss0.tr - ss0.tl)x4 (ss1.tr - ss1.tl)x4) on Haswell,
+* which gives much better performance when the result is used with
+* sample_d
+*/
+   unsigned vstride = (brw->is_haswell) ? BRW_VERTICAL_STRIDE_4 :
+  BRW_VERTICAL_STRIDE_2;
+   unsigned width = (brw->is_haswell) ? BRW_WIDTH_4 :
+BRW_WIDTH_2;
+
struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
 BRW_REGISTER_TYPE_F,
-BRW_VERTICAL_STRIDE_2,
-BRW_WIDTH_2,
+vstride,
+width,
 BRW_HORIZONTAL_STRIDE_0,
 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
 BRW_REGISTER_TYPE_F,
-BRW_VERTICAL_STRIDE_2,
-BRW_WIDTH_2,
+vstride,
+width,
 BRW_HORIZONTAL_STRIDE_0,
 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
brw_ADD(p, dst, src0, negate(src1));
-- 
1.8.3.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] i965/gen7: always lower textureGrad() on gen7

2013-09-11 Thread Chia-I Wu

On Tue, Sep 10, 2013 at 2:01 PM, Chia-I Wu  wrote:
> On Tue, Sep 10, 2013 at 4:05 AM, Ian Romanick  wrote:
>> On 09/05/2013 03:35 AM, Chia-I Wu wrote:
>>> sample_d is slower than the lowered version on gen7.  For gen7, this 
>>> improves
>>> Xonotic benchmark with Ultimate effects by as much as 25%:
>>>
>>>  before the change:  40.06 fps
>>>  after the change:   51.10 fps
>>>  after the change with INTEL_DEBUG=no16: 44.46 fps
>>>
>>> As sample_d is not allowed in SIMD16 mode, I firstly thought the difference
>>> was from SIMD8 versus SIMD16.  If that was the case, we would want to apply
>>> brw_lower_texture_gradients() only on fragment shaders in SIMD16 mode.
>>>
>>> But, as the numbers show, there is still 10% improvement when SIMD16 is 
>>> forced
>>> off after the change.  Thus textureGrad() is lowered unconditionally for 
>>> now.
>>> Due to this and that I haven't tried it on Haswell, this is still RFC.
>>
>> A lot of this code depends on the texture targets being used.  What
>> texture targets is Xonotic using with textureGrad?
> Only sampler2D.
>>
>>> No piglit regressions.
>>>
>>> Signed-off-by: Chia-I Wu 
>>> ---
>>>  .../dri/i965/brw_lower_texture_gradients.cpp   | 54 
>>> ++
>>>  1 file changed, 36 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp 
>>> b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>>> index 1589a20..f3fcb56 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>>> +++ b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>>> @@ -34,8 +34,8 @@ using namespace ir_builder;
>>>
>>>  class lower_texture_grad_visitor : public ir_hierarchical_visitor {
>>>  public:
>>> -   lower_texture_grad_visitor(bool has_sample_d_c)
>>> -  : has_sample_d_c(has_sample_d_c)
>>> +   lower_texture_grad_visitor(bool has_sample_d, bool has_sample_d_c)
>>> +  : has_sample_d(has_sample_d), has_sample_d_c(has_sample_d_c)
>>> {
>>>progress = false;
>>> }
>>> @@ -44,6 +44,7 @@ public:
>>>
>>>
>>> bool progress;
>>> +   bool has_sample_d;
>>> bool has_sample_d_c;
>>>
>>>  private:
>>> @@ -90,22 +91,33 @@ txs_type(const glsl_type *type)
>>>  ir_visitor_status
>>>  lower_texture_grad_visitor::visit_leave(ir_texture *ir)
>>>  {
>>> -   /* Only lower textureGrad with shadow samplers */
>>> -   if (ir->op != ir_txd || !ir->shadow_comparitor)
>>> +   if (ir->op != ir_txd)
>>>return visit_continue;
>>>
>>> -   /* Lower textureGrad() with samplerCubeShadow even if we have the 
>>> sample_d_c
>>> -* message.  GLSL provides gradients for the 'r' coordinate.  
>>> Unfortunately:
>>> -*
>>> -* From the Ivybridge PRM, Volume 4, Part 1, sample_d message 
>>> description:
>>> -* "The r coordinate contains the faceid, and the r gradients are 
>>> ignored
>>> -*  by hardware."
>>> -*
>>> -* We likely need to do a similar treatment for samplerCube and
>>> -* samplerCubeArray, but we have insufficient testing for that at the 
>>> moment.
>>> -*/
>>> -   bool need_lowering = !has_sample_d_c ||
>>> -  ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE;
>>> +   bool need_lowering = false;
>>> +
>>> +   if (ir->shadow_comparitor) {
>>> +  /* Lower textureGrad() with samplerCubeShadow even if we have the
>>> +   * sample_d_c message.  GLSL provides gradients for the 'r' 
>>> coordinate.
>>> +   * Unfortunately:
>>> +   *
>>> +   * From the Ivybridge PRM, Volume 4, Part 1, sample_d message
>>> +   * description: "The r coordinate contains the faceid, and the r
>>> +   * gradients are ignored by hardware."
>>> +   */
>>> +  if (ir->sampler->type->sampler_dimensionality == 
>>> GLSL_SAMPLER_DIM_CUBE)
>>> + need_lowering = true;
>>> +  else if (!has_sample_d_c)
>>> + need_lowering = true;
>>
>> This should look like the old code:
>>
>> need_lowering = !has_sample

Re: [Mesa-dev] [PATCH] i965/gen7: always lower textureGrad() on gen7

2013-09-11 Thread Chia-I Wu

On Tue, Sep 10, 2013 at 1:37 PM, Chia-I Wu  wrote:
> On Tue, Sep 10, 2013 at 4:01 AM, Ian Romanick  wrote:
>> On 09/06/2013 05:05 AM, Chia-I Wu wrote:
>>> On Thu, Sep 5, 2013 at 9:57 PM, Chia-I Wu  wrote:
>>>> On Thu, Sep 5, 2013 at 5:12 PM, Chris Forbes  wrote:
>>>>> A possible explanation for the perf change is that Xonotic uses
>>>>> anisotropic filtering at this quality level. Lowering to txl defeats
>>>>> it.
>>>> I had a look at that.  gl_sampler->MaxAnisotropy is never greater than
>>>> 1.0 in gen7_update_sampler_state() so there is no anisotropic
>>>> filtering in this case.
>>>>
>>>> It makes sense to me that avoiding punting to SIMD8 helps the
>>>> performance.  But it is not clear to me why >10% performance change
>>>> can still be observed when INTEL_DEBUG=no16 is specified.  A
>>>> reasonable explanation is that the image quality is degraded in some
>>>> way, which is why I am still nervous about the change.
>>> With INTEL_DEBUG=no16 set, the same trick hurts the performance on
>>> Haswell by about 5%.  That is, sample_d on Haswell is faster than the
>>> one emulated with sample_l.
>>
>> What is the delta if sample_d is used for just SIMD8 shaders on HSW?
>> Even when the shader can go SIMD16, some fragments will use the SIMD8 path.
> brw_lower_texture_gradients applies on the IR so it is hard to
> selectively apply it only for SIMD16 fs.  I will see if I can work
> something out here to get the numbers you need.
I could clone the original IR list, run all but
brw_lower_texture_gradients passes on it, and use the cloned list to
generate SIMD8 code.  This is to get the numbers, not for the final
code.

But I sent another patch that should speed up sample_d.  With it, we
do not want to lower sample_d to sample_l at all.  I will see how the
patch goes first.

>
>
>>> But since the trick makes SIMD16 possible, it gains 5% more fps when
>>> INTEL_DEBUG=no16 is not set.
>>>
>>>> An alternative approach to avoid punting seems to emulate SIMD16
>>>> sample_d with two SIMD8 sample_d.  It will take longer to implement
>>>> given my familiarity with the code, and may be less performant.  BUt
>>>> that would allow things like anisotropic filtering to be honored.
And we will need to do this to enable SIMD16.
>>>>
>>>>
>>>>> It would be worth doing an image quality comparison before and after the 
>>>>> change.
>>>> Yeah, that is worth doing.  I will do that.
>>>>
>>>>>
>>>>> -- Chris
>>>>>
>>>>> On Thu, Sep 5, 2013 at 8:35 PM, Chia-I Wu  wrote:
>>>>>> sample_d is slower than the lowered version on gen7.  For gen7, this 
>>>>>> improves
>>>>>> Xonotic benchmark with Ultimate effects by as much as 25%:
>>>>>>
>>>>>>  before the change:  40.06 fps
>>>>>>  after the change:   51.10 fps
>>>>>>  after the change with INTEL_DEBUG=no16: 44.46 fps
>>>>>>
>>>>>> As sample_d is not allowed in SIMD16 mode, I firstly thought the 
>>>>>> difference
>>>>>> was from SIMD8 versus SIMD16.  If that was the case, we would want to 
>>>>>> apply
>>>>>> brw_lower_texture_gradients() only on fragment shaders in SIMD16 mode.
>>>>>>
>>>>>> But, as the numbers show, there is still 10% improvement when SIMD16 is 
>>>>>> forced
>>>>>> off after the change.  Thus textureGrad() is lowered unconditionally for 
>>>>>> now.
>>>>>> Due to this and that I haven't tried it on Haswell, this is still RFC.
>>>>>>
>>>>>> No piglit regressions.
>>>>>>
>>>>>> Signed-off-by: Chia-I Wu 
>>>>>> ---
>>>>>>  .../dri/i965/brw_lower_texture_gradients.cpp   | 54 
>>>>>> ++
>>>>>>  1 file changed, 36 insertions(+), 18 deletions(-)
>>>>>>
>>>>>> diff --git a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp 
>>>>>> b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>>>>>> index 1589a20..f3fcb56 100644
>>>>>> --- a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
>>>>>> +++ b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients

Re: [Mesa-dev] [PATCH] i965/hsw: approximate DDX with a uniform value across a subspan

2013-09-12 Thread Chia-I Wu

On Thu, Sep 12, 2013 at 2:06 PM, Chris Forbes  wrote:
> Can we make this approximation conditional on an image-quality control
> in driconf [or somewhere else]?
Sure.  What would be the default behavior?

> On Thu, Sep 12, 2013 at 5:00 PM, Chia-I Wu  wrote:
>> From: Chia-I Wu 
>>
>> Replicate the gradient of the top-left pixel to the other three pixels in the
>> subspan, as how DDY is implemented.  Before, different graidents were used 
>> for
>> pixels in the top row and pixels in the bottom row.
>>
>> This change results in a less accurate approximation.  However, it improves
>> the performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% (at
>> 95.0% confidence) on Haswell.  No noticeable image quality difference
>> observed.
>>
>> No piglit gpu.tests regressions.
>>
>> I failed to come up with an explanation for the performance difference.  The
>> change does not make a difference on Ivy Bridge either.  If anyone has the
>> insight, please kindly enlighten me.  Performance differences may also be
>> observed on other games that call textureGrad and dFdx.
>>
>> Signed-off-by: Chia-I Wu 
>> ---
>>  src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 17 +
>>  1 file changed, 13 insertions(+), 4 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp 
>> b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
>> index bfb3d33..c0d24a0 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
>> @@ -564,16 +564,25 @@ fs_generator::generate_tex(fs_inst *inst, struct 
>> brw_reg dst, struct brw_reg src
>>  void
>>  fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct 
>> brw_reg src)
>>  {
>> +   /* approximate with ((ss0.tr - ss0.tl)x4 (ss1.tr - ss1.tl)x4) on Haswell,
>> +* which gives much better performance when the result is used with
>> +* sample_d
>> +*/
>> +   unsigned vstride = (brw->is_haswell) ? BRW_VERTICAL_STRIDE_4 :
>> +  BRW_VERTICAL_STRIDE_2;
>> +   unsigned width = (brw->is_haswell) ? BRW_WIDTH_4 :
>> +BRW_WIDTH_2;
>> +
>> struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
>>  BRW_REGISTER_TYPE_F,
>> -BRW_VERTICAL_STRIDE_2,
>> -BRW_WIDTH_2,
>> +vstride,
>> +width,
>>  BRW_HORIZONTAL_STRIDE_0,
>>  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
>> struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
>>  BRW_REGISTER_TYPE_F,
>> -BRW_VERTICAL_STRIDE_2,
>> -BRW_WIDTH_2,
>> +vstride,
>> +width,
>>  BRW_HORIZONTAL_STRIDE_0,
>>  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
>> brw_ADD(p, dst, src0, negate(src1));
>> --
>> 1.8.3.1
>>
>> ___
>> mesa-dev mailing list
>> mesa-dev@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] i965/hsw: approximate DDX with a uniform value across a subspan

2013-09-12 Thread Chia-I Wu

On Thu, Sep 12, 2013 at 5:27 PM, Chris Forbes  wrote:
> I guess fast-by-default. I imagine that more apps care about
> performance than care about the granularity of their derivatives.
That is my preference too.  My concern is that the performance gain is
only observed on Haswell so far.  Why is that and is there a way to
speed up sample_d on Ivy Brdige and Sandy Brdige?

> After a bit more thought -- In HLSL shader model 5 there's both
> ddx_coarse() and ddx_fine() which gives the shader author the choice
> between roughly these options. In a *very* quick look I haven't found
> anything equivalent -- but I might just be being blind.
>
> CC'ing Ian -- any opinion? Is there any conformance issue here?
>
> -- Chris
>
> On Thu, Sep 12, 2013 at 8:41 PM, Chia-I Wu  wrote:
>> On Thu, Sep 12, 2013 at 2:06 PM, Chris Forbes  wrote:
>>> Can we make this approximation conditional on an image-quality control
>>> in driconf [or somewhere else]?
>> Sure.  What would be the default behavior?
>>
>>> On Thu, Sep 12, 2013 at 5:00 PM, Chia-I Wu  wrote:
>>>> From: Chia-I Wu 
>>>>
>>>> Replicate the gradient of the top-left pixel to the other three pixels in 
>>>> the
>>>> subspan, as how DDY is implemented.  Before, different graidents were used 
>>>> for
>>>> pixels in the top row and pixels in the bottom row.
>>>>
>>>> This change results in a less accurate approximation.  However, it improves
>>>> the performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% 
>>>> (at
>>>> 95.0% confidence) on Haswell.  No noticeable image quality difference
>>>> observed.
>>>>
>>>> No piglit gpu.tests regressions.
>>>>
>>>> I failed to come up with an explanation for the performance difference.  
>>>> The
>>>> change does not make a difference on Ivy Bridge either.  If anyone has the
>>>> insight, please kindly enlighten me.  Performance differences may also be
>>>> observed on other games that call textureGrad and dFdx.
>>>>
>>>> Signed-off-by: Chia-I Wu 
>>>> ---
>>>>  src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 17 +
>>>>  1 file changed, 13 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp 
>>>> b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
>>>> index bfb3d33..c0d24a0 100644
>>>> --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
>>>> +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
>>>> @@ -564,16 +564,25 @@ fs_generator::generate_tex(fs_inst *inst, struct 
>>>> brw_reg dst, struct brw_reg src
>>>>  void
>>>>  fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct 
>>>> brw_reg src)
>>>>  {
>>>> +   /* approximate with ((ss0.tr - ss0.tl)x4 (ss1.tr - ss1.tl)x4) on 
>>>> Haswell,
>>>> +* which gives much better performance when the result is used with
>>>> +* sample_d
>>>> +*/
>>>> +   unsigned vstride = (brw->is_haswell) ? BRW_VERTICAL_STRIDE_4 :
>>>> +  BRW_VERTICAL_STRIDE_2;
>>>> +   unsigned width = (brw->is_haswell) ? BRW_WIDTH_4 :
>>>> +BRW_WIDTH_2;
>>>> +
>>>> struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
>>>>  BRW_REGISTER_TYPE_F,
>>>> -BRW_VERTICAL_STRIDE_2,
>>>> -BRW_WIDTH_2,
>>>> +vstride,
>>>> +width,
>>>>  BRW_HORIZONTAL_STRIDE_0,
>>>>  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
>>>> struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
>>>>  BRW_REGISTER_TYPE_F,
>>>> -BRW_VERTICAL_STRIDE_2,
>>>> -BRW_WIDTH_2,
>>>> +vstride,
>>>> +width,
>>>>  BRW_HORIZONTAL_STRIDE_0,
>>>>  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
>>>> brw_ADD(p, dst, src0, negate(src1));
>>>> --
>>>> 1.8.3.1
>>>>
>>>> ___
>>>> mesa-dev mailing list
>>>> mesa-dev@lists.freedesktop.org
>>>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>
>>
>>
>> --
>> o...@lunarg.com



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] gallium-bind-sampler-states branch

2013-09-12 Thread Chia-I Wu

Hi Brian,

On Fri, Sep 13, 2013 at 8:46 AM, Brian Paul  wrote:
>
> I just pushed a gallium-bind-sampler-states branch to my git repo at
> git://people.freedesktop.org/~brianp/mesa
>
> It replaces the four
> pipe_context::bind_fragment/vertex/geometry/compute_sampler_states()
> functions with a single bind_sampler_states() function:
>
>  void (*bind_sampler_states)(struct pipe_context *,
>  unsigned shader, unsigned start_slot,
>  unsigned num_samplers, void **samplers);
>
> At this point start_slot is always zero (at least for non-compute shaders).
> And as the updated gallium docs explain, at some point calls to
> bind_sampler_states() will be used to updated sub-ranges, but that never
> happens currently.
>
> I've updated all the drivers, state trackers, utils, etc.
>
> I've tested the svga, llvmpipe and softpipe drivers.  'make check' and a
> texture subset of piglit pass w/out regressions.  I'd appreciate it if other
> driver developers would test their favorite driver.
For ilo, the new code does not follow the doc and unbinds samplers not in range.

Is it fine if I implement the new bind_sampler_states as a helper
function on master branch, so that you hook it up to
pipe_context::bind_sampler_states in your branch and remove the old
ones?

>
> Next, I'd like to consolidate the
> set_vertex/geometry/fragment/compute_sampler_views() functions with a single
> function.  But I have no idea when I'll get around to that.
>
> -Brian
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] i965/hsw: approximate DDX with a uniform value across a subspan

2013-09-12 Thread Chia-I Wu

On Thu, Sep 12, 2013 at 10:48 PM, Ian Romanick  wrote:
> On 09/12/2013 01:06 AM, Chris Forbes wrote:
>> Can we make this approximation conditional on an image-quality control
>> in driconf [or somewhere else]?
>
> There's already a control that applications can use:
> GL_FRAGMENT_SHADER_DERIVATIVE_HINT.  I don't know whether or not /any/
> app has ever used it.  The default setting is GL_DONT_CARE, so,
> technically speaking, we could do this optimization whenever the hint
> isn't GL_NICEST.  Though, we may want a driconf override anyway.  Hmm...
How about, in generate_ddx():

  if (brw->ctx.Hint.FragmentShaderDerivative == GL_NICEST ||
  brw->accurate_ddx) {
 // current code
  }
  else {
 // new code
  }

That is, when the app don't care, we treat it as GL_FASTEST.  If the
user cares, he can set the new drirc option, accurate_ddx, to true to
override.  accurate_ddx is false by default.

>> On Thu, Sep 12, 2013 at 5:00 PM, Chia-I Wu  wrote:
>>> From: Chia-I Wu 
>>>
>>> Replicate the gradient of the top-left pixel to the other three pixels in 
>>> the
>>> subspan, as how DDY is implemented.  Before, different graidents were used 
>>> for
>>> pixels in the top row and pixels in the bottom row.
>>>
>>> This change results in a less accurate approximation.  However, it improves
>>> the performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% (at
>>> 95.0% confidence) on Haswell.  No noticeable image quality difference
>>> observed.
>>>
>>> No piglit gpu.tests regressions.
>>>
>>> I failed to come up with an explanation for the performance difference.  The
>>> change does not make a difference on Ivy Bridge either.  If anyone has the
>>> insight, please kindly enlighten me.  Performance differences may also be
>>> observed on other games that call textureGrad and dFdx.
>>>
>>> Signed-off-by: Chia-I Wu 
>>> ---
>>>  src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 17 +
>>>  1 file changed, 13 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp 
>>> b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
>>> index bfb3d33..c0d24a0 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
>>> +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
>>> @@ -564,16 +564,25 @@ fs_generator::generate_tex(fs_inst *inst, struct 
>>> brw_reg dst, struct brw_reg src
>>>  void
>>>  fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct 
>>> brw_reg src)
>>>  {
>>> +   /* approximate with ((ss0.tr - ss0.tl)x4 (ss1.tr - ss1.tl)x4) on 
>>> Haswell,
>>> +* which gives much better performance when the result is used with
>>> +* sample_d
>>> +*/
>>> +   unsigned vstride = (brw->is_haswell) ? BRW_VERTICAL_STRIDE_4 :
>>> +  BRW_VERTICAL_STRIDE_2;
>>> +   unsigned width = (brw->is_haswell) ? BRW_WIDTH_4 :
>>> +BRW_WIDTH_2;
>>> +
>>> struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
>>>  BRW_REGISTER_TYPE_F,
>>> -BRW_VERTICAL_STRIDE_2,
>>> -BRW_WIDTH_2,
>>> +vstride,
>>> +width,
>>>  BRW_HORIZONTAL_STRIDE_0,
>>>  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
>>> struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
>>>  BRW_REGISTER_TYPE_F,
>>> -BRW_VERTICAL_STRIDE_2,
>>> -BRW_WIDTH_2,
>>> +vstride,
>>> +width,
>>>  BRW_HORIZONTAL_STRIDE_0,
>>>  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
>>> brw_ADD(p, dst, src0, negate(src1));
>>> --
>>> 1.8.3.1
>>>
>>> ___
>>> mesa-dev mailing list
>>> mesa-dev@lists.freedesktop.org
>>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>> ___
>> mesa-dev mailing list
>> mesa-dev@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] i965/hsw: compute DDX in a subspan based only on top row

2013-09-12 Thread Chia-I Wu

From: Chia-I Wu 

Consider only the top-left and top-right pixels to approximate DDX in a 2x2
subspan, unless the application or the user requests a more accurate
approximation.  This results in a less accurate approximation.  However, it
improves the performance of Xonotic with Ultra settings by 24.3879% +/-
0.832202% (at 95.0% confidence) on Haswell.  No noticeable image quality
difference observed.

No piglit gpu.tests regressions (tested with v1)

I failed to come up with an explanation for the performance difference, as the
change does not affect Ivy Bridge.  If anyone has the insight, please kindly
enlighten me.  Performance differences may also be observed on other games
that call textureGrad and dFdx.

v2: Honor GL_FRAGMENT_SHADER_DERIVATIVE_HINT and add a drirc option.  Update
comments.

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_context.c   |  1 +
 src/mesa/drivers/dri/i965/brw_context.h   |  1 +
 src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 40 ---
 src/mesa/drivers/dri/i965/intel_screen.c  |  4 
 4 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.c 
b/src/mesa/drivers/dri/i965/brw_context.c
index 4fcc9fb..1cdfb9d 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -470,6 +470,7 @@ brwCreateContext(int api,
brw_draw_init( brw );
 
brw->precompile = driQueryOptionb(&brw->optionCache, "shader_precompile");
+   brw->accurate_derivative = driQueryOptionb(&brw->optionCache, 
"accurate_derivative");
 
ctx->Const.ContextFlags = 0;
if ((flags & __DRI_CTX_FLAG_FORWARD_COMPATIBLE) != 0)
diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
b/src/mesa/drivers/dri/i965/brw_context.h
index c566bba..8bfc54a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -964,6 +964,7 @@ struct brw_context
bool always_flush_cache;
bool disable_throttling;
bool precompile;
+   bool accurate_derivative;
 
driOptionCache optionCache;
/** @} */
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index bfb3d33..69aeab1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -540,7 +540,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg 
dst, struct brw_reg src
  *
  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
  *
- * and we're trying to produce:
+ * Ideally, we want to produce:
  *
  *   DDX DDY
  * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
@@ -556,24 +556,48 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg 
dst, struct brw_reg src
  *
  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
- * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
- * between each other.  We could probably do it like ddx and swizzle the right
- * order later, but bail for now and just produce
+ * pair.  But the ideal approximation of DDX may impose a huge performance
+ * cost on sample_d.  As such, we favor ((ss0.tr - ss0.tl)x4 (ss1.tr -
+ * ss1.tl)x4) unless the app or the user requests otherwise.
+ *
+ * For DDY, it's harder, as we want to produce the pairs swizzled between each
+ * other.  We could probably do it like ddx and swizzle the right order later,
+ * but bail for now and just produce
  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
  */
 void
 fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg 
src)
 {
+   unsigned vstride, width;
+
+   /* Produce accurate result only when requested.  We emit only one
+* instruction for either case, but the problem is the result may affect
+* how fast sample_d executes.
+*
+* Since the performance difference is only observed on Haswell, ignore the
+* hints on other GENs for now.
+*/
+   if (!brw->is_haswell ||
+   brw->ctx.Hint.FragmentShaderDerivative == GL_NICEST ||
+   brw->accurate_derivative) {
+  vstride = BRW_VERTICAL_STRIDE_2;
+  width = BRW_WIDTH_2;
+   }
+   else {
+  vstride = BRW_VERTICAL_STRIDE_4;
+  width = BRW_WIDTH_4;
+   }
+
struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
 BRW_REGISTER_TYPE_F,
-BRW_VERTICAL_STRIDE_2,
-BRW_WIDTH_2,
+vstride,
+width,
 BRW_HORIZONTAL_STRIDE_0,
 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
 BRW_REGISTER_TYPE_F,
-BRW_VERTICAL_STRIDE_2,
-

Re: [Mesa-dev] gallium-bind-sampler-states branch

2013-09-15 Thread Chia-I Wu

On Sun, Sep 15, 2013 at 12:24 AM, Brian Paul  wrote:
> On 09/12/2013 09:06 PM, Chia-I Wu wrote:
>>
>> Hi Brian,
>>
>> On Fri, Sep 13, 2013 at 8:46 AM, Brian Paul  wrote:
>>>
>>>
>>> I just pushed a gallium-bind-sampler-states branch to my git repo at
>>> git://people.freedesktop.org/~brianp/mesa
>>>
>>> It replaces the four
>>> pipe_context::bind_fragment/vertex/geometry/compute_sampler_states()
>>> functions with a single bind_sampler_states() function:
>>>
>>>   void (*bind_sampler_states)(struct pipe_context *,
>>>   unsigned shader, unsigned start_slot,
>>>   unsigned num_samplers, void **samplers);
>>>
>>> At this point start_slot is always zero (at least for non-compute
>>> shaders).
>>> And as the updated gallium docs explain, at some point calls to
>>> bind_sampler_states() will be used to updated sub-ranges, but that never
>>> happens currently.
>>>
>>> I've updated all the drivers, state trackers, utils, etc.
>>>
>>> I've tested the svga, llvmpipe and softpipe drivers.  'make check' and a
>>> texture subset of piglit pass w/out regressions.  I'd appreciate it if
>>> other
>>> driver developers would test their favorite driver.
>>
>> For ilo, the new code does not follow the doc and unbinds samplers not in
>> range.
>
>
> I think that's OK.  The CSO module (used by the state tracker) currently
> always calls pipe_context::bind_sampler_states() with start=0 and count such
> that it sets/replaces all samplers, never a sub-range.  That could/should
> change in the future.
>
> See single_sampler_done() in cso_context.c.
>
>
>
>> Is it fine if I implement the new bind_sampler_states as a helper
>> function on master branch, so that you hook it up to
>> pipe_context::bind_sampler_states in your branch and remove the old
>> ones?
>
>
> I'm not quite sure that I understand what you mean.  Can you elaborate?
There is already ilo_bind_sampler_states that does what
pipe_context::bind_sampler_states expects, except that the function
returns a bool.  I can make it return void so that, in your branch,
you can initialize pipe_context::bind_sampler_states to it instead of
adding ilo_bind_sampler_states2.

>
> -Brian
>



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] i965/hsw: compute DDX in a subspan based only on top row

2013-09-16 Thread Chia-I Wu

On Sat, Sep 14, 2013 at 5:15 AM, Paul Berry  wrote:
> On 12 September 2013 22:06, Chia-I Wu  wrote:
>>
>> From: Chia-I Wu 
>>
>> Consider only the top-left and top-right pixels to approximate DDX in a
>> 2x2
>> subspan, unless the application or the user requests a more accurate
>> approximation.  This results in a less accurate approximation.  However,
>> it
>> improves the performance of Xonotic with Ultra settings by 24.3879% +/-
>> 0.832202% (at 95.0% confidence) on Haswell.  No noticeable image quality
>> difference observed.
>>
>> No piglit gpu.tests regressions (tested with v1)
>>
>> I failed to come up with an explanation for the performance difference, as
>> the
>> change does not affect Ivy Bridge.  If anyone has the insight, please
>> kindly
>> enlighten me.  Performance differences may also be observed on other games
>> that call textureGrad and dFdx.
>>
>> v2: Honor GL_FRAGMENT_SHADER_DERIVATIVE_HINT and add a drirc option.
>> Update
>> comments.
>
>
> I'm not entirely comfortable making a change that has a known negative
> impact on computational accuracy (even one that leads to such an impressive
> performance improvement) when we don't have any theories as to why the
> performance improvement happens, or why the improvement doesn't apply to Ivy
> Bridge.  In my experience, making changes to the codebase without
> understanding why they improve things almost always leads to improvements
> that are brittle, since it's likely that the true source of the improvement
> is a coincidence that will be wiped out by some future change (or won't be
> relevant to client programs other than this particular benchmark).  Having a
> theory as to why the performance improvement happens would help us be
> confident that we're applying the right fix under the right circumstances.
That is how I feel as I've mentioned.  I am really glad to have the
discussion.  I have done some experiments actually.  It is just that
those experiments only tell me what theories are likely to be wrong.
They could not tell me if a theory is right.

So I have a micro benchmark that draws a 256x256 texture to a 512x512
window, using texture2D() or textureGrad().  It can also set up the
vertex buffer such that the texture is rotated around the z-axis by 15
degrees.

On Haswell, when the texture is not rotated, rendering with
textureGrad() is less than 1% slower than rendering with texture2D().
The slowdown could be from the additional instructions to calculate
DDX/DDY or the extra message payload.  Whether this patch is applied
or not does not make any difference.

When the texture is rotated, rendering with textureGrad() is ~3%
slower than rendering with texture2D() without the patch.  With the
patch, the difference is down to less than 1% again.

Computing LOD in the shader results in ~17% slowdown comparing to textureGrad().

As a better way to control the result of DDX, I also hacked the driver
so that DDX produced the values I specified for each pixel.  When not
all pixels in the subspan have the same gradient, rendering is ~6%
slower comparing to when all pixels in the subspan have the same
gradient.

The weird thing is, in SIMD8 mode, two subspans are processed at the
same time.  When all pixels in one of the subspan have the same
gradient, whether the pixels in the other subspan have the same
gradient or not does not matter.

As for Ivy Bridge, rendering with textureGrad() is always
significantly slower than rendering with texture2D().  Computing LOD
in the shader results in another ~4% slowdown comparing to
textureGrad().

>
> For example, here's one theory as to why we might be seeing an improvement:
> perhaps Haswell's sample_d processing is smart enough to realize that when
> all the gradient values within a sub-span are the same, that means that all
> of the sampling for the sub-span will come from the same LOD, and that
> allows it to short-cut some expensive step in the LOD calculation.  Perhaps
> the same improvement isn't seen on Ivy Bridge because Ivy Bridge's sample_d
> processing logic is less sophisticated, so it's unable to perform the
> optimization.  If this is the case, then conditioning the optimization on
> brw->is_haswell (as you've done) makes sense.
This is also the theory I have and my experiments could not rule it
out.  The question I have is, if LODs of all pixels were calculated
parallely, could the short cut help this much?  I don't have enough
knowledge in hardware to know the answer or even to know this is a
question or not.

>
> Another possible explanation for the Haswell vs Ivy Bridge difference is
> that perhaps Ivy Bridge, being a lower-performing chip, has other
> bottlenecks that make

Re: [Mesa-dev] [PATCH] i965/hsw: compute DDX in a subspan based only on top row

2013-09-16 Thread Chia-I Wu

On Mon, Sep 16, 2013 at 3:50 AM, Mark Mueller  wrote:
>
>
>
> On Fri, Sep 13, 2013 at 2:15 PM, Paul Berry  wrote:
>>
>> On 12 September 2013 22:06, Chia-I Wu  wrote:
>>>
>>> From: Chia-I Wu 
>>>
>>> Consider only the top-left and top-right pixels to approximate DDX in a
>>> 2x2
>>> subspan, unless the application or the user requests a more accurate
>>> approximation.  This results in a less accurate approximation.  However,
>>> it
>>> improves the performance of Xonotic with Ultra settings by 24.3879% +/-
>>> 0.832202% (at 95.0% confidence) on Haswell.  No noticeable image quality
>>> difference observed.
>>>
>>> No piglit gpu.tests regressions (tested with v1)
>>>
>>> I failed to come up with an explanation for the performance difference,
>>> as the
>>> change does not affect Ivy Bridge.  If anyone has the insight, please
>>> kindly
>>> enlighten me.  Performance differences may also be observed on other
>>> games
>>> that call textureGrad and dFdx.
>>>
>>> v2: Honor GL_FRAGMENT_SHADER_DERIVATIVE_HINT and add a drirc option.
>>> Update
>>> comments.
>>
>>
>> I'm not entirely comfortable making a change that has a known negative
>> impact on computational accuracy (even one that leads to such an impressive
>> performance improvement) when we don't have any theories as to why the
>> performance improvement happens, or why the improvement doesn't apply to Ivy
>> Bridge.  In my experience, making changes to the codebase without
>> understanding why they improve things almost always leads to improvements
>> that are brittle, since it's likely that the true source of the improvement
>> is a coincidence that will be wiped out by some future change (or won't be
>> relevant to client programs other than this particular benchmark).  Having a
>> theory as to why the performance improvement happens would help us be
>> confident that we're applying the right fix under the right circumstances.
> There's another angle to approach this and that is to develop a simple test
> case that will show the different results across a range of computational
> accuracy and run the test on proprietary drivers for the same hardware to
> determine what settings they are using.
Yes, I have a little test.  On Windows, rendering with texture2D() or
textureGrad() does not have a noticeable impact on the performance.
But I am not sure how to change dFdx() accuracy from the shaders.  On
Linux, I did that by modifying the driver.

>
>>
>>
>> For example, here's one theory as to why we might be seeing an
>> improvement: perhaps Haswell's sample_d processing is smart enough to
>> realize that when all the gradient values within a sub-span are the same,
>> that means that all of the sampling for the sub-span will come from the same
>> LOD, and that allows it to short-cut some expensive step in the LOD
>> calculation.  Perhaps the same improvement isn't seen on Ivy Bridge because
>> Ivy Bridge's sample_d processing logic is less sophisticated, so it's unable
>> to perform the optimization.  If this is the case, then conditioning the
>> optimization on brw->is_haswell (as you've done) makes sense.
>>
>> Another possible explanation for the Haswell vs Ivy Bridge difference is
>> that perhaps Ivy Bridge, being a lower-performing chip, has other
>> bottlenecks that make the optimization irrelevant for this particular
>> benchmark, but potentially still useful for other benchmarks.  For instance,
>> maybe when this benchmark executes on Ivy Bridge, the texture that's being
>> sampled from is located in sufficiently distant memory that optimizing the
>> sample_d's memory accesses makes no difference, since the bottleneck is the
>> speed with which the texture can be read into cache, rather than the speed
>> of operation of sample_d.  If this explanation is correct, then it might be
>> worth applying the optimization to both Ivy Bridge and Haswell (and perhaps
>> Sandy Bridge as well), since it might conceivably benefit those other chips
>> when running applications that place less cache pressure on the chip.
>
>
> This scenario is where I'd place my bets, especially given that the numbers
> are based on Xonotic. I benchmarked this patch using Xonotic on Bay Trail as
> is and by replacing !brw->is_haswell with !brw->is_baytrail. With ultra and
> ultimate levels at medium and high resolutions, the results were all
> essentially the same at comparable resolutions

Re: [Mesa-dev] [PATCH] i965/hsw: compute DDX in a subspan based only on top row

2013-09-16 Thread Chia-I Wu

On Mon, Sep 16, 2013 at 4:12 PM, Chia-I Wu  wrote:
> On Mon, Sep 16, 2013 at 3:50 AM, Mark Mueller  wrote:
>>
>>
>>
>> On Fri, Sep 13, 2013 at 2:15 PM, Paul Berry  wrote:
>>>
>>> On 12 September 2013 22:06, Chia-I Wu  wrote:
>>>>
>>>> From: Chia-I Wu 
>>>>
>>>> Consider only the top-left and top-right pixels to approximate DDX in a
>>>> 2x2
>>>> subspan, unless the application or the user requests a more accurate
>>>> approximation.  This results in a less accurate approximation.  However,
>>>> it
>>>> improves the performance of Xonotic with Ultra settings by 24.3879% +/-
>>>> 0.832202% (at 95.0% confidence) on Haswell.  No noticeable image quality
>>>> difference observed.
>>>>
>>>> No piglit gpu.tests regressions (tested with v1)
>>>>
>>>> I failed to come up with an explanation for the performance difference,
>>>> as the
>>>> change does not affect Ivy Bridge.  If anyone has the insight, please
>>>> kindly
>>>> enlighten me.  Performance differences may also be observed on other
>>>> games
>>>> that call textureGrad and dFdx.
>>>>
>>>> v2: Honor GL_FRAGMENT_SHADER_DERIVATIVE_HINT and add a drirc option.
>>>> Update
>>>> comments.
>>>
>>>
>>> I'm not entirely comfortable making a change that has a known negative
>>> impact on computational accuracy (even one that leads to such an impressive
>>> performance improvement) when we don't have any theories as to why the
>>> performance improvement happens, or why the improvement doesn't apply to Ivy
>>> Bridge.  In my experience, making changes to the codebase without
>>> understanding why they improve things almost always leads to improvements
>>> that are brittle, since it's likely that the true source of the improvement
>>> is a coincidence that will be wiped out by some future change (or won't be
>>> relevant to client programs other than this particular benchmark).  Having a
>>> theory as to why the performance improvement happens would help us be
>>> confident that we're applying the right fix under the right circumstances.
>> There's another angle to approach this and that is to develop a simple test
>> case that will show the different results across a range of computational
>> accuracy and run the test on proprietary drivers for the same hardware to
>> determine what settings they are using.
> Yes, I have a little test.  On Windows, rendering with texture2D() or
> textureGrad() does not have a noticeable impact on the performance.
> But I am not sure how to change dFdx() accuracy from the shaders.  On
> Linux, I did that by modifying the driver.
>
>>
>>>
>>>
>>> For example, here's one theory as to why we might be seeing an
>>> improvement: perhaps Haswell's sample_d processing is smart enough to
>>> realize that when all the gradient values within a sub-span are the same,
>>> that means that all of the sampling for the sub-span will come from the same
>>> LOD, and that allows it to short-cut some expensive step in the LOD
>>> calculation.  Perhaps the same improvement isn't seen on Ivy Bridge because
>>> Ivy Bridge's sample_d processing logic is less sophisticated, so it's unable
>>> to perform the optimization.  If this is the case, then conditioning the
>>> optimization on brw->is_haswell (as you've done) makes sense.
>>>
>>> Another possible explanation for the Haswell vs Ivy Bridge difference is
>>> that perhaps Ivy Bridge, being a lower-performing chip, has other
>>> bottlenecks that make the optimization irrelevant for this particular
>>> benchmark, but potentially still useful for other benchmarks.  For instance,
>>> maybe when this benchmark executes on Ivy Bridge, the texture that's being
>>> sampled from is located in sufficiently distant memory that optimizing the
>>> sample_d's memory accesses makes no difference, since the bottleneck is the
>>> speed with which the texture can be read into cache, rather than the speed
>>> of operation of sample_d.  If this explanation is correct, then it might be
>>> worth applying the optimization to both Ivy Bridge and Haswell (and perhaps
>>> Sandy Bridge as well), since it might conceivably benefit those other chips
>>> when running applications that place less cache pressure on the chip.
>>
>

Re: [Mesa-dev] gallium-bind-sampler-states branch

2013-09-16 Thread Chia-I Wu

On Tue, Sep 17, 2013 at 12:09 AM, Brian Paul  wrote:
> On 09/15/2013 09:31 AM, Chia-I Wu wrote:
>>
>> On Sun, Sep 15, 2013 at 12:24 AM, Brian Paul  wrote:
>>>
>>> On 09/12/2013 09:06 PM, Chia-I Wu wrote:
>>>>
>>>>
>>>> Hi Brian,
>>>>
>>>> On Fri, Sep 13, 2013 at 8:46 AM, Brian Paul  wrote:
>>>>>
>>>>>
>>>>>
>>>>> I just pushed a gallium-bind-sampler-states branch to my git repo at
>>>>> git://people.freedesktop.org/~brianp/mesa
>>>>>
>>>>> It replaces the four
>>>>> pipe_context::bind_fragment/vertex/geometry/compute_sampler_states()
>>>>> functions with a single bind_sampler_states() function:
>>>>>
>>>>>void (*bind_sampler_states)(struct pipe_context *,
>>>>>unsigned shader, unsigned start_slot,
>>>>>unsigned num_samplers, void **samplers);
>>>>>
>>>>> At this point start_slot is always zero (at least for non-compute
>>>>> shaders).
>>>>> And as the updated gallium docs explain, at some point calls to
>>>>> bind_sampler_states() will be used to updated sub-ranges, but that
>>>>> never
>>>>> happens currently.
>>>>>
>>>>> I've updated all the drivers, state trackers, utils, etc.
>>>>>
>>>>> I've tested the svga, llvmpipe and softpipe drivers.  'make check' and
>>>>> a
>>>>> texture subset of piglit pass w/out regressions.  I'd appreciate it if
>>>>> other
>>>>> driver developers would test their favorite driver.
>>>>
>>>>
>>>> For ilo, the new code does not follow the doc and unbinds samplers not
>>>> in
>>>> range.
>>>
>>>
>>>
>>> I think that's OK.  The CSO module (used by the state tracker) currently
>>> always calls pipe_context::bind_sampler_states() with start=0 and count
>>> such
>>> that it sets/replaces all samplers, never a sub-range.  That could/should
>>> change in the future.
>>>
>>> See single_sampler_done() in cso_context.c.
>>>
>>>
>>>
>>>> Is it fine if I implement the new bind_sampler_states as a helper
>>>> function on master branch, so that you hook it up to
>>>> pipe_context::bind_sampler_states in your branch and remove the old
>>>> ones?
>>>
>>>
>>>
>>> I'm not quite sure that I understand what you mean.  Can you elaborate?
>>
>> There is already ilo_bind_sampler_states that does what
>> pipe_context::bind_sampler_states expects, except that the function
>> returns a bool.  I can make it return void so that, in your branch,
>> you can initialize pipe_context::bind_sampler_states to it instead of
>> adding ilo_bind_sampler_states2.
>
>
> Sure, feel free to refactor as you see fit.  I didn't do that in the first
> place because the existing ilo_bind_X_sampler_states() functions are calling
> ilo_bind_sampler_states() twice and setting dirty flags.  I didn't feel
> comfortable rewriting all that and likely breaking it so I just wrapped
> things up with ilo_bind_sampler_states2().
I've pushed the change.  It is called twice by
ilo_bind_X_sampler_states() needs to bind the samplers in the range,
and unbind samplers that are not.

It's great to see the interface change.  Thanks for the work.  When
you get to work on consolidating set_X_sampler_views, I believe there
is also ilo_set_sampler_views that you can hook up to :P


>
> -Brian
>



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] i965/hsw: compute DDX in a subspan based only on top row

2013-09-17 Thread Chia-I Wu

On Wed, Sep 18, 2013 at 6:27 AM, Mark Mueller  wrote:
> On Mon, Sep 16, 2013 at 1:31 AM, Chia-I Wu  wrote:
>>
>> On Mon, Sep 16, 2013 at 4:12 PM, Chia-I Wu  wrote:
>> > On Mon, Sep 16, 2013 at 3:50 AM, Mark Mueller 
>> > wrote:
>> >>
>> >>
>> >>
>> >> On Fri, Sep 13, 2013 at 2:15 PM, Paul Berry 
>> >> wrote:
>> >>>
>> >>> On 12 September 2013 22:06, Chia-I Wu  wrote:
>> >>>>
>> >>>> From: Chia-I Wu 
>>
>> >>
>> >>
>> >> This scenario is where I'd place my bets, especially given that the
>> >> numbers
>> >> are based on Xonotic. I benchmarked this patch using Xonotic on Bay
>> >> Trail as
>> >> is and by replacing !brw->is_haswell with !brw->is_baytrail. With ultra
>> >> and
>> >> ultimate levels at medium and high resolutions, the results were all
>> >> essentially the same at comparable resolutions and quality levels.
>> > Isn't Bay Trail based on Ivy Bridge?
>> For Bay Trail, this might help you
>>
>>
>> http://lists.freedesktop.org/archives/mesa-dev/2013-September/044288.html
>>
>> if you are interested.
>
>
> Testing with Bay Trail shows no performance improvement with this patch.
> Most likely there are one or more CPU bottlenecks on Bay Tail that hide a
> majority of the performance gains of this change.
And no performance lost?  It could also be

 - the gain from SIMD16 was even out by the math ops
 - the lowering did not kick in because of one of the conditional checks
 - the game did not run in Ultra or Ultimate mode

I think the discussion belongs to that other thread.

>
>



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] i965/hsw: compute DDX in a subspan based only on top row

2013-09-17 Thread Chia-I Wu

Hi Paul,

On Mon, Sep 16, 2013 at 3:46 PM, Chia-I Wu  wrote:
> On Sat, Sep 14, 2013 at 5:15 AM, Paul Berry  wrote:
>> On 12 September 2013 22:06, Chia-I Wu  wrote:
>>>
>>> From: Chia-I Wu 
>>>
>>> Consider only the top-left and top-right pixels to approximate DDX in a
>>> 2x2
>>> subspan, unless the application or the user requests a more accurate
>>> approximation.  This results in a less accurate approximation.  However,
>>> it
>>> improves the performance of Xonotic with Ultra settings by 24.3879% +/-
>>> 0.832202% (at 95.0% confidence) on Haswell.  No noticeable image quality
>>> difference observed.
>>>
>>> No piglit gpu.tests regressions (tested with v1)
>>>
>>> I failed to come up with an explanation for the performance difference, as
>>> the
>>> change does not affect Ivy Bridge.  If anyone has the insight, please
>>> kindly
>>> enlighten me.  Performance differences may also be observed on other games
>>> that call textureGrad and dFdx.
>>>
>>> v2: Honor GL_FRAGMENT_SHADER_DERIVATIVE_HINT and add a drirc option.
>>> Update
>>> comments.
>>
>>
>> I'm not entirely comfortable making a change that has a known negative
>> impact on computational accuracy (even one that leads to such an impressive
>> performance improvement) when we don't have any theories as to why the
>> performance improvement happens, or why the improvement doesn't apply to Ivy
>> Bridge.  In my experience, making changes to the codebase without
>> understanding why they improve things almost always leads to improvements
>> that are brittle, since it's likely that the true source of the improvement
>> is a coincidence that will be wiped out by some future change (or won't be
>> relevant to client programs other than this particular benchmark).  Having a
>> theory as to why the performance improvement happens would help us be
>> confident that we're applying the right fix under the right circumstances.
> That is how I feel as I've mentioned.  I am really glad to have the
> discussion.  I have done some experiments actually.  It is just that
> those experiments only tell me what theories are likely to be wrong.
> They could not tell me if a theory is right.
Do the experiments make sense to you?  What other experiments do you
want to see conducted?

It could be hard to get direct proof without knowing the internal working..

>
> So I have a micro benchmark that draws a 256x256 texture to a 512x512
> window, using texture2D() or textureGrad().  It can also set up the
> vertex buffer such that the texture is rotated around the z-axis by 15
> degrees.
>
> On Haswell, when the texture is not rotated, rendering with
> textureGrad() is less than 1% slower than rendering with texture2D().
> The slowdown could be from the additional instructions to calculate
> DDX/DDY or the extra message payload.  Whether this patch is applied
> or not does not make any difference.
>
> When the texture is rotated, rendering with textureGrad() is ~3%
> slower than rendering with texture2D() without the patch.  With the
> patch, the difference is down to less than 1% again.
>
> Computing LOD in the shader results in ~17% slowdown comparing to 
> textureGrad().
>
> As a better way to control the result of DDX, I also hacked the driver
> so that DDX produced the values I specified for each pixel.  When not
> all pixels in the subspan have the same gradient, rendering is ~6%
> slower comparing to when all pixels in the subspan have the same
> gradient.
>
> The weird thing is, in SIMD8 mode, two subspans are processed at the
> same time.  When all pixels in one of the subspan have the same
> gradient, whether the pixels in the other subspan have the same
> gradient or not does not matter.
>
> As for Ivy Bridge, rendering with textureGrad() is always
> significantly slower than rendering with texture2D().  Computing LOD
> in the shader results in another ~4% slowdown comparing to
> textureGrad().
>
>>
>> For example, here's one theory as to why we might be seeing an improvement:
>> perhaps Haswell's sample_d processing is smart enough to realize that when
>> all the gradient values within a sub-span are the same, that means that all
>> of the sampling for the sub-span will come from the same LOD, and that
>> allows it to short-cut some expensive step in the LOD calculation.  Perhaps
>> the same improvement isn't seen on Ivy Bridge because Ivy Bridge's sample_d
>> processing logic is less sophisticated, so it's unable to perform the
>> o

Re: [Mesa-dev] [PATCH] i965/hsw: compute DDX in a subspan based only on top row

2013-09-22 Thread Chia-I Wu

On Fri, Sep 20, 2013 at 10:50 PM, Paul Berry  wrote:
> On 17 September 2013 19:54, Chia-I Wu  wrote:
>>
>> Hi Paul,
>>
>> On Mon, Sep 16, 2013 at 3:46 PM, Chia-I Wu  wrote:
>> > On Sat, Sep 14, 2013 at 5:15 AM, Paul Berry 
>> > wrote:
>> >> On 12 September 2013 22:06, Chia-I Wu  wrote:
>> >>>
>> >>> From: Chia-I Wu 
>> >>>
>> >>> Consider only the top-left and top-right pixels to approximate DDX in
>> >>> a
>> >>> 2x2
>> >>> subspan, unless the application or the user requests a more accurate
>> >>> approximation.  This results in a less accurate approximation.
>> >>> However,
>> >>> it
>> >>> improves the performance of Xonotic with Ultra settings by 24.3879%
>> >>> +/-
>> >>> 0.832202% (at 95.0% confidence) on Haswell.  No noticeable image
>> >>> quality
>> >>> difference observed.
>> >>>
>> >>> No piglit gpu.tests regressions (tested with v1)
>> >>>
>> >>> I failed to come up with an explanation for the performance
>> >>> difference, as
>> >>> the
>> >>> change does not affect Ivy Bridge.  If anyone has the insight, please
>> >>> kindly
>> >>> enlighten me.  Performance differences may also be observed on other
>> >>> games
>> >>> that call textureGrad and dFdx.
>> >>>
>> >>> v2: Honor GL_FRAGMENT_SHADER_DERIVATIVE_HINT and add a drirc option.
>> >>> Update
>> >>> comments.
>> >>
>> >>
>> >> I'm not entirely comfortable making a change that has a known negative
>> >> impact on computational accuracy (even one that leads to such an
>> >> impressive
>> >> performance improvement) when we don't have any theories as to why the
>> >> performance improvement happens, or why the improvement doesn't apply
>> >> to Ivy
>> >> Bridge.  In my experience, making changes to the codebase without
>> >> understanding why they improve things almost always leads to
>> >> improvements
>> >> that are brittle, since it's likely that the true source of the
>> >> improvement
>> >> is a coincidence that will be wiped out by some future change (or won't
>> >> be
>> >> relevant to client programs other than this particular benchmark).
>> >> Having a
>> >> theory as to why the performance improvement happens would help us be
>> >> confident that we're applying the right fix under the right
>> >> circumstances.
>> > That is how I feel as I've mentioned.  I am really glad to have the
>> > discussion.  I have done some experiments actually.  It is just that
>> > those experiments only tell me what theories are likely to be wrong.
>> > They could not tell me if a theory is right.
>> Do the experiments make sense to you?  What other experiments do you
>> want to see conducted?
>>
>> It could be hard to get direct proof without knowing the internal
>> working..
>
>
> Sorry for the slow reply.  We had some internal discussions with the
> hardware architects about this, and it appears that the first theory is
> correct: Haswell has an optimization in its sample_d processing which allows
> it to assume that all pixels in a 2x2 subspan will resolve to the same LOD
> provided that all the gradients in the 2x2 subspan are sufficiently similar
> to each other.  There's a register called SAMPLER_MODE which determines how
> similar the gradients have to be in order to trigger the optimization.  It
> can be set to values between 0 and 0x1f, where 0 (the default) means "only
> trigger the optimization if the gradients are exactly equal" and 0x1f means
> "trigger the optimization as frequently as possible".  Obviously triggering
> the optimization more often reduces the quality of the rendered output
> slightly, because it forces all pixels within a 2x2 subspan to sample from
> the same LOD.
>
> We believe that setting this register to 0x1f should produce an equivalent
> speed-up to your patch, without sacrificing the quality of d/dx when it is
> used for other (non-sample_d) purposes.  This approach would have the
> additional advantage that the benefit would apply to any shader that uses
> the sample_d message, regardless of whether or not that shader uses d/dx and
> d/dy to compute its gradients.
>
> Would you mi

Re: [Mesa-dev] [PATCH] i965/hsw: compute DDX in a subspan based only on top row

2013-09-22 Thread Chia-I Wu

On Mon, Sep 23, 2013 at 12:09 PM, Chia-I Wu  wrote:
> On Fri, Sep 20, 2013 at 10:50 PM, Paul Berry  wrote:
>> On 17 September 2013 19:54, Chia-I Wu  wrote:
>>>
>>> Hi Paul,
>>>
>>> On Mon, Sep 16, 2013 at 3:46 PM, Chia-I Wu  wrote:
>>> > On Sat, Sep 14, 2013 at 5:15 AM, Paul Berry 
>>> > wrote:
>>> >> On 12 September 2013 22:06, Chia-I Wu  wrote:
>>> >>>
>>> >>> From: Chia-I Wu 
>>> >>>
>>> >>> Consider only the top-left and top-right pixels to approximate DDX in
>>> >>> a
>>> >>> 2x2
>>> >>> subspan, unless the application or the user requests a more accurate
>>> >>> approximation.  This results in a less accurate approximation.
>>> >>> However,
>>> >>> it
>>> >>> improves the performance of Xonotic with Ultra settings by 24.3879%
>>> >>> +/-
>>> >>> 0.832202% (at 95.0% confidence) on Haswell.  No noticeable image
>>> >>> quality
>>> >>> difference observed.
>>> >>>
>>> >>> No piglit gpu.tests regressions (tested with v1)
>>> >>>
>>> >>> I failed to come up with an explanation for the performance
>>> >>> difference, as
>>> >>> the
>>> >>> change does not affect Ivy Bridge.  If anyone has the insight, please
>>> >>> kindly
>>> >>> enlighten me.  Performance differences may also be observed on other
>>> >>> games
>>> >>> that call textureGrad and dFdx.
>>> >>>
>>> >>> v2: Honor GL_FRAGMENT_SHADER_DERIVATIVE_HINT and add a drirc option.
>>> >>> Update
>>> >>> comments.
>>> >>
>>> >>
>>> >> I'm not entirely comfortable making a change that has a known negative
>>> >> impact on computational accuracy (even one that leads to such an
>>> >> impressive
>>> >> performance improvement) when we don't have any theories as to why the
>>> >> performance improvement happens, or why the improvement doesn't apply
>>> >> to Ivy
>>> >> Bridge.  In my experience, making changes to the codebase without
>>> >> understanding why they improve things almost always leads to
>>> >> improvements
>>> >> that are brittle, since it's likely that the true source of the
>>> >> improvement
>>> >> is a coincidence that will be wiped out by some future change (or won't
>>> >> be
>>> >> relevant to client programs other than this particular benchmark).
>>> >> Having a
>>> >> theory as to why the performance improvement happens would help us be
>>> >> confident that we're applying the right fix under the right
>>> >> circumstances.
>>> > That is how I feel as I've mentioned.  I am really glad to have the
>>> > discussion.  I have done some experiments actually.  It is just that
>>> > those experiments only tell me what theories are likely to be wrong.
>>> > They could not tell me if a theory is right.
>>> Do the experiments make sense to you?  What other experiments do you
>>> want to see conducted?
>>>
>>> It could be hard to get direct proof without knowing the internal
>>> working..
>>
>>
>> Sorry for the slow reply.  We had some internal discussions with the
>> hardware architects about this, and it appears that the first theory is
>> correct: Haswell has an optimization in its sample_d processing which allows
>> it to assume that all pixels in a 2x2 subspan will resolve to the same LOD
>> provided that all the gradients in the 2x2 subspan are sufficiently similar
>> to each other.  There's a register called SAMPLER_MODE which determines how
>> similar the gradients have to be in order to trigger the optimization.  It
>> can be set to values between 0 and 0x1f, where 0 (the default) means "only
>> trigger the optimization if the gradients are exactly equal" and 0x1f means
>> "trigger the optimization as frequently as possible".  Obviously triggering
>> the optimization more often reduces the quality of the rendered output
>> slightly, because it forces all pixels within a 2x2 subspan to sample from
>> the same LOD.
>>
>> We believe that setting this register to 0x1f should

[Mesa-dev] [PATCH] i965: fix SURFACE_STATE dumping

2013-04-10 Thread Chia-I Wu

Wrong fields were used when dumping width and height.
---
 src/mesa/drivers/dri/i965/brw_state_dump.c |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c 
b/src/mesa/drivers/dri/i965/brw_state_dump.c
index 9ea3fac..6fc8837 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -184,8 +184,8 @@ static void dump_surface_state(struct brw_context *brw, 
uint32_t offset)
 get_965_surface_format(GET_FIELD(surf[0], BRW_SURFACE_FORMAT)));
batch_out(brw, name, offset, 1, "offset\n");
batch_out(brw, name, offset, 2, "%dx%d size, %d mips\n",
-GET_FIELD(surf[2], GEN7_SURFACE_WIDTH) + 1,
-GET_FIELD(surf[2], GEN7_SURFACE_HEIGHT) + 1,
+GET_FIELD(surf[2], BRW_SURFACE_WIDTH) + 1,
+GET_FIELD(surf[2], BRW_SURFACE_HEIGHT) + 1,
 GET_FIELD(surf[2], BRW_SURFACE_LOD));
batch_out(brw, name, offset, 3, "pitch %d, %s tiled\n",
 GET_FIELD(surf[3], BRW_SURFACE_PITCH) + 1,
@@ -208,8 +208,8 @@ static void dump_gen7_surface_state(struct brw_context 
*brw, uint32_t offset)
  get_965_surface_format(GET_FIELD(surf[0], BRW_SURFACE_FORMAT)));
batch_out(brw, name, offset, 1, "offset\n");
batch_out(brw, name, offset, 2, "%dx%d size, %d mips\n",
- GET_FIELD(surf[2], BRW_SURFACE_WIDTH) + 1,
- GET_FIELD(surf[2], BRW_SURFACE_HEIGHT) + 1,
+ GET_FIELD(surf[2], GEN7_SURFACE_WIDTH) + 1,
+ GET_FIELD(surf[2], GEN7_SURFACE_HEIGHT) + 1,
  surf[5] & INTEL_MASK(3, 0));
batch_out(brw, name, offset, 3, "pitch %d, %stiled\n",
 (surf[3] & INTEL_MASK(17, 0)) + 1,
-- 
1.7.10.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] i965/gen7: fix 3DSTATE_LINE_STIPPLE_PATTERN

2013-04-10 Thread Chia-I Wu

The inverse repeat count should taks up bits 31:15 and is in U1.16.  Demos
from mesa/demos seem to render correctly with this change, bu piglit
"linestipple" test still fails.
---
 src/mesa/drivers/dri/i965/brw_misc_state.c |   17 ++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c 
b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 25672eb..ff19987 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -946,9 +946,20 @@ static void upload_line_stipple(struct brw_context *brw)
BEGIN_BATCH(3);
OUT_BATCH(_3DSTATE_LINE_STIPPLE_PATTERN << 16 | (3 - 2));
OUT_BATCH(ctx->Line.StipplePattern);
-   tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
-   tmpi = tmp * (1<<13);
-   OUT_BATCH(tmpi << 16 | ctx->Line.StippleFactor);
+
+   if (intel->gen >= 7) {
+  /* in U1.16 */
+  tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
+  tmpi = tmp * (1<<16);
+  OUT_BATCH(tmpi << 15 | ctx->Line.StippleFactor);
+   }
+   else {
+  /* in U1.13 */
+  tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
+  tmpi = tmp * (1<<13);
+  OUT_BATCH(tmpi << 16 | ctx->Line.StippleFactor);
+   }
+
CACHED_BATCH();
 }
 
-- 
1.7.10.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] i965/gen7: fix (huge) entry count for BRW_SURFACE_BUFFER

2013-04-10 Thread Chia-I Wu

Unlike GEN6, the bits of entry count are distributed like this

  width  = (entry_count & 0x007f);   /* bits [6:0] */
  height = (entry_count & 0x001fff80) >> 7;  /* bits [20:7] */
  depth  = (entry_count & 0x7fe0) >> 21; /* bits [30:21] */

The maximum entry count is still limited to 2^27.

This was noted while going over the PRM.  No test is done to verify.
---
 src/mesa/drivers/dri/i965/gen7_wm_surface_state.c |   16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c 
b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
index 2c12be3..761ceb0 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
@@ -269,9 +269,11 @@ gen7_update_buffer_texture_surface(struct gl_context *ctx,
 
   int texel_size = _mesa_get_format_bytes(format);
   int w = intel_obj->Base.Size / texel_size;
+
+  /* note that these differ from GEN6 */
   surf[2] = SET_FIELD(w & 0x7f, GEN7_SURFACE_WIDTH) | /* bits 6:0 of size 
*/
-SET_FIELD((w >> 7) & 0x1fff, GEN7_SURFACE_HEIGHT); /* 19:7 */
-  surf[3] = SET_FIELD((w >> 20) & 0x7f, BRW_SURFACE_DEPTH) | /* bits 26:20 
*/
+SET_FIELD((w >> 7) & 0x3fff, GEN7_SURFACE_HEIGHT); /* 20:7 */
+  surf[3] = SET_FIELD((w >> 21) & 0x3f, BRW_SURFACE_DEPTH) | /* bits 26:21 
*/
 (texel_size - 1);
}
 
@@ -403,9 +405,10 @@ gen7_create_constant_surface(struct brw_context *brw,
assert(bo);
surf[1] = bo->offset + offset; /* reloc */
 
+   /* note that these differ from GEN6 */
surf[2] = SET_FIELD(w & 0x7f, GEN7_SURFACE_WIDTH) |
- SET_FIELD((w >> 7) & 0x1fff, GEN7_SURFACE_HEIGHT);
-   surf[3] = SET_FIELD((w >> 20) & 0x7f, BRW_SURFACE_DEPTH) |
+ SET_FIELD((w >> 7) & 0x3fff, GEN7_SURFACE_HEIGHT);
+   surf[3] = SET_FIELD((w >> 21) & 0x3f, BRW_SURFACE_DEPTH) |
  (stride - 1);
 
if (intel->is_haswell) {
@@ -446,9 +449,10 @@ gen7_create_shader_time_surface(struct brw_context *brw, 
uint32_t *out_offset)
 
surf[1] = brw->shader_time.bo->offset; /* reloc */
 
+   /* note that these differ from GEN6 */
surf[2] = SET_FIELD(w & 0x7f, GEN7_SURFACE_WIDTH) |
- SET_FIELD((w >> 7) & 0x1fff, GEN7_SURFACE_HEIGHT);
-   surf[3] = SET_FIELD((w >> 20) & 0x7f, BRW_SURFACE_DEPTH);
+ SET_FIELD((w >> 7) & 0x3fff, GEN7_SURFACE_HEIGHT);
+   surf[3] = SET_FIELD((w >> 21) & 0x3f, BRW_SURFACE_DEPTH);
 
/* Unlike texture or renderbuffer surfaces, we only do untyped operations
 * on the shader_time surface, so there's no need to set HSW channel
-- 
1.7.10.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] new i965g pipe driver for Intel GEN6 (and later)

2013-04-16 Thread Chia-I Wu

Hi list,

On Thu, Dec 13, 2012 at 6:41 AM, Chia-I Wu  wrote:

> Hi list,
>
> I've been working on i965g, a new pipe driver for Intel GEN6 (and
> later), for a while now.  I would like to know if there is any
> interest in it and if it can be merged upstream.  The code is
> currently available here
>
>   https://github.com/olvaffe/mesa/tree/i965g
>
> The project was started for my own fun and for self-learning.  It was
> later sponsored by LunarG.  While it is still new, it does work for
> many of mesa-demos.  Right now it passes 6884 of 7547 piglit
> quick-driver.tests.  I also tried it with gnome-shell, OpenArena, and
> Nexuiz, and they all seem to work.
>
It has been a while since the announcement.  At first, I was waiting for my
account to be re-opened.  Then I got an Ivy Bridge laptop and was busy
enabling GEN7 on the driver until now.

I force-updated the branch for the latest code, and cleaned the history up
today.  The branch has 24 commits on top of master

Chia-I Wu (24):
  winsys/intel: new winsys for intel
  i965g: new pipe driver for Intel GEN6+
  i965g: add debug flags settable through I965_DEBUG
  i965g: hook up pipe_screen param and fence functions
  i965g: hook up pipe screen format functions
  i965g: hook up pipe screen resource functions
  i965g: add command parser
  i965g: hook up pipe context flush function
  i965g: add functions to manage shaders
  i965g: hook up pipe context state functions
  i965g: hook up pipe context blit functions
  i965g: hook up pipe context transfer functions
  i965g: hook up pipe context query functions
  i965g: add GEN6 GPE
  i965g: add GEN7 GPE
  i965g: add 3D pipeline for GEN6
  i965g: add GEN7 support for 3D pipeline
  i965g: hook up pipe context 3D functions
  i965g: add support for time/occlusion/primitive queries
  i965g: hook up pipe context video functions
  i965g: hook up pipe context GPGPU functions
  i965g: add a toy shader compiler
  i965g: compile VS/GS/FS with the toy compiler
  i965g: add the driver to the build system

If you are interested in the complete history, you can take a look at
i965g-next branch.

Changes since the announcement are:
 - GEN7 support (stencil buffer support is still missing)
 - fixed tons of bugs, with piglit passing 7999 of 8428 tests

The state of the TGSI->GEN compiler remains the same: messy, missing some
features, and non-optimizing.  As my focus is still on features, the
performance should remain about the same.  I will see if I can improve the
situations over the next few months.

If there is no objection, I'd like to merge it in a day or two.


> The driver is written from scratch.  However, it follows classic i965
> driver for many of the design decisions.  It comes with its own toy
> compiler to translate TGSI tokens to GEN instructions.  The compiler
> still lacks several functions (register spilling and most TGSI
> indirections), but more importantly, almost no optimization is
> performed.  It thus generates much worse code comparing to that
> generated by classic i965.
>
> I rebased the code tonight and cleaned up the history.  The branch now
> has 24  new commits on top of master
>
>   winsys/intel: new winsys for intel
>   i965g: new pipe driver for Intel GEN6+
>   i965g: add debug flags settable through I965_DEBUG
>   i965g: hook up pipe_screen param and fence functions
>   i965g: add functions to translate pipe enums to HW enums
>   i965g: hook up pipe screen format functions
>   i965g: hook up pipe screen resource functions
>   i965g: add command parser
>   i965g: hook up pipe context flush function
>   i965g: add functions to manage shaders
>   i965g: hook up pipe context state functions
>   i965g: hook up pipe context blit functions
>   i965g: hook up pipe context transfer functions
>   i965g: hook up pipe context query functions
>   i965g: add GEN6 GPE
>   i965g: add GEN6 3D context
>   i965g: hook up pipe context 3D functions
>   i965g: add support for timer/occlusion/primitive queries
>   i965g: hook up pipe context video functions
>   i965g: hook up pipe context GPGPU functions
>   i965g: add a toy shader compiler
>   i965g: compile VS and FS with the toy compiler
>   i965g: support the new driver in various targets
>   i965g: add to --with-gallium-drivers
>
> It is quite self-contained.  If preferred, I can send the patches to the
> list.
>
> Oh, and my account on fdo is disabled because of my own mistake[1].  I
> contacted some of the developers in the thread but did not get any
> response.  Could anyone help me with that, or how do I have it
> re-enabled?
>
> [1] http://lists.freedesktop.org/ar

Re: [Mesa-dev] new i965g pipe driver for Intel GEN6 (and later)

2013-04-16 Thread Chia-I Wu

Hi Matt,

On Wed, Apr 17, 2013 at 12:58 AM, Matt Turner  wrote:

> On Tue, Apr 16, 2013 at 9:45 AM, Chia-I Wu  wrote:
> > If there is no objection, I'd like to merge it in a day or two.
>
> My only objection is over adding a driver that is explicitly a toy,
> the confusion it will cause users, and the developer time it will
> waste. It wasn't uncommon for a user to waste a nontrivial amount of
> someone's time in #intel-gfx only to discover that they were trying to
> use the (old) i965g driver that no one maintained.
>
I think there are two concerns here: the driver is a toy and the driver can
be confusing.  And I agree with both.

But being a toy has its advantages.  For example, if someone sends me a
patch to incorporate beignet backend and hook the driver up with clover
OpenCL state tracker, I will be happy to take the patch.  It might have
loose ends or known issues, but as long as the developer is committed to
fix them, they will be resolved over time.  Same if someone wants to work
on video decoding and etc.

While being a toy, the driver is developed seriously and is always kept in
a usable state.  This is mainly thanks to piglit, to which your team
contribute a lot.  After making changes, I run piglit to make sure there is
no regression.  I even have a spreadsheet documenting why each of the
current 429 failures fails.  There were like ~650 failures last December.
I've fixed quite some, and I plan to keep going.  That is why I haven't
started looking at the performance.

As you can see, I have to reorganize the commits to have a clean history
before attempting to merge it to master.  If I had to wait until the driver
is no longer a toy to merge it, it would lose more of its history by the
time it is finally merged.

I started the driver because I wanted to play with gallium and I did not
have any radeon/nvidia card.  There could be other potential developers who
are in the same situation.  I believe having the driver in master allows
them to get started more easily.

If you could agree with me, then, technically, we need to make the new
driver less confusing.  The driver is disabled by default, of course.  It
can be enabled with --with-gallium-drivers=i965 now.  Would it help if I
change it to something like --enable-gallium-drivers=i965-UNOFFICIAL?  Or
do you have any suggestion, given your experiences dealing with i915g or
the old i965g bug reports?

> I think everything Marek said was correct. If you could extend Gallium
> to consume GLSL IR it might actually be an interesting project.
>
Yes, it is.  I do want to make pipe drivers be able to express the
preferred IR and make the mesa state tracker generate it.  I had LLVM IR in
mind, but GLSL IR could be a much less intrusive choice.  I will check that
out.

-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] new i965g pipe driver for Intel GEN6 (and later)

2013-04-16 Thread Chia-I Wu

Hi Ken,

On Wed, Apr 17, 2013 at 1:18 AM, Kenneth Graunke wrote:

> On 04/16/2013 09:58 AM, Matt Turner wrote:
>
>> On Tue, Apr 16, 2013 at 9:45 AM, Chia-I Wu  wrote:
>>
>>> If there is no objection, I'd like to merge it in a day or two.
>>>
>>
>> My only objection is over adding a driver that is explicitly a toy,
>> the confusion it will cause users, and the developer time it will
>> waste. It wasn't uncommon for a user to waste a nontrivial amount of
>> someone's time in #intel-gfx only to discover that they were trying to
>> use the (old) i965g driver that no one maintained.
>>
>
> That's very true.
>
> I wonder, should i965g be built by default?  Or should you have to
> explicitly request it via --with-gallium-drivers=i965?  The thinking is
> that the default should be to build the drivers most people want to use.
>
> We could also make ./configure print out a warning message saying
> something like: "The Gallium i965 driver is highly experimental and not
> supported by Intel.  Intel recommends using the classic driver
> (--with-dri-drivers=i965)."
>
> Those are just ideas.  I'm open to discussion.

The driver is disabled by default and needs to be enabled via
--with-gallium-drivers=i965.

I explained why I want the driver merged in my reply to Matt.  It looks
like the other concern for your team is that the new driver could be
confusing.  I can imagine that.  I wonder if it can be resolved
technically.  Maybe the same rule can be applied to i915g, if you still get
bug reports for it.

>
>
>  I think everything Marek said was correct. If you could extend Gallium
>> to consume GLSL IR it might actually be an interesting project.
>>
>
> I agree, that would be interesting.  It'd definitely make for a more
> compelling classic vs. gallium comparison.
>
> --Ken
>



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] new i965g pipe driver for Intel GEN6 (and later)

2013-04-16 Thread Chia-I Wu

On Wed, Apr 17, 2013 at 5:25 AM, Dave Airlie  wrote:

> >> Those are just ideas.  I'm open to discussion.
> >
> > The driver is disabled by default and needs to be enabled via
> > --with-gallium-drivers=i965.
>
> I think a warning + maybe something like
> --with-gallium-drivers=i965g-unofficial might work,
>
Would it be better if I rename i965g to igen (Intel GEN)?  The driver does
not support 965 anyway.

It will be enabled with --with-gallium-drivers=igen, and the DRI driver
will be named igen_dri.so.  Those wanting to give it a try must manually
rename it to i965_dri.so.  Or even better, I can add LIBGL_FORCE_DRIVER to
GLX so that a driver name can be specified.

> the thing is distros should probably be using i915g at this point over
> i915, though it might warrant piglit fixing up, though the chromeos
> guys did a fair bit already.
>
> Dave.
>

-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] new i965g pipe driver for Intel GEN6 (and later)

2013-04-16 Thread Chia-I Wu

On Wed, Apr 17, 2013 at 8:09 AM, Marek Olšák  wrote:
> On Tue, Apr 16, 2013 at 9:58 PM, Chia-I Wu  wrote:
>>
>> On Wed, Apr 17, 2013 at 12:58 AM, Matt Turner  wrote:
>>>
>>> I think everything Marek said was correct. If you could extend Gallium
>>> to consume GLSL IR it might actually be an interesting project.
>>
>> Yes, it is.  I do want to make pipe drivers be able to express the preferred 
>> IR and make the mesa state tracker generate it.  I had LLVM IR in mind, but 
>> GLSL IR could be a much less intrusive choice.  I will check that out.
>
> Actually, there is even a better option. You don't have to fork the
> compiler, you can add source files from src/mesa/drivers/dri/i965 to
> your gallium driver. We have done this before. r300g did not have its
> own compiler for a couple of *years*. The compiler lived in r300c in
> src/mesa/drivers/dri/r300/compiler and it was more or less a separate
> library (it did not depend on gl_context). If r300g was enabled, the
> build system also compiled files in the r300 compiler directory and
> the linker included them in the .so file. r300g also used (and still
> uses) the register allocator from src/mesa/program, which is also used
> by i965.
One thing I want to do next in i965g is to make shader compilation and
complied shaders opaque for the rest of the driver.  That would make
the code cleaner, and make it easy to switch to different compilers.
It seems the compiler in i965 makes some use of brw_context, but that
is mainly to get hardware capabilities.  It should be possible to make
it more library-like or to fork it for i965g.

How hard would it be to make GLSL IR an alternative IR for pipe
drivers?  On the driver side, it seems I only need to add
PIPE_SHADER_IR_GLSL for the preferred IR and change the type of tokens
to void *.  On the st/mesa side, could I simply skip GLSL-to-TGSI
pass?  Maybe also disable API_OPENGL_COMPAT?

>
> Feel free to use the same approach. It was a big win for us.
>
> Marek



--
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] new i965g pipe driver for Intel GEN6 (and later)

2013-04-17 Thread Chia-I Wu

On Wed, Apr 17, 2013 at 10:26 PM, Brian Paul  wrote:
> On 04/17/2013 02:31 AM, Jose Fonseca wrote:
[snipped]
>>> How hard would it be to make GLSL IR an alternative IR for pipe
>>> drivers?  On the driver side, it seems I only need to add
>>> PIPE_SHADER_IR_GLSL for the preferred IR and change the type of tokens
>>> to void *.  On the st/mesa side, could I simply skip GLSL-to-TGSI
>>> pass?  Maybe also disable API_OPENGL_COMPAT?
>>
>>
>> Fully replacing TGSI would also imply providing an alternative to
>> src/gallium/auxiliary/util/u_simple_shaders.h , for clears and blits.  One
>> can easily imagining one day these helpers taking an additional
>> PIPE_SHADER_IR_* parameter. I think this would be the ideal.
>
>
> Olv, I presume you're not using the 'draw' module.  There's lots of TGSI
> dependencies in there too.
No, I don't need the module.
>> In the short term, you could workaround that need either by keeping enough
>> TGSI support just for those cases, or maybe by tweaking the mesa state
>> tracker to use the src/mesa/drivers/common/meta.c for clears.
>
>
> Unfortunately, the meta code has a few dependencies on swrast, which isn't
> used with any gallium configuration.
>
>
>
>> Brian can give a better assessment, but my understanding is that meta
>> module doesn't have as tight control over the driver interface as the mesa
>> gallium state tracker has, so bugs due to non-default/tampered state tend to
>> creep up now and then.
>
>
> Can you keep the TGSI support in place alongside GLSL IR?  That is, keep the
> TGSI support as-is.  Provide GLSL IR where possible.  In pipe_shader_state,
> add (a) new field(s) to carry GLSL IR.  When pipe_shader_state has GLSL IR,
> use it.  Else, fall back to using TGSI.
>
> Ideally, everything would continue to run throughout the process but you'd
> gradually transition everything to GLSL IR.  When you get to 100% of the
> later, remove the TGSI support.
Yes, that should be a better plan.  At first, I will try to figure out
where in the state tracker depends on TGSI and try to move the code
together to some new files.  Then we know clearly what needs to be
replaced to support an alternative IR, and we can have both IRs live
together for a while.

> But in addition to the GLSL IR code itself, there's other stuff that
> interfaces with it, like constant buffer layout.  I'm really not too aware
> of how that's handled at the moment.
>
> -Brian



--
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] renaming i965g

2013-04-17 Thread Chia-I Wu

Hi list,

Per the discussion, i965g is confusing and misleading.  Instead of
preventing the confusions via

  --with-gallium-drivers=i965g-unofficial or
  --with-gallium-drivers=experimental-i965g,

which kind of makes i965g a second-class citizen, I think it is better
to rename it.  Then whenever one sees i965, be it on mailing list or
in a commit message, we know it refers to the official driver that
Intel supports.

Let's say the new name is iFoo.  The gallium driver will be enabled
via --with-gallium-drivers=iFoo, and the DRI driver will be named
iFoo_dri.so.  Users will need to rename it to use it, and I hope this
extra step will avoid the confusions.

I'd like to call it igen, so that I do not need to reflow the code
after running sed -i 's/i965/igen/g'.  But I am pretty bad at naming.
If this approach or the name is frowned upon, please let me know.

--
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] renaming i965g

2013-04-18 Thread Chia-I Wu

On Thu, Apr 18, 2013 at 11:36 PM, Eric Anholt  wrote:
> Chia-I Wu  writes:
>
>> Hi list,
>>
>> Per the discussion, i965g is confusing and misleading.  Instead of
>> preventing the confusions via
>>
>>   --with-gallium-drivers=i965g-unofficial or
>>   --with-gallium-drivers=experimental-i965g,
>>
>> which kind of makes i965g a second-class citizen, I think it is better
>> to rename it.  Then whenever one sees i965, be it on mailing list or
>> in a commit message, we know it refers to the official driver that
>> Intel supports.
>>
>> Let's say the new name is iFoo.  The gallium driver will be enabled
>> via --with-gallium-drivers=iFoo, and the DRI driver will be named
>> iFoo_dri.so.  Users will need to rename it to use it, and I hope this
>> extra step will avoid the confusions.
>>
>> I'd like to call it igen, so that I do not need to reflow the code
>> after running sed -i 's/i965/igen/g'.  But I am pretty bad at naming.
>> If this approach or the name is frowned upon, please let me know.
> But what you're building isn't an Intel gen graphics driver, right?
> You're just targeting gen7.  Are you planning on extending back to gen2?
>
> I think you should explicitly mark it experimental/unoffficial in the
> configure switch.  Otherwise build-it-themselves users see both options,
> and having heard about gallium, assume the gallium one is what they
> should be using, and then they suffer.
I should come up with another name then, one that does not imply if
the driver supports some GPUs.

In the scenario you described, a user knows he/she can choose from
either the gallium one or the classic one only because the drivers
have the same name.  If the user have to choose between
--with-dri-drivers=i965 and --with-gallium-drivers=ironman for his
Intel GPU, he sure will choose i965.  If he does choose ironman
without knowing what it is, firstly, the driver will not be loaded
because it is called ironman_dri.so.  Secondly, they keyword ironman
will appear in the bug reports and you will know they are not your
bugs.

--
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] i965: Avoid extraneous fast depth clears

2013-12-10 Thread Chia-I Wu

When the depth buffer is already cleared, skip GEN6_HIZ_OP_DEPTH_CLEAR.  This
is made possible by tracking which slices have been cleared in
"struct intel_mipmap_level".  The hiz_cleared flag is unset when the depth
buffer is rendered to or when a HiZ resolve is needed.

For Unigine Tropics, the FPS improvement is 1.32134% +/- 0.161878% (n=13).
---
 src/mesa/drivers/dri/i965/brw_blorp_blit.cpp  |  1 +
 src/mesa/drivers/dri/i965/brw_clear.c | 58 +--
 src/mesa/drivers/dri/i965/brw_draw.c  | 16 +++-
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 37 +
 src/mesa/drivers/dri/i965/intel_mipmap_tree.h | 21 ++
 5 files changed, 119 insertions(+), 14 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp 
b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
index 51a3bef..d9ec3e9 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -173,6 +173,7 @@ brw_blorp_blit_miptrees(struct brw_context *brw,
brw_blorp_exec(brw, ¶ms);
 
intel_miptree_slice_set_needs_hiz_resolve(dst_mt, dst_level, dst_layer);
+   intel_miptree_slice_set_hiz_cleared(dst_mt, dst_level, dst_layer, false);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/brw_clear.c 
b/src/mesa/drivers/dri/i965/brw_clear.c
index 1cac996..9dfb94a 100644
--- a/src/mesa/drivers/dri/i965/brw_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_clear.c
@@ -164,34 +164,66 @@ brw_fast_clear_depth(struct gl_context *ctx)
   break;
}
 
+   unsigned num_layers_cleared = 0;
+   bool clear_all_layers = false;
+
/* If we're clearing to a new clear value, then we need to resolve any clear
 * flags out of the HiZ buffer into the real depth buffer.
 */
if (mt->depth_clear_value != depth_clear_value) {
   intel_miptree_all_slices_resolve_depth(brw, mt);
   mt->depth_clear_value = depth_clear_value;
-   }
 
-   /* From the Sandy Bridge PRM, volume 2 part 1, page 313:
-*
-* "If other rendering operations have preceded this clear, a
-*  PIPE_CONTROL with write cache flush enabled and Z-inhibit disabled
-*  must be issued before the rectangle primitive used for the depth
-*  buffer clear operation.
-*/
-   intel_batchbuffer_emit_mi_flush(brw);
+  clear_all_layers = true;
+   }
 
if (fb->NumLayers > 0) {
   assert(fb->NumLayers == depth_irb->mt->level[depth_irb->mt_level].depth);
   for (unsigned layer = 0; layer < fb->NumLayers; layer++) {
- intel_hiz_exec(brw, mt, depth_irb->mt_level, layer,
-GEN6_HIZ_OP_DEPTH_CLEAR);
+ if (clear_all_layers ||
+ !intel_miptree_slice_get_hiz_cleared(mt,
+  depth_irb->mt_level,
+  layer)) {
+/* From the Sandy Bridge PRM, volume 2 part 1, page 313:
+ *
+ * "If other rendering operations have preceded this clear, a
+ *  PIPE_CONTROL with write cache flush enabled and Z-inhibit
+ *  disabled must be issued before the rectangle primitive used
+ *  for the depth buffer clear operation.
+ */
+if (!num_layers_cleared)
+   intel_batchbuffer_emit_mi_flush(brw);
+
+intel_hiz_exec(brw, mt, depth_irb->mt_level, layer,
+   GEN6_HIZ_OP_DEPTH_CLEAR);
+
+intel_miptree_slice_set_hiz_cleared(mt,
+depth_irb->mt_level,
+layer,
+true);
+num_layers_cleared++;
+ }
   }
} else {
-  intel_hiz_exec(brw, mt, depth_irb->mt_level, depth_irb->mt_layer,
- GEN6_HIZ_OP_DEPTH_CLEAR);
+  if (clear_all_layers ||
+  !intel_miptree_slice_get_hiz_cleared(mt,
+   depth_irb->mt_level,
+   depth_irb->mt_layer)) {
+ intel_batchbuffer_emit_mi_flush(brw);
+ intel_hiz_exec(brw, mt, depth_irb->mt_level, depth_irb->mt_layer,
+GEN6_HIZ_OP_DEPTH_CLEAR);
+
+ intel_miptree_slice_set_hiz_cleared(mt,
+ depth_irb->mt_level,
+ depth_irb->mt_layer,
+ true);
+ num_layers_cleared = 1;
+  }
}
 
+   if (!num_layers_cleared)
+  return true;
+
if (brw->gen == 6) {
   /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
*
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c 
b/src/mesa/drivers/dri/i965/brw_draw.c
index b898cd3..4ebfe44 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -363,8 +363,22 @@ static void brw_postdra

Re: [Mesa-dev] [PATCH] ilo: build pipe-loader driver]

2014-01-02 Thread Chia-I Wu

On Thu, Jan 2, 2014 at 10:39 PM, Steven Newbury  wrote:
> Forgot to add signed-off-by...
>
> In trying to get gallium-nine working with the ilo Gallium driver I
> noticed there's no ilo pipe-loader driver being built.
>
> This patch simply puts in place the missing pieces.
>
> The driver descriptor is named "ilo", rather than "i965" as the ilo DRI
> driver currently names itself, this is necessary as otherwise the
> pipe-loader refuses to load as it has a sanity check verifying the name
> matches.  A follow-up patch renames the ilo DRI driver descriptor to
> match.
>
> Signed-off-by: Steven Newbury 
> ---
>  include/pci_ids/pci_id_driver_map.h |  4 +++-
>  src/gallium/targets/pipe-loader/Makefile.am | 17 +
>  src/gallium/targets/pipe-loader/pipe_ilo.c  | 27 +++
>  3 files changed, 47 insertions(+), 1 deletion(-)
>
> diff --git a/include/pci_ids/pci_id_driver_map.h 
> b/include/pci_ids/pci_id_driver_map.h
> index 8a97c6f..1fb0467 100644
> --- a/include/pci_ids/pci_id_driver_map.h
> +++ b/include/pci_ids/pci_id_driver_map.h
> @@ -64,10 +64,12 @@ static const struct {
> int num_chips_ids;
>  } driver_map[] = {
> { 0x8086, "i915", i915_chip_ids, ARRAY_SIZE(i915_chip_ids) },
> -   { 0x8086, "i965", i965_chip_ids, ARRAY_SIZE(i965_chip_ids) },
>  #ifndef DRIVER_MAP_GALLIUM_ONLY
> +   { 0x8086, "i965", i965_chip_ids, ARRAY_SIZE(i965_chip_ids) },
> { 0x1002, "radeon", r100_chip_ids, ARRAY_SIZE(r100_chip_ids) },
> { 0x1002, "r200", r200_chip_ids, ARRAY_SIZE(r200_chip_ids) },
> +#else
> +   { 0x8086, "ilo", i965_chip_ids, ARRAY_SIZE(i965_chip_ids) },
>  #endif
Moving "i965" into the #ifndef looks correct to me, but having "ilo"
in the #else looks hacky.  For in this map, "ilo" should be always
defined by definition, and supports a subset of i965_chip_ids.

I am actually in favor of an environment variable that overrides the
auto-detection of the driver in the pipe loader, thus skipping this
map.

> { 0x1002, "r300", r300_chip_ids, ARRAY_SIZE(r300_chip_ids) },
> { 0x1002, "r600", r600_chip_ids, ARRAY_SIZE(r600_chip_ids) },
> diff --git a/src/gallium/targets/pipe-loader/Makefile.am 
> b/src/gallium/targets/pipe-loader/Makefile.am
> index 6875453..8fa3873 100644
> --- a/src/gallium/targets/pipe-loader/Makefile.am
> +++ b/src/gallium/targets/pipe-loader/Makefile.am
> @@ -47,6 +47,23 @@ PIPE_LIBS = \
> -lpthread \
> -lm
>
> +if HAVE_GALLIUM_ILO
> +pipe_LTLIBRARIES += pipe_ilo.la
> +pipe_ilo_la_SOURCES = pipe_ilo.c
> +pipe_ilo_la_LIBADD = \
> +   $(PIPE_LIBS) \
> +   $(top_builddir)/src/gallium/winsys/intel/drm/libintelwinsys.la \
> +   $(top_builddir)/src/gallium/drivers/ilo/libilo.la \
> +   $(LIBDRM_LIBS) \
> +   $(INTEL_LIBS)
> +pipe_ilo_la_LDFLAGS = -no-undefined -avoid-version -module
> +if HAVE_MESA_LLVM
> +nodist_EXTRA_pipe_ilo_la_SOURCES = dummy.cpp
> +pipe_ilo_la_LIBADD += $(LLVM_LIBS)
> +pipe_ilo_la_LDFLAGS += $(LLVM_LDFLAGS)
> +endif
> +endif
> +
>  if HAVE_GALLIUM_I915
>  pipe_LTLIBRARIES += pipe_i915.la
>  pipe_i915_la_SOURCES = pipe_i915.c
> diff --git a/src/gallium/targets/pipe-loader/pipe_ilo.c 
> b/src/gallium/targets/pipe-loader/pipe_ilo.c
> new file mode 100644
> index 000..11be2d1
> --- /dev/null
> +++ b/src/gallium/targets/pipe-loader/pipe_ilo.c
> @@ -0,0 +1,27 @@
> +
> +#include "target-helpers/inline_debug_helper.h"
> +#include "state_tracker/drm_driver.h"
> +#include "intel/intel_winsys.h"
> +#include "ilo/ilo_public.h"
> +
> +static struct pipe_screen *
> +create_screen(int fd)
> +{
> +   struct intel_winsys *iws;
> +   struct pipe_screen *screen;
> +
> +   iws = intel_winsys_create_for_fd(fd);
> +   if (!iws)
> +  return NULL;
> +
> +   screen = ilo_screen_create(iws);
> +   if (!screen)
> +  return NULL;
> +
> +   screen = debug_screen_wrap(screen);
> +
> +   return screen;
> +}
> +
> +PUBLIC
> +DRM_DRIVER_DESCRIPTOR("ilo", "i915", create_screen, NULL)
>
>
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCHv2] i965: Avoid extraneous fast depth clears

2014-01-02 Thread Chia-I Wu

When the depth buffer is already cleared, skip GEN6_HIZ_OP_DEPTH_CLEAR.  This
is made possible by tracking which slices have been cleared in
"struct intel_mipmap_level".  The hiz_cleared flag is unset when the depth
buffer is rendered to or when a HiZ resolve is needed.

For Unigine Tropics, the FPS improvement is 1.32134% +/- 0.161878% (n=13).

v2:
- unset hiz_cleared automatically in intel_miptree_slice_set_needs_hiz_resolve
- set/unset hiz_cleared with intel_renderbuffer_att_set_needs_depth_resolve

Signed-off-by: Chia-I Wu 
---
 src/mesa/drivers/dri/i965/brw_clear.c | 54 +++
 src/mesa/drivers/dri/i965/brw_draw.c  |  2 +-
 src/mesa/drivers/dri/i965/intel_fbo.c | 18 -
 src/mesa/drivers/dri/i965/intel_fbo.h |  4 +-
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 39 +++
 src/mesa/drivers/dri/i965/intel_mipmap_tree.h | 21 +++
 6 files changed, 118 insertions(+), 20 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_clear.c 
b/src/mesa/drivers/dri/i965/brw_clear.c
index 1cac996..8622584 100644
--- a/src/mesa/drivers/dri/i965/brw_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_clear.c
@@ -164,34 +164,58 @@ brw_fast_clear_depth(struct gl_context *ctx)
   break;
}
 
+   unsigned num_layers_cleared = 0;
+   bool clear_all_layers = false;
+
/* If we're clearing to a new clear value, then we need to resolve any clear
 * flags out of the HiZ buffer into the real depth buffer.
 */
if (mt->depth_clear_value != depth_clear_value) {
   intel_miptree_all_slices_resolve_depth(brw, mt);
   mt->depth_clear_value = depth_clear_value;
-   }
 
-   /* From the Sandy Bridge PRM, volume 2 part 1, page 313:
-*
-* "If other rendering operations have preceded this clear, a
-*  PIPE_CONTROL with write cache flush enabled and Z-inhibit disabled
-*  must be issued before the rectangle primitive used for the depth
-*  buffer clear operation.
-*/
-   intel_batchbuffer_emit_mi_flush(brw);
+  clear_all_layers = true;
+   }
 
if (fb->NumLayers > 0) {
   assert(fb->NumLayers == depth_irb->mt->level[depth_irb->mt_level].depth);
   for (unsigned layer = 0; layer < fb->NumLayers; layer++) {
- intel_hiz_exec(brw, mt, depth_irb->mt_level, layer,
-GEN6_HIZ_OP_DEPTH_CLEAR);
+ if (clear_all_layers ||
+ !intel_miptree_slice_get_hiz_cleared(mt,
+  depth_irb->mt_level,
+  layer)) {
+/* From the Sandy Bridge PRM, volume 2 part 1, page 313:
+ *
+ * "If other rendering operations have preceded this clear, a
+ *  PIPE_CONTROL with write cache flush enabled and Z-inhibit
+ *  disabled must be issued before the rectangle primitive used
+ *  for the depth buffer clear operation.
+ */
+if (num_layers_cleared == 0)
+   intel_batchbuffer_emit_mi_flush(brw);
+
+intel_hiz_exec(brw, mt, depth_irb->mt_level, layer,
+   GEN6_HIZ_OP_DEPTH_CLEAR);
+
+num_layers_cleared++;
+ }
   }
} else {
-  intel_hiz_exec(brw, mt, depth_irb->mt_level, depth_irb->mt_layer,
- GEN6_HIZ_OP_DEPTH_CLEAR);
+  if (clear_all_layers ||
+  !intel_miptree_slice_get_hiz_cleared(mt,
+   depth_irb->mt_level,
+   depth_irb->mt_layer)) {
+ intel_batchbuffer_emit_mi_flush(brw);
+ intel_hiz_exec(brw, mt, depth_irb->mt_level, depth_irb->mt_layer,
+GEN6_HIZ_OP_DEPTH_CLEAR);
+
+ num_layers_cleared = 1;
+  }
}
 
+   if (num_layers_cleared == 0)
+  return true;
+
if (brw->gen == 6) {
   /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
*
@@ -203,9 +227,9 @@ brw_fast_clear_depth(struct gl_context *ctx)
}
 
/* Now, the HiZ buffer contains data that needs to be resolved to the depth
-* buffer.
+* buffer.  And set its cleared state to avoid unnecessary clears.
 */
-   intel_renderbuffer_att_set_needs_depth_resolve(depth_att);
+   intel_renderbuffer_att_set_needs_depth_resolve(depth_att, true);
 
return true;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c 
b/src/mesa/drivers/dri/i965/brw_draw.c
index b898cd3..2138174 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -364,7 +364,7 @@ static void brw_postdraw_set_buffers_need_resolve(struct 
brw_context *brw)
if (back_irb)
   intel_renderbuffer_set_needs_downsample(back_irb);
if (depth_irb && ctx->Depth.Mask)
-  intel_renderbuffer_att_set_needs_depth_resolve(depth_att

Re: [Mesa-dev] [PATCH] i965: Avoid extraneous fast depth clears

2014-01-02 Thread Chia-I Wu

On Fri, Dec 27, 2013 at 7:25 AM, Chad Versace
 wrote:
> On 12/10/2013 09:54 PM, Chia-I Wu wrote:
>>
>> When the depth buffer is already cleared, skip GEN6_HIZ_OP_DEPTH_CLEAR.
>> This
>> is made possible by tracking which slices have been cleared in
>> "struct intel_mipmap_level".  The hiz_cleared flag is unset when the depth
>> buffer is rendered to or when a HiZ resolve is needed.
>>
>> For Unigine Tropics, the FPS improvement is 1.32134% +/- 0.161878% (n=13).
>
>
> The code looks correct to me, and the perf improvement is nice. I have
> comments below that should improve the maintainability of the affected
> codepaths.
>
>
>> ---
>>   src/mesa/drivers/dri/i965/brw_blorp_blit.cpp  |  1 +
>>   src/mesa/drivers/dri/i965/brw_clear.c | 58
>> +--
>>   src/mesa/drivers/dri/i965/brw_draw.c  | 16 +++-
>>   src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 37 +
>>   src/mesa/drivers/dri/i965/intel_mipmap_tree.h | 21 ++
>>   5 files changed, 119 insertions(+), 14 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
>> b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
>> index 51a3bef..d9ec3e9 100644
>> --- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
>> @@ -173,6 +173,7 @@ brw_blorp_blit_miptrees(struct brw_context *brw,
>>  brw_blorp_exec(brw, ¶ms);
>>
>>  intel_miptree_slice_set_needs_hiz_resolve(dst_mt, dst_level,
>> dst_layer);
>> +   intel_miptree_slice_set_hiz_cleared(dst_mt, dst_level, dst_layer,
>> false);
>
>
> If the miptree slice needs a hiz resolve, then the hiz buffer is not
> cleared. You captured
> this invariant by appending ``intel_miptree_slice_set_hiz_cleared(false)``
> to each occurrence
> ``intel_miptree_slice_set_needs_hiz_resolve()``.
>
> In effect, this patch introduces the requirement that all calls to
> ``intel_miptree_slice_set_needs_hiz_resolve()``
> be followed by ``intel_miptree_slice_set_hiz_cleared(false)``. Rather than
> introducing an implicit
> requirement, ``intel_miptree_slice_set_needs_hiz_resolve()`` should
> automatically set ``hiz_cleared = false``.
>
>
>>   }
>>
>>   static void
>> diff --git a/src/mesa/drivers/dri/i965/brw_clear.c
>> b/src/mesa/drivers/dri/i965/brw_clear.c
>> index 1cac996..9dfb94a 100644
>> --- a/src/mesa/drivers/dri/i965/brw_clear.c
>> +++ b/src/mesa/drivers/dri/i965/brw_clear.c
>> @@ -164,34 +164,66 @@ brw_fast_clear_depth(struct gl_context *ctx)
>> break;
>>  }
>>
>> +   unsigned num_layers_cleared = 0;
>> +   bool clear_all_layers = false;
>> +
>>  /* If we're clearing to a new clear value, then we need to resolve
>> any clear
>>   * flags out of the HiZ buffer into the real depth buffer.
>>   */
>>  if (mt->depth_clear_value != depth_clear_value) {
>> intel_miptree_all_slices_resolve_depth(brw, mt);
>> mt->depth_clear_value = depth_clear_value;
>> -   }
>>
>> -   /* From the Sandy Bridge PRM, volume 2 part 1, page 313:
>> -*
>> -* "If other rendering operations have preceded this clear, a
>> -*  PIPE_CONTROL with write cache flush enabled and Z-inhibit
>> disabled
>> -*  must be issued before the rectangle primitive used for the
>> depth
>> -*  buffer clear operation.
>> -*/
>> -   intel_batchbuffer_emit_mi_flush(brw);
>> +  clear_all_layers = true;
>> +   }
>>
>>  if (fb->NumLayers > 0) {
>> assert(fb->NumLayers ==
>> depth_irb->mt->level[depth_irb->mt_level].depth);
>> for (unsigned layer = 0; layer < fb->NumLayers; layer++) {
>> - intel_hiz_exec(brw, mt, depth_irb->mt_level, layer,
>> -GEN6_HIZ_OP_DEPTH_CLEAR);
>> + if (clear_all_layers ||
>> + !intel_miptree_slice_get_hiz_cleared(mt,
>> +  depth_irb->mt_level,
>> +  layer)) {
>> +/* From the Sandy Bridge PRM, volume 2 part 1, page 313:
>> + *
>> + * "If other rendering operations have preceded this
>> clear, a
>> + *  PIPE_CONTROL with write cache flush enabled and
>> Z-inhibit
>> + *  disabled must be issued before the rectangle
>> primitive used
>> +

Re: [Mesa-dev] [PATCH] ilo: build pipe-loader driver]

2014-01-08 Thread Chia-I Wu

On Wed, Jan 8, 2014 at 8:23 PM, Steven Newbury  wrote:
> On Fri, 2014-01-03 at 13:14 +0800, Chia-I Wu wrote:
>> On Thu, Jan 2, 2014 at 10:39 PM, Steven Newbury  
>> wrote:
>> > Forgot to add signed-off-by...
>> >
>> > In trying to get gallium-nine working with the ilo Gallium driver I
>> > noticed there's no ilo pipe-loader driver being built.
>> >
>> > This patch simply puts in place the missing pieces.
>> >
>> > The driver descriptor is named "ilo", rather than "i965" as the ilo DRI
>> > driver currently names itself, this is necessary as otherwise the
>> > pipe-loader refuses to load as it has a sanity check verifying the name
>> > matches.  A follow-up patch renames the ilo DRI driver descriptor to
>> > match.
>> >
>> > Signed-off-by: Steven Newbury 
>> > ---
>> >  include/pci_ids/pci_id_driver_map.h |  4 +++-
>> >  src/gallium/targets/pipe-loader/Makefile.am | 17 +
>> >  src/gallium/targets/pipe-loader/pipe_ilo.c  | 27 
>> > +++
>> >  3 files changed, 47 insertions(+), 1 deletion(-)
>> >
>> > diff --git a/include/pci_ids/pci_id_driver_map.h 
>> > b/include/pci_ids/pci_id_driver_map.h
>> > index 8a97c6f..1fb0467 100644
>> > --- a/include/pci_ids/pci_id_driver_map.h
>> > +++ b/include/pci_ids/pci_id_driver_map.h
>> > @@ -64,10 +64,12 @@ static const struct {
>> > int num_chips_ids;
>> >  } driver_map[] = {
>> > { 0x8086, "i915", i915_chip_ids, ARRAY_SIZE(i915_chip_ids) },
>> > -   { 0x8086, "i965", i965_chip_ids, ARRAY_SIZE(i965_chip_ids) },
>> >  #ifndef DRIVER_MAP_GALLIUM_ONLY
>> > +   { 0x8086, "i965", i965_chip_ids, ARRAY_SIZE(i965_chip_ids) },
>> > { 0x1002, "radeon", r100_chip_ids, ARRAY_SIZE(r100_chip_ids) },
>> > { 0x1002, "r200", r200_chip_ids, ARRAY_SIZE(r200_chip_ids) },
>> > +#else
>> > +   { 0x8086, "ilo", i965_chip_ids, ARRAY_SIZE(i965_chip_ids) },
>> >  #endif
>> Moving "i965" into the #ifndef looks correct to me, but having "ilo"
>> in the #else looks hacky.  For in this map, "ilo" should be always
>> defined by definition, and supports a subset of i965_chip_ids.
>>
> I guess it is hacky in the sense that without DRIVER_MAP_GALLIUM_ONLY it
> should return all drivers, but there is no provision (nor use) for
> returning two drivers for an opened drm device.  You're absolutely right
> though, I should check which devices are actually supported and create a
> new array for "ilo_chip_ids".
There is no rule as to what to do when two drivers support the same
devices.  In practice, if you list ilo after i965, and move i965 into
the #ifdef, things may work for your need.  But it should not be
relied on.

IMO, pipe loader uses the driver map to auto-detect the driver.
Having an environment variable to skip auto-detection makes sense
(e.g., to load the driver from a different path, or to force loading
the driver for an unknown device, and etc.).


>
>> I am actually in favor of an environment variable that overrides the
>> auto-detection of the driver in the pipe loader, thus skipping this
>> map.
> This does remind me of related, perhaps overlapping issue.  From the
> point of view of the DRI extension there's already a specified (or
> default) DRI driver which gets returned in DRI2Connect().  Perhaps this
> is the place to read the driver from except as far as I can tell it
> provides no override from a running Xserver.
>
> The ability to be able to use an alternative driver is really useful for
> ilo.  An environment variable override is definitely a good idea, I'm
> not sure about skipping the map entirely and not having a default
> though.
>
>>
>> > { 0x1002, "r300", r300_chip_ids, ARRAY_SIZE(r300_chip_ids) },
>> > { 0x1002, "r600", r600_chip_ids, ARRAY_SIZE(r600_chip_ids) },
>> > diff --git a/src/gallium/targets/pipe-loader/Makefile.am 
>> > b/src/gallium/targets/pipe-loader/Makefile.am
>> > index 6875453..8fa3873 100644
>> > --- a/src/gallium/targets/pipe-loader/Makefile.am
>> > +++ b/src/gallium/targets/pipe-loader/Makefile.am
>> > @@ -47,6 +47,23 @@ PIPE_LIBS = \
>> > -lpthread \
>> > -lm
>> >
>> > +if HAVE_GALLIUM_ILO
>> > +pipe_LTLIBRARIES += pipe_ilo.la
>> > +pipe_ilo_la_SOURCES = pipe_ilo.c
>> > +pipe_ilo_la_LIBADD = \
>> > +   $(PIPE_LIBS) \
>

Re: [Mesa-dev] [PATCH 08/16] ilo: Set PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT to 64

2014-01-23 Thread Chia-I Wu

On Thu, Jan 23, 2014 at 12:48 PM, Ian Romanick  wrote:
> From: Siavash Eliasi 
>
> Ian manually ran the map_buffer_range* tests and the
> arb_map_buffer_alignment-* tests, but he did not do a full piglit run.
Thanks for going ahead and testing.  The patch is

 Reviewed-by: Chia-I Wu 

> v2 (idr): Use 64 instead of 4096
>
> Tested-by: Ian Romanick 
> Cc: Chia-I Wu 
> ---
>  src/gallium/drivers/ilo/ilo_screen.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/src/gallium/drivers/ilo/ilo_screen.c 
> b/src/gallium/drivers/ilo/ilo_screen.c
> index 13a0be5..1443ba3 100644
> --- a/src/gallium/drivers/ilo/ilo_screen.c
> +++ b/src/gallium/drivers/ilo/ilo_screen.c
> @@ -408,7 +408,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap 
> param)
> case PIPE_CAP_TEXTURE_MULTISAMPLE:
>return false; /* TODO */
> case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
> -  return 0;
> +  return 64;
> case PIPE_CAP_CUBE_MAP_ARRAY:
> case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
>return true;
> --
> 1.8.1.4
>



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] i965/vec4: fix record clearing in copy propagation

2014-04-06 Thread Chia-I Wu

From: Chia-I Wu 

Given

  mov vgrf7, vgrf9.xyxz
  add vgrf9.xyz, vgrf4.xyzw, vgrf5.xyzw
  add vgrf10.x, vgrf6.xyzw, vgrf7.

the last instruction would be wrongly changed to

  add vgrf10.x, vgrf6.xyzw, vgrf9.

during copy propagation.

The issue is that when deciding if a record should be cleared, the old code
checked for

  inst->dst.writemask & (1 << ch)

instead of

  inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch))

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=76749
Signed-off-by: Chia-I Wu 
Cc: Jordan Justen 
Cc: Matt Turner 
---
 .../drivers/dri/i965/brw_vec4_copy_propagation.cpp  | 21 -
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
index 3d68f0e..83cf191 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
@@ -58,6 +58,21 @@ is_dominated_by_previous_instruction(vec4_instruction *inst)
 }
 
 static bool
+is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch)
+{
+   const src_reg *src = values[ch];
+
+   /* consider GRF only */
+   assert(inst->dst.file == GRF);
+   if (!src || src->file != GRF)
+  return false;
+
+   return (src->reg == inst->dst.reg &&
+  src->reg_offset == inst->dst.reg_offset &&
+  inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch)));
+}
+
+static bool
 try_constant_propagation(vec4_instruction *inst, int arg, src_reg *values[4])
 {
/* For constant propagation, we only handle the same constant
@@ -357,11 +372,7 @@ vec4_visitor::opt_copy_propagation()
 else {
for (int i = 0; i < virtual_grf_reg_count; i++) {
   for (int j = 0; j < 4; j++) {
- if (inst->dst.writemask & (1 << j) &&
- cur_value[i][j] &&
- cur_value[i][j]->file == GRF &&
- cur_value[i][j]->reg == inst->dst.reg &&
- cur_value[i][j]->reg_offset == inst->dst.reg_offset) {
+ if (is_channel_updated(inst, cur_value[i], j)){
 cur_value[i][j] = NULL;
  }
   }
-- 
1.8.5.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] i965/vec4: fix record clearing in copy propagation

2014-04-08 Thread Chia-I Wu

On Tue, Apr 8, 2014 at 2:52 AM, Eric Anholt  wrote:
> Chia-I Wu  writes:
>
>> From: Chia-I Wu 
>>
>> Given
>>
>>   mov vgrf7, vgrf9.xyxz
>>   add vgrf9.xyz, vgrf4.xyzw, vgrf5.xyzw
>>   add vgrf10.x, vgrf6.xyzw, vgrf7.
>>
>> the last instruction would be wrongly changed to
>>
>>   add vgrf10.x, vgrf6.xyzw, vgrf9.
>>
>> during copy propagation.
>>
>> The issue is that when deciding if a record should be cleared, the old code
>> checked for
>>
>>   inst->dst.writemask & (1 << ch)
>>
>> instead of
>>
>>   inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch))
>>
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=76749
>> Signed-off-by: Chia-I Wu 
>> Cc: Jordan Justen 
>> Cc: Matt Turner 
>
> Reviewed-by: Eric Anholt 
>
> I've just sent a patch for adding a unit test of copy propagation -- it
> would be nice to see an additional test on top of that for this bug.
Nice.  I will do that maybe tomorrow.


-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 2/2] i965: Disable Z16 in all APIs.

2014-04-14 Thread Chia-I Wu

On Mon, Apr 14, 2014 at 1:04 PM, Kenneth Graunke  wrote:
> We originally thought that GL 3.0 required GL_DEPTH_COMPONENT16 to map
> exactly to Z16.  However, we misread the specification, thanks in part
> to LaTeX reordering the tables in the PDF.
>
> Page 180 of the GL 3.0 specification (glspec30.20080923.pdf) says:
> "[...] memory allocation per texture component is assigned by the GL to
> match the allocations listed in tables 3.16-3.18 as closely as possible.
> [...]
>
> Required Texture Formats
> [...]
> In addition, implementations are required to support the following sized
> internal formats.  Requesting one of these internal formats for any
> texture type will allocate exactly the internal component sizes and
> types shown for that format in tables 3.16-3.17:"
>
> Notably, however, GL_DEPTH_COMPONENT16 does /not/ appear in table 3.16
> or table 3.17.  It appears in table 3.18, where the "exact" rule doesn't
> apply, and it falls back to the "closely as possible" rule.
>
> The confusing part is that the ordering of the tables in the PDF is:
>
> Table 3.16 (pages 182-184)
> Table 3.18 (bottom of page 184 to top of 185)
> Table 3.17 (page 185)
>
> Presumably, people saw table 3.16, then saw the table immediately
> following with DEPTH_COMPONENT* formats, and assumed it was 3.17.
>
> Based on a batch by Chia-I Wu, but without the driconf option to force
s/batch/patch/

Both patches look good to me.  Unless I overlooked your patch for
piglit, this is needed

http://lists.freedesktop.org/archives/piglit/2014-February/009650.html

to avoid a false regression.  It would be great if you could review
and commit the piglit fix, as I do not have commit access.

> Z16 to be used.  It's not required, and there's apparently no benefit
> to actually using it.
>
> Signed-off-by: Kenneth Graunke 
> ---
>  src/mesa/drivers/dri/i965/brw_surface_formats.c | 6 --
>  1 file changed, 6 deletions(-)
>
> Sorry if this is a duplicate of an earlier patch...the only one I could
> find in my inbox was the one with the driconf option.
>
> diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.c 
> b/src/mesa/drivers/dri/i965/brw_surface_formats.c
> index 196f139..5907dd9 100644
> --- a/src/mesa/drivers/dri/i965/brw_surface_formats.c
> +++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c
> @@ -608,7 +608,6 @@ brw_init_surface_formats(struct brw_context *brw)
> brw->format_supported_as_render_target[MESA_FORMAT_Z24_UNORM_S8_UINT] = 
> true;
> brw->format_supported_as_render_target[MESA_FORMAT_Z24_UNORM_X8_UINT] = 
> true;
> brw->format_supported_as_render_target[MESA_FORMAT_S_UINT8] = true;
> -   brw->format_supported_as_render_target[MESA_FORMAT_Z_UNORM16] = true;
> brw->format_supported_as_render_target[MESA_FORMAT_Z_FLOAT32] = true;
> brw->format_supported_as_render_target[MESA_FORMAT_Z32_FLOAT_S8X24_UINT] 
> = true;
>
> @@ -630,12 +629,7 @@ brw_init_surface_formats(struct brw_context *brw)
>  *
>  * Other speculation is that we may be hitting increased fragment shader
>  * execution from GL_LEQUAL/GL_EQUAL depth tests at reduced precision.
> -*
> -* However, desktop GL 3.0+ require that you get exactly 16 bits when
> -* asking for DEPTH_COMPONENT16, so we have to respect that.
>  */
> -   if (_mesa_is_desktop_gl(ctx))
> -  ctx->TextureFormatSupported[MESA_FORMAT_Z_UNORM16] = true;
>
> /* On hardware that lacks support for ETC1, we map ETC1 to RGBX
>  * during glCompressedTexImage2D(). See intel_mipmap_tree::wraps_etc1.
> --
> 1.9.2
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] i965/vec4: unit test for copy propagation and writemask

2014-04-14 Thread Chia-I Wu

This unit test demonstrates a subtle bug fixed by
4ddf51db6af36736d5d42c1043eeea86e47459ce.

Signed-off-by: Chia-I Wu 
Cc: Eric Anholt 
---
 .../dri/i965/test_vec4_copy_propagation.cpp| 30 ++
 1 file changed, 30 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp 
b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
index cb70096..fd517f8 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
@@ -154,3 +154,33 @@ TEST_F(copy_propagation_test, test_swizzle_swizzle)
 SWIZZLE_X,
 SWIZZLE_Y));
 }
+
+TEST_F(copy_propagation_test, test_swizzle_writemask)
+{
+   dst_reg a = dst_reg(v, glsl_type::vec4_type);
+   dst_reg b = dst_reg(v, glsl_type::vec4_type);
+   dst_reg c = dst_reg(v, glsl_type::vec4_type);
+
+   v->emit(v->MOV(b, swizzle(src_reg(a), BRW_SWIZZLE4(SWIZZLE_X,
+  SWIZZLE_Y,
+  SWIZZLE_X,
+  SWIZZLE_Z;
+
+   v->emit(v->MOV(writemask(a, WRITEMASK_XYZ), src_reg(1.0f)));
+
+   vec4_instruction *test_mov =
+  v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(SWIZZLE_W,
+ SWIZZLE_W,
+ SWIZZLE_W,
+ SWIZZLE_W)));
+   v->emit(test_mov);
+
+   copy_propagation(v);
+
+   /* should not copy propagate */
+   EXPECT_EQ(test_mov->src[0].reg, b.reg);
+   EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(SWIZZLE_W,
+SWIZZLE_W,
+SWIZZLE_W,
+SWIZZLE_W));
+}
-- 
1.8.5.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [Mesa-stable] [PATCH] i965: Fix buffer overruns in MSAA MCS buffer clearing.

2014-04-15 Thread Chia-I Wu

On Wed, Apr 16, 2014 at 3:18 AM, Eric Anholt  wrote:
> Kenneth Graunke  writes:
>
>> On 04/14/2014 05:33 PM, Eric Anholt wrote:
>>> This manifested as rendering failures or sometimes GPU hangs in
>>> compositors when they accidentally got MSAA visuals due to a bug in the X
>>> Server.  Today we decided that the problem in compositors was equivalent
>>> to a corruption bug we'd noticed recently in resizing MSAA-visual
>>> glxgears, and debugging got a lot easier.
>>>
>>> When we allocate our MCS MT, libdrm takes the size we request, aligns it
>>> to Y tile size (blowing it up from 300x300=90 bytes to 384*320=122880
>>> bytes, 30 pages), then puts it into a power-of-two-sized BO (131072 bytes,
>>> 32 pages).  Because it's Y tiled, we attach a 384-byte-stride fence to it.
>>> When we memset by the BO size in Mesa, between bytes 122880 and 131072 the
>>> data gets stored to the first 20 or so scanlines of each of the 3 tiled
>>> pages in that row, even though only 2 of those pages were allocated by
>>> libdrm.
>>
>> What?
>>
>> I get that drm_intel_bo_alloc/drm_intel_bo_alloc_tiled might return a
>> drm_intel_bo where bo->size is larger than what you asked for, due to
>> the BO cache.  But...what you're saying is, it doesn't actually allocate
>> enough pages to back the whole bo->size it gives you?  So, if you write
>> bytes 0..(bo->size - 1), you'll randomly clobber memory in a way that's
>> really difficult to detect?
>
> You have that many pages, really.  But you've attached a fence to it, so
> your allocated pages are structured as:
>
> +---+---+---+
> |   |   |   |
> +---+---+---+
> |   |   |   |
> +---+---+---+
> |   |   |   |
> +---+---+---+
> |   |   |
> +---+---+
>
> (except taller in this specific example).  If you hit the pixels in
> those quads, you'll be fine.
>
>>
>> There are other places where we memset an entire BO using bo->size.  For
>> example, your INTEL_DEBUG=shader_time code does exactly that (though it
>> isn't tiled).
>>
>> Could we change libdrm to set bo->size to the actual usable size of the
>> buffer, rather than the bucket size?
>
> The pages containing pixels you asked for go to 122880, and the BO is
> 131072, but the pixels you asked for have a maximum linear address of
> 384*320=115200.  Which value are you thinking is the "actual usable
> size"?  We certainly shouldn't have been memsetting more pixels than
> 115200.
384*320 is 122880.  It  feels like bo->size could be 122880, and
131072 could be stored elsewhere in bo_gem.

With that change assumed, do you think it makes sense to add

if (tiling_mode != I915_TILING_NONE && bo->size % stride)
   fprintf(stderr, "bo size is not a multiple of stride\n");

to drm_intel_gem_bo_set_tiling_internal?  That is, emit a warning when

  drm_intel_gem_bo_map_gtt(bo);
  memset(bo->virtual, 0, bo->size);

is known to explode.


>
> ___
> mesa-stable mailing list
> mesa-sta...@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-stable
>



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

1 2 3 4 5 6 7 8 >

1 - 100 of 713 matches

Mail list logo