From: Marek Olšák <marek.ol...@amd.com>

The workaround causes a massive performance decrease on 1-SE parts.
(Cape Verde, Hainan, Oland)

The performance regression is already part of 17.0 and 17.1.

v2: check tess_uses_prim_id

Cc: 17.0 17.1 <mesa-sta...@lists.freedesktop.org>
---
 src/gallium/drivers/radeonsi/si_pipe.h       |  1 +
 src/gallium/drivers/radeonsi/si_state_draw.c | 35 ++++++++++++++++------------
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 108929c..5559946 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -364,20 +364,21 @@ struct si_context {
        unsigned                spi_tmpring_size;
 
        struct r600_resource    *compute_scratch_buffer;
 
        /* Emitted derived tessellation state. */
        /* Local shader (VS), or HS if LS-HS are merged. */
        struct si_shader        *last_ls;
        struct si_shader_selector *last_tcs;
        int                     last_num_tcs_input_cp;
        int                     last_tes_sh_base;
+       bool                    last_tess_uses_primid;
        unsigned                last_num_patches;
 
        /* Debug state. */
        bool                    is_debug;
        struct radeon_saved_cs  last_gfx;
        struct r600_resource    *last_trace_buf;
        struct r600_resource    *trace_buf;
        unsigned                trace_id;
        uint64_t                dmesg_timestamp;
        unsigned                apitrace_call_number;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index cd069e3..8508259 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -95,20 +95,23 @@ static void si_emit_derived_tess_state(struct si_context 
*sctx,
                                       const struct pipe_draw_info *info,
                                       unsigned *num_patches)
 {
        struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        struct si_shader *ls_current;
        struct si_shader_selector *ls;
        /* The TES pointer will only be used for sctx->last_tcs.
         * It would be wrong to think that TCS = TES. */
        struct si_shader_selector *tcs =
                sctx->tcs_shader.cso ? sctx->tcs_shader.cso : 
sctx->tes_shader.cso;
+       unsigned tess_uses_primid = 
sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
+       bool has_primid_instancing_bug = sctx->b.chip_class == SI &&
+                                        sctx->b.screen->info.max_se == 1;
        unsigned tes_sh_base = 
sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL];
        unsigned num_tcs_input_cp = info->vertices_per_patch;
        unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
        unsigned num_tcs_patch_outputs;
        unsigned input_vertex_size, output_vertex_size, 
pervertex_output_patch_size;
        unsigned input_patch_size, output_patch_size, output_patch0_offset;
        unsigned perpatch_output_offset, lds_size;
        unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
        unsigned offchip_layout, hardware_lds_size, ls_hs_config;
 
@@ -121,29 +124,32 @@ static void si_emit_derived_tess_state(struct si_context 
*sctx,
 
                ls = ls_current->key.part.tcs.ls;
        } else {
                ls_current = sctx->vs_shader.current;
                ls = sctx->vs_shader.cso;
        }
 
        if (sctx->last_ls == ls_current &&
            sctx->last_tcs == tcs &&
            sctx->last_tes_sh_base == tes_sh_base &&
-           sctx->last_num_tcs_input_cp == num_tcs_input_cp) {
+           sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
+           (!has_primid_instancing_bug ||
+            (sctx->last_tess_uses_primid == tess_uses_primid))) {
                *num_patches = sctx->last_num_patches;
                return;
        }
 
        sctx->last_ls = ls_current;
        sctx->last_tcs = tcs;
        sctx->last_tes_sh_base = tes_sh_base;
        sctx->last_num_tcs_input_cp = num_tcs_input_cp;
+       sctx->last_tess_uses_primid = tess_uses_primid;
 
        /* This calculates how shader inputs and outputs among VS, TCS, and TES
         * are laid out in LDS. */
        num_tcs_inputs = util_last_bit64(ls->outputs_written);
 
        if (sctx->tcs_shader.cso) {
                num_tcs_outputs = util_last_bit64(tcs->outputs_written);
                num_tcs_output_cp = 
tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
                num_tcs_patch_outputs = 
util_last_bit64(tcs->patch_outputs_written);
        } else {
@@ -187,36 +193,35 @@ static void si_emit_derived_tess_state(struct si_context 
*sctx,
         * specific value is taken from the proprietary driver.
         */
        *num_patches = MIN2(*num_patches, 40);
 
        if (sctx->b.chip_class == SI) {
                /* SI bug workaround, related to power management. Limit LS-HS
                 * threadgroups to only one wave.
                 */
                unsigned one_wave = 64 / MAX2(num_tcs_input_cp, 
num_tcs_output_cp);
                *num_patches = MIN2(*num_patches, one_wave);
-
-               if (sctx->screen->b.info.max_se == 1) {
-                       /* The VGT HS block increments the patch ID 
unconditionally
-                        * within a single threadgroup. This results in 
incorrect
-                        * patch IDs when instanced draws are used.
-                        *
-                        * The intended solution is to restrict threadgroups to
-                        * a single instance by setting SWITCH_ON_EOI, which
-                        * should cause IA to split instances up. However, this
-                        * doesn't work correctly on SI when there is no other
-                        * SE to switch to.
-                        */
-                       *num_patches = 1;
-               }
        }
 
+       /* The VGT HS block increments the patch ID unconditionally
+        * within a single threadgroup. This results in incorrect
+        * patch IDs when instanced draws are used.
+        *
+        * The intended solution is to restrict threadgroups to
+        * a single instance by setting SWITCH_ON_EOI, which
+        * should cause IA to split instances up. However, this
+        * doesn't work correctly on SI when there is no other
+        * SE to switch to.
+        */
+       if (has_primid_instancing_bug)
+               *num_patches = 1;
+
        sctx->last_num_patches = *num_patches;
 
        output_patch0_offset = input_patch_size * *num_patches;
        perpatch_output_offset = output_patch0_offset + 
pervertex_output_patch_size;
 
        /* Compute userdata SGPRs. */
        assert(((input_vertex_size / 4) & ~0xff) == 0);
        assert(((output_vertex_size / 4) & ~0xff) == 0);
        assert(((input_patch_size / 4) & ~0x1fff) == 0);
        assert(((output_patch_size / 4) & ~0x1fff) == 0);
-- 
2.7.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to