Module: Mesa
Branch: main
Commit: 1a99f50c7f27793f86a089a028c678cf5cf3142c
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=1a99f50c7f27793f86a089a028c678cf5cf3142c

Author: Pierre-Eric Pelloux-Prayer <[email protected]>
Date:   Thu Nov 30 10:15:40 2023 +0100

radeonsi: use a compute shader to convert unsupported indices format

This commit replace the CPU-conversion of ubyte to ushort by a compute shader.
The benefits are:
* we don't need to sync anymore
* we can allocate the index buffer in VRAM (no need to CPU map it)

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10195
Reviewed-by: Marek Olšák <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26416>

---

 src/gallium/drivers/radeonsi/si_compute_blit.c  | 35 +++++++++++++++++++++++++
 src/gallium/drivers/radeonsi/si_pipe.c          |  2 ++
 src/gallium/drivers/radeonsi/si_pipe.h          |  4 +++
 src/gallium/drivers/radeonsi/si_shaderlib_nir.c | 31 ++++++++++++++++++++++
 src/gallium/drivers/radeonsi/si_state_draw.cpp  | 15 +++++------
 5 files changed, 79 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c 
b/src/gallium/drivers/radeonsi/si_compute_blit.c
index cddca6e6fc5..fe7ffe8b6ab 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -526,6 +526,41 @@ void si_copy_buffer(struct si_context *sctx, struct 
pipe_resource *dst, struct p
    }
 }
 
+void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct 
pipe_resource *dst, struct pipe_resource *src,
+                                     uint64_t dst_offset, uint64_t src_offset, 
unsigned size, unsigned flags)
+{
+   if (!size)
+      return;
+
+   if (!sctx->cs_ubyte_to_ushort)
+      sctx->cs_ubyte_to_ushort = 
si_create_ubyte_to_ushort_compute_shader(sctx);
+
+   enum si_coherency coher = SI_COHERENCY_SHADER;
+
+   si_improve_sync_flags(sctx, dst, src, &flags);
+
+   struct pipe_grid_info info = {};
+   info.block[0] = si_determine_wave_size(sctx->screen, NULL);
+   info.block[1] = 1;
+   info.block[2] = 1;
+   info.grid[0] = DIV_ROUND_UP(size, info.block[0]);
+   info.grid[1] = 1;
+   info.grid[2] = 1;
+   info.last_block[0] = size % info.block[0];
+
+   struct pipe_shader_buffer sb[2] = {};
+   sb[0].buffer = dst;
+   sb[0].buffer_offset = dst_offset;
+   sb[0].buffer_size = dst->width0;
+
+   sb[1].buffer = src;
+   sb[1].buffer_offset = src_offset;
+   sb[1].buffer_size = src->width0;
+
+   si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_ubyte_to_ushort, flags, 
coher,
+                                 2, sb, 0x1);
+}
+
 static unsigned
 set_work_size(struct pipe_grid_info *info, unsigned block_x, unsigned block_y, 
unsigned block_z,
               unsigned work_x, unsigned work_y, unsigned work_z)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index 264474f4e0f..a65f8b7d307 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -276,6 +276,8 @@ static void si_destroy_context(struct pipe_context *context)
       sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer_rmw);
    if (sctx->cs_copy_buffer)
       sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
+   if (sctx->cs_ubyte_to_ushort)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_ubyte_to_ushort);
    for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_copy_image); i++) {
       for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_copy_image[i]); j++) {
          for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_copy_image[i][j]); k++) {
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 5c3b31842f2..6277c448de2 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1025,6 +1025,7 @@ struct si_context {
    void *cs_clear_buffer;
    void *cs_clear_buffer_rmw;
    void *cs_copy_buffer;
+   void *cs_ubyte_to_ushort;
    void *cs_copy_image[3][2][2]; /* [wg_dim-1][src_is_1d][dst_is_1d] */
    void *cs_clear_render_target;
    void *cs_clear_render_target_1d_array;
@@ -1491,6 +1492,8 @@ void si_compute_clear_buffer_rmw(struct si_context *sctx, 
struct pipe_resource *
                                  unsigned flags, enum si_coherency coher);
 void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct 
pipe_resource *src,
                     uint64_t dst_offset, uint64_t src_offset, unsigned size, 
unsigned flags);
+void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct 
pipe_resource *dst, struct pipe_resource *src,
+                                     uint64_t dst_offset, uint64_t src_offset, 
unsigned size, unsigned flags);
 bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, 
unsigned dst_level,
                            struct pipe_resource *src, unsigned src_level, 
unsigned dstx,
                            unsigned dsty, unsigned dstz, const struct pipe_box 
*src_box,
@@ -1654,6 +1657,7 @@ void *si_get_blitter_vs(struct si_context *sctx, enum 
blitter_attrib_type type,
                         unsigned num_layers);
 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned 
num_dwords_per_thread,
                                    bool dst_stream_cache_policy, bool is_copy);
+void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx);
 void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
 void *si_clear_render_target_shader(struct si_context *sctx, enum 
pipe_texture_target type);
 void *si_clear_12bytes_buffer_shader(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c 
b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
index ee9bf2fc159..a689460066a 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@@ -624,6 +624,37 @@ void *si_clear_12bytes_buffer_shader(struct si_context 
*sctx)
    return create_shader_state(sctx, b.shader);
 }
 
+void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
+{
+   const nir_shader_compiler_options *options =
+      sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, 
PIPE_SHADER_COMPUTE);
+
+   unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
+
+   /* Don't cache loads, because there is no reuse. */
+   unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
+
+   nir_builder b =
+      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, 
"ubyte_to_ushort");
+
+   unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
+
+   b.shader->info.workgroup_size[0] = default_wave_size;
+   b.shader->info.workgroup_size[1] = 1;
+   b.shader->info.workgroup_size[2] = 1;
+   b.shader->info.num_ssbos = 2;
+
+   nir_def *load_address = get_global_ids(&b, 1);
+   nir_def *store_address = nir_imul_imm(&b, load_address, 2);
+
+   nir_def *ubyte_value = nir_load_ssbo(&b, 1, 8, nir_imm_int(&b, 1),
+                                        load_address, .access = 
load_qualifier);
+   nir_store_ssbo(&b, nir_u2uN(&b, ubyte_value, 16), nir_imm_int(&b, 0),
+                  store_address, .access = store_qualifier);
+
+   return create_shader_state(sctx, b.shader);
+}
+
 /* Create a compute shader implementing clear_buffer or copy_buffer. */
 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned 
num_dwords_per_thread,
                                    bool dst_stream_cache_policy, bool is_copy)
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp 
b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 737f873f703..95e1478c5cc 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -2044,23 +2044,22 @@ static void si_draw(struct pipe_context *ctx,
       /* Translate or upload, if needed. */
       /* 8-bit indices are supported on GFX8. */
       if (!IS_DRAW_VERTEX_STATE && GFX_VERSION <= GFX7 && index_size == 1) {
-         unsigned start, count, start_offset, size, offset;
-         void *ptr;
+         unsigned start, count, start_offset, size;
 
          si_get_draw_start_count(sctx, info, indirect, draws, num_draws, 
&start, &count);
          start_offset = start * 2;
          size = count * 2;
 
-         indexbuf = NULL;
-         u_upload_alloc(ctx->stream_uploader, start_offset, size,
-                        si_optimal_tcc_alignment(sctx, size), &offset, 
&indexbuf, &ptr);
+         /* Don't use u_upload_alloc because we don't need to map the buffer 
for CPU access. */
+         indexbuf = pipe_buffer_create(&sctx->screen->b, 0, 
PIPE_USAGE_IMMUTABLE, start_offset + size);
          if (unlikely(!indexbuf))
             return;
 
-         util_shorten_ubyte_elts_to_userptr(&sctx->b, info, 0, 0, index_offset 
+ start, count, ptr);
+         si_compute_shorten_ubyte_buffer(sctx, indexbuf, info->index.resource,
+                                         start_offset, index_offset + start, 
count,
+                                         SI_OP_SYNC_AFTER);
 
-         /* info->start will be added by the drawing code */
-         index_offset = offset - start_offset;
+         index_offset = 0;
          index_size = 2;
       } else if (!IS_DRAW_VERTEX_STATE && info->has_user_indices) {
          unsigned start_offset;

Reply via email to