Module: Mesa
Branch: main
Commit: f0abdaea9f75ab483460c137c1b65863d2694a0f
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=f0abdaea9f75ab483460c137c1b65863d2694a0f

Author: Samuel Pitoiset <[email protected]>
Date:   Tue Sep 19 14:14:07 2023 +0200

amd/llvm,aco,radv: implement NGG streamout with GDS_STRMOUT registers on GFX11

According to RadeonSI, this is required for preemption, user queues,
and we only have to wait for VS after streamout which should be more
performant.

Signed-off-by: Samuel Pitoiset <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25284>

---

 src/amd/compiler/aco_instruction_selection.cpp |  4 +-
 src/amd/llvm/ac_nir_to_llvm.c                  |  8 +---
 src/amd/vulkan/radv_cmd_buffer.c               | 59 ++++++++++++++++++++------
 3 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp 
b/src/amd/compiler/aco_instruction_selection.cpp
index 22fc17f71c3..357d566e07e 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -8985,7 +8985,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* 
instr)
          aco_opcode::p_create_vector, Format::PSEUDO, instr->num_components, 
1)};
       unsigned write_mask = nir_intrinsic_write_mask(instr);
 
-      bool use_gds_registers = ctx->options->gfx_level >= GFX11 && 
ctx->options->is_opengl;
+      const bool use_gds_registers = ctx->options->gfx_level >= GFX11;
 
       for (unsigned i = 0; i < instr->num_components; i++) {
          if (write_mask & (1 << i)) {
@@ -9022,7 +9022,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* 
instr)
       break;
    }
    case nir_intrinsic_xfb_counter_sub_amd: {
-      bool use_gds_registers = ctx->options->gfx_level >= GFX11 && 
ctx->options->is_opengl;
+      const bool use_gds_registers = ctx->options->gfx_level >= GFX11;
 
       unsigned write_mask = nir_intrinsic_write_mask(instr);
       Temp counter = get_ssa_temp(ctx, instr->src[0].ssa);
diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c
index b0a335fb881..c2fdb9f88ce 100644
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -3654,9 +3654,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, 
nir_intrinsic_instr *ins
    }
    case nir_intrinsic_ordered_xfb_counter_add_amd: {
       /* must be called in a single lane of a workgroup. */
-      /* TODO: Add RADV support. */
-      bool use_gds_registers = ctx->ac.gfx_level >= GFX11 &&
-                               ctx->ac.float_mode == 
AC_FLOAT_MODE_DEFAULT_OPENGL;
+      const bool use_gds_registers = ctx->ac.gfx_level >= GFX11;
       LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
       LLVMValueRef gdsbase = LLVMBuildIntToPtr(ctx->ac.builder, ctx->ac.i32_0, 
gdsptr, "");
 
@@ -3736,9 +3734,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, 
nir_intrinsic_instr *ins
    }
    case nir_intrinsic_xfb_counter_sub_amd: {
       /* must be called in a single lane of a workgroup. */
-      /* TODO: Add RADV support. */
-      bool use_gds_registers = ctx->ac.gfx_level >= GFX11 &&
-                               ctx->ac.float_mode == 
AC_FLOAT_MODE_DEFAULT_OPENGL;
+      const bool use_gds_registers = ctx->ac.gfx_level >= GFX11;
       LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
       LLVMValueRef gdsbase = LLVMBuildIntToPtr(ctx->ac.builder, ctx->ac.i32_0, 
gdsptr, "");
       LLVMValueRef sub_vec = get_src(ctx, instr->src[0]);
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 652f145e033..9323998b5d1 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -10983,14 +10983,30 @@ radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer 
commandBuffer, uint32_t firstC
       }
 
       if (cmd_buffer->device->physical_device->use_ngg_streamout) {
-         radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-         radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : 
V_411_DATA) | S_411_DST_SEL(V_411_GDS) |
-                            S_411_CP_SYNC(i == last_target));
-         radeon_emit(cs, va);
-         radeon_emit(cs, va >> 32);
-         radeon_emit(cs, 4 * i); /* destination in GDS */
-         radeon_emit(cs, 0);
-         radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | 
S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target));
+         if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) 
{
+            if (append) {
+               radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+               radeon_emit(
+                  cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | 
COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_WR_CONFIRM);
+               radeon_emit(cs, va);
+               radeon_emit(cs, va >> 32);
+               radeon_emit(cs, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + 
i);
+               radeon_emit(cs, 0);
+            } else {
+               /* The PKT3 CAM bit workaround seems needed for initializing 
this GDS register to zero. */
+               
radeon_set_perfctr_reg(cmd_buffer->device->physical_device->rad_info.gfx_level, 
cmd_buffer->qf, cs,
+                                      R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 + 
i * 4, 0);
+            }
+         } else {
+            radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+            radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : 
V_411_DATA) | S_411_DST_SEL(V_411_GDS) |
+                               S_411_CP_SYNC(i == last_target));
+            radeon_emit(cs, va);
+            radeon_emit(cs, va >> 32);
+            radeon_emit(cs, 4 * i); /* destination in GDS */
+            radeon_emit(cs, 0);
+            radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | 
S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target));
+         }
       } else {
          /* AMD GCN binds streamout buffers as shader resources.
           * VGT only counts primitives and tells the shader through
@@ -11038,8 +11054,13 @@ radv_CmdEndTransformFeedbackEXT(VkCommandBuffer 
commandBuffer, uint32_t firstCou
 
    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
 
-   if (!cmd_buffer->device->physical_device->use_ngg_streamout)
+   if (cmd_buffer->device->physical_device->use_ngg_streamout) {
+      /* Wait for streamout to finish before reading GDS_STRMOUT registers. */
+      cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
+      si_emit_cache_flush(cmd_buffer);
+   } else {
       radv_flush_vgt_streamout(cmd_buffer);
+   }
 
    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, 
cmd_buffer->cs, MAX_SO_BUFFERS * 12);
 
@@ -11065,10 +11086,22 @@ radv_CmdEndTransformFeedbackEXT(VkCommandBuffer 
commandBuffer, uint32_t firstCou
       }
 
       if (cmd_buffer->device->physical_device->use_ngg_streamout) {
-         if (append) {
-            si_cs_emit_write_event_eop(cs, 
cmd_buffer->device->physical_device->rad_info.gfx_level,
-                                       radv_cmd_buffer_uses_mec(cmd_buffer), 
V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2,
-                                       EOP_DATA_SEL_GDS, va, EOP_DATA_GDS(i, 
1), 0);
+         if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) 
{
+            if (append) {
+               radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+               radeon_emit(
+                  cs, COPY_DATA_SRC_SEL(COPY_DATA_REG) | 
COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
+               radeon_emit(cs, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + 
i);
+               radeon_emit(cs, 0);
+               radeon_emit(cs, va);
+               radeon_emit(cs, va >> 32);
+            }
+         } else {
+            if (append) {
+               si_cs_emit_write_event_eop(cs, 
cmd_buffer->device->physical_device->rad_info.gfx_level,
+                                          
radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2,
+                                          EOP_DATA_SEL_GDS, va, 
EOP_DATA_GDS(i, 1), 0);
+            }
          }
       } else {
          if (append) {

Reply via email to