Module: Mesa Branch: main Commit: f0abdaea9f75ab483460c137c1b65863d2694a0f URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=f0abdaea9f75ab483460c137c1b65863d2694a0f
Author: Samuel Pitoiset <[email protected]> Date: Tue Sep 19 14:14:07 2023 +0200 amd/llvm,aco,radv: implement NGG streamout with GDS_STRMOUT registers on GFX11 According to RadeonSI, this is required for preemption, user queues, and we only have to wait for VS after streamout which should be more performant. Signed-off-by: Samuel Pitoiset <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25284> --- src/amd/compiler/aco_instruction_selection.cpp | 4 +- src/amd/llvm/ac_nir_to_llvm.c | 8 +--- src/amd/vulkan/radv_cmd_buffer.c | 59 ++++++++++++++++++++------ 3 files changed, 50 insertions(+), 21 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 22fc17f71c3..357d566e07e 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -8985,7 +8985,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) aco_opcode::p_create_vector, Format::PSEUDO, instr->num_components, 1)}; unsigned write_mask = nir_intrinsic_write_mask(instr); - bool use_gds_registers = ctx->options->gfx_level >= GFX11 && ctx->options->is_opengl; + const bool use_gds_registers = ctx->options->gfx_level >= GFX11; for (unsigned i = 0; i < instr->num_components; i++) { if (write_mask & (1 << i)) { @@ -9022,7 +9022,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) break; } case nir_intrinsic_xfb_counter_sub_amd: { - bool use_gds_registers = ctx->options->gfx_level >= GFX11 && ctx->options->is_opengl; + const bool use_gds_registers = ctx->options->gfx_level >= GFX11; unsigned write_mask = nir_intrinsic_write_mask(instr); Temp counter = get_ssa_temp(ctx, instr->src[0].ssa); diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index b0a335fb881..c2fdb9f88ce 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -3654,9 +3654,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins } case nir_intrinsic_ordered_xfb_counter_add_amd: { /* must be called in a single lane of a workgroup. */ - /* TODO: Add RADV support. */ - bool use_gds_registers = ctx->ac.gfx_level >= GFX11 && - ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL; + const bool use_gds_registers = ctx->ac.gfx_level >= GFX11; LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); LLVMValueRef gdsbase = LLVMBuildIntToPtr(ctx->ac.builder, ctx->ac.i32_0, gdsptr, ""); @@ -3736,9 +3734,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins } case nir_intrinsic_xfb_counter_sub_amd: { /* must be called in a single lane of a workgroup. */ - /* TODO: Add RADV support. */ - bool use_gds_registers = ctx->ac.gfx_level >= GFX11 && - ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL; + const bool use_gds_registers = ctx->ac.gfx_level >= GFX11; LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); LLVMValueRef gdsbase = LLVMBuildIntToPtr(ctx->ac.builder, ctx->ac.i32_0, gdsptr, ""); LLVMValueRef sub_vec = get_src(ctx, instr->src[0]); diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 652f145e033..9323998b5d1 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -10983,14 +10983,30 @@ radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstC } if (cmd_buffer->device->physical_device->use_ngg_streamout) { - radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); - radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) | S_411_DST_SEL(V_411_GDS) | - S_411_CP_SYNC(i == last_target)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - radeon_emit(cs, 4 * i); /* destination in GDS */ - radeon_emit(cs, 0); - radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target)); + if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) { + if (append) { + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit( + cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_WR_CONFIRM); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i); + radeon_emit(cs, 0); + } else { + /* The PKT3 CAM bit workaround seems needed for initializing this GDS register to zero. */ + radeon_set_perfctr_reg(cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf, cs, + R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 + i * 4, 0); + } + } else { + radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); + radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) | S_411_DST_SEL(V_411_GDS) | + S_411_CP_SYNC(i == last_target)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, 4 * i); /* destination in GDS */ + radeon_emit(cs, 0); + radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target)); + } } else { /* AMD GCN binds streamout buffers as shader resources. * VGT only counts primitives and tells the shader through @@ -11038,8 +11054,13 @@ radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCou assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); - if (!cmd_buffer->device->physical_device->use_ngg_streamout) + if (cmd_buffer->device->physical_device->use_ngg_streamout) { + /* Wait for streamout to finish before reading GDS_STRMOUT registers. */ + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH; + si_emit_cache_flush(cmd_buffer); + } else { radv_flush_vgt_streamout(cmd_buffer); + } ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MAX_SO_BUFFERS * 12); @@ -11065,10 +11086,22 @@ radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCou } if (cmd_buffer->device->physical_device->use_ngg_streamout) { - if (append) { - si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, - radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2, - EOP_DATA_SEL_GDS, va, EOP_DATA_GDS(i, 1), 0); + if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) { + if (append) { + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit( + cs, COPY_DATA_SRC_SEL(COPY_DATA_REG) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM); + radeon_emit(cs, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i); + radeon_emit(cs, 0); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + } + } else { + if (append) { + si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, + radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2, + EOP_DATA_SEL_GDS, va, EOP_DATA_GDS(i, 1), 0); + } } } else { if (append) {
