Signed-off-by: Samuel Pitoiset <[email protected]>
---
 src/amd/vulkan/Makefile.sources  |   2 +
 src/amd/vulkan/radv_cmd_buffer.c | 239 +---------------------------------
 src/amd/vulkan/radv_compute.c    | 275 +++++++++++++++++++++++++++++++++++++++
 src/amd/vulkan/radv_compute.h    |  69 ++++++++++
 src/amd/vulkan/radv_meta.h       |   1 +
 src/amd/vulkan/radv_private.h    |  35 +++--
 src/amd/vulkan/si_cmd_buffer.c   |  50 +------
 7 files changed, 381 insertions(+), 290 deletions(-)
 create mode 100644 src/amd/vulkan/radv_compute.c
 create mode 100644 src/amd/vulkan/radv_compute.h

diff --git a/src/amd/vulkan/Makefile.sources b/src/amd/vulkan/Makefile.sources
index 9489219f5b..7cef56b43d 100644
--- a/src/amd/vulkan/Makefile.sources
+++ b/src/amd/vulkan/Makefile.sources
@@ -32,6 +32,8 @@ RADV_WS_AMDGPU_FILES := \
 
 VULKAN_FILES := \
        radv_cmd_buffer.c \
+       radv_compute.c \
+       radv_compute.h \
        radv_cs.h \
        radv_debug.c \
        radv_debug.h \
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 10a071c3d6..af9f8210bf 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -34,6 +34,7 @@
 #include "vk_format.h"
 #include "radv_debug.h"
 #include "radv_meta.h"
+#include "radv_compute.h"
 
 #include "ac_debug.h"
 
@@ -366,7 +367,7 @@ void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer 
*cmd_buffer)
        radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
 }
 
-static void
+void
 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer)
 {
        if (cmd_buffer->device->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
@@ -386,7 +387,7 @@ radv_cmd_buffer_after_draw(struct radv_cmd_buffer 
*cmd_buffer)
        radv_cmd_buffer_trace_emit(cmd_buffer);
 }
 
-static void
+void
 radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer,
                   struct radv_pipeline *pipeline, enum ring_type ring)
 {
@@ -601,14 +602,6 @@ radv_emit_graphics_raster_state(struct radv_cmd_buffer 
*cmd_buffer,
                               raster->pa_su_sc_mode_cntl);
 }
 
-static inline void
-radv_emit_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
-                  unsigned size)
-{
-       if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK)
-               si_cp_dma_prefetch(cmd_buffer, va, size);
-}
-
 static void
 radv_emit_hw_vs(struct radv_cmd_buffer *cmd_buffer,
                struct radv_pipeline *pipeline,
@@ -1577,7 +1570,7 @@ radv_flush_indirect_descriptor_sets(struct 
radv_cmd_buffer *cmd_buffer)
                                           AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
 }
 
-static void
+void
 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
                       VkShaderStageFlags stages)
 {
@@ -1615,7 +1608,7 @@ radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
        assert(cmd_buffer->cs->cdw <= cdw_max);
 }
 
-static void
+void
 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
                     struct radv_pipeline *pipeline,
                     VkShaderStageFlags stages)
@@ -2108,7 +2101,8 @@ VkResult radv_BeginCommandBuffer(
                        radv_set_db_count_control(cmd_buffer);
                        break;
                case RADV_QUEUE_COMPUTE:
-                       si_init_compute(cmd_buffer);
+                       radv_init_compute(cmd_buffer->device->physical_device,
+                                         cmd_buffer->cs);
                        break;
                case RADV_QUEUE_TRANSFER:
                default:
@@ -2378,58 +2372,6 @@ VkResult radv_EndCommandBuffer(
        return cmd_buffer->record_result;
 }
 
-static void
-radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
-{
-       struct radeon_winsys *ws = cmd_buffer->device->ws;
-       struct radv_shader_variant *compute_shader;
-       struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
-       uint64_t va;
-
-       if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
-               return;
-
-       cmd_buffer->state.emitted_compute_pipeline = pipeline;
-
-       compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
-       va = ws->buffer_get_va(compute_shader->bo) + compute_shader->bo_offset;
-
-       ws->cs_add_buffer(cmd_buffer->cs, compute_shader->bo, 8);
-       radv_emit_prefetch(cmd_buffer, va, compute_shader->code_size);
-
-       MAYBE_UNUSED unsigned cdw_max = 
radeon_check_space(cmd_buffer->device->ws,
-                                                          cmd_buffer->cs, 16);
-
-       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B830_COMPUTE_PGM_LO, 2);
-       radeon_emit(cmd_buffer->cs, va >> 8);
-       radeon_emit(cmd_buffer->cs, va >> 40);
-
-       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
-       radeon_emit(cmd_buffer->cs, compute_shader->rsrc1);
-       radeon_emit(cmd_buffer->cs, compute_shader->rsrc2);
-
-
-       cmd_buffer->compute_scratch_size_needed =
-                                 MAX2(cmd_buffer->compute_scratch_size_needed,
-                                      pipeline->max_waves * 
pipeline->scratch_bytes_per_wave);
-
-       /* change these once we have scratch support */
-       radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE,
-                         S_00B860_WAVES(pipeline->max_waves) |
-                         S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 
10));
-
-       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
-       radeon_emit(cmd_buffer->cs,
-                   
S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]));
-       radeon_emit(cmd_buffer->cs,
-                   
S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]));
-       radeon_emit(cmd_buffer->cs,
-                   
S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]));
-
-       assert(cmd_buffer->cs->cdw <= cdw_max);
-       radv_save_pipeline(cmd_buffer, pipeline, RING_COMPUTE);
-}
-
 static void radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer)
 {
        for (unsigned i = 0; i < MAX_SETS; i++) {
@@ -3124,157 +3066,6 @@ void radv_CmdDrawIndexedIndirectCountAMD(
                                             maxDrawCount, stride);
 }
 
-struct radv_dispatch_info {
-       /**
-        * Determine the layout of the grid (in block units) to be used.
-        */
-       uint32_t blocks[3];
-
-       /**
-        * Whether it's an unaligned compute dispatch.
-        */
-       bool unaligned;
-
-       /**
-        * Indirect compute parameters resource.
-        */
-       struct radv_buffer *indirect;
-       uint64_t indirect_offset;
-};
-
-static void
-radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
-                          const struct radv_dispatch_info *info)
-{
-       struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
-       struct radv_shader_variant *compute_shader = 
pipeline->shaders[MESA_SHADER_COMPUTE];
-       struct radeon_winsys *ws = cmd_buffer->device->ws;
-       struct radeon_winsys_cs *cs = cmd_buffer->cs;
-       struct ac_userdata_info *loc;
-       uint8_t grid_used;
-
-       grid_used = compute_shader->info.info.cs.grid_components_used;
-
-       loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE,
-                                   AC_UD_CS_GRID_SIZE);
-
-       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(ws, cs, 25);
-
-       if (info->indirect) {
-               uint64_t va = ws->buffer_get_va(info->indirect->bo);
-
-               va += info->indirect->offset + info->indirect_offset;
-
-               ws->cs_add_buffer(cs, info->indirect->bo, 8);
-
-               if (loc->sgpr_idx != -1) {
-                       for (unsigned i = 0; i < grid_used; ++i) {
-                               radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-                               radeon_emit(cs, 
COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
-                                               
COPY_DATA_DST_SEL(COPY_DATA_REG));
-                               radeon_emit(cs, (va +  4 * i));
-                               radeon_emit(cs, (va + 4 * i) >> 32);
-                               radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0
-                                                + loc->sgpr_idx * 4) >> 2) + 
i);
-                               radeon_emit(cs, 0);
-                       }
-               }
-
-               if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
-                       radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) |
-                                       PKT3_SHADER_TYPE_S(1));
-                       radeon_emit(cs, va);
-                       radeon_emit(cs, va >> 32);
-                       radeon_emit(cs, 1);
-               } else {
-                       radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
-                                       PKT3_SHADER_TYPE_S(1));
-                       radeon_emit(cs, 1);
-                       radeon_emit(cs, va);
-                       radeon_emit(cs, va >> 32);
-
-                       radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) |
-                                       PKT3_SHADER_TYPE_S(1));
-                       radeon_emit(cs, 0);
-                       radeon_emit(cs, 1);
-               }
-       } else {
-               unsigned blocks[3] = { info->blocks[0], info->blocks[1], 
info->blocks[2] };
-               unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1);
-
-               if (info->unaligned) {
-                       unsigned *cs_block_size = 
compute_shader->info.cs.block_size;
-                       unsigned remainder[3];
-
-                       /* If aligned, these should be an entire block size,
-                        * not 0.
-                        */
-                       remainder[0] = blocks[0] + cs_block_size[0] -
-                                      align_u32_npot(blocks[0], 
cs_block_size[0]);
-                       remainder[1] = blocks[1] + cs_block_size[1] -
-                                      align_u32_npot(blocks[1], 
cs_block_size[1]);
-                       remainder[2] = blocks[2] + cs_block_size[2] -
-                                      align_u32_npot(blocks[2], 
cs_block_size[2]);
-
-                       blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
-                       blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
-                       blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
-
-                       radeon_set_sh_reg_seq(cs, 
R_00B81C_COMPUTE_NUM_THREAD_X, 3);
-                       radeon_emit(cs,
-                                   S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
-                                   S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
-                       radeon_emit(cs,
-                                   S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
-                                   S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
-                       radeon_emit(cs,
-                                   S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
-                                   S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
-
-                       dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
-               }
-
-               if (loc->sgpr_idx != -1) {
-                       assert(!loc->indirect);
-                       assert(loc->num_sgprs == grid_used);
-
-                       radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
-                                                 loc->sgpr_idx * 4, grid_used);
-                       radeon_emit(cs, blocks[0]);
-                       if (grid_used > 1)
-                               radeon_emit(cs, blocks[1]);
-                       if (grid_used > 2)
-                               radeon_emit(cs, blocks[2]);
-               }
-
-               radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
-                               PKT3_SHADER_TYPE_S(1));
-               radeon_emit(cs, blocks[0]);
-               radeon_emit(cs, blocks[1]);
-               radeon_emit(cs, blocks[2]);
-               radeon_emit(cs, dispatch_initiator);
-       }
-
-       assert(cmd_buffer->cs->cdw <= cdw_max);
-}
-
-static void
-radv_dispatch(struct radv_cmd_buffer *cmd_buffer,
-             const struct radv_dispatch_info *info)
-{
-       radv_emit_compute_pipeline(cmd_buffer);
-
-       radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
-       radv_flush_constants(cmd_buffer, cmd_buffer->state.compute_pipeline,
-                            VK_SHADER_STAGE_COMPUTE_BIT);
-
-       si_emit_cache_flush(cmd_buffer);
-
-       radv_emit_dispatch_packets(cmd_buffer, info);
-
-       radv_cmd_buffer_after_draw(cmd_buffer);
-}
-
 void radv_CmdDispatch(
        VkCommandBuffer                             commandBuffer,
        uint32_t                                    x,
@@ -3306,22 +3097,6 @@ void radv_CmdDispatchIndirect(
        radv_dispatch(cmd_buffer, &info);
 }
 
-void radv_unaligned_dispatch(
-       struct radv_cmd_buffer                      *cmd_buffer,
-       uint32_t                                    x,
-       uint32_t                                    y,
-       uint32_t                                    z)
-{
-       struct radv_dispatch_info info = {};
-
-       info.blocks[0] = x;
-       info.blocks[1] = y;
-       info.blocks[2] = z;
-       info.unaligned = 1;
-
-       radv_dispatch(cmd_buffer, &info);
-}
-
 void radv_CmdEndRenderPass(
        VkCommandBuffer                             commandBuffer)
 {
diff --git a/src/amd/vulkan/radv_compute.c b/src/amd/vulkan/radv_compute.c
new file mode 100644
index 0000000000..b230686223
--- /dev/null
+++ b/src/amd/vulkan/radv_compute.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright © 2016 Red Hat.
+ * Copyright © 2016 Bas Nieuwenhuizen
+ *
+ * based in part on anv driver which is:
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "radv_private.h"
+#include "radv_radeon_winsys.h"
+#include "radv_shader.h"
+#include "radv_cs.h"
+#include "sid.h"
+#include "gfx9d.h"
+#include "vk_format.h"
+#include "radv_debug.h"
+#include "radv_meta.h"
+#include "radv_compute.h"
+#include "ac_debug.h"
+
+void
+radv_init_compute(struct radv_physical_device *physical_device,
+                 struct radeon_winsys_cs *cs)
+{
+       radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
+       radeon_emit(cs, 0);
+       radeon_emit(cs, 0);
+       radeon_emit(cs, 0);
+
+       radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, 3);
+       radeon_emit(cs, 0);
+       /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */
+       radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | 
S_00B858_SH1_CU_EN(0xffff));
+       radeon_emit(cs, S_00B85C_SH0_CU_EN(0xffff) | 
S_00B85C_SH1_CU_EN(0xffff));
+
+       if (physical_device->rad_info.chip_class >= CIK) {
+               /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
+               radeon_set_sh_reg_seq(cs,
+                                     R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 
2);
+               radeon_emit(cs, S_00B864_SH0_CU_EN(0xffff) |
+                           S_00B864_SH1_CU_EN(0xffff));
+               radeon_emit(cs, S_00B868_SH0_CU_EN(0xffff) |
+                           S_00B868_SH1_CU_EN(0xffff));
+       }
+
+       /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
+        * and is now per pipe, so it should be handled in the
+        * kernel if we want to use something other than the default value,
+        * which is now 0x22f.
+        */
+       if (physical_device->rad_info.chip_class <= SI) {
+               /* XXX: This should be:
+                * (number of compute units) * 4 * (waves per simd) - 1 */
+
+               radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID,
+                                 0x190 /* Default value */);
+       }
+}
+
+static void
+radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
+{
+       struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
+       struct radeon_winsys *ws = cmd_buffer->device->ws;
+       struct radeon_winsys_cs *cs = cmd_buffer->cs;
+       struct radv_shader_variant *compute_shader;
+       uint64_t va;
+
+       if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
+               return;
+
+       cmd_buffer->state.emitted_compute_pipeline = pipeline;
+
+       compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
+       va = ws->buffer_get_va(compute_shader->bo) + compute_shader->bo_offset;
+
+       ws->cs_add_buffer(cs, compute_shader->bo, 8);
+       radv_emit_prefetch(cmd_buffer, va, compute_shader->code_size);
+
+       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(ws, cs, 16);
+
+       radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
+       radeon_emit(cs, va >> 8);
+       radeon_emit(cs, va >> 40);
+
+       radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
+       radeon_emit(cs, compute_shader->rsrc1);
+       radeon_emit(cs, compute_shader->rsrc2);
+
+       cmd_buffer->compute_scratch_size_needed =
+               MAX2(cmd_buffer->compute_scratch_size_needed,
+                    pipeline->max_waves * pipeline->scratch_bytes_per_wave);
+
+       /* change these once we have scratch support */
+       radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
+                             S_00B860_WAVES(pipeline->max_waves) |
+                             
S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10));
+
+       radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+       radeon_emit(cs,
+                   
S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]));
+       radeon_emit(cs,
+                   
S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]));
+       radeon_emit(cs,
+                   
S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]));
+
+       assert(cmd_buffer->cs->cdw <= cdw_max);
+       radv_save_pipeline(cmd_buffer, pipeline, RING_COMPUTE);
+}
+
+static void
+radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
+                          const struct radv_dispatch_info *info)
+{
+       struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
+       struct radv_shader_variant *compute_shader = 
pipeline->shaders[MESA_SHADER_COMPUTE];
+       struct radeon_winsys *ws = cmd_buffer->device->ws;
+       struct radeon_winsys_cs *cs = cmd_buffer->cs;
+       struct ac_userdata_info *loc;
+       uint8_t grid_used;
+
+       grid_used = compute_shader->info.info.cs.grid_components_used;
+
+       loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE,
+                                   AC_UD_CS_GRID_SIZE);
+
+       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(ws, cs, 25);
+
+       if (info->indirect) {
+               uint64_t va = ws->buffer_get_va(info->indirect->bo);
+
+               va += info->indirect->offset + info->indirect_offset;
+
+               ws->cs_add_buffer(cs, info->indirect->bo, 8);
+
+               if (loc->sgpr_idx != -1) {
+                       for (unsigned i = 0; i < grid_used; ++i) {
+                               radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+                               radeon_emit(cs, 
COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
+                                               
COPY_DATA_DST_SEL(COPY_DATA_REG));
+                               radeon_emit(cs, (va +  4 * i));
+                               radeon_emit(cs, (va + 4 * i) >> 32);
+                               radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0
+                                                + loc->sgpr_idx * 4) >> 2) + 
i);
+                               radeon_emit(cs, 0);
+                       }
+               }
+
+               if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
+                       radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) |
+                                       PKT3_SHADER_TYPE_S(1));
+                       radeon_emit(cs, va);
+                       radeon_emit(cs, va >> 32);
+                       radeon_emit(cs, 1);
+               } else {
+                       radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
+                                       PKT3_SHADER_TYPE_S(1));
+                       radeon_emit(cs, 1);
+                       radeon_emit(cs, va);
+                       radeon_emit(cs, va >> 32);
+
+                       radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) |
+                                       PKT3_SHADER_TYPE_S(1));
+                       radeon_emit(cs, 0);
+                       radeon_emit(cs, 1);
+               }
+       } else {
+               unsigned blocks[3] = { info->blocks[0], info->blocks[1], 
info->blocks[2] };
+               unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1);
+
+               if (info->unaligned) {
+                       unsigned *cs_block_size = 
compute_shader->info.cs.block_size;
+                       unsigned remainder[3];
+
+                       /* If aligned, these should be an entire block size,
+                        * not 0.
+                        */
+                       remainder[0] = blocks[0] + cs_block_size[0] -
+                                      align_u32_npot(blocks[0], 
cs_block_size[0]);
+                       remainder[1] = blocks[1] + cs_block_size[1] -
+                                      align_u32_npot(blocks[1], 
cs_block_size[1]);
+                       remainder[2] = blocks[2] + cs_block_size[2] -
+                                      align_u32_npot(blocks[2], 
cs_block_size[2]);
+
+                       blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
+                       blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
+                       blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
+
+                       radeon_set_sh_reg_seq(cs, 
R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+                       radeon_emit(cs,
+                                   S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
+                                   S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
+                       radeon_emit(cs,
+                                   S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
+                                   S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
+                       radeon_emit(cs,
+                                   S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
+                                   S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
+
+                       dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
+               }
+
+               if (loc->sgpr_idx != -1) {
+                       assert(!loc->indirect);
+                       assert(loc->num_sgprs == grid_used);
+
+                       radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+                                                 loc->sgpr_idx * 4, grid_used);
+                       radeon_emit(cs, blocks[0]);
+                       if (grid_used > 1)
+                               radeon_emit(cs, blocks[1]);
+                       if (grid_used > 2)
+                               radeon_emit(cs, blocks[2]);
+               }
+
+               radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
+                               PKT3_SHADER_TYPE_S(1));
+               radeon_emit(cs, blocks[0]);
+               radeon_emit(cs, blocks[1]);
+               radeon_emit(cs, blocks[2]);
+               radeon_emit(cs, dispatch_initiator);
+       }
+
+       assert(cmd_buffer->cs->cdw <= cdw_max);
+}
+
+void
+radv_dispatch(struct radv_cmd_buffer *cmd_buffer,
+             const struct radv_dispatch_info *info)
+{
+       radv_emit_compute_pipeline(cmd_buffer);
+
+       radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
+       radv_flush_constants(cmd_buffer, cmd_buffer->state.compute_pipeline,
+                            VK_SHADER_STAGE_COMPUTE_BIT);
+
+       si_emit_cache_flush(cmd_buffer);
+
+       radv_emit_dispatch_packets(cmd_buffer, info);
+
+       radv_cmd_buffer_after_draw(cmd_buffer);
+}
+
+void
+radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer,
+                       uint32_t x, uint32_t y, uint32_t z)
+{
+       struct radv_dispatch_info info = {};
+
+       info.blocks[0] = x;
+       info.blocks[1] = y;
+       info.blocks[2] = z;
+       info.unaligned = 1;
+
+       radv_dispatch(cmd_buffer, &info);
+}
diff --git a/src/amd/vulkan/radv_compute.h b/src/amd/vulkan/radv_compute.h
new file mode 100644
index 0000000000..fc9770b772
--- /dev/null
+++ b/src/amd/vulkan/radv_compute.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright © 2016 Red Hat.
+ * Copyright © 2016 Bas Nieuwenhuizen
+ *
+ * based in part on anv driver which is:
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef RADV_COMPUTE_H
+#define RADV_COMPUTE_H
+
+#include "radv_private.h"
+
+struct radv_dispatch_info {
+       /**
+        * Determine the layout of the grid (in block units) to be used.
+        */
+       uint32_t blocks[3];
+
+       /**
+        * Whether it's an unaligned compute dispatch.
+        */
+       bool unaligned;
+
+       /**
+        * Indirect compute parameters resource.
+        */
+       struct radv_buffer *indirect;
+       uint64_t indirect_offset;
+};
+
+void
+radv_init_compute(struct radv_physical_device *physical_device,
+                 struct radeon_winsys_cs *cs);
+
+void
+radv_dispatch(struct radv_cmd_buffer *cmd_buffer,
+             const struct radv_dispatch_info *info);
+
+/**
+ * Takes x,y,z as exact numbers of invocations, instead of blocks.
+ *
+ * Limitations: Can't call normal dispatch functions without binding or 
rebinding
+ *              the compute pipeline.
+ */
+void
+radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer,
+                       uint32_t x, uint32_t y, uint32_t z);
+
+#endif
diff --git a/src/amd/vulkan/radv_meta.h b/src/amd/vulkan/radv_meta.h
index 5d28cc5f0f..a35dfee184 100644
--- a/src/amd/vulkan/radv_meta.h
+++ b/src/amd/vulkan/radv_meta.h
@@ -27,6 +27,7 @@
 #define RADV_META_H
 
 #include "radv_private.h"
+#include "radv_compute.h"
 #include "radv_shader.h"
 
 #ifdef __cplusplus
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index e5092a8923..9cb4570100 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -937,17 +937,6 @@ void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer 
*cmd_buffer);
 bool radv_get_memory_fd(struct radv_device *device,
                        struct radv_device_memory *memory,
                        int *pFD);
-/*
- * Takes x,y,z as exact numbers of invocations, instead of blocks.
- *
- * Limitations: Can't call normal dispatch functions without binding or 
rebinding
- *              the compute pipeline.
- */
-void radv_unaligned_dispatch(
-       struct radv_cmd_buffer                      *cmd_buffer,
-       uint32_t                                    x,
-       uint32_t                                    y,
-       uint32_t                                    z);
 
 struct radv_event {
        struct radeon_winsys_bo *bo;
@@ -1521,6 +1510,30 @@ void radv_initialise_cmask(struct radv_cmd_buffer 
*cmd_buffer,
 void radv_initialize_dcc(struct radv_cmd_buffer *cmd_buffer,
                         struct radv_image *image, uint32_t value);
 
+static inline void
+radv_emit_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
+                  unsigned size)
+{
+       if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK)
+               si_cp_dma_prefetch(cmd_buffer, va, size);
+}
+
+void
+radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer,
+                  struct radv_pipeline *pipeline, enum ring_type ring);
+
+void
+radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
+                      VkShaderStageFlags stages);
+
+void
+radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
+                    struct radv_pipeline *pipeline,
+                    VkShaderStageFlags stages);
+
+void
+radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer);
+
 struct radv_fence {
        struct radeon_winsys_fence *fence;
        bool submitted;
diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index 9f8d881d27..a63a57de06 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -30,6 +30,7 @@
 #include "radv_private.h"
 #include "radv_shader.h"
 #include "radv_cs.h"
+#include "radv_compute.h"
 #include "sid.h"
 #include "gfx9d.h"
 #include "radv_util.h"
@@ -170,52 +171,6 @@ si_write_harvested_raster_configs(struct 
radv_physical_device *physical_device,
                                       S_030800_INSTANCE_BROADCAST_WRITES(1));
 }
 
-static void
-si_emit_compute(struct radv_physical_device *physical_device,
-                struct radeon_winsys_cs *cs)
-{
-       radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
-       radeon_emit(cs, 0);
-       radeon_emit(cs, 0);
-       radeon_emit(cs, 0);
-
-       radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, 3);
-       radeon_emit(cs, 0);
-       /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */
-       radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | 
S_00B858_SH1_CU_EN(0xffff));
-       radeon_emit(cs, S_00B85C_SH0_CU_EN(0xffff) | 
S_00B85C_SH1_CU_EN(0xffff));
-
-       if (physical_device->rad_info.chip_class >= CIK) {
-               /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
-               radeon_set_sh_reg_seq(cs,
-                                     R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 
2);
-               radeon_emit(cs, S_00B864_SH0_CU_EN(0xffff) |
-                           S_00B864_SH1_CU_EN(0xffff));
-               radeon_emit(cs, S_00B868_SH0_CU_EN(0xffff) |
-                           S_00B868_SH1_CU_EN(0xffff));
-       }
-
-       /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
-        * and is now per pipe, so it should be handled in the
-        * kernel if we want to use something other than the default value,
-        * which is now 0x22f.
-        */
-       if (physical_device->rad_info.chip_class <= SI) {
-               /* XXX: This should be:
-                * (number of compute units) * 4 * (waves per simd) - 1 */
-
-               radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID,
-                                 0x190 /* Default value */);
-       }
-}
-
-void
-si_init_compute(struct radv_cmd_buffer *cmd_buffer)
-{
-       struct radv_physical_device *physical_device = 
cmd_buffer->device->physical_device;
-       si_emit_compute(physical_device, cmd_buffer->cs);
-}
-
 static void
 si_emit_config(struct radv_physical_device *physical_device,
               struct radeon_winsys_cs *cs)
@@ -486,7 +441,8 @@ si_emit_config(struct radv_physical_device *physical_device,
                                       S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1));
                radeon_set_uconfig_reg(cs, R_030968_VGT_INSTANCE_BASE_ID, 0);
        }
-       si_emit_compute(physical_device, cs);
+
+       radv_init_compute(physical_device, cs);
 }
 
 void si_init_config(struct radv_cmd_buffer *cmd_buffer)
-- 
2.14.1

_______________________________________________
mesa-dev mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to