Module: Mesa
Branch: main
Commit: 3cd6bb3e5d727596432622fc56c1efc7e3d521fb
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=3cd6bb3e5d727596432622fc56c1efc7e3d521fb

Author: Danylo Piliaiev <[email protected]>
Date:   Fri Nov 10 18:25:01 2023 +0100

tu: Add a725 workaround dispatch at the start of each cmdbuf

Blob executes a special compute dispatch at the start of each
command buffers. We copy this dispatch as is. At this point
we don't know what this workaround is for.

Signed-off-by: Danylo Piliaiev <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25888>

---

 src/freedreno/common/freedreno_dev_info.h     |   5 ++
 src/freedreno/common/freedreno_devices.py     |   4 +-
 src/freedreno/registers/adreno/adreno_pm4.xml |   1 +
 src/freedreno/vulkan/tu_cmd_buffer.cc         |   9 ++
 src/freedreno/vulkan/tu_device.cc             | 124 ++++++++++++++++++++++++++
 src/freedreno/vulkan/tu_device.h              |   3 +
 6 files changed, 145 insertions(+), 1 deletion(-)

diff --git a/src/freedreno/common/freedreno_dev_info.h 
b/src/freedreno/common/freedreno_dev_info.h
index 6726ca43f0a..c61274f8cba 100644
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@@ -201,6 +201,11 @@ struct fd_dev_info {
 
       /* Whether there is CP_EVENT_WRITE7::WRITE_SAMPLE_COUNT */
       bool has_event_write_sample_count;
+
+      /* Blob executes a special compute dispatch at the start of each
+       * command buffers. We copy this dispatch as is.
+       */
+      bool cmdbuf_start_a725_quirk;
    } a7xx;
 };
 
diff --git a/src/freedreno/common/freedreno_devices.py 
b/src/freedreno/common/freedreno_devices.py
index 3ec46c1c32b..2bb2ad6f029 100644
--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@@ -704,7 +704,9 @@ add_gpus([
         )
     ))
 
-a7xx_725 = A7XXProps()
+a7xx_725 = A7XXProps(
+        cmdbuf_start_a725_quirk = True,
+)
 
 a7xx_730 = A7XXProps()
 
diff --git a/src/freedreno/registers/adreno/adreno_pm4.xml 
b/src/freedreno/registers/adreno/adreno_pm4.xml
index 42311ad6ff0..1b687eed5a7 100644
--- a/src/freedreno/registers/adreno/adreno_pm4.xml
+++ b/src/freedreno/registers/adreno/adreno_pm4.xml
@@ -1942,6 +1942,7 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords)
 
                <bitfield name="BV" pos="25" variants="THREAD_MODE" 
type="boolean"/>
                <bitfield name="BR" pos="26" variants="THREAD_MODE" 
type="boolean"/>
+               <bitfield name="LPAC" pos="27" variants="THREAD_MODE" 
type="boolean"/>
 
                <bitfield name="MODE" low="28" high="31" type="compare_mode" 
addvariant="yes"/>
        </reg32>
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc 
b/src/freedreno/vulkan/tu_cmd_buffer.cc
index 23a1e4ae53f..acbf5218db8 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@@ -1274,6 +1274,15 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
                       A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_4(0x3f0243f0), );
    }
 
+   if (phys_dev->info->a7xx.cmdbuf_start_a725_quirk) {
+      tu_cs_reserve(cs, 3 + 4);
+      tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
+      tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) |
+                     CP_COND_REG_EXEC_0_BR | CP_COND_REG_EXEC_0_LPAC);
+      tu_cs_emit(cs, RENDER_MODE_CP_COND_REG_EXEC_1_DWORDS(4));
+      tu_cs_emit_ib(cs, dev->cmdbuf_start_a725_quirk_entry);
+   }
+
    tu_cs_sanity_check(cs);
 }
 
diff --git a/src/freedreno/vulkan/tu_device.cc 
b/src/freedreno/vulkan/tu_device.cc
index 71ed243c170..943bc4bfd82 100644
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@@ -2016,6 +2016,112 @@ tu_init_dbg_reg_stomper(struct tu_device *device)
    device->dbg_renderpass_stomp_cs = rp_cs;
 }
 
+/* It is unknown what this workaround is for and what it fixes. */
+static VkResult
+tu_init_cmdbuf_start_a725_quirk(struct tu_device *device)
+{
+   struct tu_cs *cs;
+
+   if (!(device->cmdbuf_start_a725_quirk_cs =
+            (struct tu_cs *) calloc(1, sizeof(struct tu_cs)))) {
+      return vk_startup_errorf(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY,
+                               "OOM");
+   }
+
+   if (!(device->cmdbuf_start_a725_quirk_entry =
+            (struct tu_cs_entry *) calloc(1, sizeof(struct tu_cs_entry)))) {
+      free(device->cmdbuf_start_a725_quirk_cs);
+      return vk_startup_errorf(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY,
+                               "OOM");
+   }
+
+   cs = device->cmdbuf_start_a725_quirk_cs;
+   tu_cs_init(cs, device, TU_CS_MODE_SUB_STREAM, 57, "a725 workaround cs");
+
+   struct tu_cs shader_cs;
+   tu_cs_begin_sub_stream(cs, 10, &shader_cs);
+
+   uint32_t raw_shader[] = {
+      0x00040000, 0x40600000, // mul.f hr0.x, hr0.x, hr1.x
+      0x00050001, 0x40600001, // mul.f hr0.y, hr0.y, hr1.y
+      0x00060002, 0x40600002, // mul.f hr0.z, hr0.z, hr1.z
+      0x00070003, 0x40600003, // mul.f hr0.w, hr0.w, hr1.w
+      0x00000000, 0x03000000, // end
+   };
+
+   tu_cs_emit_array(&shader_cs, raw_shader, ARRAY_SIZE(raw_shader));
+   struct tu_cs_entry shader_entry = tu_cs_end_sub_stream(cs, &shader_cs);
+   uint64_t shader_iova = shader_entry.bo->iova + shader_entry.offset;
+
+   struct tu_cs sub_cs;
+   tu_cs_begin_sub_stream(cs, 47, &sub_cs);
+
+   tu_cs_emit_regs(&sub_cs, HLSQ_INVALIDATE_CMD(A7XX,
+            .vs_state = true, .hs_state = true, .ds_state = true,
+            .gs_state = true, .fs_state = true, .gfx_ibo = true,
+            .cs_bindless = 0xff, .gfx_bindless = 0xff));
+   tu_cs_emit_regs(&sub_cs, HLSQ_CS_CNTL(A7XX,
+            .constlen = 4,
+            .enabled = true));
+   tu_cs_emit_regs(&sub_cs, A6XX_SP_CS_CONFIG(.enabled = true));
+   tu_cs_emit_regs(&sub_cs, A6XX_SP_CS_CTRL_REG0(
+            .threadmode = MULTI,
+            .threadsize = THREAD128,
+            .mergedregs = true));
+   tu_cs_emit_regs(&sub_cs, A6XX_SP_CS_UNKNOWN_A9B1(.shared_size = 1));
+   tu_cs_emit_regs(&sub_cs, HLSQ_CS_KERNEL_GROUP_X(A7XX, 1),
+                     HLSQ_CS_KERNEL_GROUP_Y(A7XX, 1),
+                     HLSQ_CS_KERNEL_GROUP_Z(A7XX, 1));
+   tu_cs_emit_regs(&sub_cs, A6XX_SP_CS_INSTRLEN(.sp_cs_instrlen = 1));
+   tu_cs_emit_regs(&sub_cs, A6XX_SP_CS_TEX_COUNT(0));
+   tu_cs_emit_regs(&sub_cs, A6XX_SP_CS_IBO_COUNT(0));
+   tu_cs_emit_regs(&sub_cs, A7XX_HLSQ_CS_CNTL_1(
+            .linearlocalidregid = regid(63, 0),
+            .threadsize = THREAD128,
+            .unk11 = true,
+            .unk22 = true,
+            .yalign = CS_YALIGN_1));
+   tu_cs_emit_regs(&sub_cs, A6XX_SP_CS_CNTL_0(
+            .wgidconstid = regid(51, 3),
+            .wgsizeconstid = regid(48, 0),
+            .wgoffsetconstid = regid(63, 0),
+            .localidregid = regid(63, 0)));
+   tu_cs_emit_regs(&sub_cs, SP_CS_CNTL_1(A7XX,
+            .linearlocalidregid = regid(63, 0),
+            .threadsize = THREAD128,
+            .unk15 = true));
+   tu_cs_emit_regs(&sub_cs, A7XX_SP_CS_UNKNOWN_A9BE(0));
+
+   tu_cs_emit_regs(&sub_cs,
+                  HLSQ_CS_NDRANGE_0(A7XX, .kerneldim = 3,
+                                          .localsizex = 255,
+                                          .localsizey = 1,
+                                          .localsizez = 1),
+                  HLSQ_CS_NDRANGE_1(A7XX, .globalsize_x = 3072),
+                  HLSQ_CS_NDRANGE_2(A7XX, .globaloff_x = 0),
+                  HLSQ_CS_NDRANGE_3(A7XX, .globalsize_y = 1),
+                  HLSQ_CS_NDRANGE_4(A7XX, .globaloff_y = 0),
+                  HLSQ_CS_NDRANGE_5(A7XX, .globalsize_z = 1),
+                  HLSQ_CS_NDRANGE_6(A7XX, .globaloff_z = 0));
+   tu_cs_emit_regs(&sub_cs, A7XX_HLSQ_CS_LOCAL_SIZE(
+            .localsizex = 255,
+            .localsizey = 0,
+            .localsizez = 0));
+   tu_cs_emit_pkt4(&sub_cs, REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET, 3);
+   tu_cs_emit(&sub_cs, 0);
+   tu_cs_emit_qw(&sub_cs, shader_iova);
+
+   tu_cs_emit_pkt7(&sub_cs, CP_EXEC_CS, 4);
+   tu_cs_emit(&sub_cs, 0x00000000);
+   tu_cs_emit(&sub_cs, CP_EXEC_CS_1_NGROUPS_X(12));
+   tu_cs_emit(&sub_cs, CP_EXEC_CS_2_NGROUPS_Y(1));
+   tu_cs_emit(&sub_cs, CP_EXEC_CS_3_NGROUPS_Z(1));
+
+   *device->cmdbuf_start_a725_quirk_entry = tu_cs_end_sub_stream(cs, &sub_cs);
+
+   return VK_SUCCESS;
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 tu_CreateDevice(VkPhysicalDevice physicalDevice,
                 const VkDeviceCreateInfo *pCreateInfo,
@@ -2315,6 +2421,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
       }
    }
 
+   if (physical_device->info->a7xx.cmdbuf_start_a725_quirk) {
+         result = tu_init_cmdbuf_start_a725_quirk(device);
+         if (result != VK_SUCCESS)
+            goto fail_a725_workaround;
+   }
+
    tu_init_dbg_reg_stomper(device);
 
    /* Initialize a condition variable for timeline semaphore */
@@ -2376,6 +2488,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    return VK_SUCCESS;
 
 fail_timeline_cond:
+   if (device->cmdbuf_start_a725_quirk_entry) {
+      free(device->cmdbuf_start_a725_quirk_entry);
+      tu_cs_finish(device->cmdbuf_start_a725_quirk_cs);
+      free(device->cmdbuf_start_a725_quirk_cs);
+   }
+fail_a725_workaround:
 fail_prepare_perfcntrs_pass_cs:
    free(device->perfcntrs_pass_cs_entries);
    tu_cs_finish(device->perfcntrs_pass_cs);
@@ -2462,6 +2580,12 @@ tu_DestroyDevice(VkDevice _device, const 
VkAllocationCallbacks *pAllocator)
       free(device->dbg_renderpass_stomp_cs);
    }
 
+   if (device->cmdbuf_start_a725_quirk_entry) {
+      free(device->cmdbuf_start_a725_quirk_entry);
+      tu_cs_finish(device->cmdbuf_start_a725_quirk_cs);
+      free(device->cmdbuf_start_a725_quirk_cs);
+   }
+
    tu_autotune_fini(&device->autotune, device);
 
    tu_bo_suballocator_finish(&device->pipeline_suballoc);
diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h
index cfd0b4c78ed..074cdf769cb 100644
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@@ -355,6 +355,9 @@ struct tu_device
    struct tu_cs *perfcntrs_pass_cs;
    struct tu_cs_entry *perfcntrs_pass_cs_entries;
 
+   struct tu_cs *cmdbuf_start_a725_quirk_cs;
+   struct tu_cs_entry *cmdbuf_start_a725_quirk_entry;
+
    struct util_dynarray dynamic_rendering_pending;
    VkCommandPool dynamic_rendering_pool;
    uint32_t dynamic_rendering_fence;

Reply via email to