Module: Mesa Branch: main Commit: 3cd6bb3e5d727596432622fc56c1efc7e3d521fb URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=3cd6bb3e5d727596432622fc56c1efc7e3d521fb
Author: Danylo Piliaiev <[email protected]> Date: Fri Nov 10 18:25:01 2023 +0100 tu: Add a725 workaround dispatch at the start of each cmdbuf Blob executes a special compute dispatch at the start of each command buffers. We copy this dispatch as is. At this point we don't know what this workaround is for. Signed-off-by: Danylo Piliaiev <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25888> --- src/freedreno/common/freedreno_dev_info.h | 5 ++ src/freedreno/common/freedreno_devices.py | 4 +- src/freedreno/registers/adreno/adreno_pm4.xml | 1 + src/freedreno/vulkan/tu_cmd_buffer.cc | 9 ++ src/freedreno/vulkan/tu_device.cc | 124 ++++++++++++++++++++++++++ src/freedreno/vulkan/tu_device.h | 3 + 6 files changed, 145 insertions(+), 1 deletion(-) diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index 6726ca43f0a..c61274f8cba 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -201,6 +201,11 @@ struct fd_dev_info { /* Whether there is CP_EVENT_WRITE7::WRITE_SAMPLE_COUNT */ bool has_event_write_sample_count; + + /* Blob executes a special compute dispatch at the start of each + * command buffers. We copy this dispatch as is. + */ + bool cmdbuf_start_a725_quirk; } a7xx; }; diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index 3ec46c1c32b..2bb2ad6f029 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -704,7 +704,9 @@ add_gpus([ ) )) -a7xx_725 = A7XXProps() +a7xx_725 = A7XXProps( + cmdbuf_start_a725_quirk = True, +) a7xx_730 = A7XXProps() diff --git a/src/freedreno/registers/adreno/adreno_pm4.xml b/src/freedreno/registers/adreno/adreno_pm4.xml index 42311ad6ff0..1b687eed5a7 100644 --- a/src/freedreno/registers/adreno/adreno_pm4.xml +++ b/src/freedreno/registers/adreno/adreno_pm4.xml @@ -1942,6 +1942,7 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) <bitfield name="BV" pos="25" variants="THREAD_MODE" type="boolean"/> <bitfield name="BR" pos="26" variants="THREAD_MODE" type="boolean"/> + <bitfield name="LPAC" pos="27" variants="THREAD_MODE" type="boolean"/> <bitfield name="MODE" low="28" high="31" type="compare_mode" addvariant="yes"/> </reg32> diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 23a1e4ae53f..acbf5218db8 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -1274,6 +1274,15 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_4(0x3f0243f0), ); } + if (phys_dev->info->a7xx.cmdbuf_start_a725_quirk) { + tu_cs_reserve(cs, 3 + 4); + tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2); + tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) | + CP_COND_REG_EXEC_0_BR | CP_COND_REG_EXEC_0_LPAC); + tu_cs_emit(cs, RENDER_MODE_CP_COND_REG_EXEC_1_DWORDS(4)); + tu_cs_emit_ib(cs, dev->cmdbuf_start_a725_quirk_entry); + } + tu_cs_sanity_check(cs); } diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 71ed243c170..943bc4bfd82 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -2016,6 +2016,112 @@ tu_init_dbg_reg_stomper(struct tu_device *device) device->dbg_renderpass_stomp_cs = rp_cs; } +/* It is unknown what this workaround is for and what it fixes. */ +static VkResult +tu_init_cmdbuf_start_a725_quirk(struct tu_device *device) +{ + struct tu_cs *cs; + + if (!(device->cmdbuf_start_a725_quirk_cs = + (struct tu_cs *) calloc(1, sizeof(struct tu_cs)))) { + return vk_startup_errorf(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY, + "OOM"); + } + + if (!(device->cmdbuf_start_a725_quirk_entry = + (struct tu_cs_entry *) calloc(1, sizeof(struct tu_cs_entry)))) { + free(device->cmdbuf_start_a725_quirk_cs); + return vk_startup_errorf(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY, + "OOM"); + } + + cs = device->cmdbuf_start_a725_quirk_cs; + tu_cs_init(cs, device, TU_CS_MODE_SUB_STREAM, 57, "a725 workaround cs"); + + struct tu_cs shader_cs; + tu_cs_begin_sub_stream(cs, 10, &shader_cs); + + uint32_t raw_shader[] = { + 0x00040000, 0x40600000, // mul.f hr0.x, hr0.x, hr1.x + 0x00050001, 0x40600001, // mul.f hr0.y, hr0.y, hr1.y + 0x00060002, 0x40600002, // mul.f hr0.z, hr0.z, hr1.z + 0x00070003, 0x40600003, // mul.f hr0.w, hr0.w, hr1.w + 0x00000000, 0x03000000, // end + }; + + tu_cs_emit_array(&shader_cs, raw_shader, ARRAY_SIZE(raw_shader)); + struct tu_cs_entry shader_entry = tu_cs_end_sub_stream(cs, &shader_cs); + uint64_t shader_iova = shader_entry.bo->iova + shader_entry.offset; + + struct tu_cs sub_cs; + tu_cs_begin_sub_stream(cs, 47, &sub_cs); + + tu_cs_emit_regs(&sub_cs, HLSQ_INVALIDATE_CMD(A7XX, + .vs_state = true, .hs_state = true, .ds_state = true, + .gs_state = true, .fs_state = true, .gfx_ibo = true, + .cs_bindless = 0xff, .gfx_bindless = 0xff)); + tu_cs_emit_regs(&sub_cs, HLSQ_CS_CNTL(A7XX, + .constlen = 4, + .enabled = true)); + tu_cs_emit_regs(&sub_cs, A6XX_SP_CS_CONFIG(.enabled = true)); + tu_cs_emit_regs(&sub_cs, A6XX_SP_CS_CTRL_REG0( + .threadmode = MULTI, + .threadsize = THREAD128, + .mergedregs = true)); + tu_cs_emit_regs(&sub_cs, A6XX_SP_CS_UNKNOWN_A9B1(.shared_size = 1)); + tu_cs_emit_regs(&sub_cs, HLSQ_CS_KERNEL_GROUP_X(A7XX, 1), + HLSQ_CS_KERNEL_GROUP_Y(A7XX, 1), + HLSQ_CS_KERNEL_GROUP_Z(A7XX, 1)); + tu_cs_emit_regs(&sub_cs, A6XX_SP_CS_INSTRLEN(.sp_cs_instrlen = 1)); + tu_cs_emit_regs(&sub_cs, A6XX_SP_CS_TEX_COUNT(0)); + tu_cs_emit_regs(&sub_cs, A6XX_SP_CS_IBO_COUNT(0)); + tu_cs_emit_regs(&sub_cs, A7XX_HLSQ_CS_CNTL_1( + .linearlocalidregid = regid(63, 0), + .threadsize = THREAD128, + .unk11 = true, + .unk22 = true, + .yalign = CS_YALIGN_1)); + tu_cs_emit_regs(&sub_cs, A6XX_SP_CS_CNTL_0( + .wgidconstid = regid(51, 3), + .wgsizeconstid = regid(48, 0), + .wgoffsetconstid = regid(63, 0), + .localidregid = regid(63, 0))); + tu_cs_emit_regs(&sub_cs, SP_CS_CNTL_1(A7XX, + .linearlocalidregid = regid(63, 0), + .threadsize = THREAD128, + .unk15 = true)); + tu_cs_emit_regs(&sub_cs, A7XX_SP_CS_UNKNOWN_A9BE(0)); + + tu_cs_emit_regs(&sub_cs, + HLSQ_CS_NDRANGE_0(A7XX, .kerneldim = 3, + .localsizex = 255, + .localsizey = 1, + .localsizez = 1), + HLSQ_CS_NDRANGE_1(A7XX, .globalsize_x = 3072), + HLSQ_CS_NDRANGE_2(A7XX, .globaloff_x = 0), + HLSQ_CS_NDRANGE_3(A7XX, .globalsize_y = 1), + HLSQ_CS_NDRANGE_4(A7XX, .globaloff_y = 0), + HLSQ_CS_NDRANGE_5(A7XX, .globalsize_z = 1), + HLSQ_CS_NDRANGE_6(A7XX, .globaloff_z = 0)); + tu_cs_emit_regs(&sub_cs, A7XX_HLSQ_CS_LOCAL_SIZE( + .localsizex = 255, + .localsizey = 0, + .localsizez = 0)); + tu_cs_emit_pkt4(&sub_cs, REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET, 3); + tu_cs_emit(&sub_cs, 0); + tu_cs_emit_qw(&sub_cs, shader_iova); + + tu_cs_emit_pkt7(&sub_cs, CP_EXEC_CS, 4); + tu_cs_emit(&sub_cs, 0x00000000); + tu_cs_emit(&sub_cs, CP_EXEC_CS_1_NGROUPS_X(12)); + tu_cs_emit(&sub_cs, CP_EXEC_CS_2_NGROUPS_Y(1)); + tu_cs_emit(&sub_cs, CP_EXEC_CS_3_NGROUPS_Z(1)); + + *device->cmdbuf_start_a725_quirk_entry = tu_cs_end_sub_stream(cs, &sub_cs); + + return VK_SUCCESS; +} + VKAPI_ATTR VkResult VKAPI_CALL tu_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo, @@ -2315,6 +2421,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, } } + if (physical_device->info->a7xx.cmdbuf_start_a725_quirk) { + result = tu_init_cmdbuf_start_a725_quirk(device); + if (result != VK_SUCCESS) + goto fail_a725_workaround; + } + tu_init_dbg_reg_stomper(device); /* Initialize a condition variable for timeline semaphore */ @@ -2376,6 +2488,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, return VK_SUCCESS; fail_timeline_cond: + if (device->cmdbuf_start_a725_quirk_entry) { + free(device->cmdbuf_start_a725_quirk_entry); + tu_cs_finish(device->cmdbuf_start_a725_quirk_cs); + free(device->cmdbuf_start_a725_quirk_cs); + } +fail_a725_workaround: fail_prepare_perfcntrs_pass_cs: free(device->perfcntrs_pass_cs_entries); tu_cs_finish(device->perfcntrs_pass_cs); @@ -2462,6 +2580,12 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) free(device->dbg_renderpass_stomp_cs); } + if (device->cmdbuf_start_a725_quirk_entry) { + free(device->cmdbuf_start_a725_quirk_entry); + tu_cs_finish(device->cmdbuf_start_a725_quirk_cs); + free(device->cmdbuf_start_a725_quirk_cs); + } + tu_autotune_fini(&device->autotune, device); tu_bo_suballocator_finish(&device->pipeline_suballoc); diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index cfd0b4c78ed..074cdf769cb 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -355,6 +355,9 @@ struct tu_device struct tu_cs *perfcntrs_pass_cs; struct tu_cs_entry *perfcntrs_pass_cs_entries; + struct tu_cs *cmdbuf_start_a725_quirk_cs; + struct tu_cs_entry *cmdbuf_start_a725_quirk_entry; + struct util_dynarray dynamic_rendering_pending; VkCommandPool dynamic_rendering_pool; uint32_t dynamic_rendering_fence;
