On Mon, Jan 30, 2017, at 07:02, Dave Airlie wrote: > From: Dave Airlie <airl...@redhat.com> > > This uses the scratch infrastructure to handle the esgs > and gsvs rings. > > (this replaces the old code that did this with patching). > > Signed-off-by: Dave Airlie <airl...@redhat.com> > --- > src/amd/vulkan/radv_cmd_buffer.c | 28 ++++++ > src/amd/vulkan/radv_device.c | 199 > ++++++++++++++++++++++++++++++++++++--- > src/amd/vulkan/radv_private.h | 8 ++ > 3 files changed, 224 insertions(+), 11 deletions(-) > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > b/src/amd/vulkan/radv_cmd_buffer.c > index c351636..b1b125f 100644 > --- a/src/amd/vulkan/radv_cmd_buffer.c > +++ b/src/amd/vulkan/radv_cmd_buffer.c > @@ -1458,6 +1458,8 @@ static void radv_reset_cmd_buffer(struct > radv_cmd_buffer *cmd_buffer) > cmd_buffer->upload.offset = 0; > > cmd_buffer->record_fail = false; > + > + cmd_buffer->ring_offsets_idx = -1; > } > > VkResult radv_ResetCommandBuffer( > @@ -1644,6 +1646,7 @@ VkResult radv_EndCommandBuffer( > > if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) > si_emit_cache_flush(cmd_buffer); > + > if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) || > cmd_buffer->record_fail) > return VK_ERROR_OUT_OF_DEVICE_MEMORY; > @@ -1730,6 +1733,20 @@ void radv_CmdBindPipeline( > radv_dynamic_state_copy(&cmd_buffer->state.dynamic, > &pipeline->dynamic_state, > pipeline->dynamic_state_mask); > + > + if (pipeline->graphics.esgs_ring_size > > cmd_buffer->esgs_ring_size_needed) > + cmd_buffer->esgs_ring_size_needed = > pipeline->graphics.esgs_ring_size; > + if (pipeline->graphics.gsvs_ring_size > > cmd_buffer->gsvs_ring_size_needed) > + cmd_buffer->gsvs_ring_size_needed = > pipeline->graphics.gsvs_ring_size; > + > + if (radv_pipeline_has_gs(pipeline)) { > + struct ac_userdata_info *loc = > radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY, > + > AC_UD_SCRATCH_RING_OFFSETS); > + if (cmd_buffer->ring_offsets_idx == -1) > + cmd_buffer->ring_offsets_idx = > loc->sgpr_idx; > + else if (loc->sgpr_idx != -1) > + assert(loc->sgpr_idx != > cmd_buffer->ring_offsets_idx); > + } > break; > default: > assert(!"invalid bind point"); > @@ -1882,6 +1899,17 @@ void radv_CmdExecuteCommands( > primary->compute_scratch_size_needed = > MAX2(primary->compute_scratch_size_needed, > > secondary->compute_scratch_size_needed); > > + if (secondary->esgs_ring_size_needed > > primary->esgs_ring_size_needed) > + primary->esgs_ring_size_needed = > secondary->esgs_ring_size_needed; > + if (secondary->gsvs_ring_size_needed > > primary->gsvs_ring_size_needed) > + primary->gsvs_ring_size_needed = > secondary->gsvs_ring_size_needed; > + > + if (secondary->ring_offsets_idx != -1) { > + if (primary->ring_offsets_idx == -1) > + primary->ring_offsets_idx = > secondary->ring_offsets_idx; > + else > + assert(secondary->ring_offsets_idx == > primary->ring_offsets_idx); > + } > primary->device->ws->cs_execute_secondary(primary->cs, > secondary->cs); > } > > diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c > index af16c89..fd4e7f5 100644 > --- a/src/amd/vulkan/radv_device.c > +++ b/src/amd/vulkan/radv_device.c > @@ -760,6 +760,10 @@ radv_queue_finish(struct radv_queue *queue) > queue->device->ws->buffer_destroy(queue->descriptor_bo); > if (queue->scratch_bo) > queue->device->ws->buffer_destroy(queue->scratch_bo); > + if (queue->esgs_ring_bo) > + queue->device->ws->buffer_destroy(queue->esgs_ring_bo); > + if (queue->gsvs_ring_bo) > + queue->device->ws->buffer_destroy(queue->gsvs_ring_bo); > if (queue->compute_scratch_bo) > queue->device->ws->buffer_destroy(queue->compute_scratch_bo); > } > @@ -1042,24 +1046,118 @@ static void radv_dump_trace(struct radv_device > *device, > fclose(f); > } > > +static void > +fill_geom_rings(struct radv_queue *queue, > + uint32_t *map, > + uint32_t esgs_ring_size, > + struct radeon_winsys_bo *esgs_ring_bo, > + uint32_t gsvs_ring_size, > + struct radeon_winsys_bo *gsvs_ring_bo) > +{ > + uint64_t esgs_va, gsvs_va; > + esgs_va = queue->device->ws->buffer_get_va(esgs_ring_bo); > + gsvs_va = queue->device->ws->buffer_get_va(gsvs_ring_bo); > + uint32_t *desc = &map[4]; > + > + /* stride 0, num records - size, add tid, swizzle, elsize4, > + index stride 64 */ > + desc[0] = esgs_va; > + desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32) | > + S_008F04_STRIDE(0) | > + S_008F04_SWIZZLE_ENABLE(true); > + desc[2] = esgs_ring_size; > + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | > + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | > + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | > + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | > + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | > + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | > + S_008F0C_ELEMENT_SIZE(1) | > + S_008F0C_INDEX_STRIDE(3) | > + S_008F0C_ADD_TID_ENABLE(true); > + > + desc += 4; > + /* GS entry for ES->GS ring */ > + /* stride 0, num records - size, elsize0, > + index stride 0 */ > + desc[0] = esgs_va; > + desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32)| > + S_008F04_STRIDE(0) | > + S_008F04_SWIZZLE_ENABLE(false); > + desc[2] = esgs_ring_size; > + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | > + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | > + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | > + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | > + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | > + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | > + S_008F0C_ELEMENT_SIZE(0) | > + S_008F0C_INDEX_STRIDE(0) | > + S_008F0C_ADD_TID_ENABLE(false); > + > + desc += 4; > + /* VS entry for GS->VS ring */ > + /* stride 0, num records - size, elsize0, > + index stride 0 */ > + desc[0] = gsvs_va; > + desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)| > + S_008F04_STRIDE(0) | > + S_008F04_SWIZZLE_ENABLE(false); > + desc[2] = gsvs_ring_size; > + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | > + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | > + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | > + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | > + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | > + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | > + S_008F0C_ELEMENT_SIZE(0) | > + S_008F0C_INDEX_STRIDE(0) | > + S_008F0C_ADD_TID_ENABLE(false); > + desc += 4; > + > + /* stride gsvs_itemsize, num records 64 > + elsize 4, index stride 16 */ > + /* shader will patch stride and desc[2] */ > + desc[0] = gsvs_va; > + desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)| > + S_008F04_STRIDE(0) | > + S_008F04_SWIZZLE_ENABLE(true); > + desc[2] = 0; > + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | > + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | > + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | > + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | > + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | > + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | > + S_008F0C_ELEMENT_SIZE(1) | > + S_008F0C_INDEX_STRIDE(1) | > + S_008F0C_ADD_TID_ENABLE(true); > +} > + > static VkResult > radv_get_preamble_cs(struct radv_queue *queue, > uint32_t scratch_size, > uint32_t compute_scratch_size, > + uint32_t esgs_ring_size, > + uint32_t gsvs_ring_size, > struct radeon_winsys_cs **preamble_cs) > { > struct radeon_winsys_bo *scratch_bo = NULL; > struct radeon_winsys_bo *descriptor_bo = NULL; > struct radeon_winsys_bo *compute_scratch_bo = NULL; > + struct radeon_winsys_bo *esgs_ring_bo = NULL; > + struct radeon_winsys_bo *gsvs_ring_bo = NULL; > struct radeon_winsys_cs *cs = NULL; > > - if (!scratch_size && !compute_scratch_size) { > + if (!scratch_size && !compute_scratch_size && !esgs_ring_size && > !gsvs_ring_size) { > *preamble_cs = NULL; > return VK_SUCCESS; > } > > if (scratch_size <= queue->scratch_size && > - compute_scratch_size <= queue->compute_scratch_size) { > + compute_scratch_size <= queue->compute_scratch_size && > + esgs_ring_size <= queue->esgs_ring_size && > + gsvs_ring_size <= queue->gsvs_ring_size) { > *preamble_cs = queue->preamble_cs; > return VK_SUCCESS; > } > @@ -1087,9 +1185,40 @@ radv_get_preamble_cs(struct radv_queue *queue, > } else > compute_scratch_bo = queue->compute_scratch_bo; > > - if (scratch_bo != queue->scratch_bo) { > + if (esgs_ring_size > queue->esgs_ring_size) { > + esgs_ring_bo = > queue->device->ws->buffer_create(queue->device->ws, > + > esgs_ring_size, > + 4096, > + > RADEON_DOMAIN_VRAM, > + > RADEON_FLAG_NO_CPU_ACCESS); > + if (!esgs_ring_bo) > + goto fail; > + } else > + esgs_ring_bo = queue->esgs_ring_bo; > + > + > + if (gsvs_ring_size > queue->gsvs_ring_size) { > + gsvs_ring_bo = > queue->device->ws->buffer_create(queue->device->ws, > + > gsvs_ring_size, > + 4096, > + > RADEON_DOMAIN_VRAM, > + > RADEON_FLAG_NO_CPU_ACCESS); > + if (!gsvs_ring_bo) > + goto fail; > + } else > + gsvs_ring_bo = queue->gsvs_ring_bo; > + > + if (scratch_bo != queue->scratch_bo || > + esgs_ring_bo != queue->esgs_ring_bo || > + gsvs_ring_bo != queue->gsvs_ring_bo) { > + uint32_t size = 0; > + if (gsvs_ring_bo || esgs_ring_bo) > + size = 80; /* 2 dword + 2 padding + 4 dword * 4 > */ > + else if (scratch_bo) > + size = 8; /* 2 dword */ > + > descriptor_bo = > queue->device->ws->buffer_create(queue->device->ws, > - 8, > + size, > 4096, > > RADEON_DOMAIN_VRAM, > > RADEON_FLAG_CPU_ACCESS); > @@ -1107,22 +1236,49 @@ radv_get_preamble_cs(struct radv_queue *queue, > if (scratch_bo) > queue->device->ws->cs_add_buffer(cs, scratch_bo, 8); > > + if (esgs_ring_bo) > + queue->device->ws->cs_add_buffer(cs, esgs_ring_bo, 8); > + > + if (gsvs_ring_bo) > + queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8); > + > if (descriptor_bo) > queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8); > > if (descriptor_bo != queue->descriptor_bo) { > - uint64_t scratch_va = > queue->device->ws->buffer_get_va(scratch_bo); > - uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> > 32) | > - S_008F04_SWIZZLE_ENABLE(1); > - > uint32_t *map = > (uint32_t*)queue->device->ws->buffer_map(descriptor_bo); > > - map[0] = scratch_va; > - map[1] = rsrc1; > + if (scratch_bo) { > + uint64_t scratch_va = > queue->device->ws->buffer_get_va(scratch_bo); > + uint32_t rsrc1 = > S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | > + S_008F04_SWIZZLE_ENABLE(1); > + map[0] = scratch_va; > + map[1] = rsrc1; > + } > + > + if (esgs_ring_bo || gsvs_ring_bo) > + fill_geom_rings(queue, map, esgs_ring_size, > esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo);
You need to MAX2 these sizes with the sizes in the queue, otherwise you can use a smaller size if the current cmd_buffer e.g. needs a larger scratch, but small or no gs ringbuffers. > > queue->device->ws->buffer_unmap(descriptor_bo); > } > > + if (esgs_ring_bo || gsvs_ring_bo) { > + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); > + radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | > EVENT_INDEX(4)); > + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); > + radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | > EVENT_INDEX(0)); > + > + if (queue->device->physical_device->rad_info.chip_class > >= CIK) { > + radeon_set_uconfig_reg_seq(cs, > R_030900_VGT_ESGS_RING_SIZE, 2); > + radeon_emit(cs, esgs_ring_size >> 8); > + radeon_emit(cs, gsvs_ring_size >> 8); > + } else { > + radeon_set_config_reg_seq(cs, > R_0088C8_VGT_ESGS_RING_SIZE, 2); > + radeon_emit(cs, esgs_ring_size >> 8); > + radeon_emit(cs, gsvs_ring_size >> 8); > + } > + } > + > if (descriptor_bo) { > uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0, > R_00B130_SPI_SHADER_USER_DATA_VS_0, > @@ -1174,6 +1330,20 @@ radv_get_preamble_cs(struct radv_queue *queue, > queue->compute_scratch_size = compute_scratch_size; > } > > + if (esgs_ring_bo != queue->esgs_ring_bo) { > + if (queue->esgs_ring_bo) > + > queue->device->ws->buffer_destroy(queue->esgs_ring_bo); > + queue->esgs_ring_bo = esgs_ring_bo; > + queue->esgs_ring_size = esgs_ring_size; > + } > + > + if (gsvs_ring_bo != queue->gsvs_ring_bo) { > + if (queue->gsvs_ring_bo) > + > queue->device->ws->buffer_destroy(queue->gsvs_ring_bo); > + queue->gsvs_ring_bo = gsvs_ring_bo; > + queue->gsvs_ring_size = gsvs_ring_size; > + } > + > if (descriptor_bo != queue->descriptor_bo) { > if (queue->descriptor_bo) > queue->device->ws->buffer_destroy(queue->descriptor_bo); > @@ -1192,6 +1362,10 @@ fail: > queue->device->ws->buffer_destroy(scratch_bo); > if (compute_scratch_bo && compute_scratch_bo != > queue->compute_scratch_bo) > queue->device->ws->buffer_destroy(compute_scratch_bo); > + if (esgs_ring_bo && esgs_ring_bo != queue->esgs_ring_bo) > + queue->device->ws->buffer_destroy(esgs_ring_bo); > + if (gsvs_ring_bo && gsvs_ring_bo != queue->gsvs_ring_bo) > + queue->device->ws->buffer_destroy(gsvs_ring_bo); > return VK_ERROR_OUT_OF_DEVICE_MEMORY; > } > > @@ -1209,6 +1383,7 @@ VkResult radv_QueueSubmit( > uint32_t max_cs_submission = queue->device->trace_bo ? 1 : UINT32_MAX; > uint32_t scratch_size = 0; > uint32_t compute_scratch_size = 0; > + uint32_t esgs_ring_size = 0, gsvs_ring_size = 0; > struct radeon_winsys_cs *preamble_cs = NULL; > VkResult result; > > @@ -1222,10 +1397,12 @@ VkResult radv_QueueSubmit( > scratch_size = MAX2(scratch_size, > cmd_buffer->scratch_size_needed); > compute_scratch_size = MAX2(compute_scratch_size, > > cmd_buffer->compute_scratch_size_needed); > + esgs_ring_size = MAX2(esgs_ring_size, > cmd_buffer->esgs_ring_size_needed); > + gsvs_ring_size = MAX2(gsvs_ring_size, > cmd_buffer->gsvs_ring_size_needed); > } > } > > - result = radv_get_preamble_cs(queue, scratch_size, > compute_scratch_size, &preamble_cs); > + result = radv_get_preamble_cs(queue, scratch_size, > compute_scratch_size, esgs_ring_size, gsvs_ring_size, &preamble_cs); > if (result != VK_SUCCESS) > return result; > > diff --git a/src/amd/vulkan/radv_private.h > b/src/amd/vulkan/radv_private.h > index 3a0318b..57aa9ea 100644 > --- a/src/amd/vulkan/radv_private.h > +++ b/src/amd/vulkan/radv_private.h > @@ -470,10 +470,14 @@ struct radv_queue { > > uint32_t scratch_size; > uint32_t compute_scratch_size; > + uint32_t esgs_ring_size; > + uint32_t gsvs_ring_size; > > struct radeon_winsys_bo *scratch_bo; > struct radeon_winsys_bo *descriptor_bo; > struct radeon_winsys_bo *compute_scratch_bo; > + struct radeon_winsys_bo *esgs_ring_bo; > + struct radeon_winsys_bo *gsvs_ring_bo; > struct radeon_winsys_cs *preamble_cs; > }; > > @@ -742,6 +746,10 @@ struct radv_cmd_buffer { > > uint32_t scratch_size_needed; > uint32_t compute_scratch_size_needed; > + uint32_t esgs_ring_size_needed; > + uint32_t gsvs_ring_size_needed; > + These need to be reset on command buffer reset. With these two comments fixed, patches 1-20, 22-30 (with 30 moved before 29) are Review-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl> I still consider it strange that 21 is needed and running it through CTS now. Bas > + int ring_offsets_idx; /* just used for verification */ > }; > > struct radv_image; > -- > 2.9.3 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev