On Fri, Jul 21, 2017 at 03:56:05PM -0700, Jason Ekstrand wrote: > On Wed, Jul 19, 2017 at 2:22 PM, Nanley Chery <nanleych...@gmail.com> wrote: > > > Image layouts only let us know that an image *may* be fast-cleared. For > > this reason we can end up with redundant resolves. Testing has shown > > that such resolves can measurably hurt performance and that predicating > > them can avoid the penalty. > > > > v2: > > - Introduce additional resolve state management function (Jason Ekstrand). > > - Enable easy retrieval of fast clear state fields. > > > > Signed-off-by: Nanley Chery <nanley.g.ch...@intel.com> > > --- > > src/intel/vulkan/anv_blorp.c | 3 +- > > src/intel/vulkan/anv_private.h | 13 ++-- > > src/intel/vulkan/genX_cmd_buffer.c | 120 ++++++++++++++++++++++++++++++ > > +++---- > > 3 files changed, 120 insertions(+), 16 deletions(-) > > > > diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c > > index e9b2ccbbdf..ba34cec0bd 100644 > > --- a/src/intel/vulkan/anv_blorp.c > > +++ b/src/intel/vulkan/anv_blorp.c > > @@ -1628,7 +1628,8 @@ anv_ccs_resolve(struct anv_cmd_buffer * const > > cmd_buffer, > > return; > > > > struct blorp_batch batch; > > - blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0); > > + blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, > > + BLORP_BATCH_PREDICATE_ENABLE); > > > > struct blorp_surf surf; > > get_blorp_surf_for_anv_image(image, VK_IMAGE_ASPECT_COLOR_BIT, > > diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_ > > private.h > > index e7b47ead36..588bf732df 100644 > > --- a/src/intel/vulkan/anv_private.h > > +++ b/src/intel/vulkan/anv_private.h > > @@ -2090,11 +2090,16 @@ anv_fast_clear_state_entry_size(const struct > > anv_device *device) > > { > > assert(device); > > /* Entry contents: > > - * +----------------------+ > > - * | clear value dword(s) | > > - * +----------------------+ > > + * +--------------------------------------------+ > > + * | clear value dword(s) | needs resolve dword | > > + * +--------------------------------------------+ > > */ > > - return device->isl_dev.ss.clear_value_size; > > + > > + /* Ensure that the needs resolve dword is in fact dword-aligned to > > enable > > + * GPU memcpy operations. > > + */ > > + assert(device->isl_dev.ss.clear_value_size % 4 == 0); > > + return device->isl_dev.ss.clear_value_size + 4; > > } > > > > /* Returns true if a HiZ-enabled depth buffer can be sampled from. */ > > diff --git a/src/intel/vulkan/genX_cmd_buffer.c > > b/src/intel/vulkan/genX_cmd_buffer.c > > index 611e77bddb..c4d67fe8c1 100644 > > --- a/src/intel/vulkan/genX_cmd_buffer.c > > +++ b/src/intel/vulkan/genX_cmd_buffer.c > > @@ -407,21 +407,92 @@ transition_depth_buffer(struct anv_cmd_buffer > > *cmd_buffer, > > anv_gen8_hiz_op_resolve(cmd_buffer, image, hiz_op); > > } > > > > +enum fast_clear_state_field { > > + FAST_CLEAR_STATE_FIELD_CLEAR, > > + FAST_CLEAR_STATE_FIELD_RESOLVE, > > > > Mind calling these CLEAR_COLOR and NEEDS_RESOLVE? Otherwise, it's not > clear what you're fetching. > >
I don't mind. > > +}; > > + > > static inline uint32_t > > -get_fast_clear_state_entry_offset(const struct anv_device *device, > > - const struct anv_image *image, > > - unsigned level) > > +get_fast_clear_state_offset(const struct anv_device *device, > > + const struct anv_image *image, > > + unsigned level, enum fast_clear_state_field > > field) > > { > > assert(device && image); > > assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); > > assert(level < anv_image_aux_levels(image)); > > - const uint32_t offset = image->offset + image->aux_surface.offset + > > - image->aux_surface.isl.size + > > - anv_fast_clear_state_entry_size(device) * > > level; > > + uint32_t offset = image->offset + image->aux_surface.offset + > > + image->aux_surface.isl.size + > > + anv_fast_clear_state_entry_size(device) * level; > > + > > + switch (field) { > > + case FAST_CLEAR_STATE_FIELD_RESOLVE: > > + offset += device->isl_dev.ss.clear_value_size; > > + /* Fall-through */ > > + case FAST_CLEAR_STATE_FIELD_CLEAR: > > + break; > > + } > > + > > assert(offset < image->offset + image->size); > > return offset; > > } > > > > +#define MI_PREDICATE_SRC0 0x2400 > > +#define MI_PREDICATE_SRC1 0x2408 > > + > > +/* Manages the state of an color image subresource to ensure resolves are > > + * performed properly. > > + */ > > +static void > > +genX(set_image_needs_resolve)(struct anv_cmd_buffer *cmd_buffer, > > + const struct anv_image *image, > > + unsigned level, bool needs_resolve) > > +{ > > + assert(cmd_buffer && image); > > + assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); > > + assert(level < anv_image_aux_levels(image)); > > + > > + const uint32_t resolve_flag_offset = > > + get_fast_clear_state_offset(cmd_buffer->device, image, level, > > + FAST_CLEAR_STATE_FIELD_RESOLVE); > > + > > + /* The HW docs say that there is no way to guarantee the completion of > > + * the following command. We use it nevertheless because it shows no > > + * issues in testing is currently being used in the GL driver. > > + */ > > + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { > > + sdi.Address = (struct anv_address) { image->bo, resolve_flag_offset > > }; > > + sdi.ImmediateData = needs_resolve; > > + } > > +} > > + > > +static void > > +genX(load_needs_resolve_predicate)(struct anv_cmd_buffer *cmd_buffer, > > + const struct anv_image *image, > > + unsigned level) > > +{ > > + assert(cmd_buffer && image); > > + assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); > > + assert(level < anv_image_aux_levels(image)); > > + > > + const uint32_t resolve_flag_offset = > > + get_fast_clear_state_offset(cmd_buffer->device, image, level, > > + FAST_CLEAR_STATE_FIELD_RESOLVE); > > + > > + /* Make the pending predicated resolve a no-op if one is not needed. > > + * predicate = do_resolve = resolve_flag != 0; > > + */ > > + emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 , 0); > > + emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0); > > + emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0 , 0); > > + emit_lrm(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4, > > + image->bo, resolve_flag_offset); > > + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { > > + mip.LoadOperation = LOAD_LOADINV; > > + mip.CombineOperation = COMBINE_SET; > > + mip.CompareOperation = COMPARE_SRCS_EQUAL; > > + } > > +} > > > > Thanks for splitting these up. Much easier to read! > > No problem! > > + > > static void > > init_fast_clear_state_entry(struct anv_cmd_buffer *cmd_buffer, > > const struct anv_image *image, > > @@ -431,6 +502,15 @@ init_fast_clear_state_entry(struct anv_cmd_buffer > > *cmd_buffer, > > assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); > > assert(level < anv_image_aux_levels(image)); > > > > + /* The resolve flag should updated to signify that > > fast-clear/compression > > + * data needs to be removed when leaving the undefined layout. Such > > data > > + * may need to be removed if it would cause accesses to the color > > buffer > > + * to return incorrect data. The fast clear data in CCS_D buffers > > should > > + * be removed because CCS_D isn't enabled all the time. > > + */ > > + genX(set_image_needs_resolve)(cmd_buffer, image, level, > > + image->aux_usage == ISL_AUX_USAGE_NONE); > > + > > /* The fast clear value dword(s) will be copied into a surface state > > object. > > * Ensure that the restrictions of the fields in the dword(s) are > > followed. > > * > > @@ -446,7 +526,8 @@ init_fast_clear_state_entry(struct anv_cmd_buffer > > *cmd_buffer, > > for (; i < cmd_buffer->device->isl_dev.ss.clear_value_size; i += 4) { > > anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { > > const uint32_t entry_offset = > > - get_fast_clear_state_entry_offset(cmd_buffer->device, image, > > level); > > + get_fast_clear_state_offset(cmd_buffer->device, image, level, > > + FAST_CLEAR_STATE_FIELD_CLEAR); > > sdi.Address = (struct anv_address) { image->bo, entry_offset + i > > }; > > > > if (GEN_GEN >= 9) { > > @@ -493,7 +574,8 @@ genX(copy_fast_clear_dwords)(struct anv_cmd_buffer > > *cmd_buffer, > > uint32_t ss_clear_offset = surface_state.offset + > > cmd_buffer->device->isl_dev.ss.clear_value_offset; > > uint32_t entry_offset = > > - get_fast_clear_state_entry_offset(cmd_buffer->device, image, > > level); > > + get_fast_clear_state_offset(cmd_buffer->device, image, level, > > + FAST_CLEAR_STATE_FIELD_CLEAR); > > unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size; > > > > if (copy_from_surface_state) { > > @@ -670,6 +752,8 @@ transition_color_buffer(struct anv_cmd_buffer > > *cmd_buffer, > > layer_count = MIN2(layer_count, anv_image_aux_layers(image, > > level)); > > } > > > > + genX(load_needs_resolve_predicate)(cmd_buffer, image, level); > > + > > /* Create a surface state with the right clear color and perform the > > * resolve. > > */ > > @@ -701,6 +785,8 @@ transition_color_buffer(struct anv_cmd_buffer > > *cmd_buffer, > > image->aux_usage == ISL_AUX_USAGE_CCS_E ? > > BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL : > > BLORP_FAST_CLEAR_OP_RESOLVE_FULL); > > + > > + genX(set_image_needs_resolve)(cmd_buffer, image, level, false); > > } > > > > cmd_buffer->state.pending_pipe_bits |= > > @@ -2450,9 +2536,6 @@ void genX(CmdDispatch)( > > #define GPGPU_DISPATCHDIMY 0x2504 > > #define GPGPU_DISPATCHDIMZ 0x2508 > > > > -#define MI_PREDICATE_SRC0 0x2400 > > -#define MI_PREDICATE_SRC1 0x2408 > > - > > void genX(CmdDispatchIndirect)( > > VkCommandBuffer commandBuffer, > > VkBuffer _buffer, > > @@ -2859,6 +2942,21 @@ cmd_buffer_subpass_sync_fast_clear_values(struct > > anv_cmd_buffer *cmd_buffer) > > genX(copy_fast_clear_dwords)(cmd_buffer, > > att_state->color_rt_state, > > iview->image, iview->isl.base_level, > > true /* copy from ss */); > > + > > + /* Fast-clears impact whether or not a resolve will be > > necessary. */ > > + if (iview->image->aux_usage == ISL_AUX_USAGE_CCS_E && > > + att_state->clear_color_is_zero) { > > + /* This image always has the auxiliary buffer enabled. We can > > mark > > + * the subresource as not needing a resolve because the clear > > color > > + * will match what's in every RENDER_SURFACE_STATE object > > when it's > > + * being used for sampling. > > + */ > > + genX(set_image_needs_resolve)(cmd_buffer, iview->image, > > + iview->isl.base_level, false); > > + } else { > > + genX(set_image_needs_resolve)(cmd_buffer, iview->image, > > + iview->isl.base_level, true); > > + } > > } else if (rp_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { > > /* The attachment may have been fast-cleared in a previous render > > * pass and the value is needed now. Update the surface state(s). > > -- > > 2.13.3 > > > > _______________________________________________ > > mesa-dev mailing list > > mesa-dev@lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/mesa-dev > > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev