On Wed, Aug 14, 2013 at 11:25 PM, Marek Olšák <[email protected]> wrote: > (This should be applied before MSAA, which will need to be rebased.) > > It moves all sampler view descriptors to a buffer. > It supports partial resource updates and it can also unbind resources > (required for FMASK texturing). > > The buffer contains all sampler view descriptors for one shader stage, > represented as an array. On top of that, there are N arrays in the buffer, > which are used to emulate context registers as implemented by the previous > ASICs (each array is a context). > > This uses the RCU synchronization approach to avoid read-after-write hazards > as discussed in the thread: > "radeonsi: add FMASK texture binding slots and resource setup" > > CP DMA is used to clear the descriptors at context initialization and to copy > the descriptors from one context to the next. > > IMPORTANT: > 128 resource contexts are needed, 64 doesn't work. If I set > SH_KCACHE_ACTION_ENA before every draw call, only 2 contexts are needed. > I don't have an explanation for this. > --- > src/gallium/drivers/radeonsi/Makefile.sources | 1 + > src/gallium/drivers/radeonsi/r600_blit.c | 12 +- > src/gallium/drivers/radeonsi/r600_hw_context.c | 14 ++ > src/gallium/drivers/radeonsi/radeonsi_pipe.c | 7 +- > src/gallium/drivers/radeonsi/radeonsi_pipe.h | 19 +- > src/gallium/drivers/radeonsi/si_descriptors.c | 335 > +++++++++++++++++++++++++ > src/gallium/drivers/radeonsi/si_state.c | 47 ++-- > src/gallium/drivers/radeonsi/si_state.h | 56 +++++ > src/gallium/drivers/radeonsi/si_state_draw.c | 18 +- > src/gallium/drivers/radeonsi/sid.h | 43 ++++ > 10 files changed, 500 insertions(+), 52 deletions(-) > create mode 100644 src/gallium/drivers/radeonsi/si_descriptors.c > > diff --git a/src/gallium/drivers/radeonsi/Makefile.sources > b/src/gallium/drivers/radeonsi/Makefile.sources > index b3ffa72..68c8282 100644 > --- a/src/gallium/drivers/radeonsi/Makefile.sources > +++ b/src/gallium/drivers/radeonsi/Makefile.sources > @@ -10,6 +10,7 @@ C_SOURCES := \ > r600_translate.c \ > radeonsi_pm4.c \ > radeonsi_compute.c \ > + si_descriptors.c \ > si_state.c \ > si_state_streamout.c \ > si_state_draw.c \ > diff --git a/src/gallium/drivers/radeonsi/r600_blit.c > b/src/gallium/drivers/radeonsi/r600_blit.c > index bab108e..5bd1a62 100644 > --- a/src/gallium/drivers/radeonsi/r600_blit.c > +++ b/src/gallium/drivers/radeonsi/r600_blit.c > @@ -70,12 +70,12 @@ static void r600_blitter_begin(struct pipe_context *ctx, > enum r600_blitter_op op > > if (op & R600_SAVE_TEXTURES) { > util_blitter_save_fragment_sampler_states( > - rctx->blitter, rctx->ps_samplers.n_samplers, > - (void**)rctx->ps_samplers.samplers); > + rctx->blitter, > rctx->samplers[PIPE_SHADER_FRAGMENT].n_samplers, > + > (void**)rctx->samplers[PIPE_SHADER_FRAGMENT].samplers); > > - util_blitter_save_fragment_sampler_views( > - rctx->blitter, rctx->ps_samplers.n_views, > - (struct pipe_sampler_view**)rctx->ps_samplers.views); > + util_blitter_save_fragment_sampler_views(rctx->blitter, > + > util_bitcount(rctx->samplers[PIPE_SHADER_FRAGMENT].views.desc.enabled_mask), > + rctx->samplers[PIPE_SHADER_FRAGMENT].views.views); > } > > if ((op & R600_DISABLE_RENDER_COND) && rctx->current_render_cond) { > @@ -224,7 +224,7 @@ void si_flush_depth_textures(struct r600_context *rctx, > struct pipe_sampler_view *view; > struct r600_texture *tex; > > - view = &textures->views[i]->base; > + view = textures->views.views[i]; > if (!view) continue; > > tex = (struct r600_texture *)view->texture; > diff --git a/src/gallium/drivers/radeonsi/r600_hw_context.c > b/src/gallium/drivers/radeonsi/r600_hw_context.c > index 25c972b..cf43089 100644 > --- a/src/gallium/drivers/radeonsi/r600_hw_context.c > +++ b/src/gallium/drivers/radeonsi/r600_hw_context.c > @@ -114,9 +114,17 @@ err: > void si_need_cs_space(struct r600_context *ctx, unsigned num_dw, > boolean count_draw_in) > { > + int i; > + > /* The number of dwords we already used in the CS so far. */ > num_dw += ctx->cs->cdw; > > + for (i = 0; i < SI_NUM_ATOMS(ctx); i++) { > + if (ctx->atoms.array[i]->dirty) { > + num_dw += ctx->atoms.array[i]->num_dw; > + } > + } > + > if (count_draw_in) { > /* The number of dwords all the dirty states would take. */ > num_dw += ctx->pm4_dirty_cdwords; > @@ -254,6 +262,10 @@ void si_context_flush(struct r600_context *ctx, unsigned > flags) > ctx->pm4_dirty_cdwords = 0; > ctx->flags = 0; > > + /* The CS initialization should be emitted before everything else. */ > + si_pm4_emit(ctx, ctx->queued.named.init); > + ctx->emitted.named.init = ctx->queued.named.init; > + > #if 0 > if (streamout_suspended) { > ctx->streamout_start = TRUE; > @@ -270,6 +282,8 @@ void si_context_flush(struct r600_context *ctx, unsigned > flags) > * next draw command > */ > si_pm4_reset_emitted(ctx); > + > + si_all_descriptors_begin_new_cs(ctx); > } > > void si_context_emit_fence(struct r600_context *ctx, struct si_resource > *fence_bo, unsigned offset, unsigned value) > diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.c > b/src/gallium/drivers/radeonsi/radeonsi_pipe.c > index b4a1ca9..9afc7f2 100644 > --- a/src/gallium/drivers/radeonsi/radeonsi_pipe.c > +++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.c > @@ -178,6 +178,8 @@ static void r600_destroy_context(struct pipe_context > *context) > { > struct r600_context *rctx = (struct r600_context *)context; > > + si_release_all_descriptors(rctx); > + > si_resource_reference(&rctx->border_color_table, NULL); > > if (rctx->dummy_pixel_shader) { > @@ -231,12 +233,15 @@ static struct pipe_context *r600_create_context(struct > pipe_screen *screen, void > rctx->context.create_video_buffer = vl_video_buffer_create; > } > > + rctx->cs = rctx->ws->cs_create(rctx->ws, RING_GFX, NULL); > + > + si_init_all_descriptors(rctx); > + > switch (rctx->chip_class) { > case SI: > case CIK: > si_init_state_functions(rctx); > LIST_INITHEAD(&rctx->active_query_list); > - rctx->cs = rctx->ws->cs_create(rctx->ws, RING_GFX, NULL); > rctx->max_db = 8; > si_init_config(rctx); > break; > diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.h > b/src/gallium/drivers/radeonsi/radeonsi_pipe.h > index 6fbe653..674c630 100644 > --- a/src/gallium/drivers/radeonsi/radeonsi_pipe.h > +++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.h > @@ -94,11 +94,8 @@ struct si_cs_shader_state { > struct si_pipe_compute *program; > }; > > -/* needed for blitter save */ > -#define NUM_TEX_UNITS 16 > - > struct r600_textures_info { > - struct si_pipe_sampler_view *views[NUM_TEX_UNITS]; > + struct si_sampler_views views; > struct si_pipe_sampler_state *samplers[NUM_TEX_UNITS]; > unsigned n_views; > uint32_t depth_texture_mask; /* which textures > are depth */ > @@ -131,6 +128,9 @@ struct r600_constbuf_state > uint32_t dirty_mask; > }; > > +#define SI_NUM_ATOMS(rctx) > (sizeof((rctx)->atoms)/sizeof((rctx)->atoms.array[0])) > +#define SI_NUM_SHADERS (PIPE_SHADER_FRAGMENT+1) > + > struct r600_context { > struct pipe_context context; > struct blitter_context *blitter; > @@ -142,6 +142,14 @@ struct r600_context { > void *custom_dsa_flush_inplace; > struct r600_screen *screen; > struct radeon_winsys *ws; > + > + union { > + struct { > + struct si_atom *sampler_views[SI_NUM_SHADERS]; > + }; > + struct si_atom *array[0]; > + } atoms; > + > struct si_vertex_element *vertex_elements; > struct pipe_framebuffer_state framebuffer; > unsigned pa_sc_line_stipple; > @@ -161,8 +169,7 @@ struct r600_context { > unsigned sprite_coord_enable; > unsigned export_16bpc; > struct r600_constbuf_state constbuf_state[PIPE_SHADER_TYPES]; > - struct r600_textures_info vs_samplers; > - struct r600_textures_info ps_samplers; > + struct r600_textures_info samplers[SI_NUM_SHADERS]; > struct si_resource *border_color_table; > unsigned border_color_offset; > > diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c > b/src/gallium/drivers/radeonsi/si_descriptors.c > new file mode 100644 > index 0000000..98cd789 > --- /dev/null > +++ b/src/gallium/drivers/radeonsi/si_descriptors.c > @@ -0,0 +1,335 @@ > +/* > + * Copyright 2013 Advanced Micro Devices, Inc. > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * on the rights to use, copy, modify, merge, publish, distribute, sub > + * license, and/or sell copies of the Software, and to permit persons to whom > + * the Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including the next > + * paragraph) shall be included in all copies or substantial portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, > + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR > + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE > + * USE OR OTHER DEALINGS IN THE SOFTWARE. > + * > + * Authors: > + * Marek Olšák <[email protected]> > + */ > + > +#include "radeonsi_pipe.h" > +#include "radeonsi_resource.h" > +#include "radeonsi_shader.h" > +#include "r600_hw_context_priv.h" > + > +#include "util/u_memory.h" > + > +#define SI_NUM_CONTEXTS 128 > + > +static const uint32_t null_desc[8]; /* zeros */ > + > +/* Emit a CP DMA packet to do a copy from one buffer to another. > + * The size must fit in bits [20:0]. Notes: > + * > + * 1) Set sync to true if you want the 3D engine to wait until CP DMA is > done. > + * > + * 2) Set raw_hazard_wait to true if the source data was used as a > destination > + * in a previous CP DMA packet. It's for preventing a read-after-write > hazard > + * between two CP DMA packets. > + */ > +static void si_emit_cp_dma_copy_buffer(struct r600_context *rctx, > + uint64_t dst_va, uint64_t src_va, > + unsigned size, > + bool sync, bool raw_hazard_wait) > +{ > + struct radeon_winsys_cs *cs = rctx->cs; > + uint32_t sync_flag = sync ? PKT3_CP_DMA_CP_SYNC : 0; > + uint32_t raw_wait = raw_hazard_wait ? PKT3_CP_DMA_CMD_RAW_WAIT : 0; > + > + assert(size); > + assert((size & ((1<<21)-1)) == size); > + > + cs->buf[cs->cdw++] = PKT3(PKT3_CP_DMA, 4, 0); > + cs->buf[cs->cdw++] = src_va; /* SRC_ADDR_LO [31:0] > */ > + cs->buf[cs->cdw++] = sync_flag | ((src_va >> 32) & 0xff); /* CP_SYNC > [31] | SRC_ADDR_HI [7:0] */ > + cs->buf[cs->cdw++] = dst_va; /* DST_ADDR_LO [31:0] > */ > + cs->buf[cs->cdw++] = (dst_va >> 32) & 0xff; /* DST_ADDR_HI [7:0] > */ > + cs->buf[cs->cdw++] = size | raw_wait; /* COMMAND [29:22] | > BYTE_COUNT [20:0] */ > +} > + > +/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. > */ > +static void si_emit_cp_dma_clear_buffer(struct r600_context *rctx, > + uint64_t dst_va, unsigned size, > + uint32_t clear_value, > + bool sync, bool raw_hazard_wait) > +{ > + struct radeon_winsys_cs *cs = rctx->cs; > + uint32_t sync_flag = sync ? PKT3_CP_DMA_CP_SYNC : 0; > + uint32_t raw_wait = raw_hazard_wait ? PKT3_CP_DMA_CMD_RAW_WAIT : 0; > + > + assert(size); > + assert((size & ((1<<21)-1)) == size); > + > + cs->buf[cs->cdw++] = PKT3(PKT3_CP_DMA, 4, 0); > + cs->buf[cs->cdw++] = clear_value; /* DATA [31:0] */ > + cs->buf[cs->cdw++] = sync_flag | PKT3_CP_DMA_SRC_SEL(2); /* CP_SYNC > [31] | SRC_SEL[30:29] */ > + cs->buf[cs->cdw++] = dst_va; /* DST_ADDR_LO [31:0] > */ > + cs->buf[cs->cdw++] = (dst_va >> 32) & 0xff; /* DST_ADDR_HI [7:0] > */ > + cs->buf[cs->cdw++] = size | raw_wait; /* COMMAND [29:22] | > BYTE_COUNT [20:0] */
CIK parts use the new DMA_DATA packet. Current bonaire ucode may still support the old packet, but we shouldn't rely on it. Alex _______________________________________________ mesa-dev mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/mesa-dev
