For the series: Reviewed-by: Marek Olšák <[email protected]>
Marek On Sat, Mar 25, 2017 at 1:06 AM, Nicolai Hähnle <[email protected]> wrote: > From: Nicolai Hähnle <[email protected]> > > Only a small tail needs to be uploaded manually. > > This is only partly a performance measure (apps are expected to use > aligned access). Mostly it is preparation for sparse buffers, which the > old code would incorrectly have attempted to map directly. > --- > src/gallium/drivers/radeonsi/si_cp_dma.c | 46 > +++++++++++++++++++------------- > 1 file changed, 27 insertions(+), 19 deletions(-) > > diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c > b/src/gallium/drivers/radeonsi/si_cp_dma.c > index 0cf7b3b..812fcbc 100644 > --- a/src/gallium/drivers/radeonsi/si_cp_dma.c > +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c > @@ -178,87 +178,95 @@ static void si_cp_dma_prepare(struct si_context *sctx, > struct pipe_resource *dst > > static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource > *dst, > uint64_t offset, uint64_t size, unsigned value, > enum r600_coherency coher) > { > struct si_context *sctx = (struct si_context*)ctx; > struct radeon_winsys *ws = sctx->b.ws; > struct r600_resource *rdst = r600_resource(dst); > unsigned tc_l2_flag = get_tc_l2_flag(sctx, coher); > unsigned flush_flags = get_flush_flags(sctx, coher); > + uint64_t dma_clear_size; > bool is_first = true; > > if (!size) > return; > > + dma_clear_size = size & ~3llu; > + > /* Mark the buffer range of destination as valid (initialized), > * so that transfer_map knows it should wait for the GPU when mapping > * that range. */ > util_range_add(&rdst->valid_buffer_range, offset, > - offset + size); > - > - /* Fallback for unaligned clears. */ > - if (size % 4 != 0) { > - uint8_t *map = r600_buffer_map_sync_with_rings(&sctx->b, rdst, > - > PIPE_TRANSFER_WRITE); > - map += offset; > - for (uint64_t i = 0; i < size; i++) { > - unsigned byte_within_dword = (offset + i) % 4; > - *map++ = (value >> (byte_within_dword * 8)) & 0xff; > - } > - return; > - } > + offset + dma_clear_size); > > /* dma_clear_buffer can use clear_buffer on failure. Make sure that > * doesn't happen. We don't want an infinite recursion: */ > if (sctx->b.dma.cs && > (offset % 4 == 0) && > /* CP DMA is very slow. Always use SDMA for big clears. This > * alone improves DeusEx:MD performance by 70%. */ > (size > 128 * 1024 || > /* Buffers not used by the GFX IB yet will be cleared by SDMA. > * This happens to move most buffer clears to SDMA, including > * DCC and CMASK clears, because pipe->clear clears them before > * si_emit_framebuffer_state (in a draw call) adds them. > * For example, DeusEx:MD has 21 buffer clears per frame and all > * of them are moved to SDMA thanks to this. */ > !ws->cs_is_buffer_referenced(sctx->b.gfx.cs, rdst->buf, > RADEON_USAGE_READWRITE))) { > - sctx->b.dma_clear_buffer(ctx, dst, offset, size, value); > - } else { > + sctx->b.dma_clear_buffer(ctx, dst, offset, dma_clear_size, > value); > + > + offset += dma_clear_size; > + size -= dma_clear_size; > + } else if (dma_clear_size >= 4) { > uint64_t va = rdst->gpu_address + offset; > > + offset += dma_clear_size; > + size -= dma_clear_size; > + > /* Flush the caches. */ > sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | > SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags; > > - while (size) { > - unsigned byte_count = MIN2(size, > CP_DMA_MAX_BYTE_COUNT); > + while (dma_clear_size) { > + unsigned byte_count = MIN2(dma_clear_size, > CP_DMA_MAX_BYTE_COUNT); > unsigned dma_flags = tc_l2_flag | CP_DMA_CLEAR; > > - si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, > 0, > + si_cp_dma_prepare(sctx, dst, NULL, byte_count, > dma_clear_size, 0, > &is_first, &dma_flags); > > /* Emit the clear packet. */ > si_emit_cp_dma(sctx, va, value, byte_count, > dma_flags, coher); > > - size -= byte_count; > + dma_clear_size -= byte_count; > va += byte_count; > } > > if (tc_l2_flag) > rdst->TC_L2_dirty = true; > > /* If it's not a framebuffer fast clear... */ > if (coher == R600_COHERENCY_SHADER) > sctx->b.num_cp_dma_calls++; > } > + > + if (size) { > + /* Handle non-dword alignment. > + * > + * This function is called for embedded texture metadata > clears, > + * but those should always be properly aligned. */ > + assert(dst->target == PIPE_BUFFER); > + assert(size < 4); > + > + pipe_buffer_write(ctx, dst, offset, size, &value); > + } > } > > /** > * Realign the CP DMA engine. This must be done after a copy with an > unaligned > * size. > * > * \param size Remaining size to the CP DMA alignment. > */ > static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, > unsigned user_flags, bool *is_first) > -- > 2.9.3 > > _______________________________________________ > mesa-dev mailing list > [email protected] > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-dev
