Reviewed-by: Marek Olšák <[email protected]> Marek
On Mon, Nov 6, 2017 at 11:23 AM, Nicolai Hähnle <[email protected]> wrote: > From: Nicolai Hähnle <[email protected]> > > v2: use uncached system memory for the fence, and use the CPU to > clear it so we never read garbage when checking the fence > --- > src/gallium/drivers/radeonsi/si_fence.c | 89 > ++++++++++++++++++++++++++++++++- > 1 file changed, 88 insertions(+), 1 deletion(-) > > diff --git a/src/gallium/drivers/radeonsi/si_fence.c > b/src/gallium/drivers/radeonsi/si_fence.c > index 81007192994..fa80f4fd87a 100644 > --- a/src/gallium/drivers/radeonsi/si_fence.c > +++ b/src/gallium/drivers/radeonsi/si_fence.c > @@ -20,35 +20,44 @@ > * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN > THE > * SOFTWARE. > * > */ > > #include <libsync.h> > > #include "util/os_time.h" > #include "util/u_memory.h" > #include "util/u_queue.h" > +#include "util/u_upload_mgr.h" > > #include "si_pipe.h" > +#include "radeon/r600_cs.h" > + > +struct si_fine_fence { > + struct r600_resource *buf; > + unsigned offset; > +}; > > struct si_multi_fence { > struct pipe_reference reference; > struct pipe_fence_handle *gfx; > struct pipe_fence_handle *sdma; > struct tc_unflushed_batch_token *tc_token; > struct util_queue_fence ready; > > /* If the context wasn't flushed at fence creation, this is non-NULL. > */ > struct { > struct r600_common_context *ctx; > unsigned ib_index; > } gfx_unflushed; > + > + struct si_fine_fence fine; > }; > > static void si_add_fence_dependency(struct r600_common_context *rctx, > struct pipe_fence_handle *fence) > { > struct radeon_winsys *ws = rctx->ws; > > if (rctx->dma.cs) > ws->cs_add_fence_dependency(rctx->dma.cs, fence); > ws->cs_add_fence_dependency(rctx->gfx.cs, fence); > @@ -59,20 +68,21 @@ static void si_fence_reference(struct pipe_screen *screen, > struct pipe_fence_handle *src) > { > struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws; > struct si_multi_fence **rdst = (struct si_multi_fence **)dst; > struct si_multi_fence *rsrc = (struct si_multi_fence *)src; > > if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) { > ws->fence_reference(&(*rdst)->gfx, NULL); > ws->fence_reference(&(*rdst)->sdma, NULL); > tc_unflushed_batch_token_reference(&(*rdst)->tc_token, NULL); > + r600_resource_reference(&(*rdst)->fine.buf, NULL); > FREE(*rdst); > } > *rdst = rsrc; > } > > static struct si_multi_fence *si_create_multi_fence() > { > struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence); > if (!fence) > return NULL; > @@ -113,20 +123,71 @@ static void si_fence_server_sync(struct pipe_context > *ctx, > * this fence dependency is signalled. > * > * Should we flush the context to allow more GPU parallelism? > */ > if (rfence->sdma) > si_add_fence_dependency(rctx, rfence->sdma); > if (rfence->gfx) > si_add_fence_dependency(rctx, rfence->gfx); > } > > +static bool si_fine_fence_signaled(struct radeon_winsys *rws, > + const struct si_fine_fence *fine) > +{ > + char *map = rws->buffer_map(fine->buf->buf, NULL, PIPE_TRANSFER_READ | > + > PIPE_TRANSFER_UNSYNCHRONIZED); > + if (!map) > + return false; > + > + uint32_t *fence = (uint32_t*)(map + fine->offset); > + return *fence != 0; > +} > + > +static void si_fine_fence_set(struct si_context *ctx, > + struct si_fine_fence *fine, > + unsigned flags) > +{ > + uint32_t *fence_ptr; > + > + assert(util_bitcount(flags & (PIPE_FLUSH_TOP_OF_PIPE | > PIPE_FLUSH_BOTTOM_OF_PIPE)) == 1); > + > + /* Use uncached system memory for the fence. */ > + u_upload_alloc(ctx->b.b.stream_uploader, 0, 4, 4, > + &fine->offset, (struct pipe_resource **)&fine->buf, > (void **)&fence_ptr); > + if (!fine->buf) > + return; > + > + *fence_ptr = 0; > + > + uint64_t fence_va = fine->buf->gpu_address + fine->offset; > + > + radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx, fine->buf, > + RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); > + if (flags & PIPE_FLUSH_TOP_OF_PIPE) { > + struct radeon_winsys_cs *cs = ctx->b.gfx.cs; > + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); > + radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) | > + S_370_WR_CONFIRM(1) | > + S_370_ENGINE_SEL(V_370_PFP)); > + radeon_emit(cs, fence_va); > + radeon_emit(cs, fence_va >> 32); > + radeon_emit(cs, 0x80000000); > + } else if (flags & PIPE_FLUSH_BOTTOM_OF_PIPE) { > + si_gfx_write_event_eop(&ctx->b, V_028A90_BOTTOM_OF_PIPE_TS, 0, > + EOP_DATA_SEL_VALUE_32BIT, > + NULL, fence_va, 0x80000000, > + PIPE_QUERY_GPU_FINISHED); > + } else { > + assert(false); > + } > +} > + > static boolean si_fence_finish(struct pipe_screen *screen, > struct pipe_context *ctx, > struct pipe_fence_handle *fence, > uint64_t timeout) > { > struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws; > struct si_multi_fence *rfence = (struct si_multi_fence *)fence; > struct r600_common_context *rctx; > int64_t abs_timeout = os_time_get_absolute_timeout(timeout); > > @@ -164,20 +225,27 @@ static boolean si_fence_finish(struct pipe_screen > *screen, > /* Recompute the timeout after waiting. */ > if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { > int64_t time = os_time_get_nano(); > timeout = abs_timeout > time ? abs_timeout - time : 0; > } > } > > if (!rfence->gfx) > return true; > > + if (rfence->fine.buf && > + si_fine_fence_signaled(rws, &rfence->fine)) { > + rws->fence_reference(&rfence->gfx, NULL); > + r600_resource_reference(&rfence->fine.buf, NULL); > + return true; > + } > + > /* Flush the gfx IB if it hasn't been flushed yet. */ > if (rctx && > rfence->gfx_unflushed.ctx == rctx && > rfence->gfx_unflushed.ib_index == rctx->num_gfx_cs_flushes) { > /* Section 4.1.2 (Signaling) of the OpenGL 4.6 (Core profile) > * spec says: > * > * "If the sync object being blocked upon will not be > * signaled in finite time (for example, by an associated > * fence command issued previously, but not yet flushed to > @@ -203,21 +271,30 @@ static boolean si_fence_finish(struct pipe_screen > *screen, > if (!timeout) > return false; > > /* Recompute the timeout after all that. */ > if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { > int64_t time = os_time_get_nano(); > timeout = abs_timeout > time ? abs_timeout - time : 0; > } > } > > - return rws->fence_wait(rws, rfence->gfx, timeout); > + if (rws->fence_wait(rws, rfence->gfx, timeout)) > + return true; > + > + /* Re-check in case the GPU is slow or hangs, but the commands before > + * the fine-grained fence have completed. */ > + if (rfence->fine.buf && > + si_fine_fence_signaled(rws, &rfence->fine)) > + return true; > + > + return false; > } > > static void si_create_fence_fd(struct pipe_context *ctx, > struct pipe_fence_handle **pfence, int fd) > { > struct r600_common_screen *rscreen = (struct > r600_common_screen*)ctx->screen; > struct radeon_winsys *ws = rscreen->ws; > struct si_multi_fence *rfence; > > *pfence = NULL; > @@ -286,25 +363,33 @@ static int si_fence_get_fd(struct pipe_screen *screen, > static void si_flush_from_st(struct pipe_context *ctx, > struct pipe_fence_handle **fence, > unsigned flags) > { > struct pipe_screen *screen = ctx->screen; > struct r600_common_context *rctx = (struct r600_common_context *)ctx; > struct radeon_winsys *ws = rctx->ws; > struct pipe_fence_handle *gfx_fence = NULL; > struct pipe_fence_handle *sdma_fence = NULL; > bool deferred_fence = false; > + struct si_fine_fence fine = {}; > unsigned rflags = RADEON_FLUSH_ASYNC; > > if (flags & PIPE_FLUSH_END_OF_FRAME) > rflags |= RADEON_FLUSH_END_OF_FRAME; > > + if (flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) { > + assert(flags & PIPE_FLUSH_DEFERRED); > + assert(fence); > + > + si_fine_fence_set((struct si_context *)rctx, &fine, flags); > + } > + > /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. > */ > if (rctx->dma.cs) > rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL); > > if (!radeon_emitted(rctx->gfx.cs, rctx->initial_gfx_cs_size)) { > if (fence) > ws->fence_reference(&gfx_fence, rctx->last_gfx_fence); > if (!(flags & PIPE_FLUSH_DEFERRED)) > ws->cs_sync_flush(rctx->gfx.cs); > } else { > @@ -345,20 +430,22 @@ static void si_flush_from_st(struct pipe_context *ctx, > > /* If both fences are NULL, fence_finish will always return > true. */ > multi_fence->gfx = gfx_fence; > multi_fence->sdma = sdma_fence; > > if (deferred_fence) { > multi_fence->gfx_unflushed.ctx = rctx; > multi_fence->gfx_unflushed.ib_index = > rctx->num_gfx_cs_flushes; > } > > + multi_fence->fine = fine; > + > if (flags & TC_FLUSH_ASYNC) { > util_queue_fence_signal(&multi_fence->ready); > > tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL); > } > } > finish: > if (!(flags & PIPE_FLUSH_DEFERRED)) { > if (rctx->dma.cs) > ws->cs_sync_flush(rctx->dma.cs); > ws->cs_sync_flush(rctx->gfx.cs); > -- > 2.11.0 > > _______________________________________________ > mesa-dev mailing list > [email protected] > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-dev
