FYI, this breaks: piglit/bin/bufferstorage-persistent read -auto and a bunch of others.
Marek On Mon, Nov 6, 2017 at 11:23 AM, Nicolai Hähnle <[email protected]> wrote: > From: Nicolai Hähnle <[email protected]> > > The idea is to fix the following interleaving of operations > that can arise from deferred fences: > > Thread 1 / Context 1 Thread 2 / Context 2 > -------------------- -------------------- > f = deferred flush > <------- application-side synchronization -------> > fence_server_sync(f) > ... > flush() > flush() > > We will now stall in fence_server_sync until the flush of context 1 > has completed. > > This scenario was unlikely to occur previously, because applications > seem to be doing > > Thread 1 / Context 1 Thread 2 / Context 2 > -------------------- -------------------- > f = glFenceSync() > glFlush() > <------- application-side synchronization -------> > glWaitSync(f) > > ... and indeed they probably *have* to use this ordering to avoid > deadlocks in the GLX model, where all GL operations conceptually > go through a single connection to the X server. However, it's less > clear whether applications have to do this with other WSI (i.e. EGL). > Besides, even this sequence of GL commands can be translated into > the Gallium-level sequence outlined above when Gallium threading > and asynchronous flushes are used. So it makes sense to be more > robust. > > As a side effect, we no longer busy-wait on submission_in_progress. > > We won't enable asynchronous flushes on radeon, but add a > cs_add_fence_dependency stub anyway to document the potential > issue. > --- > src/gallium/drivers/radeon/radeon_winsys.h | 4 +++- > src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 21 +++++++++++++-------- > src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 9 ++++++--- > src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 19 +++++++++++++++++++ > 4 files changed, 41 insertions(+), 12 deletions(-) > > diff --git a/src/gallium/drivers/radeon/radeon_winsys.h > b/src/gallium/drivers/radeon/radeon_winsys.h > index 2d3f646dc65..e8c486cb7f4 100644 > --- a/src/gallium/drivers/radeon/radeon_winsys.h > +++ b/src/gallium/drivers/radeon/radeon_winsys.h > @@ -536,21 +536,23 @@ struct radeon_winsys { > * \return Negative POSIX error code or 0 for success. > * Asynchronous submissions never return an error. > */ > int (*cs_flush)(struct radeon_winsys_cs *cs, > unsigned flags, > struct pipe_fence_handle **fence); > > /** > * Create a fence before the CS is flushed. > * The user must flush manually to complete the initializaton of the > fence. > - * The fence must not be used before the flush. > + * > + * The fence must not be used for anything except \ref > cs_add_fence_dependency > + * before the flush. > */ > struct pipe_fence_handle *(*cs_get_next_fence)(struct radeon_winsys_cs > *cs); > > /** > * Return true if a buffer is referenced by a command stream. > * > * \param cs A command stream. > * \param buf A winsys buffer. > */ > bool (*cs_is_buffer_referenced)(struct radeon_winsys_cs *cs, > diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c > b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c > index 0450ccc3596..0628e547351 100644 > --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c > +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c > @@ -43,21 +43,22 @@ amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned > ip_type, > { > struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); > > fence->reference.count = 1; > fence->ws = ctx->ws; > fence->ctx = ctx; > fence->fence.context = ctx->ctx; > fence->fence.ip_type = ip_type; > fence->fence.ip_instance = ip_instance; > fence->fence.ring = ring; > - fence->submission_in_progress = true; > + util_queue_fence_init(&fence->submitted); > + util_queue_fence_reset(&fence->submitted); > p_atomic_inc(&ctx->refcount); > return (struct pipe_fence_handle *)fence; > } > > static struct pipe_fence_handle * > amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd) > { > struct amdgpu_winsys *ws = amdgpu_winsys(rws); > struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); > > @@ -74,66 +75,69 @@ amdgpu_fence_import_sync_file(struct radeon_winsys *rws, > int fd) > FREE(fence); > return NULL; > } > > r = amdgpu_cs_syncobj_import_sync_file(ws->dev, fence->syncobj, fd); > if (r) { > amdgpu_cs_destroy_syncobj(ws->dev, fence->syncobj); > FREE(fence); > return NULL; > } > + > + util_queue_fence_init(&fence->submitted); > + > return (struct pipe_fence_handle*)fence; > } > > static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws, > struct pipe_fence_handle *pfence) > { > struct amdgpu_winsys *ws = amdgpu_winsys(rws); > struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence; > > if (amdgpu_fence_is_syncobj(fence)) { > int fd, r; > > /* Convert syncobj into sync_file. */ > r = amdgpu_cs_syncobj_export_sync_file(ws->dev, fence->syncobj, &fd); > return r ? -1 : fd; > } > > - os_wait_until_zero(&fence->submission_in_progress, PIPE_TIMEOUT_INFINITE); > + util_queue_fence_wait(&fence->submitted); > > /* Convert the amdgpu fence into a fence FD. */ > int fd; > if (amdgpu_cs_fence_to_handle(ws->dev, &fence->fence, > AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD, > (uint32_t*)&fd)) > return -1; > > return fd; > } > > static void amdgpu_fence_submitted(struct pipe_fence_handle *fence, > uint64_t seq_no, > uint64_t *user_fence_cpu_address) > { > struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence; > > rfence->fence.fence = seq_no; > rfence->user_fence_cpu_address = user_fence_cpu_address; > - rfence->submission_in_progress = false; > + util_queue_fence_signal(&rfence->submitted); > } > > static void amdgpu_fence_signalled(struct pipe_fence_handle *fence) > { > struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence; > > rfence->signalled = true; > - rfence->submission_in_progress = false; > + util_queue_fence_signal(&rfence->submitted); > } > > bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, > bool absolute) > { > struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence; > uint32_t expired; > int64_t abs_timeout; > uint64_t *user_fence_cpu; > int r; > @@ -157,22 +161,21 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, > uint64_t timeout, > } > > if (absolute) > abs_timeout = timeout; > else > abs_timeout = os_time_get_absolute_timeout(timeout); > > /* The fence might not have a number assigned if its IB is being > * submitted in the other thread right now. Wait until the submission > * is done. */ > - if (!os_wait_until_zero_abs_timeout(&rfence->submission_in_progress, > - abs_timeout)) > + if (!util_queue_fence_wait_timeout(&rfence->submitted, abs_timeout)) > return false; > > user_fence_cpu = rfence->user_fence_cpu_address; > if (user_fence_cpu) { > if (*user_fence_cpu >= rfence->fence.fence) { > rfence->signalled = true; > return true; > } > > /* No timeout, just query: no need for the ioctl. */ > @@ -1022,20 +1025,22 @@ static bool is_noop_fence_dependency(struct amdgpu_cs > *acs, > return amdgpu_fence_wait((void *)fence, 0, false); > } > > static void amdgpu_cs_add_fence_dependency(struct radeon_winsys_cs *rws, > struct pipe_fence_handle *pfence) > { > struct amdgpu_cs *acs = amdgpu_cs(rws); > struct amdgpu_cs_context *cs = acs->csc; > struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence; > > + util_queue_fence_wait(&fence->submitted); > + > if (is_noop_fence_dependency(acs, fence)) > return; > > unsigned idx = add_fence_dependency_entry(cs); > amdgpu_fence_reference(&cs->fence_dependencies[idx], > (struct pipe_fence_handle*)fence); > } > > static void amdgpu_add_bo_fence_dependencies(struct amdgpu_cs *acs, > struct amdgpu_cs_buffer *buffer) > @@ -1297,21 +1302,21 @@ bo_list_error: > > for (unsigned i = 0; i < num_dependencies; i++) { > struct amdgpu_fence *fence = > (struct amdgpu_fence*)cs->fence_dependencies[i]; > > if (amdgpu_fence_is_syncobj(fence)) { > num_syncobj_dependencies++; > continue; > } > > - assert(!fence->submission_in_progress); > + assert(util_queue_fence_is_signalled(&fence->submitted)); > amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[num++]); > } > > chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES; > chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num; > chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk; > num_chunks++; > } > > /* Syncobj dependencies. */ > @@ -1320,21 +1325,21 @@ bo_list_error: > alloca(num_syncobj_dependencies * sizeof(sem_chunk[0])); > unsigned num = 0; > > for (unsigned i = 0; i < num_dependencies; i++) { > struct amdgpu_fence *fence = > (struct amdgpu_fence*)cs->fence_dependencies[i]; > > if (!amdgpu_fence_is_syncobj(fence)) > continue; > > - assert(!fence->submission_in_progress); > + assert(util_queue_fence_is_signalled(&fence->submitted)); > sem_chunk[num++].handle = fence->syncobj; > } > > chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN; > chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num; > chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk; > num_chunks++; > } > > assert(num_chunks <= ARRAY_SIZE(chunks)); > diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h > b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h > index b7bc9a20bbf..fbf44b36610 100644 > --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h > +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h > @@ -137,23 +137,25 @@ struct amdgpu_cs { > struct amdgpu_fence { > struct pipe_reference reference; > /* If ctx == NULL, this fence is syncobj-based. */ > uint32_t syncobj; > > struct amdgpu_winsys *ws; > struct amdgpu_ctx *ctx; /* submission context */ > struct amdgpu_cs_fence fence; > uint64_t *user_fence_cpu_address; > > - /* If the fence is unknown due to an IB still being submitted > - * in the other thread. */ > - volatile int submission_in_progress; /* bool (int for atomicity) */ > + /* If the fence has been submitted. This is unsignalled for deferred > fences > + * (cs->next_fence) and while an IB is still being submitted in the submit > + * thread. */ > + struct util_queue_fence submitted; > + > volatile int signalled; /* bool (int for atomicity) */ > }; > > static inline bool amdgpu_fence_is_syncobj(struct amdgpu_fence *fence) > { > return fence->ctx == NULL; > } > > static inline void amdgpu_ctx_unref(struct amdgpu_ctx *ctx) > { > @@ -171,20 +173,21 @@ static inline void amdgpu_fence_reference(struct > pipe_fence_handle **dst, > struct amdgpu_fence *rsrc = (struct amdgpu_fence *)src; > > if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) { > struct amdgpu_fence *fence = *rdst; > > if (amdgpu_fence_is_syncobj(fence)) > amdgpu_cs_destroy_syncobj(fence->ws->dev, fence->syncobj); > else > amdgpu_ctx_unref(fence->ctx); > > + util_queue_fence_destroy(&fence->submitted); > FREE(fence); > } > *rdst = rsrc; > } > > int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct > amdgpu_winsys_bo *bo); > > static inline struct amdgpu_ib * > amdgpu_ib(struct radeon_winsys_cs *base) > { > diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c > b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c > index 7220f3a0240..add88f80aae 100644 > --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c > +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c > @@ -779,28 +779,47 @@ radeon_drm_cs_get_next_fence(struct radeon_winsys_cs > *rcs) > } > > fence = radeon_cs_create_fence(rcs); > if (!fence) > return NULL; > > radeon_fence_reference(&cs->next_fence, fence); > return fence; > } > > +static void > +radeon_drm_cs_add_fence_dependency(struct radeon_winsys_cs *cs, > + struct pipe_fence_handle *fence) > +{ > + /* TODO: Handle the following unlikely multi-threaded scenario: > + * > + * Thread 1 / Context 1 Thread 2 / Context 2 > + * -------------------- -------------------- > + * f = cs_get_next_fence() > + * cs_add_fence_dependency(f) > + * cs_flush() > + * cs_flush() > + * > + * We currently assume that this does not happen because we don't support > + * asynchronous flushes on Radeon. > + */ > +} > + > void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws) > { > ws->base.ctx_create = radeon_drm_ctx_create; > ws->base.ctx_destroy = radeon_drm_ctx_destroy; > ws->base.cs_create = radeon_drm_cs_create; > ws->base.cs_destroy = radeon_drm_cs_destroy; > ws->base.cs_add_buffer = radeon_drm_cs_add_buffer; > ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer; > ws->base.cs_validate = radeon_drm_cs_validate; > ws->base.cs_check_space = radeon_drm_cs_check_space; > ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list; > ws->base.cs_flush = radeon_drm_cs_flush; > ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence; > ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced; > ws->base.cs_sync_flush = radeon_drm_cs_sync_flush; > + ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency; > ws->base.fence_wait = radeon_fence_wait; > ws->base.fence_reference = radeon_fence_reference; > } > -- > 2.11.0 > > _______________________________________________ > mesa-dev mailing list > [email protected] > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-dev
