On Thu, Oct 21, 2021 at 04:40:44PM -0700, [email protected] wrote:
> From: John Harrison <[email protected]>
> 
> When GuC submission is enabled, GuC itself manages hang detection and
> recovery. Therefore, any test that relies on being able to trigger an
> engine reset in the driver will fail. Full GT resets can still be
> triggered by the driver. However, in that situation detecting the
> specific context that caused a hang is not possible as the driver has
> no information about what is actually running on the hardware at any
> given time. Plus of course, there was no context that caused the hang
> because the hang was triggered manually, so it's basically a bogus
> mechanism in the first place!
> 
> Update the capture test to cause a reset via a the hangcheck mechanism
> by submitting a hanging batch and waiting. That way it is guaranteed to
> be testing the correct reset code paths for the current platform,
> whether that is GuC enabled or not.
> 
> Signed-off-by: John Harrison <[email protected]>

Reviewed-by: Matthew Brost <[email protected]>

> ---
>  tests/i915/gem_exec_capture.c | 65 ++++++++++++++++++++++++++++-------
>  1 file changed, 53 insertions(+), 12 deletions(-)
> 
> diff --git a/tests/i915/gem_exec_capture.c b/tests/i915/gem_exec_capture.c
> index 8997125ee..dda6e6a8f 100644
> --- a/tests/i915/gem_exec_capture.c
> +++ b/tests/i915/gem_exec_capture.c
> @@ -23,6 +23,7 @@
>  
>  #include <sys/poll.h>
>  #include <zlib.h>
> +#include <sched.h>
>  
>  #include "i915/gem.h"
>  #include "i915/gem_create.h"
> @@ -31,6 +32,8 @@
>  #include "igt_rand.h"
>  #include "igt_sysfs.h"
>  
> +#define MAX_RESET_TIME       600
> +
>  IGT_TEST_DESCRIPTION("Check that we capture the user specified objects on a 
> hang");
>  
>  struct offset {
> @@ -213,7 +216,29 @@ static void configure_hangs(int fd, const struct 
> intel_execution_engine2 *e, int
>       gem_engine_property_printf(fd, e->name, "heartbeat_interval_ms", "%d", 
> 500);
>  
>       /* Allow engine based resets and disable banning */
> -     igt_allow_hang(fd, ctxt_id, HANG_ALLOW_CAPTURE);
> +     igt_allow_hang(fd, ctxt_id, HANG_ALLOW_CAPTURE | 
> HANG_WANT_ENGINE_RESET);
> +}
> +
> +static bool fence_busy(int fence)
> +{
> +     return poll(&(struct pollfd){fence, POLLIN}, 1, 0) == 0;
> +}
> +
> +static void wait_to_die(int fence_out)
> +{
> +     struct timeval before, after, delta;
> +
> +     /* Wait for a reset to occur */
> +     gettimeofday(&before, NULL);
> +     while (fence_busy(fence_out)) {
> +             gettimeofday(&after, NULL);
> +             timersub(&after, &before, &delta);
> +             igt_assert(delta.tv_sec < MAX_RESET_TIME);
> +             sched_yield();
> +     }
> +     gettimeofday(&after, NULL);
> +     timersub(&after, &before, &delta);
> +     igt_info("Target died after %ld.%06lds\n", delta.tv_sec, delta.tv_usec);
>  }
>  
>  static void __capture1(int fd, int dir, uint64_t ahnd, const intel_ctx_t 
> *ctx,
> @@ -230,7 +255,7 @@ static void __capture1(int fd, int dir, uint64_t ahnd, 
> const intel_ctx_t *ctx,
>       struct drm_i915_gem_execbuffer2 execbuf;
>       uint32_t *batch, *seqno;
>       struct offset offset;
> -     int i;
> +     int i, fence_out;
>  
>       configure_hangs(fd, e, ctx->id);
>  
> @@ -315,18 +340,25 @@ static void __capture1(int fd, int dir, uint64_t ahnd, 
> const intel_ctx_t *ctx,
>       execbuf.flags = e->flags;
>       if (gen > 3 && gen < 6)
>               execbuf.flags |= I915_EXEC_SECURE;
> +     execbuf.flags |= I915_EXEC_FENCE_OUT;
>       execbuf.rsvd1 = ctx->id;
> +     execbuf.rsvd2 = ~0UL;
>  
>       igt_assert(!READ_ONCE(*seqno));
> -     gem_execbuf(fd, &execbuf);
> +     gem_execbuf_wr(fd, &execbuf);
> +
> +     fence_out = execbuf.rsvd2 >> 32;
> +     igt_assert(fence_out >= 0);
>  
>       /* Wait for the request to start */
>       while (READ_ONCE(*seqno) != 0xc0ffee)
>               igt_assert(gem_bo_busy(fd, obj[SCRATCH].handle));
>       munmap(seqno, 4096);
>  
> +     /* Wait for a reset to occur */
> +     wait_to_die(fence_out);
> +
>       /* Check that only the buffer we marked is reported in the error */
> -     igt_force_gpu_reset(fd);
>       memset(&offset, 0, sizeof(offset));
>       offset.addr = obj[CAPTURE].offset;
>       igt_assert_eq(check_error_state(dir, &offset, 1, target_size, false), 
> 1);
> @@ -373,7 +405,8 @@ static int cmp(const void *A, const void *B)
>  static struct offset *
>  __captureN(int fd, int dir, uint64_t ahnd, const intel_ctx_t *ctx,
>          const struct intel_execution_engine2 *e,
> -        unsigned int size, int count, unsigned int flags)
> +        unsigned int size, int count,
> +        unsigned int flags, int *_fence_out)
>  #define INCREMENTAL 0x1
>  #define ASYNC 0x2
>  {
> @@ -383,7 +416,7 @@ __captureN(int fd, int dir, uint64_t ahnd, const 
> intel_ctx_t *ctx,
>       struct drm_i915_gem_execbuffer2 execbuf;
>       uint32_t *batch, *seqno;
>       struct offset *offsets;
> -     int i;
> +     int i, fence_out;
>  
>       configure_hangs(fd, e, ctx->id);
>  
> @@ -491,10 +524,17 @@ __captureN(int fd, int dir, uint64_t ahnd, const 
> intel_ctx_t *ctx,
>       execbuf.flags = e->flags;
>       if (gen > 3 && gen < 6)
>               execbuf.flags |= I915_EXEC_SECURE;
> +     execbuf.flags |= I915_EXEC_FENCE_OUT;
>       execbuf.rsvd1 = ctx->id;
> +     execbuf.rsvd2 = ~0UL;
>  
>       igt_assert(!READ_ONCE(*seqno));
> -     gem_execbuf(fd, &execbuf);
> +     gem_execbuf_wr(fd, &execbuf);
> +
> +     fence_out = execbuf.rsvd2 >> 32;
> +     igt_assert(fence_out >= 0);
> +     if (_fence_out)
> +             *_fence_out = fence_out;
>  
>       /* Wait for the request to start */
>       while (READ_ONCE(*seqno) != 0xc0ffee)
> @@ -502,7 +542,7 @@ __captureN(int fd, int dir, uint64_t ahnd, const 
> intel_ctx_t *ctx,
>       munmap(seqno, 4096);
>  
>       if (!(flags & ASYNC)) {
> -             igt_force_gpu_reset(fd);
> +             wait_to_die(fence_out);
>               gem_sync(fd, obj[count + 1].handle);
>       }
>  
> @@ -549,7 +589,7 @@ static void many(int fd, int dir, uint64_t size, unsigned 
> int flags)
>       intel_require_memory(count, size, CHECK_RAM);
>       ahnd = get_reloc_ahnd(fd, ctx->id);
>  
> -     offsets = __captureN(fd, dir, ahnd, ctx, e, size, count, flags);
> +     offsets = __captureN(fd, dir, ahnd, ctx, e, size, count, flags, NULL);
>  
>       blobs = check_error_state(dir, offsets, count, size, !!(flags & 
> INCREMENTAL));
>       igt_info("Captured %lu %"PRId64"-blobs out of a total of %lu\n",
> @@ -602,6 +642,7 @@ static void prioinv(int fd, int dir, const intel_ctx_t 
> *ctx,
>       igt_assert(pipe(link) == 0);
>       igt_fork(child, 1) {
>               const intel_ctx_t *ctx2;
> +             int fence_out;
>               fd = gem_reopen_driver(fd);
>               igt_debug("Submitting large capture [%ld x %dMiB objects]\n",
>                         count, (int)(size >> 20));
> @@ -613,11 +654,11 @@ static void prioinv(int fd, int dir, const intel_ctx_t 
> *ctx,
>               /* Reopen the allocator in the new process. */
>               ahnd = get_reloc_ahnd(fd, ctx2->id);
>  
> -             free(__captureN(fd, dir, ahnd, ctx2, e, size, count, ASYNC));
> +             free(__captureN(fd, dir, ahnd, ctx2, e, size, count, ASYNC, 
> &fence_out));
>               put_ahnd(ahnd);
>  
>               write(link[1], &fd, sizeof(fd)); /* wake the parent up */
> -             igt_force_gpu_reset(fd);
> +             wait_to_die(fence_out);
>               write(link[1], &fd, sizeof(fd)); /* wake the parent up */
>       }
>       read(link[0], &dummy, sizeof(dummy));
> @@ -714,7 +755,7 @@ igt_main
>               gem_require_mmap_wc(fd);
>               igt_require(has_capture(fd));
>               ctx = intel_ctx_create_all_physical(fd);
> -             igt_allow_hang(fd, ctx->id, HANG_ALLOW_CAPTURE);
> +             igt_allow_hang(fd, ctx->id, HANG_ALLOW_CAPTURE | 
> HANG_WANT_ENGINE_RESET);
>  
>               dir = igt_sysfs_open(fd);
>               igt_require(igt_sysfs_set(dir, "error", "Begone!"));
> -- 
> 2.25.1
> 

Reply via email to