Chris Wilson <[email protected]> writes:
> We call i915_gem_reset_prepare_engine() during reset and then upon
> wedging if the reset fails. Unfortunately, kthread_park and similar do
> not support being called recursively and so we must count the number of
> times we prepare for reset and only actually prepare on the outermost
> layer. (Similarly for finish on unwinding the onion.)
>
> [ 87.705581] WARNING: CPU: 2 PID: 1377 at kernel/kthread.c:505
> kthread_park+0x55/0x60
> [ 87.705583] Modules linked in: snd_hda_codec_hdmi snd_hda_codec_realtek
> snd_hda_codec_generic i915 x86_pkg_temp_thermal intel_powerclamp coretemp
> crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_hda_intel snd_hda_codec
> snd_hwdep snd_hda_core snd_pcm broadcom bcm_phy_lib tg3 mei_me prime_numbers
> mei lpc_ich
> [ 87.705618] CPU: 2 PID: 1377 Comm: gem_eio Tainted: G U
> 4.17.0-rc5-CI-CI_DRM_4177+ #1
> [ 87.705620] Hardware name: Dell Inc. XPS 8300 /0Y2MRG, BIOS A06 10/17/2011
> [ 87.705622] RIP: 0010:kthread_park+0x55/0x60
> [ 87.705624] RSP: 0018:ffffc9000051bac0 EFLAGS: 00010202
> [ 87.705627] RAX: 0000000000000004 RBX: ffff88021ca13de8 RCX:
> 0000000000000001
> [ 87.705629] RDX: 0000000080000001 RSI: ffffffff821228a9 RDI:
> ffff88020e8f0040
> [ 87.705630] RBP: ffff880215937670 R08: 00000000bae32d65 R09:
> 0000000000000000
> [ 87.705632] R10: 0000000000000000 R11: 0000000000000000 R12:
> ffff8802159376b0
> [ 87.705634] R13: ffff880215937670 R14: ffff880215930000 R15:
> ffffffffa01c8d60
> [ 87.705636] FS: 00007f0c32061980(0000) GS:ffff88022fa80000(0000)
> knlGS:0000000000000000
> [ 87.705637] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 87.705639] CR2: 00007f0c32094000 CR3: 000000021a0d4004 CR4:
> 00000000000606e0
> [ 87.705641] Call Trace:
> [ 87.705668] i915_gem_reset_prepare_engine+0x1d/0xa0 [i915]
> [ 87.705694] i915_gem_set_wedged+0x7b/0x1e0 [i915]
> [ 87.705699] ? __drm_printfn_info+0x20/0x20
> [ 87.705722] i915_reset+0x14a/0x290 [i915]
> [ 87.705743] i915_reset_device+0x1fb/0x290 [i915]
> [ 87.705767] ? __intel_get_crtc_scanline+0x1c0/0x1c0 [i915]
> [ 87.705772] ? work_on_cpu_safe+0x50/0x50
> [ 87.705798] i915_handle_error+0x207/0x4a0 [i915]
> [ 87.705810] ? __might_fault+0x39/0x90
> [ 87.705835] i915_wedged_set+0x7f/0xc0 [i915]
> [ 87.705841] simple_attr_write+0xb0/0xd0
> [ 87.705847] full_proxy_write+0x51/0x80
> [ 87.705852] __vfs_write+0x31/0x160
> [ 87.705857] ? rcu_read_lock_sched_held+0x6f/0x80
> [ 87.705860] ? rcu_sync_lockdep_assert+0x29/0x50
> [ 87.705862] ? __sb_start_write+0x152/0x1f0
> [ 87.705864] ? __sb_start_write+0x168/0x1f0
> [ 87.705868] vfs_write+0xbd/0x1a0
> [ 87.705872] ksys_write+0x50/0xc0
> [ 87.705877] do_syscall_64+0x55/0x190
> [ 87.705880] entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [ 87.705882] RIP: 0033:0x7f0c315df281
> [ 87.705884] RSP: 002b:00007ffc9c990328 EFLAGS: 00000246 ORIG_RAX:
> 0000000000000001
> [ 87.705887] RAX: ffffffffffffffda RBX: 0000000000000000 RCX:
> 00007f0c315df281
> [ 87.705889] RDX: 0000000000000002 RSI: 000055a5e23ef276 RDI:
> 0000000000000047
> [ 87.705890] RBP: 00007ffc9c990350 R08: 0000000000000000 R09:
> 0000000000000034
> [ 87.705892] R10: 0000000000000000 R11: 0000000000000246 R12:
> 000055a5e23ebc50
> [ 87.705894] R13: 00007ffc9c990dc0 R14: 0000000000000000 R15:
> 0000000000000000
> [ 87.705902] Code: 00 31 ed 48 39 c7 74 0e e8 79 db 00 00 48 8d 7b 18 e8 a0
> 05 88 00 89 e8 5b 5d c3 0f 0b bd da ff ff ff 89 e8 5b 5d c3 0f 0b eb b7 <0f>
> 0b bd f0 ff ff ff eb e2 66 90 41 57 41 56 49 c7 c6 f4 ff ff
>
> References: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme()
> completion issue")
> Signed-off-by: Chris Wilson <[email protected]>
> Cc: Mika Kuoppala <[email protected]>
> ---
> drivers/gpu/drm/i915/i915_gem.c | 52 ++++++++++++++++++-------
> drivers/gpu/drm/i915/intel_engine_cs.c | 1 +
> drivers/gpu/drm/i915/intel_ringbuffer.h | 3 ++
> 3 files changed, 43 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 0a2070112b66..b169b630bf78 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2997,14 +2997,11 @@ i915_gem_find_active_request(struct intel_engine_cs
> *engine)
> return active;
> }
>
> -/*
> - * Ensure irq handler finishes, and not run again.
> - * Also return the active request so that we only search for it once.
> - */
> -struct i915_request *
> -i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> +static void __i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> {
> - struct i915_request *request = NULL;
> + mutex_lock(&engine->reset_lock);
> + if (engine->reset_depth++)
> + goto unlock;
>
> /*
> * During the reset sequence, we must prevent the engine from
> @@ -3057,6 +3054,38 @@ i915_gem_reset_prepare_engine(struct intel_engine_cs
> *engine)
> if (engine->i915->guc.preempt_wq)
> flush_workqueue(engine->i915->guc.preempt_wq);
>
> +unlock:
> + mutex_unlock(&engine->reset_lock);
> +}
> +
> +static void __i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
> +{
> + mutex_lock(&engine->reset_lock);
> +
> + GEM_BUG_ON(engine->reset_depth);
!engine->reset_depth
-Mika
> + if (--engine->reset_depth)
> + goto unlock;
> +
> + tasklet_enable(&engine->execlists.tasklet);
> + kthread_unpark(engine->breadcrumbs.signaler);
> +
> + intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
> +
> +unlock:
> + mutex_unlock(&engine->reset_lock);
> +}
> +
> +/*
> + * Ensure irq handler finishes, and not run again.
> + * Also return the active request so that we only search for it once.
> + */
> +struct i915_request *
> +i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> +{
> + struct i915_request *request = NULL;
> +
> + __i915_gem_reset_prepare_engine(engine);
> +
> if (engine->irq_seqno_barrier)
> engine->irq_seqno_barrier(engine);
>
> @@ -3265,10 +3294,7 @@ void i915_gem_reset(struct drm_i915_private *dev_priv,
>
> void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
> {
> - tasklet_enable(&engine->execlists.tasklet);
> - kthread_unpark(engine->breadcrumbs.signaler);
> -
> - intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
> + __i915_gem_reset_finish_engine(engine);
> }
>
> void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
> @@ -3332,7 +3358,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
> * for which we haven't set the fence error to EIO yet).
> */
> for_each_engine(engine, i915, id) {
> - i915_gem_reset_prepare_engine(engine);
> + __i915_gem_reset_prepare_engine(engine);
>
> engine->submit_request = nop_submit_request;
> engine->schedule = NULL;
> @@ -3380,7 +3406,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>
> intel_engine_last_submit(engine));
> spin_unlock_irqrestore(&engine->timeline.lock, flags);
>
> - i915_gem_reset_finish_engine(engine);
> + __i915_gem_reset_finish_engine(engine);
> }
>
> GEM_TRACE("end\n");
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c
> b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 6bfd7e3ed152..0de489da514e 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -306,6 +306,7 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
> /* Nothing to do here, execute in order of dependencies */
> engine->schedule = NULL;
>
> + mutex_init(&engine->reset_lock);
> seqlock_init(&engine->stats.lock);
>
> ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h
> b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 010750e8ee44..ec2b359f3e8b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -566,6 +566,9 @@ struct intel_engine_cs {
>
> struct intel_engine_hangcheck hangcheck;
>
> + struct mutex reset_lock;
> + unsigned int reset_depth;
> +
> #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
> #define I915_ENGINE_SUPPORTS_STATS BIT(1)
> #define I915_ENGINE_HAS_PREEMPTION BIT(2)
> --
> 2.17.0
_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx