Commit 789659f4 authored by Chris Wilson's avatar Chris Wilson
Browse files

drm/i915: Drop fake breadcrumb irq



Missed breadcrumb detection is defunct due to the tight coupling with
dma_fence signaling and the myriad ways we may signal fences from
everywhere but from an interrupt, i.e. we frequently signal a fence
before we even see its interrupt. This means that even if we miss an
interrupt for a fence, it still is signaled before our breadcrumb
hangcheck fires, so simplify the breadcrumb hangchecking by moving it
into the GPU hangcheck and forgo fake interrupts.

Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190129205230.19056-3-chris@chris-wilson.co.uk
parent 52c0fdb2
Loading
Loading
Loading
Loading
+0 −93
Original line number Diff line number Diff line
@@ -1321,9 +1321,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
			   intel_engine_last_submit(engine),
			   jiffies_to_msecs(jiffies -
					    engine->hangcheck.action_timestamp));
		seq_printf(m, "\tfake irq active? %s\n",
			   yesno(test_bit(engine->id,
					  &dev_priv->gpu_error.missed_irq_rings)));

		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
			   (long long)engine->hangcheck.acthd,
@@ -3899,94 +3896,6 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
			i915_wedged_get, i915_wedged_set,
			"%llu\n");

static int
fault_irq_set(struct drm_i915_private *i915,
	      unsigned long *irq,
	      unsigned long val)
{
	int err;

	err = mutex_lock_interruptible(&i915->drm.struct_mutex);
	if (err)
		return err;

	err = i915_gem_wait_for_idle(i915,
				     I915_WAIT_LOCKED |
				     I915_WAIT_INTERRUPTIBLE,
				     MAX_SCHEDULE_TIMEOUT);
	if (err)
		goto err_unlock;

	*irq = val;
	mutex_unlock(&i915->drm.struct_mutex);

	/* Flush idle worker to disarm irq */
	drain_delayed_work(&i915->gt.idle_work);

	return 0;

err_unlock:
	mutex_unlock(&i915->drm.struct_mutex);
	return err;
}

static int
i915_ring_missed_irq_get(void *data, u64 *val)
{
	struct drm_i915_private *dev_priv = data;

	*val = dev_priv->gpu_error.missed_irq_rings;
	return 0;
}

static int
i915_ring_missed_irq_set(void *data, u64 val)
{
	struct drm_i915_private *i915 = data;

	return fault_irq_set(i915, &i915->gpu_error.missed_irq_rings, val);
}

DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
			i915_ring_missed_irq_get, i915_ring_missed_irq_set,
			"0x%08llx\n");

static int
i915_ring_test_irq_get(void *data, u64 *val)
{
	struct drm_i915_private *dev_priv = data;

	*val = dev_priv->gpu_error.test_irq_rings;

	return 0;
}

static int
i915_ring_test_irq_set(void *data, u64 val)
{
	struct drm_i915_private *i915 = data;

	/* GuC keeps the user interrupt permanently enabled for submission */
	if (USES_GUC_SUBMISSION(i915))
		return -ENODEV;

	/*
	 * From icl, we can no longer individually mask interrupt generation
	 * from each engine.
	 */
	if (INTEL_GEN(i915) >= 11)
		return -ENODEV;

	val &= INTEL_INFO(i915)->ring_mask;
	DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);

	return fault_irq_set(i915, &i915->gpu_error.test_irq_rings, val);
}

DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
			i915_ring_test_irq_get, i915_ring_test_irq_set,
			"0x%08llx\n");

#define DROP_UNBOUND	BIT(0)
#define DROP_BOUND	BIT(1)
#define DROP_RETIRE	BIT(2)
@@ -4750,8 +4659,6 @@ static const struct i915_debugfs_files {
} i915_debugfs_files[] = {
	{"i915_wedged", &i915_wedged_fops},
	{"i915_cache_sharing", &i915_cache_sharing_fops},
	{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
	{"i915_ring_test_irq", &i915_ring_test_irq_fops},
	{"i915_gem_drop_caches", &i915_drop_caches_fops},
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
	{"i915_error_state", &i915_error_state_fops},
+0 −2
Original line number Diff line number Diff line
@@ -723,8 +723,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
	err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
	err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
	err_printf(m, "CCID: 0x%08x\n", error->ccid);
	err_printf(m, "Missed interrupts: 0x%08lx\n",
		   m->i915->gpu_error.missed_irq_rings);

	for (i = 0; i < error->nfence; i++)
		err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
+0 −5
Original line number Diff line number Diff line
@@ -204,8 +204,6 @@ struct i915_gpu_error {

	atomic_t pending_fb_pin;

	unsigned long missed_irq_rings;

	/**
	 * State variable controlling the reset flow and count
	 *
@@ -274,9 +272,6 @@ struct i915_gpu_error {
	 */
	wait_queue_head_t reset_queue;

	/* For missed irq/seqno simulation. */
	unsigned long test_irq_rings;

	struct i915_gpu_restart *restart;
};

+3 −144
Original line number Diff line number Diff line
@@ -91,7 +91,6 @@ bool intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine)

	spin_lock(&b->irq_lock);

	b->irq_fired = true;
	if (b->irq_armed && list_empty(&b->signalers))
		__intel_breadcrumbs_disarm_irq(b);

@@ -172,86 +171,6 @@ static void signal_irq_work(struct irq_work *work)
	intel_engine_breadcrumbs_irq(engine);
}

static unsigned long wait_timeout(void)
{
	return round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES);
}

static noinline void missed_breadcrumb(struct intel_engine_cs *engine)
{
	if (GEM_SHOW_DEBUG()) {
		struct drm_printer p = drm_debug_printer(__func__);

		intel_engine_dump(engine, &p,
				  "%s missed breadcrumb at %pS\n",
				  engine->name, __builtin_return_address(0));
	}

	set_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
}

static void intel_breadcrumbs_hangcheck(struct timer_list *t)
{
	struct intel_engine_cs *engine =
		from_timer(engine, t, breadcrumbs.hangcheck);
	struct intel_breadcrumbs *b = &engine->breadcrumbs;

	if (!b->irq_armed)
		return;

	if (b->irq_fired)
		goto rearm;

	/*
	 * We keep the hangcheck timer alive until we disarm the irq, even
	 * if there are no waiters at present.
	 *
	 * If the waiter was currently running, assume it hasn't had a chance
	 * to process the pending interrupt (e.g, low priority task on a loaded
	 * system) and wait until it sleeps before declaring a missed interrupt.
	 *
	 * If the waiter was asleep (and not even pending a wakeup), then we
	 * must have missed an interrupt as the GPU has stopped advancing
	 * but we still have a waiter. Assuming all batches complete within
	 * DRM_I915_HANGCHECK_JIFFIES [1.5s]!
	 */
	synchronize_hardirq(engine->i915->drm.irq);
	if (intel_engine_signal_breadcrumbs(engine)) {
		missed_breadcrumb(engine);
		mod_timer(&b->fake_irq, jiffies + 1);
	} else {
rearm:
		b->irq_fired = false;
		mod_timer(&b->hangcheck, wait_timeout());
	}
}

static void intel_breadcrumbs_fake_irq(struct timer_list *t)
{
	struct intel_engine_cs *engine =
		from_timer(engine, t, breadcrumbs.fake_irq);
	struct intel_breadcrumbs *b = &engine->breadcrumbs;

	/*
	 * The timer persists in case we cannot enable interrupts,
	 * or if we have previously seen seqno/interrupt incoherency
	 * ("missed interrupt" syndrome, better known as a "missed breadcrumb").
	 * Here the worker will wake up every jiffie in order to kick the
	 * oldest waiter to do the coherent seqno check.
	 */

	if (!intel_engine_signal_breadcrumbs(engine) && !b->irq_armed)
		return;

	/* If the user has disabled the fake-irq, restore the hangchecking */
	if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) {
		mod_timer(&b->hangcheck, wait_timeout());
		return;
	}

	mod_timer(&b->fake_irq, jiffies + 1);
}

void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine)
{
	struct intel_breadcrumbs *b = &engine->breadcrumbs;
@@ -274,43 +193,14 @@ void intel_engine_unpin_breadcrumbs_irq(struct intel_engine_cs *engine)
	spin_unlock_irq(&b->irq_lock);
}

static bool use_fake_irq(const struct intel_breadcrumbs *b)
{
	const struct intel_engine_cs *engine =
		container_of(b, struct intel_engine_cs, breadcrumbs);

	if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
		return false;

	/*
	 * Only start with the heavy weight fake irq timer if we have not
	 * seen any interrupts since enabling it the first time. If the
	 * interrupts are still arriving, it means we made a mistake in our
	 * engine->seqno_barrier(), a timing error that should be transient
	 * and unlikely to reoccur.
	 */
	return !b->irq_fired;
}

static void enable_fake_irq(struct intel_breadcrumbs *b)
{
	/* Ensure we never sleep indefinitely */
	if (!b->irq_enabled || use_fake_irq(b))
		mod_timer(&b->fake_irq, jiffies + 1);
	else
		mod_timer(&b->hangcheck, wait_timeout());
}

static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
{
	struct intel_engine_cs *engine =
		container_of(b, struct intel_engine_cs, breadcrumbs);
	struct drm_i915_private *i915 = engine->i915;
	bool enabled;

	lockdep_assert_held(&b->irq_lock);
	if (b->irq_armed)
		return false;
		return;

	/*
	 * The breadcrumb irq will be disarmed on the interrupt after the
@@ -328,16 +218,8 @@ static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
	 * the driver is idle) we disarm the breadcrumbs.
	 */

	/* No interrupts? Kick the waiter every jiffie! */
	enabled = false;
	if (!b->irq_enabled++ &&
	    !test_bit(engine->id, &i915->gpu_error.test_irq_rings)) {
	if (!b->irq_enabled++)
		irq_enable(engine);
		enabled = true;
	}

	enable_fake_irq(b);
	return enabled;
}

void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
@@ -348,18 +230,6 @@ void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
	INIT_LIST_HEAD(&b->signalers);

	init_irq_work(&b->irq_work, signal_irq_work);

	timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0);
	timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0);
}

static void cancel_fake_irq(struct intel_engine_cs *engine)
{
	struct intel_breadcrumbs *b = &engine->breadcrumbs;

	del_timer_sync(&b->fake_irq); /* may queue b->hangcheck */
	del_timer_sync(&b->hangcheck);
	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
}

void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
@@ -369,13 +239,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)

	spin_lock_irqsave(&b->irq_lock, flags);

	/*
	 * Leave the fake_irq timer enabled (if it is running), but clear the
	 * bit so that it turns itself off on its next wake up and goes back
	 * to the long hangcheck interval if still required.
	 */
	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);

	if (b->irq_enabled)
		irq_enable(engine);
	else
@@ -386,7 +249,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)

void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
{
	cancel_fake_irq(engine);
}

bool i915_request_enable_breadcrumb(struct i915_request *rq)
@@ -482,7 +344,4 @@ void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine,
		}
	}
	spin_unlock_irq(&b->irq_lock);

	if (test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
		drm_printf(p, "Fake irq active\n");
}
+2 −0
Original line number Diff line number Diff line
@@ -275,6 +275,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
	for_each_engine(engine, dev_priv, id) {
		struct hangcheck hc;

		intel_engine_signal_breadcrumbs(engine);

		hangcheck_load_sample(engine, &hc);
		hangcheck_accumulate_sample(engine, &hc);
		hangcheck_store_sample(engine, &hc);
Loading