Commit 70a76a9b authored by Chris Wilson's avatar Chris Wilson
Browse files

drm/i915/gt: Hook up CS_MASTER_ERROR_INTERRUPT



Now that we have offline error capture and can reset an engine from
inside an atomic context while also preserving the GPU state for
post-mortem analysis, it is time to handle error interrupts thrown by
the command parser.

This provides a much, much faster mechanism for us to detect known
problems than using heartbeats/hangchecks, and also provides a mechanism
for when those are disabled. However, it is limited to problems the HW
can detect in the CS and so not a complete solution for detecting lockups.

Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: default avatarMika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200128204318.4182039-2-chris@chris-wilson.co.uk
parent 8a574698
Loading
Loading
Loading
Loading
+7 −1
Original line number Diff line number Diff line
@@ -1293,8 +1293,14 @@ static void intel_engine_print_registers(struct intel_engine_cs *engine,
	}

	if (INTEL_GEN(dev_priv) >= 6) {
		drm_printf(m, "\tRING_IMR: %08x\n",
		drm_printf(m, "\tRING_IMR:   0x%08x\n",
			   ENGINE_READ(engine, RING_IMR));
		drm_printf(m, "\tRING_ESR:   0x%08x\n",
			   ENGINE_READ(engine, RING_ESR));
		drm_printf(m, "\tRING_EMR:   0x%08x\n",
			   ENGINE_READ(engine, RING_EMR));
		drm_printf(m, "\tRING_EIR:   0x%08x\n",
			   ENGINE_READ(engine, RING_EIR));
	}

	addr = intel_engine_get_active_head(engine);
+10 −0
Original line number Diff line number Diff line
@@ -156,6 +156,16 @@ struct intel_engine_execlists {
	 */
	struct i915_priolist default_priolist;

	/**
	 * @error_interrupt: CS Master EIR
	 *
	 * The CS generates an interrupt when it detects an error. We capture
	 * the first error interrupt, record the EIR and schedule the tasklet.
	 * In the tasklet, we process the pending CS events to ensure we have
	 * the guilty request, and then reset the engine.
	 */
	u32 error_interrupt;

	/**
	 * @no_priolist: priority lists disabled
	 */
+5 −0
Original line number Diff line number Diff line
@@ -455,6 +455,11 @@ err_rq:
		if (!rq)
			continue;

		if (rq->fence.error) {
			err = -EIO;
			goto out;
		}

		GEM_BUG_ON(!test_bit(CONTEXT_ALLOC_BIT, &rq->context->flags));
		state = rq->context->state;
		if (!state)
+24 −3
Original line number Diff line number Diff line
@@ -24,6 +24,21 @@ cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
{
	bool tasklet = false;

	if (unlikely(iir & GT_CS_MASTER_ERROR_INTERRUPT)) {
		u32 eir;

		eir = ENGINE_READ(engine, RING_EIR);
		ENGINE_TRACE(engine, "CS error: %x\n", eir);

		/* Disable the error interrupt until after the reset */
		if (likely(eir)) {
			ENGINE_WRITE(engine, RING_EMR, ~0u);
			ENGINE_WRITE(engine, RING_EIR, eir);
			WRITE_ONCE(engine->execlists.error_interrupt, eir);
			tasklet = true;
		}
	}

	if (iir & GT_CONTEXT_SWITCH_INTERRUPT)
		tasklet = true;

@@ -210,7 +225,10 @@ void gen11_gt_irq_reset(struct intel_gt *gt)

void gen11_gt_irq_postinstall(struct intel_gt *gt)
{
	const u32 irqs = GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT;
	const u32 irqs =
		GT_CS_MASTER_ERROR_INTERRUPT |
		GT_RENDER_USER_INTERRUPT |
		GT_CONTEXT_SWITCH_INTERRUPT;
	struct intel_uncore *uncore = gt->uncore;
	const u32 dmask = irqs << 16 | irqs;
	const u32 smask = irqs << 16;
@@ -279,7 +297,7 @@ void gen6_gt_irq_handler(struct intel_gt *gt, u32 gt_iir)

	if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
		      GT_BSD_CS_ERROR_INTERRUPT |
		      GT_RENDER_CS_MASTER_ERROR_INTERRUPT))
		      GT_CS_MASTER_ERROR_INTERRUPT))
		DRM_DEBUG("Command parser error, gt_iir 0x%08x\n", gt_iir);

	if (gt_iir & GT_PARITY_ERROR(gt->i915))
@@ -345,7 +363,10 @@ void gen8_gt_irq_reset(struct intel_gt *gt)
void gen8_gt_irq_postinstall(struct intel_gt *gt)
{
	/* These are interrupts we'll toggle with the ring mask register */
	const u32 irqs = GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT;
	const u32 irqs =
		GT_CS_MASTER_ERROR_INTERRUPT |
		GT_RENDER_USER_INTERRUPT |
		GT_CONTEXT_SWITCH_INTERRUPT;
	const u32 gt_interrupts[] = {
		irqs << GEN8_RCS_IRQ_SHIFT | irqs << GEN8_BCS_IRQ_SHIFT,
		irqs << GEN8_VCS0_IRQ_SHIFT | irqs << GEN8_VCS1_IRQ_SHIFT,
+66 −15
Original line number Diff line number Diff line
@@ -2613,13 +2613,13 @@ static bool execlists_capture(struct intel_engine_cs *engine)
	if (!cap)
		return true;

	spin_lock_irq(&engine->active.lock);
	cap->rq = execlists_active(&engine->execlists);
	GEM_BUG_ON(!cap->rq);

	rcu_read_lock();
	if (cap->rq) {
		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
		cap->rq = i915_request_get_rcu(cap->rq);
	rcu_read_unlock();
	}
	spin_unlock_irq(&engine->active.lock);
	if (!cap->rq)
		goto err_free;

@@ -2658,27 +2658,25 @@ err_free:
	return false;
}

static noinline void preempt_reset(struct intel_engine_cs *engine)
static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
{
	const unsigned int bit = I915_RESET_ENGINE + engine->id;
	unsigned long *lock = &engine->gt->reset.flags;

	if (i915_modparams.reset < 3)
	if (!intel_has_reset_engine(engine->gt))
		return;

	if (test_and_set_bit(bit, lock))
		return;

	ENGINE_TRACE(engine, "reset for %s\n", msg);

	/* Mark this tasklet as disabled to avoid waiting for it to complete */
	tasklet_disable_nosync(&engine->execlists.tasklet);

	ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
		     READ_ONCE(engine->props.preempt_timeout_ms),
		     jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));

	ring_set_paused(engine, 1); /* Freeze the current request in place */
	if (execlists_capture(engine))
		intel_engine_reset(engine, "preemption time out");
		intel_engine_reset(engine, msg);
	else
		ring_set_paused(engine, 0);

@@ -2709,6 +2707,13 @@ static void execlists_submission_tasklet(unsigned long data)
	bool timeout = preempt_timeout(engine);

	process_csb(engine);

	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
		engine->execlists.error_interrupt = 0;
		if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
			execlists_reset(engine, "CS error");
	}

	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
		unsigned long flags;

@@ -2717,8 +2722,8 @@ static void execlists_submission_tasklet(unsigned long data)
		spin_unlock_irqrestore(&engine->active.lock, flags);

		/* Recheck after serialising with direct-submission */
		if (timeout && preempt_timeout(engine))
			preempt_reset(engine);
		if (unlikely(timeout && preempt_timeout(engine)))
			execlists_reset(engine, "preemption time out");
	}
}

@@ -3335,6 +3340,49 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
	return ret;
}

static void enable_error_interrupt(struct intel_engine_cs *engine)
{
	u32 status;

	engine->execlists.error_interrupt = 0;
	ENGINE_WRITE(engine, RING_EMR, ~0u);
	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */

	status = ENGINE_READ(engine, RING_ESR);
	if (unlikely(status)) {
		dev_err(engine->i915->drm.dev,
			"engine '%s' resumed still in error: %08x\n",
			engine->name, status);
		__intel_gt_reset(engine->gt, engine->mask);
	}

	/*
	 * On current gen8+, we have 2 signals to play with
	 *
	 * - I915_ERROR_INSTUCTION (bit 0)
	 *
	 *    Generate an error if the command parser encounters an invalid
	 *    instruction
	 *
	 *    This is a fatal error.
	 *
	 * - CP_PRIV (bit 2)
	 *
	 *    Generate an error on privilege violation (where the CP replaces
	 *    the instruction with a no-op). This also fires for writes into
	 *    read-only scratch pages.
	 *
	 *    This is a non-fatal error, parsing continues.
	 *
	 * * there are a few others defined for odd HW that we do not use
	 *
	 * Since CP_PRIV fires for cases where we have chosen to ignore the
	 * error (as the HW is validating and suppressing the mistakes), we
	 * only unmask the instruction error bit.
	 */
	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
}

static void enable_execlists(struct intel_engine_cs *engine)
{
	u32 mode;
@@ -3356,6 +3404,8 @@ static void enable_execlists(struct intel_engine_cs *engine)
			i915_ggtt_offset(engine->status_page.vma));
	ENGINE_POSTING_READ(engine, RING_HWS_PGA);

	enable_error_interrupt(engine);

	engine->context_tag = 0;
}

@@ -4282,6 +4332,7 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)

	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
}

static void rcs_submission_override(struct intel_engine_cs *engine)
Loading