Commit 6b332fa2 authored by Arun Siluvery's avatar Arun Siluvery Committed by Tvrtko Ursulin
Browse files

drm/i915/guc: reset GuC and retry on firmware load failure



Due to timing issues in the HW, some of the status bits required for GuC
authentication occasionally don't get set; when that happens, the GuC
cannot be initialized and we will be left with a wedged GPU. The W/A
suggested is to perform a soft reset of the GuC and attempt to reload
the F/W again for few times before giving up.

As the failure is dependent on timing, tests performed by triggering
manual full gpu reset (i915_wedged) showed that we could sometimes hit
this after several thousand iterations, but sometimes tests ran even
longer without any issues. Reset and reload mechanism proved helpful
when we indeed hit f/w load failure, so it is better to include this
to improve driver stability.

This change implements the following WAs,

	WaEnableuKernelHeaderValidFix:skl,bxt
	WaEnableGuCBootHashCheckNotSet:skl,bxt

Signed-off-by: default avatarArun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: default avatarDave Gordon <david.s.gordon@intel.com>
Reviewed-by: default avatarAlex Dai <yu.dai@intel.com>
Signed-off-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
parent 168cf367
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -2747,6 +2747,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
extern int intel_gpu_reset(struct drm_device *dev, u32 engine_mask);
extern bool intel_has_gpu_reset(struct drm_device *dev);
extern int i915_reset(struct drm_device *dev);
extern int intel_guc_reset(struct drm_i915_private *dev_priv);
extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
+1 −0
Original line number Diff line number Diff line
@@ -27,6 +27,7 @@
/* Definitions of GuC H/W registers, bits, etc */

#define GUC_STATUS			_MMIO(0xc000)
#define   GS_MIA_IN_RESET		(1 << 0)
#define   GS_BOOTROM_SHIFT		1
#define   GS_BOOTROM_MASK		  (0x7F << GS_BOOTROM_SHIFT)
#define   GS_BOOTROM_RSA_FAILED		  (0x50 << GS_BOOTROM_SHIFT)
+1 −0
Original line number Diff line number Diff line
@@ -165,6 +165,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
#define  GEN6_GRDOM_MEDIA		(1 << 2)
#define  GEN6_GRDOM_BLT			(1 << 3)
#define  GEN6_GRDOM_VECS		(1 << 4)
#define  GEN9_GRDOM_GUC			(1 << 5)
#define  GEN8_GRDOM_MEDIA2		(1 << 7)

#define RING_PP_DIR_BASE(ring)		_MMIO((ring)->mmio_base+0x228)
+47 −2
Original line number Diff line number Diff line
@@ -353,6 +353,24 @@ static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
	return ret;
}

static int i915_reset_guc(struct drm_i915_private *dev_priv)
{
	int ret;
	u32 guc_status;

	ret = intel_guc_reset(dev_priv);
	if (ret) {
		DRM_ERROR("GuC reset failed, ret = %d\n", ret);
		return ret;
	}

	guc_status = I915_READ(GUC_STATUS);
	WARN(!(guc_status & GS_MIA_IN_RESET),
	     "GuC status: 0x%x, MIA core expected to be in reset\n", guc_status);

	return ret;
}

/**
 * intel_guc_ucode_load() - load GuC uCode into the device
 * @dev:	drm device
@@ -417,9 +435,36 @@ int intel_guc_ucode_load(struct drm_device *dev)
	if (err)
		goto fail;

	/*
	 * WaEnableuKernelHeaderValidFix:skl,bxt
	 * For BXT, this is only upto B0 but below WA is required for later
	 * steppings also so this is extended as well.
	 */
	/* WaEnableGuCBootHashCheckNotSet:skl,bxt */
	err = guc_ucode_xfer(dev_priv);
	if (err) {
		int retries = 3;

		DRM_ERROR("GuC fw load failed, err=%d, attempting reset and retry\n", err);

		while (retries--) {
			err = i915_reset_guc(dev_priv);
			if (err)
				break;

			err = guc_ucode_xfer(dev_priv);
			if (!err) {
				DRM_DEBUG_DRIVER("GuC fw reload succeeded after reset\n");
				break;
			}
			DRM_DEBUG_DRIVER("GuC fw reload retries left: %d\n", retries);
		}

		if (err) {
			DRM_ERROR("GuC fw reload attempt failed, ret=%d\n", err);
			goto fail;
		}
	}

	guc_fw->guc_fw_load_status = GUC_FIRMWARE_SUCCESS;

+19 −0
Original line number Diff line number Diff line
@@ -1673,6 +1673,25 @@ bool intel_has_gpu_reset(struct drm_device *dev)
	return intel_get_gpu_reset(dev) != NULL;
}

int intel_guc_reset(struct drm_i915_private *dev_priv)
{
	int ret;
	unsigned long irqflags;

	if (!i915.enable_guc_submission)
		return -EINVAL;

	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
	spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);

	ret = gen6_hw_domain_reset(dev_priv, GEN9_GRDOM_GUC);

	spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);

	return ret;
}

bool intel_uncore_unclaimed_mmio(struct drm_i915_private *dev_priv)
{
	return check_for_unclaimed_mmio(dev_priv);