Commit b82e65a9 authored by Guchun Chen's avatar Guchun Chen Committed by Alex Deucher
Browse files

drm/amdgpu: break driver init process when it's bad GPU(v5)



When retrieving bad gpu tag from eeprom, GPU init should
fail as the GPU needs to be retired for further check.

v2: Fix spelling typo, correct the condition to detect
    bad gpu tag and refine error message.

v3: Refine function argument name.

v4: Fix missing check of returning value of i2c
    initialization error case.

v5: Use dev_err to print PCI information in dmesg instead
    of DRM_ERROR.

Signed-off-by: default avatarGuchun Chen <guchun.chen@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 1d6a9d12
Loading
Loading
Loading
Loading
+9 −3
Original line number Diff line number Diff line
@@ -2055,13 +2055,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
	 * for I2C communication which only true at this point.
	 * recovery_init may fail, but it can free all resources allocated by
	 * itself and its failure should not stop amdgpu init process.
	 *
	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
	 * failure from bad gpu situation and stop amdgpu init process
	 * accordingly. For other failed cases, it will still release all
	 * the resource and print error message, rather than returning one
	 * negative value to upper level.
	 *
	 * Note: theoretically, this should be called before all vram allocations
	 * to protect retired page from abusing
	 */
	amdgpu_ras_recovery_init(adev);
	r = amdgpu_ras_recovery_init(adev);
	if (r)
		goto init_failed;

	if (adev->gmc.xgmi.num_physical_nodes > 1)
		amdgpu_xgmi_add_device(adev);
+16 −2
Original line number Diff line number Diff line
@@ -1821,6 +1821,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_err_handler_data **data;
	uint32_t max_eeprom_records_len = 0;
	bool exc_err_limit = false;
	int ret;

	if (con)
@@ -1842,8 +1843,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
	max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
	amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);

	ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
	if (ret)
	ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
	/*
	 * This calling fails when exc_err_limit is true or
	 * ret != 0.
	 */
	if (exc_err_limit || ret)
		goto free;

	if (con->eeprom_control.num_recs) {
@@ -1867,6 +1872,15 @@ free:
out:
	dev_warn(adev->dev, "Failed to initialize ras recovery!\n");

	/*
	 * Except error threshold exceeding case, other failure cases in this
	 * function would not fail amdgpu driver init.
	 */
	if (!exc_err_limit)
		ret = 0;
	else
		ret = -EINVAL;

	return ret;
}

+9 −1
Original line number Diff line number Diff line
@@ -241,7 +241,8 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)

}

int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
			bool *exceed_err_limit)
{
	int ret = 0;
	struct amdgpu_device *adev = to_amdgpu_device(control);
@@ -254,6 +255,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
			.buf	= buff,
	};

	*exceed_err_limit = false;

	/* Verify i2c adapter is initialized */
	if (!adev->pm.smu_i2c.algo)
		return -ENOENT;
@@ -282,6 +285,11 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
		DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
				 control->num_recs);

	} else if ((hdr->header == EEPROM_TABLE_HDR_BAD) &&
			(amdgpu_bad_page_threshold != 0)) {
		*exceed_err_limit = true;
		dev_err(adev->dev, "Exceeding the bad_page_threshold parameter, "
				"disabling the GPU.\n");
	} else {
		DRM_INFO("Creating new EEPROM table");

+2 −1
Original line number Diff line number Diff line
@@ -76,7 +76,8 @@ struct eeprom_table_record {
	unsigned char mcumc_id;
}__attribute__((__packed__));

int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
			bool *exceed_err_limit);
int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);

int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,