drm/amdgpu: schedule ras recovery when reaching bad page threshold(v2) (9c06f91f) · Commits · 戴 / test

drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

+36 −1

Original line number	Diff line number	Diff line
		@@ -394,8 +394,10 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
		int i, ret = 0;
		struct i2c_msg msgs, msg;
		unsigned char buffs, buff;
		bool sched_ras_recovery = false;
		struct eeprom_table_record *record;
		struct amdgpu_device *adev = to_amdgpu_device(control);
		struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);

		if (adev->asic_type != CHIP_VEGA20 && adev->asic_type != CHIP_ARCTURUS)
		return 0;
		@@ -413,11 +415,30 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
		goto free_buff;
		}

		/*
		* If saved bad pages number exceeds the bad page threshold for
		* the whole VRAM, update table header to mark the BAD GPU tag
		* and schedule one ras recovery after eeprom write is done,
		* this can avoid the missing for latest records.
		*
		* This new header will be picked up and checked in the bootup
		* by ras recovery, which may break bootup process to notify
		* user this GPU is in bad state and to retire such GPU for
		* further check.
		*/
		if (write && (amdgpu_bad_page_threshold != 0) &&
		((control->num_recs + num) >= ras->bad_page_cnt_threshold)) {
		dev_warn(adev->dev,
		"Saved bad pages(%d) reaches threshold value(%d).\n",
		control->num_recs + num, ras->bad_page_cnt_threshold);
		control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD;
		sched_ras_recovery = true;
		}

		/* In case of overflow just start from beginning to not lose newest records */
		if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE * num > EEPROM_SIZE_BYTES))
		control->next_addr = EEPROM_RECORD_START;


		/*
		* TODO Currently makes EEPROM writes for each record, this creates
		* internal fragmentation. Optimized the code to do full page write of
		@@ -493,6 +514,20 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
		__update_tbl_checksum(control, records, num, old_hdr_byte_sum);

		__update_table_header(control, buffs);

		if (sched_ras_recovery) {
		/*
		* Before scheduling ras recovery, assert the related
		* flag first, which shall bypass common bad page
		* reservation execution in amdgpu_ras_reset_gpu.
		*/
		amdgpu_ras_get_context(adev)->flags \|=
		AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV;

		dev_warn(adev->dev, "Conduct ras recovery due to bad "
		"page threshold reached.\n");
		amdgpu_ras_reset_gpu(adev);
		}
		} else if (!__validate_tbl_checksum(control, records, num)) {
		DRM_WARN("EEPROM Table checksum mismatch!");
		/* TODO Uncomment when EEPROM read/write is relliable */

Admin message