Commit 18f36157 authored by Hawking Zhang's avatar Hawking Zhang Committed by Alex Deucher
Browse files

drm/amdgpu: add helper funcs to detect PCS error



Since from vega20, hardware supports run-time detect
and report XGMI/WAFL PCS ras error. Add helper functions
to walkthrough every type of ras error and report it if
any.

Signed-off-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: default avatarGuchun Chen <guchun.chen@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 15a1fbdc
Loading
Loading
Loading
Loading
+173 −0
Original line number Diff line number Diff line
@@ -26,7 +26,12 @@
#include "amdgpu_xgmi.h"
#include "amdgpu_smu.h"
#include "amdgpu_ras.h"
#include "soc15.h"
#include "df/df_3_6_offset.h"
#include "xgmi/xgmi_4_0_0_smn.h"
#include "xgmi/xgmi_4_0_0_sh_mask.h"
#include "wafl/wafl2_4_0_0_smn.h"
#include "wafl/wafl2_4_0_0_sh_mask.h"

static DEFINE_MUTEX(xgmi_mutex);

@@ -36,6 +41,94 @@ static DEFINE_MUTEX(xgmi_mutex);
static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
static unsigned hive_count = 0;

static const int xgmi_pcs_err_status_reg_vg20[] = {
	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
	smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
};

static const int wafl_pcs_err_status_reg_vg20[] = {
	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
};

static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
	{"XGMI PCS DataLossErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
	{"XGMI PCS TrainingErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)},
	{"XGMI PCS CRCErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)},
	{"XGMI PCS BERExceededErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)},
	{"XGMI PCS TxMetaDataErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)},
	{"XGMI PCS ReplayBufParityErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)},
	{"XGMI PCS DataParityErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)},
	{"XGMI PCS ReplayFifoOverflowErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
	{"XGMI PCS ReplayFifoUnderflowErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
	{"XGMI PCS ElasticFifoOverflowErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
	{"XGMI PCS DeskewErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)},
	{"XGMI PCS DataStartupLimitErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)},
	{"XGMI PCS FCInitTimeoutErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
	{"XGMI PCS RecoveryTimeoutErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
	{"XGMI PCS ReadySerialTimeoutErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
	{"XGMI PCS ReadySerialAttemptErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
	{"XGMI PCS RecoveryAttemptErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
	{"XGMI PCS RecoveryRelockAttemptErr",
	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
};

static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = {
	{"WAFL PCS DataLossErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)},
	{"WAFL PCS TrainingErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)},
	{"WAFL PCS CRCErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)},
	{"WAFL PCS BERExceededErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)},
	{"WAFL PCS TxMetaDataErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)},
	{"WAFL PCS ReplayBufParityErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)},
	{"WAFL PCS DataParityErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)},
	{"WAFL PCS ReplayFifoOverflowErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
	{"WAFL PCS ReplayFifoUnderflowErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
	{"WAFL PCS ElasticFifoOverflowErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
	{"WAFL PCS DeskewErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)},
	{"WAFL PCS DataStartupLimitErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)},
	{"WAFL PCS FCInitTimeoutErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)},
	{"WAFL PCS RecoveryTimeoutErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
	{"WAFL PCS ReadySerialTimeoutErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
	{"WAFL PCS ReadySerialAttemptErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
	{"WAFL PCS RecoveryAttemptErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)},
	{"WAFL PCS RecoveryRelockAttemptErr",
	 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
};

void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
{
	return &hive->device_list;
@@ -560,3 +653,83 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,

	return addr + dram_base_addr;
}

static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
					      uint32_t value,
					      uint32_t *ue_count,
					      uint32_t *ce_count,
					      bool is_xgmi_pcs)
{
	int i;
	int ue_cnt;

	if (is_xgmi_pcs) {
		/* query xgmi pcs error status,
		 * only ue is supported */
		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) {
			ue_cnt = (value &
				  xgmi_pcs_ras_fields[i].pcs_err_mask) >>
				  xgmi_pcs_ras_fields[i].pcs_err_shift;
			if (ue_cnt) {
				dev_info(adev->dev, "%s detected\n",
					 xgmi_pcs_ras_fields[i].err_name);
				*ue_count += ue_cnt;
			}
		}
	} else {
		/* query wafl pcs error status,
		 * only ue is supported */
		for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) {
			ue_cnt = (value &
				  wafl_pcs_ras_fields[i].pcs_err_mask) >>
				  wafl_pcs_ras_fields[i].pcs_err_shift;
			if (ue_cnt) {
				dev_info(adev->dev, "%s detected\n",
					 wafl_pcs_ras_fields[i].err_name);
				*ue_count += ue_cnt;
			}
		}
	}

	return 0;
}

int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
				      void *ras_error_status)
{
	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
	int i;
	uint32_t data;
	uint32_t ue_cnt = 0, ce_cnt = 0;

	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
		return -EINVAL;

	err_data->ue_count = 0;
	err_data->ce_count = 0;

	switch (adev->asic_type) {
	case CHIP_VEGA20:
	default:
		/* check xgmi pcs error */
		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
			data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
			if (data)
				amdgpu_xgmi_query_pcs_error_status(adev,
						data, &ue_cnt, &ce_cnt, true);
		}
		/* check wafl pcs error */
		for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
			data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
			if (data)
				amdgpu_xgmi_query_pcs_error_status(adev,
						data, &ue_cnt, &ce_cnt, false);
		}
		break;
	}

	err_data->ue_count += ue_cnt;
	err_data->ce_count += ce_cnt;

	return 0;
}
+8 −0
Original line number Diff line number Diff line
@@ -37,6 +37,12 @@ struct amdgpu_hive_info {
	struct task_barrier tb;
};

struct amdgpu_pcs_ras_field {
	const char *err_name;
	uint32_t pcs_err_mask;
	uint32_t pcs_err_shift;
};

struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
@@ -48,6 +54,8 @@ int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev);
void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev);
uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
					   uint64_t addr);
int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
				      void *ras_error_status);

static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
		struct amdgpu_device *bo_adev)