Commit 86b6ba17 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'net-qed-qede-critical-hw-error-handling'

Igor Russkikh says:

====================
net: qed/qede: critical hw error handling

FastLinQ devices as a complex systems may observe various hardware
level error conditions, both severe and recoverable.

Driver is able to detect and report this, but so far it only did
trace/dmesg based reporting.

Here we implement an extended hw error detection, service task
handler captures a dump for the later analysis.

I also resubmit a patch from Denis Bolotin on tx timeout handler,
addressing David's comment regarding recovery procedure as an extra
reaction on this event.

v2:

Removing the patch with ethtool dump and udev magic. Its quite isolated,
I'm working on devlink based logic for this separately.

v1:

https://patchwork.ozlabs.org/project/netdev/cover/cover.1588758463.git.irusskikh@marvell.com/


====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents c8a867a3 8f76812e
Loading
Loading
Loading
Loading
+8 −8
Original line number Diff line number Diff line
@@ -740,12 +740,6 @@ struct qed_dbg_feature {
	u32 dumped_dwords;
};

struct qed_dbg_params {
	struct qed_dbg_feature features[DBG_FEATURE_NUM];
	u8 engine_for_debug;
	bool print_data;
};

struct qed_dev {
	u32	dp_module;
	u8	dp_level;
@@ -844,6 +838,9 @@ struct qed_dev {
	/* Recovery */
	bool recov_in_prog;

	/* Indicates whether should prevent attentions from being reasserted */
	bool attn_clr_en;

	/* LLH info */
	u8 ppfid_bitmap;
	struct qed_llh_info *p_llh_info;
@@ -872,17 +869,18 @@ struct qed_dev {
	} protocol_ops;
	void				*ops_cookie;

	struct qed_dbg_params		dbg_params;

#ifdef CONFIG_QED_LL2
	struct qed_cb_ll2_info		*ll2;
	u8				ll2_mac_address[ETH_ALEN];
#endif
	struct qed_dbg_feature dbg_features[DBG_FEATURE_NUM];
	u8 engine_for_debug;
	bool disable_ilt_dump;
	DECLARE_HASHTABLE(connections, 10);
	const struct firmware		*firmware;

	bool print_dbg_data;

	u32 rdma_max_sge;
	u32 rdma_max_inline;
	u32 rdma_max_srq_sge;
@@ -1020,6 +1018,8 @@ u32 qed_unzip_data(struct qed_hwfn *p_hwfn,
		   u32 input_len, u8 *input_buf,
		   u32 max_size, u8 *unzip_buf);
void qed_schedule_recovery_handler(struct qed_hwfn *p_hwfn);
void qed_hw_error_occurred(struct qed_hwfn *p_hwfn,
			   enum qed_hw_err_type err_type);
void qed_get_protocol_stats(struct qed_dev *cdev,
			    enum qed_mcp_protocol_type type,
			    union qed_mcp_protocol_stats *stats);
+13 −13
Original line number Diff line number Diff line
@@ -7453,7 +7453,7 @@ static enum dbg_status format_feature(struct qed_hwfn *p_hwfn,
				      enum qed_dbg_features feature_idx)
{
	struct qed_dbg_feature *feature =
	    &p_hwfn->cdev->dbg_params.features[feature_idx];
	    &p_hwfn->cdev->dbg_features[feature_idx];
	u32 text_size_bytes, null_char_pos, i;
	enum dbg_status rc;
	char *text_buf;
@@ -7502,7 +7502,7 @@ static enum dbg_status format_feature(struct qed_hwfn *p_hwfn,
		text_buf[i] = '\n';

	/* Dump printable feature to log */
	if (p_hwfn->cdev->dbg_params.print_data)
	if (p_hwfn->cdev->print_dbg_data)
		qed_dbg_print_feature(text_buf, text_size_bytes);

	/* Free the old dump_buf and point the dump_buf to the newly allocagted
@@ -7523,7 +7523,7 @@ static enum dbg_status qed_dbg_dump(struct qed_hwfn *p_hwfn,
				    enum qed_dbg_features feature_idx)
{
	struct qed_dbg_feature *feature =
	    &p_hwfn->cdev->dbg_params.features[feature_idx];
	    &p_hwfn->cdev->dbg_features[feature_idx];
	u32 buf_size_dwords;
	enum dbg_status rc;

@@ -7648,7 +7648,7 @@ static int qed_dbg_nvm_image(struct qed_dev *cdev, void *buffer,
			     enum qed_nvm_images image_id)
{
	struct qed_hwfn *p_hwfn =
		&cdev->hwfns[cdev->dbg_params.engine_for_debug];
		&cdev->hwfns[cdev->engine_for_debug];
	u32 len_rounded, i;
	__be32 val;
	int rc;
@@ -7780,7 +7780,7 @@ int qed_dbg_all_data(struct qed_dev *cdev, void *buffer)
{
	u8 cur_engine, omit_engine = 0, org_engine;
	struct qed_hwfn *p_hwfn =
		&cdev->hwfns[cdev->dbg_params.engine_for_debug];
		&cdev->hwfns[cdev->engine_for_debug];
	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
	int grc_params[MAX_DBG_GRC_PARAMS], i;
	u32 offset = 0, feature_size;
@@ -8000,7 +8000,7 @@ int qed_dbg_all_data(struct qed_dev *cdev, void *buffer)
int qed_dbg_all_data_size(struct qed_dev *cdev)
{
	struct qed_hwfn *p_hwfn =
		&cdev->hwfns[cdev->dbg_params.engine_for_debug];
		&cdev->hwfns[cdev->engine_for_debug];
	u32 regs_len = 0, image_len = 0, ilt_len = 0, total_ilt_len = 0;
	u8 cur_engine, org_engine;

@@ -8059,9 +8059,9 @@ int qed_dbg_feature(struct qed_dev *cdev, void *buffer,
		    enum qed_dbg_features feature, u32 *num_dumped_bytes)
{
	struct qed_hwfn *p_hwfn =
		&cdev->hwfns[cdev->dbg_params.engine_for_debug];
		&cdev->hwfns[cdev->engine_for_debug];
	struct qed_dbg_feature *qed_feature =
		&cdev->dbg_params.features[feature];
		&cdev->dbg_features[feature];
	enum dbg_status dbg_rc;
	struct qed_ptt *p_ptt;
	int rc = 0;
@@ -8084,7 +8084,7 @@ int qed_dbg_feature(struct qed_dev *cdev, void *buffer,
	DP_VERBOSE(cdev, QED_MSG_DEBUG,
		   "copying debugfs feature to external buffer\n");
	memcpy(buffer, qed_feature->dump_buf, qed_feature->buf_size);
	*num_dumped_bytes = cdev->dbg_params.features[feature].dumped_dwords *
	*num_dumped_bytes = cdev->dbg_features[feature].dumped_dwords *
			    4;

out:
@@ -8095,7 +8095,7 @@ out:
int qed_dbg_feature_size(struct qed_dev *cdev, enum qed_dbg_features feature)
{
	struct qed_hwfn *p_hwfn =
		&cdev->hwfns[cdev->dbg_params.engine_for_debug];
		&cdev->hwfns[cdev->engine_for_debug];
	struct qed_dbg_feature *qed_feature = &cdev->dbg_features[feature];
	struct qed_ptt *p_ptt = qed_ptt_acquire(p_hwfn);
	u32 buf_size_dwords;
@@ -8120,14 +8120,14 @@ int qed_dbg_feature_size(struct qed_dev *cdev, enum qed_dbg_features feature)

u8 qed_get_debug_engine(struct qed_dev *cdev)
{
	return cdev->dbg_params.engine_for_debug;
	return cdev->engine_for_debug;
}

void qed_set_debug_engine(struct qed_dev *cdev, int engine_number)
{
	DP_VERBOSE(cdev, QED_MSG_DEBUG, "set debug engine to %d\n",
		   engine_number);
	cdev->dbg_params.engine_for_debug = engine_number;
	cdev->engine_for_debug = engine_number;
}

void qed_dbg_pf_init(struct qed_dev *cdev)
@@ -8146,7 +8146,7 @@ void qed_dbg_pf_init(struct qed_dev *cdev)
	}

	/* Set the hwfn to be 0 as default */
	cdev->dbg_params.engine_for_debug = 0;
	cdev->engine_for_debug = 0;
}

void qed_dbg_pf_exit(struct qed_dev *cdev)
+3 −1
Original line number Diff line number Diff line
@@ -3085,7 +3085,9 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
			rc = qed_final_cleanup(p_hwfn, p_hwfn->p_main_ptt,
					       p_hwfn->rel_pf_id, false);
			if (rc) {
				DP_NOTICE(p_hwfn, "Final cleanup failed\n");
				qed_hw_err_notify(p_hwfn, p_hwfn->p_main_ptt,
						  QED_HW_ERR_RAMROD_FAIL,
						  "Final cleanup failed\n");
				goto load_err;
			}
		}
+47 −2
Original line number Diff line number Diff line
@@ -12400,6 +12400,13 @@ struct load_rsp_stc {
#define LOAD_RSP_FLAGS0_DRV_EXISTS      (0x1 << 0)
};

struct mdump_retain_data_stc {
	u32 valid;
	u32 epoch;
	u32 pf;
	u32 status;
};

union drv_union_data {
	u32 ver_str[MCP_DRV_VER_STR_SIZE_DWORD];
	struct mcp_mac wol_mac;
@@ -12488,10 +12495,14 @@ struct public_drv_mb {
#define DRV_MSG_CODE_BIST_TEST			0x001e0000
#define DRV_MSG_CODE_SET_LED_MODE		0x00200000
#define DRV_MSG_CODE_RESOURCE_CMD		0x00230000
/* Send crash dump commands with param[3:0] - opcode */
#define DRV_MSG_CODE_MDUMP_CMD			0x00250000
#define DRV_MSG_CODE_GET_TLV_DONE		0x002f0000
#define DRV_MSG_CODE_GET_ENGINE_CONFIG		0x00370000
#define DRV_MSG_CODE_GET_PPFID_BITMAP		0x43000000

#define DRV_MSG_CODE_DEBUG_DATA_SEND		0xc0040000

#define RESOURCE_CMD_REQ_RESC_MASK		0x0000001F
#define RESOURCE_CMD_REQ_RESC_SHIFT		0
#define RESOURCE_CMD_REQ_OPCODE_MASK		0x000000E0
@@ -12517,6 +12528,21 @@ struct public_drv_mb {

#define RESOURCE_DUMP				0

/* DRV_MSG_CODE_MDUMP_CMD parameters */
#define MDUMP_DRV_PARAM_OPCODE_MASK             0x0000000f
#define DRV_MSG_CODE_MDUMP_ACK                  0x01
#define DRV_MSG_CODE_MDUMP_SET_VALUES           0x02
#define DRV_MSG_CODE_MDUMP_TRIGGER              0x03
#define DRV_MSG_CODE_MDUMP_GET_CONFIG           0x04
#define DRV_MSG_CODE_MDUMP_SET_ENABLE           0x05
#define DRV_MSG_CODE_MDUMP_CLEAR_LOGS           0x06
#define DRV_MSG_CODE_MDUMP_GET_RETAIN           0x07
#define DRV_MSG_CODE_MDUMP_CLR_RETAIN           0x08

#define DRV_MSG_CODE_HW_DUMP_TRIGGER            0x0a
#define DRV_MSG_CODE_MDUMP_GEN_MDUMP2           0x0b
#define DRV_MSG_CODE_MDUMP_FREE_MDUMP2          0x0c

#define DRV_MSG_CODE_GET_PF_RDMA_PROTOCOL	0x002b0000
#define DRV_MSG_CODE_OS_WOL			0x002e0000

@@ -12626,6 +12652,17 @@ struct public_drv_mb {
#define DRV_MB_PARAM_FEATURE_SUPPORT_PORT_EEE		0x00000002
#define DRV_MB_PARAM_FEATURE_SUPPORT_FUNC_VLINK		0x00010000

/* DRV_MSG_CODE_DEBUG_DATA_SEND parameters */
#define DRV_MSG_CODE_DEBUG_DATA_SEND_SIZE_OFFSET	0
#define DRV_MSG_CODE_DEBUG_DATA_SEND_SIZE_MASK		0xFF

/* Driver attributes params */
#define DRV_MB_PARAM_ATTRIBUTE_KEY_OFFSET		0
#define DRV_MB_PARAM_ATTRIBUTE_KEY_MASK			0x00FFFFFF
#define DRV_MB_PARAM_ATTRIBUTE_CMD_OFFSET		24
#define DRV_MB_PARAM_ATTRIBUTE_CMD_MASK			0xFF000000

#define DRV_MB_PARAM_NVM_CFG_OPTION_ID_OFFSET		0
#define DRV_MB_PARAM_NVM_CFG_OPTION_ID_SHIFT		0
#define DRV_MB_PARAM_NVM_CFG_OPTION_ID_MASK		0x0000FFFF
#define DRV_MB_PARAM_NVM_CFG_OPTION_ALL_SHIFT		16
@@ -12678,6 +12715,14 @@ struct public_drv_mb {
#define FW_MSG_CODE_DRV_CFG_PF_VFS_MSIX_DONE	0x00870000
#define FW_MSG_SEQ_NUMBER_MASK			0x0000ffff

#define FW_MSG_CODE_DEBUG_DATA_SEND_INV_ARG	0xb0070000
#define FW_MSG_CODE_DEBUG_DATA_SEND_BUF_FULL	0xb0080000
#define FW_MSG_CODE_DEBUG_DATA_SEND_NO_BUF	0xb0090000
#define FW_MSG_CODE_DEBUG_NOT_ENABLED		0xb00a0000
#define FW_MSG_CODE_DEBUG_DATA_SEND_OK		0xb00b0000

#define FW_MSG_CODE_MDUMP_INVALID_CMD		0x00030000

	u32 fw_mb_param;
#define FW_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_MASK	0xFFFF0000
#define FW_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_SHIFT	16
@@ -12742,9 +12787,9 @@ enum MFW_DRV_MSG_TYPE {
	MFW_DRV_MSG_GET_FCOE_STATS,
	MFW_DRV_MSG_GET_ISCSI_STATS,
	MFW_DRV_MSG_GET_RDMA_STATS,
	MFW_DRV_MSG_BW_UPDATE10,
	MFW_DRV_MSG_FAILURE_DETECTED,
	MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE,
	MFW_DRV_MSG_BW_UPDATE11,
	MFW_DRV_MSG_CRITICAL_ERROR_OCCURRED,
	MFW_DRV_MSG_RESERVED,
	MFW_DRV_MSG_GET_TLV_REQ,
	MFW_DRV_MSG_OEM_CFG_UPDATE,
+39 −3
Original line number Diff line number Diff line
@@ -762,9 +762,10 @@ static int qed_dmae_execute_command(struct qed_hwfn *p_hwfn,
							    dst_type,
							    length_cur);
		if (qed_status) {
			DP_NOTICE(p_hwfn,
			qed_hw_err_notify(p_hwfn, p_ptt, QED_HW_ERR_DMAE_FAIL,
					  "qed_dmae_execute_sub_operation Failed with error 0x%x. source_addr 0x%llx, destination addr 0x%llx, size_in_dwords 0x%x\n",
				  qed_status, src_addr, dst_addr, length_cur);
					  qed_status, src_addr,
					  dst_addr, length_cur);
			break;
		}
	}
@@ -837,6 +838,41 @@ int qed_dmae_host2host(struct qed_hwfn *p_hwfn,
	return rc;
}

void qed_hw_err_notify(struct qed_hwfn *p_hwfn,
		       struct qed_ptt *p_ptt,
		       enum qed_hw_err_type err_type, char *fmt, ...)
{
	char buf[QED_HW_ERR_MAX_STR_SIZE];
	va_list vl;
	int len;

	if (fmt) {
		va_start(vl, fmt);
		len = vsnprintf(buf, QED_HW_ERR_MAX_STR_SIZE, fmt, vl);
		va_end(vl);

		if (len > QED_HW_ERR_MAX_STR_SIZE - 1)
			len = QED_HW_ERR_MAX_STR_SIZE - 1;

		DP_NOTICE(p_hwfn, "%s", buf);
	}

	/* Fan failure cannot be masked by handling of another HW error */
	if (p_hwfn->cdev->recov_in_prog &&
	    err_type != QED_HW_ERR_FAN_FAIL) {
		DP_VERBOSE(p_hwfn,
			   NETIF_MSG_DRV,
			   "Recovery is in progress. Avoid notifying about HW error %d.\n",
			   err_type);
		return;
	}

	qed_hw_error_occurred(p_hwfn, err_type);

	if (fmt)
		qed_mcp_send_raw_debug_data(p_hwfn, p_ptt, buf, len);
}

int qed_dmae_sanity(struct qed_hwfn *p_hwfn,
		    struct qed_ptt *p_ptt, const char *phase)
{
Loading