Commit a8736ea8 authored by Igor Russkikh's avatar Igor Russkikh Committed by David S. Miller
Browse files

net: qede: add hw err scheduled handler



qede (ethernet level driver) registers a callback handler.
This handler maintains eth dev state flags/bits to track error processing.

It implements in place processing part for nonsleeping context (WARN_ON
trigger), and a deferred (delayed work) part which triggers recovery
process for recoverable errors.

In later patches this atomic handler will come with more meat.

We introduce err_flags on ethdevice structure, its being used to record
error handling properties.

Signed-off-by: default avatarAriel Elior <ariel.elior@marvell.com>
Signed-off-by: default avatarMichal Kalderon <michal.kalderon@marvell.com>
Signed-off-by: default avatarIgor Russkikh <irusskikh@marvell.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent d639836a
Loading
Loading
Loading
Loading
+12 −1
Original line number Diff line number Diff line
@@ -278,6 +278,14 @@ struct qede_dev {
	struct qede_rdma_dev		rdma_info;

	struct bpf_prog *xdp_prog;

	unsigned long err_flags;
#define QEDE_ERR_IS_HANDLED	31
#define QEDE_ERR_ATTN_CLR_EN	0
#define QEDE_ERR_GET_DBG_INFO	1
#define QEDE_ERR_IS_RECOVERABLE	2
#define QEDE_ERR_WARN		3

	struct qede_dump_info		dump_info;
};

@@ -485,12 +493,15 @@ struct qede_fastpath {

#define QEDE_SP_RECOVERY		0
#define QEDE_SP_RX_MODE			1
#define QEDE_SP_RSVD1                   2
#define QEDE_SP_RSVD2                   3
#define QEDE_SP_HW_ERR                  4
#define QEDE_SP_ARFS_CONFIG             5
#define QEDE_SP_AER			7

#ifdef CONFIG_RFS_ACCEL
int qede_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
		       u16 rxq_index, u32 flow_id);
#define QEDE_SP_ARFS_CONFIG	4
#define QEDE_SP_TASK_POLL_DELAY	(5 * HZ)
#endif

+94 −1
Original line number Diff line number Diff line
@@ -139,10 +139,12 @@ static void qede_shutdown(struct pci_dev *pdev);
static void qede_link_update(void *dev, struct qed_link_output *link);
static void qede_schedule_recovery_handler(void *dev);
static void qede_recovery_handler(struct qede_dev *edev);
static void qede_schedule_hw_err_handler(void *dev,
					 enum qed_hw_err_type err_type);
static void qede_get_eth_tlv_data(void *edev, void *data);
static void qede_get_generic_tlv_data(void *edev,
				      struct qed_generic_tlvs *data);

static void qede_generic_hw_err_handler(struct qede_dev *edev);
#ifdef CONFIG_QED_SRIOV
static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos,
			    __be16 vlan_proto)
@@ -230,6 +232,7 @@ static struct qed_eth_cb_ops qede_ll_ops = {
#endif
		.link_update = qede_link_update,
		.schedule_recovery_handler = qede_schedule_recovery_handler,
		.schedule_hw_err_handler = qede_schedule_hw_err_handler,
		.get_generic_tlv_data = qede_get_generic_tlv_data,
		.get_protocol_tlv_data = qede_get_eth_tlv_data,
	},
@@ -1009,6 +1012,8 @@ static void qede_sp_task(struct work_struct *work)
			qede_process_arfs_filters(edev, false);
	}
#endif
	if (test_and_clear_bit(QEDE_SP_HW_ERR, &edev->sp_flags))
		qede_generic_hw_err_handler(edev);
	__qede_unlock(edev);

	if (test_and_clear_bit(QEDE_SP_AER, &edev->sp_flags)) {
@@ -2509,6 +2514,94 @@ err:
	qede_recovery_failed(edev);
}

static void qede_atomic_hw_err_handler(struct qede_dev *edev)
{
	DP_NOTICE(edev,
		  "Generic non-sleepable HW error handling started - err_flags 0x%lx\n",
		  edev->err_flags);

	/* Get a call trace of the flow that led to the error */
	WARN_ON(test_bit(QEDE_ERR_WARN, &edev->err_flags));

	DP_NOTICE(edev, "Generic non-sleepable HW error handling is done\n");
}

static void qede_generic_hw_err_handler(struct qede_dev *edev)
{
	struct qed_dev *cdev = edev->cdev;

	DP_NOTICE(edev,
		  "Generic sleepable HW error handling started - err_flags 0x%lx\n",
		  edev->err_flags);

	/* Trigger a recovery process.
	 * This is placed in the sleep requiring section just to make
	 * sure it is the last one, and that all the other operations
	 * were completed.
	 */
	if (test_bit(QEDE_ERR_IS_RECOVERABLE, &edev->err_flags))
		edev->ops->common->recovery_process(cdev);

	clear_bit(QEDE_ERR_IS_HANDLED, &edev->err_flags);

	DP_NOTICE(edev, "Generic sleepable HW error handling is done\n");
}

static void qede_set_hw_err_flags(struct qede_dev *edev,
				  enum qed_hw_err_type err_type)
{
	unsigned long err_flags = 0;

	switch (err_type) {
	case QED_HW_ERR_DMAE_FAIL:
		set_bit(QEDE_ERR_WARN, &err_flags);
		fallthrough;
	case QED_HW_ERR_MFW_RESP_FAIL:
	case QED_HW_ERR_HW_ATTN:
	case QED_HW_ERR_RAMROD_FAIL:
	case QED_HW_ERR_FW_ASSERT:
		set_bit(QEDE_ERR_ATTN_CLR_EN, &err_flags);
		set_bit(QEDE_ERR_GET_DBG_INFO, &err_flags);
		break;

	default:
		DP_NOTICE(edev, "Unexpected HW error [%d]\n", err_type);
		break;
	}

	edev->err_flags |= err_flags;
}

static void qede_schedule_hw_err_handler(void *dev,
					 enum qed_hw_err_type err_type)
{
	struct qede_dev *edev = dev;

	/* Fan failure cannot be masked by handling of another HW error or by a
	 * concurrent recovery process.
	 */
	if ((test_and_set_bit(QEDE_ERR_IS_HANDLED, &edev->err_flags) ||
	     edev->state == QEDE_STATE_RECOVERY) &&
	     err_type != QED_HW_ERR_FAN_FAIL) {
		DP_INFO(edev,
			"Avoid scheduling an error handling while another HW error is being handled\n");
		return;
	}

	if (err_type >= QED_HW_ERR_LAST) {
		DP_NOTICE(edev, "Unknown HW error [%d]\n", err_type);
		clear_bit(QEDE_ERR_IS_HANDLED, &edev->err_flags);
		return;
	}

	qede_set_hw_err_flags(edev, err_type);
	qede_atomic_hw_err_handler(edev);
	set_bit(QEDE_SP_HW_ERR, &edev->sp_flags);
	schedule_delayed_work(&edev->sp_task, 0);

	DP_INFO(edev, "Scheduled a error handler [err_type %d]\n", err_type);
}

static bool qede_is_txq_full(struct qede_dev *edev, struct qede_tx_queue *txq)
{
	struct netdev_queue *netdev_txq;