Commit 4e84608c authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull RDMA fixes from Jason Gunthorpe:
 "Bug fixes for old bugs in the hns and hfi1 drivers:

   - Calculate various values in hns properly to avoid over/underflows
     in some cases

   - Fix an oops, PCI negotiation on Gen4 systems, and bugs related to
     retries"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma:
  RDMA/hns: Correct the value of srq_desc_size
  RDMA/hns: Correct the value of HNS_ROCE_HEM_CHUNK_LEN
  IB/hfi1: TID RDMA WRITE should not return IB_WC_RNR_RETRY_EXC_ERR
  IB/hfi1: Calculate flow weight based on QP MTU for TID RDMA
  IB/hfi1: Ensure r_tid_ack is valid before building TID RDMA ACK packet
  IB/hfi1: Ensure full Gen3 speed in a Gen4 system
parents bf929479 411c1e67
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -1489,7 +1489,6 @@ static int __init hfi1_mod_init(void)
		goto bail_dev;
	}

	hfi1_compute_tid_rdma_flow_wt();
	/*
	 * These must be called before the driver is registered with
	 * the PCI subsystem.
+3 −1
Original line number Diff line number Diff line
@@ -319,7 +319,9 @@ int pcie_speeds(struct hfi1_devdata *dd)
	/*
	 * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed
	 */
	if (parent && dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) {
	if (parent &&
	    (dd->pcidev->bus->max_bus_speed == PCIE_SPEED_2_5GT ||
	     dd->pcidev->bus->max_bus_speed == PCIE_SPEED_5_0GT)) {
		dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n");
		dd->link_gen3_capable = 0;
	}
+8 −8
Original line number Diff line number Diff line
@@ -2209,15 +2209,15 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
		if (qp->s_flags & RVT_S_WAIT_RNR)
			goto bail_stop;
		rdi = ib_to_rvt(qp->ibqp.device);
		if (qp->s_rnr_retry == 0 &&
		    !((rdi->post_parms[wqe->wr.opcode].flags &
		      RVT_OPERATION_IGN_RNR_CNT) &&
		      qp->s_rnr_retry_cnt == 0)) {
		if (!(rdi->post_parms[wqe->wr.opcode].flags &
		       RVT_OPERATION_IGN_RNR_CNT)) {
			if (qp->s_rnr_retry == 0) {
				status = IB_WC_RNR_RETRY_EXC_ERR;
				goto class_b;
			}
			if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0)
				qp->s_rnr_retry--;
		}

		/*
		 * The last valid PSN is the previous PSN. For TID RDMA WRITE
+32 −25
Original line number Diff line number Diff line
@@ -107,8 +107,6 @@ static u32 mask_generation(u32 a)
 * C - Capcode
 */

static u32 tid_rdma_flow_wt;

static void tid_rdma_trigger_resume(struct work_struct *work);
static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
@@ -136,6 +134,26 @@ static void update_r_next_psn_fecn(struct hfi1_packet *packet,
				   struct tid_rdma_flow *flow,
				   bool fecn);

static void validate_r_tid_ack(struct hfi1_qp_priv *priv)
{
	if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
		priv->r_tid_ack = priv->r_tid_tail;
}

static void tid_rdma_schedule_ack(struct rvt_qp *qp)
{
	struct hfi1_qp_priv *priv = qp->priv;

	priv->s_flags |= RVT_S_ACK_PENDING;
	hfi1_schedule_tid_send(qp);
}

static void tid_rdma_trigger_ack(struct rvt_qp *qp)
{
	validate_r_tid_ack(qp->priv);
	tid_rdma_schedule_ack(qp);
}

static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
{
	return
@@ -3005,10 +3023,7 @@ nak_psn:
		qpriv->s_nak_state = IB_NAK_PSN_ERROR;
		/* We are NAK'ing the next expected PSN */
		qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn);
		qpriv->s_flags |= RVT_S_ACK_PENDING;
		if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID)
			qpriv->r_tid_ack = qpriv->r_tid_tail;
		hfi1_schedule_tid_send(qp);
		tid_rdma_trigger_ack(qp);
	}
	goto unlock;
}
@@ -3371,18 +3386,17 @@ u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
	return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32);
}

void hfi1_compute_tid_rdma_flow_wt(void)
static u32 hfi1_compute_tid_rdma_flow_wt(struct rvt_qp *qp)
{
	/*
	 * Heuristic for computing the RNR timeout when waiting on the flow
	 * queue. Rather than a computationaly expensive exact estimate of when
	 * a flow will be available, we assume that if a QP is at position N in
	 * the flow queue it has to wait approximately (N + 1) * (number of
	 * segments between two sync points), assuming PMTU of 4K. The rationale
	 * for this is that flows are released and recycled at each sync point.
	 * segments between two sync points). The rationale for this is that
	 * flows are released and recycled at each sync point.
	 */
	tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) /
		TID_RDMA_MAX_SEGMENT_SIZE;
	return (MAX_TID_FLOW_PSN * qp->pmtu) >> TID_RDMA_SEGMENT_SHIFT;
}

static u32 position_in_queue(struct hfi1_qp_priv *qpriv,
@@ -3505,7 +3519,7 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
		if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) {
			ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp);
			if (ret) {
				to_seg = tid_rdma_flow_wt *
				to_seg = hfi1_compute_tid_rdma_flow_wt(qp) *
					position_in_queue(qpriv,
							  &rcd->flow_queue);
				break;
@@ -3526,7 +3540,7 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
		/*
		 * If overtaking req->acked_tail, send an RNR NAK. Because the
		 * QP is not queued in this case, and the issue can only be
		 * caused due a delay in scheduling the second leg which we
		 * caused by a delay in scheduling the second leg which we
		 * cannot estimate, we use a rather arbitrary RNR timeout of
		 * (MAX_FLOWS / 2) segments
		 */
@@ -3534,8 +3548,7 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
				MAX_FLOWS)) {
			ret = -EAGAIN;
			to_seg = MAX_FLOWS >> 1;
			qpriv->s_flags |= RVT_S_ACK_PENDING;
			hfi1_schedule_tid_send(qp);
			tid_rdma_trigger_ack(qp);
			break;
		}

@@ -4335,8 +4348,7 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
	trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn,
					  req);
	trace_hfi1_tid_write_rsp_rcv_data(qp);
	if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
		priv->r_tid_ack = priv->r_tid_tail;
	validate_r_tid_ack(priv);

	if (opcode == TID_OP(WRITE_DATA_LAST)) {
		release_rdma_sge_mr(e);
@@ -4375,8 +4387,7 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
	}

done:
	priv->s_flags |= RVT_S_ACK_PENDING;
	hfi1_schedule_tid_send(qp);
	tid_rdma_schedule_ack(qp);
exit:
	priv->r_next_psn_kdeth = flow->flow_state.r_next_psn;
	if (fecn)
@@ -4388,10 +4399,7 @@ send_nak:
	if (!priv->s_nak_state) {
		priv->s_nak_state = IB_NAK_PSN_ERROR;
		priv->s_nak_psn = flow->flow_state.r_next_psn;
		priv->s_flags |= RVT_S_ACK_PENDING;
		if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
			priv->r_tid_ack = priv->r_tid_tail;
		hfi1_schedule_tid_send(qp);
		tid_rdma_trigger_ack(qp);
	}
	goto done;
}
@@ -4939,8 +4947,7 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
	qpriv->resync = true;
	/* RESYNC request always gets a TID RDMA ACK. */
	qpriv->s_nak_state = 0;
	qpriv->s_flags |= RVT_S_ACK_PENDING;
	hfi1_schedule_tid_send(qp);
	tid_rdma_trigger_ack(qp);
bail:
	if (fecn)
		qp->s_flags |= RVT_S_ECN;
+1 −2
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@
#define TID_RDMA_MIN_SEGMENT_SIZE       BIT(18)   /* 256 KiB (for now) */
#define TID_RDMA_MAX_SEGMENT_SIZE       BIT(18)   /* 256 KiB (for now) */
#define TID_RDMA_MAX_PAGES              (BIT(18) >> PAGE_SHIFT)
#define TID_RDMA_SEGMENT_SHIFT		18

/*
 * Bit definitions for priv->s_flags.
@@ -274,8 +275,6 @@ u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
				  struct ib_other_headers *ohdr,
				  u32 *bth1, u32 *bth2, u32 *len);

void hfi1_compute_tid_rdma_flow_wt(void);

void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet);

u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
Loading