Commit 416fbc1b authored by Doug Ledford's avatar Doug Ledford
Browse files

Merge branch 'hfi1-tid' into wip/dl-for-next



Omni-Path TID RDMA Feature

Intel Omni-Path (OPA) TID RDMA support is a feature that accelerates
data movement between two OPA nodes through the IB Verbs interface. It
improves RDMA READ/WRITE performance by delivering the data payload to a
user buffer directly without any software copying.

Architecture
=============
The TID RDMA protocol is implemented on the hfi1 driver level and is
therefore transparent to the ULPs. It is designed to facilitate the data
transactions for two specific RDMA requests:
  - RDMA READ;
  - RDMA WRITE.
Previously, when a verbs data packet is received at the destination
(requester side for RDMA READ and responder side for RDMA WRITE), the
data payload is copied to the user buffer by software, which slows down
the performance significantly for large requests.

Internally, hfi1 converts qualified RDMA READ/WRITE requests into TID
RDMA READ/WRITE requests when the requests are post sent to the hfi1
driver. Non-qualified RDMA requests are handled by normal RDMA protocol.

For TID RDMA requests, hardware resources (hardware flow and TID entries)
are allocated on the destination side (the requester side for TID RDMA
READ and the responder side for TID RDMA WRITE). The information for
these resources is conveyed to the data source side (the responder side
for TID RDMA READ and the requester side for TID RDMA WRITE) and embedded
in data packets. When data packets are received by the destination,
hardware will deliver the data payload to the destination buffer without
involving software and therefore improve the performance.

Details
=======
RDMA READ/WRITE requests are qualified by the following:
  - Total data length >= 256k;
  - Totoal data length is a multiple of 4K pages.

Additional qualifications are enforced for the destination buffers:
  For RDMA RAED:
    - Each destination sge buffer is 4K aligned;
    - Each destination sge buffer is a multiple of 4K pages.
  For RDMA WRITE:
    - The destination number is 4K aligned.

In addition, in an OPA fabric, some nodes may support TID RDMA while
others may not. As such, it is important for two transaction nodes to
exchange the information about the features they support. This discovery
mechanism is called OPA Feature Negotion (OPFN) and is described in
details in the patch series. Through OPFN, two nodes can find whether
they both support TID RDMA and subsequently convert RDMA requests into
TID RDMA requests.

* hfi1-tid: (46 commits)
  IB/hfi1: Prioritize the sending of ACK packets
  IB/hfi1: Add static trace for TID RDMA WRITE protocol
  IB/hfi1: Enable TID RDMA WRITE protocol
  IB/hfi1: Add interlock between TID RDMA WRITE and other requests
  IB/hfi1: Add TID RDMA WRITE functionality into RDMA verbs
  IB/hfi1: Add the dual leg code
  IB/hfi1: Add the TID second leg ACK packet builder
  IB/hfi1: Add the TID second leg send packet builder
  IB/hfi1: Resend the TID RDMA WRITE DATA packets
  IB/hfi1: Add a function to receive TID RDMA RESYNC packet
  IB/hfi1: Add a function to build TID RDMA RESYNC packet
  IB/hfi1: Add TID RDMA retry timer
  IB/hfi1: Add a function to receive TID RDMA ACK packet
  IB/hfi1: Add a function to build TID RDMA ACK packet
  IB/hfi1: Add a function to receive TID RDMA WRITE DATA packet
  IB/hfi1: Add a function to build TID RDMA WRITE DATA packet
  IB/hfi1: Add a function to receive TID RDMA WRITE response
  IB/hfi1: Add TID resource timer
  IB/hfi1: Add a function to build TID RDMA WRITE response
  IB/hfi1: Add functions to receive TID RDMA WRITE request
  ...

Signed-off-by: default avatarDoug Ledford <dledford@redhat.com>
parents db421a54 885c5807
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -24,6 +24,7 @@ hfi1-y := \
	mad.o \
	mmu_rb.o \
	msix.o \
	opfn.o \
	pcie.o \
	pio.o \
	pio_copy.o \
+13 −0
Original line number Diff line number Diff line
@@ -4253,6 +4253,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
			    access_sw_pio_drain),
[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
			    access_sw_kmem_wait),
[C_SW_TID_WAIT] = CNTR_ELEM("TidWait", 0, 0, CNTR_NORMAL,
			    hfi1_access_sw_tid_wait),
[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
			    access_sw_send_schedule),
[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
@@ -5222,6 +5224,17 @@ int is_bx(struct hfi1_devdata *dd)
	return (chip_rev_minor & 0xF0) == 0x10;
}

/* return true is kernel urg disabled for rcd */
bool is_urg_masked(struct hfi1_ctxtdata *rcd)
{
	u64 mask;
	u32 is = IS_RCVURGENT_START + rcd->ctxt;
	u8 bit = is % 64;

	mask = read_csr(rcd->dd, CCE_INT_MASK + (8 * (is / 64)));
	return !(mask & BIT_ULL(bit));
}

/*
 * Append string s to buffer buf.  Arguments curp and len are the current
 * position and remaining length, respectively.
+3 −1
Original line number Diff line number Diff line
#ifndef _CHIP_H
#define _CHIP_H
/*
 * Copyright(c) 2015 - 2017 Intel Corporation.
 * Copyright(c) 2015 - 2018 Intel Corporation.
 *
 * This file is provided under a dual BSD/GPLv2 license.  When using or
 * redistributing this file, you may do so under either license.
@@ -804,6 +804,7 @@ void clear_linkup_counters(struct hfi1_devdata *dd);
u32 hdrqempty(struct hfi1_ctxtdata *rcd);
int is_ax(struct hfi1_devdata *dd);
int is_bx(struct hfi1_devdata *dd);
bool is_urg_masked(struct hfi1_ctxtdata *rcd);
u32 read_physical_state(struct hfi1_devdata *dd);
u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
const char *opa_lstate_name(u32 lstate);
@@ -926,6 +927,7 @@ enum {
	C_SW_PIO_WAIT,
	C_SW_PIO_DRAIN,
	C_SW_KMEM_WAIT,
	C_SW_TID_WAIT,
	C_SW_SEND_SCHED,
	C_SDMA_DESC_FETCHED_CNT,
	C_SDMA_INT_CNT,
+4 −0
Original line number Diff line number Diff line
@@ -340,6 +340,10 @@ struct diag_pkt {

#define HFI1_PSM_IOC_BASE_SEQ 0x0

/* Number of BTH.PSN bits used for sequence number in expected rcvs */
#define HFI1_KDETH_BTH_SEQ_SHIFT 11
#define HFI1_KDETH_BTH_SEQ_MASK (BIT(HFI1_KDETH_BTH_SEQ_SHIFT) - 1)

static inline __u64 rhf_to_cpu(const __le32 *rbuf)
{
	return __le64_to_cpu(*((__le64 *)rbuf));
+37 −21
Original line number Diff line number Diff line
@@ -1575,13 +1575,11 @@ drop:
	return -EINVAL;
}

void handle_eflags(struct hfi1_packet *packet)
static void show_eflags_errs(struct hfi1_packet *packet)
{
	struct hfi1_ctxtdata *rcd = packet->rcd;
	u32 rte = rhf_rcv_type_err(packet->rhf);

	rcv_hdrerr(rcd, rcd->ppd, packet);
	if (rhf_err_flags(packet->rhf))
	dd_dev_err(rcd->dd,
		   "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
		   rcd->ctxt, packet->rhf,
@@ -1596,6 +1594,15 @@ void handle_eflags(struct hfi1_packet *packet)
		   rte);
}

void handle_eflags(struct hfi1_packet *packet)
{
	struct hfi1_ctxtdata *rcd = packet->rcd;

	rcv_hdrerr(rcd, rcd->ppd, packet);
	if (rhf_err_flags(packet->rhf))
		show_eflags_errs(packet);
}

/*
 * The following functions are called by the interrupt handler. They are type
 * specific handlers for each packet type.
@@ -1699,11 +1706,14 @@ static int kdeth_process_expected(struct hfi1_packet *packet)
	if (unlikely(hfi1_dbg_should_fault_rx(packet)))
		return RHF_RCV_CONTINUE;

	if (unlikely(rhf_err_flags(packet->rhf)))
		handle_eflags(packet);
	if (unlikely(rhf_err_flags(packet->rhf))) {
		struct hfi1_ctxtdata *rcd = packet->rcd;

	dd_dev_err(packet->rcd->dd,
		   "Unhandled expected packet received. Dropping.\n");
		if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
			return RHF_RCV_CONTINUE;
	}

	hfi1_kdeth_expected_rcv(packet);
	return RHF_RCV_CONTINUE;
}

@@ -1712,11 +1722,17 @@ static int kdeth_process_eager(struct hfi1_packet *packet)
	hfi1_setup_9B_packet(packet);
	if (unlikely(hfi1_dbg_should_fault_rx(packet)))
		return RHF_RCV_CONTINUE;
	if (unlikely(rhf_err_flags(packet->rhf)))
		handle_eflags(packet);

	dd_dev_err(packet->rcd->dd,
		   "Unhandled eager packet received. Dropping.\n");
	trace_hfi1_rcvhdr(packet);
	if (unlikely(rhf_err_flags(packet->rhf))) {
		struct hfi1_ctxtdata *rcd = packet->rcd;

		show_eflags_errs(packet);
		if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
			return RHF_RCV_CONTINUE;
	}

	hfi1_kdeth_eager_rcv(packet);
	return RHF_RCV_CONTINUE;
}

Loading