RDMA/core: Introduce shared CQ pool API (c7ff819a) · Commits · 戴 / test

drivers/infiniband/core/core_priv.h

+3 −0

Original line number	Diff line number	Diff line
		@@ -414,4 +414,7 @@ void rdma_umap_priv_init(struct rdma_umap_priv *priv,
		struct vm_area_struct *vma,
		struct rdma_user_mmap_entry *entry);

		void ib_cq_pool_init(struct ib_device *dev);
		void ib_cq_pool_destroy(struct ib_device *dev);

		#endif /* _CORE_PRIV_H */

drivers/infiniband/core/cq.c

+173 −0

Original line number	Diff line number	Diff line
		@@ -7,7 +7,11 @@
		#include <linux/slab.h>
		#include <rdma/ib_verbs.h>

		#include "core_priv.h"

		#include <trace/events/rdma_core.h>
		/* Max size for shared CQ, may require tuning */
		#define IB_MAX_SHARED_CQ_SZ 4096U

		/* # of WCs to poll for with a single call to ib_poll_cq */
		#define IB_POLL_BATCH 16
		@@ -218,6 +222,7 @@ struct ib_cq __ib_alloc_cq_user(struct ib_device dev, void *private,
		cq->cq_context = private;
		cq->poll_ctx = poll_ctx;
		atomic_set(&cq->usecnt, 0);
		cq->comp_vector = comp_vector;

		cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
		if (!cq->wc)
		@@ -309,6 +314,8 @@ void ib_free_cq_user(struct ib_cq cq, struct ib_udata udata)
		{
		if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
		return;
		if (WARN_ON_ONCE(cq->cqe_used))
		return;

		switch (cq->poll_ctx) {
		case IB_POLL_DIRECT:
		@@ -334,3 +341,169 @@ void ib_free_cq_user(struct ib_cq cq, struct ib_udata udata)
		kfree(cq);
		}
		EXPORT_SYMBOL(ib_free_cq_user);

		void ib_cq_pool_init(struct ib_device *dev)
		{
		unsigned int i;

		spin_lock_init(&dev->cq_pools_lock);
		for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++)
		INIT_LIST_HEAD(&dev->cq_pools[i]);
		}

		void ib_cq_pool_destroy(struct ib_device *dev)
		{
		struct ib_cq cq, n;
		unsigned int i;

		for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) {
		list_for_each_entry_safe(cq, n, &dev->cq_pools[i],
		pool_entry) {
		WARN_ON(cq->cqe_used);
		cq->shared = false;
		ib_free_cq(cq);
		}
		}
		}

		static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes,
		enum ib_poll_context poll_ctx)
		{
		LIST_HEAD(tmp_list);
		unsigned int nr_cqs, i;
		struct ib_cq *cq;
		int ret;

		if (poll_ctx > IB_POLL_LAST_POOL_TYPE) {
		WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE);
		return -EINVAL;
		}

		/*
		* Allocate at least as many CQEs as requested, and otherwise
		* a reasonable batch size so that we can share CQs between
		* multiple users instead of allocating a larger number of CQs.
		*/
		nr_cqes = min_t(unsigned int, dev->attrs.max_cqe,
		max(nr_cqes, IB_MAX_SHARED_CQ_SZ));
		nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus());
		for (i = 0; i < nr_cqs; i++) {
		cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx);
		if (IS_ERR(cq)) {
		ret = PTR_ERR(cq);
		goto out_free_cqs;
		}
		cq->shared = true;
		list_add_tail(&cq->pool_entry, &tmp_list);
		}

		spin_lock_irq(&dev->cq_pools_lock);
		list_splice(&tmp_list, &dev->cq_pools[poll_ctx]);
		spin_unlock_irq(&dev->cq_pools_lock);

		return 0;

		out_free_cqs:
		list_for_each_entry(cq, &tmp_list, pool_entry) {
		cq->shared = false;
		ib_free_cq(cq);
		}
		return ret;
		}

		/**
		* ib_cq_pool_get() - Find the least used completion queue that matches
		* a given cpu hint (or least used for wild card affinity) and fits
		* nr_cqe.
		* @dev: rdma device
		* @nr_cqe: number of needed cqe entries
		* @comp_vector_hint: completion vector hint (-1) for the driver to assign
		* a comp vector based on internal counter
		* @poll_ctx: cq polling context
		*
		* Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and
		* claim entries in it for us. In case there is no available cq, allocate
		* a new cq with the requirements and add it to the device pool.
		* IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value
		* for @poll_ctx.
		*/
		struct ib_cq ib_cq_pool_get(struct ib_device dev, unsigned int nr_cqe,
		int comp_vector_hint,
		enum ib_poll_context poll_ctx)
		{
		static unsigned int default_comp_vector;
		unsigned int vector, num_comp_vectors;
		struct ib_cq cq, found = NULL;
		int ret;

		if (poll_ctx > IB_POLL_LAST_POOL_TYPE) {
		WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE);
		return ERR_PTR(-EINVAL);
		}

		num_comp_vectors =
		min_t(unsigned int, dev->num_comp_vectors, num_online_cpus());
		/* Project the affinty to the device completion vector range */
		if (comp_vector_hint < 0) {
		comp_vector_hint =
		(READ_ONCE(default_comp_vector) + 1) % num_comp_vectors;
		WRITE_ONCE(default_comp_vector, comp_vector_hint);
		}
		vector = comp_vector_hint % num_comp_vectors;

		/*
		* Find the least used CQ with correct affinity and
		* enough free CQ entries
		*/
		while (!found) {
		spin_lock_irq(&dev->cq_pools_lock);
		list_for_each_entry(cq, &dev->cq_pools[poll_ctx],
		pool_entry) {
		/*
		* Check to see if we have found a CQ with the
		* correct completion vector
		*/
		if (vector != cq->comp_vector)
		continue;
		if (cq->cqe_used + nr_cqe > cq->cqe)
		continue;
		found = cq;
		break;
		}

		if (found) {
		found->cqe_used += nr_cqe;
		spin_unlock_irq(&dev->cq_pools_lock);

		return found;
		}
		spin_unlock_irq(&dev->cq_pools_lock);

		/*
		* Didn't find a match or ran out of CQs in the device
		* pool, allocate a new array of CQs.
		*/
		ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx);
		if (ret)
		return ERR_PTR(ret);
		}

		return found;
		}
		EXPORT_SYMBOL(ib_cq_pool_get);

		/**
		* ib_cq_pool_put - Return a CQ taken from a shared pool.
		* @cq: The CQ to return.
		* @nr_cqe: The max number of cqes that the user had requested.
		*/
		void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe)
		{
		if (WARN_ON_ONCE(nr_cqe > cq->cqe_used))
		return;

		spin_lock_irq(&cq->device->cq_pools_lock);
		cq->cqe_used -= nr_cqe;
		spin_unlock_irq(&cq->device->cq_pools_lock);
		}
		EXPORT_SYMBOL(ib_cq_pool_put);

drivers/infiniband/core/device.c

+2 −0

Original line number	Diff line number	Diff line
		@@ -1393,6 +1393,7 @@ int ib_register_device(struct ib_device device, const char name)
		goto dev_cleanup;
		}

		ib_cq_pool_init(device);
		ret = enable_device_and_get(device);
		dev_set_uevent_suppress(&device->dev, false);
		/* Mark for userspace that device is ready */
		@@ -1447,6 +1448,7 @@ static void __ib_unregister_device(struct ib_device *ib_dev)
		goto out;

		disable_device(ib_dev);
		ib_cq_pool_destroy(ib_dev);

		/* Expedite removing unregistered pointers from the hash table */
		free_netdevs(ib_dev);

include/rdma/ib_verbs.h

+16 −1

Original line number	Diff line number	Diff line
		@@ -1588,10 +1588,12 @@ struct ib_ah {
		typedef void (ib_comp_handler)(struct ib_cq cq, void *cq_context);

		enum ib_poll_context {
		IB_POLL_DIRECT, /* caller context, no hw completions */
		IB_POLL_SOFTIRQ, /* poll from softirq context */
		IB_POLL_WORKQUEUE, /* poll from workqueue */
		IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */
		IB_POLL_LAST_POOL_TYPE = IB_POLL_UNBOUND_WORKQUEUE,

		IB_POLL_DIRECT, /* caller context, no hw completions */
		};

		struct ib_cq {
		@@ -1601,9 +1603,11 @@ struct ib_cq {
		void (event_handler)(struct ib_event , void *);
		void *cq_context;
		int cqe;
		unsigned int cqe_used;
		atomic_t usecnt; /* count number of work queues */
		enum ib_poll_context poll_ctx;
		struct ib_wc *wc;
		struct list_head pool_entry;
		union {
		struct irq_poll iop;
		struct work_struct work;
		@@ -1615,6 +1619,7 @@ struct ib_cq {
		ktime_t timestamp;
		u8 interrupt:1;
		u8 shared:1;
		unsigned int comp_vector;

		/*
		* Implementation details of the RDMA core, don't use in drivers:
		@@ -2734,6 +2739,10 @@ struct ib_device {
		#endif

		u32 index;

		spinlock_t cq_pools_lock;
		struct list_head cq_pools[IB_POLL_LAST_POOL_TYPE + 1];

		struct rdma_restrack_root *res;

		const struct uapi_definition *driver_def;
		@@ -4037,6 +4046,12 @@ static inline int ib_req_notify_cq(struct ib_cq *cq,
		return cq->device->ops.req_notify_cq(cq, flags);
		}

		struct ib_cq ib_cq_pool_get(struct ib_device dev, unsigned int nr_cqe,
		int comp_vector_hint,
		enum ib_poll_context poll_ctx);

		void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe);

		/**
		* ib_req_ncomp_notif - Request completion notification when there are
		* at least the specified number of unreaped completions on the CQ.

Admin message