Commit c66fa19c authored by Matan Barak's avatar Matan Barak Committed by David S. Miller
Browse files

net/mlx4: Add EQ pool



Previously, mlx4_en allocated EQs and used them exclusively.
This affected RoCE performance, as applications which are
events sensitive were limited to use only the legacy EQs.

Change that by introducing an EQ pool. This pool is managed
by mlx4_core. EQs are assigned to ports (when there are limited
number of EQs, multiple ports could be assigned to the same EQs).

An exception to this rule is the ASYNC EQ which handles various events.

Legacy EQs are completely removed as all EQs could be shared.

When a consumer (mlx4_ib/mlx4_en) requests an EQ, it asks for
EQ serving on a specific port. The core driver calculates which
EQ should be assigned to that request.

Because IRQs are shared between IB and Ethernet modules, their
names only include the PCI device BDF address.

Signed-off-by: default avatarMatan Barak <matanb@mellanox.com>
Signed-off-by: default avatarIdo Shamay <idos@mellanox.com>
Signed-off-by: default avatarOr Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 48564135
Loading
Loading
Loading
Loading
+23 −48
Original line number Original line Diff line number Diff line
@@ -2041,77 +2041,52 @@ static void init_pkeys(struct mlx4_ib_dev *ibdev)


static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
{
{
	char name[80];
	int i, j, eq = 0, total_eqs = 0;
	int eq_per_port = 0;
	int added_eqs = 0;
	int total_eqs = 0;
	int i, j, eq;

	/* Legacy mode or comp_pool is not large enough */
	if (dev->caps.comp_pool == 0 ||
	    dev->caps.num_ports > dev->caps.comp_pool)
		return;

	eq_per_port = dev->caps.comp_pool / dev->caps.num_ports;

	/* Init eq table */
	added_eqs = 0;
	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
		added_eqs += eq_per_port;

	total_eqs = dev->caps.num_comp_vectors + added_eqs;


	ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL);
	ibdev->eq_table = kcalloc(dev->caps.num_comp_vectors,
				  sizeof(ibdev->eq_table[0]), GFP_KERNEL);
	if (!ibdev->eq_table)
	if (!ibdev->eq_table)
		return;
		return;


	ibdev->eq_added = added_eqs;
	for (i = 1; i <= dev->caps.num_ports; i++) {

		for (j = 0; j < mlx4_get_eqs_per_port(dev, i);
	eq = 0;
		     j++, total_eqs++) {
	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) {
			if (i > 1 &&  mlx4_is_eq_shared(dev, total_eqs))
		for (j = 0; j < eq_per_port; j++) {
				continue;
			snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%s",
			ibdev->eq_table[eq] = total_eqs;
				 i, j, dev->persist->pdev->bus->name);
			if (!mlx4_assign_eq(dev, i,
			/* Set IRQ for specific name (per ring) */
					    &ibdev->eq_table[eq]))
			if (mlx4_assign_eq(dev, name, NULL,
					   &ibdev->eq_table[eq])) {
				/* Use legacy (same as mlx4_en driver) */
				pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq);
				ibdev->eq_table[eq] =
					(eq % dev->caps.num_comp_vectors);
			}
				eq++;
				eq++;
			else
				ibdev->eq_table[eq] = -1;
		}
		}
	}
	}


	/* Fill the reset of the vector with legacy EQ */
	for (i = eq; i < dev->caps.num_comp_vectors;
	for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++)
	     ibdev->eq_table[i++] = -1)
		ibdev->eq_table[eq++] = i;
		;


	/* Advertise the new number of EQs to clients */
	/* Advertise the new number of EQs to clients */
	ibdev->ib_dev.num_comp_vectors = total_eqs;
	ibdev->ib_dev.num_comp_vectors = eq;
}
}


static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
{
{
	int i;
	int i;
	int total_eqs = ibdev->ib_dev.num_comp_vectors;


	/* no additional eqs were added */
	/* no eqs were allocated */
	if (!ibdev->eq_table)
	if (!ibdev->eq_table)
		return;
		return;


	/* Reset the advertised EQ number */
	/* Reset the advertised EQ number */
	ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors;
	ibdev->ib_dev.num_comp_vectors = 0;


	/* Free only the added eqs */
	for (i = 0; i < total_eqs; i++)
	for (i = 0; i < ibdev->eq_added; i++) {
		/* Don't free legacy eqs if used */
		if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors)
			continue;
		mlx4_release_eq(dev, ibdev->eq_table[i]);
		mlx4_release_eq(dev, ibdev->eq_table[i]);
	}


	kfree(ibdev->eq_table);
	kfree(ibdev->eq_table);
	ibdev->eq_table = NULL;
}
}


static void *mlx4_ib_add(struct mlx4_dev *dev)
static void *mlx4_ib_add(struct mlx4_dev *dev)
+0 −1
Original line number Original line Diff line number Diff line
@@ -523,7 +523,6 @@ struct mlx4_ib_dev {
	struct mlx4_ib_iboe	iboe;
	struct mlx4_ib_iboe	iboe;
	int			counters[MLX4_MAX_PORTS];
	int			counters[MLX4_MAX_PORTS];
	int		       *eq_table;
	int		       *eq_table;
	int			eq_added;
	struct kobject	       *iov_parent;
	struct kobject	       *iov_parent;
	struct kobject	       *ports_parent;
	struct kobject	       *ports_parent;
	struct kobject	       *dev_ports_parent[MLX4_MFUNC_MAX];
	struct kobject	       *dev_ports_parent[MLX4_MFUNC_MAX];
+5 −5
Original line number Original line Diff line number Diff line
@@ -292,7 +292,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
	u64 mtt_addr;
	u64 mtt_addr;
	int err;
	int err;


	if (vector > dev->caps.num_comp_vectors + dev->caps.comp_pool)
	if (vector >= dev->caps.num_comp_vectors)
		return -EINVAL;
		return -EINVAL;


	cq->vector = vector;
	cq->vector = vector;
@@ -319,7 +319,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
		cq_context->flags  |= cpu_to_be32(1 << 19);
		cq_context->flags  |= cpu_to_be32(1 << 19);


	cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index);
	cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index);
	cq_context->comp_eqn	    = priv->eq_table.eq[vector].eqn;
	cq_context->comp_eqn	    = priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].eqn;
	cq_context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
	cq_context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;


	mtt_addr = mlx4_mtt_addr(dev, mtt);
	mtt_addr = mlx4_mtt_addr(dev, mtt);
@@ -339,11 +339,11 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
	init_completion(&cq->free);
	init_completion(&cq->free);
	cq->comp = mlx4_add_cq_to_tasklet;
	cq->comp = mlx4_add_cq_to_tasklet;
	cq->tasklet_ctx.priv =
	cq->tasklet_ctx.priv =
		&priv->eq_table.eq[cq->vector].tasklet_ctx;
		&priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].tasklet_ctx;
	INIT_LIST_HEAD(&cq->tasklet_ctx.list);
	INIT_LIST_HEAD(&cq->tasklet_ctx.list);




	cq->irq = priv->eq_table.eq[cq->vector].irq;
	cq->irq = priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].irq;
	return 0;
	return 0;


err_radix:
err_radix:
@@ -368,7 +368,7 @@ void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq)
	if (err)
	if (err)
		mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn);
		mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn);


	synchronize_irq(priv->eq_table.eq[cq->vector].irq);
	synchronize_irq(priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(cq->vector)].irq);


	spin_lock_irq(&cq_table->lock);
	spin_lock_irq(&cq_table->lock);
	radix_tree_delete(&cq_table->tree, cq->cqn);
	radix_tree_delete(&cq_table->tree, cq->cqn);
+23 −25
Original line number Original line Diff line number Diff line
@@ -66,6 +66,7 @@ int mlx4_en_create_cq(struct mlx4_en_priv *priv,


	cq->ring = ring;
	cq->ring = ring;
	cq->is_tx = mode;
	cq->is_tx = mode;
	cq->vector = mdev->dev->caps.num_comp_vectors;


	/* Allocate HW buffers on provided NUMA node.
	/* Allocate HW buffers on provided NUMA node.
	 * dev->numa_node is used in mtt range allocation flow.
	 * dev->numa_node is used in mtt range allocation flow.
@@ -101,12 +102,7 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
	int err = 0;
	int err = 0;
	char name[25];
	char name[25];
	int timestamp_en = 0;
	int timestamp_en = 0;
	struct cpu_rmap *rmap =
	bool assigned_eq = false;
#ifdef CONFIG_RFS_ACCEL
		priv->dev->rx_cpu_rmap;
#else
		NULL;
#endif


	cq->dev = mdev->pndev[priv->port];
	cq->dev = mdev->pndev[priv->port];
	cq->mcq.set_ci_db  = cq->wqres.db.db;
	cq->mcq.set_ci_db  = cq->wqres.db.db;
@@ -116,23 +112,19 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
	memset(cq->buf, 0, cq->buf_size);
	memset(cq->buf, 0, cq->buf_size);


	if (cq->is_tx == RX) {
	if (cq->is_tx == RX) {
		if (mdev->dev->caps.comp_pool) {
		if (!mlx4_is_eq_vector_valid(mdev->dev, priv->port,
			if (!cq->vector) {
					     cq->vector)) {
				sprintf(name, "%s-%d", priv->dev->name,
			cq->vector = cq_idx;
					cq->ring);

				/* Set IRQ for specific name (per ring) */
			err = mlx4_assign_eq(mdev->dev, priv->port,
				if (mlx4_assign_eq(mdev->dev, name, rmap,
					     &cq->vector);
						   &cq->vector)) {
			if (err) {
					cq->vector = (cq->ring + 1 + priv->port)
				mlx4_err(mdev, "Failed assigning an EQ to %s\n",
					    % mdev->dev->caps.num_comp_vectors;
					mlx4_warn(mdev, "Failed assigning an EQ to %s, falling back to legacy EQ's\n",
					 name);
					 name);
				goto free_eq;
			}
			}


			}
			assigned_eq = true;
		} else {
			cq->vector = (cq->ring + 1 + priv->port) %
				mdev->dev->caps.num_comp_vectors;
		}
		}


		cq->irq_desc =
		cq->irq_desc =
@@ -159,7 +151,7 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
			    &mdev->priv_uar, cq->wqres.db.dma, &cq->mcq,
			    &mdev->priv_uar, cq->wqres.db.dma, &cq->mcq,
			    cq->vector, 0, timestamp_en);
			    cq->vector, 0, timestamp_en);
	if (err)
	if (err)
		return err;
		goto free_eq;


	cq->mcq.comp  = cq->is_tx ? mlx4_en_tx_irq : mlx4_en_rx_irq;
	cq->mcq.comp  = cq->is_tx ? mlx4_en_tx_irq : mlx4_en_rx_irq;
	cq->mcq.event = mlx4_en_cq_event;
	cq->mcq.event = mlx4_en_cq_event;
@@ -182,6 +174,12 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
	napi_enable(&cq->napi);
	napi_enable(&cq->napi);


	return 0;
	return 0;

free_eq:
	if (assigned_eq)
		mlx4_release_eq(mdev->dev, cq->vector);
	cq->vector = mdev->dev->caps.num_comp_vectors;
	return err;
}
}


void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq)
void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq)
@@ -191,9 +189,9 @@ void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq)


	mlx4_en_unmap_buffer(&cq->wqres.buf);
	mlx4_en_unmap_buffer(&cq->wqres.buf);
	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
	if (priv->mdev->dev->caps.comp_pool && cq->vector) {
	if (mlx4_is_eq_vector_valid(mdev->dev, priv->port, cq->vector) &&
	    cq->is_tx == RX)
		mlx4_release_eq(priv->mdev->dev, cq->vector);
		mlx4_release_eq(priv->mdev->dev, cq->vector);
	}
	cq->vector = 0;
	cq->vector = 0;
	cq->buf_size = 0;
	cq->buf_size = 0;
	cq->buf = NULL;
	cq->buf = NULL;
+1 −6
Original line number Original line Diff line number Diff line
@@ -1958,7 +1958,6 @@ void mlx4_en_free_resources(struct mlx4_en_priv *priv)
	int i;
	int i;


#ifdef CONFIG_RFS_ACCEL
#ifdef CONFIG_RFS_ACCEL
	free_irq_cpu_rmap(priv->dev->rx_cpu_rmap);
	priv->dev->rx_cpu_rmap = NULL;
	priv->dev->rx_cpu_rmap = NULL;
#endif
#endif


@@ -2016,11 +2015,7 @@ int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
	}
	}


#ifdef CONFIG_RFS_ACCEL
#ifdef CONFIG_RFS_ACCEL
	if (priv->mdev->dev->caps.comp_pool) {
	priv->dev->rx_cpu_rmap = mlx4_get_cpu_rmap(priv->mdev->dev, priv->port);
		priv->dev->rx_cpu_rmap = alloc_irq_cpu_rmap(priv->mdev->dev->caps.comp_pool);
		if (!priv->dev->rx_cpu_rmap)
			goto err;
	}
#endif
#endif


	return 0;
	return 0;
Loading