Commit 6c70f899 authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge branch 'nvme-5.2-rc-next' of git://git.infradead.org/nvme into for-linus

Pull NVMe fixes from Sagi.

* 'nvme-5.2-rc-next' of git://git.infradead.org/nvme:
  nvme-rdma: use dynamic dma mapping per command
  nvme: Fix u32 overflow in the number of namespace list calculation
  nvmet: fix data_len to 0 for bdev-backed write_zeroes
  nvme-tcp: fix queue mapping when queue count is limited
  nvme-rdma: fix queue mapping when queue count is limited
parents 19e9da9e 62f99b62
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -3400,7 +3400,8 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
{
	struct nvme_ns *ns;
	__le32 *ns_list;
	unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
	unsigned i, j, nsid, prev = 0;
	unsigned num_lists = DIV_ROUND_UP_ULL((u64)nn, 1024);
	int ret = 0;

	ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
+97 −55
Original line number Diff line number Diff line
@@ -213,6 +213,11 @@ static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
	if (!ring)
		return NULL;

	/*
	 * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
	 * lifetime. It's safe, since any chage in the underlying RDMA device
	 * will issue error recovery and queue re-creation.
	 */
	for (i = 0; i < ib_queue_size; i++) {
		if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
			goto out_free_ring;
@@ -274,14 +279,9 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
		struct request *rq, unsigned int hctx_idx)
{
	struct nvme_rdma_ctrl *ctrl = set->driver_data;
	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
	int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
	struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
	struct nvme_rdma_device *dev = queue->device;

	nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
			DMA_TO_DEVICE);
	kfree(req->sqe.data);
}

static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
@@ -292,15 +292,11 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
	int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
	struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
	struct nvme_rdma_device *dev = queue->device;
	struct ib_device *ibdev = dev->dev;
	int ret;

	nvme_req(rq)->ctrl = &ctrl->ctrl;
	ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
			DMA_TO_DEVICE);
	if (ret)
		return ret;
	req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL);
	if (!req->sqe.data)
		return -ENOMEM;

	req->queue = queue;

@@ -641,34 +637,16 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
{
	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
	struct ib_device *ibdev = ctrl->device->dev;
	unsigned int nr_io_queues;
	unsigned int nr_io_queues, nr_default_queues;
	unsigned int nr_read_queues, nr_poll_queues;
	int i, ret;

	nr_io_queues = min(opts->nr_io_queues, num_online_cpus());

	/*
	 * we map queues according to the device irq vectors for
	 * optimal locality so we don't need more queues than
	 * completion vectors.
	 */
	nr_io_queues = min_t(unsigned int, nr_io_queues,
				ibdev->num_comp_vectors);

	if (opts->nr_write_queues) {
		ctrl->io_queues[HCTX_TYPE_DEFAULT] =
				min(opts->nr_write_queues, nr_io_queues);
		nr_io_queues += ctrl->io_queues[HCTX_TYPE_DEFAULT];
	} else {
		ctrl->io_queues[HCTX_TYPE_DEFAULT] = nr_io_queues;
	}

	ctrl->io_queues[HCTX_TYPE_READ] = nr_io_queues;

	if (opts->nr_poll_queues) {
		ctrl->io_queues[HCTX_TYPE_POLL] =
			min(opts->nr_poll_queues, num_online_cpus());
		nr_io_queues += ctrl->io_queues[HCTX_TYPE_POLL];
	}
	nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors,
				min(opts->nr_io_queues, num_online_cpus()));
	nr_default_queues =  min_t(unsigned int, ibdev->num_comp_vectors,
				min(opts->nr_write_queues, num_online_cpus()));
	nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus());
	nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues;

	ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
	if (ret)
@@ -681,6 +659,34 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
	dev_info(ctrl->ctrl.device,
		"creating %d I/O queues.\n", nr_io_queues);

	if (opts->nr_write_queues && nr_read_queues < nr_io_queues) {
		/*
		 * separate read/write queues
		 * hand out dedicated default queues only after we have
		 * sufficient read queues.
		 */
		ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues;
		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
		ctrl->io_queues[HCTX_TYPE_DEFAULT] =
			min(nr_default_queues, nr_io_queues);
		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
	} else {
		/*
		 * shared read/write queues
		 * either no write queues were requested, or we don't have
		 * sufficient queue count to have dedicated default queues.
		 */
		ctrl->io_queues[HCTX_TYPE_DEFAULT] =
			min(nr_read_queues, nr_io_queues);
		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
	}

	if (opts->nr_poll_queues && nr_io_queues) {
		/* map dedicated poll queues only if we have queues left */
		ctrl->io_queues[HCTX_TYPE_POLL] =
			min(nr_poll_queues, nr_io_queues);
	}

	for (i = 1; i < ctrl->ctrl.queue_count; i++) {
		ret = nvme_rdma_alloc_queue(ctrl, i,
				ctrl->ctrl.sqsize + 1);
@@ -769,6 +775,11 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,

	ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);

	/*
	 * Bind the async event SQE DMA mapping to the admin queue lifetime.
	 * It's safe, since any chage in the underlying RDMA device will issue
	 * error recovery and queue re-creation.
	 */
	error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
			sizeof(struct nvme_command), DMA_TO_DEVICE);
	if (error)
@@ -1709,12 +1720,20 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
		return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);

	dev = queue->device->dev;

	req->sqe.dma = ib_dma_map_single(dev, req->sqe.data,
					 sizeof(struct nvme_command),
					 DMA_TO_DEVICE);
	err = ib_dma_mapping_error(dev, req->sqe.dma);
	if (unlikely(err))
		return BLK_STS_RESOURCE;

	ib_dma_sync_single_for_cpu(dev, sqe->dma,
			sizeof(struct nvme_command), DMA_TO_DEVICE);

	ret = nvme_setup_cmd(ns, rq, c);
	if (ret)
		return ret;
		goto unmap_qe;

	blk_mq_start_request(rq);

@@ -1739,10 +1758,16 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
	}

	return BLK_STS_OK;

err:
	if (err == -ENOMEM || err == -EAGAIN)
		return BLK_STS_RESOURCE;
	return BLK_STS_IOERR;
		ret = BLK_STS_RESOURCE;
	else
		ret = BLK_STS_IOERR;
unmap_qe:
	ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command),
			    DMA_TO_DEVICE);
	return ret;
}

static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
@@ -1755,25 +1780,36 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
static void nvme_rdma_complete_rq(struct request *rq)
{
	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
	struct nvme_rdma_queue *queue = req->queue;
	struct ib_device *ibdev = queue->device->dev;

	nvme_rdma_unmap_data(req->queue, rq);
	nvme_rdma_unmap_data(queue, rq);
	ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command),
			    DMA_TO_DEVICE);
	nvme_complete_rq(rq);
}

static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
{
	struct nvme_rdma_ctrl *ctrl = set->driver_data;
	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;

	set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
	if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
		/* separate read/write queues */
		set->map[HCTX_TYPE_DEFAULT].nr_queues =
			ctrl->io_queues[HCTX_TYPE_DEFAULT];
	set->map[HCTX_TYPE_READ].nr_queues = ctrl->io_queues[HCTX_TYPE_READ];
	if (ctrl->ctrl.opts->nr_write_queues) {
		/* separate read/write queues */
		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
		set->map[HCTX_TYPE_READ].nr_queues =
			ctrl->io_queues[HCTX_TYPE_READ];
		set->map[HCTX_TYPE_READ].queue_offset =
			ctrl->io_queues[HCTX_TYPE_DEFAULT];
	} else {
		/* mixed read/write queues */
		/* shared read/write queues */
		set->map[HCTX_TYPE_DEFAULT].nr_queues =
			ctrl->io_queues[HCTX_TYPE_DEFAULT];
		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
		set->map[HCTX_TYPE_READ].nr_queues =
			ctrl->io_queues[HCTX_TYPE_DEFAULT];
		set->map[HCTX_TYPE_READ].queue_offset = 0;
	}
	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
@@ -1781,16 +1817,22 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
			ctrl->device->dev, 0);

	if (ctrl->ctrl.opts->nr_poll_queues) {
	if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
		/* map dedicated poll queues only if we have queues left */
		set->map[HCTX_TYPE_POLL].nr_queues =
				ctrl->io_queues[HCTX_TYPE_POLL];
		set->map[HCTX_TYPE_POLL].queue_offset =
				ctrl->io_queues[HCTX_TYPE_DEFAULT];
		if (ctrl->ctrl.opts->nr_write_queues)
			set->map[HCTX_TYPE_POLL].queue_offset +=
			ctrl->io_queues[HCTX_TYPE_DEFAULT] +
			ctrl->io_queues[HCTX_TYPE_READ];
		blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
	}

	dev_info(ctrl->ctrl.device,
		"mapped %d/%d/%d default/read/poll queues.\n",
		ctrl->io_queues[HCTX_TYPE_DEFAULT],
		ctrl->io_queues[HCTX_TYPE_READ],
		ctrl->io_queues[HCTX_TYPE_POLL]);

	return 0;
}

+50 −7
Original line number Diff line number Diff line
@@ -111,6 +111,7 @@ struct nvme_tcp_ctrl {
	struct work_struct	err_work;
	struct delayed_work	connect_work;
	struct nvme_tcp_request async_req;
	u32			io_queues[HCTX_MAX_TYPES];
};

static LIST_HEAD(nvme_tcp_ctrl_list);
@@ -1564,6 +1565,35 @@ static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
	return nr_io_queues;
}

static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
		unsigned int nr_io_queues)
{
	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
	struct nvmf_ctrl_options *opts = nctrl->opts;

	if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
		/*
		 * separate read/write queues
		 * hand out dedicated default queues only after we have
		 * sufficient read queues.
		 */
		ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
		ctrl->io_queues[HCTX_TYPE_DEFAULT] =
			min(opts->nr_write_queues, nr_io_queues);
		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
	} else {
		/*
		 * shared read/write queues
		 * either no write queues were requested, or we don't have
		 * sufficient queue count to have dedicated default queues.
		 */
		ctrl->io_queues[HCTX_TYPE_DEFAULT] =
			min(opts->nr_io_queues, nr_io_queues);
		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
	}
}

static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
{
	unsigned int nr_io_queues;
@@ -1581,6 +1611,8 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
	dev_info(ctrl->device,
		"creating %d I/O queues.\n", nr_io_queues);

	nvme_tcp_set_io_queues(ctrl, nr_io_queues);

	return __nvme_tcp_alloc_io_queues(ctrl);
}

@@ -2089,23 +2121,34 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
{
	struct nvme_tcp_ctrl *ctrl = set->driver_data;
	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;

	set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
	set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues;
	if (ctrl->ctrl.opts->nr_write_queues) {
	if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
		/* separate read/write queues */
		set->map[HCTX_TYPE_DEFAULT].nr_queues =
				ctrl->ctrl.opts->nr_write_queues;
			ctrl->io_queues[HCTX_TYPE_DEFAULT];
		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
		set->map[HCTX_TYPE_READ].nr_queues =
			ctrl->io_queues[HCTX_TYPE_READ];
		set->map[HCTX_TYPE_READ].queue_offset =
				ctrl->ctrl.opts->nr_write_queues;
			ctrl->io_queues[HCTX_TYPE_DEFAULT];
	} else {
		/* mixed read/write queues */
		/* shared read/write queues */
		set->map[HCTX_TYPE_DEFAULT].nr_queues =
				ctrl->ctrl.opts->nr_io_queues;
			ctrl->io_queues[HCTX_TYPE_DEFAULT];
		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
		set->map[HCTX_TYPE_READ].nr_queues =
			ctrl->io_queues[HCTX_TYPE_DEFAULT];
		set->map[HCTX_TYPE_READ].queue_offset = 0;
	}
	blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
	blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);

	dev_info(ctrl->ctrl.device,
		"mapped %d/%d default/read queues.\n",
		ctrl->io_queues[HCTX_TYPE_DEFAULT],
		ctrl->io_queues[HCTX_TYPE_READ]);

	return 0;
}

+1 −0
Original line number Diff line number Diff line
@@ -293,6 +293,7 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req)
		return 0;
	case nvme_cmd_write_zeroes:
		req->execute = nvmet_bdev_execute_write_zeroes;
		req->data_len = 0;
		return 0;
	default:
		pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,