Commit 2d5ba0c7 authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge branch 'nvme-5.4' of git://git.infradead.org/nvme into for-linus

Pull NVMe changes from Sagi:

"This set consists of various fixes and cleanups:
 - controller removal race fix from Balbir
 - quirk additions from Gabriel and Jian-Hong
 - nvme-pci power state save fix from Mario
 - Add 64bit user commands (for 64bit registers) from Marta
 - nvme-rdma/nvme-tcp fixes from Max, Mark and Me
 - Minor cleanups and nits from James, Dan and John"

* 'nvme-5.4' of git://git.infradead.org/nvme:
  nvme-rdma: fix possible use-after-free in connect timeout
  nvme: Move ctrl sqsize to generic space
  nvme: Add ctrl attributes for queue_count and sqsize
  nvme: allow 64-bit results in passthru commands
  nvme: Add quirk for Kingston NVME SSD running FW E8FK11.T
  nvmet-tcp: remove superflous check on request sgl
  Added QUIRKs for ADATA XPG SX8200 Pro 512GB
  nvme-rdma: Fix max_hw_sectors calculation
  nvme: fix an error code in nvme_init_subsystem()
  nvme-pci: Save PCI state before putting drive into deepest state
  nvme-tcp: fix wrong stop condition in io_work
  nvme-pci: Fix a race in controller removal
  nvmet: change ppl to lpp
parents 3154df26 67b483dd
Loading
Loading
Loading
Loading
+113 −19
Original line number Diff line number Diff line
@@ -102,10 +102,13 @@ static void nvme_set_queue_dying(struct nvme_ns *ns)
	 */
	if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
		return;
	revalidate_disk(ns->disk);
	blk_set_queue_dying(ns->queue);
	/* Forcibly unquiesce queues to avoid blocking dispatch */
	blk_mq_unquiesce_queue(ns->queue);
	/*
	 * Revalidate after unblocking dispatchers that may be holding bd_butex
	 */
	revalidate_disk(ns->disk);
}

static void nvme_queue_scan(struct nvme_ctrl *ctrl)
@@ -847,7 +850,7 @@ out:
static int nvme_submit_user_cmd(struct request_queue *q,
		struct nvme_command *cmd, void __user *ubuffer,
		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
		u32 meta_seed, u32 *result, unsigned timeout)
		u32 meta_seed, u64 *result, unsigned timeout)
{
	bool write = nvme_is_write(cmd);
	struct nvme_ns *ns = q->queuedata;
@@ -888,7 +891,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
	else
		ret = nvme_req(req)->status;
	if (result)
		*result = le32_to_cpu(nvme_req(req)->result.u32);
		*result = le64_to_cpu(nvme_req(req)->result.u64);
	if (meta && !ret && !write) {
		if (copy_to_user(meta_buffer, meta, meta_len))
			ret = -EFAULT;
@@ -1335,6 +1338,54 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
	struct nvme_command c;
	unsigned timeout = 0;
	u32 effects;
	u64 result;
	int status;

	if (!capable(CAP_SYS_ADMIN))
		return -EACCES;
	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
		return -EFAULT;
	if (cmd.flags)
		return -EINVAL;

	memset(&c, 0, sizeof(c));
	c.common.opcode = cmd.opcode;
	c.common.flags = cmd.flags;
	c.common.nsid = cpu_to_le32(cmd.nsid);
	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
	c.common.cdw10 = cpu_to_le32(cmd.cdw10);
	c.common.cdw11 = cpu_to_le32(cmd.cdw11);
	c.common.cdw12 = cpu_to_le32(cmd.cdw12);
	c.common.cdw13 = cpu_to_le32(cmd.cdw13);
	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
	c.common.cdw15 = cpu_to_le32(cmd.cdw15);

	if (cmd.timeout_ms)
		timeout = msecs_to_jiffies(cmd.timeout_ms);

	effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
			(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
			(void __user *)(uintptr_t)cmd.metadata,
			cmd.metadata_len, 0, &result, timeout);
	nvme_passthru_end(ctrl, effects);

	if (status >= 0) {
		if (put_user(result, &ucmd->result))
			return -EFAULT;
	}

	return status;
}

static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
			struct nvme_passthru_cmd64 __user *ucmd)
{
	struct nvme_passthru_cmd64 cmd;
	struct nvme_command c;
	unsigned timeout = 0;
	u32 effects;
	int status;

	if (!capable(CAP_SYS_ADMIN))
@@ -1405,6 +1456,41 @@ static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
		srcu_read_unlock(&head->srcu, idx);
}

static bool is_ctrl_ioctl(unsigned int cmd)
{
	if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
		return true;
	if (is_sed_ioctl(cmd))
		return true;
	return false;
}

static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
				  void __user *argp,
				  struct nvme_ns_head *head,
				  int srcu_idx)
{
	struct nvme_ctrl *ctrl = ns->ctrl;
	int ret;

	nvme_get_ctrl(ns->ctrl);
	nvme_put_ns_from_disk(head, srcu_idx);

	switch (cmd) {
	case NVME_IOCTL_ADMIN_CMD:
		ret = nvme_user_cmd(ctrl, NULL, argp);
		break;
	case NVME_IOCTL_ADMIN64_CMD:
		ret = nvme_user_cmd64(ctrl, NULL, argp);
		break;
	default:
		ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
		break;
	}
	nvme_put_ctrl(ctrl);
	return ret;
}

static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
		unsigned int cmd, unsigned long arg)
{
@@ -1422,20 +1508,8 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
	 * seperately and drop the ns SRCU reference early.  This avoids a
	 * deadlock when deleting namespaces using the passthrough interface.
	 */
	if (cmd == NVME_IOCTL_ADMIN_CMD || is_sed_ioctl(cmd)) {
		struct nvme_ctrl *ctrl = ns->ctrl;

		nvme_get_ctrl(ns->ctrl);
		nvme_put_ns_from_disk(head, srcu_idx);

		if (cmd == NVME_IOCTL_ADMIN_CMD)
			ret = nvme_user_cmd(ctrl, NULL, argp);
		else
			ret = sed_ioctl(ctrl->opal_dev, cmd, argp);

		nvme_put_ctrl(ctrl);
		return ret;
	}
	if (is_ctrl_ioctl(cmd))
		return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);

	switch (cmd) {
	case NVME_IOCTL_ID:
@@ -1448,6 +1522,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
	case NVME_IOCTL_SUBMIT_IO:
		ret = nvme_submit_io(ns, argp);
		break;
	case NVME_IOCTL_IO64_CMD:
		ret = nvme_user_cmd64(ns->ctrl, ns, argp);
		break;
	default:
		if (ns->ndev)
			ret = nvme_nvm_ioctl(ns, cmd, arg);
@@ -2289,6 +2366,16 @@ static const struct nvme_core_quirk_entry core_quirks[] = {
		.vid = 0x14a4,
		.fr = "22301111",
		.quirks = NVME_QUIRK_SIMPLE_SUSPEND,
	},
	{
		/*
		 * This Kingston E8FK11.T firmware version has no interrupt
		 * after resume with actions related to suspend to idle
		 * https://bugzilla.kernel.org/show_bug.cgi?id=204887
		 */
		.vid = 0x2646,
		.fr = "E8FK11.T",
		.quirks = NVME_QUIRK_SIMPLE_SUSPEND,
	}
};

@@ -2540,8 +2627,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
		list_add_tail(&subsys->entry, &nvme_subsystems);
	}

	if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
			dev_name(ctrl->device))) {
	ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
				dev_name(ctrl->device));
	if (ret) {
		dev_err(ctrl->device,
			"failed to create sysfs link from subsystem.\n");
		goto out_put_subsystem;
@@ -2838,6 +2926,8 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
	switch (cmd) {
	case NVME_IOCTL_ADMIN_CMD:
		return nvme_user_cmd(ctrl, NULL, argp);
	case NVME_IOCTL_ADMIN64_CMD:
		return nvme_user_cmd64(ctrl, NULL, argp);
	case NVME_IOCTL_IO_CMD:
		return nvme_dev_user_cmd(ctrl, argp);
	case NVME_IOCTL_RESET:
@@ -3045,6 +3135,8 @@ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);

nvme_show_int_function(cntlid);
nvme_show_int_function(numa_node);
nvme_show_int_function(queue_count);
nvme_show_int_function(sqsize);

static ssize_t nvme_sysfs_delete(struct device *dev,
				struct device_attribute *attr, const char *buf,
@@ -3125,6 +3217,8 @@ static struct attribute *nvme_dev_attrs[] = {
	&dev_attr_address.attr,
	&dev_attr_state.attr,
	&dev_attr_numa_node.attr,
	&dev_attr_queue_count.attr,
	&dev_attr_sqsize.attr,
	NULL
};

+1 −1
Original line number Diff line number Diff line
@@ -221,6 +221,7 @@ struct nvme_ctrl {
	u16 oacs;
	u16 nssa;
	u16 nr_streams;
	u16 sqsize;
	u32 max_namespaces;
	atomic_t abort_limit;
	u8 vwc;
@@ -269,7 +270,6 @@ struct nvme_ctrl {
	u16 hmmaxd;

	/* Fabrics only */
	u16 sqsize;
	u32 ioccsz;
	u32 iorcsz;
	u16 icdoff;
+13 −7
Original line number Diff line number Diff line
@@ -2946,11 +2946,21 @@ static int nvme_suspend(struct device *dev)
	if (ret < 0)
		goto unfreeze;

	/*
	 * A saved state prevents pci pm from generically controlling the
	 * device's power. If we're using protocol specific settings, we don't
	 * want pci interfering.
	 */
	pci_save_state(pdev);

	ret = nvme_set_power_state(ctrl, ctrl->npss);
	if (ret < 0)
		goto unfreeze;

	if (ret) {
		/* discard the saved state */
		pci_load_saved_state(pdev, NULL);

		/*
		 * Clearing npss forces a controller reset on resume. The
		 * correct value will be resdicovered then.
@@ -2958,14 +2968,7 @@ static int nvme_suspend(struct device *dev)
		nvme_dev_disable(ndev, true);
		ctrl->npss = 0;
		ret = 0;
		goto unfreeze;
	}
	/*
	 * A saved state prevents pci pm from generically controlling the
	 * device's power. If we're using protocol specific settings, we don't
	 * want pci interfering.
	 */
	pci_save_state(pdev);
unfreeze:
	nvme_unfreeze(ctrl);
	return ret;
@@ -3090,6 +3093,9 @@ static const struct pci_device_id nvme_id_table[] = {
		.driver_data = NVME_QUIRK_LIGHTNVM, },
	{ PCI_DEVICE(0x10ec, 0x5762),   /* ADATA SX6000LNP */
		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
	{ PCI_DEVICE(0x1cc1, 0x8201),   /* ADATA SX8200PNP 512GB */
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
+13 −6
Original line number Diff line number Diff line
@@ -427,7 +427,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
{
	return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
		     ibdev->attrs.max_fast_reg_page_list_len);
		     ibdev->attrs.max_fast_reg_page_list_len - 1);
}

static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
@@ -437,7 +437,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
	const int cq_factor = send_wr_factor + 1;	/* + RECV */
	int comp_vector, idx = nvme_rdma_queue_idx(queue);
	enum ib_poll_context poll_ctx;
	int ret;
	int ret, pages_per_mr;

	queue->device = nvme_rdma_find_get_device(queue->cm_id);
	if (!queue->device) {
@@ -479,10 +479,16 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
		goto out_destroy_qp;
	}

	/*
	 * Currently we don't use SG_GAPS MR's so if the first entry is
	 * misaligned we'll end up using two entries for a single data page,
	 * so one additional entry is required.
	 */
	pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev) + 1;
	ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
			      queue->queue_size,
			      IB_MR_TYPE_MEM_REG,
			      nvme_rdma_get_max_fr_pages(ibdev), 0);
			      pages_per_mr, 0);
	if (ret) {
		dev_err(queue->ctrl->ctrl.device,
			"failed to initialize MR pool sized %d for QID %d\n",
@@ -614,6 +620,7 @@ static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
	if (!ret) {
		set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
	} else {
		if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
			__nvme_rdma_stop_queue(queue);
		dev_info(ctrl->ctrl.device,
			"failed to connect queue: %d ret=%d\n", idx, ret);
@@ -820,8 +827,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
	if (error)
		goto out_stop_queue;

	ctrl->ctrl.max_hw_sectors =
		(ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
	ctrl->ctrl.max_segments = ctrl->max_fr_pages;
	ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9);

	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);

+2 −2
Original line number Diff line number Diff line
@@ -1042,7 +1042,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
{
	struct nvme_tcp_queue *queue =
		container_of(w, struct nvme_tcp_queue, io_work);
	unsigned long start = jiffies + msecs_to_jiffies(1);
	unsigned long deadline = jiffies + msecs_to_jiffies(1);

	do {
		bool pending = false;
@@ -1067,7 +1067,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
		if (!pending)
			return;

	} while (time_after(jiffies, start)); /* quota is exhausted */
	} while (!time_after(jiffies, deadline)); /* quota is exhausted */

	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}
Loading