Commit fcc95f06 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'ceph-for-5.7-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "The main items are:

   - support for asynchronous create and unlink (Jeff Layton).

     Creates and unlinks are satisfied locally, without waiting for a
     reply from the MDS, provided the client has been granted
     appropriate caps (new in v15.y.z ("Octopus") release). This can be
     a big help for metadata heavy workloads such as tar and rsync.
     Opt-in with the new nowsync mount option.

   - multiple blk-mq queues for rbd (Hannes Reinecke and myself).

     When the driver was converted to blk-mq, we settled on a single
     blk-mq queue because of a global lock in libceph and some other
     technical debt. These have since been addressed, so allocate a
     queue per CPU to enhance parallelism.

   - don't hold onto caps that aren't actually needed (Zheng Yan).

     This has been our long-standing behavior, but it causes issues with
     some active/standby applications (synchronous I/O, stalls if the
     standby goes down, etc).

   - .snap directory timestamps consistent with ceph-fuse (Luis
     Henriques)"

* tag 'ceph-for-5.7-rc1' of git://github.com/ceph/ceph-client: (49 commits)
  ceph: fix snapshot directory timestamps
  ceph: wait for async creating inode before requesting new max size
  ceph: don't skip updating wanted caps when cap is stale
  ceph: request new max size only when there is auth cap
  ceph: cleanup return error of try_get_cap_refs()
  ceph: return ceph_mdsc_do_request() errors from __get_parent()
  ceph: check all mds' caps after page writeback
  ceph: update i_requested_max_size only when sending cap msg to auth mds
  ceph: simplify calling of ceph_get_fmode()
  ceph: remove delay check logic from ceph_check_caps()
  ceph: consider inode's last read/write when calculating wanted caps
  ceph: always renew caps if mds_wanted is insufficient
  ceph: update dentry lease for async create
  ceph: attempt to do async create when possible
  ceph: cache layout in parent dir on first sync create
  ceph: add new MDS req field to hold delegated inode number
  ceph: decode interval_sets for delegated inos
  ceph: make ceph_fill_inode non-static
  ceph: perform asynchronous unlink if we have sufficient caps
  ceph: don't take refs to want mask unless we have all bits
  ...
parents c6b80eb8 ef915725
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -107,17 +107,17 @@ Mount Options
	address its connection to the monitor originates from.

  wsize=X
	Specify the maximum write size in bytes.  Default: 16 MB.
	Specify the maximum write size in bytes.  Default: 64 MB.

  rsize=X
	Specify the maximum read size in bytes.  Default: 16 MB.
	Specify the maximum read size in bytes.  Default: 64 MB.

  rasize=X
	Specify the maximum readahead size in bytes.  Default: 8 MB.

  mount_timeout=X
	Specify the timeout value for mount (in seconds), in the case
	of a non-responsive Ceph file system.  The default is 30
	of a non-responsive Ceph file system.  The default is 60
	seconds.

  caps_max=X
+79 −136
Original line number Diff line number Diff line
@@ -337,10 +337,7 @@ struct rbd_img_request {
		u64			snap_id;	/* for reads */
		struct ceph_snap_context *snapc;	/* for writes */
	};
	union {
		struct request		*rq;		/* block request */
	struct rbd_obj_request	*obj_request;	/* obj req initiator */
	};

	struct list_head	lock_item;
	struct list_head	object_extents;	/* obj_req.ex structs */
@@ -349,7 +346,6 @@ struct rbd_img_request {
	struct pending_result	pending;
	struct work_struct	work;
	int			work_result;
	struct kref		kref;
};

#define for_each_obj_request(ireq, oreq) \
@@ -1320,15 +1316,6 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
	kref_put(&obj_request->kref, rbd_obj_request_destroy);
}

static void rbd_img_request_destroy(struct kref *kref);
static void rbd_img_request_put(struct rbd_img_request *img_request)
{
	rbd_assert(img_request != NULL);
	dout("%s: img %p (was %d)\n", __func__, img_request,
		kref_read(&img_request->kref));
	kref_put(&img_request->kref, rbd_img_request_destroy);
}

static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
					struct rbd_obj_request *obj_request)
{
@@ -1366,18 +1353,10 @@ static void rbd_osd_submit(struct ceph_osd_request *osd_req)
static void img_request_layered_set(struct rbd_img_request *img_request)
{
	set_bit(IMG_REQ_LAYERED, &img_request->flags);
	smp_mb();
}

static void img_request_layered_clear(struct rbd_img_request *img_request)
{
	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
	smp_mb();
}

static bool img_request_layered_test(struct rbd_img_request *img_request)
{
	smp_mb();
	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
}

@@ -1619,10 +1598,8 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
	if (!rbd_dev->parent_spec)
		return false;

	down_read(&rbd_dev->header_rwsem);
	if (rbd_dev->parent_overlap)
		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
	up_read(&rbd_dev->header_rwsem);

	if (counter < 0)
		rbd_warn(rbd_dev, "parent reference overflow");
@@ -1630,62 +1607,53 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
	return counter > 0;
}

/*
 * Caller is responsible for filling in the list of object requests
 * that comprises the image request, and the Linux request pointer
 * (if there is one).
 */
static struct rbd_img_request *rbd_img_request_create(
static void rbd_img_request_init(struct rbd_img_request *img_request,
				 struct rbd_device *rbd_dev,
					enum obj_operation_type op_type,
					struct ceph_snap_context *snapc)
				 enum obj_operation_type op_type)
{
	struct rbd_img_request *img_request;

	img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
	if (!img_request)
		return NULL;
	memset(img_request, 0, sizeof(*img_request));

	img_request->rbd_dev = rbd_dev;
	img_request->op_type = op_type;
	if (!rbd_img_is_write(img_request))
		img_request->snap_id = rbd_dev->spec->snap_id;
	else
		img_request->snapc = snapc;

	if (rbd_dev_parent_get(rbd_dev))
		img_request_layered_set(img_request);

	INIT_LIST_HEAD(&img_request->lock_item);
	INIT_LIST_HEAD(&img_request->object_extents);
	mutex_init(&img_request->state_mutex);
	kref_init(&img_request->kref);
}

static void rbd_img_capture_header(struct rbd_img_request *img_req)
{
	struct rbd_device *rbd_dev = img_req->rbd_dev;

	lockdep_assert_held(&rbd_dev->header_rwsem);

	return img_request;
	if (rbd_img_is_write(img_req))
		img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
	else
		img_req->snap_id = rbd_dev->spec->snap_id;

	if (rbd_dev_parent_get(rbd_dev))
		img_request_layered_set(img_req);
}

static void rbd_img_request_destroy(struct kref *kref)
static void rbd_img_request_destroy(struct rbd_img_request *img_request)
{
	struct rbd_img_request *img_request;
	struct rbd_obj_request *obj_request;
	struct rbd_obj_request *next_obj_request;

	img_request = container_of(kref, struct rbd_img_request, kref);

	dout("%s: img %p\n", __func__, img_request);

	WARN_ON(!list_empty(&img_request->lock_item));
	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
		rbd_img_obj_request_del(img_request, obj_request);

	if (img_request_layered_test(img_request)) {
		img_request_layered_clear(img_request);
	if (img_request_layered_test(img_request))
		rbd_dev_parent_put(img_request->rbd_dev);
	}

	if (rbd_img_is_write(img_request))
		ceph_put_snap_context(img_request->snapc);

	if (test_bit(IMG_REQ_CHILD, &img_request->flags))
		kmem_cache_free(rbd_img_request_cache, img_request);
}

@@ -2849,17 +2817,22 @@ static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
{
	struct rbd_img_request *img_req = obj_req->img_request;
	struct rbd_device *parent = img_req->rbd_dev->parent;
	struct rbd_img_request *child_img_req;
	int ret;

	child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
					       OBJ_OP_READ, NULL);
	child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
	if (!child_img_req)
		return -ENOMEM;

	rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
	child_img_req->obj_request = obj_req;

	down_read(&parent->header_rwsem);
	rbd_img_capture_header(child_img_req);
	up_read(&parent->header_rwsem);

	dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
	     obj_req);

@@ -2888,7 +2861,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
					      obj_req->copyup_bvecs);
	}
	if (ret) {
		rbd_img_request_put(child_img_req);
		rbd_img_request_destroy(child_img_req);
		return ret;
	}

@@ -3647,15 +3620,15 @@ again:
	if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
		struct rbd_obj_request *obj_req = img_req->obj_request;

		rbd_img_request_put(img_req);
		rbd_img_request_destroy(img_req);
		if (__rbd_obj_handle_request(obj_req, &result)) {
			img_req = obj_req->img_request;
			goto again;
		}
	} else {
		struct request *rq = img_req->rq;
		struct request *rq = blk_mq_rq_from_pdu(img_req);

		rbd_img_request_put(img_req);
		rbd_img_request_destroy(img_req);
		blk_mq_end_request(rq, errno_to_blk_status(result));
	}
}
@@ -4707,84 +4680,36 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,

static void rbd_queue_workfn(struct work_struct *work)
{
	struct request *rq = blk_mq_rq_from_pdu(work);
	struct rbd_device *rbd_dev = rq->q->queuedata;
	struct rbd_img_request *img_request;
	struct ceph_snap_context *snapc = NULL;
	struct rbd_img_request *img_request =
	    container_of(work, struct rbd_img_request, work);
	struct rbd_device *rbd_dev = img_request->rbd_dev;
	enum obj_operation_type op_type = img_request->op_type;
	struct request *rq = blk_mq_rq_from_pdu(img_request);
	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
	u64 length = blk_rq_bytes(rq);
	enum obj_operation_type op_type;
	u64 mapping_size;
	int result;

	switch (req_op(rq)) {
	case REQ_OP_DISCARD:
		op_type = OBJ_OP_DISCARD;
		break;
	case REQ_OP_WRITE_ZEROES:
		op_type = OBJ_OP_ZEROOUT;
		break;
	case REQ_OP_WRITE:
		op_type = OBJ_OP_WRITE;
		break;
	case REQ_OP_READ:
		op_type = OBJ_OP_READ;
		break;
	default:
		dout("%s: non-fs request type %d\n", __func__, req_op(rq));
		result = -EIO;
		goto err;
	}

	/* Ignore/skip any zero-length requests */

	if (!length) {
		dout("%s: zero-length request\n", __func__);
		result = 0;
		goto err_rq;
	}

	if (op_type != OBJ_OP_READ) {
		if (rbd_is_ro(rbd_dev)) {
			rbd_warn(rbd_dev, "%s on read-only mapping",
				 obj_op_name(op_type));
			result = -EIO;
			goto err;
		}
		rbd_assert(!rbd_is_snap(rbd_dev));
	}

	if (offset && length > U64_MAX - offset + 1) {
		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
			 length);
		result = -EINVAL;
		goto err_rq;	/* Shouldn't happen */
		goto err_img_request;
	}

	blk_mq_start_request(rq);

	down_read(&rbd_dev->header_rwsem);
	mapping_size = rbd_dev->mapping.size;
	if (op_type != OBJ_OP_READ) {
		snapc = rbd_dev->header.snapc;
		ceph_get_snap_context(snapc);
	}
	rbd_img_capture_header(img_request);
	up_read(&rbd_dev->header_rwsem);

	if (offset + length > mapping_size) {
		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
			 length, mapping_size);
		result = -EIO;
		goto err_rq;
	}

	img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
	if (!img_request) {
		result = -ENOMEM;
		goto err_rq;
		goto err_img_request;
	}
	img_request->rq = rq;
	snapc = NULL; /* img_request consumes a ref */

	dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
	     img_request, obj_op_name(op_type), offset, length);
@@ -4801,23 +4726,51 @@ static void rbd_queue_workfn(struct work_struct *work)
	return;

err_img_request:
	rbd_img_request_put(img_request);
err_rq:
	rbd_img_request_destroy(img_request);
	if (result)
		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
			 obj_op_name(op_type), length, offset, result);
	ceph_put_snap_context(snapc);
err:
	blk_mq_end_request(rq, errno_to_blk_status(result));
}

static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
		const struct blk_mq_queue_data *bd)
{
	struct request *rq = bd->rq;
	struct work_struct *work = blk_mq_rq_to_pdu(rq);
	struct rbd_device *rbd_dev = hctx->queue->queuedata;
	struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
	enum obj_operation_type op_type;

	queue_work(rbd_wq, work);
	switch (req_op(bd->rq)) {
	case REQ_OP_DISCARD:
		op_type = OBJ_OP_DISCARD;
		break;
	case REQ_OP_WRITE_ZEROES:
		op_type = OBJ_OP_ZEROOUT;
		break;
	case REQ_OP_WRITE:
		op_type = OBJ_OP_WRITE;
		break;
	case REQ_OP_READ:
		op_type = OBJ_OP_READ;
		break;
	default:
		rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
		return BLK_STS_IOERR;
	}

	rbd_img_request_init(img_req, rbd_dev, op_type);

	if (rbd_img_is_write(img_req)) {
		if (rbd_is_ro(rbd_dev)) {
			rbd_warn(rbd_dev, "%s on read-only mapping",
				 obj_op_name(img_req->op_type));
			return BLK_STS_IOERR;
		}
		rbd_assert(!rbd_is_snap(rbd_dev));
	}

	INIT_WORK(&img_req->work, rbd_queue_workfn);
	queue_work(rbd_wq, &img_req->work);
	return BLK_STS_OK;
}

@@ -4984,18 +4937,8 @@ out:
	return ret;
}

static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
		unsigned int hctx_idx, unsigned int numa_node)
{
	struct work_struct *work = blk_mq_rq_to_pdu(rq);

	INIT_WORK(work, rbd_queue_workfn);
	return 0;
}

static const struct blk_mq_ops rbd_mq_ops = {
	.queue_rq	= rbd_queue_rq,
	.init_request	= rbd_init_request,
};

static int rbd_init_disk(struct rbd_device *rbd_dev)
@@ -5027,8 +4970,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
	rbd_dev->tag_set.nr_hw_queues = 1;
	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
	rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
	rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);

	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
	if (err)
+85 −5
Original line number Diff line number Diff line
@@ -159,8 +159,6 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
	if (!PagePrivate(page))
		return;

	ClearPageChecked(page);

	dout("%p invalidatepage %p idx %lu full dirty page\n",
	     inode, page, page->index);

@@ -182,6 +180,47 @@ static int ceph_releasepage(struct page *page, gfp_t g)
	return !PagePrivate(page);
}

/*
 * Read some contiguous pages.  If we cross a stripe boundary, shorten
 * *plen.  Return number of bytes read, or error.
 */
static int ceph_sync_readpages(struct ceph_fs_client *fsc,
			       struct ceph_vino vino,
			       struct ceph_file_layout *layout,
			       u64 off, u64 *plen,
			       u32 truncate_seq, u64 truncate_size,
			       struct page **pages, int num_pages,
			       int page_align)
{
	struct ceph_osd_client *osdc = &fsc->client->osdc;
	struct ceph_osd_request *req;
	int rc = 0;

	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
	     vino.snap, off, *plen);
	req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
				    NULL, truncate_seq, truncate_size,
				    false);
	if (IS_ERR(req))
		return PTR_ERR(req);

	/* it may be a short read due to an object boundary */
	osd_req_op_extent_osd_data_pages(req, 0,
				pages, *plen, page_align, false, false);

	dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
	     off, *plen, *plen, page_align);

	rc = ceph_osdc_start_request(osdc, req, false);
	if (!rc)
		rc = ceph_osdc_wait_request(osdc, req);

	ceph_osdc_put_request(req);
	dout("readpages result %d\n", rc);
	return rc;
}

/*
 * read a single page, without unlocking it.
 */
@@ -218,7 +257,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page)

	dout("readpage inode %p file %p page %p index %lu\n",
	     inode, filp, page, page->index);
	err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
	err = ceph_sync_readpages(fsc, ceph_vino(inode),
				  &ci->i_layout, off, &len,
				  ci->i_truncate_seq, ci->i_truncate_size,
				  &page, 1, 0);
@@ -570,6 +609,47 @@ static u64 get_writepages_data_length(struct inode *inode,
	return end > start ? end - start : 0;
}

/*
 * do a synchronous write on N pages
 */
static int ceph_sync_writepages(struct ceph_fs_client *fsc,
				struct ceph_vino vino,
				struct ceph_file_layout *layout,
				struct ceph_snap_context *snapc,
				u64 off, u64 len,
				u32 truncate_seq, u64 truncate_size,
				struct timespec64 *mtime,
				struct page **pages, int num_pages)
{
	struct ceph_osd_client *osdc = &fsc->client->osdc;
	struct ceph_osd_request *req;
	int rc = 0;
	int page_align = off & ~PAGE_MASK;

	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
				    CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
				    snapc, truncate_seq, truncate_size,
				    true);
	if (IS_ERR(req))
		return PTR_ERR(req);

	/* it may be a short write due to an object boundary */
	osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
				false, false);
	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);

	req->r_mtime = *mtime;
	rc = ceph_osdc_start_request(osdc, req, true);
	if (!rc)
		rc = ceph_osdc_wait_request(osdc, req);

	ceph_osdc_put_request(req);
	if (rc == 0)
		rc = len;
	dout("writepages result %d\n", rc);
	return rc;
}

/*
 * Write a single page, but leave the page locked.
 *
@@ -628,7 +708,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
		set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);

	set_page_writeback(page);
	err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode),
	err = ceph_sync_writepages(fsc, ceph_vino(inode),
				   &ci->i_layout, snapc, page_off, len,
				   ceph_wbc.truncate_seq,
				   ceph_wbc.truncate_size,
@@ -1575,7 +1655,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
	do {
		lock_page(page);

		if ((off > size) || (page->mapping != inode->i_mapping)) {
		if (page_mkwrite_check_truncate(page, inode) < 0) {
			unlock_page(page);
			ret = VM_FAULT_NOPAGE;
			break;
+1 −1
Original line number Diff line number Diff line
@@ -32,7 +32,7 @@ struct ceph_fscache_entry {
	size_t uniq_len;
	/* The following members must be last */
	struct ceph_fsid fsid;
	char uniquifier[0];
	char uniquifier[];
};

static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
+292 −244

File changed.

Preview size limit exceeded, changes collapsed.

Loading