Commit 22e8bd51 authored by Ilya Dryomov's avatar Ilya Dryomov
Browse files

rbd: support for object-map and fast-diff



Speed up reads, discards and zeroouts through RBD_OBJ_FLAG_MAY_EXIST
and RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT based on object map.

Invalid object maps are not trusted, but still updated.  Note that we
never iterate, resize or invalidate object maps.  If object-map feature
is enabled but object map fails to load, we just fail the requester
(either "rbd map" or I/O, by way of post-acquire action).

Signed-off-by: default avatarIlya Dryomov <idryomov@gmail.com>
parent da5ef6be
Loading
Loading
Loading
Loading
+717 −3
Original line number Diff line number Diff line
@@ -115,6 +115,8 @@ static int atomic_dec_return_safe(atomic_t *v)
#define RBD_FEATURE_LAYERING		(1ULL<<0)
#define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
#define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
#define RBD_FEATURE_OBJECT_MAP		(1ULL<<3)
#define RBD_FEATURE_FAST_DIFF		(1ULL<<4)
#define RBD_FEATURE_DEEP_FLATTEN	(1ULL<<5)
#define RBD_FEATURE_DATA_POOL		(1ULL<<7)
#define RBD_FEATURE_OPERATIONS		(1ULL<<8)
@@ -122,6 +124,8 @@ static int atomic_dec_return_safe(atomic_t *v)
#define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
				 RBD_FEATURE_STRIPINGV2 |	\
				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
				 RBD_FEATURE_OBJECT_MAP |	\
				 RBD_FEATURE_FAST_DIFF |	\
				 RBD_FEATURE_DEEP_FLATTEN |	\
				 RBD_FEATURE_DATA_POOL |	\
				 RBD_FEATURE_OPERATIONS)
@@ -227,6 +231,8 @@ enum obj_operation_type {
#define RBD_OBJ_FLAG_DELETION			(1U << 0)
#define RBD_OBJ_FLAG_COPYUP_ENABLED		(1U << 1)
#define RBD_OBJ_FLAG_COPYUP_ZEROS		(1U << 2)
#define RBD_OBJ_FLAG_MAY_EXIST			(1U << 3)
#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT	(1U << 4)

enum rbd_obj_read_state {
	RBD_OBJ_READ_START = 1,
@@ -261,14 +267,18 @@ enum rbd_obj_read_state {
 */
enum rbd_obj_write_state {
	RBD_OBJ_WRITE_START = 1,
	RBD_OBJ_WRITE_PRE_OBJECT_MAP,
	RBD_OBJ_WRITE_OBJECT,
	__RBD_OBJ_WRITE_COPYUP,
	RBD_OBJ_WRITE_COPYUP,
	RBD_OBJ_WRITE_POST_OBJECT_MAP,
};

enum rbd_obj_copyup_state {
	RBD_OBJ_COPYUP_START = 1,
	RBD_OBJ_COPYUP_READ_PARENT,
	__RBD_OBJ_COPYUP_OBJECT_MAPS,
	RBD_OBJ_COPYUP_OBJECT_MAPS,
	__RBD_OBJ_COPYUP_WRITE_OBJECT,
	RBD_OBJ_COPYUP_WRITE_OBJECT,
};
@@ -419,6 +429,11 @@ struct rbd_device {
	int			acquire_err;
	struct completion	releasing_wait;

	spinlock_t		object_map_lock;
	u8			*object_map;
	u64			object_map_size;	/* in objects */
	u64			object_map_flags;

	struct workqueue_struct	*task_wq;

	struct rbd_spec		*parent_spec;
@@ -620,6 +635,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
				u8 *order, u64 *snap_size);
static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
		u64 *snap_features);
static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);

static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
@@ -1768,6 +1784,466 @@ static void rbd_img_request_destroy(struct kref *kref)
	kmem_cache_free(rbd_img_request_cache, img_request);
}

#define BITS_PER_OBJ	2
#define OBJS_PER_BYTE	(BITS_PER_BYTE / BITS_PER_OBJ)
#define OBJ_MASK	((1 << BITS_PER_OBJ) - 1)

static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
				   u64 *index, u8 *shift)
{
	u32 off;

	rbd_assert(objno < rbd_dev->object_map_size);
	*index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
	*shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
}

static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
{
	u64 index;
	u8 shift;

	lockdep_assert_held(&rbd_dev->object_map_lock);
	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
	return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
}

static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
{
	u64 index;
	u8 shift;
	u8 *p;

	lockdep_assert_held(&rbd_dev->object_map_lock);
	rbd_assert(!(val & ~OBJ_MASK));

	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
	p = &rbd_dev->object_map[index];
	*p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
}

static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
{
	u8 state;

	spin_lock(&rbd_dev->object_map_lock);
	state = __rbd_object_map_get(rbd_dev, objno);
	spin_unlock(&rbd_dev->object_map_lock);
	return state;
}

static bool use_object_map(struct rbd_device *rbd_dev)
{
	return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
		!(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
}

static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
{
	u8 state;

	/* fall back to default logic if object map is disabled or invalid */
	if (!use_object_map(rbd_dev))
		return true;

	state = rbd_object_map_get(rbd_dev, objno);
	return state != OBJECT_NONEXISTENT;
}

static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
				struct ceph_object_id *oid)
{
	if (snap_id == CEPH_NOSNAP)
		ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
				rbd_dev->spec->image_id);
	else
		ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
				rbd_dev->spec->image_id, snap_id);
}

static int rbd_object_map_lock(struct rbd_device *rbd_dev)
{
	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
	CEPH_DEFINE_OID_ONSTACK(oid);
	u8 lock_type;
	char *lock_tag;
	struct ceph_locker *lockers;
	u32 num_lockers;
	bool broke_lock = false;
	int ret;

	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);

again:
	ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
			    CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
	if (ret != -EBUSY || broke_lock) {
		if (ret == -EEXIST)
			ret = 0; /* already locked by myself */
		if (ret)
			rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
		return ret;
	}

	ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
				 RBD_LOCK_NAME, &lock_type, &lock_tag,
				 &lockers, &num_lockers);
	if (ret) {
		if (ret == -ENOENT)
			goto again;

		rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
		return ret;
	}

	kfree(lock_tag);
	if (num_lockers == 0)
		goto again;

	rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
		 ENTITY_NAME(lockers[0].id.name));

	ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
				  RBD_LOCK_NAME, lockers[0].id.cookie,
				  &lockers[0].id.name);
	ceph_free_lockers(lockers, num_lockers);
	if (ret) {
		if (ret == -ENOENT)
			goto again;

		rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
		return ret;
	}

	broke_lock = true;
	goto again;
}

static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
{
	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
	CEPH_DEFINE_OID_ONSTACK(oid);
	int ret;

	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);

	ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
			      "");
	if (ret && ret != -ENOENT)
		rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
}

static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
{
	u8 struct_v;
	u32 struct_len;
	u32 header_len;
	void *header_end;
	int ret;

	ceph_decode_32_safe(p, end, header_len, e_inval);
	header_end = *p + header_len;

	ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
				  &struct_len);
	if (ret)
		return ret;

	ceph_decode_64_safe(p, end, *object_map_size, e_inval);

	*p = header_end;
	return 0;

e_inval:
	return -EINVAL;
}

static int __rbd_object_map_load(struct rbd_device *rbd_dev)
{
	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
	CEPH_DEFINE_OID_ONSTACK(oid);
	struct page **pages;
	void *p, *end;
	size_t reply_len;
	u64 num_objects;
	u64 object_map_bytes;
	u64 object_map_size;
	int num_pages;
	int ret;

	rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);

	num_objects = ceph_get_num_objects(&rbd_dev->layout,
					   rbd_dev->mapping.size);
	object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
					    BITS_PER_BYTE);
	num_pages = calc_pages_for(0, object_map_bytes) + 1;
	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
	if (IS_ERR(pages))
		return PTR_ERR(pages);

	reply_len = num_pages * PAGE_SIZE;
	rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
	ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
			     "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
			     NULL, 0, pages, &reply_len);
	if (ret)
		goto out;

	p = page_address(pages[0]);
	end = p + min(reply_len, (size_t)PAGE_SIZE);
	ret = decode_object_map_header(&p, end, &object_map_size);
	if (ret)
		goto out;

	if (object_map_size != num_objects) {
		rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
			 object_map_size, num_objects);
		ret = -EINVAL;
		goto out;
	}

	if (offset_in_page(p) + object_map_bytes > reply_len) {
		ret = -EINVAL;
		goto out;
	}

	rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
	if (!rbd_dev->object_map) {
		ret = -ENOMEM;
		goto out;
	}

	rbd_dev->object_map_size = object_map_size;
	ceph_copy_from_page_vector(pages, rbd_dev->object_map,
				   offset_in_page(p), object_map_bytes);

out:
	ceph_release_page_vector(pages, num_pages);
	return ret;
}

static void rbd_object_map_free(struct rbd_device *rbd_dev)
{
	kvfree(rbd_dev->object_map);
	rbd_dev->object_map = NULL;
	rbd_dev->object_map_size = 0;
}

static int rbd_object_map_load(struct rbd_device *rbd_dev)
{
	int ret;

	ret = __rbd_object_map_load(rbd_dev);
	if (ret)
		return ret;

	ret = rbd_dev_v2_get_flags(rbd_dev);
	if (ret) {
		rbd_object_map_free(rbd_dev);
		return ret;
	}

	if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
		rbd_warn(rbd_dev, "object map is invalid");

	return 0;
}

static int rbd_object_map_open(struct rbd_device *rbd_dev)
{
	int ret;

	ret = rbd_object_map_lock(rbd_dev);
	if (ret)
		return ret;

	ret = rbd_object_map_load(rbd_dev);
	if (ret) {
		rbd_object_map_unlock(rbd_dev);
		return ret;
	}

	return 0;
}

static void rbd_object_map_close(struct rbd_device *rbd_dev)
{
	rbd_object_map_free(rbd_dev);
	rbd_object_map_unlock(rbd_dev);
}

/*
 * This function needs snap_id (or more precisely just something to
 * distinguish between HEAD and snapshot object maps), new_state and
 * current_state that were passed to rbd_object_map_update().
 *
 * To avoid allocating and stashing a context we piggyback on the OSD
 * request.  A HEAD update has two ops (assert_locked).  For new_state
 * and current_state we decode our own object_map_update op, encoded in
 * rbd_cls_object_map_update().
 */
static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
					struct ceph_osd_request *osd_req)
{
	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
	struct ceph_osd_data *osd_data;
	u64 objno;
	u8 state, new_state, current_state;
	bool has_current_state;
	void *p;

	if (osd_req->r_result)
		return osd_req->r_result;

	/*
	 * Nothing to do for a snapshot object map.
	 */
	if (osd_req->r_num_ops == 1)
		return 0;

	/*
	 * Update in-memory HEAD object map.
	 */
	rbd_assert(osd_req->r_num_ops == 2);
	osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
	rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);

	p = page_address(osd_data->pages[0]);
	objno = ceph_decode_64(&p);
	rbd_assert(objno == obj_req->ex.oe_objno);
	rbd_assert(ceph_decode_64(&p) == objno + 1);
	new_state = ceph_decode_8(&p);
	has_current_state = ceph_decode_8(&p);
	if (has_current_state)
		current_state = ceph_decode_8(&p);

	spin_lock(&rbd_dev->object_map_lock);
	state = __rbd_object_map_get(rbd_dev, objno);
	if (!has_current_state || current_state == state ||
	    (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
		__rbd_object_map_set(rbd_dev, objno, new_state);
	spin_unlock(&rbd_dev->object_map_lock);

	return 0;
}

static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
{
	struct rbd_obj_request *obj_req = osd_req->r_priv;
	int result;

	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
	     osd_req->r_result, obj_req);

	result = rbd_object_map_update_finish(obj_req, osd_req);
	rbd_obj_handle_request(obj_req, result);
}

static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
{
	u8 state = rbd_object_map_get(rbd_dev, objno);

	if (state == new_state ||
	    (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
	    (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
		return false;

	return true;
}

static int rbd_cls_object_map_update(struct ceph_osd_request *req,
				     int which, u64 objno, u8 new_state,
				     const u8 *current_state)
{
	struct page **pages;
	void *p, *start;
	int ret;

	ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
	if (ret)
		return ret;

	pages = ceph_alloc_page_vector(1, GFP_NOIO);
	if (IS_ERR(pages))
		return PTR_ERR(pages);

	p = start = page_address(pages[0]);
	ceph_encode_64(&p, objno);
	ceph_encode_64(&p, objno + 1);
	ceph_encode_8(&p, new_state);
	if (current_state) {
		ceph_encode_8(&p, 1);
		ceph_encode_8(&p, *current_state);
	} else {
		ceph_encode_8(&p, 0);
	}

	osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
					  false, true);
	return 0;
}

/*
 * Return:
 *   0 - object map update sent
 *   1 - object map update isn't needed
 *  <0 - error
 */
static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
				 u8 new_state, const u8 *current_state)
{
	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
	struct ceph_osd_request *req;
	int num_ops = 1;
	int which = 0;
	int ret;

	if (snap_id == CEPH_NOSNAP) {
		if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
			return 1;

		num_ops++; /* assert_locked */
	}

	req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
	if (!req)
		return -ENOMEM;

	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
	req->r_callback = rbd_object_map_callback;
	req->r_priv = obj_req;

	rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
	req->r_flags = CEPH_OSD_FLAG_WRITE;
	ktime_get_real_ts64(&req->r_mtime);

	if (snap_id == CEPH_NOSNAP) {
		/*
		 * Protect against possible race conditions during lock
		 * ownership transitions.
		 */
		ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
					     CEPH_CLS_LOCK_EXCLUSIVE, "", "");
		if (ret)
			return ret;
	}

	ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
					new_state, current_state);
	if (ret)
		return ret;

	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
	if (ret)
		return ret;

	ceph_osdc_start_request(osdc, req, false);
	return 0;
}

static void prune_extents(struct ceph_file_extent *img_extents,
			  u32 *num_img_extents, u64 overlap)
{
@@ -1975,6 +2451,7 @@ static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
	if (ret)
		return ret;

	obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
		obj_req->flags |= RBD_OBJ_FLAG_DELETION;

@@ -2022,6 +2499,7 @@ static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
	if (rbd_obj_copyup_enabled(obj_req))
		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
	if (!obj_req->num_img_extents) {
		obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
		if (rbd_obj_is_entire(obj_req))
			obj_req->flags |= RBD_OBJ_FLAG_DELETION;
	}
@@ -2407,6 +2885,20 @@ static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
	queue_work(rbd_wq, &img_req->work);
}

static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
{
	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;

	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
		return true;
	}

	dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
	     obj_req->ex.oe_objno);
	return false;
}

static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
{
	struct ceph_osd_request *osd_req;
@@ -2482,10 +2974,17 @@ static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
	int ret;

again:
	switch (obj_req->read_state) {
	case RBD_OBJ_READ_START:
		rbd_assert(!*result);

		if (!rbd_obj_may_exist(obj_req)) {
			*result = -ENOENT;
			obj_req->read_state = RBD_OBJ_READ_OBJECT;
			goto again;
		}

		ret = rbd_obj_read_object(obj_req);
		if (ret) {
			*result = ret;
@@ -2536,6 +3035,44 @@ static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
	}
}

static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
{
	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;

	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;

	if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
	    (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
		dout("%s %p noop for nonexistent\n", __func__, obj_req);
		return true;
	}

	return false;
}

/*
 * Return:
 *   0 - object map update sent
 *   1 - object map update isn't needed
 *  <0 - error
 */
static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
{
	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
	u8 new_state;

	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
		return 1;

	if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
		new_state = OBJECT_PENDING;
	else
		new_state = OBJECT_EXISTS;

	return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
}

static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
{
	struct ceph_osd_request *osd_req;
@@ -2706,6 +3243,41 @@ static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
	return rbd_obj_read_from_parent(obj_req);
}

static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
{
	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
	struct ceph_snap_context *snapc = obj_req->img_request->snapc;
	u8 new_state;
	u32 i;
	int ret;

	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);

	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
		return;

	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
		return;

	for (i = 0; i < snapc->num_snaps; i++) {
		if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
		    i + 1 < snapc->num_snaps)
			new_state = OBJECT_EXISTS_CLEAN;
		else
			new_state = OBJECT_EXISTS;

		ret = rbd_object_map_update(obj_req, snapc->snaps[i],
					    new_state, NULL);
		if (ret < 0) {
			obj_req->pending.result = ret;
			return;
		}

		rbd_assert(!ret);
		obj_req->pending.num_pending++;
	}
}

static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
{
	u32 bytes = rbd_obj_img_extents_bytes(obj_req);
@@ -2749,6 +3321,7 @@ static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)

static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
{
	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
	int ret;

again:
@@ -2776,6 +3349,25 @@ again:
			obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
		}

		rbd_obj_copyup_object_maps(obj_req);
		if (!obj_req->pending.num_pending) {
			*result = obj_req->pending.result;
			obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
			goto again;
		}
		obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
		return false;
	case __RBD_OBJ_COPYUP_OBJECT_MAPS:
		if (!pending_result_dec(&obj_req->pending, result))
			return false;
		/* fall through */
	case RBD_OBJ_COPYUP_OBJECT_MAPS:
		if (*result) {
			rbd_warn(rbd_dev, "snap object map update failed: %d",
				 *result);
			return true;
		}

		rbd_obj_copyup_write_object(obj_req);
		if (!obj_req->pending.num_pending) {
			*result = obj_req->pending.result;
@@ -2795,6 +3387,27 @@ again:
	}
}

/*
 * Return:
 *   0 - object map update sent
 *   1 - object map update isn't needed
 *  <0 - error
 */
static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
{
	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
	u8 current_state = OBJECT_PENDING;

	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
		return 1;

	if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
		return 1;

	return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
				     &current_state);
}

static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
{
	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
@@ -2805,6 +3418,24 @@ again:
	case RBD_OBJ_WRITE_START:
		rbd_assert(!*result);

		if (rbd_obj_write_is_noop(obj_req))
			return true;

		ret = rbd_obj_write_pre_object_map(obj_req);
		if (ret < 0) {
			*result = ret;
			return true;
		}
		obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
		if (ret > 0)
			goto again;
		return false;
	case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
		if (*result) {
			rbd_warn(rbd_dev, "pre object map update failed: %d",
				 *result);
			return true;
		}
		ret = rbd_obj_write_object(obj_req);
		if (ret) {
			*result = ret;
@@ -2837,9 +3468,24 @@ again:
			return false;
		/* fall through */
	case RBD_OBJ_WRITE_COPYUP:
		if (*result)
		if (*result) {
			rbd_warn(rbd_dev, "copyup failed: %d", *result);
			return true;
		}
		ret = rbd_obj_write_post_object_map(obj_req);
		if (ret < 0) {
			*result = ret;
			return true;
		}
		obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
		if (ret > 0)
			goto again;
		return false;
	case RBD_OBJ_WRITE_POST_OBJECT_MAP:
		if (*result)
			rbd_warn(rbd_dev, "post object map update failed: %d",
				 *result);
		return true;
	default:
		BUG();
	}
@@ -2892,7 +3538,8 @@ static bool need_exclusive_lock(struct rbd_img_request *img_req)
		return false;

	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
	if (rbd_dev->opts->lock_on_read)
	if (rbd_dev->opts->lock_on_read ||
	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
		return true;

	return rbd_img_is_write(img_req);
@@ -3431,7 +4078,7 @@ static int rbd_try_lock(struct rbd_device *rbd_dev)
		if (ret)
			goto out; /* request lock or error */

		rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
		rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
			 ENTITY_NAME(lockers[0].id.name));

		ret = ceph_monc_blacklist_add(&client->monc,
@@ -3458,6 +4105,19 @@ out:
	return ret;
}

static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
{
	int ret;

	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
		ret = rbd_object_map_open(rbd_dev);
		if (ret)
			return ret;
	}

	return 0;
}

/*
 * Return:
 *   0 - lock acquired
@@ -3501,6 +4161,17 @@ static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
	rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
	rbd_assert(list_empty(&rbd_dev->running_list));

	ret = rbd_post_acquire_action(rbd_dev);
	if (ret) {
		rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
		/*
		 * Can't stay in RBD_LOCK_STATE_LOCKED because
		 * rbd_lock_add_request() would let the request through,
		 * assuming that e.g. object map is locked and loaded.
		 */
		rbd_unlock(rbd_dev);
	}

out:
	wake_lock_waiters(rbd_dev, ret);
	up_write(&rbd_dev->lock_rwsem);
@@ -3574,10 +4245,17 @@ static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
	return true;
}

static void rbd_pre_release_action(struct rbd_device *rbd_dev)
{
	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
		rbd_object_map_close(rbd_dev);
}

static void __rbd_release_lock(struct rbd_device *rbd_dev)
{
	rbd_assert(list_empty(&rbd_dev->running_list));

	rbd_pre_release_action(rbd_dev);
	rbd_unlock(rbd_dev);
}

@@ -4864,6 +5542,8 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
	init_completion(&rbd_dev->acquire_wait);
	init_completion(&rbd_dev->releasing_wait);

	spin_lock_init(&rbd_dev->object_map_lock);

	rbd_dev->dev.bus = &rbd_bus_type;
	rbd_dev->dev.type = &rbd_device_type;
	rbd_dev->dev.parent = &rbd_root_dev;
@@ -5045,6 +5725,32 @@ static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
						&rbd_dev->header.features);
}

/*
 * These are generic image flags, but since they are used only for
 * object map, store them in rbd_dev->object_map_flags.
 *
 * For the same reason, this function is called only on object map
 * (re)load and not on header refresh.
 */
static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
{
	__le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
	__le64 flags;
	int ret;

	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
				  &rbd_dev->header_oloc, "get_flags",
				  &snapid, sizeof(snapid),
				  &flags, sizeof(flags));
	if (ret < 0)
		return ret;
	if (ret < sizeof(flags))
		return -EBADMSG;

	rbd_dev->object_map_flags = le64_to_cpu(flags);
	return 0;
}

struct parent_image_info {
	u64		pool_id;
	const char	*pool_ns;
@@ -6018,6 +6724,7 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
	struct rbd_image_header	*header;

	rbd_dev_parent_put(rbd_dev);
	rbd_object_map_free(rbd_dev);
	rbd_dev_mapping_clear(rbd_dev);

	/* Free dynamic fields from the header, then zero it out */
@@ -6267,6 +6974,13 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
	if (ret)
		goto err_out_probe;

	if (rbd_dev->spec->snap_id != CEPH_NOSNAP &&
	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
		ret = rbd_object_map_load(rbd_dev);
		if (ret)
			goto err_out_probe;
	}

	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
		ret = rbd_dev_v2_parent_info(rbd_dev);
		if (ret)
+10 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@
/* For format version 2, rbd image 'foo' consists of objects
 *   rbd_id.foo		- id of image
 *   rbd_header.<id>	- image metadata
 *   rbd_object_map.<id> - optional image object map
 *   rbd_data.<id>.0000000000000000
 *   rbd_data.<id>.0000000000000001
 *   ...		- data
@@ -25,6 +26,7 @@
 */

#define RBD_HEADER_PREFIX      "rbd_header."
#define RBD_OBJECT_MAP_PREFIX  "rbd_object_map."
#define RBD_ID_PREFIX          "rbd_id."
#define RBD_V2_DATA_FORMAT     "%s.%016llx"

@@ -39,6 +41,14 @@ enum rbd_notify_op {
	RBD_NOTIFY_OP_HEADER_UPDATE      = 3,
};

#define OBJECT_NONEXISTENT	0
#define OBJECT_EXISTS		1
#define OBJECT_PENDING		2
#define OBJECT_EXISTS_CLEAN	3

#define RBD_FLAG_OBJECT_MAP_INVALID	(1ULL << 0)
#define RBD_FLAG_FAST_DIFF_INVALID	(1ULL << 1)

/*
 * For format version 1, rbd image 'foo' consists of objects
 *   foo.rbd		- image metadata
+3 −0
Original line number Diff line number Diff line
@@ -52,4 +52,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
		       char *lock_name, u8 *type, char **tag,
		       struct ceph_locker **lockers, u32 *num_lockers);

int ceph_cls_assert_locked(struct ceph_osd_request *req, int which,
			   char *lock_name, u8 type, char *cookie, char *tag);

#endif
+2 −0
Original line number Diff line number Diff line
@@ -66,4 +66,6 @@ int ceph_extent_to_file(struct ceph_file_layout *l,
			struct ceph_file_extent **file_extents,
			u32 *num_file_extents);

u64 ceph_get_num_objects(struct ceph_file_layout *l, u64 size);

#endif
+45 −0

File changed.

Preview size limit exceeded, changes collapsed.

Loading