Commit 0512a75b authored by Keith Busch's avatar Keith Busch Committed by Jens Axboe
Browse files

block: Introduce REQ_OP_ZONE_APPEND



Define REQ_OP_ZONE_APPEND to append-write sectors to a zone of a zoned
block device. This is a no-merge write operation.

A zone append write BIO must:
* Target a zoned block device
* Have a sector position indicating the start sector of the target zone
* The target zone must be a sequential write zone
* The BIO must not cross a zone boundary
* The BIO size must not be split to ensure that a single range of LBAs
  is written with a single command.

Implement these checks in generic_make_request_checks() using the
helper function blk_check_zone_append(). To avoid write append BIO
splitting, introduce the new max_zone_append_sectors queue limit
attribute and ensure that a BIO size is always lower than this limit.
Export this new limit through sysfs and check these limits in bio_full().

Also when a LLDD can't dispatch a request to a specific zone, it
will return BLK_STS_ZONE_RESOURCE indicating this request needs to
be delayed, e.g.  because the zone it will be dispatched to is still
write-locked. If this happens set the request aside in a local list
to continue trying dispatching requests such as READ requests or a
WRITE/ZONE_APPEND requests targetting other zones. This way we can
still keep a high queue depth without starving other requests even if
one request can't be served due to zone write-locking.

Finally, make sure that the bio sector position indicates the actual
write position as indicated by the device on completion.

Signed-off-by: default avatarKeith Busch <kbusch@kernel.org>
[ jth: added zone-append specific add_page and merge_page helpers ]
Signed-off-by: default avatarJohannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarHannes Reinecke <hare@suse.de>
Reviewed-by: default avatarMartin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent e4581105
Loading
Loading
Loading
Loading
+58 −4
Original line number Diff line number Diff line
@@ -1025,6 +1025,50 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
	return 0;
}

static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
{
	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
	struct request_queue *q = bio->bi_disk->queue;
	unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
	struct page **pages = (struct page **)bv;
	ssize_t size, left;
	unsigned len, i;
	size_t offset;

	if (WARN_ON_ONCE(!max_append_sectors))
		return 0;

	/*
	 * Move page array up in the allocated memory for the bio vecs as far as
	 * possible so that we can start filling biovecs from the beginning
	 * without overwriting the temporary page array.
	 */
	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);

	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
	if (unlikely(size <= 0))
		return size ? size : -EFAULT;

	for (left = size, i = 0; left > 0; left -= len, i++) {
		struct page *page = pages[i];
		bool same_page = false;

		len = min_t(size_t, PAGE_SIZE - offset, left);
		if (bio_add_hw_page(q, bio, page, len, offset,
				max_append_sectors, &same_page) != len)
			return -EINVAL;
		if (same_page)
			put_page(page);
		offset = 0;
	}

	iov_iter_advance(iter, size);
	return 0;
}

/**
 * bio_iov_iter_get_pages - add user or kernel pages to a bio
 * @bio: bio to add pages to
@@ -1054,10 +1098,16 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
		return -EINVAL;

	do {
		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
			if (WARN_ON_ONCE(is_bvec))
				return -EINVAL;
			ret = __bio_iov_append_get_pages(bio, iter);
		} else {
			if (is_bvec)
				ret = __bio_iov_bvec_add_pages(bio, iter);
			else
				ret = __bio_iov_iter_get_pages(bio, iter);
		}
	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));

	if (is_bvec)
@@ -1460,6 +1510,10 @@ struct bio *bio_split(struct bio *bio, int sectors,
	BUG_ON(sectors <= 0);
	BUG_ON(sectors >= bio_sectors(bio));

	/* Zone append commands cannot be split */
	if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
		return NULL;

	split = bio_clone_fast(bio, gfp, bs);
	if (!split)
		return NULL;
+52 −0
Original line number Diff line number Diff line
@@ -135,6 +135,7 @@ static const char *const blk_op_name[] = {
	REQ_OP_NAME(ZONE_OPEN),
	REQ_OP_NAME(ZONE_CLOSE),
	REQ_OP_NAME(ZONE_FINISH),
	REQ_OP_NAME(ZONE_APPEND),
	REQ_OP_NAME(WRITE_SAME),
	REQ_OP_NAME(WRITE_ZEROES),
	REQ_OP_NAME(SCSI_IN),
@@ -240,6 +241,17 @@ static void req_bio_endio(struct request *rq, struct bio *bio,

	bio_advance(bio, nbytes);

	if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
		/*
		 * Partial zone append completions cannot be supported as the
		 * BIO fragments may end up not being written sequentially.
		 */
		if (bio->bi_iter.bi_size)
			bio->bi_status = BLK_STS_IOERR;
		else
			bio->bi_iter.bi_sector = rq->__sector;
	}

	/* don't actually finish bio if it's part of flush sequence */
	if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
		bio_endio(bio);
@@ -887,6 +899,41 @@ out:
	return ret;
}

/*
 * Check write append to a zoned block device.
 */
static inline blk_status_t blk_check_zone_append(struct request_queue *q,
						 struct bio *bio)
{
	sector_t pos = bio->bi_iter.bi_sector;
	int nr_sectors = bio_sectors(bio);

	/* Only applicable to zoned block devices */
	if (!blk_queue_is_zoned(q))
		return BLK_STS_NOTSUPP;

	/* The bio sector must point to the start of a sequential zone */
	if (pos & (blk_queue_zone_sectors(q) - 1) ||
	    !blk_queue_zone_is_seq(q, pos))
		return BLK_STS_IOERR;

	/*
	 * Not allowed to cross zone boundaries. Otherwise, the BIO will be
	 * split and could result in non-contiguous sectors being written in
	 * different zones.
	 */
	if (nr_sectors > q->limits.chunk_sectors)
		return BLK_STS_IOERR;

	/* Make sure the BIO is small enough and will not get split */
	if (nr_sectors > q->limits.max_zone_append_sectors)
		return BLK_STS_IOERR;

	bio->bi_opf |= REQ_NOMERGE;

	return BLK_STS_OK;
}

static noinline_for_stack bool
generic_make_request_checks(struct bio *bio)
{
@@ -959,6 +1006,11 @@ generic_make_request_checks(struct bio *bio)
		if (!q->limits.max_write_same_sectors)
			goto not_supported;
		break;
	case REQ_OP_ZONE_APPEND:
		status = blk_check_zone_append(q, bio);
		if (status != BLK_STS_OK)
			goto end_io;
		break;
	case REQ_OP_ZONE_RESET:
	case REQ_OP_ZONE_OPEN:
	case REQ_OP_ZONE_CLOSE:
+27 −0
Original line number Diff line number Diff line
@@ -1183,6 +1183,19 @@ static void blk_mq_handle_dev_resource(struct request *rq,
	__blk_mq_requeue_request(rq);
}

static void blk_mq_handle_zone_resource(struct request *rq,
					struct list_head *zone_list)
{
	/*
	 * If we end up here it is because we cannot dispatch a request to a
	 * specific zone due to LLD level zone-write locking or other zone
	 * related resource not being available. In this case, set the request
	 * aside in zone_list for retrying it later.
	 */
	list_add(&rq->queuelist, zone_list);
	__blk_mq_requeue_request(rq);
}

/*
 * Returns true if we did some work AND can potentially do more.
 */
@@ -1195,6 +1208,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
	int errors, queued;
	blk_status_t ret = BLK_STS_OK;
	bool no_budget_avail = false;
	LIST_HEAD(zone_list);

	if (list_empty(list))
		return false;
@@ -1256,6 +1270,16 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
		if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
			blk_mq_handle_dev_resource(rq, list);
			break;
		} else if (ret == BLK_STS_ZONE_RESOURCE) {
			/*
			 * Move the request to zone_list and keep going through
			 * the dispatch list to find more requests the drive can
			 * accept.
			 */
			blk_mq_handle_zone_resource(rq, &zone_list);
			if (list_empty(list))
				break;
			continue;
		}

		if (unlikely(ret != BLK_STS_OK)) {
@@ -1267,6 +1291,9 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
		queued++;
	} while (!list_empty(list));

	if (!list_empty(&zone_list))
		list_splice_tail_init(&zone_list, list);

	hctx->dispatched[queued_to_index(queued)]++;

	/*
+31 −0
Original line number Diff line number Diff line
@@ -48,6 +48,7 @@ void blk_set_default_limits(struct queue_limits *lim)
	lim->chunk_sectors = 0;
	lim->max_write_same_sectors = 0;
	lim->max_write_zeroes_sectors = 0;
	lim->max_zone_append_sectors = 0;
	lim->max_discard_sectors = 0;
	lim->max_hw_discard_sectors = 0;
	lim->discard_granularity = 0;
@@ -83,6 +84,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
	lim->max_dev_sectors = UINT_MAX;
	lim->max_write_same_sectors = UINT_MAX;
	lim->max_write_zeroes_sectors = UINT_MAX;
	lim->max_zone_append_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);

@@ -221,6 +223,33 @@ void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
}
EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);

/**
 * blk_queue_max_zone_append_sectors - set max sectors for a single zone append
 * @q:  the request queue for the device
 * @max_zone_append_sectors: maximum number of sectors to write per command
 **/
void blk_queue_max_zone_append_sectors(struct request_queue *q,
		unsigned int max_zone_append_sectors)
{
	unsigned int max_sectors;

	if (WARN_ON(!blk_queue_is_zoned(q)))
		return;

	max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
	max_sectors = min(q->limits.chunk_sectors, max_sectors);

	/*
	 * Signal eventual driver bugs resulting in the max_zone_append sectors limit
	 * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set,
	 * or the max_hw_sectors limit not set.
	 */
	WARN_ON(!max_sectors);

	q->limits.max_zone_append_sectors = max_sectors;
}
EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors);

/**
 * blk_queue_max_segments - set max hw segments for a request for this queue
 * @q:  the request queue for the device
@@ -470,6 +499,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
					b->max_write_same_sectors);
	t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
					b->max_write_zeroes_sectors);
	t->max_zone_append_sectors = min(t->max_zone_append_sectors,
					b->max_zone_append_sectors);
	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);

	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
+13 −0
Original line number Diff line number Diff line
@@ -218,6 +218,13 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
		(unsigned long long)q->limits.max_write_zeroes_sectors << 9);
}

static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
{
	unsigned long long max_sectors = q->limits.max_zone_append_sectors;

	return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT);
}

static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
{
@@ -639,6 +646,11 @@ static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
	.show = queue_write_zeroes_max_show,
};

static struct queue_sysfs_entry queue_zone_append_max_entry = {
	.attr = {.name = "zone_append_max_bytes", .mode = 0444 },
	.show = queue_zone_append_max_show,
};

static struct queue_sysfs_entry queue_nonrot_entry = {
	.attr = {.name = "rotational", .mode = 0644 },
	.show = queue_show_nonrot,
@@ -749,6 +761,7 @@ static struct attribute *queue_attrs[] = {
	&queue_discard_zeroes_data_entry.attr,
	&queue_write_same_max_entry.attr,
	&queue_write_zeroes_max_entry.attr,
	&queue_zone_append_max_entry.attr,
	&queue_nonrot_entry.attr,
	&queue_zoned_entry.attr,
	&queue_nr_zones_entry.attr,
Loading