Commit 9feb1af9 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-linus-20191205' of git://git.kernel.dk/linux-block

Pull more block and io_uring updates from Jens Axboe:
 "I wasn't expecting this to be so big, and if I was, I would have used
  separate branches for this. Going forward I'll be doing separate
  branches for the current tree, just like for the next kernel version
  tree. In any case, this contains:

   - Series from Christoph that fixes an inherent race condition with
     zoned devices and revalidation.

   - null_blk zone size fix (Damien)

   - Fix for a regression in this merge window that caused busy spins by
     sending empty disk uevents (Eric)

   - Fix for a regression in this merge window for bfq stats (Hou)

   - Fix for io_uring creds allocation failure handling (me)

   - io_uring -ERESTARTSYS send/recvmsg fix (me)

   - Series that fixes the need for applications to retain state across
     async request punts for io_uring. This one is a bit larger than I
     would have hoped, but I think it's important we get this fixed for
     5.5.

   - connect(2) improvement for io_uring, handling EINPROGRESS instead
     of having applications needing to poll for it (me)

   - Have io_uring use a hash for poll requests instead of an rbtree.
     This turned out to work much better in practice, so I think we
     should make the switch now. For some workloads, even with a fair
     amount of cancellations, the insertion sort is just too expensive.
     (me)

   - Various little io_uring fixes (me, Jackie, Pavel, LimingWu)

   - Fix for brd unaligned IO, and a warning for the future (Ming)

   - Fix for a bio integrity data leak (Justin)

   - bvec_iter_advance() improvement (Pavel)

   - Xen blkback page unmap fix (SeongJae)

  The major items in here are all well tested, and on the liburing side
  we continue to add regression and feature test cases. We're up to 50
  topic cases now, each with anywhere from 1 to more than 10 cases in
  each"

* tag 'for-linus-20191205' of git://git.kernel.dk/linux-block: (33 commits)
  block: fix memleak of bio integrity data
  io_uring: fix a typo in a comment
  bfq-iosched: Ensure bio->bi_blkg is valid before using it
  io_uring: hook all linked requests via link_list
  io_uring: fix error handling in io_queue_link_head
  io_uring: use hash table for poll command lookups
  io-wq: clear node->next on list deletion
  io_uring: ensure deferred timeouts copy necessary data
  io_uring: allow IO_SQE_* flags on IORING_OP_TIMEOUT
  null_blk: remove unused variable warning on !CONFIG_BLK_DEV_ZONED
  brd: warn on un-aligned buffer
  brd: remove max_hw_sectors queue limit
  xen/blkback: Avoid unmapping unmapped grant pages
  io_uring: handle connect -EINPROGRESS like -EAGAIN
  block: set the zone size in blk_revalidate_disk_zones atomically
  block: don't handle bio based drivers in blk_revalidate_disk_zones
  block: allocate the zone bitmaps lazily
  block: replace seq_zones_bitmap with conv_zones_bitmap
  block: simplify blkdev_nr_zones
  block: remove the empty line at the end of blk-zoned.c
  ...
parents 0aecba61 85394299
Loading
Loading
Loading
Loading
+3 −0
Original line number Original line Diff line number Diff line
@@ -351,6 +351,9 @@ void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq)
{
{
	struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg);
	struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg);


	if (!bfqg)
		return;

	blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq));
	blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq));
	blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1);
	blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1);
}
}
+1 −1
Original line number Original line Diff line number Diff line
@@ -87,7 +87,7 @@ EXPORT_SYMBOL(bio_integrity_alloc);
 * Description: Used to free the integrity portion of a bio. Usually
 * Description: Used to free the integrity portion of a bio. Usually
 * called from bio_free().
 * called from bio_free().
 */
 */
static void bio_integrity_free(struct bio *bio)
void bio_integrity_free(struct bio *bio)
{
{
	struct bio_integrity_payload *bip = bio_integrity(bio);
	struct bio_integrity_payload *bip = bio_integrity(bio);
	struct bio_set *bs = bio->bi_pool;
	struct bio_set *bs = bio->bi_pool;
+3 −0
Original line number Original line Diff line number Diff line
@@ -233,6 +233,9 @@ fallback:
void bio_uninit(struct bio *bio)
void bio_uninit(struct bio *bio)
{
{
	bio_disassociate_blkg(bio);
	bio_disassociate_blkg(bio);

	if (bio_integrity(bio))
		bio_integrity_free(bio);
}
}
EXPORT_SYMBOL(bio_uninit);
EXPORT_SYMBOL(bio_uninit);


+69 −80
Original line number Original line Diff line number Diff line
@@ -70,30 +70,20 @@ void __blk_req_zone_write_unlock(struct request *rq)
}
}
EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);


static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
					     sector_t nr_sectors)
{
	sector_t zone_sectors = blk_queue_zone_sectors(q);

	return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
}

/**
/**
 * blkdev_nr_zones - Get number of zones
 * blkdev_nr_zones - Get number of zones
 * @bdev:	Target block device
 * @disk:	Target gendisk
 *
 *
 * Description:
 * Return the total number of zones of a zoned block device.  For a block
 *    Return the total number of zones of a zoned block device.
 * device without zone capabilities, the number of zones is always 0.
 *    For a regular block device, the number of zones is always 0.
 */
 */
unsigned int blkdev_nr_zones(struct block_device *bdev)
unsigned int blkdev_nr_zones(struct gendisk *disk)
{
{
	struct request_queue *q = bdev_get_queue(bdev);
	sector_t zone_sectors = blk_queue_zone_sectors(disk->queue);


	if (!blk_queue_is_zoned(q))
	if (!blk_queue_is_zoned(disk->queue))
		return 0;
		return 0;

	return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
	return __blkdev_nr_zones(q, get_capacity(bdev->bd_disk));
}
}
EXPORT_SYMBOL_GPL(blkdev_nr_zones);
EXPORT_SYMBOL_GPL(blkdev_nr_zones);


@@ -342,16 +332,18 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node,


void blk_queue_free_zone_bitmaps(struct request_queue *q)
void blk_queue_free_zone_bitmaps(struct request_queue *q)
{
{
	kfree(q->seq_zones_bitmap);
	kfree(q->conv_zones_bitmap);
	q->seq_zones_bitmap = NULL;
	q->conv_zones_bitmap = NULL;
	kfree(q->seq_zones_wlock);
	kfree(q->seq_zones_wlock);
	q->seq_zones_wlock = NULL;
	q->seq_zones_wlock = NULL;
}
}


struct blk_revalidate_zone_args {
struct blk_revalidate_zone_args {
	struct gendisk	*disk;
	struct gendisk	*disk;
	unsigned long	*seq_zones_bitmap;
	unsigned long	*conv_zones_bitmap;
	unsigned long	*seq_zones_wlock;
	unsigned long	*seq_zones_wlock;
	unsigned int	nr_zones;
	sector_t	zone_sectors;
	sector_t	sector;
	sector_t	sector;
};
};


@@ -364,26 +356,34 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
	struct blk_revalidate_zone_args *args = data;
	struct blk_revalidate_zone_args *args = data;
	struct gendisk *disk = args->disk;
	struct gendisk *disk = args->disk;
	struct request_queue *q = disk->queue;
	struct request_queue *q = disk->queue;
	sector_t zone_sectors = blk_queue_zone_sectors(q);
	sector_t capacity = get_capacity(disk);
	sector_t capacity = get_capacity(disk);


	/*
	/*
	 * All zones must have the same size, with the exception on an eventual
	 * All zones must have the same size, with the exception on an eventual
	 * smaller last zone.
	 * smaller last zone.
	 */
	 */
	if (zone->start + zone_sectors < capacity &&
	if (zone->start == 0) {
	    zone->len != zone_sectors) {
		if (zone->len == 0 || !is_power_of_2(zone->len)) {
			pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
				disk->disk_name, zone->len);
			return -ENODEV;
		}

		args->zone_sectors = zone->len;
		args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
	} else if (zone->start + args->zone_sectors < capacity) {
		if (zone->len != args->zone_sectors) {
			pr_warn("%s: Invalid zoned device with non constant zone size\n",
			pr_warn("%s: Invalid zoned device with non constant zone size\n",
				disk->disk_name);
				disk->disk_name);
		return false;
			return -ENODEV;
		}
		}

	} else {
	if (zone->start + zone->len >= capacity &&
		if (zone->len > args->zone_sectors) {
	    zone->len > zone_sectors) {
			pr_warn("%s: Invalid zoned device with larger last zone size\n",
			pr_warn("%s: Invalid zoned device with larger last zone size\n",
				disk->disk_name);
				disk->disk_name);
			return -ENODEV;
			return -ENODEV;
		}
		}
	}


	/* Check for holes in the zone report */
	/* Check for holes in the zone report */
	if (zone->start != args->sector) {
	if (zone->start != args->sector) {
@@ -395,8 +395,22 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
	/* Check zone type */
	/* Check zone type */
	switch (zone->type) {
	switch (zone->type) {
	case BLK_ZONE_TYPE_CONVENTIONAL:
	case BLK_ZONE_TYPE_CONVENTIONAL:
		if (!args->conv_zones_bitmap) {
			args->conv_zones_bitmap =
				blk_alloc_zone_bitmap(q->node, args->nr_zones);
			if (!args->conv_zones_bitmap)
				return -ENOMEM;
		}
		set_bit(idx, args->conv_zones_bitmap);
		break;
	case BLK_ZONE_TYPE_SEQWRITE_REQ:
	case BLK_ZONE_TYPE_SEQWRITE_REQ:
	case BLK_ZONE_TYPE_SEQWRITE_PREF:
	case BLK_ZONE_TYPE_SEQWRITE_PREF:
		if (!args->seq_zones_wlock) {
			args->seq_zones_wlock =
				blk_alloc_zone_bitmap(q->node, args->nr_zones);
			if (!args->seq_zones_wlock)
				return -ENOMEM;
		}
		break;
		break;
	default:
	default:
		pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
		pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
@@ -404,78 +418,54 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
		return -ENODEV;
		return -ENODEV;
	}
	}


	if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
		set_bit(idx, args->seq_zones_bitmap);

	args->sector += zone->len;
	args->sector += zone->len;
	return 0;
	return 0;
}
}


static int blk_update_zone_info(struct gendisk *disk, unsigned int nr_zones,
				struct blk_revalidate_zone_args *args)
{
	/*
	 * Ensure that all memory allocations in this context are done as
	 * if GFP_NOIO was specified.
	 */
	unsigned int noio_flag = memalloc_noio_save();
	struct request_queue *q = disk->queue;
	int ret;

	args->seq_zones_wlock = blk_alloc_zone_bitmap(q->node, nr_zones);
	if (!args->seq_zones_wlock)
		return -ENOMEM;
	args->seq_zones_bitmap = blk_alloc_zone_bitmap(q->node, nr_zones);
	if (!args->seq_zones_bitmap)
		return -ENOMEM;

	ret = disk->fops->report_zones(disk, 0, nr_zones,
				       blk_revalidate_zone_cb, args);
	memalloc_noio_restore(noio_flag);
	return ret;
}

/**
/**
 * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
 * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
 * @disk:	Target disk
 * @disk:	Target disk
 *
 *
 * Helper function for low-level device drivers to (re) allocate and initialize
 * Helper function for low-level device drivers to (re) allocate and initialize
 * a disk request queue zone bitmaps. This functions should normally be called
 * a disk request queue zone bitmaps. This functions should normally be called
 * within the disk ->revalidate method. For BIO based queues, no zone bitmap
 * within the disk ->revalidate method for blk-mq based drivers.  For BIO based
 * is allocated.
 * drivers only q->nr_zones needs to be updated so that the sysfs exposed value
 * is correct.
 */
 */
int blk_revalidate_disk_zones(struct gendisk *disk)
int blk_revalidate_disk_zones(struct gendisk *disk)
{
{
	struct request_queue *q = disk->queue;
	struct request_queue *q = disk->queue;
	unsigned int nr_zones = __blkdev_nr_zones(q, get_capacity(disk));
	struct blk_revalidate_zone_args args = {
	struct blk_revalidate_zone_args args = { .disk = disk };
		.disk		= disk,
	int ret = 0;
	};
	unsigned int noio_flag;
	int ret;


	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
		return -EIO;
		return -EIO;
	if (WARN_ON_ONCE(!queue_is_mq(q)))
		return -EIO;


	/*
	/*
	 * BIO based queues do not use a scheduler so only q->nr_zones
	 * Ensure that all memory allocations in this context are done as if
	 * needs to be updated so that the sysfs exposed value is correct.
	 * GFP_NOIO was specified.
	 */
	 */
	if (!queue_is_mq(q)) {
	noio_flag = memalloc_noio_save();
		q->nr_zones = nr_zones;
	ret = disk->fops->report_zones(disk, 0, UINT_MAX,
		return 0;
				       blk_revalidate_zone_cb, &args);
	}
	memalloc_noio_restore(noio_flag);

	if (nr_zones)
		ret = blk_update_zone_info(disk, nr_zones, &args);


	/*
	/*
	 * Install the new bitmaps, making sure the queue is stopped and
	 * Install the new bitmaps and update nr_zones only once the queue is
	 * all I/Os are completed (i.e. a scheduler is not referencing the
	 * stopped and all I/Os are completed (i.e. a scheduler is not
	 * bitmaps).
	 * referencing the bitmaps).
	 */
	 */
	blk_mq_freeze_queue(q);
	blk_mq_freeze_queue(q);
	if (ret >= 0) {
	if (ret >= 0) {
		q->nr_zones = nr_zones;
		blk_queue_chunk_sectors(q, args.zone_sectors);
		q->nr_zones = args.nr_zones;
		swap(q->seq_zones_wlock, args.seq_zones_wlock);
		swap(q->seq_zones_wlock, args.seq_zones_wlock);
		swap(q->seq_zones_bitmap, args.seq_zones_bitmap);
		swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
		ret = 0;
		ret = 0;
	} else {
	} else {
		pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
		pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
@@ -484,8 +474,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
	blk_mq_unfreeze_queue(q);
	blk_mq_unfreeze_queue(q);


	kfree(args.seq_zones_wlock);
	kfree(args.seq_zones_wlock);
	kfree(args.seq_zones_bitmap);
	kfree(args.conv_zones_bitmap);
	return ret;
	return ret;
}
}
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
+4 −0
Original line number Original line Diff line number Diff line
@@ -121,6 +121,7 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
#ifdef CONFIG_BLK_DEV_INTEGRITY
#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
bool __bio_integrity_endio(struct bio *);
void bio_integrity_free(struct bio *bio);
static inline bool bio_integrity_endio(struct bio *bio)
static inline bool bio_integrity_endio(struct bio *bio)
{
{
	if (bio_integrity(bio))
	if (bio_integrity(bio))
@@ -166,6 +167,9 @@ static inline bool bio_integrity_endio(struct bio *bio)
{
{
	return true;
	return true;
}
}
static inline void bio_integrity_free(struct bio *bio)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
#endif /* CONFIG_BLK_DEV_INTEGRITY */


unsigned long blk_rq_timeout(unsigned long timeout);
unsigned long blk_rq_timeout(unsigned long timeout);
Loading