Commit ac8f7a02 authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge branch 'for-5.10/block' into for-5.10/drivers

* for-5.10/block: (140 commits)
  bdi: replace BDI_CAP_NO_{WRITEBACK,ACCT_DIRTY} with a single flag
  bdi: invert BDI_CAP_NO_ACCT_WB
  bdi: replace BDI_CAP_STABLE_WRITES with a queue and a sb flag
  mm: use SWP_SYNCHRONOUS_IO more intelligently
  bdi: remove BDI_CAP_SYNCHRONOUS_IO
  bdi: remove BDI_CAP_CGROUP_WRITEBACK
  block: lift setting the readahead size into the block layer
  md: update the optimal I/O size on reshape
  bdi: initialize ->ra_pages and ->io_pages in bdi_init
  aoe: set an optimal I/O size
  bcache: inherit the optimal I/O size
  drbd: remove dead code in device_to_statistics
  fs: remove the unused SB_I_MULTIROOT flag
  block: mark blkdev_get static
  PM: mm: cleanup swsusp_swap_check
  mm: split swap_type_of
  PM: rewrite is_hibernate_resume_dev to not require an inode
  mm: cleanup claim_swapfile
  ocfs2: cleanup o2hb_region_dev_store
  dasd: cleanup dasd_scan_partitions
  ...
parents 805c6d3c f56753ac
Loading
Loading
Loading
Loading
+0 −3
Original line number Diff line number Diff line
@@ -488,9 +488,6 @@ getgeo: no
swap_slot_free_notify:	no	(see below)
======================= ===================

unlock_native_capacity and revalidate_disk are called only from
check_disk_change().

swap_slot_free_notify is called with swap_lock and sometimes the page lock
held.

+0 −2
Original line number Diff line number Diff line
@@ -161,8 +161,6 @@ config BLK_WBT_MQ
	depends on BLK_WBT
	help
	Enable writeback throttling by default on multiqueue devices.
	Multiqueue currently doesn't have support for IO scheduling,
	enabling this option is recommended.

config BLK_DEBUG_FS
	bool "Block layer debugging information in debugfs"
+7 −2
Original line number Diff line number Diff line
@@ -4640,6 +4640,9 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
{
	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;

	if (!atomic_read(&hctx->elevator_queued))
		return false;

	/*
	 * Avoiding lock: a race on bfqd->busy_queues should cause at
	 * most a call to dispatch for nothing
@@ -5554,6 +5557,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
		rq = list_first_entry(list, struct request, queuelist);
		list_del_init(&rq->queuelist);
		bfq_insert_request(hctx, rq, at_head);
		atomic_inc(&hctx->elevator_queued);
	}
}

@@ -5921,6 +5925,7 @@ static void bfq_finish_requeue_request(struct request *rq)

		bfq_completed_request(bfqq, bfqd);
		bfq_finish_requeue_request_body(bfqq);
		atomic_dec(&rq->mq_hctx->elevator_queued);

		spin_unlock_irqrestore(&bfqd->lock, flags);
	} else {
@@ -6360,8 +6365,8 @@ static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
	struct blk_mq_tags *tags = hctx->sched_tags;
	unsigned int min_shallow;

	min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
	min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
	sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
}

static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
+26 −6
Original line number Diff line number Diff line
@@ -119,6 +119,8 @@ static void blkg_async_bio_workfn(struct work_struct *work)
					     async_bio_work);
	struct bio_list bios = BIO_EMPTY_LIST;
	struct bio *bio;
	struct blk_plug plug;
	bool need_plug = false;

	/* as long as there are pending bios, @blkg can't go away */
	spin_lock_bh(&blkg->async_bio_lock);
@@ -126,8 +128,15 @@ static void blkg_async_bio_workfn(struct work_struct *work)
	bio_list_init(&blkg->async_bios);
	spin_unlock_bh(&blkg->async_bio_lock);

	/* start plug only when bio_list contains at least 2 bios */
	if (bios.head && bios.head->bi_next) {
		need_plug = true;
		blk_start_plug(&plug);
	}
	while ((bio = bio_list_pop(&bios)))
		submit_bio(bio);
	if (need_plug)
		blk_finish_plug(&plug);
}

/**
@@ -1613,16 +1622,24 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
	unsigned long pflags;
	bool clamp;
	u64 now = ktime_to_ns(ktime_get());
	u64 exp;
	u64 delay_nsec = 0;
	int tok;

	while (blkg->parent) {
		if (atomic_read(&blkg->use_delay)) {
		int use_delay = atomic_read(&blkg->use_delay);

		if (use_delay) {
			u64 this_delay;

			blkcg_scale_delay(blkg, now);
			delay_nsec = max_t(u64, delay_nsec,
					   atomic64_read(&blkg->delay_nsec));
			this_delay = atomic64_read(&blkg->delay_nsec);
			if (this_delay > delay_nsec) {
				delay_nsec = this_delay;
				clamp = use_delay > 0;
			}
		}
		blkg = blkg->parent;
	}
@@ -1634,9 +1651,12 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
	 * Let's not sleep for all eternity if we've amassed a huge delay.
	 * Swapping or metadata IO can accumulate 10's of seconds worth of
	 * delay, and we want userspace to be able to do _something_ so cap the
	 * delays at 1 second.  If there's 10's of seconds worth of delay then
	 * the tasks will be delayed for 1 second for every syscall.
	 * delays at 0.25s. If there's 10's of seconds worth of delay then the
	 * tasks will be delayed for 0.25 second for every syscall. If
	 * blkcg_set_delay() was used as indicated by negative use_delay, the
	 * caller is responsible for regulating the range.
	 */
	if (clamp)
		delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);

	if (use_memdelay)
+63 −176
Original line number Diff line number Diff line
@@ -116,8 +116,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
	rq->__sector = (sector_t) -1;
	INIT_HLIST_NODE(&rq->hash);
	RB_CLEAR_NODE(&rq->rb_node);
	rq->tag = -1;
	rq->internal_tag = -1;
	rq->tag = BLK_MQ_NO_TAG;
	rq->internal_tag = BLK_MQ_NO_TAG;
	rq->start_time_ns = ktime_get_ns();
	rq->part = NULL;
	refcount_set(&rq->ref, 1);
@@ -538,11 +538,10 @@ struct request_queue *blk_alloc_queue(int node_id)
	if (!q->stats)
		goto fail_stats;

	q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
	q->backing_dev_info->io_pages = VM_READAHEAD_PAGES;
	q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
	q->node = node_id;

	atomic_set(&q->nr_active_requests_shared_sbitmap, 0);

	timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
		    laptop_mode_timer_fn, 0);
	timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
@@ -643,162 +642,6 @@ void blk_put_request(struct request *req)
}
EXPORT_SYMBOL(blk_put_request);

static void blk_account_io_merge_bio(struct request *req)
{
	if (!blk_do_io_stat(req))
		return;

	part_stat_lock();
	part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
	part_stat_unlock();
}

bool bio_attempt_back_merge(struct request *req, struct bio *bio,
		unsigned int nr_segs)
{
	const int ff = bio->bi_opf & REQ_FAILFAST_MASK;

	if (!ll_back_merge_fn(req, bio, nr_segs))
		return false;

	trace_block_bio_backmerge(req->q, req, bio);
	rq_qos_merge(req->q, req, bio);

	if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
		blk_rq_set_mixed_merge(req);

	req->biotail->bi_next = bio;
	req->biotail = bio;
	req->__data_len += bio->bi_iter.bi_size;

	bio_crypt_free_ctx(bio);

	blk_account_io_merge_bio(req);
	return true;
}

bool bio_attempt_front_merge(struct request *req, struct bio *bio,
		unsigned int nr_segs)
{
	const int ff = bio->bi_opf & REQ_FAILFAST_MASK;

	if (!ll_front_merge_fn(req, bio, nr_segs))
		return false;

	trace_block_bio_frontmerge(req->q, req, bio);
	rq_qos_merge(req->q, req, bio);

	if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
		blk_rq_set_mixed_merge(req);

	bio->bi_next = req->bio;
	req->bio = bio;

	req->__sector = bio->bi_iter.bi_sector;
	req->__data_len += bio->bi_iter.bi_size;

	bio_crypt_do_front_merge(req, bio);

	blk_account_io_merge_bio(req);
	return true;
}

bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
		struct bio *bio)
{
	unsigned short segments = blk_rq_nr_discard_segments(req);

	if (segments >= queue_max_discard_segments(q))
		goto no_merge;
	if (blk_rq_sectors(req) + bio_sectors(bio) >
	    blk_rq_get_max_sectors(req, blk_rq_pos(req)))
		goto no_merge;

	rq_qos_merge(q, req, bio);

	req->biotail->bi_next = bio;
	req->biotail = bio;
	req->__data_len += bio->bi_iter.bi_size;
	req->nr_phys_segments = segments + 1;

	blk_account_io_merge_bio(req);
	return true;
no_merge:
	req_set_nomerge(q, req);
	return false;
}

/**
 * blk_attempt_plug_merge - try to merge with %current's plugged list
 * @q: request_queue new bio is being queued at
 * @bio: new bio being queued
 * @nr_segs: number of segments in @bio
 * @same_queue_rq: pointer to &struct request that gets filled in when
 * another request associated with @q is found on the plug list
 * (optional, may be %NULL)
 *
 * Determine whether @bio being queued on @q can be merged with a request
 * on %current's plugged list.  Returns %true if merge was successful,
 * otherwise %false.
 *
 * Plugging coalesces IOs from the same issuer for the same purpose without
 * going through @q->queue_lock.  As such it's more of an issuing mechanism
 * than scheduling, and the request, while may have elvpriv data, is not
 * added on the elevator at this point.  In addition, we don't have
 * reliable access to the elevator outside queue lock.  Only check basic
 * merging parameters without querying the elevator.
 *
 * Caller must ensure !blk_queue_nomerges(q) beforehand.
 */
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
		unsigned int nr_segs, struct request **same_queue_rq)
{
	struct blk_plug *plug;
	struct request *rq;
	struct list_head *plug_list;

	plug = blk_mq_plug(q, bio);
	if (!plug)
		return false;

	plug_list = &plug->mq_list;

	list_for_each_entry_reverse(rq, plug_list, queuelist) {
		bool merged = false;

		if (rq->q == q && same_queue_rq) {
			/*
			 * Only blk-mq multiple hardware queues case checks the
			 * rq in the same queue, there should be only one such
			 * rq in a queue
			 **/
			*same_queue_rq = rq;
		}

		if (rq->q != q || !blk_rq_merge_ok(rq, bio))
			continue;

		switch (blk_try_merge(rq, bio)) {
		case ELEVATOR_BACK_MERGE:
			merged = bio_attempt_back_merge(rq, bio, nr_segs);
			break;
		case ELEVATOR_FRONT_MERGE:
			merged = bio_attempt_front_merge(rq, bio, nr_segs);
			break;
		case ELEVATOR_DISCARD_MERGE:
			merged = bio_attempt_discard_merge(q, rq, bio);
			break;
		default:
			break;
		}

		if (merged)
			return true;
	}

	return false;
}

static void handle_bad_sector(struct bio *bio, sector_t maxsector)
{
	char b[BDEVNAME_SIZE];
@@ -1301,14 +1144,28 @@ EXPORT_SYMBOL(submit_bio);
 *    limits when retrying requests on other queues. Those requests need
 *    to be checked against the new queue limits again during dispatch.
 */
static int blk_cloned_rq_check_limits(struct request_queue *q,
static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
				      struct request *rq)
{
	if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, req_op(rq))) {
	unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));

	if (blk_rq_sectors(rq) > max_sectors) {
		/*
		 * SCSI device does not have a good way to return if
		 * Write Same/Zero is actually supported. If a device rejects
		 * a non-read/write command (discard, write same,etc.) the
		 * low-level device driver will set the relevant queue limit to
		 * 0 to prevent blk-lib from issuing more of the offending
		 * operations. Commands queued prior to the queue limit being
		 * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
		 * errors being propagated to upper layers.
		 */
		if (max_sectors == 0)
			return BLK_STS_NOTSUPP;

		printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
			__func__, blk_rq_sectors(rq),
			blk_queue_get_max_sectors(q, req_op(rq)));
		return -EIO;
			__func__, blk_rq_sectors(rq), max_sectors);
		return BLK_STS_IOERR;
	}

	/*
@@ -1321,10 +1178,10 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
	if (rq->nr_phys_segments > queue_max_segments(q)) {
		printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
			__func__, rq->nr_phys_segments, queue_max_segments(q));
		return -EIO;
		return BLK_STS_IOERR;
	}

	return 0;
	return BLK_STS_OK;
}

/**
@@ -1334,8 +1191,11 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
 */
blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
{
	if (blk_cloned_rq_check_limits(q, rq))
		return BLK_STS_IOERR;
	blk_status_t ret;

	ret = blk_cloned_rq_check_limits(q, rq);
	if (ret != BLK_STS_OK)
		return ret;

	if (rq->rq_disk &&
	    should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
@@ -1461,10 +1321,9 @@ void blk_account_io_start(struct request *rq)
	part_stat_unlock();
}

unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
		unsigned int op)
static unsigned long __part_start_io_acct(struct hd_struct *part,
					  unsigned int sectors, unsigned int op)
{
	struct hd_struct *part = &disk->part0;
	const int sgrp = op_stat_group(op);
	unsigned long now = READ_ONCE(jiffies);

@@ -1477,12 +1336,26 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,

	return now;
}

unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
				 struct bio *bio)
{
	*part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector);

	return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio));
}
EXPORT_SYMBOL_GPL(part_start_io_acct);

unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
				 unsigned int op)
{
	return __part_start_io_acct(&disk->part0, sectors, op);
}
EXPORT_SYMBOL(disk_start_io_acct);

void disk_end_io_acct(struct gendisk *disk, unsigned int op,
static void __part_end_io_acct(struct hd_struct *part, unsigned int op,
			       unsigned long start_time)
{
	struct hd_struct *part = &disk->part0;
	const int sgrp = op_stat_group(op);
	unsigned long now = READ_ONCE(jiffies);
	unsigned long duration = now - start_time;
@@ -1493,6 +1366,20 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op,
	part_stat_local_dec(part, in_flight[op_is_write(op)]);
	part_stat_unlock();
}

void part_end_io_acct(struct hd_struct *part, struct bio *bio,
		      unsigned long start_time)
{
	__part_end_io_acct(part, bio_op(bio), start_time);
	hd_struct_put(part);
}
EXPORT_SYMBOL_GPL(part_end_io_acct);

void disk_end_io_acct(struct gendisk *disk, unsigned int op,
		      unsigned long start_time)
{
	__part_end_io_acct(&disk->part0, op, start_time);
}
EXPORT_SYMBOL(disk_end_io_acct);

/*
Loading