Commit ac7ac461 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-5.11/block-2020-12-14' of git://git.kernel.dk/linux-block

Pull block updates from Jens Axboe:
 "Another series of killing more code than what is being added, again
  thanks to Christoph's relentless cleanups and tech debt tackling.

  This contains:

   - blk-iocost improvements (Baolin Wang)

   - part0 iostat fix (Jeffle Xu)

   - Disable iopoll for split bios (Jeffle Xu)

   - block tracepoint cleanups (Christoph Hellwig)

   - Merging of struct block_device and hd_struct (Christoph Hellwig)

   - Rework/cleanup of how block device sizes are updated (Christoph
     Hellwig)

   - Simplification of gendisk lookup and removal of block device
     aliasing (Christoph Hellwig)

   - Block device ioctl cleanups (Christoph Hellwig)

   - Removal of bdget()/blkdev_get() as exported API (Christoph Hellwig)

   - Disk change rework, avoid ->revalidate_disk() (Christoph Hellwig)

   - sbitmap improvements (Pavel Begunkov)

   - Hybrid polling fix (Pavel Begunkov)

   - bvec iteration improvements (Pavel Begunkov)

   - Zone revalidation fixes (Damien Le Moal)

   - blk-throttle limit fix (Yu Kuai)

   - Various little fixes"

* tag 'for-5.11/block-2020-12-14' of git://git.kernel.dk/linux-block: (126 commits)
  blk-mq: fix msec comment from micro to milli seconds
  blk-mq: update arg in comment of blk_mq_map_queue
  blk-mq: add helper allocating tagset->tags
  Revert "block: Fix a lockdep complaint triggered by request queue flushing"
  nvme-loop: use blk_mq_hctx_set_fq_lock_class to set loop's lock class
  blk-mq: add new API of blk_mq_hctx_set_fq_lock_class
  block: disable iopoll for split bio
  block: Improve blk_revalidate_disk_zones() checks
  sbitmap: simplify wrap check
  sbitmap: replace CAS with atomic and
  sbitmap: remove swap_lock
  sbitmap: optimise sbitmap_deferred_clear()
  blk-mq: skip hybrid polling if iopoll doesn't spin
  blk-iocost: Factor out the base vrate change into a separate function
  blk-iocost: Factor out the active iocgs' state check into a separate function
  blk-iocost: Move the usage ratio calculation to the correct place
  blk-iocost: Remove unnecessary advance declaration
  blk-iocost: Fix some typos in comments
  blktrace: fix up a kerneldoc comment
  block: remove the request_queue to argument request based tracepoints
  ...
parents 48aba79b fa94ba8a
Loading
Loading
Loading
Loading
+5 −5
Original line number Diff line number Diff line
@@ -608,12 +608,12 @@ void bio_truncate(struct bio *bio, unsigned new_size)
void guard_bio_eod(struct bio *bio)
{
	sector_t maxsector;
	struct hd_struct *part;
	struct block_device *part;

	rcu_read_lock();
	part = __disk_get_part(bio->bi_disk, bio->bi_partno);
	if (part)
		maxsector = part_nr_sects_read(part);
		maxsector = bdev_nr_sectors(part);
	else	
		maxsector = get_capacity(bio->bi_disk);
	rcu_read_unlock();
@@ -1212,8 +1212,8 @@ void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,

		flush_dcache_page(dst_bv.bv_page);

		bio_advance_iter(src, src_iter, bytes);
		bio_advance_iter(dst, dst_iter, bytes);
		bio_advance_iter_single(src, src_iter, bytes);
		bio_advance_iter_single(dst, dst_iter, bytes);
	}
}
EXPORT_SYMBOL(bio_copy_data_iter);
+25 −26
Original line number Diff line number Diff line
@@ -556,22 +556,22 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
}

/**
 * blkg_conf_prep - parse and prepare for per-blkg config update
 * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update
 * @inputp: input string pointer
 *
 * Parse the device node prefix part, MAJ:MIN, of per-blkg config update
 * from @input and get and return the matching gendisk.  *@inputp is
 * from @input and get and return the matching bdev.  *@inputp is
 * updated to point past the device node prefix.  Returns an ERR_PTR()
 * value on error.
 *
 * Use this function iff blkg_conf_prep() can't be used for some reason.
 */
struct gendisk *blkcg_conf_get_disk(char **inputp)
struct block_device *blkcg_conf_open_bdev(char **inputp)
{
	char *input = *inputp;
	unsigned int major, minor;
	struct gendisk *disk;
	int key_len, part;
	struct block_device *bdev;
	int key_len;

	if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
		return ERR_PTR(-EINVAL);
@@ -581,16 +581,16 @@ struct gendisk *blkcg_conf_get_disk(char **inputp)
		return ERR_PTR(-EINVAL);
	input = skip_spaces(input);

	disk = get_gendisk(MKDEV(major, minor), &part);
	if (!disk)
	bdev = blkdev_get_no_open(MKDEV(major, minor));
	if (!bdev)
		return ERR_PTR(-ENODEV);
	if (part) {
		put_disk_and_module(disk);
	if (bdev_is_partition(bdev)) {
		blkdev_put_no_open(bdev);
		return ERR_PTR(-ENODEV);
	}

	*inputp = input;
	return disk;
	return bdev;
}

/**
@@ -607,18 +607,18 @@ struct gendisk *blkcg_conf_get_disk(char **inputp)
 */
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
		   char *input, struct blkg_conf_ctx *ctx)
	__acquires(rcu) __acquires(&disk->queue->queue_lock)
	__acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock)
{
	struct gendisk *disk;
	struct block_device *bdev;
	struct request_queue *q;
	struct blkcg_gq *blkg;
	int ret;

	disk = blkcg_conf_get_disk(&input);
	if (IS_ERR(disk))
		return PTR_ERR(disk);
	bdev = blkcg_conf_open_bdev(&input);
	if (IS_ERR(bdev))
		return PTR_ERR(bdev);

	q = disk->queue;
	q = bdev->bd_disk->queue;

	rcu_read_lock();
	spin_lock_irq(&q->queue_lock);
@@ -689,7 +689,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
			goto success;
	}
success:
	ctx->disk = disk;
	ctx->bdev = bdev;
	ctx->blkg = blkg;
	ctx->body = input;
	return 0;
@@ -700,7 +700,7 @@ fail_unlock:
	spin_unlock_irq(&q->queue_lock);
	rcu_read_unlock();
fail:
	put_disk_and_module(disk);
	blkdev_put_no_open(bdev);
	/*
	 * If queue was bypassing, we should retry.  Do so after a
	 * short msleep().  It isn't strictly necessary but queue
@@ -723,11 +723,11 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
 * with blkg_conf_prep().
 */
void blkg_conf_finish(struct blkg_conf_ctx *ctx)
	__releases(&ctx->disk->queue->queue_lock) __releases(rcu)
	__releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu)
{
	spin_unlock_irq(&ctx->disk->queue->queue_lock);
	spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock);
	rcu_read_unlock();
	put_disk_and_module(ctx->disk);
	blkdev_put_no_open(ctx->bdev);
}
EXPORT_SYMBOL_GPL(blkg_conf_finish);

@@ -820,9 +820,9 @@ static void blkcg_fill_root_iostats(void)

	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
	while ((dev = class_dev_iter_next(&iter))) {
		struct gendisk *disk = dev_to_disk(dev);
		struct hd_struct *part = disk_get_part(disk, 0);
		struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue);
		struct block_device *bdev = dev_to_bdev(dev);
		struct blkcg_gq *blkg =
			blk_queue_root_blkg(bdev->bd_disk->queue);
		struct blkg_iostat tmp;
		int cpu;

@@ -830,7 +830,7 @@ static void blkcg_fill_root_iostats(void)
		for_each_possible_cpu(cpu) {
			struct disk_stats *cpu_dkstats;

			cpu_dkstats = per_cpu_ptr(part->dkstats, cpu);
			cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
			tmp.ios[BLKG_IOSTAT_READ] +=
				cpu_dkstats->ios[STAT_READ];
			tmp.ios[BLKG_IOSTAT_WRITE] +=
@@ -849,7 +849,6 @@ static void blkcg_fill_root_iostats(void)
			blkg_iostat_set(&blkg->iostat.cur, &tmp);
			u64_stats_update_end(&blkg->iostat.sync);
		}
		disk_put_part(part);
	}
}

+32 −38
Original line number Diff line number Diff line
@@ -666,9 +666,9 @@ static int __init setup_fail_make_request(char *str)
}
__setup("fail_make_request=", setup_fail_make_request);

static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
static bool should_fail_request(struct block_device *part, unsigned int bytes)
{
	return part->make_it_fail && should_fail(&fail_make_request, bytes);
	return part->bd_make_it_fail && should_fail(&fail_make_request, bytes);
}

static int __init fail_make_request_debugfs(void)
@@ -683,7 +683,7 @@ late_initcall(fail_make_request_debugfs);

#else /* CONFIG_FAIL_MAKE_REQUEST */

static inline bool should_fail_request(struct hd_struct *part,
static inline bool should_fail_request(struct block_device *part,
					unsigned int bytes)
{
	return false;
@@ -691,11 +691,11 @@ static inline bool should_fail_request(struct hd_struct *part,

#endif /* CONFIG_FAIL_MAKE_REQUEST */

static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
static inline bool bio_check_ro(struct bio *bio, struct block_device *part)
{
	const int op = bio_op(bio);

	if (part->policy && op_is_write(op)) {
	if (part->bd_read_only && op_is_write(op)) {
		char b[BDEVNAME_SIZE];

		if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
@@ -703,7 +703,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)

		WARN_ONCE(1,
		       "Trying to write to read-only block-device %s (partno %d)\n",
			bio_devname(bio, b), part->partno);
			bio_devname(bio, b), part->bd_partno);
		/* Older lvm-tools actually trigger this */
		return false;
	}
@@ -713,7 +713,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)

static noinline int should_fail_bio(struct bio *bio)
{
	if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
	if (should_fail_request(bio->bi_disk->part0, bio->bi_iter.bi_size))
		return -EIO;
	return 0;
}
@@ -742,7 +742,7 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
 */
static inline int blk_partition_remap(struct bio *bio)
{
	struct hd_struct *p;
	struct block_device *p;
	int ret = -EIO;

	rcu_read_lock();
@@ -755,11 +755,12 @@ static inline int blk_partition_remap(struct bio *bio)
		goto out;

	if (bio_sectors(bio)) {
		if (bio_check_eod(bio, part_nr_sects_read(p)))
		if (bio_check_eod(bio, bdev_nr_sectors(p)))
			goto out;
		bio->bi_iter.bi_sector += p->start_sect;
		trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
				      bio->bi_iter.bi_sector - p->start_sect);
		bio->bi_iter.bi_sector += p->bd_start_sect;
		trace_block_bio_remap(bio, p->bd_dev,
				      bio->bi_iter.bi_sector -
				      p->bd_start_sect);
	}
	bio->bi_partno = 0;
	ret = 0;
@@ -829,7 +830,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
		if (unlikely(blk_partition_remap(bio)))
			goto end_io;
	} else {
		if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
		if (unlikely(bio_check_ro(bio, bio->bi_disk->part0)))
			goto end_io;
		if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
			goto end_io;
@@ -906,7 +907,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
	blkcg_bio_issue_init(bio);

	if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
		trace_block_bio_queue(q, bio);
		trace_block_bio_queue(bio);
		/* Now that enqueuing has been traced, we need to trace
		 * completion as well.
		 */
@@ -1201,7 +1202,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
		return ret;

	if (rq->rq_disk &&
	    should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
	    should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq)))
		return BLK_STS_IOERR;

	if (blk_crypto_insert_cloned_request(rq))
@@ -1260,17 +1261,18 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_rq_err_bytes);

static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
static void update_io_ticks(struct block_device *part, unsigned long now,
		bool end)
{
	unsigned long stamp;
again:
	stamp = READ_ONCE(part->stamp);
	stamp = READ_ONCE(part->bd_stamp);
	if (unlikely(stamp != now)) {
		if (likely(cmpxchg(&part->stamp, stamp, now) == stamp))
		if (likely(cmpxchg(&part->bd_stamp, stamp, now) == stamp))
			__part_stat_add(part, io_ticks, end ? now - stamp : 1);
	}
	if (part->partno) {
		part = &part_to_disk(part)->part0;
	if (part->bd_partno) {
		part = bdev_whole(part);
		goto again;
	}
}
@@ -1279,11 +1281,9 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
	if (req->part && blk_do_io_stat(req)) {
		const int sgrp = op_stat_group(req_op(req));
		struct hd_struct *part;

		part_stat_lock();
		part = req->part;
		part_stat_add(part, sectors[sgrp], bytes >> 9);
		part_stat_add(req->part, sectors[sgrp], bytes >> 9);
		part_stat_unlock();
	}
}
@@ -1298,17 +1298,12 @@ void blk_account_io_done(struct request *req, u64 now)
	if (req->part && blk_do_io_stat(req) &&
	    !(req->rq_flags & RQF_FLUSH_SEQ)) {
		const int sgrp = op_stat_group(req_op(req));
		struct hd_struct *part;

		part_stat_lock();
		part = req->part;

		update_io_ticks(part, jiffies, true);
		part_stat_inc(part, ios[sgrp]);
		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
		update_io_ticks(req->part, jiffies, true);
		part_stat_inc(req->part, ios[sgrp]);
		part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
		part_stat_unlock();

		hd_struct_put(part);
	}
}

@@ -1324,7 +1319,7 @@ void blk_account_io_start(struct request *rq)
	part_stat_unlock();
}

static unsigned long __part_start_io_acct(struct hd_struct *part,
static unsigned long __part_start_io_acct(struct block_device *part,
					  unsigned int sectors, unsigned int op)
{
	const int sgrp = op_stat_group(op);
@@ -1340,7 +1335,7 @@ static unsigned long __part_start_io_acct(struct hd_struct *part,
	return now;
}

unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
unsigned long part_start_io_acct(struct gendisk *disk, struct block_device **part,
				 struct bio *bio)
{
	*part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector);
@@ -1352,11 +1347,11 @@ EXPORT_SYMBOL_GPL(part_start_io_acct);
unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
				 unsigned int op)
{
	return __part_start_io_acct(&disk->part0, sectors, op);
	return __part_start_io_acct(disk->part0, sectors, op);
}
EXPORT_SYMBOL(disk_start_io_acct);

static void __part_end_io_acct(struct hd_struct *part, unsigned int op,
static void __part_end_io_acct(struct block_device *part, unsigned int op,
			       unsigned long start_time)
{
	const int sgrp = op_stat_group(op);
@@ -1370,18 +1365,17 @@ static void __part_end_io_acct(struct hd_struct *part, unsigned int op,
	part_stat_unlock();
}

void part_end_io_acct(struct hd_struct *part, struct bio *bio,
void part_end_io_acct(struct block_device *part, struct bio *bio,
		      unsigned long start_time)
{
	__part_end_io_acct(part, bio_op(bio), start_time);
	hd_struct_put(part);
}
EXPORT_SYMBOL_GPL(part_end_io_acct);

void disk_end_io_acct(struct gendisk *disk, unsigned int op,
		      unsigned long start_time)
{
	__part_end_io_acct(&disk->part0, op, start_time);
	__part_end_io_acct(disk->part0, op, start_time);
}
EXPORT_SYMBOL(disk_end_io_acct);

+26 −6
Original line number Diff line number Diff line
@@ -69,7 +69,6 @@
#include <linux/blkdev.h>
#include <linux/gfp.h>
#include <linux/blk-mq.h>
#include <linux/lockdep.h>

#include "blk.h"
#include "blk-mq.h"
@@ -139,7 +138,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front)

static void blk_account_io_flush(struct request *rq)
{
	struct hd_struct *part = &rq->rq_disk->part0;
	struct block_device *part = rq->rq_disk->part0;

	part_stat_lock();
	part_stat_inc(part, ios[STAT_FLUSH]);
@@ -474,9 +473,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
	INIT_LIST_HEAD(&fq->flush_queue[1]);
	INIT_LIST_HEAD(&fq->flush_data_in_flight);

	lockdep_register_key(&fq->key);
	lockdep_set_class(&fq->mq_flush_lock, &fq->key);

	return fq;

 fail_rq:
@@ -491,7 +487,31 @@ void blk_free_flush_queue(struct blk_flush_queue *fq)
	if (!fq)
		return;

	lockdep_unregister_key(&fq->key);
	kfree(fq->flush_rq);
	kfree(fq);
}

/*
 * Allow driver to set its own lock class to fq->mq_flush_lock for
 * avoiding lockdep complaint.
 *
 * flush_end_io() may be called recursively from some driver, such as
 * nvme-loop, so lockdep may complain 'possible recursive locking' because
 * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class
 * key. We need to assign different lock class for these driver's
 * fq->mq_flush_lock for avoiding the lockdep warning.
 *
 * Use dynamically allocated lock class key for each 'blk_flush_queue'
 * instance is over-kill, and more worse it introduces horrible boot delay
 * issue because synchronize_rcu() is implied in lockdep_unregister_key which
 * is called for each hctx release. SCSI probing may synchronously create and
 * destroy lots of MQ request_queues for non-existent devices, and some robot
 * test kernel always enable lockdep option. It is observed that more than half
 * an hour is taken during SCSI MQ probe with per-fq lock class.
 */
void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
		struct lock_class_key *key)
{
	lockdep_set_class(&hctx->fq->mq_flush_lock, key);
}
EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class);
+155 −132
Original line number Diff line number Diff line
@@ -39,7 +39,7 @@
 * On top of that, a size cost proportional to the length of the IO is
 * added.  While simple, this model captures the operational
 * characteristics of a wide varienty of devices well enough.  Default
 * paramters for several different classes of devices are provided and the
 * parameters for several different classes of devices are provided and the
 * parameters can be configured from userspace via
 * /sys/fs/cgroup/io.cost.model.
 *
@@ -77,7 +77,7 @@
 *
 * This constitutes the basis of IO capacity distribution.  Each cgroup's
 * vtime is running at a rate determined by its hweight.  A cgroup tracks
 * the vtime consumed by past IOs and can issue a new IO iff doing so
 * the vtime consumed by past IOs and can issue a new IO if doing so
 * wouldn't outrun the current device vtime.  Otherwise, the IO is
 * suspended until the vtime has progressed enough to cover it.
 *
@@ -155,7 +155,7 @@
 * Instead of debugfs or other clumsy monitoring mechanisms, this
 * controller uses a drgn based monitoring script -
 * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
 * https://github.com/osandov/drgn.  The ouput looks like the following.
 * https://github.com/osandov/drgn.  The output looks like the following.
 *
 *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
 *                 active      weight      hweight% inflt% dbt  delay usages%
@@ -370,8 +370,6 @@ enum {
	AUTOP_SSD_FAST,
};

struct ioc_gq;

struct ioc_params {
	u32				qos[NR_QOS_PARAMS];
	u64				i_lcoefs[NR_I_LCOEFS];
@@ -492,7 +490,7 @@ struct ioc_gq {
	/*
	 * `vtime` is this iocg's vtime cursor which progresses as IOs are
	 * issued.  If lagging behind device vtime, the delta represents
	 * the currently available IO budget.  If runnning ahead, the
	 * the currently available IO budget.  If running ahead, the
	 * overage.
	 *
	 * `vtime_done` is the same but progressed on completion rather
@@ -973,6 +971,58 @@ done:
	ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
}

static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
				  int nr_lagging, int nr_shortages,
				  int prev_busy_level, u32 *missed_ppm)
{
	u64 vrate = ioc->vtime_base_rate;
	u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;

	if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
		if (ioc->busy_level != prev_busy_level || nr_lagging)
			trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
						   missed_ppm, rq_wait_pct,
						   nr_lagging, nr_shortages);

		return;
	}

	/* rq_wait signal is always reliable, ignore user vrate_min */
	if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
		vrate_min = VRATE_MIN;

	/*
	 * If vrate is out of bounds, apply clamp gradually as the
	 * bounds can change abruptly.  Otherwise, apply busy_level
	 * based adjustment.
	 */
	if (vrate < vrate_min) {
		vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
		vrate = min(vrate, vrate_min);
	} else if (vrate > vrate_max) {
		vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
		vrate = max(vrate, vrate_max);
	} else {
		int idx = min_t(int, abs(ioc->busy_level),
				ARRAY_SIZE(vrate_adj_pct) - 1);
		u32 adj_pct = vrate_adj_pct[idx];

		if (ioc->busy_level > 0)
			adj_pct = 100 - adj_pct;
		else
			adj_pct = 100 + adj_pct;

		vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
			      vrate_min, vrate_max);
	}

	trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
				   nr_lagging, nr_shortages);

	ioc->vtime_base_rate = vrate;
	ioc_refresh_margins(ioc);
}

/* take a snapshot of the current [v]time and vrate */
static void ioc_now(struct ioc *ioc, struct ioc_now *now)
{
@@ -1046,7 +1096,7 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,

		/*
		 * The delta between inuse and active sums indicates that
		 * that much of weight is being given away.  Parent's inuse
		 * much of weight is being given away.  Parent's inuse
		 * and active should reflect the ratio.
		 */
		if (parent->child_active_sum) {
@@ -2071,40 +2121,21 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
	}
}

static void ioc_timer_fn(struct timer_list *timer)
/*
 * Check the active iocgs' state to avoid oversleeping and deactive
 * idle iocgs.
 *
 * Since waiters determine the sleep durations based on the vrate
 * they saw at the time of sleep, if vrate has increased, some
 * waiters could be sleeping for too long. Wake up tardy waiters
 * which should have woken up in the last period and expire idle
 * iocgs.
 */
static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
{
	struct ioc *ioc = container_of(timer, struct ioc, timer);
	int nr_debtors = 0;
	struct ioc_gq *iocg, *tiocg;
	struct ioc_now now;
	LIST_HEAD(surpluses);
	int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0;
	u64 usage_us_sum = 0;
	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
	u32 missed_ppm[2], rq_wait_pct;
	u64 period_vtime;
	int prev_busy_level;

	/* how were the latencies during the period? */
	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);

	/* take care of active iocgs */
	spin_lock_irq(&ioc->lock);

	ioc_now(ioc, &now);

	period_vtime = now.vnow - ioc->period_at_vtime;
	if (WARN_ON_ONCE(!period_vtime)) {
		spin_unlock_irq(&ioc->lock);
		return;
	}

	/*
	 * Waiters determine the sleep durations based on the vrate they
	 * saw at the time of sleep.  If vrate has increased, some waiters
	 * could be sleeping for too long.  Wake up tardy waiters which
	 * should have woken up in the last period and expire idle iocgs.
	 */
	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
		if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
		    !iocg->delay && !iocg_is_idle(iocg))
@@ -2114,24 +2145,24 @@ static void ioc_timer_fn(struct timer_list *timer)

		/* flush wait and indebt stat deltas */
		if (iocg->wait_since) {
			iocg->local_stat.wait_us += now.now - iocg->wait_since;
			iocg->wait_since = now.now;
			iocg->local_stat.wait_us += now->now - iocg->wait_since;
			iocg->wait_since = now->now;
		}
		if (iocg->indebt_since) {
			iocg->local_stat.indebt_us +=
				now.now - iocg->indebt_since;
			iocg->indebt_since = now.now;
				now->now - iocg->indebt_since;
			iocg->indebt_since = now->now;
		}
		if (iocg->indelay_since) {
			iocg->local_stat.indelay_us +=
				now.now - iocg->indelay_since;
			iocg->indelay_since = now.now;
				now->now - iocg->indelay_since;
			iocg->indelay_since = now->now;
		}

		if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
		    iocg->delay) {
			/* might be oversleeping vtime / hweight changes, kick */
			iocg_kick_waitq(iocg, true, &now);
			iocg_kick_waitq(iocg, true, now);
			if (iocg->abs_vdebt || iocg->delay)
				nr_debtors++;
		} else if (iocg_is_idle(iocg)) {
@@ -2145,7 +2176,7 @@ static void ioc_timer_fn(struct timer_list *timer)
			 * error and throw away. On reactivation, it'll start
			 * with the target budget.
			 */
			excess = now.vnow - vtime - ioc->margins.target;
			excess = now->vnow - vtime - ioc->margins.target;
			if (excess > 0) {
				u32 old_hwi;

@@ -2154,13 +2185,46 @@ static void ioc_timer_fn(struct timer_list *timer)
							    WEIGHT_ONE);
			}

			__propagate_weights(iocg, 0, 0, false, &now);
			__propagate_weights(iocg, 0, 0, false, now);
			list_del_init(&iocg->active_list);
		}

		spin_unlock(&iocg->waitq.lock);
	}

	commit_weights(ioc);
	return nr_debtors;
}

static void ioc_timer_fn(struct timer_list *timer)
{
	struct ioc *ioc = container_of(timer, struct ioc, timer);
	struct ioc_gq *iocg, *tiocg;
	struct ioc_now now;
	LIST_HEAD(surpluses);
	int nr_debtors, nr_shortages = 0, nr_lagging = 0;
	u64 usage_us_sum = 0;
	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
	u32 missed_ppm[2], rq_wait_pct;
	u64 period_vtime;
	int prev_busy_level;

	/* how were the latencies during the period? */
	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);

	/* take care of active iocgs */
	spin_lock_irq(&ioc->lock);

	ioc_now(ioc, &now);

	period_vtime = now.vnow - ioc->period_at_vtime;
	if (WARN_ON_ONCE(!period_vtime)) {
		spin_unlock_irq(&ioc->lock);
		return;
	}

	nr_debtors = ioc_check_iocgs(ioc, &now);

	/*
	 * Wait and indebt stat are flushed above and the donation calculation
@@ -2170,8 +2234,8 @@ static void ioc_timer_fn(struct timer_list *timer)

	/* calc usage and see whether some weights need to be moved around */
	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
		u64 vdone, vtime, usage_us, usage_dur;
		u32 usage, hw_active, hw_inuse;
		u64 vdone, vtime, usage_us;
		u32 hw_active, hw_inuse;

		/*
		 * Collect unused and wind vtime closer to vnow to prevent
@@ -2202,10 +2266,19 @@ static void ioc_timer_fn(struct timer_list *timer)
		usage_us = iocg->usage_delta_us;
		usage_us_sum += usage_us;

		/* see whether there's surplus vtime */
		WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
		if (hw_inuse < hw_active ||
		    (!waitqueue_active(&iocg->waitq) &&
		     time_before64(vtime, now.vnow - ioc->margins.low))) {
			u32 hwa, old_hwi, hwm, new_hwi, usage;
			u64 usage_dur;

			if (vdone != vtime) {
				u64 inflight_us = DIV64_U64_ROUND_UP(
					cost_to_abs_cost(vtime - vdone, hw_inuse),
					ioc->vtime_base_rate);

				usage_us = max(usage_us, inflight_us);
			}

@@ -2220,13 +2293,6 @@ static void ioc_timer_fn(struct timer_list *timer)
						   usage_dur),
				1, WEIGHT_ONE);

		/* see whether there's surplus vtime */
		WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
		if (hw_inuse < hw_active ||
		    (!waitqueue_active(&iocg->waitq) &&
		     time_before64(vtime, now.vnow - ioc->margins.low))) {
			u32 hwa, old_hwi, hwm, new_hwi;

			/*
			 * Already donating or accumulated enough to start.
			 * Determine the donation amount.
@@ -2309,51 +2375,8 @@ static void ioc_timer_fn(struct timer_list *timer)

	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);

	if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
		u64 vrate = ioc->vtime_base_rate;
		u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;

		/* rq_wait signal is always reliable, ignore user vrate_min */
		if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
			vrate_min = VRATE_MIN;

		/*
		 * If vrate is out of bounds, apply clamp gradually as the
		 * bounds can change abruptly.  Otherwise, apply busy_level
		 * based adjustment.
		 */
		if (vrate < vrate_min) {
			vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
					  100);
			vrate = min(vrate, vrate_min);
		} else if (vrate > vrate_max) {
			vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
					  100);
			vrate = max(vrate, vrate_max);
		} else {
			int idx = min_t(int, abs(ioc->busy_level),
					ARRAY_SIZE(vrate_adj_pct) - 1);
			u32 adj_pct = vrate_adj_pct[idx];

			if (ioc->busy_level > 0)
				adj_pct = 100 - adj_pct;
			else
				adj_pct = 100 + adj_pct;

			vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
				      vrate_min, vrate_max);
		}

		trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
					   nr_lagging, nr_shortages);

		ioc->vtime_base_rate = vrate;
		ioc_refresh_margins(ioc);
	} else if (ioc->busy_level != prev_busy_level || nr_lagging) {
		trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
					   missed_ppm, rq_wait_pct, nr_lagging,
					   nr_shortages);
	}
	ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
			      prev_busy_level, missed_ppm);

	ioc_refresh_params(ioc, false);

@@ -2400,7 +2423,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
		return cost;

	/*
	 * We only increase inuse during period and do so iff the margin has
	 * We only increase inuse during period and do so if the margin has
	 * deteriorated since the previous adjustment.
	 */
	if (margin >= iocg->saved_margin || margin >= margins->low ||
@@ -3120,23 +3143,23 @@ static const match_table_t qos_tokens = {
static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
			     size_t nbytes, loff_t off)
{
	struct gendisk *disk;
	struct block_device *bdev;
	struct ioc *ioc;
	u32 qos[NR_QOS_PARAMS];
	bool enable, user;
	char *p;
	int ret;

	disk = blkcg_conf_get_disk(&input);
	if (IS_ERR(disk))
		return PTR_ERR(disk);
	bdev = blkcg_conf_open_bdev(&input);
	if (IS_ERR(bdev))
		return PTR_ERR(bdev);

	ioc = q_to_ioc(disk->queue);
	ioc = q_to_ioc(bdev->bd_disk->queue);
	if (!ioc) {
		ret = blk_iocost_init(disk->queue);
		ret = blk_iocost_init(bdev->bd_disk->queue);
		if (ret)
			goto err;
		ioc = q_to_ioc(disk->queue);
		ioc = q_to_ioc(bdev->bd_disk->queue);
	}

	spin_lock_irq(&ioc->lock);
@@ -3231,12 +3254,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
	ioc_refresh_params(ioc, true);
	spin_unlock_irq(&ioc->lock);

	put_disk_and_module(disk);
	blkdev_put_no_open(bdev);
	return nbytes;
einval:
	ret = -EINVAL;
err:
	put_disk_and_module(disk);
	blkdev_put_no_open(bdev);
	return ret;
}

@@ -3287,23 +3310,23 @@ static const match_table_t i_lcoef_tokens = {
static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
				    size_t nbytes, loff_t off)
{
	struct gendisk *disk;
	struct block_device *bdev;
	struct ioc *ioc;
	u64 u[NR_I_LCOEFS];
	bool user;
	char *p;
	int ret;

	disk = blkcg_conf_get_disk(&input);
	if (IS_ERR(disk))
		return PTR_ERR(disk);
	bdev = blkcg_conf_open_bdev(&input);
	if (IS_ERR(bdev))
		return PTR_ERR(bdev);

	ioc = q_to_ioc(disk->queue);
	ioc = q_to_ioc(bdev->bd_disk->queue);
	if (!ioc) {
		ret = blk_iocost_init(disk->queue);
		ret = blk_iocost_init(bdev->bd_disk->queue);
		if (ret)
			goto err;
		ioc = q_to_ioc(disk->queue);
		ioc = q_to_ioc(bdev->bd_disk->queue);
	}

	spin_lock_irq(&ioc->lock);
@@ -3356,13 +3379,13 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
	ioc_refresh_params(ioc, true);
	spin_unlock_irq(&ioc->lock);

	put_disk_and_module(disk);
	blkdev_put_no_open(bdev);
	return nbytes;

einval:
	ret = -EINVAL;
err:
	put_disk_and_module(disk);
	blkdev_put_no_open(bdev);
	return ret;
}

Loading