Merge tag 'for-5.11/block-2020-12-14' of git://git.kernel.dk/linux-block (ac7ac461) · Commits · 戴 / test

block/bio.c

+5 −5

Original line number	Diff line number	Diff line
		@@ -608,12 +608,12 @@ void bio_truncate(struct bio *bio, unsigned new_size)
		void guard_bio_eod(struct bio *bio)
		{
		sector_t maxsector;
		struct hd_struct *part;
		struct block_device *part;

		rcu_read_lock();
		part = __disk_get_part(bio->bi_disk, bio->bi_partno);
		if (part)
		maxsector = part_nr_sects_read(part);
		maxsector = bdev_nr_sectors(part);
		else
		maxsector = get_capacity(bio->bi_disk);
		rcu_read_unlock();
		@@ -1212,8 +1212,8 @@ void bio_copy_data_iter(struct bio dst, struct bvec_iter dst_iter,

		flush_dcache_page(dst_bv.bv_page);

		bio_advance_iter(src, src_iter, bytes);
		bio_advance_iter(dst, dst_iter, bytes);
		bio_advance_iter_single(src, src_iter, bytes);
		bio_advance_iter_single(dst, dst_iter, bytes);
		}
		}
		EXPORT_SYMBOL(bio_copy_data_iter);

block/blk-cgroup.c

+25 −26

Original line number	Diff line number	Diff line
		@@ -556,22 +556,22 @@ static struct blkcg_gq blkg_lookup_check(struct blkcg blkcg,
		}

		/**
		* blkg_conf_prep - parse and prepare for per-blkg config update
		* blkcg_conf_open_bdev - parse and open bdev for per-blkg config update
		* @inputp: input string pointer
		*
		* Parse the device node prefix part, MAJ:MIN, of per-blkg config update
		* from @input and get and return the matching gendisk. *@inputp is
		* from @input and get and return the matching bdev. *@inputp is
		* updated to point past the device node prefix. Returns an ERR_PTR()
		* value on error.
		*
		* Use this function iff blkg_conf_prep() can't be used for some reason.
		*/
		struct gendisk blkcg_conf_get_disk(char *inputp)
		struct block_device blkcg_conf_open_bdev(char *inputp)
		{
		char input = inputp;
		unsigned int major, minor;
		struct gendisk *disk;
		int key_len, part;
		struct block_device *bdev;
		int key_len;

		if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
		return ERR_PTR(-EINVAL);
		@@ -581,16 +581,16 @@ struct gendisk blkcg_conf_get_disk(char *inputp)
		return ERR_PTR(-EINVAL);
		input = skip_spaces(input);

		disk = get_gendisk(MKDEV(major, minor), &part);
		if (!disk)
		bdev = blkdev_get_no_open(MKDEV(major, minor));
		if (!bdev)
		return ERR_PTR(-ENODEV);
		if (part) {
		put_disk_and_module(disk);
		if (bdev_is_partition(bdev)) {
		blkdev_put_no_open(bdev);
		return ERR_PTR(-ENODEV);
		}

		*inputp = input;
		return disk;
		return bdev;
		}

		/**
		@@ -607,18 +607,18 @@ struct gendisk blkcg_conf_get_disk(char *inputp)
		*/
		int blkg_conf_prep(struct blkcg blkcg, const struct blkcg_policy pol,
		char input, struct blkg_conf_ctx ctx)
		__acquires(rcu) __acquires(&disk->queue->queue_lock)
		__acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock)
		{
		struct gendisk *disk;
		struct block_device *bdev;
		struct request_queue *q;
		struct blkcg_gq *blkg;
		int ret;

		disk = blkcg_conf_get_disk(&input);
		if (IS_ERR(disk))
		return PTR_ERR(disk);
		bdev = blkcg_conf_open_bdev(&input);
		if (IS_ERR(bdev))
		return PTR_ERR(bdev);

		q = disk->queue;
		q = bdev->bd_disk->queue;

		rcu_read_lock();
		spin_lock_irq(&q->queue_lock);
		@@ -689,7 +689,7 @@ int blkg_conf_prep(struct blkcg blkcg, const struct blkcg_policy pol,
		goto success;
		}
		success:
		ctx->disk = disk;
		ctx->bdev = bdev;
		ctx->blkg = blkg;
		ctx->body = input;
		return 0;
		@@ -700,7 +700,7 @@ fail_unlock:
		spin_unlock_irq(&q->queue_lock);
		rcu_read_unlock();
		fail:
		put_disk_and_module(disk);
		blkdev_put_no_open(bdev);
		/*
		* If queue was bypassing, we should retry. Do so after a
		* short msleep(). It isn't strictly necessary but queue
		@@ -723,11 +723,11 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
		* with blkg_conf_prep().
		*/
		void blkg_conf_finish(struct blkg_conf_ctx *ctx)
		__releases(&ctx->disk->queue->queue_lock) __releases(rcu)
		__releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu)
		{
		spin_unlock_irq(&ctx->disk->queue->queue_lock);
		spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock);
		rcu_read_unlock();
		put_disk_and_module(ctx->disk);
		blkdev_put_no_open(ctx->bdev);
		}
		EXPORT_SYMBOL_GPL(blkg_conf_finish);

		@@ -820,9 +820,9 @@ static void blkcg_fill_root_iostats(void)

		class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
		while ((dev = class_dev_iter_next(&iter))) {
		struct gendisk *disk = dev_to_disk(dev);
		struct hd_struct *part = disk_get_part(disk, 0);
		struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue);
		struct block_device *bdev = dev_to_bdev(dev);
		struct blkcg_gq *blkg =
		blk_queue_root_blkg(bdev->bd_disk->queue);
		struct blkg_iostat tmp;
		int cpu;

		@@ -830,7 +830,7 @@ static void blkcg_fill_root_iostats(void)
		for_each_possible_cpu(cpu) {
		struct disk_stats *cpu_dkstats;

		cpu_dkstats = per_cpu_ptr(part->dkstats, cpu);
		cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
		tmp.ios[BLKG_IOSTAT_READ] +=
		cpu_dkstats->ios[STAT_READ];
		tmp.ios[BLKG_IOSTAT_WRITE] +=
		@@ -849,7 +849,6 @@ static void blkcg_fill_root_iostats(void)
		blkg_iostat_set(&blkg->iostat.cur, &tmp);
		u64_stats_update_end(&blkg->iostat.sync);
		}
		disk_put_part(part);
		}
		}

block/blk-core.c

+32 −38

Original line number	Diff line number	Diff line
		@@ -666,9 +666,9 @@ static int __init setup_fail_make_request(char *str)
		}
		__setup("fail_make_request=", setup_fail_make_request);

		static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
		static bool should_fail_request(struct block_device *part, unsigned int bytes)
		{
		return part->make_it_fail && should_fail(&fail_make_request, bytes);
		return part->bd_make_it_fail && should_fail(&fail_make_request, bytes);
		}

		static int __init fail_make_request_debugfs(void)
		@@ -683,7 +683,7 @@ late_initcall(fail_make_request_debugfs);

		#else /* CONFIG_FAIL_MAKE_REQUEST */

		static inline bool should_fail_request(struct hd_struct *part,
		static inline bool should_fail_request(struct block_device *part,
		unsigned int bytes)
		{
		return false;
		@@ -691,11 +691,11 @@ static inline bool should_fail_request(struct hd_struct *part,

		#endif /* CONFIG_FAIL_MAKE_REQUEST */

		static inline bool bio_check_ro(struct bio bio, struct hd_struct part)
		static inline bool bio_check_ro(struct bio bio, struct block_device part)
		{
		const int op = bio_op(bio);

		if (part->policy && op_is_write(op)) {
		if (part->bd_read_only && op_is_write(op)) {
		char b[BDEVNAME_SIZE];

		if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
		@@ -703,7 +703,7 @@ static inline bool bio_check_ro(struct bio bio, struct hd_struct part)

		WARN_ONCE(1,
		"Trying to write to read-only block-device %s (partno %d)\n",
		bio_devname(bio, b), part->partno);
		bio_devname(bio, b), part->bd_partno);
		/* Older lvm-tools actually trigger this */
		return false;
		}
		@@ -713,7 +713,7 @@ static inline bool bio_check_ro(struct bio bio, struct hd_struct part)

		static noinline int should_fail_bio(struct bio *bio)
		{
		if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
		if (should_fail_request(bio->bi_disk->part0, bio->bi_iter.bi_size))
		return -EIO;
		return 0;
		}
		@@ -742,7 +742,7 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
		*/
		static inline int blk_partition_remap(struct bio *bio)
		{
		struct hd_struct *p;
		struct block_device *p;
		int ret = -EIO;

		rcu_read_lock();
		@@ -755,11 +755,12 @@ static inline int blk_partition_remap(struct bio *bio)
		goto out;

		if (bio_sectors(bio)) {
		if (bio_check_eod(bio, part_nr_sects_read(p)))
		if (bio_check_eod(bio, bdev_nr_sectors(p)))
		goto out;
		bio->bi_iter.bi_sector += p->start_sect;
		trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
		bio->bi_iter.bi_sector - p->start_sect);
		bio->bi_iter.bi_sector += p->bd_start_sect;
		trace_block_bio_remap(bio, p->bd_dev,
		bio->bi_iter.bi_sector -
		p->bd_start_sect);
		}
		bio->bi_partno = 0;
		ret = 0;
		@@ -829,7 +830,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
		if (unlikely(blk_partition_remap(bio)))
		goto end_io;
		} else {
		if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
		if (unlikely(bio_check_ro(bio, bio->bi_disk->part0)))
		goto end_io;
		if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
		goto end_io;
		@@ -906,7 +907,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
		blkcg_bio_issue_init(bio);

		if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
		trace_block_bio_queue(q, bio);
		trace_block_bio_queue(bio);
		/* Now that enqueuing has been traced, we need to trace
		* completion as well.
		*/
		@@ -1201,7 +1202,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue q, struct request
		return ret;

		if (rq->rq_disk &&
		should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
		should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq)))
		return BLK_STS_IOERR;

		if (blk_crypto_insert_cloned_request(rq))
		@@ -1260,17 +1261,18 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
		}
		EXPORT_SYMBOL_GPL(blk_rq_err_bytes);

		static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
		static void update_io_ticks(struct block_device *part, unsigned long now,
		bool end)
		{
		unsigned long stamp;
		again:
		stamp = READ_ONCE(part->stamp);
		stamp = READ_ONCE(part->bd_stamp);
		if (unlikely(stamp != now)) {
		if (likely(cmpxchg(&part->stamp, stamp, now) == stamp))
		if (likely(cmpxchg(&part->bd_stamp, stamp, now) == stamp))
		__part_stat_add(part, io_ticks, end ? now - stamp : 1);
		}
		if (part->partno) {
		part = &part_to_disk(part)->part0;
		if (part->bd_partno) {
		part = bdev_whole(part);
		goto again;
		}
		}
		@@ -1279,11 +1281,9 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
		{
		if (req->part && blk_do_io_stat(req)) {
		const int sgrp = op_stat_group(req_op(req));
		struct hd_struct *part;

		part_stat_lock();
		part = req->part;
		part_stat_add(part, sectors[sgrp], bytes >> 9);
		part_stat_add(req->part, sectors[sgrp], bytes >> 9);
		part_stat_unlock();
		}
		}
		@@ -1298,17 +1298,12 @@ void blk_account_io_done(struct request *req, u64 now)
		if (req->part && blk_do_io_stat(req) &&
		!(req->rq_flags & RQF_FLUSH_SEQ)) {
		const int sgrp = op_stat_group(req_op(req));
		struct hd_struct *part;

		part_stat_lock();
		part = req->part;

		update_io_ticks(part, jiffies, true);
		part_stat_inc(part, ios[sgrp]);
		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
		update_io_ticks(req->part, jiffies, true);
		part_stat_inc(req->part, ios[sgrp]);
		part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
		part_stat_unlock();

		hd_struct_put(part);
		}
		}

		@@ -1324,7 +1319,7 @@ void blk_account_io_start(struct request *rq)
		part_stat_unlock();
		}

		static unsigned long __part_start_io_acct(struct hd_struct *part,
		static unsigned long __part_start_io_acct(struct block_device *part,
		unsigned int sectors, unsigned int op)
		{
		const int sgrp = op_stat_group(op);
		@@ -1340,7 +1335,7 @@ static unsigned long __part_start_io_acct(struct hd_struct *part,
		return now;
		}

		unsigned long part_start_io_acct(struct gendisk disk, struct hd_struct *part,
		unsigned long part_start_io_acct(struct gendisk disk, struct block_device *part,
		struct bio *bio)
		{
		*part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector);
		@@ -1352,11 +1347,11 @@ EXPORT_SYMBOL_GPL(part_start_io_acct);
		unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
		unsigned int op)
		{
		return __part_start_io_acct(&disk->part0, sectors, op);
		return __part_start_io_acct(disk->part0, sectors, op);
		}
		EXPORT_SYMBOL(disk_start_io_acct);

		static void __part_end_io_acct(struct hd_struct *part, unsigned int op,
		static void __part_end_io_acct(struct block_device *part, unsigned int op,
		unsigned long start_time)
		{
		const int sgrp = op_stat_group(op);
		@@ -1370,18 +1365,17 @@ static void __part_end_io_acct(struct hd_struct *part, unsigned int op,
		part_stat_unlock();
		}

		void part_end_io_acct(struct hd_struct part, struct bio bio,
		void part_end_io_acct(struct block_device part, struct bio bio,
		unsigned long start_time)
		{
		__part_end_io_acct(part, bio_op(bio), start_time);
		hd_struct_put(part);
		}
		EXPORT_SYMBOL_GPL(part_end_io_acct);

		void disk_end_io_acct(struct gendisk *disk, unsigned int op,
		unsigned long start_time)
		{
		__part_end_io_acct(&disk->part0, op, start_time);
		__part_end_io_acct(disk->part0, op, start_time);
		}
		EXPORT_SYMBOL(disk_end_io_acct);

block/blk-flush.c

+26 −6

Original line number	Diff line number	Diff line
		@@ -69,7 +69,6 @@
		#include <linux/blkdev.h>
		#include <linux/gfp.h>
		#include <linux/blk-mq.h>
		#include <linux/lockdep.h>

		#include "blk.h"
		#include "blk-mq.h"
		@@ -139,7 +138,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front)

		static void blk_account_io_flush(struct request *rq)
		{
		struct hd_struct *part = &rq->rq_disk->part0;
		struct block_device *part = rq->rq_disk->part0;

		part_stat_lock();
		part_stat_inc(part, ios[STAT_FLUSH]);
		@@ -474,9 +473,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
		INIT_LIST_HEAD(&fq->flush_queue[1]);
		INIT_LIST_HEAD(&fq->flush_data_in_flight);

		lockdep_register_key(&fq->key);
		lockdep_set_class(&fq->mq_flush_lock, &fq->key);

		return fq;

		fail_rq:
		@@ -491,7 +487,31 @@ void blk_free_flush_queue(struct blk_flush_queue *fq)
		if (!fq)
		return;

		lockdep_unregister_key(&fq->key);
		kfree(fq->flush_rq);
		kfree(fq);
		}

		/*
		* Allow driver to set its own lock class to fq->mq_flush_lock for
		* avoiding lockdep complaint.
		*
		* flush_end_io() may be called recursively from some driver, such as
		* nvme-loop, so lockdep may complain 'possible recursive locking' because
		* all 'struct blk_flush_queue' instance share same mq_flush_lock lock class
		* key. We need to assign different lock class for these driver's
		* fq->mq_flush_lock for avoiding the lockdep warning.
		*
		* Use dynamically allocated lock class key for each 'blk_flush_queue'
		* instance is over-kill, and more worse it introduces horrible boot delay
		* issue because synchronize_rcu() is implied in lockdep_unregister_key which
		* is called for each hctx release. SCSI probing may synchronously create and
		* destroy lots of MQ request_queues for non-existent devices, and some robot
		* test kernel always enable lockdep option. It is observed that more than half
		* an hour is taken during SCSI MQ probe with per-fq lock class.
		*/
		void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
		struct lock_class_key *key)
		{
		lockdep_set_class(&hctx->fq->mq_flush_lock, key);
		}
		EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class);

block/blk-iocost.c

+155 −132

Original line number	Diff line number	Diff line
		@@ -39,7 +39,7 @@
		* On top of that, a size cost proportional to the length of the IO is
		* added. While simple, this model captures the operational
		* characteristics of a wide varienty of devices well enough. Default
		* paramters for several different classes of devices are provided and the
		* parameters for several different classes of devices are provided and the
		* parameters can be configured from userspace via
		* /sys/fs/cgroup/io.cost.model.
		*
		@@ -77,7 +77,7 @@
		*
		* This constitutes the basis of IO capacity distribution. Each cgroup's
		* vtime is running at a rate determined by its hweight. A cgroup tracks
		* the vtime consumed by past IOs and can issue a new IO iff doing so
		* the vtime consumed by past IOs and can issue a new IO if doing so
		* wouldn't outrun the current device vtime. Otherwise, the IO is
		* suspended until the vtime has progressed enough to cover it.
		*
		@@ -155,7 +155,7 @@
		* Instead of debugfs or other clumsy monitoring mechanisms, this
		* controller uses a drgn based monitoring script -
		* tools/cgroup/iocost_monitor.py. For details on drgn, please see
		* https://github.com/osandov/drgn. The ouput looks like the following.
		* https://github.com/osandov/drgn. The output looks like the following.
		*
		* sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
		* active weight hweight% inflt% dbt delay usages%
		@@ -370,8 +370,6 @@ enum {
		AUTOP_SSD_FAST,
		};

		struct ioc_gq;

		struct ioc_params {
		u32 qos[NR_QOS_PARAMS];
		u64 i_lcoefs[NR_I_LCOEFS];
		@@ -492,7 +490,7 @@ struct ioc_gq {
		/*
		* `vtime` is this iocg's vtime cursor which progresses as IOs are
		* issued. If lagging behind device vtime, the delta represents
		* the currently available IO budget. If runnning ahead, the
		* the currently available IO budget. If running ahead, the
		* overage.
		*
		* `vtime_done` is the same but progressed on completion rather
		@@ -973,6 +971,58 @@ done:
		ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
		}

		static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
		int nr_lagging, int nr_shortages,
		int prev_busy_level, u32 *missed_ppm)
		{
		u64 vrate = ioc->vtime_base_rate;
		u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;

		if (!ioc->busy_level \|\| (ioc->busy_level < 0 && nr_lagging)) {
		if (ioc->busy_level != prev_busy_level \|\| nr_lagging)
		trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
		missed_ppm, rq_wait_pct,
		nr_lagging, nr_shortages);

		return;
		}

		/* rq_wait signal is always reliable, ignore user vrate_min */
		if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
		vrate_min = VRATE_MIN;

		/*
		* If vrate is out of bounds, apply clamp gradually as the
		* bounds can change abruptly. Otherwise, apply busy_level
		* based adjustment.
		*/
		if (vrate < vrate_min) {
		vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
		vrate = min(vrate, vrate_min);
		} else if (vrate > vrate_max) {
		vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
		vrate = max(vrate, vrate_max);
		} else {
		int idx = min_t(int, abs(ioc->busy_level),
		ARRAY_SIZE(vrate_adj_pct) - 1);
		u32 adj_pct = vrate_adj_pct[idx];

		if (ioc->busy_level > 0)
		adj_pct = 100 - adj_pct;
		else
		adj_pct = 100 + adj_pct;

		vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
		vrate_min, vrate_max);
		}

		trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
		nr_lagging, nr_shortages);

		ioc->vtime_base_rate = vrate;
		ioc_refresh_margins(ioc);
		}

		/* take a snapshot of the current [v]time and vrate */
		static void ioc_now(struct ioc ioc, struct ioc_now now)
		{
		@@ -1046,7 +1096,7 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,

		/*
		* The delta between inuse and active sums indicates that
		* that much of weight is being given away. Parent's inuse
		* much of weight is being given away. Parent's inuse
		* and active should reflect the ratio.
		*/
		if (parent->child_active_sum) {
		@@ -2071,40 +2121,21 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
		}
		}

		static void ioc_timer_fn(struct timer_list *timer)
		/*
		* Check the active iocgs' state to avoid oversleeping and deactive
		* idle iocgs.
		*
		* Since waiters determine the sleep durations based on the vrate
		* they saw at the time of sleep, if vrate has increased, some
		* waiters could be sleeping for too long. Wake up tardy waiters
		* which should have woken up in the last period and expire idle
		* iocgs.
		*/
		static int ioc_check_iocgs(struct ioc ioc, struct ioc_now now)
		{
		struct ioc *ioc = container_of(timer, struct ioc, timer);
		int nr_debtors = 0;
		struct ioc_gq iocg, tiocg;
		struct ioc_now now;
		LIST_HEAD(surpluses);
		int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0;
		u64 usage_us_sum = 0;
		u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
		u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
		u32 missed_ppm[2], rq_wait_pct;
		u64 period_vtime;
		int prev_busy_level;

		/* how were the latencies during the period? */
		ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);

		/* take care of active iocgs */
		spin_lock_irq(&ioc->lock);

		ioc_now(ioc, &now);

		period_vtime = now.vnow - ioc->period_at_vtime;
		if (WARN_ON_ONCE(!period_vtime)) {
		spin_unlock_irq(&ioc->lock);
		return;
		}

		/*
		* Waiters determine the sleep durations based on the vrate they
		* saw at the time of sleep. If vrate has increased, some waiters
		* could be sleeping for too long. Wake up tardy waiters which
		* should have woken up in the last period and expire idle iocgs.
		*/
		list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
		if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
		!iocg->delay && !iocg_is_idle(iocg))
		@@ -2114,24 +2145,24 @@ static void ioc_timer_fn(struct timer_list *timer)

		/* flush wait and indebt stat deltas */
		if (iocg->wait_since) {
		iocg->local_stat.wait_us += now.now - iocg->wait_since;
		iocg->wait_since = now.now;
		iocg->local_stat.wait_us += now->now - iocg->wait_since;
		iocg->wait_since = now->now;
		}
		if (iocg->indebt_since) {
		iocg->local_stat.indebt_us +=
		now.now - iocg->indebt_since;
		iocg->indebt_since = now.now;
		now->now - iocg->indebt_since;
		iocg->indebt_since = now->now;
		}
		if (iocg->indelay_since) {
		iocg->local_stat.indelay_us +=
		now.now - iocg->indelay_since;
		iocg->indelay_since = now.now;
		now->now - iocg->indelay_since;
		iocg->indelay_since = now->now;
		}

		if (waitqueue_active(&iocg->waitq) \|\| iocg->abs_vdebt \|\|
		iocg->delay) {
		/* might be oversleeping vtime / hweight changes, kick */
		iocg_kick_waitq(iocg, true, &now);
		iocg_kick_waitq(iocg, true, now);
		if (iocg->abs_vdebt \|\| iocg->delay)
		nr_debtors++;
		} else if (iocg_is_idle(iocg)) {
		@@ -2145,7 +2176,7 @@ static void ioc_timer_fn(struct timer_list *timer)
		* error and throw away. On reactivation, it'll start
		* with the target budget.
		*/
		excess = now.vnow - vtime - ioc->margins.target;
		excess = now->vnow - vtime - ioc->margins.target;
		if (excess > 0) {
		u32 old_hwi;

		@@ -2154,13 +2185,46 @@ static void ioc_timer_fn(struct timer_list *timer)
		WEIGHT_ONE);
		}

		__propagate_weights(iocg, 0, 0, false, &now);
		__propagate_weights(iocg, 0, 0, false, now);
		list_del_init(&iocg->active_list);
		}

		spin_unlock(&iocg->waitq.lock);
		}

		commit_weights(ioc);
		return nr_debtors;
		}

		static void ioc_timer_fn(struct timer_list *timer)
		{
		struct ioc *ioc = container_of(timer, struct ioc, timer);
		struct ioc_gq iocg, tiocg;
		struct ioc_now now;
		LIST_HEAD(surpluses);
		int nr_debtors, nr_shortages = 0, nr_lagging = 0;
		u64 usage_us_sum = 0;
		u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
		u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
		u32 missed_ppm[2], rq_wait_pct;
		u64 period_vtime;
		int prev_busy_level;

		/* how were the latencies during the period? */
		ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);

		/* take care of active iocgs */
		spin_lock_irq(&ioc->lock);

		ioc_now(ioc, &now);

		period_vtime = now.vnow - ioc->period_at_vtime;
		if (WARN_ON_ONCE(!period_vtime)) {
		spin_unlock_irq(&ioc->lock);
		return;
		}

		nr_debtors = ioc_check_iocgs(ioc, &now);

		/*
		* Wait and indebt stat are flushed above and the donation calculation
		@@ -2170,8 +2234,8 @@ static void ioc_timer_fn(struct timer_list *timer)

		/* calc usage and see whether some weights need to be moved around */
		list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
		u64 vdone, vtime, usage_us, usage_dur;
		u32 usage, hw_active, hw_inuse;
		u64 vdone, vtime, usage_us;
		u32 hw_active, hw_inuse;

		/*
		* Collect unused and wind vtime closer to vnow to prevent
		@@ -2202,10 +2266,19 @@ static void ioc_timer_fn(struct timer_list *timer)
		usage_us = iocg->usage_delta_us;
		usage_us_sum += usage_us;

		/* see whether there's surplus vtime */
		WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
		if (hw_inuse < hw_active \|\|
		(!waitqueue_active(&iocg->waitq) &&
		time_before64(vtime, now.vnow - ioc->margins.low))) {
		u32 hwa, old_hwi, hwm, new_hwi, usage;
		u64 usage_dur;

		if (vdone != vtime) {
		u64 inflight_us = DIV64_U64_ROUND_UP(
		cost_to_abs_cost(vtime - vdone, hw_inuse),
		ioc->vtime_base_rate);

		usage_us = max(usage_us, inflight_us);
		}

		@@ -2220,13 +2293,6 @@ static void ioc_timer_fn(struct timer_list *timer)
		usage_dur),
		1, WEIGHT_ONE);

		/* see whether there's surplus vtime */
		WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
		if (hw_inuse < hw_active \|\|
		(!waitqueue_active(&iocg->waitq) &&
		time_before64(vtime, now.vnow - ioc->margins.low))) {
		u32 hwa, old_hwi, hwm, new_hwi;

		/*
		* Already donating or accumulated enough to start.
		* Determine the donation amount.
		@@ -2309,51 +2375,8 @@ static void ioc_timer_fn(struct timer_list *timer)

		ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);

		if (ioc->busy_level > 0 \|\| (ioc->busy_level < 0 && !nr_lagging)) {
		u64 vrate = ioc->vtime_base_rate;
		u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;

		/* rq_wait signal is always reliable, ignore user vrate_min */
		if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
		vrate_min = VRATE_MIN;

		/*
		* If vrate is out of bounds, apply clamp gradually as the
		* bounds can change abruptly. Otherwise, apply busy_level
		* based adjustment.
		*/
		if (vrate < vrate_min) {
		vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
		100);
		vrate = min(vrate, vrate_min);
		} else if (vrate > vrate_max) {
		vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
		100);
		vrate = max(vrate, vrate_max);
		} else {
		int idx = min_t(int, abs(ioc->busy_level),
		ARRAY_SIZE(vrate_adj_pct) - 1);
		u32 adj_pct = vrate_adj_pct[idx];

		if (ioc->busy_level > 0)
		adj_pct = 100 - adj_pct;
		else
		adj_pct = 100 + adj_pct;

		vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
		vrate_min, vrate_max);
		}

		trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
		nr_lagging, nr_shortages);

		ioc->vtime_base_rate = vrate;
		ioc_refresh_margins(ioc);
		} else if (ioc->busy_level != prev_busy_level \|\| nr_lagging) {
		trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
		missed_ppm, rq_wait_pct, nr_lagging,
		nr_shortages);
		}
		ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
		prev_busy_level, missed_ppm);

		ioc_refresh_params(ioc, false);

		@@ -2400,7 +2423,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
		return cost;

		/*
		* We only increase inuse during period and do so iff the margin has
		* We only increase inuse during period and do so if the margin has
		* deteriorated since the previous adjustment.
		*/
		if (margin >= iocg->saved_margin \|\| margin >= margins->low \|\|
		@@ -3120,23 +3143,23 @@ static const match_table_t qos_tokens = {
		static ssize_t ioc_qos_write(struct kernfs_open_file of, char input,
		size_t nbytes, loff_t off)
		{
		struct gendisk *disk;
		struct block_device *bdev;
		struct ioc *ioc;
		u32 qos[NR_QOS_PARAMS];
		bool enable, user;
		char *p;
		int ret;

		disk = blkcg_conf_get_disk(&input);
		if (IS_ERR(disk))
		return PTR_ERR(disk);
		bdev = blkcg_conf_open_bdev(&input);
		if (IS_ERR(bdev))
		return PTR_ERR(bdev);

		ioc = q_to_ioc(disk->queue);
		ioc = q_to_ioc(bdev->bd_disk->queue);
		if (!ioc) {
		ret = blk_iocost_init(disk->queue);
		ret = blk_iocost_init(bdev->bd_disk->queue);
		if (ret)
		goto err;
		ioc = q_to_ioc(disk->queue);
		ioc = q_to_ioc(bdev->bd_disk->queue);
		}

		spin_lock_irq(&ioc->lock);
		@@ -3231,12 +3254,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file of, char input,
		ioc_refresh_params(ioc, true);
		spin_unlock_irq(&ioc->lock);

		put_disk_and_module(disk);
		blkdev_put_no_open(bdev);
		return nbytes;
		einval:
		ret = -EINVAL;
		err:
		put_disk_and_module(disk);
		blkdev_put_no_open(bdev);
		return ret;
		}

		@@ -3287,23 +3310,23 @@ static const match_table_t i_lcoef_tokens = {
		static ssize_t ioc_cost_model_write(struct kernfs_open_file of, char input,
		size_t nbytes, loff_t off)
		{
		struct gendisk *disk;
		struct block_device *bdev;
		struct ioc *ioc;
		u64 u[NR_I_LCOEFS];
		bool user;
		char *p;
		int ret;

		disk = blkcg_conf_get_disk(&input);
		if (IS_ERR(disk))
		return PTR_ERR(disk);
		bdev = blkcg_conf_open_bdev(&input);
		if (IS_ERR(bdev))
		return PTR_ERR(bdev);

		ioc = q_to_ioc(disk->queue);
		ioc = q_to_ioc(bdev->bd_disk->queue);
		if (!ioc) {
		ret = blk_iocost_init(disk->queue);
		ret = blk_iocost_init(bdev->bd_disk->queue);
		if (ret)
		goto err;
		ioc = q_to_ioc(disk->queue);
		ioc = q_to_ioc(bdev->bd_disk->queue);
		}

		spin_lock_irq(&ioc->lock);
		@@ -3356,13 +3379,13 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file of, char input,
		ioc_refresh_params(ioc, true);
		spin_unlock_irq(&ioc->lock);

		put_disk_and_module(disk);
		blkdev_put_no_open(bdev);
		return nbytes;

		einval:
		ret = -EINVAL;
		err:
		put_disk_and_module(disk);
		blkdev_put_no_open(bdev);
		return ret;
		}

Admin message