blk-mq: drain I/O when all CPUs in a hctx are offline (bf0beec0) · Commits · 戴 / test

block/blk-mq-debugfs.c

+2 −0

Original line number	Diff line number	Diff line
		@@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = {
		HCTX_STATE_NAME(STOPPED),
		HCTX_STATE_NAME(TAG_ACTIVE),
		HCTX_STATE_NAME(SCHED_RESTART),
		HCTX_STATE_NAME(INACTIVE),
		};
		#undef HCTX_STATE_NAME

		@@ -239,6 +240,7 @@ static const char *const hctx_flag_name[] = {
		HCTX_FLAG_NAME(TAG_SHARED),
		HCTX_FLAG_NAME(BLOCKING),
		HCTX_FLAG_NAME(NO_SCHED),
		HCTX_FLAG_NAME(STACKING),
		};
		#undef HCTX_FLAG_NAME

block/blk-mq-tag.c

+8 −0

Original line number	Diff line number	Diff line
		@@ -180,6 +180,14 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
		sbitmap_finish_wait(bt, ws, &wait);

		found_tag:
		/*
		* Give up this allocation if the hctx is inactive. The caller will
		* retry on an active hctx.
		*/
		if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) {
		blk_mq_put_tag(tags, data->ctx, tag + tag_offset);
		return BLK_MQ_NO_TAG;
		}
		return tag + tag_offset;
		}

block/blk-mq.c

+110 −2

Original line number	Diff line number	Diff line
		@@ -375,14 +375,30 @@ static struct request __blk_mq_alloc_request(struct blk_mq_alloc_data data)
		e->type->ops.limit_depth(data->cmd_flags, data);
		}

		retry:
		data->ctx = blk_mq_get_ctx(q);
		data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
		if (!(data->flags & BLK_MQ_REQ_INTERNAL))
		blk_mq_tag_busy(data->hctx);

		/*
		* Waiting allocations only fail because of an inactive hctx. In that
		* case just retry the hctx assignment and tag allocation as CPU hotplug
		* should have migrated us to an online CPU by now.
		*/
		tag = blk_mq_get_tag(data);
		if (tag == BLK_MQ_NO_TAG)
		if (tag == BLK_MQ_NO_TAG) {
		if (data->flags & BLK_MQ_REQ_NOWAIT)
		return NULL;

		/*
		* Give up the CPU and sleep for a random short time to ensure
		* that thread using a realtime scheduling class are migrated
		* off the the CPU, and thus off the hctx that is going away.
		*/
		msleep(3);
		goto retry;
		}
		return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
		}

		@@ -2335,6 +2351,86 @@ fail:
		return -ENOMEM;
		}

		struct rq_iter_data {
		struct blk_mq_hw_ctx *hctx;
		bool has_rq;
		};

		static bool blk_mq_has_request(struct request rq, void data, bool reserved)
		{
		struct rq_iter_data *iter_data = data;

		if (rq->mq_hctx != iter_data->hctx)
		return true;
		iter_data->has_rq = true;
		return false;
		}

		static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
		{
		struct blk_mq_tags *tags = hctx->sched_tags ?
		hctx->sched_tags : hctx->tags;
		struct rq_iter_data data = {
		.hctx = hctx,
		};

		blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
		return data.has_rq;
		}

		static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
		struct blk_mq_hw_ctx *hctx)
		{
		if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
		return false;
		if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
		return false;
		return true;
		}

		static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
		{
		struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
		struct blk_mq_hw_ctx, cpuhp_online);

		if (!cpumask_test_cpu(cpu, hctx->cpumask) \|\|
		!blk_mq_last_cpu_in_hctx(cpu, hctx))
		return 0;

		/*
		* Prevent new request from being allocated on the current hctx.
		*
		* The smp_mb__after_atomic() Pairs with the implied barrier in
		* test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
		* seen once we return from the tag allocator.
		*/
		set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
		smp_mb__after_atomic();

		/*
		* Try to grab a reference to the queue and wait for any outstanding
		* requests. If we could not grab a reference the queue has been
		* frozen and there are no requests.
		*/
		if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
		while (blk_mq_hctx_has_requests(hctx))
		msleep(5);
		percpu_ref_put(&hctx->queue->q_usage_counter);
		}

		return 0;
		}

		static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
		{
		struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
		struct blk_mq_hw_ctx, cpuhp_online);

		if (cpumask_test_cpu(cpu, hctx->cpumask))
		clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
		return 0;
		}

		/*
		* 'cpu' is going away. splice any existing rq_list entries from this
		* software queue to the hw queue dispatch list, and ensure that it
		@@ -2348,6 +2444,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
		enum hctx_type type;

		hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
		if (!cpumask_test_cpu(cpu, hctx->cpumask))
		return 0;

		ctx = __blk_mq_get_ctx(hctx->queue, cpu);
		type = hctx->type;

		@@ -2371,6 +2470,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)

		static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
		{
		if (!(hctx->flags & BLK_MQ_F_STACKING))
		cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
		&hctx->cpuhp_online);
		cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
		&hctx->cpuhp_dead);
		}
		@@ -2430,6 +2532,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
		{
		hctx->queue_num = hctx_idx;

		if (!(hctx->flags & BLK_MQ_F_STACKING))
		cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
		&hctx->cpuhp_online);
		cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);

		hctx->tags = set->tags[hctx_idx];
		@@ -3684,6 +3789,9 @@ static int __init blk_mq_init(void)
		{
		cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
		blk_mq_hctx_notify_dead);
		cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
		blk_mq_hctx_notify_online,
		blk_mq_hctx_notify_offline);
		return 0;
		}
		subsys_initcall(blk_mq_init);

drivers/block/loop.c

+1 −1

Original line number	Diff line number	Diff line
		@@ -2037,7 +2037,7 @@ static int loop_add(struct loop_device **l, int i)
		lo->tag_set.queue_depth = 128;
		lo->tag_set.numa_node = NUMA_NO_NODE;
		lo->tag_set.cmd_size = sizeof(struct loop_cmd);
		lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
		lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE \| BLK_MQ_F_STACKING;
		lo->tag_set.driver_data = lo;

		err = blk_mq_alloc_tag_set(&lo->tag_set);

drivers/md/dm-rq.c

+1 −1

Original line number	Diff line number	Diff line
		@@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device md, struct dm_table t)
		md->tag_set->ops = &dm_mq_ops;
		md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
		md->tag_set->numa_node = md->numa_node_id;
		md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
		md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE \| BLK_MQ_F_STACKING;
		md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
		md->tag_set->driver_data = md;

Admin message