Commit 84597a44 authored by Mikulas Patocka's avatar Mikulas Patocka Committed by Mike Snitzer
Browse files

dm integrity: add optional discard support



Add an argument "allow_discards" that enables discard processing on
dm-integrity device. Discards are only allowed to devices using
internal hash.

When a block is discarded the integrity tag is filled with
DISCARD_FILLER (0xf6) bytes.

Signed-off-by: default avatarMikulas Patocka <mpatocka@redhat.com>
Signed-off-by: default avatarMike Snitzer <snitzer@redhat.com>
parent 1ac2c15a
Loading
Loading
Loading
Loading
+145 −32
Original line number Diff line number Diff line
@@ -39,6 +39,7 @@
#define RECALC_WRITE_SUPER		16
#define BITMAP_BLOCK_SIZE		4096	/* don't change it */
#define BITMAP_FLUSH_INTERVAL		(10 * HZ)
#define DISCARD_FILLER			0xf6

/*
 * Warning - DEBUG_PRINT prints security-sensitive data to the log,
@@ -257,6 +258,7 @@ struct dm_integrity_c {
	bool just_formatted;
	bool recalculate_flag;
	bool fix_padding;
	bool discard;

	struct alg_spec internal_hash_alg;
	struct alg_spec journal_crypt_alg;
@@ -284,7 +286,7 @@ struct dm_integrity_io {
	struct work_struct work;

	struct dm_integrity_c *ic;
	bool write;
	enum req_opf op;
	bool fua;

	struct dm_integrity_range range;
@@ -1299,6 +1301,11 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_
static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
			       unsigned *metadata_offset, unsigned total_size, int op)
{
#define MAY_BE_FILLER		1
#define MAY_BE_HASH		2
	unsigned hash_offset = 0;
	unsigned may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);

	do {
		unsigned char *data, *dp;
		struct dm_buffer *b;
@@ -1322,16 +1329,33 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
			dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
		} else {
			/* e.g.: op == TAG_CMP */
			if (unlikely(memcmp(dp, tag, to_copy))) {
				unsigned i;

				for (i = 0; i < to_copy; i++) {
					if (dp[i] != tag[i])
						break;
					total_size--;
			if (likely(is_power_of_2(ic->tag_size))) {
				if (unlikely(memcmp(dp, tag, to_copy)))
					if (unlikely(!ic->discard) ||
					    unlikely(!memchr_inv(dp, DISCARD_FILLER, to_copy))) {
						goto thorough_test;
				}
			} else {
				unsigned i, ts;
thorough_test:
				ts = total_size;

				for (i = 0; i < to_copy; i++, ts--) {
					if (unlikely(dp[i] != tag[i]))
						may_be &= ~MAY_BE_HASH;
					if (likely(dp[i] != DISCARD_FILLER))
						may_be &= ~MAY_BE_FILLER;
					hash_offset++;
					if (unlikely(hash_offset == ic->tag_size)) {
						if (unlikely(!may_be)) {
							dm_bufio_release(b);
				return total_size;
							return ts;
						}
						hash_offset = 0;
						may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
					}
				}
			}
		}
		dm_bufio_release(b);
@@ -1342,10 +1366,17 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
			(*metadata_block)++;
			*metadata_offset = 0;
		}

		if (unlikely(!is_power_of_2(ic->tag_size))) {
			hash_offset = (hash_offset + to_copy) % ic->tag_size;
		}

		total_size -= to_copy;
	} while (unlikely(total_size));

	return 0;
#undef MAY_BE_FILLER
#undef MAY_BE_HASH
}

static void dm_integrity_flush_buffers(struct dm_integrity_c *ic)
@@ -1428,7 +1459,7 @@ static void dec_in_flight(struct dm_integrity_io *dio)

		remove_range(ic, &dio->range);

		if (unlikely(dio->write))
		if (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD))
			schedule_autocommit(ic);

		bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
@@ -1520,14 +1551,19 @@ static void integrity_metadata(struct work_struct *w)
		char *checksums;
		unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
		char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
		unsigned sectors_to_process = dio->range.n_sectors;
		sector_t sector = dio->range.logical_sector;
		sector_t sector;
		unsigned sectors_to_process;
		sector_t save_metadata_block;
		unsigned save_metadata_offset;

		if (unlikely(ic->mode == 'R'))
			goto skip_io;

		if (likely(dio->op != REQ_OP_DISCARD))
			checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
					    GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
		else
			checksums = kmalloc(PAGE_SIZE, GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
		if (!checksums) {
			checksums = checksums_onstack;
			if (WARN_ON(extra_space &&
@@ -1537,6 +1573,43 @@ static void integrity_metadata(struct work_struct *w)
			}
		}

		if (unlikely(dio->op == REQ_OP_DISCARD)) {
			sector_t bi_sector = dio->bio_details.bi_iter.bi_sector;
			unsigned bi_size = dio->bio_details.bi_iter.bi_size;
			unsigned max_size = likely(checksums != checksums_onstack) ? PAGE_SIZE : HASH_MAX_DIGESTSIZE;
			unsigned max_blocks = max_size / ic->tag_size;
			memset(checksums, DISCARD_FILLER, max_size);

			while (bi_size) {
				unsigned this_step_blocks = bi_size >> (SECTOR_SHIFT + ic->sb->log2_sectors_per_block);
				this_step_blocks = min(this_step_blocks, max_blocks);
				r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
							this_step_blocks * ic->tag_size, TAG_WRITE);
				if (unlikely(r)) {
					if (likely(checksums != checksums_onstack))
						kfree(checksums);
					goto error;
				}

				/*if (bi_size < this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block)) {
					printk("BUGG: bi_sector: %llx, bi_size: %u\n", bi_sector, bi_size);
					printk("BUGG: this_step_blocks: %u\n", this_step_blocks);
					BUG();
				}*/
				bi_size -= this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block);
				bi_sector += this_step_blocks << ic->sb->log2_sectors_per_block;
			}

			if (likely(checksums != checksums_onstack))
				kfree(checksums);
			goto skip_io;
		}

		save_metadata_block = dio->metadata_block;
		save_metadata_offset = dio->metadata_offset;
		sector = dio->range.logical_sector;
		sectors_to_process = dio->range.n_sectors;

		__bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) {
			unsigned pos;
			char *mem, *checksums_ptr;
@@ -1555,7 +1628,7 @@ again:
			kunmap_atomic(mem);

			r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
						checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
						checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE);
			if (unlikely(r)) {
				if (r > 0) {
					char b[BDEVNAME_SIZE];
@@ -1599,7 +1672,7 @@ again:
				tag = lowmem_page_address(biv.bv_page) + biv.bv_offset;
				this_len = min(biv.bv_len, data_to_process);
				r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset,
							this_len, !dio->write ? TAG_READ : TAG_WRITE);
							this_len, dio->op == REQ_OP_READ ? TAG_READ : TAG_WRITE);
				if (unlikely(r))
					goto error;
				data_to_process -= this_len;
@@ -1626,6 +1699,20 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)

	dio->ic = ic;
	dio->bi_status = 0;
	dio->op = bio_op(bio);

	if (unlikely(dio->op == REQ_OP_DISCARD)) {
		if (ti->max_io_len) {
			sector_t sec = dm_target_offset(ti, bio->bi_iter.bi_sector);
			unsigned log2_max_io_len = __fls(ti->max_io_len);
			sector_t start_boundary = sec >> log2_max_io_len;
			sector_t end_boundary = (sec + bio_sectors(bio) - 1) >> log2_max_io_len;
			if (start_boundary < end_boundary) {
				sector_t len = ti->max_io_len - (sec & (ti->max_io_len - 1));
				dm_accept_partial_bio(bio, len);
			}
		}
	}

	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
		submit_flush_bio(ic, dio);
@@ -1633,8 +1720,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
	}

	dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
	dio->write = bio_op(bio) == REQ_OP_WRITE;
	dio->fua = dio->write && bio->bi_opf & REQ_FUA;
	dio->fua = dio->op == REQ_OP_WRITE && bio->bi_opf & REQ_FUA;
	if (unlikely(dio->fua)) {
		/*
		 * Don't pass down the FUA flag because we have to flush
@@ -1655,7 +1741,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
		return DM_MAPIO_KILL;
	}

	if (ic->sectors_per_block > 1) {
	if (ic->sectors_per_block > 1 && likely(dio->op != REQ_OP_DISCARD)) {
		struct bvec_iter iter;
		struct bio_vec bv;
		bio_for_each_segment(bv, bio, iter) {
@@ -1688,7 +1774,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
		}
	}

	if (unlikely(ic->mode == 'R') && unlikely(dio->write))
	if (unlikely(ic->mode == 'R') && unlikely(dio->op != REQ_OP_READ))
		return DM_MAPIO_KILL;

	get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
@@ -1718,13 +1804,13 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
		bio_advance_iter(bio, &bio->bi_iter, bv.bv_len);
retry_kmap:
		mem = kmap_atomic(bv.bv_page);
		if (likely(dio->write))
		if (likely(dio->op == REQ_OP_WRITE))
			flush_dcache_page(bv.bv_page);

		do {
			struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry);

			if (unlikely(!dio->write)) {
			if (unlikely(dio->op == REQ_OP_READ)) {
				struct journal_sector *js;
				char *mem_ptr;
				unsigned s;
@@ -1771,7 +1857,7 @@ retry_kmap:
					char *tag_addr;
					BUG_ON(PageHighMem(biv.bv_page));
					tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset;
					if (likely(dio->write))
					if (likely(dio->op == REQ_OP_WRITE))
						memcpy(tag_ptr, tag_addr, tag_now);
					else
						memcpy(tag_addr, tag_ptr, tag_now);
@@ -1779,12 +1865,12 @@ retry_kmap:
					tag_ptr += tag_now;
					tag_todo -= tag_now;
				} while (unlikely(tag_todo)); else {
					if (likely(dio->write))
					if (likely(dio->op == REQ_OP_WRITE))
						memset(tag_ptr, 0, tag_todo);
				}
			}

			if (likely(dio->write)) {
			if (likely(dio->op == REQ_OP_WRITE)) {
				struct journal_sector *js;
				unsigned s;

@@ -1820,12 +1906,12 @@ retry_kmap:
			bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT;
		} while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT);

		if (unlikely(!dio->write))
		if (unlikely(dio->op == REQ_OP_READ))
			flush_dcache_page(bv.bv_page);
		kunmap_atomic(mem);
	} while (n_sectors);

	if (likely(dio->write)) {
	if (likely(dio->op == REQ_OP_WRITE)) {
		smp_mb();
		if (unlikely(waitqueue_active(&ic->copy_to_journal_wait)))
			wake_up(&ic->copy_to_journal_wait);
@@ -1857,7 +1943,9 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
	unsigned journal_section, journal_entry;
	unsigned journal_read_pos;
	struct completion read_comp;
	bool need_sync_io = ic->internal_hash && !dio->write;
	bool need_sync_io = ic->internal_hash && dio->op == REQ_OP_READ;
	if (unlikely(dio->op == REQ_OP_DISCARD) && ic->mode != 'D')
		need_sync_io = true;

	if (need_sync_io && from_map) {
		INIT_WORK(&dio->work, integrity_bio_wait);
@@ -1875,8 +1963,8 @@ retry:
	}
	dio->range.n_sectors = bio_sectors(bio);
	journal_read_pos = NOT_FOUND;
	if (likely(ic->mode == 'J')) {
		if (dio->write) {
	if (ic->mode == 'J' && likely(dio->op != REQ_OP_DISCARD)) {
		if (dio->op == REQ_OP_WRITE) {
			unsigned next_entry, i, pos;
			unsigned ws, we, range_sectors;

@@ -1979,7 +2067,7 @@ offload_to_thread:
		goto journal_read_write;
	}

	if (ic->mode == 'B' && dio->write) {
	if (ic->mode == 'B' && (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD))) {
		if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
				     dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
			struct bitmap_block_status *bbs;
@@ -2008,6 +2096,18 @@ offload_to_thread:
	bio->bi_end_io = integrity_end_io;
	bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT;

	if (unlikely(dio->op == REQ_OP_DISCARD) && likely(ic->mode != 'D')) {
		integrity_metadata(&dio->work);
		dm_integrity_flush_buffers(ic);

		dio->in_flight = (atomic_t)ATOMIC_INIT(1);
		dio->completion = NULL;

		generic_make_request(bio);

		return;
	}

	generic_make_request(bio);

	if (need_sync_io) {
@@ -2969,6 +3069,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
		arg_count += !!ic->meta_dev;
		arg_count += ic->sectors_per_block != 1;
		arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING));
		arg_count += ic->discard;
		arg_count += ic->mode == 'J';
		arg_count += ic->mode == 'J';
		arg_count += ic->mode == 'B';
@@ -2985,6 +3086,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
			DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
		if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
			DMEMIT(" recalculate");
		if (ic->discard)
			DMEMIT(" allow_discards");
		DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
		DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
		DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
@@ -3771,6 +3874,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
				goto bad;
		} else if (!strcmp(opt_string, "recalculate")) {
			ic->recalculate_flag = true;
		} else if (!strcmp(opt_string, "allow_discards")) {
			ic->discard = true;
		} else if (!strcmp(opt_string, "fix_padding")) {
			ic->fix_padding = true;
		} else {
@@ -3829,6 +3934,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
		goto bad;
	}

	if (ic->discard && !ic->internal_hash) {
		r = -EINVAL;
		ti->error = "Discard can be only used with internal hash";
		goto bad;
	}

	ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
	ic->autocommit_msec = sync_msec;
	timer_setup(&ic->autocommit_timer, autocommit_fn, 0);
@@ -4158,6 +4269,8 @@ try_smaller_buffer:

	ti->num_flush_bios = 1;
	ti->flush_supported = true;
	if (ic->discard)
		ti->num_discard_bios = 1;

	return 0;