Commit 0d0a60c9 authored by Theodore Ts'o's avatar Theodore Ts'o
Browse files

Merge branch 'rh/dioread-nolock-1k' into dev

parents 7855a57d c33fbe8f
Loading
Loading
Loading
Loading
+11 −2
Original line number Diff line number Diff line
@@ -198,6 +198,12 @@ struct ext4_system_blocks {
 */
#define	EXT4_IO_END_UNWRITTEN	0x0001

struct ext4_io_end_vec {
	struct list_head list;		/* list of io_end_vec */
	loff_t offset;			/* offset in the file */
	ssize_t size;			/* size of the extent */
};

/*
 * For converting unwritten extents on a work queue. 'handle' is used for
 * buffered writeback.
@@ -211,8 +217,7 @@ typedef struct ext4_io_end {
						 * bios covering the extent */
	unsigned int		flag;		/* unwritten or not */
	atomic_t		count;		/* reference counter */
	loff_t			offset;		/* offset in the file */
	ssize_t			size;		/* size of the extent */
	struct list_head	list_vec;	/* list of ext4_io_end_vec */
} ext4_io_end_t;

struct ext4_io_submit {
@@ -3264,6 +3269,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
			  loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
					  loff_t offset, ssize_t len);
extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
					     ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
			   struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -3322,6 +3329,8 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
			       int len,
			       struct writeback_control *wbc,
			       bool keep_towrite);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);

/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+34 −15
Original line number Diff line number Diff line
@@ -4962,23 +4962,13 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
	int ret = 0;
	int ret2 = 0;
	struct ext4_map_blocks map;
	unsigned int credits, blkbits = inode->i_blkbits;
	unsigned int blkbits = inode->i_blkbits;
	unsigned int credits = 0;

	map.m_lblk = offset >> blkbits;
	max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);

	/*
	 * This is somewhat ugly but the idea is clear: When transaction is
	 * reserved, everything goes into it. Otherwise we rather start several
	 * smaller transactions for conversion of each extent separately.
	 */
	if (handle) {
		handle = ext4_journal_start_reserved(handle,
						     EXT4_HT_EXT_CONVERT);
		if (IS_ERR(handle))
			return PTR_ERR(handle);
		credits = 0;
	} else {
	if (!handle) {
		/*
		 * credits to insert 1 extent into extent tree
		 */
@@ -5009,11 +4999,40 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
		if (ret <= 0 || ret2)
			break;
	}
	if (!credits)
		ret2 = ext4_journal_stop(handle);
	return ret > 0 ? ret2 : ret;
}

int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
{
	int ret, err = 0;
	struct ext4_io_end_vec *io_end_vec;

	/*
	 * This is somewhat ugly but the idea is clear: When transaction is
	 * reserved, everything goes into it. Otherwise we rather start several
	 * smaller transactions for conversion of each extent separately.
	 */
	if (handle) {
		handle = ext4_journal_start_reserved(handle,
						     EXT4_HT_EXT_CONVERT);
		if (IS_ERR(handle))
			return PTR_ERR(handle);
	}

	list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
		ret = ext4_convert_unwritten_extents(handle, io_end->inode,
						     io_end_vec->offset,
						     io_end_vec->size);
		if (ret)
			break;
	}

	if (handle)
		err = ext4_journal_stop(handle);

	return ret < 0 ? ret : err;
}

/*
 * If newes is not existing extent (newes->ec_pblk equals zero) find
 * delayed extent at start of newes and update newes accordingly and
+90 −46
Original line number Diff line number Diff line
@@ -2340,6 +2340,75 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
	return lblk < blocks;
}

/*
 * mpage_process_page - update page buffers corresponding to changed extent and
 *		       may submit fully mapped page for IO
 *
 * @mpd		- description of extent to map, on return next extent to map
 * @m_lblk	- logical block mapping.
 * @m_pblk	- corresponding physical mapping.
 * @map_bh	- determines on return whether this page requires any further
 *		  mapping or not.
 * Scan given page buffers corresponding to changed extent and update buffer
 * state according to new extent state.
 * We map delalloc buffers to their physical location, clear unwritten bits.
 * If the given page is not fully mapped, we update @map to the next extent in
 * the given page that needs mapping & return @map_bh as true.
 */
static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
			      ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
			      bool *map_bh)
{
	struct buffer_head *head, *bh;
	ext4_io_end_t *io_end = mpd->io_submit.io_end;
	ext4_lblk_t lblk = *m_lblk;
	ext4_fsblk_t pblock = *m_pblk;
	int err = 0;
	int blkbits = mpd->inode->i_blkbits;
	ssize_t io_end_size = 0;
	struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);

	bh = head = page_buffers(page);
	do {
		if (lblk < mpd->map.m_lblk)
			continue;
		if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
			/*
			 * Buffer after end of mapped extent.
			 * Find next buffer in the page to map.
			 */
			mpd->map.m_len = 0;
			mpd->map.m_flags = 0;
			io_end_vec->size += io_end_size;
			io_end_size = 0;

			err = mpage_process_page_bufs(mpd, head, bh, lblk);
			if (err > 0)
				err = 0;
			if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
				io_end_vec = ext4_alloc_io_end_vec(io_end);
				io_end_vec->offset = mpd->map.m_lblk << blkbits;
			}
			*map_bh = true;
			goto out;
		}
		if (buffer_delay(bh)) {
			clear_buffer_delay(bh);
			bh->b_blocknr = pblock++;
		}
		clear_buffer_unwritten(bh);
		io_end_size += (1 << blkbits);
	} while (lblk++, (bh = bh->b_this_page) != head);

	io_end_vec->size += io_end_size;
	io_end_size = 0;
	*map_bh = false;
out:
	*m_lblk = lblk;
	*m_pblk = pblock;
	return err;
}

/*
 * mpage_map_buffers - update buffers corresponding to changed extent and
 *		       submit fully mapped pages for IO
@@ -2359,12 +2428,12 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
	struct pagevec pvec;
	int nr_pages, i;
	struct inode *inode = mpd->inode;
	struct buffer_head *head, *bh;
	int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
	pgoff_t start, end;
	ext4_lblk_t lblk;
	sector_t pblock;
	ext4_fsblk_t pblock;
	int err;
	bool map_bh = false;

	start = mpd->map.m_lblk >> bpp_bits;
	end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
@@ -2380,50 +2449,19 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];

			bh = head = page_buffers(page);
			do {
				if (lblk < mpd->map.m_lblk)
					continue;
				if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
					/*
					 * Buffer after end of mapped extent.
					 * Find next buffer in the page to map.
					 */
					mpd->map.m_len = 0;
					mpd->map.m_flags = 0;
					/*
					 * FIXME: If dioread_nolock supports
					 * blocksize < pagesize, we need to make
					 * sure we add size mapped so far to
					 * io_end->size as the following call
					 * can submit the page for IO.
					 */
					err = mpage_process_page_bufs(mpd, head,
								      bh, lblk);
					pagevec_release(&pvec);
					if (err > 0)
						err = 0;
					return err;
				}
				if (buffer_delay(bh)) {
					clear_buffer_delay(bh);
					bh->b_blocknr = pblock++;
				}
				clear_buffer_unwritten(bh);
			} while (lblk++, (bh = bh->b_this_page) != head);

			err = mpage_process_page(mpd, page, &lblk, &pblock,
						 &map_bh);
			/*
			 * FIXME: This is going to break if dioread_nolock
			 * supports blocksize < pagesize as we will try to
			 * convert potentially unmapped parts of inode.
			 * If map_bh is true, means page may require further bh
			 * mapping, or maybe the page was submitted for IO.
			 * So we return to call further extent mapping.
			 */
			mpd->io_submit.io_end->size += PAGE_SIZE;
			if (err < 0 || map_bh == true)
				goto out;
			/* Page fully mapped - let IO run! */
			err = mpage_submit_page(mpd, page);
			if (err < 0) {
				pagevec_release(&pvec);
				return err;
			}
			if (err < 0)
				goto out;
		}
		pagevec_release(&pvec);
	}
@@ -2431,6 +2469,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
	mpd->map.m_len = 0;
	mpd->map.m_flags = 0;
	return 0;
out:
	pagevec_release(&pvec);
	return err;
}

static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
@@ -2510,9 +2551,10 @@ static int mpage_map_and_submit_extent(handle_t *handle,
	int err;
	loff_t disksize;
	int progress = 0;
	ext4_io_end_t *io_end = mpd->io_submit.io_end;
	struct ext4_io_end_vec *io_end_vec = ext4_alloc_io_end_vec(io_end);

	mpd->io_submit.io_end->offset =
				((loff_t)map->m_lblk) << inode->i_blkbits;
	io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
	do {
		err = mpage_map_one_extent(handle, mpd);
		if (err < 0) {
@@ -3613,6 +3655,7 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
			    ssize_t size, void *private)
{
        ext4_io_end_t *io_end = private;
	struct ext4_io_end_vec *io_end_vec;

	/* if not async direct IO just return */
	if (!io_end)
@@ -3630,8 +3673,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
		ext4_clear_io_unwritten_flag(io_end);
		size = 0;
	}
	io_end->offset = offset;
	io_end->size = size;
	io_end_vec = ext4_alloc_io_end_vec(io_end);
	io_end_vec->offset = offset;
	io_end_vec->size = size;
	ext4_put_io_end(io_end);

	return 0;
+73 −37
Original line number Diff line number Diff line
@@ -31,18 +31,56 @@
#include "acl.h"

static struct kmem_cache *io_end_cachep;
static struct kmem_cache *io_end_vec_cachep;

int __init ext4_init_pageio(void)
{
	io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
	if (io_end_cachep == NULL)
		return -ENOMEM;

	io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0);
	if (io_end_vec_cachep == NULL) {
		kmem_cache_destroy(io_end_cachep);
		return -ENOMEM;
	}
	return 0;
}

void ext4_exit_pageio(void)
{
	kmem_cache_destroy(io_end_cachep);
	kmem_cache_destroy(io_end_vec_cachep);
}

struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end)
{
	struct ext4_io_end_vec *io_end_vec;

	io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS);
	if (!io_end_vec)
		return ERR_PTR(-ENOMEM);
	INIT_LIST_HEAD(&io_end_vec->list);
	list_add_tail(&io_end_vec->list, &io_end->list_vec);
	return io_end_vec;
}

static void ext4_free_io_end_vec(ext4_io_end_t *io_end)
{
	struct ext4_io_end_vec *io_end_vec, *tmp;

	if (list_empty(&io_end->list_vec))
		return;
	list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) {
		list_del(&io_end_vec->list);
		kmem_cache_free(io_end_vec_cachep, io_end_vec);
	}
}

struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end)
{
	BUG_ON(list_empty(&io_end->list_vec));
	return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list);
}

/*
@@ -125,6 +163,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
		ext4_finish_bio(bio);
		bio_put(bio);
	}
	ext4_free_io_end_vec(io_end);
	kmem_cache_free(io_end_cachep, io_end);
}

@@ -136,29 +175,26 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
 * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
 * completed (happens from ext4_free_ioend()).
 */
static int ext4_end_io(ext4_io_end_t *io)
static int ext4_end_io_end(ext4_io_end_t *io_end)
{
	struct inode *inode = io->inode;
	loff_t offset = io->offset;
	ssize_t size = io->size;
	handle_t *handle = io->handle;
	struct inode *inode = io_end->inode;
	handle_t *handle = io_end->handle;
	int ret = 0;

	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
	ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
		   "list->prev 0x%p\n",
		   io, inode->i_ino, io->list.next, io->list.prev);
		   io_end, inode->i_ino, io_end->list.next, io_end->list.prev);

	io->handle = NULL;	/* Following call will use up the handle */
	ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
	io_end->handle = NULL;	/* Following call will use up the handle */
	ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
	if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
		ext4_msg(inode->i_sb, KERN_EMERG,
			 "failed to convert unwritten extents to written "
			 "extents -- potential data loss!  "
			 "(inode %lu, offset %llu, size %zd, error %d)",
			 inode->i_ino, offset, size, ret);
			 "(inode %lu, error %d)", inode->i_ino, ret);
	}
	ext4_clear_io_unwritten_flag(io);
	ext4_release_io_end(io);
	ext4_clear_io_unwritten_flag(io_end);
	ext4_release_io_end(io_end);
	return ret;
}

@@ -166,21 +202,21 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
{
#ifdef	EXT4FS_DEBUG
	struct list_head *cur, *before, *after;
	ext4_io_end_t *io, *io0, *io1;
	ext4_io_end_t *io_end, *io_end0, *io_end1;

	if (list_empty(head))
		return;

	ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
	list_for_each_entry(io, head, list) {
		cur = &io->list;
	list_for_each_entry(io_end, head, list) {
		cur = &io_end->list;
		before = cur->prev;
		io0 = container_of(before, ext4_io_end_t, list);
		io_end0 = container_of(before, ext4_io_end_t, list);
		after = cur->next;
		io1 = container_of(after, ext4_io_end_t, list);
		io_end1 = container_of(after, ext4_io_end_t, list);

		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
			    io, inode->i_ino, io0, io1);
			    io_end, inode->i_ino, io_end0, io_end1);
	}
#endif
}
@@ -207,7 +243,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
static int ext4_do_flush_completed_IO(struct inode *inode,
				      struct list_head *head)
{
	ext4_io_end_t *io;
	ext4_io_end_t *io_end;
	struct list_head unwritten;
	unsigned long flags;
	struct ext4_inode_info *ei = EXT4_I(inode);
@@ -219,11 +255,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);

	while (!list_empty(&unwritten)) {
		io = list_entry(unwritten.next, ext4_io_end_t, list);
		BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
		list_del_init(&io->list);
		io_end = list_entry(unwritten.next, ext4_io_end_t, list);
		BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
		list_del_init(&io_end->list);

		err = ext4_end_io(io);
		err = ext4_end_io_end(io_end);
		if (unlikely(!ret && err))
			ret = err;
	}
@@ -242,19 +278,22 @@ void ext4_end_io_rsv_work(struct work_struct *work)

ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
{
	ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
	if (io) {
		io->inode = inode;
		INIT_LIST_HEAD(&io->list);
		atomic_set(&io->count, 1);
	ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags);

	if (io_end) {
		io_end->inode = inode;
		INIT_LIST_HEAD(&io_end->list);
		INIT_LIST_HEAD(&io_end->list_vec);
		atomic_set(&io_end->count, 1);
	}
	return io;
	return io_end;
}

void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
	if (atomic_dec_and_test(&io_end->count)) {
		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
				list_empty(&io_end->list_vec)) {
			ext4_release_io_end(io_end);
			return;
		}
@@ -268,9 +307,8 @@ int ext4_put_io_end(ext4_io_end_t *io_end)

	if (atomic_dec_and_test(&io_end->count)) {
		if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
			err = ext4_convert_unwritten_extents(io_end->handle,
						io_end->inode, io_end->offset,
						io_end->size);
			err = ext4_convert_unwritten_io_end_vec(io_end->handle,
								io_end);
			io_end->handle = NULL;
			ext4_clear_io_unwritten_flag(io_end);
		}
@@ -307,10 +345,8 @@ static void ext4_end_bio(struct bio *bio)
		struct inode *inode = io_end->inode;

		ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
			     "(offset %llu size %ld starting block %llu)",
			     "starting block %llu)",
			     bio->bi_status, inode->i_ino,
			     (unsigned long long) io_end->offset,
			     (long) io_end->size,
			     (unsigned long long)
			     bi_sector >> (inode->i_blkbits - 9));
		mapping_set_error(inode->i_mapping,
+0 −10
Original line number Diff line number Diff line
@@ -2105,16 +2105,6 @@ static int parse_options(char *options, struct super_block *sb,
		}
	}
#endif
	if (test_opt(sb, DIOREAD_NOLOCK)) {
		int blocksize =
			BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);

		if (blocksize < PAGE_SIZE) {
			ext4_msg(sb, KERN_ERR, "can't mount with "
				 "dioread_nolock if block size != PAGE_SIZE");
			return 0;
		}
	}
	return 1;
}