Commit 52e2d0a3 authored by Martin Brandenburg's avatar Martin Brandenburg Committed by Mike Marshall
Browse files

orangefs: write range tracking



Attach the actual range of bytes written to plus the responsible uid/gid
to each dirty page.  This information must be sent to the server when
the page is written out.

Now write_begin, page_mkwrite, and invalidatepage keep up with this
information.  There are several conditions where they must write out the
page immediately to store the new range.  Two non-contiguous ranges
cannot be stored on a single page.

Signed-off-by: default avatarMartin Brandenburg <martin@omnibond.com>
Signed-off-by: default avatarMike Marshall <hubcap@omnibond.com>
parent 90fc0706
Loading
Loading
Loading
Loading
+7 −3
Original line number Diff line number Diff line
@@ -46,8 +46,8 @@ static int flush_racache(struct inode *inode)
 * Post and wait for the I/O upcall to finish
 */
ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
		loff_t *offset, struct iov_iter *iter,
		size_t total_size, loff_t readahead_size)
    loff_t *offset, struct iov_iter *iter, size_t total_size,
    loff_t readahead_size, struct orangefs_write_range *wr)
{
	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
	struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
@@ -85,6 +85,10 @@ populate_shared_memory:
	new_op->upcall.req.io.buf_index = buffer_index;
	new_op->upcall.req.io.count = total_size;
	new_op->upcall.req.io.offset = *offset;
	if (type == ORANGEFS_IO_WRITE && wr) {
		new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid);
		new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid);
	}

	gossip_debug(GOSSIP_FILE_DEBUG,
		     "%s(%pU): offset: %llu total_size: %zd\n",
@@ -329,7 +333,7 @@ static vm_fault_t orangefs_fault(struct vm_fault *vmf)
static const struct vm_operations_struct orangefs_file_vm_ops = {
	.fault = orangefs_fault,
	.map_pages = filemap_map_pages,
	.page_mkwrite = filemap_page_mkwrite,
	.page_mkwrite = orangefs_page_mkwrite,
};

/*
+258 −31
Original line number Diff line number Diff line
@@ -15,9 +15,11 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"

static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
static int orangefs_writepage_locked(struct page *page,
    struct writeback_control *wbc)
{
	struct inode *inode = page->mapping->host;
	struct orangefs_write_range *wr = NULL;
	struct iov_iter iter;
	struct bio_vec bv;
	size_t len, wlen;
@@ -26,34 +28,52 @@ static int orangefs_writepage(struct page *page, struct writeback_control *wbc)

	set_page_writeback(page);

	off = page_offset(page);
	len = i_size_read(inode);
	if (off > len) {
		/* The file was truncated; there is nothing to write. */
		unlock_page(page);
		end_page_writeback(page);
		return 0;
	}
	if (PagePrivate(page)) {
		wr = (struct orangefs_write_range *)page_private(page);
		off = wr->pos;
		if (off + wr->len > len)
			wlen = len - off;
		else
			wlen = wr->len;
	} else {
		WARN_ON(1);
		off = page_offset(page);
		if (off + PAGE_SIZE > len)
			wlen = len - off;
		else
			wlen = PAGE_SIZE;
	}
	/* Should've been handled in orangefs_invalidatepage. */
	WARN_ON(off == len || off + wlen > len);

	bv.bv_page = page;
	bv.bv_len = wlen;
	bv.bv_offset = off % PAGE_SIZE;
	if (wlen == 0)
		dump_stack();
	WARN_ON(wlen == 0);
	iov_iter_bvec(&iter, WRITE, &bv, 1, wlen);

	ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
	    len);
	    len, wr);
	if (ret < 0) {
		SetPageError(page);
		mapping_set_error(page->mapping, ret);
	} else {
		ret = 0;
	}
	if (wr) {
		kfree(wr);
		set_page_private(page, 0);
		ClearPagePrivate(page);
		put_page(page);
	}
	return ret;
}

static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
{
	int ret;
	ret = orangefs_writepage_locked(page, wbc);
	unlock_page(page);
	end_page_writeback(page);
	return ret;
@@ -74,7 +94,7 @@ static int orangefs_readpage(struct file *file, struct page *page)
	iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE);

	ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
	    PAGE_SIZE, inode->i_size);
	    PAGE_SIZE, inode->i_size, NULL);
	/* this will only zero remaining unread portions of the page data */
	iov_iter_zero(~0U, &iter);
	/* takes care of potential aliasing */
@@ -92,6 +112,73 @@ static int orangefs_readpage(struct file *file, struct page *page)
	return ret;
}

static int orangefs_launder_page(struct page *);

static int orangefs_write_begin(struct file *file,
    struct address_space *mapping,
    loff_t pos, unsigned len, unsigned flags, struct page **pagep,
    void **fsdata)
{
	struct orangefs_write_range *wr;
	struct page *page;
	pgoff_t index;
	int ret;

	index = pos >> PAGE_SHIFT;

	page = grab_cache_page_write_begin(mapping, index, flags);
	if (!page)
		return -ENOMEM;

	*pagep = page;

	if (PageDirty(page) && !PagePrivate(page)) {
		/*
		 * Should be impossible.  If it happens, launder the page
		 * since we don't know what's dirty.  This will WARN in
		 * orangefs_writepage_locked.
		 */
		ret = orangefs_launder_page(page);
		if (ret)
			return ret;
	}
	if (PagePrivate(page)) {
		struct orangefs_write_range *wr;
		wr = (struct orangefs_write_range *)page_private(page);
		if (wr->pos + wr->len == pos &&
		    uid_eq(wr->uid, current_fsuid()) &&
		    gid_eq(wr->gid, current_fsgid())) {
			wr->len += len;
			goto okay;
		} else {
			ret = orangefs_launder_page(page);
			if (ret)
				return ret;
		}

	}

	wr = kmalloc(sizeof *wr, GFP_KERNEL);
	if (!wr)
		return -ENOMEM;

	wr->pos = pos;
	wr->len = len;
	wr->uid = current_fsuid();
	wr->gid = current_fsgid();
	SetPagePrivate(page);
	set_page_private(page, (unsigned long)wr);
	get_page(page);
okay:

	if (!PageUptodate(page) && (len != PAGE_SIZE)) {
		unsigned from = pos & (PAGE_SIZE - 1);

		zero_user_segments(page, 0, from, from + len, PAGE_SIZE);
	}
	return 0;
}

static int orangefs_write_end(struct file *file, struct address_space *mapping,
    loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata)
{
@@ -105,24 +192,96 @@ static void orangefs_invalidatepage(struct page *page,
				 unsigned int offset,
				 unsigned int length)
{
	gossip_debug(GOSSIP_INODE_DEBUG,
		     "orangefs_invalidatepage called on page %p "
		     "(offset is %u)\n",
		     page,
		     offset);
	struct orangefs_write_range *wr;
	wr = (struct orangefs_write_range *)page_private(page);

	if (offset == 0 && length == PAGE_SIZE) {
		kfree((struct orangefs_write_range *)page_private(page));
		set_page_private(page, 0);
		ClearPagePrivate(page);
		put_page(page);
	/* write range entirely within invalidate range (or equal) */
	} else if (page_offset(page) + offset <= wr->pos &&
	    wr->pos + wr->len <= page_offset(page) + offset + length) {
		kfree((struct orangefs_write_range *)page_private(page));
		set_page_private(page, 0);
		ClearPagePrivate(page);
		put_page(page);
		/* XXX is this right? only caller in fs */
		cancel_dirty_page(page);
	/* invalidate range chops off end of write range */
	} else if (wr->pos < page_offset(page) + offset &&
	    wr->pos + wr->len <= page_offset(page) + offset + length &&
	     page_offset(page) + offset < wr->pos + wr->len) {
		size_t x;
		x = wr->pos + wr->len - (page_offset(page) + offset);
		WARN_ON(x > wr->len);
		wr->len -= x;
		wr->uid = current_fsuid();
		wr->gid = current_fsgid();
	/* invalidate range chops off beginning of write range */
	} else if (page_offset(page) + offset <= wr->pos &&
	    page_offset(page) + offset + length < wr->pos + wr->len &&
	    wr->pos < page_offset(page) + offset + length) {
		size_t x;
		x = page_offset(page) + offset + length - wr->pos;
		WARN_ON(x > wr->len);
		wr->pos += x;
		wr->len -= x;
		wr->uid = current_fsuid();
		wr->gid = current_fsgid();
	/* invalidate range entirely within write range (punch hole) */
	} else if (wr->pos < page_offset(page) + offset &&
	    page_offset(page) + offset + length < wr->pos + wr->len) {
		/* XXX what do we do here... should not WARN_ON */
		WARN_ON(1);
		/* punch hole */
		/*
		 * should we just ignore this and write it out anyway?
		 * it hardly makes sense
		 */
	/* non-overlapping ranges */
	} else {
		/* WARN if they do overlap */
		if (!((page_offset(page) + offset + length <= wr->pos) ^
		    (wr->pos + wr->len <= page_offset(page) + offset))) {
			WARN_ON(1);
			printk("invalidate range offset %llu length %u\n",
			    page_offset(page) + offset, length);
			printk("write range offset %llu length %zu\n",
			    wr->pos, wr->len);
		}
	}
}

	ClearPageUptodate(page);
	ClearPageMappedToDisk(page);
	return;
static int orangefs_releasepage(struct page *page, gfp_t foo)
{
	return !PagePrivate(page);
}

static void orangefs_freepage(struct page *page)
{
	if (PagePrivate(page)) {
		kfree((struct orangefs_write_range *)page_private(page));
		set_page_private(page, 0);
		ClearPagePrivate(page);
		put_page(page);
	}
}

static int orangefs_releasepage(struct page *page, gfp_t foo)
static int orangefs_launder_page(struct page *page)
{
	gossip_debug(GOSSIP_INODE_DEBUG,
		     "orangefs_releasepage called on page %p\n",
		     page);
	return 0;
	int r = 0;
	struct writeback_control wbc = {
		.sync_mode = WB_SYNC_ALL,
		.nr_to_write = 0,
	};
	wait_on_page_writeback(page);
	if (clear_page_dirty_for_io(page)) {
		r = orangefs_writepage_locked(page, &wbc);
		end_page_writeback(page);
	}
	return r;
}

static ssize_t orangefs_direct_IO(struct kiocb *iocb,
@@ -145,7 +304,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
	struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
	size_t count = iov_iter_count(iter);
	size_t ORIGINALcount = iov_iter_count(iter);
	ssize_t total_count = 0;
	ssize_t ret = -EINVAL;
	int i = 0;
@@ -192,7 +350,7 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
			     (int)*offset);

		ret = wait_for_direct_io(type, inode, offset, iter,
				each_count, 0);
				each_count, 0, NULL);
		gossip_debug(GOSSIP_FILE_DEBUG,
			     "%s(%pU): return from wait_for_io:%d\n",
			     __func__,
@@ -247,13 +405,82 @@ static const struct address_space_operations orangefs_address_operations = {
	.writepage = orangefs_writepage,
	.readpage = orangefs_readpage,
	.set_page_dirty = __set_page_dirty_nobuffers,
	.write_begin = simple_write_begin,
	.write_begin = orangefs_write_begin,
	.write_end = orangefs_write_end,
	.invalidatepage = orangefs_invalidatepage,
	.releasepage = orangefs_releasepage,
	.freepage = orangefs_freepage,
	.launder_page = orangefs_launder_page,
	.direct_IO = orangefs_direct_IO,
};

vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
{
	struct page *page = vmf->page;
	struct inode *inode = file_inode(vmf->vma->vm_file);
	vm_fault_t ret = VM_FAULT_LOCKED;
	struct orangefs_write_range *wr;

	lock_page(page);
	if (PageDirty(page) && !PagePrivate(page)) {
		/*
		 * Should be impossible.  If it happens, launder the page
		 * since we don't know what's dirty.  This will WARN in
		 * orangefs_writepage_locked.
		 */
		if (orangefs_launder_page(page)) {
			ret = VM_FAULT_RETRY;
			goto out;
		}
	}
	if (PagePrivate(page)) {
		wr = (struct orangefs_write_range *)page_private(page);
		if (uid_eq(wr->uid, current_fsuid()) &&
		    gid_eq(wr->gid, current_fsgid())) {
			wr->pos = page_offset(page);
			wr->len = PAGE_SIZE;
			goto okay;
		} else {
			if (orangefs_launder_page(page)) {
				ret = VM_FAULT_RETRY;
				goto out;
			}
		}
	}
	wr = kmalloc(sizeof *wr, GFP_KERNEL);
	if (!wr) {
		ret = VM_FAULT_RETRY;
		goto out;
	}
	wr->pos = page_offset(page);
	wr->len = PAGE_SIZE;
	wr->uid = current_fsuid();
	wr->gid = current_fsgid();
	SetPagePrivate(page);
	set_page_private(page, (unsigned long)wr);
	get_page(page);
okay:

	sb_start_pagefault(inode->i_sb);
	file_update_time(vmf->vma->vm_file);
	if (page->mapping != inode->i_mapping) {
		unlock_page(page);
		ret = VM_FAULT_NOPAGE;
		goto out;
	}

	/*
	 * We mark the page dirty already here so that when freeze is in
	 * progress, we are guaranteed that writeback during freezing will
	 * see the dirty page and writeprotect it again.
	 */
	set_page_dirty(page);
	wait_for_stable_page(page);
out:
	sb_end_pagefault(inode->i_sb);
	return ret;
}

static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
{
	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+9 −1
Original line number Diff line number Diff line
@@ -230,6 +230,13 @@ struct orangefs_cached_xattr {
	unsigned long timeout;
};

struct orangefs_write_range {
	loff_t pos;
	size_t len;
	kuid_t uid;
	kgid_t gid;
};

extern struct orangefs_stats orangefs_stats;

/*
@@ -342,6 +349,7 @@ void fsid_key_table_finalize(void);
/*
 * defined in inode.c
 */
vm_fault_t orangefs_page_mkwrite(struct vm_fault *);
struct inode *orangefs_new_inode(struct super_block *sb,
			      struct inode *dir,
			      int mode,
@@ -383,7 +391,7 @@ bool __is_daemon_in_service(void);
 * defined in file.c
 */
ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *,
    struct iov_iter *, size_t, loff_t);
    struct iov_iter *, size_t, loff_t, struct orangefs_write_range *);
ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *,
    struct iov_iter *);