Commit 555a6e8c authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull ext4 updates from Ted Ts'o:
 "Various bug fixes and cleanups for ext4; no new features this cycle"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (29 commits)
  ext4: remove unnecessary wbc parameter from ext4_bio_write_page
  ext4: avoid s_mb_prefetch to be zero in individual scenarios
  ext4: defer saving error info from atomic context
  ext4: simplify ext4 error translation
  ext4: move functions in super.c
  ext4: make ext4_abort() use __ext4_error()
  ext4: standardize error message in ext4_protect_reserved_inode()
  ext4: remove redundant sb checksum recomputation
  ext4: don't remount read-only with errors=continue on reboot
  ext4: fix deadlock with fs freezing and EA inodes
  jbd2: add a helper to find out number of fast commit blocks
  ext4: make fast_commit.h byte identical with e2fsprogs/fast_commit.h
  ext4: fix fall-through warnings for Clang
  ext4: add docs about fast commit idempotence
  ext4: remove the unused EXT4_CURRENT_REV macro
  ext4: fix an IS_ERR() vs NULL check
  ext4: check for invalid block size early when mounting a file system
  ext4: fix a memory leak of ext4_free_data
  ext4: delete nonsensical (commented-out) code inside ext4_xattr_block_set()
  ext4: update ext4_data_block_valid related comments
  ...
parents 2f2fce3d be993933
Loading
Loading
Loading
Loading
+50 −0
Original line number Diff line number Diff line
@@ -681,3 +681,53 @@ Here is the list of supported tags and their meanings:
     - Stores the TID of the commit, CRC of the fast commit of which this tag
       represents the end of

Fast Commit Replay Idempotence
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Fast commits tags are idempotent in nature provided the recovery code follows
certain rules. The guiding principle that the commit path follows while
committing is that it stores the result of a particular operation instead of
storing the procedure.

Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
was associated with inode 10. During fast commit, instead of storing this
operation as a procedure "rename a to b", we store the resulting file system
state as a "series" of outcomes:

- Link dirent b to inode 10
- Unlink dirent a
- Inode 10 with valid refcount

Now when recovery code runs, it needs "enforce" this state on the file
system. This is what guarantees idempotence of fast commit replay.

Let's take an example of a procedure that is not idempotent and see how fast
commits make it idempotent. Consider following sequence of operations:

1) rm A
2) mv B A
3) read A

If we store this sequence of operations as is then the replay is not idempotent.
Let's say while in replay, we crash after (2). During the second replay,
file A (which was actually created as a result of "mv B A" operation) would get
deleted. Thus, file named A would be absent when we try to read A. So, this
sequence of operations is not idempotent. However, as mentioned above, instead
of storing the procedure fast commits store the outcome of each procedure. Thus
the fast commit log for above procedure would be as follows:

(Let's assume dirent A was linked to inode 10 and dirent B was linked to
inode 11 before the replay)

1) Unlink A
2) Link A to inode 11
3) Unlink B
4) Inode 11

If we crash after (3) we will have file A linked to inode 11. During the second
replay, we will remove file A (inode 11). But we will create it back and make
it point to inode 11. We won't find B, so we'll just skip that step. At this
point, the refcount for inode 11 is not reliable, but that gets fixed by the
replay of last inode 11 tag. Thus, by converting a non-idempotent procedure
into a series of idempotent outcomes, fast commits ensured idempotence during
the replay.
+1 −1
Original line number Diff line number Diff line
@@ -185,7 +185,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	ext4_fsblk_t start, tmp;

	J_ASSERT_BH(bh, buffer_locked(bh));
	ASSERT(buffer_locked(bh));

	/* If checksum is bad mark all blocks used to prevent allocation
	 * essentially implementing a per-group read-only flag. */
+7 −9
Original line number Diff line number Diff line
@@ -176,12 +176,10 @@ static int ext4_protect_reserved_inode(struct super_block *sb,
			err = add_system_zone(system_blks, map.m_pblk, n, ino);
			if (err < 0) {
				if (err == -EFSCORRUPTED) {
					__ext4_error(sb, __func__, __LINE__,
						     -err, map.m_pblk,
						     "blocks %llu-%llu from inode %u overlap system zone",
					EXT4_ERROR_INODE_ERR(inode, -err,
						"blocks %llu-%llu from inode overlap system zone",
						map.m_pblk,
						     map.m_pblk + map.m_len - 1,
						     ino);
						map.m_pblk + map.m_len - 1);
				}
				break;
			}
@@ -206,7 +204,7 @@ static void ext4_destroy_system_zone(struct rcu_head *rcu)
 *
 * The update of system_blks pointer in this function is protected by
 * sb->s_umount semaphore. However we have to be careful as we can be
 * racing with ext4_data_block_valid() calls reading system_blks rbtree
 * racing with ext4_inode_block_valid() calls reading system_blks rbtree
 * protected only by RCU. That's why we first build the rbtree and then
 * swap it in place.
 */
@@ -258,7 +256,7 @@ int ext4_setup_system_zone(struct super_block *sb)

	/*
	 * System blks rbtree complete, announce it once to prevent racing
	 * with ext4_data_block_valid() accessing the rbtree at the same
	 * with ext4_inode_block_valid() accessing the rbtree at the same
	 * time.
	 */
	rcu_assign_pointer(sbi->s_system_blks, system_blks);
@@ -278,7 +276,7 @@ err:
 *
 * The update of system_blks pointer in this function is protected by
 * sb->s_umount semaphore. However we have to be careful as we can be
 * racing with ext4_data_block_valid() calls reading system_blks rbtree
 * racing with ext4_inode_block_valid() calls reading system_blks rbtree
 * protected only by RCU. So we first clear the system_blks pointer and
 * then free the rbtree only after RCU grace period expires.
 */
+58 −19
Original line number Diff line number Diff line
@@ -98,6 +98,16 @@
#define ext_debug(ino, fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
#endif

#define ASSERT(assert)						\
do {									\
	if (unlikely(!(assert))) {					\
		printk(KERN_EMERG					\
		       "Assertion failure in %s() at %s:%d: '%s'\n",	\
		       __func__, __FILE__, __LINE__, #assert);		\
		BUG();							\
	}								\
} while (0)

/* data type for block offset of block group */
typedef int ext4_grpblk_t;

@@ -1619,6 +1629,27 @@ struct ext4_sb_info {
	errseq_t s_bdev_wb_err;
	spinlock_t s_bdev_wb_lock;

	/* Information about errors that happened during this mount */
	spinlock_t s_error_lock;
	int s_add_error_count;
	int s_first_error_code;
	__u32 s_first_error_line;
	__u32 s_first_error_ino;
	__u64 s_first_error_block;
	const char *s_first_error_func;
	time64_t s_first_error_time;
	int s_last_error_code;
	__u32 s_last_error_line;
	__u32 s_last_error_ino;
	__u64 s_last_error_block;
	const char *s_last_error_func;
	time64_t s_last_error_time;
	/*
	 * If we are in a context where we cannot update error information in
	 * the on-disk superblock, we queue this work to do it.
	 */
	struct work_struct s_error_work;

	/* Ext4 fast commit stuff */
	atomic_t s_fc_subtid;
	atomic_t s_fc_ineligible_updates;
@@ -1858,7 +1889,6 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define EXT4_GOOD_OLD_REV	0	/* The good old (original) format */
#define EXT4_DYNAMIC_REV	1	/* V2 format w/ dynamic inode sizes */

#define EXT4_CURRENT_REV	EXT4_GOOD_OLD_REV
#define EXT4_MAX_SUPP_REV	EXT4_DYNAMIC_REV

#define EXT4_GOOD_OLD_INODE_SIZE 128
@@ -2952,9 +2982,9 @@ extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
					     ext4_group_t block_group,
					     unsigned int flags);

extern __printf(6, 7)
void __ext4_error(struct super_block *, const char *, unsigned int, int, __u64,
		  const char *, ...);
extern __printf(7, 8)
void __ext4_error(struct super_block *, const char *, unsigned int, bool,
		  int, __u64, const char *, ...);
extern __printf(6, 7)
void __ext4_error_inode(struct inode *, const char *, unsigned int,
			ext4_fsblk_t, int, const char *, ...);
@@ -2963,9 +2993,6 @@ void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
		     const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
			     unsigned int, int);
extern __printf(5, 6)
void __ext4_abort(struct super_block *, const char *, unsigned int, int,
		  const char *, ...);
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
		    const char *, ...);
@@ -2995,6 +3022,9 @@ void __ext4_grp_locked_error(const char *, unsigned int,
#define EXT4_ERROR_FILE(file, block, fmt, a...)				\
	ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)

#define ext4_abort(sb, err, fmt, a...)					\
	__ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a)

#ifdef CONFIG_PRINTK

#define ext4_error_inode(inode, func, line, block, fmt, ...)		\
@@ -3005,11 +3035,11 @@ void __ext4_grp_locked_error(const char *, unsigned int,
#define ext4_error_file(file, func, line, block, fmt, ...)		\
	__ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
#define ext4_error(sb, fmt, ...)					\
	__ext4_error((sb), __func__, __LINE__, 0, 0, (fmt), ##__VA_ARGS__)
	__ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt),	\
		##__VA_ARGS__)
#define ext4_error_err(sb, err, fmt, ...)				\
	__ext4_error((sb), __func__, __LINE__, (err), 0, (fmt), ##__VA_ARGS__)
#define ext4_abort(sb, err, fmt, ...)					\
	__ext4_abort((sb), __func__, __LINE__, (err), (fmt), ##__VA_ARGS__)
	__ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt),	\
		##__VA_ARGS__)
#define ext4_warning(sb, fmt, ...)					\
	__ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_warning_inode(inode, fmt, ...)				\
@@ -3042,17 +3072,12 @@ do { \
#define ext4_error(sb, fmt, ...)					\
do {									\
	no_printk(fmt, ##__VA_ARGS__);					\
	__ext4_error(sb, "", 0, 0, 0, " ");				\
	__ext4_error(sb, "", 0, false, 0, 0, " ");			\
} while (0)
#define ext4_error_err(sb, err, fmt, ...)				\
do {									\
	no_printk(fmt, ##__VA_ARGS__);					\
	__ext4_error(sb, "", 0, err, 0, " ");				\
} while (0)
#define ext4_abort(sb, err, fmt, ...)					\
do {									\
	no_printk(fmt, ##__VA_ARGS__);					\
	__ext4_abort(sb, "", 0, err, " ");				\
	__ext4_error(sb, "", 0, false, err, 0, " ");			\
} while (0)
#define ext4_warning(sb, fmt, ...)					\
do {									\
@@ -3361,6 +3386,21 @@ static inline void ext4_unlock_group(struct super_block *sb,
	spin_unlock(ext4_group_lock_ptr(sb, group));
}

#ifdef CONFIG_QUOTA
static inline bool ext4_quota_capable(struct super_block *sb)
{
	return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb));
}

static inline bool ext4_is_quota_journalled(struct super_block *sb)
{
	struct ext4_sb_info *sbi = EXT4_SB(sb);

	return (ext4_has_feature_quota(sb) ||
		sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]);
}
#endif

/*
 * Block validity checking
 */
@@ -3609,7 +3649,6 @@ extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
			       struct page *page,
			       int len,
			       struct writeback_control *wbc,
			       bool keep_towrite);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
+2 −2
Original line number Diff line number Diff line
@@ -296,7 +296,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
	if (err) {
		ext4_journal_abort_handle(where, line, __func__,
					  bh, handle, err);
		__ext4_abort(inode->i_sb, where, line, -err,
		__ext4_error(inode->i_sb, where, line, true, -err, 0,
			     "error %d when attempting revoke", err);
	}
	BUFFER_TRACE(bh, "exit");
Loading