Commit 93848a99 authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Ben Myers
Browse files

xfs: add version 3 inode format with CRCs



Add a new inode version with a larger core.  The primary objective is
to allow for a crc of the inode, and location information (uuid and ino)
to verify it was written in the right place.  We also extend it by:

	a creation time (for Samba);
	a changecount (for NFSv4);
	a flush sequence (in LSN format for recovery);
	an additional inode flags field; and
	some additional padding.

These additional fields are not implemented yet, but already laid
out in the structure.

[dchinner@redhat.com] Added LSN and flags field, some factoring and rework to
capture all the necessary information in the crc calculation.

Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarDave Chinner <dchinner@redhat.com>
Reviewed-by: default avatarBen Myers <bpm@sgi.com>
Signed-off-by: default avatarBen Myers <bpm@sgi.com>
parent 3fe58f30
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -48,6 +48,7 @@ extern kmem_zone_t *xfs_buf_item_zone;
#define XFS_BLF_AGF_BUF		(1<<6)
#define XFS_BLF_AGFL_BUF	(1<<7)
#define XFS_BLF_AGI_BUF		(1<<8)
#define XFS_BLF_DINO_BUF	(1<<9)

#define XFS_BLF_TYPE_MASK	\
		(XFS_BLF_UDQUOT_BUF | \
@@ -56,7 +57,8 @@ extern kmem_zone_t *xfs_buf_item_zone;
		 XFS_BLF_BTREE_BUF | \
		 XFS_BLF_AGF_BUF | \
		 XFS_BLF_AGFL_BUF | \
		 XFS_BLF_AGI_BUF)
		 XFS_BLF_AGI_BUF | \
		 XFS_BLF_DINO_BUF)

#define	XFS_BLF_CHUNK		128
#define	XFS_BLF_SHIFT		7
+29 −4
Original line number Diff line number Diff line
@@ -19,7 +19,7 @@
#define	__XFS_DINODE_H__

#define	XFS_DINODE_MAGIC		0x494e	/* 'IN' */
#define XFS_DINODE_GOOD_VERSION(v)	(((v) == 1 || (v) == 2))
#define XFS_DINODE_GOOD_VERSION(v)	((v) >= 1 && (v) <= 3)

typedef struct xfs_timestamp {
	__be32		t_sec;		/* timestamp seconds */
@@ -70,10 +70,35 @@ typedef struct xfs_dinode {

	/* di_next_unlinked is the only non-core field in the old dinode */
	__be32		di_next_unlinked;/* agi unlinked list ptr */
} __attribute__((packed)) xfs_dinode_t;

	/* start of the extended dinode, writable fields */
	__le32		di_crc;		/* CRC of the inode */
	__be64		di_changecount;	/* number of attribute changes */
	__be64		di_lsn;		/* flush sequence */
	__be64		di_flags2;	/* more random flags */
	__u8		di_pad2[16];	/* more padding for future expansion */

	/* fields only written to during inode creation */
	xfs_timestamp_t	di_crtime;	/* time created */
	__be64		di_ino;		/* inode number */
	uuid_t		di_uuid;	/* UUID of the filesystem */

	/* structure must be padded to 64 bit alignment */
} xfs_dinode_t;

#define DI_MAX_FLUSH 0xffff

/*
 * Size of the core inode on disk.  Version 1 and 2 inodes have
 * the same size, but version 3 has grown a few additional fields.
 */
static inline uint xfs_dinode_size(int version)
{
	if (version == 3)
		return sizeof(struct xfs_dinode);
	return offsetof(struct xfs_dinode, di_crc);
}

/*
 * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
 * Since the pathconf interface is signed, we use 2^31 - 1 instead.
@@ -105,7 +130,7 @@ typedef enum xfs_dinode_fmt {
 * Inode size for given fs.
 */
#define XFS_LITINO(mp, version) \
	((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode)))
	((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))

#define XFS_BROOT_SIZE_ADJ(ip) \
	(XFS_BMBT_BLOCK_LEN((ip)->i_mount) - sizeof(xfs_bmdr_block_t))
@@ -133,7 +158,7 @@ typedef enum xfs_dinode_fmt {
 * Return pointers to the data or attribute forks.
 */
#define XFS_DFORK_DPTR(dip) \
	((char *)(dip) + sizeof(struct xfs_dinode))
	((char *)dip + xfs_dinode_size(dip->di_version))
#define XFS_DFORK_APTR(dip)	\
	(XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
#define XFS_DFORK_PTR(dip,w)	\
+41 −9
Original line number Diff line number Diff line
@@ -167,6 +167,7 @@ xfs_ialloc_inode_init(
	int			version;
	int			i, j;
	xfs_daddr_t		d;
	xfs_ino_t		ino = 0;

	/*
	 * Loop over the new block(s), filling in the inodes.
@@ -185,13 +186,29 @@ xfs_ialloc_inode_init(
	}

	/*
	 * Figure out what version number to use in the inodes we create.
	 * If the superblock version has caught up to the one that supports
	 * the new inode format, then use the new inode version.  Otherwise
	 * use the old version so that old kernels will continue to be
	 * able to use the file system.
	 */
	if (xfs_sb_version_hasnlink(&mp->m_sb))
	 * Figure out what version number to use in the inodes we create.  If
	 * the superblock version has caught up to the one that supports the new
	 * inode format, then use the new inode version.  Otherwise use the old
	 * version so that old kernels will continue to be able to use the file
	 * system.
	 *
	 * For v3 inodes, we also need to write the inode number into the inode,
	 * so calculate the first inode number of the chunk here as
	 * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
	 * across multiple filesystem blocks (such as a cluster) and so cannot
	 * be used in the cluster buffer loop below.
	 *
	 * Further, because we are writing the inode directly into the buffer
	 * and calculating a CRC on the entire inode, we have ot log the entire
	 * inode so that the entire range the CRC covers is present in the log.
	 * That means for v3 inode we log the entire buffer rather than just the
	 * inode cores.
	 */
	if (xfs_sb_version_hascrc(&mp->m_sb)) {
		version = 3;
		ino = XFS_AGINO_TO_INO(mp, agno,
				       XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
	} else if (xfs_sb_version_hasnlink(&mp->m_sb))
		version = 2;
	else
		version = 1;
@@ -214,17 +231,32 @@ xfs_ialloc_inode_init(
		 *	individual transactions causing a lot of log traffic.
		 */
		fbuf->b_ops = &xfs_inode_buf_ops;
		xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
		xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
		for (i = 0; i < ninodes; i++) {
			int	ioffset = i << mp->m_sb.sb_inodelog;
			uint	isize = sizeof(struct xfs_dinode);
			uint	isize = xfs_dinode_size(version);

			free = xfs_make_iptr(mp, fbuf, i);
			free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
			free->di_version = version;
			free->di_gen = cpu_to_be32(gen);
			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
			xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);

			if (version == 3) {
				free->di_ino = cpu_to_be64(ino);
				ino++;
				uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
				xfs_dinode_calc_crc(mp, free);
			} else {
				/* just log the inode core */
				xfs_trans_log_buf(tp, fbuf, ioffset,
						  ioffset + isize - 1);
			}
		}
		if (version == 3) {
			/* need to log the entire buffer */
			xfs_trans_log_buf(tp, fbuf, 0,
					  BBTOB(fbuf->b_length) - 1);
		}
		xfs_trans_inode_alloc_buf(tp, fbuf);
	}
+125 −54
Original line number Diff line number Diff line
@@ -44,6 +44,7 @@
#include "xfs_quota.h"
#include "xfs_filestream.h"
#include "xfs_vnodeops.h"
#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_icache.h"

@@ -866,6 +867,17 @@ xfs_dinode_from_disk(
	to->di_dmstate	= be16_to_cpu(from->di_dmstate);
	to->di_flags	= be16_to_cpu(from->di_flags);
	to->di_gen	= be32_to_cpu(from->di_gen);

	if (to->di_version == 3) {
		to->di_changecount = be64_to_cpu(from->di_changecount);
		to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
		to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
		to->di_flags2 = be64_to_cpu(from->di_flags2);
		to->di_ino = be64_to_cpu(from->di_ino);
		to->di_lsn = be64_to_cpu(from->di_lsn);
		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
		uuid_copy(&to->di_uuid, &from->di_uuid);
	}
}

void
@@ -902,6 +914,17 @@ xfs_dinode_to_disk(
	to->di_dmstate = cpu_to_be16(from->di_dmstate);
	to->di_flags = cpu_to_be16(from->di_flags);
	to->di_gen = cpu_to_be32(from->di_gen);

	if (from->di_version == 3) {
		to->di_changecount = cpu_to_be64(from->di_changecount);
		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
		to->di_flags2 = cpu_to_be64(from->di_flags2);
		to->di_ino = cpu_to_be64(from->di_ino);
		to->di_lsn = cpu_to_be64(from->di_lsn);
		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
		uuid_copy(&to->di_uuid, &from->di_uuid);
	}
}

STATIC uint
@@ -962,6 +985,47 @@ xfs_dic2xflags(
				(XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
}

static bool
xfs_dinode_verify(
	struct xfs_mount	*mp,
	struct xfs_inode	*ip,
	struct xfs_dinode	*dip)
{
	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
		return false;

	/* only version 3 or greater inodes are extensively verified here */
	if (dip->di_version < 3)
		return true;

	if (!xfs_sb_version_hascrc(&mp->m_sb))
		return false;
	if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
			      offsetof(struct xfs_dinode, di_crc)))
		return false;
	if (be64_to_cpu(dip->di_ino) != ip->i_ino)
		return false;
	if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
		return false;
	return true;
}

void
xfs_dinode_calc_crc(
	struct xfs_mount	*mp,
	struct xfs_dinode	*dip)
{
	__uint32_t		crc;

	if (dip->di_version < 3)
		return;

	ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
	crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
			      offsetof(struct xfs_dinode, di_crc));
	dip->di_crc = xfs_end_cksum(crc);
}

/*
 * Read the disk inode attributes into the in-core inode structure.
 */
@@ -990,17 +1054,13 @@ xfs_iread(
	if (error)
		return error;

	/*
	 * If we got something that isn't an inode it means someone
	 * (nfs or dmi) has a stale handle.
	 */
	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) {
#ifdef DEBUG
		xfs_alert(mp,
			"%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
			__func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
#endif /* DEBUG */
		error = XFS_ERROR(EINVAL);
	/* even unallocated inodes are verified */
	if (!xfs_dinode_verify(mp, ip, dip)) {
		xfs_alert(mp, "%s: validation failed for inode %lld failed",
				__func__, ip->i_ino);

		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
		error = XFS_ERROR(EFSCORRUPTED);
		goto out_brelse;
	}

@@ -1022,10 +1082,20 @@ xfs_iread(
			goto out_brelse;
		}
	} else {
		/*
		 * Partial initialisation of the in-core inode. Just the bits
		 * that xfs_ialloc won't overwrite or relies on being correct.
		 */
		ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
		ip->i_d.di_version = dip->di_version;
		ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
		ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);

		if (dip->di_version == 3) {
			ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
			uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
		}

		/*
		 * Make sure to pull in the mode here as well in
		 * case the inode is released without being used.
@@ -1161,6 +1231,7 @@ xfs_ialloc(
	xfs_buf_t	**ialloc_context,
	xfs_inode_t	**ipp)
{
	struct xfs_mount *mp = tp->t_mountp;
	xfs_ino_t	ino;
	xfs_inode_t	*ip;
	uint		flags;
@@ -1187,7 +1258,7 @@ xfs_ialloc(
	 * This is because we're setting fields here we need
	 * to prevent others from looking at until we're done.
	 */
	error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
	error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
			 XFS_ILOCK_EXCL, &ip);
	if (error)
		return error;
@@ -1208,7 +1279,7 @@ xfs_ialloc(
	 * the inode version number now.  This way we only do the conversion
	 * here rather than here and in the flush/logging code.
	 */
	if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
	if (xfs_sb_version_hasnlink(&mp->m_sb) &&
	    ip->i_d.di_version == 1) {
		ip->i_d.di_version = 2;
		/*
@@ -1258,6 +1329,19 @@ xfs_ialloc(
	ip->i_d.di_dmevmask = 0;
	ip->i_d.di_dmstate = 0;
	ip->i_d.di_flags = 0;

	if (ip->i_d.di_version == 3) {
		ASSERT(ip->i_d.di_ino == ino);
		ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid));
		ip->i_d.di_crc = 0;
		ip->i_d.di_changecount = 1;
		ip->i_d.di_lsn = 0;
		ip->i_d.di_flags2 = 0;
		memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2));
		ip->i_d.di_crtime = ip->i_d.di_mtime;
	}


	flags = XFS_ILOG_CORE;
	switch (mode & S_IFMT) {
	case S_IFIFO:
@@ -2716,20 +2800,18 @@ abort_out:

STATIC int
xfs_iflush_int(
	xfs_inode_t		*ip,
	xfs_buf_t		*bp)
	struct xfs_inode	*ip,
	struct xfs_buf		*bp)
{
	xfs_inode_log_item_t	*iip;
	xfs_dinode_t		*dip;
	xfs_mount_t		*mp;
	struct xfs_inode_log_item *iip = ip->i_itemp;
	struct xfs_dinode	*dip;
	struct xfs_mount	*mp = ip->i_mount;

	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
	ASSERT(xfs_isiflocked(ip));
	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));

	iip = ip->i_itemp;
	mp = ip->i_mount;
	ASSERT(iip != NULL && iip->ili_fields != 0);

	/* set *dip = inode's place in the buffer */
	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -2790,9 +2872,9 @@ xfs_iflush_int(
	}
	/*
	 * bump the flush iteration count, used to detect flushes which
	 * postdate a log record during recovery.
	 * postdate a log record during recovery. This is redundant as we now
	 * log every change and hence this can't happen. Still, it doesn't hurt.
	 */

	ip->i_d.di_flushiter++;

	/*
@@ -2868,7 +2950,6 @@ xfs_iflush_int(
	 * need the AIL lock, because it is a 64 bit value that cannot be read
	 * atomically.
	 */
	if (iip != NULL && iip->ili_fields != 0) {
	iip->ili_last_fields = iip->ili_fields;
	iip->ili_fields = 0;
	iip->ili_logged = 1;
@@ -2884,25 +2965,15 @@ xfs_iflush_int(
	 */
	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);

	/* update the lsn in the on disk inode if required */
	if (ip->i_d.di_version == 3)
		dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn);

	/* generate the checksum. */
	xfs_dinode_calc_crc(mp, dip);

	ASSERT(bp->b_fspriv != NULL);
	ASSERT(bp->b_iodone != NULL);
	} else {
		/*
		 * We're flushing an inode which is not in the AIL and has
		 * not been logged.  For this case we can immediately drop
		 * the inode flush lock because we can avoid the whole
		 * AIL state thing.  It's OK to drop the flush lock now,
		 * because we've already locked the buffer and to do anything
		 * you really need both.
		 */
		if (iip != NULL) {
			ASSERT(iip->ili_logged == 0);
			ASSERT(iip->ili_last_fields == 0);
			ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
		}
		xfs_ifunlock(ip);
	}

	return 0;

corrupt_out:
+26 −0
Original line number Diff line number Diff line
@@ -150,13 +150,38 @@ typedef struct xfs_icdinode {
	__uint16_t	di_dmstate;	/* DMIG state info */
	__uint16_t	di_flags;	/* random flags, XFS_DIFLAG_... */
	__uint32_t	di_gen;		/* generation number */

	/* di_next_unlinked is the only non-core field in the old dinode */
	xfs_agino_t	di_next_unlinked;/* agi unlinked list ptr */

	/* start of the extended dinode, writable fields */
	__uint32_t	di_crc;		/* CRC of the inode */
	__uint64_t	di_changecount;	/* number of attribute changes */
	xfs_lsn_t	di_lsn;		/* flush sequence */
	__uint64_t	di_flags2;	/* more random flags */
	__uint8_t	di_pad2[16];	/* more padding for future expansion */

	/* fields only written to during inode creation */
	xfs_ictimestamp_t di_crtime;	/* time created */
	xfs_ino_t	di_ino;		/* inode number */
	uuid_t		di_uuid;	/* UUID of the filesystem */

	/* structure must be padded to 64 bit alignment */
} xfs_icdinode_t;

static inline uint xfs_icdinode_size(int version)
{
	if (version == 3)
		return sizeof(struct xfs_icdinode);
	return offsetof(struct xfs_icdinode, di_next_unlinked);
}

/*
 * Flags for xfs_ichgtime().
 */
#define	XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
#define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
#define	XFS_ICHGTIME_CREATE	0x4	/* inode create timestamp */

/*
 * Per-fork incore inode flags.
@@ -556,6 +581,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
			       struct xfs_buf **, uint, uint);
int		xfs_iread(struct xfs_mount *, struct xfs_trans *,
			  struct xfs_inode *, uint);
void		xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
void		xfs_dinode_to_disk(struct xfs_dinode *,
				   struct xfs_icdinode *);
void		xfs_idestroy_fork(struct xfs_inode *, int);
Loading