Merge with /home/shaggy/git/linus-clean/

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
author: Dave Kleikamp <shaggy@austin.ibm.com> 2005-07-19 13:46:53 -0500
committer: Dave Kleikamp <shaggy@austin.ibm.com> 2005-07-19 13:46:53 -0500
commit: 21d1ee8b375bcd180f1d6b8ccbb8d8f938596310 (patch)
tree: 2e82b65c16a4aaa88eeb7dd9f47f2d1c418e77d0 /fs
parent: 3d9b1cdd2455017c6aa25bc2442092b81438981f (diff)
parent: f60f700876cd51de9de69f3a3c865d95e287a24d (diff)
30 files changed, 2632 insertions, 1008 deletions
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index d44431d1a338..0aa5ac159c09 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -15,66 +15,79 @@
 #include "xip.h"
 
 static inline int
-__inode_direct_access(struct inode *inode, sector_t sector, unsigned long *data) {
+__inode_direct_access(struct inode *inode, sector_t sector,
+		      unsigned long *data)
+{
 	BUG_ON(!inode->i_sb->s_bdev->bd_disk->fops->direct_access);
 	return inode->i_sb->s_bdev->bd_disk->fops
 		->direct_access(inode->i_sb->s_bdev,sector,data);
 }
 
+static inline int
+__ext2_get_sector(struct inode *inode, sector_t offset, int create,
+		   sector_t *result)
+{
+	struct buffer_head tmp;
+	int rc;
+
+	memset(&tmp, 0, sizeof(struct buffer_head));
+	rc = ext2_get_block(inode, offset/ (PAGE_SIZE/512), &tmp,
+			    create);
+	*result = tmp.b_blocknr;
+
+	/* did we get a sparse block (hole in the file)? */
+	if (!(*result)) {
+		BUG_ON(create);
+		rc = -ENODATA;
+	}
+
+	return rc;
+}
+
 int
-ext2_clear_xip_target(struct inode *inode, int block) {
-	sector_t sector = block*(PAGE_SIZE/512);
+ext2_clear_xip_target(struct inode *inode, int block)
+{
+	sector_t sector = block * (PAGE_SIZE/512);
 	unsigned long data;
 	int rc;
 
 	rc = __inode_direct_access(inode, sector, &data);
-	if (rc)
-		return rc;
-	clear_page((void*)data);
-	return 0;
+	if (!rc)
+		clear_page((void*)data);
+	return rc;
 }
 
 void ext2_xip_verify_sb(struct super_block *sb)
 {
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 
-	if ((sbi->s_mount_opt & EXT2_MOUNT_XIP)) {
-		if ((sb->s_bdev == NULL) ||
-			sb->s_bdev->bd_disk == NULL ||
-			sb->s_bdev->bd_disk->fops == NULL ||
-			sb->s_bdev->bd_disk->fops->direct_access == NULL) {
-			sbi->s_mount_opt &= (~EXT2_MOUNT_XIP);
-			ext2_warning(sb, __FUNCTION__,
-				"ignoring xip option - not supported by bdev");
-		}
+	if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) &&
+	    !sb->s_bdev->bd_disk->fops->direct_access) {
+		sbi->s_mount_opt &= (~EXT2_MOUNT_XIP);
+		ext2_warning(sb, __FUNCTION__,
+			     "ignoring xip option - not supported by bdev");
 	}
 }
 
-struct page*
-ext2_get_xip_page(struct address_space *mapping, sector_t blockno,
+struct page *
+ext2_get_xip_page(struct address_space *mapping, sector_t offset,
 		   int create)
 {
 	int rc;
 	unsigned long data;
-	struct buffer_head tmp;
+	sector_t sector;
 
-	tmp.b_state = 0;
-	tmp.b_blocknr = 0;
-	rc = ext2_get_block(mapping->host, blockno/(PAGE_SIZE/512) , &tmp,
-				create);
+	/* first, retrieve the sector number */
+	rc = __ext2_get_sector(mapping->host, offset, create, &sector);
 	if (rc)
-		return ERR_PTR(rc);
-	if (tmp.b_blocknr == 0) {
-		/* SPARSE block */
-		BUG_ON(create);
-		return ERR_PTR(-ENODATA);
-	}
+		goto error;
 
+	/* retrieve address of the target data */
 	rc = __inode_direct_access
-		(mapping->host,tmp.b_blocknr*(PAGE_SIZE/512) ,&data);
-	if (rc)
-		return ERR_PTR(rc);
+		(mapping->host, sector * (PAGE_SIZE/512), &data);
+	if (!rc)
+		return virt_to_page(data);
 
-	SetPageUptodate(virt_to_page(data));
-	return virt_to_page(data);
+ error:
+	return ERR_PTR(rc);
 }
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 6a4c0a3685da..787d84ac2bcd 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -7,7 +7,7 @@
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
- * $Id: erase.c,v 1.76 2005/05/03 15:11:40 dedekind Exp $
+ * $Id: erase.c,v 1.80 2005/07/14 19:46:24 joern Exp $
  *
  */
 
@@ -300,100 +300,86 @@ static void jffs2_free_all_node_refs(struct jffs2_sb_info *c, struct jffs2_erase
 	jeb->last_node = NULL;
 }
 
-static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t *bad_offset)
 {
-	struct jffs2_raw_node_ref *marker_ref = NULL;
-	unsigned char *ebuf;
+	void *ebuf;
+	uint32_t ofs;
 	size_t retlen;
-	int ret;
-	uint32_t bad_offset;
-
-	if ((!jffs2_cleanmarker_oob(c)) && (c->cleanmarker_size > 0)) {
-		marker_ref = jffs2_alloc_raw_node_ref();
-		if (!marker_ref) {
-			printk(KERN_WARNING "Failed to allocate raw node ref for clean marker\n");
-			/* Stick it back on the list from whence it came and come back later */
-			jffs2_erase_pending_trigger(c);
-			spin_lock(&c->erase_completion_lock);
-			list_add(&jeb->list, &c->erase_complete_list);
-			spin_unlock(&c->erase_completion_lock);
-			return;
-		}
-	}
+	int ret = -EIO;
+	
 	ebuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!ebuf) {
-		printk(KERN_WARNING "Failed to allocate page buffer for verifying erase at 0x%08x. Assuming it worked\n", jeb->offset);
-	} else {
-		uint32_t ofs = jeb->offset;
+		printk(KERN_WARNING "Failed to allocate page buffer for verifying erase at 0x%08x. Refiling\n", jeb->offset);
+		return -EAGAIN;
+	}
 
-		D1(printk(KERN_DEBUG "Verifying erase at 0x%08x\n", jeb->offset));
-		while(ofs < jeb->offset + c->sector_size) {
-			uint32_t readlen = min((uint32_t)PAGE_SIZE, jeb->offset + c->sector_size - ofs);
-			int i;
+	D1(printk(KERN_DEBUG "Verifying erase at 0x%08x\n", jeb->offset));
 
-			bad_offset = ofs;
+	for (ofs = jeb->offset; ofs < jeb->offset + c->sector_size; ) {
+		uint32_t readlen = min((uint32_t)PAGE_SIZE, jeb->offset + c->sector_size - ofs);
+		int i;
 
-			ret = c->mtd->read(c->mtd, ofs, readlen, &retlen, ebuf);
+		*bad_offset = ofs;
 
-			if (ret) {
-				printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret);
-				goto bad;
-			}
-			if (retlen != readlen) {
-				printk(KERN_WARNING "Short read from newly-erased block at 0x%08x. Wanted %d, got %zd\n", ofs, readlen, retlen);
-				goto bad;
-			}
-			for (i=0; i<readlen; i += sizeof(unsigned long)) {
-				/* It's OK. We know it's properly aligned */
-				unsigned long datum = *(unsigned long *)(&ebuf[i]);
-				if (datum + 1) {
-					bad_offset += i;
-					printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08x\n", datum, bad_offset);
-				bad: 
-					if ((!jffs2_cleanmarker_oob(c)) && (c->cleanmarker_size > 0))
-						jffs2_free_raw_node_ref(marker_ref);
-					kfree(ebuf);
-				bad2:
-					spin_lock(&c->erase_completion_lock);
-					/* Stick it on a list (any list) so
-					   erase_failed can take it right off
-					   again.  Silly, but shouldn't happen
-					   often. */
-					list_add(&jeb->list, &c->erasing_list);
-					spin_unlock(&c->erase_completion_lock);
-					jffs2_erase_failed(c, jeb, bad_offset);
-					return;
-				}
+		ret = jffs2_flash_read(c, ofs, readlen, &retlen, ebuf);
+		if (ret) {
+			printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret);
+			goto fail;
+		}
+		if (retlen != readlen) {
+			printk(KERN_WARNING "Short read from newly-erased block at 0x%08x. Wanted %d, got %zd\n", ofs, readlen, retlen);
+			goto fail;
+		}
+		for (i=0; i<readlen; i += sizeof(unsigned long)) {
+			/* It's OK. We know it's properly aligned */
+			unsigned long *datum = ebuf + i;
+			if (*datum + 1) {
+				*bad_offset += i;
+				printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08x\n", *datum, *bad_offset);
+				goto fail;
 			}
-			ofs += readlen;
-			cond_resched();
 		}
-		kfree(ebuf);
+		ofs += readlen;
+		cond_resched();
 	}
+	ret = 0;
+fail:
+	kfree(ebuf);
+	return ret;
+}
 
-	bad_offset = jeb->offset;
+static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+	struct jffs2_raw_node_ref *marker_ref = NULL;
+	size_t retlen;
+	int ret;
+	uint32_t bad_offset;
+
+	switch (jffs2_block_check_erase(c, jeb, &bad_offset)) {
+	case -EAGAIN:	goto refile;
+	case -EIO:	goto filebad;
+	}
 
 	/* Write the erase complete marker */	
 	D1(printk(KERN_DEBUG "Writing erased marker to block at 0x%08x\n", jeb->offset));
-	if (jffs2_cleanmarker_oob(c)) {
+	bad_offset = jeb->offset;
 
-		if (jffs2_write_nand_cleanmarker(c, jeb))
-			goto bad2;
-			
-		jeb->first_node = jeb->last_node = NULL;
+	/* Cleanmarker in oob area or no cleanmarker at all ? */
+	if (jffs2_cleanmarker_oob(c) || c->cleanmarker_size == 0) {
 
-		jeb->free_size = c->sector_size;
-		jeb->used_size = 0;
-		jeb->dirty_size = 0;
-		jeb->wasted_size = 0;
-	} else if (c->cleanmarker_size == 0) {
-		jeb->first_node = jeb->last_node = NULL;
+		if (jffs2_cleanmarker_oob(c)) {
+			if (jffs2_write_nand_cleanmarker(c, jeb))
+				goto filebad;
+		}
 
+		jeb->first_node = jeb->last_node = NULL;
 		jeb->free_size = c->sector_size;
 		jeb->used_size = 0;
 		jeb->dirty_size = 0;
 		jeb->wasted_size = 0;
+
 	} else {
+
 		struct kvec vecs[1];
 		struct jffs2_unknown_node marker = {
 			.magic =	cpu_to_je16(JFFS2_MAGIC_BITMASK),
@@ -401,21 +387,28 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 			.totlen =	cpu_to_je32(c->cleanmarker_size)
 		};
 
+		marker_ref = jffs2_alloc_raw_node_ref();
+		if (!marker_ref) {
+			printk(KERN_WARNING "Failed to allocate raw node ref for clean marker. Refiling\n");
+			goto refile;
+		}
+
 		marker.hdr_crc = cpu_to_je32(crc32(0, &marker, sizeof(struct jffs2_unknown_node)-4));
 
 		vecs[0].iov_base = (unsigned char *) &marker;
 		vecs[0].iov_len = sizeof(marker);
 		ret = jffs2_flash_direct_writev(c, vecs, 1, jeb->offset, &retlen);
 		
-		if (ret) {
-			printk(KERN_WARNING "Write clean marker to block at 0x%08x failed: %d\n",
-			       jeb->offset, ret);
-			goto bad2;
-		}
-		if (retlen != sizeof(marker)) {
-			printk(KERN_WARNING "Short write to newly-erased block at 0x%08x: Wanted %zd, got %zd\n",
-			       jeb->offset, sizeof(marker), retlen);
-			goto bad2;
+		if (ret || retlen != sizeof(marker)) {
+			if (ret)
+				printk(KERN_WARNING "Write clean marker to block at 0x%08x failed: %d\n",
+				       jeb->offset, ret);
+			else
+				printk(KERN_WARNING "Short write to newly-erased block at 0x%08x: Wanted %zd, got %zd\n",
+				       jeb->offset, sizeof(marker), retlen);
+
+			jffs2_free_raw_node_ref(marker_ref);
+			goto filebad;
 		}
 
 		marker_ref->next_in_ino = NULL;
@@ -444,5 +437,22 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 	c->nr_free_blocks++;
 	spin_unlock(&c->erase_completion_lock);
 	wake_up(&c->erase_wait);
-}
+	return;
+
+filebad:
+	spin_lock(&c->erase_completion_lock);
+	/* Stick it on a list (any list) so erase_failed can take it
+	   right off again.  Silly, but shouldn't happen often. */
+	list_add(&jeb->list, &c->erasing_list);
+	spin_unlock(&c->erase_completion_lock);
+	jffs2_erase_failed(c, jeb, bad_offset);
+	return;
 
+refile:
+	/* Stick it back on the list from whence it came and come back later */
+	jffs2_erase_pending_trigger(c);
+	spin_lock(&c->erase_completion_lock);
+	list_add(&jeb->list, &c->erase_complete_list);
+	spin_unlock(&c->erase_completion_lock);
+	return;
+}
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 1d2ad15f1533..9709fac6531d 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -1,21 +1,18 @@
 ToDo/Notes:
 	- Find and fix bugs.
-	- Checkpoint or disable the user space journal ($UsnJrnl).
 	- In between ntfs_prepare/commit_write, need exclusion between
-	  simultaneous file extensions. Need perhaps an NInoResizeUnderway()
-	  flag which we can set in ntfs_prepare_write() and clear again in
-	  ntfs_commit_write(). Just have to be careful in readpage/writepage,
-	  as well as in truncate, that we play nice... We might need to have
-	  a data_size field in the ntfs_inode to store the real attribute
-	  length. Also need to be careful with initialized_size extention in
+	  simultaneous file extensions.  This is given to us by holding i_sem
+	  on the inode.  The only places in the kernel when a file is resized
+	  are prepare/commit write and truncate for both of which i_sem is
+	  held.  Just have to be careful in readpage/writepage and all other
+	  helpers not running under i_sem that we play nice...
+	  Also need to be careful with initialized_size extention in
 	  ntfs_prepare_write. Basically, just be _very_ careful in this code...
-	  OTOH, perhaps i_sem, which is held accross generic_file_write is
-	  sufficient for synchronisation here. We then just need to make sure
-	  ntfs_readpage/writepage/truncate interoperate properly with us.
-	  UPDATE: The above is all ok as it is due to i_sem held.  The only
-	  thing that needs to be checked is ntfs_writepage() which does not
-	  hold i_sem.  It cannot change i_size but it needs to cope with a
-	  concurrent i_size change.
+	  UPDATE: The only things that need to be checked are read/writepage
+	  which do not hold i_sem.  Note writepage cannot change i_size but it
+	  needs to cope with a concurrent i_size change, just like readpage.
+	  Also both need to cope with concurrent changes to the other sizes,
+	  i.e. initialized/allocated/compressed size, as well.
 	- Implement mft.c::sync_mft_mirror_umount().  We currently will just
 	  leave the volume dirty on umount if the final iput(vol->mft_ino)
 	  causes a write of any mirrored mft records due to the mft mirror
@@ -25,12 +22,158 @@ ToDo/Notes:
 	- Enable the code for setting the NT4 compatibility flag when we start
 	  making NTFS 1.2 specific modifications.
 
-2.1.23-WIP
+2.1.23 - Implement extension of resident files and make writing safe as well as
+	 many bug fixes, cleanups, and enhancements...
 
 	- Add printk rate limiting for ntfs_warning() and ntfs_error() when
 	  compiled without debug.  This avoids a possible denial of service
 	  attack.  Thanks to Carl-Daniel Hailfinger from SuSE for pointing this
 	  out.
+	- Fix compilation warnings on ia64.  (Randy Dunlap)
+	- Use i_size_{read,write}() instead of reading i_size by hand and cache
+	  the value where apropriate.
+	- Add size_lock to the ntfs_inode structure.  This is an rw spinlock
+	  and it locks against access to the inode sizes.  Note, ->size_lock
+	  is also accessed from irq context so you must use the _irqsave and
+	  _irqrestore lock and unlock functions, respectively.  Protect all
+	  accesses to allocated_size, initialized_size, and compressed_size.
+	- Minor optimization to fs/ntfs/super.c::ntfs_statfs() and its helpers.
+	- Implement extension of resident files in the regular file write code
+	  paths (fs/ntfs/aops.c::ntfs_{prepare,commit}_write()).  At present
+	  this only works until the data attribute becomes too big for the mft
+	  record after which we abort the write returning -EOPNOTSUPP from
+	  ntfs_prepare_write().
+	- Add disable_sparse mount option together with a per volume sparse
+	  enable bit which is set appropriately and a per inode sparse disable
+	  bit which is preset on some system file inodes as appropriate.
+	- Enforce that sparse support is disabled on NTFS volumes pre 3.0.
+	- Fix a bug in fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress() in
+	  the creation of the unmapped runlist element for the base attribute
+	  extent.
+	- Split ntfs_map_runlist() into ntfs_map_runlist() and a non-locking
+	  helper ntfs_map_runlist_nolock() which is used by ntfs_map_runlist().
+	  This allows us to map runlist fragments with the runlist lock already
+	  held without having to drop and reacquire it around the call.  Adapt
+	  all callers.
+	- Change ntfs_find_vcn() to ntfs_find_vcn_nolock() which takes a locked
+	  runlist.  This allows us to find runlist elements with the runlist
+	  lock already held without having to drop and reacquire it around the
+	  call.  Adapt all callers.
+	- Change time to u64 in time.h::ntfs2utc() as it otherwise generates a
+	  warning in the do_div() call on sparc32.  Thanks to Meelis Roos for
+	  the report and analysis of the warning.
+	- Fix a nasty runlist merge bug when merging two holes.
+	- Set the ntfs_inode->allocated_size to the real allocated size in the
+	  mft record for resident attributes (fs/ntfs/inode.c).
+	- Small readability cleanup to use "a" instead of "ctx->attr"
+	  everywhere (fs/ntfs/inode.c).
+	- Make fs/ntfs/namei.c::ntfs_get_{parent,dentry} static and move the
+	  definition of ntfs_export_ops from fs/ntfs/super.c to namei.c.  Also,
+	  declare ntfs_export_ops in fs/ntfs/ntfs.h.
+	- Correct sparse file handling.  The compressed values need to be
+	  checked and set in the ntfs inode as done for compressed files and
+	  the compressed size needs to be used for vfs inode->i_blocks instead
+	  of the allocated size, again, as done for compressed files.
+	- Add AT_EA in addition to AT_DATA to whitelist for being allowed to be
+	  non-resident in fs/ntfs/attrib.c::ntfs_attr_can_be_non_resident().
+	- Add fs/ntfs/attrib.c::ntfs_attr_vcn_to_lcn_nolock() used by the new
+	  write code.
+	- Fix bug in fs/ntfs/attrib.c::ntfs_find_vcn_nolock() where after
+	  dropping the read lock and taking the write lock we were not checking
+	  whether someone else did not already do the work we wanted to do.
+	- Rename fs/ntfs/attrib.c::ntfs_find_vcn_nolock() to
+	  ntfs_attr_find_vcn_nolock() and update all callers.
+	- Add fs/ntfs/attrib.[hc]::ntfs_attr_make_non_resident().
+	- Fix sign of various error return values to be negative in
+	  fs/ntfs/lcnalloc.c.
+	- Modify ->readpage and ->writepage (fs/ntfs/aops.c) so they detect and
+	  handle the case where an attribute is converted from resident to
+	  non-resident by a concurrent file write.
+	- Remove checks for NULL before calling kfree() since kfree() does the
+	  checking itself.  (Jesper Juhl)
+	- Some utilities modify the boot sector but do not update the checksum.
+	  Thus, relax the checking in fs/ntfs/super.c::is_boot_sector_ntfs() to
+	  only emit a warning when the checksum is incorrect rather than
+	  refusing the mount.  Thanks to Bernd Casimir for pointing this
+	  problem out.
+	- Update attribute definition handling.
+	- Add NTFS_MAX_CLUSTER_SIZE and NTFS_MAX_PAGES_PER_CLUSTER constants.
+	- Use NTFS_MAX_CLUSTER_SIZE in super.c instead of hard coding 0x10000.
+	- Use MAX_BUF_PER_PAGE instead of variable sized array allocation for
+	  better code generation and one less sparse warning in fs/ntfs/aops.c.
+	- Remove spurious void pointer casts from fs/ntfs/.  (Pekka Enberg)
+	- Use C99 style structure initialization after memory allocation where
+	  possible (fs/ntfs/{attrib.c,index.c,super.c}).  Thanks to Al Viro and
+	  Pekka Enberg.
+	- Stamp the transaction log ($UsnJrnl), aka user space journal, if it
+	  is active on the volume and we are mounting read-write or remounting
+	  from read-only to read-write.
+	- Fix a bug in address space operations error recovery code paths where
+	  if the runlist was not mapped at all and a mapping error occured we
+	  would leave the runlist locked on exit to the function so that the
+	  next access to the same file would try to take the lock and deadlock.
+	- Detect the case when Windows has been suspended to disk on the volume
+	  to be mounted and if this is the case do not allow (re)mounting
+	  read-write.  This is done by parsing hiberfil.sys if present.
+	- Fix several occurences of a bug where we would perform 'var & ~const'
+	  with a 64-bit variable and a int, i.e. 32-bit, constant.  This causes
+	  the higher order 32-bits of the 64-bit variable to be zeroed.  To fix
+	  this cast the 'const' to the same 64-bit type as 'var'.
+	- Change the runlist terminator of the newly allocated cluster(s) to
+	  LCN_ENOENT in ntfs_attr_make_non_resident().  Otherwise the runlist
+	  code gets confused.
+	- Add an extra parameter @last_vcn to ntfs_get_size_for_mapping_pairs()
+	  and ntfs_mapping_pairs_build() to allow the runlist encoding to be
+	  partial which is desirable when filling holes in sparse attributes.
+	  Update all callers.
+	- Change ntfs_map_runlist_nolock() to only decompress the mapping pairs
+	  if the requested vcn is inside it.  Otherwise we get into problems
+	  when we try to map an out of bounds vcn because we then try to map
+	  the already mapped runlist fragment which causes
+	  ntfs_mapping_pairs_decompress() to fail and return error.  Update
+	  ntfs_attr_find_vcn_nolock() accordingly.
+	- Fix a nasty deadlock that appeared in recent kernels.
+	  The situation: VFS inode X on a mounted ntfs volume is dirty.  For
+	  same inode X, the ntfs_inode is dirty and thus corresponding on-disk
+	  inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging
+	  to the table of inodes, i.e. $MFT, inode 0.
+	  What happens:
+	  Process 1: sys_sync()/umount()/whatever...  calls
+	  __sync_single_inode() for $MFT -> do_writepages() -> write_page for
+	  the dirty page containing the on-disk inode X, the page is now locked
+	  -> ntfs_write_mst_block() which clears PageUptodate() on the page to
+	  prevent anyone else getting hold of it whilst it does the write out.
+	  This is necessary as the on-disk inode needs "fixups" applied before
+	  the write to disk which are removed again after the write and
+	  PageUptodate is then set again.  It then analyses the page looking
+	  for dirty on-disk inodes and when it finds one it calls
+	  ntfs_may_write_mft_record() to see if it is safe to write this
+	  on-disk inode.  This then calls ilookup5() to check if the
+	  corresponding VFS inode is in icache().  This in turn calls ifind()
+	  which waits on the inode lock via wait_on_inode whilst holding the
+	  global inode_lock.
+	  Process 2: pdflush results in a call to __sync_single_inode for the
+	  same VFS inode X on the ntfs volume.  This locks the inode (I_LOCK)
+	  then calls write-inode -> ntfs_write_inode -> map_mft_record() ->
+	  read_cache_page() for the page (in page cache of table of inodes
+	  $MFT, inode 0) containing the on-disk inode.  This page has
+	  PageUptodate() clear because of Process 1 (see above) so
+	  read_cache_page() blocks when it tries to take the page lock for the
+	  page so it can call ntfs_read_page().
+	  Thus Process 1 is holding the page lock on the page containing the
+	  on-disk inode X and it is waiting on the inode X to be unlocked in
+	  ifind() so it can write the page out and then unlock the page.
+	  And Process 2 is holding the inode lock on inode X and is waiting for
+	  the page to be unlocked so it can call ntfs_readpage() or discover
+	  that Process 1 set PageUptodate() again and use the page.
+	  Thus we have a deadlock due to ifind() waiting on the inode lock.
+	  The solution: The fix is to use the newly introduced
+	  ilookup5_nowait() which does not wait on the inode's lock and hence
+	  avoids the deadlock.  This is safe as we do not care about the VFS
+	  inode and only use the fact that it is in the VFS inode cache and the
+	  fact that the vfs and ntfs inodes are one struct in memory to find
+	  the ntfs inode in memory if present.  Also, the ntfs inode has its
+	  own locking so it does not matter if the vfs inode is locked.
 
 2.1.22 - Many bug and race fixes and error handling improvements.
 
@@ -1037,7 +1180,7 @@ tng-0.0.8 - 08/03/2002 - Now using BitKeeper, http://linux-ntfs.bkbits.net/
 	- Further runlist merging work. (Richard Russon)
 	- Backwards compatibility for gcc-2.95. (Richard Russon)
 	- Update to kernel 2.5.5-pre1 and rediff the now tiny patch.
-	- Convert to new file system declaration using ->ntfs_get_sb() and
+	- Convert to new filesystem declaration using ->ntfs_get_sb() and
 	  replacing ntfs_read_super() with ntfs_fill_super().
 	- Set s_maxbytes to MAX_LFS_FILESIZE to avoid page cache page index
 	  overflow on 32-bit architectures.
@@ -1333,7 +1476,7 @@ tng-0.0.1 - The first useful version.
 	The driver is now actually useful! Yey. (-: It undoubtedly has got bugs
 	though and it doesn't implement accesssing compressed files yet. Also,
 	accessing files with attribute list attributes is not implemented yet
-	either. But for small or simple file systems it should work and allow
+	either. But for small or simple filesystems it should work and allow
 	you to list directories, use stat on directory entries and the file
 	system, open, read, mmap and llseek around in files. A big mile stone
 	has been reached!
@@ -1341,7 +1484,7 @@ tng-0.0.1 - The first useful version.
 tng-0.0.0 - Initial version tag.
 
 	Initial driver implementation. The driver can mount and umount simple
-	NTFS file systems (i.e. ones without attribute lists in the system
+	NTFS filesystems (i.e. ones without attribute lists in the system
 	files). If the mount fails there might be problems in the error handling
 	code paths, so be warned. Otherwise it seems to be loading the system
 	files nicely and the mft record read mapping/unmapping seems to be
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 7b66381a0b0f..f083f27d8b69 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
 	     index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
 	     unistr.o upcase.o
 
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.22\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.23\"
 
 ifeq ($(CONFIG_NTFS_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
@@ -15,5 +15,5 @@ endif
 ifeq ($(CONFIG_NTFS_RW),y)
 EXTRA_CFLAGS += -DNTFS_RW
 
-ntfs-objs += bitmap.o lcnalloc.o logfile.o quota.o
+ntfs-objs += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
 endif
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 45d56e41ed98..3f43bfe6184e 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -2,7 +2,7 @@
  * aops.c - NTFS kernel address space operations and page cache handling.
  *	    Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -66,19 +66,22 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	ni = NTFS_I(page->mapping->host);
 
 	if (likely(uptodate)) {
-		s64 file_ofs;
+		s64 file_ofs, initialized_size;
 
 		set_buffer_uptodate(bh);
 
 		file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) +
 				bh_offset(bh);
+		read_lock_irqsave(&ni->size_lock, flags);
+		initialized_size = ni->initialized_size;
+		read_unlock_irqrestore(&ni->size_lock, flags);
 		/* Check for the current buffer head overflowing. */
-		if (file_ofs + bh->b_size > ni->initialized_size) {
+		if (file_ofs + bh->b_size > initialized_size) {
 			char *addr;
 			int ofs = 0;
 
-			if (file_ofs < ni->initialized_size)
-				ofs = ni->initialized_size - file_ofs;
+			if (file_ofs < initialized_size)
+				ofs = initialized_size - file_ofs;
 			addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
 			memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs);
 			flush_dcache_page(page);
@@ -132,7 +135,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 					i * rec_size), rec_size);
 		flush_dcache_page(page);
 		kunmap_atomic(addr, KM_BIO_SRC_IRQ);
-		if (likely(!PageError(page) && page_uptodate))
+		if (likely(page_uptodate && !PageError(page)))
 			SetPageUptodate(page);
 	}
 	unlock_page(page);
@@ -168,6 +171,7 @@ static int ntfs_read_block(struct page *page)
 	runlist_element *rl;
 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
 	sector_t iblock, lblock, zblock;
+	unsigned long flags;
 	unsigned int blocksize, vcn_ofs;
 	int i, nr;
 	unsigned char blocksize_bits;
@@ -190,8 +194,10 @@ static int ntfs_read_block(struct page *page)
 	}
 
 	iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+	read_lock_irqsave(&ni->size_lock, flags);
 	lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
 	zblock = (ni->initialized_size + blocksize - 1) >> blocksize_bits;
+	read_unlock_irqrestore(&ni->size_lock, flags);
 
 	/* Loop through all the buffers in the page. */
 	rl = NULL;
@@ -258,7 +264,8 @@ lock_retry_remap:
 					goto lock_retry_remap;
 				rl = NULL;
 				lcn = err;
-			}
+			} else if (!rl)
+				up_read(&ni->runlist.lock);
 			/* Hard error, zero out region. */
 			bh->b_blocknr = -1;
 			SetPageError(page);
@@ -341,14 +348,15 @@ handle_zblock:
  */
 static int ntfs_readpage(struct file *file, struct page *page)
 {
-	loff_t i_size;
 	ntfs_inode *ni, *base_ni;
 	u8 *kaddr;
 	ntfs_attr_search_ctx *ctx;
 	MFT_RECORD *mrec;
+	unsigned long flags;
 	u32 attr_len;
 	int err = 0;
 
+retry_readpage:
 	BUG_ON(!PageLocked(page));
 	/*
 	 * This can potentially happen because we clear PageUptodate() during
@@ -383,9 +391,9 @@ static int ntfs_readpage(struct file *file, struct page *page)
 	 * Attribute is resident, implying it is not compressed or encrypted.
 	 * This also means the attribute is smaller than an mft record and
 	 * hence smaller than a page, so can simply zero out any pages with
-	 * index above 0.  We can also do this if the file size is 0.
+	 * index above 0.
 	 */
-	if (unlikely(page->index > 0 || !i_size_read(VFS_I(ni)))) {
+	if (unlikely(page->index > 0)) {
 		kaddr = kmap_atomic(page, KM_USER0);
 		memset(kaddr, 0, PAGE_CACHE_SIZE);
 		flush_dcache_page(page);
@@ -402,6 +410,14 @@ static int ntfs_readpage(struct file *file, struct page *page)
 		err = PTR_ERR(mrec);
 		goto err_out;
 	}
+	/*
+	 * If a parallel write made the attribute non-resident, drop the mft
+	 * record and retry the readpage.
+	 */
+	if (unlikely(NInoNonResident(ni))) {
+		unmap_mft_record(base_ni);
+		goto retry_readpage;
+	}
 	ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
 	if (unlikely(!ctx)) {
 		err = -ENOMEM;
@@ -412,9 +428,10 @@ static int ntfs_readpage(struct file *file, struct page *page)
 	if (unlikely(err))
 		goto put_unm_err_out;
 	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
-	i_size = i_size_read(VFS_I(ni));
-	if (unlikely(attr_len > i_size))
-		attr_len = i_size;
+	read_lock_irqsave(&ni->size_lock, flags);
+	if (unlikely(attr_len > ni->initialized_size))
+		attr_len = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
 	kaddr = kmap_atomic(page, KM_USER0);
 	/* Copy the data to the page. */
 	memcpy(kaddr, (u8*)ctx->attr +
@@ -463,12 +480,15 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 {
 	VCN vcn;
 	LCN lcn;
+	s64 initialized_size;
+	loff_t i_size;
 	sector_t block, dblock, iblock;
 	struct inode *vi;
 	ntfs_inode *ni;
 	ntfs_volume *vol;
 	runlist_element *rl;
 	st
author	Dave Kleikamp <shaggy@austin.ibm.com>	2005-07-19 13:46:53 -0500
committer	Dave Kleikamp <shaggy@austin.ibm.com>	2005-07-19 13:46:53 -0500
commit	21d1ee8b375bcd180f1d6b8ccbb8d8f938596310 (patch)
tree	2e82b65c16a4aaa88eeb7dd9f47f2d1c418e77d0 /fs
parent	3d9b1cdd2455017c6aa25bc2442092b81438981f (diff)
parent	f60f700876cd51de9de69f3a3c865d95e287a24d (diff)