Merge branch 'linux-next' of git://git.infradead.org/ubifs-2.6

* 'linux-next' of git://git.infradead.org/ubifs-2.6: UBIFS: fix recovery bug UBIFS: add R/O compatibility UBIFS: fix compiler warnings UBIFS: fully sort GCed nodes UBIFS: fix commentaries UBIFS: introduce a helpful variable UBIFS: use KERN_CONT UBIFS: fix lprops committing bug UBIFS: fix bogus assertion UBIFS: fix bug where page is marked uptodate when out of space UBIFS: amend key_hash return value UBIFS: improve find function interface UBIFS: list usage cleanup UBIFS: fix dbg_chk_lpt_sz()
author: Linus Torvalds <torvalds@linux-foundation.org> 2009-04-06 15:00:19 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-04-06 15:00:19 -0700
commit: e0724bf6e4a1f2e678d2b2aab01cae22e17862f0 (patch)
tree: 559a8fa8e7a92f8ae0e0a27d4e71f408fa7cec62 /fs/ubifs
parent: 38d9aefb5ce8f26358b0d5cd933cfa9e267105b1 (diff)
parent: de0975781a1a8bc92e07eb7681d10ef9bb5e6df9 (diff)
17 files changed, 482 insertions, 265 deletions
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index f393620890ee..af1914462f02 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c)
 }
 
 /**
- * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index.
+ * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index.
  * @c: UBIFS file-system description object
  *
- * This function calculates and returns the number of eraseblocks which should
- * be kept for index usage.
+ * This function calculates and returns the number of LEBs which should be kept
+ * for index usage.
  */
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
 {
-	int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz;
+	int idx_lebs;
 	long long idx_size;
 
 	idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
-
 	/* And make sure we have thrice the index size of space reserved */
-	idx_size = idx_size + (idx_size << 1);
-
+	idx_size += idx_size << 1;
 	/*
 	 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
 	 * pair, nor similarly the two variables for the new index size, so we
 	 * have to do this costly 64-bit division on fast-path.
 	 */
-	idx_size += eff_leb_size - 1;
-	idx_lebs = div_u64(idx_size, eff_leb_size);
+	idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size);
 	/*
 	 * The index head is not available for the in-the-gaps method, so add an
 	 * extra LEB to compensate.
@@ -310,23 +307,23 @@ static int can_use_rp(struct ubifs_info *c)
  * do_budget_space - reserve flash space for index and data growth.
  * @c: UBIFS file-system description object
  *
- * This function makes sure UBIFS has enough free eraseblocks for index growth
- * and data.
+ * This function makes sure UBIFS has enough free LEBs for index growth and
+ * data.
  *
  * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
  * would take if it was consolidated and written to the flash. This guarantees
  * that the "in-the-gaps" commit method always succeeds and UBIFS will always
  * be able to commit dirty index. So this function basically adds amount of
  * budgeted index space to the size of the current index, multiplies this by 3,
- * and makes sure this does not exceed the amount of free eraseblocks.
+ * and makes sure this does not exceed the amount of free LEBs.
  *
  * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
  * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
  *    be large, because UBIFS does not do any index consolidation as long as
  *    there is free space. IOW, the index may take a lot of LEBs, but the LEBs
  *    will contain a lot of dirt.
- * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be
- *   consolidated to take up to @c->min_idx_lebs LEBs.
+ * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
+ *    the index may be consolidated to take up to @c->min_idx_lebs LEBs.
  *
  * This function returns zero in case of success, and %-ENOSPC in case of
  * failure.
@@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
  * This function calculates amount of free space to report to user-space.
  *
  * Because UBIFS may introduce substantial overhead (the index, node headers,
- * alignment, wastage at the end of eraseblocks, etc), it cannot report real
- * amount of free flash space it has (well, because not all dirty space is
- * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so,
- * it would bread user expectations about what free space is. Users seem to
- * accustomed to assume that if the file-system reports N bytes of free space,
- * they would be able to fit a file of N bytes to the FS. This almost works for
+ * alignment, wastage at the end of LEBs, etc), it cannot report real amount of
+ * free flash space it has (well, because not all dirty space is reclaimable,
+ * UBIFS does not actually know the real amount). If UBIFS did so, it would
+ * bread user expectations about what free space is. Users seem to accustomed
+ * to assume that if the file-system reports N bytes of free space, they would
+ * be able to fit a file of N bytes to the FS. This almost works for
  * traditional file-systems, because they have way less overhead than UBIFS.
  * So, to keep users happy, UBIFS tries to take the overhead into account.
  */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index e975bd82f38b..ce2cd8343618 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 					  "bad or corrupted node)");
 		else {
 			for (i = 0; i < nlen && dent->name[i]; i++)
-				printk("%c", dent->name[i]);
+				printk(KERN_CONT "%c", dent->name[i]);
 		}
-		printk("\n");
+		printk(KERN_CONT "\n");
 
 		break;
 	}
@@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
 
 			/*
 			 * Make sure the last key in our znode is less or
-			 * equivalent than the the key in zbranch which goes
+			 * equivalent than the key in the zbranch which goes
 			 * after our pointing zbranch.
 			 */
 			cmp = keys_cmp(c, max,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0ff89fe71e51..6d34dc7e33e1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+	int skipped_read = 0;
 	struct page *page;
 
 	ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
@@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 
 	if (!PageUptodate(page)) {
 		/* The page is not loaded from the flash */
-		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
 			/*
 			 * We change whole page so no need to load it. But we
 			 * have to set the @PG_checked flag to make the further
@@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 			 * the media.
 			 */
 			SetPageChecked(page);
-		else {
+			skipped_read = 1;
+		} else {
 			err = do_readpage(page);
 			if (err) {
 				unlock_page(page);
@@ -470,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	if (unlikely(err)) {
 		ubifs_assert(err == -ENOSPC);
 		/*
+		 * If we skipped reading the page because we were going to
+		 * write all of it, then it is not up to date.
+		 */
+		if (skipped_read) {
+			ClearPageChecked(page);
+			ClearPageUptodate(page);
+		}
+		/*
 		 * Budgeting failed which means it would have to force
 		 * write-back but didn't, because we set the @fast flag in the
 		 * request. Write-back cannot be done now, while we have the
@@ -949,7 +959,7 @@ static int do_writepage(struct page *page, int len)
  * whole index and correct all inode sizes, which is long an unacceptable.
  *
  * To prevent situations like this, UBIFS writes pages back only if they are
- * within last synchronized inode size, i.e. the the size which has been
+ * within the last synchronized inode size, i.e. the size which has been
  * written to the flash media last time. Otherwise, UBIFS forces inode
  * write-back, thus making sure the on-flash inode contains current inode size,
  * and then keeps writing pages back.
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 717d79c97c5e..1d54383d1269 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
  * ubifs_find_free_space - find a data LEB with free space.
  * @c: the UBIFS file-system description object
  * @min_space: minimum amount of required free space
- * @free: contains amount of free space in the LEB on exit
+ * @offs: contains offset of where free space starts on exit
  * @squeeze: whether to try to find space in a non-empty LEB first
  *
  * This function looks for an LEB with at least @min_space bytes of free space.
@@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
  * failed to find a LEB with @min_space bytes of free space and other a negative
  * error codes in case of failure.
  */
-int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
 			  int squeeze)
 {
 	const struct ubifs_lprops *lprops;
@@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
 		spin_unlock(&c->space_lock);
 	}
 
-	*free = lprops->free;
+	*offs = c->leb_size - lprops->free;
 	ubifs_release_lprops(c);
 
-	if (*free == c->leb_size) {
+	if (*offs == 0) {
 		/*
 		 * Ensure that empty LEBs have been unmapped. They may not have
 		 * been, for example, because of an unclean unmount.  Also
@@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
 			return err;
 	}
 
-	dbg_find("found LEB %d, free %d", lnum, *free);
-	ubifs_assert(*free >= min_space);
+	dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs);
+	ubifs_assert(*offs <= c->leb_size - min_space);
 	return lnum;
 
 out:
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a711d33b3d3e..f0f5f15d384e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -47,7 +47,7 @@
  * have to waste large pieces of free space at the end of LEB B, because nodes
  * from LEB A would not fit. And the worst situation is when all nodes are of
  * maximum size. So dark watermark is the amount of free + dirty space in LEB
- * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
+ * which are guaranteed to be reclaimable. If LEB has less space, the GC might
  * be unable to reclaim it. So, LEBs with free + dirty greater than dark
  * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
  * good, and GC takes extra care when moving them.
@@ -57,14 +57,6 @@
 #include "ubifs.h"
 
 /*
- * GC tries to optimize the way it fit nodes to available space, and it sorts
- * nodes a little. The below constants are watermarks which define "large",
- * "medium", and "small" nodes.
- */
-#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
-#define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ
-
-/*
  * GC may need to move more than one LEB to make progress. The below constants
  * define "soft" and "hard" limits on the number of LEBs the garbage collector
  * may move.
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
 }
 
 /**
- * joinup - bring data nodes for an inode together.
- * @c: UBIFS file-system description object
- * @sleb: describes scanned LEB
- * @inum: inode number
- * @blk: block number
- * @data: list to which to add data nodes
+ * list_sort - sort a list.
+ * @priv: private data, passed to @cmp
+ * @head: the list to sort
+ * @cmp: the elements comparison function
  *
- * This function looks at the first few nodes in the scanned LEB @sleb and adds
- * them to @data if they are data nodes from @inum and have a larger block
- * number than @blk. This function returns %0 on success and a negative error
- * code on failure.
+ * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
+ * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
+ * in ascending order.
+ *
+ * The comparison function @cmp is supposed to return a negative value if @a is
+ * than @b, and a positive value if @a is greater than @b. If @a and @b are
+ * equivalent, then it does not matter what this function returns.
  */
-static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum,
-		  unsigned int blk, struct list_head *data)
+static void list_sort(void *priv, struct list_head *head,
+		      int (*cmp)(void *priv, struct list_head *a,
+				 struct list_head *b))
 {
-	int err, cnt = 6, lnum = sleb->lnum, offs;
-	struct ubifs_scan_node *snod, *tmp;
-	union ubifs_key *key;
+	struct list_head *p, *q, *e, *list, *tail, *oldhead;
+	int insize, nmerges, psize, qsize, i;
+
+	if (list_empty(head))
+		return;
+
+	list = head->next;
+	list_del(head);
+	insize = 1;
+	for (;;) {
+		p = oldhead = list;
+		list = tail = NULL;
+		nmerges = 0;
+
+		while (p) {
+			nmerges++;
+			q = p;
+			psize = 0;
+			for (i = 0; i < insize; i++) {
+				psize++;
+				q = q->next == oldhead ? NULL : q->next;
+				if (!q)
+					break;
+			}
 
-	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
-		key = &snod->key;
-		if (key_inum(c, key) == inum &&
-		    key_type(c, key) == UBIFS_DATA_KEY &&
-		    key_block(c, key) > blk) {
-			offs = snod->offs;
-			err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0);
-			if (err < 0)
-				return err;
-			list_del(&snod->list);
-			if (err) {
-				list_add_tail(&snod->list, data);
-				blk = key_block(c, key);
-			} else
-				kfree(snod);
-			cnt = 6;
-		} else if (--cnt == 0)
+			qsize = insize;
+			while (psize > 0 || (qsize > 0 && q)) {
+				if (!psize) {
+					e = q;
+					q = q->next;
+					qsize--;
+					if (q == oldhead)
+						q = NULL;
+				} else if (!qsize || !q) {
+					e = p;
+					p = p->next;
+					psize--;
+					if (p == oldhead)
+						p = NULL;
+				} else if (cmp(priv, p, q) <= 0) {
+					e = p;
+					p = p->next;
+					psize--;
+					if (p == oldhead)
+						p = NULL;
+				} else {
+					e = q;
+					q = q->next;
+					qsize--;
+					if (q == oldhead)
+						q = NULL;
+				}
+				if (tail)
+					tail->next = e;
+				else
+					list = e;
+				e->prev = tail;
+				tail = e;
+			}
+			p = q;
+		}
+
+		tail->next = list;
+		list->prev = tail;
+
+		if (nmerges <= 1)
 			break;
+
+		insize *= 2;
 	}
-	return 0;
+
+	head->next = list;
+	head->prev = list->prev;
+	list->prev->next = head;
+	list->prev = head;
 }
 
 /**
- * move_nodes - move nodes.
+ * data_nodes_cmp - compare 2 data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first data node
+ * @a: second data node
+ *
+ * This function compares data nodes @a and @b. Returns %1 if @a has greater
+ * inode or block number, and %-1 otherwise.
+ */
+int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	ino_t inuma, inumb;
+	struct ubifs_info *c = priv;
+	struct ubifs_scan_node *sa, *sb;
+
+	cond_resched();
+	sa = list_entry(a, struct ubifs_scan_node, list);
+	sb = list_entry(b, struct ubifs_scan_node, list);
+	ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
+	ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
+
+	inuma = key_inum(c, &sa->key);
+	inumb = key_inum(c, &sb->key);
+
+	if (inuma == inumb) {
+		unsigned int blka = key_block(c, &sa->key);
+		unsigned int blkb = key_block(c, &sb->key);
+
+		if (blka <= blkb)
+			return -1;
+	} else if (inuma <= inumb)
+		return -1;
+
+	return 1;
+}
+
+/*
+ * nondata_nodes_cmp - compare 2 non-data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first node
+ * @a: second node
+ *
+ * This function compares nodes @a and @b. It makes sure that inode nodes go
+ * first and sorted by length in descending order. Directory entry nodes go
+ * after inode nodes and are sorted in ascending hash valuer order.
+ */
+int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	int typea, typeb;
+	ino_t inuma, inumb;
+	struct ubifs_info *c = priv;
+	struct ubifs_scan_node *sa, *sb;
+
+	cond_resched();
+	sa = list_entry(a, struct ubifs_scan_node, list);
+	sb = list_entry(b, struct ubifs_scan_node, list);
+	typea = key_type(c, &sa->key);
+	typeb = key_type(c, &sb->key);
+	ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
+
+	/* Inodes go before directory entries */
+	if (typea == UBIFS_INO_KEY) {
+		if (typeb == UBIFS_INO_KEY)
+			return sb->len - sa->len;
+		return -1;
+	}
+	if (typeb == UBIFS_INO_KEY)
+		return 1;
+
+	ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
+	inuma = key_inum(c, &sa->key);
+	inumb = key_inum(c, &sb->key);
+
+	if (inuma == inumb) {
+		uint32_t hasha = key_hash(c, &sa->key);
+		uint32_t hashb = key_hash(c, &sb->key);
+
+		if (hasha <= hashb)
+			return -1;
+	} else if (inuma <= inumb)
+		return -1;
+
+	return 1;
+}
+
+/**
+ * sort_nodes - sort nodes for GC.
  * @c: UBIFS file-system description object
- * @sleb: describes nodes to move
+ * @sleb: describes nodes to sort and contains the result on exit
+ * @nondata: contains non-data nodes on exit
+ * @min: minimum node size is returned here
  *
- * This function moves valid nodes from data LEB described by @sleb to the GC
- * journal head. The obsolete nodes are dropped.
+ * This function sorts the list of inodes to garbage collect. First of all, it
+ * kills obsolete nodes and separates data and non-data nodes to the
+ * @sleb->nodes and @nondata lists correspondingly.
+ *
+ * Data nodes are then sorted in block number order - this is important for
+ * bulk-read; data nodes with lower inode number go before data nodes with
+ * higher inode number, and data nodes with lower block number go before data
+ * nodes with higher block number;
  *
- * When moving nodes we have to deal with classical bin-packing problem: the
- * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
- * where the nodes in the @sleb->nodes list are the elements which should be
- * fit optimally to the bins. This function uses the "first fit decreasing"
- * strategy, although it does not really sort the nodes but just split them on
- * 3 classes - large, medium, and small, so they are roughly sorted.
+ * Non-data nodes are sorted as follows.
+ *   o First go inode nodes - they are sorted in descending length order.
+ *   o Then go directory entry nodes - they are sorted in hash order, which
+ *     should supposedly optimize 'readdir()'. Direntry nodes with lower parent
+ *     inode number go before direntry nodes with higher parent inode number,
+ *     and direntry nodes with lower name hash values go before direntry nodes
+ *     with higher name hash values.
  *
- * This function returns zero in case of success, %-EAGAIN if commit is
- * required, and other negative error codes in case of other failures.
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
  */
-static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		      struct list_head *nondata, int *min)
 {
 	struct ubifs_scan_node *snod, *tmp;
-	struct list_head data, large, medium, small;
-	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
-	int avail, err, min = INT_MAX;
-	unsigned int blk = 0;
-	ino_t inum = 0;
 
-	INIT_LIST_HEAD(&data);
-	INIT_LIST_HEAD(&large);
-	INIT_LIST_HEAD(&medium);
-	INIT_LIST_HEAD(&small);
+	*min = INT_MAX;
 
-	while (!list_empty(&sleb->nodes)) {
-		struct list_head *lst = sleb->nodes.next;
-
-		snod = list_entry(lst, struct ubifs_scan_node, list);
+	/* Separate data nodes and non-data nodes */
+	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+		int err;
 
 		ubifs_assert(snod->type != UBIFS_IDX_NODE);
 		ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 		err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
 					 snod->offs, 0);
 		if (err < 0)
-			goto out;
+			return err;
 
-		list_del(lst);
 		if (!err) {
 			/* The node is obsolete, remove it from the list */
+			list_del(&snod->list);
 			kfree(snod);
 			continue;
 		}
 
-		/*
-		 * Sort the list of nodes so that data nodes go first, large
-		 * nodes go second, and small nodes go last.
-		 */
-		if (key_type(c, &snod->key) == UBIFS_DATA_KEY) {
-			if (inum != key_inum(c, &snod->key)) {
-				if (inum) {
-					/*
-					 * Try to move data nodes from the same
-					 * inode together.
-					 */
-					err = joinup(c, sleb, inum, blk, &data);
-					if (err)
-						goto out;
-				}
-				inum = key_inum(c, &snod->key);
-				blk = key_block(c, &snod->key);
-			}
-			list_add_tail(lst, &data);
-		} else if (snod->len > MEDIUM_NODE_WM)
-			list_add_tail(lst, &large);
-		else if (snod->len > SMALL_NODE_WM)
-			list_add_tail(lst, &medium);
-		else
-			list_add_tail(lst, &small);
-
-		/* And find the smallest node */
-		if (snod->len < min)
-			min = snod->len;
+		if (snod->len < *min)
+			*min = snod->len;
+
+		if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
+			list_move_tail(&snod->list, nondata);
 	}
 
-	/*
-	 * Join the tree lists so that we'd have one roughly sorted list
-	 * ('large' will be the head of the joined list).
-	 */
-	list_splice(&data, &large);
-	list_splice(&medium, large.prev);
-	list_splice(&small, large.prev);
+	/* Sort data and non-data nodes */
+	list_sort(c, &sleb->nodes, &data_nodes_cmp);
+	list_sort(c, nondata, &nondata_nodes_cmp);
+	return 0;
+}
+
+/**
+ * move_node - move a node.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ * @snod: the mode to move
+ * @wbuf: write-buffer to move node to
+ *
+ * This function moves node @snod to @wbuf, changes TNC correspondingly, and
+ * destroys @snod. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		     struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
+{
+	int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
+
+	cond_resched();
+	err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
+	if (err)
+		return err;
+
+	err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
+				snod->offs, new_lnum, new_offs,
+				snod->len);
+	list_del(&snod->list);
+	kfree(snod);
+	return err;
+}
+
+/**
+ * move_nodes - move nodes.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ *
+ * This function moves valid nodes from data LEB described by @sleb to the GC
+ * journal head. This function returns zero in case of success, %-EAGAIN if
+ * commit is required, and other negative error codes in case of other
+ * failures.
+ */
+static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+{
+	int err, min;
+	LIST_HEAD(nondata);
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
 
 	if (wbuf->lnum == -1) {
 		/*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 		 */
 		err = switch_gc_head(c);
 		if (err)
-			goto out;
+			return err;
 	}
 
+	err = sort_nodes(c, sleb, &nondata, &min);
+	if (err)
+		goto out;
+
 	/* Write nodes to their new location. Use the first-fit strategy */
 	while (1) {
-		avail = c->leb_size - wbuf->offs - wbuf->used;
-		list_for_each_entry_safe(snod, tmp, &large, list) {
-			int new_lnum, new_offs;
+		int avail;
+		struct ubifs_scan_node *snod, *tmp;
+
+		/* Move data nodes */
+		list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+			avail = c->leb_size - wbuf->offs - wbuf->used;
+			if  (snod->len > avail)
+				/*
+				 * Do not skip data nodes in order to optimize
+				 * bulk-read.
+				 */
+				break;
+
+			err = move_node(c, sleb, snod, wbuf);
+			if (err)
+				goto out;
+		}
 
+		/* Move non-data nodes */
+		list_for_each_entry_safe(snod, tmp, &nondata, list) {
+			avail = c->leb_size - wbuf->offs - wbuf->used;
 			if (avail < min)
 				break;
 
-			if (snod->len > avail)
-				/* This node does not fit */
+			if  (snod->len > avail) {
+				/*
+				 * Keep going only if this is an inode with
+				 * some data. Otherwise stop and switch the GC
+				 * head. IOW, we assume that data-less inode
+				 * nodes and direntry nodes are roughly of the
+				 * same size.
+				 */
+				if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+				    snod->len == UBIFS_INO_NODE_SZ)
+					break;
 				continue;
+			}
 
-			cond_resched();
-
-			new_lnum = wbuf->lnum;
-			new_offs = wbuf->offs + wbuf->used;
-			err = ubifs_wbuf_write_nolock(wbuf, snod->node,
-						      snod->len);
+			err = move_node(c, sleb, snod, wbuf);
 			if (err)
 				goto out;
-			err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
-						snod->offs, new_lnum, new_offs,
-						snod->len);
-			if (err)
-				goto out;
-
-			avail = c->leb_size - wbuf->offs - wbuf->used;
-			list_del(&snod->list);
-			kfree(snod);
 		}
 
-		if (list_empty(&large))
+		if (list_empty(&sleb->nodes) && list_empty(&nondata))
 			break;
 
 		/*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 	return 0;
 
 out:
-	list_for_each_entry_safe(snod, tmp, &large, list) {
-		list_del(&snod->list);
-		kfree(snod);
-	}
+	list_splice_tail(&nondata, &sleb->nodes);
 	return err;
 }
 
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index a11ca0958a23..64b5f3a309f5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
  */
 static int reserve_space(struct ubifs_info *c, int jhead, int len)
 {
-	int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze;
+	int err = 0, err1, retries = 0, avail, lnum, offs, squeeze;
 	struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
 
 	/*
@@ -139,10 +139,9 @@ again:
 	 * Write buffer wasn't seek'ed or there is no enough space - look for an
 	 * LEB with some empty space.
 	 */
-	lnum = ubifs_find_free_space(c, len, &free, squeeze);
+	lnum = ubifs_find_free_space(c, len, &offs, squeeze);
 	if (lnum >= 0) {
 		/* Found an LEB, add it to the journal head */
-		offs = c->leb_size - free;
 		err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
 		if (err)
 			goto out_return;
@@ -1366,7 +1365,7 @@ out_ro:
  * @host: host inode
  *
  * This function writes the updated version of an extended attribute inode and
- * the host inode tho the journal (to the base head). The host inode is written
+ * the host inode to the journal (to the base head). The host inode is written
  * after the extended attribute inode in order to guarantee that the extended
  * attribute will be flushed when the inode is synchronized by 'fsync()' and
  * consequently, the write-buffer is synchronized. This function returns zero
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index efb3430a2581..5fa27ea031ba 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
  * @c: UBIFS file-system description object
  * @key: the key to get hash from
  */
-static inline int key_hash(const struct ubifs_info *c,
-			   const union ubifs_key *key)
+static inline uint32_t key_hash(const struct ubifs_info *c,
+				const union ubifs_key *key)
 {
 	return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
 }
@@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c,
  * @c: UBIFS file-system description object
  * @k: the key to get hash from
  */
-static inline int key_hash_flash(const struct ubifs_info *c, const void *k)
+static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k)
 {
 	const union ubifs_key *key = k;
 
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 3e0aa7367556..56e33772a1ee 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
 	}
 
 	/*
-	 * Make sure the the amount of space in buds will not exceed
+	 * Make sure the amount of space in buds will not exceed the
 	 * 'c->max_bud_bytes' limit, because we want to guarantee mount time
 	 * limits.
 	 *
@@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c)
 				bud->jhead, c->leb_size - bud->start,
 				c->cmt_bud_bytes);
 			rb_erase(p1, &c->buds);
-			list_del(&bud->list);
 			/*
 			 * If the commit does not finish, the recovery will need
 			 * to replay the journal, in which case the old buds
@@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c)
 			 * commit i.e. do not allow them to be garbage
 			 * collected.
 			 */
-			list_add(&bud->list, &c->old_buds);
+			list_move(&bud->list, &c->old_buds);
 		}
 	}
 	spin_unlock(&c->buds_lock);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 3216a1f277f8..8cbfb8248025 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		while (offs + len > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-			dbg_chk_lpt_sz(c, 2, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
@@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		if (offs + c->lsave_sz > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-			dbg_chk_lpt_sz(c, 2, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
@@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		if (offs + c->ltab_sz > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-			dbg_chk_lpt_sz(c, 2, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
@@ -416,14 +416,12 @@ static int write_cnodes(struct ubifs_info *c)
 						       alen, UBI_SHORTTERM);
 				if (err)
 					return err;
-				dbg_chk_lpt_sz(c, 4, alen - wlen);
 			}
-			dbg_chk_lpt_sz(c, 2, 0);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
-			offs = 0;
-			from = 0;
+			offs = from = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
 			err = ubifs_leb_unmap(c, lnum);
@@ -477,11 +475,11 @@ static int write_cnodes(struct ubifs_info *c)
 					      UBI_SHORTTERM);
 			if (err)
 				return err;
-			dbg_chk_lpt_sz(c, 2, alen - wlen);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
-			offs = 0;
+			offs = from = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
 			err = ubifs_leb_unmap(c, lnum);
@@ -504,11 +502,11 @@ static int write_cnodes(struct ubifs_info *c)
 					      UBI_SHORTTERM);
 			if (err)
 				return err;
-			dbg_chk_lpt_sz(c, 2, alen - wlen);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
author	Linus Torvalds <torvalds@linux-foundation.org>	2009-04-06 15:00:19 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-04-06 15:00:19 -0700
commit	e0724bf6e4a1f2e678d2b2aab01cae22e17862f0 (patch)
tree	559a8fa8e7a92f8ae0e0a27d4e71f408fa7cec62 /fs/ubifs
parent	38d9aefb5ce8f26358b0d5cd933cfa9e267105b1 (diff)
parent	de0975781a1a8bc92e07eb7681d10ef9bb5e6df9 (diff)