50 files changed, 695 insertions, 644 deletions
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 303c57a7fad9..8e63831971d5 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -263,14 +263,8 @@ A flag in the bio structure, BIO_BARRIER is used to identify a barrier i/o.
 The generic i/o scheduler would make sure that it places the barrier request and
 all other requests coming after it after all the previous requests in the
 queue. Barriers may be implemented in different ways depending on the
-driver. A SCSI driver for example could make use of ordered tags to
-preserve the necessary ordering with a lower impact on throughput. For IDE
-this might be two sync cache flush: a pre and post flush when encountering
-a barrier write.
-
-There is a provision for queues to indicate what kind of barriers they
-can provide. This is as of yet unmerged, details will be added here once it
-is in the kernel.
+driver. For more details regarding I/O barriers, please read barrier.txt
+in this directory.
 
 1.2.2 Request Priority/Latency
 
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 43fa20495688..8da3cf66894c 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -182,6 +182,9 @@ struct as_rq {
 
 static kmem_cache_t *arq_pool;
 
+static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq);
+static void as_antic_stop(struct as_data *ad);
+
 /*
  * IO Context helper functions
  */
@@ -370,7 +373,7 @@ static struct as_rq *as_find_first_arq(struct as_data *ad, int data_dir)
  * existing request against the same sector), which can happen when using
  * direct IO, then return the alias.
  */
-static struct as_rq *as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
+static struct as_rq *__as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
 {
 	struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node;
 	struct rb_node *parent = NULL;
@@ -397,6 +400,16 @@ static struct as_rq *as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
 	return NULL;
 }
 
+static void as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
+{
+	struct as_rq *alias;
+
+	while ((unlikely(alias = __as_add_arq_rb(ad, arq)))) {
+		as_move_to_dispatch(ad, alias);
+		as_antic_stop(ad);
+	}
+}
+
 static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq)
 {
 	if (!ON_RB(&arq->rb_node)) {
@@ -1133,23 +1146,6 @@ static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
 	/*
 	 * take it off the sort and fifo list, add to dispatch queue
 	 */
-	while (!list_empty(&rq->queuelist)) {
-		struct request *__rq = list_entry_rq(rq->queuelist.next);
-		struct as_rq *__arq = RQ_DATA(__rq);
-
-		list_del(&__rq->queuelist);
-
-		elv_dispatch_add_tail(ad->q, __rq);
-
-		if (__arq->io_context && __arq->io_context->aic)
-			atomic_inc(&__arq->io_context->aic->nr_dispatched);
-
-		WARN_ON(__arq->state != AS_RQ_QUEUED);
-		__arq->state = AS_RQ_DISPATCHED;
-
-		ad->nr_dispatched++;
-	}
-
 	as_remove_queued_request(ad->q, rq);
 	WARN_ON(arq->state != AS_RQ_QUEUED);
 
@@ -1326,49 +1322,12 @@ fifo_expired:
 }
 
 /*
- * Add arq to a list behind alias
- */
-static inline void
-as_add_aliased_request(struct as_data *ad, struct as_rq *arq,
-				struct as_rq *alias)
-{
-	struct request  *req = arq->request;
-	struct list_head *insert = alias->request->queuelist.prev;
-
-	/*
-	 * Transfer list of aliases
-	 */
-	while (!list_empty(&req->queuelist)) {
-		struct request *__rq = list_entry_rq(req->queuelist.next);
-		struct as_rq *__arq = RQ_DATA(__rq);
-
-		list_move_tail(&__rq->queuelist, &alias->request->queuelist);
-
-		WARN_ON(__arq->state != AS_RQ_QUEUED);
-	}
-
-	/*
-	 * Another request with the same start sector on the rbtree.
-	 * Link this request to that sector. They are untangled in
-	 * as_move_to_dispatch
-	 */
-	list_add(&arq->request->queuelist, insert);
-
-	/*
-	 * Don't want to have to handle merges.
-	 */
-	as_del_arq_hash(arq);
-	arq->request->flags |= REQ_NOMERGE;
-}
-
-/*
  * add arq to rbtree and fifo
  */
 static void as_add_request(request_queue_t *q, struct request *rq)
 {
 	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
-	struct as_rq *alias;
 	int data_dir;
 
 	arq->state = AS_RQ_NEW;
@@ -1387,33 +1346,17 @@ static void as_add_request(request_queue_t *q, struct request *rq)
 		atomic_inc(&arq->io_context->aic->nr_queued);
 	}
 
-	alias = as_add_arq_rb(ad, arq);
-	if (!alias) {
-		/*
-		 * set expire time (only used for reads) and add to fifo list
-		 */
-		arq->expires = jiffies + ad->fifo_expire[data_dir];
-		list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
+	as_add_arq_rb(ad, arq);
+	if (rq_mergeable(arq->request))
+		as_add_arq_hash(ad, arq);
 
-		if (rq_mergeable(arq->request))
-			as_add_arq_hash(ad, arq);
-		as_update_arq(ad, arq); /* keep state machine up to date */
-
-	} else {
-		as_add_aliased_request(ad, arq, alias);
-
-		/*
-		 * have we been anticipating this request?
-		 * or does it come from the same process as the one we are
-		 * anticipating for?
-		 */
-		if (ad->antic_status == ANTIC_WAIT_REQ
-				|| ad->antic_status == ANTIC_WAIT_NEXT) {
-			if (as_can_break_anticipation(ad, arq))
-				as_antic_stop(ad);
-		}
-	}
+	/*
+	 * set expire time (only used for reads) and add to fifo list
+	 */
+	arq->expires = jiffies + ad->fifo_expire[data_dir];
+	list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
 
+	as_update_arq(ad, arq); /* keep state machine up to date */
 	arq->state = AS_RQ_QUEUED;
 }
 
@@ -1536,23 +1479,8 @@ static void as_merged_request(request_queue_t *q, struct request *req)
 	 * if the merge was a front merge, we need to reposition request
 	 */
 	if (rq_rb_key(req) != arq->rb_key) {
-		struct as_rq *alias, *next_arq = NULL;
-
-		if (ad->next_arq[arq->is_sync] == arq)
-			next_arq = as_find_next_arq(ad, arq);
-
-		/*
-		 * Note! We should really be moving any old aliased requests
-		 * off this request and try to insert them into the rbtree. We
-		 * currently don't bother. Ditto the next function.
-		 */
 		as_del_arq_rb(ad, arq);
-		if ((alias = as_add_arq_rb(ad, arq))) {
-			list_del_init(&arq->fifo);
-			as_add_aliased_request(ad, arq, alias);
-			if (next_arq)
-				ad->next_arq[arq->is_sync] = next_arq;
-		}
+		as_add_arq_rb(ad, arq);
 		/*
 		 * Note! At this stage of this and the next function, our next
 		 * request may not be optimal - eg the request may have "grown"
@@ -1579,18 +1507,8 @@ static void as_merged_requests(request_queue_t *q, struct request *req,
 	as_add_arq_hash(ad, arq);
 
 	if (rq_rb_key(req) != arq->rb_key) {
-		struct as_rq *alias, *next_arq = NULL;
-
-		if (ad->next_arq[arq->is_sync] == arq)
-			next_arq = as_find_next_arq(ad, arq);
-
 		as_del_arq_rb(ad, arq);
-		if ((alias = as_add_arq_rb(ad, arq))) {
-			list_del_init(&arq->fifo);
-			as_add_aliased_request(ad, arq, alias);
-			if (next_arq)
-				ad->next_arq[arq->is_sync] = next_arq;
-		}
+		as_add_arq_rb(ad, arq);
 	}
 
 	/*
@@ -1610,18 +1528,6 @@ static void as_merged_requests(request_queue_t *q, struct request *req,
 	}
 
 	/*
-	 * Transfer list of aliases
-	 */
-	while (!list_empty(&next->queuelist)) {
-		struct request *__rq = list_entry_rq(next->queuelist.next);
-		struct as_rq *__arq = RQ_DATA(__rq);
-
-		list_move_tail(&__rq->queuelist, &req->queuelist);
-
-		WARN_ON(__arq->state != AS_RQ_QUEUED);
-	}
-
-	/*
 	 * kill knowledge of next, this one is a goner
 	 */
 	as_remove_queued_request(q, next);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ee0bb41694b0..74fae2daf87e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -25,15 +25,15 @@
 /*
  * tunables
  */
-static int cfq_quantum = 4;		/* max queue in one round of service */
-static int cfq_queued = 8;		/* minimum rq allocate limit per-queue*/
-static int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
-static int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
-static int cfq_back_penalty = 2;	/* penalty of a backwards seek */
+static const int cfq_quantum = 4;		/* max queue in one round of service */
+static const int cfq_queued = 8;		/* minimum rq allocate limit per-queue*/
+static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
+static const int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
+static const int cfq_back_penalty = 2;		/* penalty of a backwards seek */
 
-static int cfq_slice_sync = HZ / 10;
+static const int cfq_slice_sync = HZ / 10;
 static int cfq_slice_async = HZ / 25;
-static int cfq_slice_async_rq = 2;
+static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 100;
 
 #define CFQ_IDLE_GRACE		(HZ / 10)
@@ -45,7 +45,7 @@ static int cfq_slice_idle = HZ / 100;
 /*
  * disable queueing at the driver/hardware level
  */
-static int cfq_max_depth = 2;
+static const int cfq_max_depth = 2;
 
 /*
  * for the hash of cfqq inside the cfqd
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 9cbec09e8415..27e494b1bf97 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -19,10 +19,10 @@
 /*
  * See Documentation/block/deadline-iosched.txt
  */
-static int read_expire = HZ / 2;  /* max time before a read is submitted. */
-static int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
-static int writes_starved = 2;    /* max times reads can starve a write */
-static int fifo_batch = 16;       /* # of sequential requests treated as one
+static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
+static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+static const int writes_starved = 2;    /* max times reads can starve a write */
+static const int fifo_batch = 16;       /* # of sequential requests treated as one
 				     by the above parameters. For throughput. */
 
 static const int deadline_hash_shift = 5;
diff --git a/block/elevator.c b/block/elevator.c
index 6c3fc8a10bf2..39dcccc82ada 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -304,15 +304,7 @@ void elv_requeue_request(request_queue_t *q, struct request *rq)
 
 	rq->flags &= ~REQ_STARTED;
 
-	/*
-	 * if this is the flush, requeue the original instead and drop the flush
-	 */
-	if (rq->flags & REQ_BAR_FLUSH) {
-		clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-		rq = rq->end_io_data;
-	}
-
-	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
+	__elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE, 0);
 }
 
 static void elv_drain_elevator(request_queue_t *q)
@@ -332,8 +324,19 @@ static void elv_drain_elevator(request_queue_t *q)
 void __elv_add_request(request_queue_t *q, struct request *rq, int where,
 		       int plug)
 {
+	struct list_head *pos;
+	unsigned ordseq;
+
+	if (q->ordcolor)
+		rq->flags |= REQ_ORDERED_COLOR;
+
 	if (rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
 		/*
+		 * toggle ordered color
+		 */
+		q->ordcolor ^= 1;
+
+		/*
 		 * barriers implicitly indicate back insertion
 		 */
 		if (where == ELEVATOR_INSERT_SORT)
@@ -393,6 +396,30 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where,
 		q->elevator->ops->elevator_add_req_fn(q, rq);
 		break;
 
+	case ELEVATOR_INSERT_REQUEUE:
+		/*
+		 * If ordered flush isn't in progress, we do front
+		 * insertion; otherwise, requests should be requeued
+		 * in ordseq order.
+		 */
+		rq->flags |= REQ_SOFTBARRIER;
+
+		if (q->ordseq == 0) {
+			list_add(&rq->queuelist, &q->queue_head);
+			break;
+		}
+
+		ordseq = blk_ordered_req_seq(rq);
+
+		list_for_each(pos, &q->queue_head) {
+			struct request *pos_rq = list_entry_rq(pos);
+			if (ordseq <= blk_ordered_req_seq(pos_rq))
+				break;
+		}
+
+		list_add_tail(&rq->queuelist, pos);
+		break;
+
 	default:
 		printk(KERN_ERR "%s: bad insertion point %d\n",
 		       __FUNCTION__, where);
@@ -422,25 +449,16 @@ static inline struct request *__elv_next_request(request_queue_t *q)
 {
 	struct request *rq;
 
-	if (unlikely(list_empty(&q->queue_head) &&
-		     !q->elevator->ops->elevator_dispatch_fn(q, 0)))
-		return NULL;
-
-	rq = list_entry_rq(q->queue_head.next);
-
-	/*
-	 * if this is a barrier write and the device has to issue a
-	 * flush sequence to support it, check how far we are
-	 */
-	if (blk_fs_request(rq) && blk_barrier_rq(rq)) {
-		BUG_ON(q->ordered == QUEUE_ORDERED_NONE);
+	while (1) {
+		while (!list_empty(&q->queue_head)) {
+			rq = list_entry_rq(q->queue_head.next);
+			if (blk_do_ordered(q, &rq))
+				return rq;
+		}
 
-		if (q->ordered == QUEUE_ORDERED_FLUSH &&
-		    !blk_barrier_preflush(rq))
-			rq = blk_start_pre_flush(q, rq);
+		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
+			return NULL;
 	}
-
-	return rq;
 }
 
 struct request *elv_next_request(request_queue_t *q)
@@ -498,7 +516,7 @@ struct request *elv_next_request(request_queue_t *q)
 			blkdev_dequeue_request(rq);
 			rq->flags |= REQ_QUIET;
 			end_that_request_chunk(rq, 0, nr_bytes);
-			end_that_request_last(rq);
+			end_that_request_last(rq, 0);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__,
 								ret);
@@ -593,7 +611,21 @@ void elv_completed_request(request_queue_t *q, struct request *rq)
 	 * request is released from the driver, io must be done
 	 */
 	if (blk_account_rq(rq)) {
+		struct request *first_rq = list_entry_rq(q->queue_head.next);
+
 		q->in_flight--;
+
+		/*
+		 * Check if the queue is waiting for fs requests to be
+		 * drained for flush sequence.
+		 */
+		if (q->ordseq && q->in_flight == 0 &&
+		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
+		    blk_ordered_req_seq(first_rq) > QUEUE_ORDSEQ_DRAIN) {
+			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
+			q->request_fn(q);
+		}
+
 		if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
 			e->ops->elevator_completed_req_fn(q, rq);
 	}
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index d4beb9a89ee0..91d3b4828c49 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -36,6 +36,8 @@
 static void blk_unplug_work(void *data);
 static void blk_unplug_timeout(unsigned long data);
 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
+static void init_request_from_bio(struct request *req, struct bio *bio);
+static int __make_request(request_queue_t *q, struct bio *bio);
 
 /*
  * For the allocated request tables
@@ -288,8 +290,8 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
 
 /**
  * blk_queue_ordered - does this queue support ordered writes
- * @q:     the request queue
- * @flag:  see below
+ * @q:        the request queue
+ * @ordered:  one of QUEUE_ORDERED_*
  *
  * Description:
  *   For journalled file systems, doing ordered writes on a commit
@@ -298,28 +300,30 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
  *   feature should call this function and indicate so.
  *
  **/
-void blk_queue_ordered(request_queue_t *q, int flag)
-{
-	switch (flag) {
-		case QUEUE_ORDERED_NONE:
-			if (q->flush_rq)
-				kmem_cache_free(request_cachep, q->flush_rq);
-			q->flush_rq = NULL;
-			q->ordered = flag;
-			break;
-		case QUEUE_ORDERED_TAG:
-			q->ordered = flag;
-			break;
-		case QUEUE_ORDERED_FLUSH:
-			q->ordered = flag;
-			if (!q->flush_rq)
-				q->flush_rq = kmem_cache_alloc(request_cachep,
-								GFP_KERNEL);
-			break;
-		default:
-			printk("blk_queue_ordered: bad value %d\n", flag);
-			break;
+int blk_queue_ordered(request_queue_t *q, unsigned ordered,
+		      prepare_flush_fn *prepare_flush_fn)
+{
+	if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
+	    prepare_flush_fn == NULL) {
+		printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");
+		return -EINVAL;
+	}
+
+	if (ordered != QUEUE_ORDERED_NONE &&
+	    ordered != QUEUE_ORDERED_DRAIN &&
+	    ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
+	    ordered != QUEUE_ORDERED_DRAIN_FUA &&
+	    ordered != QUEUE_ORDERED_TAG &&
+	    ordered != QUEUE_ORDERED_TAG_FLUSH &&
+	    ordered != QUEUE_ORDERED_TAG_FUA) {
+		printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
+		return -EINVAL;
 	}
+
+	q->next_ordered = ordered;
+	q->prepare_flush_fn = prepare_flush_fn;
+
+	return 0;
 }
 
 EXPORT_SYMBOL(blk_queue_ordered);
@@ -344,167 +348,265 @@ EXPORT_SYMBOL(blk_queue_issue_flush_fn);
 /*
  * Cache flushing for ordered writes handling
  */
-static void blk_pre_flush_end_io(struct request *flush_rq)
+inline unsigned blk_ordered_cur_seq(request_queue_t *q)
 {
-	struct request *rq = flush_rq->end_io_data;
-	request_queue_t *q = rq->q;
-
-	elv_completed_request(q, flush_rq);
-
-	rq->flags |= REQ_BAR_PREFLUSH;
-
-	if (!flush_rq->errors)
-		elv_requeue_request(q, rq);
-	else {
-		q->end_flush_fn(q, flush_rq);
-		clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-		q->request_fn(q);
-	}
+	if (!q->ordseq)
+		return 0;
+	return 1 << ffz(q->ordseq);
 }
 
-static void blk_post_flush_end_io(struct request *flush_rq)
+unsigned blk_ordered_req_seq(struct request *rq)
 {
-	struct request *rq = flush_rq->end_io_data;
 	request_queue_t *q = rq->q;
 
-	elv_completed_request(q, flush_rq);
+	BUG_ON(q->ordseq == 0);
 
-	rq->flags |= REQ_BAR_POSTFLUSH;
+	if (rq == &q->pre_flush_rq)
+		return QUEUE_ORDSEQ_PREFLUSH;
+	if (rq == &q->bar_rq)
+		return QUEUE_ORDSEQ_BAR;
+	if (rq == &q->post_flush_rq)
+		return QUEUE_ORDSEQ_POSTFLUSH;
 
-	q->end_flush_fn(q, flush_rq);
-	clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-	q->request_fn(q);
+	if ((rq->flags & REQ_ORDERED_COLOR) ==
+	    (q->orig_bar_rq->flags & REQ_ORDERED_COLOR))
+		return QUEUE_ORDSEQ_DRAIN;
+	else
+		return QUEUE_ORDSEQ_DONE;
 }
 
-struct request *blk_start_pre_flush(request_queue_t *q, struct request *rq)
+void blk_ordered_complete_seq(request_queue_t *q, unsigned seq, int error)
 {
-	struct request *flush_rq = q->flush_rq;
-
-	BUG_ON(!blk_barrier_rq(rq));
+	struct request *rq;
+	int uptodate;
 
-	if (test_and_set_bit(QUEUE_FLAG_FLUSH, &q->queue_flags))
-		return NULL;
+	if (error && !q->orderr)
+		q->orderr = error;
 
-	rq_init(q, flush_rq);
-	flush_rq->elevator_private = NULL;
-	flush_rq->flags = REQ_BAR_FLUSH;
-	flush_rq->rq_disk = rq->rq_disk;
-	flush_rq->rl = NULL;
+	BUG_ON(q->ordseq & seq);
+	q->ordseq |= seq;
 
-	/*
-	 * prepare_flush returns 0 if no flush is needed, just mark both
-	 * pre and post flush as done in that case
-	 */
-	if (!q->prepare_flush_fn(q, flush_rq)) {
-		rq->flags |= REQ_BAR_PREFLUSH | REQ_BAR_POSTFLUSH;
-		clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-		return rq;
-	}
+	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
+		return;
 
 	/*
-	 * some drivers dequeue requests right away, some only after io
-	 * completion. make sure the request is dequeued.
+	 * Okay, sequence complete.
 	 */
-	if (!list_empty(&rq->queuelist))
-		blkdev_dequeue_request(rq);
+	rq = q->orig_bar_rq;
+	uptodate = q->orderr ? q->orderr : 1;
 
-	flush_rq->end_io_data = rq;
-	flush_rq->end_io = blk_pre_flush_end_io;
+	q->ordseq = 0;
 
-	__elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
-	return flush_rq;
+	end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
+	end_that_request_last(rq, uptodate);
 }
 
-static void blk_start_post_flush(request_queue_t *q, struct request *rq)
+static void pre_flush_end_io(struct request *rq, int error)
 {
-	struct request *flush_rq = q->flush_rq;
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
+}
 
-	BUG_ON(!blk_barrier_rq(rq));
+static void bar_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
+}
 
-	rq_init(q, flush_rq);
-	flush_rq->elevator_private = NULL;
-	flush_rq->flags = REQ_BAR_FLUSH;
-	flush_rq->rq_disk = rq->rq_disk;
-	flush_rq->rl = NULL;
+static void post_flush_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
+}
 
-	if (q->prepare_flush_fn(q, flush_rq)) {
-		flush_rq->end_io_data = rq;
-		flush_rq->end_io = blk_post_flush_end_io;
+static void queue_flush(request_queue_t *q, unsigned which)
+{
+	struct request *rq;
+	rq_end_io_fn *end_io;
 
-		__elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
-		q->request_fn(q);
+	if (which == QUEUE_ORDERED_PREFLUSH) {
+		rq = &q->pre_flush_rq;
+		end_io = pre_flush_end_io;
+	} else {
+		rq = &q->post_flush_rq;
+		end_io = post_flush_end_io;
 	}
+
+	rq_init(q, rq);
+	rq->flags = REQ_HARDBARRIER;
+	rq->elevator_private = NULL;
+	rq->rq_disk = q->bar_rq.rq_disk;
+	rq->rl = NULL;
+	rq->end_io = end_io;
+	q->prepare_flush_fn(q, rq);
+
+	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
 }
 
-static inline int blk_check_end_barrier(request_queue_t *q, struct request *rq,
-					int sectors)
+static inline struct request *start_ordered(request_queue_t *q,
+					    struct request *rq)
 {
-	if (sectors > rq->nr_sectors)
-		sectors = rq->nr_sectors;
+	q->bi_size = 0;
+	q->orderr = 0;
+	q->ordered = q->next_ordered;
+	q->ordseq |= QUEUE_ORDSEQ_STARTED;
+
+	/*
+	 * Prep proxy barrier request.
+	 */
+	blkdev_dequeue_request(rq);
+	q->orig_bar_rq = rq;
+	rq = &q->bar_rq;
+	rq_init(q, rq);
+	rq->flags = bio_data_dir(q->orig_bar_rq->bio);
+	rq->flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0;
+	rq->elevator_private = NULL;
+	rq->rl = NULL;
+	init_request_from_bio(rq, q->orig_bar_rq->bio);
+	rq->end_io = bar_end_io;
+
+	/*
+	 * Queue ordered sequence.  As we stack them at the head, we
+	 * need to queue in reverse order.  Note that we rely on that
+	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
+	 * request gets inbetween ordered sequence.
+	 */
+	if (q->ordered & QUEUE_ORDERED_POSTFLUSH)
+		queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
+	else
+		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
+
+	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
+
+	if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
+		queue_flush(q, QUEUE_ORDERED_PREFLUSH);
+		rq = &q->pre_flush_rq;
+	} else
+		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
 
-	rq->nr_sectors -= sectors;
-	return rq->nr_sectors;
+	if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
+		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
+	else
+		rq = NULL;
+
+	return rq;
 }
 
-static int __blk_complete_barrier_rq(request_queue_t *q, struct request *rq,
-				     int sectors, int queue_locked)
+int blk_do_ordered(request_queue_t *q, struct request **rqp)
 {
-	if (q->ordered != QUEUE_ORDERED_FLUSH)
-		return 0;
-	if (!blk_fs_request(rq) || !blk_barrier_rq(rq))
-		return 0;
-	if (blk_barrier_postflush(rq))
-		return 0;
+	struct request *rq = *rqp, *allowed_rq;
+	int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
 
-	if (!blk_check_end_barrier(q, rq, sectors)) {
-		unsigned long flags = 0;
+	if (!q->ordseq) {
+		if (!is_barrier)
+			return 1;
 
-		if (!queue_locked)
-			spin_lock_irqsave(q->queue_lock, flags);
+		if (q->next_ordered != QUEUE_ORDERED_NONE) {
+			*rqp = start_ordered(q, rq);
+			return 1;
+		} else {
+			/*
+			 * This can happen when the que