From 6e6938b6d3130305a5960c86b1a9b21e58cf6144 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 6 Jun 2010 10:38:15 -0600 Subject: writeback: introduce .tagged_writepages for the WB_SYNC_NONE sync stage sync(2) is performed in two stages: the WB_SYNC_NONE sync and the WB_SYNC_ALL sync. Identify the first stage with .tagged_writepages and do livelock prevention for it, too. Jan's commit f446daaea9 ("mm: implement writeback livelock avoidance using page tagging") is a partial fix in that it only fixed the WB_SYNC_ALL phase livelock. Although ext4 is tested to no longer livelock with commit f446daaea9, it may due to some "redirty_tail() after pages_skipped" effect which is by no means a guarantee for _all_ the file systems. Note that writeback_inodes_sb() is called by not only sync(), they are treated the same because the other callers also need livelock prevention. Impact: It changes the order in which pages/inodes are synced to disk. Now in the WB_SYNC_NONE stage, it won't proceed to write the next inode until finished with the current inode. Acked-by: Jan Kara CC: Dave Chinner Signed-off-by: Wu Fengguang --- include/linux/writeback.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 17e7ccc322a5..3f6542ca6198 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -47,6 +47,7 @@ struct writeback_control { unsigned encountered_congestion:1; /* An output: a queue is full */ unsigned for_kupdate:1; /* A kupdate writeback */ unsigned for_background:1; /* A background writeback */ + unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */ unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned more_io:1; /* more io to be dispatched */ -- cgit v1.2.3 From cb9bd1159c5fe8995e151fa7df10fa19f8c119cc Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 21 Jul 2010 22:50:57 -0600 Subject: writeback: introduce writeback_control.inodes_written The flusher works on dirty inodes in batches, and may quit prematurely if the batch of inodes happen to be metadata-only dirtied: in this case wbc->nr_to_write won't be decreased at all, which stands for "no pages written" but also mis-interpreted as "no progress". So introduce writeback_control.inodes_written to count the inodes get cleaned from VFS POV. A non-zero value means there are some progress on writeback, in which case more writeback can be tried. Acked-by: Jan Kara Acked-by: Mel Gorman Signed-off-by: Wu Fengguang --- include/linux/writeback.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3f6542ca6198..7df9026f7129 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -34,6 +34,7 @@ struct writeback_control { long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ + long inodes_written; /* # of inodes written (at least) */ /* * For a_ops->writepages(): is start or end are non-zero then this is -- cgit v1.2.3 From f758eeabeb96f878c860e8f110f94ec8820822a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Apr 2011 18:19:44 -0600 Subject: writeback: split inode_wb_list_lock into bdi_writeback.list_lock Split the global inode_wb_list_lock into a per-bdi_writeback list_lock, as it's currently the most contended lock in the system for metadata heavy workloads. It won't help for single-filesystem workloads for which we'll need the I/O-less balance_dirty_pages, but at least we can dedicate a cpu to spinning on each bdi now for larger systems. Based on earlier patches from Nick Piggin and Dave Chinner. It reduces lock contentions to 1/4 in this test case: 10 HDD JBOD, 100 dd on each disk, XFS, 6GB ram lock_stat version 0.3 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- vanilla 2.6.39-rc3: inode_wb_list_lock: 42590 44433 0.12 147.74 144127.35 252274 886792 0.08 121.34 917211.23 ------------------ inode_wb_list_lock 2 [] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 34 [] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 12893 [] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 10702 [] writeback_single_inode+0x16d/0x20a ------------------ inode_wb_list_lock 2 [] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 19 [] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 5550 [] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 8511 [] writeback_sb_inodes+0x10f/0x157 2.6.39-rc3 + patch: &(&wb->list_lock)->rlock: 11383 11657 0.14 151.69 40429.51 90825 527918 0.11 145.90 556843.37 ------------------------ &(&wb->list_lock)->rlock 10 [] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 1493 [] writeback_inodes_wb+0x3d/0x150 &(&wb->list_lock)->rlock 3652 [] writeback_sb_inodes+0x123/0x16f &(&wb->list_lock)->rlock 1412 [] writeback_single_inode+0x17f/0x223 ------------------------ &(&wb->list_lock)->rlock 3 [] bdi_lock_two+0x46/0x4b &(&wb->list_lock)->rlock 6 [] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 2061 [] __mark_inode_dirty+0x173/0x1cf &(&wb->list_lock)->rlock 2629 [] writeback_sb_inodes+0x123/0x16f hughd@google.com: fix recursive lock when bdi_lock_two() is called with new the same as old akpm@linux-foundation.org: cleanup bdev_inode_switch_bdi() comment Signed-off-by: Christoph Hellwig Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Wu Fengguang --- include/linux/backing-dev.h | 2 ++ include/linux/writeback.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 96f4094b706d..47feb2c4706a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -57,6 +57,7 @@ struct bdi_writeback { struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ + spinlock_t list_lock; /* protects the b_* lists */ }; struct backing_dev_info { @@ -106,6 +107,7 @@ int bdi_writeback_thread(void *data); int bdi_has_dirty_io(struct backing_dev_info *bdi); void bdi_arm_supers_timer(void); void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); +void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); extern spinlock_t bdi_lock; extern struct list_head bdi_list; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 7df9026f7129..c2d957fb38d3 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -9,8 +9,6 @@ struct backing_dev_info; -extern spinlock_t inode_wb_list_lock; - /* * fs/fs-writeback.c */ -- cgit v1.2.3 From e185dda89d69cde142b48059413a03561f41f78a Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 23 Apr 2011 11:26:07 -0600 Subject: writeback: avoid extra sync work at enqueue time This removes writeback_control.wb_start and does more straightforward sync livelock prevention by setting .older_than_this to prevent extra inodes from being enqueued in the first place. Acked-by: Jan Kara Signed-off-by: Wu Fengguang --- include/linux/writeback.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index c2d957fb38d3..d8e96a480850 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -26,9 +26,6 @@ struct writeback_control { enum writeback_sync_modes sync_mode; unsigned long *older_than_this; /* If !NULL, only write back inodes older than this */ - unsigned long wb_start; /* Time writeback_inodes_wb was - called. This is needed to avoid - extra jobs and livelock */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ -- cgit v1.2.3 From b7a2441f9966fe3e1be960a876ab52e6029ea005 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 21 Jul 2010 22:19:51 -0600 Subject: writeback: remove writeback_control.more_io When wbc.more_io was first introduced, it indicates whether there are at least one superblock whose s_more_io contains more IO work. Now with the per-bdi writeback, it can be replaced with a simple b_more_io test. Acked-by: Jan Kara Acked-by: Mel Gorman Reviewed-by: Minchan Kim Signed-off-by: Wu Fengguang --- include/linux/writeback.h | 1 - include/trace/events/ext4.h | 6 ++---- include/trace/events/writeback.h | 5 +---- 3 files changed, 3 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d8e96a480850..8797b20dd22b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -46,7 +46,6 @@ struct writeback_control { unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */ unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ - unsigned more_io:1; /* more io to be dispatched */ }; /* diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index e09592d2f916..b225d0d8c87f 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -404,7 +404,6 @@ TRACE_EVENT(ext4_da_writepages_result, __field( int, pages_written ) __field( long, pages_skipped ) __field( int, sync_mode ) - __field( char, more_io ) __field( pgoff_t, writeback_index ) ), @@ -415,16 +414,15 @@ TRACE_EVENT(ext4_da_writepages_result, __entry->pages_written = pages_written; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; - __entry->more_io = wbc->more_io; __entry->writeback_index = inode->i_mapping->writeback_index; ), TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " - " more_io %d sync_mode %d writeback_index %lu", + "sync_mode %d writeback_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, - __entry->more_io, __entry->sync_mode, + __entry->sync_mode, (unsigned long) __entry->writeback_index) ); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 4e249b927eaa..b2cfac5f3313 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -101,7 +101,6 @@ DECLARE_EVENT_CLASS(wbc_class, __field(int, for_background) __field(int, for_reclaim) __field(int, range_cyclic) - __field(int, more_io) __field(unsigned long, older_than_this) __field(long, range_start) __field(long, range_end) @@ -116,7 +115,6 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->for_background = wbc->for_background; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; - __entry->more_io = wbc->more_io; __entry->older_than_this = wbc->older_than_this ? *wbc->older_than_this : 0; __entry->range_start = (long)wbc->range_start; @@ -124,7 +122,7 @@ DECLARE_EVENT_CLASS(wbc_class, ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " - "bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx " + "bgrd=%d reclm=%d cyclic=%d older=0x%lx " "start=0x%lx end=0x%lx", __entry->name, __entry->nr_to_write, @@ -134,7 +132,6 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->for_background, __entry->for_reclaim, __entry->range_cyclic, - __entry->more_io, __entry->older_than_this, __entry->range_start, __entry->range_end) -- cgit v1.2.3 From 846d5a091b0506b75489577cde27f39b37a192a4 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 5 May 2011 21:10:38 -0600 Subject: writeback: remove .nonblocking and .encountered_congestion Remove two unused struct writeback_control fields: .encountered_congestion (completely unused) .nonblocking (never set, checked/showed in XFS,NFS/btrfs) The .for_background check in nfs_write_inode() is also removed btw, as .for_background implies WB_SYNC_NONE. Reviewed-by: Jan Kara Proposed-by: Christoph Hellwig Signed-off-by: Wu Fengguang --- include/linux/writeback.h | 2 -- include/trace/events/btrfs.h | 6 ++---- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 8797b20dd22b..2f1b512bd6e0 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -39,8 +39,6 @@ struct writeback_control { loff_t range_start; loff_t range_end; - unsigned nonblocking:1; /* Don't get stuck on request queues */ - unsigned encountered_congestion:1; /* An output: a queue is full */ unsigned for_kupdate:1; /* A kupdate writeback */ unsigned for_background:1; /* A background writeback */ unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */ diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 4114129f0794..b31702ac15be 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -284,7 +284,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage, __field( long, pages_skipped ) __field( loff_t, range_start ) __field( loff_t, range_end ) - __field( char, nonblocking ) __field( char, for_kupdate ) __field( char, for_reclaim ) __field( char, range_cyclic ) @@ -299,7 +298,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage, __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; __entry->range_end = wbc->range_end; - __entry->nonblocking = wbc->nonblocking; __entry->for_kupdate = wbc->for_kupdate; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; @@ -310,13 +308,13 @@ DECLARE_EVENT_CLASS(btrfs__writepage, TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, " "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, " - "range_end = %llu, nonblocking = %d, for_kupdate = %d, " + "range_end = %llu, for_kupdate = %d, " "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu", show_root_type(__entry->root_objectid), (unsigned long)__entry->ino, __entry->index, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, - __entry->nonblocking, __entry->for_kupdate, + __entry->for_kupdate, __entry->for_reclaim, __entry->range_cyclic, (unsigned long)__entry->writeback_index) ); -- cgit v1.2.3 From 251d6a471c831e22880b3c146bb4556ddfb1dc82 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 1 Dec 2010 17:33:37 -0600 Subject: writeback: trace event writeback_single_inode It is valuable to know how the dirty inodes are iterated and their IO size. "writeback_single_inode: bdi 8:0: ino=134246746 state=I_DIRTY_SYNC|I_SYNC age=414 index=0 to_write=1024 wrote=0" - "state" reflects inode->i_state at the end of writeback_single_inode() - "index" reflects mapping->writeback_index after the ->writepages() call - "to_write" is the wbc->nr_to_write at entrance of writeback_single_inode() - "wrote" is the number of pages actually written v2: add trace event writeback_single_inode_requeue as proposed by Dave. CC: Dave Chinner Signed-off-by: Wu Fengguang --- include/trace/events/writeback.h | 70 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'include') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index b2cfac5f3313..898277bc89b4 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -8,6 +8,19 @@ #include #include +#define show_inode_state(state) \ + __print_flags(state, "|", \ + {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \ + {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \ + {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \ + {I_NEW, "I_NEW"}, \ + {I_WILL_FREE, "I_WILL_FREE"}, \ + {I_FREEING, "I_FREEING"}, \ + {I_CLEAR, "I_CLEAR"}, \ + {I_SYNC, "I_SYNC"}, \ + {I_REFERENCED, "I_REFERENCED"} \ + ) + struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_work_class, @@ -184,6 +197,63 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, TP_ARGS(usec_timeout, usec_delayed) ); +DECLARE_EVENT_CLASS(writeback_single_inode_template, + + TP_PROTO(struct inode *inode, + struct writeback_control *wbc, + unsigned long nr_to_write + ), + + TP_ARGS(inode, wbc, nr_to_write), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(unsigned long, ino) + __field(unsigned long, state) + __field(unsigned long, age) + __field(unsigned long, writeback_index) + __field(long, nr_to_write) + __field(unsigned long, wrote) + ), + + TP_fast_assign( + strncpy(__entry->name, + dev_name(inode->i_mapping->backing_dev_info->dev), 32); + __entry->ino = inode->i_ino; + __entry->state = inode->i_state; + __entry->age = (jiffies - inode->dirtied_when) * + 1000 / HZ; + __entry->writeback_index = inode->i_mapping->writeback_index; + __entry->nr_to_write = nr_to_write; + __entry->wrote = nr_to_write - wbc->nr_to_write; + ), + + TP_printk("bdi %s: ino=%lu state=%s age=%lu " + "index=%lu to_write=%ld wrote=%lu", + __entry->name, + __entry->ino, + show_inode_state(__entry->state), + __entry->age, + __entry->writeback_index, + __entry->nr_to_write, + __entry->wrote + ) +); + +DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_requeue, + TP_PROTO(struct inode *inode, + struct writeback_control *wbc, + unsigned long nr_to_write), + TP_ARGS(inode, wbc, nr_to_write) +); + +DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, + TP_PROTO(struct inode *inode, + struct writeback_control *wbc, + unsigned long nr_to_write), + TP_ARGS(inode, wbc, nr_to_write) +); + #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ -- cgit v1.2.3 From e84d0a4f8e39a73003a6ec9a11b07702745f4c1f Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 23 Apr 2011 12:27:27 -0600 Subject: writeback: trace event writeback_queue_io Note that it adds a little overheads to account the moved/enqueued inodes from b_dirty to b_io. The "moved" accounting may be later used to limit the number of inodes that can be moved in one shot, in order to keep spinlock hold time under control. Signed-off-by: Wu Fengguang --- include/trace/events/writeback.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 898277bc89b4..205d14919ef2 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -162,6 +162,31 @@ DEFINE_WBC_EVENT(wbc_balance_dirty_written); DEFINE_WBC_EVENT(wbc_balance_dirty_wait); DEFINE_WBC_EVENT(wbc_writepage); +TRACE_EVENT(writeback_queue_io, + TP_PROTO(struct bdi_writeback *wb, + unsigned long *older_than_this, + int moved), + TP_ARGS(wb, older_than_this, moved), + TP_STRUCT__entry( + __array(char, name, 32) + __field(unsigned long, older) + __field(long, age) + __field(int, moved) + ), + TP_fast_assign( + strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + __entry->older = older_than_this ? *older_than_this : 0; + __entry->age = older_than_this ? + (jiffies - *older_than_this) * 1000 / HZ : -1; + __entry->moved = moved; + ), + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d", + __entry->name, + __entry->older, /* older_than_this in jiffies */ + __entry->age, /* older_than_this in relative milliseconds */ + __entry->moved) +); + DECLARE_EVENT_CLASS(writeback_congest_waited_template, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), -- cgit v1.2.3 From d46db3d58233be4be980eb1e42eebe7808bcabab Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 4 May 2011 19:54:37 -0600 Subject: writeback: make writeback_control.nr_to_write straight Pass struct wb_writeback_work all the way down to writeback_sb_inodes(), and initialize the struct writeback_control there. struct writeback_control is basically designed to control writeback of a single file, but we keep abuse it for writing multiple files in writeback_sb_inodes() and its callers. It immediately clean things up, e.g. suddenly wbc.nr_to_write vs work->nr_pages starts to make sense, and instead of saving and restoring pages_skipped in writeback_sb_inodes it can always start with a clean zero value. It also makes a neat IO pattern change: large dirty files are now written in the full 4MB writeback chunk size, rather than whatever remained quota in wbc->nr_to_write. Acked-by: Jan Kara Proposed-by: Christoph Hellwig Signed-off-by: Wu Fengguang --- include/linux/writeback.h | 6 +----- include/trace/events/writeback.h | 39 ++++++++++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 2f1b512bd6e0..df1b7f18f100 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -24,12 +24,9 @@ enum writeback_sync_modes { */ struct writeback_control { enum writeback_sync_modes sync_mode; - unsigned long *older_than_this; /* If !NULL, only write back inodes - older than this */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ - long inodes_written; /* # of inodes written (at least) */ /* * For a_ops->writepages(): is start or end are non-zero then this is @@ -56,8 +53,7 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr); int writeback_inodes_sb_if_idle(struct super_block *); int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); void sync_inodes_sb(struct super_block *); -void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc); +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages); long wb_do_writeback(struct bdi_writeback *wb, int force_wait); void wakeup_flusher_threads(long nr_pages); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 205d14919ef2..3e7662a0cfa3 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -62,6 +62,9 @@ DEFINE_EVENT(writeback_work_class, name, \ DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); +DEFINE_WRITEBACK_WORK_EVENT(writeback_start); +DEFINE_WRITEBACK_WORK_EVENT(writeback_written); +DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); TRACE_EVENT(writeback_pages_written, TP_PROTO(long pages_written), @@ -101,6 +104,30 @@ DEFINE_WRITEBACK_EVENT(writeback_bdi_register); DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); DEFINE_WRITEBACK_EVENT(writeback_thread_start); DEFINE_WRITEBACK_EVENT(writeback_thread_stop); +DEFINE_WRITEBACK_EVENT(balance_dirty_start); +DEFINE_WRITEBACK_EVENT(balance_dirty_wait); + +TRACE_EVENT(balance_dirty_written, + + TP_PROTO(struct backing_dev_info *bdi, int written), + + TP_ARGS(bdi, written), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(int, written) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->written = written; + ), + + TP_printk("bdi %s written %d", + __entry->name, + __entry->written + ) +); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), @@ -114,7 +141,6 @@ DECLARE_EVENT_CLASS(wbc_class, __field(int, for_background) __field(int, for_reclaim) __field(int, range_cyclic) - __field(unsigned long, older_than_this) __field(long, range_start) __field(long, range_end) ), @@ -128,14 +154,12 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->for_background = wbc->for_background; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; - __entry->older_than_this = wbc->older_than_this ? - *wbc->older_than_this : 0; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " - "bgrd=%d reclm=%d cyclic=%d older=0x%lx " + "bgrd=%d reclm=%d cyclic=%d " "start=0x%lx end=0x%lx", __entry->name, __entry->nr_to_write, @@ -145,7 +169,6 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->for_background, __entry->for_reclaim, __entry->range_cyclic, - __entry->older_than_this, __entry->range_start, __entry->range_end) ) @@ -154,12 +177,6 @@ DECLARE_EVENT_CLASS(wbc_class, DEFINE_EVENT(wbc_class, name, \ TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ TP_ARGS(wbc, bdi)) -DEFINE_WBC_EVENT(wbc_writeback_start); -DEFINE_WBC_EVENT(wbc_writeback_written); -DEFINE_WBC_EVENT(wbc_writeback_wait); -DEFINE_WBC_EVENT(wbc_balance_dirty_start); -DEFINE_WBC_EVENT(wbc_balance_dirty_written); -DEFINE_WBC_EVENT(wbc_balance_dirty_wait); DEFINE_WBC_EVENT(wbc_writepage); TRACE_EVENT(writeback_queue_io, -- cgit v1.2.3 From f7d2b1ecd0c714adefc7d3a942ef87beb828a763 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 8 Dec 2010 22:44:24 -0600 Subject: writeback: account per-bdi accumulated written pages Introduce the BDI_WRITTEN counter. It will be used for estimating the bdi's write bandwidth. Peter Zijlstra : Move BDI_WRITTEN accounting into __bdi_writeout_inc(). This will cover and fix fuse, which only calls bdi_writeout_inc(). CC: Michael Rubin Reviewed-by: KOSAKI Motohiro Signed-off-by: Jan Kara Signed-off-by: Wu Fengguang --- include/linux/backing-dev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 47feb2c4706a..469d56443c63 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -40,6 +40,7 @@ typedef int (congested_fn)(void *, int); enum bdi_stat_item { BDI_RECLAIMABLE, BDI_WRITEBACK, + BDI_WRITTEN, NR_BDI_STAT_ITEMS }; -- cgit v1.2.3 From e98be2d599207c6b31e9bb340d52a231b2f3662d Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 29 Aug 2010 11:22:30 -0600 Subject: writeback: bdi write bandwidth estimation The estimation value will start from 100MB/s and adapt to the real bandwidth in seconds. It tries to update the bandwidth only when disk is fully utilized. Any inactive period of more than one second will be skipped. The estimated bandwidth will be reflecting how fast the device can writeout when _fully utilized_, and won't drop to 0 when it goes idle. The value will remain constant at disk idle time. At busy write time, if not considering fluctuations, it will also remain high unless be knocked down by possible concurrent reads that compete for the disk time and bandwidth with async writes. The estimation is not done purely in the flusher because there is no guarantee for write_cache_pages() to return timely to update bandwidth. The bdi->avg_write_bandwidth smoothing is very effective for filtering out sudden spikes, however may be a little biased in long term. The overheads are low because the bdi bandwidth update only occurs at 200ms intervals. The 200ms update interval is suitable, because it's not possible to get the real bandwidth for the instance at all, due to large fluctuations. The NFS commits can be as large as seconds worth of data. One XFS completion may be as large as half second worth of data if we are going to increase the write chunk to half second worth of data. In ext4, fluctuations with time period of around 5 seconds is observed. And there is another pattern of irregular periods of up to 20 seconds on SSD tests. That's why we are not only doing the estimation at 200ms intervals, but also averaging them over a period of 3 seconds and then go further to do another level of smoothing in avg_write_bandwidth. CC: Li Shaohua CC: Peter Zijlstra Signed-off-by: Wu Fengguang --- include/linux/backing-dev.h | 5 +++++ include/linux/writeback.h | 3 +++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 469d56443c63..a008982e7c08 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -73,6 +73,11 @@ struct backing_dev_info { struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; + unsigned long bw_time_stamp; /* last time write bw is updated */ + unsigned long written_stamp; /* pages written at bw_time_stamp */ + unsigned long write_bandwidth; /* the estimated write bandwidth */ + unsigned long avg_write_bandwidth; /* further smoothed write bw */ + struct prop_local_percpu completions; int dirty_exceeded; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index df1b7f18f100..66862f2d90c8 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -118,6 +118,9 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty); +void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long start_time); + void page_writeback_init(void); void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied); -- cgit v1.2.3 From c42843f2f0bbc9d716a32caf667d18fc2bf3bc4c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 2 Mar 2011 15:54:09 -0600 Subject: writeback: introduce smoothed global dirty limit The start of a heavy weight application (ie. KVM) may instantly knock down determine_dirtyable_memory() if the swap is not enabled or full. global_dirty_limits() and bdi_dirty_limit() will in turn get global/bdi dirty thresholds that are _much_ lower than the global/bdi dirty pages. balance_dirty_pages() will then heavily throttle all dirtiers including the light ones, until the dirty pages drop below the new dirty thresholds. During this _deep_ dirty-exceeded state, the system may appear rather unresponsive to the users. About "deep" dirty-exceeded: task_dirty_limit() assigns 1/8 lower dirty threshold to heavy dirtiers than light ones, and the dirty pages will be throttled around the heavy dirtiers' dirty threshold and reasonably below the light dirtiers' dirty threshold. In this state, only the heavy dirtiers will be throttled and the dirty pages are carefully controlled to not exceed the light dirtiers' dirty threshold. However if the threshold itself suddenly drops below the number of dirty pages, the light dirtiers will get heavily throttled. So introduce global_dirty_limit for tracking the global dirty threshold with policies - follow downwards slowly - follow up in one shot global_dirty_limit can effectively mask out the impact of sudden drop of dirtyable memory. It will be used in the next patch for two new type of dirty limits. Note that the new dirty limits are not going to avoid throttling the light dirtiers, but could limit their sleep time to 200ms. Signed-off-by: Wu Fengguang --- include/linux/writeback.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 66862f2d90c8..e9d371b6053b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -84,6 +84,8 @@ static inline void laptop_sync_completion(void) { } #endif void throttle_vm_writeout(gfp_t gfp_mask); +extern unsigned long global_dirty_limit; + /* These are exported to sysctl. */ extern int dirty_background_ratio; extern unsigned long dirty_background_bytes; @@ -119,6 +121,10 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty); void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, unsigned long start_time); void page_writeback_init(void); -- cgit v1.2.3 From ffd1f609ab10532e8137b4b981fdf903ef4d0b32 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 19 Jun 2011 22:18:42 -0600 Subject: writeback: introduce max-pause and pass-good dirty limits The max-pause limit helps to keep the sleep time inside balance_dirty_pages() within MAX_PAUSE=200ms. The 200ms max sleep means per task rate limit of 8pages/200ms=160KB/s when dirty exceeded, which normally is enough to stop dirtiers from continue pushing the dirty pages high, unless there are a sufficient large number of slow dirtiers (eg. 500 tasks doing 160KB/s will still sum up to 80MB/s, exceeding the write bandwidth of a slow disk and hence accumulating more and more dirty pages). The pass-good limit helps to let go of the good bdi's in the presence of a blocked bdi (ie. NFS server not responding) or slow USB disk which for some reason build up a large number of initial dirty pages that refuse to go away anytime soon. For example, given two bdi's A and B and the initial state bdi_thresh_A = dirty_thresh / 2 bdi_thresh_B = dirty_thresh / 2 bdi_dirty_A = dirty_thresh / 2 bdi_dirty_B = dirty_thresh / 2 Then A get blocked, after a dozen seconds bdi_thresh_A = 0 bdi_thresh_B = dirty_thresh bdi_dirty_A = dirty_thresh / 2 bdi_dirty_B = dirty_thresh / 2 The (bdi_dirty_B < bdi_thresh_B) test is now useless and the dirty pages will be effectively throttled by condition (nr_dirty < dirty_thresh). This has two problems: (1) we lose the protections for light dirtiers (2) balance_dirty_pages() effectively becomes IO-less because the (bdi_nr_reclaimable > bdi_thresh) test won't be true. This is good for IO, but balance_dirty_pages() loses an important way to break out of the loop which leads to more spread out throttle delays. DIRTY_PASSGOOD_AREA can eliminate the above issues. The only problem is, DIRTY_PASSGOOD_AREA needs to be defined as 2 to fully cover the above example while this patch uses the more conservative value 8 so as not to surprise people with too many dirty pages than expected. The max-pause limit won't noticeably impact the speed dirty pages are knocked down when there is a sudden drop of global/bdi dirty thresholds. Because the heavy dirties will be throttled below 160KB/s which is slow enough. It does help to avoid long dirty throttle delays and especially will make light dirtiers more responsive. Signed-off-by: Wu Fengguang --- include/linux/writeback.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e9d371b6053b..b625073b80c8 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -7,6 +7,27 @@ #include #include +/* + * The 1/16 region above the global dirty limit will be put to maximum pauses: + * + * (limit, limit + limit/DIRTY_MAXPAUSE_AREA) + * + * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put + * to loops: + * + * (limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA) + * + * Further beyond, all dirtier tasks will enter a loop waiting (possibly long + * time) for the dirty pages to drop, unless written enough pages. + * + * The global dirty threshold is normally equal to the global dirty limit, + * except when the system suddenly allocates a lot of anonymous memory and + * knocks down the global dirty threshold quickly, in which case the global + * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. + */ +#define DIRTY_MAXPAUSE_AREA 16 +#define DIRTY_PASSGOOD_AREA 8 + struct backing_dev_info; /* -- cgit v1.2.3 From e1cbe236013c82bcf9a156e98d7b47efb89d2674 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 6 Dec 2010 22:34:29 -0600 Subject: writeback: trace global_dirty_state Add trace event balance_dirty_state for showing the global dirty page counts and thresholds at each global_dirty_limits() invocation. This will cover the callers throttle_vm_writeout(), over_bground_thresh() and each balance_dirty_pages() loop. Signed-off-by: Wu Fengguang --- include/trace/events/writeback.h | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'include') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 3e7662a0cfa3..6bca4cc0063c 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -204,6 +204,52 @@ TRACE_EVENT(writeback_queue_io, __entry->moved) ); +TRACE_EVENT(global_dirty_state, + + TP_PROTO(unsigned long background_thresh, + unsigned long dirty_thresh + ), + + TP_ARGS(background_thresh, + dirty_thresh + ), + + TP_STRUCT__entry( + __field(unsigned long, nr_dirty) + __field(unsigned long, nr_writeback) + __field(unsigned long, nr_unstable) + __field(unsigned long, background_thresh) + __field(unsigned long, dirty_thresh) + __field(unsigned long, dirty_limit) + __field(unsigned long, nr_dirtied) + __field(unsigned long, nr_written) + ), + + TP_fast_assign( + __entry->nr_dirty = global_page_state(NR_FILE_DIRTY); + __entry->nr_writeback = global_page_state(NR_WRITEBACK); + __entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS); + __entry->nr_dirtied = global_page_state(NR_DIRTIED); + __entry->nr_written = global_page_state(NR_WRITTEN); + __entry->background_thresh = background_thresh; + __entry->dirty_thresh = dirty_thresh; + __entry->dirty_limit = global_dirty_limit; + ), + + TP_printk("dirty=%lu writeback=%lu unstable=%lu " + "bg_thresh=%lu thresh=%lu limit=%lu " + "dirtied=%lu written=%lu", + __entry->nr_dirty, + __entry->nr_writeback, + __entry->nr_unstable, + __entry->background_thresh, + __entry->dirty_thresh, + __entry->dirty_limit, + __entry->nr_dirtied, + __entry->nr_written + ) +); + DECLARE_EVENT_CLASS(writeback_congest_waited_template, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), -- cgit v1.2.3 From 1a12d8bd7b2998be01ee55edb64e7473728abb9c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 29 Aug 2010 13:28:09 -0600 Subject: writeback: scale IO chunk size up to half device bandwidth Originally, MAX_WRITEBACK_PAGES was hard-coded to 1024 because of a concern of not holding I_SYNC for too long. (At least, that was the comment previously.) This doesn't make sense now because the only time we wait for I_SYNC is if we are calling sync or fsync, and in that case we need to write out all of the data anyway. Previously there may have been other code paths that waited on I_SYNC, but not any more. -- Theodore Ts'o So remove the MAX_WRITEBACK_PAGES constraint. The writeback pages will adapt to as large as the storage device can write within 500ms. XFS is observed to do IO completions in a batch, and the batch size is equal to the write chunk size. To avoid dirty pages to suddenly drop out of balance_dirty_pages()'s dirty control scope and create large fluctuations, the chunk size is also limited to half the control scope. The balance_dirty_pages() control scrope is [(background_thresh + dirty_thresh) / 2, dirty_thresh] which is by default [15%, 20%] of global dirty pages, whose range size is dirty_thresh / DIRTY_FULL_SCOPE. The adpative write chunk size will be rounded to the nearest 4MB boundary. http://bugzilla.kernel.org/show_bug.cgi?id=13930 CC: Theodore Ts'o CC: Dave Chinner CC: Chris Mason CC: Peter Zijlstra Signed-off-by: Wu Fengguang --- include/linux/writeback.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index b625073b80c8..f1bfa12ea246 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -8,6 +8,10 @@ #include /* + * The 1/4 region under the global dirty thresh is for smooth dirty throttling: + * + * (thresh - thresh/DIRTY_FULL_SCOPE, thresh) + * * The 1/16 region above the global dirty limit will be put to maximum pauses: * * (limit, limit + limit/DIRTY_MAXPAUSE_AREA) @@ -25,9 +29,16 @@ * knocks down the global dirty threshold quickly, in which case the global * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. */ +#define DIRTY_SCOPE 8 +#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) #define DIRTY_MAXPAUSE_AREA 16 #define DIRTY_PASSGOOD_AREA 8 +/* + * 4MB minimal write chunk size + */ +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) + struct backing_dev_info; /* -- cgit v1.2.3