summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/blkio-controller.txt83
-rw-r--r--Documentation/cgroups/memory.txt1
-rw-r--r--block/bio.c35
-rw-r--r--block/blk-cgroup.c123
-rw-r--r--block/blk-core.c70
-rw-r--r--block/blk-integrity.c1
-rw-r--r--block/blk-sysfs.c3
-rw-r--r--block/blk-throttle.c2
-rw-r--r--block/bounce.c1
-rw-r--r--block/cfq-iosched.c2
-rw-r--r--block/elevator.c2
-rw-r--r--block/genhd.c1
-rw-r--r--drivers/block/drbd/drbd_int.h1
-rw-r--r--drivers/block/drbd/drbd_main.c10
-rw-r--r--drivers/block/pktcdvd.c1
-rw-r--r--drivers/char/raw.c1
-rw-r--r--drivers/md/bcache/request.c1
-rw-r--r--drivers/md/dm.c2
-rw-r--r--drivers/md/dm.h1
-rw-r--r--drivers/md/md.h1
-rw-r--r--drivers/md/raid1.c4
-rw-r--r--drivers/md/raid10.c2
-rw-r--r--drivers/mtd/devices/block2mtd.c1
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h4
-rw-r--r--fs/9p/v9fs.c50
-rw-r--r--fs/9p/vfs_super.c8
-rw-r--r--fs/block_dev.c9
-rw-r--r--fs/buffer.c64
-rw-r--r--fs/ext2/super.c1
-rw-r--r--fs/ext4/extents.c1
-rw-r--r--fs/ext4/mballoc.c1
-rw-r--r--fs/ext4/super.c1
-rw-r--r--fs/f2fs/node.c4
-rw-r--r--fs/f2fs/segment.h3
-rw-r--r--fs/fat/file.c1
-rw-r--r--fs/fat/inode.c1
-rw-r--r--fs/fs-writeback.c1167
-rw-r--r--fs/fuse/file.c12
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/super.c1
-rw-r--r--fs/inode.c1
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/nfs/filelayout/filelayout.c1
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/write.c3
-rw-r--r--fs/ocfs2/file.c1
-rw-r--r--fs/reiserfs/super.c1
-rw-r--r--fs/ufs/super.c1
-rw-r--r--fs/xfs/xfs_aops.c12
-rw-r--r--fs/xfs/xfs_file.c1
-rw-r--r--include/linux/backing-dev-defs.h255
-rw-r--r--include/linux/backing-dev.h557
-rw-r--r--include/linux/bio.h3
-rw-r--r--include/linux/blk-cgroup.h (renamed from block/blk-cgroup.h)32
-rw-r--r--include/linux/blkdev.h21
-rw-r--r--include/linux/cgroup.h25
-rw-r--r--include/linux/fs.h26
-rw-r--r--include/linux/memcontrol.h29
-rw-r--r--include/linux/mm.h8
-rw-r--r--include/linux/pagemap.h3
-rw-r--r--include/linux/writeback.h221
-rw-r--r--include/trace/events/writeback.h15
-rw-r--r--init/Kconfig5
-rw-r--r--mm/backing-dev.c634
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c34
-rw-r--r--mm/madvise.c1
-rw-r--r--mm/memcontrol.c223
-rw-r--r--mm/page-writeback.c1231
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/truncate.c18
-rw-r--r--mm/vmscan.c79
74 files changed, 3890 insertions, 1242 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index cd556b914786..68b6a6a470b0 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -387,8 +387,81 @@ groups and put applications in that group which are not driving enough
IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
on individual groups and throughput should improve.
-What works
-==========
-- Currently only sync IO queues are support. All the buffered writes are
- still system wide and not per group. Hence we will not see service
- differentiation between buffered writes between groups.
+Writeback
+=========
+
+Page cache is dirtied through buffered writes and shared mmaps and
+written asynchronously to the backing filesystem by the writeback
+mechanism. Writeback sits between the memory and IO domains and
+regulates the proportion of dirty memory by balancing dirtying and
+write IOs.
+
+On traditional cgroup hierarchies, relationships between different
+controllers cannot be established making it impossible for writeback
+to operate accounting for cgroup resource restrictions and all
+writeback IOs are attributed to the root cgroup.
+
+If both the blkio and memory controllers are used on the v2 hierarchy
+and the filesystem supports cgroup writeback, writeback operations
+correctly follow the resource restrictions imposed by both memory and
+blkio controllers.
+
+Writeback examines both system-wide and per-cgroup dirty memory status
+and enforces the more restrictive of the two. Also, writeback control
+parameters which are absolute values - vm.dirty_bytes and
+vm.dirty_background_bytes - are distributed across cgroups according
+to their current writeback bandwidth.
+
+There's a peculiarity stemming from the discrepancy in ownership
+granularity between memory controller and writeback. While memory
+controller tracks ownership per page, writeback operates on inode
+basis. cgroup writeback bridges the gap by tracking ownership by
+inode but migrating ownership if too many foreign pages, pages which
+don't match the current inode ownership, have been encountered while
+writing back the inode.
+
+This is a conscious design choice as writeback operations are
+inherently tied to inodes making strictly following page ownership
+complicated and inefficient. The only use case which suffers from
+this compromise is multiple cgroups concurrently dirtying disjoint
+regions of the same inode, which is an unlikely use case and decided
+to be unsupported. Note that as memory controller assigns page
+ownership on the first use and doesn't update it until the page is
+released, even if cgroup writeback strictly follows page ownership,
+multiple cgroups dirtying overlapping areas wouldn't work as expected.
+In general, write-sharing an inode across multiple cgroups is not well
+supported.
+
+Filesystem support for cgroup writeback
+---------------------------------------
+
+A filesystem can make writeback IOs cgroup-aware by updating
+address_space_operations->writepage[s]() to annotate bio's using the
+following two functions.
+
+* wbc_init_bio(@wbc, @bio)
+
+ Should be called for each bio carrying writeback data and associates
+ the bio with the inode's owner cgroup. Can be called anytime
+ between bio allocation and submission.
+
+* wbc_account_io(@wbc, @page, @bytes)
+
+ Should be called for each data segment being written out. While
+ this function doesn't care exactly when it's called during the
+ writeback session, it's the easiest and most natural to call it as
+ data segments are added to a bio.
+
+With writeback bio's annotated, cgroup support can be enabled per
+super_block by setting MS_CGROUPWB in ->s_flags. This allows for
+selective disabling of cgroup writeback support which is helpful when
+certain filesystem features, e.g. journaled data mode, are
+incompatible.
+
+wbc_init_bio() binds the specified bio to its cgroup. Depending on
+the configuration, the bio may be executed at a lower priority and if
+the writeback session is holding shared resources, e.g. a journal
+entry, may lead to priority inversion. There is no one easy solution
+for the problem. Filesystems can try to work around specific problem
+cases by skipping wbc_init_bio() or using bio_associate_blkcg()
+directly.
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index f456b4315e86..ff71e16cc752 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -493,6 +493,7 @@ pgpgin - # of charging events to the memory cgroup. The charging
pgpgout - # of uncharging events to the memory cgroup. The uncharging
event happens each time a page is unaccounted from the cgroup.
swap - # of bytes of swap usage
+dirty - # of bytes that are waiting to get written back to the disk.
writeback - # of bytes of file/anon cache that are queued for syncing to
disk.
inactive_anon - # of bytes of anonymous and swap cache memory on inactive
diff --git a/block/bio.c b/block/bio.c
index 259197d97de1..2a00d349cd68 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1988,6 +1988,28 @@ struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_
EXPORT_SYMBOL(bioset_create_nobvec);
#ifdef CONFIG_BLK_CGROUP
+
+/**
+ * bio_associate_blkcg - associate a bio with the specified blkcg
+ * @bio: target bio
+ * @blkcg_css: css of the blkcg to associate
+ *
+ * Associate @bio with the blkcg specified by @blkcg_css. Block layer will
+ * treat @bio as if it were issued by a task which belongs to the blkcg.
+ *
+ * This function takes an extra reference of @blkcg_css which will be put
+ * when @bio is released. The caller must own @bio and is responsible for
+ * synchronizing calls to this function.
+ */
+int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
+{
+ if (unlikely(bio->bi_css))
+ return -EBUSY;
+ css_get(blkcg_css);
+ bio->bi_css = blkcg_css;
+ return 0;
+}
+
/**
* bio_associate_current - associate a bio with %current
* @bio: target bio
@@ -2004,26 +2026,17 @@ EXPORT_SYMBOL(bioset_create_nobvec);
int bio_associate_current(struct bio *bio)
{
struct io_context *ioc;
- struct cgroup_subsys_state *css;
- if (bio->bi_ioc)
+ if (bio->bi_css)
return -EBUSY;
ioc = current->io_context;
if (!ioc)
return -ENOENT;
- /* acquire active ref on @ioc and associate */
get_io_context_active(ioc);
bio->bi_ioc = ioc;
-
- /* associate blkcg if exists */
- rcu_read_lock();
- css = task_css(current, blkio_cgrp_id);
- if (css && css_tryget_online(css))
- bio->bi_css = css;
- rcu_read_unlock();
-
+ bio->bi_css = task_get_css(current, blkio_cgrp_id);
return 0;
}
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 6e43fa355e71..9f97da52d006 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -19,11 +19,12 @@
#include <linux/module.h>
#include <linux/err.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/slab.h>
#include <linux/genhd.h>
#include <linux/delay.h>
#include <linux/atomic.h>
-#include "blk-cgroup.h"
+#include <linux/blk-cgroup.h>
#include "blk.h"
#define MAX_KEY_LEN 100
@@ -33,6 +34,8 @@ static DEFINE_MUTEX(blkcg_pol_mutex);
struct blkcg blkcg_root;
EXPORT_SYMBOL_GPL(blkcg_root);
+struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static bool blkcg_policy_enabled(struct request_queue *q,
@@ -182,6 +185,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
struct blkcg_gq *new_blkg)
{
struct blkcg_gq *blkg;
+ struct bdi_writeback_congested *wb_congested;
int i, ret;
WARN_ON_ONCE(!rcu_read_lock_held());
@@ -193,22 +197,30 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
goto err_free_blkg;
}
+ wb_congested = wb_congested_get_create(&q->backing_dev_info,
+ blkcg->css.id, GFP_ATOMIC);
+ if (!wb_congested) {
+ ret = -ENOMEM;
+ goto err_put_css;
+ }
+
/* allocate */
if (!new_blkg) {
new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
- goto err_put_css;
+ goto err_put_congested;
}
}
blkg = new_blkg;
+ blkg->wb_congested = wb_congested;
/* link parent */
if (blkcg_parent(blkcg)) {
blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
if (WARN_ON_ONCE(!blkg->parent)) {
ret = -EINVAL;
- goto err_put_css;
+ goto err_put_congested;
}
blkg_get(blkg->parent);
}
@@ -238,18 +250,15 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
blkg->online = true;
spin_unlock(&blkcg->lock);
- if (!ret) {
- if (blkcg == &blkcg_root) {
- q->root_blkg = blkg;
- q->root_rl.blkg = blkg;
- }
+ if (!ret)
return blkg;
- }
/* @blkg failed fully initialized, use the usual release path */
blkg_put(blkg);
return ERR_PTR(ret);
+err_put_congested:
+ wb_congested_put(wb_congested);
err_put_css:
css_put(&blkcg->css);
err_free_blkg:
@@ -343,15 +352,6 @@ static void blkg_destroy(struct blkcg_gq *blkg)
rcu_assign_pointer(blkcg->blkg_hint, NULL);
/*
- * If root blkg is destroyed. Just clear the pointer since root_rl
- * does not take reference on root blkg.
- */
- if (blkcg == &blkcg_root) {
- blkg->q->root_blkg = NULL;
- blkg->q->root_rl.blkg = NULL;
- }
-
- /*
* Put the reference taken at the time of creation so that when all
* queues are gone, group can be destroyed.
*/
@@ -405,6 +405,8 @@ void __blkg_release_rcu(struct rcu_head *rcu_head)
if (blkg->parent)
blkg_put(blkg->parent);
+ wb_congested_put(blkg->wb_congested);
+
blkg_free(blkg);
}
EXPORT_SYMBOL_GPL(__blkg_release_rcu);
@@ -812,6 +814,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock_irq(&blkcg->lock);
+
+ wb_blkcg_offline(blkcg);
}
static void blkcg_css_free(struct cgroup_subsys_state *css)
@@ -868,7 +872,9 @@ done:
spin_lock_init(&blkcg->lock);
INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
INIT_HLIST_HEAD(&blkcg->blkg_list);
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+ INIT_LIST_HEAD(&blkcg->cgwb_list);
+#endif
return &blkcg->css;
free_pd_blkcg:
@@ -892,9 +898,45 @@ free_blkcg:
*/
int blkcg_init_queue(struct request_queue *q)
{
- might_sleep();
+ struct blkcg_gq *new_blkg, *blkg;
+ bool preloaded;
+ int ret;
+
+ new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+ if (!new_blkg)
+ return -ENOMEM;
+
+ preloaded = !radix_tree_preload(GFP_KERNEL);
+
+ /*
+ * Make sure the root blkg exists and count the existing blkgs. As
+ * @q is bypassing at this point, blkg_lookup_create() can't be
+ * used. Open code insertion.
+ */
+ rcu_read_lock();
+ spin_lock_irq(q->queue_lock);
+ blkg = blkg_create(&blkcg_root, q, new_blkg);
+ spin_unlock_irq(q->queue_lock);
+ rcu_read_unlock();
+
+ if (preloaded)
+ radix_tree_preload_end();
+
+ if (IS_ERR(blkg)) {
+ kfree(new_blkg);
+ return PTR_ERR(blkg);
+ }
- return blk_throtl_init(q);
+ q->root_blkg = blkg;
+ q->root_rl.blkg = blkg;
+
+ ret = blk_throtl_init(q);
+ if (ret) {
+ spin_lock_irq(q->queue_lock);
+ blkg_destroy_all(q);
+ spin_unlock_irq(q->queue_lock);
+ }
+ return ret;
}
/**
@@ -996,50 +1038,19 @@ int blkcg_activate_policy(struct request_queue *q,
{
LIST_HEAD(pds);
LIST_HEAD(cpds);
- struct blkcg_gq *blkg, *new_blkg;
+ struct blkcg_gq *blkg;
struct blkg_policy_data *pd, *nd;
struct blkcg_policy_data *cpd, *cnd;
int cnt = 0, ret;
- bool preloaded;
if (blkcg_policy_enabled(q, pol))
return 0;
- /* preallocations for root blkg */
- new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
- if (!new_blkg)
- return -ENOMEM;
-
+ /* count and allocate policy_data for all existing blkgs */
blk_queue_bypass_start(q);
-
- preloaded = !radix_tree_preload(GFP_KERNEL);
-
- /*
- * Make sure the root blkg exists and count the existing blkgs. As
- * @q is bypassing at this point, blkg_lookup_create() can't be
- * used. Open code it.
- */
spin_lock_irq(q->queue_lock);
-
- rcu_read_lock();
- blkg = __blkg_lookup(&blkcg_root, q, false);
- if (blkg)
- blkg_free(new_blkg);
- else
- blkg = blkg_create(&blkcg_root, q, new_blkg);
- rcu_read_unlock();
-
- if (preloaded)
- radix_tree_preload_end();
-
- if (IS_ERR(blkg)) {
- ret = PTR_ERR(blkg);
- goto out_unlock;
- }
-
list_for_each_entry(blkg, &q->blkg_list, q_node)
cnt++;
-
spin_unlock_irq(q->queue_lock);
/*
@@ -1140,10 +1151,6 @@ void blkcg_deactivate_policy(struct request_queue *q,
__clear_bit(pol->plid, q->blkcg_pols);
- /* if no policy is left, no need for blkgs - shoot them down */
- if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
- blkg_destroy_all(q);
-
list_for_each_entry(blkg, &q->blkg_list, q_node) {
/* grab blkcg lock too while removing @pd from @blkg */
spin_lock(&blkg->blkcg->lock);
diff --git a/block/blk-core.c b/block/blk-core.c
index f6ab750060fe..688ae9482cb8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -32,12 +32,12 @@
#include <linux/delay.h>
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
+#include <linux/blk-cgroup.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
#include "blk.h"
-#include "blk-cgroup.h"
#include "blk-mq.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -63,6 +63,31 @@ struct kmem_cache *blk_requestq_cachep;
*/
static struct workqueue_struct *kblockd_workqueue;
+static void blk_clear_congested(struct request_list *rl, int sync)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+ clear_wb_congested(rl->blkg->wb_congested, sync);
+#else
+ /*
+ * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
+ * flip its congestion state for events on other blkcgs.
+ */
+ if (rl == &rl->q->root_rl)
+ clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+#endif
+}
+
+static void blk_set_congested(struct request_list *rl, int sync)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+ set_wb_congested(rl->blkg->wb_congested, sync);
+#else
+ /* see blk_clear_congested() */
+ if (rl == &rl->q->root_rl)
+ set_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+#endif
+}
+
void blk_queue_congestion_threshold(struct request_queue *q)
{
int nr;
@@ -623,8 +648,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
- q->backing_dev_info.state = 0;
- q->backing_dev_info.capabilities = 0;
+ q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
q->backing_dev_info.name = "block";
q->node = node_id;
@@ -847,13 +871,8 @@ static void __freed_request(struct request_list *rl, int sync)
{
struct request_queue *q = rl->q;
- /*
- * bdi isn't aware of blkcg yet. As all async IOs end up root
- * blkcg anyway, just use root blkcg state.
- */
- if (rl == &q->root_rl &&
- rl->count[sync] < queue_congestion_off_threshold(q))
- blk_clear_queue_congested(q, sync);
+ if (rl->count[sync] < queue_congestion_off_threshold(q))
+ blk_clear_congested(rl, sync);
if (rl->count[sync] + 1 <= q->nr_requests) {
if (waitqueue_active(&rl->wait[sync]))
@@ -886,25 +905,25 @@ static void freed_request(struct request_list *rl, unsigned int flags)
int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
{
struct request_list *rl;
+ int on_thresh, off_thresh;
spin_lock_irq(q->queue_lock);
q->nr_requests = nr;
blk_queue_congestion_threshold(q);
+ on_thresh = queue_congestion_on_threshold(q);
+ off_thresh = queue_congestion_off_threshold(q);
- /* congestion isn't cgroup aware and follows root blkcg for now */
- rl = &q->root_rl;
-
- if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
- blk_set_queue_congested(q, BLK_RW_SYNC);
- else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
- blk_clear_queue_congested(q, BLK_RW_SYNC);
+ blk_queue_for_each_rl(rl, q) {
+ if (rl->count[BLK_RW_SYNC] >= on_thresh)
+ blk_set_congested(rl, BLK_RW_SYNC);
+ else if (rl->count[BLK_RW_SYNC] < off_thresh)
+ blk_clear_congested(rl, BLK_RW_SYNC);
- if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
- blk_set_queue_congested(q, BLK_RW_ASYNC);
- else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
- blk_clear_queue_congested(q, BLK_RW_ASYNC);
+ if (rl->count[BLK_RW_ASYNC] >= on_thresh)
+ blk_set_congested(rl, BLK_RW_ASYNC);
+ else if (rl->count[BLK_RW_ASYNC] < off_thresh)
+ blk_clear_congested(rl, BLK_RW_ASYNC);
- blk_queue_for_each_rl(rl, q) {
if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
blk_set_rl_full(rl, BLK_RW_SYNC);
} else {
@@ -1014,12 +1033,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
}
}
}
- /*
- * bdi isn't aware of blkcg yet. As all async IOs end up
- * root blkcg anyway, just use root blkcg state.
- */
- if (rl == &q->root_rl)
<