From 3c9959e025472122a61faebb208525cf26b305d1 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 28 Dec 2018 00:36:33 -0800 Subject: zram: fix lockdep warning of free block handling Patch series "zram idle page writeback", v3. Inherently, swap device has many idle pages which are rare touched since it was allocated. It is never problem if we use storage device as swap. However, it's just waste for zram-swap. This patchset supports zram idle page writeback feature. * Admin can define what is idle page "no access since X time ago" * Admin can define when zram should writeback them * Admin can define when zram should stop writeback to prevent wearout Details are in each patch's description. This patch (of 7): ================================ WARNING: inconsistent lock state 4.19.0+ #390 Not tainted -------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. zram_verify/2095 [HC0[0]:SC1[1]:HE1:SE0] takes: 00000000b1828693 (&(&zram->bitmap_lock)->rlock){+.?.}, at: put_entry_bdev+0x1e/0x50 {SOFTIRQ-ON-W} state was registered at: _raw_spin_lock+0x2c/0x40 zram_make_request+0x755/0xdc9 generic_make_request+0x373/0x6a0 submit_bio+0x6c/0x140 __swap_writepage+0x3a8/0x480 shrink_page_list+0x1102/0x1a60 shrink_inactive_list+0x21b/0x3f0 shrink_node_memcg.constprop.99+0x4f8/0x7e0 shrink_node+0x7d/0x2f0 do_try_to_free_pages+0xe0/0x300 try_to_free_pages+0x116/0x2b0 __alloc_pages_slowpath+0x3f4/0xf80 __alloc_pages_nodemask+0x2a2/0x2f0 __handle_mm_fault+0x42e/0xb50 handle_mm_fault+0x55/0xb0 __do_page_fault+0x235/0x4b0 page_fault+0x1e/0x30 irq event stamp: 228412 hardirqs last enabled at (228412): [] __slab_free+0x3e6/0x600 hardirqs last disabled at (228411): [] __slab_free+0x1c5/0x600 softirqs last enabled at (228396): [] __do_softirq+0x31e/0x427 softirqs last disabled at (228403): [] irq_exit+0xd1/0xe0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(&zram->bitmap_lock)->rlock); lock(&(&zram->bitmap_lock)->rlock); *** DEADLOCK *** no locks held by zram_verify/2095. stack backtrace: CPU: 5 PID: 2095 Comm: zram_verify Not tainted 4.19.0+ #390 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 Call Trace: dump_stack+0x67/0x9b print_usage_bug+0x1bd/0x1d3 mark_lock+0x4aa/0x540 __lock_acquire+0x51d/0x1300 lock_acquire+0x90/0x180 _raw_spin_lock+0x2c/0x40 put_entry_bdev+0x1e/0x50 zram_free_page+0xf6/0x110 zram_slot_free_notify+0x42/0xa0 end_swap_bio_read+0x5b/0x170 blk_update_request+0x8f/0x340 scsi_end_request+0x2c/0x1e0 scsi_io_completion+0x98/0x650 blk_done_softirq+0x9e/0xd0 __do_softirq+0xcc/0x427 irq_exit+0xd1/0xe0 do_IRQ+0x93/0x120 common_interrupt+0xf/0xf With writeback feature, zram_slot_free_notify could be called in softirq context by end_swap_bio_read. However, bitmap_lock is not aware of that so lockdep yell out: get_entry_bdev spin_lock(bitmap->lock); irq softirq end_swap_bio_read zram_slot_free_notify zram_slot_lock <-- deadlock prone zram_free_page put_entry_bdev spin_lock(bitmap->lock); <-- deadlock prone With akpm's suggestion (i.e. bitmap operation is already atomic), we could remove bitmap lock. It might fail to find a empty slot if serious contention happens. However, it's not severe problem because huge page writeback has already possiblity to fail if there is severe memory pressure. Worst case is just keeping the incompressible in memory, not storage. The other problem is zram_slot_lock in zram_slot_slot_free_notify. To make it safe is this patch introduces zram_slot_trylock where zram_slot_free_notify uses it. Although it's rare to be contented, this patch adds new debug stat "miss_free" to keep monitoring how often it happens. Link: http://lkml.kernel.org/r/20181127055429.251614-2-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Reviewed-by: Joey Pabalinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 38 +++++++++++++++++++++----------------- drivers/block/zram/zram_drv.h | 2 +- 2 files changed, 22 insertions(+), 18 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 4879595200e1..21a7046958a3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -53,6 +53,11 @@ static size_t huge_class_size; static void zram_free_page(struct zram *zram, size_t index); +static int zram_slot_trylock(struct zram *zram, u32 index) +{ + return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].value); +} + static void zram_slot_lock(struct zram *zram, u32 index) { bit_spin_lock(ZRAM_LOCK, &zram->table[index].value); @@ -399,7 +404,6 @@ static ssize_t backing_dev_store(struct device *dev, goto out; reset_bdev(zram); - spin_lock_init(&zram->bitmap_lock); zram->old_block_size = old_block_size; zram->bdev = bdev; @@ -443,29 +447,24 @@ out: static unsigned long get_entry_bdev(struct zram *zram) { - unsigned long entry; - - spin_lock(&zram->bitmap_lock); + unsigned long blk_idx = 1; +retry: /* skip 0 bit to confuse zram.handle = 0 */ - entry = find_next_zero_bit(zram->bitmap, zram->nr_pages, 1); - if (entry == zram->nr_pages) { - spin_unlock(&zram->bitmap_lock); + blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx); + if (blk_idx == zram->nr_pages) return 0; - } - set_bit(entry, zram->bitmap); - spin_unlock(&zram->bitmap_lock); + if (test_and_set_bit(blk_idx, zram->bitmap)) + goto retry; - return entry; + return blk_idx; } static void put_entry_bdev(struct zram *zram, unsigned long entry) { int was_set; - spin_lock(&zram->bitmap_lock); was_set = test_and_clear_bit(entry, zram->bitmap); - spin_unlock(&zram->bitmap_lock); WARN_ON_ONCE(!was_set); } @@ -886,9 +885,10 @@ static ssize_t debug_stat_show(struct device *dev, down_read(&zram->init_lock); ret = scnprintf(buf, PAGE_SIZE, - "version: %d\n%8llu\n", + "version: %d\n%8llu %8llu\n", version, - (u64)atomic64_read(&zram->stats.writestall)); + (u64)atomic64_read(&zram->stats.writestall), + (u64)atomic64_read(&zram->stats.miss_free)); up_read(&zram->init_lock); return ret; @@ -1400,10 +1400,14 @@ static void zram_slot_free_notify(struct block_device *bdev, zram = bdev->bd_disk->private_data; - zram_slot_lock(zram, index); + atomic64_inc(&zram->stats.notify_free); + if (!zram_slot_trylock(zram, index)) { + atomic64_inc(&zram->stats.miss_free); + return; + } + zram_free_page(zram, index); zram_slot_unlock(zram, index); - atomic64_inc(&zram->stats.notify_free); } static int zram_rw_page(struct block_device *bdev, sector_t sector, diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 72c8584b6dff..d1095dfdffa8 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -79,6 +79,7 @@ struct zram_stats { atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ atomic64_t writestall; /* no. of write slow paths */ + atomic64_t miss_free; /* no. of missed free */ }; struct zram { @@ -110,7 +111,6 @@ struct zram { unsigned int old_block_size; unsigned long *bitmap; unsigned long nr_pages; - spinlock_t bitmap_lock; #endif #ifdef CONFIG_ZRAM_MEMORY_TRACKING struct dentry *debugfs_dir; -- cgit v1.2.3 From 5547932dc67a48713eece4fa4703bfdf0cfcb818 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 28 Dec 2018 00:36:37 -0800 Subject: zram: fix double free backing device If blkdev_get fails, we shouldn't do blkdev_put. Otherwise, kernel emits below log. This patch fixes it. WARNING: CPU: 0 PID: 1893 at fs/block_dev.c:1828 blkdev_put+0x105/0x120 Modules linked in: CPU: 0 PID: 1893 Comm: swapoff Not tainted 4.19.0+ #453 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 RIP: 0010:blkdev_put+0x105/0x120 Call Trace: __x64_sys_swapoff+0x46d/0x490 do_syscall_64+0x5a/0x190 entry_SYSCALL_64_after_hwframe+0x49/0xbe irq event stamp: 4466 hardirqs last enabled at (4465): __free_pages_ok+0x1e3/0x490 hardirqs last disabled at (4466): trace_hardirqs_off_thunk+0x1a/0x1c softirqs last enabled at (3420): __do_softirq+0x333/0x446 softirqs last disabled at (3407): irq_exit+0xd1/0xe0 Link: http://lkml.kernel.org/r/20181127055429.251614-3-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Reviewed-by: Joey Pabalinas Cc: [4.14+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 21a7046958a3..d1459cc1159f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -387,8 +387,10 @@ static ssize_t backing_dev_store(struct device *dev, bdev = bdgrab(I_BDEV(inode)); err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram); - if (err < 0) + if (err < 0) { + bdev = NULL; goto out; + } nr_pages = i_size_read(inode) >> PAGE_SHIFT; bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long); -- cgit v1.2.3 From 7e5292831b346bacb05558ca385cae366187314c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 28 Dec 2018 00:36:40 -0800 Subject: zram: refactor flags and writeback stuff Rename some variables and restructure some code for better readability in writeback and zs_free_page. Link: http://lkml.kernel.org/r/20181127055429.251614-4-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Reviewed-by: Joey Pabalinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 105 ++++++++++++++++-------------------------- drivers/block/zram/zram_drv.h | 8 ++-- 2 files changed, 44 insertions(+), 69 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d1459cc1159f..4457d0395bfb 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -55,17 +55,17 @@ static void zram_free_page(struct zram *zram, size_t index); static int zram_slot_trylock(struct zram *zram, u32 index) { - return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].value); + return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); } static void zram_slot_lock(struct zram *zram, u32 index) { - bit_spin_lock(ZRAM_LOCK, &zram->table[index].value); + bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); } static void zram_slot_unlock(struct zram *zram, u32 index) { - bit_spin_unlock(ZRAM_LOCK, &zram->table[index].value); + bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); } static inline bool init_done(struct zram *zram) @@ -76,7 +76,7 @@ static inline bool init_done(struct zram *zram) static inline bool zram_allocated(struct zram *zram, u32 index) { - return (zram->table[index].value >> (ZRAM_FLAG_SHIFT + 1)) || + return (zram->table[index].flags >> (ZRAM_FLAG_SHIFT + 1)) || zram->table[index].handle; } @@ -99,19 +99,19 @@ static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) static bool zram_test_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { - return zram->table[index].value & BIT(flag); + return zram->table[index].flags & BIT(flag); } static void zram_set_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { - zram->table[index].value |= BIT(flag); + zram->table[index].flags |= BIT(flag); } static void zram_clear_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { - zram->table[index].value &= ~BIT(flag); + zram->table[index].flags &= ~BIT(flag); } static inline void zram_set_element(struct zram *zram, u32 index, @@ -127,15 +127,15 @@ static unsigned long zram_get_element(struct zram *zram, u32 index) static size_t zram_get_obj_size(struct zram *zram, u32 index) { - return zram->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); + return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1); } static void zram_set_obj_size(struct zram *zram, u32 index, size_t size) { - unsigned long flags = zram->table[index].value >> ZRAM_FLAG_SHIFT; + unsigned long flags = zram->table[index].flags >> ZRAM_FLAG_SHIFT; - zram->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; + zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size; } #if PAGE_SIZE != 4096 @@ -282,16 +282,11 @@ static ssize_t mem_used_max_store(struct device *dev, } #ifdef CONFIG_ZRAM_WRITEBACK -static bool zram_wb_enabled(struct zram *zram) -{ - return zram->backing_dev; -} - static void reset_bdev(struct zram *zram) { struct block_device *bdev; - if (!zram_wb_enabled(zram)) + if (!zram->backing_dev) return; bdev = zram->bdev; @@ -318,7 +313,7 @@ static ssize_t backing_dev_show(struct device *dev, ssize_t ret; down_read(&zram->init_lock); - if (!zram_wb_enabled(zram)) { + if (!zram->backing_dev) { memcpy(buf, "none\n", 5); up_read(&zram->init_lock); return 5; @@ -447,7 +442,7 @@ out: return err; } -static unsigned long get_entry_bdev(struct zram *zram) +static unsigned long alloc_block_bdev(struct zram *zram) { unsigned long blk_idx = 1; retry: @@ -462,11 +457,11 @@ retry: return blk_idx; } -static void put_entry_bdev(struct zram *zram, unsigned long entry) +static void free_block_bdev(struct zram *zram, unsigned long blk_idx) { int was_set; - was_set = test_and_clear_bit(entry, zram->bitmap); + was_set = test_and_clear_bit(blk_idx, zram->bitmap); WARN_ON_ONCE(!was_set); } @@ -579,7 +574,7 @@ static int write_to_bdev(struct zram *zram, struct bio_vec *bvec, if (!bio) return -ENOMEM; - entry = get_entry_bdev(zram); + entry = alloc_block_bdev(zram); if (!entry) { bio_put(bio); return -ENOSPC; @@ -590,7 +585,7 @@ static int write_to_bdev(struct zram *zram, struct bio_vec *bvec, if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) { bio_put(bio); - put_entry_bdev(zram, entry); + free_block_bdev(zram, entry); return -EIO; } @@ -608,18 +603,7 @@ static int write_to_bdev(struct zram *zram, struct bio_vec *bvec, return 0; } -static void zram_wb_clear(struct zram *zram, u32 index) -{ - unsigned long entry; - - zram_clear_flag(zram, index, ZRAM_WB); - entry = zram_get_element(zram, index); - zram_set_element(zram, index, 0); - put_entry_bdev(zram, entry); -} - #else -static bool zram_wb_enabled(struct zram *zram) { return false; } static inline void reset_bdev(struct zram *zram) {}; static int write_to_bdev(struct zram *zram, struct bio_vec *bvec, u32 index, struct bio *parent, @@ -634,7 +618,8 @@ static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, { return -EIO; } -static void zram_wb_clear(struct zram *zram, u32 index) {} + +static void free_block_bdev(struct zram *zram, unsigned long blk_idx) {}; #endif #ifdef CONFIG_ZRAM_MEMORY_TRACKING @@ -656,11 +641,6 @@ static void zram_accessed(struct zram *zram, u32 index) zram->table[index].ac_time = ktime_get_boottime(); } -static void zram_reset_access(struct zram *zram, u32 index) -{ - zram->table[index].ac_time = 0; -} - static ssize_t read_block_state(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -741,7 +721,6 @@ static void zram_debugfs_unregister(struct zram *zram) static void zram_debugfs_create(void) {}; static void zram_debugfs_destroy(void) {}; static void zram_accessed(struct zram *zram, u32 index) {}; -static void zram_reset_access(struct zram *zram, u32 index) {}; static void zram_debugfs_register(struct zram *zram) {}; static void zram_debugfs_unregister(struct zram *zram) {}; #endif @@ -942,17 +921,18 @@ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; - zram_reset_access(zram, index); - +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + zram->table[index].ac_time = 0; +#endif if (zram_test_flag(zram, index, ZRAM_HUGE)) { zram_clear_flag(zram, index, ZRAM_HUGE); atomic64_dec(&zram->stats.huge_pages); } - if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) { - zram_wb_clear(zram, index); - atomic64_dec(&zram->stats.pages_stored); - return; + if (zram_test_flag(zram, index, ZRAM_WB)) { + zram_clear_flag(zram, index, ZRAM_WB); + free_block_bdev(zram, zram_get_element(zram, index)); + goto out; } /* @@ -961,10 +941,8 @@ static void zram_free_page(struct zram *zram, size_t index) */ if (zram_test_flag(zram, index, ZRAM_SAME)) { zram_clear_flag(zram, index, ZRAM_SAME); - zram_set_element(zram, index, 0); atomic64_dec(&zram->stats.same_pages); - atomic64_dec(&zram->stats.pages_stored); - return; + goto out; } handle = zram_get_handle(zram, index); @@ -975,10 +953,11 @@ static void zram_free_page(struct zram *zram, size_t index) atomic64_sub(zram_get_obj_size(zram, index), &zram->stats.compr_data_size); +out: atomic64_dec(&zram->stats.pages_stored); - zram_set_handle(zram, index, 0); zram_set_obj_size(zram, index, 0); + WARN_ON_ONCE(zram->table[index].flags & ~(1UL << ZRAM_LOCK)); } static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, @@ -989,24 +968,20 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, unsigned int size; void *src, *dst; - if (zram_wb_enabled(zram)) { - zram_slot_lock(zram, index); - if (zram_test_flag(zram, index, ZRAM_WB)) { - struct bio_vec bvec; - - zram_slot_unlock(zram, index); + zram_slot_lock(zram, index); + if (zram_test_flag(zram, index, ZRAM_WB)) { + struct bio_vec bvec; - bvec.bv_page = page; - bvec.bv_len = PAGE_SIZE; - bvec.bv_offset = 0; - return read_from_bdev(zram, &bvec, - zram_get_element(zram, index), - bio, partial_io); - } zram_slot_unlock(zram, index); + + bvec.bv_page = page; + bvec.bv_len = PAGE_SIZE; + bvec.bv_offset = 0; + return read_from_bdev(zram, &bvec, + zram_get_element(zram, index), + bio, partial_io); } - zram_slot_lock(zram, index); handle = zram_get_handle(zram, index); if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) { unsigned long value; @@ -1118,7 +1093,7 @@ compress_again: if (unlikely(comp_len >= huge_class_size)) { comp_len = PAGE_SIZE; - if (zram_wb_enabled(zram) && allow_wb) { + if (zram->backing_dev && allow_wb) { zcomp_stream_put(zram->comp); ret = write_to_bdev(zram, bvec, index, bio, &element); if (!ret) { diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index d1095dfdffa8..4f83f1f14b0a 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -30,7 +30,7 @@ /* - * The lower ZRAM_FLAG_SHIFT bits of table.value is for + * The lower ZRAM_FLAG_SHIFT bits of table.flags is for * object size (excluding header), the higher bits is for * zram_pageflags. * @@ -41,7 +41,7 @@ */ #define ZRAM_FLAG_SHIFT 24 -/* Flags for zram pages (table[page_no].value) */ +/* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { /* zram slot is locked */ ZRAM_LOCK = ZRAM_FLAG_SHIFT, @@ -60,7 +60,7 @@ struct zram_table_entry { unsigned long handle; unsigned long element; }; - unsigned long value; + unsigned long flags; #ifdef CONFIG_ZRAM_MEMORY_TRACKING ktime_t ac_time; #endif @@ -105,8 +105,8 @@ struct zram { * zram is claimed so open request will be failed */ bool claim; /* Protected by bdev->bd_mutex */ -#ifdef CONFIG_ZRAM_WRITEBACK struct file *backing_dev; +#ifdef CONFIG_ZRAM_WRITEBACK struct block_device *bdev; unsigned int old_block_size; unsigned long *bitmap; -- cgit v1.2.3 From e82592c4fd7eafe8dec12a70436e93e3afb28556 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 28 Dec 2018 00:36:44 -0800 Subject: zram: introduce ZRAM_IDLE flag To support idle page writeback with upcoming patches, this patch introduces a new ZRAM_IDLE flag. Userspace can mark zram slots as "idle" via "echo all > /sys/block/zramX/idle" which marks every allocated zram slot as ZRAM_IDLE. User could see it by /sys/kernel/debug/zram/zram0/block_state. 300 75.033841 ...i 301 63.806904 s..i 302 63.806919 ..hi Once there is IO for the slot, the mark will be disappeared. 300 75.033841 ... 301 63.806904 s..i 302 63.806919 ..hi Therefore, 300th block is idle zpage. With this feature, user can how many zram has idle pages which are waste of memory. Link: http://lkml.kernel.org/r/20181127055429.251614-5-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Reviewed-by: Joey Pabalinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 57 ++++++++++++++++++++++++++++++++++++++++--- drivers/block/zram/zram_drv.h | 1 + 2 files changed, 55 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 4457d0395bfb..180613b478a6 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -281,6 +281,47 @@ static ssize_t mem_used_max_store(struct device *dev, return len; } +static ssize_t idle_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; + int index; + char mode_buf[8]; + ssize_t sz; + + sz = strscpy(mode_buf, buf, sizeof(mode_buf)); + if (sz <= 0) + return -EINVAL; + + /* ignore trailing new line */ + if (mode_buf[sz - 1] == '\n') + mode_buf[sz - 1] = 0x00; + + if (strcmp(mode_buf, "all")) + return -EINVAL; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + return -EINVAL; + } + + for (index = 0; index < nr_pages; index++) { + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index)) + goto next; + + zram_set_flag(zram, index, ZRAM_IDLE); +next: + zram_slot_unlock(zram, index); + } + + up_read(&zram->init_lock); + + return len; +} + #ifdef CONFIG_ZRAM_WRITEBACK static void reset_bdev(struct zram *zram) { @@ -638,6 +679,7 @@ static void zram_debugfs_destroy(void) static void zram_accessed(struct zram *zram, u32 index) { + zram_clear_flag(zram, index, ZRAM_IDLE); zram->table[index].ac_time = ktime_get_boottime(); } @@ -670,12 +712,13 @@ static ssize_t read_block_state(struct file *file, char __user *buf, ts = ktime_to_timespec64(zram->table[index].ac_time); copied = snprintf(kbuf + written, count, - "%12zd %12lld.%06lu %c%c%c\n", + "%12zd %12lld.%06lu %c%c%c%c\n", index, (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC, zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.', zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.', - zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.'); + zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.', + zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.'); if (count < copied) { zram_slot_unlock(zram, index); @@ -720,7 +763,10 @@ static void zram_debugfs_unregister(struct zram *zram) #else static void zram_debugfs_create(void) {}; static void zram_debugfs_destroy(void) {}; -static void zram_accessed(struct zram *zram, u32 index) {}; +static void zram_accessed(struct zram *zram, u32 index) +{ + zram_clear_flag(zram, index, ZRAM_IDLE); +}; static void zram_debugfs_register(struct zram *zram) {}; static void zram_debugfs_unregister(struct zram *zram) {}; #endif @@ -924,6 +970,9 @@ static void zram_free_page(struct zram *zram, size_t index) #ifdef CONFIG_ZRAM_MEMORY_TRACKING zram->table[index].ac_time = 0; #endif + if (zram_test_flag(zram, index, ZRAM_IDLE)) + zram_clear_flag(zram, index, ZRAM_IDLE); + if (zram_test_flag(zram, index, ZRAM_HUGE)) { zram_clear_flag(zram, index, ZRAM_HUGE); atomic64_dec(&zram->stats.huge_pages); @@ -1589,6 +1638,7 @@ static DEVICE_ATTR_RO(initstate); static DEVICE_ATTR_WO(reset); static DEVICE_ATTR_WO(mem_limit); static DEVICE_ATTR_WO(mem_used_max); +static DEVICE_ATTR_WO(idle); static DEVICE_ATTR_RW(max_comp_streams); static DEVICE_ATTR_RW(comp_algorithm); #ifdef CONFIG_ZRAM_WRITEBACK @@ -1602,6 +1652,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_compact.attr, &dev_attr_mem_limit.attr, &dev_attr_mem_used_max.attr, + &dev_attr_idle.attr, &dev_attr_max_comp_streams.attr, &dev_attr_comp_algorithm.attr, #ifdef CONFIG_ZRAM_WRITEBACK diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 4f83f1f14b0a..a84611b97867 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -48,6 +48,7 @@ enum zram_pageflags { ZRAM_SAME, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ ZRAM_HUGE, /* Incompressible page */ + ZRAM_IDLE, /* not accessed page since last idle marking */ __NR_ZRAM_PAGEFLAGS, }; -- cgit v1.2.3 From a939888ec38bf1f33e4a903056677e92a4844244 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 28 Dec 2018 00:36:47 -0800 Subject: zram: support idle/huge page writeback Add a new feature "zram idle/huge page writeback". In the zram-swap use case, zram usually has many idle/huge swap pages. It's pointless to keep them in memory (ie, zram). To solve this problem, this feature introduces idle/huge page writeback to the backing device so the goal is to save more memory space on embedded systems. Normal sequence to use idle/huge page writeback feature is as follows, while (1) { # mark allocated zram slot to idle echo all > /sys/block/zram0/idle # leave system working for several hours # Unless there is no access for some blocks on zram, # they are still IDLE marked pages. echo "idle" > /sys/block/zram0/writeback or/and echo "huge" > /sys/block/zram0/writeback # write the IDLE or/and huge marked slot into backing device # and free the memory. } Per the discussion at https://lore.kernel.org/lkml/20181122065926.GG3441@jagdpanzerIV/T/#u, This patch removes direct incommpressibe page writeback feature (d2afd25114f4 ("zram: write incompressible pages to backing device")). Below concerns from Sergey: == &< == "IDLE writeback" is superior to "incompressible writeback". "incompressible writeback" is completely unpredictable and uncontrollable; it depens on data patterns and compression algorithms. While "IDLE writeback" is predictable. I even suspect, that, *ideally*, we can remove "incompressible writeback". "IDLE pages" is a super set which also includes "incompressible" pages. So, technically, we still can do "incompressible writeback" from "IDLE writeback" path; but a much more reasonable one, based on a page idling period. I understand that you want to keep "direct incompressible writeback" around. ZRAM is especially popular on devices which do suffer from flash wearout, so I can see "incompressible writeback" path becoming a dead code, long term. == &< == Below concerns from Minchan: == &< == My concern is if we enable CONFIG_ZRAM_WRITEBACK in this implementation, both hugepage/idlepage writeck will turn on. However someuser want to enable only idlepage writeback so we need to introduce turn on/off knob for hugepage or new CONFIG_ZRAM_IDLEPAGE_WRITEBACK for those usecase. I don't want to make it complicated *if possible*. Long term, I imagine we need to make VM aware of new swap hierarchy a little bit different with as-is. For example, first high priority swap can return -EIO or -ENOCOMP, swap try to fallback to next lower priority swap device. With that, hugepage writeback will work tranparently. So we could regard it as regression because incompressible pages doesn't go to backing storage automatically. Instead, user should do it via "echo huge" > /sys/block/zram/writeback" manually. == &< == Link: http://lkml.kernel.org/r/20181127055429.251614-6-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Joey Pabalinas Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/Kconfig | 5 +- drivers/block/zram/zram_drv.c | 247 +++++++++++++++++++++++++++++------------- drivers/block/zram/zram_drv.h | 1 + 3 files changed, 178 insertions(+), 75 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index fcd055457364..1ffc64770643 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -15,7 +15,7 @@ config ZRAM See Documentation/blockdev/zram.txt for more information. config ZRAM_WRITEBACK - bool "Write back incompressible page to backing device" + bool "Write back incompressible or idle page to backing device" depends on ZRAM help With incompressible page, there is no memory saving to keep it @@ -23,6 +23,9 @@ config ZRAM_WRITEBACK For this feature, admin should set up backing device via /sys/block/zramX/backing_dev. + With /sys/block/zramX/{idle,writeback}, application could ask + idle page's writeback to the backing device to save in memory. + See Documentation/blockdev/zram.txt for more information. config ZRAM_MEMORY_TRACKING diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 180613b478a6..6b5a886c8f32 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -52,6 +52,9 @@ static unsigned int num_devices = 1; static size_t huge_class_size; static void zram_free_page(struct zram *zram, size_t index); +static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset, struct bio *bio); + static int zram_slot_trylock(struct zram *zram, u32 index) { @@ -73,13 +76,6 @@ static inline bool init_done(struct zram *zram) return zram->disksize; } -static inline bool zram_allocated(struct zram *zram, u32 index) -{ - - return (zram->table[index].flags >> (ZRAM_FLAG_SHIFT + 1)) || - zram->table[index].handle; -} - static inline struct zram *dev_to_zram(struct device *dev) { return (struct zram *)dev_to_disk(dev)->private_data; @@ -138,6 +134,13 @@ static void zram_set_obj_size(struct zram *zram, zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size; } +static inline bool zram_allocated(struct zram *zram, u32 index) +{ + return zram_get_obj_size(zram, index) || + zram_test_flag(zram, index, ZRAM_SAME) || + zram_test_flag(zram, index, ZRAM_WB); +} + #if PAGE_SIZE != 4096 static inline bool is_partial_io(struct bio_vec *bvec) { @@ -308,10 +311,14 @@ static ssize_t idle_store(struct device *dev, } for (index = 0; index < nr_pages; index++) { + /* + * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race. + * See the comment in writeback_store. + */ zram_slot_lock(zram, index); - if (!zram_allocated(zram, index)) + if (!zram_allocated(zram, index) || + zram_test_flag(zram, index, ZRAM_UNDER_WB)) goto next; - zram_set_flag(zram, index, ZRAM_IDLE); next: zram_slot_unlock(zram, index); @@ -546,6 +553,158 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec, return 1; } +#define HUGE_WRITEBACK 0x1 +#define IDLE_WRITEBACK 0x2 + +static ssize_t writeback_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; + unsigned long index; + struct bio bio; + struct bio_vec bio_vec; + struct page *page; + ssize_t ret, sz; + char mode_buf[8]; + unsigned long mode = -1UL; + unsigned long blk_idx = 0; + + sz = strscpy(mode_buf, buf, sizeof(mode_buf)); + if (sz <= 0) + return -EINVAL; + + /* ignore trailing newline */ + if (mode_buf[sz - 1] == '\n') + mode_buf[sz - 1] = 0x00; + + if (!strcmp(mode_buf, "idle")) + mode = IDLE_WRITEBACK; + else if (!strcmp(mode_buf, "huge")) + mode = HUGE_WRITEBACK; + + if (mode == -1UL) + return -EINVAL; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + ret = -EINVAL; + goto release_init_lock; + } + + if (!zram->backing_dev) { + ret = -ENODEV; + goto release_init_lock; + } + + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto release_init_lock; + } + + for (index = 0; index < nr_pages; index++) { + struct bio_vec bvec; + + bvec.bv_page = page; + bvec.bv_len = PAGE_SIZE; + bvec.bv_offset = 0; + + if (!blk_idx) { + blk_idx = alloc_block_bdev(zram); + if (!blk_idx) { + ret = -ENOSPC; + break; + } + } + + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index)) + goto next; + + if (zram_test_flag(zram, index, ZRAM_WB) || + zram_test_flag(zram, index, ZRAM_SAME) || + zram_test_flag(zram, index, ZRAM_UNDER_WB)) + goto next; + + if ((mode & IDLE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_IDLE)) && + (mode & HUGE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_HUGE))) + goto next; + /* + * Clearing ZRAM_UNDER_WB is duty of caller. + * IOW, zram_free_page never clear it. + */ + zram_set_flag(zram, index, ZRAM_UNDER_WB); + /* Need for hugepage writeback racing */ + zram_set_flag(zram, index, ZRAM_IDLE); + zram_slot_unlock(zram, index); + if (zram_bvec_read(zram, &bvec, index, 0, NULL)) { + zram_slot_lock(zram, index); + zram_clear_flag(zram, index, ZRAM_UNDER_WB); + zram_clear_flag(zram, index, ZRAM_IDLE); + zram_slot_unlock(zram, index); + continue; + } + + bio_init(&bio, &bio_vec, 1); + bio_set_dev(&bio, zram->bdev); + bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9); + bio.bi_opf = REQ_OP_WRITE | REQ_SYNC; + + bio_add_page(&bio, bvec.bv_page, bvec.bv_len, + bvec.bv_offset); + /* + * XXX: A single page IO would be inefficient for write + * but it would be not bad as starter. + */ + ret = submit_bio_wait(&bio); + if (ret) { + zram_slot_lock(zram, index); + zram_clear_flag(zram, index, ZRAM_UNDER_WB); + zram_clear_flag(zram, index, ZRAM_IDLE); + zram_slot_unlock(zram, index); + continue; + } + + /* + * We released zram_slot_lock so need to check if the slot was + * changed. If there is freeing for the slot, we can catch it + * easily by zram_allocated. + * A subtle case is the slot is freed/reallocated/marked as + * ZRAM_IDLE again. To close the race, idle_store doesn't + * mark ZRAM_IDLE once it found the slot was ZRAM_UNDER_WB. + * Thus, we could close the race by checking ZRAM_IDLE bit. + */ + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index) || + !zram_test_flag(zram, index, ZRAM_IDLE)) { + zram_clear_flag(zram, index, ZRAM_UNDER_WB); + zram_clear_flag(zram, index, ZRAM_IDLE); + goto next; + } + + zram_free_page(zram, index); + zram_clear_flag(zram, index, ZRAM_UNDER_WB); + zram_set_flag(zram, index, ZRAM_WB); + zram_set_element(zram, index, blk_idx); + blk_idx = 0; + atomic64_inc(&zram->stats.pages_stored); +next: + zram_slot_unlock(zram, index); + } + + if (blk_idx) + free_block_bdev(zram, blk_idx); + ret = len; + __free_page(page); +release_init_lock: + up_read(&zram->init_lock); + + return ret; +} + struct zram_work { struct work_struct work; struct zram *zram; @@ -603,57 +762,8 @@ static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, else return read_from_bdev_async(zram, bvec, entry, parent); } - -static int write_to_bdev(struct zram *zram, struct bio_vec *bvec, - u32 index, struct bio *parent, - unsigned long *pentry) -{ - struct bio *bio; - unsigned long entry; - - bio = bio_alloc(GFP_ATOMIC, 1); - if (!bio) - return -ENOMEM; - - entry = alloc_block_bdev(zram); - if (!entry) { - bio_put(bio); - return -ENOSPC; - } - - bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9); - bio_set_dev(bio, zram->bdev); - if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, - bvec->bv_offset)) { - bio_put(bio); - free_block_bdev(zram, entry); - return -EIO; - } - - if (!parent) { - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; - bio->bi_end_io = zram_page_end_io; - } else { - bio->bi_opf = parent->bi_opf; - bio_chain(bio, parent); - } - - submit_bio(bio); - *pentry = entry; - - return 0; -} - #else static inline void reset_bdev(struct zram *zram) {}; -static int write_to_bdev(struct zram *zram, struct bio_vec *bvec, - u32 index, struct bio *parent, - unsigned long *pentry) - -{ - return -EIO; -} - static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, unsigned long entry, struct bio *parent, bool sync) { @@ -1006,7 +1116,8 @@ out: atomic64_dec(&zram->stats.pages_stored); zram_set_handle(zram, index, 0); zram_set_obj_size(zram, index, 0); - WARN_ON_ONCE(zram->table[index].flags & ~(1UL << ZRAM_LOCK)); + WARN_ON_ONCE(zram->table[index].flags & + ~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB)); } static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, @@ -1115,7 +1226,6 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, struct page *page = bvec->bv_page; unsigned long element = 0; enum zram_pageflags flags = 0; - bool allow_wb = true; mem = kmap_atomic(page); if (page_same_filled(mem, &element)) { @@ -1140,21 +1250,8 @@ compress_again: return ret; } - if (unlikely(comp_len >= huge_class_size)) { + if (comp_len >= huge_class_size) comp_len = PAGE_SIZE; - if (zram->backing_dev && allow_wb) { - zcomp_stream_put(zram->comp); - ret = write_to_bdev(zram, bvec, index, bio, &element); - if (!ret) { - flags = ZRAM_WB; - ret = 1; - goto out; - } - allow_wb = false; - goto compress_again; - } - } - /* * handle allocation has 2 paths: * a) fast path is executed with preemption disabled (for @@ -1643,6 +1740,7 @@ static DEVICE_ATTR_RW(max_comp_streams); static DEVICE_ATTR_RW(comp_algorithm); #ifdef CONFIG_ZRAM_WRITEBACK static DEVICE_ATTR_RW(backing_dev); +static DEVICE_ATTR_WO(writeback); #endif static struct attribute *zram_disk_attrs[] = { @@ -1657,6 +1755,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_comp_algorithm.attr, #ifdef CONFIG_ZRAM_WRITEBACK &dev_attr_backing_dev.attr, + &dev_attr_writeback.attr, #endif &dev_attr_io_stat.attr, &dev_attr_mm_stat.attr, diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index a84611b97867..1ad74f030b6d 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -47,6 +47,7 @@ enum zram_pageflags { ZRAM_LOCK = ZRAM_FLAG_SHIFT, ZRAM_SAME, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ + ZRAM_UNDER_WB, /* page is under writeback */ ZRAM_HUGE, /* Incompressible page */ ZRAM_IDLE, /* not accessed page since last idle marking */ -- cgit v1.2.3 From 23eddf39b2c28c05cb8f8203d38e61807d701b38 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 28 Dec 2018 00:36:51 -0800 Subject: zram: add bd_stat statistics bd_stat represents things that happened in the backing device. Currently it supports bd_counts, bd_reads and bd_writes which are helpful to understand wearout of flash and memory saving. [minchan@kernel.org: v4] Link: http://lkml.kernel.org/r/20181203024045.153534-7-minchan@kernel.org Link: http://lkml.kernel.org/r/20181127055429.251614-7-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Joey Pabalinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 29 +++++++++++++++++++++++++++++ drivers/block/zram/zram_drv.h | 5 +++++ 2 files changed, 34 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6b5a886c8f32..f1832fa3ba41 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -502,6 +502,7 @@ retry: if (test_and_set_bit(blk_idx, zram->bitmap)) goto retry; + atomic64_inc(&zram->stats.bd_count); return blk_idx; } @@ -511,6 +512,7 @@ static void free_block_bdev(struct zram *zram, unsigned long blk_idx) was_set = test_and_clear_bit(blk_idx, zram->bitmap); WARN_ON_ONCE(!was_set); + atomic64_dec(&zram->stats.bd_count); } static void zram_page_end_io(struct bio *bio) @@ -668,6 +670,7 @@ static ssize_t writeback_store(struct device *dev, continue; } + atomic64_inc(&zram->stats.bd_writes); /* * We released zram_slot_lock so need to check if the slot was * changed. If there is freeing for the slot, we can catch it @@ -757,6 +760,7 @@ static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec, static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, unsigned long entry, struct bio *parent, bool sync) { + atomic64_inc(&zram->stats.bd_reads); if (sync) return read_from_bdev_sync(zram, bvec, entry, parent); else @@ -1013,6 +1017,25 @@ static ssize_t mm_stat_show(struct device *dev, return ret; } +#ifdef CONFIG_ZRAM_WRITEBACK +static ssize_t bd_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + ssize_t ret; + + down_read(&zram->init_lock); + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu\n", + (u64)atomic64_read(&zram->stats.bd_count) * (PAGE_SHIFT - 12), + (u64)atomic64_read(&zram->stats.bd_reads) * (PAGE_SHIFT - 12), + (u64)atomic64_read(&zram->stats.bd_writes) * (PAGE_SHIFT - 12)); + up_read(&zram->init_lock); + + return ret; +} +#endif + static ssize_t debug_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1033,6 +1056,9 @@ static ssize_t debug_stat_show(struct device *dev, static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); +#ifdef CONFIG_ZRAM_WRITEBACK +static DEVICE_ATTR_RO(bd_stat); +#endif static DEVICE_ATTR_RO(debug_stat); static void zram_meta_free(struct zram *zram, u64 disksize) @@ -1759,6 +1785,9 @@ static struct attribute *zram_disk_attrs[] = { #endif &dev_attr_io_stat.attr, &dev_attr_mm_stat.attr, +#ifdef CONFIG_ZRAM_WRITEBACK + &dev_attr_bd_stat.attr, +#endif &dev_attr_debug_stat.attr, NULL, }; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 1ad74f030b6d..bc477803530d 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -82,6 +82,11 @@ struct zram_stats { atomic_long_t max_used_pages; /* no. of maximum pages stored */ atomic64_t writestall; /* no. of write slow paths */ atomic64_t miss_free; /* no. of missed free */ +#ifdef CONFIG_ZRAM_WRITEBACK + atomic64_t bd_count; /* no. of pages in backing device */ + atomic64_t bd_reads; /* no. of reads from backing device */ + atomic64_t bd_writes; /* no. of writes from backing device */ +#endif }; struct zram { -- cgit v1.2.3 From bb416d18b850faaa44bd3bb67c9728922c3cce98 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 28 Dec 2018 00:36:54 -0800 Subject: zram: writeback throttle If there are lots of write IO with flash device, it could have a wearout problem of storage. To overcome the problem, admin needs to design write limitation to guarantee flash health for entire product life. This patch creates a new knob "writeback_limit" for zram. writeback_limit's default value is 0 so that it doesn't limit any writeback. If admin want to measure writeback count in a certain period, he could know it via /sys/block/zram0/bd_stat's 3rd column. If admin want to limit writeback as per-day 400M, he could do it like below. MB_SHIFT=20 4K_SHIFT=12 echo $((400<>4K_SHIFT)) > \ /sys/block/zram0/writeback_limit. If admin want to allow further write again, he could do it like below echo 0 > /sys/block/zram0/writeback_limit If admin want to see remaining writeback budget, cat /sys/block/zram0/writeback_limit The writeback_limit count will reset whenever you reset zram (e.g., system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of writeback happened until you reset the zram to allocate extra writeback budget in next setting is user's job. [minchan@kernel.org: v4] Link: http://lkml.kernel.org/r/20181203024045.153534-8-minchan@kernel.org Link: http://lkml.kernel.org/r/20181127055429.251614-8-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Joey Pabalinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 52 ++++++++++++++++++++++++++++++++++++++++--- drivers/block/zram/zram_drv.h | 2 ++ 2 files changed, 51 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f1832fa3ba41..33c5cc879f24 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -330,6 +330,39 @@ next: } #ifdef CONFIG_ZRAM_WRITEBACK +static ssize_t writeback_limit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + u64 val; + ssize_t ret = -EINVAL; + + if (kstrtoull(buf, 10, &val)) + return ret; + + down_read(&zram->init_lock); + atomic64_set(&zram->stats.bd_wb_limit, val); + if (val == 0) + zram->stop_writeback = false; + up_read(&zram->init_lock); + ret = len; + + return ret; +} + +static ssize_t writeback_limit_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = atomic64_read(&zram->stats.bd_wb_limit); + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", val); +} + static void reset_bdev(struct zram *zram) { struct block_device *bdev; @@ -612,6 +645,11 @@ static ssize_t writeback_store(struct device *dev, bvec.bv_len = PAGE_SIZE; bvec.bv_offset = 0; + if (zram->stop_writeback) { + ret = -EIO; + break; + } + if (!blk_idx) { blk_idx = alloc_block_bdev(zram); if (!blk_idx) { @@ -694,6 +732,11 @@ static ssize_t writeback_store(struct device *dev, zram_set_element(zram, index, blk_idx); blk_idx = 0; atomic64_inc(&zram->stats.pages_stored); + if (atomic64_add_unless(&zram->stats.bd_wb_limit, + -1 << (PAGE_SHIFT - 12), 0)) { + if (atomic64_read(&zram->stats.bd_wb_limit) == 0) + zram->stop_writeback = true; + } next: zram_slot_unlock(zram, index); } @@ -1018,6 +1061,7 @@ static ssize_t mm_stat_show(struct device *dev, } #ifdef CONFIG_ZRAM_WRITEBACK +#define FOUR_K(x) ((x) * (1 << (PAGE_SHIFT - 12))) static ssize_t bd_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1027,9 +1071,9 @@ static ssize_t bd_stat_show(struct device *dev, down_read(&zram->init_lock); ret = scnprintf(buf, PAGE_SIZE, "%8llu %8llu %8llu\n", - (u64)atomic64_read(&zram->stats.bd_count) * (PAGE_SHIFT - 12), - (u64)atomic64_read(&zram->stats.bd_reads) * (PAGE_SHIFT - 12), - (u64)atomic64_read(&zram->stats.bd_writes) * (PAGE_SHIFT - 12)); + FOUR_K((u64)atomic64_read(&zram->stats.bd_count)), + FOUR_K((u64)atomic64_read(&zram->stats.bd_reads)), + FOUR_K((u64)atomic64_read(&zram->stats.bd_writes))); up_read(&zram->init_lock); return ret; @@ -1767,6 +1811,7 @@ static DEVICE_ATTR_RW(comp_algorithm); #ifdef CONFIG_ZRAM_WRITEBACK static DEVICE_ATTR_RW(backing_dev); static DEVICE_ATTR_WO(writeback); +static DEVICE_ATTR_RW(writeback_limit); #endif static struct attribute *zram_disk_attrs[] = { @@ -1782,6 +1827,7 @@ static struct attribute *zram_disk_attrs[] = { #ifdef CONFIG_ZRAM_WRITEBACK &dev_attr_backing_dev.attr, &dev_attr_writeback.attr, + &dev_attr_writeback_limit.attr, #endif &dev_attr_io_stat.attr, &dev_attr_mm_stat.attr, diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index bc477803530d..4bd3afd15e83 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -86,6 +86,7 @@ struct zram_stats { atomic64_t bd_count; /* no. of pages in backing device */ atomic64_t bd_reads; /* no. of reads from backing device */ atomic64_t bd_writes; /* no. of writes from backing device */ + atomic64_t bd_wb_limit; /* writeback limit of backing device */ #endif }; @@ -113,6 +114,7 @@ struct zram { */ bool claim; /* Protected by bdev->bd_mutex */ struct file *backing_dev; + bool stop_writeback; #ifdef CONFIG_ZRAM_WRITEBACK struct block_device *bdev; unsigned int old_block_size; -- cgit v1.2.3