dm: add writecache target

The writecache target caches writes on persistent memory or SSD. It is intended for databases or other programs that need extremely low commit latency. The writecache target doesn't cache reads because reads are supposed to be cached in page cache in normal RAM. If persistent memory isn't available this target can still be used in SSD mode. Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Colin Ian King <colin.king@canonical.com> # fix missing goto Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> # fix compilation issue with !DAX Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com> # use msecs_to_jiffies Acked-by: Dan Williams <dan.j.williams@intel.com> # reworks to unify ARM and x86 flushing Signed-off-by: Mike Snitzer <msnitzer@redhat.com>
author: Mikulas Patocka <mpatocka@redhat.com> 2018-03-08 08:25:24 -0500
committer: Mike Snitzer <snitzer@redhat.com> 2018-06-08 11:59:51 -0400
commit: 48debafe4f2feabcc99f8e2659e80557e3ca6b39 (patch)
tree: 898a7c9c33238b068a79d40e97c380b36b1498ee /drivers/md/dm-writecache.c
parent: 72d711c8768805b5f8cf2d23c575dfd188993e12 (diff)
1 files changed, 2305 insertions, 0 deletions
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
new file mode 100644
index 000000000000..5961c7794ef3
--- /dev/null
+++ b/drivers/md/dm-writecache.c
@@ -0,0 +1,2305 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/kthread.h>
+#include <linux/dm-io.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/dax.h>
+#include <linux/pfn_t.h>
+#include <linux/libnvdimm.h>
+
+#define DM_MSG_PREFIX "writecache"
+
+#define HIGH_WATERMARK			50
+#define LOW_WATERMARK			45
+#define MAX_WRITEBACK_JOBS		0
+#define ENDIO_LATENCY			16
+#define WRITEBACK_LATENCY		64
+#define AUTOCOMMIT_BLOCKS_SSD		65536
+#define AUTOCOMMIT_BLOCKS_PMEM		64
+#define AUTOCOMMIT_MSEC			1000
+
+#define BITMAP_GRANULARITY	65536
+#if BITMAP_GRANULARITY < PAGE_SIZE
+#undef BITMAP_GRANULARITY
+#define BITMAP_GRANULARITY	PAGE_SIZE
+#endif
+
+#if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
+#define DM_WRITECACHE_HAS_PMEM
+#endif
+
+#ifdef DM_WRITECACHE_HAS_PMEM
+#define pmem_assign(dest, src)					\
+do {								\
+	typeof(dest) uniq = (src);				\
+	memcpy_flushcache(&(dest), &uniq, sizeof(dest));	\
+} while (0)
+#else
+#define pmem_assign(dest, src)	((dest) = (src))
+#endif
+
+#if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM)
+#define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+#endif
+
+#define MEMORY_SUPERBLOCK_MAGIC		0x23489321
+#define MEMORY_SUPERBLOCK_VERSION	1
+
+struct wc_memory_entry {
+	__le64 original_sector;
+	__le64 seq_count;
+};
+
+struct wc_memory_superblock {
+	union {
+		struct {
+			__le32 magic;
+			__le32 version;
+			__le32 block_size;
+			__le32 pad;
+			__le64 n_blocks;
+			__le64 seq_count;
+		};
+		__le64 padding[8];
+	};
+	struct wc_memory_entry entries[0];
+};
+
+struct wc_entry {
+	struct rb_node rb_node;
+	struct list_head lru;
+	unsigned short wc_list_contiguous;
+	bool write_in_progress
+#if BITS_PER_LONG == 64
+		:1
+#endif
+	;
+	unsigned long index
+#if BITS_PER_LONG == 64
+		:47
+#endif
+	;
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+	uint64_t original_sector;
+	uint64_t seq_count;
+#endif
+};
+
+#ifdef DM_WRITECACHE_HAS_PMEM
+#define WC_MODE_PMEM(wc)			((wc)->pmem_mode)
+#define WC_MODE_FUA(wc)				((wc)->writeback_fua)
+#else
+#define WC_MODE_PMEM(wc)			false
+#define WC_MODE_FUA(wc)				false
+#endif
+#define WC_MODE_SORT_FREELIST(wc)		(!WC_MODE_PMEM(wc))
+
+struct dm_writecache {
+	struct mutex lock;
+	struct list_head lru;
+	union {
+		struct list_head freelist;
+		struct {
+			struct rb_root freetree;
+			struct wc_entry *current_free;
+		};
+	};
+	struct rb_root tree;
+
+	size_t freelist_size;
+	size_t writeback_size;
+	size_t freelist_high_watermark;
+	size_t freelist_low_watermark;
+
+	unsigned uncommitted_blocks;
+	unsigned autocommit_blocks;
+	unsigned max_writeback_jobs;
+
+	int error;
+
+	unsigned long autocommit_jiffies;
+	struct timer_list autocommit_timer;
+	struct wait_queue_head freelist_wait;
+
+	atomic_t bio_in_progress[2];
+	struct wait_queue_head bio_in_progress_wait[2];
+
+	struct dm_target *ti;
+	struct dm_dev *dev;
+	struct dm_dev *ssd_dev;
+	void *memory_map;
+	uint64_t memory_map_size;
+	size_t metadata_sectors;
+	size_t n_blocks;
+	uint64_t seq_count;
+	void *block_start;
+	struct wc_entry *entries;
+	unsigned block_size;
+	unsigned char block_size_bits;
+
+	bool pmem_mode:1;
+	bool writeback_fua:1;
+
+	bool overwrote_committed:1;
+	bool memory_vmapped:1;
+
+	bool high_wm_percent_set:1;
+	bool low_wm_percent_set:1;
+	bool max_writeback_jobs_set:1;
+	bool autocommit_blocks_set:1;
+	bool autocommit_time_set:1;
+	bool writeback_fua_set:1;
+	bool flush_on_suspend:1;
+
+	unsigned writeback_all;
+	struct workqueue_struct *writeback_wq;
+	struct work_struct writeback_work;
+	struct work_struct flush_work;
+
+	struct dm_io_client *dm_io;
+
+	raw_spinlock_t endio_list_lock;
+	struct list_head endio_list;
+	struct task_struct *endio_thread;
+
+	struct task_struct *flush_thread;
+	struct bio_list flush_list;
+
+	struct dm_kcopyd_client *dm_kcopyd;
+	unsigned long *dirty_bitmap;
+	unsigned dirty_bitmap_size;
+
+	struct bio_set bio_set;
+	mempool_t copy_pool;
+};
+
+#define WB_LIST_INLINE		16
+
+struct writeback_struct {
+	struct list_head endio_entry;
+	struct dm_writecache *wc;
+	struct wc_entry **wc_list;
+	unsigned wc_list_n;
+	unsigned page_offset;
+	struct page *page;
+	struct wc_entry *wc_list_inline[WB_LIST_INLINE];
+	struct bio bio;
+};
+
+struct copy_struct {
+	struct list_head endio_entry;
+	struct dm_writecache *wc;
+	struct wc_entry *e;
+	unsigned n_entries;
+	int error;
+};
+
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
+					    "A percentage of time allocated for data copying");
+
+static void wc_lock(struct dm_writecache *wc)
+{
+	mutex_lock(&wc->lock);
+}
+
+static void wc_unlock(struct dm_writecache *wc)
+{
+	mutex_unlock(&wc->lock);
+}
+
+#ifdef DM_WRITECACHE_HAS_PMEM
+static int persistent_memory_claim(struct dm_writecache *wc)
+{
+	int r;
+	loff_t s;
+	long p, da;
+	pfn_t pfn;
+	int id;
+	struct page **pages;
+
+	wc->memory_vmapped = false;
+
+	if (!wc->ssd_dev->dax_dev) {
+		r = -EOPNOTSUPP;
+		goto err1;
+	}
+	s = wc->memory_map_size;
+	p = s >> PAGE_SHIFT;
+	if (!p) {
+		r = -EINVAL;
+		goto err1;
+	}
+	if (p != s >> PAGE_SHIFT) {
+		r = -EOVERFLOW;
+		goto err1;
+	}
+
+	id = dax_read_lock();
+
+	da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn);
+	if (da < 0) {
+		wc->memory_map = NULL;
+		r = da;
+		goto err2;
+	}
+	if (!pfn_t_has_page(pfn)) {
+		wc->memory_map = NULL;
+		r = -EOPNOTSUPP;
+		goto err2;
+	}
+	if (da != p) {
+		long i;
+		wc->memory_map = NULL;
+		pages = kvmalloc(p * sizeof(struct page *), GFP_KERNEL);
+		if (!pages) {
+			r = -ENOMEM;
+			goto err2;
+		}
+		i = 0;
+		do {
+			long daa;
+			void *dummy_addr;
+			daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i,
+						&dummy_addr, &pfn);
+			if (daa <= 0) {
+				r = daa ? daa : -EINVAL;
+				goto err3;
+			}
+			if (!pfn_t_has_page(pfn)) {
+				r = -EOPNOTSUPP;
+				goto err3;
+			}
+			while (daa-- && i < p) {
+				pages[i++] = pfn_t_to_page(pfn);
+				pfn.val++;
+			}
+		} while (i < p);
+		wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
+		if (!wc->memory_map) {
+			r = -ENOMEM;
+			goto err3;
+		}
+		kvfree(pages);
+		wc->memory_vmapped = true;
+	}
+
+	dax_read_unlock(id);
+	return 0;
+err3:
+	kvfree(pages);
+err2:
+	dax_read_unlock(id);
+err1:
+	return r;
+}
+#else
+static int persistent_memory_claim(struct dm_writecache *wc)
+{
+	BUG();
+}
+#endif
+
+static void persistent_memory_release(struct dm_writecache *wc)
+{
+	if (wc->memory_vmapped)
+		vunmap(wc->memory_map);
+}
+
+static struct page *persistent_memory_page(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		return vmalloc_to_page(addr);
+	else
+		return virt_to_page(addr);
+}
+
+static unsigned persistent_memory_page_offset(void *addr)
+{
+	return (unsigned long)addr & (PAGE_SIZE - 1);
+}
+
+static void persistent_memory_flush_cache(void *ptr, size_t size)
+{
+	if (is_vmalloc_addr(ptr))
+		flush_kernel_vmap_range(ptr, size);
+}
+
+static void persistent_memory_invalidate_cache(void *ptr, size_t size)
+{
+	if (is_vmalloc_addr(ptr))
+		invalidate_kernel_vmap_range(ptr, size);
+}
+
+static struct wc_memory_superblock *sb(struct dm_writecache *wc)
+{
+	return wc->memory_map;
+}
+
+static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
+{
+	if (is_power_of_2(sizeof(struct wc_entry)) && 0)
+		return &sb(wc)->entries[e - wc->entries];
+	else
+		return &sb(wc)->entries[e->index];
+}
+
+static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
+{
+	return (char *)wc->block_start + (e->index << wc->block_size_bits);
+}
+
+static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
+{
+	return wc->metadata_sectors +
+		((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
+}
+
+static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
+{
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+	return e->original_sector;
+#else
+	return le64_to_cpu(memory_entry(wc, e)->original_sector);
+#endif
+}
+
+static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
+{
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+	return e->seq_count;
+#else
+	return le64_to_cpu(memory_entry(wc, e)->seq_count);
+#endif
+}
+
+static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
+{
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+	e->seq_count = -1;
+#endif
+	pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
+}
+
+static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
+					    uint64_t original_sector, uint64_t seq_count)
+{
+	struct wc_memory_entry me;
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+	e->original_sector = original_sector;
+	e->seq_count = seq_count;
+#endif
+	me.original_sector = cpu_to_le64(original_sector);
+	me.seq_count = cpu_to_le64(seq_count);
+	pmem_assign(*memory_entry(wc, e), me);
+}
+
+#define writecache_error(wc, err, msg, arg...)				\
+do {									\
+	if (!cmpxchg(&(wc)->error, 0, err))				\
+		DMERR(msg, ##arg);					\
+	wake_up(&(wc)->freelist_wait);					\
+} while (0)
+
+#define writecache_has_error(wc)	(unlikely(READ_ONCE((wc)->error)))
+
+static void writecache_flush_all_metadata(struct dm_writecache *wc)
+{
+	if (!WC_MODE_PMEM(wc))
+		memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
+}
+
+static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
+{
+	if (!WC_MODE_PMEM(wc))
+		__set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
+			  wc->dirty_bitmap);
+}
+
+static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
+
+struct io_notify {
+	struct dm_writecache *wc;
+	struct completion c;
+	atomic_t count;
+};
+
+static void writecache_notify_io(unsigned long error, void *context)
+{
+	struct io_notify *endio = context;
+
+	if (unlikely(error != 0))
+		writecache_error(endio->wc, -EIO, "error writing metadata");
+	BUG_ON(atomic_read(&endio->count) <= 0);
+	if (atomic_dec_and_test(&endio->count))
+		complete(&endio->c);
+}
+
+static void ssd_commit_flushed(struct dm_writecache *wc)
+{
+	struct dm_io_region region;
+	struct dm_io_request req;
+	struct io_notify endio = {
+		wc,
+		COMPLETION_INITIALIZER_ONSTACK(endio.c),
+		ATOMIC_INIT(1),
+	};
+	unsigned bitmap_bits = wc->dirty_bitmap_size * BITS_PER_LONG;
+	unsigned i = 0;
+
+	while (1) {
+		unsigned j;
+		i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
+		if (unlikely(i == bitmap_bits))
+			break;
+		j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
+
+		region.bdev = wc->ssd_dev->bdev;
+		region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
+		region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
+
+		if (unlikely(region.sector >= wc->metadata_sectors))
+			break;
+		if (unlikely(region.sector + region.count > wc->metadata_sectors))
+			region.count = wc->metadata_sectors - region.sector;
+
+		atomic_inc(&endio.count);
+		req.bi_op = REQ_OP_WRITE;
+		req.bi_op_flags = REQ_SYNC;
+		req.mem.type = DM_IO_VMA;
+		req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
+		req.client = wc->dm_io;
+		req.notify.fn = writecache_notify_io;
+		req.notify.context = &endio;
+
+		/* writing via async dm-io (implied by notify.fn above) won't return an error */
+	        (void) dm_io(&req, 1, &region, NULL);
+		i = j;
+	}
+
+	writecache_notify_io(0, &endio);
+	wait_for_completion_io(&endio.c);
+
+	writecache_disk_flush(wc, wc->ssd_dev);
+
+	memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
+}
+
+static void writecache_commit_flushed(struct dm_writecache *wc)
+{
+	if (WC_MODE_PMEM(wc))
+		wmb();
+	else
+		ssd_commit_flushed(wc);
+}
+
+static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
+{
+	int r;
+	struct dm_io_region region;
+	struct dm_io_request req;
+
+	region.bdev = dev->bdev;
+	region.sector = 0;
+	region.count = 0;
+	req.bi_op = REQ_OP_WRITE;
+	req.bi_op_flags = REQ_PREFLUSH;
+	req.mem.type = DM_IO_KMEM;
+	req.mem.ptr.addr = NULL;
+	req.client = wc->dm_io;
+	req.notify.fn = NULL;
+
+	r = dm_io(&req, 1, &region, NULL);
+	if (unlikely(r))
+		writecache_error(wc, r, "error flushing metadata: %d", r);
+}
+
+static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
+{
+	wait_event(wc->bio_in_progress_wait[direction],
+		   !atomic_read(&wc->bio_in_progress[direction]));
+}
+
+#define WFE_RETURN_FOLLOWING	1
+#define WFE_LOWEST_SEQ		2
+
+static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
+					      uint64_t block, int flags)
+{
+	struct wc_entry *e;
+	struct rb_node *node = wc->tree.rb_node;
+
+	if (unlikely(!node))
+		return NULL;
+
+	while (1) {
+		e = container_of(node, struct wc_entry, rb_node);
+		if (read_original_sector(wc, e) == block)
+			break;
+		node = (read_original_sector(wc, e) >= block ?
+			e->rb_node.rb_left : e->rb_node.rb_right);
+		if (unlikely(!node)) {
+			if (!(flags & WFE_RETURN_FOLLOWING)) {
+				return NULL;
+			}
+			if (read_original_sector(wc, e) >= block) {
+				break;
+			} else {
+				node = rb_next(&e->rb_node);
+				if (unlikely(!node)) {
+					return NULL;
+				}
+				e = container_of(node, struct wc_entry, rb_node);
+				break;
+			}
+		}
+	}
+
+	while (1) {
+		struct wc_entry *e2;
+		if (flags & WFE_LOWEST_SEQ)
+			node = rb_prev(&e->rb_node);
+		else
+			node = rb_next(&e->rb_node);
+		if (!node)
+			return e;
+		e2 = container_of(node, struct wc_entry, rb_node);
+		if (read_original_sector(wc, e2) != block)
+			return e;
+		e = e2;
+	}
+}
+
+static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
+{
+	struct wc_entry *e;
+	struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
+
+	while (*node) {
+		e = container_of(*node, struct wc_entry, rb_node);
+		parent = &e->rb_node;
+		if (read_original_sector(wc, e) > read_original_sector(wc, ins))
+			node = &parent->rb_left;
+		else
+			node = &parent->rb_right;
+	}
+	rb_link_node(&ins->rb_node, parent, node);
+	rb_insert_color(&ins->rb_node, &wc->tree);
+	list_add(&ins->lru, &wc->lru);
+}
+
+static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
+{
+	list_del(&e->lru);
+	rb_erase(&e->rb_node, &wc->tree);
+}
+
+static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
+{
+	if (WC_MODE_SORT_FREELIST(wc)) {
+		struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
+		if (unlikely(!*node))
+			wc->current_free = e;
+		while (*node) {
+			parent = *node;
+			if (&e->rb_node < *node)
+				node = &parent->rb_left;
+			else
+				node = &parent->rb_right;
+		}
+		rb_link_node(&e->rb_node, parent, node);
+		rb_insert_color(&e->rb_node, &wc->freetree);
+	} else {
+		list_add_tail(&e->lru, &wc->freelist);
+	}
+	wc->freelist_size++;
+}
+
+static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
+{
+	struct wc_entry *e;
+
+	if (WC_MODE_SORT_FREELIST(wc)) {
+		struct rb_node *next;
+		if (unlikely(!wc->current_free))
+			return NULL;
+		e = wc->current_free;
+		next = rb_next(&e->rb_node);
+		rb_erase(&e->rb_node, &wc->freetree);
+		if (unlikely(!next))
+			next = rb_first(&wc->freetree);
+		wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
+	} else {
+		if (unlikely(list_empty(&wc->freelist)))
+			return NULL;
+		e = container_of(wc->freelist.next, struct wc_entry, lru);
+		list_del(&e->lru);
+	}
+	wc->freelist_size--;
+	if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
+		queue_work(wc->writeback_wq, &wc->writeback_work);
+
+	return e;
+}
+
+static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
+{
+	writecache_unlink(wc, e);
+	writecache_add_to_freelist(wc, e);
+	clear_seq_count(wc, e);
+	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
+	if (unlikely(waitqueue_active(&wc->freelist_wait)))
+		wake_up(&wc->freelist_wait);
+}
+
+static void writecache_wait_on_freelist(struct dm_writecache *wc)
+{
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
+	wc_unlock(wc);
+	io_schedule();
+	finish_wait(&wc->freelist_wait, &wait);
+	wc_lock(wc);
+}
+
+static void writecache_poison_lists(struct dm_writecache *wc)
+{
+	/*
+	 * Catch incorrect access to these values while the device is suspended.
+	 */
+	memset(&wc->tree, -1, sizeof wc->tree);
+	wc->lru.next = LIST_POISON1;
+	wc->lru.prev = LIST_POISON2;
+	wc->freelist.next = LIST_POISON1;
+	wc->freelist.prev = LIST_POISON2;
+}
+
+static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
+{
+	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
+	if (WC_MODE_PMEM(wc))
+		writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
+}
+
+static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
+{
+	return read_seq_count(wc, e) < wc->seq_count;
+}
+
+static void writecache_flush(struct dm_writecache *wc)
+{
+	struct wc_entry *e, *e2;
+	bool need_flush_after_free;
+
+	wc->uncommitted_blocks = 0;
+	del_timer(&wc->autocommit_timer);
+
+	if (list_empty(&wc->lru))
+		return;
+
+	e = container_of(wc->lru.next, struct wc_entry, lru);
+	if (writecache_entry_is_committed(wc, e)) {
+		if (wc->overwrote_committed) {
+			writecache_wait_for_ios(wc, WRITE);
+			writecache_disk_flush(wc, wc->ssd_dev);
+			wc->overwrote_committed = false;
+		}
+		return;
+	}
+	while (1) {
+		writecache_flush_entry(wc, e);
+		if (unlikely(e->lru.next == &wc->lru))
+			break;
+		e2 = container_of(e->lru.next, struct wc_entry, lru);
+		if (writecache_entry_is_committed(wc, e2))
+			break;
+		e = e2;
+		cond_resched();
+	}
+	writecache_commit_flushed(wc);
+
+	writecache_wait_for_ios(wc, WRITE);
+
+	wc->seq_count++;
+	pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
+	writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
+	writecache_commit_flushed(wc);
+
+	wc->overwrote_committed = false;
+
+	need_flush_after_free = false;
+	while (1) {
+		/* Free another committed entry with lower seq-count */
+		struct rb_node *rb_node = rb_prev(&e->rb_node);
+
+		if (rb_node) {
+			e2 = container_of(rb_node, struct wc_entry, rb_node);
+			if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
+			    likely(!e2->write_in_progress)) {
+				writecache_free_entry(wc, e2);
+				need_flush_after_free = true;
+			}
+		}
+		if (unlikely(e->lru.prev == &wc->lru))
+			break;
+		e = container_of(e->lru.prev, struct wc_entry, lru);
+		cond_resched();
+	}
+
+	if (need_flush_after_free)
+		writecache_commit_flushed(wc);
+}
+
+static void writecache_flush_work(struct work_struct *work)
+{
+	struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
+
+	wc_lock(wc);
+	writecache_flush(wc);
+	wc_unlock(wc);
+}
+
+static void writecache_autocommit_timer(struct timer_list *t)
+{
+	struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
+	if (!writecache_has_error(wc))
+		queue_work(wc->writeback_wq, &wc->flush_work);
+}
+
+static void writecache_schedule_autocommit(struct dm_writecache *wc)
+{
+	if (!timer_pending(&wc->autocommit_timer))
+		mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
+}
+
+static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
+{
+	struct wc_entry *e;
+	bool discarded_something = false;
+
+	e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
+	if (unlikely(!e))
+		return;
+
+	while (read_original_sector(wc, e) < end) {
+		struct rb_node *node = rb_next(&e->rb_node);
+
+		if (likely(!e->write_in_progress)) {
+			if (!discarded_something) {
+				writecache_wait_for_ios(wc, READ);
+				writecache_wait_for_ios(wc, WRITE);
+				discarded_something = true;
+			}
+			writecache_free_entry(wc, e);
+		}
+
+		if (!node)
+			break;
+
+		e = container_of(node, struct wc_entry, rb_node);
+	}
+
+	if (discarded_something)
+		writecache_commit_flushed(wc);
+}
+
+static bool writecache_wait_for_writeback(struct dm_writecache *wc)
+{
+	if (wc->writeback_size) {
+		writecache_wait_on_freelist(wc);
+		return true;
+	}
+	return false;
+}
+
+static void writecache_suspend(struct dm_target *ti)
+{
+	struct dm_writecache *wc = ti->private;
+	bool flush_on_suspend;
+
+	del_timer_sync(&wc->autocommit_timer);
+
+	wc_lock(wc);
+	writecache_flush(wc);
+	flush_on_suspend = wc->flush_on_suspend;
+	if (flush_on_suspend) {
+		wc->flush_on_suspend = false;
+		wc->writeback_all++;
+		queue_work(wc->writeback_wq, &wc->writeback_work);
+	}
+	wc_unlock(wc);
+
+	flush_workqueue(wc->writeback_wq);
+
+	wc_lock(wc);
+	if (flush_on_suspend)
+		wc->writeback_all--;
+	while (writecache_wait_for_writeback(wc));
+
+	if (WC_MODE_PMEM(wc))
+		persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
+
+	writecache_poison_lists(wc);
+
+	wc_unlock(wc);
+}
+
+static int writecache_alloc_entries(struct dm_writecache *wc)
+{
+	size_t b;
+
+	if (wc->entries)
+		return 0;
+	wc->entries = vmalloc(sizeof(struct wc_entry) * wc->n_blocks);
+	if (!wc->entries)
+		return -ENOMEM;
+	for (b = 0; b < wc->n_blocks; b++) {
+		struct wc_entry *e = &wc->entries[b];
+		e->index = b;
+		e->write_in_progress = false;
+	}
+
+	return 0;
+}
+
+static void writecache_resume(struct dm_target *ti)
+{
+	struct dm_writecache *wc = ti->private;
+	size_t b;
+	bool need_flush = false;
+	__le64 sb_seq_count;
+	int r;
+
+	wc_lock(wc);
+
+	if (WC_MODE_PMEM(wc))
+		persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
+
+	wc->tree = RB_ROOT;
+	INIT_LIST_HEAD(&wc->lru);
+	if (WC_MODE_SORT_FREELIST(wc)) {
+		wc->freetree = RB_ROOT;
+		wc->current_free = NULL;
+	} else {
+		INIT_LIST_HEAD(&wc->freelist);
+	}
+	wc->freelist_size = 0;
+
+	r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
+	if (r) {
+		writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
+		sb_seq_count = cpu_to_le64(0);
+	}
+	wc->seq_count = le64_to_cpu(sb_seq_count);
+
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+	for (b = 0; b < wc->n_blocks; b++) {
+		struct wc_entry *e = &wc->entries[b];
+		struct wc_memory_entry wme;
+		if (writecache_has_error(wc)) {
+			e->original_sector = -1;
+			e->seq_count = -1;
+			continue;
+		}
+		r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry));
+		if (r) {
+			writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
+					 (unsigned long)b, r);
+			e->original_sector = -1;
+			e->seq_count = -1;
+		} else {
+			e->original_sector = le64_to_cpu(wme.original_sector);
+			e->seq_count = le64_to_cpu(wme.seq_count);
+		}
+	}
+#endif
+	for (b = 0; b < wc->n_blocks; b++) {
+		struct wc_entry *e = &wc->entries[b];
+		if (!writecache_entry_is_committed(wc, e)) {
+			if (read_seq_count(wc, e) != -1) {
+erase_this:
+				clear_seq_count(wc, e);
+				need_flush = true;
+			}
+			writecache_add_to_freelist(wc, e);
+		} else {
+			struct wc_entry *old;
+
+			old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
+			if (!old) {
+				writecache_insert_entry(wc, e);
+			} else {
+				if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
+					writecache_error(wc, -EINVAL,
+						 "two identical entries, position %llu, sector %llu, sequence %llu",
+						 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
+						 (unsigned long long)read_seq_count(wc, e));
+				}
+				if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
+					goto erase_this;
+				} else {
+					writecache_free_entry(wc, old);
+					writecache_insert_entry(wc, e);
+					need_flush = true;
+				}
+			}
+		}
+		cond_resched();
+	}
+
+	if (need_flush) {
+		writecache_flush_all_metadata(wc);
+		writecache_commit_flushed(wc);
+	}
+
+	wc_unlock(wc);
+}
+
+static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
+{
+	if (argc != 1)
+		return -EINVAL;
+
+	wc_lock(wc);
+	if (dm_suspended(wc->ti)) {
+		wc_unlock(wc);
+		return -EBUSY;
+	}
+	if (writecache_has_error(wc)) {
+		wc_unlock(wc);
+		return -EIO;
+	}
+
+	writecache_flush(wc);
+	wc->writeback_all++;
+	queue_work(wc->writeback_wq, &wc->writeback_work);
+	wc_unlock(wc);
+
+	flush_workqueue(wc->writeback_wq);
+
+	wc_lock(wc);
+	wc->writeback_all--;
+	if (writecache_has_error(wc)) {
+		wc_unlock(wc);
+		return -EIO;
+	}
+	wc_unlock(wc);
+
+	return 0;
+}
+
+static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
+{
+	if (argc != 1)
+		return -EINVAL;
+
+	wc_lock(wc);
+	wc->flush_on_suspend = true;
+	wc_unlock(wc);
+
+	return 0;
+}
+
+static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
+			      char *result, unsigned maxlen)
+{
+	int r = -EINVAL;
+	struct dm_writecache *wc = ti->private;
+
+	if (!strcasecmp(argv[0], "flush"))
+		r = process_flush_mesg(argc, argv, wc);
+	else if (!strcasecmp(argv[0], "flush_on_suspend"))
+		r = process_flush_on_suspend_mesg(argc, argv, wc);
+	else
+		DMERR("unrecognised message received: %s", argv[0]);
+
+	return r;
+}
+
+static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
+{
+	void *buf;
+	unsigned long flags;
+	unsigned size;
+	int rw = bio_data_dir(bio);
+	unsigned remaining_size = wc->block_size;
+
+	do {
+		struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
+		buf = bvec_kmap_irq(&bv, &flags);
+		size = bv.bv_len;
+		if (unlikely(size > remaining_size))
+			size = remaining_size;
+
+		if (rw == READ) {
+			int r;
+			r = memcpy_mcsafe(buf, data, size);
+			flush_dcache_page(bio_page(bio));
+			if (unlikely(r)) {
+				writecache_error(wc, r, "hardware memory error when reading data: %d", r);
+				bio->bi_status = BLK_STS_IOERR;
+			}
+		} else {
+			flush_dcache_page(bio_page(bio));
+			memcpy_flushcache(data, buf, size);
+		}
+
+		bvec_kunmap_irq(buf, &flags);
+
+		data = (char *)data + size;
+		remaining_size -= size;
+		bio_advance(bio, size);
+	} while (unlikely(remaining_size));
+}
+
+static int writecache_flush_thread(void *data)
+{
+	struct dm_writecache *wc = data;
+
+	while (1) {
+		struct bio *bio;
+
+		wc_lock(wc);
+		bio = bio_list_pop(&wc->flush_list);
+		if (!bio) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			wc_unlock(wc);
+
+			if (unlikely(kthread_should_stop())) {
+				set_current_state(TASK_RUNNING);
+				break;
+			}
+
+			schedule();
+			continue;
+		}
+
+		if (bio_op(bio) == REQ_OP_DISCARD) {
+			writecache_discard(wc, bio->bi_iter.bi_sector,
+					   bio_end_sector(bio));
+			wc_unlock(wc);
+			bio_set_dev(bio, wc->dev->bdev);
+			generic_make_request(bio);
+		} else {
+			writecache_flush(wc);
+			wc_unlock(wc);
+			if (writecache_has_error(wc))
+				bio->bi_status = BLK_STS_IOERR;
+			bio_endio(bio);
+		}
+	}
+
+	return 0;
+}
+
+static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
+{
+	if (bio_list_empty(&wc->flush_list))
+		wake_up_process(wc->flush_thread);
+	bio_list_add(&wc->flush_list, bio);
+}
+
+static int writecache_map(struct dm_target *ti, struct bio *bio)
+{
+	struct wc_entry *e;
+	struct dm_writecache *wc = ti->private;
+
+	bio->bi_private = NULL;
+
+	wc_lock(wc);
+
+	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
+		if (writecache_has_error(wc))
+			goto unlock_error;
+		if (WC_MODE_PMEM(wc)) {
+			writecache_flush(wc);
+			if (writecache_has_error(wc))
+				goto unlock_error;
+			goto unlock_submit;
+		} else {
+			writecache_offload_bio(wc, bio);
+			goto unlock_return;
+		}
+	}
+
+	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
+
+	if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
+				(wc->block_size / 512 - 1)) != 0)) {
+		DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
+		      (unsigned long long)bio->bi_iter.bi_sector,
+		      bio->bi_iter.bi_size, wc->block_size);
+		goto unlock_error;
+	}
+
+	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
+		if (writecache_has_error(wc))
+			goto unlock_error;
+		if (WC_MODE_PMEM(wc)) {
+			writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
+			goto unlock_remap_origin;
+		} else {
+			writecache_offload_bio(wc, bio);
+			goto unlock_return;
+		}
+	}
+
+	if (bio_data_dir(bio) == READ) {
+read_next_block:
+		e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
+		if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
+			if (WC_MODE_PMEM(wc)) {
+				bio_copy_block(wc, bio, memory_data(wc, e));
+				if (bio->bi_iter.bi_size)
+					goto read_next_block;
+				goto unlock_submit;
+			} else {
+				dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
+				bio_set_dev(bio, wc->ssd_dev->bdev);
+				bio->bi_iter.bi_sector = cache_sector(wc, e);
+				if (!writecache_entry_is_committed(wc, e))
+					writecache_wait_for_ios(wc, WRITE);
+				goto unlock_remap;
+			}
+		} else {
+			if (e) {
+				sector_t next_boundary =
+					read_original_sector(wc, e) - bio->bi_iter.bi_sector;
+				if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
+					dm_accept_partial_bio(bio, next_boundary);
+				}
+			}
+			goto unlock_remap_origin;
+		}
+	} else {
+		do {
+			if (writecache_has_error(wc))
+				goto unlock_error;
+			e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
+			if (e) {
+				if (!writecache_entry_is_committed(wc, e))
+					goto bio_copy;
+				if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
+					wc->overwrote_committed = true;
+					goto bio_copy;
+				}
+			}
+			e = writecache_pop_from_freelist(wc);
+			if (unlikely(!e)) {
+				writecache_wait_on_freelist(wc);
+				continue;
+			}
+			write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
+			writecache_insert_entry(wc, e);
+			wc->uncommitted_blocks++;
+bio_copy:
+			if (WC_MODE_PMEM(wc)) {
+				bio_copy_block(wc, bio, mem
author	Mikulas Patocka <mpatocka@redhat.com>	2018-03-08 08:25:24 -0500
committer	Mike Snitzer <snitzer@redhat.com>	2018-06-08 11:59:51 -0400
commit	48debafe4f2feabcc99f8e2659e80557e3ca6b39 (patch)
tree	898a7c9c33238b068a79d40e97c380b36b1498ee /drivers/md/dm-writecache.c
parent	72d711c8768805b5f8cf2d23c575dfd188993e12 (diff)