/*
* Copyright (C) 2014 Facebook. All rights reserved.
*
* This file is released under the GPL.
*/
#include <linux/device-mapper.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/dax.h>
#include <linux/slab.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/uio.h>
#define DM_MSG_PREFIX "log-writes"
/*
* This target will sequentially log all writes to the target device onto the
* log device. This is helpful for replaying writes to check for fs consistency
* at all times. This target provides a mechanism to mark specific events to
* check data at a later time. So for example you would:
*
* write data
* fsync
* dmsetup message /dev/whatever mark mymark
* unmount /mnt/test
*
* Then replay the log up to mymark and check the contents of the replay to
* verify it matches what was written.
*
* We log writes only after they have been flushed, this makes the log describe
* close to the order in which the data hits the actual disk, not its cache. So
* for example the following sequence (W means write, C means complete)
*
* Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
*
* Would result in the log looking like this:
*
* c,a,b,flush,fuad,<other writes>,<next flush>
*
* This is meant to help expose problems where file systems do not properly wait
* on data being written before invoking a FLUSH. FUA bypasses cache so once it
* completes it is added to the log as it should be on disk.
*
* We treat DISCARDs as if they don't bypass cache so that they are logged in
* order of completion along with the normal writes. If we didn't do it this
* way we would process all the discards first and then write all the data, when
* in fact we want to do the data and the discard in the order that they
* completed.
*/
#define LOG_FLUSH_FLAG (1 << 0)
#define LOG_FUA_FLAG (1 << 1)
#define LOG_DISCARD_FLAG (1 << 2)
#define LOG_MARK_FLAG (1 << 3)
#define LOG_METADATA_FLAG (1 << 4)
#define WRITE_LOG_VERSION 1ULL
#define WRITE_LOG_MAGIC 0x6a736677736872ULL
#define WRITE_LOG_SUPER_SECTOR 0
/*
* The disk format for this is braindead simple.
*
* At byte 0 we have our super, followed by the following sequence for
* nr_entries:
*
* [ 1 sector ][ entry->nr_sectors ]
* [log_write_entry][ data written ]
*
* The log_write_entry takes up a full sector so we can have arbitrary length
* marks and it leaves us room for extra content in the future.
*/
/*
* Basic info about the log for userspace.
*/
struct log_write_super {
__le64 magic;
__le64 version;
__le64 nr_entries;
__le32 sectorsize;
};
/*
* sector - the sector we wrote.
* nr_sectors - the number of sectors we wrote.
* flags - flags for this log entry.
* data_len - the size of the data in this log entry, this is for private log
* entry stuff, the MARK data provided by userspace for example.
*/
struct log_write_entry {
__le64 sector;
__le64 nr_sectors;
__le64 flags;
__le64 data_len;
};
struct log_writes_c {
struct dm_dev *dev;
struct dm_dev *logdev;
u64 logged_entries;
u32 sectorsize;
u32 sectorshift;
atomic_t io_blocks;
atomic_t pending_blocks;
sector_t next_sector;
sector_t end_sector;
bool logging_enabled;
bool device_supports_discard;
spinlock_t blocks_lock;
struct list_head unflushed_blocks;
struct list_head logging_blocks;
wait_queue_head_t wait;
struct task_struct *log_kthread;
struct completion super_done;
};
struct pending_block {
int vec_cnt;
u64 flags;
sector_t sector;
sector_t nr_sectors;
char *data;
u32 datalen;
struct list_head list;
struct bio_vec vecs[];
};
struct per_bio_data {
struct pending_block *block;
};
static inline sector_t bio_to_dev_sectors(struct log_writes_c *lc,
sector_t sectors)
{
return sectors >> (lc->sectorshift - SECTOR_SHIFT);
}
static inline sector_t dev_to_bio_sectors(struct log_writes_c *lc,
sector_t sectors)
{
return sectors << (lc->sectorshift - SECTOR_SHIFT);
}
static void put_pending_block(struct log_writes_c *lc)
{
if (atomic_dec_and_test(&lc->pending_blocks)) {
smp_mb__after_atomic();
if (waitqueue_active(&lc->wait))
wake_up(&lc->wait);
}
}
static void put_io_block(struct log_writes_c *lc)
{
if (atomic_dec_and_test(&lc->io_blocks)) {
smp_mb__after_atomic();
if (waitqueue_active(&lc->wait))
wake_up(&lc->wait);
}
}
static void log_end_io(struct bio *bio)
{
struct log_writes_c *lc = bio->bi_private;
if (bio->bi_status) {
unsigned long flags;
DMERR("Error writing log block, error=%d", bio->bi_status);
spin_lock_irqsave(&lc->blocks_lock, flags);
lc->logging_enabled = false;
spin_unlock_irqrestore(&lc->blocks_lock, flags);
}
bio_free_pages(bio);
put_io_block(lc);
bio_put(bio);
}
static void log_end_super(struct bio *bio)
{
struct log_writes_c *lc = bio->bi_private;
complete(&lc->super_done);
log_end_io(bio);
}
/*
* Meant to be called if there is an error, it will free all the pages
* associated with the block.
*/
static void free_pending_block(struct log_writes_c *lc,
struct pending_block