/*
* Copyright (C) 2015 Shaohua Li <shli@fb.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
*/
#include <linux/kernel.h>
#include <linux/wait.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/raid/md_p.h>
#include <linux/crc32c.h>
#include <linux/random.h>
#include "md.h"
#include "raid5.h"
/*
* metadata/data stored in disk with 4k size unit (a block) regardless
* underneath hardware sector size. only works with PAGE_SIZE == 4096
*/
#define BLOCK_SECTORS (8)
/*
* reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
* recovery scans a very long log
*/
#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
struct r5l_log {
struct md_rdev *rdev;
u32 uuid_checksum;
sector_t device_size; /* log device size, round to
* BLOCK_SECTORS */
sector_t max_free_space; /* reclaim run if free space is at
* this size */
sector_t last_checkpoint; /* log tail. where recovery scan
* starts from */
u64 last_cp_seq; /* log tail sequence */
sector_t log_start; /* log head. where new data appends */
u64 seq; /* log head sequence */
sector_t next_checkpoint;
u64 next_cp_seq;
struct mutex io_mutex;
struct r5l_io_unit *current_io; /* current io_unit accepting new data */
spinlock_t io_list_lock;
struct list_head running_ios; /* io_units which are still running,
* and have not yet been completely
* written to the log */
struct list_head io_end_ios; /* io_units which have been completely
* written to the log but not yet written
* to the RAID */
struct list_head flushing_ios; /* io_units which are waiting for log
* cache flush */
struct list_head finished_ios; /* io_units which settle down in log disk */
struct bio flush_bio;
struct kmem_cache *io_kc;
struct md_thread *reclaim_thread;
unsigned long reclaim_target; /* number of space that need to be
* reclaimed. if it's 0, reclaim spaces
* used by io_units which are in
* IO_UNIT_STRIPE_END state (eg, reclaim
* dones't wait for specific io_unit
* switching to IO_UNIT_STRIPE_END
* state) */
wait_queue_head_t iounit_wait;
struct list_head no_space_stripes; /* pending stripes, log has no space */
spinlock_t no_space_stripes_lock;
bool need_cache_flush;
bool in_teardown;
};
/*
* an IO range starts from a meta data block and end at the next meta data
* block. The io unit's the meta data block tracks data/parity followed it. io
* unit is written to log disk with normal write, as we always flush log disk
* first and then start move data to raid disks, there is no requirement to
* write io unit with FLUSH/FUA
*/
struct r5l_io_unit {
struct r5l_log *log;
struct page *meta_page; /* store meta block */
int meta_offset; /* current offset in meta_page */
struct bio *current_bio;/* current_bio accepting new data */
atomic_t pending_stripe;/* how many stripes not flushed to raid */
u64 seq; /* seq number of the metablock */
sector_t log_start; /* where the io_unit starts */
sector_t log_end; /* where the io_unit ends */
struct list_head log_sibling; /* log->running_ios */
struct list_head stripe_list; /* stripes added to the io_unit */
int state;
bool need_split_bio;
};
/* r5l_io_unit state */
enum r5l_io_unit_state {
IO_UNIT_RUNNING = 0, /* accepting new IO */
IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
* don't accepting new bio */
IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
};
static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
{
start += inc;
if (start >= log->device_size)
start = start - log->device_size;
return start;
}
static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
sector_t end)
{
if (end >= start