summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2012-01-16 15:26:02 -0500
committerChris Mason <chris.mason@oracle.com>2012-01-16 15:26:02 -0500
commit27263e28321db438bc43dc0c0be432ce91526224 (patch)
treeb7571dd52d6c1d48528e383d273b330da94439bd
parent64e05503ab5c73b2ffb8d55d2f7aab74f34fc691 (diff)
parent19a39dce3b9bf0244d19a446718ad6f7605ff099 (diff)
Merge branch 'restriper' of git://github.com/idryomov/btrfs-unstable into integration
-rw-r--r--fs/btrfs/ctree.h205
-rw-r--r--fs/btrfs/disk-io.c18
-rw-r--r--fs/btrfs/extent-tree.c128
-rw-r--r--fs/btrfs/ioctl.c226
-rw-r--r--fs/btrfs/ioctl.h54
-rw-r--r--fs/btrfs/super.c11
-rw-r--r--fs/btrfs/volumes.c790
-rw-r--r--fs/btrfs/volumes.h51
8 files changed, 1385 insertions, 98 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323d..dfc136cc07d7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum;
/* holds checksums of all the data extents */
#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
/* orhpan objectid for tracking unlinked/truncated files */
#define BTRFS_ORPHAN_OBJECTID -5ULL
@@ -692,6 +695,54 @@ struct btrfs_root_ref {
__le16 name_len;
} __attribute__ ((__packed__));
+struct btrfs_disk_balance_args {
+ /*
+ * profiles to operate on, single is denoted by
+ * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+ */
+ __le64 profiles;
+
+ /* usage filter */
+ __le64 usage;
+
+ /* devid filter */
+ __le64 devid;
+
+ /* devid subset filter [pstart..pend) */
+ __le64 pstart;
+ __le64 pend;
+
+ /* btrfs virtual address space subset filter [vstart..vend) */
+ __le64 vstart;
+ __le64 vend;
+
+ /*
+ * profile to convert to, single is denoted by
+ * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+ */
+ __le64 target;
+
+ /* BTRFS_BALANCE_ARGS_* */
+ __le64 flags;
+
+ __le64 unused[8];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+ /* BTRFS_BALANCE_* */
+ __le64 flags;
+
+ struct btrfs_disk_balance_args data;
+ struct btrfs_disk_balance_args meta;
+ struct btrfs_disk_balance_args sys;
+
+ __le64 unused[4];
+} __attribute__ ((__packed__));
+
#define BTRFS_FILE_EXTENT_INLINE 0
#define BTRFS_FILE_EXTENT_REG 1
#define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -751,14 +802,32 @@ struct btrfs_csum_item {
} __attribute__ ((__packed__));
/* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA (1 << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
-#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3)
-#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
-#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
-#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
-#define BTRFS_NR_RAID_TYPES 5
+#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
+#define BTRFS_NR_RAID_TYPES 5
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
+ BTRFS_BLOCK_GROUP_SYSTEM | \
+ BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
+ BTRFS_BLOCK_GROUP_RAID1 | \
+ BTRFS_BLOCK_GROUP_DUP | \
+ BTRFS_BLOCK_GROUP_RAID10)
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available. This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk). The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
struct btrfs_block_group_item {
__le64 used;
@@ -916,6 +985,7 @@ struct btrfs_block_group_cache {
struct reloc_control;
struct btrfs_device;
struct btrfs_fs_devices;
+struct btrfs_balance_control;
struct btrfs_delayed_root;
struct btrfs_fs_info {
u8 fsid[BTRFS_FSID_SIZE];
@@ -1132,12 +1202,23 @@ struct btrfs_fs_info {
spinlock_t ref_cache_lock;
u64 total_ref_cache_size;
+ /*
+ * these three are in extended format (availability of single
+ * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+ * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+ */
u64 avail_data_alloc_bits;
u64 avail_metadata_alloc_bits;
u64 avail_system_alloc_bits;
- u64 data_alloc_profile;
- u64 metadata_alloc_profile;
- u64 system_alloc_profile;
+
+ /* restriper state */
+ spinlock_t balance_lock;
+ struct mutex balance_mutex;
+ atomic_t balance_running;
+ atomic_t balance_pause_req;
+ atomic_t balance_cancel_req;
+ struct btrfs_balance_control *balance_ctl;
+ wait_queue_head_t balance_wait_q;
unsigned data_chunk_allocations;
unsigned metadata_ratio;
@@ -1383,6 +1464,8 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_DEV_ITEM_KEY 216
#define BTRFS_CHUNK_ITEM_KEY 228
+#define BTRFS_BALANCE_ITEM_KEY 248
+
/*
* string items are for debugging. They just store a short string of
* data in the FS
@@ -1413,6 +1496,7 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
#define BTRFS_MOUNT_RECOVERY (1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2077,8 +2161,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
num_devices, 64);
-/* struct btrfs_super_block */
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+ struct btrfs_disk_balance_args *disk)
+{
+ memset(cpu, 0, sizeof(*cpu));
+
+ cpu->profiles = le64_to_cpu(disk->profiles);
+ cpu->usage = le64_to_cpu(disk->usage);
+ cpu->devid = le64_to_cpu(disk->devid);
+ cpu->pstart = le64_to_cpu(disk->pstart);
+ cpu->pend = le64_to_cpu(disk->pend);
+ cpu->vstart = le64_to_cpu(disk->vstart);
+ cpu->vend = le64_to_cpu(disk->vend);
+ cpu->target = le64_to_cpu(disk->target);
+ cpu->flags = le64_to_cpu(disk->flags);
+}
+
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+ struct btrfs_balance_args *cpu)
+{
+ memset(disk, 0, sizeof(*disk));
+
+ disk->profiles = cpu_to_le64(cpu->profiles);
+ disk->usage = cpu_to_le64(cpu->usage);
+ disk->devid = cpu_to_le64(cpu->devid);
+ disk->pstart = cpu_to_le64(cpu->pstart);
+ disk->pend = cpu_to_le64(cpu->pend);
+ disk->vstart = cpu_to_le64(cpu->vstart);
+ disk->vend = cpu_to_le64(cpu->vend);
+ disk->target = cpu_to_le64(cpu->target);
+ disk->flags = cpu_to_le64(cpu->flags);
+}
+
+/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2500,6 +2662,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
}
static inline void free_fs_info(struct btrfs_fs_info *fs_info)
{
+ kfree(fs_info->balance_ctl);
kfree(fs_info->delayed_root);
kfree(fs_info->extent_root);
kfree(fs_info->tree_root);
@@ -2510,6 +2673,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->super_for_commit);
kfree(fs_info);
}
+/**
+ * profile_is_valid - tests whether a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static inline int profile_is_valid(u64 flags, int extended)
+{
+ u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+ if (extended)
+ mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ if (flags & mask)
+ return 0;
+ /* true if zero or exactly one bit set */
+ return (flags & (~flags + 1)) == flags;
+}
/* root-item.c */
int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f44b3928dc2d..9c1a744e595b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2002,6 +2002,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
init_rwsem(&fs_info->scrub_super_lock);
fs_info->scrub_workers_refcnt = 0;
+ spin_lock_init(&fs_info->balance_lock);
+ mutex_init(&fs_info->balance_mutex);
+ atomic_set(&fs_info->balance_running, 0);
+ atomic_set(&fs_info->balance_pause_req, 0);
+ atomic_set(&fs_info->balance_cancel_req, 0);
+ fs_info->balance_ctl = NULL;
+ init_waitqueue_head(&fs_info->balance_wait_q);
+
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
sb->s_bdi = &fs_info->bdi;
@@ -2321,9 +2329,6 @@ retry_root_backup:
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
- fs_info->data_alloc_profile = (u64)-1;
- fs_info->metadata_alloc_profile = (u64)-1;
- fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
ret = btrfs_init_space_info(fs_info);
if (ret) {
@@ -2426,6 +2431,10 @@ retry_root_backup:
if (!err)
err = btrfs_orphan_cleanup(fs_info->tree_root);
up_read(&fs_info->cleanup_work_sem);
+
+ if (!err)
+ err = btrfs_recover_balance(fs_info->tree_root);
+
if (err) {
close_ctree(tree_root);
return ERR_PTR(err);
@@ -2975,6 +2984,9 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
+ /* pause restriper - we want to resume on mount */
+ btrfs_pause_balance(root->fs_info);
+
btrfs_scrub_cancel(root);
/* wait for any defraggers to finish */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 37594e4bf660..352083ad233c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
- BTRFS_BLOCK_GROUP_METADATA;
+ flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
@@ -2999,9 +2998,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
INIT_LIST_HEAD(&found->block_groups[i]);
init_rwsem(&found->groups_sem);
spin_lock_init(&found->lock);
- found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
- BTRFS_BLOCK_GROUP_SYSTEM |
- BTRFS_BLOCK_GROUP_METADATA);
+ found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
found->total_bytes = total_bytes;
found->disk_total = total_bytes * factor;
found->bytes_used = bytes_used;
@@ -3022,20 +3019,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
- u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_DUP);
- if (extra_flags) {
- if (flags & BTRFS_BLOCK_GROUP_DATA)
- fs_info->avail_data_alloc_bits |= extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_METADATA)
- fs_info->avail_metadata_alloc_bits |= extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- fs_info->avail_system_alloc_bits |= extra_flags;
- }
+ u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ /* chunk -> extended profile */
+ if (extra_flags == 0)
+ extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits |= extra_flags;
}
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format. If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
+ */
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
/*
@@ -3046,6 +3050,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
u64 num_devices = root->fs_info->fs_devices->rw_devices +
root->fs_info->fs_devices->missing_devices;
+ /* pick restriper's target profile if it's available */
+ spin_lock(&root->fs_info->balance_lock);
+ if (root->fs_info->balance_ctl) {
+ struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+ u64 tgt = 0;
+
+ if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
+ (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (flags & bctl->data.target)) {
+ tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+ } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (flags & bctl->sys.target)) {
+ tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+ } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
+ (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (flags & bctl->meta.target)) {
+ tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+ }
+
+ if (tgt) {
+ spin_unlock(&root->fs_info->balance_lock);
+ flags = tgt;
+ goto out;
+ }
+ }
+ spin_unlock(&root->fs_info->balance_lock);
+
if (num_devices == 1)
flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
if (num_devices < 4)
@@ -3065,22 +3097,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
((flags & BTRFS_BLOCK_GROUP_RAID1) |
(flags & BTRFS_BLOCK_GROUP_RAID10) |
- (flags & BTRFS_BLOCK_GROUP_DUP)))
+ (flags & BTRFS_BLOCK_GROUP_DUP))) {
flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+ }
+
+out:
+ /* extended -> chunk profile */
+ flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
return flags;
}
static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
{
if (flags & BTRFS_BLOCK_GROUP_DATA)
- flags |= root->fs_info->avail_data_alloc_bits &
- root->fs_info->data_alloc_profile;
+ flags |= root->fs_info->avail_data_alloc_bits;
else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- flags |= root->fs_info->avail_system_alloc_bits &
- root->fs_info->system_alloc_profile;
+ flags |= root->fs_info->avail_system_alloc_bits;
else if (flags & BTRFS_BLOCK_GROUP_METADATA)
- flags |= root->fs_info->avail_metadata_alloc_bits &
- root->fs_info->metadata_alloc_profile;
+ flags |= root->fs_info->avail_metadata_alloc_bits;
+
return btrfs_reduce_alloc_profile(root, flags);
}
@@ -3282,7 +3317,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
int wait_for_alloc = 0;
int ret = 0;
- flags = btrfs_reduce_alloc_profile(extent_root, flags);
+ BUG_ON(!profile_is_valid(flags, 0));
space_info = __find_space_info(extent_root->fs_info, flags);
if (!space_info) {
@@ -6792,6 +6827,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+ if (root->fs_info->balance_ctl) {
+ struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+ u64 tgt = 0;
+
+ /* pick restriper's target profile and return */
+ if (flags & BTRFS_BLOCK_GROUP_DATA &&
+ bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+ bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+ bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+ }
+
+ if (tgt) {
+ /* extended -> chunk profile */
+ tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ return tgt;
+ }
+ }
+
/*
* we add in the count of missing devices because we want
* to make sure that any RAID levels on a degraded FS
@@ -7466,6 +7524,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
return 0;
}
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ /* chunk -> extended profile */
+ if (extra_flags == 0)
+ extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits &= ~extra_flags;
+}
+
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start)
{
@@ -7476,6 +7550,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct inode *inode;
int ret;
+ int index;
int factor;
root = root->fs_info->extent_root;
@@ -7491,6 +7566,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
free_excluded_extents(root, block_group);
memcpy(&key, &block_group->key, sizeof(key));
+ index = get_block_group_index(block_group);
if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
@@ -7565,6 +7641,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* are still on the list after taking the semaphore
*/
list_del_init(&block_group->list);
+ if (list_empty(&block_group->space_info->block_groups[index]))
+ clear_avail_alloc_bits(root->fs_info, block_group->flags);
up_write(&block_group->space_info->groups_sem);
if (block_group->cached == BTRFS_CACHE_STARTED)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c04f02c7d5bb..1e7a9bac31ab 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1203,13 +1203,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ mutex_lock(&root->fs_info->volume_mutex);
+ if (root->fs_info->balance_ctl) {
+ printk(KERN_INFO "btrfs: balance in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- mutex_lock(&root->fs_info->volume_mutex);
sizestr = vol_args->name;
devstr = strchr(sizestr, ':');
if (devstr) {
@@ -1226,7 +1234,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
(unsigned long long)devid);
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
if (!strcmp(sizestr, "max"))
new_size = device->bdev->bd_inode->i_size;
@@ -1241,7 +1249,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
new_size = memparse(sizestr, NULL);
if (new_size == 0) {
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
}
@@ -1250,7 +1258,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
if (mod < 0) {
if (new_size > old_size) {
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
new_size = old_size - new_size;
} else if (mod > 0) {
@@ -1259,11 +1267,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
if (new_size < 256 * 1024 * 1024) {
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
if (new_size > device->bdev->bd_inode->i_size) {
ret = -EFBIG;
- goto out_unlock;
+ goto out_free;
}
do_div(new_size, root->sectorsize);
@@ -1276,7 +1284,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto out_unlock;
+ goto out_free;
}
ret = btrfs_grow_device(trans, device, new_size);
btrfs_commit_transaction(trans, root);
@@ -1284,9 +1292,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
ret = btrfs_shrink_device(device, new_size);
}
-out_unlock:
- mutex_unlock(&root->fs_info->volume_mutex);
+out_free:
kfree(vol_args);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
return ret;
}
@@ -2052,14 +2061,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ mutex_lock(&root->fs_info->volume_mutex);
+ if (root->fs_info->balance_ctl) {
+ printk(KERN_INFO "btrfs: balance in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_init_new_device(root, vol_args->name);
kfree(vol_args);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
return ret;
}
@@ -2074,14 +2094,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
if (root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
+ mutex_lock(&root->fs_info->volume_mutex);
+ if (root->fs_info->balance_ctl) {
+ printk(KERN_INFO "btrfs: balance in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_rm_device(root, vol_args->name);
kfree(vol_args);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
return ret;
}
@@ -3034,6 +3065,163 @@ out:
return ret;
}
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+ struct btrfs_ioctl_balance_args *bargs)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+ bargs->flags = bctl->flags;
+
+ if (atomic_read(&fs_info->balance_running))
+ bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
+ if (atomic_read(&fs_info->balance_pause_req))
+ bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+ if (atomic_read(&fs_info->balance_cancel_req))
+ bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
+
+ memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
+ memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
+ memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+
+ if (lock) {
+ spin_lock(&fs_info->balance_lock);
+ memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+ spin_unlock(&fs_info->balance_lock);
+ } else {
+ memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+ }
+}
+
+static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ioctl_balance_args *bargs;
+ struct btrfs_balance_control *bctl;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (arg) {
+ bargs = memdup_user(arg, sizeof(*bargs));
+ if (IS_ERR(bargs)) {
+ ret = PTR_ERR(bargs);
+ goto out;
+ }
+
+ if (bargs->flags & BTRFS_BALANCE_RESUME) {
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
+ goto out_bargs;
+ }
+
+ bctl = fs_info->balance_ctl;
+ spin_lock(&fs_info->balance_lock);
+ bctl->flags |= BTRFS_BALANCE_RESUME;
+ spin_unlock(&fs_info->balance_lock);
+
+ goto do_balance;
+ }
+ } else {
+ bargs = NULL;
+ }
+
+ if (fs_info->balance_ctl) {
+ ret = -EINPROGRESS;
+ goto out_bargs;
+ }
+
+ bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ if (!bctl) {
+ ret = -ENOMEM;
+ goto out_bargs;
+ }
+
+ bctl->fs_info = fs_info;
+ if (arg) {
+ memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+ memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+ memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+
+ bctl->flags = bargs->flags;
+ } else {
+ /* balance everything - no filters */
+ bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+ }
+
+do_balance:
+ ret = btrfs_balance(bctl, bargs);
+ /*
+ * bctl is freed in __cancel_balance or in free_fs_info if
+ * restriper was paused all the way until unmount
+ */
+ if (arg) {
+ if (copy_to_user(arg, bargs, sizeof(*bargs)))
+ ret = -EFAULT;
+ }
+
+out_bargs:
+ kfree(bargs);
+out:
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+ return ret;
+}
+
+static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case BTRFS_BALANCE_CTL_PAUSE:
+ return btrfs_pause_balance(root->fs_info);
+ case BTRFS_BALANCE_CTL_CANCEL:
+ return btrfs_cancel_balance(root->fs_info);
+ }
+
+ return -EINVAL;
+}
+
+static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+ void __user *arg)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ioctl_balance_args *bargs;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+ if (!bargs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ update_ioctl_balance_args(fs_info, 1, bargs);
+
+ if (copy_to_user(arg, bargs, sizeof(*bargs)))
+ ret = -EFAULT;
+
+ kfree(bargs);
+out:
+ mutex_unlock(&fs_info->balance_mutex);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -3078,7 +3266,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_DEV_INFO:
return btrfs_ioctl_dev_info(root, argp);
case BTRFS_IOC_BALANCE:
- return btrfs_balance(root->fs_info->dev_root);
+ return btrfs_ioctl_balance(root, NULL);
case BTRFS_IOC_CLONE:
return btrfs_ioctl_clone(file, arg, 0, 0, 0);
case BTRFS_IOC_CLONE_RANGE:
@@ -3110,6 +3298,12 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_scrub_cancel(root, argp);
case BTRFS_IOC_SCRUB_PROGRESS:
return btrfs_ioctl_scrub_progress(root, argp);
+ case BTRFS_IOC_BALANCE_V2:
+ return btrfs_ioctl_balance(root, argp);
+ case BTRFS_IOC_BALANCE_CTL:
+ return btrfs_ioctl_balance_ctl(root, arg);
+ case BTRFS_IOC_BALANCE_PROGRESS:
+ return btrfs_ioctl_balance_progress(root, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 252ae9915de8..4f69028a68c4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args {
__u64 reserved[124]; /* pad to 1k */
};
+/* balance control ioctl modes */
+#define BTRFS_BALANCE_CTL_PAUSE 1
+#define BTRFS_BALANCE_CTL_CANCEL 2
+
+/*
+ * this is packed, because it should be exactly the same as its disk
+ * byte order counterpart (struct btrfs_disk_balance_args)
+ */
+struct btrfs_balance_args {
+ __u64 profiles;
+ __u64 usage;
+ __u64 devid;
+ __u64 pstart;
+ __u64 pend;
+ __u64 vstart;
+ __u64 vend;
+
+ __u64 target;
+
+ __u64 flags;
+
+ __u64 unused[8];
+} __attribute__ ((__packed__));
+
+/* report balance progress to userspace */
+struct btrfs_balance_progress {
+ __u64 expected; /* estimated # of chunks that will be
+ * relocated to fulfill the request */
+ __u64 considered; /* # of chunks we have considered so far */
+ __u64 completed; /* # of chunks relocated so far */
+};
+
+#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0)
+#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1)
+#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
+
+struct btrfs_ioctl_balance_args {
+ __u64 flags; /* in/out */
+ __u64 state; /* out */
+
+ struct btrfs_balance_args data; /* in/out */
+ struct btrfs_balance_args meta; /* in/out */
+ struct btrfs_balance_args sys; /* in/out */
+
+ struct btrfs_balance_progress stat; /* out */
+
+ __u64 unused[72]; /* pad to 1k */
+};
+
#define BTRFS_INO_LOOKUP_PATH_MAX 4080
struct btrfs_ioctl_ino_lookup_args {
__u64 treeid;
@@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args {
struct btrfs_ioctl_dev_info_args)
#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
struct btrfs_ioctl_fs_info_args)
+#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
+ struct btrfs_ioctl_balance_args)
+#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
+#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
+ struct btrfs_ioctl_balance_args)
#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
struct btrfs_ioctl_ino_path_args)
#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 200f63bc6675..5a7227fa9380 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -164,8 +164,9 @@ enum {
Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
- Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
- Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
+ Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
+ Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+ Opt_err,
};
static match_table_t tokens = {
@@ -200,6 +201,7 @@ static match_table_t tokens = {
{Opt_inode_cache, "inode_cache"},
{Opt_no_space_cache, "nospace_cache"},
{Opt_recovery, "recovery"},
+ {Opt_skip_balance, "skip_balance"},
{Opt_err, NULL},
};
@@ -398,6 +400,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
printk(KERN_INFO "btrfs: enabling auto recovery");
btrfs_set_opt(info->mount_opt, RECOVERY);
break;
+ case Opt_skip_balance:
+ btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+ break;
case Opt_err:
printk(KERN_INFO "btrfs: unrecognized mount option "
"'%s'\n", p);
@@ -723,6 +728,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",autodefrag");
if (btrfs_test_opt(root, INODE_MAP_CACHE))
seq_puts(seq, ",inode_cache");
+ if (btrfs_test_opt(root, SKIP_BALANCE))
+ seq_puts(seq, ",skip_balance");
return 0;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ac00e3aa80a1..9489a2aca47b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
#include <linux/random.h>
#include <linux/iocontext.h>
#include <linux/capability.h>
+#include <linux/kthread.h>
#include <asm/div64.h>
#include "compat.h"
#include "ctree.h"
@@ -1282,7 +1283,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
bool clear_super = false;
mutex_lock(&uuid_mutex);
- mutex_lock(&root->fs_info->volume_mutex);
all_avail = root->fs_info->avail_data_alloc_bits |
root->fs_info->avail_system_alloc_bits |
@@ -1452,7 +1452,6 @@ error_close:
if (bdev)
blkdev_put(bdev, FMODE_READ | FMODE_EXCL);