From a1f765061e1491d5ec467429d0d6adfd9df2f6d9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 16 Sep 2010 14:29:55 -0400 Subject: Btrfs: stop trying to shrink delalloc if there are no inodes to reclaim In very severe ENOSPC cases we can run out of inodes to do delalloc on, which means we'll just keep looping trying to shrink delalloc. Instead, if we fail to shrink delalloc 3 times in a row break out since we're not likely to make any progress. Tested this with a 100mb fs an xfstests test 13. Before the patch it would hang the box, with the patch we get -ENOSPC like we should. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 32d094002a57..c6a5d9095d5f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3115,6 +3115,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, u64 reserved; u64 max_reclaim; u64 reclaimed = 0; + int no_reclaim = 0; int pause = 1; int ret; @@ -3131,12 +3132,16 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0); if (!ret) { + if (no_reclaim > 2) + break; + no_reclaim++; __set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(pause); pause <<= 1; if (pause > HZ / 10) pause = HZ / 10; } else { + no_reclaim = 0; pause = 1; } -- cgit v1.2.3 From bf5fc093c5b625e4259203f1cee7ca73488a5620 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 29 Sep 2010 11:22:36 -0400 Subject: Btrfs: fix the df ioctl to report raid types The new ENOSPC stuff broke the df ioctl since we no longer create seperate space info's for each RAID type. So instead, loop through each space info's raid lists so we can get the right RAID information which will allow the df ioctl to tell us RAID types again. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 100 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 76 insertions(+), 24 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9254b3d58dbe..db0b8fc59235 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1879,6 +1879,22 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) return 0; } +static void get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space) +{ + struct btrfs_block_group_cache *block_group; + + space->total_bytes = 0; + space->used_bytes = 0; + space->flags = 0; + list_for_each_entry(block_group, groups_list, list) { + space->flags = block_group->flags; + space->total_bytes += block_group->key.offset; + space->used_bytes += + btrfs_block_group_used(&block_group->item); + } +} + long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_space_args space_args; @@ -1887,27 +1903,56 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) struct btrfs_ioctl_space_info *dest_orig; struct btrfs_ioctl_space_info *user_dest; struct btrfs_space_info *info; + u64 types[] = {BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; + int num_types = 4; int alloc_size; int ret = 0; int slot_count = 0; + int i, c; if (copy_from_user(&space_args, (struct btrfs_ioctl_space_args __user *)arg, sizeof(space_args))) return -EFAULT; - /* first we count slots */ - rcu_read_lock(); - list_for_each_entry_rcu(info, &root->fs_info->space_info, list) - slot_count++; - rcu_read_unlock(); + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + info = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &root->fs_info->space_info, + list) { + if (tmp->flags == types[i]) { + info = tmp; + break; + } + } + rcu_read_unlock(); + + if (!info) + continue; + + down_read(&info->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&info->block_groups[c])) + slot_count++; + } + up_read(&info->groups_sem); + } /* space_slots == 0 means they are asking for a count */ if (space_args.space_slots == 0) { space_args.total_spaces = slot_count; goto out; } + + slot_count = min_t(int, space_args.space_slots, slot_count); + alloc_size = sizeof(*dest) * slot_count; + /* we generally have at most 6 or so space infos, one for each raid * level. So, a whole page should be more than enough for everyone */ @@ -1921,27 +1966,34 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) dest_orig = dest; /* now we have a buffer to copy into */ - rcu_read_lock(); - list_for_each_entry_rcu(info, &root->fs_info->space_info, list) { - /* make sure we don't copy more than we allocated - * in our buffer - */ - if (slot_count == 0) - break; - slot_count--; - - /* make sure userland has enough room in their buffer */ - if (space_args.total_spaces >= space_args.space_slots) - break; + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + info = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &root->fs_info->space_info, + list) { + if (tmp->flags == types[i]) { + info = tmp; + break; + } + } + rcu_read_unlock(); - space.flags = info->flags; - space.total_bytes = info->total_bytes; - space.used_bytes = info->bytes_used; - memcpy(dest, &space, sizeof(space)); - dest++; - space_args.total_spaces++; + if (!info) + continue; + down_read(&info->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&info->block_groups[c])) { + get_block_group_info(&info->block_groups[c], + &space); + memcpy(dest, &space, sizeof(space)); + dest++; + space_args.total_spaces++; + } + } + up_read(&info->groups_sem); } - rcu_read_unlock(); user_dest = (struct btrfs_ioctl_space_info *) (arg + sizeof(struct btrfs_ioctl_space_args)); -- cgit v1.2.3 From 89a55897a2fbbceb94480952784004bf23911d38 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Oct 2010 14:52:27 -0400 Subject: Btrfs: fix df regression The new ENOSPC stuff breaks out the raid types which breaks the way we were reporting df to the system. This fixes it back so that Available is the total space available to data and used is the actual bytes used by the filesystem. This means that Available is Total - data used - all of the metadata space. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 5 ++++- fs/btrfs/extent-tree.c | 10 ++++++++++ fs/btrfs/super.c | 11 +++++++++-- 3 files changed, 23 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 29c20092847e..014fd52c01bf 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -675,7 +675,8 @@ struct btrfs_block_group_item { struct btrfs_space_info { u64 flags; - u64 total_bytes; /* total bytes in the space */ + u64 total_bytes; /* total bytes in the space, + this doesn't take mirrors into account */ u64 bytes_used; /* total bytes used, this does't take mirrors into account */ u64 bytes_pinned; /* total bytes pinned, will be freed when the @@ -687,6 +688,8 @@ struct btrfs_space_info { u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ u64 disk_used; /* total bytes used on disk */ + u64 disk_total; /* total bytes on disk, takes mirrors into + account */ int full; /* indicates that we cannot allocate any more chunks for this space */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c6a5d9095d5f..4669c6f8a44d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2763,6 +2763,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (found) { spin_lock(&found->lock); found->total_bytes += total_bytes; + found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; found->full = 0; @@ -2782,6 +2783,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA); found->total_bytes = total_bytes; + found->disk_total = total_bytes * factor; found->bytes_used = bytes_used; found->disk_used = bytes_used * factor; found->bytes_pinned = 0; @@ -8095,6 +8097,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_free_cluster *cluster; struct btrfs_key key; int ret; + int factor; root = root->fs_info->extent_root; @@ -8103,6 +8106,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, BUG_ON(!block_group->ro); memcpy(&key, &block_group->key, sizeof(key)); + if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; /* make sure this block group isn't part of an allocation cluster */ cluster = &root->fs_info->data_alloc_cluster; @@ -8143,6 +8152,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&block_group->space_info->lock); block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; + block_group->space_info->disk_total -= block_group->key.offset * factor; spin_unlock(&block_group->space_info->lock); btrfs_clear_space_info_full(root->fs_info); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f2393b390318..afab6ca14d03 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -716,18 +716,25 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; + u64 total_used_data = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)root->fs_info->fsid; rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) + list_for_each_entry_rcu(found, head, list) { + if (found->flags & (BTRFS_BLOCK_GROUP_METADATA | + BTRFS_BLOCK_GROUP_SYSTEM)) + total_used_data += found->disk_total; + else + total_used_data += found->disk_used; total_used += found->disk_used; + } rcu_read_unlock(); buf->f_namelen = BTRFS_NAME_LEN; buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bavail = buf->f_bfree; + buf->f_bavail = buf->f_blocks - (total_used_data >> bits); buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; -- cgit v1.2.3 From 6d48755d02b150de7f47e7b4753202f2fc9f990f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Oct 2010 15:13:32 -0400 Subject: Btrfs: fix reservation code for mixed block groups The global reservation stuff tries to add together DATA and METADATA used in order to figure out how much to reserve for everything, but this doesn't work right for mixed block groups. Instead if we have mixed block groups just set data used to 0. Also with mixed block groups we will use bytes_may_use for keeping track of delalloc bytes, so we need to take that into account in our reservation calculations. Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4669c6f8a44d..0f27f7b48804 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3213,7 +3213,8 @@ static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv, spin_lock(&space_info->lock); unused = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly; + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; if (unused < space_info->total_bytes) unused = space_info->total_bytes - unused; @@ -3507,6 +3508,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); spin_lock(&sinfo->lock); + if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) + data_used = 0; meta_used = sinfo->bytes_used; spin_unlock(&sinfo->lock); @@ -3534,7 +3537,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->size = num_bytes; num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + - sinfo->bytes_reserved + sinfo->bytes_readonly; + sinfo->bytes_reserved + sinfo->bytes_readonly + + sinfo->bytes_may_use; if (sinfo->total_bytes > num_bytes) { num_bytes = sinfo->total_bytes - num_bytes; -- cgit v1.2.3 From 0019f10db6f596f3e14a19f9bd7059a1b85b0853 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Oct 2010 15:18:40 -0400 Subject: Btrfs: re-work delalloc flushing Currently we try and flush delalloc, but we only do that in a sort of weak way, which works fine in most cases but if we're under heavy pressure we need to be able to wait for flushing to happen. Also instead of checking the bytes reserved in the block_rsv, check the space info since it is more accurate. The sync option will be used in a future patch. Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 ++- fs/btrfs/extent-tree.c | 26 ++++++++++++++------------ fs/btrfs/inode.c | 24 ++++++++++++++++++++++-- 3 files changed, 38 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 014fd52c01bf..f32404db2c5d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2376,7 +2376,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); -int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput); +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput, + int sync); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_writepages(struct address_space *mapping, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0f27f7b48804..2846cebc9427 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3111,9 +3111,10 @@ static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, * shrink metadata reservation for delalloc */ static int shrink_delalloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 to_reclaim) + struct btrfs_root *root, u64 to_reclaim, int sync) { struct btrfs_block_rsv *block_rsv; + struct btrfs_space_info *space_info; u64 reserved; u64 max_reclaim; u64 reclaimed = 0; @@ -3122,9 +3123,10 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, int ret; block_rsv = &root->fs_info->delalloc_block_rsv; - spin_lock(&block_rsv->lock); - reserved = block_rsv->reserved; - spin_unlock(&block_rsv->lock); + space_info = block_rsv->space_info; + spin_lock(&space_info->lock); + reserved = space_info->bytes_reserved; + spin_unlock(&space_info->lock); if (reserved == 0) return 0; @@ -3132,7 +3134,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, max_reclaim = min(reserved, to_reclaim); while (1) { - ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0); + ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0, sync); if (!ret) { if (no_reclaim > 2) break; @@ -3147,11 +3149,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, pause = 1; } - spin_lock(&block_rsv->lock); - if (reserved > block_rsv->reserved) - reclaimed = reserved - block_rsv->reserved; - reserved = block_rsv->reserved; - spin_unlock(&block_rsv->lock); + spin_lock(&space_info->lock); + if (reserved > space_info->bytes_reserved) + reclaimed += reserved - space_info->bytes_reserved; + reserved = space_info->bytes_reserved; + spin_unlock(&space_info->lock); if (reserved == 0 || reclaimed >= max_reclaim) break; @@ -3180,7 +3182,7 @@ static int should_retry_reserve(struct btrfs_trans_handle *trans, if (trans && trans->transaction->in_commit) return -ENOSPC; - ret = shrink_delalloc(trans, root, num_bytes); + ret = shrink_delalloc(trans, root, num_bytes, 0); if (ret) return ret; @@ -3729,7 +3731,7 @@ again: block_rsv_add_bytes(block_rsv, to_reserve, 1); if (block_rsv->size > 512 * 1024 * 1024) - shrink_delalloc(NULL, root, to_reserve); + shrink_delalloc(NULL, root, to_reserve, 0); return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1bff92ad4744..5f9e4fc20a73 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6603,7 +6603,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) return 0; } -int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput, + int sync) { struct btrfs_inode *binode; struct inode *inode = NULL; @@ -6625,7 +6626,26 @@ int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) spin_unlock(&root->fs_info->delalloc_lock); if (inode) { - write_inode_now(inode, 0); + if (sync) { + filemap_write_and_wait(inode->i_mapping); + /* + * We have to do this because compression doesn't + * actually set PG_writeback until it submits the pages + * for IO, which happens in an async thread, so we could + * race and not actually wait for any writeback pages + * because they've not been submitted yet. Technically + * this could still be the case for the ordered stuff + * since the async thread may not have started to do its + * work yet. If this becomes the case then we need to + * figure out a way to make sure that in writepage we + * wait for any async pages to be submitted before + * returning so that fdatawait does what its supposed to + * do. + */ + btrfs_wait_ordered_range(inode, 0, (u64)-1); + } else { + filemap_flush(inode->i_mapping); + } if (delay_iput) btrfs_add_delayed_iput(inode); else -- cgit v1.2.3 From 14ed0ca6e8236f2d264c4a8faec9e3a2b3d04377 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Oct 2010 15:23:48 -0400 Subject: Btrfs: don't allocate chunks as aggressively Because the ENOSPC code over reserves super aggressively we end up allocating chunks way more often than we should. For example with my fs_mark tests on a 2gb fs I can end up reserved 1gb just for metadata, when only 34mb of that is being used. So instead check to see if the amount of space actually used is less than 30% of the total space, and if so don't allocate a chunk, but only if we have at least 256mb of free space to make sure we don't put too much pressure on free space. Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2846cebc9427..aca3314ef8b9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3000,8 +3000,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_unlock(); } -static int should_alloc_chunk(struct btrfs_space_info *sinfo, - u64 alloc_bytes) +static int should_alloc_chunk(struct btrfs_space_info *sinfo, u64 alloc_bytes) { u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; @@ -3013,6 +3012,10 @@ static int should_alloc_chunk(struct btrfs_space_info *sinfo, alloc_bytes < div_factor(num_bytes, 8)) return 0; + if (num_bytes > 256 * 1024 * 1024 && + sinfo->bytes_used < div_factor(num_bytes, 3)) + return 0; + return 1; } -- cgit v1.2.3 From 8bb8ab2e93f9c3c9453e13be0f37d344a32a3a6d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Oct 2010 16:52:49 -0400 Subject: Btrfs: rework how we reserve metadata bytes With multi-threaded writes we were getting ENOSPC early because somebody would come in, start flushing delalloc because they couldn't make their reservation, and in the meantime other threads would come in and use the space that was getting freed up, so when the original thread went to check to see if they had space they didn't and they'd return ENOSPC. So instead if we have some free space but not enough for our reservation, take the reservation and then start doing the flushing. The only time we don't take reservations is when we've already overcommitted our space, that way we don't have people who come late to the party way overcommitting ourselves. This also moves all of the retrying and flushing code into reserve_metdata_bytes so it's all uniform. This keeps my fs_mark test from returning -ENOSPC as soon as it starts and actually lets me fill up the disk. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 4 +- fs/btrfs/extent-tree.c | 238 +++++++++++++++++++++++++++---------------------- fs/btrfs/relocation.c | 14 +-- fs/btrfs/transaction.c | 7 +- 4 files changed, 136 insertions(+), 127 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f32404db2c5d..47bc66e34da7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2082,7 +2082,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, - int num_items, int *retries); + int num_items); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, @@ -2103,7 +2103,7 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int *retries); + u64 num_bytes); int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index aca3314ef8b9..180a50146ddf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3078,38 +3078,6 @@ out: return ret; } -static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_space_info *sinfo, u64 num_bytes) -{ - int ret; - int end_trans = 0; - - if (sinfo->full) - return 0; - - spin_lock(&sinfo->lock); - ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024); - spin_unlock(&sinfo->lock); - if (!ret) - return 0; - - if (!trans) { - trans = btrfs_join_transaction(root, 1); - BUG_ON(IS_ERR(trans)); - end_trans = 1; - } - - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes + 2 * 1024 * 1024, - get_alloc_profile(root, sinfo->flags), 0); - - if (end_trans) - btrfs_end_transaction(trans, root); - - return ret == 1 ? 1 : 0; -} - /* * shrink metadata reservation for delalloc */ @@ -3167,79 +3135,138 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, return reclaimed >= to_reclaim; } -static int should_retry_reserve(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int *retries) +/* + * Retries tells us how many times we've called reserve_metadata_bytes. The + * idea is if this is the first call (retries == 0) then we will add to our + * reserved count if we can't make the allocation in order to hold our place + * while we go and try and free up space. That way for retries > 1 we don't try + * and add space, we just check to see if the amount of unused space is >= the + * total space, meaning that our reservation is valid. + * + * However if we don't intend to retry this reservation, pass -1 as retries so + * that it short circuits this logic. + */ +static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, int flush) { struct btrfs_space_info *space_info = block_rsv->space_info; - int ret; + u64 unused; + u64 num_bytes = orig_bytes; + int retries = 0; + int ret = 0; + bool reserved = false; - if ((*retries) > 2) - return -ENOSPC; +again: + ret = -ENOSPC; + if (reserved) + num_bytes = 0; - ret = maybe_allocate_chunk(trans, root, space_info, num_bytes); - if (ret) - return 1; + spin_lock(&space_info->lock); + unused = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; - if (trans && trans->transaction->in_commit) - return -ENOSPC; + /* + * The idea here is that we've not already over-reserved the block group + * then we can go ahead and save our reservation first and then start + * flushing if we need to. Otherwise if we've already overcommitted + * lets start flushing stuff first and then come back and try to make + * our reservation. + */ + if (unused <= space_info->total_bytes) { + unused -= space_info->total_bytes; + if (unused >= num_bytes) { + if (!reserved) + space_info->bytes_reserved += orig_bytes; + ret = 0; + } else { + /* + * Ok set num_bytes to orig_bytes since we aren't + * overocmmitted, this way we only try and reclaim what + * we need. + */ + num_bytes = orig_bytes; + } + } else { + /* + * Ok we're over committed, set num_bytes to the overcommitted + * amount plus the amount of bytes that we need for this + * reservation. + */ + num_bytes = unused - space_info->total_bytes + + (orig_bytes * (retries + 1)); + } - ret = shrink_delalloc(trans, root, num_bytes, 0); - if (ret) - return ret; + /* + * Couldn't make our reservation, save our place so while we're trying + * to reclaim space we can actually use it instead of somebody else + * stealing it from us. + */ + if (ret && !reserved) { + space_info->bytes_reserved += orig_bytes; + reserved = true; + } - spin_lock(&space_info->lock); - if (space_info->bytes_pinned < num_bytes) - ret = 1; spin_unlock(&space_info->lock); - if (ret) - return -ENOSPC; - - (*retries)++; - if (trans) - return -EAGAIN; + if (!ret) + return 0; - trans = btrfs_join_transaction(root, 1); - BUG_ON(IS_ERR(trans)); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + if (!flush) + goto out; - return 1; -} + /* + * We do synchronous shrinking since we don't actually unreserve + * metadata until after the IO is completed. + */ + ret = shrink_delalloc(trans, root, num_bytes, 1); + if (ret > 0) + return 0; + else if (ret < 0) + goto out; -static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - struct btrfs_space_info *space_info = block_rsv->space_info; - u64 unused; - int ret = -ENOSPC; + /* + * So if we were overcommitted it's possible that somebody else flushed + * out enough space and we simply didn't have enough space to reclaim, + * so go back around and try again. + */ + if (retries < 2) { + retries++; + goto again; + } spin_lock(&space_info->lock); - unused = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + /* + * Not enough space to be reclaimed, don't bother committing the + * transaction. + */ + if (space_info->bytes_pinned < orig_bytes) + ret = -ENOSPC; + spin_unlock(&space_info->lock); + if (ret) + goto out; - if (unused < space_info->total_bytes) - unused = space_info->total_bytes - unused; - else - unused = 0; + ret = -EAGAIN; + if (trans) + goto out; - if (unused >= num_bytes) { - if (block_rsv->priority >= 10) { - space_info->bytes_reserved += num_bytes; - ret = 0; - } else { - if ((unused + block_rsv->reserved) * - block_rsv->priority >= - (num_bytes + block_rsv->reserved) * 10) { - space_info->bytes_reserved += num_bytes; - ret = 0; - } - } + + ret = -ENOSPC; + trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + goto out; + ret = btrfs_commit_transaction(trans, root); + if (!ret) + goto again; + +out: + if (reserved) { + spin_lock(&space_info->lock); + space_info->bytes_reserved -= orig_bytes; + spin_unlock(&space_info->lock); } - spin_unlock(&space_info->lock); return ret; } @@ -3383,23 +3410,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int *retries) + u64 num_bytes) { int ret; if (num_bytes == 0) return 0; -again: - ret = reserve_metadata_bytes(block_rsv, num_bytes); + + ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; } - ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries); - if (ret > 0) - goto again; - return ret; } @@ -3434,7 +3457,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, return 0; if (block_rsv->refill_used) { - ret = reserve_metadata_bytes(block_rsv, num_bytes); + ret = reserve_metadata_bytes(trans, root, block_rsv, + num_bytes, 0); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; @@ -3614,7 +3638,7 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, - int num_items, int *retries) + int num_items) { u64 num_bytes; int ret; @@ -3624,7 +3648,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = calc_trans_metadata_size(root, num_items); ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, - num_bytes, retries); + num_bytes); if (!ret) { trans->bytes_reserved += num_bytes; trans->block_rsv = &root->fs_info->trans_block_rsv; @@ -3698,14 +3722,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; u64 to_reserve; int nr_extents; - int retries = 0; int ret; if (btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); num_bytes = ALIGN(num_bytes, root->sectorsize); -again: + spin_lock(&BTRFS_I(inode)->accounting_lock); nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; if (nr_extents > BTRFS_I(inode)->reserved_extents) { @@ -3715,18 +3738,14 @@ again: nr_extents = 0; to_reserve = 0; } + spin_unlock(&BTRFS_I(inode)->accounting_lock); to_reserve += calc_csum_metadata_size(inode, num_bytes); - ret = reserve_metadata_bytes(block_rsv, to_reserve); - if (ret) { - spin_unlock(&BTRFS_I(inode)->accounting_lock); - ret = should_retry_reserve(NULL, root, block_rsv, to_reserve, - &retries); - if (ret > 0) - goto again; + ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); + if (ret) return ret; - } + spin_lock(&BTRFS_I(inode)->accounting_lock); BTRFS_I(inode)->reserved_extents += nr_extents; atomic_inc(&BTRFS_I(inode)->outstanding_extents); spin_unlock(&BTRFS_I(inode)->accounting_lock); @@ -5325,7 +5344,8 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(block_rsv, blocksize); + ret = reserve_metadata_bytes(trans, root, block_rsv, + blocksize, 0); if (ret) return ERR_PTR(ret); return block_rsv; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b37d723b9d4a..39adb68a653f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -178,8 +178,6 @@ struct reloc_control { u64 search_start; u64 extents_found; - int block_rsv_retries; - unsigned int stage:8; unsigned int create_reloc_tree:1; unsigned int merge_reloc_tree:1; @@ -2133,7 +2131,6 @@ int prepare_to_merge(struct reloc_control *rc, int err) LIST_HEAD(reloc_roots); u64 num_bytes = 0; int ret; - int retries = 0; mutex_lock(&root->fs_info->trans_mutex); rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; @@ -2143,7 +2140,7 @@ again: if (!err) { num_bytes = rc->merging_rsv_size; ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, - num_bytes, &retries); + num_bytes); if (ret) err = ret; } @@ -2155,7 +2152,6 @@ again: btrfs_end_transaction(trans, rc->extent_root); btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes); - retries = 0; goto again; } } @@ -2405,15 +2401,13 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes, - &rc->block_rsv_retries); + ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); if (ret) { if (ret == -EAGAIN) rc->commit_transaction = 1; return ret; } - rc->block_rsv_retries = 0; return 0; } @@ -3554,8 +3548,7 @@ int prepare_to_relocate(struct reloc_control *rc) * is no reservation in transaction handle. */ ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, - rc->extent_root->nodesize * 256, - &rc->block_rsv_retries); + rc->extent_root->nodesize * 256); if (ret) return ret; @@ -3567,7 +3560,6 @@ int prepare_to_relocate(struct reloc_control *rc) rc->extents_found = 0; rc->nodes_relocated = 0; rc->merging_rsv_size = 0; - rc->block_rsv_retries = 0; rc->create_reloc_tree = 1; set_reloc_control(rc); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 66e4c66cc63b..abbec80aaa44 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -179,7 +179,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; - int retries = 0; int ret; again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); @@ -212,8 +211,7 @@ again: } if (num_items > 0) { - ret = btrfs_trans_reserve_metadata(h, root, num_items, - &retries); + ret = btrfs_trans_reserve_metadata(h, root, num_items); if (ret == -EAGAIN) { btrfs_commit_transaction(h, root); goto again; @@ -836,7 +834,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct extent_buffer *tmp; struct extent_buffer *old; int ret; - int retries = 0; u64 to_reserve = 0; u64 index = 0; u64 objectid; @@ -858,7 +855,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (to_reserve > 0) { ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, - to_reserve, &retries); + to_reserve); if (ret) { pending->error = ret; goto fail; -- cgit v1.2.3 From 0e78340f3c1fc603e8016c8ac304766bcc65506e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 22 Oct 2010 15:26:53 -0400 Subject: Btrfs: fix error handling in btrfs_get_sb If we failed to find the root subvol id, or the subvol=, we would deactivate the locked super and close the devices. The problem is at this point we have gotten the SB all setup, which includes setting super_operations, so when we'd deactiveate the super, we'd do a close_ctree() which closes the devices, so we'd end up closing the devices twice. So if you do something like this mount /dev/sda1 /mnt/test1 mount /dev/sda1 /mnt/test2 -o subvol=xxx umount /mnt/test1 it would blow up (if subvol xxx doesn't exist). This patch fixes that problem. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/super.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index afab6ca14d03..d1867cda92a7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -629,7 +629,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, if (IS_ERR(root)) { error = PTR_ERR(root); deactivate_locked_super(s); - goto error; + goto error_free_subvol_name; } /* if they gave us a subvolume name bind mount into that */ if (strcmp(subvol_name, ".")) { @@ -643,14 +643,14 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, deactivate_locked_super(s); error = PTR_ERR(new_root); dput(root); - goto error_close_devices; + goto error_free_subvol_name; } if (!new_root->d_inode) { dput(root); dput(new_root); deactivate_locked_super(s); error = -ENXIO; - goto error_close_devices; + goto error_free_subvol_name; } dput(root); root = new_root; @@ -668,7 +668,6 @@ error_close_devices: btrfs_close_devices(fs_devices); error_free_subvol_name: kfree(subvol_name); -error: return error; } -- cgit v1.2.3 From 382279336f428c80f344edfc30d53797e3e76146 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Oct 2010 12:52:53 -0400 Subject: Btrfs: set trans to null in reserve_metadata_bytes if we commit the transaction btrfs_commit_transaction will free our trans, but because we pass trans to shrink_delalloc we could possibly have a use after free situation. So instead if we commit the transaction, set trans to null and set committed to true so we don't keep trying to commit a transaction. This fixes a panic I could reproduce at will. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 180a50146ddf..e2dfd4ab3b9b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3157,6 +3157,7 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, int retries = 0; int ret = 0; bool reserved = false; + bool committed = false; again: ret = -ENOSPC; @@ -3249,17 +3250,19 @@ again: goto out; ret = -EAGAIN; - if (trans) + if (trans || committed) goto out; - ret = -ENOSPC; trans = btrfs_join_transaction(root, 1); if (IS_ERR(trans)) goto out; ret = btrfs_commit_transaction(trans, root); - if (!ret) + if (!ret) { + trans = NULL; + committed = true; goto again; + } out: if (reserved) { -- cgit v1.2.3 From e9bb7f10d3617304ef94ff7aa8fefbce3078f08b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Oct 2010 12:55:03 -0400 Subject: Btrfs: remove warn_on from use_block_rsv Because btrfs_dirty_inode does a btrfs_join_transaction, it doesn't actually reserve space. It does this so we can try and dirty the inode quickly without having to deal with the ENOSPC problems. But if it does get back ENOSPC it handles it properly. The problem is use_block_rsv does a WARN_ON whenever this case happens, even tho btrfs_dirty_inode takes it into account and actually expects to get -ENOSPC if things are particularly tight. So instead just remove the warning. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e2dfd4ab3b9b..33785333f293 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5358,11 +5358,6 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (!ret) return block_rsv; - WARN_ON(1); - printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", - block_rsv->size, block_rsv->reserved, - block_rsv->freed[0], block_rsv->freed[1]); - return ERR_PTR(-ENOSPC); } -- cgit v1.2.3