summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-10-29 14:49:59 -0400
committerChris Mason <chris.mason@oracle.com>2008-10-29 14:49:59 -0400
commitc8b978188c9a0fd3d535c13debd19d522b726f1f (patch)
tree873628723fb82fe2a7c77adc65fa93eca1d61c0c
parent26ce34a9c47334ff7984769e4661b2f1883594ff (diff)
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing, both for inline and regular extents. It does some fairly large surgery to the writeback paths. Compression is off by default and enabled by mount -o compress. Even when the -o compress mount option is not used, it is possible to read compressed extents off the disk. If compression for a given set of pages fails to make them smaller, the file is flagged to avoid future compression attempts later. * While finding delalloc extents, the pages are locked before being sent down to the delalloc handler. This allows the delalloc handler to do complex things such as cleaning the pages, marking them writeback and starting IO on their behalf. * Inline extents are inserted at delalloc time now. This allows us to compress the data before inserting the inline extent, and it allows us to insert an inline extent that spans multiple pages. * All of the in-memory extent representations (extent_map.c, ordered-data.c etc) are changed to record both an in-memory size and an on disk size, as well as a flag for compression. From a disk format point of view, the extent pointers in the file are changed to record the on disk size of a given extent and some encoding flags. Space in the disk format is allocated for compression encoding, as well as encryption and a generic 'other' field. Neither the encryption or the 'other' field are currently used. In order to limit the amount of data read for a single random read in the file, the size of a compressed extent is limited to 128k. This is a software only limit, the disk format supports u64 sized compressed extents. In order to limit the ram consumed while processing extents, the uncompressed size of a compressed extent is limited to 256k. This is a software only limit and will be subject to tuning later. Checksumming is still done on compressed extents, and it is done on the uncompressed version of the data. This way additional encodings can be layered on without having to figure out which encoding to checksum. Compression happens at delalloc time, which is basically singled threaded because it is usually done by a single pdflush thread. This makes it tricky to spread the compression load across all the cpus on the box. We'll have to look at parallel pdflush walks of dirty inodes at a later time. Decompression is hooked into readpages and it does spread across CPUs nicely. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/compression.c454
-rw-r--r--fs/btrfs/compression.h47
-rw-r--r--fs/btrfs/ctree.h99
-rw-r--r--fs/btrfs/disk-io.c18
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c27
-rw-r--r--fs/btrfs/extent_io.c411
-rw-r--r--fs/btrfs/extent_io.h17
-rw-r--r--fs/btrfs/extent_map.c9
-rw-r--r--fs/btrfs/extent_map.h6
-rw-r--r--fs/btrfs/file-item.c75
-rw-r--r--fs/btrfs/file.c263
-rw-r--r--fs/btrfs/inode.c584
-rw-r--r--fs/btrfs/ordered-data.c9
-rw-r--r--fs/btrfs/ordered-data.h10
-rw-r--r--fs/btrfs/print-tree.c7
-rw-r--r--fs/btrfs/super.c10
-rw-r--r--fs/btrfs/tree-log.c3
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/btrfs/zlib.c637
22 files changed, 2315 insertions, 379 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 18f5a85b47c6..31cce5d88b1a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -501,6 +501,8 @@ config BTRFS_FS
tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
depends on EXPERIMENTAL
select LIBCRC32C
+ select ZLIB_INFLATE
+ select ZLIB_DEFLATE
help
Btrfs is a new filesystem with extents, writable snapshotting,
support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7125716e142b..d2cf5a54a4b8 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,7 +7,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
transaction.o inode.o file.o tree-defrag.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
- ref-cache.o export.o tree-log.o acl.o free-space-cache.o
+ ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+ compression.o
else
# Normal Makefile
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 000000000000..c5470367ca5c
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compat.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+
+struct compressed_bio {
+ /* number of bios pending for this compressed extent */
+ atomic_t pending_bios;
+
+ /* the pages with the compressed data on them */
+ struct page **compressed_pages;
+
+ /* inode that owns this data */
+ struct inode *inode;
+
+ /* starting offset in the inode for our pages */
+ u64 start;
+
+ /* number of bytes in the inode we're working on */
+ unsigned long len;
+
+ /* number of bytes on disk */
+ unsigned long compressed_len;
+
+ /* number of compressed pages in the array */
+ unsigned long nr_pages;
+
+ /* IO errors */
+ int errors;
+
+ /* for reads, this is the bio we are copying the data into */
+ struct bio *orig_bio;
+};
+
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+ u64 first_byte, gfp_t gfp_flags)
+{
+ struct bio *bio;
+ int nr_vecs;
+
+ nr_vecs = bio_get_nr_vecs(bdev);
+ bio = bio_alloc(gfp_flags, nr_vecs);
+
+ if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+ while (!bio && (nr_vecs /= 2))
+ bio = bio_alloc(gfp_flags, nr_vecs);
+ }
+
+ if (bio) {
+ bio->bi_size = 0;
+ bio->bi_bdev = bdev;
+ bio->bi_sector = first_byte >> 9;
+ }
+ return bio;
+}
+
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+ struct extent_io_tree *tree;
+ struct compressed_bio *cb = bio->bi_private;
+ struct inode *inode;
+ struct page *page;
+ unsigned long index;
+ int ret;
+
+ if (err)
+ cb->errors = 1;
+
+ /* if there are more bios still pending for this compressed
+ * extent, just exit
+ */
+ if (!atomic_dec_and_test(&cb->pending_bios))
+ goto out;
+
+ /* ok, we're the last bio for this extent, lets start
+ * the decompression.
+ */
+ inode = cb->inode;
+ tree = &BTRFS_I(inode)->io_tree;
+ ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+ cb->start,
+ cb->orig_bio->bi_io_vec,
+ cb->orig_bio->bi_vcnt,
+ cb->compressed_len);
+ if (ret)
+ cb->errors = 1;
+
+ /* release the compressed pages */
+ index = 0;
+ for (index = 0; index < cb->nr_pages; index++) {
+ page = cb->compressed_pages[index];
+ page->mapping = NULL;
+ page_cache_release(page);
+ }
+
+ /* do io completion on the original bio */
+ if (cb->errors)
+ bio_io_error(cb->orig_bio);
+ else
+ bio_endio(cb->orig_bio, 0);
+
+ /* finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+out:
+ bio_put(bio);
+}
+
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline int end_compressed_writeback(struct inode *inode, u64 start,
+ unsigned long ram_size)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+ struct page *pages[16];
+ unsigned long nr_pages = end_index - index + 1;
+ int i;
+ int ret;
+
+ while(nr_pages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min(nr_pages, ARRAY_SIZE(pages)), pages);
+ if (ret == 0) {
+ nr_pages -= 1;
+ index += 1;
+ continue;
+ }
+ for (i = 0; i < ret; i++) {
+ end_page_writeback(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ nr_pages -= ret;
+ index += ret;
+ }
+ /* the inode may be gone now */
+ return 0;
+}
+
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+ struct extent_io_tree *tree;
+ struct compressed_bio *cb = bio->bi_private;
+ struct inode *inode;
+ struct page *page;
+ unsigned long index;
+
+ if (err)
+ cb->errors = 1;
+
+ /* if there are more bios still pending for this compressed
+ * extent, just exit
+ */
+ if (!atomic_dec_and_test(&cb->pending_bios))
+ goto out;
+
+ /* ok, we're the last bio for this extent, step one is to
+ * call back into the FS and do all the end_io operations
+ */
+ inode = cb->inode;
+ tree = &BTRFS_I(inode)->io_tree;
+ tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+ cb->start,
+ cb->start + cb->len - 1,
+ NULL, 1);
+
+ end_compressed_writeback(inode, cb->start, cb->len);
+ /* note, our inode could be gone now */
+
+ /*
+ * release the compressed pages, these came from alloc_page and
+ * are not attached to the inode at all
+ */
+ index = 0;
+ for (index = 0; index < cb->nr_pages; index++) {
+ page = cb->compressed_pages[index];
+ page->mapping = NULL;
+ page_cache_release(page);
+ }
+
+ /* finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+out:
+ bio_put(bio);
+}
+
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+ unsigned long len, u64 disk_start,
+ unsigned long compressed_len,
+ struct page **compressed_pages,
+ unsigned long nr_pages)
+{
+ struct bio *bio = NULL;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct compressed_bio *cb;
+ unsigned long bytes_left;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ int page_index = 0;
+ struct page *page;
+ u64 first_byte = disk_start;
+ struct block_device *bdev;
+ int ret;
+
+ WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+ cb = kmalloc(sizeof(*cb), GFP_NOFS);
+ atomic_set(&cb->pending_bios, 0);
+ cb->errors = 0;
+ cb->inode = inode;
+ cb->start = start;
+ cb->len = len;
+ cb->compressed_pages = compressed_pages;
+ cb->compressed_len = compressed_len;
+ cb->orig_bio = NULL;
+ cb->nr_pages = nr_pages;
+
+ bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+ ret = btrfs_csum_file_bytes(root, inode, start, len);
+ BUG_ON(ret);
+
+ bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+ bio->bi_private = cb;
+ bio->bi_end_io = end_compressed_bio_write;
+ atomic_inc(&cb->pending_bios);
+
+ /* create and submit bios for the compressed pages */
+ bytes_left = compressed_len;
+ while(bytes_left > 0) {
+ page = compressed_pages[page_index];
+ page->mapping = inode->i_mapping;
+ if (bio->bi_size)
+ ret = io_tree->ops->merge_bio_hook(page, 0,
+ PAGE_CACHE_SIZE,
+ bio, 0);
+ else
+ ret = 0;
+
+ if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+ PAGE_CACHE_SIZE) {
+ bio_get(bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ BUG_ON(ret);
+
+ ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+ BUG_ON(ret);
+
+ bio_put(bio);
+
+ bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+ atomic_inc(&cb->pending_bios);
+ bio->bi_private = cb;
+ bio->bi_end_io = end_compressed_bio_write;
+ bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ }
+ page_index++;
+ bytes_left -= PAGE_CACHE_SIZE;
+ first_byte += PAGE_CACHE_SIZE;
+ }
+ bio_get(bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ BUG_ON(ret);
+
+ ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+ BUG_ON(ret);
+
+ bio_put(bio);
+ return 0;
+}
+
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it. We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ struct extent_io_tree *tree;
+ struct extent_map_tree *em_tree;
+ struct compressed_bio *cb;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+ unsigned long compressed_len;
+ unsigned long nr_pages;
+ unsigned long page_index;
+ struct page *page;
+ struct block_device *bdev;
+ struct bio *comp_bio;
+ u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+ struct extent_map *em;
+ int ret;
+
+ tree = &BTRFS_I(inode)->io_tree;
+ em_tree = &BTRFS_I(inode)->extent_tree;
+
+ /* we need the actual starting offset of this extent in the file */
+ spin_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree,
+ page_offset(bio->bi_io_vec->bv_page),
+ PAGE_CACHE_SIZE);
+ spin_unlock(&em_tree->lock);
+
+ cb = kmalloc(sizeof(*cb), GFP_NOFS);
+ atomic_set(&cb->pending_bios, 0);
+ cb->errors = 0;
+ cb->inode = inode;
+
+ cb->start = em->start;
+ compressed_len = em->block_len;
+ free_extent_map(em);
+
+ cb->len = uncompressed_len;
+ cb->compressed_len = compressed_len;
+ cb->orig_bio = bio;
+
+ nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+ PAGE_CACHE_SIZE;
+ cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+ GFP_NOFS);
+ bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+ for (page_index = 0; page_index < nr_pages; page_index++) {
+ cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+ __GFP_HIGHMEM);
+ }
+ cb->nr_pages = nr_pages;
+
+ comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+ comp_bio->bi_private = cb;
+ comp_bio->bi_end_io = end_compressed_bio_read;
+ atomic_inc(&cb->pending_bios);
+
+ for (page_index = 0; page_index < nr_pages; page_index++) {
+ page = cb->compressed_pages[page_index];
+ page->mapping = inode->i_mapping;
+ if (comp_bio->bi_size)
+ ret = tree->ops->merge_bio_hook(page, 0,
+ PAGE_CACHE_SIZE,
+ comp_bio, 0);
+ else
+ ret = 0;
+
+ if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+ PAGE_CACHE_SIZE) {
+ bio_get(comp_bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+ BUG_ON(ret);
+
+ ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+ BUG_ON(ret);
+
+ bio_put(comp_bio);
+
+ comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+ GFP_NOFS);
+ atomic_inc(&cb->pending_bios);
+ bio->bi_private = cb;
+ bio->bi_end_io = end_compressed_bio_write;
+ bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ }
+ cur_disk_byte += PAGE_CACHE_SIZE;
+ }
+ bio_get(comp_bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+ BUG_ON(ret);
+
+ ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+ BUG_ON(ret);
+
+ bio_put(comp_bio);
+ return 0;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 000000000000..421f5b4aa715
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+
+int btrfs_zlib_decompress(unsigned char *data_in,
+ struct page *dest_page,
+ unsigned long start_byte,
+ size_t srclen, size_t destlen);
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out);
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+ u64 disk_start,
+ struct bio_vec *bvec,
+ int vcnt,
+ size_t srclen);
+void btrfs_zlib_exit(void);
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+ unsigned long len, u64 disk_start,
+ unsigned long compressed_len,
+ struct page **compressed_pages,
+ unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
+#endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8559f39fd47f..793d8fdda244 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -400,10 +400,18 @@ struct btrfs_timespec {
__le32 nsec;
} __attribute__ ((__packed__));
-/*
- * there is no padding here on purpose. If you want to extent the inode,
- * make a new item type
- */
+typedef enum {
+ BTRFS_COMPRESS_NONE = 0,
+ BTRFS_COMPRESS_ZLIB = 1,
+ BTRFS_COMPRESS_LAST = 2,
+} btrfs_compression_type;
+
+/* we don't understand any encryption methods right now */
+typedef enum {
+ BTRFS_ENCRYPTION_NONE = 0,
+ BTRFS_ENCRYPTION_LAST = 1,
+} btrfs_encryption_type;
+
struct btrfs_inode_item {
/* nfs style generation number */
__le64 generation;
@@ -419,6 +427,7 @@ struct btrfs_inode_item {
__le64 rdev;
__le16 flags;
__le16 compat_flags;
+
struct btrfs_timespec atime;
struct btrfs_timespec ctime;
struct btrfs_timespec mtime;
@@ -454,8 +463,33 @@ struct btrfs_root_item {
#define BTRFS_FILE_EXTENT_INLINE 1
struct btrfs_file_extent_item {
+ /*
+ * transaction id that created this extent
+ */
__le64 generation;
+ /*
+ * max number of bytes to hold this extent in ram
+ * when we split a compressed extent we can't know how big
+ * each of the resulting pieces will be. So, this is
+ * an upper limit on the size of the extent in ram instead of
+ * an exact limit.
+ */
+ __le64 ram_bytes;
+
+ /*
+ * 32 bits for the various ways we might encode the data,
+ * including compression and encryption. If any of these
+ * are set to something a given disk format doesn't understand
+ * it is treated like an incompat flag for reading and writing,
+ * but not for stat.
+ */
+ u8 compression;
+ u8 encryption;
+ __le16 other_encoding; /* spare for later use */
+
+ /* are we inline data or a real extent? */
u8 type;
+
/*
* disk space consumed by the extent, checksum blocks are included
* in these numbers
@@ -471,9 +505,11 @@ struct btrfs_file_extent_item {
*/
__le64 offset;
/*
- * the logical number of file blocks (no csums included)
+ * the logical number of file blocks (no csums included). This
+ * always reflects the size uncompressed and without encoding.
*/
__le64 num_bytes;
+
} __attribute__ ((__packed__));
struct btrfs_csum_item {
@@ -814,6 +850,7 @@ struct btrfs_root {
#define BTRFS_MOUNT_NOBARRIER (1 << 2)
#define BTRFS_MOUNT_SSD (1 << 3)
#define BTRFS_MOUNT_DEGRADED (1 << 4)
+#define BTRFS_MOUNT_COMPRESS (1 << 5)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -825,6 +862,7 @@ struct btrfs_root {
#define BTRFS_INODE_NODATASUM (1 << 0)
#define BTRFS_INODE_NODATACOW (1 << 1)
#define BTRFS_INODE_READONLY (1 << 2)
+#define BTRFS_INODE_NOCOMPRESS (1 << 3)
#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \
~BTRFS_INODE_##flag)
#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \
@@ -1424,14 +1462,6 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
}
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
- struct btrfs_item *e)
-{
- unsigned long offset;
- offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
- return btrfs_item_size(eb, e) - offset;
-}
-
BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
disk_bytenr, 64);
BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
@@ -1442,6 +1472,36 @@ BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
offset, 64);
BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+ ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+ compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+ encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+ other_encoding, 16);
+
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+ struct btrfs_file_extent_item *e)
+{
+ return btrfs_file_extent_ram_bytes(eb, e);
+}
+
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers. If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+ struct btrfs_item *e)
+{
+ unsigned long offset;
+ offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
+ return btrfs_item_size(eb, e) - offset;
+}
static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
{
@@ -1745,10 +1805,11 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 objectid, u64 pos, u64 disk_offset,
- u64 disk_num_bytes,
- u64 num_bytes, u64 offset);
+ struct btrfs_root *root,
+ u64 objectid, u64 pos,
+ u64 disk_offset, u64 disk_num_bytes,
+ u64 num_bytes, u64 offset, u64 ram_bytes,
+ u8 compression, u8 encryption, u16 other_encoding);
int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
@@ -1758,6 +1819,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum *sums);
int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
struct bio *bio);
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+ u64 start, unsigned long len);
struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -1799,7 +1862,7 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
int namelen);
int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
- size_t size, struct bio *bio);
+ size_t size, struct bio *bio, unsigned long bio_flags);
unsigned long btrfs_force_ra(struct address_space *mapping,
struct file_ra_state *ra, struct file *file,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0be044bb6194..dc95f636a11b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -83,6 +83,7 @@ struct async_submit_bio {
extent_submit_bio_hook_t *submit_bio_hook;
int rw;
int mirror_num;
+ unsigned long bio_flags;
struct btrfs_work work;
};
@@ -115,6 +116,7 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
}
em->start = 0;
em->len = (u64)-1;
+ em->block_len = (u64)-1;
em->block_start = 0;
em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
@@ -469,12 +471,13 @@ static void run_one_async_submit(struct btrfs_work *work)
wake_up(&fs_info->async_submit_wait);
async->submit_bio_hook(async->inode, async->rw, async->bio,
- async->mirror_num);
+ async->mirror_num, async->bio_flags);
kfree(async);
}
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
int rw, struct bio *bio, int mirror_num,
+ unsigned long bio_flags,
extent_submit_bio_hook_t *submit_bio_hook)
{
struct async_submit_bio *async;
@@ -491,6 +494,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->submit_bio_hook = submit_bio_hook;
async->work.func = run_one_async_submit;
async->work.flags = 0;
+ async->bio_flags = bio_flags;
while(atomic_read(&fs_info->async_submit_draining) &&
atomic_read(&fs_info->nr_async_submits)) {
@@ -530,7 +534,7 @@ static int btree_csum_one_bio(struct bio *bio)
}
static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
- int mirror_num)
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
@@ -556,17 +560,17 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
}
static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
- int mirror_num)
+ int mirror_num, unsigned long bio_flags)
{
/*
* kthread helpers are used to submit writes so that checksumming
* can happen in parallel across all CPUs
*/
if (!(rw & (1 << BIO_RW))) {
- return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
+ return __btree_submit_bio_hook(inode, rw, bio, mirror_num, 0);
}
return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
- inode, rw, bio, mirror_num,
+ inode, rw, bio, mirror_num, 0,
__btree_submit_bio_hook);
}
@@ -1407,6 +1411,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->btree_inode = new_inode(sb);
fs_info->btree_inode->i_ino = 1;
fs_info->btree_inode->i_nlink = 1;
+
fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
INIT_LIST_HEAD(&fs_info->ordered_extents);
@@ -1508,6 +1513,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
*/
btrfs_init_workers(&fs_info->workers, "worker",
fs_info->thread_pool_size);
+
btrfs_init_workers(&fs_info->submit_workers, "submit",
min_t(u64, fs_devices->num_devices,
fs_info->thread_pool_size));
@@ -1559,6 +1565,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
}
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+ fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+ 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
nodesize = btrfs_super_nodesize(disk_super);
leafsize = btrfs_super_leafsize(disk_super);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index f84f5058dbbb..4eb1f1408d21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -71,6 +71,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
int metadata);
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
int rw, struct bio *bio, int mirror_num,
+ unsigned long bio_flags,
extent_submit_bio_hook_t *submit_bio_hook);
int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 280ac1aa9b6d..bbf04e80a1a3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3278,6 +3278,7 @@ static int noinline relocate_data_extent(struct inode *reloc_inode,
em->start = extent_key->objectid - offset;
em->len = extent_key->offset;
+ em->block_len = extent_key->offset;
em->block_start = extent_key->objectid;
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
@@ -3314,10 +3315,14 @@ struct btrfs_ref_path {
};
struct disk_extent {
+ u64 ram_bytes;
u64 disk_bytenr;
u64 disk_num_bytes;
u64 offset;
u64 num_bytes;
+ u8 compression;
+ u8 encryption;
+ u16 other_encoding;
};
static int is_cowonly_root(u64 root_objectid)
@@ -3631,6 +3636,11 @@ static int noinline get_new_locations(struct inode *reloc_inode,
btrfs_file_extent_disk_num_bytes(leaf, fi);
exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+ exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
+ exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
+ exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
+ fi);
WARN_ON(exts[nr].offset > 0);
WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
@@ -3846,6 +3856,8 @@ next:
new_extents[0].disk_bytenr);
btrfs_set_file_extent_disk_num_bytes(leaf, fi,
new_extents[0].disk_num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi,
+ new_extents[0].ram_bytes);
ext_offset += new_extents[0].offset;
btrfs_set_file_extent_offset(leaf, fi, ext_offset);
btrfs_mark_buffer_dirty(leaf);