summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/squashfs.txt225
-rw-r--r--MAINTAINERS7
-rw-r--r--fs/Kconfig52
-rw-r--r--fs/Makefile1
-rw-r--r--fs/squashfs/Makefile8
-rw-r--r--fs/squashfs/block.c274
-rw-r--r--fs/squashfs/cache.c412
-rw-r--r--fs/squashfs/dir.c235
-rw-r--r--fs/squashfs/export.c155
-rw-r--r--fs/squashfs/file.c502
-rw-r--r--fs/squashfs/fragment.c98
-rw-r--r--fs/squashfs/id.c94
-rw-r--r--fs/squashfs/inode.c346
-rw-r--r--fs/squashfs/namei.c242
-rw-r--r--fs/squashfs/squashfs.h90
-rw-r--r--fs/squashfs/squashfs_fs.h381
-rw-r--r--fs/squashfs/squashfs_fs_i.h45
-rw-r--r--fs/squashfs/squashfs_fs_sb.h76
-rw-r--r--fs/squashfs/super.c440
-rw-r--r--fs/squashfs/symlink.c118
-rw-r--r--init/do_mounts_rd.c14
21 files changed, 3815 insertions, 0 deletions
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt
new file mode 100644
index 000000000000..3e79e4a7a392
--- /dev/null
+++ b/Documentation/filesystems/squashfs.txt
@@ -0,0 +1,225 @@
+SQUASHFS 4.0 FILESYSTEM
+=======================
+
+Squashfs is a compressed read-only filesystem for Linux.
+It uses zlib compression to compress files, inodes and directories.
+Inodes in the system are very small and all blocks are packed to minimise
+data overhead. Block sizes greater than 4K are supported up to a maximum
+of 1Mbytes (default block size 128K).
+
+Squashfs is intended for general read-only filesystem use, for archival
+use (i.e. in cases where a .tar.gz file may be used), and in constrained
+block device/memory systems (e.g. embedded systems) where low overhead is
+needed.
+
+Mailing list: squashfs-devel@lists.sourceforge.net
+Web site: www.squashfs.org
+
+1. FILESYSTEM FEATURES
+----------------------
+
+Squashfs filesystem features versus Cramfs:
+
+ Squashfs Cramfs
+
+Max filesystem size: 2^64 16 MiB
+Max file size: ~ 2 TiB 16 MiB
+Max files: unlimited unlimited
+Max directories: unlimited unlimited
+Max entries per directory: unlimited unlimited
+Max block size: 1 MiB 4 KiB
+Metadata compression: yes no
+Directory indexes: yes no
+Sparse file support: yes no
+Tail-end packing (fragments): yes no
+Exportable (NFS etc.): yes no
+Hard link support: yes no
+"." and ".." in readdir: yes no
+Real inode numbers: yes no
+32-bit uids/gids: yes no
+File creation time: yes no
+Xattr and ACL support: no no
+
+Squashfs compresses data, inodes and directories. In addition, inode and
+directory data are highly compacted, and packed on byte boundaries. Each
+compressed inode is on average 8 bytes in length (the exact length varies on
+file type, i.e. regular file, directory, symbolic link, and block/char device
+inodes have different sizes).
+
+2. USING SQUASHFS
+-----------------
+
+As squashfs is a read-only filesystem, the mksquashfs program must be used to
+create populated squashfs filesystems. This and other squashfs utilities
+can be obtained from http://www.squashfs.org. Usage instructions can be
+obtained from this site also.
+
+
+3. SQUASHFS FILESYSTEM DESIGN
+-----------------------------
+
+A squashfs filesystem consists of seven parts, packed together on a byte
+alignment:
+
+ ---------------
+ | superblock |
+ |---------------|
+ | datablocks |
+ | & fragments |
+ |---------------|
+ | inode table |
+ |---------------|
+ | directory |
+ | table |
+ |---------------|
+ | fragment |
+ | table |
+ |---------------|
+ | export |
+ | table |
+ |---------------|
+ | uid/gid |
+ | lookup table |
+ ---------------
+
+Compressed data blocks are written to the filesystem as files are read from
+the source directory, and checked for duplicates. Once all file data has been
+written the completed inode, directory, fragment, export and uid/gid lookup
+tables are written.
+
+3.1 Inodes
+----------
+
+Metadata (inodes and directories) are compressed in 8Kbyte blocks. Each
+compressed block is prefixed by a two byte length, the top bit is set if the
+block is uncompressed. A block will be uncompressed if the -noI option is set,
+or if the compressed block was larger than the uncompressed block.
+
+Inodes are packed into the metadata blocks, and are not aligned to block
+boundaries, therefore inodes overlap compressed blocks. Inodes are identified
+by a 48-bit number which encodes the location of the compressed metadata block
+containing the inode, and the byte offset into that block where the inode is
+placed (<block, offset>).
+
+To maximise compression there are different inodes for each file type
+(regular file, directory, device, etc.), the inode contents and length
+varying with the type.
+
+To further maximise compression, two types of regular file inode and
+directory inode are defined: inodes optimised for frequently occurring
+regular files and directories, and extended types where extra
+information has to be stored.
+
+3.2 Directories
+---------------
+
+Like inodes, directories are packed into compressed metadata blocks, stored
+in a directory table. Directories are accessed using the start address of
+the metablock containing the directory and the offset into the
+decompressed block (<block, offset>).
+
+Directories are organised in a slightly complex way, and are not simply
+a list of file names. The organisation takes advantage of the
+fact that (in most cases) the inodes of the files will be in the same
+compressed metadata block, and therefore, can share the start block.
+Directories are therefore organised in a two level list, a directory
+header containing the shared start block value, and a sequence of directory
+entries, each of which share the shared start block. A new directory header
+is written once/if the inode start block changes. The directory
+header/directory entry list is repeated as many times as necessary.
+
+Directories are sorted, and can contain a directory index to speed up
+file lookup. Directory indexes store one entry per metablock, each entry
+storing the index/filename mapping to the first directory header
+in each metadata block. Directories are sorted in alphabetical order,
+and at lookup the index is scanned linearly looking for the first filename
+alphabetically larger than the filename being looked up. At this point the
+location of the metadata block the filename is in has been found.
+The general idea of the index is ensure only one metadata block needs to be
+decompressed to do a lookup irrespective of the length of the directory.
+This scheme has the advantage that it doesn't require extra memory overhead
+and doesn't require much extra storage on disk.
+
+3.3 File data
+-------------
+
+Regular files consist of a sequence of contiguous compressed blocks, and/or a
+compressed fragment block (tail-end packed block). The compressed size
+of each datablock is stored in a block list contained within the
+file inode.
+
+To speed up access to datablocks when reading 'large' files (256 Mbytes or
+larger), the code implements an index cache that caches the mapping from
+block index to datablock location on disk.
+
+The index cache allows Squashfs to handle large files (up to 1.75 TiB) while
+retaining a simple and space-efficient block list on disk. The cache
+is split into slots, caching up to eight 224 GiB files (128 KiB blocks).
+Larger files use multiple slots, with 1.75 TiB files using all 8 slots.
+The index cache is designed to be memory efficient, and by default uses
+16 KiB.
+
+3.4 Fragment lookup table
+-------------------------
+
+Regular files can contain a fragment index which is mapped to a fragment
+location on disk and compressed size using a fragment lookup table. This
+fragment lookup table is itself stored compressed into metadata blocks.
+A second index table is used to locate these. This second index table for
+speed of access (and because it is small) is read at mount time and cached
+in memory.
+
+3.5 Uid/gid lookup table
+------------------------
+
+For space efficiency regular files store uid and gid indexes, which are
+converted to 32-bit uids/gids using an id look up table. This table is
+stored compressed into metadata blocks. A second index table is used to
+locate these. This second index table for speed of access (and because it
+is small) is read at mount time and cached in memory.
+
+3.6 Export table
+----------------
+
+To enable Squashfs filesystems to be exportable (via NFS etc.) filesystems
+can optionally (disabled with the -no-exports Mksquashfs option) contain
+an inode number to inode disk location lookup table. This is required to
+enable Squashfs to map inode numbers passed in filehandles to the inode
+location on disk, which is necessary when the export code reinstantiates
+expired/flushed inodes.
+
+This table is stored compressed into metadata blocks. A second index table is
+used to locate these. This second index table for speed of access (and because
+it is small) is read at mount time and cached in memory.
+
+
+4. TODOS AND OUTSTANDING ISSUES
+-------------------------------
+
+4.1 Todo list
+-------------
+
+Implement Xattr and ACL support. The Squashfs 4.0 filesystem layout has hooks
+for these but the code has not been written. Once the code has been written
+the existing layout should not require modification.
+
+4.2 Squashfs internal cache
+---------------------------
+
+Blocks in Squashfs are compressed. To avoid repeatedly decompressing
+recently accessed data Squashfs uses two small metadata and fragment caches.
+
+The cache is not used for file datablocks, these are decompressed and cached in
+the page-cache in the normal way. The cache is used to temporarily cache
+fragment and metadata blocks which have been read as a result of a metadata
+(i.e. inode or directory) or fragment access. Because metadata and fragments
+are packed together into blocks (to gain greater compression) the read of a
+particular piece of metadata or fragment will retrieve other metadata/fragments
+which have been packed with it, these because of locality-of-reference may be
+read in the near future. Temporarily caching them ensures they are available
+for near future access without requiring an additional read and decompress.
+
+In the future this internal cache may be replaced with an implementation which
+uses the kernel page cache. Because the page cache operates on page sized
+units this may introduce additional complexity in terms of locking and
+associated race conditions.
diff --git a/MAINTAINERS b/MAINTAINERS
index 57e0309243cc..6f65a269cb17 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4081,6 +4081,13 @@ L: cbe-oss-dev@ozlabs.org
W: http://www.ibm.com/developerworks/power/cell/
S: Supported
+SQUASHFS FILE SYSTEM
+P: Phillip Lougher
+M: phillip@lougher.demon.co.uk
+L: squashfs-devel@lists.sourceforge.net (subscribers-only)
+W: http://squashfs.org.uk
+S: Maintained
+
SRM (Alpha) environment access
P: Jan-Benedict Glaw
M: jbglaw@lug-owl.de
diff --git a/fs/Kconfig b/fs/Kconfig
index 02cff86af1b4..51307b0fdf0f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -932,6 +932,58 @@ config CRAMFS
If unsure, say N.
+config SQUASHFS
+ tristate "SquashFS 4.0 - Squashed file system support"
+ depends on BLOCK
+ select ZLIB_INFLATE
+ help
+ Saying Y here includes support for SquashFS 4.0 (a Compressed
+ Read-Only File System). Squashfs is a highly compressed read-only
+ filesystem for Linux. It uses zlib compression to compress both
+ files, inodes and directories. Inodes in the system are very small
+ and all blocks are packed to minimise data overhead. Block sizes
+ greater than 4K are supported up to a maximum of 1 Mbytes (default
+ block size 128K). SquashFS 4.0 supports 64 bit filesystems and files
+ (larger than 4GB), full uid/gid information, hard links and
+ timestamps.
+
+ Squashfs is intended for general read-only filesystem use, for
+ archival use (i.e. in cases where a .tar.gz file may be used), and in
+ embedded systems where low overhead is needed. Further information
+ and tools are available from http://squashfs.sourceforge.net.
+
+ If you want to compile this as a module ( = code which can be
+ inserted in and removed from the running kernel whenever you want),
+ say M here and read <file:Documentation/modules.txt>. The module
+ will be called squashfs. Note that the root file system (the one
+ containing the directory /) cannot be compiled as a module.
+
+ If unsure, say N.
+
+config SQUASHFS_EMBEDDED
+
+ bool "Additional option for memory-constrained systems"
+ depends on SQUASHFS
+ default n
+ help
+ Saying Y here allows you to specify cache size.
+
+ If unsure, say N.
+
+config SQUASHFS_FRAGMENT_CACHE_SIZE
+ int "Number of fragments cached" if SQUASHFS_EMBEDDED
+ depends on SQUASHFS
+ default "3"
+ help
+ By default SquashFS caches the last 3 fragments read from
+ the filesystem. Increasing this amount may mean SquashFS
+ has to re-read fragments less often from disk, at the expense
+ of extra system memory. Decreasing this amount will mean
+ SquashFS uses less memory at the expense of extra reads from disk.
+
+ Note there must be at least one cached fragment. Anything
+ much more than three will probably not make much difference.
+
config VXFS_FS
tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
depends on BLOCK
diff --git a/fs/Makefile b/fs/Makefile
index bc4e14df1082..38bc735c67ad 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -74,6 +74,7 @@ obj-$(CONFIG_JBD) += jbd/
obj-$(CONFIG_JBD2) += jbd2/
obj-$(CONFIG_EXT2_FS) += ext2/
obj-$(CONFIG_CRAMFS) += cramfs/
+obj-$(CONFIG_SQUASHFS) += squashfs/
obj-y += ramfs/
obj-$(CONFIG_HUGETLBFS) += hugetlbfs/
obj-$(CONFIG_CODA_FS) += coda/
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
new file mode 100644
index 000000000000..8258cf9a0317
--- /dev/null
+++ b/fs/squashfs/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the linux squashfs routines.
+#
+
+obj-$(CONFIG_SQUASHFS) += squashfs.o
+squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
+squashfs-y += namei.o super.o symlink.o
+#squashfs-y += squashfs2_0.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
new file mode 100644
index 000000000000..c837dfc2b3c6
--- /dev/null
+++ b/fs/squashfs/block.c
@@ -0,0 +1,274 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * block.c
+ */
+
+/*
+ * This file implements the low-level routines to read and decompress
+ * datablocks and metadata blocks.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Read the metadata block length, this is stored in the first two
+ * bytes of the metadata block.
+ */
+static struct buffer_head *get_block_length(struct super_block *sb,
+ u64 *cur_index, int *offset, int *length)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ struct buffer_head *bh;
+
+ bh = sb_bread(sb, *cur_index);
+ if (bh == NULL)
+ return NULL;
+
+ if (msblk->devblksize - *offset == 1) {
+ *length = (unsigned char) bh->b_data[*offset];
+ put_bh(bh);
+ bh = sb_bread(sb, ++(*cur_index));
+ if (bh == NULL)
+ return NULL;
+ *length |= (unsigned char) bh->b_data[0] << 8;
+ *offset = 1;
+ } else {
+ *length = (unsigned char) bh->b_data[*offset] |
+ (unsigned char) bh->b_data[*offset + 1] << 8;
+ *offset += 2;
+ }
+
+ return bh;
+}
+
+
+/*
+ * Read and decompress a metadata block or datablock. Length is non-zero
+ * if a datablock is being read (the size is stored elsewhere in the
+ * filesystem), otherwise the length is obtained from the first two bytes of
+ * the metadata block. A bit in the length field indicates if the block
+ * is stored uncompressed in the filesystem (usually because compression
+ * generated a larger block - this does occasionally happen with zlib).
+ */
+int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
+ int length, u64 *next_index, int srclength)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ struct buffer_head **bh;
+ int offset = index & ((1 << msblk->devblksize_log2) - 1);
+ u64 cur_index = index >> msblk->devblksize_log2;
+ int bytes, compressed, b = 0, k = 0, page = 0, avail;
+
+
+ bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1,
+ sizeof(*bh), GFP_KERNEL);
+ if (bh == NULL)
+ return -ENOMEM;
+
+ if (length) {
+ /*
+ * Datablock.
+ */
+ bytes = -offset;
+ compressed = SQUASHFS_COMPRESSED_BLOCK(length);
+ length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
+ if (next_index)
+ *next_index = index + length;
+
+ TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
+ index, compressed ? "" : "un", length, srclength);
+
+ if (length < 0 || length > srclength ||
+ (index + length) > msblk->bytes_used)
+ goto read_failure;
+
+ for (b = 0; bytes < length; b++, cur_index++) {
+ bh[b] = sb_getblk(sb, cur_index);
+ if (bh[b] == NULL)
+ goto block_release;
+ bytes += msblk->devblksize;
+ }
+ ll_rw_block(READ, b, bh);
+ } else {
+ /*
+ * Metadata block.
+ */
+ if ((index + 2) > msblk->bytes_used)
+ goto read_failure;
+
+ bh[0] = get_block_length(sb, &cur_index, &offset, &length);
+ if (bh[0] == NULL)
+ goto read_failure;
+ b = 1;
+
+ bytes = msblk->devblksize - offset;
+ compressed = SQUASHFS_COMPRESSED(length);
+ length = SQUASHFS_COMPRESSED_SIZE(length);
+ if (next_index)
+ *next_index = index + length + 2;
+
+ TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
+ compressed ? "" : "un", length);
+
+ if (length < 0 || length > srclength ||
+ (index + length) > msblk->bytes_used)
+ goto block_release;
+
+ for (; bytes < length; b++) {
+ bh[b] = sb_getblk(sb, ++cur_index);
+ if (bh[b] == NULL)
+ goto block_release;
+ bytes += msblk->devblksize;
+ }
+ ll_rw_block(READ, b - 1, bh + 1);
+ }
+
+ if (compressed) {
+ int zlib_err = 0, zlib_init = 0;
+
+ /*
+ * Uncompress block.
+ */
+
+ mutex_lock(&msblk->read_data_mutex);
+
+ msblk->stream.avail_out = 0;
+ msblk->stream.avail_in = 0;
+
+ bytes = length;
+ do {
+ if (msblk->stream.avail_in == 0 && k < b) {
+ avail = min(bytes, msblk->devblksize - offset);
+ bytes -= avail;
+ wait_on_buffer(bh[k]);
+ if (!buffer_uptodate(bh[k]))
+ goto release_mutex;
+
+ if (avail == 0) {
+ offset = 0;
+ put_bh(bh[k++]);
+ continue;
+ }
+
+ msblk->stream.next_in = bh[k]->b_data + offset;
+ msblk->stream.avail_in = avail;
+ offset = 0;
+ }
+
+ if (msblk->stream.avail_out == 0) {
+ msblk->stream.next_out = buffer[page++];
+ msblk->stream.avail_out = PAGE_CACHE_SIZE;
+ }
+
+ if (!zlib_init) {
+ zlib_err = zlib_inflateInit(&msblk->stream);
+ if (zlib_err != Z_OK) {
+ ERROR("zlib_inflateInit returned"
+ " unexpected result 0x%x,"
+ " srclength %d\n", zlib_err,
+ srclength);
+ goto release_mutex;
+ }
+ zlib_init = 1;
+ }
+
+ zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH);
+
+ if (msblk->stream.avail_in == 0 && k < b)
+ put_bh(bh[k++]);
+ } while (zlib_err == Z_OK);
+
+ if (zlib_err != Z_STREAM_END) {
+ ERROR("zlib_inflate returned unexpected result"
+ " 0x%x, srclength %d, avail_in %d,"
+ " avail_out %d\n", zlib_err, srclength,
+ msblk->stream.avail_in,
+ msblk->stream.avail_out);
+ goto release_mutex;
+ }
+
+ zlib_err = zlib_inflateEnd(&msblk->stream);
+ if (zlib_err != Z_OK) {
+ ERROR("zlib_inflateEnd returned unexpected result 0x%x,"
+ " srclength %d\n", zlib_err, srclength);
+ goto release_mutex;
+ }
+ length = msblk->stream.total_out;
+ mutex_unlock(&msblk->read_data_mutex);
+ } else {
+ /*
+ * Block is uncompressed.
+ */
+ int i, in, pg_offset = 0;
+
+ for (i = 0; i < b; i++) {
+ wait_on_buffer(bh[i]);
+ if (!buffer_uptodate(bh[i]))
+ goto block_release;
+ }
+
+ for (bytes = length; k < b; k++) {
+ in = min(bytes, msblk->devblksize - offset);
+ bytes -= in;
+ while (in) {
+ if (pg_offset == PAGE_CACHE_SIZE) {
+ page++;
+ pg_offset = 0;
+ }
+ avail = min_t(int, in, PAGE_CACHE_SIZE -
+ pg_offset);
+ memcpy(buffer[page] + pg_offset,
+ bh[k]->b_data + offset, avail);
+ in -= avail;
+ pg_offset += avail;
+ offset += avail;
+ }
+ offset = 0;
+ put_bh(bh[k]);
+ }
+ }
+
+ kfree(bh);
+ return length;
+
+release_mutex:
+ mutex_unlock(&msblk->read_data_mutex);
+
+block_release:
+ for (; k < b; k++)
+ put_bh(bh[k]);
+
+read_failure:
+ ERROR("sb_bread failed reading block 0x%llx\n", cur_index);
+ kfree(bh);
+ return -EIO;
+}
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
new file mode 100644
index 000000000000..f29eda16d25e
--- /dev/null
+++ b/fs/squashfs/cache.c
@@ -0,0 +1,412 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * cache.c
+ */
+
+/*
+ * Blocks in Squashfs are compressed. To avoid repeatedly decompressing
+ * recently accessed data Squashfs uses two small metadata and fragment caches.
+ *
+ * This file implements a generic cache implementation used for both caches,
+ * plus functions layered ontop of the generic cache implementation to
+ * access the metadata and fragment caches.
+ *
+ * To avoid out of memory and fragmentation isssues with vmalloc the cache
+ * uses sequences of kmalloced PAGE_CACHE_SIZE buffers.
+ *
+ * It should be noted that the cache is not used for file datablocks, these
+ * are decompressed and cached in the page-cache in the normal way. The
+ * cache is only used to temporarily cache fragment and metadata blocks
+ * which have been read as as a result of a metadata (i.e. inode or
+ * directory) or fragment access. Because metadata and fragments are packed
+ * together into blocks (to gain greater compression) the read of a particular
+ * piece of metadata or fragment will retrieve other metadata/fragments which
+ * have been packed with it, these because of locality-of-reference may be read
+ * in the near future. Temporarily caching them ensures they are available for
+ * near future access without requiring an additional read and decompress.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/zlib.h>
+#include <linux/pagemap.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Look-up block in cache, and increment usage count. If not in cache, read
+ * and decompress it from disk.
+ */
+struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
+ struct squashfs_cache *cache, u64 block, int length)
+{
+ int i, n;
+ struct squashfs_cache_entry *entry;
+
+ spin_lock(&cache->lock);
+
+ while (1) {
+ for (i = 0; i < cache->entries; i++)
+ if (cache->entry[i].block == block)
+ break;
+
+ if (i == cache->entries) {
+ /*
+ * Block not in cache, if all cache entries are used
+ * go to sleep waiting for one to become available.
+ */
+ if (cache->unused == 0) {
+ cache->num_waiters++;
+ spin_unlock(&cache->lock);
+ wait_event(cache->wait_queue, cache->unused);
+ spin_lock(&cache->lock);
+ cache->num_waiters--;
+ continue;
+ }
+
+ /*
+ * At least one unused cache entry. A simple
+ * round-robin strategy is used to choose the entry to
+ * be evicted from the cache.
+ */
+ i = cache->next_blk;
+ for (n = 0; n < cache->entries; n++) {
+ if (cache->entry[i].refcount == 0)
+ break;
+ i = (i + 1) % cache->entries;
+ }
+
+ cache->next_blk = (i + 1) % cache->entries;
+ entry = &cache->entry[i];
+
+ /*
+ * Initialise choosen cache entry, and fill it in from
+ * disk.
+ */
+ cache->unused--;
+ entry->block = block;
+ entry->refcount = 1;
+ entry->pending = 1;
+ entry->num_waiters = 0;
+ entry->error = 0;
+ spin_unlock(&cache->lock);
+
+ entry->length = squashfs_read_data(sb, entry->data,
+ block, length, &entry->next_index,
+ cache->block_size);
+
+ spin_lock(&cache->lock);
+
+ if (entry->length < 0)
+ entry->error = entry->length;
+
+ entry->pending = 0;
+
+ /*
+ * While filling this entry one or more other processes
+ * have looked it up in the cache, and have slept
+ * waiting for it to become available.
+ */
+ if (entry->num_waiters) {
+ spin_unlock(&cache->lock);
+ wake_up_all(&entry->wait_queue);
+ } else
+ spin_unlock(&cache->lock);
+
+ goto out;
+ }
+
+ /*
+ * Block already in cache. Increment refcount so it doesn't
+ * get reused until we're finished with it, if it was
+ * previously unused there's one less cache entry available
+ * for reuse.
+ */
+ entry = &cache->entry[i];
+ if (entry->refcount == 0)
+ cache->unused--;
+ entry->refcount++;
+
+ /*
+ * If the entry is currently being filled in by another process
+ * go to sleep waiting for it to become available.
+ */
+ if (entry->pending) {
+ entry->num_waiters++;
+ spin_unlock(&cache->lock);
+ wait_event(entry->wait_queue, !entry->pending);
+ } else
+ spin_unlock(&cache->lock);
+
+ goto out;
+ }
+
+out:
+ TRACE("Got %s %d, start block %lld, refcount %d, error %d\n",
+ cache->name, i, entry->block, entry->refcount, entry->error);
+
+ if (entry->error)
+ ERROR("Unable to read %s cache entry [%llx]\n", cache->name,
+ block);
+ return entry;
+}
+
+
+/*
+ * Release cache entry, once usage count is zero it can be reused.
+ */
+void squashfs_cache_put(struct squashfs_cache_entry *entry)
+{
+ struct squashfs_cache *cache = entry->cache;
+
+ spin_lock(&cache->lock);
+ entry->refcount--;
+ if (entry->refcount == 0) {
+ cache->unused++;
+ /*
+ * If there's any processes waiting for a block to become
+ * available, wake one up.
+ */
+ if (cache->num_waiters) {
+ spin_unlock(&cache->lock);
+ wake_up(&cache->wait_queue);
+ return;
+ }
+ }
+ spin_unlock(&cache->lock);
+}
+
+/*
+ * Delete cache reclaiming all kmalloced buffers.
+ */
+void squashfs_cache_delete(struct squashfs_cache *cache)
+{
+ int i, j;
+
+ if (cache == NULL)
+ return;
+
+ for (i = 0; i < cache->entries; i++) {
+ if (cache->entry[i].data) {
+ for (j = 0; j < cache->pages; j++)
+ kfree(cache->entry[i].data[j]);
+ kfree(cache->entry[i].data);
+ }
+ }
+
+ kfree(cache->entry);
+ kfree(cache);
+}
+
+
+/*
+ * Initialise cache allocating the specified number of entries, each of
+ * size block_size. To avoid vmalloc fragmentation issues each entry
+ * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers.
+ */
+struct squashfs_cache *squashfs_cache_init(char *name, int entries,
+ int block_size)
+{
+ int i, j;
+ struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+
+ if (cache == NULL) {
+ ERROR("Failed to allocate %s cache\n", name);
+ return NULL;
+ }
+
+ cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL);
+ if (cache->entry == NULL) {
+ ERROR("Failed to allocate %s cache\n", name);
+ goto cleanup;
+ }
+
+ cache->next_blk = 0;
+ cache->unused = entries;
+ cache->entries = entries;
+ cache->block_size = block_size;
+ cache->pages = block_size >> PAGE_CACHE_SHIFT;
+ cache->name = name;
+ cache->num_waiters = 0;
+ spin_lock_init(&cache->lock);
+ init_waitqueue_head(&cache->wait_queue);
+
+ for (i = 0; i < entries; i++) {
+ struct squashfs_cache_entry *entry = &cache->entry[i];
+
+ init_waitqueue_head(&cache->entry[i].wait_queue);
+ entry->cache = cache;
+ entry->block = SQUASHFS_INVALID_BLK;
+ entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL);
+ if (entry->data == NULL) {
+ ERROR("Failed to allocate %s cache entry\n", name);
+ goto cleanup;
+ }
+
+ for (j = 0; j < cache->pages; j++) {
+ entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+ if (entry->data[j] == NULL) {
+ ERROR("Failed to allocate %s buffer\n", name);
+ goto cleanup;
+ }
+ }
+ }
+
+ return cache;
+
+cleanup:
+ squashfs_cache_delete(cache);
+ return NULL;
+}
+
+
+/*
+ * Copy upto length bytes from cache entry to buffer starting at offset bytes
+ * into the cache entry. If there's not length bytes then copy the number of
+ * bytes available. In all cases return the number of bytes copied.
+ */
+int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
+ int offset, int length)
+{
+ int remaining = length;
+
+ if (length == 0)
+ return 0;
+ else if (buffer == NULL)
+ return min(length, entry->length - offset);
+
+ while (offset < entry->length) {
+ void *buff = entry->data[offset / PAGE_CACHE_SIZE]
+ + (offset % PAGE_CACHE_SIZE);
+ int bytes = min_t(int, entry->length - offset,
+ PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
+
+ if (bytes >= remaining) {
+ memcpy(buffer, buff, remaining);
+ remaining = 0;
+ break;
+ }
+
+ memcpy(buffer, buff, bytes);
+ buffer += bytes;
+ remaining -= bytes;
+ offset += bytes;
+ }
+
+ return length - remaining;
+}
+
+
+/*
+ * Read length bytes from metadata position <block, offset> (block is the
+ * start of the compressed block on disk, and offset is the offset into
+ * the block once decompressed). Data is packed into consecutive blocks,
+ * and length bytes may require reading more than one block.
+ */
+int squashfs_read_metadata(struct super_block *sb, void *buffer,
+ u64 *block, int *offset, int length)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ int bytes, copied = length;
+ struct squashfs_cache_entry *entry;