summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/nfs/00-INDEX2
-rw-r--r--Documentation/filesystems/nfs/pnfs.txt48
-rw-r--r--fs/nfs/Kconfig8
-rw-r--r--fs/nfs/Makefile4
-rw-r--r--fs/nfs/callback_proc.c8
-rw-r--r--fs/nfs/client.c10
-rw-r--r--fs/nfs/file.c5
-rw-r--r--fs/nfs/inode.c3
-rw-r--r--fs/nfs/nfs4filelayout.c280
-rw-r--r--fs/nfs/nfs4filelayout.h94
-rw-r--r--fs/nfs/nfs4filelayoutdev.c448
-rw-r--r--fs/nfs/nfs4proc.c218
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/nfs4xdr.c360
-rw-r--r--fs/nfs/pnfs.c783
-rw-r--r--fs/nfs/pnfs.h189
-rw-r--r--fs/nfs/read.c3
-rw-r--r--fs/nfsd/nfs4callback.c1
-rw-r--r--include/linux/nfs4.h62
-rw-r--r--include/linux/nfs_fs.h5
-rw-r--r--include/linux/nfs_fs_sb.h3
-rw-r--r--include/linux/nfs_xdr.h50
-rw-r--r--include/linux/sunrpc/xdr.h7
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c2
24 files changed, 2550 insertions, 45 deletions
diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
index 3225a5662114..a57e12411d2a 100644
--- a/Documentation/filesystems/nfs/00-INDEX
+++ b/Documentation/filesystems/nfs/00-INDEX
@@ -12,6 +12,8 @@ nfs-rdma.txt
- how to install and setup the Linux NFS/RDMA client and server software
nfsroot.txt
- short guide on setting up a diskless box with NFS root filesystem.
+pnfs.txt
+ - short explanation of some of the internals of the pnfs client code
rpc-cache.txt
- introduction to the caching mechanisms in the sunrpc layer.
idmapper.txt
diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
new file mode 100644
index 000000000000..bc0b9cfe095b
--- /dev/null
+++ b/Documentation/filesystems/nfs/pnfs.txt
@@ -0,0 +1,48 @@
+Reference counting in pnfs:
+==========================
+
+The are several inter-related caches. We have layouts which can
+reference multiple devices, each of which can reference multiple data servers.
+Each data server can be referenced by multiple devices. Each device
+can be referenced by multiple layouts. To keep all of this straight,
+we need to reference count.
+
+
+struct pnfs_layout_hdr
+----------------------
+The on-the-wire command LAYOUTGET corresponds to struct
+pnfs_layout_segment, usually referred to by the variable name lseg.
+Each nfs_inode may hold a pointer to a cache of of these layout
+segments in nfsi->layout, of type struct pnfs_layout_hdr.
+
+We reference the header for the inode pointing to it, across each
+outstanding RPC call that references it (LAYOUTGET, LAYOUTRETURN,
+LAYOUTCOMMIT), and for each lseg held within.
+
+Each header is also (when non-empty) put on a list associated with
+struct nfs_client (cl_layouts). Being put on this list does not bump
+the reference count, as the layout is kept around by the lseg that
+keeps it in the list.
+
+deviceid_cache
+--------------
+lsegs reference device ids, which are resolved per nfs_client and
+layout driver type. The device ids are held in a RCU cache (struct
+nfs4_deviceid_cache). The cache itself is referenced across each
+mount. The entries (struct nfs4_deviceid) themselves are held across
+the lifetime of each lseg referencing them.
+
+RCU is used because the deviceid is basically a write once, read many
+data structure. The hlist size of 32 buckets needs better
+justification, but seems reasonable given that we can have multiple
+deviceid's per filesystem, and multiple filesystems per nfs_client.
+
+The hash code is copied from the nfsd code base. A discussion of
+hashing and variations of this algorithm can be found at:
+http://groups.google.com/group/comp.lang.c/browse_thread/thread/9522965e2b8d3809
+
+data server cache
+-----------------
+file driver devices refer to data servers, which are kept in a module
+level cache. Its reference is held over the lifetime of the deviceid
+pointing to it.
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 5c55c26af165..fd667652c502 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -77,13 +77,17 @@ config NFS_V4
config NFS_V4_1
bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
- depends on NFS_V4 && EXPERIMENTAL
+ depends on NFS_FS && NFS_V4 && EXPERIMENTAL
+ select PNFS_FILE_LAYOUT
help
This option enables support for minor version 1 of the NFSv4 protocol
- (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
+ (RFC 5661) in the kernel's NFS client.
If unsure, say N.
+config PNFS_FILE_LAYOUT
+ tristate
+
config ROOT_NFS
bool "Root file system on NFS"
depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index da7fda639eac..4776ff9e3814 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,5 +15,9 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
delegation.o idmap.o \
callback.o callback_xdr.o callback_proc.o \
nfs4namespace.o
+nfs-$(CONFIG_NFS_V4_1) += pnfs.o
nfs-$(CONFIG_SYSCTL) += sysctl.o
nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
+
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
+nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 930d10fecdaf..2950fca0c61b 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -118,11 +118,11 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n
if (delegation == NULL)
return 0;
- /* seqid is 4-bytes long */
- if (((u32 *) &stateid->data)[0] != 0)
+ if (stateid->stateid.seqid != 0)
return 0;
- if (memcmp(&delegation->stateid.data[4], &stateid->data[4],
- sizeof(stateid->data)-4))
+ if (memcmp(&delegation->stateid.stateid.other,
+ &stateid->stateid.other,
+ NFS4_STATEID_OTHER_SIZE))
return 0;
return 1;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a882785eba41..fd6f0a70021b 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -48,6 +48,7 @@
#include "iostat.h"
#include "internal.h"
#include "fscache.h"
+#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_CLIENT
@@ -155,7 +156,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
cred = rpc_lookup_machine_cred();
if (!IS_ERR(cred))
clp->cl_machine_cred = cred;
-
+#if defined(CONFIG_NFS_V4_1)
+ INIT_LIST_HEAD(&clp->cl_layouts);
+#endif
nfs_fscache_get_client_cookie(clp);
return clp;
@@ -252,6 +255,7 @@ void nfs_put_client(struct nfs_client *clp)
nfs_free_client(clp);
}
}
+EXPORT_SYMBOL_GPL(nfs_put_client);
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
/*
@@ -900,6 +904,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
server->wsize = NFS_MAX_FILE_IO_SIZE;
server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ set_pnfs_layoutdriver(server, fsinfo->layouttype);
+
server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
@@ -939,6 +945,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
}
fsinfo.fattr = fattr;
+ fsinfo.layouttype = 0;
error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
if (error < 0)
goto out_error;
@@ -1021,6 +1028,7 @@ void nfs_free_server(struct nfs_server *server)
{
dprintk("--> nfs_free_server()\n");
+ unset_pnfs_layoutdriver(server);
spin_lock(&nfs_client_lock);
list_del(&server->client_link);
list_del(&server->master_link);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e18c31e08a28..e756075637b0 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
#include "internal.h"
#include "iostat.h"
#include "fscache.h"
+#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_FILE
@@ -386,6 +387,10 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
file->f_path.dentry->d_name.name,
mapping->host->i_ino, len, (long long) pos);
+ pnfs_update_layout(mapping->host,
+ nfs_file_open_context(file),
+ IOMODE_RW);
+
start:
/*
* Prevent starvation issues if someone is doing a consistency
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6eec28656415..314f57164602 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,6 +48,7 @@
#include "internal.h"
#include "fscache.h"
#include "dns_resolve.h"
+#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_VFS
@@ -1410,6 +1411,7 @@ void nfs4_evict_inode(struct inode *inode)
{
truncate_inode_pages(&inode->i_data, 0);
end_writeback(inode);
+ pnfs_destroy_layout(NFS_I(inode));
/* If we are holding a delegation, return it! */
nfs_inode_return_delegation_noreclaim(inode);
/* First call standard NFS clear_inode() code */
@@ -1447,6 +1449,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
nfsi->delegation = NULL;
nfsi->delegation_state = 0;
init_rwsem(&nfsi->rwsem);
+ nfsi->layout = NULL;
#endif
}
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
new file mode 100644
index 000000000000..2e92f0d8d654
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.c
@@ -0,0 +1,280 @@
+/*
+ * Module for the pnfs nfs4 file layout driver.
+ * Defines all I/O and Policy interface operations, plus code
+ * to register itself with the pNFS client.
+ *
+ * Copyright (c) 2002
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#include <linux/nfs_fs.h>
+
+#include "internal.h"
+#include "nfs4filelayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
+MODULE_DESCRIPTION("The NFSv4 file layout driver");
+
+static int
+filelayout_set_layoutdriver(struct nfs_server *nfss)
+{
+ int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
+ nfs4_fl_free_deviceid_callback);
+ if (status) {
+ printk(KERN_WARNING "%s: deviceid cache could not be "
+ "initialized\n", __func__);
+ return status;
+ }
+ dprintk("%s: deviceid cache has been initialized successfully\n",
+ __func__);
+ return 0;
+}
+
+/* Clear out the layout by destroying its device list */
+static int
+filelayout_clear_layoutdriver(struct nfs_server *nfss)
+{
+ dprintk("--> %s\n", __func__);
+
+ if (nfss->nfs_client->cl_devid_cache)
+ pnfs_put_deviceid_cache(nfss->nfs_client);
+ return 0;
+}
+
+/*
+ * filelayout_check_layout()
+ *
+ * Make sure layout segment parameters are sane WRT the device.
+ * At this point no generic layer initialization of the lseg has occurred,
+ * and nothing has been added to the layout_hdr cache.
+ *
+ */
+static int
+filelayout_check_layout(struct pnfs_layout_hdr *lo,
+ struct nfs4_filelayout_segment *fl,
+ struct nfs4_layoutget_res *lgr,
+ struct nfs4_deviceid *id)
+{
+ struct nfs4_file_layout_dsaddr *dsaddr;
+ int status = -EINVAL;
+ struct nfs_server *nfss = NFS_SERVER(lo->inode);
+
+ dprintk("--> %s\n", __func__);
+
+ if (fl->pattern_offset > lgr->range.offset) {
+ dprintk("%s pattern_offset %lld to large\n",
+ __func__, fl->pattern_offset);
+ goto out;
+ }
+
+ if (fl->stripe_unit % PAGE_SIZE) {
+ dprintk("%s Stripe unit (%u) not page aligned\n",
+ __func__, fl->stripe_unit);
+ goto out;
+ }
+
+ /* find and reference the deviceid */
+ dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
+ if (dsaddr == NULL) {
+ dsaddr = get_device_info(lo->inode, id);
+ if (dsaddr == NULL)
+ goto out;
+ }
+ fl->dsaddr = dsaddr;
+
+ if (fl->first_stripe_index < 0 ||
+ fl->first_stripe_index >= dsaddr->stripe_count) {
+ dprintk("%s Bad first_stripe_index %d\n",
+ __func__, fl->first_stripe_index);
+ goto out_put;
+ }
+
+ if ((fl->stripe_type == STRIPE_SPARSE &&
+ fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
+ (fl->stripe_type == STRIPE_DENSE &&
+ fl->num_fh != dsaddr->stripe_count)) {
+ dprintk("%s num_fh %u not valid for given packing\n",
+ __func__, fl->num_fh);
+ goto out_put;
+ }
+
+ if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
+ dprintk("%s Stripe unit (%u) not aligned with rsize %u "
+ "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
+ nfss->wsize);
+ }
+
+ status = 0;
+out:
+ dprintk("--> %s returns %d\n", __func__, status);
+ return status;
+out_put:
+ pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
+ goto out;
+}
+
+static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
+{
+ int i;
+
+ for (i = 0; i < fl->num_fh; i++) {
+ if (!fl->fh_array[i])
+ break;
+ kfree(fl->fh_array[i]);
+ }
+ kfree(fl->fh_array);
+ fl->fh_array = NULL;
+}
+
+static void
+_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
+{
+ filelayout_free_fh_array(fl);
+ kfree(fl);
+}
+
+static int
+filelayout_decode_layout(struct pnfs_layout_hdr *flo,
+ struct nfs4_filelayout_segment *fl,
+ struct nfs4_layoutget_res *lgr,
+ struct nfs4_deviceid *id)
+{
+ uint32_t *p = (uint32_t *)lgr->layout.buf;
+ uint32_t nfl_util;
+ int i;
+
+ dprintk("%s: set_layout_map Begin\n", __func__);
+
+ memcpy(id, p, sizeof(*id));
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+ print_deviceid(id);
+
+ nfl_util = be32_to_cpup(p++);
+ if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
+ fl->commit_through_mds = 1;
+ if (nfl_util & NFL4_UFLG_DENSE)
+ fl->stripe_type = STRIPE_DENSE;
+ else
+ fl->stripe_type = STRIPE_SPARSE;
+ fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
+
+ fl->first_stripe_index = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &fl->pattern_offset);
+ fl->num_fh = be32_to_cpup(p++);
+
+ dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
+ __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
+ fl->pattern_offset);
+
+ fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
+ GFP_KERNEL);
+ if (!fl->fh_array)
+ return -ENOMEM;
+
+ for (i = 0; i < fl->num_fh; i++) {
+ /* Do we want to use a mempool here? */
+ fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
+ if (!fl->fh_array[i]) {
+ filelayout_free_fh_array(fl);
+ return -ENOMEM;
+ }
+ fl->fh_array[i]->size = be32_to_cpup(p++);
+ if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
+ printk(KERN_ERR "Too big fh %d received %d\n",
+ i, fl->fh_array[i]->size);
+ filelayout_free_fh_array(fl);
+ return -EIO;
+ }
+ memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
+ p += XDR_QUADLEN(fl->fh_array[i]->size);
+ dprintk("DEBUG: %s: fh len %d\n", __func__,
+ fl->fh_array[i]->size);
+ }
+
+ return 0;
+}
+
+static struct pnfs_layout_segment *
+filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
+ struct nfs4_layoutget_res *lgr)
+{
+ struct nfs4_filelayout_segment *fl;
+ int rc;
+ struct nfs4_deviceid id;
+
+ dprintk("--> %s\n", __func__);
+ fl = kzalloc(sizeof(*fl), GFP_KERNEL);
+ if (!fl)
+ return NULL;
+
+ rc = filelayout_decode_layout(layoutid, fl, lgr, &id);
+ if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) {
+ _filelayout_free_lseg(fl);
+ return NULL;
+ }
+ return &fl->generic_hdr;
+}
+
+static void
+filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
+ struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+
+ dprintk("--> %s\n", __func__);
+ pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
+ &fl->dsaddr->deviceid);
+ _filelayout_free_lseg(fl);
+}
+
+static struct pnfs_layoutdriver_type filelayout_type = {
+ .id = LAYOUT_NFSV4_1_FILES,
+ .name = "LAYOUT_NFSV4_1_FILES",
+ .owner = THIS_MODULE,
+ .set_layoutdriver = filelayout_set_layoutdriver,
+ .clear_layoutdriver = filelayout_clear_layoutdriver,
+ .alloc_lseg = filelayout_alloc_lseg,
+ .free_lseg = filelayout_free_lseg,
+};
+
+static int __init nfs4filelayout_init(void)
+{
+ printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
+ __func__);
+ return pnfs_register_layoutdriver(&filelayout_type);
+}
+
+static void __exit nfs4filelayout_exit(void)
+{
+ printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
+ __func__);
+ pnfs_unregister_layoutdriver(&filelayout_type);
+}
+
+module_init(nfs4filelayout_init);
+module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
new file mode 100644
index 000000000000..bbf60dd2ab9d
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.h
@@ -0,0 +1,94 @@
+/*
+ * NFSv4 file layout driver data structures.
+ *
+ * Copyright (c) 2002
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#ifndef FS_NFS_NFS4FILELAYOUT_H
+#define FS_NFS_NFS4FILELAYOUT_H
+
+#include "pnfs.h"
+
+/*
+ * Field testing shows we need to support upto 4096 stripe indices.
+ * We store each index as a u8 (u32 on the wire) to keep the memory footprint
+ * reasonable. This in turn means we support a maximum of 256
+ * RFC 5661 multipath_list4 structures.
+ */
+#define NFS4_PNFS_MAX_STRIPE_CNT 4096
+#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */
+
+enum stripetype4 {
+ STRIPE_SPARSE = 1,
+ STRIPE_DENSE = 2
+};
+
+/* Individual ip address */
+struct nfs4_pnfs_ds {
+ struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
+ u32 ds_ip_addr;
+ u32 ds_port;
+ struct nfs_client *ds_clp;
+ atomic_t ds_count;
+};
+
+struct nfs4_file_layout_dsaddr {
+ struct pnfs_deviceid_node deviceid;
+ u32 stripe_count;
+ u8 *stripe_indices;
+ u32 ds_num;
+ struct nfs4_pnfs_ds *ds_list[1];
+};
+
+struct nfs4_filelayout_segment {
+ struct pnfs_layout_segment generic_hdr;
+ u32 stripe_type;
+ u32 commit_through_mds;
+ u32 stripe_unit;
+ u32 first_stripe_index;
+ u64 pattern_offset;
+ struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
+ unsigned int num_fh;
+ struct nfs_fh **fh_array;
+};
+
+static inline struct nfs4_filelayout_segment *
+FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+ return container_of(lseg,
+ struct nfs4_filelayout_segment,
+ generic_hdr);
+}
+
+extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
+extern void print_ds(struct nfs4_pnfs_ds *ds);
+extern void print_deviceid(struct nfs4_deviceid *dev_id);
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
+struct nfs4_file_layout_dsaddr *
+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
+
+#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
new file mode 100644
index 000000000000..51fe64ace55a
--- /dev/null
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -0,0 +1,448 @@
+/*
+ * Device operations for the pnfs nfs4 file layout driver.
+ *
+ * Copyright (c) 2002
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ * Garth Goodson <Garth.Goodson@netapp.com>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+
+#include "internal.h"
+#include "nfs4filelayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+/*
+ * Data server cache
+ *
+ * Data servers can be mapped to different device ids.
+ * nfs4_pnfs_ds reference counting
+ * - set to 1 on allocation
+ * - incremented when a device id maps a data server already in the cache.
+ * - decremented when deviceid is removed from the cache.
+ */
+DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static LIST_HEAD(nfs4_data_server_cache);
+
+/* Debug routines */
+void
+print_ds(struct nfs4_pnfs_ds *ds)
+{
+ if (ds == NULL) {
+ printk("%s NULL device\n", __func__);
+ return;
+ }
+ printk(" ip_addr %x port %hu\n"
+ " ref count %d\n"
+ " client %p\n"
+ " cl_exchange_flags %x\n",
+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+ atomic_read(&ds->ds_count), ds->ds_clp,
+ ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+}
+
+void
+print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+ int i;
+
+ ifdebug(FACILITY) {
+ printk("%s dsaddr->ds_num %d\n", __func__,
+ dsaddr->ds_num);
+ for (i = 0; i < dsaddr->ds_num; i++)
+ print_ds(dsaddr->ds_list[i]);
+ }
+}
+
+void print_deviceid(struct nfs4_deviceid *id)
+{
+ u32 *p = (u32 *)id;
+
+ dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+ p[0], p[1], p[2], p[3]);
+}
+
+/* nfs4_ds_cache_lock is held */
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(u32 ip_addr, u32 port)
+{
+ struct nfs4_pnfs_ds *ds;
+
+ dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
+ ntohl(ip_addr), ntohs(port));
+
+ list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
+ if (ds->ds_ip_addr == ip_addr &&
+ ds->ds_port == port) {
+ return ds;
+ }
+ }
+ return NULL;
+}
+
+static void
+destroy_ds(struct nfs4_pnfs_ds *ds)
+{
+ dprintk("--> %s\n", __func__);
+ ifdebug(FACILITY)
+ print_ds(ds);
+
+ if (ds->ds_clp)
+ nfs_put_client(ds->ds_clp);
+ kfree(ds);
+}
+
+static void
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+ struct nfs4_pnfs_ds *ds;
+ int i;
+
+ print_deviceid(&dsaddr->deviceid.de_id);
+
+ for (i = 0; i < dsaddr->ds_num; i++) {
+ ds = dsaddr->ds_list[i];
+ if (ds != NULL) {
+ if (atomic_dec_and_lock(&ds->ds_count,
+ &nfs4_ds_cache_lock)) {
+ list_del_init(&ds->ds_node);
+ spin_unlock(&nfs4_ds_cache_lock);
+ destroy_ds(ds);
+ }
+ }
+ }
+ kfree(dsaddr->stripe_indices);
+ kfree(dsaddr);
+}
+
+void
+nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
+{
+ struct nfs4_file_layout_dsaddr *dsaddr =
+ container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
+
+ nfs4_fl_free_deviceid(dsaddr);
+}
+
+static struct nfs4_pnfs_ds *
+nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
+{
+ struct nfs4_pnfs_ds *tmp_ds, *ds;
+
+ ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL);
+ if (!ds)
+ goto out;
+
+ spin_lock(&nfs4_ds_cache_lock);
+ tmp_ds = _data_server_lookup_locked(ip_addr, port);
+ if (tmp_ds == NULL) {
+ ds->ds_ip_addr = ip_addr;
+ ds->ds_port = port;
+ atomic_set(&ds->ds_count, 1);
+ INIT_LIST_HEAD(&ds->ds_node);
+ ds->ds_clp = NULL;
+ list_add(&ds->ds_node, &nfs4_data_server_cache);
+ dprintk("%s add new data server ip 0x%x\n", __func__,
+ ds->ds_ip_addr);
+ } else {
+ kfree(ds);
+ atomic_inc(&tmp_ds->ds_count);
+ dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
+ __func__, tmp_ds->ds_ip_addr,
+ atomic_read(&tmp_ds->ds_count));
+ ds = tmp_ds;
+ }
+ spin_unlock(&nfs4_ds_cache_lock);
+out:
+ return ds;
+}
+
+/*
+ * Currently only support ipv4, and one multi-path address.
+ */
+static struct nfs4_pnfs_ds *
+decode_and_add_ds(__be32 **pp, struct inode *inode)
+{
+ struct nfs4_pnfs_ds *ds = NULL;
+ char *buf;
+ const char *ipend, *pstr;
+ u32 ip_addr, port;
+ int nlen, rlen, i;
+ int tmp[2];
+ __be32 *r_netid, *r_addr, *p = *pp;
+
+ /* r_netid */
+ nlen = be32_to_cpup(p++);
+ r_netid = p;
+ p += XDR_QUADLEN(nlen);
+
+ /* r_addr */
+ rlen = be32_to_cpup(p++);
+ r_addr = p;
+ p += XDR_QUADLEN(rlen);
+ *pp = p;
+
+ /* Check that netid is "tcp" */
+ if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) {
+ dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
+ goto out_err;
+ }
+
+ /* ipv6 length plus port is legal */
+ if (rlen > INET6_ADDRSTRLEN + 8) {
+ dprintk("%s Invalid address, length %d\n", __func__,
+ rlen);
+ goto out_err;
+ }
+ buf = kmalloc(rlen + 1, GFP_KERNEL);
+ buf[rlen] = '\0';
+ memcpy(buf, r_addr, rlen);
+
+ /* replace the port dots with dashes for the in4_pton() delimiter*/
+ for (i = 0; i < 2; i++) {
+ char *res = strrchr(buf, '.');
+ *res = '-';
+ }
+
+ /* Currently only support ipv4 address */
+ if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
+ dprintk("%s: Only ipv4 addresses supported\n", __func__);
+ goto out_free;
+ }
+
+ /* port */
+ pstr = ipend;
+ sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
+ port = htons((tmp[0] << 8) | (tmp[1]));
+
+ ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
+ dprintk("%s Decoded address and port %s\n", __func__, buf);
+out_free:
+ kfree(buf);
+out_err:
+ return ds;
+}
+
+/* Decode opaque device data and return the result */
+static struct nfs4_file_layout_dsaddr*
+decode_device(struct inode *ino, struct pnfs_device *pdev)
+{
+ int i, dummy;
+ u32 cnt, num;
+ u8 *indexp;
+ __be32 *p = (__be32 *)pdev->area, *indicesp;
+ struct nfs4_file_layout_dsaddr *dsaddr;
+
+ /* Get the stripe count (number of stripe index) */
+ cnt = be32_to_cpup(p++);
+ dprintk("%s stripe count %d\n", __func__, cnt);
+ if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
+ printk(KERN_WARNING "%s: stripe count %d greater than "
+ "supported maximum %d\n", __func__,
+ cnt, NFS4_PNFS_MAX_STRIPE_CNT);
+ goto out_err;
+ }
+
+ /* Check the multipath list count */
+ indicesp = p;
+ p += XDR_QUADLEN(cnt << 2);
+ num = be32_to_cpup(p++);
+ dprintk("%s ds_num %u\n", __func__, num);
+ if (num > NFS4_PNFS_MAX_MULTI_CNT) {
+ printk(KERN_WARNING "%s: multipath count %d greater than "
+ "supported maximum %d\n", __func__,
+ num, NFS4_PNFS_MAX_MULTI_CNT);
+ goto out_err;
+ }
+ dsaddr = kzalloc(sizeof(*dsaddr) +
+ (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
+ GFP_KERNEL);
+ if (!dsaddr)
+ goto out_err;
+
+ dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
+ if (!dsaddr->stripe_indices)
+ goto out_err_free;
+
+ dsaddr->stripe_count = cnt;
+ dsaddr->ds_num = num;
+
+ memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
+
+ /* Go back an read stripe indices */
+ p = indicesp;
+ indexp = &dsaddr->stripe_indices[0];
+ for (i = 0; i < dsaddr->stripe_count; i++) {
+ *indexp = be32_to_cpup(p++);
+ if (*indexp >= num)
+ goto out_err_free;
+ indexp++;
+ }
+ /* Skip already read multipath list count */
+ p++;