Merge tag 'ofs-pull-tag-1' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux

Pull orangefs filesystem from Mike Marshall. This finally merges the long-pending orangefs filesystem, which has been much cleaned up with input from Al Viro over the last six months. From the documentation file: "OrangeFS is an LGPL userspace scale-out parallel storage system. It is ideal for large storage problems faced by HPC, BigData, Streaming Video, Genomics, Bioinformatics. Orangefs, originally called PVFS, was first developed in 1993 by Walt Ligon and Eric Blumer as a parallel file system for Parallel Virtual Machine (PVM) as part of a NASA grant to study the I/O patterns of parallel programs. Orangefs features include: - Distributes file data among multiple file servers - Supports simultaneous access by multiple clients - Stores file data and metadata on servers using local file system and access methods - Userspace implementation is easy to install and maintain - Direct MPI support - Stateless" see Documentation/filesystems/orangefs.txt for more in-depth details. * tag 'ofs-pull-tag-1' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux: (174 commits) orangefs: fix orangefs_superblock locking orangefs: fix do_readv_writev() handling of error halfway through orangefs: have ->kill_sb() evict the VFS side of things first orangefs: sanitize ->llseek() orangefs-bufmap.h: trim unused junk orangefs: saner calling conventions for getting a slot orangefs_copy_{to,from}_bufmap(): don't pass bufmap pointer orangefs: get rid of readdir_handle_s ornagefs: ensure that truncate has an up to date inode size orangefs: move code which sets i_link to orangefs_inode_getattr orangefs: remove needless wrapper around GFP_KERNEL orangefs: remove wrapper around mutex_lock(&inode->i_mutex) orangefs: refactor inode type or link_target change detection orangefs: use new getattr for revalidate and remove old getattr orangefs: use new getattr in inode getattr and permission orangefs: use new orangefs_inode_getattr to get size in write and llseek orangefs: use new orangefs_inode_getattr to create new inodes orangefs: rename orangefs_inode_getattr to orangefs_inode_old_getattr orangefs: remove inode->i_lock wrapper orangefs: put register_chrdev immediately before register_filesystem ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-03-26 12:59:04 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-03-26 12:59:04 -0700
commit: 698f415cf5756e320623bdb015a600945743377c (patch)
tree: cf39b9233a9aea178156e876843fb742fa5ed070 /fs
parent: b4cec5f66849872d2e9573bc95c2016cb8e530ec (diff)
parent: 45996492e5c85aa0ac93a95d1b2d1ed56851c865 (diff)
30 files changed, 10742 insertions, 0 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 9d757673bf40..6725f59c18e6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -209,6 +209,7 @@ menuconfig MISC_FILESYSTEMS
 
 if MISC_FILESYSTEMS
 
+source "fs/orangefs/Kconfig"
 source "fs/adfs/Kconfig"
 source "fs/affs/Kconfig"
 source "fs/ecryptfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 252c96898a43..85b6e13b62d3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -106,6 +106,7 @@ obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/
 obj-$(CONFIG_ADFS_FS)		+= adfs/
 obj-$(CONFIG_FUSE_FS)		+= fuse/
 obj-$(CONFIG_OVERLAY_FS)	+= overlayfs/
+obj-$(CONFIG_ORANGEFS_FS)       += orangefs/
 obj-$(CONFIG_UDF_FS)		+= udf/
 obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
 obj-$(CONFIG_OMFS_FS)		+= omfs/
diff --git a/fs/orangefs/Kconfig b/fs/orangefs/Kconfig
new file mode 100644
index 000000000000..1554c02489de
--- /dev/null
+++ b/fs/orangefs/Kconfig
@@ -0,0 +1,6 @@
+config ORANGEFS_FS
+	tristate "ORANGEFS (Powered by PVFS) support"
+	select FS_POSIX_ACL
+	help
+	   Orange is a parallel file system designed for use on high end
+	   computing (HEC) systems.
diff --git a/fs/orangefs/Makefile b/fs/orangefs/Makefile
new file mode 100644
index 000000000000..a9d6a968fe6d
--- /dev/null
+++ b/fs/orangefs/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the ORANGEFS filesystem.
+#
+
+obj-$(CONFIG_ORANGEFS_FS) += orangefs.o
+
+orangefs-objs := acl.o file.o orangefs-cache.o orangefs-utils.o xattr.o \
+		 dcache.o inode.o orangefs-sysfs.o orangefs-mod.o super.o \
+		 devorangefs-req.o namei.o symlink.o dir.o orangefs-bufmap.o \
+		 orangefs-debugfs.o waitqueue.o
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
new file mode 100644
index 000000000000..03f89dbb2512
--- /dev/null
+++ b/fs/orangefs/acl.c
@@ -0,0 +1,175 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/fs_struct.h>
+
+struct posix_acl *orangefs_get_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	int ret;
+	char *key = NULL, *value = NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		key = ORANGEFS_XATTR_NAME_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		key = ORANGEFS_XATTR_NAME_ACL_DEFAULT;
+		break;
+	default:
+		gossip_err("orangefs_get_acl: bogus value of type %d\n", type);
+		return ERR_PTR(-EINVAL);
+	}
+	/*
+	 * Rather than incurring a network call just to determine the exact
+	 * length of the attribute, I just allocate a max length to save on
+	 * the network call. Conceivably, we could pass NULL to
+	 * orangefs_inode_getxattr() to probe the length of the value, but
+	 * I don't do that for now.
+	 */
+	value = kmalloc(ORANGEFS_MAX_XATTR_VALUELEN, GFP_KERNEL);
+	if (value == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	gossip_debug(GOSSIP_ACL_DEBUG,
+		     "inode %pU, key %s, type %d\n",
+		     get_khandle_from_ino(inode),
+		     key,
+		     type);
+	ret = orangefs_inode_getxattr(inode,
+				   "",
+				   key,
+				   value,
+				   ORANGEFS_MAX_XATTR_VALUELEN);
+	/* if the key exists, convert it to an in-memory rep */
+	if (ret > 0) {
+		acl = posix_acl_from_xattr(&init_user_ns, value, ret);
+	} else if (ret == -ENODATA || ret == -ENOSYS) {
+		acl = NULL;
+	} else {
+		gossip_err("inode %pU retrieving acl's failed with error %d\n",
+			   get_khandle_from_ino(inode),
+			   ret);
+		acl = ERR_PTR(ret);
+	}
+	/* kfree(NULL) is safe, so don't worry if value ever got used */
+	kfree(value);
+	return acl;
+}
+
+int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+	int error = 0;
+	void *value = NULL;
+	size_t size = 0;
+	const char *name = NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = ORANGEFS_XATTR_NAME_ACL_ACCESS;
+		if (acl) {
+			umode_t mode = inode->i_mode;
+			/*
+			 * can we represent this with the traditional file
+			 * mode permission bits?
+			 */
+			error = posix_acl_equiv_mode(acl, &mode);
+			if (error < 0) {
+				gossip_err("%s: posix_acl_equiv_mode err: %d\n",
+					   __func__,
+					   error);
+				return error;
+			}
+
+			if (inode->i_mode != mode)
+				SetModeFlag(orangefs_inode);
+			inode->i_mode = mode;
+			mark_inode_dirty_sync(inode);
+			if (error == 0)
+				acl = NULL;
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = ORANGEFS_XATTR_NAME_ACL_DEFAULT;
+		break;
+	default:
+		gossip_err("%s: invalid type %d!\n", __func__, type);
+		return -EINVAL;
+	}
+
+	gossip_debug(GOSSIP_ACL_DEBUG,
+		     "%s: inode %pU, key %s type %d\n",
+		     __func__, get_khandle_from_ino(inode),
+		     name,
+		     type);
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(size, GFP_KERNEL);
+		if (!value)
+			return -ENOMEM;
+
+		error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+		if (error < 0)
+			goto out;
+	}
+
+	gossip_debug(GOSSIP_ACL_DEBUG,
+		     "%s: name %s, value %p, size %zd, acl %p\n",
+		     __func__, name, value, size, acl);
+	/*
+	 * Go ahead and set the extended attribute now. NOTE: Suppose acl
+	 * was NULL, then value will be NULL and size will be 0 and that
+	 * will xlate to a removexattr. However, we don't want removexattr
+	 * complain if attributes does not exist.
+	 */
+	error = orangefs_inode_setxattr(inode, "", name, value, size, 0);
+
+out:
+	kfree(value);
+	if (!error)
+		set_cached_acl(inode, type, acl);
+	return error;
+}
+
+int orangefs_init_acl(struct inode *inode, struct inode *dir)
+{
+	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+	struct posix_acl *default_acl, *acl;
+	umode_t mode = inode->i_mode;
+	int error = 0;
+
+	ClearModeFlag(orangefs_inode);
+
+	error = posix_acl_create(dir, &mode, &default_acl, &acl);
+	if (error)
+		return error;
+
+	if (default_acl) {
+		error = orangefs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		posix_acl_release(default_acl);
+	}
+
+	if (acl) {
+		if (!error)
+			error = orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		posix_acl_release(acl);
+	}
+
+	/* If mode of the inode was changed, then do a forcible ->setattr */
+	if (mode != inode->i_mode) {
+		SetModeFlag(orangefs_inode);
+		inode->i_mode = mode;
+		orangefs_flush_inode(inode);
+	}
+
+	return error;
+}
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
new file mode 100644
index 000000000000..5dfc4f3cfe68
--- /dev/null
+++ b/fs/orangefs/dcache.c
@@ -0,0 +1,138 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ *  Implementation of dentry (directory cache) functions.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+
+/* Returns 1 if dentry can still be trusted, else 0. */
+static int orangefs_revalidate_lookup(struct dentry *dentry)
+{
+	struct dentry *parent_dentry = dget_parent(dentry);
+	struct inode *parent_inode = parent_dentry->d_inode;
+	struct orangefs_inode_s *parent = ORANGEFS_I(parent_inode);
+	struct inode *inode = dentry->d_inode;
+	struct orangefs_kernel_op_s *new_op;
+	int ret = 0;
+	int err = 0;
+
+	gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__);
+
+	new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
+	if (!new_op)
+		goto out_put_parent;
+
+	new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
+	new_op->upcall.req.lookup.parent_refn = parent->refn;
+	strncpy(new_op->upcall.req.lookup.d_name,
+		dentry->d_name.name,
+		ORANGEFS_NAME_MAX);
+
+	gossip_debug(GOSSIP_DCACHE_DEBUG,
+		     "%s:%s:%d interrupt flag [%d]\n",
+		     __FILE__,
+		     __func__,
+		     __LINE__,
+		     get_interruptible_flag(parent_inode));
+
+	err = service_operation(new_op, "orangefs_lookup",
+			get_interruptible_flag(parent_inode));
+
+	/* Positive dentry: reject if error or not the same inode. */
+	if (inode) {
+		if (err) {
+			gossip_debug(GOSSIP_DCACHE_DEBUG,
+			    "%s:%s:%d lookup failure.\n",
+			    __FILE__, __func__, __LINE__);
+			goto out_drop;
+		}
+		if (!match_handle(new_op->downcall.resp.lookup.refn.khandle,
+		    inode)) {
+			gossip_debug(GOSSIP_DCACHE_DEBUG,
+			    "%s:%s:%d no match.\n",
+			    __FILE__, __func__, __LINE__);
+			goto out_drop;
+		}
+
+	/* Negative dentry: reject if success or error other than ENOENT. */
+	} else {
+		gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: negative dentry.\n",
+		    __func__);
+		if (!err || err != -ENOENT) {
+			if (new_op->downcall.status != 0)
+				gossip_debug(GOSSIP_DCACHE_DEBUG,
+				    "%s:%s:%d lookup failure.\n",
+				    __FILE__, __func__, __LINE__);
+			goto out_drop;
+		}
+	}
+
+	ret = 1;
+out_release_op:
+	op_release(new_op);
+out_put_parent:
+	dput(parent_dentry);
+	return ret;
+out_drop:
+	gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d revalidate failed\n",
+	    __FILE__, __func__, __LINE__);
+	goto out_release_op;
+}
+
+/*
+ * Verify that dentry is valid.
+ *
+ * Should return 1 if dentry can still be trusted, else 0.
+ */
+static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	int ret;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: called on dentry %p.\n",
+		     __func__, dentry);
+
+	/* skip root handle lookups. */
+	if (dentry->d_inode && is_root_handle(dentry->d_inode))
+		return 1;
+
+	/*
+	 * If this passes, the positive dentry still exists or the negative
+	 * dentry still does not exist.
+	 */
+	if (!orangefs_revalidate_lookup(dentry))
+		return 0;
+
+	/* We do not need to continue with negative dentries. */
+	if (!dentry->d_inode)
+		goto out;
+
+	/* Now we must perform a getattr to validate the inode contents. */
+
+	ret = orangefs_inode_check_changed(dentry->d_inode);
+	if (ret < 0) {
+		gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d getattr failure.\n",
+		    __FILE__, __func__, __LINE__);
+		return 0;
+	}
+	if (ret == 0)
+		return 0;
+
+out:
+	gossip_debug(GOSSIP_DCACHE_DEBUG,
+	    "%s: negative dentry or positive dentry and inode valid.\n",
+	    __func__);
+	return 1;
+}
+
+const struct dentry_operations orangefs_dentry_operations = {
+	.d_revalidate = orangefs_d_revalidate,
+};
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
new file mode 100644
index 000000000000..db170beba797
--- /dev/null
+++ b/fs/orangefs/devorangefs-req.c
@@ -0,0 +1,943 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * Changes by Acxiom Corporation to add protocol version to kernel
+ * communication, Copyright Acxiom Corporation, 2005.
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-dev-proto.h"
+#include "orangefs-bufmap.h"
+
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+
+/* this file implements the /dev/pvfs2-req device node */
+
+static int open_access_count;
+
+#define DUMP_DEVICE_ERROR()                                                   \
+do {                                                                          \
+	gossip_err("*****************************************************\n");\
+	gossip_err("ORANGEFS Device Error:  You cannot open the device file ");  \
+	gossip_err("\n/dev/%s more than once.  Please make sure that\nthere " \
+		   "are no ", ORANGEFS_REQDEVICE_NAME);                          \
+	gossip_err("instances of a program using this device\ncurrently "     \
+		   "running. (You must verify this!)\n");                     \
+	gossip_err("For example, you can use the lsof program as follows:\n");\
+	gossip_err("'lsof | grep %s' (run this as root)\n",                   \
+		   ORANGEFS_REQDEVICE_NAME);                                     \
+	gossip_err("  open_access_count = %d\n", open_access_count);          \
+	gossip_err("*****************************************************\n");\
+} while (0)
+
+static int hash_func(__u64 tag, int table_size)
+{
+	return do_div(tag, (unsigned int)table_size);
+}
+
+static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op)
+{
+	int index = hash_func(op->tag, hash_table_size);
+
+	list_add_tail(&op->list, &htable_ops_in_progress[index]);
+}
+
+/*
+ * find the op with this tag and remove it from the in progress
+ * hash table.
+ */
+static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag)
+{
+	struct orangefs_kernel_op_s *op, *next;
+	int index;
+
+	index = hash_func(tag, hash_table_size);
+
+	spin_lock(&htable_ops_in_progress_lock);
+	list_for_each_entry_safe(op,
+				 next,
+				 &htable_ops_in_progress[index],
+				 list) {
+		if (op->tag == tag && !op_state_purged(op) &&
+		    !op_state_given_up(op)) {
+			list_del_init(&op->list);
+			spin_unlock(&htable_ops_in_progress_lock);
+			return op;
+		}
+	}
+
+	spin_unlock(&htable_ops_in_progress_lock);
+	return NULL;
+}
+
+/* Returns whether any FS are still pending remounted */
+static int mark_all_pending_mounts(void)
+{
+	int unmounted = 1;
+	struct orangefs_sb_info_s *orangefs_sb = NULL;
+
+	spin_lock(&orangefs_superblocks_lock);
+	list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
+		/* All of these file system require a remount */
+		orangefs_sb->mount_pending = 1;
+		unmounted = 0;
+	}
+	spin_unlock(&orangefs_superblocks_lock);
+	return unmounted;
+}
+
+/*
+ * Determine if a given file system needs to be remounted or not
+ *  Returns -1 on error
+ *           0 if already mounted
+ *           1 if needs remount
+ */
+static int fs_mount_pending(__s32 fsid)
+{
+	int mount_pending = -1;
+	struct orangefs_sb_info_s *orangefs_sb = NULL;
+
+	spin_lock(&orangefs_superblocks_lock);
+	list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
+		if (orangefs_sb->fs_id == fsid) {
+			mount_pending = orangefs_sb->mount_pending;
+			break;
+		}
+	}
+	spin_unlock(&orangefs_superblocks_lock);
+	return mount_pending;
+}
+
+static int orangefs_devreq_open(struct inode *inode, struct file *file)
+{
+	int ret = -EINVAL;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		gossip_err("%s: device cannot be opened in blocking mode\n",
+			   __func__);
+		goto out;
+	}
+	ret = -EACCES;
+	gossip_debug(GOSSIP_DEV_DEBUG, "client-core: opening device\n");
+	mutex_lock(&devreq_mutex);
+
+	if (open_access_count == 0) {
+		open_access_count = 1;
+		ret = 0;
+	} else {
+		DUMP_DEVICE_ERROR();
+	}
+	mutex_unlock(&devreq_mutex);
+
+out:
+
+	gossip_debug(GOSSIP_DEV_DEBUG,
+		     "pvfs2-client-core: open device complete (ret = %d)\n",
+		     ret);
+	return ret;
+}
+
+/* Function for read() callers into the device */
+static ssize_t orangefs_devreq_read(struct file *file,
+				 char __user *buf,
+				 size_t count, loff_t *offset)
+{
+	struct orangefs_kernel_op_s *op, *temp;
+	__s32 proto_ver = ORANGEFS_KERNEL_PROTO_VERSION;
+	static __s32 magic = ORANGEFS_DEVREQ_MAGIC;
+	struct orangefs_kernel_op_s *cur_op = NULL;
+	unsigned long ret;
+
+	/* We do not support blocking IO. */
+	if (!(file->f_flags & O_NONBLOCK)) {
+		gossip_err("%s: blocking read from client-core.\n",
+			   __func__);
+		return -EINVAL;
+	}
+
+	/*
+	 * The client will do an ioctl to find MAX_DEV_REQ_UPSIZE, then
+	 * always read with that size buffer.
+	 */
+	if (count != MAX_DEV_REQ_UPSIZE) {
+		gossip_err("orangefs: client-core tried to read wrong size\n");
+		return -EINVAL;
+	}
+
+restart:
+	/* Get next op (if any) from top of list. */
+	spin_lock(&orangefs_request_list_lock);
+	list_for_each_entry_safe(op, temp, &orangefs_request_list, list) {
+		__s32 fsid;
+		/* This lock is held past the end of the loop when we break. */
+		spin_lock(&op->lock);
+		if (unlikely(op_state_purged(op) || op_state_given_up(op))) {
+			spin_unlock(&op->lock);
+			continue;
+		}
+
+		fsid = fsid_of_op(op);
+		if (fsid != ORANGEFS_FS_ID_NULL) {
+			int ret;
+			/* Skip ops whose filesystem needs to be mounted. */
+			ret = fs_mount_pending(fsid);
+			if (ret == 1) {
+				gossip_debug(GOSSIP_DEV_DEBUG,
+				    "%s: mount pending, skipping op tag "
+				    "%llu %s\n",
+				    __func__,
+				    llu(op->tag),
+				    get_opname_string(op));
+				spin_unlock(&op->lock);
+				continue;
+			/*
+			 * Skip ops whose filesystem we don't know about unless
+			 * it is being mounted.
+			 */
+			/* XXX: is there a better way to detect this? */
+			} else if (ret == -1 &&
+				   !(op->upcall.type ==
+					ORANGEFS_VFS_OP_FS_MOUNT ||
+				     op->upcall.type ==
+					ORANGEFS_VFS_OP_GETATTR)) {
+				gossip_debug(GOSSIP_DEV_DEBUG,
+				    "orangefs: skipping op tag %llu %s\n",
+				    llu(op->tag), get_opname_string(op));
+				gossip_err(
+				    "orangefs: ERROR: fs_mount_pending %d\n",
+				    fsid);
+				spin_unlock(&op->lock);
+				continue;
+			}
+		}
+		/*
+		 * Either this op does not pertain to a filesystem, is mounting
+		 * a filesystem, or pertains to a mounted filesystem. Let it
+		 * through.
+		 */
+		cur_op = op;
+		break;
+	}
+
+	/*
+	 * At this point we either have a valid op and can continue or have not
+	 * found an op and must ask the client to try again later.
+	 */
+	if (!cur_op) {
+		spin_unlock(&orangefs_request_list_lock);
+		return -EAGAIN;
+	}
+
+	gossip_debug(GOSSIP_DEV_DEBUG, "%s: reading op tag %llu %s\n",
+		     __func__,
+		     llu(cur_op->tag),
+		     get_opname_string(cur_op));
+
+	/*
+	 * Such an op should never be on the list in the first place. If so, we
+	 * will abort.
+	 */
+	if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) {
+		gossip_err("orangefs: ERROR: Current op already queued.\n");
+		list_del_init(&cur_op->list);
+		spin_unlock(&cur_op->lock);
+		spin_unlock(&orangefs_request_list_lock);
+		return -EAGAIN;
+	}
+
+	list_del_init(&cur_op->list);
+	spin_unlock(&orangefs_request_list_lock);
+
+	spin_unlock(&cur_op->lock);
+
+	/* Push the upcall out. */
+	ret = copy_to_user(buf, &proto_ver, sizeof(__s32));
+	if (ret != 0)
+		goto error;
+	ret = copy_to_user(buf+sizeof(__s32), &magic, sizeof(__s32));
+	if (ret != 0)
+		goto error;
+	ret = copy_to_user(buf+2 * sizeof(__s32), &cur_op->tag, sizeof(__u64));
+	if (ret != 0)
+		goto error;
+	ret = copy_to_user(buf+2*sizeof(__s32)+sizeof(__u64), &cur_op->upcall,
+			   sizeof(struct orangefs_upcall_s));
+	if (ret != 0)
+		goto error;
+
+	spin_lock(&htable_ops_in_progress_lock);
+	spin_lock(&cur_op->lock);
+	if (unlikely(op_state_given_up(cur_op))) {
+		spin_unlock(&cur_op->lock);
+		spin_unlock(&htable_ops_in_progress_lock);
+		complete(&cur_op->waitq);
+		goto restart;
+	}
+
+	/*
+	 * Set the operation to be in progress and move it between lists since
+	 * it has been sent to the client.
+	 */
+	set_op_state_inprogress(cur_op);
+	gossip_debug(GOSSIP_DEV_DEBUG,
+		     "%s: 1 op:%s: op_state:%d: process:%s:\n",
+		     __func__,
+		     get_opname_string(cur_op),
+		     cur_op->op_state,
+		     current->comm);
+	orangefs_devreq_add_op(cur_op);
+	spin_unlock(&cur_op->lock);
+	spin_unlock(&htable_ops_in_progress_lock);
+
+	/* The client only asks to read one size buffer. */
+	return MAX_DEV_REQ_UPSIZE;
+error:
+	/*
+	 * We were unable to copy the op data to the client. Put the op back in
+	 * list. If client has crashed, the op will be purged later when the
+	 * device is released.
+	 */
+	gossip_err("orangefs: Failed to copy data to user space\n");
+	spin_lock(&orangefs_request_list_lock);
+	spin_lock(&cur_op->lock);
+	if (likely(!op_state_given_up(cur_op))) {
+		set_op_state_waiting(cur_op);
+		gossip_debug(GOSSIP_DEV_DEBUG,
+			     "%s: 2 op:%s: op_state:%d: process:%s:\n",
+			     __func__,
+			     get_opname_string(cur_op),
+			     cur_op->op_state,
+			     current->comm);
+		list_add(&cur_op->list, &orangefs_request_list);
+		spin_unlock(&cur_op->lock);
+	} else {
+		spin_unlock(&cur_op->lock);
+		complete(&cur_op->waitq);
+	}
+	spin_unlock(&orangefs_request_list_lock);
+	return -EFAULT;
+}
+
+/*
+ * Function for writev() callers into the device.
+ *
+ * Userspace should have written:
+ *  - __u32 version
+ *  - __u32 magic
+ *  - __u64 tag
+ *  - struct orangefs_downcall_s
+ *  - trailer buffer (in the case of READDIR operations)
+ */
+static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
+				      struct iov_iter *iter)
+{
+	ssize_t ret;
+	struct orangefs_kernel_op_s *op = NULL;
+	struct {
+		__u32 version;
+		__u32 magic;
+		__u64 tag;
+	} head;
+	int total = ret = iov_iter_count(iter);
+	int n;
+	int downcall_size = sizeof(struct orangefs_downcall_s);
+	int head_size = sizeof(head);
+
+	gossip_debug(GOSSIP_DEV_DEBUG, "%s: total:%d: ret:%zd:\n",
+		     __func__,
+		     total,
+		     ret);
+
+        if (total < MAX_DEV_REQ_DOWNSIZE) {
+		gossip_err("%s: total:%d: must be at least:%u:\n",
+			   __func__,
+			   total,
+			   (unsigned int) MAX_DEV_REQ_DOWNSIZE);
+		return -EFAULT;
+	}
+     
+	n = copy_from_iter(&head, head_size, iter);
+	if (n < head_size) {
+		gossip_err("%s: failed to copy head.\n", __func__);
+		return -EFAULT;
+	}
+
+	if (head.version < ORANGEFS_MINIMUM_USERSPACE_VERSION) {
+		gossip_err("%s: userspace claims version"
+			   "%d, minimum version required: %d.\n",
+			   __func__,
+			   head.version,
+			   ORANGEFS_MINIMUM_USERSPACE_VERSION);
+		return -EPROTO;
+	}
+
+	if (head.magic != ORANGEFS_DEVREQ_MAGIC) {
+		gossip_err("Error: Device magic number does not match.\n");
+		return -EPROTO;
+	}
+
+	/* remove the op from the in progress hash table */
+	op = orangefs_devreq_remove_op(head.tag);
+	if (!op) {
+		gossip_err("WARNING: No one's waiting for tag %llu\n",
+			   llu(head.tag));
+		return ret;
+	}
+
+	n = copy_from_iter(&op->downcall, downcall_size, iter);
+	if (n != downcall_size) {
+		gossip_err("%s: failed to copy downcall.\n", __func__);
+		goto Efault;
+	}
+
+	if (op->downcall.status)
+		goto wakeup;
+
+	/*
+	 * We've successfully peeled off the head and the downcall. 
+	 * Something has gone awry if total doesn't equal the
+	 * sum of head_size, downcall_size and trailer_size.
+	 */
+	if ((head_size + downcall_size + op->downcall.trailer_size) != total) {
+		gossip_err("%s: funky write, head_size:%d"
+			   ": downcall_size:%d: trailer_size:%lld"
+			   ": total size:%d:\n",
+			   __func__,
+			   head_size,
+			   downcall_size,
+			   op->downcall.trailer_size,
+			   total);
+		goto Efault;
+	}
+
+	/* Only READDIR operations should have trailers. */
+	if ((op->downcall.type != ORANGEFS_VFS_OP_READDIR) &&
+	    (op->downcall.trailer_size != 0)) {
+		gossip_err("%s: %x operation with trailer.",
+			   __func__,
+			   op->downcall.type);
+		goto Efault;
+	}
+
+	/* READDIR operations should always have trailers. */
+	if ((op->downcall.type == ORANGEFS_VFS_OP_READDIR) &&
+	    (op->downcall.trailer_size == 0)) {
+		gossip_err("%s: %x operation with no trailer.",
+			   __func__,
+			   op->downcall.type);
+		goto Efault;
+	}
+
+	if (op->downcall.type != ORANGEFS_VFS_OP_READDIR)
+		goto wakeup;
+
+	op->downcall.trailer_buf =
+		vmalloc(op->downcall.trailer_size);
+	if (op->downcall.trailer_buf == NULL) {
+		gossip_err("%s: failed trailer vmalloc.\n",
+			   __func__);
+		goto Enomem;
+	}
+	memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size);
+	n = copy_from_iter(op->downcall.trailer_buf,
+			   op->downcall.trailer_size,
+			   iter);
+	if (n != op->downcall.trailer_size) {
+		gossip_err("%s: failed to copy trailer.\n", __func__);
+		vfree(op->downcall.trailer_buf);
+		goto Efault;
+	}
+
+wakeup:
+	/*
+	 * Return to vfs waitqueue, and back to service_operation
+	 * through wait_for_matching_downcall. 
+	 */
+	spin_lock(&op->lock);
+	if (unlikely(op_is_cancel(op))) {
+		spin_unlock(&op->lock);
+		put_cancel(op);
+	} else if (unlikely(op_state_given_up(op))) {
+		spin_unlock(&op->lock);
+		complete(&op->waitq);
+	} else {
+		set_op_state_serviced(op);
+		gossip_debug(GOSSIP_DEV_DEBUG,
+			     "%s: op:%s: op_state:%d: process:%s:\n",
+			     __func__,
+			     get_opname_string(op),
+			     op->op_state,
+			     current->comm);
+		spin_unlock(&op->lock);
+	}
+	return ret;
+
+Efault:
+	op->downcall.status = -(ORANGEFS_ERROR_BIT | 9);
+	ret = -EFAULT;
+	goto wakeup;
+
+Enomem:
+	op->downcall.status = -(ORANGEFS_ERROR_BIT | 8);
+	ret = -ENOMEM;
+	goto wakeup;
+}
+
+/*
+ * NOTE: gets called when the last reference to this device is dropped.
+ * Using the open_access_count variable, we enforce a reference count
+ * on this file so that it can be opened by only one process at a time.
+ * the devreq_mutex is used to make sure all i/o has completed
+ * before we call orangefs_bufmap_finalize, and similar such tricky
+ * situations
+ */
+static int orangefs_devreq_release(struct inode *inode, struct file *file)
+{
+	int unmounted = 0;
+
+	gossip_debug(GOSSIP_DEV_DEBUG,
+		     "%s:pvfs2-client-core: exiting, clo
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-03-26 12:59:04 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-03-26 12:59:04 -0700
commit	698f415cf5756e320623bdb015a600945743377c (patch)
tree	cf39b9233a9aea178156e876843fb742fa5ed070 /fs
parent	b4cec5f66849872d2e9573bc95c2016cb8e530ec (diff)
parent	45996492e5c85aa0ac93a95d1b2d1ed56851c865 (diff)