Orangefs: kernel client part 3

Signed-off-by: Mike Marshall <hubcap@omnibond.com>
author: Mike Marshall <hubcap@omnibond.com> 2015-07-17 10:38:13 -0400
committer: Mike Marshall <hubcap@omnibond.com> 2015-10-03 11:39:55 -0400
commit: 274dcf55bd4ab12af1cc1d3b77416285bef8ebf4 (patch)
tree: 0b4bba244b5deccb8c853a2578e0e4c6b92d0bac /fs/orangefs
parent: 5db11c21a929cd9d8c0484006efb1014fc723c93 (diff)
5 files changed, 2477 insertions, 0 deletions
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
new file mode 100644
index 000000000000..747fe6a690af
--- /dev/null
+++ b/fs/orangefs/namei.c
@@ -0,0 +1,473 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ *  Linux VFS namei operations.
+ */
+
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+
+/*
+ * Get a newly allocated inode to go with a negative dentry.
+ */
+static int pvfs2_create(struct inode *dir,
+			struct dentry *dentry,
+			umode_t mode,
+			bool exclusive)
+{
+	struct pvfs2_inode_s *parent = PVFS2_I(dir);
+	struct pvfs2_kernel_op_s *new_op;
+	struct inode *inode;
+	int ret;
+
+	gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__);
+
+	new_op = op_alloc(PVFS2_VFS_OP_CREATE);
+	if (!new_op)
+		return -ENOMEM;
+
+	new_op->upcall.req.create.parent_refn = parent->refn;
+
+	fill_default_sys_attrs(new_op->upcall.req.create.attributes,
+			       PVFS_TYPE_METAFILE, mode);
+
+	strncpy(new_op->upcall.req.create.d_name,
+		dentry->d_name.name, PVFS2_NAME_LEN);
+
+	ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Create Got PVFS2 handle %pU on fsid %d (ret=%d)\n",
+		     &new_op->downcall.resp.create.refn.khandle,
+		     new_op->downcall.resp.create.refn.fs_id, ret);
+
+	if (ret < 0) {
+		gossip_debug(GOSSIP_NAME_DEBUG,
+			     "%s: failed with error code %d\n",
+			     __func__, ret);
+		goto out;
+	}
+
+	inode = pvfs2_new_inode(dir->i_sb, dir, S_IFREG | mode, 0,
+				&new_op->downcall.resp.create.refn);
+	if (IS_ERR(inode)) {
+		gossip_err("*** Failed to allocate pvfs2 file inode\n");
+		ret = PTR_ERR(inode);
+		goto out;
+	}
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Assigned file inode new number of %pU\n",
+		     get_khandle_from_ino(inode));
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Inode (Regular File) %pU -> %s\n",
+		     get_khandle_from_ino(inode),
+		     dentry->d_name.name);
+
+	SetMtimeFlag(parent);
+	dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+	mark_inode_dirty_sync(dir);
+	ret = 0;
+out:
+	op_release(new_op);
+	gossip_debug(GOSSIP_NAME_DEBUG, "%s: returning %d\n", __func__, ret);
+	return ret;
+}
+
+/*
+ * Attempt to resolve an object name (dentry->d_name), parent handle, and
+ * fsid into a handle for the object.
+ */
+static struct dentry *pvfs2_lookup(struct inode *dir, struct dentry *dentry,
+				   unsigned int flags)
+{
+	struct pvfs2_inode_s *parent = PVFS2_I(dir);
+	struct pvfs2_kernel_op_s *new_op;
+	struct inode *inode;
+	struct dentry *res;
+	int ret = -EINVAL;
+
+	/*
+	 * in theory we could skip a lookup here (if the intent is to
+	 * create) in order to avoid a potentially failed lookup, but
+	 * leaving it in can skip a valid lookup and try to create a file
+	 * that already exists (e.g. the vfs already handles checking for
+	 * -EEXIST on O_EXCL opens, which is broken if we skip this lookup
+	 * in the create path)
+	 */
+	gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %s\n",
+		     __func__, dentry->d_name.name);
+
+	if (dentry->d_name.len > (PVFS2_NAME_LEN - 1))
+		return ERR_PTR(-ENAMETOOLONG);
+
+	new_op = op_alloc(PVFS2_VFS_OP_LOOKUP);
+	if (!new_op)
+		return ERR_PTR(-ENOMEM);
+
+	new_op->upcall.req.lookup.sym_follow = flags & LOOKUP_FOLLOW;
+
+	gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d using parent %pU\n",
+		     __FILE__,
+		     __func__,
+		     __LINE__,
+		     &parent->refn.khandle);
+	new_op->upcall.req.lookup.parent_refn = parent->refn;
+
+	strncpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name,
+		PVFS2_NAME_LEN);
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "%s: doing lookup on %s under %pU,%d (follow=%s)\n",
+		     __func__,
+		     new_op->upcall.req.lookup.d_name,
+		     &new_op->upcall.req.lookup.parent_refn.khandle,
+		     new_op->upcall.req.lookup.parent_refn.fs_id,
+		     ((new_op->upcall.req.lookup.sym_follow ==
+		       PVFS2_LOOKUP_LINK_FOLLOW) ? "yes" : "no"));
+
+	ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Lookup Got %pU, fsid %d (ret=%d)\n",
+		     &new_op->downcall.resp.lookup.refn.khandle,
+		     new_op->downcall.resp.lookup.refn.fs_id,
+		     ret);
+
+	if (ret < 0) {
+		if (ret == -ENOENT) {
+			/*
+			 * if no inode was found, add a negative dentry to
+			 * dcache anyway; if we don't, we don't hold expected
+			 * lookup semantics and we most noticeably break
+			 * during directory renames.
+			 *
+			 * however, if the operation failed or exited, do not
+			 * add the dentry (e.g. in the case that a touch is
+			 * issued on a file that already exists that was
+			 * interrupted during this lookup -- no need to add
+			 * another negative dentry for an existing file)
+			 */
+
+			gossip_debug(GOSSIP_NAME_DEBUG,
+				     "pvfs2_lookup: Adding *negative* dentry "
+				     "%p for %s\n",
+				     dentry,
+				     dentry->d_name.name);
+
+			d_add(dentry, NULL);
+			res = NULL;
+			goto out;
+		}
+
+		/* must be a non-recoverable error */
+		res = ERR_PTR(ret);
+		goto out;
+	}
+
+	inode = pvfs2_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
+	if (IS_ERR(inode)) {
+		gossip_debug(GOSSIP_NAME_DEBUG,
+			"error %ld from iget\n", PTR_ERR(inode));
+		res = ERR_CAST(inode);
+		goto out;
+	}
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "%s:%s:%d "
+		     "Found good inode [%lu] with count [%d]\n",
+		     __FILE__,
+		     __func__,
+		     __LINE__,
+		     inode->i_ino,
+		     (int)atomic_read(&inode->i_count));
+
+	/* update dentry/inode pair into dcache */
+	res = d_splice_alias(inode, dentry);
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Lookup success (inode ct = %d)\n",
+		     (int)atomic_read(&inode->i_count));
+out:
+	op_release(new_op);
+	return res;
+}
+
+/* return 0 on success; non-zero otherwise */
+static int pvfs2_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct pvfs2_inode_s *parent = PVFS2_I(dir);
+	struct pvfs2_kernel_op_s *new_op;
+	int ret;
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "%s: called on %s\n"
+		     "  (inode %pU): Parent is %pU | fs_id %d\n",
+		     __func__,
+		     dentry->d_name.name,
+		     get_khandle_from_ino(inode),
+		     &parent->refn.khandle,
+		     parent->refn.fs_id);
+
+	new_op = op_alloc(PVFS2_VFS_OP_REMOVE);
+	if (!new_op)
+		return -ENOMEM;
+
+	new_op->upcall.req.remove.parent_refn = parent->refn;
+	strncpy(new_op->upcall.req.remove.d_name, dentry->d_name.name,
+		PVFS2_NAME_LEN);
+
+	ret = service_operation(new_op, "pvfs2_unlink",
+				get_interruptible_flag(inode));
+
+	/* when request is serviced properly, free req op struct */
+	op_release(new_op);
+
+	if (!ret) {
+		drop_nlink(inode);
+
+		SetMtimeFlag(parent);
+		dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+		mark_inode_dirty_sync(dir);
+	}
+	return ret;
+}
+
+/*
+ * pvfs2_link() is only implemented here to make sure that we return a
+ * reasonable error code (the kernel will return a misleading EPERM
+ * otherwise).  PVFS2 does not support hard links.
+ */
+static int pvfs2_link(struct dentry *old_dentry,
+		      struct inode *dir,
+		      struct dentry *dentry)
+{
+	return -EOPNOTSUPP;
+}
+
+/*
+ * pvfs2_mknod() is only implemented here to make sure that we return a
+ * reasonable error code (the kernel will return a misleading EPERM
+ * otherwise).  PVFS2 does not support special files such as fifos or devices.
+ */
+static int pvfs2_mknod(struct inode *dir,
+		       struct dentry *dentry,
+		       umode_t mode,
+		       dev_t rdev)
+{
+	return -EOPNOTSUPP;
+}
+
+static int pvfs2_symlink(struct inode *dir,
+			 struct dentry *dentry,
+			 const char *symname)
+{
+	struct pvfs2_inode_s *parent = PVFS2_I(dir);
+	struct pvfs2_kernel_op_s *new_op;
+	struct inode *inode;
+	int mode = 755;
+	int ret;
+
+	gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__);
+
+	if (!symname)
+		return -EINVAL;
+
+	new_op = op_alloc(PVFS2_VFS_OP_SYMLINK);
+	if (!new_op)
+		return -ENOMEM;
+
+	new_op->upcall.req.sym.parent_refn = parent->refn;
+
+	fill_default_sys_attrs(new_op->upcall.req.sym.attributes,
+			       PVFS_TYPE_SYMLINK,
+			       mode);
+
+	strncpy(new_op->upcall.req.sym.entry_name,
+		dentry->d_name.name,
+		PVFS2_NAME_LEN);
+	strncpy(new_op->upcall.req.sym.target, symname, PVFS2_NAME_LEN);
+
+	ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Symlink Got PVFS2 handle %pU on fsid %d (ret=%d)\n",
+		     &new_op->downcall.resp.sym.refn.khandle,
+		     new_op->downcall.resp.sym.refn.fs_id, ret);
+
+	if (ret < 0) {
+		gossip_debug(GOSSIP_NAME_DEBUG,
+			    "%s: failed with error code %d\n",
+			    __func__, ret);
+		goto out;
+	}
+
+	inode = pvfs2_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0,
+				&new_op->downcall.resp.sym.refn);
+	if (IS_ERR(inode)) {
+		gossip_err
+		    ("*** Failed to allocate pvfs2 symlink inode\n");
+		ret = PTR_ERR(inode);
+		goto out;
+	}
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Assigned symlink inode new number of %pU\n",
+		     get_khandle_from_ino(inode));
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Inode (Symlink) %pU -> %s\n",
+		     get_khandle_from_ino(inode),
+		     dentry->d_name.name);
+
+	SetMtimeFlag(parent);
+	dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+	mark_inode_dirty_sync(dir);
+	ret = 0;
+out:
+	op_release(new_op);
+	return ret;
+}
+
+static int pvfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct pvfs2_inode_s *parent = PVFS2_I(dir);
+	struct pvfs2_kernel_op_s *new_op;
+	struct inode *inode;
+	int ret;
+
+	new_op = op_alloc(PVFS2_VFS_OP_MKDIR);
+	if (!new_op)
+		return -ENOMEM;
+
+	new_op->upcall.req.mkdir.parent_refn = parent->refn;
+
+	fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes,
+			       PVFS_TYPE_DIRECTORY, mode);
+
+	strncpy(new_op->upcall.req.mkdir.d_name,
+		dentry->d_name.name, PVFS2_NAME_LEN);
+
+	ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Mkdir Got PVFS2 handle %pU on fsid %d\n",
+		     &new_op->downcall.resp.mkdir.refn.khandle,
+		     new_op->downcall.resp.mkdir.refn.fs_id);
+
+	if (ret < 0) {
+		gossip_debug(GOSSIP_NAME_DEBUG,
+			     "%s: failed with error code %d\n",
+			     __func__, ret);
+		goto out;
+	}
+
+	inode = pvfs2_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0,
+				&new_op->downcall.resp.mkdir.refn);
+	if (IS_ERR(inode)) {
+		gossip_err("*** Failed to allocate pvfs2 dir inode\n");
+		ret = PTR_ERR(inode);
+		goto out;
+	}
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Assigned dir inode new number of %pU\n",
+		     get_khandle_from_ino(inode));
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Inode (Directory) %pU -> %s\n",
+		     get_khandle_from_ino(inode),
+		     dentry->d_name.name);
+
+	/*
+	 * NOTE: we have no good way to keep nlink consistent for directories
+	 * across clients; keep constant at 1.
+	 */
+	SetMtimeFlag(parent);
+	dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+	mark_inode_dirty_sync(dir);
+out:
+	op_release(new_op);
+	return ret;
+}
+
+static int pvfs2_rename(struct inode *old_dir,
+			struct dentry *old_dentry,
+			struct inode *new_dir,
+			struct dentry *new_dentry)
+{
+	struct pvfs2_kernel_op_s *new_op;
+	int ret;
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "pvfs2_rename: called (%s/%s => %s/%s) ct=%d\n",
+		     old_dentry->d_parent->d_name.name,
+		     old_dentry->d_name.name,
+		     new_dentry->d_parent->d_name.name,
+		     new_dentry->d_name.name,
+		     d_count(new_dentry));
+
+	new_op = op_alloc(PVFS2_VFS_OP_RENAME);
+	if (!new_op)
+		return -EINVAL;
+
+	new_op->upcall.req.rename.old_parent_refn = PVFS2_I(old_dir)->refn;
+	new_op->upcall.req.rename.new_parent_refn = PVFS2_I(new_dir)->refn;
+
+	strncpy(new_op->upcall.req.rename.d_old_name,
+		old_dentry->d_name.name,
+		PVFS2_NAME_LEN);
+	strncpy(new_op->upcall.req.rename.d_new_name,
+		new_dentry->d_name.name,
+		PVFS2_NAME_LEN);
+
+	ret = service_operation(new_op,
+				"pvfs2_rename",
+				get_interruptible_flag(old_dentry->d_inode));
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "pvfs2_rename: got downcall status %d\n",
+		     ret);
+
+	if (new_dentry->d_inode)
+		new_dentry->d_inode->i_ctime = CURRENT_TIME;
+
+	op_release(new_op);
+	return ret;
+}
+
+/* PVFS2 implementation of VFS inode operations for directories */
+struct inode_operations pvfs2_dir_inode_operations = {
+	.lookup = pvfs2_lookup,
+	.get_acl = pvfs2_get_acl,
+	.set_acl = pvfs2_set_acl,
+	.create = pvfs2_create,
+	.link = pvfs2_link,
+	.unlink = pvfs2_unlink,
+	.symlink = pvfs2_symlink,
+	.mkdir = pvfs2_mkdir,
+	.rmdir = pvfs2_unlink,
+	.mknod = pvfs2_mknod,
+	.rename = pvfs2_rename,
+	.setattr = pvfs2_setattr,
+	.getattr = pvfs2_getattr,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.removexattr = generic_removexattr,
+	.listxattr = pvfs2_listxattr,
+};
diff --git a/fs/orangefs/pvfs2-bufmap.c b/fs/orangefs/pvfs2-bufmap.c
new file mode 100644
index 000000000000..aa14c37d0216
--- /dev/null
+++ b/fs/orangefs/pvfs2-bufmap.c
@@ -0,0 +1,970 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+#include "pvfs2-bufmap.h"
+
+DECLARE_WAIT_QUEUE_HEAD(pvfs2_bufmap_init_waitq);
+
+struct pvfs2_bufmap {
+	atomic_t refcnt;
+
+	int desc_size;
+	int desc_shift;
+	int desc_count;
+	int total_size;
+	int page_count;
+
+	struct page **page_array;
+	struct pvfs_bufmap_desc *desc_array;
+
+	/* array to track usage of buffer descriptors */
+	int *buffer_index_array;
+	spinlock_t buffer_index_lock;
+
+	/* array to track usage of buffer descriptors for readdir */
+	int readdir_index_array[PVFS2_READDIR_DEFAULT_DESC_COUNT];
+	spinlock_t readdir_index_lock;
+} *__pvfs2_bufmap;
+
+static DEFINE_SPINLOCK(pvfs2_bufmap_lock);
+
+static void
+pvfs2_bufmap_unmap(struct pvfs2_bufmap *bufmap)
+{
+	int i;
+
+	for (i = 0; i < bufmap->page_count; i++)
+		page_cache_release(bufmap->page_array[i]);
+}
+
+static void
+pvfs2_bufmap_free(struct pvfs2_bufmap *bufmap)
+{
+	kfree(bufmap->page_array);
+	kfree(bufmap->desc_array);
+	kfree(bufmap->buffer_index_array);
+	kfree(bufmap);
+}
+
+struct pvfs2_bufmap *pvfs2_bufmap_ref(void)
+{
+	struct pvfs2_bufmap *bufmap = NULL;
+
+	spin_lock(&pvfs2_bufmap_lock);
+	if (__pvfs2_bufmap) {
+		bufmap = __pvfs2_bufmap;
+		atomic_inc(&bufmap->refcnt);
+	}
+	spin_unlock(&pvfs2_bufmap_lock);
+	return bufmap;
+}
+
+void pvfs2_bufmap_unref(struct pvfs2_bufmap *bufmap)
+{
+	if (atomic_dec_and_lock(&bufmap->refcnt, &pvfs2_bufmap_lock)) {
+		__pvfs2_bufmap = NULL;
+		spin_unlock(&pvfs2_bufmap_lock);
+
+		pvfs2_bufmap_unmap(bufmap);
+		pvfs2_bufmap_free(bufmap);
+	}
+}
+
+inline int pvfs_bufmap_size_query(void)
+{
+	struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref();
+	int size = bufmap ? bufmap->desc_size : 0;
+
+	pvfs2_bufmap_unref(bufmap);
+	return size;
+}
+
+inline int pvfs_bufmap_shift_query(void)
+{
+	struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref();
+	int shift = bufmap ? bufmap->desc_shift : 0;
+
+	pvfs2_bufmap_unref(bufmap);
+	return shift;
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq);
+static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq);
+
+/*
+ * get_bufmap_init
+ *
+ * If bufmap_init is 1, then the shared memory system, including the
+ * buffer_index_array, is available.  Otherwise, it is not.
+ *
+ * returns the value of bufmap_init
+ */
+int get_bufmap_init(void)
+{
+	return __pvfs2_bufmap ? 1 : 0;
+}
+
+
+static struct pvfs2_bufmap *
+pvfs2_bufmap_alloc(struct PVFS_dev_map_desc *user_desc)
+{
+	struct pvfs2_bufmap *bufmap;
+
+	bufmap = kzalloc(sizeof(*bufmap), GFP_KERNEL);
+	if (!bufmap)
+		goto out;
+
+	atomic_set(&bufmap->refcnt, 1);
+	bufmap->total_size = user_desc->total_size;
+	bufmap->desc_count = user_desc->count;
+	bufmap->desc_size = user_desc->size;
+	bufmap->desc_shift = ilog2(bufmap->desc_size);
+
+	spin_lock_init(&bufmap->buffer_index_lock);
+	bufmap->buffer_index_array =
+		kcalloc(bufmap->desc_count, sizeof(int), GFP_KERNEL);
+	if (!bufmap->buffer_index_array) {
+		gossip_err("pvfs2: could not allocate %d buffer indices\n",
+				bufmap->desc_count);
+		goto out_free_bufmap;
+	}
+	spin_lock_init(&bufmap->readdir_index_lock);
+
+	bufmap->desc_array =
+		kcalloc(bufmap->desc_count, sizeof(struct pvfs_bufmap_desc),
+			GFP_KERNEL);
+	if (!bufmap->desc_array) {
+		gossip_err("pvfs2: could not allocate %d descriptors\n",
+				bufmap->desc_count);
+		goto out_free_index_array;
+	}
+
+	bufmap->page_count = bufmap->total_size / PAGE_SIZE;
+
+	/* allocate storage to track our page mappings */
+	bufmap->page_array =
+		kcalloc(bufmap->page_count, sizeof(struct page *), GFP_KERNEL);
+	if (!bufmap->page_array)
+		goto out_free_desc_array;
+
+	return bufmap;
+
+out_free_desc_array:
+	kfree(bufmap->desc_array);
+out_free_index_array:
+	kfree(bufmap->buffer_index_array);
+out_free_bufmap:
+	kfree(bufmap);
+out:
+	return NULL;
+}
+
+static int
+pvfs2_bufmap_map(struct pvfs2_bufmap *bufmap,
+		struct PVFS_dev_map_desc *user_desc)
+{
+	int pages_per_desc = bufmap->desc_size / PAGE_SIZE;
+	int offset = 0, ret, i;
+
+	/* map the pages */
+	down_write(&current->mm->mmap_sem);
+	ret = get_user_pages(current,
+			     current->mm,
+			     (unsigned long)user_desc->ptr,
+			     bufmap->page_count,
+			     1,
+			     0,
+			     bufmap->page_array,
+			     NULL);
+	up_write(&current->mm->mmap_sem);
+
+	if (ret < 0)
+		return ret;
+
+	if (ret != bufmap->page_count) {
+		gossip_err("pvfs2 error: asked for %d pages, only got %d.\n",
+				bufmap->page_count, ret);
+
+		for (i = 0; i < ret; i++) {
+			SetPageError(bufmap->page_array[i]);
+			page_cache_release(bufmap->page_array[i]);
+		}
+		return -ENOMEM;
+	}
+
+	/*
+	 * ideally we want to get kernel space pointers for each page, but
+	 * we can't kmap that many pages at once if highmem is being used.
+	 * so instead, we just kmap/kunmap the page address each time the
+	 * kaddr is needed.
+	 */
+	for (i = 0; i < bufmap->page_count; i++)
+		flush_dcache_page(bufmap->page_array[i]);
+
+	/* build a list of available descriptors */
+	for (offset = 0, i = 0; i < bufmap->desc_count; i++) {
+		bufmap->desc_array[i].page_array = &bufmap->page_array[offset];
+		bufmap->desc_array[i].array_count = pages_per_desc;
+		bufmap->desc_array[i].uaddr =
+		    (user_desc->ptr + (i * pages_per_desc * PAGE_SIZE));
+		offset += pages_per_desc;
+	}
+
+	return 0;
+}
+
+/*
+ * pvfs_bufmap_initialize()
+ *
+ * initializes the mapped buffer interface
+ *
+ * returns 0 on success, -errno on failure
+ */
+int pvfs_bufmap_initialize(struct PVFS_dev_map_desc *user_desc)
+{
+	struct pvfs2_bufmap *bufmap;
+	int ret = -EINVAL;
+
+	gossip_debug(GOSSIP_BUFMAP_DEBUG,
+		     "pvfs_bufmap_initialize: called (ptr ("
+		     "%p) sz (%d) cnt(%d).\n",
+		     user_desc->ptr,
+		     user_desc->size,
+		     user_desc->count);
+
+	/*
+	 * sanity check alignment and size of buffer that caller wants to
+	 * work with
+	 */
+	if (PAGE_ALIGN((unsigned long)user_desc->ptr) !=
+	    (unsigned long)user_desc->ptr) {
+		gossip_err("pvfs2 error: memory alignment (front). %p\n",
+			   user_desc->ptr);
+		goto out;
+	}
+
+	if (PAGE_ALIGN(((unsigned long)user_desc->ptr + user_desc->total_size))
+	    != (unsigned long)(user_desc->ptr + user_desc->total_size)) {
+		gossip_err("pvfs2 error: memory alignment (back).(%p + %d)\n",
+			   user_desc->ptr,
+			   user_desc->total_size);
+		goto out;
+	}
+
+	if (user_desc->total_size != (user_desc->size * user_desc->count)) {
+		gossip_err("pvfs2 error: user provided an oddly sized buffer: (%d, %d, %d)\n",
+			   user_desc->total_size,
+			   user_desc->size,
+			   user_desc->count);
+		goto out;
+	}
+
+	if ((user_desc->size % PAGE_SIZE) != 0) {
+		gossip_err("pvfs2 error: bufmap size not page size divisible (%d).\n",
+			   user_desc->size);
+		goto out;
+	}
+
+	ret = -ENOMEM;
+	bufmap = pvfs2_bufmap_alloc(user_desc);
+	if (!bufmap)
+		goto out;
+
+	ret = pvfs2_bufmap_map(bufmap, user_desc);
+	if (ret)
+		goto out_free_bufmap;
+
+
+	spin_lock(&pvfs2_bufmap_lock);
+	if (__pvfs2_bufmap) {
+		spin_unlock(&pvfs2_bufmap_lock);
+		gossip_err("pvfs2: error: bufmap already initialized.\n");
+		ret = -EALREADY;
+		goto out_unmap_bufmap;
+	}
+	__pvfs2_bufmap = bufmap;
+	spin_unlock(&pvfs2_bufmap_lock);
+
+	/*
+	 * If there are operations in pvfs2_bufmap_init_waitq, wake them up.
+	 * This scenario occurs when the client-core is restarted and I/O
+	 * requests in the in-progress or waiting tables are restarted.  I/O
+	 * requests cannot be restarted until the shared memory system is
+	 * completely re-initialized, so we put the I/O requests in this
+	 * waitq until initialization has completed.  NOTE:  the I/O requests
+	 * are also on a timer, so they don't wait forever just in case the
+	 * client-core doesn't come back up.
+	 */
+	wake_up_interruptible(&pvfs2_bufmap_init_waitq);
+
+	gossip_debug(GOSSIP_BUFMAP_DEBUG,
+		     "pvfs_bufmap_initialize: exiting normally\n");
+	return 0;
+
+out_unmap_bufmap:
+	pvfs2_bufmap_unmap(bufmap);
+out_free_bufmap:
+	pvfs2_bufmap_free(bufmap);
+out:
+	return ret;
+}
+
+/*
+ * pvfs_bufmap_finalize()
+ *
+ * shuts down the mapped buffer interface and releases any resources
+ * associated with it
+ *
+ * no return value
+ */
+void pvfs_bufmap_finalize(void)
+{
+	gossip_debug(GOSSIP_BUFMAP_DEBUG, "pvfs2_bufmap_finalize: called\n");
+	BUG_ON(!__pvfs2_bufmap);
+	pvfs2_bufmap_unref(__pvfs2_bufmap);
+	gossip_debug(GOSSIP_BUFMAP_DEBUG,
+		     "pvfs2_bufmap_finalize: exiting normally\n");
+}
+
+struct slot_args {
+	int slot_count;
+	int *slot_array;
+	spinlock_t *slot_lock;
+	wait_queue_head_t *slot_wq;
+};
+
+static int wait_for_a_slot(struct slot_args *slargs, int *buffer_index)
+{
+	int ret = -1;
+	int i = 0;
+	DECLARE_WAITQUEUE(my_wait, current);
+
+
+	add_wait_queue_exclusive(slargs->slot_wq, &my_wait);
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		/*
+		 * check for available desc, slot_lock is the appropriate
+		 * index_lock
+		 */
+		spin_lock(slargs->slot_lock);
+		for (i = 0; i < slargs->slot_count; i++)
+			if (slargs->slot_array[i] == 0) {
+				slargs->slot_array[i] = 1;
+				*buffer_index = i;
+				ret = 0;
+				break;
+			}
+		spin_unlock(slargs->slot_lock);
+
+		/* if we acquired a buffer, then break out of while */
+		if (ret == 0)
+			break;
+
+		if (!signal_pending(current)) {
+			int timeout =
+			    MSECS_TO_JIFFIES(1000 * slot_timeout_secs);
+			gossip_debug(GOSSIP_BUFMAP_DEBUG,
+				     "[BUFMAP]: waiting %d "
+				     "seconds for a slot\n",
+				     slot_timeout_secs);
+			if (!schedule_timeout(timeout)) {
+				gossip_debug(GOSSIP_BUFMAP_DEBUG,
+					     "*** wait_for_a_slot timed out\n");
+				ret = -ETIMEDOUT;
+				break;
+			}
+			gossip_debug(GOSSIP_BUFMAP_DEBUG,
+			  "[BUFMAP]: woken up by a slot becoming available.\n");
+			continue;
+		}
+
+		gossip_debug(GOSSIP_BUFMAP_DEBUG, "pvfs2: %s interrupted.\n",
+			     __func__);
+		ret = -EINTR;
+		break;
+	}
+
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(slargs->slot_wq, &my_wait);
+	return ret;
+}
+
+static void put_back_slot(struct slot_args *slargs, int buffer_index)
+{
+	/* slot_lock is the appropriate index_lock */
+	spin_lock(slargs->slot_lock);
+	if (buffer_index < 0 || buffer_index >= slargs->slot_count) {
+		spin_unlock(slargs->slot_lock);
+		return;
+	}
+
+	/* put the desc back on the queue */
+	slargs->slot_array[buffer_index] = 0;
+	spin_unlock(slargs->slot_lock);
+
+	/* wake up anyone who may be sleeping on the queue */
+	wake_up_interruptible(slargs->slot_wq);
+}
+
+/*
+ * pvfs_bufmap_get()
+ *
+ * gets a free mapped buffer descriptor, will sleep until one becomes
+ * available if necessary
+ *
+ * returns 0 on success, -errno on failure
+ */
+int pvfs_bufmap_get(struct pvfs2_bufmap **mapp, int *buffer_index)
+{
+	struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref();
+	struct slot_args slargs;
+	int ret;
+
+	if (!bufmap) {
+		gossip_err("pvfs2: please confirm that pvfs2-client daemon is running.\n");
+		return -EIO;
+	}
+
+	slargs.slot_count = bufmap->desc_count;
+	slargs.slot_array = bufmap->buffer_index_array;
+	slargs.slot_lock = &bufmap->buffer_index_lock;
+	slargs.slot_wq = &bufmap_waitq;
+	ret = wait_for_a_slot(&slargs, buffer_index);
+	if (ret)
+		pvfs2_bufmap_unref(bufmap);
+	*mapp = bufmap;
+	return ret;
+}
+
+/*
+ * pvfs_bufmap_put()
+ *
+ * returns a mapped buffer descriptor to the collection
+ *
+ * no return value
+ */
+void pvfs_bufmap_put(struct pvfs2_bufmap *bufmap, int buffer_index)
+{
+	struct slot_args slargs;
+
+	slargs.slot_count = bufmap->desc_count;
+	slargs.slot_array = bufmap->buffer_index_array;
+	slargs.slot_lock = &bufmap->buffer_index_lock;
+	slargs.slot_wq = &bufmap_waitq;
+	put_back_slot(&slargs, buffer_index);
+	pvfs2_bufmap_unref(bufmap);
+}
+
+/*
+ * readdir_index_get()
+ *
+ * gets a free descriptor, will sleep until one becomes
+ * available if necessary.
+ * Although the readdir buffers are not mapped into kernel space
+ * we could do that at a later point of time. Regardless, these
+ * indices are used by the client-core.
+ *
+ * returns 0 on success, -errno on failure
+ */
+int readdir_index_get(struct pvfs2_bufmap **mapp, int *buffer_index)
+{
+	struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref();
+	struct slot_args slargs;
+	int ret;
+
+	if (!bufmap) {
+		gossip_err("pvfs2: please confirm that pvfs2-client daemon is running.\n");
+		return -EIO;
+	}
+
+	slargs.slot_count = PVFS2_READDIR_DEFAULT_DESC_COUNT;
+	slargs.slot_array = bufmap->readdir_index_array;
+	slargs.slot_lock = &bufmap->readdir_index_lock;
+	slargs.slot_wq = &readdir_waitq;
+	ret = wait_for_a_slot(&slargs, buffer_index);
+	if (ret)
+		pvfs2_bufmap_unref(bufmap);
+	*mapp = bufmap;
+	return ret;
+}
+
+void readdir_index_put(struct pvfs2_bufmap *bufmap, int buffer_index)
+{
+	struct slot_args slargs;
+
+	slargs.slot_count = PVFS2_READDIR_DEFAULT_DESC_COUNT;
+	slargs.slot_array = bufmap->readdir_index_array;
+	slargs.slot_lock = &bufmap->readdir_index_lock;
+	slargs.slot_wq = &readdir_waitq;
+	put_back_slot(&slargs, buffer_index);
+	pvfs2_bufmap_unref(bufmap);
+}
+
+/*
+ * pvfs_bufmap_copy_iovec_from_user()
+ *
+ * copies data from several user space address's in an iovec
+ * to a mapped buffer
+ *
+ * Note that the mapped buffer is a series of pages and therefore
+ * the copies have to be split by PAGE_SIZE bytes at a time.
+ * Note that this routine checks that summation of iov_len
+ * across all the elements of iov is equal to size.
+ *
+ * returns 0 on success, -errno on failure
+ */
+int pvfs_bufmap_copy_iovec_from_user(struct pvfs2_bufmap *bufmap,
+				     int buffer_index,
+				     const struct iovec *iov,
+				     unsigned long nr_segs,
+				     size_t size)
+{
+	size_t ret = 0;
+	size_t amt_copied = 0;
+	size_t cur_copy_size = 0;
+	unsigned int to_page_offset = 0;
+	unsigned int to_page_index = 0;
+	void *to_kaddr = NULL;
+	void __user *from_addr = NULL;
+	struct iovec *copied_iovec = NULL;
+	struct pvfs_bufmap_desc *to;
+	unsigned int seg;
+	char *tmp_printer = NULL;
+	int tmp_int = 0;
+
+	gossip_debug(GOSSIP_BUFMAP_DEBUG,
+		     "pvfs_bufmap_copy_iovec_from_user: index %d, "
+		     "size %zd\n",
+		     buffer_index,
+		     size);
+
+	to = &bufmap->desc_array[buffer_index];
+
+	/*
+	 * copy the passed in iovec so that we can change some of its fields
+	 */
+	copied_iovec = kmalloc_array(nr_segs,
+				     sizeof(*copied_iovec),
+				     PVFS2_BUFMAP_GFP_FLAGS);
+	if (copied_iovec == NULL)
+		return -ENOMEM;
+
+	memcpy(copied_iovec, iov, nr_segs * sizeof(*copied_iovec));
+	/*
+	 * Go through each segment in the iovec and make sure that
+	 * the summation of iov_len matches the given size.
+	 */
+	for (seg = 0, amt_copied = 0; seg < nr_segs; seg++)
+		amt_copied += copied_iovec[seg].iov_len;
+	if (amt_copied != size) {
+		gossip_err(
+		    "pvfs2_bufmap_copy_iovec_from_user: computed total ("
+		    "%zd) is not equal to (%zd)\n",
+		    amt_copied,
+		    size);
+		kfree(copied_iovec);
+		return -EINVAL;
+	}
+
+	to_page_index = 0;
+	to_page_offset = 0;
+	amt_copied = 0;
+	seg = 0;
+	/*
+	 * Go through each segment in the iovec and copy its
+	 * buffer into the mapped buffer one page at a time though
+	 */
+	while (amt_copied < size) {
+		struct iovec *iv = &copied_iovec[seg];
+		int inc_to_page_index;
+
+		if (iv->iov_len < (PAGE_SIZE - to_page_offset)) {
+			cur_copy_size =
+			    PVFS_util_min(iv->iov_len, size - amt_copied);
+			seg++;
+			from_addr = iv->iov_base;
+			inc_to_page_index = 0;
+		} else if (iv->iov_len == (PAGE_SIZE - to_page_offset)) {
+			cur_copy_size =
+			    PVFS_util_min(iv->iov_len, size - amt_copied);
+			seg++;
+			from_addr = iv->iov_base;
+			inc_to_page_index = 1;
+		} else {
+			cur_copy_size =
+			    PVFS_util_min(PAGE_SIZE - to_page_offset,
+					  size - amt_copied);
+			from_addr = iv->iov_base;
+			iv->iov_base += cur_copy_size;
+			iv->iov_len -= cur_copy_size;
+			inc_to_page_index = 1;
+		}
+		to_kaddr = pvfs2_kmap(to->page_array[to_page_index]);
+		ret =
+		    copy_from_user(to_kaddr + to_page_offset,
+				   from_addr,
+				   cur_copy_size);
+		if (!PageReserved(to->page_array[to_page_index]))
+			SetPageDirty(to->page_array[to_page_index]);
+
+		if (!tmp_printer) {
+			tmp_printer = (char *)(to_kaddr + to_page_offset);
+			tmp_int += tmp_printer[0];
+			gossip_debug(GOSSIP_BUFMAP_DEBUG,
+				     "First character (integer value) in pvfs_bufmap_copy_from_user: %d\n",
+				     tmp_int);
+		}
+
+		pvfs2_kunmap(to->page_array[to_page_index]);
+		if (ret) {
+			gossip_err("Failed to copy data from user space\n");
+			kfree(copied_iovec);
+			return -EFAULT;
+		}
+
+		amt_copied += cur_copy_size;
+		if (inc_to_page_index) {
+			to_page_offset = 0;
+			to_page_index++;
+		} else {
+			to_page_offset += cur_copy_size;
+		}
+	}
+	kfree(copied_iovec);
+	return 0;
+}
+
+/*
+ * pvfs_bufmap_copy_iovec_from_kernel()
+ *
+ * copies data from several kernel space address's in an iovec
+ * to a mapped buffer
+ *
+ * Note that the mapped buffer is a series of pages and therefore
+ * the copies have to be split by PAGE_SIZE bytes at a time.
+ * Note that this routine checks that summation of iov_len
+ * across all the elements of iov is equal to size.
+ *
+ * returns 0 on success, -errno on failure
+ */
+int pvfs_bufmap_copy_iovec_from_kernel(struct pvfs2_bufmap *bufmap,
+		int buffer_index, const struct iovec *iov,
+		unsigned long nr_segs, size_t size)
+{
+	size_t amt_copied = 0;
+	size_t cur_copy_size = 0;
+	int to_page_index = 0;
+	void *to_kaddr = NULL;
+	void *from_kaddr = NULL;
+	struct iovec *copied_iovec = NULL;
+	struct pvfs_bufmap_desc *to;
+	unsigned int seg;
+	unsigned to_page_offset = 0;
+
+	gossip_debug(GOSSIP_BUFMAP_DEBUG,
+		     "pvfs_bufmap_copy_iovec_from_kernel: index %d, "
+		     "size %zd\n",
+		     buffer_index,
+		     size);
+
author	Mike Marshall <hubcap@omnibond.com>	2015-07-17 10:38:13 -0400
committer	Mike Marshall <hubcap@omnibond.com>	2015-10-03 11:39:55 -0400
commit	274dcf55bd4ab12af1cc1d3b77416285bef8ebf4 (patch)
tree	0b4bba244b5deccb8c853a2578e0e4c6b92d0bac /fs/orangefs
parent	5db11c21a929cd9d8c0484006efb1014fc723c93 (diff)