diff options
Diffstat (limited to 'fs/orangefs')
-rw-r--r-- | fs/orangefs/namei.c | 473 | ||||
-rw-r--r-- | fs/orangefs/pvfs2-bufmap.c | 970 | ||||
-rw-r--r-- | fs/orangefs/pvfs2-cache.c | 260 | ||||
-rw-r--r-- | fs/orangefs/pvfs2-debugfs.c | 458 | ||||
-rw-r--r-- | fs/orangefs/pvfs2-mod.c | 316 |
5 files changed, 2477 insertions, 0 deletions
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c new file mode 100644 index 000000000000..747fe6a690af --- /dev/null +++ b/fs/orangefs/namei.c @@ -0,0 +1,473 @@ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ + +/* + * Linux VFS namei operations. + */ + +#include "protocol.h" +#include "pvfs2-kernel.h" + +/* + * Get a newly allocated inode to go with a negative dentry. + */ +static int pvfs2_create(struct inode *dir, + struct dentry *dentry, + umode_t mode, + bool exclusive) +{ + struct pvfs2_inode_s *parent = PVFS2_I(dir); + struct pvfs2_kernel_op_s *new_op; + struct inode *inode; + int ret; + + gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__); + + new_op = op_alloc(PVFS2_VFS_OP_CREATE); + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.create.parent_refn = parent->refn; + + fill_default_sys_attrs(new_op->upcall.req.create.attributes, + PVFS_TYPE_METAFILE, mode); + + strncpy(new_op->upcall.req.create.d_name, + dentry->d_name.name, PVFS2_NAME_LEN); + + ret = service_operation(new_op, __func__, get_interruptible_flag(dir)); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Create Got PVFS2 handle %pU on fsid %d (ret=%d)\n", + &new_op->downcall.resp.create.refn.khandle, + new_op->downcall.resp.create.refn.fs_id, ret); + + if (ret < 0) { + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: failed with error code %d\n", + __func__, ret); + goto out; + } + + inode = pvfs2_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, + &new_op->downcall.resp.create.refn); + if (IS_ERR(inode)) { + gossip_err("*** Failed to allocate pvfs2 file inode\n"); + ret = PTR_ERR(inode); + goto out; + } + + gossip_debug(GOSSIP_NAME_DEBUG, + "Assigned file inode new number of %pU\n", + get_khandle_from_ino(inode)); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Inode (Regular File) %pU -> %s\n", + get_khandle_from_ino(inode), + dentry->d_name.name); + + SetMtimeFlag(parent); + dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); + mark_inode_dirty_sync(dir); + ret = 0; +out: + op_release(new_op); + gossip_debug(GOSSIP_NAME_DEBUG, "%s: returning %d\n", __func__, ret); + return ret; +} + +/* + * Attempt to resolve an object name (dentry->d_name), parent handle, and + * fsid into a handle for the object. + */ +static struct dentry *pvfs2_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct pvfs2_inode_s *parent = PVFS2_I(dir); + struct pvfs2_kernel_op_s *new_op; + struct inode *inode; + struct dentry *res; + int ret = -EINVAL; + + /* + * in theory we could skip a lookup here (if the intent is to + * create) in order to avoid a potentially failed lookup, but + * leaving it in can skip a valid lookup and try to create a file + * that already exists (e.g. the vfs already handles checking for + * -EEXIST on O_EXCL opens, which is broken if we skip this lookup + * in the create path) + */ + gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %s\n", + __func__, dentry->d_name.name); + + if (dentry->d_name.len > (PVFS2_NAME_LEN - 1)) + return ERR_PTR(-ENAMETOOLONG); + + new_op = op_alloc(PVFS2_VFS_OP_LOOKUP); + if (!new_op) + return ERR_PTR(-ENOMEM); + + new_op->upcall.req.lookup.sym_follow = flags & LOOKUP_FOLLOW; + + gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d using parent %pU\n", + __FILE__, + __func__, + __LINE__, + &parent->refn.khandle); + new_op->upcall.req.lookup.parent_refn = parent->refn; + + strncpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name, + PVFS2_NAME_LEN); + + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: doing lookup on %s under %pU,%d (follow=%s)\n", + __func__, + new_op->upcall.req.lookup.d_name, + &new_op->upcall.req.lookup.parent_refn.khandle, + new_op->upcall.req.lookup.parent_refn.fs_id, + ((new_op->upcall.req.lookup.sym_follow == + PVFS2_LOOKUP_LINK_FOLLOW) ? "yes" : "no")); + + ret = service_operation(new_op, __func__, get_interruptible_flag(dir)); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Lookup Got %pU, fsid %d (ret=%d)\n", + &new_op->downcall.resp.lookup.refn.khandle, + new_op->downcall.resp.lookup.refn.fs_id, + ret); + + if (ret < 0) { + if (ret == -ENOENT) { + /* + * if no inode was found, add a negative dentry to + * dcache anyway; if we don't, we don't hold expected + * lookup semantics and we most noticeably break + * during directory renames. + * + * however, if the operation failed or exited, do not + * add the dentry (e.g. in the case that a touch is + * issued on a file that already exists that was + * interrupted during this lookup -- no need to add + * another negative dentry for an existing file) + */ + + gossip_debug(GOSSIP_NAME_DEBUG, + "pvfs2_lookup: Adding *negative* dentry " + "%p for %s\n", + dentry, + dentry->d_name.name); + + d_add(dentry, NULL); + res = NULL; + goto out; + } + + /* must be a non-recoverable error */ + res = ERR_PTR(ret); + goto out; + } + + inode = pvfs2_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn); + if (IS_ERR(inode)) { + gossip_debug(GOSSIP_NAME_DEBUG, + "error %ld from iget\n", PTR_ERR(inode)); + res = ERR_CAST(inode); + goto out; + } + + gossip_debug(GOSSIP_NAME_DEBUG, + "%s:%s:%d " + "Found good inode [%lu] with count [%d]\n", + __FILE__, + __func__, + __LINE__, + inode->i_ino, + (int)atomic_read(&inode->i_count)); + + /* update dentry/inode pair into dcache */ + res = d_splice_alias(inode, dentry); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Lookup success (inode ct = %d)\n", + (int)atomic_read(&inode->i_count)); +out: + op_release(new_op); + return res; +} + +/* return 0 on success; non-zero otherwise */ +static int pvfs2_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct pvfs2_inode_s *parent = PVFS2_I(dir); + struct pvfs2_kernel_op_s *new_op; + int ret; + + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: called on %s\n" + " (inode %pU): Parent is %pU | fs_id %d\n", + __func__, + dentry->d_name.name, + get_khandle_from_ino(inode), + &parent->refn.khandle, + parent->refn.fs_id); + + new_op = op_alloc(PVFS2_VFS_OP_REMOVE); + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.remove.parent_refn = parent->refn; + strncpy(new_op->upcall.req.remove.d_name, dentry->d_name.name, + PVFS2_NAME_LEN); + + ret = service_operation(new_op, "pvfs2_unlink", + get_interruptible_flag(inode)); + + /* when request is serviced properly, free req op struct */ + op_release(new_op); + + if (!ret) { + drop_nlink(inode); + + SetMtimeFlag(parent); + dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); + mark_inode_dirty_sync(dir); + } + return ret; +} + +/* + * pvfs2_link() is only implemented here to make sure that we return a + * reasonable error code (the kernel will return a misleading EPERM + * otherwise). PVFS2 does not support hard links. + */ +static int pvfs2_link(struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + return -EOPNOTSUPP; +} + +/* + * pvfs2_mknod() is only implemented here to make sure that we return a + * reasonable error code (the kernel will return a misleading EPERM + * otherwise). PVFS2 does not support special files such as fifos or devices. + */ +static int pvfs2_mknod(struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) +{ + return -EOPNOTSUPP; +} + +static int pvfs2_symlink(struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + struct pvfs2_inode_s *parent = PVFS2_I(dir); + struct pvfs2_kernel_op_s *new_op; + struct inode *inode; + int mode = 755; + int ret; + + gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__); + + if (!symname) + return -EINVAL; + + new_op = op_alloc(PVFS2_VFS_OP_SYMLINK); + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.sym.parent_refn = parent->refn; + + fill_default_sys_attrs(new_op->upcall.req.sym.attributes, + PVFS_TYPE_SYMLINK, + mode); + + strncpy(new_op->upcall.req.sym.entry_name, + dentry->d_name.name, + PVFS2_NAME_LEN); + strncpy(new_op->upcall.req.sym.target, symname, PVFS2_NAME_LEN); + + ret = service_operation(new_op, __func__, get_interruptible_flag(dir)); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Symlink Got PVFS2 handle %pU on fsid %d (ret=%d)\n", + &new_op->downcall.resp.sym.refn.khandle, + new_op->downcall.resp.sym.refn.fs_id, ret); + + if (ret < 0) { + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: failed with error code %d\n", + __func__, ret); + goto out; + } + + inode = pvfs2_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0, + &new_op->downcall.resp.sym.refn); + if (IS_ERR(inode)) { + gossip_err + ("*** Failed to allocate pvfs2 symlink inode\n"); + ret = PTR_ERR(inode); + goto out; + } + + gossip_debug(GOSSIP_NAME_DEBUG, + "Assigned symlink inode new number of %pU\n", + get_khandle_from_ino(inode)); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Inode (Symlink) %pU -> %s\n", + get_khandle_from_ino(inode), + dentry->d_name.name); + + SetMtimeFlag(parent); + dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); + mark_inode_dirty_sync(dir); + ret = 0; +out: + op_release(new_op); + return ret; +} + +static int pvfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct pvfs2_inode_s *parent = PVFS2_I(dir); + struct pvfs2_kernel_op_s *new_op; + struct inode *inode; + int ret; + + new_op = op_alloc(PVFS2_VFS_OP_MKDIR); + if (!new_op) + return -ENOMEM; + + new_op->upcall.req.mkdir.parent_refn = parent->refn; + + fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes, + PVFS_TYPE_DIRECTORY, mode); + + strncpy(new_op->upcall.req.mkdir.d_name, + dentry->d_name.name, PVFS2_NAME_LEN); + + ret = service_operation(new_op, __func__, get_interruptible_flag(dir)); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Mkdir Got PVFS2 handle %pU on fsid %d\n", + &new_op->downcall.resp.mkdir.refn.khandle, + new_op->downcall.resp.mkdir.refn.fs_id); + + if (ret < 0) { + gossip_debug(GOSSIP_NAME_DEBUG, + "%s: failed with error code %d\n", + __func__, ret); + goto out; + } + + inode = pvfs2_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0, + &new_op->downcall.resp.mkdir.refn); + if (IS_ERR(inode)) { + gossip_err("*** Failed to allocate pvfs2 dir inode\n"); + ret = PTR_ERR(inode); + goto out; + } + + gossip_debug(GOSSIP_NAME_DEBUG, + "Assigned dir inode new number of %pU\n", + get_khandle_from_ino(inode)); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + gossip_debug(GOSSIP_NAME_DEBUG, + "Inode (Directory) %pU -> %s\n", + get_khandle_from_ino(inode), + dentry->d_name.name); + + /* + * NOTE: we have no good way to keep nlink consistent for directories + * across clients; keep constant at 1. + */ + SetMtimeFlag(parent); + dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); + mark_inode_dirty_sync(dir); +out: + op_release(new_op); + return ret; +} + +static int pvfs2_rename(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + struct pvfs2_kernel_op_s *new_op; + int ret; + + gossip_debug(GOSSIP_NAME_DEBUG, + "pvfs2_rename: called (%s/%s => %s/%s) ct=%d\n", + old_dentry->d_parent->d_name.name, + old_dentry->d_name.name, + new_dentry->d_parent->d_name.name, + new_dentry->d_name.name, + d_count(new_dentry)); + + new_op = op_alloc(PVFS2_VFS_OP_RENAME); + if (!new_op) + return -EINVAL; + + new_op->upcall.req.rename.old_parent_refn = PVFS2_I(old_dir)->refn; + new_op->upcall.req.rename.new_parent_refn = PVFS2_I(new_dir)->refn; + + strncpy(new_op->upcall.req.rename.d_old_name, + old_dentry->d_name.name, + PVFS2_NAME_LEN); + strncpy(new_op->upcall.req.rename.d_new_name, + new_dentry->d_name.name, + PVFS2_NAME_LEN); + + ret = service_operation(new_op, + "pvfs2_rename", + get_interruptible_flag(old_dentry->d_inode)); + + gossip_debug(GOSSIP_NAME_DEBUG, + "pvfs2_rename: got downcall status %d\n", + ret); + + if (new_dentry->d_inode) + new_dentry->d_inode->i_ctime = CURRENT_TIME; + + op_release(new_op); + return ret; +} + +/* PVFS2 implementation of VFS inode operations for directories */ +struct inode_operations pvfs2_dir_inode_operations = { + .lookup = pvfs2_lookup, + .get_acl = pvfs2_get_acl, + .set_acl = pvfs2_set_acl, + .create = pvfs2_create, + .link = pvfs2_link, + .unlink = pvfs2_unlink, + .symlink = pvfs2_symlink, + .mkdir = pvfs2_mkdir, + .rmdir = pvfs2_unlink, + .mknod = pvfs2_mknod, + .rename = pvfs2_rename, + .setattr = pvfs2_setattr, + .getattr = pvfs2_getattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = pvfs2_listxattr, +}; diff --git a/fs/orangefs/pvfs2-bufmap.c b/fs/orangefs/pvfs2-bufmap.c new file mode 100644 index 000000000000..aa14c37d0216 --- /dev/null +++ b/fs/orangefs/pvfs2-bufmap.c @@ -0,0 +1,970 @@ +/* + * (C) 2001 Clemson University and The University of Chicago + * + * See COPYING in top-level directory. + */ +#include "protocol.h" +#include "pvfs2-kernel.h" +#include "pvfs2-bufmap.h" + +DECLARE_WAIT_QUEUE_HEAD(pvfs2_bufmap_init_waitq); + +struct pvfs2_bufmap { + atomic_t refcnt; + + int desc_size; + int desc_shift; + int desc_count; + int total_size; + int page_count; + + struct page **page_array; + struct pvfs_bufmap_desc *desc_array; + + /* array to track usage of buffer descriptors */ + int *buffer_index_array; + spinlock_t buffer_index_lock; + + /* array to track usage of buffer descriptors for readdir */ + int readdir_index_array[PVFS2_READDIR_DEFAULT_DESC_COUNT]; + spinlock_t readdir_index_lock; +} *__pvfs2_bufmap; + +static DEFINE_SPINLOCK(pvfs2_bufmap_lock); + +static void +pvfs2_bufmap_unmap(struct pvfs2_bufmap *bufmap) +{ + int i; + + for (i = 0; i < bufmap->page_count; i++) + page_cache_release(bufmap->page_array[i]); +} + +static void +pvfs2_bufmap_free(struct pvfs2_bufmap *bufmap) +{ + kfree(bufmap->page_array); + kfree(bufmap->desc_array); + kfree(bufmap->buffer_index_array); + kfree(bufmap); +} + +struct pvfs2_bufmap *pvfs2_bufmap_ref(void) +{ + struct pvfs2_bufmap *bufmap = NULL; + + spin_lock(&pvfs2_bufmap_lock); + if (__pvfs2_bufmap) { + bufmap = __pvfs2_bufmap; + atomic_inc(&bufmap->refcnt); + } + spin_unlock(&pvfs2_bufmap_lock); + return bufmap; +} + +void pvfs2_bufmap_unref(struct pvfs2_bufmap *bufmap) +{ + if (atomic_dec_and_lock(&bufmap->refcnt, &pvfs2_bufmap_lock)) { + __pvfs2_bufmap = NULL; + spin_unlock(&pvfs2_bufmap_lock); + + pvfs2_bufmap_unmap(bufmap); + pvfs2_bufmap_free(bufmap); + } +} + +inline int pvfs_bufmap_size_query(void) +{ + struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref(); + int size = bufmap ? bufmap->desc_size : 0; + + pvfs2_bufmap_unref(bufmap); + return size; +} + +inline int pvfs_bufmap_shift_query(void) +{ + struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref(); + int shift = bufmap ? bufmap->desc_shift : 0; + + pvfs2_bufmap_unref(bufmap); + return shift; +} + +static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq); +static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq); + +/* + * get_bufmap_init + * + * If bufmap_init is 1, then the shared memory system, including the + * buffer_index_array, is available. Otherwise, it is not. + * + * returns the value of bufmap_init + */ +int get_bufmap_init(void) +{ + return __pvfs2_bufmap ? 1 : 0; +} + + +static struct pvfs2_bufmap * +pvfs2_bufmap_alloc(struct PVFS_dev_map_desc *user_desc) +{ + struct pvfs2_bufmap *bufmap; + + bufmap = kzalloc(sizeof(*bufmap), GFP_KERNEL); + if (!bufmap) + goto out; + + atomic_set(&bufmap->refcnt, 1); + bufmap->total_size = user_desc->total_size; + bufmap->desc_count = user_desc->count; + bufmap->desc_size = user_desc->size; + bufmap->desc_shift = ilog2(bufmap->desc_size); + + spin_lock_init(&bufmap->buffer_index_lock); + bufmap->buffer_index_array = + kcalloc(bufmap->desc_count, sizeof(int), GFP_KERNEL); + if (!bufmap->buffer_index_array) { + gossip_err("pvfs2: could not allocate %d buffer indices\n", + bufmap->desc_count); + goto out_free_bufmap; + } + spin_lock_init(&bufmap->readdir_index_lock); + + bufmap->desc_array = + kcalloc(bufmap->desc_count, sizeof(struct pvfs_bufmap_desc), + GFP_KERNEL); + if (!bufmap->desc_array) { + gossip_err("pvfs2: could not allocate %d descriptors\n", + bufmap->desc_count); + goto out_free_index_array; + } + + bufmap->page_count = bufmap->total_size / PAGE_SIZE; + + /* allocate storage to track our page mappings */ + bufmap->page_array = + kcalloc(bufmap->page_count, sizeof(struct page *), GFP_KERNEL); + if (!bufmap->page_array) + goto out_free_desc_array; + + return bufmap; + +out_free_desc_array: + kfree(bufmap->desc_array); +out_free_index_array: + kfree(bufmap->buffer_index_array); +out_free_bufmap: + kfree(bufmap); +out: + return NULL; +} + +static int +pvfs2_bufmap_map(struct pvfs2_bufmap *bufmap, + struct PVFS_dev_map_desc *user_desc) +{ + int pages_per_desc = bufmap->desc_size / PAGE_SIZE; + int offset = 0, ret, i; + + /* map the pages */ + down_write(¤t->mm->mmap_sem); + ret = get_user_pages(current, + current->mm, + (unsigned long)user_desc->ptr, + bufmap->page_count, + 1, + 0, + bufmap->page_array, + NULL); + up_write(¤t->mm->mmap_sem); + + if (ret < 0) + return ret; + + if (ret != bufmap->page_count) { + gossip_err("pvfs2 error: asked for %d pages, only got %d.\n", + bufmap->page_count, ret); + + for (i = 0; i < ret; i++) { + SetPageError(bufmap->page_array[i]); + page_cache_release(bufmap->page_array[i]); + } + return -ENOMEM; + } + + /* + * ideally we want to get kernel space pointers for each page, but + * we can't kmap that many pages at once if highmem is being used. + * so instead, we just kmap/kunmap the page address each time the + * kaddr is needed. + */ + for (i = 0; i < bufmap->page_count; i++) + flush_dcache_page(bufmap->page_array[i]); + + /* build a list of available descriptors */ + for (offset = 0, i = 0; i < bufmap->desc_count; i++) { + bufmap->desc_array[i].page_array = &bufmap->page_array[offset]; + bufmap->desc_array[i].array_count = pages_per_desc; + bufmap->desc_array[i].uaddr = + (user_desc->ptr + (i * pages_per_desc * PAGE_SIZE)); + offset += pages_per_desc; + } + + return 0; +} + +/* + * pvfs_bufmap_initialize() + * + * initializes the mapped buffer interface + * + * returns 0 on success, -errno on failure + */ +int pvfs_bufmap_initialize(struct PVFS_dev_map_desc *user_desc) +{ + struct pvfs2_bufmap *bufmap; + int ret = -EINVAL; + + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "pvfs_bufmap_initialize: called (ptr (" + "%p) sz (%d) cnt(%d).\n", + user_desc->ptr, + user_desc->size, + user_desc->count); + + /* + * sanity check alignment and size of buffer that caller wants to + * work with + */ + if (PAGE_ALIGN((unsigned long)user_desc->ptr) != + (unsigned long)user_desc->ptr) { + gossip_err("pvfs2 error: memory alignment (front). %p\n", + user_desc->ptr); + goto out; + } + + if (PAGE_ALIGN(((unsigned long)user_desc->ptr + user_desc->total_size)) + != (unsigned long)(user_desc->ptr + user_desc->total_size)) { + gossip_err("pvfs2 error: memory alignment (back).(%p + %d)\n", + user_desc->ptr, + user_desc->total_size); + goto out; + } + + if (user_desc->total_size != (user_desc->size * user_desc->count)) { + gossip_err("pvfs2 error: user provided an oddly sized buffer: (%d, %d, %d)\n", + user_desc->total_size, + user_desc->size, + user_desc->count); + goto out; + } + + if ((user_desc->size % PAGE_SIZE) != 0) { + gossip_err("pvfs2 error: bufmap size not page size divisible (%d).\n", + user_desc->size); + goto out; + } + + ret = -ENOMEM; + bufmap = pvfs2_bufmap_alloc(user_desc); + if (!bufmap) + goto out; + + ret = pvfs2_bufmap_map(bufmap, user_desc); + if (ret) + goto out_free_bufmap; + + + spin_lock(&pvfs2_bufmap_lock); + if (__pvfs2_bufmap) { + spin_unlock(&pvfs2_bufmap_lock); + gossip_err("pvfs2: error: bufmap already initialized.\n"); + ret = -EALREADY; + goto out_unmap_bufmap; + } + __pvfs2_bufmap = bufmap; + spin_unlock(&pvfs2_bufmap_lock); + + /* + * If there are operations in pvfs2_bufmap_init_waitq, wake them up. + * This scenario occurs when the client-core is restarted and I/O + * requests in the in-progress or waiting tables are restarted. I/O + * requests cannot be restarted until the shared memory system is + * completely re-initialized, so we put the I/O requests in this + * waitq until initialization has completed. NOTE: the I/O requests + * are also on a timer, so they don't wait forever just in case the + * client-core doesn't come back up. + */ + wake_up_interruptible(&pvfs2_bufmap_init_waitq); + + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "pvfs_bufmap_initialize: exiting normally\n"); + return 0; + +out_unmap_bufmap: + pvfs2_bufmap_unmap(bufmap); +out_free_bufmap: + pvfs2_bufmap_free(bufmap); +out: + return ret; +} + +/* + * pvfs_bufmap_finalize() + * + * shuts down the mapped buffer interface and releases any resources + * associated with it + * + * no return value + */ +void pvfs_bufmap_finalize(void) +{ + gossip_debug(GOSSIP_BUFMAP_DEBUG, "pvfs2_bufmap_finalize: called\n"); + BUG_ON(!__pvfs2_bufmap); + pvfs2_bufmap_unref(__pvfs2_bufmap); + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "pvfs2_bufmap_finalize: exiting normally\n"); +} + +struct slot_args { + int slot_count; + int *slot_array; + spinlock_t *slot_lock; + wait_queue_head_t *slot_wq; +}; + +static int wait_for_a_slot(struct slot_args *slargs, int *buffer_index) +{ + int ret = -1; + int i = 0; + DECLARE_WAITQUEUE(my_wait, current); + + + add_wait_queue_exclusive(slargs->slot_wq, &my_wait); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + /* + * check for available desc, slot_lock is the appropriate + * index_lock + */ + spin_lock(slargs->slot_lock); + for (i = 0; i < slargs->slot_count; i++) + if (slargs->slot_array[i] == 0) { + slargs->slot_array[i] = 1; + *buffer_index = i; + ret = 0; + break; + } + spin_unlock(slargs->slot_lock); + + /* if we acquired a buffer, then break out of while */ + if (ret == 0) + break; + + if (!signal_pending(current)) { + int timeout = + MSECS_TO_JIFFIES(1000 * slot_timeout_secs); + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "[BUFMAP]: waiting %d " + "seconds for a slot\n", + slot_timeout_secs); + if (!schedule_timeout(timeout)) { + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "*** wait_for_a_slot timed out\n"); + ret = -ETIMEDOUT; + break; + } + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "[BUFMAP]: woken up by a slot becoming available.\n"); + continue; + } + + gossip_debug(GOSSIP_BUFMAP_DEBUG, "pvfs2: %s interrupted.\n", + __func__); + ret = -EINTR; + break; + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(slargs->slot_wq, &my_wait); + return ret; +} + +static void put_back_slot(struct slot_args *slargs, int buffer_index) +{ + /* slot_lock is the appropriate index_lock */ + spin_lock(slargs->slot_lock); + if (buffer_index < 0 || buffer_index >= slargs->slot_count) { + spin_unlock(slargs->slot_lock); + return; + } + + /* put the desc back on the queue */ + slargs->slot_array[buffer_index] = 0; + spin_unlock(slargs->slot_lock); + + /* wake up anyone who may be sleeping on the queue */ + wake_up_interruptible(slargs->slot_wq); +} + +/* + * pvfs_bufmap_get() + * + * gets a free mapped buffer descriptor, will sleep until one becomes + * available if necessary + * + * returns 0 on success, -errno on failure + */ +int pvfs_bufmap_get(struct pvfs2_bufmap **mapp, int *buffer_index) +{ + struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref(); + struct slot_args slargs; + int ret; + + if (!bufmap) { + gossip_err("pvfs2: please confirm that pvfs2-client daemon is running.\n"); + return -EIO; + } + + slargs.slot_count = bufmap->desc_count; + slargs.slot_array = bufmap->buffer_index_array; + slargs.slot_lock = &bufmap->buffer_index_lock; + slargs.slot_wq = &bufmap_waitq; + ret = wait_for_a_slot(&slargs, buffer_index); + if (ret) + pvfs2_bufmap_unref(bufmap); + *mapp = bufmap; + return ret; +} + +/* + * pvfs_bufmap_put() + * + * returns a mapped buffer descriptor to the collection + * + * no return value + */ +void pvfs_bufmap_put(struct pvfs2_bufmap *bufmap, int buffer_index) +{ + struct slot_args slargs; + + slargs.slot_count = bufmap->desc_count; + slargs.slot_array = bufmap->buffer_index_array; + slargs.slot_lock = &bufmap->buffer_index_lock; + slargs.slot_wq = &bufmap_waitq; + put_back_slot(&slargs, buffer_index); + pvfs2_bufmap_unref(bufmap); +} + +/* + * readdir_index_get() + * + * gets a free descriptor, will sleep until one becomes + * available if necessary. + * Although the readdir buffers are not mapped into kernel space + * we could do that at a later point of time. Regardless, these + * indices are used by the client-core. + * + * returns 0 on success, -errno on failure + */ +int readdir_index_get(struct pvfs2_bufmap **mapp, int *buffer_index) +{ + struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref(); + struct slot_args slargs; + int ret; + + if (!bufmap) { + gossip_err("pvfs2: please confirm that pvfs2-client daemon is running.\n"); + return -EIO; + } + + slargs.slot_count = PVFS2_READDIR_DEFAULT_DESC_COUNT; + slargs.slot_array = bufmap->readdir_index_array; + slargs.slot_lock = &bufmap->readdir_index_lock; + slargs.slot_wq = &readdir_waitq; + ret = wait_for_a_slot(&slargs, buffer_index); + if (ret) + pvfs2_bufmap_unref(bufmap); + *mapp = bufmap; + return ret; +} + +void readdir_index_put(struct pvfs2_bufmap *bufmap, int buffer_index) +{ + struct slot_args slargs; + + slargs.slot_count = PVFS2_READDIR_DEFAULT_DESC_COUNT; + slargs.slot_array = bufmap->readdir_index_array; + slargs.slot_lock = &bufmap->readdir_index_lock; + slargs.slot_wq = &readdir_waitq; + put_back_slot(&slargs, buffer_index); + pvfs2_bufmap_unref(bufmap); +} + +/* + * pvfs_bufmap_copy_iovec_from_user() + * + * copies data from several user space address's in an iovec + * to a mapped buffer + * + * Note that the mapped buffer is a series of pages and therefore + * the copies have to be split by PAGE_SIZE bytes at a time. + * Note that this routine checks that summation of iov_len + * across all the elements of iov is equal to size. + * + * returns 0 on success, -errno on failure + */ +int pvfs_bufmap_copy_iovec_from_user(struct pvfs2_bufmap *bufmap, + int buffer_index, + const struct iovec *iov, + unsigned long nr_segs, + size_t size) +{ + size_t ret = 0; + size_t amt_copied = 0; + size_t cur_copy_size = 0; + unsigned int to_page_offset = 0; + unsigned int to_page_index = 0; + void *to_kaddr = NULL; + void __user *from_addr = NULL; + struct iovec *copied_iovec = NULL; + struct pvfs_bufmap_desc *to; + unsigned int seg; + char *tmp_printer = NULL; + int tmp_int = 0; + + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "pvfs_bufmap_copy_iovec_from_user: index %d, " + "size %zd\n", + buffer_index, + size); + + to = &bufmap->desc_array[buffer_index]; + + /* + * copy the passed in iovec so that we can change some of its fields + */ + copied_iovec = kmalloc_array(nr_segs, + sizeof(*copied_iovec), + PVFS2_BUFMAP_GFP_FLAGS); + if (copied_iovec == NULL) + return -ENOMEM; + + memcpy(copied_iovec, iov, nr_segs * sizeof(*copied_iovec)); + /* + * Go through each segment in the iovec and make sure that + * the summation of iov_len matches the given size. + */ + for (seg = 0, amt_copied = 0; seg < nr_segs; seg++) + amt_copied += copied_iovec[seg].iov_len; + if (amt_copied != size) { + gossip_err( + "pvfs2_bufmap_copy_iovec_from_user: computed total (" + "%zd) is not equal to (%zd)\n", + amt_copied, + size); + kfree(copied_iovec); + return -EINVAL; + } + + to_page_index = 0; + to_page_offset = 0; + amt_copied = 0; + seg = 0; + /* + * Go through each segment in the iovec and copy its + * buffer into the mapped buffer one page at a time though + */ + while (amt_copied < size) { + struct iovec *iv = &copied_iovec[seg]; + int inc_to_page_index; + + if (iv->iov_len < (PAGE_SIZE - to_page_offset)) { + cur_copy_size = + PVFS_util_min(iv->iov_len, size - amt_copied); + seg++; + from_addr = iv->iov_base; + inc_to_page_index = 0; + } else if (iv->iov_len == (PAGE_SIZE - to_page_offset)) { + cur_copy_size = + PVFS_util_min(iv->iov_len, size - amt_copied); + seg++; + from_addr = iv->iov_base; + inc_to_page_index = 1; + } else { + cur_copy_size = + PVFS_util_min(PAGE_SIZE - to_page_offset, + size - amt_copied); + from_addr = iv->iov_base; + iv->iov_base += cur_copy_size; + iv->iov_len -= cur_copy_size; + inc_to_page_index = 1; + } + to_kaddr = pvfs2_kmap(to->page_array[to_page_index]); + ret = + copy_from_user(to_kaddr + to_page_offset, + from_addr, + cur_copy_size); + if (!PageReserved(to->page_array[to_page_index])) + SetPageDirty(to->page_array[to_page_index]); + + if (!tmp_printer) { + tmp_printer = (char *)(to_kaddr + to_page_offset); + tmp_int += tmp_printer[0]; + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "First character (integer value) in pvfs_bufmap_copy_from_user: %d\n", + tmp_int); + } + + pvfs2_kunmap(to->page_array[to_page_index]); + if (ret) { + gossip_err("Failed to copy data from user space\n"); + kfree(copied_iovec); + return -EFAULT; + } + + amt_copied += cur_copy_size; + if (inc_to_page_index) { + to_page_offset = 0; + to_page_index++; + } else { + to_page_offset += cur_copy_size; + } + } + kfree(copied_iovec); + return 0; +} + +/* + * pvfs_bufmap_copy_iovec_from_kernel() + * + * copies data from several kernel space address's in an iovec + * to a mapped buffer + * + * Note that the mapped buffer is a series of pages and therefore + * the copies have to be split by PAGE_SIZE bytes at a time. + * Note that this routine checks that summation of iov_len + * across all the elements of iov is equal to size. + * + * returns 0 on success, -errno on failure + */ +int pvfs_bufmap_copy_iovec_from_kernel(struct pvfs2_bufmap *bufmap, + int buffer_index, const struct iovec *iov, + unsigned long nr_segs, size_t size) +{ + size_t amt_copied = 0; + size_t cur_copy_size = 0; + int to_page_index = 0; + void *to_kaddr = NULL; + void *from_kaddr = NULL; + struct iovec *copied_iovec = NULL; + struct pvfs_bufmap_desc *to; + unsigned int seg; + unsigned to_page_offset = 0; + + gossip_debug(GOSSIP_BUFMAP_DEBUG, + "pvfs_bufmap_copy_iovec_from_kernel: index %d, " + "size %zd\n", + buffer_index, + size); + + to = &bufmap->desc_array[buffer_index]; + /* + * copy the passed in iovec so that we can change some of its fields + */ + copied_iovec = kmalloc_array(nr_segs, + sizeof(*copied_iovec), + PVFS2_BUFMAP_GFP_FLAGS); + if (copied_iovec == NULL) + return -ENOMEM; + + memcpy(copied_iovec, iov, nr_segs * sizeof(*copied_iovec)); + /* + * Go through each segment in the iovec and make sure that + * the summation of iov_len matches the given size. + */ + for (seg = 0, amt_copied = 0; seg < nr_segs; seg++) + amt_copied += copied_iovec[seg].iov_len; + if (amt_copied != size) { + gossip_err("pvfs2_bufmap_copy_iovec_from_kernel: computed total(%zd) is not equal to (%zd)\n", + amt_copied, + size); + kfree(copied_iovec); + return -EINVAL; + } |