From f727a0c324ce2c7e7cbe478d22895bf7bc8ed0a6 Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Fri, 5 Feb 2016 11:57:46 -0500 Subject: staging/hfi1: Add function stubs for TID caching Add mmu notify helper functions and TID caching function stubs in preparation for the TID caching implementation. TID caching makes use of the MMU notifier to allow the driver to respond to the user freeing memory which is allocated to the HFI. This patch implements the basic MMU notifier functions to insert, find and remove buffer pages from memory based on the mmu_notifier being invoked. In addition it places stubs in place for the main entry points by follow on code. Follow up patches will complete the implementation of the interaction with user space and makes use of these functions. Signed-off-by: Mitko Haralanov Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/staging/rdma/hfi1/Kconfig | 1 + drivers/staging/rdma/hfi1/Makefile | 2 +- drivers/staging/rdma/hfi1/hfi.h | 4 + drivers/staging/rdma/hfi1/user_exp_rcv.c | 264 +++++++++++++++++++++++++++++++ drivers/staging/rdma/hfi1/user_exp_rcv.h | 8 + 5 files changed, 278 insertions(+), 1 deletion(-) create mode 100644 drivers/staging/rdma/hfi1/user_exp_rcv.c (limited to 'drivers/staging') diff --git a/drivers/staging/rdma/hfi1/Kconfig b/drivers/staging/rdma/hfi1/Kconfig index fd25078ee923..bd0249bcf199 100644 --- a/drivers/staging/rdma/hfi1/Kconfig +++ b/drivers/staging/rdma/hfi1/Kconfig @@ -1,6 +1,7 @@ config INFINIBAND_HFI1 tristate "Intel OPA Gen1 support" depends on X86_64 + select MMU_NOTIFIER default m ---help--- This is a low-level driver for Intel OPA Gen1 adapter. diff --git a/drivers/staging/rdma/hfi1/Makefile b/drivers/staging/rdma/hfi1/Makefile index 68c5a315e557..e63251b9c56b 100644 --- a/drivers/staging/rdma/hfi1/Makefile +++ b/drivers/staging/rdma/hfi1/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o hfi1-y := chip.o cq.o device.o diag.o dma.o driver.o efivar.o eprom.o file_ops.o firmware.o \ init.o intr.o keys.o mad.o mmap.o mr.o pcie.o pio.o pio_copy.o \ qp.o qsfp.o rc.o ruc.o sdma.o srq.o sysfs.o trace.o twsi.o \ - uc.o ud.o user_pages.o user_sdma.o verbs_mcast.o verbs.o + uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs_mcast.o verbs.o hfi1-$(CONFIG_DEBUG_FS) += debugfs.o CFLAGS_trace.o = -I$(src) diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h index 2611bb2e764d..ddb21f0fffe7 100644 --- a/drivers/staging/rdma/hfi1/hfi.h +++ b/drivers/staging/rdma/hfi1/hfi.h @@ -65,6 +65,8 @@ #include #include #include +#include +#include #include "chip_registers.h" #include "common.h" @@ -1125,6 +1127,8 @@ struct hfi1_devdata { #define PT_EAGER 1 #define PT_INVALID 2 +struct mmu_rb_node; + /* Private data for file operations */ struct hfi1_filedata { struct hfi1_ctxtdata *uctxt; diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c new file mode 100644 index 000000000000..bafeddf67c8f --- /dev/null +++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c @@ -0,0 +1,264 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include + +#include "user_exp_rcv.h" +#include "trace.h" + +struct mmu_rb_node { + struct rb_node rbnode; + unsigned long virt; + unsigned long phys; + unsigned long len; + struct tid_group *grp; + u32 rcventry; + dma_addr_t dma_addr; + bool freed; + unsigned npages; + struct page *pages[0]; +}; + +enum mmu_call_types { + MMU_INVALIDATE_PAGE = 0, + MMU_INVALIDATE_RANGE = 1 +}; + +static const char * const mmu_types[] = { + "PAGE", + "RANGE" +}; + +static inline int mmu_addr_cmp(struct mmu_rb_node *, unsigned long, + unsigned long); +static struct mmu_rb_node *mmu_rb_search_by_addr(struct rb_root *, + unsigned long) __maybe_unused; +static inline struct mmu_rb_node *mmu_rb_search_by_entry(struct rb_root *, + u32); +static int mmu_rb_insert_by_addr(struct rb_root *, + struct mmu_rb_node *) __maybe_unused; +static int mmu_rb_insert_by_entry(struct rb_root *, + struct mmu_rb_node *) __maybe_unused; +static void mmu_notifier_mem_invalidate(struct mmu_notifier *, + unsigned long, unsigned long, + enum mmu_call_types); +static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *, + unsigned long); +static inline void mmu_notifier_range_start(struct mmu_notifier *, + struct mm_struct *, + unsigned long, unsigned long); + +static struct mmu_notifier_ops __maybe_unused mn_opts = { + .invalidate_page = mmu_notifier_page, + .invalidate_range_start = mmu_notifier_range_start, +}; + +/* + * Initialize context and file private data needed for Expected + * receive caching. This needs to be done after the context has + * been configured with the eager/expected RcvEntry counts. + */ +int hfi1_user_exp_rcv_init(struct file *fp) +{ + return -EINVAL; +} + +int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) +{ + return -EINVAL; +} + +int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo) +{ + return -EINVAL; +} + +int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo) +{ + return -EINVAL; +} + +int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo) +{ + return -EINVAL; +} + +static inline void mmu_notifier_page(struct mmu_notifier *mn, + struct mm_struct *mm, unsigned long addr) +{ + mmu_notifier_mem_invalidate(mn, addr, addr + PAGE_SIZE, + MMU_INVALIDATE_PAGE); +} + +static inline void mmu_notifier_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + mmu_notifier_mem_invalidate(mn, start, end, MMU_INVALIDATE_RANGE); +} + +static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, + unsigned long start, unsigned long end, + enum mmu_call_types type) +{ + /* Stub for now */ +} + +static inline int mmu_addr_cmp(struct mmu_rb_node *node, unsigned long addr, + unsigned long len) +{ + if ((addr + len) <= node->virt) + return -1; + else if (addr >= node->virt && addr < (node->virt + node->len)) + return 0; + else + return 1; +} + +static inline int mmu_entry_cmp(struct mmu_rb_node *node, u32 entry) +{ + if (entry < node->rcventry) + return -1; + else if (entry > node->rcventry) + return 1; + else + return 0; +} + +static struct mmu_rb_node *mmu_rb_search_by_addr(struct rb_root *root, + unsigned long addr) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct mmu_rb_node *mnode = + container_of(node, struct mmu_rb_node, rbnode); + /* + * When searching, use at least one page length for size. The + * MMU notifier will not give us anything less than that. We + * also don't need anything more than a page because we are + * guaranteed to have non-overlapping buffers in the tree. + */ + int result = mmu_addr_cmp(mnode, addr, PAGE_SIZE); + + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return mnode; + } + return NULL; +} + +static inline struct mmu_rb_node *mmu_rb_search_by_entry(struct rb_root *root, + u32 index) +{ + struct mmu_rb_node *rbnode; + struct rb_node *node; + + if (root && !RB_EMPTY_ROOT(root)) + for (node = rb_first(root); node; node = rb_next(node)) { + rbnode = rb_entry(node, struct mmu_rb_node, rbnode); + if (rbnode->rcventry == index) + return rbnode; + } + return NULL; +} + +static int mmu_rb_insert_by_entry(struct rb_root *root, + struct mmu_rb_node *node) +{ + struct rb_node **new = &root->rb_node, *parent = NULL; + + while (*new) { + struct mmu_rb_node *this = + container_of(*new, struct mmu_rb_node, rbnode); + int result = mmu_entry_cmp(this, node->rcventry); + + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) + new = &((*new)->rb_right); + else + return 1; + } + + rb_link_node(&node->rbnode, parent, new); + rb_insert_color(&node->rbnode, root); + return 0; +} + +static int mmu_rb_insert_by_addr(struct rb_root *root, struct mmu_rb_node *node) +{ + struct rb_node **new = &root->rb_node, *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct mmu_rb_node *this = + container_of(*new, struct mmu_rb_node, rbnode); + int result = mmu_addr_cmp(this, node->virt, node->len); + + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) + new = &((*new)->rb_right); + else + return 1; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&node->rbnode, parent, new); + rb_insert_color(&node->rbnode, root); + + return 0; +} diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.h b/drivers/staging/rdma/hfi1/user_exp_rcv.h index 4f4876e1d353..28ef98a45a1e 100644 --- a/drivers/staging/rdma/hfi1/user_exp_rcv.h +++ b/drivers/staging/rdma/hfi1/user_exp_rcv.h @@ -50,6 +50,8 @@ * */ +#include "hfi.h" + #define EXP_TID_TIDLEN_MASK 0x7FFULL #define EXP_TID_TIDLEN_SHIFT 0 #define EXP_TID_TIDCTRL_MASK 0x3ULL @@ -71,4 +73,10 @@ (tid) |= EXP_TID_SET(field, (value)); \ } while (0) +int hfi1_user_exp_rcv_init(struct file *); +int hfi1_user_exp_rcv_free(struct hfi1_filedata *); +int hfi1_user_exp_rcv_setup(struct file *, struct hfi1_tid_info *); +int hfi1_user_exp_rcv_clear(struct file *, struct hfi1_tid_info *); +int hfi1_user_exp_rcv_invalid(struct file *, struct hfi1_tid_info *); + #endif /* _HFI1_USER_EXP_RCV_H */ -- cgit v1.2.3 From 955ad36dcde4639664253c2bd39f626cd88d2acf Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Fri, 5 Feb 2016 11:57:48 -0500 Subject: uapi/hfi1_user: Add command and event for TID caching TID caching will use a new event to signal userland that cache invalidation has occurred and needs a matching command code that will be used to read the invalidated TIDs. Add the event bit and the new command to the exported header file. The command is also added to the switch() statement in file_ops.c for completeness and in preparation for its usage later. Signed-off-by: Mitko Haralanov Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/staging/rdma/hfi1/file_ops.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/staging') diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c index d57d549052c8..c66693532be0 100644 --- a/drivers/staging/rdma/hfi1/file_ops.c +++ b/drivers/staging/rdma/hfi1/file_ops.c @@ -241,6 +241,7 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data, must_be_root = 1; /* validate user */ copy = 0; break; + case HFI1_CMD_TID_INVAL_READ: default: ret = -EINVAL; goto bail; -- cgit v1.2.3 From a86cd357e5be1b7eae3b399c02b972a92808c38a Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Fri, 5 Feb 2016 11:57:49 -0500 Subject: staging/hfi1: Add definitions needed for TID cache In preparation for adding the TID caching support, there is a set of headers, structures, and variables which will be needed. This commit adds them to the hfi.h header file. Signed-off-by: Mitko Haralanov Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/staging/rdma/hfi1/hfi.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'drivers/staging') diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h index ddb21f0fffe7..51ecf45ef70b 100644 --- a/drivers/staging/rdma/hfi1/hfi.h +++ b/drivers/staging/rdma/hfi1/hfi.h @@ -179,6 +179,11 @@ struct ctxt_eager_bufs { } *rcvtids; }; +struct exp_tid_set { + struct list_head list; + u32 count; +}; + struct hfi1_ctxtdata { /* shadow the ctxt's RcvCtrl register */ u64 rcvctrl; @@ -247,6 +252,11 @@ struct hfi1_ctxtdata { struct page **tid_pg_list; /* dma handles for exp tid pages */ dma_addr_t *physshadow; + + struct exp_tid_set tid_group_list; + struct exp_tid_set tid_used_list; + struct exp_tid_set tid_full_list; + /* lock protecting all Expected TID data */ spinlock_t exp_lock; /* number of pio bufs for this ctxt (all procs, if shared) */ @@ -1137,6 +1147,16 @@ struct hfi1_filedata { struct hfi1_user_sdma_pkt_q *pq; /* for cpu affinity; -1 if none */ int rec_cpu_num; + struct mmu_notifier mn; + struct rb_root tid_rb_root; + spinlock_t tid_lock; /* protect tid_[limit,used] counters */ + u32 tid_limit; + u32 tid_used; + spinlock_t rb_lock; /* protect tid_rb_root RB tree */ + u32 *invalid_tids; + u32 invalid_tid_idx; + spinlock_t invalid_lock; /* protect the invalid_tids array */ + int (*mmu_rb_insert)(struct rb_root *, struct mmu_rb_node *); }; extern struct list_head hfi1_dev_list; -- cgit v1.2.3 From acac10fdd75a85b10a638381127f7bbed632580d Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Fri, 5 Feb 2016 11:57:50 -0500 Subject: staging/hfi1: Remove un-needed variable There is no need to use a separate variable for a return value and a label when returning right away would do just as well. Signed-off-by: Mitko Haralanov Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/staging/rdma/hfi1/file_ops.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'drivers/staging') diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c index c66693532be0..76fe60315bb4 100644 --- a/drivers/staging/rdma/hfi1/file_ops.c +++ b/drivers/staging/rdma/hfi1/file_ops.c @@ -1037,22 +1037,19 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd, static int init_subctxts(struct hfi1_ctxtdata *uctxt, const struct hfi1_user_info *uinfo) { - int ret = 0; unsigned num_subctxts; num_subctxts = uinfo->subctxt_cnt; - if (num_subctxts > HFI1_MAX_SHARED_CTXTS) { - ret = -EINVAL; - goto bail; - } + if (num_subctxts > HFI1_MAX_SHARED_CTXTS) + return -EINVAL; uctxt->subctxt_cnt = uinfo->subctxt_cnt; uctxt->subctxt_id = uinfo->subctxt_id; uctxt->active_slaves = 1; uctxt->redirect_seq_cnt = 1; set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags); -bail: - return ret; + + return 0; } static int setup_subctxt(struct hfi1_ctxtdata *uctxt) -- cgit v1.2.3 From b8abe346737215c6ee6b50c01771b4ca1746801d Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Fri, 5 Feb 2016 11:57:51 -0500 Subject: staging/hfi1: TID group definitions and support funcs Definitions and functions use to manage sets of TID/RcvArray groups. These will be used by the TID cacheline functionality coming with later patches. TID groups (or RcvArray groups) are groups of TID/RcvArray entries organized in sets of 8 and aligned on cacheline boundaries. The TID/RcvArray entries are managed in this way to make taking advantage of write-combining easier - each group is a entire cacheline. rcv_array_wc_fill() is provided to allow of generating writes to TIDs which are not currently being used in order to cause the flush of the write-combining buffer. Signed-off-by: Mitko Haralanov Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/staging/rdma/hfi1/user_exp_rcv.c | 64 ++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) (limited to 'drivers/staging') diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c index bafeddf67c8f..7f15024daab9 100644 --- a/drivers/staging/rdma/hfi1/user_exp_rcv.c +++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c @@ -52,6 +52,14 @@ #include "user_exp_rcv.h" #include "trace.h" +struct tid_group { + struct list_head list; + unsigned base; + u8 size; + u8 used; + u8 map; +}; + struct mmu_rb_node { struct rb_node rbnode; unsigned long virt; @@ -75,6 +83,8 @@ static const char * const mmu_types[] = { "RANGE" }; +#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list)) + static inline int mmu_addr_cmp(struct mmu_rb_node *, unsigned long, unsigned long); static struct mmu_rb_node *mmu_rb_search_by_addr(struct rb_root *, @@ -94,6 +104,43 @@ static inline void mmu_notifier_range_start(struct mmu_notifier *, struct mm_struct *, unsigned long, unsigned long); +static inline void exp_tid_group_init(struct exp_tid_set *set) +{ + INIT_LIST_HEAD(&set->list); + set->count = 0; +} + +static inline void tid_group_remove(struct tid_group *grp, + struct exp_tid_set *set) +{ + list_del_init(&grp->list); + set->count--; +} + +static inline void tid_group_add_tail(struct tid_group *grp, + struct exp_tid_set *set) +{ + list_add_tail(&grp->list, &set->list); + set->count++; +} + +static inline struct tid_group *tid_group_pop(struct exp_tid_set *set) +{ + struct tid_group *grp = + list_first_entry(&set->list, struct tid_group, list); + list_del_init(&grp->list); + set->count--; + return grp; +} + +static inline void tid_group_move(struct tid_group *group, + struct exp_tid_set *s1, + struct exp_tid_set *s2) +{ + tid_group_remove(group, s1); + tid_group_add_tail(group, s2); +} + static struct mmu_notifier_ops __maybe_unused mn_opts = { .invalidate_page = mmu_notifier_page, .invalidate_range_start = mmu_notifier_range_start, @@ -114,6 +161,23 @@ int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) return -EINVAL; } +/* + * Write an "empty" RcvArray entry. + * This function exists so the TID registaration code can use it + * to write to unused/unneeded entries and still take advantage + * of the WC performance improvements. The HFI will ignore this + * write to the RcvArray entry. + */ +static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index) +{ + /* + * Doing the WC fill writes only makes sense if the device is + * present and the RcvArray has been mapped as WC memory. + */ + if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc) + writeq(0, dd->rcvarray_wc + (index * 8)); +} + int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo) { return -EINVAL; -- cgit v1.2.3 From f88e0c8a139dc737b997876203885a3168c32e95 Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Fri, 5 Feb 2016 11:57:52 -0500 Subject: staging/hfi1: Add building blocks for TID caching Functions added by this patch are building blocks for the upcoming TID caching functionality. The functions added are currently unsed (and marked as such.) The functions' purposes are to find physically contigous pages in the user's virtual buffer, program the RcvArray group entries with these physical chunks, and unprogram the RcvArray groups. Reviewed-by: Ira Weiny Signed-off-by: Mitko Haralanov Signed-off-by: Doug Ledford --- drivers/staging/rdma/hfi1/user_exp_rcv.c | 337 +++++++++++++++++++++++++++++++ 1 file changed, 337 insertions(+) (limited to 'drivers/staging') diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c index 7f15024daab9..5a7e455b9f58 100644 --- a/drivers/staging/rdma/hfi1/user_exp_rcv.c +++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c @@ -83,8 +83,20 @@ static const char * const mmu_types[] = { "RANGE" }; +struct tid_pageset { + u16 idx; + u16 count; +}; + #define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list)) +static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *, + struct rb_root *) __maybe_unused; +static u32 find_phys_blocks(struct page **, unsigned, + struct tid_pageset *) __maybe_unused; +static int set_rcvarray_entry(struct file *, unsigned long, u32, + struct tid_group *, struct page **, + unsigned) __maybe_unused; static inline int mmu_addr_cmp(struct mmu_rb_node *, unsigned long, unsigned long); static struct mmu_rb_node *mmu_rb_search_by_addr(struct rb_root *, @@ -103,6 +115,21 @@ static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *, static inline void mmu_notifier_range_start(struct mmu_notifier *, struct mm_struct *, unsigned long, unsigned long); +static int program_rcvarray(struct file *, unsigned long, struct tid_group *, + struct tid_pageset *, unsigned, u16, struct page **, + u32 *, unsigned *, unsigned *) __maybe_unused; +static int unprogram_rcvarray(struct file *, u32, + struct tid_group **) __maybe_unused; +static void clear_tid_node(struct hfi1_filedata *, u16, + struct mmu_rb_node *) __maybe_unused; + +static inline u32 rcventry2tidinfo(u32 rcventry) +{ + u32 pair = rcventry & ~0x1; + + return EXP_TID_SET(IDX, pair >> 1) | + EXP_TID_SET(CTRL, 1 << (rcventry - pair)); +} static inline void exp_tid_group_init(struct exp_tid_set *set) { @@ -193,6 +220,316 @@ int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo) return -EINVAL; } +static u32 find_phys_blocks(struct page **pages, unsigned npages, + struct tid_pageset *list) +{ + unsigned pagecount, pageidx, setcount = 0, i; + unsigned long pfn, this_pfn; + + if (!npages) + return 0; + + /* + * Look for sets of physically contiguous pages in the user buffer. + * This will allow us to optimize Expected RcvArray entry usage by + * using the bigger supported sizes. + */ + pfn = page_to_pfn(pages[0]); + for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { + this_pfn = i < npages ? page_to_pfn(pages[i]) : 0; + + /* + * If the pfn's are not sequential, pages are not physically + * contiguous. + */ + if (this_pfn != ++pfn) { + /* + * At this point we have to loop over the set of + * physically contiguous pages and break them down it + * sizes supported by the HW. + * There are two main constraints: + * 1. The max buffer size is MAX_EXPECTED_BUFFER. + * If the total set size is bigger than that + * program only a MAX_EXPECTED_BUFFER chunk. + * 2. The buffer size has to be a power of two. If + * it is not, round down to the closes power of + * 2 and program that size. + */ + while (pagecount) { + int maxpages = pagecount; + u32 bufsize = pagecount * PAGE_SIZE; + + if (bufsize > MAX_EXPECTED_BUFFER) + maxpages = + MAX_EXPECTED_BUFFER >> + PAGE_SHIFT; + else if (!is_power_of_2(bufsize)) + maxpages = + rounddown_pow_of_two(bufsize) >> + PAGE_SHIFT; + + list[setcount].idx = pageidx; + list[setcount].count = maxpages; + pagecount -= maxpages; + pageidx += maxpages; + setcount++; + } + pageidx = i; + pagecount = 1; + pfn = this_pfn; + } else { + pagecount++; + } + } + return setcount; +} + +/** + * program_rcvarray() - program an RcvArray group with receive buffers + * @fp: file pointer + * @vaddr: starting user virtual address + * @grp: RcvArray group + * @sets: array of struct tid_pageset holding information on physically + * contiguous chunks from the user buffer + * @start: starting index into sets array + * @count: number of struct tid_pageset's to program + * @pages: an array of struct page * for the user buffer + * @tidlist: the array of u32 elements when the information about the + * programmed RcvArray entries is to be encoded. + * @tididx: starting offset into tidlist + * @pmapped: (output parameter) number of pages programmed into the RcvArray + * entries. + * + * This function will program up to 'count' number of RcvArray entries from the + * group 'grp'. To make best use of write-combining writes, the function will + * perform writes to the unused RcvArray entries which will be ignored by the + * HW. Each RcvArray entry will be programmed with a physically contiguous + * buffer chunk from the user's virtual buffer. + * + * Return: + * -EINVAL if the requested count is larger than the size of the group, + * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or + * number of RcvArray entries programmed. + */ +static int program_rcvarray(struct file *fp, unsigned long vaddr, + struct tid_group *grp, + struct tid_pageset *sets, + unsigned start, u16 count, struct page **pages, + u32 *tidlist, unsigned *tididx, unsigned *pmapped) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + u16 idx; + u32 tidinfo = 0, rcventry, useidx = 0; + int mapped = 0; + + /* Count should never be larger than the group size */ + if (count > grp->size) + return -EINVAL; + + /* Find the first unused entry in the group */ + for (idx = 0; idx < grp->size; idx++) { + if (!(grp->map & (1 << idx))) { + useidx = idx; + break; + } + rcv_array_wc_fill(dd, grp->base + idx); + } + + idx = 0; + while (idx < count) { + u16 npages, pageidx, setidx = start + idx; + int ret = 0; + + /* + * If this entry in the group is used, move to the next one. + * If we go past the end of the group, exit the loop. + */ + if (useidx >= grp->size) { + break; + } else if (grp->map & (1 << useidx)) { + rcv_array_wc_fill(dd, grp->base + useidx); + useidx++; + continue; + } + + rcventry = grp->base + useidx; + npages = sets[setidx].count; + pageidx = sets[setidx].idx; + + ret = set_rcvarray_entry(fp, vaddr + (pageidx * PAGE_SIZE), + rcventry, grp, pages + pageidx, + npages); + if (ret) + return ret; + mapped += npages; + + tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) | + EXP_TID_SET(LEN, npages); + tidlist[(*tididx)++] = tidinfo; + grp->used++; + grp->map |= 1 << useidx++; + idx++; + } + + /* Fill the rest of the group with "blank" writes */ + for (; useidx < grp->size; useidx++) + rcv_array_wc_fill(dd, grp->base + useidx); + *pmapped = mapped; + return idx; +} + +static int set_rcvarray_entry(struct file *fp, unsigned long vaddr, + u32 rcventry, struct tid_group *grp, + struct page **pages, unsigned npages) +{ + int ret; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct mmu_rb_node *node; + struct hfi1_devdata *dd = uctxt->dd; + struct rb_root *root = &fd->tid_rb_root; + dma_addr_t phys; + + /* + * Allocate the node first so we can handle a potential + * failure before we've programmed anything. + */ + node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages), + GFP_KERNEL); + if (!node) + return -ENOMEM; + + phys = pci_map_single(dd->pcidev, + __va(page_to_phys(pages[0])), + npages * PAGE_SIZE, PCI_DMA_FROMDEVICE); + if (dma_mapping_error(&dd->pcidev->dev, phys)) { + dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n", + phys); + kfree(node); + return -EFAULT; + } + + node->virt = vaddr; + node->phys = page_to_phys(pages[0]); + node->len = npages * PAGE_SIZE; + node->npages = npages; + node->rcventry = rcventry; + node->dma_addr = phys; + node->grp = grp; + node->freed = false; + memcpy(node->pages, pages, sizeof(struct page *) * npages); + + spin_lock(&fd->rb_lock); + ret = fd->mmu_rb_insert(root, node); + spin_unlock(&fd->rb_lock); + + if (ret) { + hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d", + node->rcventry, node->virt, node->phys, ret); + pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE, + PCI_DMA_FROMDEVICE); + kfree(node); + return -EFAULT; + } + hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1); + return 0; +} + +static int unprogram_rcvarray(struct file *fp, u32 tidinfo, + struct tid_group **grp) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + struct mmu_rb_node *node; + u8 tidctrl = EXP_TID_GET(tidinfo, CTRL); + u32 tidbase = uctxt->expected_base, + tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry; + + if (tididx >= uctxt->expected_count) { + dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", + tididx, uctxt->ctxt); + return -EINVAL; + } + + if (tidctrl == 0x3) + return -EINVAL; + + rcventry = tidbase + tididx + (tidctrl - 1); + + spin_lock(&fd->rb_lock); + node = mmu_rb_search_by_entry(&fd->tid_rb_root, rcventry); + if (!node) { + spin_unlock(&fd->rb_lock); + return -EBADF; + } + rb_erase(&node->rbnode, &fd->tid_rb_root); + spin_unlock(&fd->rb_lock); + if (grp) + *grp = node->grp; + clear_tid_node(fd, fd->subctxt, node); + return 0; +} + +static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt, + struct mmu_rb_node *node) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + + hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0); + /* + * Make sure device has seen the write before we unpin the + * pages. + */ + flush_wc(); + + pci_unmap_single(dd->pcidev, node->dma_addr, node->len, + PCI_DMA_FROMDEVICE); + hfi1_release_user_pages(node->pages, node->npages, true); + + node->grp->used--; + node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); + + if (node->grp->used == node->grp->size - 1) + tid_group_move(node->grp, &uctxt->tid_full_list, + &uctxt->tid_used_list); + else if (!node->grp->used) + tid_group_move(node->grp, &uctxt->tid_used_list, + &uctxt->tid_group_list); + kfree(node); +} + +static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, + struct exp_tid_set *set, struct rb_root *root) +{ + struct tid_group *grp, *ptr; + struct hfi1_filedata *fd = container_of(root, struct hfi1_filedata, + tid_rb_root); + int i; + + list_for_each_entry_safe(grp, ptr, &set->list, list) { + list_del_init(&grp->list); + + spin_lock(&fd->rb_lock); + for (i = 0; i < grp->size; i++) { + if (grp->map & (1 << i)) { + u16 rcventry = grp->base + i; + struct mmu_rb_node *node; + + node = mmu_rb_search_by_entry(root, rcventry); + if (!node) + continue; + rb_erase(&node->rbnode, root); + clear_tid_node(fd, -1, node); + } + } + spin_unlock(&fd->rb_lock); + } +} + static inline void mmu_notifier_page(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long addr) { -- cgit v1.2.3 From 463e6ebc86578ef3ff5bb500f6fc9449afaeea7e Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Fri, 5 Feb 2016 11:57:53 -0500 Subject: staging/hfi1: Convert lock to mutex The exp_lock lock does not need to be a spinlock as all its uses are in process context and allowing the process to sleep when the mutex is contended might be beneficial. Signed-off-by: Mitko Haralanov Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/staging/rdma/hfi1/file_ops.c | 12 ++++++------ drivers/staging/rdma/hfi1/hfi.h | 2 +- drivers/staging/rdma/hfi1/init.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/staging') diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c index 76fe60315bb4..b0348263b901 100644 --- a/drivers/staging/rdma/hfi1/file_ops.c +++ b/drivers/staging/rdma/hfi1/file_ops.c @@ -1611,14 +1611,14 @@ static int exp_tid_setup(struct file *fp, struct hfi1_tid_info *tinfo) * reserved, we don't need the lock anymore since we * are guaranteed the groups. */ - spin_lock(&uctxt->exp_lock); + mutex_lock(&uctxt->exp_lock); if (uctxt->tidusemap[useidx] == -1ULL || bitidx >= BITS_PER_LONG) { /* no free groups in the set, use the next */ useidx = (useidx + 1) % uctxt->tidmapcnt; idx++; bitidx = 0; - spin_unlock(&uctxt->exp_lock); + mutex_unlock(&uctxt->exp_lock); continue; } ngroups = ((npages - mapped) / dd->rcv_entries.group_size) + @@ -1635,13 +1635,13 @@ static int exp_tid_setup(struct file *fp, struct hfi1_tid_info *tinfo) * as 0 because we don't check the entire bitmap but * we start from bitidx. */ - spin_unlock(&uctxt->exp_lock); + mutex_unlock(&uctxt->exp_lock); continue; } bits_used = min(free, ngroups); tidmap[useidx] |= ((1ULL << bits_used) - 1) << bitidx; uctxt->tidusemap[useidx] |= tidmap[useidx]; - spin_unlock(&uctxt->exp_lock); + mutex_unlock(&uctxt->exp_lock); /* * At this point, we know where in the map we have free bits. @@ -1677,10 +1677,10 @@ static int exp_tid_setup(struct file *fp, struct hfi1_tid_info *tinfo) * Let go of the bits that we reserved since we are not * going to use them. */ - spin_lock(&uctxt->exp_lock); + mutex_lock(&uctxt->exp_lock); uctxt->tidusemap[useidx] &= ~(((1ULL << bits_used) - 1) << bitidx); - spin_unlock(&uctxt->exp_lock); + mutex_unlock(&uctxt->exp_lock); goto done; } /* diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h index 51ecf45ef70b..53f464cc40ef 100644 --- a/drivers/staging/rdma/hfi1/hfi.h +++ b/drivers/staging/rdma/hfi1/hfi.h @@ -258,7 +258,7 @@ struct hfi1_ctxtdata { struct exp_tid_set tid_full_list; /* lock protecting all Expected TID data */ - spinlock_t exp_lock; + struct mutex exp_lock; /* number of pio bufs for this ctxt (all procs, if shared) */ u32 piocnt; /* first pio buffer for this ctxt */ diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c index 4dd8051aba7e..72c51431b2bf 100644 --- a/drivers/staging/rdma/hfi1/init.c +++ b/drivers/staging/rdma/hfi1/init.c @@ -227,7 +227,7 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt) rcd->numa_id = numa_node_id(); rcd->rcv_array_groups = dd->rcv_entries.ngroups; - spin_lock_init(&rcd->exp_lock); + mutex_init(&rcd->exp_lock); /* * Calculate the context's RcvArray entry starting point. -- cgit v1.2.3 From 3abb33ac652135da9c3c36d9def73ede67e4ba03 Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Fri, 5 Feb 2016 11:57:54 -0500 Subject: staging/hfi1: Add TID cache receive init and free funcs The upcoming TID caching feature requires different data structures and, by extension, different initialization for each of the MPI processes. The two new functions (currently unused) perform the required initialization and freeing of required resources and structures. Signed-off-by: Mitko Haralanov Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/staging/rdma/hfi1/user_exp_rcv.c | 154 +++++++++++++++++++++++++++++-- 1 file changed, 144 insertions(+), 10 deletions(-) (limited to 'drivers/staging') diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c index 5a7e455b9f58..843023e2e2c7 100644 --- a/drivers/staging/rdma/hfi1/user_exp_rcv.c +++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c @@ -90,23 +90,25 @@ struct tid_pageset { #define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list)) +#define num_user_pages(vaddr, len) \ + (1 + (((((unsigned long)(vaddr) + \ + (unsigned long)(len) - 1) & PAGE_MASK) - \ + ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT)) + static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *, - struct rb_root *) __maybe_unused; + struct rb_root *); static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *) __maybe_unused; static int set_rcvarray_entry(struct file *, unsigned long, u32, - struct tid_group *, struct page **, - unsigned) __maybe_unused; + struct tid_group *, struct page **, unsigned); static inline int mmu_addr_cmp(struct mmu_rb_node *, unsigned long, unsigned long); static struct mmu_rb_node *mmu_rb_search_by_addr(struct rb_root *, unsigned long) __maybe_unused; static inline struct mmu_rb_node *mmu_rb_search_by_entry(struct rb_root *, u32); -static int mmu_rb_insert_by_addr(struct rb_root *, - struct mmu_rb_node *) __maybe_unused; -static int mmu_rb_insert_by_entry(struct rb_root *, - struct mmu_rb_node *) __maybe_unused; +static int mmu_rb_insert_by_addr(struct rb_root *, struct mmu_rb_node *); +static int mmu_rb_insert_by_entry(struct rb_root *, struct mmu_rb_node *); static void mmu_notifier_mem_invalidate(struct mmu_notifier *, unsigned long, unsigned long, enum mmu_call_types); @@ -168,7 +170,7 @@ static inline void tid_group_move(struct tid_group *group, tid_group_add_tail(group, s2); } -static struct mmu_notifier_ops __maybe_unused mn_opts = { +static struct mmu_notifier_ops mn_opts = { .invalidate_page = mmu_notifier_page, .invalidate_range_start = mmu_notifier_range_start, }; @@ -180,12 +182,144 @@ static struct mmu_notifier_ops __maybe_unused mn_opts = { */ int hfi1_user_exp_rcv_init(struct file *fp) { - return -EINVAL; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + unsigned tidbase; + int i, ret = 0; + + INIT_HLIST_NODE(&fd->mn.hlist); + spin_lock_init(&fd->rb_lock); + spin_lock_init(&fd->tid_lock); + spin_lock_init(&fd->invalid_lock); + fd->mn.ops = &mn_opts; + fd->tid_rb_root = RB_ROOT; + + if (!uctxt->subctxt_cnt || !fd->subctxt) { + exp_tid_group_init(&uctxt->tid_group_list); + exp_tid_group_init(&uctxt->tid_used_list); + exp_tid_group_init(&uctxt->tid_full_list); + + tidbase = uctxt->expected_base; + for (i = 0; i < uctxt->expected_count / + dd->rcv_entries.group_size; i++) { + struct tid_group *grp; + + grp = kzalloc(sizeof(*grp), GFP_KERNEL); + if (!grp) { + /* + * If we fail here, the groups already + * allocated will be freed by the close + * call. + */ + ret = -ENOMEM; + goto done; + } + grp->size = dd->rcv_entries.group_size; + grp->base = tidbase; + tid_group_add_tail(grp, &uctxt->tid_group_list); + tidbase += dd->rcv_entries.group_size; + } + } + + if (!HFI1_CAP_IS_USET(TID_UNMAP)) { + fd->invalid_tid_idx = 0; + fd->invalid_tids = kzalloc(uctxt->expected_count * + sizeof(u32), GFP_KERNEL); + if (!fd->invalid_tids) { + ret = -ENOMEM; + goto done; + } else { + /* + * Register MMU notifier callbacks. If the registration + * fails, continue but turn off the TID caching for + * all user contexts. + */ + ret = mmu_notifier_register(&fd->mn, current->mm); + if (ret) { + dd_dev_info(dd, + "Failed MMU notifier registration %d\n", + ret); + HFI1_CAP_USET(TID_UNMAP); + ret = 0; + } + } + } + + if (HFI1_CAP_IS_USET(TID_UNMAP)) + fd->mmu_rb_insert = mmu_rb_insert_by_entry; + else + fd->mmu_rb_insert = mmu_rb_insert_by_addr; + + /* + * PSM does not have a good way to separate, count, and + * effectively enforce a limit on RcvArray entries used by + * subctxts (when context sharing is used) when TID caching + * is enabled. To help with that, we calculate a per-process + * RcvArray entry share and enforce that. + * If TID caching is not in use, PSM deals with usage on its + * own. In that case, we allow any subctxt to take all of the + * entries. + * + * Make sure that we set the tid counts only after successful + * init. + */ + if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) { + u16 remainder; + + fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt; + remainder = uctxt->expected_count % uctxt->subctxt_cnt; + if (remainder && fd->subctxt < remainder) + fd->tid_limit++; + } else { + fd->tid_limit = uctxt->expected_count; + } +done: + return ret; } int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) { - return -EINVAL; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct tid_group *grp, *gptr; + + /* + * The notifier would have been removed when the process'es mm + * was freed. + */ + if (current->mm && !HFI1_CAP_IS_USET(TID_UNMAP)) + mmu_notifier_unregister(&fd->mn, current->mm); + + kfree(fd->invalid_tids); + + if (!uctxt->cnt) { + if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) + unlock_exp_tids(uctxt, &uctxt->tid_full_list, + &fd->tid_rb_root); + if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) + unlock_exp_tids(uctxt, &uctxt->tid_used_list, + &fd->tid_rb_root); + list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list, + list) { + list_del_init(&grp->list); + kfree(grp); + } + spin_lock(&fd->rb_lock); + if (!RB_EMPTY_ROOT(&fd->tid_rb_root)) { + struct rb_node *node; + struct mmu_rb_node *rbnode; + + while ((node = rb_first(&fd->tid_rb_root))) { + rbnode = rb_entry(node, struct mmu_rb_node, + rbnode); + rb_erase(&rbnode->rbnode, &fd->tid_rb_root); + kfree(rbnode); + } + } + spin_unlock(&fd->rb_lock); + hfi1_clear_tids(uctxt); + } + return 0; } /* -- cgit v1.2.3 From b5eb3b2ffd1bf5be17df08565f4ab56c3fdae43e Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Fri, 5 Feb 2016 11:57:55 -0500 Subject: staging/hfi1: Add MMU notifier callback function TID caching will rely on the MMU notifier to be told when memory is being invalidated. When the callback is called, the driver will find all RcvArray entries that span the invalidated buffer and "schedule" them to be freed by the PSM library. This function is currently unused and is being added in preparation for the TID caching feature. Signed-off-by: Mitko Haralanov Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/staging/rdma/hfi1/user_exp_rcv.c | 67 +++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 2 deletions(-) (limited to 'drivers/staging') diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c index 843023e2e2c7..1787c55d21d6 100644 --- a/drivers/staging/rdma/hfi1/user_exp_rcv.c +++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c @@ -104,7 +104,7 @@ static int set_rcvarray_entry(struct file *, unsigned long, u32, static inline int mmu_addr_cmp(struct mmu_rb_node *, unsigned long, unsigned long); static struct mmu_rb_node *mmu_rb_search_by_addr(struct rb_root *, - unsigned long) __maybe_unused; + unsigned long); static inline struct mmu_rb_node *mmu_rb_search_by_entry(struct rb_root *, u32); static int mmu_rb_insert_by_addr(struct rb_root *, struct mmu_rb_node *); @@ -683,7 +683,70 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, unsigned long start, unsigned long end, enum mmu_call_types type) { - /* Stub for now */ + struct hfi1_filedata *fd = container_of(mn, struct hfi1_filedata, mn); + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct rb_root *root = &fd->tid_rb_root; + struct mmu_rb_node *node; + unsigned long addr = start; + + spin_lock(&fd->rb_lock); + while (addr < end) { + node = mmu_rb_search_by_addr(root, addr); + + if (!node) { + /* + * Didn't find a node at this address. However, the + * range could be bigger than what we have registered + * so we have to keep looking. + */ + addr += PAGE_SIZE; + continue; + } + + /* + * The next address to be looked up is computed based + * on the node's starting address. This is due to the + * fact that the range where we start might be in the + * middle of the node's buffer so simply incrementing + * the address by the node's size would result is a + * bad address. + */ + addr = node->virt + (node->npages * PAGE_SIZE); + if (node->freed) + continue; + + node->freed = true; + + spin_lock(&fd->invalid_lock); + if (fd->invalid_tid_idx < uctxt->expected_count) { + fd->invalid_tids[fd->invalid_tid_idx] = + rcventry2tidinfo(node->rcventry - + uctxt->expected_base); + fd->invalid_tids[fd->invalid_tid_idx] |= + EXP_TID_SET(LEN, node->npages); + if (!fd->invalid_tid_idx) { + unsigned long *ev; + + /* + * hfi1_set_uevent_bits() sets a user event flag + * for all processes. Because calling into the + * driver to process TID cache invalidations is + * expensive and TID cache invalidations are + * handled on a per-process basis, we can + * optimize this to set the flag only for the + * process in question. + */ + ev = uctxt->dd->events + + (((uctxt->ctxt - + uctxt->dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS) + fd->subctxt); + set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); + } + fd->invalid_tid_idx++; + } + spin_unlock(&fd->invalid_lock); + } + spin_unlock(&fd->rb_lock); } static inline int mmu_addr_cmp(struct mmu_rb_node *node, unsigned long addr, -- cgit v1.2.3 From 455d7f1ab86b7b1703898c75c4bc01df869da4a6 Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Fri, 5 Feb 2016 11:57:56 -0500 Subject: staging/hfi1: Add TID free/clear function bodies Up to now, the functions which cleared the programmed TID entries and gave PSM the list of invalidated TID entries were just stubs. With this commit, the bodies of these functions are added. This commit is a bit asymmetric as it only contains the free code path. This is done on purpose to help with patch reviews as the programming code path is much longer. Signed-off-by: Mitko Haralanov Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/staging/rdma/hfi1/user_exp_rcv.c | 91 +++++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 6 deletions(-) (limited to 'drivers/staging') diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c index 1787c55d21d6..776ce003248e 100644 --- a/drivers/staging/rdma/hfi1/user_exp_rcv.c +++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c @@ -120,10 +120,8 @@ static inline void mmu_notifier_range_start(struct mmu_notifier *, static int program_rcvarray(struct file *, unsigned long, struct tid_group *, struct tid_pageset *, unsigned, u16, struct page **, u32 *, unsigned *, unsigned *) __maybe_unused; -static int unprogram_rcvarray(struct file *, u32, - struct tid_group **) __maybe_unused; -static void clear_tid_node(struct hfi1_filedata *, u16, - struct mmu_rb_node *) __maybe_unused; +static int unprogram_rcvarray(struct file *, u32, struct tid_group **); +static void clear_tid_node(struct hfi1_filedata *, u16, struct mmu_rb_node *); static inline u32 rcventry2tidinfo(u32 rcventry) { @@ -264,6 +262,7 @@ int hfi1_user_exp_rcv_init(struct file *fp) * Make sure that we set the tid counts only after successful * init. */ + spin_lock(&fd->tid_lock); if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) { u16 remainder; @@ -274,6 +273,7 @@ int hfi1_user_exp_rcv_init(struct file *fp) } else { fd->tid_limit = uctxt->expected_count; } + spin_unlock(&fd->tid_lock); done: return ret; } @@ -346,12 +346,91 @@ int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo) int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo) { - return -EINVAL; + int ret = 0; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + u32 *tidinfo; + unsigned tididx; + + tidinfo = kcalloc(tinfo->tidcnt, sizeof(*tidinfo), GFP_KERNEL); + if (!tidinfo) + return -ENOMEM; + + if (copy_from_user(tidinfo, (void __user *)(unsigned long) + tinfo->tidlist, sizeof(tidinfo[0]) * + tinfo->tidcnt)) { + ret = -EFAULT; + goto done; + } + + mutex_lock(&uctxt->exp_lock); + for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { + ret = unprogram_rcvarray(fp, tidinfo[tididx], NULL); + if (ret) { + hfi1_cdbg(TID, "Failed to unprogram rcv array %d", + ret); + break; + } + } + spin_lock(&fd->tid_lock); + fd->tid_used -= tididx; + spin_unlock(&fd->tid_lock); + tinfo->tidcnt = tididx; + mutex_unlock(&uctxt->exp_lock); +done: + kfree(tidinfo); + return ret; } int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo) { - return -EINVAL; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + unsigned long *ev = uctxt->dd->events + + (((uctxt->ctxt - uctxt->dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS) + fd->subctxt); + u32 *array; + int ret = 0; + + if (!fd->invalid_tids) + return -EINVAL; + + /* + * copy_to_user() can sleep, which will leave the invalid_lock + * locked and cause the MMU notifier to be blocked on the lock + * for a long time. + * Copy the data to a local buffer so we can release the lock. + */ + array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL); + if (!array) + return -EFAULT; + + spin_lock(&fd->invalid_lock); + if (fd->invalid_tid_idx) { + memcpy(array, fd->invalid_tids, sizeof(*array) * + fd->invalid_tid_idx); + memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) * + fd->invalid_tid_idx); + tinfo->tidcnt = fd->invalid_tid_idx; + fd->invalid_tid_idx = 0; + /* + * Reset the user flag while still holding the lock. + * Otherwise, PSM can miss events. + */ + clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); + } else { + tinfo->tidcnt = 0; + } + spin_unlock(&fd->invalid_lock); + + if (tinfo->tidcnt) { + if (copy_to_user((void __user *)tinfo->tidlist, + array, sizeof(*array) * tinfo->tidcnt)) + ret = -EFAULT; + } + kfree(array); + + return ret; } static u32 find_phys_blocks(struct page **pages, unsigned npages, -- cgit v1.2.3 From 7e7a436ecb6e703a232df0613b5f24accbe3d7d2 Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Fri, 5 Feb 2016 11:57:57 -0500 Subject: staging/hfi1: Add TID entry program function body The previous patch in the series added the free/invalidate function bodies. Now, it's time for the programming side. This large function takes the user's buffer, breaks it up into manageable chunks, allocates enough RcvArray groups and programs the chunks into the RcvArray entries in the hardware. With this function, the TID caching functionality is implemented. However, it is still unused. The switch will come in a later patch in the series, which will remove the old functionality and switch the driver over to TID caching. Signed-off-by: Mitko Haralanov Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/staging/rdma/hfi1/user_exp_rcv.c | 263 ++++++++++++++++++++++++++++++- 1 file changed, 259 insertions(+), 4 deletions(-) (limited to 'drivers/staging') diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c index 776ce003248e..d33f579675b7 100644 --- a/drivers/staging/rdma/hfi1/user_exp_rcv.c +++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c @@ -97,8 +97,7 @@ struct tid_pageset { static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *, struct rb_root *); -static u32 find_phys_blocks(struct page **, unsigned, - struct tid_pageset *) __maybe_unused; +static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *); static int set_rcvarray_entry(struct file *, unsigned long, u32, struct tid_group *, struct page **, unsigned); static inline int mmu_addr_cmp(struct mmu_rb_node *, unsigned long, @@ -119,7 +118,7 @@ static inline void mmu_notifier_range_start(struct mmu_notifier *, unsigned long, unsigned long); static int program_rcvarray(struct file *, unsigned long, struct tid_group *, struct tid_pageset *, unsigned, u16, struct page **, - u32 *, unsigned *, unsigned *) __maybe_unused; + u32 *, unsigned *, unsigned *); static int unprogram_rcvarray(struct file *, u32, struct tid_group **); static void clear_tid_node(struct hfi1_filedata *, u16, struct mmu_rb_node *); @@ -339,9 +338,265 @@ static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index) writeq(0, dd->rcvarray_wc + (index * 8)); } +/* + * RcvArray entry allocation for Expected Receives is done by the + * following algorithm: + * + * The context keeps 3 lists of groups of RcvArray entries: + * 1. List of empty groups - tid_group_list + * This list is created during user context creation and + * contains elements which describe sets (of 8) of empty + * RcvArray entries. + * 2. List of partially used groups - tid_used_list + * This list contains sets of RcvArray entries which are + * not completely used up. Another mapping request could + * use some of all of the remaining entries. + * 3. List of full groups - tid_full_list + * This is the list where sets that are completely used + * up go. + * + * An attempt to optimize the usage of RcvArray entries is + * made by finding all sets of physically contiguous pages in a + * user's buffer. + * These physically contiguous sets are further split into + * sizes supported by the receive engine of the HFI. The + * resulting sets of pages are stored in struct tid_pageset, + * which describes the sets as: + * * .count - number of pages in this set + * * .idx - starting index into struct page ** array + * of this set + * + * From this point on, the algorithm deals with the page sets + * described above. The number of pagesets is divided by the + * RcvArray group size to produce the number of full groups + * needed. + * + * Groups from the 3 lists are manipulated using the following + * rules: + * 1. For each set of 8 pagesets, a complete group from + * tid_group_list is taken, programmed, and moved to + * the tid_full_list list. + * 2. For all remaining pagesets: + * 2.1 If the tid_used_list is empty and the tid_group_list + * is empty, stop processing pageset and return only + * what has been programmed up to this point. + * 2.2 If the tid_used_list is empty and the tid_group_list + * is not empty, move a group from tid_group_list to + * tid_used_list. + * 2.3 For each group is tid_used_group, program as much as + * can fit into the group. If the group becomes fully + * used, move it to tid_full_list. + */ int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo) { - return -EINVAL; + int ret = 0, need_group = 0, pinned; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets, + tididx = 0, mapped, mapped_pages = 0; + unsigned long vaddr = tinfo->vaddr; + struct page **pages = NULL; + u32 *tidlist = NULL; + struct tid_pageset *pagesets = NULL; + + /* Get the number of pages the user buffer spans */ + npages = num_user_pages(vaddr, tinfo->length); + if (!npages) + return -EINVAL; + + if (npages > uctxt->expected_count) { + dd_dev_err(dd, "Expected buffer too big\n"); + return -EINVAL; + } + + /* Verify that access is OK for the user buffer */ + if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, + npages * PAGE_SIZE)) { + dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n", + (void *)vaddr, npages); + return -EFAULT; + } + + pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets), + GFP_KERNEL); + if (!pagesets) + return -ENOMEM; + + /* Allocate the array of struct page pointers needed for pinning */ + pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto bail; + } + + /* + * Pin all the pages of the user buffer. If we can't pin all the + * pages, accept the amount pinned so far and program only that. + * User space knows how to deal with partially programmed buffers. + */ + pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages); + if (pinned <= 0) { + ret = pinned; + goto bail; + } + + /* Find sets of physically contiguous pages */ + npagesets = find_phys_blocks(pages, pinned, pagesets); + + /* + * We don't need to access this under a lock since tid_used is per + * process and the same process cannot be in hfi1_user_exp_rcv_clear() + * and hfi1_user_exp_rcv_setup() at the same time. + */ + spin_lock(&fd->tid_lock); + if (fd->tid_used + npagesets > fd->tid_limit) + pageset_count = fd->tid_limit - fd->tid_used; + else + pageset_count = npagesets; + spin_unlock(&fd->tid_lock); + + if (!pageset_count) + goto bail; + + ngroups = pageset_count / dd->rcv_entries.group_size; + tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL); + if (!tidlist) { + ret = -ENOMEM; + goto nomem; + } + + tididx = 0; + + /* + * From this point on, we are going to be using shared (between master + * and subcontexts) context resources. We need to take the lock. + */ + mutex_lock(&uctxt->exp_lock); + /* + * The first step is to program the RcvArray entries which are complete + * groups. + */ + while (ngroups && uctxt->tid_group_list.count) { + struct tid_group *grp = + tid_group_pop(&uctxt->tid_group_list); + + ret = program_rcvarray(fp, vaddr, grp, pagesets, + pageidx, dd->rcv_entries.group_size, + pages, tidlist, &tididx, &mapped); + /* + * If there was a failure to program the RcvArray + * entries for the entire group, reset the grp fields + * and add the grp back to the free group list. + */ + if (ret <= 0) { + tid_group_add_tail(grp, &uctxt->tid_group_list); + hfi1_cdbg(TID, + "Failed to program RcvArray group %d", ret); + goto unlock; + } + + tid_group_add_tail(grp, &uctxt->tid_full_list); + ngroups--; + pageidx += ret; + mapped_pages += mapped; + } + + while (pageidx < pageset_count) { + struct tid_group *grp, *ptr; + /* + * If we don't have any partially used tid groups, check + * if we have empty groups. If so, take one from there and + * put in the partially used list. + */ + if (!uctxt->tid_used_list.count || need_group) { + if (!uctxt->tid_group_list.count) + goto unlock; + + grp = tid_group_pop(&uctxt->tid_group_list); + tid_group_add_tail(grp, &uctxt->tid_used_list); + need_group = 0; + } + /* + * There is an optimization opportunity here - instead of + * fitting as many page sets as we can, check for a group + * later on in the list that could fit all of them. + */ + list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list, + list) { + unsigned use = min_t(unsigned, pageset_count - pageidx, + grp->size - grp->used); + + ret = program_rcvarray(fp, vaddr, grp, pagesets, + pageidx, use, pages, tidlist, + &tididx, &mapped); + if (ret < 0) { + hfi1_cdbg(TID, + "Failed to program RcvArray entries %d", + ret); + ret = -EFAULT; + goto unlock; + } else if (ret > 0) { + if (grp->used == grp->size) + tid_group_move(grp, + &uctxt->tid_used_list, + &uctxt->tid_full_list); + pageidx += ret; + mapped_pages += mapped; + need_group = 0; + /* Check if we are done so we break out early */ + if (pageidx >= pageset_count) + break; + } else if (WARN_ON(ret == 0)) { + /* + * If ret is 0, we did not program any entries + * into this group, which can only happen if + * we've screwed up the accounting somewhere. + * Warn and try to continue. + */ + need_group = 1; + } + } + } +unlock: + mutex_unlock(&uctxt->exp_lock); +nomem: + hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, + mapped_pages, ret); + if (tididx) { + spin_lock(&fd->tid_lock); + fd->tid_used += tididx; + spin_unlock(&fd->tid_lock); + tinfo->tidcnt = tididx; + tinfo->length = mapped_pages * PAGE_SIZE; + + if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist, + tidlist, sizeof(tidlist[0]) * tididx)) { + /* + * On failure to copy to the user level, we need to undo + * everything done so far so we don't leak resources. + */ + tinfo->tidlist = (unsigned long)&tidlist; + hfi1_user_exp_rcv_clear(fp, tinfo); + tinfo->tidlist = 0; + ret = -EFAULT; + goto bail; + } + } + + /* + * If not everything was mapped (due to insufficient RcvArray entries, + * for example), unpin all unmapped pages so we can pin them nex time. + */ + if (mapped_pages != pinned) + hfi1_release_user_pages(&pages[mapped_pages], + pinned - mapped_pages, + false); +bail: + kfree(pagesets); + kfree(pages); + kfree(tidlist); + return ret > 0 ? 0 : ret; } int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo) -- cgit v1.2.3 From 0b091fb32c5ae4737bf606a313e6625dad34bbc6 Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Fri, 5 Feb 2016 11:57:58 -0500 Subject: staging/hfi1: Enable TID caching feature This commit "flips the switch" on the TID caching feature implemented in this patch series. As well as enabling the new feature by tying the new function with the PSM API, it also cleans up the old unneeded code, data structure members, and variables. Due to difference in operation and information, the tracing functions related to expected receives had to be changed. This patch include these changes. The tracing function changes could not be split into a separate commit without including both tracing variants at the same time. This would have caused other complications and ugliness. Signed-off-by: Mitko Haralanov Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/staging/rdma/hfi1/file_ops.c | 448 +++---------------------------- drivers/staging/rdma/hfi1/hfi.h | 14 - drivers/staging/rdma/hfi1/init.c | 3 - drivers/staging/rdma/hfi1/trace.h | 132 +++++---- drivers/staging/rdma/hfi1/user_exp_rcv.c | 12 + drivers/staging/rdma/hfi1/user_pages.c | 14 - 6 files changed, 131 insertions(+), 492 deletions(-) (limited to 'drivers/staging') diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c index b0348263b901..d36588934f99 100644 --- a/drivers/staging/rdma/hfi1/file_ops.c +++ b/drivers/staging/rdma/hfi1/file_ops.c @@ -96,9 +96,6 @@ static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long); static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16); static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int); static int vma_fault(struct vm_area_struct *, struct vm_fault *); -static int exp_tid_setup(struct file *, struct hfi1_tid_info *); -static int exp_tid_free(struct file *, struct hfi1_tid_info *); -static void unlock_exp_tids(struct hfi1_ctxtdata *); static const struct file_operations hfi1_file_ops = { .owner = THIS_MODULE, @@ -188,6 +185