/*
* Copyright (c) 2007, 2020 Oracle and/or its affiliates.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
#include "rds.h"
/*
* XXX
* - build with sparse
* - should we detect duplicate keys on a socket? hmm.
* - an rdma is an mlock, apply rlimit?
*/
/*
* get the number of pages by looking at the page indices that the start and
* end addresses fall in.
*
* Returns 0 if the vec is invalid. It is invalid if the number of bytes
* causes the address to wrap or overflows an unsigned int. This comes
* from being stored in the 'length' member of 'struct scatterlist'.
*/
static unsigned int rds_pages_in_vec(struct rds_iovec *vec)
{
if ((vec->addr + vec->bytes <= vec->addr) ||
(vec->bytes > (u64)UINT_MAX))
return 0;
return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) -
(vec->addr >> PAGE_SHIFT);
}
static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
struct rds_mr *insert)
{
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
struct rds_mr *mr;
while (*p) {
parent = *p;
mr = rb_entry(parent, struct rds_mr, r_rb_node);
if (key < mr->r_key)
p = &(*p)->rb_left;
else if (key > mr->r_key)
p = &(*p)->rb_right;
else
return mr;
}
if (insert) {
rb_link_node(&insert->r_rb_node, parent, p);
rb_insert_color(&insert->r_rb_node, root);
kref_get(&insert->r_kref);
}
return NULL;
}
/*
* Destroy the transport-specific part of a MR.
*/
static void rds_destroy_mr(struct rds_mr *mr)
{
struct rds_sock *rs = mr->r_sock;
void *trans_private = NULL;
unsigned long flags;
rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
mr->r_key, kref_read(&mr->r_kref));
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
if (!RB_EMPTY_NODE(&mr->r_rb_node))
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
trans_private = mr->r_trans_private;
mr->r_trans_private = NULL;
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
if (trans_private)
mr->r_trans->free_mr(trans_private, mr->r_invalidate);
}
void __rds_put_mr_final(struct kref *kref)
{
struct rds_mr *mr = container_of(kref, struct rds_mr, r_kref);
rds_destroy_mr(mr);
kfree(mr);
}
/*
* By the time this is called we can't have any more ioctls called on
* the socket so we don't need to worry about racing with others.
*/
void rds_rdma_drop_keys(struct rds_sock *rs)
{
struct rds_mr *mr;
struct rb_node *node;
unsigned long flags;
/* Release any MRs associated with this socket */
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
while ((node = rb_first(&rs->rs_rdma_keys))) {
mr = rb_entry(node, struct rds_mr, r_rb_node);
if (mr->r_trans == rs->rs_transport)
mr->r_invalidate = 0;
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
RB_CLEAR_NODE(&mr->r_rb_node);
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
kref_put(&mr->r_kref, __rds_put_mr_final);
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
}
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
if (rs->rs_transport && rs->rs_transport->flush_mrs)
rs->rs_transport->flush_mrs();
}
/*
* Helper function to pin user pages.
*/
static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
struct page **pages, int write)
{
unsigned int gup_flags = FOLL_LONGTERM;
int ret;
if (write)
gup_flags |= FOLL_WRITE