diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-07 14:02:24 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-07 14:02:24 -0700 |
commit | 4e4adb2f462889b9eac736dd06d60658beb091b6 (patch) | |
tree | 3582dab57d97bbb30add005b3b2f8a8d8412121e /fs | |
parent | 77a78806c7df8d414c33031a1ca5121876910c4f (diff) | |
parent | 5445b1fbd123420bffed5e629a420aa2a16bf849 (diff) |
Merge tag 'nfs-for-4.3-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust:
"Highlights include:
Stable patches:
- Fix atomicity of pNFS commit list updates
- Fix NFSv4 handling of open(O_CREAT|O_EXCL|O_RDONLY)
- nfs_set_pgio_error sometimes misses errors
- Fix a thinko in xs_connect()
- Fix borkage in _same_data_server_addrs_locked()
- Fix a NULL pointer dereference of migration recovery ops for v4.2
client
- Don't let the ctime override attribute barriers.
- Revert "NFSv4: Remove incorrect check in can_open_delegated()"
- Ensure flexfiles pNFS driver updates the inode after write finishes
- flexfiles must not pollute the attribute cache with attrbutes from
the DS
- Fix a protocol error in layoutreturn
- Fix a protocol issue with NFSv4.1 CLOSE stateids
Bugfixes + cleanups
- pNFS blocks bugfixes from Christoph
- Various cleanups from Anna
- More fixes for delegation corner cases
- Don't fsync twice for O_SYNC/IS_SYNC files
- Fix pNFS and flexfiles layoutstats bugs
- pnfs/flexfiles: avoid duplicate tracking of mirror data
- pnfs: Fix layoutget/layoutreturn/return-on-close serialisation
issues
- pnfs/flexfiles: error handling retries a layoutget before fallback
to MDS
Features:
- Full support for the OPEN NFS4_CREATE_EXCLUSIVE4_1 mode from
Kinglong
- More RDMA client transport improvements from Chuck
- Removal of the deprecated ib_reg_phys_mr() and ib_rereg_phys_mr()
verbs from the SUNRPC, Lustre and core infiniband tree.
- Optimise away the close-to-open getattr if there is no cached data"
* tag 'nfs-for-4.3-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (108 commits)
NFSv4: Respect the server imposed limit on how many changes we may cache
NFSv4: Express delegation limit in units of pages
Revert "NFS: Make close(2) asynchronous when closing NFS O_DIRECT files"
NFS: Optimise away the close-to-open getattr if there is no cached data
NFSv4.1/flexfiles: Clean up ff_layout_write_done_cb/ff_layout_commit_done_cb
NFSv4.1/flexfiles: Mark the layout for return in ff_layout_io_track_ds_error()
nfs: Remove unneeded checking of the return value from scnprintf
nfs: Fix truncated client owner id without proto type
NFSv4.1/flexfiles: Mark layout for return if the mirrors are invalid
NFSv4.1/flexfiles: RW layouts are valid only if all mirrors are valid
NFSv4.1/flexfiles: Fix incorrect usage of pnfs_generic_mark_devid_invalid()
NFSv4.1/flexfiles: Fix freeing of mirrors
NFSv4.1/pNFS: Don't request a minimal read layout beyond the end of file
NFSv4.1/pnfs: Handle LAYOUTGET return values correctly
NFSv4.1/pnfs: Don't ask for a read layout for an empty file.
NFSv4.1: Fix a protocol issue with CLOSE stateids
NFSv4.1/flexfiles: Don't mark the entire deviceid as bad for file errors
SUNRPC: Prevent SYN+SYNACK+RST storms
SUNRPC: xs_reset_transport must mark the connection as disconnected
NFSv4.1/pnfs: Ensure layoutreturn reserves space for the opaque payload
...
Diffstat (limited to 'fs')
34 files changed, 967 insertions, 645 deletions
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 92dca9e90d8d..c556640dcf3b 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -46,13 +46,6 @@ struct pnfs_block_dev; -enum pnfs_block_volume_type { - PNFS_BLOCK_VOLUME_SIMPLE = 0, - PNFS_BLOCK_VOLUME_SLICE = 1, - PNFS_BLOCK_VOLUME_CONCAT = 2, - PNFS_BLOCK_VOLUME_STRIPE = 3, -}; - #define PNFS_BLOCK_MAX_UUIDS 4 #define PNFS_BLOCK_MAX_DEVICES 64 @@ -117,13 +110,6 @@ struct pnfs_block_dev { struct pnfs_block_dev_map *map); }; -enum exstate4 { - PNFS_BLOCK_READWRITE_DATA = 0, - PNFS_BLOCK_READ_DATA = 1, - PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ - PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ -}; - /* sector_t fields are all in 512-byte sectors */ struct pnfs_block_extent { union { @@ -134,15 +120,12 @@ struct pnfs_block_extent { sector_t be_f_offset; /* the starting offset in the file */ sector_t be_length; /* the size of the extent */ sector_t be_v_offset; /* the starting offset in the volume */ - enum exstate4 be_state; /* the state of this extent */ + enum pnfs_block_extent_state be_state; /* the state of this extent */ #define EXTENT_WRITTEN 1 #define EXTENT_COMMITTING 2 unsigned int be_tag; }; -/* on the wire size of the extent */ -#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE) - struct pnfs_block_layout { struct pnfs_layout_hdr bl_layout; struct rb_root bl_ext_rw; diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index e535599a0719..a861bbdfe577 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -22,7 +22,7 @@ bl_free_device(struct pnfs_block_dev *dev) kfree(dev->children); } else { if (dev->bdev) - blkdev_put(dev->bdev, FMODE_READ); + blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE); } } @@ -65,6 +65,11 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) return -EIO; p = xdr_decode_hyper(p, &b->simple.sigs[i].offset); b->simple.sigs[i].sig_len = be32_to_cpup(p++); + if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) { + pr_info("signature too long: %d\n", + b->simple.sigs[i].sig_len); + return -EIO; + } p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len); if (!p) @@ -195,7 +200,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, if (!dev) return -EIO; - d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); if (IS_ERR(d->bdev)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 31d0b5e53dfd..c59a59c37f3d 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -462,6 +462,12 @@ out: return err; } +static size_t ext_tree_layoutupdate_size(size_t count) +{ + return sizeof(__be32) /* number of entries */ + + PNFS_BLOCK_EXTENT_SIZE * count; +} + static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, size_t buffer_size) { @@ -489,7 +495,7 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, continue; (*count)++; - if (*count * BL_EXTENT_SIZE > buffer_size) { + if (ext_tree_layoutupdate_size(*count) > buffer_size) { /* keep counting.. */ ret = -ENOSPC; continue; @@ -530,7 +536,7 @@ retry: if (unlikely(ret)) { ext_tree_free_commitdata(arg, buffer_size); - buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count; + buffer_size = ext_tree_layoutupdate_size(count); count = 0; arg->layoutupdate_pages = @@ -549,17 +555,14 @@ retry: } *start_p = cpu_to_be32(count); - arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count; + arg->layoutupdate_len = ext_tree_layoutupdate_size(count); if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { - __be32 *p = start_p; + void *p = start_p, *end = p + arg->layoutupdate_len; int i = 0; - for (p = start_p; - p < start_p + arg->layoutupdate_len; - p += PAGE_SIZE) { + for ( ; p < end; p += PAGE_SIZE) arg->layoutupdate_pages[i++] = vmalloc_to_page(p); - } } dprintk("%s found %zu ranges\n", __func__, count); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 2c4a0b565d28..75f7c0a7538a 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -162,10 +162,6 @@ nfs41_callback_up(struct svc_serv *serv) spin_lock_init(&serv->sv_cb_lock); init_waitqueue_head(&serv->sv_cb_waitq); rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); - if (IS_ERR(rqstp)) { - svc_xprt_put(serv->sv_bc_xprt); - serv->sv_bc_xprt = NULL; - } dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp)); return rqstp; } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 29e3c1b011b7..b85cf7a30232 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -40,8 +40,11 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); inode = nfs_delegation_find_inode(cps->clp, &args->fh); - if (inode == NULL) + if (inode == NULL) { + trace_nfs4_cb_getattr(cps->clp, &args->fh, NULL, + -ntohl(res->status)); goto out; + } nfsi = NFS_I(inode); rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); @@ -60,6 +63,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, res->status = 0; out_iput: rcu_read_unlock(); + trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status)); iput(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); @@ -194,6 +198,7 @@ unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); pnfs_put_layout_hdr(lo); + trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv); iput(ino); out: return rv; @@ -554,7 +559,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, status = htonl(NFS4_OK); nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid); - nfs41_server_notify_target_slotid_update(cps->clp); + nfs41_notify_server(cps->clp); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 4a90c9bb3135..57c5a02f6213 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -20,6 +20,7 @@ #include <linux/stat.h> #include <linux/errno.h> #include <linux/unistd.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/metrics.h> @@ -285,116 +286,6 @@ void nfs_put_client(struct nfs_client *clp) } EXPORT_SYMBOL_GPL(nfs_put_client); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -/* - * Test if two ip6 socket addresses refer to the same socket by - * comparing relevant fields. The padding bytes specifically, are not - * compared. sin6_flowinfo is not compared because it only affects QoS - * and sin6_scope_id is only compared if the address is "link local" - * because "link local" addresses need only be unique to a specific - * link. Conversely, ordinary unicast addresses might have different - * sin6_scope_id. - * - * The caller should ensure both socket addresses are AF_INET6. - */ -static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; - const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - - if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) - return 0; - else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) - return sin1->sin6_scope_id == sin2->sin6_scope_id; - - return 1; -} -#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ -static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - return 0; -} -#endif - -/* - * Test if two ip4 socket addresses refer to the same socket, by - * comparing relevant fields. The padding bytes specifically, are - * not compared. - * - * The caller should ensure both socket addresses are AF_INET. - */ -static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; - const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; - - return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; -} - -static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; - const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - - return nfs_sockaddr_match_ipaddr6(sa1, sa2) && - (sin1->sin6_port == sin2->sin6_port); -} - -static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; - const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; - - return nfs_sockaddr_match_ipaddr4(sa1, sa2) && - (sin1->sin_port == sin2->sin_port); -} - -#if defined(CONFIG_NFS_V4_1) -/* - * Test if two socket addresses represent the same actual socket, - * by comparing (only) relevant fields, excluding the port number. - */ -int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - if (sa1->sa_family != sa2->sa_family) - return 0; - - switch (sa1->sa_family) { - case AF_INET: - return nfs_sockaddr_match_ipaddr4(sa1, sa2); - case AF_INET6: - return nfs_sockaddr_match_ipaddr6(sa1, sa2); - } - return 0; -} -EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr); -#endif /* CONFIG_NFS_V4_1 */ - -/* - * Test if two socket addresses represent the same actual socket, - * by comparing (only) relevant fields, including the port number. - */ -static int nfs_sockaddr_cmp(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - if (sa1->sa_family != sa2->sa_family) - return 0; - - switch (sa1->sa_family) { - case AF_INET: - return nfs_sockaddr_cmp_ip4(sa1, sa2); - case AF_INET6: - return nfs_sockaddr_cmp_ip6(sa1, sa2); - } - return 0; -} - /* * Find an nfs_client on the list that matches the initialisation data * that is supplied. @@ -421,7 +312,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat if (clp->cl_minorversion != data->minorversion) continue; /* Match the full socket address */ - if (!nfs_sockaddr_cmp(sap, clap)) + if (!rpc_cmp_addr_port(sap, clap)) continue; atomic_inc(&clp->cl_count); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 029d688a969f..2714ef835bdd 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -175,7 +175,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, if (delegation->inode != NULL) { nfs4_stateid_copy(&delegation->stateid, &res->delegation); delegation->type = res->delegation_type; - delegation->maxsize = res->maxsize; + delegation->pagemod_limit = res->pagemod_limit; oldcred = delegation->cred; delegation->cred = get_rpccred(cred); clear_bit(NFS_DELEGATION_NEED_RECLAIM, @@ -337,7 +337,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct return -ENOMEM; nfs4_stateid_copy(&delegation->stateid, &res->delegation); delegation->type = res->delegation_type; - delegation->maxsize = res->maxsize; + delegation->pagemod_limit = res->pagemod_limit; delegation->change_attr = inode->i_version; delegation->cred = get_rpccred(cred); delegation->inode = inode; @@ -900,3 +900,28 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, rcu_read_unlock(); return ret; } + +/** + * nfs4_delegation_flush_on_close - Check if we must flush file on close + * @inode: inode to check + * + * This function checks the number of outstanding writes to the file + * against the delegation 'space_limit' field to see if + * the spec requires us to flush the file on close. + */ +bool nfs4_delegation_flush_on_close(const struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation; + bool ret = true; + + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation == NULL || !(delegation->type & FMODE_WRITE)) + goto out; + if (nfsi->nrequests < delegation->pagemod_limit) + ret = false; +out: + rcu_read_unlock(); + return ret; +} diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index e3c20a3ccc93..a44829173e57 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -18,7 +18,7 @@ struct nfs_delegation { struct inode *inode; nfs4_stateid stateid; fmode_t type; - loff_t maxsize; + unsigned long pagemod_limit; __u64 change_attr; unsigned long flags; spinlock_t lock; @@ -61,6 +61,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); int nfs4_check_delegation(struct inode *inode, fmode_t flags); +bool nfs4_delegation_flush_on_close(const struct inode *inode); #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 547308a5ec6f..3d8e4ffa0a33 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -583,26 +583,19 @@ out_nopages: } static -void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages) +void nfs_readdir_free_pages(struct page **pages, unsigned int npages) { unsigned int i; for (i = 0; i < npages; i++) put_page(pages[i]); } -static -void nfs_readdir_free_large_page(void *ptr, struct page **pages, - unsigned int npages) -{ - nfs_readdir_free_pagearray(pages, npages); -} - /* * nfs_readdir_large_page will allocate pages that must be freed with a call - * to nfs_readdir_free_large_page + * to nfs_readdir_free_pagearray */ static -int nfs_readdir_large_page(struct page **pages, unsigned int npages) +int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages) { unsigned int i; @@ -615,7 +608,7 @@ int nfs_readdir_large_page(struct page **pages, unsigned int npages) return 0; out_freepages: - nfs_readdir_free_pagearray(pages, i); + nfs_readdir_free_pages(pages, i); return -ENOMEM; } @@ -623,7 +616,6 @@ static int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) { struct page *pages[NFS_MAX_READDIR_PAGES]; - void *pages_ptr = NULL; struct nfs_entry entry; struct file *file = desc->file; struct nfs_cache_array *array; @@ -653,7 +645,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, memset(array, 0, sizeof(struct nfs_cache_array)); array->eof_index = -1; - status = nfs_readdir_large_page(pages, array_size); + status = nfs_readdir_alloc_pages(pages, array_size); if (status < 0) goto out_release_array; do { @@ -671,7 +663,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, } } while (array->eof_index < 0); - nfs_readdir_free_large_page(pages_ptr, pages, array_size); + nfs_readdir_free_pages(pages, array_size); out_release_array: nfs_readdir_release_array(page); out_label_free: diff --git a/fs/nfs/file.c b/fs/nfs/file.c index cc4fa1ed61fc..c0f9b1ed12b9 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -82,7 +82,8 @@ nfs_file_release(struct inode *inode, struct file *filp) dprintk("NFS: release(%pD2)\n", filp); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); - return nfs_release(inode, filp); + nfs_file_clear_open_context(filp); + return 0; } EXPORT_SYMBOL_GPL(nfs_file_release); @@ -141,7 +142,7 @@ EXPORT_SYMBOL_GPL(nfs_file_llseek); /* * Flush all dirty pages, and check for write errors. */ -int +static int nfs_file_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); @@ -152,17 +153,9 @@ nfs_file_flush(struct file *file, fl_owner_t id) if ((file->f_mode & FMODE_WRITE) == 0) return 0; - /* - * If we're holding a write delegation, then just start the i/o - * but don't wait for completion (or send a commit). - */ - if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) - return filemap_fdatawrite(file->f_mapping); - /* Flush writes to the server and return any errors */ return vfs_fsync(file, 0); } -EXPORT_SYMBOL_GPL(nfs_file_flush); ssize_t nfs_file_read(struct kiocb *iocb, struct iov_iter *to) @@ -644,12 +637,10 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .page_mkwrite = nfs_vm_page_mkwrite, }; -static int nfs_need_sync_write(struct file *filp, struct inode *inode) +static int nfs_need_check_write(struct file *filp, struct inode *inode) { struct nfs_open_context *ctx; - if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC)) - return 1; ctx = nfs_file_open_context(filp); if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || nfs_ctx_key_to_expire(ctx)) @@ -699,8 +690,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result > 0) written = result; - /* Return error values for O_DSYNC and IS_SYNC() */ - if (result >= 0 && nfs_need_sync_write(file, inode)) { + /* Return error values */ + if (result >= 0 && nfs_need_check_write(file, inode)) { int err = vfs_fsync(file, 0); if (err < 0) result = err; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index b3289d701eea..fbc5a56de875 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -34,6 +34,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) ffl = kzalloc(sizeof(*ffl), gfp_flags); if (ffl) { INIT_LIST_HEAD(&ffl->error_list); + INIT_LIST_HEAD(&ffl->mirrors); return &ffl->generic_hdr; } else return NULL; @@ -135,6 +136,95 @@ decode_name(struct xdr_stream *xdr, u32 *id) return 0; } +static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, + const struct nfs4_ff_layout_mirror *m2) +{ + int i, j; + + if (m1->fh_versions_cnt != m2->fh_versions_cnt) + return false; + for (i = 0; i < m1->fh_versions_cnt; i++) { + bool found_fh = false; + for (j = 0; j < m2->fh_versions_cnt; i++) { + if (nfs_compare_fh(&m1->fh_versions[i], + &m2->fh_versions[j]) == 0) { + found_fh = true; + break; + } + } + if (!found_fh) + return false; + } + return true; +} + +static struct nfs4_ff_layout_mirror * +ff_layout_add_mirror(struct pnfs_layout_hdr *lo, + struct nfs4_ff_layout_mirror *mirror) +{ + struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); + struct nfs4_ff_layout_mirror *pos; + struct inode *inode = lo->plh_inode; + + spin_lock(&inode->i_lock); + list_for_each_entry(pos, &ff_layout->mirrors, mirrors) { + if (mirror->mirror_ds != pos->mirror_ds) + continue; + if (!ff_mirror_match_fh(mirror, pos)) + continue; + if (atomic_inc_not_zero(&pos->ref)) { + spin_unlock(&inode->i_lock); + return pos; + } + } + list_add(&mirror->mirrors, &ff_layout->mirrors); + mirror->layout = lo; + spin_unlock(&inode->i_lock); + return mirror; +} + +static void +ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror) +{ + struct inode *inode; + if (mirror->layout == NULL) + return; + inode = mirror->layout->plh_inode; + spin_lock(&inode->i_lock); + list_del(&mirror->mirrors); + spin_unlock(&inode->i_lock); + mirror->layout = NULL; +} + +static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) +{ + struct nfs4_ff_layout_mirror *mirror; + + mirror = kzalloc(sizeof(*mirror), gfp_flags); + if (mirror != NULL) { + spin_lock_init(&mirror->lock); + atomic_set(&mirror->ref, 1); + INIT_LIST_HEAD(&mirror->mirrors); + } + return mirror; +} + +static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) +{ + ff_layout_remove_mirror(mirror); + kfree(mirror->fh_versions); + if (mirror->cred) + put_rpccred(mirror->cred); + nfs4_ff_layout_put_deviceid(mirror->mirror_ds); + kfree(mirror); +} + +static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror) +{ + if (mirror != NULL && atomic_dec_and_test(&mirror->ref)) + ff_layout_free_mirror(mirror); +} + static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls) { int i; @@ -144,11 +234,7 @@ static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls) /* normally mirror_ds is freed in * .free_deviceid_node but we still do it here * for .alloc_lseg error path */ - if (fls->mirror_array[i]) { - kfree(fls->mirror_array[i]->fh_versions); - nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds); - kfree(fls->mirror_array[i]); - } + ff_layout_put_mirror(fls->mirror_array[i]); } kfree(fls->mirror_array); fls->mirror_array = NULL; @@ -181,6 +267,65 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) } } +static bool +ff_lseg_range_is_after(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) +{ + u64 end1, end2; + + if (l1->iomode != l2->iomode) + return l1->iomode != IOMODE_READ; + end1 = pnfs_calc_offset_end(l1->offset, l1->length); + end2 = pnfs_calc_offset_end(l2->offset, l2->length); + if (end1 < l2->offset) + return false; + if (end2 < l1->offset) + return true; + return l2->offset <= l1->offset; +} + +static bool +ff_lseg_merge(struct pnfs_layout_segment *new, + struct pnfs_layout_segment *old) +{ + u64 new_end, old_end; + + if (new->pls_range.iomode != old->pls_range.iomode) + return false; + old_end = pnfs_calc_offset_end(old->pls_range.offset, + old->pls_range.length); + if (old_end < new->pls_range.offset) + return false; + new_end = pnfs_calc_offset_end(new->pls_range.offset, + new->pls_range.length); + if (new_end < old->pls_range.offset) + return false; + + /* Mergeable: copy info from 'old' to 'new' */ + if (new_end < old_end) + new_end = old_end; + if (new->pls_range.offset < old->pls_range.offset) + new->pls_range.offset = old->pls_range.offset; + new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset, + new_end); + if (test_bit(NFS_LSEG_ROC, &old->pls_flags)) + set_bit(NFS_LSEG_ROC, &new->pls_flags); + if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags)) + set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags); + return true; +} + +static void +ff_layout_add_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, + struct list_head *free_me) +{ + pnfs_generic_layout_insert_lseg(lo, lseg, + ff_lseg_range_is_after, + ff_lseg_merge, + free_me); +} + static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) { int i, j; @@ -246,6 +391,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, goto out_err_free; for (i = 0; i < fls->mirror_array_cnt; i++) { + struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid devid; struct nfs4_deviceid_node *idnode; u32 ds_count; @@ -262,17 +408,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, if (ds_count != 1) goto out_err_free; - fls->mirror_array[i] = - kzalloc(sizeof(struct nfs4_ff_layout_mirror), - gfp_flags); + fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags); if (fls->mirror_array[i] == NULL) { rc = -ENOMEM; goto out_err_free; } - spin_lock_init(&fls->mirror_array[i]->lock); fls->mirror_array[i]->ds_count = ds_count; - fls->mirror_array[i]->lseg = &fls->generic_hdr; /* deviceid */ rc = decode_deviceid(&stream, &devid); @@ -338,6 +480,12 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, if (rc) goto out_err_free; + mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); + if ( |