diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-12 20:03:53 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-12 20:03:53 -0700 |
commit | cdb897e3279ad1677138d6bdf1cfaf1393718a08 (patch) | |
tree | 05c5162aadcee8e56a384ef059adcb7e85c48d43 /fs | |
parent | b31ff3cdf540110da4572e3e29bd172087af65cc (diff) | |
parent | 15b51bd6badbb373c723aa019cf530c8263efd7e (diff) |
Merge tag 'ceph-for-4.14-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov:
"The highlights include:
- a large series of fixes and improvements to the snapshot-handling
code (Zheng Yan)
- individual read/write OSD requests passed down to libceph are now
limited to 16M in size to avoid hitting OSD-side limits (Zheng Yan)
- encode MStatfs v2 message to allow for more accurate space usage
reporting (Douglas Fuller)
- switch to the new writeback error tracking infrastructure (Jeff
Layton)"
* tag 'ceph-for-4.14-rc1' of git://github.com/ceph/ceph-client: (35 commits)
ceph: stop on-going cached readdir if mds revokes FILE_SHARED cap
ceph: wait on writeback after writing snapshot data
ceph: fix capsnap dirty pages accounting
ceph: ignore wbc->range_{start,end} when write back snapshot data
ceph: fix "range cyclic" mode writepages
ceph: cleanup local variables in ceph_writepages_start()
ceph: optimize pagevec iterating in ceph_writepages_start()
ceph: make writepage_nounlock() invalidate page that beyonds EOF
ceph: properly get capsnap's size in get_oldest_context()
ceph: remove stale check in ceph_invalidatepage()
ceph: queue cap snap only when snap realm's context changes
ceph: handle race between vmtruncate and queuing cap snap
ceph: fix message order check in handle_cap_export()
ceph: fix NULL pointer dereference in ceph_flush_snaps()
ceph: adjust 36 checks for NULL pointers
ceph: delete an unnecessary return statement in update_dentry_lease()
ceph: ENOMEM pr_err in __get_or_create_frag() is redundant
ceph: check negative offsets in ceph_llseek()
ceph: more accurate statfs
ceph: properly set snap follows for cap reconnect
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ceph/addr.c | 403 | ||||
-rw-r--r-- | fs/ceph/cache.c | 2 | ||||
-rw-r--r-- | fs/ceph/caps.c | 40 | ||||
-rw-r--r-- | fs/ceph/debugfs.c | 2 | ||||
-rw-r--r-- | fs/ceph/dir.c | 6 | ||||
-rw-r--r-- | fs/ceph/file.c | 50 | ||||
-rw-r--r-- | fs/ceph/inode.c | 53 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 37 | ||||
-rw-r--r-- | fs/ceph/mdsmap.c | 6 | ||||
-rw-r--r-- | fs/ceph/snap.c | 37 | ||||
-rw-r--r-- | fs/ceph/super.c | 78 | ||||
-rw-r--r-- | fs/ceph/super.h | 16 | ||||
-rw-r--r-- | fs/ceph/xattr.c | 8 |
13 files changed, 408 insertions, 330 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1bc709fe330a..b3e3edc09d80 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -152,17 +152,10 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, ceph_invalidate_fscache_page(inode, page); + WARN_ON(!PageLocked(page)); if (!PagePrivate(page)) return; - /* - * We can get non-dirty pages here due to races between - * set_page_dirty and truncate_complete_page; just spit out a - * warning, in case we end up with accounting problems later. - */ - if (!PageDirty(page)) - pr_err("%p invalidatepage %p page not dirty\n", inode, page); - ClearPageChecked(page); dout("%p invalidatepage %p idx %lu full dirty page\n", @@ -455,13 +448,9 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, if (rc == 0) goto out; - if (fsc->mount_options->rsize >= PAGE_SIZE) - max = (fsc->mount_options->rsize + PAGE_SIZE - 1) - >> PAGE_SHIFT; - - dout("readpages %p file %p nr_pages %d max %d\n", inode, - file, nr_pages, - max); + max = fsc->mount_options->rsize >> PAGE_SHIFT; + dout("readpages %p file %p nr_pages %d max %d\n", + inode, file, nr_pages, max); while (!list_empty(page_list)) { rc = start_read(inode, page_list, max); if (rc < 0) @@ -474,14 +463,22 @@ out: return rc; } +struct ceph_writeback_ctl +{ + loff_t i_size; + u64 truncate_size; + u32 truncate_seq; + bool size_stable; + bool head_snapc; +}; + /* * Get ref for the oldest snapc for an inode with dirty data... that is, the * only snap context we are allowed to write back. */ -static struct ceph_snap_context *get_oldest_context(struct inode *inode, - loff_t *snap_size, - u64 *truncate_size, - u32 *truncate_seq) +static struct ceph_snap_context * +get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl, + struct ceph_snap_context *page_snapc) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc = NULL; @@ -491,30 +488,78 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, capsnap->context, capsnap->dirty_pages); - if (capsnap->dirty_pages) { - snapc = ceph_get_snap_context(capsnap->context); - if (snap_size) - *snap_size = capsnap->size; - if (truncate_size) - *truncate_size = capsnap->truncate_size; - if (truncate_seq) - *truncate_seq = capsnap->truncate_seq; - break; + if (!capsnap->dirty_pages) + continue; + + /* get i_size, truncate_{seq,size} for page_snapc? */ + if (snapc && capsnap->context != page_snapc) + continue; + + if (ctl) { + if (capsnap->writing) { + ctl->i_size = i_size_read(inode); + ctl->size_stable = false; + } else { + ctl->i_size = capsnap->size; + ctl->size_stable = true; + } + ctl->truncate_size = capsnap->truncate_size; + ctl->truncate_seq = capsnap->truncate_seq; + ctl->head_snapc = false; } + + if (snapc) + break; + + snapc = ceph_get_snap_context(capsnap->context); + if (!page_snapc || + page_snapc == snapc || + page_snapc->seq > snapc->seq) + break; } if (!snapc && ci->i_wrbuffer_ref_head) { snapc = ceph_get_snap_context(ci->i_head_snapc); dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); - if (truncate_size) - *truncate_size = ci->i_truncate_size; - if (truncate_seq) - *truncate_seq = ci->i_truncate_seq; + if (ctl) { + ctl->i_size = i_size_read(inode); + ctl->truncate_size = ci->i_truncate_size; + ctl->truncate_seq = ci->i_truncate_seq; + ctl->size_stable = false; + ctl->head_snapc = true; + } } spin_unlock(&ci->i_ceph_lock); return snapc; } +static u64 get_writepages_data_length(struct inode *inode, + struct page *page, u64 start) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc = page_snap_context(page); + struct ceph_cap_snap *capsnap = NULL; + u64 end = i_size_read(inode); + + if (snapc != ci->i_head_snapc) { + bool found = false; + spin_lock(&ci->i_ceph_lock); + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->context == snapc) { + if (!capsnap->writing) + end = capsnap->size; + found = true; + break; + } + } + spin_unlock(&ci->i_ceph_lock); + WARN_ON(!found); + } + if (end > page_offset(page) + PAGE_SIZE) + end = page_offset(page) + PAGE_SIZE; + return end > start ? end - start : 0; +} + /* * Write a single page, but leave the page locked. * @@ -526,30 +571,25 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct inode *inode; struct ceph_inode_info *ci; struct ceph_fs_client *fsc; - struct ceph_osd_client *osdc; struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); - loff_t snap_size = -1; long writeback_stat; - u64 truncate_size; - u32 truncate_seq; int err, len = PAGE_SIZE; + struct ceph_writeback_ctl ceph_wbc; dout("writepage %p idx %lu\n", page, page->index); inode = page->mapping->host; ci = ceph_inode(inode); fsc = ceph_inode_to_client(inode); - osdc = &fsc->client->osdc; /* verify this is a writeable snap context */ snapc = page_snap_context(page); - if (snapc == NULL) { + if (!snapc) { dout("writepage %p page %p not dirty?\n", inode, page); return 0; } - oldest = get_oldest_context(inode, &snap_size, - &truncate_size, &truncate_seq); + oldest = get_oldest_context(inode, &ceph_wbc, snapc); if (snapc->seq > oldest->seq) { dout("writepage %p page %p snapc %p not writeable - noop\n", inode, page, snapc); @@ -561,20 +601,18 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } ceph_put_snap_context(oldest); - if (snap_size == -1) - snap_size = i_size_read(inode); - /* is this a partial page at end of file? */ - if (page_off >= snap_size) { - dout("%p page eof %llu\n", page, snap_size); + if (page_off >= ceph_wbc.i_size) { + dout("%p page eof %llu\n", page, ceph_wbc.i_size); + page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); return 0; } - if (snap_size < page_off + len) - len = snap_size - page_off; + if (ceph_wbc.i_size < page_off + len) + len = ceph_wbc.i_size - page_off; - dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", - inode, page, page->index, page_off, len, snapc); + dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n", + inode, page, page->index, page_off, len, snapc, snapc->seq); writeback_stat = atomic_long_inc_return(&fsc->writeback_count); if (writeback_stat > @@ -582,10 +620,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); set_page_writeback(page); - err = ceph_osdc_writepages(osdc, ceph_vino(inode), - &ci->i_layout, snapc, - page_off, len, - truncate_seq, truncate_size, + err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode), + &ci->i_layout, snapc, page_off, len, + ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, &inode->i_mtime, &page, 1); if (err < 0) { struct writeback_control tmp_wbc; @@ -746,31 +784,17 @@ static int ceph_writepages_start(struct address_space *mapping, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_vino vino = ceph_vino(inode); - pgoff_t index, start, end; - int range_whole = 0; - int should_loop = 1; - pgoff_t max_pages = 0, max_pages_ever = 0; + pgoff_t index, start_index, end = -1; struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; struct pagevec pvec; - int done = 0; int rc = 0; unsigned int wsize = i_blocksize(inode); struct ceph_osd_request *req = NULL; - int do_sync = 0; - loff_t snap_size, i_size; - u64 truncate_size; - u32 truncate_seq; + struct ceph_writeback_ctl ceph_wbc; + bool should_loop, range_whole = false; + bool stop, done = false; - /* - * Include a 'sync' in the OSD request if this is a data - * integrity write (e.g., O_SYNC write or fsync()), or if our - * cap is being revoked. - */ - if ((wbc->sync_mode == WB_SYNC_ALL) || - ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) - do_sync = 1; - dout("writepages_start %p dosync=%d (mode=%s)\n", - inode, do_sync, + dout("writepages_start %p (mode=%s)\n", inode, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); @@ -783,35 +807,17 @@ static int ceph_writepages_start(struct address_space *mapping, mapping_set_error(mapping, -EIO); return -EIO; /* we're in a forced umount, don't write! */ } - if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) + if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - if (wsize < PAGE_SIZE) - wsize = PAGE_SIZE; - max_pages_ever = wsize >> PAGE_SHIFT; pagevec_init(&pvec, 0); - /* where to start/end? */ - if (wbc->range_cyclic) { - start = mapping->writeback_index; /* Start from prev offset */ - end = -1; - dout(" cyclic, start at %lu\n", start); - } else { - start = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; - should_loop = 0; - dout(" not cyclic, %lu to %lu\n", start, end); - } - index = start; + start_index = wbc->range_cyclic ? mapping->writeback_index : 0; + index = start_index; retry: /* find oldest snap context with dirty data */ - ceph_put_snap_context(snapc); - snap_size = -1; - snapc = get_oldest_context(inode, &snap_size, - &truncate_size, &truncate_seq); + snapc = get_oldest_context(inode, &ceph_wbc, NULL); if (!snapc) { /* hmm, why does writepages get called when there is no dirty data? */ @@ -821,40 +827,56 @@ retry: dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); - i_size = i_size_read(inode); - - if (last_snapc && snapc != last_snapc) { - /* if we switched to a newer snapc, restart our scan at the - * start of the original file range. */ - dout(" snapc differs from last pass, restarting at %lu\n", - index); - index = start; + should_loop = false; + if (ceph_wbc.head_snapc && snapc != last_snapc) { + /* where to start/end? */ + if (wbc->range_cyclic) { + index = start_index; + end = -1; + if (index > 0) + should_loop = true; + dout(" cyclic, start at %lu\n", index); + } else { + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = true; + dout(" not cyclic, %lu to %lu\n", index, end); + } + } else if (!ceph_wbc.head_snapc) { + /* Do not respect wbc->range_{start,end}. Dirty pages + * in that range can be associated with newer snapc. + * They are not writeable until we write all dirty pages + * associated with 'snapc' get written */ + if (index > 0 || wbc->sync_mode != WB_SYNC_NONE) + should_loop = true; + dout(" non-head snapc, range whole\n"); } + + ceph_put_snap_context(last_snapc); last_snapc = snapc; - while (!done && index <= end) { - unsigned i; - int first; - pgoff_t strip_unit_end = 0; + stop = false; + while (!stop && index <= end) { int num_ops = 0, op_idx; - int pvec_pages, locked_pages = 0; + unsigned i, pvec_pages, max_pages, locked_pages = 0; struct page **pages = NULL, **data_pages; mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; - int want; + pgoff_t strip_unit_end = 0; u64 offset = 0, len = 0; - max_pages = max_pages_ever; + max_pages = wsize >> PAGE_SHIFT; get_more_pages: - first = -1; - want = min(end - index, - min((pgoff_t)PAGEVEC_SIZE, - max_pages - (pgoff_t)locked_pages) - 1) - + 1; + pvec_pages = min_t(unsigned, PAGEVEC_SIZE, + max_pages - locked_pages); + if (end - index < (u64)(pvec_pages - 1)) + pvec_pages = (unsigned)(end - index) + 1; + pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - want); + pvec_pages); dout("pagevec_lookup_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; @@ -871,11 +893,15 @@ get_more_pages: unlikely(page->mapping != mapping)) { dout("!dirty or !mapping %p\n", page); unlock_page(page); - break; + continue; } - if (!wbc->range_cyclic && page->index > end) { + if (page->index > end) { dout("end of range %p\n", page); - done = 1; + /* can't be range_cyclic (1st pass) because + * end == -1 in that case. */ + stop = true; + if (ceph_wbc.head_snapc) + done = true; unlock_page(page); break; } @@ -884,39 +910,37 @@ get_more_pages: unlock_page(page); break; } - if (wbc->sync_mode != WB_SYNC_NONE) { - dout("waiting on writeback %p\n", page); - wait_on_page_writeback(page); - } - if (page_offset(page) >= - (snap_size == -1 ? i_size : snap_size)) { - dout("%p page eof %llu\n", page, - (snap_size == -1 ? i_size : snap_size)); - done = 1; + if (page_offset(page) >= ceph_wbc.i_size) { + dout("%p page eof %llu\n", + page, ceph_wbc.i_size); + /* not done if range_cyclic */ + stop = true; unlock_page(page); break; } if (PageWriteback(page)) { - dout("%p under writeback\n", page); - unlock_page(page); - break; + if (wbc->sync_mode == WB_SYNC_NONE) { + dout("%p under writeback\n", page); + unlock_page(page); + continue; + } + dout("waiting on writeback %p\n", page); + wait_on_page_writeback(page); } /* only if matching snap context */ pgsnapc = page_snap_context(page); - if (pgsnapc->seq > snapc->seq) { - dout("page snapc %p %lld > oldest %p %lld\n", + if (pgsnapc != snapc) { + dout("page snapc %p %lld != oldest %p %lld\n", pgsnapc, pgsnapc->seq, snapc, snapc->seq); unlock_page(page); - if (!locked_pages) - continue; /* keep looking for snap */ - break; + continue; } if (!clear_page_dirty_for_io(page)) { dout("%p !clear_page_dirty_for_io\n", page); unlock_page(page); - break; + continue; } /* @@ -942,7 +966,7 @@ get_more_pages: break; } - num_ops = 1 + do_sync; + num_ops = 1; strip_unit_end = page->index + ((len - 1) >> PAGE_SHIFT); @@ -972,8 +996,6 @@ get_more_pages: } /* note position of first page in pvec */ - if (first < 0) - first = i; dout("%p will write page %p idx %lu\n", inode, page, page->index); @@ -984,8 +1006,10 @@ get_more_pages: BLK_RW_ASYNC); } - pages[locked_pages] = page; - locked_pages++; + + pages[locked_pages++] = page; + pvec.pages[i] = NULL; + len += PAGE_SIZE; } @@ -993,23 +1017,23 @@ get_more_pages: if (!locked_pages) goto release_pvec_pages; if (i) { - int j; - BUG_ON(!locked_pages || first < 0); + unsigned j, n = 0; + /* shift unused page to beginning of pvec */ + for (j = 0; j < pvec_pages; j++) { + if (!pvec.pages[j]) + continue; + if (n < j) + pvec.pages[n] = pvec.pages[j]; + n++; + } + pvec.nr = n; if (pvec_pages && i == pvec_pages && locked_pages < max_pages) { dout("reached end pvec, trying for more\n"); - pagevec_reinit(&pvec); + pagevec_release(&pvec); goto get_more_pages; } - - /* shift unused pages over in the pvec... we - * will need to release them below. */ - for (j = i; j < pvec_pages; j++) { - dout(" pvec leftover page %p\n", pvec.pages[j]); - pvec.pages[j-i+first] = pvec.pages[j]; - } - pvec.nr -= i-first; } new_request: @@ -1019,10 +1043,9 @@ new_request: req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, offset, &len, 0, num_ops, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE, - snapc, truncate_seq, - truncate_size, false); + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, + snapc, ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, false); if (IS_ERR(req)) { req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, @@ -1031,8 +1054,8 @@ new_request: CEPH_OSD_SLAB_OPS), CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, - snapc, truncate_seq, - truncate_size, true); + snapc, ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, true); BUG_ON(IS_ERR(req)); } BUG_ON(len < page_offset(pages[locked_pages - 1]) + @@ -1048,7 +1071,7 @@ new_request: for (i = 0; i < locked_pages; i++) { u64 cur_offset = page_offset(pages[i]); if (offset + len != cur_offset) { - if (op_idx + do_sync + 1 == req->r_num_ops) + if (op_idx + 1 == req->r_num_ops) break; osd_req_op_extent_dup_last(req, op_idx, cur_offset - offset); @@ -1069,14 +1092,15 @@ new_request: len += PAGE_SIZE; } - if (snap_size != -1) { - len = min(len, snap_size - offset); + if (ceph_wbc.size_stable) { + len = min(len, ceph_wbc.i_size - offset); } else if (i == locked_pages) { /* writepages_finish() clears writeback pages * according to the data length, so make sure * data length covers all locked pages */ u64 min_len = len + 1 - PAGE_SIZE; - len = min(len, (u64)i_size_read(inode) - offset); + len = get_writepages_data_length(inode, pages[i - 1], + offset); len = max(len, min_len); } dout("writepages got pages at %llu~%llu\n", offset, len); @@ -1085,17 +1109,12 @@ new_request: 0, !!pool, false); osd_req_op_extent_update(req, op_idx, len); - if (do_sync) { - op_idx++; - osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0); - } BUG_ON(op_idx + 1 != req->r_num_ops); pool = NULL; if (i < locked_pages) { BUG_ON(num_ops <= req->r_num_ops); num_ops -= req->r_num_ops; - num_ops += do_sync; locked_pages -= i; /* allocate new pages array for next request */ @@ -1127,22 +1146,50 @@ new_request: if (pages) goto new_request; - if (wbc->nr_to_write <= 0) - done = 1; + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) + done = stop = true; release_pvec_pages: dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, pvec.nr ? pvec.pages[0] : NULL); pagevec_release(&pvec); - - if (locked_pages && !done) - goto retry; } if (should_loop && !done) { /* more to do; loop back to beginning of file */ dout("writepages looping back to beginning of file\n"); - should_loop = 0; + end = start_index - 1; /* OK even when start_index == 0 */ + + /* to write dirty pages associated with next snapc, + * we need to wait until current writes complete */ + if (wbc->sync_mode != WB_SYNC_NONE && + start_index == 0 && /* all dirty pages were checked */ + !ceph_wbc.head_snapc) { + struct page *page; + unsigned i, nr; + index = 0; + while ((index <= end) && + (nr = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK, + PAGEVEC_SIZE))) { + for (i = 0; i < nr; i++) { + page = pvec.pages[i]; + if (page_snap_context(page) != snapc) + continue; + wait_on_page_writeback(page); + } + pagevec_release(&pvec); + cond_resched(); + } + } + + start_index = 0; index = 0; goto retry; } @@ -1152,8 +1199,8 @@ release_pvec_pages: out: ceph_osdc_put_request(req); - ceph_put_snap_context(snapc); - dout("writepages done, rc = %d\n", rc); + ceph_put_snap_context(last_snapc); + dout("writepages dend - startone, rc = %d\n", rc); return rc; } @@ -1165,8 +1212,7 @@ out: static int context_is_writeable_or_written(struct inode *inode, struct ceph_snap_context *snapc) { - struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, - NULL, NULL); + struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL); int ret = !oldest || snapc->seq <= oldest->seq; ceph_put_snap_context(oldest); @@ -1211,8 +1257,7 @@ retry_locked: * this page is already dirty in another (older) snap * context! is it writeable now? */ - oldest = get_oldest_context(inode, NULL, NULL, NULL); - + oldest = get_oldest_context(inode, NULL, NULL); if (snapc->seq > oldest->seq) { ceph_put_snap_context(oldest); dout(" page %p snapc %p not current or oldest\n", diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 174d6e6569a8..a3ab265d3215 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -209,7 +209,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) struct ceph_fs_client *fsc = ceph_inode_to_client(inode); /* No caching for filesystem */ - if (fsc->fscache == NULL) + if (!fsc->fscache) return; /* Only cache for regular files that are read only */ diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7007ae2a5ad2..157fe59fbabe 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -490,13 +490,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, } /* - * if we are newly issued FILE_SHARED, mark dir not complete; we - * don't know what happened to this directory while we didn't - * have the cap. + * If FILE_SHARED is newly issued, mark dir not complete. We don't + * know what happened to this directory while we didn't have the cap. + * If FILE_SHARED is being revoked, also mark dir not complete. It + * stops on-going cached readdir. */ - if ((issued & CEPH_CAP_FILE_SHARED) && - (had & CEPH_CAP_FILE_SHARED) == 0) { - ci->i_shared_gen++; + if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { + if (issued & CEPH_CAP_FILE_SHARED) + ci->i_shared_gen++; if (S_ISDIR(ci->vfs_inode.i_mode)) { dout(" marking %p NOT complete\n", &ci->vfs_inode); __ceph_dir_clear_complete(ci); @@ -611,7 +612,7 @@ void ceph_add_cap(struct inode *inode, } if (flags & CEPH_CAP_FLAG_AUTH) { - if (ci->i_auth_cap == NULL || + if (!ci->i_auth_cap || ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { ci->i_auth_cap = cap; cap->mds_wanted = wanted; @@ -728,7 +729,7 @@ static void __touch_cap(struct ceph_cap *cap) struct ceph_mds_session *s = cap->session; spin_lock(&s->s_cap_lock); - if (s->s_cap_iterator == NULL) { + if (!s->s_cap_iterator) { dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, s->s_mds); list_move_tail(&cap->session_caps, &s->s_caps); @@ -1248,7 +1249,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, arg.mode = inode->i_mode; arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE; - arg.flags = 0; + if (list_empty(&ci->i_cap_snaps)) + arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP; + else + arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP; if (sync) arg.flags |= CEPH_CLIENT_CAPS_SYNC; @@ -1454,13 +1458,19 @@ retry: goto retry; } + // make sure flushsnap messages are sent in proper order. + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { + __kick_flushing_caps(mdsc, session, ci, 0); + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + } + __ceph_flush_snaps(ci, session); out: spin_unlock(&ci->i_ceph_lock); if (psession) { *psession = session; - } else { + } else if (session) { mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); } @@ -1901,11 +1911,7 @@ ack: (ci->i_ceph_flags & (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { - spin_lock(&mdsc->cap_dirty_lock); - oldest_flush_tid = __get_oldest_flush_tid(mdsc); - spin_unlock(&mdsc->cap_dirty_lock); - __kick_flushing_caps(mdsc, session, ci, - oldest_flush_tid); + __kick_flushing_caps(mdsc, session, ci, 0); ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; } if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) @@ -2110,7 +2116,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret < 0) goto out; @@ -3422,7 +3428,7 @@ retry: tcap = __get_cap_for_mds(ci, target); if (tcap) { /* already have caps from the target */ - if (tcap->cap_id != t_cap_id || + if (tcap->cap_id == t_cap_id && ceph_seq_cmp(tcap->seq, t_seq) < 0) { dout(" updating import cap %p mds%d\n", tcap, target); tcap->cap_id = t_cap_id; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 4e2d112c982f..d635496ea189 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -24,7 +24,7 @@ static int mdsmap_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_mdsmap *mdsmap; - if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) + if (!fsc->mdsc || !fsc->mdsc->mdsmap) return 0; mdsmap = fsc->mdsc->mdsmap; seq_printf(s, "epoch %d\n", mdsmap->m_epoch); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index ef7240ace576..019c2036d36f 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -377,8 +377,10 @@ more: } /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; - req->r_direct_hash = ceph_frag_value(frag); - __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + if (op == CEPH_MDS_OP_READDIR) { + req->r_direct_hash = ceph_frag_value(frag); + __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + } if (fi->last_name) { req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); if (!req->r_path2) { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3d48c415f3cb..65a6fa12c857 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -175,7 +175,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); - if (cf == NULL) { + if (!cf) { ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ return -ENOMEM; } @@ -562,8 +562,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ssize_t ret; size_t len = iov_iter_count(to); - dout("sync_read on file %p %llu~%u %s\n", file, off, - (unsigned)len, + dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); if (!len) @@ -788,7 +787,7 @@ static void ceph_aio_retry_work(struct work_struct *work) goto out; } - req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; + req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); @@ -800,7 +799,6 @@ static void ceph_aio_retry_work(struct work_struct *work) } req->r_ops[0] = orig_req->r_ops[0]; - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); req->r_mtime = aio_req->mtime; req->r_data_offset = req->r_ops[0].extent.offset; @@ -847,8 +845,9 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_direct_read_write (%s) on file %p %lld~%u\n", - (write ? "write" : "read"), file, pos, (unsigned)count); + dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", + (write ? "write" : "read"), file, pos, (unsigned)count, + snapc, snapc->seq); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) @@ -861,7 +860,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (ret2 < 0) dout("invalidate_inode_pages2_range returned %d\n", ret2); - flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; + flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; } else { flags = CEPH_OSD_FLAG_READ; } @@ -874,8 +873,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &size, 0, - /*include a 'startsync' command*/ - write ? 2 : 1, + 1, write ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ, flags, snapc, @@ -887,6 +885,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, break; } + if (write) + size = min_t(u64, size, fsc->mount_options->wsize); + else + size = min_t(u64, size, fsc->mount_options->rsize); + len = size; pages = dio_get_pages_alloc(iter, len, &start, &num_pages); if (IS_ERR(pages)) { @@ -922,7 +925,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, truncate_inode_pages_range(inode->i_mapping, pos, (pos+len) | (PAGE_SIZE - 1)); - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); req->r_mtime = mtime; } @@ -1048,7 +1050,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count); + dout("sync_write on file %p %lld~%u snapc %p seq %lld\n", + file, pos, (unsigned)count, snapc, snapc->seq); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) @@ -1060,7 +1063,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); - flags = CEPH_OSD_FLAG_ORDERSNAP | C |