From d84560f74d852ea0cf663edeaee3a470917c2f36 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 9 Aug 2019 14:42:27 +0200 Subject: jbd2: Simplify journal_unmap_buffer() journal_unmap_buffer() checks first whether the buffer head is a journal. If so it takes locks and then invokes jbd2_journal_grab_journal_head() followed by another check whether this is journal head buffer. The double checking is pointless. Replace the initial check with jbd2_journal_grab_journal_head() which alredy checks whether the buffer head is actually a journal. Allows also early access to the journal head pointer for the upcoming conversion of state lock to a regular spinlock. Signed-off-by: Thomas Gleixner Reviewed-by: Jan Kara Cc: linux-ext4@vger.kernel.org Cc: "Theodore Ts'o" Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20190809124233.13277-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index bee8498d7792..2273b1067d3a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2199,7 +2199,8 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, * holding the page lock. --sct */ - if (!buffer_jbd(bh)) + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) goto zap_buffer_unlocked; /* OK, we have data buffer in journaled mode */ @@ -2207,10 +2208,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); - jh = jbd2_journal_grab_journal_head(bh); - if (!jh) - goto zap_buffer_no_jh; - /* * We cannot remove the buffer from checkpoint lists until the * transaction adding inode to orphan list (let's call it T) @@ -2332,7 +2329,6 @@ zap_buffer: */ jh->b_modified = 0; jbd2_journal_put_journal_head(jh); -zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); write_unlock(&journal->j_state_lock); -- cgit v1.2.3 From 93108ebb848df8d4948d51db14714a14c4e81111 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 9 Aug 2019 14:42:29 +0200 Subject: jbd2: Move dropping of jh reference out of un/re-filing functions __jbd2_journal_unfile_buffer() and __jbd2_journal_refile_buffer() drop transaction's jh reference when they remove jh from a transaction. This will be however inconvenient once we move state lock into journal_head itself as we still need to unlock it and we'd need to grab jh reference just for that. Move dropping of jh reference out of these functions into the few callers. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20190809124233.13277-4-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 5 ++++- fs/jbd2/transaction.c | 23 +++++++++++++++-------- 2 files changed, 19 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 132fb92098c7..a0a191f3df77 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -918,6 +918,7 @@ restart_loop: transaction_t *cp_transaction; struct buffer_head *bh; int try_to_free = 0; + bool drop_ref; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); @@ -1022,8 +1023,10 @@ restart_loop: try_to_free = 1; } JBUFFER_TRACE(jh, "refile or unfile buffer"); - __jbd2_journal_refile_buffer(jh); + drop_ref = __jbd2_journal_refile_buffer(jh); jbd_unlock_bh_state(bh); + if (drop_ref) + jbd2_journal_put_journal_head(jh); if (try_to_free) release_buffer_page(bh); /* Drops bh reference */ else diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 2273b1067d3a..620113068e03 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1598,6 +1598,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); } else { __jbd2_journal_unfile_buffer(jh); + jbd2_journal_put_journal_head(jh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); goto not_jbd; @@ -1971,17 +1972,15 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) } /* - * Remove buffer from all transactions. + * Remove buffer from all transactions. The caller is responsible for dropping + * the jh reference that belonged to the transaction. * * Called with bh_state lock and j_list_lock - * - * jh and bh may be already freed when this function returns. */ static void __jbd2_journal_unfile_buffer(struct journal_head *jh) { __jbd2_journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; - jbd2_journal_put_journal_head(jh); } void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) @@ -1995,6 +1994,7 @@ void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) __jbd2_journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); + jbd2_journal_put_journal_head(jh); __brelse(bh); } @@ -2133,6 +2133,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) } else { JBUFFER_TRACE(jh, "on running transaction"); __jbd2_journal_unfile_buffer(jh); + jbd2_journal_put_journal_head(jh); } return may_free; } @@ -2496,9 +2497,11 @@ void jbd2_journal_file_buffer(struct journal_head *jh, * Called under j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)) * - * jh and bh may be already free when this function returns + * When this function returns true, there's no next transaction to refile to + * and the caller has to drop jh reference through + * jbd2_journal_put_journal_head(). */ -void __jbd2_journal_refile_buffer(struct journal_head *jh) +bool __jbd2_journal_refile_buffer(struct journal_head *jh) { int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); @@ -2510,7 +2513,7 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) /* If the buffer is now unused, just drop it. */ if (jh->b_next_transaction == NULL) { __jbd2_journal_unfile_buffer(jh); - return; + return true; } /* @@ -2538,6 +2541,7 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) if (was_dirty) set_buffer_jbddirty(bh); + return false; } /* @@ -2549,15 +2553,18 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + bool drop; /* Get reference so that buffer cannot be freed before we unlock it */ get_bh(bh); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); - __jbd2_journal_refile_buffer(jh); + drop = __jbd2_journal_refile_buffer(jh); jbd_unlock_bh_state(bh); spin_unlock(&journal->j_list_lock); __brelse(bh); + if (drop) + jbd2_journal_put_journal_head(jh); } /* -- cgit v1.2.3 From 6d69843e5d3f0c394e1e3004cc2b36efbe402b71 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 9 Aug 2019 14:42:30 +0200 Subject: jbd2: Drop unnecessary branch from jbd2_journal_forget() We have cleared both dirty & jbddirty bits from the bh. So there's no difference between bforget() and brelse(). Thus there's no point jumping to no_jbd branch. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20190809124233.13277-5-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 620113068e03..2d42bc42933e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1599,10 +1599,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) } else { __jbd2_journal_unfile_buffer(jh); jbd2_journal_put_journal_head(jh); - if (!buffer_jbd(bh)) { - spin_unlock(&journal->j_list_lock); - goto not_jbd; - } } spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction) { -- cgit v1.2.3 From 2e710ff03fc4599059eeda68c8de2383e65af825 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 9 Aug 2019 14:42:31 +0200 Subject: jbd2: Don't call __bforget() unnecessarily jbd2_journal_forget() jumps to 'not_jbd' branch which calls __bforget() in cases where the buffer is clean which is pointless. In case of failed assertion, it can be even argued that it is safer not to touch buffer's dirty bits. Also logically it makes more sense to just jump to 'drop' and that will make logic also simpler when we switch bh_state_lock to a spinlock. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20190809124233.13277-6-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 2d42bc42933e..f2af4afc690a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1550,7 +1550,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (!J_EXPECT_JH(jh, !jh->b_committed_data, "inconsistent data on disk")) { err = -EIO; - goto not_jbd; + goto drop; } /* keep track of whether or not this transaction modified us */ @@ -1640,7 +1640,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (!jh->b_cp_transaction) { JBUFFER_TRACE(jh, "belongs to none transaction"); spin_unlock(&journal->j_list_lock); - goto not_jbd; + goto drop; } /* @@ -1650,7 +1650,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (!buffer_dirty(bh)) { __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); - goto not_jbd; + goto drop; } /* @@ -1663,10 +1663,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); spin_unlock(&journal->j_list_lock); } - +drop: jbd_unlock_bh_state(bh); __brelse(bh); -drop: if (drop_reserve) { /* no need to reserve log space for this block -bzzz */ handle->h_buffer_credits++; -- cgit v1.2.3 From 464170647b5648bb81f3615567485fcb9a685bed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 9 Aug 2019 14:42:32 +0200 Subject: jbd2: Make state lock a spinlock Bit-spinlocks are problematic on PREEMPT_RT if functions which might sleep on RT, e.g. spin_lock(), alloc/free(), are invoked inside the lock held region because bit spinlocks disable preemption even on RT. A first attempt was to replace state lock with a spinlock placed in struct buffer_head and make the locking conditional on PREEMPT_RT and DEBUG_BIT_SPINLOCKS. Jan pointed out that there is a 4 byte hole in struct journal_head where a regular spinlock fits in and he would not object to convert the state lock to a spinlock unconditionally. Aside of solving the RT problem, this also gains lockdep coverage for the journal head state lock (bit-spinlocks are not covered by lockdep as it's hard to fit a lockdep map into a single bit). The trivial change would have been to convert the jbd_*lock_bh_state() inlines, but that comes with the downside that these functions take a buffer head pointer which needs to be converted to a journal head pointer which adds another level of indirection. As almost all functions which use this lock have a journal head pointer readily available, it makes more sense to remove the lock helper inlines and write out spin_*lock() at all call sites. Fixup all locking comments as well. Suggested-by: Jan Kara Signed-off-by: Thomas Gleixner Signed-off-by: Jan Kara Cc: "Theodore Ts'o" Cc: Mark Fasheh Cc: Joseph Qi Cc: Joel Becker Cc: Jan Kara Cc: linux-ext4@vger.kernel.org Link: https://lore.kernel.org/r/20190809124233.13277-7-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 8 ++-- fs/jbd2/journal.c | 10 +++-- fs/jbd2/transaction.c | 100 ++++++++++++++++++++++++-------------------------- fs/ocfs2/suballoc.c | 19 ++++++---- 4 files changed, 68 insertions(+), 69 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index a0a191f3df77..8e1ff875de43 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -482,10 +482,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (jh->b_committed_data) { struct buffer_head *bh = jh2bh(jh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); jbd2_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); } jbd2_journal_refile_buffer(journal, jh); } @@ -928,7 +928,7 @@ restart_loop: * done with it. */ get_bh(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); /* @@ -1024,7 +1024,7 @@ restart_loop: } JBUFFER_TRACE(jh, "refile or unfile buffer"); drop_ref = __jbd2_journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); if (drop_ref) jbd2_journal_put_journal_head(jh); if (try_to_free) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 1c58859aa592..5d4192f05879 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -363,7 +363,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, /* keep subsequent assertions sane */ atomic_set(&new_bh->b_count, 1); - jbd_lock_bh_state(bh_in); + spin_lock(&jh_in->b_state_lock); repeat: /* * If a new transaction has already done a buffer copy-out, then @@ -405,13 +405,13 @@ repeat: if (need_copy_out && !done_copy_out) { char *tmp; - jbd_unlock_bh_state(bh_in); + spin_unlock(&jh_in->b_state_lock); tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); if (!tmp) { brelse(new_bh); return -ENOMEM; } - jbd_lock_bh_state(bh_in); + spin_lock(&jh_in->b_state_lock); if (jh_in->b_frozen_data) { jbd2_free(tmp, bh_in->b_size); goto repeat; @@ -464,7 +464,7 @@ repeat: __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); spin_unlock(&journal->j_list_lock); set_buffer_shadow(bh_in); - jbd_unlock_bh_state(bh_in); + spin_unlock(&jh_in->b_state_lock); return do_escape | (done_copy_out << 1); } @@ -2410,6 +2410,8 @@ static struct journal_head *journal_alloc_journal_head(void) ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS | __GFP_NOFAIL); } + if (ret) + spin_lock_init(&ret->b_state_lock); return ret; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index f2af4afc690a..7c11afe60532 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -879,7 +879,7 @@ repeat: start_lock = jiffies; lock_buffer(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); /* If it takes too long to lock the buffer, trace it */ time_lock = jbd2_time_diff(start_lock, jiffies); @@ -929,7 +929,7 @@ repeat: error = -EROFS; if (is_handle_aborted(handle)) { - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); goto out; } error = 0; @@ -993,7 +993,7 @@ repeat: */ if (buffer_shadow(bh)) { JBUFFER_TRACE(jh, "on shadow: sleep"); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); goto repeat; } @@ -1014,7 +1014,7 @@ repeat: JBUFFER_TRACE(jh, "generate frozen data"); if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS | __GFP_NOFAIL); goto repeat; @@ -1033,7 +1033,7 @@ attach_next: jh->b_next_transaction = transaction; done: - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); /* * If we are about to journal a buffer, then any revoke pending on it is @@ -1172,7 +1172,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) * that case: the transaction must have deleted the buffer for it to be * reused here. */ - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, (jh->b_transaction == transaction || jh->b_transaction == NULL || (jh->b_transaction == journal->j_committing_transaction && @@ -1207,7 +1207,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); } - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); /* * akpm: I added this. ext3_alloc_branch can pick up new indirect @@ -1275,13 +1275,13 @@ repeat: committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS|__GFP_NOFAIL); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); if (!jh->b_committed_data) { /* Copy out the current buffer contents into the * preserved, committed copy. */ JBUFFER_TRACE(jh, "generate b_committed data"); if (!committed_data) { - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); goto repeat; } @@ -1289,7 +1289,7 @@ repeat: committed_data = NULL; memcpy(jh->b_committed_data, bh->b_data, bh->b_size); } - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); out: jbd2_journal_put_journal_head(jh); if (unlikely(committed_data)) @@ -1390,16 +1390,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction != transaction && jh->b_next_transaction != transaction) { - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_next_transaction == transaction); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); } if (jh->b_modified == 1) { /* If it's in our transaction it must be in BJ_Metadata list. */ if (jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata) { - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); if (jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata) pr_err("JBD2: assertion failure: h_type=%u " @@ -1409,13 +1409,13 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) jh->b_jlist); J_ASSERT_JH(jh, jh->b_transaction != transaction || jh->b_jlist == BJ_Metadata); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); } goto out; } journal = transaction->t_journal; - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); if (jh->b_modified == 0) { /* @@ -1501,7 +1501,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata); spin_unlock(&journal->j_list_lock); out_unlock_bh: - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); out: JBUFFER_TRACE(jh, "exit"); return ret; @@ -1539,11 +1539,13 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) BUFFER_TRACE(bh, "entry"); - jbd_lock_bh_state(bh); + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) { + __bforget(bh); + return 0; + } - if (!buffer_jbd(bh)) - goto not_jbd; - jh = bh2jh(bh); + spin_lock(&jh->b_state_lock); /* Critical error: attempting to delete a bitmap buffer, maybe? * Don't do any jbd operations, and return an error. */ @@ -1664,18 +1666,14 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) spin_unlock(&journal->j_list_lock); } drop: - jbd_unlock_bh_state(bh); __brelse(bh); + spin_unlock(&jh->b_state_lock); + jbd2_journal_put_journal_head(jh); if (drop_reserve) { /* no need to reserve log space for this block -bzzz */ handle->h_buffer_credits++; } return err; - -not_jbd: - jbd_unlock_bh_state(bh); - __bforget(bh); - goto drop; } /** @@ -1874,7 +1872,7 @@ free_and_exit: * * j_list_lock is held. * - * jbd_lock_bh_state(jh2bh(jh)) is held. + * jh->b_state_lock is held. */ static inline void @@ -1898,7 +1896,7 @@ __blist_add_buffer(struct journal_head **list, struct journal_head *jh) * * Called with j_list_lock held, and the journal may not be locked. * - * jbd_lock_bh_state(jh2bh(jh)) is held. + * jh->b_state_lock is held. */ static inline void @@ -1930,7 +1928,7 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) transaction_t *transaction; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + lockdep_assert_held(&jh->b_state_lock); transaction = jh->b_transaction; if (transaction) assert_spin_locked(&transaction->t_journal->j_list_lock); @@ -1984,11 +1982,11 @@ void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) /* Get reference so that buffer cannot be freed before we unlock it */ get_bh(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); __jbd2_journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); jbd2_journal_put_journal_head(jh); __brelse(bh); } @@ -1996,7 +1994,7 @@ void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) /* * Called from jbd2_journal_try_to_free_buffers(). * - * Called under jbd_lock_bh_state(bh) + * Called under jh->b_state_lock */ static void __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) @@ -2083,10 +2081,10 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, if (!jh) continue; - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); __journal_try_to_free_buffer(journal, bh); + spin_unlock(&jh->b_state_lock); jbd2_journal_put_journal_head(jh); - jbd_unlock_bh_state(bh); if (buffer_jbd(bh)) goto busy; } while ((bh = bh->b_this_page) != head); @@ -2107,7 +2105,7 @@ busy: * * Called under j_list_lock. * - * Called under jbd_lock_bh_state(bh). + * Called under jh->b_state_lock. */ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) { @@ -2201,7 +2199,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, /* OK, we have data buffer in journaled mode */ write_lock(&journal->j_state_lock); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); /* @@ -2282,10 +2280,10 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, * for commit and try again. */ if (partial_page) { - jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); + jbd2_journal_put_journal_head(jh); return -EBUSY; } /* @@ -2297,10 +2295,10 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, set_buffer_freed(bh); if (journal->j_running_transaction && buffer_jbddirty(bh)) jh->b_next_transaction = journal->j_running_transaction; - jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); + jbd2_journal_put_journal_head(jh); return 0; } else { /* Good, the buffer belongs to the running transaction. @@ -2324,10 +2322,10 @@ zap_buffer: * here. */ jh->b_modified = 0; - jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); + jbd2_journal_put_journal_head(jh); zap_buffer_unlocked: clear_buffer_dirty(bh); J_ASSERT_BH(bh, !buffer_jbddirty(bh)); @@ -2414,7 +2412,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, int was_dirty = 0; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + lockdep_assert_held(&jh->b_state_lock); assert_spin_locked(&transaction->t_journal->j_list_lock); J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); @@ -2476,11 +2474,11 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, void jbd2_journal_file_buffer(struct journal_head *jh, transaction_t *transaction, int jlist) { - jbd_lock_bh_state(jh2bh(jh)); + spin_lock(&jh->b_state_lock); spin_lock(&transaction->t_journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, jlist); spin_unlock(&transaction->t_journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); + spin_unlock(&jh->b_state_lock); } /* @@ -2490,7 +2488,7 @@ void jbd2_journal_file_buffer(struct journal_head *jh, * buffer on that transaction's metadata list. * * Called under j_list_lock - * Called under jbd_lock_bh_state(jh2bh(jh)) + * Called under jh->b_state_lock * * When this function returns true, there's no next transaction to refile to * and the caller has to drop jh reference through @@ -2501,7 +2499,7 @@ bool __jbd2_journal_refile_buffer(struct journal_head *jh) int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + lockdep_assert_held(&jh->b_state_lock); if (jh->b_transaction) assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); @@ -2547,17 +2545,13 @@ bool __jbd2_journal_refile_buffer(struct journal_head *jh) */ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) { - struct buffer_head *bh = jh2bh(jh); bool drop; - /* Get reference so that buffer cannot be freed before we unlock it */ - get_bh(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); drop = __jbd2_journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); spin_unlock(&journal->j_list_lock); - __brelse(bh); if (drop) jbd2_journal_put_journal_head(jh); } diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 69c21a3843af..4180c3ef0a68 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1252,6 +1252,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr) { struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + struct journal_head *jh; int ret; if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) @@ -1260,13 +1261,14 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, if (!buffer_jbd(bg_bh)) return 1; - jbd_lock_bh_state(bg_bh); - bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; + jh = bh2jh(bg_bh); + spin_lock(&jh->b_state_lock); + bg = (struct ocfs2_group_desc *) jh->b_committed_data; if (bg) ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); else ret = 1; - jbd_unlock_bh_state(bg_bh); + spin_unlock(&jh->b_state_lock); return ret; } @@ -2387,6 +2389,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, int status; unsigned int tmp; struct ocfs2_group_desc *undo_bg = NULL; + struct journal_head *jh; /* The caller got this descriptor from * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ @@ -2405,10 +2408,10 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, goto bail; } + jh = bh2jh(group_bh); if (undo_fn) { - jbd_lock_bh_state(group_bh); - undo_bg = (struct ocfs2_group_desc *) - bh2jh(group_bh)->b_committed_data; + spin_lock(&jh->b_state_lock); + undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data; BUG_ON(!undo_bg); } @@ -2423,7 +2426,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, le16_add_cpu(&bg->bg_free_bits_count, num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { if (undo_fn) - jbd_unlock_bh_state(group_bh); + spin_unlock(&jh->b_state_lock); return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", (unsigned long long)le64_to_cpu(bg->bg_blkno), le16_to_cpu(bg->bg_bits), @@ -2432,7 +2435,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, } if (undo_fn) - jbd_unlock_bh_state(group_bh); + spin_unlock(&jh->b_state_lock); ocfs2_journal_dirty(handle, group_bh); bail: -- cgit v1.2.3 From 7855a57d008b2354dd52078974529e08b889f98a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 9 Aug 2019 14:42:33 +0200 Subject: jbd2: Free journal head outside of locked region On PREEMPT_RT bit-spinlocks have the same semantics as on PREEMPT_RT=n, i.e. they disable preemption. That means functions which are not safe to be called in preempt disabled context on RT trigger a might_sleep() assert. The journal head bit spinlock is mostly held for short code sequences with trivial RT safe functionality, except for one place: jbd2_journal_put_journal_head() invokes __journal_remove_journal_head() with the journal head bit spinlock held. __journal_remove_journal_head() invokes kmem_cache_free() which must not be called with preemption disabled on RT. Jan suggested to rework the removal function so the actual free happens outside the bit-spinlocked region. Split it into two parts: - Do the sanity checks and the buffer head detach under the lock - Do the actual free after dropping the lock There is error case handling in the free part which needs to dereference the b_size field of the now detached buffer head. Due to paranoia (caused by ignorance) the size is retrieved in the detach function and handed into the free function. Might be over-engineered, but better safe than sorry. This makes the journal head bit-spinlock usage RT compliant and also avoids nested locking which is not covered by lockdep. Suggested-by: Jan Kara Signed-off-by: Thomas Gleixner Cc: linux-ext4@vger.kernel.org Cc: "Theodore Ts'o" Cc: Jan Kara Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20190809124233.13277-8-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5d4192f05879..f6034ce4a107 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2531,17 +2531,23 @@ static void __journal_remove_journal_head(struct buffer_head *bh) J_ASSERT_BH(bh, buffer_jbd(bh)); J_ASSERT_BH(bh, jh2bh(jh) == bh); BUFFER_TRACE(bh, "remove journal_head"); + + /* Unlink before dropping the lock */ + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); +} + +static void journal_release_journal_head(struct journal_head *jh, size_t b_size) +{ if (jh->b_frozen_data) { printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); - jbd2_free(jh->b_frozen_data, bh->b_size); + jbd2_free(jh->b_frozen_data, b_size); } if (jh->b_committed_data) { printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); - jbd2_free(jh->b_committed_data, bh->b_size); + jbd2_free(jh->b_committed_data, b_size); } - bh->b_private = NULL; - jh->b_bh = NULL; /* debug, really */ - clear_buffer_jbd(bh); journal_free_journal_head(jh); } @@ -2559,9 +2565,11 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) if (!jh->b_jcount) { __journal_remove_journal_head(bh); jbd_unlock_bh_journal_head(bh); + journal_release_journal_head(jh, bh->b_size); __brelse(bh); - } else + } else { jbd_unlock_bh_journal_head(bh); + } } /* -- cgit v1.2.3 From 821ff38d192a42a95fe33b5928ce5ed15cbe8564 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 16 Oct 2019 13:07:07 +0530 Subject: ext4: keep uniform naming convention for io & io_end variables Let's keep uniform naming convention for ext4_submit_io (io) & ext4_end_io_t (io_end) structures, to avoid any confusion. No functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/20191016073711.4141-2-riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/page-io.c | 55 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 12ceadef32c5..d9b96fc976a3 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -136,19 +136,19 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) * cannot get to ext4_ext_truncate() before all IOs overlapping that range are * completed (happens from ext4_free_ioend()). */ -static int ext4_end_io(ext4_io_end_t *io) +static int ext4_end_io_end(ext4_io_end_t *io_end) { - struct inode *inode = io->inode; - loff_t offset = io->offset; - ssize_t size = io->size; - handle_t *handle = io->handle; + struct inode *inode = io_end->inode; + loff_t offset = io_end->offset; + ssize_t size = io_end->size; + handle_t *handle = io_end->handle; int ret = 0; - ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," + ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", - io, inode->i_ino, io->list.next, io->list.prev); + io_end, inode->i_ino, io_end->list.next, io_end->list.prev); - io->handle = NULL; /* Following call will use up the handle */ + io_end->handle = NULL; /* Following call will use up the handle */ ret = ext4_convert_unwritten_extents(handle, inode, offset, size); if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) { ext4_msg(inode->i_sb, KERN_EMERG, @@ -157,8 +157,8 @@ static int ext4_end_io(ext4_io_end_t *io) "(inode %lu, offset %llu, size %zd, error %d)", inode->i_ino, offset, size, ret); } - ext4_clear_io_unwritten_flag(io); - ext4_release_io_end(io); + ext4_clear_io_unwritten_flag(io_end); + ext4_release_io_end(io_end); return ret; } @@ -166,21 +166,21 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) { #ifdef EXT4FS_DEBUG struct list_head *cur, *before, *after; - ext4_io_end_t *io, *io0, *io1; + ext4_io_end_t *io_end, *io_end0, *io_end1; if (list_empty(head)) return; ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); - list_for_each_entry(io, head, list) { - cur = &io->list; + list_for_each_entry(io_end, head, list) { + cur = &io_end->list; before = cur->prev; - io0 = container_of(before, ext4_io_end_t, list); + io_end0 = container_of(before, ext4_io_end_t, list); after = cur->next; - io1 = container_of(after, ext4_io_end_t, list); + io_end1 = container_of(after, ext4_io_end_t, list); ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", - io, inode->i_ino, io0, io1); + io_end, inode->i_ino, io_end0, io_end1); } #endif } @@ -207,7 +207,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) static int ext4_do_flush_completed_IO(struct inode *inode, struct list_head *head) { - ext4_io_end_t *io; + ext4_io_end_t *io_end; struct list_head unwritten; unsigned long flags; struct ext4_inode_info *ei = EXT4_I(inode); @@ -219,11 +219,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode, spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); while (!list_empty(&unwritten)) { - io = list_entry(unwritten.next, ext4_io_end_t, list); - BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); - list_del_init(&io->list); + io_end = list_entry(unwritten.next, ext4_io_end_t, list); + BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + list_del_init(&io_end->list); - err = ext4_end_io(io); + err = ext4_end_io_end(io_end); if (unlikely(!ret && err)) ret = err; } @@ -242,13 +242,14 @@ void ext4_end_io_rsv_work(struct work_struct *work) ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { - ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); - if (io) { - io->inode = inode; - INIT_LIST_HEAD(&io->list); - atomic_set(&io->count, 1); + ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags); + + if (io_end) { + io_end->inode = inode; + INIT_LIST_HEAD(&io_end->list); + atomic_set(&io_end->count, 1); } - return io; + return io_end; } void ext4_put_io_end_defer(ext4_io_end_t *io_end) -- cgit v1.2.3 From a00713ea982b7f2f9880a336b03bbf70f2202cbf Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 16 Oct 2019 13:07:08 +0530 Subject: ext4: Add API to bring in support for unwritten io_end_vec conversion This patch just brings in the API for conversion of unwritten io_end_vec extents which will be required for blocksize < pagesize support for dioread_nolock feature. No functional changes in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/20191016073711.4141-3-riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/extents.c | 42 +++++++++++++++++++++++++++--------------- fs/ext4/page-io.c | 7 +++---- 3 files changed, 32 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 03db3e71676c..8d924bd19ca7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3264,6 +3264,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); +extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, + ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_metadata_amount(struct inode *inode, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index fb0f99dc8c22..731e67ccab22 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4962,23 +4962,13 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, int ret = 0; int ret2 = 0; struct ext4_map_blocks map; - unsigned int credits, blkbits = inode->i_blkbits; + unsigned int blkbits = inode->i_blkbits; + unsigned int credits = 0; map.m_lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); - /* - * This is somewhat ugly but the idea is clear: When transaction is - * reserved, everything goes into it. Otherwise we rather start several - * smaller transactions for conversion of each extent separately. - */ - if (handle) { - handle = ext4_journal_start_reserved(handle, - EXT4_HT_EXT_CONVERT); - if (IS_ERR(handle)) - return PTR_ERR(handle); - credits = 0; - } else { + if (!handle) { /* * credits to insert 1 extent into extent tree */ @@ -5009,11 +4999,33 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, if (ret <= 0 || ret2) break; } - if (!credits) - ret2 = ext4_journal_stop(handle); return ret > 0 ? ret2 : ret; } +int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) +{ + int ret, err = 0; + + /* + * This is somewhat ugly but the idea is clear: When transaction is + * reserved, everything goes into it. Otherwise we rather start several + * smaller transactions for conversion of each extent separately. + */ + if (handle) { + handle = ext4_journal_start_reserved(handle, + EXT4_HT_EXT_CONVERT); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } + + ret = ext4_convert_unwritten_extents(handle, io_end->inode, + io_end->offset, io_end->size); + if (handle) + err = ext4_journal_stop(handle); + + return ret < 0 ? ret : err; +} + /* * If newes is not existing extent (newes->ec_pblk equals zero) find * delayed extent at start of newes and update newes accordingly and diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index d9b96fc976a3..3ccf54a380b2 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -149,7 +149,7 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) io_end, inode->i_ino, io_end->list.next, io_end->list.prev); io_end->handle = NULL; /* Following call will use up the handle */ - ret = ext4_convert_unwritten_extents(handle, inode, offset, size); + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " @@ -269,9 +269,8 @@ int ext4_put_io_end(ext4_io_end_t *io_end) if (atomic_dec_and_test(&io_end->count)) { if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - err = ext4_convert_unwritten_extents(io_end->handle, - io_end->inode, io_end->offset, - io_end->size); + err = ext4_convert_unwritten_io_end_vec(io_end->handle, + io_end); io_end->handle = NULL; ext4_clear_io_unwritten_flag(io_end); } -- cgit v1.2.3 From 2943fdbc688e7a1b4d9e3bc76a8e5ba624550213 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 16 Oct 2019 13:07:09 +0530 Subject: ext4: Refactor mpage_map_and_submit_buffers function This patch refactors mpage_map_and_submit_buffers to take out the page buffers processing, as a separate function. This will be required to add support for blocksize < pagesize for dioread_nolock feature. No functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/20191016073711.4141-4-riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 125 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 83 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 516faa280ced..5081f91cc1c4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2340,6 +2340,75 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, return lblk < blocks; } +/* + * mpage_process_page - update page buffers corresponding to changed extent and + * may submit fully mapped page for IO + * + * @mpd - description of extent to map, on return next extent to map + * @m_lblk - logical block mapping. + * @m_pblk - corresponding physical mapping. + * @map_bh - determines on return whether this page requires any further + * mapping or not. + * Scan given page buffers corresponding to changed extent and update buffer + * state according to new extent state. + * We map delalloc buffers to their physical location, clear unwritten bits. + * If the given page is not fully mapped, we update @map to the next extent in + * the given page that needs mapping & return @map_bh as true. + */ +static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, + ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, + bool *map_bh) +{ + struct buffer_head *head, *bh; + ext4_io_end_t *io_end = mpd->io_submit.io_end; + ext4_lblk_t lblk = *m_lblk; + ext4_fsblk_t pblock = *m_pblk; + int err = 0; + + bh = head = page_buffers(page); + do { + if (lblk < mpd->map.m_lblk) + continue; + if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { + /* + * Buffer after end of mapped extent. + * Find next buffer in the page to map. + */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + + /* + * FIXME: If dioread_nolock supports + * blocksize < pagesize, we need to make + * sure we add size mapped so far to + * io_end->size as the following call + * can submit the page for IO. + */ + err = mpage_process_page_bufs(mpd, head, bh, lblk); + if (err > 0) + err = 0; + *map_bh = true; + goto out; + } + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock++; + } + clear_buffer_unwritten(bh); + } while (lblk++, (bh = bh->b_this_page) != head); + /* + * FIXME: This is going to break if dioread_nolock + * supports blocksize < pagesize as we will try to + * convert potentially unmapped parts of inode. + */ + io_end->size += PAGE_SIZE; + *map_bh = false; +out: + *m_lblk = lblk; + *m_pblk = pblock; + return err; +} + /* * mpage_map_buffers - update buffers corresponding to changed extent and * submit fully mapped pages for IO @@ -2359,12 +2428,12 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) struct pagevec pvec; int nr_pages, i; struct inode *inode = mpd->inode; - struct buffer_head *head, *bh; int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; - sector_t pblock; + ext4_fsblk_t pblock; int err; + bool map_bh = false; start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; @@ -2380,50 +2449,19 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - bh = head = page_buffers(page); - do { - if (lblk < mpd->map.m_lblk) - continue; - if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { - /* - * Buffer after end of mapped extent. - * Find next buffer in the page to map. - */ - mpd->map.m_len = 0; - mpd->map.m_flags = 0; - /* - * FIXME: If dioread_nolock supports - * blocksize < pagesize, we need to make - * sure we add size mapped so far to - * io_end->size as the following call - * can submit the page for IO. - */ - err = mpage_process_page_bufs(mpd, head, - bh, lblk); - pagevec_release(&pvec); - if (err > 0) - err = 0; - return err; - } - if (buffer_delay(bh)) { - clear_buffer_delay(bh); - bh->b_blocknr = pblock++; - } - clear_buffer_unwritten(bh); - } while (lblk++, (bh = bh->b_this_page) != head); - + err = mpage_process_page(mpd, page, &lblk, &pblock, + &map_bh); /* - * FIXME: This is going to break if dioread_nolock - * supports blocksize < pagesize as we will try to - * convert potentially unmapped parts of inode. + * If map_bh is true, means page may require further bh + * mapping, or maybe the page was submitted for IO. + * So we return to call further extent mapping. */ - mpd->io_submit.io_end->size += PAGE_SIZE; + if (err < 0 || map_bh == true) + goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_page(mpd, page); - if (err < 0) { - pagevec_release(&pvec); - return err; - } + if (err < 0) + goto out; } pagevec_release(&pvec); } @@ -2431,6 +2469,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; +out: + pagevec_release(&pvec); + return err; } static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) -- cgit v1.2.3 From c8cc88163f40df39e50cda63ac361631864b453e Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 16 Oct 2019 13:07:10 +0530 Subject: ext4: Add support for blocksize < pagesize in dioread_nolock This patch adds the support for blocksize < pagesize for dioread_nolock feature. Since in case of blocksize < pagesize, we can have multiple small buffers of page as unwritten extents, we need to maintain a vector of these unwritten extents which needs the conversion after the IO is complete. Thus, we maintain a list of tuple pair (io_end_vec) for this & traverse this list to do the unwritten to written conversion. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/20191016073711.4141-5-riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 11 +++++++++-- fs/ext4/extents.c | 11 +++++++++-- fs/ext4/inode.c | 37 ++++++++++++++++++++----------------- fs/ext4/page-io.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-------- 4 files changed, 82 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8d924bd19ca7..3616f1b0c987 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -198,6 +198,12 @@ struct ext4_system_blocks { */ #define EXT4_IO_END_UNWRITTEN 0x0001 +struct ext4_io_end_vec { + struct list_head list; /* list of io_end_vec */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ +}; + /* * For converting unwritten extents on a work queue. 'handle' is used for * buffered writeback. @@ -211,8 +217,7 @@ typedef struct ext4_io_end { * bios covering the extent */ unsigned int flag; /* unwritten or not */ atomic_t count; /* reference counter */ - loff_t offset; /* offset in the file */ - ssize_t size; /* size of the extent */ + struct list_head list_vec; /* list of ext4_io_end_vec */ } ext4_io_end_t; struct ext4_io_submit { @@ -3324,6 +3329,8 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, int len, struct writeback_control *wbc, bool keep_towrite); +extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); +extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 731e67ccab22..cf6c5f64cb58 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5005,6 +5005,7 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) { int ret, err = 0; + struct ext4_io_end_vec *io_end_vec; /* * This is somewhat ugly but the idea is clear: When transaction is @@ -5018,8 +5019,14 @@ int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) return PTR_ERR(handle); } - ret = ext4_convert_unwritten_extents(handle, io_end->inode, - io_end->offset, io_end->size); + list_for_each_entry(io_end_vec, &io_end->list_vec, list) { + ret = ext4_convert_unwritten_extents(handle, io_end->inode, + io_end_vec->offset, + io_end_vec->size); + if (ret) + break; + } + if (handle) err = ext4_journal_stop(handle); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5081f91cc1c4..da9fed86086c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2364,6 +2364,9 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, ext4_lblk_t lblk = *m_lblk; ext4_fsblk_t pblock = *m_pblk; int err = 0; + int blkbits = mpd->inode->i_blkbits; + ssize_t io_end_size = 0; + struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); bh = head = page_buffers(page); do { @@ -2376,17 +2379,16 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, */ mpd->map.m_len = 0; mpd->map.m_flags = 0; + io_end_vec->size += io_end_size; + io_end_size = 0; - /* - * FIXME: If dioread_nolock supports - * blocksize < pagesize, we need to make - * sure we add size mapped so far to - * io_end->size as the following call - * can submit the page for IO. - */ err = mpage_process_page_bufs(mpd, head, bh, lblk); if (err > 0) err = 0; + if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { + io_end_vec = ext4_alloc_io_end_vec(io_end); + io_end_vec->offset = mpd->map.m_lblk << blkbits; + } *map_bh = true; goto out; } @@ -2395,13 +2397,11 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); + io_end_size += (1 << blkbits); } while (lblk++, (bh = bh->b_this_page) != head); - /* - * FIXME: This is going to break if dioread_nolock - * supports blocksize < pagesize as we will try to - * convert potentially unmapped parts of inode. - */ - io_end->size += PAGE_SIZE; + + io_end_vec->size += io_end_size; + io_end_size = 0; *map_bh = false; out: *m_lblk = lblk; @@ -2551,9 +2551,10 @@ static int mpage_map_and_submit_extent(handle_t *handle, int err; loff_t disksize; int progress = 0; + ext4_io_end_t *io_end = mpd->io_submit.io_end; + struct ext4_io_end_vec *io_end_vec = ext4_alloc_io_end_vec(io_end); - mpd->io_submit.io_end->offset = - ((loff_t)map->m_lblk) << inode->i_blkbits; + io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { @@ -3654,6 +3655,7 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private) { ext4_io_end_t *io_end = private; + struct ext4_io_end_vec *io_end_vec; /* if not async direct IO just return */ if (!io_end) @@ -3671,8 +3673,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ext4_clear_io_unwritten_flag(io_end); size = 0; } - io_end->offset = offset; - io_end->size = size; + io_end_vec = ext4_alloc_io_end_vec(io_end); + io_end_vec->offset = offset; + io_end_vec->size = size; ext4_put_io_end(io_end); return 0; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 3ccf54a380b2..893913d1c2fe 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -31,18 +31,56 @@ #include "acl.h" static struct kmem_cache *io_end_cachep; +static struct kmem_cache *io_end_vec_cachep; int __init ext4_init_pageio(void) { io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); if (io_end_cachep == NULL) return -ENOMEM; + + io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0); + if (io_end_vec_cachep == NULL) { + kmem_cache_destroy(io_end_cachep); + return -ENOMEM; + } return 0; } void ext4_exit_pageio(void) { kmem_cache_destroy(io_end_cachep); + kmem_cache_destroy(io_end_vec_cachep); +} + +struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end) +{ + struct ext4_io_end_vec *io_end_vec; + + io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS); + if (!io_end_vec) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&io_end_vec->list); + list_add_tail(&io_end_vec->list, &io_end->list_vec); + return io_end_vec; +} + +static void ext4_free_io_end_vec(ext4_io_end_t *io_end) +{ + struct ext4_io_end_vec *io_end_vec, *tmp; + + if (list_empty(&io_end->list_vec)) + return; + list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) { + list_del(&io_end_vec->list); + kmem_cache_free(io_end_vec_cachep, io_end_vec); + } +} + +struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end) +{ + BUG_ON(list_empty(&io_end->list_vec)); + return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list); } /* @@ -125,6 +163,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) ext4_finish_bio(bio); bio_put(bio); } + ext4_free_io_end_vec(io_end); kmem_cache_free(io_end_cachep, io_end); } @@ -139,8 +178,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) static int ext4_end_io_end(ext4_io_end_t *io_end) { struct inode *inode = io_end->inode; - loff_t offset = io_end->offset; - ssize_t size = io_end->size; handle_t *handle = io_end->handle; int ret = 0; @@ -154,8 +191,7 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " - "(inode %lu, offset %llu, size %zd, error %d)", - inode->i_ino, offset, size, ret); + "(inode %lu, error %d)", inode->i_ino, ret); } ext4_clear_io_unwritten_flag(io_end); ext4_release_io_end(io_end); @@ -247,6 +283,7 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) if (io_end) { io_end->inode = inode; INIT_LIST_HEAD(&io_end->list); + INIT_LIST_HEAD(&io_end->list_vec); atomic_set(&io_end->count, 1); } return io_end; @@ -255,7 +292,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) void ext4_put_io_end_defer(ext4_io_end_t *io_end) { if (atomic_dec_and_test(&io_end->count)) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || + list_empty(&io_end->list_vec)) { ext4_release_io_end(io_end); return; } @@ -307,10 +345,8 @@ static void ext4_end_bio(struct bio *bio) struct inode *inode = io_end->inode; ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " - "(offset %llu size %ld starting block %llu)", + "starting block %llu)", bio->bi_status, inode->i_ino, - (unsigned long long) io_end->offset, - (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); mapping_set_error(inode->i_mapping, -- cgit v1.2.3 From c33fbe8f673c55c178bfe69d0d9f06f1a68bf6cf Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 16 Oct 2019 13:07:11 +0530 Subject: ext4: Enable blocksize < pagesize for dioread_nolock All support is now added for blocksize < pagesize for dioread_nolock. This patch removes those checks which disables dioread_nolock feature for blocksize != pagesize. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/20191016073711.4141-6-riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dd654e53ba3d..7796e2ffc294 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2105,16 +2105,6 @@ static int parse_options(char *options, struct super_block *sb, } } #endif - if (test_opt(sb, DIOREAD_NOLOCK)) { - int blocksize = - BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - - if (blocksize < PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "dioread_nolock if block size != PAGE_SIZE"); - return 0; - } - } return 1; } -- cgit v1.2.3 From 53e5cca56795a301bbe8465781dab084f7ae8d54 Mon Sep 17 00:00:00 2001 From: Matthew Bobrowski Date: Tue, 5 Nov 2019 22:58:55 +1100 Subject: ext4: reorder map.m_flags checks within ext4_iomap_begin() For the direct I/O changes that follow in this patch series, we need to accommodate for the case where the block mapping flags passed through to ext4_map_blocks() result in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits set. In order for any allocated unwritten extents to be converted correctly in the ->end_io() handler, the iomap->type must be set to IOMAP_UNWRITTEN for cases where the EXT4_MAP_UNWRITTEN bit has been set within m_flags. Hence the reason why we need to reshuffle this conditional statement around. This change is a no-op for DAX as the block mapping flags passed through to ext4_map_blocks() i.e. EXT4_GET_BLOCKS_CREATE_ZERO never results in both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN being set at once. Signed-off-by: Matthew Bobrowski Reviewed-by: Jan Kara Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/1309ad80d31a637b2deed55a85283d582a54a26a.1572949325.git.mbobrowski@mbobrowski.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0d8971b819e9..e4b0722717b3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3577,10 +3577,20 @@ retry: iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; } else { - if (map.m_flags & EXT4_MAP_MAPPED) { - iomap->type = IOMAP_MAPPED; - } else if (map.m_flags & EXT4_MAP_UNWRITTEN) { + /* + * Flags passed into ext4_map_blocks() for direct I/O writes + * can result in m_flags having both EXT4_MAP_MAPPED and + * EXT4_MAP_UNWRITTEN bits set. In order for any allocated + * unwritten extents to be converted into written extents + * correctly within the ->end_io() handler, we need to ensure + * that the iomap->type is set appropriately. Hence the reason + * why we need to check whether EXT4_MAP_UNWRITTEN is set + * first. + */ + if (map.m_flags & EXT4_MAP_UNWRITTEN) { iomap->type = IOMAP_UNWRITTEN; + } else if (map.m_flags & EXT4_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; } else { WARN_ON_ONCE(1); return -EIO; -- cgit v1.2.3 From 548feebec7e93e58b647dba70b3303dcb569c914 Mon Sep 17 00:00:00 2001 From: Matthew Bobrowski Date: Tue, 5 Nov 2019 22:59:22 +1100 Subject: ext4: update direct I/O read lock pattern for IOCB_NOWAIT This patch updates the lock pattern in ext4_direct_IO_read() to not block on inode lock in cases of IOCB_NOWAIT direct I/O reads. The locking condition implemented here is similar to that of 942491c9e6d6 ("xfs: fix AIM7 regression"). Fixes: 16c54688592c ("ext4: Allow parallel DIO reads") Signed-off-by: Matthew Bobrowski Reviewed-by: Jan Kara Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/c5d5e759f91747359fbd2c6f9a36240cf75ad79f.1572949325.git.mbobrowski@mbobrowski.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e4b0722717b3..f33fa86fff67 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3881,7 +3881,13 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) * writes & truncates and since we take care of writing back page cache, * we are protected against page writeback as well. */ - inode_lock_shared(inode); + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, iocb->ki_pos + count - 1); if (ret) -- cgit v1.2.3 From 2e9b51d78229d5145725a481bb5464ebc0a3f9b2 Mon Sep 17 00:00:00 2001 From: Matthew Bobrowski Date: Tue, 5 Nov 2019 22:59:37 +1100 Subject: ext4: iomap that extends beyond EOF should be marked dirty This patch addresses what Dave Chinner had discovered and fixed within commit: 7684e2c4384d. This changes does not have any user visible impact for ext4 as none of the current users of ext4_iomap_begin() that extend files depend on IOMAP_F_DIRTY. When doing a direct IO that spans the current EOF, and there are written blocks beyond EOF that extend beyond the current write, the only metadata update that needs to be done is a file size extension. However, we don't mark such iomaps as IOMAP_F_DIRTY to indicate that there is IO completion metadata updates required, and hence we may fail to correctly sync file size extensions made in IO completion when O_DSYNC writes are being used and the hardware supports FUA. Hence when setting IOMAP_F_DIRTY, we need to also take into account whether the iomap spans the current EOF. If it does, then we need to mark it dirty so that IO completion will call generic_write_sync() to flush the inode size update to stable storage correctly. Signed-off-by: Matthew Bobrowski Reviewed-by: Jan Kara Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/8b43ee9ee94bee5328da56ba0909b7d2229ef150.1572949325.git.mbobrowski@mbobrowski.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f33fa86fff67..b422d9b8c0bd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3565,8 +3565,14 @@ retry: return ret; } + /* + * Writes that span EOF might trigger an I/O size update on completion, + * so consider them to be dirty for the purposes of O_DSYNC, even if + * there is no other metadata changes being made or are pending here. + */ iomap->flags = 0; - if (ext4_inode_datasync_dirty(inode)) + if (ext4_inode_datasync_dirty(inode) || + offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; iomap->bdev = inode->i_sb->s_bdev; iomap->dax_dev = sbi->s_daxdev; -- cgit v1.2.3 From c8fdfe294187455b70e42a15df35a3e1882f332d Mon Sep 17 00:00:00 2001 From: Matthew Bobrowski Date: Tue, 5 Nov 2019 22:59:56 +1100 Subject: ext4: move set iomap routines into a separate helper ext4_set_iomap() Separate the iomap field population code that is currently within ext4_iomap_begin() into a separate helper ext4_set_iomap(). The intent of this function is self explanatory, however the rationale behind taking this step is to reeduce the overall clutter that we currently have within the ext4_iomap_begin() callback. Signed-off-by: Matthew Bobrowski Reviewed-by: Jan Kara Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/1ea34da65eecffcddffb2386668ae06134e8deaf.1572949325.git.mbobrowski@mbobrowski.org Signed-off-by: Theodore Ts'o --- fs/e