ignore: check ignore rules before issuing stat calls

This seems like an obvious optimization but becomes critical when filesystem operations even as simple as stat can result in significant overheads; an example of this was a bespoke filesystem layer in Windows that hosted files remotely and would download them on-demand when particular filesystem operations occurred. Users of this system who ensured correct file-type fileters were being used could still get unnecessary file access resulting in large downloads. Fixes #1657, Closes #1660
author: Richard Khoury <richen.khoury@gmail.com> 2020-08-17 09:08:31 +1000
committer: Andrew Gallant <jamslam@gmail.com> 2021-05-31 21:51:18 -0400
commit: a28e664abd0d8912e4a2d85039fe282b9d37b994 (patch)
tree: 5640047f814d31879bd4005ff2be6da11290e9b1
parent: 0ca96e004c539e44242607fee5c4661edf27a5cd (diff)
2 files changed, 23 insertions, 7 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e98fc5e9..a8c47c1b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,6 +27,11 @@ Security fixes:
   This is the public facing issue tracking CVE-2021-3013. ripgrep's README
   now contains a section describing how to report a vulnerability.
 
+Performance improvements:
+
+* [PERF #1657](https://github.com/BurntSushi/ripgrep/discussions/1657):
+  Check if a file should be ignored first before issuing stat calls.
+
 Feature enhancements:
 
 * Added or improved file type filtering for ASP, Bazel, dvc, FlatBuffers,
diff --git a/crates/ignore/src/walk.rs b/crates/ignore/src/walk.rs
index 31c1d9f1..4d4054a9 100644
--- a/crates/ignore/src/walk.rs
+++ b/crates/ignore/src/walk.rs
@@ -934,15 +934,23 @@ impl Walk {
         if ent.depth() == 0 {
             return Ok(false);
         }
-
+        // We ensure that trivial skipping is done before any other potentially
+        // expensive operations (stat, filesystem other) are done. This seems
+        // like an obvious optimization but becomes critical when filesystem
+        // operations even as simple as stat can result in significant
+        // overheads; an example of this was a bespoke filesystem layer in
+        // Windows that hosted files remotely and would download them on-demand
+        // when particular filesystem operations occurred. Users of this system
+        // who ensured correct file-type fileters were being used could still
+        // get unnecessary file access resulting in large downloads.
+        if should_skip_entry(&self.ig, ent) {
+            return Ok(true);
+        }
         if let Some(ref stdout) = self.skip {
             if path_equals(ent, stdout)? {
                 return Ok(true);
             }
         }
-        if should_skip_entry(&self.ig, ent) {
-            return Ok(true);
-        }
         if self.max_filesize.is_some() && !ent.is_dir() {
             return Ok(skip_filesize(
                 self.max_filesize.unwrap(),
@@ -1549,6 +1557,11 @@ impl<'s> Worker<'s> {
                 }
             }
         }
+        // N.B. See analogous call in the single-threaded implementation about
+        // why it's important for this to come before the checks below.
+        if should_skip_entry(ig, &dent) {
+            return WalkState::Continue;
+        }
         if let Some(ref stdout) = self.skip {
             let is_stdout = match path_equals(&dent, stdout) {
                 Ok(is_stdout) => is_stdout,
@@ -1558,7 +1571,6 @@ impl<'s> Worker<'s> {
                 return WalkState::Continue;
             }
         }
-        let should_skip_path = should_skip_entry(ig, &dent);
         let should_skip_filesize =
             if self.max_filesize.is_some() && !dent.is_dir() {
                 skip_filesize(
@@ -1575,8 +1587,7 @@ impl<'s> Worker<'s> {
             } else {
                 false
             };
-        if !should_skip_path && !should_skip_filesize && !should_skip_filtered
-        {
+        if !should_skip_filesize && !should_skip_filtered {
             self.send(Work { dent, ignore: ig.clone(), root_device });
         }
         WalkState::Continue
author	Richard Khoury <richen.khoury@gmail.com>	2020-08-17 09:08:31 +1000
committer	Andrew Gallant <jamslam@gmail.com>	2021-05-31 21:51:18 -0400
commit	a28e664abd0d8912e4a2d85039fe282b9d37b994 (patch)
tree	5640047f814d31879bd4005ff2be6da11290e9b1
parent	0ca96e004c539e44242607fee5c4661edf27a5cd (diff)