searcher: do UTF-8 BOM sniffing like UTF-16

Previously, we were only looking for the UTF-16 BOM for determining whether to do transcoding or not. But we should also look for the UTF-8 BOM as well. Fixes #1638, Closes #1697
author: Alessandro Menezes <alessandroasm@gmail.com> 2020-10-02 16:17:39 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2021-05-31 21:51:18 -0400
commit: 2295061e8079b146e656526ce1b264bf8f217585 (patch)
tree: 99386fa5097984ebad0d32b4c96aa5d50aceda92
parent: 53c4855517ff3d6ef0cb612579ede1f6e281a506 (diff)
3 files changed, 34 insertions, 4 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 27ba4956..9a987858 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -54,6 +54,8 @@ Bug fixes:
   Fix stdin detection when using PowerShell in UNIX environments.
 * [BUG #1765](https://github.com/BurntSushi/ripgrep/issues/1765):
   Fix panic when `--crlf` is used in some cases.
+* [BUG #1638](https://github.com/BurntSushi/ripgrep/issues/1638):
+  Correctly sniff UTF-8 and do transcoding, like we do for UTF-16.
 * [BUG #1816](https://github.com/BurntSushi/ripgrep/issues/1816):
   Add documentation for glob alternate syntax, e.g., `{a,b,..}`.
 * [BUG #1847](https://github.com/BurntSushi/ripgrep/issues/1847):
diff --git a/crates/searcher/src/searcher/mod.rs b/crates/searcher/src/searcher/mod.rs
index 1cbe6660..241f9a41 100644
--- a/crates/searcher/src/searcher/mod.rs
+++ b/crates/searcher/src/searcher/mod.rs
@@ -788,7 +788,7 @@ impl Searcher {
     /// Returns true if and only if the given slice needs to be transcoded.
     fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
         self.config.encoding.is_some()
-            || (self.config.bom_sniffing && slice_has_utf16_bom(slice))
+            || (self.config.bom_sniffing && slice_has_bom(slice))
     }
 }
 
@@ -973,16 +973,18 @@ impl Searcher {
     }
 }
 
-/// Returns true if and only if the given slice begins with a UTF-16 BOM.
+/// Returns true if and only if the given slice begins with a UTF-8 or UTF-16
+/// BOM.
 ///
 /// This is used by the searcher to determine if a transcoder is necessary.
 /// Otherwise, it is advantageous to search the slice directly.
-fn slice_has_utf16_bom(slice: &[u8]) -> bool {
+fn slice_has_bom(slice: &[u8]) -> bool {
     let enc = match encoding_rs::Encoding::for_bom(slice) {
         None => return false,
         Some((enc, _)) => enc,
     };
-    [encoding_rs::UTF_16LE, encoding_rs::UTF_16BE].contains(&enc)
+    [encoding_rs::UTF_16LE, encoding_rs::UTF_16BE, encoding_rs::UTF_8]
+        .contains(&enc)
 }
 
 #[cfg(test)]
@@ -1009,4 +1011,21 @@ mod tests {
         let res = searcher.search_slice(matcher, &[], sink);
         assert!(res.is_err());
     }
+
+    #[test]
+    fn uft8_bom_sniffing() {
+        // See: https://github.com/BurntSushi/ripgrep/issues/1638
+        // ripgrep must sniff utf-8 BOM, just like it does with utf-16
+        let matcher = RegexMatcher::new("foo");
+        let haystack: &[u8] = &[0xef, 0xbb, 0xbf, 0x66, 0x6f, 0x6f];
+
+        let mut sink = KitchenSink::new();
+        let mut searcher = SearcherBuilder::new().build();
+
+        let res = searcher.search_slice(matcher, haystack, &mut sink);
+        assert!(res.is_ok());
+
+        let sink_output = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
+        assert_eq!(sink_output, "1:0:foo\nbyte count:3\n");
+    }
 }
diff --git a/tests/regression.rs b/tests/regression.rs
index 2ecd2399..1bf239b8 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -867,6 +867,15 @@ use B;
     eqnice!("2\n", cmd.stdout());
 });
 
+// See: https://github.com/BurntSushi/ripgrep/issues/1638
+//
+// Tests if UTF-8 BOM is sniffed, then the column index is correct.
+rgtest!(r1638, |dir: Dir, mut cmd: TestCommand| {
+    dir.create_bytes("foo", b"\xef\xbb\xbfx");
+
+    eqnice!("foo:1:1:x\n", cmd.arg("--column").arg("x").stdout());
+});
+
 // See: https://github.com/BurntSushi/ripgrep/issues/1765
 rgtest!(r1765, |dir: Dir, mut cmd: TestCommand| {
     dir.create("test", "\n");
author	Alessandro Menezes <alessandroasm@gmail.com>	2020-10-02 16:17:39 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2021-05-31 21:51:18 -0400
commit	2295061e8079b146e656526ce1b264bf8f217585 (patch)
tree	99386fa5097984ebad0d32b4c96aa5d50aceda92
parent	53c4855517ff3d6ef0cb612579ede1f6e281a506 (diff)