diff options
author | Alessandro Menezes <alessandroasm@gmail.com> | 2020-10-02 16:17:39 -0400 |
---|---|---|
committer | Andrew Gallant <jamslam@gmail.com> | 2021-05-31 21:51:18 -0400 |
commit | 2295061e8079b146e656526ce1b264bf8f217585 (patch) | |
tree | 99386fa5097984ebad0d32b4c96aa5d50aceda92 | |
parent | 53c4855517ff3d6ef0cb612579ede1f6e281a506 (diff) |
searcher: do UTF-8 BOM sniffing like UTF-16
Previously, we were only looking for the UTF-16 BOM for determining
whether to do transcoding or not. But we should also look for the UTF-8
BOM as well.
Fixes #1638, Closes #1697
-rw-r--r-- | CHANGELOG.md | 2 | ||||
-rw-r--r-- | crates/searcher/src/searcher/mod.rs | 27 | ||||
-rw-r--r-- | tests/regression.rs | 9 |
3 files changed, 34 insertions, 4 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 27ba4956..9a987858 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -54,6 +54,8 @@ Bug fixes: Fix stdin detection when using PowerShell in UNIX environments. * [BUG #1765](https://github.com/BurntSushi/ripgrep/issues/1765): Fix panic when `--crlf` is used in some cases. +* [BUG #1638](https://github.com/BurntSushi/ripgrep/issues/1638): + Correctly sniff UTF-8 and do transcoding, like we do for UTF-16. * [BUG #1816](https://github.com/BurntSushi/ripgrep/issues/1816): Add documentation for glob alternate syntax, e.g., `{a,b,..}`. * [BUG #1847](https://github.com/BurntSushi/ripgrep/issues/1847): diff --git a/crates/searcher/src/searcher/mod.rs b/crates/searcher/src/searcher/mod.rs index 1cbe6660..241f9a41 100644 --- a/crates/searcher/src/searcher/mod.rs +++ b/crates/searcher/src/searcher/mod.rs @@ -788,7 +788,7 @@ impl Searcher { /// Returns true if and only if the given slice needs to be transcoded. fn slice_needs_transcoding(&self, slice: &[u8]) -> bool { self.config.encoding.is_some() - || (self.config.bom_sniffing && slice_has_utf16_bom(slice)) + || (self.config.bom_sniffing && slice_has_bom(slice)) } } @@ -973,16 +973,18 @@ impl Searcher { } } -/// Returns true if and only if the given slice begins with a UTF-16 BOM. +/// Returns true if and only if the given slice begins with a UTF-8 or UTF-16 +/// BOM. /// /// This is used by the searcher to determine if a transcoder is necessary. /// Otherwise, it is advantageous to search the slice directly. -fn slice_has_utf16_bom(slice: &[u8]) -> bool { +fn slice_has_bom(slice: &[u8]) -> bool { let enc = match encoding_rs::Encoding::for_bom(slice) { None => return false, Some((enc, _)) => enc, }; - [encoding_rs::UTF_16LE, encoding_rs::UTF_16BE].contains(&enc) + [encoding_rs::UTF_16LE, encoding_rs::UTF_16BE, encoding_rs::UTF_8] + .contains(&enc) } #[cfg(test)] @@ -1009,4 +1011,21 @@ mod tests { let res = searcher.search_slice(matcher, &[], sink); assert!(res.is_err()); } + + #[test] + fn uft8_bom_sniffing() { + // See: https://github.com/BurntSushi/ripgrep/issues/1638 + // ripgrep must sniff utf-8 BOM, just like it does with utf-16 + let matcher = RegexMatcher::new("foo"); + let haystack: &[u8] = &[0xef, 0xbb, 0xbf, 0x66, 0x6f, 0x6f]; + + let mut sink = KitchenSink::new(); + let mut searcher = SearcherBuilder::new().build(); + + let res = searcher.search_slice(matcher, haystack, &mut sink); + assert!(res.is_ok()); + + let sink_output = String::from_utf8(sink.as_bytes().to_vec()).unwrap(); + assert_eq!(sink_output, "1:0:foo\nbyte count:3\n"); + } } diff --git a/tests/regression.rs b/tests/regression.rs index 2ecd2399..1bf239b8 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -867,6 +867,15 @@ use B; eqnice!("2\n", cmd.stdout()); }); +// See: https://github.com/BurntSushi/ripgrep/issues/1638 +// +// Tests if UTF-8 BOM is sniffed, then the column index is correct. +rgtest!(r1638, |dir: Dir, mut cmd: TestCommand| { + dir.create_bytes("foo", b"\xef\xbb\xbfx"); + + eqnice!("foo:1:1:x\n", cmd.arg("--column").arg("x").stdout()); +}); + // See: https://github.com/BurntSushi/ripgrep/issues/1765 rgtest!(r1765, |dir: Dir, mut cmd: TestCommand| { dir.create("test", "\n"); |