summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorlesnyrumcajs <lesny.rumcajs@gmail.com>2019-03-04 17:18:45 +0100
committerAndrew Gallant <jamslam@gmail.com>2019-04-06 09:36:32 -0400
commitb00cd69a40329b14a304bb75567d13964856d733 (patch)
tree629cd01258a613052e2e0df134bf64095afb51d4
parent77439f99a4535e3b7ba997e5c825dd885a96b725 (diff)
searcher: add option to disable BOM sniffing
This commit adds a new encoding feature where the -E/--encoding flag will now accept a value of 'none'. When given this value, all encoding related machinery is disabled and ripgrep will search the raw bytes of the file, including the BOM if it's present. Closes #1207, Closes #1208
-rw-r--r--Cargo.lock6
-rw-r--r--GUIDE.md32
-rw-r--r--complete/_rg2
-rw-r--r--grep-regex/src/matcher.rs2
-rw-r--r--grep-searcher/Cargo.toml2
-rw-r--r--grep-searcher/src/searcher/mod.rs43
-rw-r--r--src/app.rs4
-rw-r--r--src/args.rs69
-rw-r--r--tests/feature.rs32
9 files changed, 158 insertions, 34 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 095ed414..a8420de4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -114,7 +114,7 @@ dependencies = [
[[package]]
name = "encoding_rs_io"
-version = "0.1.5"
+version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -226,7 +226,7 @@ dependencies = [
"bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"bytecount 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)",
- "encoding_rs_io 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
+ "encoding_rs_io 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"grep-matcher 0.1.1",
"grep-regex 0.1.2",
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -704,7 +704,7 @@ dependencies = [
"checksum crossbeam-channel 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "0f0ed1a4de2235cabda8558ff5840bffb97fcb64c97827f354a451307df5f72b"
"checksum crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f8306fcef4a7b563b76b7dd949ca48f52bc1141aa067d2ea09565f3e2652aa5c"
"checksum encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)" = "4155785c79f2f6701f185eb2e6b4caf0555ec03477cb4c70db67b465311620ed"
-"checksum encoding_rs_io 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f94ef2bcdb2f5d58e982ef565baa1ecfd04b7cb653d0bf1b49af1dd472faa8d8"
+"checksum encoding_rs_io 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9619ee7a2bf4e777e020b95c1439abaf008f8ea8041b78a0552c4f1bcf4df32c"
"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
"checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
diff --git a/GUIDE.md b/GUIDE.md
index 0094a7b4..8022f292 100644
--- a/GUIDE.md
+++ b/GUIDE.md
@@ -603,7 +603,7 @@ topic, but we can try to summarize its relevancy to ripgrep:
* Files are generally just a bundle of bytes. There is no reliable way to know
their encoding.
* Either the encoding of the pattern must match the encoding of the files being
- searched, or a form of transcoding must be performed converts either the
+ searched, or a form of transcoding must be performed that converts either the
pattern or the file to the same encoding as the other.
* ripgrep tends to work best on plain text files, and among plain text files,
the most popular encodings likely consist of ASCII, latin1 or UTF-8. As
@@ -626,12 +626,15 @@ given, which is the default:
they correspond to a UTF-16 BOM, then ripgrep will transcode the contents of
the file from UTF-16 to UTF-8, and then execute the search on the transcoded
version of the file. (This incurs a performance penalty since transcoding
- is slower than regex searching.)
+ is slower than regex searching.) If the file contains invalid UTF-16, then
+ the Unicode replacement codepoint is substituted in place of invalid code
+ units.
* To handle other cases, ripgrep provides a `-E/--encoding` flag, which permits
you to specify an encoding from the
[Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get).
- ripgrep will assume *all* files searched are the encoding specified and
- will perform a transcoding step just like in the UTF-16 case described above.
+ ripgrep will assume *all* files searched are the encoding specified (unless
+ the file has a BOM) and will perform a transcoding step just like in the
+ UTF-16 case described above.
By default, ripgrep will not require its input be valid UTF-8. That is, ripgrep
can and will search arbitrary bytes. The key here is that if you're searching
@@ -641,9 +644,26 @@ pattern won't find anything. With all that said, this mode of operation is
important, because it lets you find ASCII or UTF-8 *within* files that are
otherwise arbitrary bytes.
+As a special case, the `-E/--encoding` flag supports the value `none`, which
+will completely disable all encoding related logic, including BOM sniffing.
+When `-E/--encoding` is set to `none`, ripgrep will search the raw bytes of
+the underlying file with no transcoding step. For example, here's how you might
+search the raw UTF-16 encoding of the string `Шерлок`:
+
+```
+$ rg '(?-u)\(\x045\x04@\x04;\x04>\x04:\x04' -E none -a some-utf16-file
+```
+
+Of course, that's just an example meant to show how one can drop down into
+raw bytes. Namely, the simpler command works as you might expect automatically:
+
+```
+$ rg 'Шерлок' some-utf16-file
+```
+
Finally, it is possible to disable ripgrep's Unicode support from within the
-pattern regular expression. For example, let's say you wanted `.` to match any
-byte rather than any Unicode codepoint. (You might want this while searching a
+regular expression. For example, let's say you wanted `.` to match any byte
+rather than any Unicode codepoint. (You might want this while searching a
binary file, since `.` by default will not match invalid UTF-8.) You could do
this by disabling Unicode via a regular expression flag:
diff --git a/complete/_rg b/complete/_rg
index 2e5c1937..c4a983ac 100644
--- a/complete/_rg
+++ b/complete/_rg
@@ -378,7 +378,7 @@ _rg_encodings() {
shift{-,_}jis csshiftjis {,x-}sjis ms_kanji ms932
utf{,-}8 utf-16{,be,le} unicode-1-1-utf-8
windows-{31j,874,949,125{0..8}} dos-874 tis-620 ansi_x3.4-1968
- x-user-defined auto
+ x-user-defined auto none
)
_wanted encodings expl encoding compadd -a "$@" - _encodings
diff --git a/grep-regex/src/matcher.rs b/grep-regex/src/matcher.rs
index 391439d9..d71f5777 100644
--- a/grep-regex/src/matcher.rs
+++ b/grep-regex/src/matcher.rs
@@ -52,7 +52,7 @@ impl RegexMatcherBuilder {
}
let matcher = RegexMatcherImpl::new(&chir)?;
- trace!("final regex: {:?}", matcher.regex());
+ trace!("final regex: {:?}", matcher.regex().to_string());
Ok(RegexMatcher {
config: self.config.clone(),
matcher: matcher,
diff --git a/grep-searcher/Cargo.toml b/grep-searcher/Cargo.toml
index f4875d9f..f3120a80 100644
--- a/grep-searcher/Cargo.toml
+++ b/grep-searcher/Cargo.toml
@@ -16,7 +16,7 @@ license = "Unlicense/MIT"
bstr = { version = "0.1.2", default-features = false, features = ["std"] }
bytecount = "0.5"
encoding_rs = "0.8.14"
-encoding_rs_io = "0.1.4"
+encoding_rs_io = "0.1.6"
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
log = "0.4.5"
memmap = "0.7"
diff --git a/grep-searcher/src/searcher/mod.rs b/grep-searcher/src/searcher/mod.rs
index c70b3a0e..729b491b 100644
--- a/grep-searcher/src/searcher/mod.rs
+++ b/grep-searcher/src/searcher/mod.rs
@@ -155,6 +155,8 @@ pub struct Config {
/// An encoding that, when present, causes the searcher to transcode all
/// input from the encoding to UTF-8.
encoding: Option<Encoding>,
+ /// Whether to do automatic transcoding based on a BOM or not.
+ bom_sniffing: bool,
}
impl Default for Config {
@@ -171,6 +173,7 @@ impl Default for Config {
binary: BinaryDetection::default(),
multi_line: false,
encoding: None,
+ bom_sniffing: true,
}
}
}
@@ -303,12 +306,15 @@ impl SearcherBuilder {
config.before_context = 0;
config.after_context = 0;
}
+
let mut decode_builder = DecodeReaderBytesBuilder::new();
decode_builder
.encoding(self.config.encoding.as_ref().map(|e| e.0))
.utf8_passthru(true)
- .strip_bom(true)
- .bom_override(true);
+ .strip_bom(self.config.bom_sniffing)
+ .bom_override(true)
+ .bom_sniffing(self.config.bom_sniffing);
+
Searcher {
config: config,
decode_builder: decode_builder,
@@ -506,12 +512,13 @@ impl SearcherBuilder {
/// transcoding process encounters an error, then bytes are replaced with
/// the Unicode replacement codepoint.
///
- /// When no encoding is specified (the default), then BOM sniffing is used
- /// to determine whether the source data is UTF-8 or UTF-16, and
- /// transcoding will be performed automatically. If no BOM could be found,
- /// then the source data is searched _as if_ it were UTF-8. However, so
- /// long as the source data is at least ASCII compatible, then it is
- /// possible for a search to produce useful results.
+ /// When no encoding is specified (the default), then BOM sniffing is
+ /// used (if it's enabled, which it is, by default) to determine whether
+ /// the source data is UTF-8 or UTF-16, and transcoding will be performed
+ /// automatically. If no BOM could be found, then the source data is
+ /// searched _as if_ it were UTF-8. However, so long as the source data is
+ /// at least ASCII compatible, then it is possible for a search to produce
+ /// useful results.
pub fn encoding(
&mut self,
encoding: Option<Encoding>,
@@ -519,6 +526,23 @@ impl SearcherBuilder {
self.config.encoding = encoding;
self
}
+
+ /// Enable automatic transcoding based on BOM sniffing.
+ ///
+ /// When this is enabled and an explicit encoding is not set, then this
+ /// searcher will try to detect the encoding of the bytes being searched
+ /// by sniffing its byte-order mark (BOM). In particular, when this is
+ /// enabled, UTF-16 encoded files will be searched seamlessly.
+ ///
+ /// When this is disabled and if an explicit encoding is not set, then
+ /// the bytes from the source stream will be passed through unchanged,
+ /// including its BOM, if one is present.
+ ///
+ /// This is enabled by default.
+ pub fn bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder {
+ self.config.bom_sniffing = yes;
+ self
+ }
}
/// A searcher executes searches over a haystack and writes results to a caller
@@ -738,7 +762,8 @@ impl Searcher {
/// Returns true if and only if the given slice needs to be transcoded.
fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
- self.config.encoding.is_some() || slice_has_utf16_bom(slice)
+ self.config.encoding.is_some()
+ || (self.config.bom_sniffing && slice_has_utf16_bom(slice))
}
}
diff --git a/src/app.rs b/src/app.rs
index b4c81a7c..66eaedb4 100644
--- a/src/app.rs
+++ b/src/app.rs
@@ -984,7 +984,9 @@ Specify the text encoding that ripgrep will use on all files searched. The
default value is 'auto', which will cause ripgrep to do a best effort automatic
detection of encoding on a per-file basis. Automatic detection in this case
only applies to files that begin with a UTF-8 or UTF-16 byte-order mark (BOM).
-No other automatic detection is performend.
+No other automatic detection is performed. One can also specify 'none' which
+will then completely disable BOM sniffing and always result in searching the
+raw bytes, including a BOM if it's present, regardless of its encoding.
Other supported values can be found in the list of labels here:
https://encoding.spec.whatwg.org/#concept-encoding-get
diff --git a/src/args.rs b/src/args.rs
index c9f2405b..166bc126 100644
--- a/src/args.rs
+++ b/src/args.rs
@@ -483,6 +483,37 @@ impl SortByKind {
}
}
+/// Encoding mode the searcher will use.
+#[derive(Clone, Debug)]
+enum EncodingMode {
+ /// Use an explicit encoding forcefully, but let BOM sniffing override it.
+ Some(Encoding),
+ /// Use only BOM sniffing to auto-detect an encoding.
+ Auto,
+ /// Use no explicit encoding and disable all BOM sniffing. This will
+ /// always result in searching the raw bytes, regardless of their
+ /// true encoding.
+ Disabled,
+}
+
+impl EncodingMode {
+ /// Checks if an explicit encoding has been set. Returns false for
+ /// automatic BOM sniffing and no sniffing.
+ ///
+ /// This is only used to determine whether PCRE2 needs to have its own
+ /// UTF-8 checking enabled. If we have an explicit encoding set, then
+ /// we're always guaranteed to get UTF-8, so we can disable PCRE2's check.
+ /// Otherwise, we have no such guarantee, and must enable PCRE2' UTF-8
+ /// check.
+ #[cfg(feature = "pcre2")]
+ fn has_explicit_encoding(&self) -> bool {
+ match self {
+ EncodingMode::Some(_) => true,
+ _ => false
+ }
+ }
+}
+
impl ArgMatches {
/// Create an ArgMatches from clap's parse result.
fn new(clap_matches: clap::ArgMatches<'static>) -> ArgMatches {
@@ -650,7 +681,7 @@ impl ArgMatches {
}
if self.pcre2_unicode() {
builder.utf(true).ucp(true);
- if self.encoding()?.is_some() {
+ if self.encoding()?.has_explicit_encoding() {
// SAFETY: If an encoding was specified, then we're guaranteed
// to get valid UTF-8, so we can disable PCRE2's UTF checking.
// (Feeding invalid UTF-8 to PCRE2 is undefined behavior.)
@@ -766,8 +797,16 @@ impl ArgMatches {
.after_context(ctx_after)
.passthru(self.is_present("passthru"))
.memory_map(self.mmap_choice(paths))
- .binary_detection(self.binary_detection())
- .encoding(self.encoding()?);
+ .binary_detection(self.binary_detection());
+ match self.encoding()? {
+ EncodingMode::Some(enc) => {
+ builder.encoding(Some(enc));
+ }
+ EncodingMode::Auto => {} // default for the searcher
+ EncodingMode::Disabled => {
+ builder.bom_sniffing(false);
+ }
+ }
Ok(builder.build())
}
@@ -952,24 +991,30 @@ impl ArgMatches {
u64_to_usize("dfa-size-limit", r)
}
- /// Returns the type of encoding to use.
+ /// Returns the encoding mode to use.
///
- /// This only returns an encoding if one is explicitly specified. When no
- /// encoding is present, the Searcher will still do BOM sniffing for UTF-16
- /// and transcode seamlessly.
- fn encoding(&self) -> Result<Option<Encoding>> {
+ /// This only returns an encoding if one is explicitly specified. Otherwise
+ /// if set to automatic, the Searcher will do BOM sniffing for UTF-16
+ /// and transcode seamlessly. If disabled, no BOM sniffing nor transcoding
+ /// will occur.
+ fn encoding(&self) -> Result<EncodingMode> {
if self.is_present("no-encoding") {
- return Ok(None);
+ return Ok(EncodingMode::Auto);
}
+
let label = match self.value_of_lossy("encoding") {
None if self.pcre2_unicode() => "utf-8".to_string(),
- None => return Ok(None),
+ None => return Ok(EncodingMode::Auto),
Some(label) => label,
};
+
if label == "auto" {
- return Ok(None);
+ return Ok(EncodingMode::Auto);
+ } else if label == "none" {
+ return Ok(EncodingMode::Disabled);
}
- Ok(Some(Encoding::new(&label)?))
+
+ Ok(EncodingMode::Some(Encoding::new(&label)?))
}
/// Return the file separator to use based on the CLI configuration.
diff --git a/tests/feature.rs b/tests/feature.rs
index 1e7ecc48..d7b343f1 100644
--- a/tests/feature.rs
+++ b/tests/feature.rs
@@ -645,3 +645,35 @@ rgtest!(f1138_no_ignore_dot, |dir: Dir, mut cmd: TestCommand| {
eqnice!("bar\nquux\n", cmd.arg("--no-ignore-dot").stdout());
eqnice!("bar\n", cmd.arg("--ignore-file").arg(".fzf-ignore").stdout());
});
+
+
+// See: https://github.com/BurntSushi/ripgrep/issues/1207
+//
+// Tests if without encoding 'none' flag null bytes are consumed by automatic
+// encoding detection.
+rgtest!(f1207_auto_encoding, |dir: Dir, mut cmd: TestCommand| {
+ dir.create_bytes(
+ "foo",
+ b"\xFF\xFE\x00\x62"
+ );
+ cmd.arg("-a").arg("\\x00").arg("foo");
+ cmd.assert_exit_code(1);
+});
+
+// See: https://github.com/BurntSushi/ripgrep/issues/1207
+//
+// Tests if encoding 'none' flag does treat file as raw bytes
+rgtest!(f1207_ignore_encoding, |dir: Dir, mut cmd: TestCommand| {
+ // PCRE2 chokes on this test because it can't search invalid non-UTF-8
+ // and the point of this test is to search raw UTF-16.
+ if dir.is_pcre2() {
+ return;
+ }
+
+ dir.create_bytes(
+ "foo",
+ b"\xFF\xFE\x00\x62"
+ );
+ cmd.arg("--encoding").arg("none").arg("-a").arg("\\x00").arg("foo");
+ eqnice!("\u{FFFD}\u{FFFD}\x00b\n", cmd.stdout());
+});