diff options
author | lesnyrumcajs <lesny.rumcajs@gmail.com> | 2019-03-04 17:18:45 +0100 |
---|---|---|
committer | Andrew Gallant <jamslam@gmail.com> | 2019-04-06 10:35:08 -0400 |
commit | 5962abc4655a0f07ece6fc6bd45142e8ee1cab0c (patch) | |
tree | 56b1f051f3e803cd24aa2d980c3edcf765756bda /src | |
parent | 1604a18db3d896514e1d536781810642de4b31c1 (diff) |
searcher: add option to disable BOM sniffing
This commit adds a new encoding feature where the -E/--encoding flag
will now accept a value of 'none'. When given this value, all encoding
related machinery is disabled and ripgrep will search the raw bytes of
the file, including the BOM if it's present.
Closes #1207, Closes #1208
Diffstat (limited to 'src')
-rw-r--r-- | src/app.rs | 4 | ||||
-rw-r--r-- | src/args.rs | 69 |
2 files changed, 60 insertions, 13 deletions
@@ -984,7 +984,9 @@ Specify the text encoding that ripgrep will use on all files searched. The default value is 'auto', which will cause ripgrep to do a best effort automatic detection of encoding on a per-file basis. Automatic detection in this case only applies to files that begin with a UTF-8 or UTF-16 byte-order mark (BOM). -No other automatic detection is performend. +No other automatic detection is performed. One can also specify 'none' which +will then completely disable BOM sniffing and always result in searching the +raw bytes, including a BOM if it's present, regardless of its encoding. Other supported values can be found in the list of labels here: https://encoding.spec.whatwg.org/#concept-encoding-get diff --git a/src/args.rs b/src/args.rs index 6d9549ed..babaf09c 100644 --- a/src/args.rs +++ b/src/args.rs @@ -483,6 +483,37 @@ impl SortByKind { } } +/// Encoding mode the searcher will use. +#[derive(Clone, Debug)] +enum EncodingMode { + /// Use an explicit encoding forcefully, but let BOM sniffing override it. + Some(Encoding), + /// Use only BOM sniffing to auto-detect an encoding. + Auto, + /// Use no explicit encoding and disable all BOM sniffing. This will + /// always result in searching the raw bytes, regardless of their + /// true encoding. + Disabled, +} + +impl EncodingMode { + /// Checks if an explicit encoding has been set. Returns false for + /// automatic BOM sniffing and no sniffing. + /// + /// This is only used to determine whether PCRE2 needs to have its own + /// UTF-8 checking enabled. If we have an explicit encoding set, then + /// we're always guaranteed to get UTF-8, so we can disable PCRE2's check. + /// Otherwise, we have no such guarantee, and must enable PCRE2' UTF-8 + /// check. + #[cfg(feature = "pcre2")] + fn has_explicit_encoding(&self) -> bool { + match self { + EncodingMode::Some(_) => true, + _ => false + } + } +} + impl ArgMatches { /// Create an ArgMatches from clap's parse result. fn new(clap_matches: clap::ArgMatches<'static>) -> ArgMatches { @@ -650,7 +681,7 @@ impl ArgMatches { } if self.pcre2_unicode() { builder.utf(true).ucp(true); - if self.encoding()?.is_some() { + if self.encoding()?.has_explicit_encoding() { // SAFETY: If an encoding was specified, then we're guaranteed // to get valid UTF-8, so we can disable PCRE2's UTF checking. // (Feeding invalid UTF-8 to PCRE2 is undefined behavior.) @@ -766,8 +797,16 @@ impl ArgMatches { .after_context(ctx_after) .passthru(self.is_present("passthru")) .memory_map(self.mmap_choice(paths)) - .binary_detection(self.binary_detection()) - .encoding(self.encoding()?); + .binary_detection(self.binary_detection()); + match self.encoding()? { + EncodingMode::Some(enc) => { + builder.encoding(Some(enc)); + } + EncodingMode::Auto => {} // default for the searcher + EncodingMode::Disabled => { + builder.bom_sniffing(false); + } + } Ok(builder.build()) } @@ -952,24 +991,30 @@ impl ArgMatches { u64_to_usize("dfa-size-limit", r) } - /// Returns the type of encoding to use. + /// Returns the encoding mode to use. /// - /// This only returns an encoding if one is explicitly specified. When no - /// encoding is present, the Searcher will still do BOM sniffing for UTF-16 - /// and transcode seamlessly. - fn encoding(&self) -> Result<Option<Encoding>> { + /// This only returns an encoding if one is explicitly specified. Otherwise + /// if set to automatic, the Searcher will do BOM sniffing for UTF-16 + /// and transcode seamlessly. If disabled, no BOM sniffing nor transcoding + /// will occur. + fn encoding(&self) -> Result<EncodingMode> { if self.is_present("no-encoding") { - return Ok(None); + return Ok(EncodingMode::Auto); } + let label = match self.value_of_lossy("encoding") { None if self.pcre2_unicode() => "utf-8".to_string(), - None => return Ok(None), + None => return Ok(EncodingMode::Auto), Some(label) => label, }; + if label == "auto" { - return Ok(None); + return Ok(EncodingMode::Auto); + } else if label == "none" { + return Ok(EncodingMode::Disabled); } - Ok(Some(Encoding::new(&label)?)) + + Ok(EncodingMode::Some(Encoding::new(&label)?)) } /// Return the file separator to use based on the CLI configuration. |