summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorlesnyrumcajs <lesny.rumcajs@gmail.com>2019-03-04 17:18:45 +0100
committerAndrew Gallant <jamslam@gmail.com>2019-04-06 10:35:08 -0400
commit5962abc4655a0f07ece6fc6bd45142e8ee1cab0c (patch)
tree56b1f051f3e803cd24aa2d980c3edcf765756bda /src
parent1604a18db3d896514e1d536781810642de4b31c1 (diff)
searcher: add option to disable BOM sniffing
This commit adds a new encoding feature where the -E/--encoding flag will now accept a value of 'none'. When given this value, all encoding related machinery is disabled and ripgrep will search the raw bytes of the file, including the BOM if it's present. Closes #1207, Closes #1208
Diffstat (limited to 'src')
-rw-r--r--src/app.rs4
-rw-r--r--src/args.rs69
2 files changed, 60 insertions, 13 deletions
diff --git a/src/app.rs b/src/app.rs
index b4c81a7c..66eaedb4 100644
--- a/src/app.rs
+++ b/src/app.rs
@@ -984,7 +984,9 @@ Specify the text encoding that ripgrep will use on all files searched. The
default value is 'auto', which will cause ripgrep to do a best effort automatic
detection of encoding on a per-file basis. Automatic detection in this case
only applies to files that begin with a UTF-8 or UTF-16 byte-order mark (BOM).
-No other automatic detection is performend.
+No other automatic detection is performed. One can also specify 'none' which
+will then completely disable BOM sniffing and always result in searching the
+raw bytes, including a BOM if it's present, regardless of its encoding.
Other supported values can be found in the list of labels here:
https://encoding.spec.whatwg.org/#concept-encoding-get
diff --git a/src/args.rs b/src/args.rs
index 6d9549ed..babaf09c 100644
--- a/src/args.rs
+++ b/src/args.rs
@@ -483,6 +483,37 @@ impl SortByKind {
}
}
+/// Encoding mode the searcher will use.
+#[derive(Clone, Debug)]
+enum EncodingMode {
+ /// Use an explicit encoding forcefully, but let BOM sniffing override it.
+ Some(Encoding),
+ /// Use only BOM sniffing to auto-detect an encoding.
+ Auto,
+ /// Use no explicit encoding and disable all BOM sniffing. This will
+ /// always result in searching the raw bytes, regardless of their
+ /// true encoding.
+ Disabled,
+}
+
+impl EncodingMode {
+ /// Checks if an explicit encoding has been set. Returns false for
+ /// automatic BOM sniffing and no sniffing.
+ ///
+ /// This is only used to determine whether PCRE2 needs to have its own
+ /// UTF-8 checking enabled. If we have an explicit encoding set, then
+ /// we're always guaranteed to get UTF-8, so we can disable PCRE2's check.
+ /// Otherwise, we have no such guarantee, and must enable PCRE2' UTF-8
+ /// check.
+ #[cfg(feature = "pcre2")]
+ fn has_explicit_encoding(&self) -> bool {
+ match self {
+ EncodingMode::Some(_) => true,
+ _ => false
+ }
+ }
+}
+
impl ArgMatches {
/// Create an ArgMatches from clap's parse result.
fn new(clap_matches: clap::ArgMatches<'static>) -> ArgMatches {
@@ -650,7 +681,7 @@ impl ArgMatches {
}
if self.pcre2_unicode() {
builder.utf(true).ucp(true);
- if self.encoding()?.is_some() {
+ if self.encoding()?.has_explicit_encoding() {
// SAFETY: If an encoding was specified, then we're guaranteed
// to get valid UTF-8, so we can disable PCRE2's UTF checking.
// (Feeding invalid UTF-8 to PCRE2 is undefined behavior.)
@@ -766,8 +797,16 @@ impl ArgMatches {
.after_context(ctx_after)
.passthru(self.is_present("passthru"))
.memory_map(self.mmap_choice(paths))
- .binary_detection(self.binary_detection())
- .encoding(self.encoding()?);
+ .binary_detection(self.binary_detection());
+ match self.encoding()? {
+ EncodingMode::Some(enc) => {
+ builder.encoding(Some(enc));
+ }
+ EncodingMode::Auto => {} // default for the searcher
+ EncodingMode::Disabled => {
+ builder.bom_sniffing(false);
+ }
+ }
Ok(builder.build())
}
@@ -952,24 +991,30 @@ impl ArgMatches {
u64_to_usize("dfa-size-limit", r)
}
- /// Returns the type of encoding to use.
+ /// Returns the encoding mode to use.
///
- /// This only returns an encoding if one is explicitly specified. When no
- /// encoding is present, the Searcher will still do BOM sniffing for UTF-16
- /// and transcode seamlessly.
- fn encoding(&self) -> Result<Option<Encoding>> {
+ /// This only returns an encoding if one is explicitly specified. Otherwise
+ /// if set to automatic, the Searcher will do BOM sniffing for UTF-16
+ /// and transcode seamlessly. If disabled, no BOM sniffing nor transcoding
+ /// will occur.
+ fn encoding(&self) -> Result<EncodingMode> {
if self.is_present("no-encoding") {
- return Ok(None);
+ return Ok(EncodingMode::Auto);
}
+
let label = match self.value_of_lossy("encoding") {
None if self.pcre2_unicode() => "utf-8".to_string(),
- None => return Ok(None),
+ None => return Ok(EncodingMode::Auto),
Some(label) => label,
};
+
if label == "auto" {
- return Ok(None);
+ return Ok(EncodingMode::Auto);
+ } else if label == "none" {
+ return Ok(EncodingMode::Disabled);
}
- Ok(Some(Encoding::new(&label)?))
+
+ Ok(EncodingMode::Some(Encoding::new(&label)?))
}
/// Return the file separator to use based on the CLI configuration.