summaryrefslogtreecommitdiffstats
path: root/grep-searcher
diff options
context:
space:
mode:
authorlesnyrumcajs <lesny.rumcajs@gmail.com>2019-03-04 17:18:45 +0100
committerAndrew Gallant <jamslam@gmail.com>2019-04-06 10:35:08 -0400
commit5962abc4655a0f07ece6fc6bd45142e8ee1cab0c (patch)
tree56b1f051f3e803cd24aa2d980c3edcf765756bda /grep-searcher
parent1604a18db3d896514e1d536781810642de4b31c1 (diff)
searcher: add option to disable BOM sniffing
This commit adds a new encoding feature where the -E/--encoding flag will now accept a value of 'none'. When given this value, all encoding related machinery is disabled and ripgrep will search the raw bytes of the file, including the BOM if it's present. Closes #1207, Closes #1208
Diffstat (limited to 'grep-searcher')
-rw-r--r--grep-searcher/Cargo.toml2
-rw-r--r--grep-searcher/src/searcher/mod.rs43
2 files changed, 35 insertions, 10 deletions
diff --git a/grep-searcher/Cargo.toml b/grep-searcher/Cargo.toml
index f4875d9f..f3120a80 100644
--- a/grep-searcher/Cargo.toml
+++ b/grep-searcher/Cargo.toml
@@ -16,7 +16,7 @@ license = "Unlicense/MIT"
bstr = { version = "0.1.2", default-features = false, features = ["std"] }
bytecount = "0.5"
encoding_rs = "0.8.14"
-encoding_rs_io = "0.1.4"
+encoding_rs_io = "0.1.6"
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
log = "0.4.5"
memmap = "0.7"
diff --git a/grep-searcher/src/searcher/mod.rs b/grep-searcher/src/searcher/mod.rs
index c70b3a0e..729b491b 100644
--- a/grep-searcher/src/searcher/mod.rs
+++ b/grep-searcher/src/searcher/mod.rs
@@ -155,6 +155,8 @@ pub struct Config {
/// An encoding that, when present, causes the searcher to transcode all
/// input from the encoding to UTF-8.
encoding: Option<Encoding>,
+ /// Whether to do automatic transcoding based on a BOM or not.
+ bom_sniffing: bool,
}
impl Default for Config {
@@ -171,6 +173,7 @@ impl Default for Config {
binary: BinaryDetection::default(),
multi_line: false,
encoding: None,
+ bom_sniffing: true,
}
}
}
@@ -303,12 +306,15 @@ impl SearcherBuilder {
config.before_context = 0;
config.after_context = 0;
}
+
let mut decode_builder = DecodeReaderBytesBuilder::new();
decode_builder
.encoding(self.config.encoding.as_ref().map(|e| e.0))
.utf8_passthru(true)
- .strip_bom(true)
- .bom_override(true);
+ .strip_bom(self.config.bom_sniffing)
+ .bom_override(true)
+ .bom_sniffing(self.config.bom_sniffing);
+
Searcher {
config: config,
decode_builder: decode_builder,
@@ -506,12 +512,13 @@ impl SearcherBuilder {
/// transcoding process encounters an error, then bytes are replaced with
/// the Unicode replacement codepoint.
///
- /// When no encoding is specified (the default), then BOM sniffing is used
- /// to determine whether the source data is UTF-8 or UTF-16, and
- /// transcoding will be performed automatically. If no BOM could be found,
- /// then the source data is searched _as if_ it were UTF-8. However, so
- /// long as the source data is at least ASCII compatible, then it is
- /// possible for a search to produce useful results.
+ /// When no encoding is specified (the default), then BOM sniffing is
+ /// used (if it's enabled, which it is, by default) to determine whether
+ /// the source data is UTF-8 or UTF-16, and transcoding will be performed
+ /// automatically. If no BOM could be found, then the source data is
+ /// searched _as if_ it were UTF-8. However, so long as the source data is
+ /// at least ASCII compatible, then it is possible for a search to produce
+ /// useful results.
pub fn encoding(
&mut self,
encoding: Option<Encoding>,
@@ -519,6 +526,23 @@ impl SearcherBuilder {
self.config.encoding = encoding;
self
}
+
+ /// Enable automatic transcoding based on BOM sniffing.
+ ///
+ /// When this is enabled and an explicit encoding is not set, then this
+ /// searcher will try to detect the encoding of the bytes being searched
+ /// by sniffing its byte-order mark (BOM). In particular, when this is
+ /// enabled, UTF-16 encoded files will be searched seamlessly.
+ ///
+ /// When this is disabled and if an explicit encoding is not set, then
+ /// the bytes from the source stream will be passed through unchanged,
+ /// including its BOM, if one is present.
+ ///
+ /// This is enabled by default.
+ pub fn bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder {
+ self.config.bom_sniffing = yes;
+ self
+ }
}
/// A searcher executes searches over a haystack and writes results to a caller
@@ -738,7 +762,8 @@ impl Searcher {
/// Returns true if and only if the given slice needs to be transcoded.
fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
- self.config.encoding.is_some() || slice_has_utf16_bom(slice)
+ self.config.encoding.is_some()
+ || (self.config.bom_sniffing && slice_has_utf16_bom(slice))
}
}