summaryrefslogtreecommitdiffstats
path: root/grep
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2016-11-28 18:31:58 -0500
committerAndrew Gallant <jamslam@gmail.com>2016-11-28 18:31:58 -0500
commit0473df1ef5721143941fb7f883e22b17292b35bb (patch)
tree5a3ad00a963f619e090051bdd29fdf46a12bc523 /grep
parent301a3fd71d3a923419d6d3e7604979b314121801 (diff)
Disable Unicode mode for literal regex.
When ripgrep detects a literal, it emits them as raw hex escaped byte sequences to Regex::new. This permits literal optimizations for arbitrary byte sequences (i.e., possibly invalid UTF-8). The problem is that Regex::new interprets hex escaped byte sequences as *Unicode codepoints* by default, but we want them to actually stand for their raw byte values. Therefore, disable Unicode mode. This is OK, since the regex is composed entirely of literals and literal extraction does Unicode case folding. Fixes #251
Diffstat (limited to 'grep')
-rw-r--r--grep/src/literals.rs4
-rw-r--r--grep/src/search.rs3
2 files changed, 3 insertions, 4 deletions
diff --git a/grep/src/literals.rs b/grep/src/literals.rs
index d931f135..3e68d24e 100644
--- a/grep/src/literals.rs
+++ b/grep/src/literals.rs
@@ -79,12 +79,12 @@ impl LiteralSets {
debug!("required literals found: {:?}", req_lits);
let alts: Vec<String> =
req_lits.into_iter().map(|x| bytes_to_regex(x)).collect();
- Some(RegexBuilder::new(&alts.join("|")))
+ Some(RegexBuilder::new(&alts.join("|")).unicode(false))
} else if lit.is_empty() {
None
} else {
debug!("required literal found: {:?}", show(lit));
- Some(RegexBuilder::new(&bytes_to_regex(lit)))
+ Some(RegexBuilder::new(&bytes_to_regex(&lit)).unicode(false))
}
}
}
diff --git a/grep/src/search.rs b/grep/src/search.rs
index 850c8d62..cf1a4c3f 100644
--- a/grep/src/search.rs
+++ b/grep/src/search.rs
@@ -167,14 +167,13 @@ impl GrepBuilder {
/// Creates a new regex from the given expression with the current
/// configuration.
fn regex(&self, expr: &Expr) -> Result<Regex> {
- self.regex_build(RegexBuilder::new(&expr.to_string()))
+ self.regex_build(RegexBuilder::new(&expr.to_string()).unicode(true))
}
/// Builds a new regex from the given builder using the caller's settings.
fn regex_build(&self, builder: RegexBuilder) -> Result<Regex> {
builder
.multi_line(true)
- .unicode(true)
.size_limit(self.opts.size_limit)
.dfa_size_limit(self.opts.dfa_size_limit)
.compile()