summaryrefslogtreecommitdiffstats
path: root/grep
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2016-09-25 20:10:28 -0400
committerAndrew Gallant <jamslam@gmail.com>2016-09-25 20:10:28 -0400
commit6a8051b258408343c4cf164acdc4cc2cd7928129 (patch)
treecb9b49be8c46284c168ea21a6b53e6fc8f7cf411 /grep
parenta13ac3e3d49729403a734c9f3eab2d99a66e21b0 (diff)
Don't union inner literals of repetitions.
If we do, this results in extracting `foofoofoo` from `(\wfoo){3}`, which is wrong. This does prevent us from extracting `foofoofoo` from `foo{3}`, which is unfortunate, but we miss plenty of other stuff too. Literal extracting needs a good rethink (all the way down into the regex engine). Fixes #93
Diffstat (limited to 'grep')
-rw-r--r--grep/src/literals.rs11
1 files changed, 6 insertions, 5 deletions
diff --git a/grep/src/literals.rs b/grep/src/literals.rs
index f1685270..4cd34a87 100644
--- a/grep/src/literals.rs
+++ b/grep/src/literals.rs
@@ -8,7 +8,6 @@ Note that this implementation is incredibly suspicious. We need something more
principled.
*/
use std::cmp;
-use std::iter;
use regex::bytes::Regex;
use syntax::{
@@ -181,8 +180,6 @@ fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
lits: &mut Literals,
mut f: F,
) {
- use syntax::Expr::*;
-
if min == 0 {
// This is a bit conservative. If `max` is set, then we could
// treat this as a finite set of alternations. For now, we
@@ -190,8 +187,12 @@ fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
lits.cut();
} else {
let n = cmp::min(lits.limit_size(), min as usize);
- let es = iter::repeat(e.clone()).take(n).collect();
- f(&Concat(es), lits);
+ // We only extract literals from a single repetition, even though
+ // we could do more. e.g., `a{3}` will have `a` extracted instead of
+ // `aaa`. The reason is that inner literal extraction can't be unioned
+ // across repetitions. e.g., extracting `foofoofoo` from `(\w+foo){3}`
+ // is wrong.
+ f(e, lits);
if n < min as usize {
lits.cut();
}