summaryrefslogtreecommitdiffstats
path: root/grep-regex/src/multi.rs
blob: 6e43e9759bf67e7a671d78ea370222d9b69e11fb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use grep_matcher::{Matcher, Match, NoError};
use regex_syntax::hir::Hir;

use error::Error;
use matcher::RegexCaptures;

/// A matcher for an alternation of literals.
///
/// Ideally, this optimization would be pushed down into the regex engine, but
/// making this work correctly there would require quite a bit of refactoring.
/// Moreover, doing it one layer above lets us do thing like, "if we
/// specifically only want to search for literals, then don't bother with
/// regex parsing at all."
#[derive(Clone, Debug)]
pub struct MultiLiteralMatcher {
    /// The Aho-Corasick automaton.
    ac: AhoCorasick,
}

impl MultiLiteralMatcher {
    /// Create a new multi-literal matcher from the given literals.
    pub fn new<B: AsRef<[u8]>>(
        literals: &[B],
    ) -> Result<MultiLiteralMatcher, Error> {
        let ac = AhoCorasickBuilder::new()
            .match_kind(MatchKind::LeftmostFirst)
            .auto_configure(literals)
            .build_with_size::<usize, _, _>(literals)
            .map_err(Error::regex)?;
        Ok(MultiLiteralMatcher { ac })
    }
}

impl Matcher for MultiLiteralMatcher {
    type Captures = RegexCaptures;
    type Error = NoError;

    fn find_at(
        &self,
        haystack: &[u8],
        at: usize,
    ) -> Result<Option<Match>, NoError> {
        match self.ac.find(&haystack[at..]) {
            None => Ok(None),
            Some(m) => Ok(Some(Match::new(at + m.start(), at + m.end()))),
        }
    }

    fn new_captures(&self) -> Result<RegexCaptures, NoError> {
        Ok(RegexCaptures::simple())
    }

    fn capture_count(&self) -> usize {
        1
    }

    fn capture_index(&self, _: &str) -> Option<usize> {
        None
    }

    fn captures_at(
        &self,
        haystack: &[u8],
        at: usize,
        caps: &mut RegexCaptures,
    ) -> Result<bool, NoError> {
        caps.set_simple(None);
        let mat = self.find_at(haystack, at)?;
        caps.set_simple(mat);
        Ok(mat.is_some())
    }

    // We specifically do not implement other methods like find_iter. Namely,
    // the iter methods are guaranteed to be correct by virtue of implementing
    // find_at above.
}

/// Alternation literals checks if the given HIR is a simple alternation of
/// literals, and if so, returns them. Otherwise, this returns None.
pub fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
    use regex_syntax::hir::{HirKind, Literal};

    // This is pretty hacky, but basically, if `is_alternation_literal` is
    // true, then we can make several assumptions about the structure of our
    // HIR. This is what justifies the `unreachable!` statements below.

    if !expr.is_alternation_literal() {
        return None;
    }
    let alts = match *expr.kind() {
        HirKind::Alternation(ref alts) => alts,
        _ => return None, // one literal isn't worth it
    };

    let extendlit = |lit: &Literal, dst: &mut Vec<u8>| {
        match *lit {
            Literal::Unicode(c) => {
                let mut buf = [0; 4];
                dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
            }
            Literal::Byte(b) => {
                dst.push(b);
            }
        }
    };

    let mut lits = vec![];
    for alt in alts {
        let mut lit = vec![];
        match *alt.kind() {
            HirKind::Empty => {}
            HirKind::Literal(ref x) => extendlit(x, &mut lit),
            HirKind::Concat(ref exprs) => {
                for e in exprs {
                    match *e.kind() {
                        HirKind::Literal(ref x) => extendlit(x, &mut lit),
                        _ => unreachable!("expected literal, got {:?}", e),
                    }
                }
            }
            _ => unreachable!("expected literal or concat, got {:?}", alt),
        }
        lits.push(lit);
    }
    Some(lits)
}