summaryrefslogtreecommitdiffstats
path: root/crates/pcre2/src/matcher.rs
blob: ecc7c16fbfc4abbbc2fdff6388f974db27e2c7a9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
use std::collections::HashMap;

use {
    grep_matcher::{Captures, Match, Matcher},
    pcre2::bytes::{CaptureLocations, Regex, RegexBuilder},
};

use crate::error::Error;

/// A builder for configuring the compilation of a PCRE2 regex.
#[derive(Clone, Debug)]
pub struct RegexMatcherBuilder {
    builder: RegexBuilder,
    case_smart: bool,
    word: bool,
    fixed_strings: bool,
    whole_line: bool,
}

impl RegexMatcherBuilder {
    /// Create a new matcher builder with a default configuration.
    pub fn new() -> RegexMatcherBuilder {
        RegexMatcherBuilder {
            builder: RegexBuilder::new(),
            case_smart: false,
            word: false,
            fixed_strings: false,
            whole_line: false,
        }
    }

    /// Compile the given pattern into a PCRE matcher using the current
    /// configuration.
    ///
    /// If there was a problem compiling the pattern, then an error is
    /// returned.
    pub fn build(&self, pattern: &str) -> Result<RegexMatcher, Error> {
        self.build_many(&[pattern])
    }

    /// Compile all of the given patterns into a single regex that matches when
    /// at least one of the patterns matches.
    ///
    /// If there was a problem building the regex, then an error is returned.
    pub fn build_many<P: AsRef<str>>(
        &self,
        patterns: &[P],
    ) -> Result<RegexMatcher, Error> {
        let mut builder = self.builder.clone();
        let mut pats = Vec::with_capacity(patterns.len());
        for p in patterns.iter() {
            pats.push(if self.fixed_strings {
                format!("(?:{})", pcre2::escape(p.as_ref()))
            } else {
                format!("(?:{})", p.as_ref())
            });
        }
        let mut singlepat = pats.join("|");
        if self.case_smart && !has_uppercase_literal(&singlepat) {
            builder.caseless(true);
        }
        if self.whole_line {
            singlepat = format!(r"(?m:^)(?:{})(?m:$)", singlepat);
        } else if self.word {
            // We make this option exclusive with whole_line because when
            // whole_line is enabled, all matches necessary fall on word
            // boundaries. So this extra goop is strictly redundant.
            singlepat = format!(r"(?<!\w)(?:{})(?!\w)", singlepat);
        }
        log::trace!("final regex: {:?}", singlepat);
        builder.build(&singlepat).map_err(Error::regex).map(|regex| {
            let mut names = HashMap::new();
            for (i, name) in regex.capture_names().iter().enumerate() {
                if let Some(ref name) = *name {
                    names.insert(name.to_string(), i);
                }
            }
            RegexMatcher { regex, names }
        })
    }

    /// Enables case insensitive matching.
    ///
    /// If the `utf` option is also set, then Unicode case folding is used
    /// to determine case insensitivity. When the `utf` option is not set,
    /// then only standard ASCII case insensitivity is considered.
    ///
    /// This option corresponds to the `i` flag.
    pub fn caseless(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.builder.caseless(yes);
        self
    }

    /// Whether to enable "smart case" or not.
    ///
    /// When smart case is enabled, the builder will automatically enable
    /// case insensitive matching based on how the pattern is written. Namely,
    /// case insensitive mode is enabled when both of the following things
    /// are believed to be true:
    ///
    /// 1. The pattern contains at least one literal character. For example,
    ///    `a\w` contains a literal (`a`) but `\w` does not.
    /// 2. Of the literals in the pattern, none of them are considered to be
    ///    uppercase according to Unicode. For example, `foo\pL` has no
    ///    uppercase literals but `Foo\pL` does.
    ///
    /// Note that the implementation of this is not perfect. Namely, `\p{Ll}`
    /// will prevent case insensitive matching even though it is part of a meta
    /// sequence. This bug will probably never be fixed.
    pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.case_smart = yes;
        self
    }

    /// Enables "dot all" matching.
    ///
    /// When enabled, the `.` metacharacter in the pattern matches any
    /// character, include `\n`. When disabled (the default), `.` will match
    /// any character except for `\n`.
    ///
    /// This option corresponds to the `s` flag.
    pub fn dotall(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.builder.dotall(yes);
        self
    }

    /// Enable "extended" mode in the pattern, where whitespace is ignored.
    ///
    /// This option corresponds to the `x` flag.
    pub fn extended(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.builder.extended(yes);
        self
    }

    /// Enable multiline matching mode.
    ///
    /// When enabled, the `^` and `$` anchors will match both at the beginning
    /// and end of a subject string, in addition to matching at the start of
    /// a line and the end of a line. When disabled, the `^` and `$` anchors
    /// will only match at the beginning and end of a subject string.
    ///
    /// This option corresponds to the `m` flag.
    pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.builder.multi_line(yes);
        self
    }

    /// Enable matching of CRLF as a line terminator.
    ///
    /// When enabled, anchors such as `^` and `$` will match any of the
    /// following as a line terminator: `\r`, `\n` or `\r\n`.
    ///
    /// This is disabled by default, in which case, only `\n` is recognized as
    /// a line terminator.
    pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.builder.crlf(yes);
        self
    }

    /// Require that all matches occur on word boundaries.
    ///
    /// Enabling this option is subtly different than putting `\b` assertions
    /// on both sides of your pattern. In particular, a `\b` assertion requires
    /// that one side of it match a word character while the other match a
    /// non-word character. This option, in contrast, merely requires that
    /// one side match a non-word character.
    ///
    /// For example, `\b-2\b` will not match `foo -2 bar` since `-` is not a
    /// word character. However, `-2` with this `word` option enabled will
    /// match the `-2` in `foo -2 bar`.
    pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.word = yes;
        self
    }

    /// Whether the patterns should be treated as literal strings or not. When
    /// this is active, all characters, including ones that would normally be
    /// special regex meta characters, are matched literally.
    pub fn fixed_strings(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.fixed_strings = yes;
        self
    }

    /// Whether each pattern should match the entire line or not. This is
    /// equivalent to surrounding the pattern with `(?m:^)` and `(?m:$)`.
    pub fn whole_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.whole_line = yes;
        self
    }

    /// Enable Unicode matching mode.
    ///
    /// When enabled, the following patterns become Unicode aware: `\b`, `\B`,
    /// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`.
    ///
    /// When set, this implies UTF matching mode. It is not possible to enable
    /// Unicode matching mode without enabling UTF matching mode.
    ///
    /// This is disabled by default.
    pub fn ucp(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.builder.ucp(yes);
        self
    }

    /// Enable UTF matching mode.
    ///
    /// When enabled, characters are treated as sequences of code units that
    /// make up a single codepoint instead of as single bytes. For example,
    /// this will cause `.` to match any single UTF-8 encoded codepoint, where
    /// as when this is disabled, `.` will any single byte (except for `\n` in
    /// both cases, unless "dot all" mode is enabled).
    ///
    /// Note that when UTF matching mode is enabled, every search performed
    /// will do a UTF-8 validation check, which can impact performance. The
    /// UTF-8 check can be disabled via the `disable_utf_check` option, but it
    /// is undefined behavior to enable UTF matching mode and search invalid
    /// UTF-8.
    ///
    /// This is disabled by default.
    pub fn utf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.builder.utf(yes);
        self
    }

    /// This is now deprecated and is a no-op.
    ///
    /// Previously, this option permitted disabling PCRE2's UTF-8 validity
    /// check, which could result in undefined behavior if the haystack was
    /// not valid UTF-8. But PCRE2 introduced a new option, `PCRE2_MATCH_INVALID_UTF`,
    /// in 10.34 which this crate always sets. When this option is enabled,
    /// PCRE2 claims to not have undefined behavior when the haystack is
    /// invalid UTF-8.
    ///
    /// Therefore, disabling the UTF-8 check is not something that is exposed
    /// by this crate.
    #[deprecated(
        since = "0.2.4",
        note = "now a no-op due to new PCRE2 features"
    )]
    pub fn disable_utf_check(&mut self) -> &mut RegexMatcherBuilder {
        self
    }

    /// Enable PCRE2's JIT and return an error if it's not available.
    ///
    /// This generally speeds up matching quite a bit. The downside is that it
    /// can increase the time it takes to compile a pattern.
    ///
    /// If the JIT isn't available or if JIT compilation returns an error, then
    /// regex compilation will fail with the corresponding error.
    ///
    /// This is disabled by default, and always overrides `jit_if_available`.
    pub fn jit(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.builder.jit(yes);
        self
    }

    /// Enable PCRE2's JIT if it's available.
    ///
    /// This generally speeds up matching quite a bit. The downside is that it
    /// can increase the time it takes to compile a pattern.
    ///
    /// If the JIT isn't available or if JIT compilation returns an error,
    /// then a debug message with the error will be emitted and the regex will
    /// otherwise silently fall back to non-JIT matching.
    ///
    /// This is disabled by default, and always overrides `jit`.
    pub fn jit_if_available(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.builder.jit_if_available(</