summaryrefslogtreecommitdiffstats
path: root/crates/matcher/src/interpolate.rs
diff options
context:
space:
mode:
Diffstat (limited to 'crates/matcher/src/interpolate.rs')
-rw-r--r--crates/matcher/src/interpolate.rs328
1 files changed, 328 insertions, 0 deletions
diff --git a/crates/matcher/src/interpolate.rs b/crates/matcher/src/interpolate.rs
new file mode 100644
index 00000000..c768bfdd
--- /dev/null
+++ b/crates/matcher/src/interpolate.rs
@@ -0,0 +1,328 @@
+use std::str;
+
+use memchr::memchr;
+
+/// Interpolate capture references in `replacement` and write the interpolation
+/// result to `dst`. References in `replacement` take the form of $N or $name,
+/// where `N` is a capture group index and `name` is a capture group name. The
+/// function provided, `name_to_index`, maps capture group names to indices.
+///
+/// The `append` function given is responsible for writing the replacement
+/// to the `dst` buffer. That is, it is called with the capture group index
+/// of a capture group reference and is expected to resolve the index to its
+/// corresponding matched text. If no such match exists, then `append` should
+/// not write anything to its given buffer.
+pub fn interpolate<A, N>(
+ mut replacement: &[u8],
+ mut append: A,
+ mut name_to_index: N,
+ dst: &mut Vec<u8>,
+) where
+ A: FnMut(usize, &mut Vec<u8>),
+ N: FnMut(&str) -> Option<usize>,
+{
+ while !replacement.is_empty() {
+ match memchr(b'$', replacement) {
+ None => break,
+ Some(i) => {
+ dst.extend(&replacement[..i]);
+ replacement = &replacement[i..];
+ }
+ }
+ if replacement.get(1).map_or(false, |&b| b == b'$') {
+ dst.push(b'$');
+ replacement = &replacement[2..];
+ continue;
+ }
+ debug_assert!(!replacement.is_empty());
+ let cap_ref = match find_cap_ref(replacement) {
+ Some(cap_ref) => cap_ref,
+ None => {
+ dst.push(b'$');
+ replacement = &replacement[1..];
+ continue;
+ }
+ };
+ replacement = &replacement[cap_ref.end..];
+ match cap_ref.cap {
+ Ref::Number(i) => append(i, dst),
+ Ref::Named(name) => {
+ if let Some(i) = name_to_index(name) {
+ append(i, dst);
+ }
+ }
+ }
+ }
+ dst.extend(replacement);
+}
+
+/// `CaptureRef` represents a reference to a capture group inside some text.
+/// The reference is either a capture group name or a number.
+///
+/// It is also tagged with the position in the text immediately proceding the
+/// capture reference.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+struct CaptureRef<'a> {
+ cap: Ref<'a>,
+ end: usize,
+}
+
+/// A reference to a capture group in some text.
+///
+/// e.g., `$2`, `$foo`, `${foo}`.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum Ref<'a> {
+ Named(&'a str),
+ Number(usize),
+}
+
+impl<'a> From<&'a str> for Ref<'a> {
+ fn from(x: &'a str) -> Ref<'a> {
+ Ref::Named(x)
+ }
+}
+
+impl From<usize> for Ref<'static> {
+ fn from(x: usize) -> Ref<'static> {
+ Ref::Number(x)
+ }
+}
+
+/// Parses a possible reference to a capture group name in the given text,
+/// starting at the beginning of `replacement`.
+///
+/// If no such valid reference could be found, None is returned.
+fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> {
+ let mut i = 0;
+ if replacement.len() <= 1 || replacement[0] != b'$' {
+ return None;
+ }
+ let mut brace = false;
+ i += 1;
+ if replacement[i] == b'{' {
+ brace = true;
+ i += 1;
+ }
+ let mut cap_end = i;
+ while replacement.get(cap_end).map_or(false, is_valid_cap_letter) {
+ cap_end += 1;
+ }
+ if cap_end == i {
+ return None;
+ }
+ // We just verified that the range 0..cap_end is valid ASCII, so it must
+ // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
+ // check with an unchecked conversion or by parsing the number straight
+ // from &[u8].
+ let cap = str::from_utf8(&replacement[i..cap_end])
+ .expect("valid UTF-8 capture name");
+ if brace {
+ if !replacement.get(cap_end).map_or(false, |&b| b == b'}') {
+ return None;
+ }
+ cap_end += 1;
+ }
+ Some(CaptureRef {
+ cap: match cap.parse::<u32>() {
+ Ok(i) => Ref::Number(i as usize),
+ Err(_) => Ref::Named(cap),
+ },
+ end: cap_end,
+ })
+}
+
+/// Returns true if and only if the given byte is allowed in a capture name.
+fn is_valid_cap_letter(b: &u8) -> bool {
+ match *b {
+ b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
+ _ => false,
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::{find_cap_ref, interpolate, CaptureRef};
+
+ macro_rules! find {
+ ($name:ident, $text:expr) => {
+ #[test]
+ fn $name() {
+ assert_eq!(None, find_cap_ref($text.as_bytes()));
+ }
+ };
+ ($name:ident, $text:expr, $capref:expr) => {
+ #[test]
+ fn $name() {
+ assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
+ }
+ };
+ }
+
+ macro_rules! c {
+ ($name_or_number:expr, $pos:expr) => {
+ CaptureRef { cap: $name_or_number.into(), end: $pos }
+ };
+ }
+
+ find!(find_cap_ref1, "$foo", c!("foo", 4));
+ find!(find_cap_ref2, "${foo}", c!("foo", 6));
+ find!(find_cap_ref3, "$0", c!(0, 2));
+ find!(find_cap_ref4, "$5", c!(5, 2));
+ find!(find_cap_ref5, "$10", c!(10, 3));
+ find!(find_cap_ref6, "$42a", c!("42a", 4));
+ find!(find_cap_ref7, "${42}a", c!(42, 5));
+ find!(find_cap_ref8, "${42");
+ find!(find_cap_ref9, "${42 ");
+ find!(find_cap_ref10, " $0 ");
+ find!(find_cap_ref11, "$");
+ find!(find_cap_ref12, " ");
+ find!(find_cap_ref13, "");
+
+ // A convenience routine for using interpolate's unwieldy but flexible API.
+ fn interpolate_string(
+ mut name_to_index: Vec<(&'static str, usize)>,
+ caps: Vec<&'static str>,
+ replacement: &str,
+ ) -> String {
+ name_to_index.sort_by_key(|x| x.0);
+
+ let mut dst = vec![];
+ interpolate(
+ replacement.as_bytes(),
+ |i, dst| {
+ if let Some(&s) = caps.get(i) {
+ dst.extend(s.as_bytes());
+ }
+ },
+ |name| -> Option<usize> {
+ name_to_index
+ .binary_search_by_key(&name, |x| x.0)
+ .ok()
+ .map(|i| name_to_index[i].1)
+ },
+ &mut dst,
+ );
+ String::from_utf8(dst).unwrap()
+ }
+
+ macro_rules! interp {
+ ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => {
+ #[test]
+ fn $name() {
+ assert_eq!($expected, interpolate_string($map, $caps, $hay));
+ }
+ };
+ }
+
+ interp!(
+ interp1,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test $foo test",
+ "test xxx test",
+ );
+
+ interp!(
+ interp2,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test$footest",
+ "test",
+ );
+
+ interp!(
+ interp3,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test${foo}test",
+ "testxxxtest",
+ );
+
+ interp!(
+ interp4,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test$2test",
+ "test",
+ );
+
+ interp!(
+ interp5,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test${2}test",
+ "testxxxtest",
+ );
+
+ interp!(
+ interp6,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test $$foo test",
+ "test $foo test",
+ );
+
+ interp!(
+ interp7,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test $foo",
+ "test xxx",
+ );
+
+ interp!(
+ interp8,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "$foo test",
+ "xxx test",
+ );
+
+ interp!(
+ interp9,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test $bar$foo",
+ "test yyyxxx",
+ );
+
+ interp!(
+ interp10,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test $ test",
+ "test $ test",
+ );
+
+ interp!(
+ interp11,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test ${} test",
+ "test ${} test",
+ );
+
+ interp!(
+ interp12,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test ${ } test",
+ "test ${ } test",
+ );
+
+ interp!(
+ interp13,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test ${a b} test",
+ "test ${a b} test",
+ );
+
+ interp!(
+ interp14,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test ${a} test",
+ "test test",
+ );
+}