Merge pull request #220 from dandavison/word-diff-regexp

New option --word-diff-regex
author: Dan Davison <dandavison7@gmail.com> 2020-06-11 20:50:09 -0400
committer: GitHub <noreply@github.com> 2020-06-11 20:50:09 -0400
commit: 1b3499e7a8e41853b0f40edb8f115c4ba5b1e6d1 (patch)
tree: 1905558d19979de62b6b5b45b7a2ba2825cbb2f1 /src/edits.rs
parent: f469492bb63763bf54f99559ad40138d3f8e24fd (diff)
parent: 97116f284bd826cd0c2805ed3b0f4359b310ad6a (diff)
1 files changed, 158 insertions, 30 deletions
diff --git a/src/edits.rs b/src/edits.rs
index 7b6e4674..d8f18377 100644
--- a/src/edits.rs
+++ b/src/edits.rs
@@ -1,6 +1,6 @@
 use regex::Regex;
 
-use lazy_static::lazy_static;
+use unicode_segmentation::UnicodeSegmentation;
 use unicode_width::UnicodeWidthStr;
 
 use crate::align;
@@ -16,6 +16,7 @@ pub fn infer_edits<'a, EditOperation>(
     deletion: EditOperation,
     noop_insertion: EditOperation,
     insertion: EditOperation,
+    tokenization_regex: &Regex,
     max_line_distance: f64,
     max_line_distance_for_naively_paired_lines: f64,
 ) -> (
@@ -34,7 +35,10 @@ where
     'minus_lines_loop: for minus_line in minus_lines {
         let mut considered = 0; // plus lines considered so far as match for minus_line
         for plus_line in &plus_lines[emitted..] {
-            let alignment = align::Alignment::new(tokenize(minus_line), tokenize(plus_line));
+            let alignment = align::Alignment::new(
+                tokenize(minus_line, tokenization_regex),
+                tokenize(plus_line, tokenization_regex),
+            );
             let (annotated_minus_line, annotated_plus_line, distance) = annotate(
                 alignment,
                 noop_deletion,
@@ -76,25 +80,29 @@ where
     (annotated_minus_lines, annotated_plus_lines)
 }
 
-lazy_static! {
-    static ref TOKENIZATION_REGEXP: Regex = Regex::new(r#"[\t ,;.:()\[\]<>/'"-]+"#).unwrap();
-}
-
 /// Split line into tokens for alignment. The alignment algorithm aligns sequences of substrings;
 /// not individual characters.
-fn tokenize(line: &str) -> Vec<&str> {
+fn tokenize<'a>(line: &'a str, regex: &Regex) -> Vec<&'a str> {
     let mut tokens = Vec::new();
     let mut offset = 0;
-    for m in TOKENIZATION_REGEXP.find_iter(line) {
-        tokens.push(&line[offset..m.start()]);
+    for m in regex.find_iter(line) {
+        if offset == 0 && m.start() > 0 {
+            tokens.push("");
+        }
         // Align separating text as multiple single-character tokens.
-        for i in m.start()..m.end() {
-            tokens.push(&line[i..i + 1]);
+        for t in line[offset..m.start()].graphemes(true) {
+            tokens.push(t);
         }
+        tokens.push(&line[m.start()..m.end()]);
         offset = m.end();
     }
     if offset < line.len() {
-        tokens.push(&line[offset..line.len()]);
+        if offset == 0 {
+            tokens.push("");
+        }
+        for t in line[offset..line.len()].graphemes(true) {
+            tokens.push(t);
+        }
     }
     tokens
 }
@@ -226,8 +234,14 @@ where
 #[cfg(test)]
 mod tests {
     use super::*;
+    use itertools::Itertools;
+    use lazy_static::lazy_static;
     use unicode_segmentation::UnicodeSegmentation;
 
+    lazy_static! {
+        static ref DEFAULT_TOKENIZATION_REGEXP: Regex = Regex::new(r#"\w+"#).unwrap();
+    }
+
     #[derive(Clone, Copy, Debug, PartialEq)]
     enum EditOperation {
         MinusNoop,
@@ -244,15 +258,26 @@ mod tests {
     use EditOperation::*;
 
     #[test]
+    fn test_tokenize_0() {
+        assert_tokenize("", &[]);
+        assert_tokenize(";", &["", ";"]);
+        assert_tokenize(";;", &["", ";", ";"]);
+        assert_tokenize(";;a", &["", ";", ";", "a"]);
+        assert_tokenize(";;ab", &["", ";", ";", "ab"]);
+        assert_tokenize(";;ab;", &["", ";", ";", "ab", ";"]);
+        assert_tokenize(";;ab;;", &["", ";", ";", "ab", ";", ";"]);
+    }
+
+    #[test]
     fn test_tokenize_1() {
-        assert_eq!(tokenize("aaa bbb"), vec!["aaa", " ", "bbb"])
+        assert_tokenize("aaa bbb", &["aaa", " ", "bbb"])
     }
 
     #[test]
     fn test_tokenize_2() {
-        assert_eq!(
-            tokenize("fn coalesce_edits<'a, EditOperation>("),
-            vec![
+        assert_tokenize(
+            "fn coalesce_edits<'a, EditOperation>(",
+            &[
                 "fn",
                 " ",
                 "coalesce_edits",
@@ -263,16 +288,16 @@ mod tests {
                 " ",
                 "EditOperation",
                 ">",
-                "("
-            ]
+                "(",
+            ],
         );
     }
 
     #[test]
     fn test_tokenize_3() {
-        assert_eq!(
-            tokenize("fn coalesce_edits<'a, 'b, EditOperation>("),
-            vec![
+        assert_tokenize(
+            "fn coalesce_edits<'a, 'b, EditOperation>(",
+            &[
                 "fn",
                 " ",
                 "coalesce_edits",
@@ -287,21 +312,22 @@ mod tests {
                 " ",
                 "EditOperation",
                 ">",
-                "("
-            ]
+                "(",
+            ],
         );
     }
 
     #[test]
     fn test_tokenize_4() {
-        assert_eq!(
-            tokenize("annotated_plus_lines.push(vec![(noop_insertion, plus_line)]);"),
-            vec![
+        assert_tokenize(
+            "annotated_plus_lines.push(vec![(noop_insertion, plus_line)]);",
+            &[
                 "annotated_plus_lines",
                 ".",
                 "push",
                 "(",
-                "vec!",
+                "vec",
+                "!",
                 "[",
                 "(",
                 "noop_insertion",
@@ -311,12 +337,112 @@ mod tests {
                 ")",
                 "]",
                 ")",
-                ";"
-            ]
+                ";",
+            ],
         );
     }
 
     #[test]
+    fn test_tokenize_5() {
+        assert_tokenize(
+            "         let col = Color::from_str(s).unwrap_or_else(|_| die());",
+            &[
+                "",
+                " ",
+                " ",
+                " ",
+                " ",
+                " ",
+                " ",
+                " ",
+                " ",
+                " ",
+                "let",
+                " ",
+                "col",
+                " ",
+                "=",
+                " ",
+                "Color",
+                ":",
+                ":",
+                "from_str",
+                "(",
+                "s",
+                ")",
+                ".",
+                "unwrap_or_else",
+                "(",
+                "|",
+                "_",
+                "|",
+                " ",
+                "die",
+                "(",
+                ")",
+                ")",
+                ";",
+            ],
+        )
+    }
+
+    #[test]
+    fn test_tokenize_6() {
+        assert_tokenize(
+            "         (minus_file, plus_file) => format!(\"renamed: {} ⟶  {}\", minus_file, plus_file),",
+            &["",
+              " ",
+              " ",
+              " ",
+              " ",
+              " ",
+              " ",
+              " ",
+              " ",
+              " ",
+              "(",
+              "minus_file",
+              ",",
+              " ",
+              "plus_file",
+              ")",
+              " ",
+              "=",
+              ">",
+              " ",
+              "format",
+              "!",
+              "(",
+              "\"",
+              "renamed",
+              ":",
+              " ",
+              "{",
+              "}",
+              " ",
+              "⟶",
+              " ",
+              " ",
+              "{",
+              "}",
+              "\"",
+              ",",
+              " ",
+              "minus_file",
+              ",",
+              " ",
+              "plus_file",
+              ")",
+              ","])
+    }
+
+    fn assert_tokenize(text: &str, expected_tokens: &[&str]) {
+        let actual_tokens = tokenize(text, &*DEFAULT_TOKENIZATION_REGEXP);
+        assert_eq!(text, expected_tokens.iter().join(""));
+        assert_eq!(actual_tokens, expected_tokens);
+    }
+
+    #[test]
     fn test_infer_edits_1() {
         assert_paired_edits(
             vec!["aaa"],
@@ -435,7 +561,7 @@ mod tests {
                 "             s0.zip(s1)",
                 "                 .take_while(|((_, c0), (_, c1))| c0 == c1) // TODO: Don't consume one-past-the-end!",
                 "                 .fold(0, |offset, ((_, c0), (_, _))| offset + c0.len())"
-            ], 0.66)
+            ], 0.5)
     }
 
     #[test]
@@ -590,9 +716,11 @@ mod tests {
             Deletion,
             PlusNoop,
             Insertion,
+            &*DEFAULT_TOKENIZATION_REGEXP,
             max_line_distance,
             0.0,
         );
+        // compare_annotated_lines(actual_edits, expected_edits);
         assert_eq!(actual_edits, expected_edits);
     }
author	Dan Davison <dandavison7@gmail.com>	2020-06-11 20:50:09 -0400
committer	GitHub <noreply@github.com>	2020-06-11 20:50:09 -0400
commit	1b3499e7a8e41853b0f40edb8f115c4ba5b1e6d1 (patch)
tree	1905558d19979de62b6b5b45b7a2ba2825cbb2f1 /src/edits.rs
parent	f469492bb63763bf54f99559ad40138d3f8e24fd (diff)
parent	97116f284bd826cd0c2805ed3b0f4359b310ad6a (diff)