Refactor: edits

author: Dan Davison <dandavison7@gmail.com> 2019-07-26 19:21:16 -0400
committer: Dan Davison <dandavison7@gmail.com> 2019-07-27 13:12:32 -0400
commit: 5c6d42d494ebb1573a9c5eb67d032d348fb29b72 (patch)
tree: 5d0b1503c7182dcbececaac8cd7f3ee9fb138a6b /src/edits.rs
parent: a598a92700233130b56677c1269b6171eeb73ef7 (diff)
1 files changed, 227 insertions, 233 deletions
diff --git a/src/edits.rs b/src/edits.rs
index 1793fea9..2695cf77 100644
--- a/src/edits.rs
+++ b/src/edits.rs
@@ -1,16 +1,13 @@
-use std::cmp::{max, min};
+use crate::edits::line_pair::LinePair;
 
-use crate::edits::string_pair::StringPair;
-
-/// Infer the edit operations responsible for the differences between
-/// a collection of old and new lines.
+/// Infer the edit operations responsible for the differences between a collection of old and new
+/// lines.
 /*
-  This function is called iff a region of n minus lines followed
-  by n plus lines is encountered, e.g. n successive lines have
-  been partially changed.
+  This function is called iff a region of n minus lines followed by n plus lines is encountered,
+  e.g. n successive lines have been partially changed.
 
-  Consider the i-th such line and let m, p be the i-th minus and
-  i-th plus line, respectively.  The following cases exist:
+  Consider the i-th such line and let m, p be the i-th minus and i-th plus line, respectively.  The
+  following cases exist:
 
   1. Whitespace deleted at line beginning.
      => The deleted section is highlighted in m; p is unstyled.
@@ -27,17 +24,16 @@ use crate::edits::string_pair::StringPair;
   5. An internal section of the line containing a non-whitespace character has been inserted.
      => The inserted section is highlighted in p; m is unstyled.
 
-  Note that whitespace can be neither deleted nor inserted at the
-  end of the line: the line by definition has no trailing
-  whitespace.
+  Note that whitespace can be neither deleted nor inserted at the end of the line: the line by
+  definition has no trailing whitespace.
 */
 pub fn infer_edit_sections<'m, 'p, EditOperationTag>(
     minus_lines: &'m Vec<String>,
     plus_lines: &'p Vec<String>,
-    minus_line_noop: EditOperationTag,
-    delete: EditOperationTag,
-    plus_line_noop: EditOperationTag,
-    insert: EditOperationTag,
+    non_deletion: EditOperationTag,
+    deletion: EditOperationTag,
+    non_insertion: EditOperationTag,
+    insertion: EditOperationTag,
     similarity_threshold: f64,
 ) -> (
     Vec<Vec<(EditOperationTag, &'m str)>>,
@@ -50,72 +46,215 @@ where
     let mut plus_line_sections = Vec::new();
 
     for (minus, plus) in minus_lines.iter().zip(plus_lines.iter()) {
-        let string_pair = StringPair::new(minus, plus);
-
-        // We require that (right-trimmed length) >= (common prefix length). Consider:
-        // minus = "a    "
-        // plus  = "a b  "
-        // Here, the right-trimmed length of minus is 1, yet the common prefix length is
-        // 2. We resolve this by taking the following maxima:
-        let minus_length = max(string_pair.lengths[0], string_pair.common_prefix_length);
-        let plus_length = max(string_pair.lengths[1], string_pair.common_prefix_length);
-
-        // Work backwards from the end of the strings. The end of the
-        // change region is equal to the start of their common
-        // suffix. To find the start of the change region, start with
-        // the end of their common prefix, and then move leftwards
-        // until it is before the start of the common suffix in both
-        // strings.
-        let minus_change_end = minus_length - string_pair.common_suffix_length;
-        let plus_change_end = plus_length - string_pair.common_suffix_length;
-        let change_begin = min(
-            string_pair.common_prefix_length,
-            min(minus_change_end, plus_change_end),
-        );
+        let line_pair = LinePair::new(minus, plus);
 
-        let minus_edit = Edit {
-            change_begin,
-            change_end: minus_change_end,
-            string_length: minus_length,
-        };
-        let plus_edit = Edit {
-            change_begin,
-            change_end: plus_change_end,
-            string_length: plus_length,
-        };
-
-        if minus_edit.appears_genuine(similarity_threshold)
-            && plus_edit.appears_genuine(similarity_threshold)
-        {
+        if line_pair.is_homologous_pair(similarity_threshold) {
+            let (minus_edit, plus_edit) = (line_pair.minus_edit, line_pair.plus_edit);
             minus_line_sections.push(vec![
-                (minus_line_noop, &minus[0..change_begin]),
-                (delete, &minus[change_begin..minus_change_end]),
-                (minus_line_noop, &minus[minus_change_end..]),
+                (non_deletion, &minus[0..minus_edit.start]),
+                (deletion, &minus[minus_edit.start..minus_edit.end]),
+                (non_deletion, &minus[minus_edit.end..]),
             ]);
             plus_line_sections.push(vec![
-                (plus_line_noop, &plus[0..change_begin]),
-                (insert, &plus[change_begin..plus_change_end]),
-                (plus_line_noop, &plus[plus_change_end..]),
+                (non_insertion, &plus[0..plus_edit.start]),
+                (insertion, &plus[plus_edit.start..plus_edit.end]),
+                (non_insertion, &plus[plus_edit.end..]),
             ]);
         } else {
-            minus_line_sections.push(vec![(minus_line_noop, minus)]);
-            plus_line_sections.push(vec![(plus_line_noop, plus)]);
+            minus_line_sections.push(vec![(non_deletion, minus)]);
+            plus_line_sections.push(vec![(non_insertion, plus)]);
         }
     }
     (minus_line_sections, plus_line_sections)
 }
 
-struct Edit {
-    change_begin: usize,
-    change_end: usize,
-    string_length: usize,
-}
+mod line_pair {
+    use std::cmp::{max, min};
+
+    use itertools::Itertools;
+    use unicode_segmentation::UnicodeSegmentation;
+
+    /// A pair of right-trimmed strings.
+    pub struct LinePair {
+        pub minus_edit: Edit,
+        pub plus_edit: Edit,
+    }
+
+    pub struct Edit {
+        pub start: usize,
+        pub end: usize,
+        string_length: usize,
+    }
+
+    impl Edit {
+        // TODO: exclude leading whitespace in this calculation
+        fn appears_genuine(&self, similarity_threshold: f64) -> bool {
+            ((self.end - self.start) as f64 / self.string_length as f64) < similarity_threshold
+        }
+    }
+
+    impl LinePair {
+        pub fn new(s0: &str, s1: &str) -> LinePair {
+            let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true));
+            let common_prefix_length = LinePair::common_prefix_length(g0, g1);
+            // TODO: Don't compute grapheme segmentation twice?
+            let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true));
+            let (common_suffix_length, trailing_whitespace) = LinePair::suffix_data(g0, g1);
+            let lengths = [
+                s0.len() - trailing_whitespace[0],
+                s1.len() - trailing_whitespace[1],
+            ];
+
+            // We require that (right-trimmed length) >= (common prefix length). Consider:
+            // minus = "a    "
+            // plus  = "a b  "
+            // Here, the right-trimmed length of minus is 1, yet the common prefix length is 2. We
+            // resolve this by taking the following maxima:
+            let minus_length = max(lengths[0], common_prefix_length);
+            let plus_length = max(lengths[1], common_prefix_length);
+
+            // Work backwards from the end of the strings. The end of the change region is equal to
+            // the start of their common suffix. To find the start of the change region, start with
+            // the end of their common prefix, and then move leftwards until it is before the start
+            // of the common suffix in both strings.
+            let minus_change_end = minus_length - common_suffix_length;
+            let plus_change_end = plus_length - common_suffix_length;
+            let change_begin = min(common_prefix_length, min(minus_change_end, plus_change_end));
+
+            LinePair {
+                minus_edit: Edit {
+                    start: change_begin,
+                    end: minus_change_end,
+                    string_length: minus_length,
+                },
+                plus_edit: Edit {
+                    start: change_begin,
+                    end: plus_change_end,
+                    string_length: plus_length,
+                },
+            }
+        }
+
+        pub fn is_homologous_pair(&self, similarity_threshold: f64) -> bool {
+            self.minus_edit.appears_genuine(similarity_threshold)
+                && self.plus_edit.appears_genuine(similarity_threshold)
+        }
+
+        /// Align the two strings at their left ends and consider only the bytes up to the length of
+        /// the shorter string. Return the byte offset of the first differing grapheme cluster, or
+        /// the byte length of shorter string if they do not differ.
+        fn common_prefix_length<'a, I>(s0: I, s1: I) -> usize
+        where
+            I: Iterator<Item = (usize, &'a str)>,
+            I: Itertools,
+        {
+            s0.zip(s1)
+                .peekable()
+                .peeking_take_while(|((_, c0), (_, c1))| c0 == c1)
+                .fold(0, |offset, ((_, c0), (_, _))| offset + c0.len())
+        }
+
+        /// Trim trailing whitespace and align the two strings at their right ends. Fix the origin
+        /// at their right ends and, looking left, consider only the bytes up to the length of the
+        /// shorter string. Return the byte offset of the first differing grapheme cluster, or the
+        /// byte length of the shorter string if they do not differ. Also return the number of bytes
+        /// of whitespace trimmed from each string.
+        fn suffix_data<'a, I>(s0: I, s1: I) -> (usize, [usize; 2])
+        where
+            I: DoubleEndedIterator<Item = (usize, &'a str)>,
+            I: Itertools,
+        {
+            let mut s0 = s0.rev().peekable();
+            let mut s1 = s1.rev().peekable();
+
+            let is_whitespace = |(_, c): &(usize, &str)| *c == " " || *c == "\n";
+            let n0 = (&mut s0).peeking_take_while(is_whitespace).count();
+            let n1 = (&mut s1).peeking_take_while(is_whitespace).count();
 
-impl Edit {
-    // TODO: exclude leading whitespace in this calculation
-    fn appears_genuine(&self, similarity_threshold: f64) -> bool {
-        ((self.change_end - self.change_begin) as f64 / self.string_length as f64)
-            < similarity_threshold
+            (LinePair::common_prefix_length(s0, s1), [n0, n1])
+        }
+    }
+
+    #[cfg(test)]
+    mod tests {
+        use super::*;
+
+        fn common_prefix_length(s1: &str, s2: &str) -> usize {
+            super::LinePair::common_prefix_length(
+                s1.grapheme_indices(true),
+                s2.grapheme_indices(true),
+            )
+        }
+
+        fn common_suffix_length(s1: &str, s2: &str) -> usize {
+            super::LinePair::suffix_data(s1.grapheme_indices(true), s2.grapheme_indices(true)).0
+        }
+
+        #[test]
+        fn test_common_prefix_length() {
+            assert_eq!(common_prefix_length("", ""), 0);
+            assert_eq!(common_prefix_length("", "a"), 0);
+            assert_eq!(common_prefix_length("a", ""), 0);
+            assert_eq!(common_prefix_length("a", "b"), 0);
+            assert_eq!(common_prefix_length("a", "a"), 1);
+            assert_eq!(common_prefix_length("a", "ab"), 1);
+            assert_eq!(common_prefix_length("ab", "a"), 1);
+            assert_eq!(common_prefix_length("ab", "aba"), 2);
+            assert_eq!(common_prefix_length("aba", "ab"), 2);
+        }
+
+        #[test]
+        fn test_common_prefix_length_with_leading_whitespace() {
+            assert_eq!(common_prefix_length(" ", ""), 0);
+            assert_eq!(common_prefix_length(" ", " "), 1);
+            assert_eq!(common_prefix_length(" a", " a"), 2);
+            assert_eq!(common_prefix_length(" a", "a"), 0);
+        }
+
+        #[test]
+        fn test_common_suffix_length() {
+            assert_eq!(common_suffix_length("", ""), 0);
+            assert_eq!(common_suffix_length("", "a"), 0);
+            assert_eq!(common_suffix_length("a", ""), 0);
+            assert_eq!(common_suffix_length("a", "b"), 0);
+            assert_eq!(common_suffix_length("a", "a"), 1);
+            assert_eq!(common_suffix_length("a", "ab"), 0);
+            assert_eq!(common_suffix_length("ab", "a"), 0);
+            assert_eq!(common_suffix_length("ab", "b"), 1);
+            assert_eq!(common_suffix_length("ab", "aab"), 2);
+            assert_eq!(common_suffix_length("aba", "ba"), 2);
+        }
+
+        #[test]
+        fn test_common_suffix_length_with_trailing_whitespace() {
+            assert_eq!(common_suffix_length("", "  "), 0);
+            assert_eq!(common_suffix_length("  ", "a"), 0);
+            assert_eq!(common_suffix_length("a  ", ""), 0);
+            assert_eq!(common_suffix_length("a", "b  "), 0);
+            assert_eq!(common_suffix_length("a", "a  "), 1);
+            assert_eq!(common_suffix_length("a  ", "ab  "), 0);
+            assert_eq!(common_suffix_length("ab", "a  "), 0);
+            assert_eq!(common_suffix_length("ab  ", "b "), 1);
+            assert_eq!(common_suffix_length("ab ", "aab  "), 2);
+            assert_eq!(common_suffix_length("aba ", "ba"), 2);
+        }
+
+        #[test]
+        fn test_common_suffix_length_with_trailing_whitespace_nonascii() {
+            assert_eq!(common_suffix_length("  ", "á"), 0);
+            assert_eq!(common_suffix_length("á  ", ""), 0);
+            assert_eq!(common_suffix_length("á", "b  "), 0);
+            assert_eq!(common_suffix_length("á", "á  "), "á".len());
+            assert_eq!(common_suffix_length("a  ", "áb  "), 0);
+            assert_eq!(common_suffix_length("ab", "á  "), 0);
+            assert_eq!(common_suffix_length("áb  ", "b "), 1);
+            assert_eq!(common_suffix_length("áb ", "aáb  "), 1 + "á".len());
+            assert_eq!(common_suffix_length("abá ", "bá"), 1 + "á".len());
+            assert_eq!(
+                common_suffix_length("áaáabá ", "ááabá   "),
+                2 + 2 * "á".len()
+            );
+        }
     }
 }
 
@@ -127,8 +266,8 @@ mod tests {
     enum EditOperationTag {
         MinusNoop,
         PlusNoop,
-        Delete,
-        Insert,
+        Deletion,
+        Insertion,
     }
 
     use EditOperationTag::*;
@@ -141,14 +280,14 @@ mod tests {
             &minus_lines,
             &plus_lines,
             MinusNoop,
-            Delete,
+            Deletion,
             PlusNoop,
-            Insert,
+            Insertion,
             1.0,
         );
         let expected_edits = (
-            vec![vec![(MinusNoop, "a"), (Delete, "a"), (MinusNoop, "a\n")]],
-            vec![vec![(PlusNoop, "a"), (Insert, "b"), (PlusNoop, "a\n")]],
+            vec![vec![(MinusNoop, "a"), (Deletion, "a"), (MinusNoop, "a\n")]],
+            vec![vec![(PlusNoop, "a"), (Insertion, "b"), (PlusNoop, "a\n")]],
         );
 
         assert_consistent(&expected_edits);
@@ -164,14 +303,14 @@ mod tests {
             &minus_lines,
             &plus_lines,
             MinusNoop,
-            Delete,
+            Deletion,
             PlusNoop,
-            Insert,
+            Insertion,
             1.0,
         );
         let expected_edits = (
-            vec![vec![(MinusNoop, "á"), (Delete, "aa"), (MinusNoop, "\n")]],
-            vec![vec![(PlusNoop, "á"), (Insert, "áb"), (PlusNoop, "\n")]],
+            vec![vec![(MinusNoop, "á"), (Deletion, "aa"), (MinusNoop, "\n")]],
+            vec![vec![(PlusNoop, "á"), (Insertion, "áb"), (PlusNoop, "\n")]],
         );
 
         assert_consistent(&expected_edits);
@@ -188,20 +327,20 @@ mod tests {
             &minus_lines,
             &plus_lines,
             MinusNoop,
-            Delete,
+            Deletion,
             PlusNoop,
-            Insert,
+            Insertion,
             1.0,
         );
         let expected_edits = (
             vec![vec![
                 (MinusNoop, "d."),
-                (Delete, "iter"),
+                (Deletion, "iter"),
                 (MinusNoop, "items()\n"),
             ]],
             vec![vec![
                 (PlusNoop, "d."),
-                (Insert, ""),
+                (Insertion, ""),
                 (PlusNoop, "items()\n"),
             ]],
         );
@@ -274,159 +413,14 @@ mod tests {
     fn fmt_edit(edit: EditOperationTag) -> &'static str {
         match edit {
             MinusNoop => "MinusNoop",
-            Delete => "Delete",
+            Deletion => "Deletion",
             PlusNoop => "PlusNoop",
-            Insert => "Insert",
+            Insertion => "Insertion",
         }
     }
 
     fn is_edit(edit: &EditOperationTag) -> bool {
-        *edit == Delete || *edit == Insert
+        *edit == Deletion || *edit == Insertion
     }
 
 }
-
-mod string_pair {
-    use itertools::Itertools;
-    use unicode_segmentation::UnicodeSegmentation;
-
-    /// A pair of right-trimmed strings.
-    pub struct StringPair {
-        pub common_prefix_length: usize,
-        pub common_suffix_length: usize,
-        pub lengths: [usize; 2],
-    }
-
-    impl StringPair {
-        pub fn new(s0: &str, s1: &str) -> StringPair {
-            let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true));
-            let common_prefix_length = StringPair::common_prefix_length(g0, g1);
-            // TODO: Don't compute grapheme segmentation twice?
-            let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true));
-            let (common_suffix_length, trailing_whitespace) = StringPair::suffix_data(g0, g1);
-            StringPair {
-                common_prefix_length,
-                common_suffix_length,
-                lengths: [
-                    s0.len() - trailing_whitespace[0],
-                    s1.len() - trailing_whitespace[1],
-                ],
-            }
-        }
-
-        /// Align the two strings at their left ends and consider only
-        /// the bytes up to the length of the shorter string. Return
-        /// the byte offset of the first differing grapheme cluster,
-        /// or the byte length of shorter string if they do not
-        /// differ.
-        fn common_prefix_length<'a, I>(s0: I, s1: I) -> usize
-        where
-            I: Iterator<Item = (usize, &'a str)>,
-            I: Itertools,
-        {
-            s0.zip(s1)
-                .peekable()
-                .peeking_take_while(|((_, c0), (_, c1))| c0 == c1)
-                .fold(0, |offset, ((_, c0), (_, _))| offset + c0.len())
-        }
-
-        /// Trim trailing whitespace and align the two strings at
-        /// their right ends. Fix the origin at their right ends and,
-        /// looking left, consider only the bytes up to the length of
-        /// the shorter string. Return the byte offset of the first
-        /// differing grapheme cluster, or the byte length of the
-        /// shorter string if they do not differ. Also return the
-        /// number of bytes of whitespace trimmed from each string.
-        fn suffix_data<'a, I>(s0: I, s1: I) -> (usize, [usize; 2])
-        where
-            I: DoubleEndedIterator<Item = (usize, &'a str)>,
-            I: Itertools,
-        {
-            let mut s0 = s0.rev().peekable();
-            let mut s1 = s1.rev().peekable();
-
-            let is_whitespace = |(_, c): &(usize, &str)| *c == " " || *c == "\n";
-            let n0 = (&mut s0).peeking_take_while(is_whitespace).count();
-            let n1 = (&mut s1).peeking_take_while(is_whitespace).count();
-
-            (StringPair::common_prefix_length(s0, s1), [n0, n1])
-        }
-    }
-
-    #[cfg(test)]
-    mod tests {
-        fn common_prefix_length(s1: &str, s2: &str) -> usize {
-            super::StringPair::new(s1, s2).common_prefix_length
-        }
-
-        fn common_suffix_length(s1: &str, s2: &str) -> usize {
-            super::StringPair::new(s1, s2).common_suffix_length
-        }
-
-        #[test]
-        fn test_common_prefix_length() {
-            assert_eq!(common_prefix_length("", ""), 0);
-            assert_eq!(common_prefix_length("", "a"), 0);
-            assert_eq!(common_prefix_length("a", ""), 0);
-            assert_eq!(common_prefix_length("a", "b"), 0);
-            assert_eq!(common_prefix_length("a", "a"), 1);
-            assert_eq!(common_prefix_length("a", "ab"), 1);
-            assert_eq!(common_prefix_length("ab", "a"), 1);
-            assert_eq!(common_prefix_length("ab", "aba"), 2);
-            assert_eq!(common_prefix_length("aba", "ab"), 2);
-        }
-
-        #[test]
-        fn test_common_prefix_length_with_leading_whitespace() {
-            assert_eq!(common_prefix_length(" ", ""), 0);
-            assert_eq!(common_prefix_length(" ", " "), 1);
-            assert_eq!(common_prefix_length(" a", " a"), 2);
-            assert_eq!(common_prefix_length(" a", "a"), 0);
-        }
-
-        #[test]
-        fn test_common_suffix_length() {
-            assert_eq!(common_suffix_length("", ""), 0);
-            assert_eq!(common_suffix_length("", "a"), 0);
-            assert_eq!(common_suffix_length("a", ""), 0);
-            assert_eq!(common_suffix_length("a", "b"), 0);
-            assert_eq!(common_suffix_length("a", "a"), 1);
-            assert_eq!(common_suffix_length("a", "ab"), 0);
-            assert_eq!(common_suffix_length("ab", "a"), 0);
-            assert_eq!(common_suffix_length("ab", "b"), 1);
-            assert_eq!(common_suffix_length("ab", "aab"), 2);
-            assert_eq!(common_suffix_length("aba", "ba"), 2);
-        }
-
-        #[test]
-        fn test_common_suffix_length_with_trailing_whitespace() {
-            assert_eq!(common_suffix_length("", "  "), 0);
-            assert_eq!(common_suffix_length("  ", "a"), 0);
-            assert_eq!(common_suffix_length("a  ", ""), 0);
-            assert_eq!(common_suffix_length("a", "b  "), 0);
-            assert_eq!(common_suffix_length("a", "a  "), 1);
-            assert_eq!(common_suffix_length("a  ", "ab  "), 0);
-            assert_eq!(common_suffix_length("ab", "a  "), 0);
-            assert_eq!(common_suffix_length("ab  ", "b "), 1);
-            assert_eq!(common_suffix_length("ab ", "aab  "), 2);
-            assert_eq!(common_suffix_length("aba ", "ba"), 2);
-        }
-
-        #[test]
-        fn test_common_suffix_length_with_trailing_whitespace_nonascii() {
-            assert_eq!(common_suffix_length("  ", "á"), 0);
-            assert_eq!(common_suffix_length("á  ", ""), 0);
-            assert_eq!(common_suffix_length("á", "b  "), 0);
-            assert_eq!(common_suffix_length("á", "á  "), "á".len());
-            assert_eq!(common_suffix_length("a  ", "áb  "), 0);
-            assert_eq!(common_suffix_length("ab", "á  "), 0);
-            assert_eq!(common_suffix_length("áb  ", "b "), 1);
-            assert_eq!(common_suffix_length("áb ", "aáb  "), 1 + "á".len());
-            assert_eq!(common_suffix_length("abá ", "bá"), 1 + "á".len());
-            assert_eq!(
-                common_suffix_length("áaáabá ", "ááabá   "),
-                2 + 2 * "á".len()
-            );
-        }
-    }
-}
author	Dan Davison <dandavison7@gmail.com>	2019-07-26 19:21:16 -0400
committer	Dan Davison <dandavison7@gmail.com>	2019-07-27 13:12:32 -0400
commit	5c6d42d494ebb1573a9c5eb67d032d348fb29b72 (patch)
tree	5d0b1503c7182dcbececaac8cd7f3ee9fb138a6b /src/edits.rs
parent	a598a92700233130b56677c1269b6171eeb73ef7 (diff)