Use Needleman-Wunsch / Wagner-Fischer algorithm

author: Dan Davison <dandavison7@gmail.com> 2019-07-30 12:10:17 -0400
committer: Dan Davison <dandavison7@gmail.com> 2019-08-06 22:50:31 -0700
commit: 41c43314da3cee8bbaa98c1deff0e7c45646b28d (patch)
tree: 4202e4bcebd74cee1ed52e27f1a7860fec77a91a /src/edits.rs
parent: 1bfd17539cba33589e07827f9e577d3d9ced018e (diff)
1 files changed, 131 insertions, 302 deletions
diff --git a/src/edits.rs b/src/edits.rs
index 060c2851..95c7191e 100644
--- a/src/edits.rs
+++ b/src/edits.rs
@@ -1,31 +1,11 @@
-use crate::edits::line_pair::LinePair;
+use unicode_segmentation::UnicodeSegmentation;
 
-/*
-  Consider minus line m and paired plus line p, respectively.  The following cases exist:
-
-  1. Whitespace deleted at line beginning.
-     => The deleted section is highlighted in m; p is unstyled.
-
-  2. Whitespace inserted at line beginning.
-     => The inserted section is highlighted in p; m is unstyled.
-
-  3. An internal section of the line containing a non-whitespace character has been deleted.
-     => The deleted section is highlighted in m; p is unstyled.
-
-  4. An internal section of the line containing a non-whitespace character has been changed.
-     => The original section is highlighted in m; the replacement is highlighted in p.
-
-  5. An internal section of the line containing a non-whitespace character has been inserted.
-     => The inserted section is highlighted in p; m is unstyled.
-
-  Note that whitespace can be neither deleted nor inserted at the end of the line: the line by
-  definition has no trailing whitespace.
-*/
-
-type AnnotatedLine<'a, EditOperation> = Vec<(EditOperation, &'a str)>;
+use crate::align;
 
 /// Infer the edit operations responsible for the differences between a collection of old and new
-/// lines. Return the input minus and plus lines, in annotated form.
+/// lines. A "line" is a string. An annotated line is a Vec of (op, &str) pairs, where the &str
+/// slices are slices of the line, and their concatenation equals the line. Return the input minus
+/// and plus lines, in annotated form.
 pub fn infer_edits<'a, EditOperation>(
     minus_lines: &'a Vec<String>,
     plus_lines: &'a Vec<String>,
@@ -35,42 +15,58 @@ pub fn infer_edits<'a, EditOperation>(
     insertion: EditOperation,
     distance_threshold: f64,
 ) -> (
-    Vec<AnnotatedLine<'a, EditOperation>>,
-    Vec<AnnotatedLine<'a, EditOperation>>,
+    Vec<Vec<(EditOperation, &'a str)>>, // annotated minus lines
+    Vec<Vec<(EditOperation, &'a str)>>, // annotated plus lines
 )
 where
     EditOperation: Copy,
+    EditOperation: PartialEq,
 {
-    let mut annotated_minus_lines = Vec::<AnnotatedLine<EditOperation>>::new();
-    let mut annotated_plus_lines = Vec::<AnnotatedLine<EditOperation>>::new();
+    let mut annotated_minus_lines = Vec::<Vec<(EditOperation, &str)>>::new();
+    let mut annotated_plus_lines = Vec::<Vec<(EditOperation, &str)>>::new();
 
     let mut emitted = 0; // plus lines emitted so far
 
     'minus_lines_loop: for minus_line in minus_lines {
         let mut considered = 0; // plus lines considered so far as match for minus_line
+        let minus_line = minus_line.trim_end();
         for plus_line in &plus_lines[emitted..] {
-            let line_pair = LinePair::new(minus_line, plus_line);
-            if line_pair.distance < distance_threshold {
+            let plus_line = plus_line.trim_end();
+            let alignment = align::Alignment::new(
+                minus_line
+                    .grapheme_indices(true)
+                    .collect::<Vec<(usize, &str)>>(),
+                plus_line
+                    .grapheme_indices(true)
+                    .collect::<Vec<(usize, &str)>>(),
+            );
+
+            if alignment.normalized_edit_distance() < distance_threshold {
                 // minus_line and plus_line are inferred to be a homologous pair.
 
                 // Emit as unpaired the plus lines already considered and rejected
                 for plus_line in &plus_lines[emitted..(emitted + considered)] {
-                    annotated_plus_lines.push(vec![(non_insertion, plus_line)]);
+                    annotated_plus_lines.push(vec![(non_insertion, plus_line.trim_end())]);
                 }
                 emitted += considered;
 
                 // Emit the homologous pair.
-                let (minus_edit, plus_edit) = (line_pair.minus_edit, line_pair.plus_edit);
-                annotated_minus_lines.push(vec![
-                    (non_deletion, &minus_line[0..minus_edit.start]),
-                    (deletion, &minus_line[minus_edit.start..minus_edit.end]),
-                    (non_deletion, &minus_line[minus_edit.end..]),
-                ]);
-                annotated_plus_lines.push(vec![
-                    (non_insertion, &plus_line[0..plus_edit.start]),
-                    (insertion, &plus_line[plus_edit.start..plus_edit.end]),
-                    (non_insertion, &plus_line[plus_edit.end..]),
-                ]);
+                annotated_minus_lines.push(coalesce_minus_edits(
+                    &alignment,
+                    minus_line,
+                    non_deletion,
+                    deletion,
+                    non_insertion,
+                    insertion,
+                ));
+                annotated_plus_lines.push(coalesce_plus_edits(
+                    &alignment,
+                    plus_line,
+                    non_deletion,
+                    deletion,
+                    non_insertion,
+                    insertion,
+                ));
                 emitted += 1;
 
                 // Move on to the next minus line.
@@ -80,260 +76,85 @@ where
             }
         }
         // No homolog was found for minus i; emit as unpaired.
-        annotated_minus_lines.push(vec![(non_deletion, minus_line)]);
+        annotated_minus_lines.push(vec![(non_deletion, minus_line.trim_end())]);
     }
     // Emit any remaining plus lines
     for plus_line in &plus_lines[emitted..] {
-        annotated_plus_lines.push(vec![(non_insertion, plus_line)]);
+        annotated_plus_lines.push(vec![(non_insertion, plus_line.trim_end())]);
     }
 
     (annotated_minus_lines, annotated_plus_lines)
 }
 
-mod line_pair {
-    use std::cmp::{max, min};
-
-    use itertools::{Itertools, PeekingNext};
-    use unicode_segmentation::UnicodeSegmentation;
+pub fn coalesce_minus_edits<'a, EditOperation>(
+    alignment: &align::Alignment<'a>,
+    line: &'a str,
+    non_deletion: EditOperation,
+    deletion: EditOperation,
+    _non_insertion: EditOperation,
+    insertion: EditOperation,
+) -> Vec<(EditOperation, &'a str)>
+where
+    EditOperation: Copy,
+    EditOperation: PartialEq,
+{
+    coalesce_edits(
+        alignment.edit_operations(non_deletion, deletion, deletion, insertion, true),
+        line,
+        insertion,
+    )
+}
 
-    /// A pair of right-trimmed strings.
-    pub struct LinePair<'a> {
-        pub minus_line: &'a str,
-        pub plus_line: &'a str,
-        pub minus_edit: Edit,
-        pub plus_edit: Edit,
-        pub distance: f64,
-    }
+pub fn coalesce_plus_edits<'a, EditOperation>(
+    alignment: &align::Alignment<'a>,
+    line: &'a str,
+    _non_deletion: EditOperation,
+    deletion: EditOperation,
+    non_insertion: EditOperation,
+    insertion: EditOperation,
+) -> Vec<(EditOperation, &'a str)>
+where
+    EditOperation: Copy,
+    EditOperation: PartialEq,
+{
+    coalesce_edits(
+        alignment.edit_operations(non_insertion, insertion, deletion, insertion, false),
+        line,
+        deletion,
+    )
+}
 
-    #[derive(Debug)]
-    pub struct Edit {
-        pub start: usize,
-        pub end: usize,
-        string_length: usize,
+fn coalesce_edits<'a, 'b, EditOperation>(
+    operations: Vec<(EditOperation, (usize, &'b str))>,
+    line: &'a str,
+    irrelevant: EditOperation,
+) -> Vec<(EditOperation, &'a str)>
+where
+    EditOperation: Copy,
+    EditOperation: PartialEq,
+{
+    let mut edits = Vec::new(); // TODO capacity
+    let mut operations = operations.iter().filter(|(op, _)| *op != irrelevant);
+    let next = operations.next();
+    if next.is_none() {
+        return edits;
     }
-
-    impl Edit {
-        fn distance(&self) -> f64 {
-            (self.end - self.start) as f64 / self.string_length as f64
+    let (mut last_op, (mut last_offset, _)) = next.unwrap();
+    let mut curr_op = last_op;
+    let mut curr_offset;
+    for (op, (offset, _)) in operations {
+        curr_op = *op;
+        curr_offset = *offset;
+        if curr_op != last_op {
+            edits.push((last_op, &line[last_offset..*offset]));
+            last_offset = curr_offset;
+            last_op = curr_op;
         }
     }
-
-    impl<'a> LinePair<'a> {
-        pub fn new(s0: &'a str, s1: &'a str) -> Self {
-            let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true));
-            let (prefix_length, leading_whitespace) = LinePair::prefix_data(g0, g1);
-            let common_prefix_length = [
-                leading_whitespace[0] + prefix_length,
-                leading_whitespace[1] + prefix_length,
-            ];
-            // TODO: Don't compute grapheme segmentation twice?
-            let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true));
-            let (common_suffix_length, trailing_whitespace) = LinePair::suffix_data(g0, g1);
-            let lengths = [
-                s0.len() - trailing_whitespace[0],
-                s1.len() - trailing_whitespace[1],
-            ];
-
-            // We require that (right-trimmed length) >= (common prefix length). Consider:
-            // minus = "a    "
-            // plus  = "a b  "
-            // Here, the right-trimmed length of minus is 1, yet the common prefix length is 2. We
-            // resolve this by taking the following maxima:
-            let minus_length = max(lengths[0], common_prefix_length[0]);
-            let plus_length = max(lengths[1], common_prefix_length[1]);
-
-            // Work backwards from the end of the strings. The end of the change region is equal to
-            // the start of their common suffix. To find the start of the change region, start with
-            // the end of their common prefix, and then move leftwards until it is before the start
-            // of the common suffix in both strings.
-            let minus_change_end = minus_length - common_suffix_length;
-            let plus_change_end = plus_length - common_suffix_length;
-            let change_begin = [
-                min(
-                    common_prefix_length[0],
-                    min(minus_change_end, plus_change_end),
-                ),
-                min(
-                    common_prefix_length[1],
-                    min(minus_change_end, plus_change_end),
-                ),
-            ];
-
-            let minus_edit = Edit {
-                start: change_begin[0],
-                end: minus_change_end,
-                string_length: minus_length - leading_whitespace[0],
-            };
-            let plus_edit = Edit {
-                start: change_begin[1],
-                end: plus_change_end,
-                string_length: plus_length - leading_whitespace[1],
-            };
-            let distance = minus_edit.distance() + plus_edit.distance();
-            LinePair {
-                minus_line: s0,
-                plus_line: s1,
-                minus_edit,
-                plus_edit,
-                distance,
-            }
-        }
-
-        #[allow(dead_code)]
-        pub fn format(&self) -> String {
-            format!(
-                "LinePair\n \
-                 \t{} {} {}\n \
-                 \t{} {} {}\n \
-                 \t{}",
-                self.minus_line.trim_end(),
-                self.minus_edit.start,
-                self.minus_edit.end,
-                self.plus_line.trim_end(),
-                self.plus_edit.start,
-                self.plus_edit.end,
-                self.distance
-            )
-        }
-
-        /// Align the two strings at their left ends and consider only the bytes up to the length of
-        /// the shorter string. Return the byte offset of the first differing grapheme cluster, or
-        /// the byte length of shorter string if they do not differ.
-        fn prefix_data<'b, I>(s0: I, s1: I) -> (usize, [usize; 2])
-        where
-            I: Iterator<Item = (usize, &'b str)>,
-            I: Itertools,
-        {
-            let mut s0 = s0.peekable();
-            let mut s1 = s1.peekable();
-
-            let w0 = consume_whitespace(&mut s0);
-            let w1 = consume_whitespace(&mut s1);
-
-            (
-                s0.zip(s1)
-                    .peekable()
-                    .peeking_take_while(|((_, c0), (_, c1))| c0 == c1)
-                    .fold(0, |offset, ((_, c0), (_, _))| offset + c0.len()),
-                [w0, w1],
-            )
-        }
-
-        /// Trim trailing whitespace and align the two strings at their right ends. Fix the origin
-        /// at their right ends and, looking left, consider only the bytes up to the length of the
-        /// shorter string. Return the byte offset of the first differing grapheme cluster, or the
-        /// byte length of the shorter string if they do not differ. Also return the number of bytes
-        /// of whitespace trimmed from each string.
-        fn suffix_data<'b, I>(s0: I, s1: I) -> (usize, [usize; 2])
-        where
-            I: DoubleEndedIterator<Item = (usize, &'b str)>,
-            I: Itertools,
-        {
-            let mut s0 = s0.rev().peekable();
-            let mut s1 = s1.rev().peekable();
-
-            let w0 = consume_whitespace(&mut s0);
-            let w1 = consume_whitespace(&mut s1);
-
-            let (suffix_length, [_, _]) = LinePair::prefix_data(s0, s1);
-            (suffix_length, [w0, w1])
-        }
-    }
-
-    fn consume_whitespace<'b, I>(s: &mut I) -> usize
-    where
-        I: Iterator<Item = (usize, &'b str)>,
-        I: Itertools,
-        I: PeekingNext,
-    {
-        let is_whitespace = |(_, c): &(usize, &str)| *c == " " || *c == "\n";
-        s.peeking_take_while(is_whitespace).count()
-    }
-
-    #[cfg(test)]
-    mod tests {
-        use super::*;
-
-        fn common_prefix_length(s1: &str, s2: &str) -> usize {
-            let (prefix_length, [w0, w1]) =
-                super::LinePair::prefix_data(s1.grapheme_indices(true), s2.grapheme_indices(true));
-            assert_eq!(
-                w0, w1,
-                "Lines with different amounts of leading whitespace are not supported"
-            );
-            w0 + prefix_length
-        }
-
-        fn common_suffix_length(s1: &str, s2: &str) -> usize {
-            super::LinePair::suffix_data(s1.grapheme_indices(true), s2.grapheme_indices(true)).0
-        }
-
-        #[test]
-        fn test_common_prefix_length() {
-            assert_eq!(common_prefix_length("", ""), 0);
-            assert_eq!(common_prefix_length("", "a"), 0);
-            assert_eq!(common_prefix_length("a", ""), 0);
-            assert_eq!(common_prefix_length("a", "b"), 0);
-            assert_eq!(common_prefix_length("a", "a"), 1);
-            assert_eq!(common_prefix_length("a", "ab"), 1);
-            assert_eq!(common_prefix_length("ab", "a"), 1);
-            assert_eq!(common_prefix_length("ab", "aba"), 2);
-            assert_eq!(common_prefix_length("aba", "ab"), 2);
-        }
-
-        #[test]
-        fn test_common_prefix_length_with_leading_whitespace() {
-            // assert_eq!(common_prefix_length(" ", ""), 0);
-            assert_eq!(common_prefix_length(" ", " "), 1);
-            assert_eq!(common_prefix_length(" a", " a"), 2);
-            // assert_eq!(common_prefix_length(" a", "a"), 0);
-        }
-
-        #[test]
-        fn test_common_suffix_length() {
-            assert_eq!(common_suffix_length("", ""), 0);
-            assert_eq!(common_suffix_length("", "a"), 0);
-            assert_eq!(common_suffix_length("a", ""), 0);
-            assert_eq!(common_suffix_length("a", "b"), 0);
-            assert_eq!(common_suffix_length("a", "a"), 1);
-            assert_eq!(common_suffix_length("a", "ab"), 0);
-            assert_eq!(common_suffix_length("ab", "a"), 0);
-            assert_eq!(common_suffix_length("ab", "b"), 1);
-            assert_eq!(common_suffix_length("ab", "aab"), 2);
-            assert_eq!(common_suffix_length("aba", "ba"), 2);
-        }
-
-        #[test]
-        fn test_common_suffix_length_with_trailing_whitespace() {
-            assert_eq!(common_suffix_length("", "  "), 0);
-            assert_eq!(common_suffix_length("  ", "a"), 0);
-            assert_eq!(common_suffix_length("a  ", ""), 0);
-            assert_eq!(common_suffix_length("a", "b  "), 0);
-            assert_eq!(common_suffix_length("a", "a  "), 1);
-            assert_eq!(common_suffix_length("a  ", "ab  "), 0);
-            assert_eq!(common_suffix_length("ab", "a  "), 0);
-            assert_eq!(common_suffix_length("ab  ", "b "), 1);
-            assert_eq!(common_suffix_length("ab ", "aab  "), 2);
-            assert_eq!(common_suffix_length("aba ", "ba"), 2);
-        }
-
-        #[test]
-        fn test_common_suffix_length_with_trailing_whitespace_nonascii() {
-            assert_eq!(common_suffix_length("  ", "á"), 0);
-            assert_eq!(common_suffix_length("á  ", ""), 0);
-            assert_eq!(common_suffix_length("á", "b  "), 0);
-            assert_eq!(common_suffix_length("á", "á  "), "á".len());
-            assert_eq!(common_suffix_length("a  ", "áb  "), 0);
-            assert_eq!(common_suffix_length("ab", "á  "), 0);
-            assert_eq!(common_suffix_length("áb  ", "b "), 1);
-            assert_eq!(common_suffix_length("áb ", "aáb  "), 1 + "á".len());
-            assert_eq!(common_suffix_length("abá ", "bá"), 1 + "á".len());
-            assert_eq!(
-                common_suffix_length("áaáabá ", "ááabá   "),
-                2 + 2 * "á".len()
-            );
-        }
+    if curr_op == last_op {
+        edits.push((last_op, &line[last_offset..]));
     }
+    edits
 }
 
 #[cfg(test)]
@@ -359,13 +180,25 @@ mod tests {
     const DISTANCE_MAX: f64 = 2.0;
 
     #[test]
+    fn test_coalesce_edits_1() {
+        assert_eq!(
+            coalesce_edits(
+                vec![(MinusNoop, (0, "a")), (MinusNoop, (1, "b"))],
+                "ab",
+                Insertion
+            ),
+            vec![(MinusNoop, "ab")]
+        )
+    }
+
+    #[test]
     fn test_infer_edits_1() {
         assert_paired_edits(
             vec!["aaa\n"],
             vec!["aba\n"],
             (
-                vec![vec![(MinusNoop, "a"), (Deletion, "a"), (MinusNoop, "a\n")]],
-                vec![vec![(PlusNoop, "a"), (Insertion, "b"), (PlusNoop, "a\n")]],
+                vec![vec![(MinusNoop, "a"), (Deletion, "a"), (MinusNoop, "a")]],
+                vec![vec![(PlusNoop, "a"), (Insertion, "b"), (PlusNoop, "a")]],
             ),
         )
     }
@@ -376,8 +209,8 @@ mod tests {
             vec!["áaa\n"],
             vec!["ááb\n"],
             (
-                vec![vec![(MinusNoop, "á"), (Deletion, "aa"), (MinusNoop, "\n")]],
-                vec![vec![(PlusNoop, "á"), (Insertion, "áb"), (PlusNoop, "\n")]],
+                vec![vec![(MinusNoop, "á"), (Deletion, "aa")]],
+                vec![vec![(PlusNoop, "á"), (Insertion, "áb")]],
             ),
         )
     }
@@ -391,13 +224,9 @@ mod tests {
                 vec![vec![
                     (MinusNoop, "d."),
                     (Deletion, "iter"),
-                    (MinusNoop, "items()\n"),
-                ]],
-                vec![vec![
-                    (PlusNoop, "d."),
-                    (Insertion, ""),
-                    (PlusNoop, "items()\n"),
+                    (MinusNoop, "items()"),
                 ]],
+                vec![vec![(PlusNoop, "d.items()")]],
             ),
         )
     }
@@ -409,17 +238,17 @@ mod tests {
             vec!["áábáácááb\n"],
             (
                 vec![
-                    vec![(MinusNoop, "áaaáaaáaa\n")],
+                    vec![(MinusNoop, "áaaáaaáaa")],
                     vec![
                         (MinusNoop, "áábáá"),
                         (Deletion, "b"),
-                        (MinusNoop, "ááb\n"),
+                        (MinusNoop, "ááb"),
                     ],
                 ],
                 vec![vec![
                     (PlusNoop, "áábáá"),
                     (Insertion, "c"),
-                    (PlusNoop, "ááb\n"),
+                    (PlusNoop, "ááb"),
                 ]],
             ),
             0.66,
@@ -433,14 +262,14 @@ mod tests {
             vec!["bbbb!bbb\n", "dddddddd\n", "cccc!ccc\n"],
             (
                 vec![
-                    vec![(MinusNoop, "aaaaaaaa\n")],
-                    vec![(MinusNoop, "bbbb"), (Deletion, "b"), (MinusNoop, "bbb\n")],
-                    vec![(MinusNoop, "cccc"), (Deletion, "c"), (MinusNoop, "ccc\n")],
+                    vec![(MinusNoop, "aaaaaaaa")],
+                    vec![(MinusNoop, "bbbb"), (Deletion, "b"), (MinusNoop, "bbb")],
+                    vec![(MinusNoop, "cccc"), (Deletion, "c"), (MinusNoop, "ccc")],
                 ],
                 vec![
-                    vec![(PlusNoop, "bbbb"), (Insertion, "!"), (PlusNoop, "bbb\n")],
-                    vec![(PlusNoop, "dddddddd\n")],
-                    vec![(PlusNoop, "cccc"), (Insertion, "!"), (PlusNoop, "ccc\n")],
+                    vec![(PlusNoop, "bbbb"), (Insertion, "!"), (PlusNoop, "bbb")],
+                    vec![(PlusNoop, "dddddddd")],
+                    vec![(PlusNoop, "cccc"), (Insertion, "!"), (PlusNoop, "ccc")],
                 ],
             ),
             0.66,
author	Dan Davison <dandavison7@gmail.com>	2019-07-30 12:10:17 -0400
committer	Dan Davison <dandavison7@gmail.com>	2019-08-06 22:50:31 -0700
commit	41c43314da3cee8bbaa98c1deff0e7c45646b28d (patch)
tree	4202e4bcebd74cee1ed52e27f1a7860fec77a91a /src/edits.rs
parent	1bfd17539cba33589e07827f9e577d3d9ced018e (diff)