diff options
author | Dan Davison <dandavison7@gmail.com> | 2019-07-30 12:10:17 -0400 |
---|---|---|
committer | Dan Davison <dandavison7@gmail.com> | 2019-08-06 22:50:31 -0700 |
commit | 41c43314da3cee8bbaa98c1deff0e7c45646b28d (patch) | |
tree | 4202e4bcebd74cee1ed52e27f1a7860fec77a91a /src/edits.rs | |
parent | 1bfd17539cba33589e07827f9e577d3d9ced018e (diff) |
Use Needleman-Wunsch / Wagner-Fischer algorithm
Diffstat (limited to 'src/edits.rs')
-rw-r--r-- | src/edits.rs | 433 |
1 files changed, 131 insertions, 302 deletions
diff --git a/src/edits.rs b/src/edits.rs index 060c2851..95c7191e 100644 --- a/src/edits.rs +++ b/src/edits.rs @@ -1,31 +1,11 @@ -use crate::edits::line_pair::LinePair; +use unicode_segmentation::UnicodeSegmentation; -/* - Consider minus line m and paired plus line p, respectively. The following cases exist: - - 1. Whitespace deleted at line beginning. - => The deleted section is highlighted in m; p is unstyled. - - 2. Whitespace inserted at line beginning. - => The inserted section is highlighted in p; m is unstyled. - - 3. An internal section of the line containing a non-whitespace character has been deleted. - => The deleted section is highlighted in m; p is unstyled. - - 4. An internal section of the line containing a non-whitespace character has been changed. - => The original section is highlighted in m; the replacement is highlighted in p. - - 5. An internal section of the line containing a non-whitespace character has been inserted. - => The inserted section is highlighted in p; m is unstyled. - - Note that whitespace can be neither deleted nor inserted at the end of the line: the line by - definition has no trailing whitespace. -*/ - -type AnnotatedLine<'a, EditOperation> = Vec<(EditOperation, &'a str)>; +use crate::align; /// Infer the edit operations responsible for the differences between a collection of old and new -/// lines. Return the input minus and plus lines, in annotated form. +/// lines. A "line" is a string. An annotated line is a Vec of (op, &str) pairs, where the &str +/// slices are slices of the line, and their concatenation equals the line. Return the input minus +/// and plus lines, in annotated form. pub fn infer_edits<'a, EditOperation>( minus_lines: &'a Vec<String>, plus_lines: &'a Vec<String>, @@ -35,42 +15,58 @@ pub fn infer_edits<'a, EditOperation>( insertion: EditOperation, distance_threshold: f64, ) -> ( - Vec<AnnotatedLine<'a, EditOperation>>, - Vec<AnnotatedLine<'a, EditOperation>>, + Vec<Vec<(EditOperation, &'a str)>>, // annotated minus lines + Vec<Vec<(EditOperation, &'a str)>>, // annotated plus lines ) where EditOperation: Copy, + EditOperation: PartialEq, { - let mut annotated_minus_lines = Vec::<AnnotatedLine<EditOperation>>::new(); - let mut annotated_plus_lines = Vec::<AnnotatedLine<EditOperation>>::new(); + let mut annotated_minus_lines = Vec::<Vec<(EditOperation, &str)>>::new(); + let mut annotated_plus_lines = Vec::<Vec<(EditOperation, &str)>>::new(); let mut emitted = 0; // plus lines emitted so far 'minus_lines_loop: for minus_line in minus_lines { let mut considered = 0; // plus lines considered so far as match for minus_line + let minus_line = minus_line.trim_end(); for plus_line in &plus_lines[emitted..] { - let line_pair = LinePair::new(minus_line, plus_line); - if line_pair.distance < distance_threshold { + let plus_line = plus_line.trim_end(); + let alignment = align::Alignment::new( + minus_line + .grapheme_indices(true) + .collect::<Vec<(usize, &str)>>(), + plus_line + .grapheme_indices(true) + .collect::<Vec<(usize, &str)>>(), + ); + + if alignment.normalized_edit_distance() < distance_threshold { // minus_line and plus_line are inferred to be a homologous pair. // Emit as unpaired the plus lines already considered and rejected for plus_line in &plus_lines[emitted..(emitted + considered)] { - annotated_plus_lines.push(vec![(non_insertion, plus_line)]); + annotated_plus_lines.push(vec![(non_insertion, plus_line.trim_end())]); } emitted += considered; // Emit the homologous pair. - let (minus_edit, plus_edit) = (line_pair.minus_edit, line_pair.plus_edit); - annotated_minus_lines.push(vec![ - (non_deletion, &minus_line[0..minus_edit.start]), - (deletion, &minus_line[minus_edit.start..minus_edit.end]), - (non_deletion, &minus_line[minus_edit.end..]), - ]); - annotated_plus_lines.push(vec![ - (non_insertion, &plus_line[0..plus_edit.start]), - (insertion, &plus_line[plus_edit.start..plus_edit.end]), - (non_insertion, &plus_line[plus_edit.end..]), - ]); + annotated_minus_lines.push(coalesce_minus_edits( + &alignment, + minus_line, + non_deletion, + deletion, + non_insertion, + insertion, + )); + annotated_plus_lines.push(coalesce_plus_edits( + &alignment, + plus_line, + non_deletion, + deletion, + non_insertion, + insertion, + )); emitted += 1; // Move on to the next minus line. @@ -80,260 +76,85 @@ where } } // No homolog was found for minus i; emit as unpaired. - annotated_minus_lines.push(vec![(non_deletion, minus_line)]); + annotated_minus_lines.push(vec![(non_deletion, minus_line.trim_end())]); } // Emit any remaining plus lines for plus_line in &plus_lines[emitted..] { - annotated_plus_lines.push(vec![(non_insertion, plus_line)]); + annotated_plus_lines.push(vec![(non_insertion, plus_line.trim_end())]); } (annotated_minus_lines, annotated_plus_lines) } -mod line_pair { - use std::cmp::{max, min}; - - use itertools::{Itertools, PeekingNext}; - use unicode_segmentation::UnicodeSegmentation; +pub fn coalesce_minus_edits<'a, EditOperation>( + alignment: &align::Alignment<'a>, + line: &'a str, + non_deletion: EditOperation, + deletion: EditOperation, + _non_insertion: EditOperation, + insertion: EditOperation, +) -> Vec<(EditOperation, &'a str)> +where + EditOperation: Copy, + EditOperation: PartialEq, +{ + coalesce_edits( + alignment.edit_operations(non_deletion, deletion, deletion, insertion, true), + line, + insertion, + ) +} - /// A pair of right-trimmed strings. - pub struct LinePair<'a> { - pub minus_line: &'a str, - pub plus_line: &'a str, - pub minus_edit: Edit, - pub plus_edit: Edit, - pub distance: f64, - } +pub fn coalesce_plus_edits<'a, EditOperation>( + alignment: &align::Alignment<'a>, + line: &'a str, + _non_deletion: EditOperation, + deletion: EditOperation, + non_insertion: EditOperation, + insertion: EditOperation, +) -> Vec<(EditOperation, &'a str)> +where + EditOperation: Copy, + EditOperation: PartialEq, +{ + coalesce_edits( + alignment.edit_operations(non_insertion, insertion, deletion, insertion, false), + line, + deletion, + ) +} - #[derive(Debug)] - pub struct Edit { - pub start: usize, - pub end: usize, - string_length: usize, +fn coalesce_edits<'a, 'b, EditOperation>( + operations: Vec<(EditOperation, (usize, &'b str))>, + line: &'a str, + irrelevant: EditOperation, +) -> Vec<(EditOperation, &'a str)> +where + EditOperation: Copy, + EditOperation: PartialEq, +{ + let mut edits = Vec::new(); // TODO capacity + let mut operations = operations.iter().filter(|(op, _)| *op != irrelevant); + let next = operations.next(); + if next.is_none() { + return edits; } - - impl Edit { - fn distance(&self) -> f64 { - (self.end - self.start) as f64 / self.string_length as f64 + let (mut last_op, (mut last_offset, _)) = next.unwrap(); + let mut curr_op = last_op; + let mut curr_offset; + for (op, (offset, _)) in operations { + curr_op = *op; + curr_offset = *offset; + if curr_op != last_op { + edits.push((last_op, &line[last_offset..*offset])); + last_offset = curr_offset; + last_op = curr_op; } } - - impl<'a> LinePair<'a> { - pub fn new(s0: &'a str, s1: &'a str) -> Self { - let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true)); - let (prefix_length, leading_whitespace) = LinePair::prefix_data(g0, g1); - let common_prefix_length = [ - leading_whitespace[0] + prefix_length, - leading_whitespace[1] + prefix_length, - ]; - // TODO: Don't compute grapheme segmentation twice? - let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true)); - let (common_suffix_length, trailing_whitespace) = LinePair::suffix_data(g0, g1); - let lengths = [ - s0.len() - trailing_whitespace[0], - s1.len() - trailing_whitespace[1], - ]; - - // We require that (right-trimmed length) >= (common prefix length). Consider: - // minus = "a " - // plus = "a b " - // Here, the right-trimmed length of minus is 1, yet the common prefix length is 2. We - // resolve this by taking the following maxima: - let minus_length = max(lengths[0], common_prefix_length[0]); - let plus_length = max(lengths[1], common_prefix_length[1]); - - // Work backwards from the end of the strings. The end of the change region is equal to - // the start of their common suffix. To find the start of the change region, start with - // the end of their common prefix, and then move leftwards until it is before the start - // of the common suffix in both strings. - let minus_change_end = minus_length - common_suffix_length; - let plus_change_end = plus_length - common_suffix_length; - let change_begin = [ - min( - common_prefix_length[0], - min(minus_change_end, plus_change_end), - ), - min( - common_prefix_length[1], - min(minus_change_end, plus_change_end), - ), - ]; - - let minus_edit = Edit { - start: change_begin[0], - end: minus_change_end, - string_length: minus_length - leading_whitespace[0], - }; - let plus_edit = Edit { - start: change_begin[1], - end: plus_change_end, - string_length: plus_length - leading_whitespace[1], - }; - let distance = minus_edit.distance() + plus_edit.distance(); - LinePair { - minus_line: s0, - plus_line: s1, - minus_edit, - plus_edit, - distance, - } - } - - #[allow(dead_code)] - pub fn format(&self) -> String { - format!( - "LinePair\n \ - \t{} {} {}\n \ - \t{} {} {}\n \ - \t{}", - self.minus_line.trim_end(), - self.minus_edit.start, - self.minus_edit.end, - self.plus_line.trim_end(), - self.plus_edit.start, - self.plus_edit.end, - self.distance - ) - } - - /// Align the two strings at their left ends and consider only the bytes up to the length of - /// the shorter string. Return the byte offset of the first differing grapheme cluster, or - /// the byte length of shorter string if they do not differ. - fn prefix_data<'b, I>(s0: I, s1: I) -> (usize, [usize; 2]) - where - I: Iterator<Item = (usize, &'b str)>, - I: Itertools, - { - let mut s0 = s0.peekable(); - let mut s1 = s1.peekable(); - - let w0 = consume_whitespace(&mut s0); - let w1 = consume_whitespace(&mut s1); - - ( - s0.zip(s1) - .peekable() - .peeking_take_while(|((_, c0), (_, c1))| c0 == c1) - .fold(0, |offset, ((_, c0), (_, _))| offset + c0.len()), - [w0, w1], - ) - } - - /// Trim trailing whitespace and align the two strings at their right ends. Fix the origin - /// at their right ends and, looking left, consider only the bytes up to the length of the - /// shorter string. Return the byte offset of the first differing grapheme cluster, or the - /// byte length of the shorter string if they do not differ. Also return the number of bytes - /// of whitespace trimmed from each string. - fn suffix_data<'b, I>(s0: I, s1: I) -> (usize, [usize; 2]) - where - I: DoubleEndedIterator<Item = (usize, &'b str)>, - I: Itertools, - { - let mut s0 = s0.rev().peekable(); - let mut s1 = s1.rev().peekable(); - - let w0 = consume_whitespace(&mut s0); - let w1 = consume_whitespace(&mut s1); - - let (suffix_length, [_, _]) = LinePair::prefix_data(s0, s1); - (suffix_length, [w0, w1]) - } - } - - fn consume_whitespace<'b, I>(s: &mut I) -> usize - where - I: Iterator<Item = (usize, &'b str)>, - I: Itertools, - I: PeekingNext, - { - let is_whitespace = |(_, c): &(usize, &str)| *c == " " || *c == "\n"; - s.peeking_take_while(is_whitespace).count() - } - - #[cfg(test)] - mod tests { - use super::*; - - fn common_prefix_length(s1: &str, s2: &str) -> usize { - let (prefix_length, [w0, w1]) = - super::LinePair::prefix_data(s1.grapheme_indices(true), s2.grapheme_indices(true)); - assert_eq!( - w0, w1, - "Lines with different amounts of leading whitespace are not supported" - ); - w0 + prefix_length - } - - fn common_suffix_length(s1: &str, s2: &str) -> usize { - super::LinePair::suffix_data(s1.grapheme_indices(true), s2.grapheme_indices(true)).0 - } - - #[test] - fn test_common_prefix_length() { - assert_eq!(common_prefix_length("", ""), 0); - assert_eq!(common_prefix_length("", "a"), 0); - assert_eq!(common_prefix_length("a", ""), 0); - assert_eq!(common_prefix_length("a", "b"), 0); - assert_eq!(common_prefix_length("a", "a"), 1); - assert_eq!(common_prefix_length("a", "ab"), 1); - assert_eq!(common_prefix_length("ab", "a"), 1); - assert_eq!(common_prefix_length("ab", "aba"), 2); - assert_eq!(common_prefix_length("aba", "ab"), 2); - } - - #[test] - fn test_common_prefix_length_with_leading_whitespace() { - // assert_eq!(common_prefix_length(" ", ""), 0); - assert_eq!(common_prefix_length(" ", " "), 1); - assert_eq!(common_prefix_length(" a", " a"), 2); - // assert_eq!(common_prefix_length(" a", "a"), 0); - } - - #[test] - fn test_common_suffix_length() { - assert_eq!(common_suffix_length("", ""), 0); - assert_eq!(common_suffix_length("", "a"), 0); - assert_eq!(common_suffix_length("a", ""), 0); - assert_eq!(common_suffix_length("a", "b"), 0); - assert_eq!(common_suffix_length("a", "a"), 1); - assert_eq!(common_suffix_length("a", "ab"), 0); - assert_eq!(common_suffix_length("ab", "a"), 0); - assert_eq!(common_suffix_length("ab", "b"), 1); - assert_eq!(common_suffix_length("ab", "aab"), 2); - assert_eq!(common_suffix_length("aba", "ba"), 2); - } - - #[test] - fn test_common_suffix_length_with_trailing_whitespace() { - assert_eq!(common_suffix_length("", " "), 0); - assert_eq!(common_suffix_length(" ", "a"), 0); - assert_eq!(common_suffix_length("a ", ""), 0); - assert_eq!(common_suffix_length("a", "b "), 0); - assert_eq!(common_suffix_length("a", "a "), 1); - assert_eq!(common_suffix_length("a ", "ab "), 0); - assert_eq!(common_suffix_length("ab", "a "), 0); - assert_eq!(common_suffix_length("ab ", "b "), 1); - assert_eq!(common_suffix_length("ab ", "aab "), 2); - assert_eq!(common_suffix_length("aba ", "ba"), 2); - } - - #[test] - fn test_common_suffix_length_with_trailing_whitespace_nonascii() { - assert_eq!(common_suffix_length(" ", "á"), 0); - assert_eq!(common_suffix_length("á ", ""), 0); - assert_eq!(common_suffix_length("á", "b "), 0); - assert_eq!(common_suffix_length("á", "á "), "á".len()); - assert_eq!(common_suffix_length("a ", "áb "), 0); - assert_eq!(common_suffix_length("ab", "á "), 0); - assert_eq!(common_suffix_length("áb ", "b "), 1); - assert_eq!(common_suffix_length("áb ", "aáb "), 1 + "á".len()); - assert_eq!(common_suffix_length("abá ", "bá"), 1 + "á".len()); - assert_eq!( - common_suffix_length("áaáabá ", "ááabá "), - 2 + 2 * "á".len() - ); - } + if curr_op == last_op { + edits.push((last_op, &line[last_offset..])); } + edits } #[cfg(test)] @@ -359,13 +180,25 @@ mod tests { const DISTANCE_MAX: f64 = 2.0; #[test] + fn test_coalesce_edits_1() { + assert_eq!( + coalesce_edits( + vec![(MinusNoop, (0, "a")), (MinusNoop, (1, "b"))], + "ab", + Insertion + ), + vec![(MinusNoop, "ab")] + ) + } + + #[test] fn test_infer_edits_1() { assert_paired_edits( vec!["aaa\n"], vec!["aba\n"], ( - vec![vec![(MinusNoop, "a"), (Deletion, "a"), (MinusNoop, "a\n")]], - vec![vec![(PlusNoop, "a"), (Insertion, "b"), (PlusNoop, "a\n")]], + vec![vec![(MinusNoop, "a"), (Deletion, "a"), (MinusNoop, "a")]], + vec![vec![(PlusNoop, "a"), (Insertion, "b"), (PlusNoop, "a")]], ), ) } @@ -376,8 +209,8 @@ mod tests { vec!["áaa\n"], vec!["ááb\n"], ( - vec![vec![(MinusNoop, "á"), (Deletion, "aa"), (MinusNoop, "\n")]], - vec![vec![(PlusNoop, "á"), (Insertion, "áb"), (PlusNoop, "\n")]], + vec![vec![(MinusNoop, "á"), (Deletion, "aa")]], + vec![vec![(PlusNoop, "á"), (Insertion, "áb")]], ), ) } @@ -391,13 +224,9 @@ mod tests { vec![vec![ (MinusNoop, "d."), (Deletion, "iter"), - (MinusNoop, "items()\n"), - ]], - vec![vec![ - (PlusNoop, "d."), - (Insertion, ""), - (PlusNoop, "items()\n"), + (MinusNoop, "items()"), ]], + vec![vec![(PlusNoop, "d.items()")]], ), ) } @@ -409,17 +238,17 @@ mod tests { vec!["áábáácááb\n"], ( vec![ - vec![(MinusNoop, "áaaáaaáaa\n")], + vec![(MinusNoop, "áaaáaaáaa")], vec![ (MinusNoop, "áábáá"), (Deletion, "b"), - (MinusNoop, "ááb\n"), + (MinusNoop, "ááb"), ], ], vec![vec![ (PlusNoop, "áábáá"), (Insertion, "c"), - (PlusNoop, "ááb\n"), + (PlusNoop, "ááb"), ]], ), 0.66, @@ -433,14 +262,14 @@ mod tests { vec!["bbbb!bbb\n", "dddddddd\n", "cccc!ccc\n"], ( vec![ - vec![(MinusNoop, "aaaaaaaa\n")], - vec![(MinusNoop, "bbbb"), (Deletion, "b"), (MinusNoop, "bbb\n")], - vec![(MinusNoop, "cccc"), (Deletion, "c"), (MinusNoop, "ccc\n")], + vec![(MinusNoop, "aaaaaaaa")], + vec![(MinusNoop, "bbbb"), (Deletion, "b"), (MinusNoop, "bbb")], + vec![(MinusNoop, "cccc"), (Deletion, "c"), (MinusNoop, "ccc")], ], vec![ - vec![(PlusNoop, "bbbb"), (Insertion, "!"), (PlusNoop, "bbb\n")], - vec![(PlusNoop, "dddddddd\n")], - vec![(PlusNoop, "cccc"), (Insertion, "!"), (PlusNoop, "ccc\n")], + vec![(PlusNoop, "bbbb"), (Insertion, "!"), (PlusNoop, "bbb")], + vec![(PlusNoop, "dddddddd")], + vec![(PlusNoop, "cccc"), (Insertion, "!"), (PlusNoop, "ccc")], ], ), 0.66, |