From 5c6d42d494ebb1573a9c5eb67d032d348fb29b72 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Fri, 26 Jul 2019 19:21:16 -0400 Subject: Refactor: edits --- src/edits.rs | 460 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 227 insertions(+), 233 deletions(-) (limited to 'src/edits.rs') diff --git a/src/edits.rs b/src/edits.rs index 1793fea9..2695cf77 100644 --- a/src/edits.rs +++ b/src/edits.rs @@ -1,16 +1,13 @@ -use std::cmp::{max, min}; +use crate::edits::line_pair::LinePair; -use crate::edits::string_pair::StringPair; - -/// Infer the edit operations responsible for the differences between -/// a collection of old and new lines. +/// Infer the edit operations responsible for the differences between a collection of old and new +/// lines. /* - This function is called iff a region of n minus lines followed - by n plus lines is encountered, e.g. n successive lines have - been partially changed. + This function is called iff a region of n minus lines followed by n plus lines is encountered, + e.g. n successive lines have been partially changed. - Consider the i-th such line and let m, p be the i-th minus and - i-th plus line, respectively. The following cases exist: + Consider the i-th such line and let m, p be the i-th minus and i-th plus line, respectively. The + following cases exist: 1. Whitespace deleted at line beginning. => The deleted section is highlighted in m; p is unstyled. @@ -27,17 +24,16 @@ use crate::edits::string_pair::StringPair; 5. An internal section of the line containing a non-whitespace character has been inserted. => The inserted section is highlighted in p; m is unstyled. - Note that whitespace can be neither deleted nor inserted at the - end of the line: the line by definition has no trailing - whitespace. + Note that whitespace can be neither deleted nor inserted at the end of the line: the line by + definition has no trailing whitespace. */ pub fn infer_edit_sections<'m, 'p, EditOperationTag>( minus_lines: &'m Vec, plus_lines: &'p Vec, - minus_line_noop: EditOperationTag, - delete: EditOperationTag, - plus_line_noop: EditOperationTag, - insert: EditOperationTag, + non_deletion: EditOperationTag, + deletion: EditOperationTag, + non_insertion: EditOperationTag, + insertion: EditOperationTag, similarity_threshold: f64, ) -> ( Vec>, @@ -50,72 +46,215 @@ where let mut plus_line_sections = Vec::new(); for (minus, plus) in minus_lines.iter().zip(plus_lines.iter()) { - let string_pair = StringPair::new(minus, plus); - - // We require that (right-trimmed length) >= (common prefix length). Consider: - // minus = "a " - // plus = "a b " - // Here, the right-trimmed length of minus is 1, yet the common prefix length is - // 2. We resolve this by taking the following maxima: - let minus_length = max(string_pair.lengths[0], string_pair.common_prefix_length); - let plus_length = max(string_pair.lengths[1], string_pair.common_prefix_length); - - // Work backwards from the end of the strings. The end of the - // change region is equal to the start of their common - // suffix. To find the start of the change region, start with - // the end of their common prefix, and then move leftwards - // until it is before the start of the common suffix in both - // strings. - let minus_change_end = minus_length - string_pair.common_suffix_length; - let plus_change_end = plus_length - string_pair.common_suffix_length; - let change_begin = min( - string_pair.common_prefix_length, - min(minus_change_end, plus_change_end), - ); + let line_pair = LinePair::new(minus, plus); - let minus_edit = Edit { - change_begin, - change_end: minus_change_end, - string_length: minus_length, - }; - let plus_edit = Edit { - change_begin, - change_end: plus_change_end, - string_length: plus_length, - }; - - if minus_edit.appears_genuine(similarity_threshold) - && plus_edit.appears_genuine(similarity_threshold) - { + if line_pair.is_homologous_pair(similarity_threshold) { + let (minus_edit, plus_edit) = (line_pair.minus_edit, line_pair.plus_edit); minus_line_sections.push(vec![ - (minus_line_noop, &minus[0..change_begin]), - (delete, &minus[change_begin..minus_change_end]), - (minus_line_noop, &minus[minus_change_end..]), + (non_deletion, &minus[0..minus_edit.start]), + (deletion, &minus[minus_edit.start..minus_edit.end]), + (non_deletion, &minus[minus_edit.end..]), ]); plus_line_sections.push(vec![ - (plus_line_noop, &plus[0..change_begin]), - (insert, &plus[change_begin..plus_change_end]), - (plus_line_noop, &plus[plus_change_end..]), + (non_insertion, &plus[0..plus_edit.start]), + (insertion, &plus[plus_edit.start..plus_edit.end]), + (non_insertion, &plus[plus_edit.end..]), ]); } else { - minus_line_sections.push(vec![(minus_line_noop, minus)]); - plus_line_sections.push(vec![(plus_line_noop, plus)]); + minus_line_sections.push(vec![(non_deletion, minus)]); + plus_line_sections.push(vec![(non_insertion, plus)]); } } (minus_line_sections, plus_line_sections) } -struct Edit { - change_begin: usize, - change_end: usize, - string_length: usize, -} +mod line_pair { + use std::cmp::{max, min}; + + use itertools::Itertools; + use unicode_segmentation::UnicodeSegmentation; + + /// A pair of right-trimmed strings. + pub struct LinePair { + pub minus_edit: Edit, + pub plus_edit: Edit, + } + + pub struct Edit { + pub start: usize, + pub end: usize, + string_length: usize, + } + + impl Edit { + // TODO: exclude leading whitespace in this calculation + fn appears_genuine(&self, similarity_threshold: f64) -> bool { + ((self.end - self.start) as f64 / self.string_length as f64) < similarity_threshold + } + } + + impl LinePair { + pub fn new(s0: &str, s1: &str) -> LinePair { + let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true)); + let common_prefix_length = LinePair::common_prefix_length(g0, g1); + // TODO: Don't compute grapheme segmentation twice? + let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true)); + let (common_suffix_length, trailing_whitespace) = LinePair::suffix_data(g0, g1); + let lengths = [ + s0.len() - trailing_whitespace[0], + s1.len() - trailing_whitespace[1], + ]; + + // We require that (right-trimmed length) >= (common prefix length). Consider: + // minus = "a " + // plus = "a b " + // Here, the right-trimmed length of minus is 1, yet the common prefix length is 2. We + // resolve this by taking the following maxima: + let minus_length = max(lengths[0], common_prefix_length); + let plus_length = max(lengths[1], common_prefix_length); + + // Work backwards from the end of the strings. The end of the change region is equal to + // the start of their common suffix. To find the start of the change region, start with + // the end of their common prefix, and then move leftwards until it is before the start + // of the common suffix in both strings. + let minus_change_end = minus_length - common_suffix_length; + let plus_change_end = plus_length - common_suffix_length; + let change_begin = min(common_prefix_length, min(minus_change_end, plus_change_end)); + + LinePair { + minus_edit: Edit { + start: change_begin, + end: minus_change_end, + string_length: minus_length, + }, + plus_edit: Edit { + start: change_begin, + end: plus_change_end, + string_length: plus_length, + }, + } + } + + pub fn is_homologous_pair(&self, similarity_threshold: f64) -> bool { + self.minus_edit.appears_genuine(similarity_threshold) + && self.plus_edit.appears_genuine(similarity_threshold) + } + + /// Align the two strings at their left ends and consider only the bytes up to the length of + /// the shorter string. Return the byte offset of the first differing grapheme cluster, or + /// the byte length of shorter string if they do not differ. + fn common_prefix_length<'a, I>(s0: I, s1: I) -> usize + where + I: Iterator, + I: Itertools, + { + s0.zip(s1) + .peekable() + .peeking_take_while(|((_, c0), (_, c1))| c0 == c1) + .fold(0, |offset, ((_, c0), (_, _))| offset + c0.len()) + } + + /// Trim trailing whitespace and align the two strings at their right ends. Fix the origin + /// at their right ends and, looking left, consider only the bytes up to the length of the + /// shorter string. Return the byte offset of the first differing grapheme cluster, or the + /// byte length of the shorter string if they do not differ. Also return the number of bytes + /// of whitespace trimmed from each string. + fn suffix_data<'a, I>(s0: I, s1: I) -> (usize, [usize; 2]) + where + I: DoubleEndedIterator, + I: Itertools, + { + let mut s0 = s0.rev().peekable(); + let mut s1 = s1.rev().peekable(); + + let is_whitespace = |(_, c): &(usize, &str)| *c == " " || *c == "\n"; + let n0 = (&mut s0).peeking_take_while(is_whitespace).count(); + let n1 = (&mut s1).peeking_take_while(is_whitespace).count(); -impl Edit { - // TODO: exclude leading whitespace in this calculation - fn appears_genuine(&self, similarity_threshold: f64) -> bool { - ((self.change_end - self.change_begin) as f64 / self.string_length as f64) - < similarity_threshold + (LinePair::common_prefix_length(s0, s1), [n0, n1]) + } + } + + #[cfg(test)] + mod tests { + use super::*; + + fn common_prefix_length(s1: &str, s2: &str) -> usize { + super::LinePair::common_prefix_length( + s1.grapheme_indices(true), + s2.grapheme_indices(true), + ) + } + + fn common_suffix_length(s1: &str, s2: &str) -> usize { + super::LinePair::suffix_data(s1.grapheme_indices(true), s2.grapheme_indices(true)).0 + } + + #[test] + fn test_common_prefix_length() { + assert_eq!(common_prefix_length("", ""), 0); + assert_eq!(common_prefix_length("", "a"), 0); + assert_eq!(common_prefix_length("a", ""), 0); + assert_eq!(common_prefix_length("a", "b"), 0); + assert_eq!(common_prefix_length("a", "a"), 1); + assert_eq!(common_prefix_length("a", "ab"), 1); + assert_eq!(common_prefix_length("ab", "a"), 1); + assert_eq!(common_prefix_length("ab", "aba"), 2); + assert_eq!(common_prefix_length("aba", "ab"), 2); + } + + #[test] + fn test_common_prefix_length_with_leading_whitespace() { + assert_eq!(common_prefix_length(" ", ""), 0); + assert_eq!(common_prefix_length(" ", " "), 1); + assert_eq!(common_prefix_length(" a", " a"), 2); + assert_eq!(common_prefix_length(" a", "a"), 0); + } + + #[test] + fn test_common_suffix_length() { + assert_eq!(common_suffix_length("", ""), 0); + assert_eq!(common_suffix_length("", "a"), 0); + assert_eq!(common_suffix_length("a", ""), 0); + assert_eq!(common_suffix_length("a", "b"), 0); + assert_eq!(common_suffix_length("a", "a"), 1); + assert_eq!(common_suffix_length("a", "ab"), 0); + assert_eq!(common_suffix_length("ab", "a"), 0); + assert_eq!(common_suffix_length("ab", "b"), 1); + assert_eq!(common_suffix_length("ab", "aab"), 2); + assert_eq!(common_suffix_length("aba", "ba"), 2); + } + + #[test] + fn test_common_suffix_length_with_trailing_whitespace() { + assert_eq!(common_suffix_length("", " "), 0); + assert_eq!(common_suffix_length(" ", "a"), 0); + assert_eq!(common_suffix_length("a ", ""), 0); + assert_eq!(common_suffix_length("a", "b "), 0); + assert_eq!(common_suffix_length("a", "a "), 1); + assert_eq!(common_suffix_length("a ", "ab "), 0); + assert_eq!(common_suffix_length("ab", "a "), 0); + assert_eq!(common_suffix_length("ab ", "b "), 1); + assert_eq!(common_suffix_length("ab ", "aab "), 2); + assert_eq!(common_suffix_length("aba ", "ba"), 2); + } + + #[test] + fn test_common_suffix_length_with_trailing_whitespace_nonascii() { + assert_eq!(common_suffix_length(" ", "á"), 0); + assert_eq!(common_suffix_length("á ", ""), 0); + assert_eq!(common_suffix_length("á", "b "), 0); + assert_eq!(common_suffix_length("á", "á "), "á".len()); + assert_eq!(common_suffix_length("a ", "áb "), 0); + assert_eq!(common_suffix_length("ab", "á "), 0); + assert_eq!(common_suffix_length("áb ", "b "), 1); + assert_eq!(common_suffix_length("áb ", "aáb "), 1 + "á".len()); + assert_eq!(common_suffix_length("abá ", "bá"), 1 + "á".len()); + assert_eq!( + common_suffix_length("áaáabá ", "ááabá "), + 2 + 2 * "á".len() + ); + } } } @@ -127,8 +266,8 @@ mod tests { enum EditOperationTag { MinusNoop, PlusNoop, - Delete, - Insert, + Deletion, + Insertion, } use EditOperationTag::*; @@ -141,14 +280,14 @@ mod tests { &minus_lines, &plus_lines, MinusNoop, - Delete, + Deletion, PlusNoop, - Insert, + Insertion, 1.0, ); let expected_edits = ( - vec![vec![(MinusNoop, "a"), (Delete, "a"), (MinusNoop, "a\n")]], - vec![vec![(PlusNoop, "a"), (Insert, "b"), (PlusNoop, "a\n")]], + vec![vec![(MinusNoop, "a"), (Deletion, "a"), (MinusNoop, "a\n")]], + vec![vec![(PlusNoop, "a"), (Insertion, "b"), (PlusNoop, "a\n")]], ); assert_consistent(&expected_edits); @@ -164,14 +303,14 @@ mod tests { &minus_lines, &plus_lines, MinusNoop, - Delete, + Deletion, PlusNoop, - Insert, + Insertion, 1.0, ); let expected_edits = ( - vec![vec![(MinusNoop, "á"), (Delete, "aa"), (MinusNoop, "\n")]], - vec![vec![(PlusNoop, "á"), (Insert, "áb"), (PlusNoop, "\n")]], + vec![vec![(MinusNoop, "á"), (Deletion, "aa"), (MinusNoop, "\n")]], + vec![vec![(PlusNoop, "á"), (Insertion, "áb"), (PlusNoop, "\n")]], ); assert_consistent(&expected_edits); @@ -188,20 +327,20 @@ mod tests { &minus_lines, &plus_lines, MinusNoop, - Delete, + Deletion, PlusNoop, - Insert, + Insertion, 1.0, ); let expected_edits = ( vec![vec![ (MinusNoop, "d."), - (Delete, "iter"), + (Deletion, "iter"), (MinusNoop, "items()\n"), ]], vec![vec![ (PlusNoop, "d."), - (Insert, ""), + (Insertion, ""), (PlusNoop, "items()\n"), ]], ); @@ -274,159 +413,14 @@ mod tests { fn fmt_edit(edit: EditOperationTag) -> &'static str { match edit { MinusNoop => "MinusNoop", - Delete => "Delete", + Deletion => "Deletion", PlusNoop => "PlusNoop", - Insert => "Insert", + Insertion => "Insertion", } } fn is_edit(edit: &EditOperationTag) -> bool { - *edit == Delete || *edit == Insert + *edit == Deletion || *edit == Insertion } } - -mod string_pair { - use itertools::Itertools; - use unicode_segmentation::UnicodeSegmentation; - - /// A pair of right-trimmed strings. - pub struct StringPair { - pub common_prefix_length: usize, - pub common_suffix_length: usize, - pub lengths: [usize; 2], - } - - impl StringPair { - pub fn new(s0: &str, s1: &str) -> StringPair { - let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true)); - let common_prefix_length = StringPair::common_prefix_length(g0, g1); - // TODO: Don't compute grapheme segmentation twice? - let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true)); - let (common_suffix_length, trailing_whitespace) = StringPair::suffix_data(g0, g1); - StringPair { - common_prefix_length, - common_suffix_length, - lengths: [ - s0.len() - trailing_whitespace[0], - s1.len() - trailing_whitespace[1], - ], - } - } - - /// Align the two strings at their left ends and consider only - /// the bytes up to the length of the shorter string. Return - /// the byte offset of the first differing grapheme cluster, - /// or the byte length of shorter string if they do not - /// differ. - fn common_prefix_length<'a, I>(s0: I, s1: I) -> usize - where - I: Iterator, - I: Itertools, - { - s0.zip(s1) - .peekable() - .peeking_take_while(|((_, c0), (_, c1))| c0 == c1) - .fold(0, |offset, ((_, c0), (_, _))| offset + c0.len()) - } - - /// Trim trailing whitespace and align the two strings at - /// their right ends. Fix the origin at their right ends and, - /// looking left, consider only the bytes up to the length of - /// the shorter string. Return the byte offset of the first - /// differing grapheme cluster, or the byte length of the - /// shorter string if they do not differ. Also return the - /// number of bytes of whitespace trimmed from each string. - fn suffix_data<'a, I>(s0: I, s1: I) -> (usize, [usize; 2]) - where - I: DoubleEndedIterator, - I: Itertools, - { - let mut s0 = s0.rev().peekable(); - let mut s1 = s1.rev().peekable(); - - let is_whitespace = |(_, c): &(usize, &str)| *c == " " || *c == "\n"; - let n0 = (&mut s0).peeking_take_while(is_whitespace).count(); - let n1 = (&mut s1).peeking_take_while(is_whitespace).count(); - - (StringPair::common_prefix_length(s0, s1), [n0, n1]) - } - } - - #[cfg(test)] - mod tests { - fn common_prefix_length(s1: &str, s2: &str) -> usize { - super::StringPair::new(s1, s2).common_prefix_length - } - - fn common_suffix_length(s1: &str, s2: &str) -> usize { - super::StringPair::new(s1, s2).common_suffix_length - } - - #[test] - fn test_common_prefix_length() { - assert_eq!(common_prefix_length("", ""), 0); - assert_eq!(common_prefix_length("", "a"), 0); - assert_eq!(common_prefix_length("a", ""), 0); - assert_eq!(common_prefix_length("a", "b"), 0); - assert_eq!(common_prefix_length("a", "a"), 1); - assert_eq!(common_prefix_length("a", "ab"), 1); - assert_eq!(common_prefix_length("ab", "a"), 1); - assert_eq!(common_prefix_length("ab", "aba"), 2); - assert_eq!(common_prefix_length("aba", "ab"), 2); - } - - #[test] - fn test_common_prefix_length_with_leading_whitespace() { - assert_eq!(common_prefix_length(" ", ""), 0); - assert_eq!(common_prefix_length(" ", " "), 1); - assert_eq!(common_prefix_length(" a", " a"), 2); - assert_eq!(common_prefix_length(" a", "a"), 0); - } - - #[test] - fn test_common_suffix_length() { - assert_eq!(common_suffix_length("", ""), 0); - assert_eq!(common_suffix_length("", "a"), 0); - assert_eq!(common_suffix_length("a", ""), 0); - assert_eq!(common_suffix_length("a", "b"), 0); - assert_eq!(common_suffix_length("a", "a"), 1); - assert_eq!(common_suffix_length("a", "ab"), 0); - assert_eq!(common_suffix_length("ab", "a"), 0); - assert_eq!(common_suffix_length("ab", "b"), 1); - assert_eq!(common_suffix_length("ab", "aab"), 2); - assert_eq!(common_suffix_length("aba", "ba"), 2); - } - - #[test] - fn test_common_suffix_length_with_trailing_whitespace() { - assert_eq!(common_suffix_length("", " "), 0); - assert_eq!(common_suffix_length(" ", "a"), 0); - assert_eq!(common_suffix_length("a ", ""), 0); - assert_eq!(common_suffix_length("a", "b "), 0); - assert_eq!(common_suffix_length("a", "a "), 1); - assert_eq!(common_suffix_length("a ", "ab "), 0); - assert_eq!(common_suffix_length("ab", "a "), 0); - assert_eq!(common_suffix_length("ab ", "b "), 1); - assert_eq!(common_suffix_length("ab ", "aab "), 2); - assert_eq!(common_suffix_length("aba ", "ba"), 2); - } - - #[test] - fn test_common_suffix_length_with_trailing_whitespace_nonascii() { - assert_eq!(common_suffix_length(" ", "á"), 0); - assert_eq!(common_suffix_length("á ", ""), 0); - assert_eq!(common_suffix_length("á", "b "), 0); - assert_eq!(common_suffix_length("á", "á "), "á".len()); - assert_eq!(common_suffix_length("a ", "áb "), 0); - assert_eq!(common_suffix_length("ab", "á "), 0); - assert_eq!(common_suffix_length("áb ", "b "), 1); - assert_eq!(common_suffix_length("áb ", "aáb "), 1 + "á".len()); - assert_eq!(common_suffix_length("abá ", "bá"), 1 + "á".len()); - assert_eq!( - common_suffix_length("áaáabá ", "ááabá "), - 2 + 2 * "á".len() - ); - } - } -} -- cgit v1.2.3