summaryrefslogtreecommitdiffstats
path: root/src/edits.rs
diff options
context:
space:
mode:
authorDan Davison <dandavison7@gmail.com>2019-07-26 19:21:16 -0400
committerDan Davison <dandavison7@gmail.com>2019-07-27 13:12:32 -0400
commit5c6d42d494ebb1573a9c5eb67d032d348fb29b72 (patch)
tree5d0b1503c7182dcbececaac8cd7f3ee9fb138a6b /src/edits.rs
parenta598a92700233130b56677c1269b6171eeb73ef7 (diff)
Refactor: edits
Diffstat (limited to 'src/edits.rs')
-rw-r--r--src/edits.rs460
1 files changed, 227 insertions, 233 deletions
diff --git a/src/edits.rs b/src/edits.rs
index 1793fea9..2695cf77 100644
--- a/src/edits.rs
+++ b/src/edits.rs
@@ -1,16 +1,13 @@
-use std::cmp::{max, min};
+use crate::edits::line_pair::LinePair;
-use crate::edits::string_pair::StringPair;
-
-/// Infer the edit operations responsible for the differences between
-/// a collection of old and new lines.
+/// Infer the edit operations responsible for the differences between a collection of old and new
+/// lines.
/*
- This function is called iff a region of n minus lines followed
- by n plus lines is encountered, e.g. n successive lines have
- been partially changed.
+ This function is called iff a region of n minus lines followed by n plus lines is encountered,
+ e.g. n successive lines have been partially changed.
- Consider the i-th such line and let m, p be the i-th minus and
- i-th plus line, respectively. The following cases exist:
+ Consider the i-th such line and let m, p be the i-th minus and i-th plus line, respectively. The
+ following cases exist:
1. Whitespace deleted at line beginning.
=> The deleted section is highlighted in m; p is unstyled.
@@ -27,17 +24,16 @@ use crate::edits::string_pair::StringPair;
5. An internal section of the line containing a non-whitespace character has been inserted.
=> The inserted section is highlighted in p; m is unstyled.
- Note that whitespace can be neither deleted nor inserted at the
- end of the line: the line by definition has no trailing
- whitespace.
+ Note that whitespace can be neither deleted nor inserted at the end of the line: the line by
+ definition has no trailing whitespace.
*/
pub fn infer_edit_sections<'m, 'p, EditOperationTag>(
minus_lines: &'m Vec<String>,
plus_lines: &'p Vec<String>,
- minus_line_noop: EditOperationTag,
- delete: EditOperationTag,
- plus_line_noop: EditOperationTag,
- insert: EditOperationTag,
+ non_deletion: EditOperationTag,
+ deletion: EditOperationTag,
+ non_insertion: EditOperationTag,
+ insertion: EditOperationTag,
similarity_threshold: f64,
) -> (
Vec<Vec<(EditOperationTag, &'m str)>>,
@@ -50,72 +46,215 @@ where
let mut plus_line_sections = Vec::new();
for (minus, plus) in minus_lines.iter().zip(plus_lines.iter()) {
- let string_pair = StringPair::new(minus, plus);
-
- // We require that (right-trimmed length) >= (common prefix length). Consider:
- // minus = "a "
- // plus = "a b "
- // Here, the right-trimmed length of minus is 1, yet the common prefix length is
- // 2. We resolve this by taking the following maxima:
- let minus_length = max(string_pair.lengths[0], string_pair.common_prefix_length);
- let plus_length = max(string_pair.lengths[1], string_pair.common_prefix_length);
-
- // Work backwards from the end of the strings. The end of the
- // change region is equal to the start of their common
- // suffix. To find the start of the change region, start with
- // the end of their common prefix, and then move leftwards
- // until it is before the start of the common suffix in both
- // strings.
- let minus_change_end = minus_length - string_pair.common_suffix_length;
- let plus_change_end = plus_length - string_pair.common_suffix_length;
- let change_begin = min(
- string_pair.common_prefix_length,
- min(minus_change_end, plus_change_end),
- );
+ let line_pair = LinePair::new(minus, plus);
- let minus_edit = Edit {
- change_begin,
- change_end: minus_change_end,
- string_length: minus_length,
- };
- let plus_edit = Edit {
- change_begin,
- change_end: plus_change_end,
- string_length: plus_length,
- };
-
- if minus_edit.appears_genuine(similarity_threshold)
- && plus_edit.appears_genuine(similarity_threshold)
- {
+ if line_pair.is_homologous_pair(similarity_threshold) {
+ let (minus_edit, plus_edit) = (line_pair.minus_edit, line_pair.plus_edit);
minus_line_sections.push(vec![
- (minus_line_noop, &minus[0..change_begin]),
- (delete, &minus[change_begin..minus_change_end]),
- (minus_line_noop, &minus[minus_change_end..]),
+ (non_deletion, &minus[0..minus_edit.start]),
+ (deletion, &minus[minus_edit.start..minus_edit.end]),
+ (non_deletion, &minus[minus_edit.end..]),
]);
plus_line_sections.push(vec![
- (plus_line_noop, &plus[0..change_begin]),
- (insert, &plus[change_begin..plus_change_end]),
- (plus_line_noop, &plus[plus_change_end..]),
+ (non_insertion, &plus[0..plus_edit.start]),
+ (insertion, &plus[plus_edit.start..plus_edit.end]),
+ (non_insertion, &plus[plus_edit.end..]),
]);
} else {
- minus_line_sections.push(vec![(minus_line_noop, minus)]);
- plus_line_sections.push(vec![(plus_line_noop, plus)]);
+ minus_line_sections.push(vec![(non_deletion, minus)]);
+ plus_line_sections.push(vec![(non_insertion, plus)]);
}
}
(minus_line_sections, plus_line_sections)
}
-struct Edit {
- change_begin: usize,
- change_end: usize,
- string_length: usize,
-}
+mod line_pair {
+ use std::cmp::{max, min};
+
+ use itertools::Itertools;
+ use unicode_segmentation::UnicodeSegmentation;
+
+ /// A pair of right-trimmed strings.
+ pub struct LinePair {
+ pub minus_edit: Edit,
+ pub plus_edit: Edit,
+ }
+
+ pub struct Edit {
+ pub start: usize,
+ pub end: usize,
+ string_length: usize,
+ }
+
+ impl Edit {
+ // TODO: exclude leading whitespace in this calculation
+ fn appears_genuine(&self, similarity_threshold: f64) -> bool {
+ ((self.end - self.start) as f64 / self.string_length as f64) < similarity_threshold
+ }
+ }
+
+ impl LinePair {
+ pub fn new(s0: &str, s1: &str) -> LinePair {
+ let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true));
+ let common_prefix_length = LinePair::common_prefix_length(g0, g1);
+ // TODO: Don't compute grapheme segmentation twice?
+ let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true));
+ let (common_suffix_length, trailing_whitespace) = LinePair::suffix_data(g0, g1);
+ let lengths = [
+ s0.len() - trailing_whitespace[0],
+ s1.len() - trailing_whitespace[1],
+ ];
+
+ // We require that (right-trimmed length) >= (common prefix length). Consider:
+ // minus = "a "
+ // plus = "a b "
+ // Here, the right-trimmed length of minus is 1, yet the common prefix length is 2. We
+ // resolve this by taking the following maxima:
+ let minus_length = max(lengths[0], common_prefix_length);
+ let plus_length = max(lengths[1], common_prefix_length);
+
+ // Work backwards from the end of the strings. The end of the change region is equal to
+ // the start of their common suffix. To find the start of the change region, start with
+ // the end of their common prefix, and then move leftwards until it is before the start
+ // of the common suffix in both strings.
+ let minus_change_end = minus_length - common_suffix_length;
+ let plus_change_end = plus_length - common_suffix_length;
+ let change_begin = min(common_prefix_length, min(minus_change_end, plus_change_end));
+
+ LinePair {
+ minus_edit: Edit {
+ start: change_begin,
+ end: minus_change_end,
+ string_length: minus_length,
+ },
+ plus_edit: Edit {
+ start: change_begin,
+ end: plus_change_end,
+ string_length: plus_length,
+ },
+ }
+ }
+
+ pub fn is_homologous_pair(&self, similarity_threshold: f64) -> bool {
+ self.minus_edit.appears_genuine(similarity_threshold)
+ && self.plus_edit.appears_genuine(similarity_threshold)
+ }
+
+ /// Align the two strings at their left ends and consider only the bytes up to the length of
+ /// the shorter string. Return the byte offset of the first differing grapheme cluster, or
+ /// the byte length of shorter string if they do not differ.
+ fn common_prefix_length<'a, I>(s0: I, s1: I) -> usize
+ where
+ I: Iterator<Item = (usize, &'a str)>,
+ I: Itertools,
+ {
+ s0.zip(s1)
+ .peekable()
+ .peeking_take_while(|((_, c0), (_, c1))| c0 == c1)
+ .fold(0, |offset, ((_, c0), (_, _))| offset + c0.len())
+ }
+
+ /// Trim trailing whitespace and align the two strings at their right ends. Fix the origin
+ /// at their right ends and, looking left, consider only the bytes up to the length of the
+ /// shorter string. Return the byte offset of the first differing grapheme cluster, or the
+ /// byte length of the shorter string if they do not differ. Also return the number of bytes
+ /// of whitespace trimmed from each string.
+ fn suffix_data<'a, I>(s0: I, s1: I) -> (usize, [usize; 2])
+ where
+ I: DoubleEndedIterator<Item = (usize, &'a str)>,
+ I: Itertools,
+ {
+ let mut s0 = s0.rev().peekable();
+ let mut s1 = s1.rev().peekable();
+
+ let is_whitespace = |(_, c): &(usize, &str)| *c == " " || *c == "\n";
+ let n0 = (&mut s0).peeking_take_while(is_whitespace).count();
+ let n1 = (&mut s1).peeking_take_while(is_whitespace).count();
-impl Edit {
- // TODO: exclude leading whitespace in this calculation
- fn appears_genuine(&self, similarity_threshold: f64) -> bool {
- ((self.change_end - self.change_begin) as f64 / self.string_length as f64)
- < similarity_threshold
+ (LinePair::common_prefix_length(s0, s1), [n0, n1])
+ }
+ }
+
+ #[cfg(test)]
+ mod tests {
+ use super::*;
+
+ fn common_prefix_length(s1: &str, s2: &str) -> usize {
+ super::LinePair::common_prefix_length(
+ s1.grapheme_indices(true),
+ s2.grapheme_indices(true),
+ )
+ }
+
+ fn common_suffix_length(s1: &str, s2: &str) -> usize {
+ super::LinePair::suffix_data(s1.grapheme_indices(true), s2.grapheme_indices(true)).0
+ }
+
+ #[test]
+ fn test_common_prefix_length() {
+ assert_eq!(common_prefix_length("", ""), 0);
+ assert_eq!(common_prefix_length("", "a"), 0);
+ assert_eq!(common_prefix_length("a", ""), 0);
+ assert_eq!(common_prefix_length("a", "b"), 0);
+ assert_eq!(common_prefix_length("a", "a"), 1);
+ assert_eq!(common_prefix_length("a", "ab"), 1);
+ assert_eq!(common_prefix_length("ab", "a"), 1);
+ assert_eq!(common_prefix_length("ab", "aba"), 2);
+ assert_eq!(common_prefix_length("aba", "ab"), 2);
+ }
+
+ #[test]
+ fn test_common_prefix_length_with_leading_whitespace() {
+ assert_eq!(common_prefix_length(" ", ""), 0);
+ assert_eq!(common_prefix_length(" ", " "), 1);
+ assert_eq!(common_prefix_length(" a", " a"), 2);
+ assert_eq!(common_prefix_length(" a", "a"), 0);
+ }
+
+ #[test]
+ fn test_common_suffix_length() {
+ assert_eq!(common_suffix_length("", ""), 0);
+ assert_eq!(common_suffix_length("", "a"), 0);
+ assert_eq!(common_suffix_length("a", ""), 0);
+ assert_eq!(common_suffix_length("a", "b"), 0);
+ assert_eq!(common_suffix_length("a", "a"), 1);
+ assert_eq!(common_suffix_length("a", "ab"), 0);
+ assert_eq!(common_suffix_length("ab", "a"), 0);
+ assert_eq!(common_suffix_length("ab", "b"), 1);
+ assert_eq!(common_suffix_length("ab", "aab"), 2);
+ assert_eq!(common_suffix_length("aba", "ba"), 2);
+ }
+
+ #[test]
+ fn test_common_suffix_length_with_trailing_whitespace() {
+ assert_eq!(common_suffix_length("", " "), 0);
+ assert_eq!(common_suffix_length(" ", "a"), 0);
+ assert_eq!(common_suffix_length("a ", ""), 0);
+ assert_eq!(common_suffix_length("a", "b "), 0);
+ assert_eq!(common_suffix_length("a", "a "), 1);
+ assert_eq!(common_suffix_length("a ", "ab "), 0);
+ assert_eq!(common_suffix_length("ab", "a "), 0);
+ assert_eq!(common_suffix_length("ab ", "b "), 1);
+ assert_eq!(common_suffix_length("ab ", "aab "), 2);
+ assert_eq!(common_suffix_length("aba ", "ba"), 2);
+ }
+
+ #[test]
+ fn test_common_suffix_length_with_trailing_whitespace_nonascii() {
+ assert_eq!(common_suffix_length(" ", "á"), 0);
+ assert_eq!(common_suffix_length("á ", ""), 0);
+ assert_eq!(common_suffix_length("á", "b "), 0);
+ assert_eq!(common_suffix_length("á", "á "), "á".len());
+ assert_eq!(common_suffix_length("a ", "áb "), 0);
+ assert_eq!(common_suffix_length("ab", "á "), 0);
+ assert_eq!(common_suffix_length("áb ", "b "), 1);
+ assert_eq!(common_suffix_length("áb ", "aáb "), 1 + "á".len());
+ assert_eq!(common_suffix_length("abá ", "bá"), 1 + "á".len());
+ assert_eq!(
+ common_suffix_length("áaáabá ", "ááabá "),
+ 2 + 2 * "á".len()
+ );
+ }
}
}
@@ -127,8 +266,8 @@ mod tests {
enum EditOperationTag {
MinusNoop,
PlusNoop,
- Delete,
- Insert,
+ Deletion,
+ Insertion,
}
use EditOperationTag::*;
@@ -141,14 +280,14 @@ mod tests {
&minus_lines,
&plus_lines,
MinusNoop,
- Delete,
+ Deletion,
PlusNoop,
- Insert,
+ Insertion,
1.0,
);
let expected_edits = (
- vec![vec![(MinusNoop, "a"), (Delete, "a"), (MinusNoop, "a\n")]],
- vec![vec![(PlusNoop, "a"), (Insert, "b"), (PlusNoop, "a\n")]],
+ vec![vec![(MinusNoop, "a"), (Deletion, "a"), (MinusNoop, "a\n")]],
+ vec![vec![(PlusNoop, "a"), (Insertion, "b"), (PlusNoop, "a\n")]],
);
assert_consistent(&expected_edits);
@@ -164,14 +303,14 @@ mod tests {
&minus_lines,
&plus_lines,
MinusNoop,
- Delete,
+ Deletion,
PlusNoop,
- Insert,
+ Insertion,
1.0,
);
let expected_edits = (
- vec![vec![(MinusNoop, "á"), (Delete, "aa"), (MinusNoop, "\n")]],
- vec![vec![(PlusNoop, "á"), (Insert, "áb"), (PlusNoop, "\n")]],
+ vec![vec![(MinusNoop, "á"), (Deletion, "aa"), (MinusNoop, "\n")]],
+ vec![vec![(PlusNoop, "á"), (Insertion, "áb"), (PlusNoop, "\n")]],
);
assert_consistent(&expected_edits);
@@ -188,20 +327,20 @@ mod tests {
&minus_lines,
&plus_lines,
MinusNoop,
- Delete,
+ Deletion,
PlusNoop,
- Insert,
+ Insertion,
1.0,
);
let expected_edits = (
vec![vec![
(MinusNoop, "d."),
- (Delete, "iter"),
+ (Deletion, "iter"),
(MinusNoop, "items()\n"),
]],
vec![vec![
(PlusNoop, "d."),
- (Insert, ""),
+ (Insertion, ""),
(PlusNoop, "items()\n"),
]],
);
@@ -274,159 +413,14 @@ mod tests {
fn fmt_edit(edit: EditOperationTag) -> &'static str {
match edit {
MinusNoop => "MinusNoop",
- Delete => "Delete",
+ Deletion => "Deletion",
PlusNoop => "PlusNoop",
- Insert => "Insert",
+ Insertion => "Insertion",
}
}
fn is_edit(edit: &EditOperationTag) -> bool {
- *edit == Delete || *edit == Insert
+ *edit == Deletion || *edit == Insertion
}
}
-
-mod string_pair {
- use itertools::Itertools;
- use unicode_segmentation::UnicodeSegmentation;
-
- /// A pair of right-trimmed strings.
- pub struct StringPair {
- pub common_prefix_length: usize,
- pub common_suffix_length: usize,
- pub lengths: [usize; 2],
- }
-
- impl StringPair {
- pub fn new(s0: &str, s1: &str) -> StringPair {
- let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true));
- let common_prefix_length = StringPair::common_prefix_length(g0, g1);
- // TODO: Don't compute grapheme segmentation twice?
- let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true));
- let (common_suffix_length, trailing_whitespace) = StringPair::suffix_data(g0, g1);
- StringPair {
- common_prefix_length,
- common_suffix_length,
- lengths: [
- s0.len() - trailing_whitespace[0],
- s1.len() - trailing_whitespace[1],
- ],
- }
- }
-
- /// Align the two strings at their left ends and consider only
- /// the bytes up to the length of the shorter string. Return
- /// the byte offset of the first differing grapheme cluster,
- /// or the byte length of shorter string if they do not
- /// differ.
- fn common_prefix_length<'a, I>(s0: I, s1: I) -> usize
- where
- I: Iterator<Item = (usize, &'a str)>,
- I: Itertools,
- {
- s0.zip(s1)
- .peekable()
- .peeking_take_while(|((_, c0), (_, c1))| c0 == c1)
- .fold(0, |offset, ((_, c0), (_, _))| offset + c0.len())
- }
-
- /// Trim trailing whitespace and align the two strings at
- /// their right ends. Fix the origin at their right ends and,
- /// looking left, consider only the bytes up to the length of
- /// the shorter string. Return the byte offset of the first
- /// differing grapheme cluster, or the byte length of the
- /// shorter string if they do not differ. Also return the
- /// number of bytes of whitespace trimmed from each string.
- fn suffix_data<'a, I>(s0: I, s1: I) -> (usize, [usize; 2])
- where
- I: DoubleEndedIterator<Item = (usize, &'a str)>,
- I: Itertools,
- {
- let mut s0 = s0.rev().peekable();
- let mut s1 = s1.rev().peekable();
-
- let is_whitespace = |(_, c): &(usize, &str)| *c == " " || *c == "\n";
- let n0 = (&mut s0).peeking_take_while(is_whitespace).count();
- let n1 = (&mut s1).peeking_take_while(is_whitespace).count();
-
- (StringPair::common_prefix_length(s0, s1), [n0, n1])
- }
- }
-
- #[cfg(test)]
- mod tests {
- fn common_prefix_length(s1: &str, s2: &str) -> usize {
- super::StringPair::new(s1, s2).common_prefix_length
- }
-
- fn common_suffix_length(s1: &str, s2: &str) -> usize {
- super::StringPair::new(s1, s2).common_suffix_length
- }
-
- #[test]
- fn test_common_prefix_length() {
- assert_eq!(common_prefix_length("", ""), 0);
- assert_eq!(common_prefix_length("", "a"), 0);
- assert_eq!(common_prefix_length("a", ""), 0);
- assert_eq!(common_prefix_length("a", "b"), 0);
- assert_eq!(common_prefix_length("a", "a"), 1);
- assert_eq!(common_prefix_length("a", "ab"), 1);
- assert_eq!(common_prefix_length("ab", "a"), 1);
- assert_eq!(common_prefix_length("ab", "aba"), 2);
- assert_eq!(common_prefix_length("aba", "ab"), 2);
- }
-
- #[test]
- fn test_common_prefix_length_with_leading_whitespace() {
- assert_eq!(common_prefix_length(" ", ""), 0);
- assert_eq!(common_prefix_length(" ", " "), 1);
- assert_eq!(common_prefix_length(" a", " a"), 2);
- assert_eq!(common_prefix_length(" a", "a"), 0);
- }
-
- #[test]
- fn test_common_suffix_length() {
- assert_eq!(common_suffix_length("", ""), 0);
- assert_eq!(common_suffix_length("", "a"), 0);
- assert_eq!(common_suffix_length("a", ""), 0);
- assert_eq!(common_suffix_length("a", "b"), 0);
- assert_eq!(common_suffix_length("a", "a"), 1);
- assert_eq!(common_suffix_length("a", "ab"), 0);
- assert_eq!(common_suffix_length("ab", "a"), 0);
- assert_eq!(common_suffix_length("ab", "b"), 1);
- assert_eq!(common_suffix_length("ab", "aab"), 2);
- assert_eq!(common_suffix_length("aba", "ba"), 2);
- }
-
- #[test]
- fn test_common_suffix_length_with_trailing_whitespace() {
- assert_eq!(common_suffix_length("", " "), 0);
- assert_eq!(common_suffix_length(" ", "a"), 0);
- assert_eq!(common_suffix_length("a ", ""), 0);
- assert_eq!(common_suffix_length("a", "b "), 0);
- assert_eq!(common_suffix_length("a", "a "), 1);
- assert_eq!(common_suffix_length("a ", "ab "), 0);
- assert_eq!(common_suffix_length("ab", "a "), 0);
- assert_eq!(common_suffix_length("ab ", "b "), 1);
- assert_eq!(common_suffix_length("ab ", "aab "), 2);
- assert_eq!(common_suffix_length("aba ", "ba"), 2);
- }
-
- #[test]
- fn test_common_suffix_length_with_trailing_whitespace_nonascii() {
- assert_eq!(common_suffix_length(" ", "á"), 0);
- assert_eq!(common_suffix_length("á ", ""), 0);
- assert_eq!(common_suffix_length("á", "b "), 0);
- assert_eq!(common_suffix_length("á", "á "), "á".len());
- assert_eq!(common_suffix_length("a ", "áb "), 0);
- assert_eq!(common_suffix_length("ab", "á "), 0);
- assert_eq!(common_suffix_length("áb ", "b "), 1);
- assert_eq!(common_suffix_length("áb ", "aáb "), 1 + "á".len());
- assert_eq!(common_suffix_length("abá ", "bá"), 1 + "á".len());
- assert_eq!(
- common_suffix_length("áaáabá ", "ááabá "),
- 2 + 2 * "á".len()
- );
- }
- }
-}