diff options
author | Dan Davison <dandavison7@gmail.com> | 2019-08-04 00:05:19 -0700 |
---|---|---|
committer | Dan Davison <dandavison7@gmail.com> | 2019-08-06 23:05:52 -0700 |
commit | 1bb19ef8ba6af230df1148c86ccec240a03f8f89 (patch) | |
tree | a5d9af5754186592bb5643ae7eface53bd1a7808 /src/edits.rs | |
parent | 41c43314da3cee8bbaa98c1deff0e7c45646b28d (diff) |
Align tokenized string instead of characters/graphemes
Thanks @clnoll
Diffstat (limited to 'src/edits.rs')
-rw-r--r-- | src/edits.rs | 89 |
1 files changed, 76 insertions, 13 deletions
diff --git a/src/edits.rs b/src/edits.rs index 95c7191e..992b3e66 100644 --- a/src/edits.rs +++ b/src/edits.rs @@ -1,4 +1,4 @@ -use unicode_segmentation::UnicodeSegmentation; +use regex::Regex; use crate::align; @@ -32,14 +32,8 @@ where let minus_line = minus_line.trim_end(); for plus_line in &plus_lines[emitted..] { let plus_line = plus_line.trim_end(); - let alignment = align::Alignment::new( - minus_line - .grapheme_indices(true) - .collect::<Vec<(usize, &str)>>(), - plus_line - .grapheme_indices(true) - .collect::<Vec<(usize, &str)>>(), - ); + + let alignment = align::Alignment::new(tokenize(minus_line), tokenize(plus_line)); if alignment.normalized_edit_distance() < distance_threshold { // minus_line and plus_line are inferred to be a homologous pair. @@ -86,6 +80,14 @@ where (annotated_minus_lines, annotated_plus_lines) } +fn tokenize(line: &str) -> Vec<(usize, &str)> { + let regex = Regex::new("[^ ]*( +|$)").unwrap(); + regex + .find_iter(line) + .map(|m| (m.start(), m.as_str())) + .collect() +} + pub fn coalesce_minus_edits<'a, EditOperation>( alignment: &align::Alignment<'a>, line: &'a str, @@ -180,6 +182,38 @@ mod tests { const DISTANCE_MAX: f64 = 2.0; #[test] + fn test_tokenize_1() { + assert_eq!(tokenize("aaa bbb"), vec![(0, "aaa "), (4, "bbb")]) + } + + #[test] + fn test_tokenize_2() { + assert_eq!( + tokenize("fn coalesce_edits<'a, EditOperation>("), + vec![ + (0, "fn "), + (3, "coalesce_edits<'a, "), + (22, "EditOperation>(") + ] + ); + } + + #[test] + fn test_tokenize_3() { + assert_eq!( + tokenize("fn coalesce_edits<'a, 'b, EditOperation>("), + vec![ + (0, "fn "), + (3, "coalesce_edits<'a, "), + (22, "'b, "), + (26, "EditOperation>(") + ] + ); + } + + // vec!["fn coalesce_edits<'a, 'b, EditOperation>("], + + #[test] fn test_coalesce_edits_1() { assert_eq!( coalesce_edits( @@ -197,8 +231,20 @@ mod tests { vec!["aaa\n"], vec!["aba\n"], ( - vec![vec![(MinusNoop, "a"), (Deletion, "a"), (MinusNoop, "a")]], - vec![vec![(PlusNoop, "a"), (Insertion, "b"), (PlusNoop, "a")]], + vec![vec![(Deletion, "aaa")]], + vec![vec![(Insertion, "aba")]], + ), + ) + } + + #[test] + fn test_infer_edits_1_2() { + assert_paired_edits( + vec!["aaa ccc\n"], + vec!["aba ccc\n"], + ( + vec![vec![(Deletion, "aaa "), (MinusNoop, "ccc")]], + vec![vec![(Insertion, "aba "), (PlusNoop, "ccc")]], ), ) } @@ -209,8 +255,8 @@ mod tests { vec!["áaa\n"], vec!["ááb\n"], ( - vec![vec![(MinusNoop, "á"), (Deletion, "aa")]], - vec![vec![(PlusNoop, "á"), (Insertion, "áb")]], + vec![vec![(Deletion, "áaa")]], + vec![vec![(Insertion, "ááb")]], ), ) } @@ -297,6 +343,23 @@ mod tests { ], 0.66) } + #[test] + fn test_infer_edits_7() { + assert_edits( + vec!["fn coalesce_edits<'a, EditOperation>("], + vec!["fn coalesce_edits<'a, 'b, EditOperation>("], + ( + vec![vec![(MinusNoop, "fn coalesce_edits<'a, EditOperation>(")]], + vec![vec![ + (PlusNoop, "fn coalesce_edits<'a, "), + (Insertion, "'b, "), + (PlusNoop, "EditOperation>("), + ]], + ), + 0.66, + ) + } + fn assert_edits( minus_lines: Vec<&str>, plus_lines: Vec<&str>, |