diff options
author | Dan Davison <dandavison7@gmail.com> | 2020-06-11 20:50:09 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-06-11 20:50:09 -0400 |
commit | 1b3499e7a8e41853b0f40edb8f115c4ba5b1e6d1 (patch) | |
tree | 1905558d19979de62b6b5b45b7a2ba2825cbb2f1 /src/edits.rs | |
parent | f469492bb63763bf54f99559ad40138d3f8e24fd (diff) | |
parent | 97116f284bd826cd0c2805ed3b0f4359b310ad6a (diff) |
Merge pull request #220 from dandavison/word-diff-regexp
New option --word-diff-regex
Diffstat (limited to 'src/edits.rs')
-rw-r--r-- | src/edits.rs | 188 |
1 files changed, 158 insertions, 30 deletions
diff --git a/src/edits.rs b/src/edits.rs index 7b6e4674..d8f18377 100644 --- a/src/edits.rs +++ b/src/edits.rs @@ -1,6 +1,6 @@ use regex::Regex; -use lazy_static::lazy_static; +use unicode_segmentation::UnicodeSegmentation; use unicode_width::UnicodeWidthStr; use crate::align; @@ -16,6 +16,7 @@ pub fn infer_edits<'a, EditOperation>( deletion: EditOperation, noop_insertion: EditOperation, insertion: EditOperation, + tokenization_regex: &Regex, max_line_distance: f64, max_line_distance_for_naively_paired_lines: f64, ) -> ( @@ -34,7 +35,10 @@ where 'minus_lines_loop: for minus_line in minus_lines { let mut considered = 0; // plus lines considered so far as match for minus_line for plus_line in &plus_lines[emitted..] { - let alignment = align::Alignment::new(tokenize(minus_line), tokenize(plus_line)); + let alignment = align::Alignment::new( + tokenize(minus_line, tokenization_regex), + tokenize(plus_line, tokenization_regex), + ); let (annotated_minus_line, annotated_plus_line, distance) = annotate( alignment, noop_deletion, @@ -76,25 +80,29 @@ where (annotated_minus_lines, annotated_plus_lines) } -lazy_static! { - static ref TOKENIZATION_REGEXP: Regex = Regex::new(r#"[\t ,;.:()\[\]<>/'"-]+"#).unwrap(); -} - /// Split line into tokens for alignment. The alignment algorithm aligns sequences of substrings; /// not individual characters. -fn tokenize(line: &str) -> Vec<&str> { +fn tokenize<'a>(line: &'a str, regex: &Regex) -> Vec<&'a str> { let mut tokens = Vec::new(); let mut offset = 0; - for m in TOKENIZATION_REGEXP.find_iter(line) { - tokens.push(&line[offset..m.start()]); + for m in regex.find_iter(line) { + if offset == 0 && m.start() > 0 { + tokens.push(""); + } // Align separating text as multiple single-character tokens. - for i in m.start()..m.end() { - tokens.push(&line[i..i + 1]); + for t in line[offset..m.start()].graphemes(true) { + tokens.push(t); } + tokens.push(&line[m.start()..m.end()]); offset = m.end(); } if offset < line.len() { - tokens.push(&line[offset..line.len()]); + if offset == 0 { + tokens.push(""); + } + for t in line[offset..line.len()].graphemes(true) { + tokens.push(t); + } } tokens } @@ -226,8 +234,14 @@ where #[cfg(test)] mod tests { use super::*; + use itertools::Itertools; + use lazy_static::lazy_static; use unicode_segmentation::UnicodeSegmentation; + lazy_static! { + static ref DEFAULT_TOKENIZATION_REGEXP: Regex = Regex::new(r#"\w+"#).unwrap(); + } + #[derive(Clone, Copy, Debug, PartialEq)] enum EditOperation { MinusNoop, @@ -244,15 +258,26 @@ mod tests { use EditOperation::*; #[test] + fn test_tokenize_0() { + assert_tokenize("", &[]); + assert_tokenize(";", &["", ";"]); + assert_tokenize(";;", &["", ";", ";"]); + assert_tokenize(";;a", &["", ";", ";", "a"]); + assert_tokenize(";;ab", &["", ";", ";", "ab"]); + assert_tokenize(";;ab;", &["", ";", ";", "ab", ";"]); + assert_tokenize(";;ab;;", &["", ";", ";", "ab", ";", ";"]); + } + + #[test] fn test_tokenize_1() { - assert_eq!(tokenize("aaa bbb"), vec!["aaa", " ", "bbb"]) + assert_tokenize("aaa bbb", &["aaa", " ", "bbb"]) } #[test] fn test_tokenize_2() { - assert_eq!( - tokenize("fn coalesce_edits<'a, EditOperation>("), - vec![ + assert_tokenize( + "fn coalesce_edits<'a, EditOperation>(", + &[ "fn", " ", "coalesce_edits", @@ -263,16 +288,16 @@ mod tests { " ", "EditOperation", ">", - "(" - ] + "(", + ], ); } #[test] fn test_tokenize_3() { - assert_eq!( - tokenize("fn coalesce_edits<'a, 'b, EditOperation>("), - vec![ + assert_tokenize( + "fn coalesce_edits<'a, 'b, EditOperation>(", + &[ "fn", " ", "coalesce_edits", @@ -287,21 +312,22 @@ mod tests { " ", "EditOperation", ">", - "(" - ] + "(", + ], ); } #[test] fn test_tokenize_4() { - assert_eq!( - tokenize("annotated_plus_lines.push(vec![(noop_insertion, plus_line)]);"), - vec![ + assert_tokenize( + "annotated_plus_lines.push(vec![(noop_insertion, plus_line)]);", + &[ "annotated_plus_lines", ".", "push", "(", - "vec!", + "vec", + "!", "[", "(", "noop_insertion", @@ -311,12 +337,112 @@ mod tests { ")", "]", ")", - ";" - ] + ";", + ], ); } #[test] + fn test_tokenize_5() { + assert_tokenize( + " let col = Color::from_str(s).unwrap_or_else(|_| die());", + &[ + "", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + "let", + " ", + "col", + " ", + "=", + " ", + "Color", + ":", + ":", + "from_str", + "(", + "s", + ")", + ".", + "unwrap_or_else", + "(", + "|", + "_", + "|", + " ", + "die", + "(", + ")", + ")", + ";", + ], + ) + } + + #[test] + fn test_tokenize_6() { + assert_tokenize( + " (minus_file, plus_file) => format!(\"renamed: {} ⟶ {}\", minus_file, plus_file),", + &["", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + "(", + "minus_file", + ",", + " ", + "plus_file", + ")", + " ", + "=", + ">", + " ", + "format", + "!", + "(", + "\"", + "renamed", + ":", + " ", + "{", + "}", + " ", + "⟶", + " ", + " ", + "{", + "}", + "\"", + ",", + " ", + "minus_file", + ",", + " ", + "plus_file", + ")", + ","]) + } + + fn assert_tokenize(text: &str, expected_tokens: &[&str]) { + let actual_tokens = tokenize(text, &*DEFAULT_TOKENIZATION_REGEXP); + assert_eq!(text, expected_tokens.iter().join("")); + assert_eq!(actual_tokens, expected_tokens); + } + + #[test] fn test_infer_edits_1() { assert_paired_edits( vec!["aaa"], @@ -435,7 +561,7 @@ mod tests { " s0.zip(s1)", " .take_while(|((_, c0), (_, c1))| c0 == c1) // TODO: Don't consume one-past-the-end!", " .fold(0, |offset, ((_, c0), (_, _))| offset + c0.len())" - ], 0.66) + ], 0.5) } #[test] @@ -590,9 +716,11 @@ mod tests { Deletion, PlusNoop, Insertion, + &*DEFAULT_TOKENIZATION_REGEXP, max_line_distance, 0.0, ); + // compare_annotated_lines(actual_edits, expected_edits); assert_eq!(actual_edits, expected_edits); } |