summaryrefslogtreecommitdiffstats
path: root/src/edits.rs
diff options
context:
space:
mode:
authorDan Davison <dandavison7@gmail.com>2020-06-11 20:50:09 -0400
committerGitHub <noreply@github.com>2020-06-11 20:50:09 -0400
commit1b3499e7a8e41853b0f40edb8f115c4ba5b1e6d1 (patch)
tree1905558d19979de62b6b5b45b7a2ba2825cbb2f1 /src/edits.rs
parentf469492bb63763bf54f99559ad40138d3f8e24fd (diff)
parent97116f284bd826cd0c2805ed3b0f4359b310ad6a (diff)
Merge pull request #220 from dandavison/word-diff-regexp
New option --word-diff-regex
Diffstat (limited to 'src/edits.rs')
-rw-r--r--src/edits.rs188
1 files changed, 158 insertions, 30 deletions
diff --git a/src/edits.rs b/src/edits.rs
index 7b6e4674..d8f18377 100644
--- a/src/edits.rs
+++ b/src/edits.rs
@@ -1,6 +1,6 @@
use regex::Regex;
-use lazy_static::lazy_static;
+use unicode_segmentation::UnicodeSegmentation;
use unicode_width::UnicodeWidthStr;
use crate::align;
@@ -16,6 +16,7 @@ pub fn infer_edits<'a, EditOperation>(
deletion: EditOperation,
noop_insertion: EditOperation,
insertion: EditOperation,
+ tokenization_regex: &Regex,
max_line_distance: f64,
max_line_distance_for_naively_paired_lines: f64,
) -> (
@@ -34,7 +35,10 @@ where
'minus_lines_loop: for minus_line in minus_lines {
let mut considered = 0; // plus lines considered so far as match for minus_line
for plus_line in &plus_lines[emitted..] {
- let alignment = align::Alignment::new(tokenize(minus_line), tokenize(plus_line));
+ let alignment = align::Alignment::new(
+ tokenize(minus_line, tokenization_regex),
+ tokenize(plus_line, tokenization_regex),
+ );
let (annotated_minus_line, annotated_plus_line, distance) = annotate(
alignment,
noop_deletion,
@@ -76,25 +80,29 @@ where
(annotated_minus_lines, annotated_plus_lines)
}
-lazy_static! {
- static ref TOKENIZATION_REGEXP: Regex = Regex::new(r#"[\t ,;.:()\[\]<>/'"-]+"#).unwrap();
-}
-
/// Split line into tokens for alignment. The alignment algorithm aligns sequences of substrings;
/// not individual characters.
-fn tokenize(line: &str) -> Vec<&str> {
+fn tokenize<'a>(line: &'a str, regex: &Regex) -> Vec<&'a str> {
let mut tokens = Vec::new();
let mut offset = 0;
- for m in TOKENIZATION_REGEXP.find_iter(line) {
- tokens.push(&line[offset..m.start()]);
+ for m in regex.find_iter(line) {
+ if offset == 0 && m.start() > 0 {
+ tokens.push("");
+ }
// Align separating text as multiple single-character tokens.
- for i in m.start()..m.end() {
- tokens.push(&line[i..i + 1]);
+ for t in line[offset..m.start()].graphemes(true) {
+ tokens.push(t);
}
+ tokens.push(&line[m.start()..m.end()]);
offset = m.end();
}
if offset < line.len() {
- tokens.push(&line[offset..line.len()]);
+ if offset == 0 {
+ tokens.push("");
+ }
+ for t in line[offset..line.len()].graphemes(true) {
+ tokens.push(t);
+ }
}
tokens
}
@@ -226,8 +234,14 @@ where
#[cfg(test)]
mod tests {
use super::*;
+ use itertools::Itertools;
+ use lazy_static::lazy_static;
use unicode_segmentation::UnicodeSegmentation;
+ lazy_static! {
+ static ref DEFAULT_TOKENIZATION_REGEXP: Regex = Regex::new(r#"\w+"#).unwrap();
+ }
+
#[derive(Clone, Copy, Debug, PartialEq)]
enum EditOperation {
MinusNoop,
@@ -244,15 +258,26 @@ mod tests {
use EditOperation::*;
#[test]
+ fn test_tokenize_0() {
+ assert_tokenize("", &[]);
+ assert_tokenize(";", &["", ";"]);
+ assert_tokenize(";;", &["", ";", ";"]);
+ assert_tokenize(";;a", &["", ";", ";", "a"]);
+ assert_tokenize(";;ab", &["", ";", ";", "ab"]);
+ assert_tokenize(";;ab;", &["", ";", ";", "ab", ";"]);
+ assert_tokenize(";;ab;;", &["", ";", ";", "ab", ";", ";"]);
+ }
+
+ #[test]
fn test_tokenize_1() {
- assert_eq!(tokenize("aaa bbb"), vec!["aaa", " ", "bbb"])
+ assert_tokenize("aaa bbb", &["aaa", " ", "bbb"])
}
#[test]
fn test_tokenize_2() {
- assert_eq!(
- tokenize("fn coalesce_edits<'a, EditOperation>("),
- vec![
+ assert_tokenize(
+ "fn coalesce_edits<'a, EditOperation>(",
+ &[
"fn",
" ",
"coalesce_edits",
@@ -263,16 +288,16 @@ mod tests {
" ",
"EditOperation",
">",
- "("
- ]
+ "(",
+ ],
);
}
#[test]
fn test_tokenize_3() {
- assert_eq!(
- tokenize("fn coalesce_edits<'a, 'b, EditOperation>("),
- vec![
+ assert_tokenize(
+ "fn coalesce_edits<'a, 'b, EditOperation>(",
+ &[
"fn",
" ",
"coalesce_edits",
@@ -287,21 +312,22 @@ mod tests {
" ",
"EditOperation",
">",
- "("
- ]
+ "(",
+ ],
);
}
#[test]
fn test_tokenize_4() {
- assert_eq!(
- tokenize("annotated_plus_lines.push(vec![(noop_insertion, plus_line)]);"),
- vec![
+ assert_tokenize(
+ "annotated_plus_lines.push(vec![(noop_insertion, plus_line)]);",
+ &[
"annotated_plus_lines",
".",
"push",
"(",
- "vec!",
+ "vec",
+ "!",
"[",
"(",
"noop_insertion",
@@ -311,12 +337,112 @@ mod tests {
")",
"]",
")",
- ";"
- ]
+ ";",
+ ],
);
}
#[test]
+ fn test_tokenize_5() {
+ assert_tokenize(
+ " let col = Color::from_str(s).unwrap_or_else(|_| die());",
+ &[
+ "",
+ " ",
+ " ",
+ " ",
+ " ",
+ " ",
+ " ",
+ " ",
+ " ",
+ " ",
+ "let",
+ " ",
+ "col",
+ " ",
+ "=",
+ " ",
+ "Color",
+ ":",
+ ":",
+ "from_str",
+ "(",
+ "s",
+ ")",
+ ".",
+ "unwrap_or_else",
+ "(",
+ "|",
+ "_",
+ "|",
+ " ",
+ "die",
+ "(",
+ ")",
+ ")",
+ ";",
+ ],
+ )
+ }
+
+ #[test]
+ fn test_tokenize_6() {
+ assert_tokenize(
+ " (minus_file, plus_file) => format!(\"renamed: {} ⟶ {}\", minus_file, plus_file),",
+ &["",
+ " ",
+ " ",
+ " ",
+ " ",
+ " ",
+ " ",
+ " ",
+ " ",
+ " ",
+ "(",
+ "minus_file",
+ ",",
+ " ",
+ "plus_file",
+ ")",
+ " ",
+ "=",
+ ">",
+ " ",
+ "format",
+ "!",
+ "(",
+ "\"",
+ "renamed",
+ ":",
+ " ",
+ "{",
+ "}",
+ " ",
+ "⟶",
+ " ",
+ " ",
+ "{",
+ "}",
+ "\"",
+ ",",
+ " ",
+ "minus_file",
+ ",",
+ " ",
+ "plus_file",
+ ")",
+ ","])
+ }
+
+ fn assert_tokenize(text: &str, expected_tokens: &[&str]) {
+ let actual_tokens = tokenize(text, &*DEFAULT_TOKENIZATION_REGEXP);
+ assert_eq!(text, expected_tokens.iter().join(""));
+ assert_eq!(actual_tokens, expected_tokens);
+ }
+
+ #[test]
fn test_infer_edits_1() {
assert_paired_edits(
vec!["aaa"],
@@ -435,7 +561,7 @@ mod tests {
" s0.zip(s1)",
" .take_while(|((_, c0), (_, c1))| c0 == c1) // TODO: Don't consume one-past-the-end!",
" .fold(0, |offset, ((_, c0), (_, _))| offset + c0.len())"
- ], 0.66)
+ ], 0.5)
}
#[test]
@@ -590,9 +716,11 @@ mod tests {
Deletion,
PlusNoop,
Insertion,
+ &*DEFAULT_TOKENIZATION_REGEXP,
max_line_distance,
0.0,
);
+ // compare_annotated_lines(actual_edits, expected_edits);
assert_eq!(actual_edits, expected_edits);
}