summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/align.rs251
-rw-r--r--src/edits.rs195
2 files changed, 331 insertions, 115 deletions
diff --git a/src/align.rs b/src/align.rs
index 92262cb1..b2e2bdfe 100644
--- a/src/align.rs
+++ b/src/align.rs
@@ -1,14 +1,14 @@
use std::cmp::max;
use std::collections::VecDeque;
-const SUBSTITUTION_COST: usize = 1;
-const DELETION_COST: usize = 1;
-const INSERTION_COST: usize = 1;
+const DELETION_COST: usize = 2;
+const INSERTION_COST: usize = 2;
+// extra cost for starting a new group of changed tokens
+const INITIAL_MISMATCH_PENALITY: usize = 1;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Operation {
NoOp,
- Substitution,
Deletion,
Insertion,
}
@@ -61,14 +61,14 @@ impl<'a> Alignment<'a> {
self.table[i] = Cell {
parent: 0,
operation: Deletion,
- cost: i,
+ cost: i * DELETION_COST + INITIAL_MISMATCH_PENALITY,
};
}
for j in 1..self.dim[0] {
self.table[j * self.dim[1]] = Cell {
parent: 0,
operation: Insertion,
- cost: j,
+ cost: j * INSERTION_COST + INITIAL_MISMATCH_PENALITY,
};
}
@@ -76,22 +76,32 @@ impl<'a> Alignment<'a> {
for (j, y_j) in self.y.iter().enumerate() {
let (left, diag, up) =
(self.index(i, j + 1), self.index(i, j), self.index(i + 1, j));
+ // The order of the candidates matters if two of them have the
+ // same cost as in that case we choose the first one. Consider
+ // insertions and deletions before matches in order to group
+ // changes together. Insertions are preferred to deletions in
+ // order to highlight moved tokens as a deletion followed by an
+ // insertion (as the edit sequence is read backwards we need to
+ // choose the insertion first)
let candidates = [
Cell {
+ parent: up,
+ operation: Insertion,
+ cost: self.mismatch_cost(up, INSERTION_COST),
+ },
+ Cell {
parent: left,
operation: Deletion,
- cost: self.table[left].cost + DELETION_COST,
+ cost: self.mismatch_cost(left, DELETION_COST),
},
Cell {
parent: diag,
- operation: if x_i == y_j { NoOp } else { Substitution },
- cost: self.table[diag].cost
- + if x_i == y_j { 0 } else { SUBSTITUTION_COST },
- },
- Cell {
- parent: up,
- operation: Insertion,
- cost: self.table[up].cost + INSERTION_COST,
+ operation: NoOp,
+ cost: if x_i == y_j {
+ self.table[diag].cost
+ } else {
+ usize::MAX
+ },
},
];
let index = self.index(i + 1, j + 1);
@@ -104,6 +114,16 @@ impl<'a> Alignment<'a> {
}
}
+ fn mismatch_cost(&self, parent: usize, basic_cost: usize) -> usize {
+ self.table[parent].cost
+ + basic_cost
+ + if self.table[parent].operation == NoOp {
+ INITIAL_MISMATCH_PENALITY
+ } else {
+ 0
+ }
+ }
+
/// Read edit operations from the table.
pub fn operations(&self) -> Vec<Operation> {
let mut ops = VecDeque::with_capacity(max(self.x.len(), self.y.len()));
@@ -161,7 +181,6 @@ impl<'a> Alignment<'a> {
let parent = &self.table[cell.parent];
let op = match cell.operation {
Deletion => "-",
- Substitution => "*",
Insertion => "+",
NoOp => ".",
};
@@ -240,68 +259,176 @@ mod tests {
#[test]
fn test_0() {
- let (before, after) = ("aaa", "aba");
- assert_string_distance_parts(before, after, (1, 3));
- assert_eq!(operations(before, after), vec![NoOp, Substitution, NoOp,]);
+ TestCase {
+ before: "aaa",
+ after: "aba",
+ distance: 5,
+ parts: (2, 4),
+ operations: vec![NoOp, Deletion, Insertion, NoOp],
+ }
+ .run();
}
#[test]
fn test_0_nonascii() {
- let (before, after) = ("ááb", "áaa");
- assert_string_distance_parts(before, after, (2, 3));
- assert_eq!(
- operations(before, after),
- vec![NoOp, Substitution, Substitution,]
- );
+ TestCase {
+ before: "ááb",
+ after: "áaa",
+ distance: 9,
+ parts: (4, 5),
+ operations: vec![NoOp, Deletion, Deletion, Insertion, Insertion],
+ }
+ .run();
}
#[test]
fn test_1() {
- let (before, after) = ("kitten", "sitting");
- assert_string_distance_parts(before, after, (3, 7));
- assert_eq!(
- operations(before, after),
- vec![
- Substitution, // K S
- NoOp, // I I
- NoOp, // T T
- NoOp, // T T
- Substitution, // E I
- NoOp, // N N
- Insertion // - G
- ]
- );
+ TestCase {
+ before: "kitten",
+ after: "sitting",
+ distance: 13,
+ parts: (5, 9),
+ operations: vec![
+ Deletion, // K -
+ Insertion, // - S
+ NoOp, // I I
+ NoOp, // T T
+ NoOp, // T T
+ Deletion, // E -
+ Insertion, // - I
+ NoOp, // N N
+ Insertion, // - G
+ ],
+ }
+ .run();
}
#[test]
fn test_2() {
- let (before, after) = ("saturday", "sunday");
- assert_string_distance_parts(before, after, (3, 8));
- assert_eq!(
- operations(before, after),
- vec![
- NoOp, // S S
- Deletion, // A -
- Deletion, // T -
- NoOp, // U U
- Substitution, // R N
- NoOp, // D D
- NoOp, // A A
- NoOp // Y Y
- ]
- );
+ TestCase {
+ before: "saturday",
+ after: "sunday",
+ distance: 10,
+ parts: (4, 9),
+ operations: vec![
+ NoOp, // S S
+ Deletion, // A -
+ Deletion, // T -
+ NoOp, // U U
+ Deletion, // R -
+ Insertion, // - N
+ NoOp, // D D
+ NoOp, // A A
+ NoOp, // Y Y
+ ],
+ }
+ .run();
}
- fn assert_string_distance_parts(s1: &str, s2: &str, parts: (usize, usize)) {
- let (numer, _) = parts;
- assert_string_levenshtein_distance(s1, s2, numer);
- assert_eq!(string_distance_parts(s1, s2), parts);
- assert_eq!(string_distance_parts(s2, s1), parts);
+ #[test]
+ fn test_3() {
+ TestCase {
+ // Prefer [Deletion NoOp Insertion] over [Insertion NoOp Deletion]
+ before: "ab",
+ after: "ba",
+ distance: 6,
+ parts: (2, 3),
+ operations: vec![
+ Deletion, // a -
+ NoOp, // b b
+ Insertion, // - a
+ ],
+ }
+ .run();
+ }
+
+ #[test]
+ fn test_4() {
+ // Deletions are grouped together.
+ TestCase {
+ before: "AABB",
+ after: "AB",
+ distance: 5,
+ parts: (2, 4),
+ operations: vec![
+ NoOp, // A A
+ Deletion, // A -
+ Deletion, // B -
+ NoOp, // B B
+ ],
+ }
+ .run();
}
- fn assert_string_levenshtein_distance(s1: &str, s2: &str, d: usize) {
- assert_eq!(string_levenshtein_distance(s1, s2), d);
- assert_eq!(string_levenshtein_distance(s2, s1), d);
+ #[test]
+ fn test_5() {
+ // Insertions are grouped together.
+ TestCase {
+ before: "AB",
+ after: "AABB",
+ distance: 5,
+ parts: (2, 4),
+ operations: vec![
+ NoOp, // A A
+ Insertion, // - A
+ Insertion, // - B
+ NoOp, // B B
+ ],
+ }
+ .run();
+ }
+
+ #[test]
+ fn test_6() {
+ // Insertion and Deletion are grouped together.
+ TestCase {
+ before: "AAABBB",
+ after: "ACB",
+ distance: 11,
+ parts: (5, 7),
+ operations: vec![
+ NoOp, // A A
+ Deletion, // A -
+ Deletion, // A -
+ Deletion, // B -
+ Deletion, // B -
+ Insertion, // - C
+ NoOp, // B B
+ ],
+ }
+ .run();
+ }
+
+ struct TestCase<'a> {
+ before: &'a str,
+ after: &'a str,
+ distance: usize,
+ parts: (usize, usize),
+ operations: Vec<Operation>,
+ }
+
+ impl<'a> TestCase<'a> {
+ pub fn run(&self) -> () {
+ self.assert_string_distance_parts();
+ assert_eq!(operations(self.before, self.after), self.operations);
+ }
+
+ fn assert_string_distance_parts(&self) {
+ self.assert_string_levenshtein_distance();
+ assert_eq!(string_distance_parts(self.before, self.after), self.parts);
+ assert_eq!(string_distance_parts(self.after, self.before), self.parts);
+ }
+
+ fn assert_string_levenshtein_distance(&self) {
+ assert_eq!(
+ string_levenshtein_distance(self.before, self.after),
+ self.distance
+ );
+ assert_eq!(
+ string_levenshtein_distance(self.after, self.before),
+ self.distance
+ );
+ }
}
fn string_distance_parts(x: &str, y: &str) -> (usize, usize) {
diff --git a/src/edits.rs b/src/edits.rs
index 9ee4d199..bcbead11 100644
--- a/src/edits.rs
+++ b/src/edits.rs
@@ -191,23 +191,11 @@ where
*substrings_offset += n;
&line[old_offset..*line_offset]
};
- let mut minus_section = |n| {
- get_section(
- n,
- &mut minus_line_offset,
- &mut x_offset,
- &alignment.x,
- minus_line,
- )
+ let mut minus_section = |n: usize, offset: &mut usize| {
+ get_section(n, &mut minus_line_offset, offset, &alignment.x, minus_line)
};
- let mut plus_section = |n| {
- get_section(
- n,
- &mut plus_line_offset,
- &mut y_offset,
- &alignment.y,
- plus_line,
- )
+ let mut plus_section = |n: usize, offset: &mut usize| {
+ get_section(n, &mut plus_line_offset, offset, &alignment.y, plus_line)
};
let distance_contribution = |section: &str| UnicodeWidthStr::width(section.trim());
@@ -215,7 +203,7 @@ where
for (op, n) in alignment.coalesced_operations() {
match op {
align::Operation::Deletion => {
- let minus_section = minus_section(n);
+ let minus_section = minus_section(n, &mut x_offset);
let n_d = distance_contribution(minus_section);
d_denom += n_d;
d_numer += n_d;
@@ -223,12 +211,14 @@ where
minus_op_prev = deletion;
}
align::Operation::NoOp => {
- let minus_section = minus_section(n);
+ let minus_section = minus_section(n, &mut x_offset);
let n_d = distance_contribution(minus_section);
- d_denom += n_d;
+ d_denom += 2 * n_d;
let is_space = minus_section.trim().is_empty();
let coalesce_space_with_previous = is_space
- && ((minus_op_prev == deletion && plus_op_prev == insertion)
+ && ((minus_op_prev == deletion
+ && plus_op_prev == insertion
+ && (x_offset < alignment.x.len() - 1 || y_offset < alignment.y.len() - 1))
|| (minus_op_prev == noop_deletion && plus_op_prev == noop_insertion));
annotated_minus_line.push((
if coalesce_space_with_previous {
@@ -244,23 +234,13 @@ where
} else {
noop_insertion
},
- plus_section(n),
+ plus_section(n, &mut y_offset),
));
minus_op_prev = noop_deletion;
plus_op_prev = noop_insertion;
}
- align::Operation::Substitution => {
- let minus_section = minus_section(n);
- let n_d = distance_contribution(minus_section);
- d_denom += n_d;
- d_numer += n_d;
- annotated_minus_line.push((deletion, minus_section));
- annotated_plus_line.push((insertion, plus_section(n)));
- minus_op_prev = deletion;
- plus_op_prev = insertion;
- }
align::Operation::Insertion => {
- let plus_section = plus_section(n);
+ let plus_section = plus_section(n, &mut y_offset);
let n_d = distance_contribution(plus_section);
d_denom += n_d;
d_numer += n_d;
@@ -631,13 +611,13 @@ mod tests {
vec!["fn coalesce_edits<'a, 'b, EditOperation>("],
(
vec![vec![
- (MinusNoop, "fn coalesce_edits<'a"),
- (MinusNoop, ", EditOperation>("),
+ (MinusNoop, "fn coalesce_edits<'a, "),
+ (MinusNoop, "EditOperation>("),
]],
vec![vec![
- (PlusNoop, "fn coalesce_edits<'a"),
- (Insertion, ", 'b"),
- (PlusNoop, ", EditOperation>("),
+ (PlusNoop, "fn coalesce_edits<'a, "),
+ (Insertion, "'b, "),
+ (PlusNoop, "EditOperation>("),
]],
),
0.66,
@@ -652,15 +632,15 @@ mod tests {
(
vec![vec![
(MinusNoop, "for _ in range(0, "),
- (MinusNoop, "options[\"count\"]"),
- (MinusNoop, "):"),
+ (MinusNoop, "options[\"count\"])"),
+ (MinusNoop, ":"),
]],
vec![vec![
(PlusNoop, "for _ in range(0, "),
(Insertion, "int("),
- (PlusNoop, "options[\"count\"]"),
+ (PlusNoop, "options[\"count\"])"),
(Insertion, ")"),
- (PlusNoop, "):"),
+ (PlusNoop, ":"),
]],
),
0.3,
@@ -673,8 +653,8 @@ mod tests {
vec!["a a"],
vec!["a b a"],
(
- vec![vec![(MinusNoop, "a"), (MinusNoop, " a")]],
- vec![vec![(PlusNoop, "a"), (Insertion, " b"), (PlusNoop, " a")]],
+ vec![vec![(MinusNoop, "a "), (MinusNoop, "a")]],
+ vec![vec![(PlusNoop, "a "), (Insertion, "b "), (PlusNoop, "a")]],
),
1.0,
);
@@ -682,8 +662,8 @@ mod tests {
vec!["a a"],
vec!["a b b a"],
(
- vec![vec![(MinusNoop, "a"), (MinusNoop, " a")]],
- vec![vec![(PlusNoop, "a"), (Insertion, " b b"), (PlusNoop, " a")]],
+ vec![vec![(MinusNoop, "a "), (MinusNoop, "a")]],
+ vec![vec![(PlusNoop, "a "), (Insertion, "b b "), (PlusNoop, "a")]],
),
1.0,
);
@@ -698,20 +678,17 @@ mod tests {
// TODO: Coalesce runs of the same operation.
vec![vec![
(MinusNoop, "so it is safe to read "),
- (Deletion, "the"),
+ (Deletion, "the commit"),
(Deletion, " "),
- (Deletion, "commit"),
- (Deletion, " "),
- (Deletion, "number "),
- (MinusNoop, "from any one of them."),
+ (Deletion, "number"),
+ (MinusNoop, " from any one of them."),
]],
vec![vec![
(PlusNoop, "so it is safe to read "),
(Insertion, "build"),
(Insertion, " "),
(Insertion, "info"),
- (Insertion, " "),
- (PlusNoop, "from any one of them."),
+ (PlusNoop, " from any one of them."),
]],
),
1.0,
@@ -735,7 +712,6 @@ mod tests {
}
#[test]
- #[ignore]
fn test_infer_edits_12() {
assert_edits(
vec![" (xxxxxxxxx, \"build info\"),"],
@@ -755,6 +731,119 @@ mod tests {
);
}
+ #[test]
+ fn test_infer_edits_13() {
+ assert_paired_edits(
+ vec!["'b '", "[element,]"],
+ vec!["' b'", "[element],"],
+ (
+ vec![
+ vec![
+ (MinusNoop, "'"),
+ (Deletion, "b"),
+ (MinusNoop, " "),
+ (MinusNoop, "'"),
+ ],
+ vec![(MinusNoop, "[element"), (Deletion, ","), (MinusNoop, "]")],
+ ],
+ vec![
+ vec![
+ (PlusNoop, "'"),
+ (PlusNoop, " "),
+ (Insertion, "b"),
+ (PlusNoop, "'"),
+ ],
+ vec![(PlusNoop, "[element"), (PlusNoop, "]"), (Insertion, ",")],
+ ],
+ ),
+ );
+ }
+
+ #[test]
+ fn test_infer_edits_14() {
+ assert_paired_edits(
+ vec!["a b c d ", "p "],
+ vec!["x y c z ", "q r"],
+ (
+ vec![
+ vec![
+ (MinusNoop, ""),
+ (Deletion, "a"),
+ (Deletion, " "),
+ (Deletion, "b"),
+ (MinusNoop, " c "),
+ (Deletion, "d"),
+ (MinusNoop, " "),
+ ],
+ vec![(MinusNoop, ""), (Deletion, "p"), (Deletion, " ")],
+ ],
+ vec![
+ vec![
+ (PlusNoop, ""),
+ (Insertion, "x"),
+ (Insertion, " "),
+ (Insertion, "y"),
+ (PlusNoop, " c "),
+ (Insertion, "z"),
+ (PlusNoop, " "),
+ ],
+ vec![
+ (PlusNoop, ""),
+ (Insertion, "q"),
+ (Insertion, " "),
+ (Insertion, "r"),
+ ],
+ ],
+ ),
+ );
+ }
+
+ #[test]
+ fn test_infer_edits_15() {
+ assert_paired_edits(
+ vec![r#"printf "%s\n" s y y | git add -p &&"#],
+ vec!["test_write_lines s y y | git add -p &&"],
+ (
+ vec![vec![
+ (MinusNoop, ""),
+ (Deletion, r#"printf "%s\n""#),
+ (MinusNoop, " s y y | git add -p &&"),
+ ]],
+ vec![vec![
+ (PlusNoop, ""),
+ (Insertion, "test_write_lines"),
+ (PlusNoop, " s y y | git add -p &&"),
+ ]],
+ ),
+ );
+ }
+
+ #[test]
+ fn test_infer_edits_16() {
+ assert_edits(
+ vec!["a a a a a a b b b"],
+ vec!["c a a a a a a c c"],
+ (
+ vec![vec![
+ (MinusNoop, ""),
+ (MinusNoop, "a a a a a a "),
+ (Deletion, "b b"),
+ (Deletion, " "),
+ (Deletion, "b"),
+ ]],
+ vec![vec![
+ (PlusNoop, ""),
+ (Insertion, "c "),
+ (PlusNoop, "a a a a a a "),
+ (Insertion, "c"),
+ (Insertion, " "),
+ (Insertion, "c"),
+ ]],
+ ),
+ 1.0,
+ );
+ }
+
fn assert_edits(
minus_lines: Vec<&str>,
plus_lines: Vec<&str>,