use std::cmp::max; use std::collections::VecDeque; const DELETION_COST: usize = 2; const INSERTION_COST: usize = 2; // extra cost for starting a new group of changed tokens const INITIAL_MISMATCH_PENALTY: usize = 1; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum Operation { NoOp, Deletion, Insertion, } use Operation::*; /// Needleman-Wunsch / Wagner-Fischer table for computation of edit distance and associated /// alignment. #[derive(Clone, Debug)] struct Cell { parent: usize, operation: Operation, cost: usize, } #[derive(Debug)] pub struct Alignment<'a> { pub x: Vec<&'a str>, pub y: Vec<&'a str>, table: Vec, dim: [usize; 2], } impl<'a> Alignment<'a> { /// Fill table for Levenshtein distance / alignment computation pub fn new(x: Vec<&'a str>, y: Vec<&'a str>) -> Self { // TODO: Something downstream of the alignment algorithm requires that the first token in // both x and y is "", so this is explicitly inserted in `tokenize()`. let dim = [y.len() + 1, x.len() + 1]; let table = vec![ Cell { parent: 0, operation: NoOp, cost: 0 }; dim[0] * dim[1] ]; let mut alignment = Self { x, y, table, dim }; alignment.fill(); alignment } /// Fill table for Levenshtein distance / alignment computation pub fn fill(&mut self) { // x is written along the top of the table; y is written down the left side of the // table. Also, we insert a 0 in cell (0, 0) of the table, so x and y are shifted by one // position. Therefore, the element corresponding to (x[i], y[j]) is in column (i + 1) and // row (j + 1); the index of this element is given by index(i, j). for i in 1..self.dim[1] { self.table[i] = Cell { parent: 0, operation: Deletion, cost: i * DELETION_COST + INITIAL_MISMATCH_PENALTY, }; } for j in 1..self.dim[0] { self.table[j * self.dim[1]] = Cell { parent: 0, operation: Insertion, cost: j * INSERTION_COST + INITIAL_MISMATCH_PENALTY, }; } for (i, x_i) in self.x.iter().enumerate() { for (j, y_j) in self.y.iter().enumerate() { let (left, diag, up) = (self.index(i, j + 1), self.index(i, j), self.index(i + 1, j)); // The order of the candidates matters if two of them have the // same cost as in that case we choose the first one. Consider // insertions and deletions before matches in order to group // changes together. Insertions are preferred to deletions in // order to highlight moved tokens as a deletion followed by an // insertion (as the edit sequence is read backwards we need to // choose the insertion first) let candidates = [ Cell { parent: up, operation: Insertion, cost: self.mismatch_cost(up, INSERTION_COST), }, Cell { parent: left, operation: Deletion, cost: self.mismatch_cost(left, DELETION_COST), }, Cell { parent: diag, operation: NoOp, cost: if x_i == y_j { self.table[diag].cost } else { usize::MAX }, }, ]; let index = self.index(i + 1, j + 1); self.table[index] = candidates .iter() .min_by_key(|cell| cell.cost) .unwrap() .clone(); } } } fn mismatch_cost(&self, parent: usize, basic_cost: usize) -> usize { self.table[parent].cost + basic_cost + if self.table[parent].operation == NoOp { INITIAL_MISMATCH_PENALTY } else { 0 } } /// Read edit operations from the table. pub fn operations(&self) -> Vec { let mut ops = VecDeque::with_capacity(max(self.x.len(), self.y.len())); let mut cell = &self.table[self.index(self.x.len(), self.y.len())]; loop { ops.push_front(cell.operation); if cell.parent == 0 { break; } cell = &self.table[cell.parent]; } Vec::from(ops) } pub fn coalesced_operations(&self) -> Vec<(Operation, usize)> { run_length_encode(self.operations()) } // Row-major storage of 2D array. fn index(&self, i: usize, j: usize) -> usize { j * self.dim[1] + i } } fn run_length_encode(sequence: Vec) -> Vec<(T, usize)> where T: Copy, T: PartialEq, { let mut encoded = Vec::with_capacity(sequence.len()); if sequence.is_empty() { return encoded; } let end = sequence.len(); let (mut i, mut j) = (0, 1); let mut curr = &sequence[i]; loop { if j == end || sequence[j] != *curr { encoded.push((*curr, j - i)); if j == end { return encoded; } else { curr = &sequence[j]; i = j; } } j += 1; } } #[cfg(test)] mod tests { use super::*; use unicode_segmentation::UnicodeSegmentation; #[test] fn test_run_length_encode() { assert_eq!(run_length_encode::(vec![]), vec![]); assert_eq!(run_length_encode(vec![0]), vec![(0, 1)]); assert_eq!(run_length_encode(vec!["0", "0"]), vec![("0", 2)]); assert_eq!( run_length_encode(vec![0, 0, 1, 2, 2, 2, 3, 4, 4, 4]), vec![(0, 2), (1, 1), (2, 3), (3, 1), (4, 3)] ); } #[test] fn test_0() { TestCase { before: "aaa", after: "aba", distance: 5, parts: (2, 4), operations: vec![NoOp, Deletion, Insertion, NoOp], } .run(); } #[test] fn test_0_nonascii() { TestCase { before: "ááb", after: "áaa", distance: 9, parts: (4, 5), operations: vec![NoOp, Deletion, Deletion, Insertion, Insertion], } .run(); } #[test] fn test_1() { TestCase { before: "kitten", after: "sitting", distance: 13, parts: (5, 9), operations: vec![ Deletion, // K - Insertion, // - S NoOp, // I I NoOp, // T T NoOp, // T T Deletion, // E - Insertion, // - I NoOp, // N N Insertion, // - G ], } .run(); } #[test] fn test_2() { TestCase { before: "saturday", after: "sunday", distance: 10, parts: (4, 9), operations: vec![ NoOp, // S S Deletion, // A - Deletion, // T - NoOp, // U U Deletion, // R - Insertion, // - N NoOp, // D D NoOp, // A A NoOp, // Y Y ], } .run(); } #[test] fn test_3() { TestCase { // Prefer [Deletion NoOp Insertion] over [Insertion NoOp Deletion] before: "ab", after: "ba", distance: 6, parts: (2, 3), operations: vec![ Deletion, // a - NoOp, // b b Insertion, // - a ], } .run(); } #[test] fn test_4() { // Deletions are grouped together. TestCase { before: "AABB", after: "AB", distance: 5, parts: (2, 4), operations: vec![ NoOp, // A A Deletion, // A - Deletion, // B - NoOp, // B B ], } .run(); } #[test] fn test_5() { // Insertions are grouped together. TestCase { before: "AB", after: "AABB", distance: 5, parts: (2, 4), operations: vec![ NoOp, // A A Insertion, // - A Insertion, // - B NoOp, // B B ], } .run(); } #[test] fn test_6() { // Insertion and Deletion are grouped together. TestCase { before: "AAABBB", after: "ACB", distance: 11, parts: (5, 7), operations: vec![ NoOp, // A A Deletion, // A - Deletion, // A - Deletion, // B - Deletion, // B - Insertion, // - C NoOp, // B B ], } .run(); } struct TestCase<'a> { before: &'a str, after: &'a str, distance: usize, parts: (usize, usize), operations: Vec, } impl<'a> TestCase<'a> { pub fn run(&self) { self.assert_string_distance_parts(); assert_eq!(operations(self.before, self.after), self.operations); } fn assert_string_distance_parts(&self) { self.assert_string_levenshtein_distance(); assert_eq!(string_distance_parts(self.before, self.after), self.parts); assert_eq!(string_distance_parts(self.after, self.before), self.parts); } fn assert_string_levenshtein_distance(&self) { assert_eq!( string_levenshtein_distance(self.before, self.after), self.distance ); assert_eq!( string_levenshtein_distance(self.after, self.before), self.distance ); } } fn string_distance_parts(x: &str, y: &str) -> (usize, usize) { let (x, y) = ( x.graphemes(true).collect::>(), y.graphemes(true).collect::>(), ); Alignment::new(x, y).distance_parts() } fn string_levenshtein_distance(x: &str, y: &str) -> usize { let (x, y) = ( x.graphemes(true).collect::>(), y.graphemes(true).collect::>(), ); Alignment::new(x, y).levenshtein_distance() } fn operations<'a>(x: &'a str, y: &'a str) -> Vec { let (x, y) = ( x.graphemes(true).collect::>(), y.graphemes(true).collect::>(), ); Alignment::new(x, y).operations() } impl<'a> Alignment<'a> { pub fn distance_parts(&self) -> (usize, usize) { let (mut numer, mut denom) = (0, 0); for op in self.operations() { if op != NoOp { numer += 1; } denom += 1; } (numer, denom) } /// Compute levenshtein distance from the filled table. pub fn levenshtein_distance(&self) -> usize { self.table[self.index(self.x.len(), self.y.len())].cost } #[allow(dead_code)] fn format_cell(&self, cell: &Cell) -> String { let parent = &self.table[cell.parent]; let op = match cell.operation { Deletion => "-", Insertion => "+", NoOp => ".", }; format!("{}{}{}", parent.cost, op, cell.cost) } #[allow(dead_code)] fn print(&self) { println!("x: {:?}", self.x); println!("y: {:?}", self.y); println!(); print!(" "); for j in 0..self.dim[1] { print!("{} ", if j > 0 { self.x[j - 1] } else { " " }) } println!(); for i in 0..self.dim[0] { for j in 0..self.dim[1] { if j == 0 { print!("{} ", if i > 0 { self.y[i - 1] } else { " " }) } let cell = &self.table[self.index(j, i)]; print!("{} ", self.format_cell(cell)); } println!(); } println!(); } } }