diff options
author | Dan Davison <dandavison7@gmail.com> | 2019-08-06 20:13:42 -0700 |
---|---|---|
committer | Dan Davison <dandavison7@gmail.com> | 2019-08-06 23:45:44 -0700 |
commit | 6369e3ced7fd027470d1a2455820c786663d0ec8 (patch) | |
tree | 287b47f504f3563925d6723813631727651b8121 /src/edits.rs | |
parent | 736d21c011b28450840506a9b5bb84358455d5e6 (diff) |
Use alignment to annotate line pair
Diffstat (limited to 'src/edits.rs')
-rw-r--r-- | src/edits.rs | 185 |
1 files changed, 151 insertions, 34 deletions
diff --git a/src/edits.rs b/src/edits.rs index 34bce3d2..4e95f070 100644 --- a/src/edits.rs +++ b/src/edits.rs @@ -9,10 +9,10 @@ use crate::align; pub fn infer_edits<'a, EditOperation>( minus_lines: &'a Vec<String>, plus_lines: &'a Vec<String>, - non_deletion: EditOperation, - _deletion: EditOperation, - non_insertion: EditOperation, - _insertion: EditOperation, + noop_deletion: EditOperation, + deletion: EditOperation, + noop_insertion: EditOperation, + insertion: EditOperation, max_line_distance: f64, ) -> ( Vec<Vec<(EditOperation, &'a str)>>, // annotated minus lines @@ -36,44 +36,163 @@ where // Emit as unpaired the plus lines already considered and rejected for plus_line in &plus_lines[emitted..(emitted + considered)] { - annotated_plus_lines.push(vec![(non_insertion, plus_line)]); + annotated_plus_lines.push(vec![(noop_insertion, plus_line)]); } emitted += considered; + let (annotated_minus_line, annotated_plus_line) = annotate( + alignment, + noop_deletion, + deletion, + noop_insertion, + insertion, + minus_line, + plus_line, + ); + annotated_minus_lines.push(annotated_minus_line); + annotated_plus_lines.push(annotated_plus_line); emitted += 1; - // Move on to the next minus line. + // Greedy: move on to the next minus line. continue 'minus_lines_loop; } else { considered += 1; } } // No homolog was found for minus i; emit as unpaired. - annotated_minus_lines.push(vec![(non_deletion, minus_line)]); + annotated_minus_lines.push(vec![(noop_deletion, minus_line)]); } // Emit any remaining plus lines for plus_line in &plus_lines[emitted..] { - annotated_plus_lines.push(vec![(non_insertion, plus_line)]); + annotated_plus_lines.push(vec![(noop_insertion, plus_line)]); } (annotated_minus_lines, annotated_plus_lines) } -fn tokenize(line: &str) -> Vec<(usize, &str)> { +fn tokenize(line: &str) -> Vec<&str> { let separators = Regex::new(r"[ ,;.:()\[\]<>]+").unwrap(); let mut tokens = Vec::new(); let mut offset = 0; for m in separators.find_iter(line) { - let (start, end) = (m.start(), m.end()); - tokens.push((offset, &line[offset..start])); - tokens.push((start, m.as_str())); - offset = end; + tokens.push(&line[offset..m.start()]); + tokens.push(m.as_str()); + offset = m.end(); } if offset < line.len() { - tokens.push((offset, &line[offset..line.len()])); + tokens.push(&line[offset..line.len()]); } tokens } +/// Use alignment to "annotate" minus and plus lines. An "annotated" line is a sequence of +/// (s: &str, a: EditOperation) pairs, where the &strs reference the memory +/// of the original line and their concatenation equals the line. +fn annotate<'a, Annotation>( + alignment: align::Alignment<'a>, + noop_deletion: Annotation, + deletion: Annotation, + noop_insertion: Annotation, + insertion: Annotation, + minus_line: &'a str, + plus_line: &'a str, +) -> (Vec<(Annotation, &'a str)>, Vec<(Annotation, &'a str)>) +where + Annotation: Copy, +{ + let mut annotated_minus_line = Vec::new(); + let mut annotated_plus_line = Vec::new(); + + let (mut x_offset, mut y_offset) = (0, 0); + let (mut minus_line_offset, mut plus_line_offset) = (0, 0); + + // Note that the inputs to align::Alignment are not the original strings themselves, but + // sequences of substrings derived from the tokenization process. We have just applied + // run_length_encoding to "coalesce" runs of the same edit operation into a single + // operation. We now need to form a &str, pointing into the memory of the original line, + // identifying a "section" which is the concatenation of the substrings involved in this + // coalesced operation. That's what the following closures do. Note that they must be called + // once only since they advance offset pointers. + let get_section = |n: usize, + line_offset: &mut usize, + substrings_offset: &mut usize, + substrings: &[&str], + line: &'a str| { + let section_length = substrings[*substrings_offset..*substrings_offset + n] + .iter() + .fold(0, |n, s| n + s.len()); + let old_offset = *line_offset; + *line_offset += section_length; + *substrings_offset += n; + &line[old_offset..*line_offset] + }; + let mut minus_section = |n| { + get_section( + n, + &mut minus_line_offset, + &mut x_offset, + &alignment.x, + minus_line, + ) + }; + let mut plus_section = |n| { + get_section( + n, + &mut plus_line_offset, + &mut y_offset, + &alignment.y, + plus_line, + ) + }; + + for (op, n) in run_length_encode(alignment.operations()) { + match op { + align::Operation::Deletion => { + annotated_minus_line.push((deletion, minus_section(n))); + } + align::Operation::NoOp => { + annotated_minus_line.push((noop_deletion, minus_section(n))); + annotated_plus_line.push((noop_insertion, plus_section(n))); + } + align::Operation::Substitution => { + annotated_minus_line.push((deletion, minus_section(n))); + annotated_plus_line.push((insertion, plus_section(n))); + } + align::Operation::Insertion => { + annotated_plus_line.push((insertion, plus_section(n))); + } + } + } + (annotated_minus_line, annotated_plus_line) +} + +fn run_length_encode<T>(sequence: Vec<T>) -> Vec<(T, usize)> +where + T: Copy, + T: PartialEq, +{ + let mut encoded = Vec::with_capacity(sequence.len()); + + if sequence.len() == 0 { + return encoded; + } + + let end = sequence.len(); + let (mut i, mut j) = (0, 1); + let mut curr = &sequence[i]; + loop { + if j == end || sequence[j] != *curr { + encoded.push((*curr, j - i)); + if j == end { + return encoded; + } else { + curr = &sequence[j]; + i = j; + } + } + j += 1; + } +} + #[cfg(test)] mod tests { use super::*; @@ -96,17 +215,14 @@ mod tests { #[test] fn test_tokenize_1() { - assert_eq!( - tokenize("aaa bbb"), - substring_indices(vec!["aaa", " ", "bbb"]) - ) + assert_eq!(tokenize("aaa bbb"), vec!["aaa", " ", "bbb"]) } #[test] fn test_tokenize_2() { assert_eq!( tokenize("fn coalesce_edits<'a, EditOperation>("), - substring_indices(vec![ + vec![ "fn", " ", "coalesce_edits", @@ -115,7 +231,7 @@ mod tests { ", ", "EditOperation", ">(" - ]) + ] ); } @@ -123,7 +239,7 @@ mod tests { fn test_tokenize_3() { assert_eq!( tokenize("fn coalesce_edits<'a, 'b, EditOperation>("), - substring_indices(vec![ + vec![ "fn", " ", "coalesce_edits", @@ -134,37 +250,38 @@ mod tests { ", ", "EditOperation", ">(" - ]) + ] ); } #[test] fn test_tokenize_4() { assert_eq!( - tokenize("annotated_plus_lines.push(vec![(non_insertion, plus_line)]);"), - substring_indices(vec![ + tokenize("annotated_plus_lines.push(vec![(noop_insertion, plus_line)]);"), + vec![ "annotated_plus_lines", ".", "push", "(", "vec!", "[(", - "non_insertion", + "noop_insertion", ", ", "plus_line", ")]);" - ]) + ] ); } - fn substring_indices(substrings: Vec<&str>) -> Vec<(usize, &str)> { - let mut offset = 0; - let mut with_offsets = Vec::new(); - for s in substrings { - with_offsets.push((offset, s)); - offset += s.len(); - } - with_offsets + #[test] + fn test_run_length_encode() { + assert_eq!(run_length_encode::<usize>(vec![]), vec![]); + assert_eq!(run_length_encode(vec![0]), vec![(0, 1)]); + assert_eq!(run_length_encode(vec!["0", "0"]), vec![("0", 2)]); + assert_eq!( + run_length_encode(vec![0, 0, 1, 2, 2, 2, 3, 4, 4, 4]), + vec![(0, 2), (1, 1), (2, 3), (3, 1), (4, 3)] + ); } #[test] |