use regex::Regex;
use unicode_segmentation::UnicodeSegmentation;
use unicode_width::UnicodeWidthStr;
use crate::align;
/// Infer the edit operations responsible for the differences between a collection of old and new
/// lines. A "line" is a string. An annotated line is a Vec of (op, &str) pairs, where the &str
/// slices are slices of the line, and their concatenation equals the line. Return the input minus
/// and plus lines, in annotated form. Also return a specification of the inferred alignment of
/// minus and plus lines. `noop_deletions[i]` is the appropriate deletion operation tag to be used
/// for `minus_lines[i]`; `noop_deletions` is guaranteed to be the same length as `minus_lines`.
/// The equivalent statements hold for `plus_insertions` and `plus_lines`.
pub fn infer_edits<'a, EditOperation>(
minus_lines: Vec<&'a str>,
plus_lines: Vec<&'a str>,
noop_deletions: Vec<EditOperation>,
deletion: EditOperation,
noop_insertions: Vec<EditOperation>,
insertion: EditOperation,
tokenization_regex: &Regex,
max_line_distance: f64,
max_line_distance_for_naively_paired_lines: f64,
) -> (
Vec<Vec<(EditOperation, &'a str)>>, // annotated minus lines
Vec<Vec<(EditOperation, &'a str)>>, // annotated plus lines
Vec<(Option<usize>, Option<usize>)>, // line alignment
)
where
EditOperation: Copy,
EditOperation: PartialEq,
{
let mut annotated_minus_lines = Vec::<Vec<(EditOperation, &str)>>::new();
let mut annotated_plus_lines = Vec::<Vec<(EditOperation, &str)>>::new();
let mut line_alignment = Vec::<(Option<usize>, Option<usize>)>::new();
let mut plus_index = 0; // plus lines emitted so far
'minus_lines_loop: for (minus_index, minus_line) in minus_lines.iter().enumerate() {
let mut considered = 0; // plus lines considered so far as match for minus_line
for plus_line in &plus_lines[plus_index..] {
let alignment = align::Alignment::new(
tokenize(minus_line, tokenization_regex),
tokenize(plus_line, tokenization_regex),
);
let (annotated_minus_line, annotated_plus_line, distance) = annotate(
alignment,
noop_deletions[minus_index],
deletion,
noop_insertions[plus_index],
insertion,
minus_line,
plus_line,
);
if minus_lines.len() == plus_lines.len()
&& distance <= max_line_distance_for_naively_paired_lines
|| distance <= max_line_distance
{
// minus_line and plus_line are inferred to be a homologous pair.
// Emit as unpaired the plus lines already considered and rejected
for plus_line in &plus_lines[plus_index..(plus_index + considered)] {
annotated_plus_lines.push(vec![(noop_insertions[plus_index], plus_line)]);
line_alignment.push((None, Some(plus_index)));
plus_index += 1;
}
annotated_minus_lines.push(annotated_minus_line);
annotated_plus_lines.push(annotated_plus_line);
line_alignment.push((Some(minus_index), Some(plus_index)));
plus_index += 1;
// Greedy: move on to the next minus line.
continue 'minus_lines_loop;
} else {
considered += 1;
}
}
// No homolog was found for minus i; emit as unpaired.
annotated_minus_lines.push(vec![(noop_deletions[minus_index], minus_line)]);
line_alignment.push((Some(minus_index), None));
}
// Emit any remaining plus lines
for plus_line in &plus_lines[plus_index..] {
annotated_plus_lines.push(vec![(