use regex::Regex;
use unicode_segmentation::UnicodeSegmentation;
use unicode_width::UnicodeWidthStr;
use crate::align;
/// Infer the edit operations responsible for the differences between a collection of old and new
/// lines. A "line" is a string. An annotated line is a Vec of (op, &str) pairs, where the &str
/// slices are slices of the line, and their concatenation equals the line. Return the input minus
/// and plus lines, in annotated form.
pub fn infer_edits<'a, EditOperation>(
minus_lines: &'a [String],
plus_lines: &'a [String],
noop_deletion: EditOperation,
deletion: EditOperation,
noop_insertion: EditOperation,
insertion: EditOperation,
tokenization_regex: &Regex,
max_line_distance: f64,
max_line_distance_for_naively_paired_lines: f64,
) -> (
Vec<Vec<(EditOperation, &'a str)>>, // annotated minus lines
Vec<Vec<(EditOperation, &'a str)>>, // annotated plus lines
)
where
EditOperation: Copy,
EditOperation: PartialEq,
{
let mut annotated_minus_lines = Vec::<Vec<(EditOperation, &str)>>::new();
let mut annotated_plus_lines = Vec::<Vec<(EditOperation, &str)>>::new();
let mut plus_index = 0; // plus lines emitted so far
'minus_lines_loop: for minus_line in minus_lines {
let mut considered = 0; // plus lines considered so far as match for minus_line
for plus_line in &plus_lines[plus_index..] {
let alignment = align::Alignment::new(
tokenize(minus_line, tokenization_regex),
tokenize(plus_line, tokenization_regex),
);
let (annotated_minus_line, annotated_plus_line, distance) = annotate(
alignment,
noop_deletion,
deletion,
noop_insertion,
insertion,
minus_line,
plus_line,
);
if minus_lines.len() == plus_lines.len()
&& distance <= max_line_distance_for_naively_paired_lines
|| distance <= max_line_distance
{
// minus_line and plus_line are inferred to be a homologous pair.
// Emit as unpaired the plus lines already considered and rejected
for plus_line in &plus_lines[plus_index..(plus_index + considered)] {
annotated_plus_lines.push(vec![(noop_insertion, plus_line)]);
}
plus_index += considered;
annotated_minus_lines.push(annotated_minus_line);
annotated_plus_lines.push(annotated_plus_line);
plus_index += 1;
// Greedy: move on to the next minus line.
continue 'minus_lines_loop;
} else {
considered += 1;
}
}
// No homolog was found for minus i; emit as unpaired.
annotated_minus_lines.push(vec![(noop_deletion, minus_line)]);
}
// Emit any remaining plus lines
for plus_line in &plus_lines[plus_index..] {
annotated_plus_lines.push(vec![(noop_insertion, plus_line)]);
}
(annotated_minus_lines, annotated_plus_lines)
}
/// Split line into tokens for alignment. The alignment algorithm aligns sequences of substrings;
/// not individual characters.
fn tokenize<'a>(line: &'a str, regex: &Regex) -> Vec<&'a str> {
let mut tokens = Vec::new();
let mut offset = 0;
for m in regex.find_iter(line) {
if offset == 0 && m.start() > 0 {
tokens.push("");
}
// Align separating text as multiple single-character tokens.
for t in line[offset..m.start()].graphemes(true) {
tokens.push(t);
}
tokens.push(&line[m.start()..m.end()]);
offset = m.end();
}
if offset < line.len() {
if offset == 0