summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/align.rs87
-rw-r--r--src/cli.rs6
-rw-r--r--src/edits.rs19
-rw-r--r--src/paint.rs2
4 files changed, 81 insertions, 33 deletions
diff --git a/src/align.rs b/src/align.rs
index ee1052f2..5100676f 100644
--- a/src/align.rs
+++ b/src/align.rs
@@ -1,4 +1,6 @@
-use std::cmp::{max, min};
+use std::cmp::min;
+
+use unicode_segmentation::UnicodeSegmentation;
/// Needleman-Wunsch / Wagner-Fischer table for computation of edit distance and associated
/// alignment.
@@ -81,15 +83,6 @@ impl<'a> Alignment<'a> {
}
}
- /// Read edit distance from the table.
- pub fn edit_distance(&self) -> usize {
- self.table[self.table.len() - 1]
- }
-
- pub fn normalized_edit_distance(&self) -> f64 {
- self.edit_distance() as f64 / max(self.xx.len(), self.yy.len()) as f64
- }
-
/// Read edit operations from the table.
pub fn edit_operations<EditOperation>(
&self,
@@ -126,12 +119,40 @@ impl<'a> Alignment<'a> {
}
ops.into_iter().rev().collect()
}
+
+ /// Compute custom distance metric from the filled table. The distance metric is
+ ///
+ /// (total length of edits) / (total length of longer string)
+ ///
+ /// where length is measured in number of unicode grapheme clusters.
+ pub fn distance(&self) -> f64 {
+ let (numer, denom) = self.distance_parts();
+ (numer as f64) / (denom as f64)
+ }
+
+ fn distance_parts(&self) -> (usize, usize) {
+ let noop = 0;
+ let (mut numer, mut denom) = (0, 0);
+ for (op, (_, s)) in self.edit_operations(0, 1, 1, 1, true) {
+ let n = s.graphemes(true).count();
+ if op != noop {
+ numer += n;
+ }
+ denom += n;
+ }
+ (numer, denom)
+ }
+
+ /// Compute levenshtein distance from the filled table.
+ #[allow(dead_code)]
+ pub fn levenshtein_distance(&self) -> usize {
+ self.table[self.table.len() - 1]
+ }
}
#[cfg(test)]
mod tests {
use super::*;
- use unicode_segmentation::UnicodeSegmentation;
#[derive(Copy, Clone, Debug, PartialEq)]
enum EditOperation {
@@ -145,27 +166,30 @@ mod tests {
#[test]
fn test_0() {
- assert_eq!(levenshtein_distance("aaa", "aba"), 1);
+ let (before, after) = ("aaa", "aba");
+ assert_string_distance_parts(before, after, (1, 3));
assert_eq!(
- edit_operations("aaa", "aba"),
+ edit_operations(before, after),
vec![NoOp, Substitution, NoOp,]
);
}
#[test]
fn test_0_nonascii() {
- assert_eq!(levenshtein_distance("áaa", "ááb"), 2);
+ let (before, after) = ("ááb", "áaa");
+ assert_string_distance_parts(before, after, (2, 3));
assert_eq!(
- edit_operations("áaa", "ááb"),
+ edit_operations(before, after),
vec![NoOp, Substitution, Substitution,]
);
}
#[test]
fn test_1() {
- assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
+ let (before, after) = ("kitten", "sitting");
+ assert_string_distance_parts(before, after, (3, 7));
assert_eq!(
- edit_operations("kitten", "sitting"),
+ edit_operations(before, after),
vec![
Substitution, // K S
NoOp, // I I
@@ -180,9 +204,10 @@ mod tests {
#[test]
fn test_2() {
- assert_eq!(levenshtein_distance("saturday", "sunday"), 3);
+ let (before, after) = ("saturday", "sunday");
+ assert_string_distance_parts(before, after, (3, 8));
assert_eq!(
- edit_operations("saturday", "sunday"),
+ edit_operations(before, after),
vec![
NoOp, // S S
Deletion, // A -
@@ -196,12 +221,32 @@ mod tests {
);
}
- fn levenshtein_distance(x: &str, y: &str) -> usize {
+ fn assert_string_distance_parts(s1: &str, s2: &str, parts: (usize, usize)) {
+ let (numer, _) = parts;
+ assert_string_levenshtein_distance(s1, s2, numer);
+ assert_eq!(string_distance_parts(s1, s2), parts);
+ assert_eq!(string_distance_parts(s2, s1), parts);
+ }
+
+ fn assert_string_levenshtein_distance(s1: &str, s2: &str, d: usize) {
+ assert_eq!(string_levenshtein_distance(s1, s2), d);
+ assert_eq!(string_levenshtein_distance(s2, s1), d);
+ }
+
+ fn string_distance_parts(x: &str, y: &str) -> (usize, usize) {
+ let (x, y) = (
+ x.grapheme_indices(true).collect::<Vec<(usize, &str)>>(),
+ y.grapheme_indices(true).collect::<Vec<(usize, &str)>>(),
+ );
+ Alignment::new(x, y).distance_parts()
+ }
+
+ fn string_levenshtein_distance(x: &str, y: &str) -> usize {
let (x, y) = (
x.grapheme_indices(true).collect::<Vec<(usize, &str)>>(),
y.grapheme_indices(true).collect::<Vec<(usize, &str)>>(),
);
- Alignment::new(x, y).edit_distance()
+ Alignment::new(x, y).levenshtein_distance()
}
fn edit_operations<'a>(x: &'a str, y: &'a str) -> Vec<EditOperation> {
diff --git a/src/cli.rs b/src/cli.rs
index 8c711baf..ad369eb1 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -86,6 +86,12 @@ pub struct Opt {
/// For example: `git show --color=always | delta --compare-themes`.
#[structopt(long = "compare-themes")]
pub compare_themes: bool,
+
+ /// The maximum distance between two lines for them to be inferred to be homologous. Homologous
+ /// line pairs are highlighted according to the deletion and insertion operations transforming
+ /// one into the other.
+ #[structopt(long = "max-line-distance", default_value = "0.3")]
+ pub max_line_distance: f64,
}
#[derive(Debug, PartialEq)]
diff --git a/src/edits.rs b/src/edits.rs
index 230f1e22..67933688 100644
--- a/src/edits.rs
+++ b/src/edits.rs
@@ -13,7 +13,7 @@ pub fn infer_edits<'a, EditOperation>(
deletion: EditOperation,
non_insertion: EditOperation,
insertion: EditOperation,
- distance_threshold: f64,
+ max_line_distance: f64,
) -> (
Vec<Vec<(EditOperation, &'a str)>>, // annotated minus lines
Vec<Vec<(EditOperation, &'a str)>>, // annotated plus lines
@@ -34,8 +34,7 @@ where
let plus_line = plus_line.trim_end();
let alignment = align::Alignment::new(tokenize(minus_line), tokenize(plus_line));
-
- if alignment.normalized_edit_distance() < distance_threshold {
+ if alignment.distance() <= max_line_distance {
// minus_line and plus_line are inferred to be a homologous pair.
// Emit as unpaired the plus lines already considered and rejected
@@ -187,8 +186,6 @@ mod tests {
use EditOperation::*;
- const DISTANCE_MAX: f64 = 2.0;
-
#[test]
fn test_tokenize_1() {
assert_eq!(
@@ -413,7 +410,7 @@ mod tests {
minus_lines: Vec<&str>,
plus_lines: Vec<&str>,
expected_edits: Edits,
- distance_threshold: f64,
+ max_line_distance: f64,
) {
let minus_lines = minus_lines
.into_iter()
@@ -430,25 +427,25 @@ mod tests {
Deletion,
PlusNoop,
Insertion,
- distance_threshold,
+ max_line_distance,
);
assert_eq!(actual_edits, expected_edits);
}
// Assert that no edits are inferred for the supplied minus and plus lines.
- fn assert_no_edits(minus_lines: Vec<&str>, plus_lines: Vec<&str>, distance_threshold: f64) {
+ fn assert_no_edits(minus_lines: Vec<&str>, plus_lines: Vec<&str>, max_line_distance: f64) {
let expected_edits = (
minus_lines.iter().map(|s| vec![(MinusNoop, *s)]).collect(),
plus_lines.iter().map(|s| vec![(PlusNoop, *s)]).collect(),
);
- assert_edits(minus_lines, plus_lines, expected_edits, distance_threshold)
+ assert_edits(minus_lines, plus_lines, expected_edits, max_line_distance)
}
// Assertions for a single pair of lines, considered as a homologous pair. We set
- // distance_threshold = DISTANCE_MAX in order that the pair will be inferred to be homologous.
+ // max_line_distance = 1.0 in order that the pair will be inferred to be homologous.
fn assert_paired_edits(minus_lines: Vec<&str>, plus_lines: Vec<&str>, expected_edits: Edits) {
assert_consistent_pairs(&expected_edits);
- assert_edits(minus_lines, plus_lines, expected_edits, DISTANCE_MAX);
+ assert_edits(minus_lines, plus_lines, expected_edits, 1.0);
}
fn assert_consistent_pairs(edits: &Edits) {
diff --git a/src/paint.rs b/src/paint.rs
index 4fcb593b..15771338 100644
--- a/src/paint.rs
+++ b/src/paint.rs
@@ -155,7 +155,7 @@ impl<'a> Painter<'a> {
config.minus_emph_style_modifier,
config.plus_style_modifier,
config.plus_emph_style_modifier,
- 0.66,
+ config.opt.max_line_distance,
)
}
}