summaryrefslogtreecommitdiffstats
path: root/src/edits.rs
diff options
context:
space:
mode:
authorDan Davison <dandavison7@gmail.com>2019-08-04 00:05:19 -0700
committerDan Davison <dandavison7@gmail.com>2019-08-06 23:05:52 -0700
commit1bb19ef8ba6af230df1148c86ccec240a03f8f89 (patch)
treea5d9af5754186592bb5643ae7eface53bd1a7808 /src/edits.rs
parent41c43314da3cee8bbaa98c1deff0e7c45646b28d (diff)
Align tokenized string instead of characters/graphemes
Thanks @clnoll
Diffstat (limited to 'src/edits.rs')
-rw-r--r--src/edits.rs89
1 files changed, 76 insertions, 13 deletions
diff --git a/src/edits.rs b/src/edits.rs
index 95c7191e..992b3e66 100644
--- a/src/edits.rs
+++ b/src/edits.rs
@@ -1,4 +1,4 @@
-use unicode_segmentation::UnicodeSegmentation;
+use regex::Regex;
use crate::align;
@@ -32,14 +32,8 @@ where
let minus_line = minus_line.trim_end();
for plus_line in &plus_lines[emitted..] {
let plus_line = plus_line.trim_end();
- let alignment = align::Alignment::new(
- minus_line
- .grapheme_indices(true)
- .collect::<Vec<(usize, &str)>>(),
- plus_line
- .grapheme_indices(true)
- .collect::<Vec<(usize, &str)>>(),
- );
+
+ let alignment = align::Alignment::new(tokenize(minus_line), tokenize(plus_line));
if alignment.normalized_edit_distance() < distance_threshold {
// minus_line and plus_line are inferred to be a homologous pair.
@@ -86,6 +80,14 @@ where
(annotated_minus_lines, annotated_plus_lines)
}
+fn tokenize(line: &str) -> Vec<(usize, &str)> {
+ let regex = Regex::new("[^ ]*( +|$)").unwrap();
+ regex
+ .find_iter(line)
+ .map(|m| (m.start(), m.as_str()))
+ .collect()
+}
+
pub fn coalesce_minus_edits<'a, EditOperation>(
alignment: &align::Alignment<'a>,
line: &'a str,
@@ -180,6 +182,38 @@ mod tests {
const DISTANCE_MAX: f64 = 2.0;
#[test]
+ fn test_tokenize_1() {
+ assert_eq!(tokenize("aaa bbb"), vec![(0, "aaa "), (4, "bbb")])
+ }
+
+ #[test]
+ fn test_tokenize_2() {
+ assert_eq!(
+ tokenize("fn coalesce_edits<'a, EditOperation>("),
+ vec![
+ (0, "fn "),
+ (3, "coalesce_edits<'a, "),
+ (22, "EditOperation>(")
+ ]
+ );
+ }
+
+ #[test]
+ fn test_tokenize_3() {
+ assert_eq!(
+ tokenize("fn coalesce_edits<'a, 'b, EditOperation>("),
+ vec![
+ (0, "fn "),
+ (3, "coalesce_edits<'a, "),
+ (22, "'b, "),
+ (26, "EditOperation>(")
+ ]
+ );
+ }
+
+ // vec!["fn coalesce_edits<'a, 'b, EditOperation>("],
+
+ #[test]
fn test_coalesce_edits_1() {
assert_eq!(
coalesce_edits(
@@ -197,8 +231,20 @@ mod tests {
vec!["aaa\n"],
vec!["aba\n"],
(
- vec![vec![(MinusNoop, "a"), (Deletion, "a"), (MinusNoop, "a")]],
- vec![vec![(PlusNoop, "a"), (Insertion, "b"), (PlusNoop, "a")]],
+ vec![vec![(Deletion, "aaa")]],
+ vec![vec![(Insertion, "aba")]],
+ ),
+ )
+ }
+
+ #[test]
+ fn test_infer_edits_1_2() {
+ assert_paired_edits(
+ vec!["aaa ccc\n"],
+ vec!["aba ccc\n"],
+ (
+ vec![vec![(Deletion, "aaa "), (MinusNoop, "ccc")]],
+ vec![vec![(Insertion, "aba "), (PlusNoop, "ccc")]],
),
)
}
@@ -209,8 +255,8 @@ mod tests {
vec!["áaa\n"],
vec!["ááb\n"],
(
- vec![vec![(MinusNoop, "á"), (Deletion, "aa")]],
- vec![vec![(PlusNoop, "á"), (Insertion, "áb")]],
+ vec![vec![(Deletion, "áaa")]],
+ vec![vec![(Insertion, "ááb")]],
),
)
}
@@ -297,6 +343,23 @@ mod tests {
], 0.66)
}
+ #[test]
+ fn test_infer_edits_7() {
+ assert_edits(
+ vec!["fn coalesce_edits<'a, EditOperation>("],
+ vec!["fn coalesce_edits<'a, 'b, EditOperation>("],
+ (
+ vec![vec![(MinusNoop, "fn coalesce_edits<'a, EditOperation>(")]],
+ vec![vec![
+ (PlusNoop, "fn coalesce_edits<'a, "),
+ (Insertion, "'b, "),
+ (PlusNoop, "EditOperation>("),
+ ]],
+ ),
+ 0.66,
+ )
+ }
+
fn assert_edits(
minus_lines: Vec<&str>,
plus_lines: Vec<&str>,