summaryrefslogtreecommitdiffstats
path: root/src/edits.rs
diff options
context:
space:
mode:
authorDan Davison <dandavison7@gmail.com>2020-06-11 20:28:53 -0400
committerDan Davison <dandavison7@gmail.com>2020-06-11 20:43:51 -0400
commit97116f284bd826cd0c2805ed3b0f4359b310ad6a (patch)
tree1905558d19979de62b6b5b45b7a2ba2825cbb2f1 /src/edits.rs
parent37a6c55d11b3ffe8f9b88ac2c772f82d6b8f0efa (diff)
New option --word-diff-regex
Fixes #184
Diffstat (limited to 'src/edits.rs')
-rw-r--r--src/edits.rs23
1 files changed, 14 insertions, 9 deletions
diff --git a/src/edits.rs b/src/edits.rs
index 3707d195..d8f18377 100644
--- a/src/edits.rs
+++ b/src/edits.rs
@@ -1,6 +1,5 @@
use regex::Regex;
-use lazy_static::lazy_static;
use unicode_segmentation::UnicodeSegmentation;
use unicode_width::UnicodeWidthStr;
@@ -17,6 +16,7 @@ pub fn infer_edits<'a, EditOperation>(
deletion: EditOperation,
noop_insertion: EditOperation,
insertion: EditOperation,
+ tokenization_regex: &Regex,
max_line_distance: f64,
max_line_distance_for_naively_paired_lines: f64,
) -> (
@@ -35,7 +35,10 @@ where
'minus_lines_loop: for minus_line in minus_lines {
let mut considered = 0; // plus lines considered so far as match for minus_line
for plus_line in &plus_lines[emitted..] {
- let alignment = align::Alignment::new(tokenize(minus_line), tokenize(plus_line));
+ let alignment = align::Alignment::new(
+ tokenize(minus_line, tokenization_regex),
+ tokenize(plus_line, tokenization_regex),
+ );
let (annotated_minus_line, annotated_plus_line, distance) = annotate(
alignment,
noop_deletion,
@@ -77,16 +80,12 @@ where
(annotated_minus_lines, annotated_plus_lines)
}
-lazy_static! {
- static ref TOKENIZATION_REGEXP: Regex = Regex::new(r#"\w+"#).unwrap();
-}
-
/// Split line into tokens for alignment. The alignment algorithm aligns sequences of substrings;
/// not individual characters.
-fn tokenize(line: &str) -> Vec<&str> {
+fn tokenize<'a>(line: &'a str, regex: &Regex) -> Vec<&'a str> {
let mut tokens = Vec::new();
let mut offset = 0;
- for m in TOKENIZATION_REGEXP.find_iter(line) {
+ for m in regex.find_iter(line) {
if offset == 0 && m.start() > 0 {
tokens.push("");
}
@@ -236,8 +235,13 @@ where
mod tests {
use super::*;
use itertools::Itertools;
+ use lazy_static::lazy_static;
use unicode_segmentation::UnicodeSegmentation;
+ lazy_static! {
+ static ref DEFAULT_TOKENIZATION_REGEXP: Regex = Regex::new(r#"\w+"#).unwrap();
+ }
+
#[derive(Clone, Copy, Debug, PartialEq)]
enum EditOperation {
MinusNoop,
@@ -433,7 +437,7 @@ mod tests {
}
fn assert_tokenize(text: &str, expected_tokens: &[&str]) {
- let actual_tokens = tokenize(text);
+ let actual_tokens = tokenize(text, &*DEFAULT_TOKENIZATION_REGEXP);
assert_eq!(text, expected_tokens.iter().join(""));
assert_eq!(actual_tokens, expected_tokens);
}
@@ -712,6 +716,7 @@ mod tests {
Deletion,
PlusNoop,
Insertion,
+ &*DEFAULT_TOKENIZATION_REGEXP,
max_line_distance,
0.0,
);