summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Davison <dandavison7@gmail.com>2020-06-11 20:28:53 -0400
committerDan Davison <dandavison7@gmail.com>2020-06-11 20:43:51 -0400
commit97116f284bd826cd0c2805ed3b0f4359b310ad6a (patch)
tree1905558d19979de62b6b5b45b7a2ba2825cbb2f1
parent37a6c55d11b3ffe8f9b88ac2c772f82d6b8f0efa (diff)
New option --word-diff-regex
Fixes #184
-rw-r--r--src/cli.rs6
-rw-r--r--src/config.rs13
-rw-r--r--src/edits.rs23
-rw-r--r--src/paint.rs1
-rw-r--r--src/rewrite.rs1
5 files changed, 35 insertions, 9 deletions
diff --git a/src/cli.rs b/src/cli.rs
index 29e0371c..9780e98b 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -363,6 +363,12 @@ pub struct Opt {
#[structopt(long = "list-syntax-themes")]
pub list_syntax_themes: bool,
+ /// The regular expression used to decide what a word is for the within-line highlight
+ /// algorithm. For less fine-grained matching than the default try --word-diff-regex="\S+"
+ /// --max-line-distance=1.0 (this is more similar to `git --word-diff`).
+ #[structopt(long = "word-diff-regex", default_value = r"\w+")]
+ pub tokenization_regex: String,
+
/// The maximum distance between two lines for them to be inferred to be homologous. Homologous
/// line pairs are highlighted according to the deletion and insertion operations transforming
/// one into the other.
diff --git a/src/config.rs b/src/config.rs
index f8513281..d10f0dcd 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -4,6 +4,7 @@ use std::process;
use console::Term;
use git2;
+use regex::Regex;
use structopt::{clap, StructOpt};
use syntect::highlighting::Style as SyntectStyle;
use syntect::highlighting::Theme as SyntaxTheme;
@@ -68,6 +69,7 @@ pub struct Config<'a> {
pub syntax_theme_name: String,
pub tab_width: usize,
pub true_color: bool,
+ pub tokenization_regex: Regex,
pub zero_style: Style,
}
@@ -248,6 +250,16 @@ impl<'a> From<cli::Opt> for Config<'a> {
.map(|s| s.parse::<f64>().unwrap_or(0.0))
.unwrap_or(0.0);
+ let tokenization_regex = Regex::new(&opt.tokenization_regex).unwrap_or_else(|_| {
+ eprintln!(
+ "Invalid word-diff-regex: {}. \
+ The value must be a valid Rust regular expression. \
+ See https://docs.rs/regex.",
+ opt.tokenization_regex
+ );
+ process::exit(1);
+ });
+
Self {
background_color_extends_to_terminal_width,
commit_style,
@@ -291,6 +303,7 @@ impl<'a> From<cli::Opt> for Config<'a> {
syntax_theme,
syntax_theme_name,
tab_width: opt.tab_width,
+ tokenization_regex,
true_color,
zero_style,
}
diff --git a/src/edits.rs b/src/edits.rs
index 3707d195..d8f18377 100644
--- a/src/edits.rs
+++ b/src/edits.rs
@@ -1,6 +1,5 @@
use regex::Regex;
-use lazy_static::lazy_static;
use unicode_segmentation::UnicodeSegmentation;
use unicode_width::UnicodeWidthStr;
@@ -17,6 +16,7 @@ pub fn infer_edits<'a, EditOperation>(
deletion: EditOperation,
noop_insertion: EditOperation,
insertion: EditOperation,
+ tokenization_regex: &Regex,
max_line_distance: f64,
max_line_distance_for_naively_paired_lines: f64,
) -> (
@@ -35,7 +35,10 @@ where
'minus_lines_loop: for minus_line in minus_lines {
let mut considered = 0; // plus lines considered so far as match for minus_line
for plus_line in &plus_lines[emitted..] {
- let alignment = align::Alignment::new(tokenize(minus_line), tokenize(plus_line));
+ let alignment = align::Alignment::new(
+ tokenize(minus_line, tokenization_regex),
+ tokenize(plus_line, tokenization_regex),
+ );
let (annotated_minus_line, annotated_plus_line, distance) = annotate(
alignment,
noop_deletion,
@@ -77,16 +80,12 @@ where
(annotated_minus_lines, annotated_plus_lines)
}
-lazy_static! {
- static ref TOKENIZATION_REGEXP: Regex = Regex::new(r#"\w+"#).unwrap();
-}
-
/// Split line into tokens for alignment. The alignment algorithm aligns sequences of substrings;
/// not individual characters.
-fn tokenize(line: &str) -> Vec<&str> {
+fn tokenize<'a>(line: &'a str, regex: &Regex) -> Vec<&'a str> {
let mut tokens = Vec::new();
let mut offset = 0;
- for m in TOKENIZATION_REGEXP.find_iter(line) {
+ for m in regex.find_iter(line) {
if offset == 0 && m.start() > 0 {
tokens.push("");
}
@@ -236,8 +235,13 @@ where
mod tests {
use super::*;
use itertools::Itertools;
+ use lazy_static::lazy_static;
use unicode_segmentation::UnicodeSegmentation;
+ lazy_static! {
+ static ref DEFAULT_TOKENIZATION_REGEXP: Regex = Regex::new(r#"\w+"#).unwrap();
+ }
+
#[derive(Clone, Copy, Debug, PartialEq)]
enum EditOperation {
MinusNoop,
@@ -433,7 +437,7 @@ mod tests {
}
fn assert_tokenize(text: &str, expected_tokens: &[&str]) {
- let actual_tokens = tokenize(text);
+ let actual_tokens = tokenize(text, &*DEFAULT_TOKENIZATION_REGEXP);
assert_eq!(text, expected_tokens.iter().join(""));
assert_eq!(actual_tokens, expected_tokens);
}
@@ -712,6 +716,7 @@ mod tests {
Deletion,
PlusNoop,
Insertion,
+ &*DEFAULT_TOKENIZATION_REGEXP,
max_line_distance,
0.0,
);
diff --git a/src/paint.rs b/src/paint.rs
index 2a2481f6..9ed065c4 100644
--- a/src/paint.rs
+++ b/src/paint.rs
@@ -292,6 +292,7 @@ impl<'a> Painter<'a> {
config.minus_emph_style,
config.plus_style,
config.plus_emph_style,
+ &config.tokenization_regex,
config.max_line_distance,
config.max_line_distance_for_naively_paired_lines,
);
diff --git a/src/rewrite.rs b/src/rewrite.rs
index 9bdcd579..685a2b0a 100644
--- a/src/rewrite.rs
+++ b/src/rewrite.rs
@@ -94,6 +94,7 @@ fn rewrite_options_to_honor_git_config(
("plus-non-emph-style", plus_non_emph_style),
("plus-style", plus_style),
("true-color", true_color),
+ ("word-diff-regex", tokenization_regex),
("zero-style", zero_style)
],
opt,