diff options
author | Dan Davison <dandavison7@gmail.com> | 2020-06-11 20:28:53 -0400 |
---|---|---|
committer | Dan Davison <dandavison7@gmail.com> | 2020-06-11 20:43:51 -0400 |
commit | 97116f284bd826cd0c2805ed3b0f4359b310ad6a (patch) | |
tree | 1905558d19979de62b6b5b45b7a2ba2825cbb2f1 | |
parent | 37a6c55d11b3ffe8f9b88ac2c772f82d6b8f0efa (diff) |
New option --word-diff-regex
Fixes #184
-rw-r--r-- | src/cli.rs | 6 | ||||
-rw-r--r-- | src/config.rs | 13 | ||||
-rw-r--r-- | src/edits.rs | 23 | ||||
-rw-r--r-- | src/paint.rs | 1 | ||||
-rw-r--r-- | src/rewrite.rs | 1 |
5 files changed, 35 insertions, 9 deletions
@@ -363,6 +363,12 @@ pub struct Opt { #[structopt(long = "list-syntax-themes")] pub list_syntax_themes: bool, + /// The regular expression used to decide what a word is for the within-line highlight + /// algorithm. For less fine-grained matching than the default try --word-diff-regex="\S+" + /// --max-line-distance=1.0 (this is more similar to `git --word-diff`). + #[structopt(long = "word-diff-regex", default_value = r"\w+")] + pub tokenization_regex: String, + /// The maximum distance between two lines for them to be inferred to be homologous. Homologous /// line pairs are highlighted according to the deletion and insertion operations transforming /// one into the other. diff --git a/src/config.rs b/src/config.rs index f8513281..d10f0dcd 100644 --- a/src/config.rs +++ b/src/config.rs @@ -4,6 +4,7 @@ use std::process; use console::Term; use git2; +use regex::Regex; use structopt::{clap, StructOpt}; use syntect::highlighting::Style as SyntectStyle; use syntect::highlighting::Theme as SyntaxTheme; @@ -68,6 +69,7 @@ pub struct Config<'a> { pub syntax_theme_name: String, pub tab_width: usize, pub true_color: bool, + pub tokenization_regex: Regex, pub zero_style: Style, } @@ -248,6 +250,16 @@ impl<'a> From<cli::Opt> for Config<'a> { .map(|s| s.parse::<f64>().unwrap_or(0.0)) .unwrap_or(0.0); + let tokenization_regex = Regex::new(&opt.tokenization_regex).unwrap_or_else(|_| { + eprintln!( + "Invalid word-diff-regex: {}. \ + The value must be a valid Rust regular expression. \ + See https://docs.rs/regex.", + opt.tokenization_regex + ); + process::exit(1); + }); + Self { background_color_extends_to_terminal_width, commit_style, @@ -291,6 +303,7 @@ impl<'a> From<cli::Opt> for Config<'a> { syntax_theme, syntax_theme_name, tab_width: opt.tab_width, + tokenization_regex, true_color, zero_style, } diff --git a/src/edits.rs b/src/edits.rs index 3707d195..d8f18377 100644 --- a/src/edits.rs +++ b/src/edits.rs @@ -1,6 +1,5 @@ use regex::Regex; -use lazy_static::lazy_static; use unicode_segmentation::UnicodeSegmentation; use unicode_width::UnicodeWidthStr; @@ -17,6 +16,7 @@ pub fn infer_edits<'a, EditOperation>( deletion: EditOperation, noop_insertion: EditOperation, insertion: EditOperation, + tokenization_regex: &Regex, max_line_distance: f64, max_line_distance_for_naively_paired_lines: f64, ) -> ( @@ -35,7 +35,10 @@ where 'minus_lines_loop: for minus_line in minus_lines { let mut considered = 0; // plus lines considered so far as match for minus_line for plus_line in &plus_lines[emitted..] { - let alignment = align::Alignment::new(tokenize(minus_line), tokenize(plus_line)); + let alignment = align::Alignment::new( + tokenize(minus_line, tokenization_regex), + tokenize(plus_line, tokenization_regex), + ); let (annotated_minus_line, annotated_plus_line, distance) = annotate( alignment, noop_deletion, @@ -77,16 +80,12 @@ where (annotated_minus_lines, annotated_plus_lines) } -lazy_static! { - static ref TOKENIZATION_REGEXP: Regex = Regex::new(r#"\w+"#).unwrap(); -} - /// Split line into tokens for alignment. The alignment algorithm aligns sequences of substrings; /// not individual characters. -fn tokenize(line: &str) -> Vec<&str> { +fn tokenize<'a>(line: &'a str, regex: &Regex) -> Vec<&'a str> { let mut tokens = Vec::new(); let mut offset = 0; - for m in TOKENIZATION_REGEXP.find_iter(line) { + for m in regex.find_iter(line) { if offset == 0 && m.start() > 0 { tokens.push(""); } @@ -236,8 +235,13 @@ where mod tests { use super::*; use itertools::Itertools; + use lazy_static::lazy_static; use unicode_segmentation::UnicodeSegmentation; + lazy_static! { + static ref DEFAULT_TOKENIZATION_REGEXP: Regex = Regex::new(r#"\w+"#).unwrap(); + } + #[derive(Clone, Copy, Debug, PartialEq)] enum EditOperation { MinusNoop, @@ -433,7 +437,7 @@ mod tests { } fn assert_tokenize(text: &str, expected_tokens: &[&str]) { - let actual_tokens = tokenize(text); + let actual_tokens = tokenize(text, &*DEFAULT_TOKENIZATION_REGEXP); assert_eq!(text, expected_tokens.iter().join("")); assert_eq!(actual_tokens, expected_tokens); } @@ -712,6 +716,7 @@ mod tests { Deletion, PlusNoop, Insertion, + &*DEFAULT_TOKENIZATION_REGEXP, max_line_distance, 0.0, ); diff --git a/src/paint.rs b/src/paint.rs index 2a2481f6..9ed065c4 100644 --- a/src/paint.rs +++ b/src/paint.rs @@ -292,6 +292,7 @@ impl<'a> Painter<'a> { config.minus_emph_style, config.plus_style, config.plus_emph_style, + &config.tokenization_regex, config.max_line_distance, config.max_line_distance_for_naively_paired_lines, ); diff --git a/src/rewrite.rs b/src/rewrite.rs index 9bdcd579..685a2b0a 100644 --- a/src/rewrite.rs +++ b/src/rewrite.rs @@ -94,6 +94,7 @@ fn rewrite_options_to_honor_git_config( ("plus-non-emph-style", plus_non_emph_style), ("plus-style", plus_style), ("true-color", true_color), + ("word-diff-regex", tokenization_regex), ("zero-style", zero_style) ], opt, |