Use grapheme units for all visible character calculations0.0.4

author: Dan Davison <dandavison7@gmail.com> 2019-07-19 17:45:09 -0400
committer: Dan Davison <dandavison7@gmail.com> 2019-07-20 18:10:27 -0400
commit: 2404597c29f1ca4125740891224216d00e75ac6f (patch)
tree: 09deb1306a8f7f9911f8af89004a4389d5aca40c
parent: 003d3c464888f603df63b727adab639b024a6ab3 (diff)
4 files changed, 47 insertions, 25 deletions
diff --git a/Cargo.toml b/Cargo.toml
index aa04d10d..a9ab9b3b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,6 +12,7 @@ console = "0.7.7"
 shell-words = "0.1.0"
 structopt = "0.2.16"
 syntect = "3.2"
+unicode-segmentation = "1.3.0"
 
 [dependencies.error-chain]
 version = "0.12"
diff --git a/src/delta.rs b/src/delta.rs
index 425f87b7..57777bfe 100644
--- a/src/delta.rs
+++ b/src/delta.rs
@@ -3,6 +3,7 @@ use std::io::Write;
 use ansi_term::Colour::{Blue, Yellow};
 use console::strip_ansi_codes;
 use syntect::easy::HighlightLines;
+use unicode_segmentation::UnicodeSegmentation;
 
 use crate::bat::assets::HighlightingAssets;
 use crate::cli;
@@ -249,9 +250,10 @@ fn prepare(_line: &str, config: &Config) -> String {
         line.push_str(" ");
         line.push_str(&_line[1..]);
     }
+    let line_length = line.graphemes(true).count();
     match config.width {
-        Some(width) if width > line.len() => {
-            format!("{}{}\n", line, " ".repeat(width - line.len()))
+        Some(width) if width > line_length => {
+            format!("{}{}\n", line, " ".repeat(width - line_length))
         }
         _ => format!("{}\n", line),
     }
diff --git a/src/draw.rs b/src/draw.rs
index 1018614f..7fe54e1d 100644
--- a/src/draw.rs
+++ b/src/draw.rs
@@ -3,6 +3,7 @@ use std::io::Write;
 use ansi_term::Style;
 use box_drawing;
 use console::strip_ansi_codes;
+use unicode_segmentation::UnicodeSegmentation;
 
 /// Write text to stream, surrounded by a box, leaving the cursor just
 /// beyond the bottom right corner.
@@ -18,7 +19,7 @@ pub fn write_boxed(
     } else {
         box_drawing::light::UP_LEFT
     };
-    let box_width = strip_ansi_codes(text).len() + 1;
+    let box_width = strip_ansi_codes(text).graphemes(true).count() + 1;
     write_boxed_partial(writer, text, box_width, line_style, heavy)?;
     write!(writer, "{}", line_style.paint(up_left))?;
     Ok(())
@@ -33,7 +34,7 @@ pub fn write_boxed_with_line(
     line_style: Style,
     heavy: bool,
 ) -> std::io::Result<()> {
-    let box_width = strip_ansi_codes(text).len() + 1;
+    let box_width = strip_ansi_codes(text).graphemes(true).count() + 1;
     write_boxed_with_horizontal_whisker(writer, text, box_width, line_style, heavy)?;
     write_horizontal_line(writer, line_width - box_width - 1, line_style, heavy)?;
     Ok(())
diff --git a/src/edits.rs b/src/edits.rs
index b3eb43fd..20893b4e 100644
--- a/src/edits.rs
+++ b/src/edits.rs
@@ -305,6 +305,7 @@ mod tests {
 
 mod string_pair {
     use std::iter::Peekable;
+    use unicode_segmentation::UnicodeSegmentation;
 
     /// A pair of right-trimmed strings.
     pub struct StringPair {
@@ -315,9 +316,10 @@ mod string_pair {
 
     impl StringPair {
         pub fn new(s0: &str, s1: &str) -> StringPair {
-            let common_prefix_length = StringPair::common_prefix_length(s0.chars(), s1.chars());
-            let (common_suffix_length, trailing_whitespace) =
-                StringPair::suffix_data(s0.chars(), s1.chars());
+            let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true));
+            let common_prefix_length = StringPair::common_prefix_length(g0, g1); // TODO: pass references
+            let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true));
+            let (common_suffix_length, trailing_whitespace) = StringPair::suffix_data(g0, g1);
             StringPair {
                 common_prefix_length,
                 common_suffix_length,
@@ -328,26 +330,36 @@ mod string_pair {
             }
         }
 
-        fn common_prefix_length<I>(s0: I, s1: I) -> usize
+        /// Align the two strings at their left ends and consider only
+        /// the bytes up to the length of the shorter string. Return
+        /// the byte offset of the first differing grapheme cluster,
+        /// or the byte length of shorter string if they do not
+        /// differ.
+        fn common_prefix_length<'a, I>(s0: I, s1: I) -> usize
         where
-            I: Iterator,
-            I::Item: PartialEq,
+            I: Iterator<Item = (usize, &'a str)>,
         {
             let mut i = 0;
-            for (c0, c1) in s0.zip(s1) {
+            for ((_, c0), (_, c1)) in s0.zip(s1) {
                 if c0 != c1 {
                     break;
                 } else {
-                    i += 1;
+                    i += c0.len();
                 }
             }
             i
         }
 
-        /// Return common suffix length and number of trailing whitespace characters on each string.
-        fn suffix_data<I>(s0: I, s1: I) -> (usize, [usize; 2])
+        /// Trim trailing whitespace and align the two strings at
+        /// their right ends. Fix the origin at their right ends and,
+        /// looking left, consider only the bytes up to the length of
+        /// the shorter string. Return the byte offset of the first
+        /// differing grapheme cluster, or the byte length of the
+        /// shorter string if they do not differ. Also return the
+        /// number of bytes of whitespace trimmed from each string.
+        fn suffix_data<'a, I>(s0: I, s1: I) -> (usize, [usize; 2])
         where
-            I: DoubleEndedIterator<Item = char>,
+            I: DoubleEndedIterator<Item = (usize, &'a str)>,
         {
             let mut s0 = s0.rev().peekable();
             let mut s1 = s1.rev().peekable();
@@ -358,21 +370,24 @@ mod string_pair {
         }
 
         /// Consume leading whitespace; return number of characters consumed.
-        fn consume_whitespace<I>(s: &mut Peekable<I>) -> usize
+        fn consume_whitespace<'a, I>(s: &mut Peekable<I>) -> usize
         where
-            I: Iterator<Item = char>,
+            I: Iterator<Item = (usize, &'a str)>,
         {
-            let mut i = 0;
+            let mut n = 0;
             loop {
                 match s.peek() {
-                    Some('\n') | Some(' ') => {
+                    // TODO: Use a whitespace unicode character class?
+                    // Allow for whitespace grapheme clusters > 1
+                    // byte?
+                    Some(&(_, "\n")) | Some(&(_, " ")) => {
                         s.next();
-                        i += 1;
+                        n += 1;
                     }
                     _ => break,
                 }
             }
-            i
+            n
         }
     }
 
@@ -440,13 +455,16 @@ mod string_pair {
             assert_eq!(common_suffix_length("  ", "á"), 0);
             assert_eq!(common_suffix_length("á  ", ""), 0);
             assert_eq!(common_suffix_length("á", "b  "), 0);
-            assert_eq!(common_suffix_length("á", "á  "), 1);
+            assert_eq!(common_suffix_length("á", "á  "), "á".len());
             assert_eq!(common_suffix_length("a  ", "áb  "), 0);
             assert_eq!(common_suffix_length("ab", "á  "), 0);
             assert_eq!(common_suffix_length("áb  ", "b "), 1);
-            assert_eq!(common_suffix_length("áb ", "aáb  "), 2);
-            assert_eq!(common_suffix_length("abá ", "bá"), 2);
-            assert_eq!(common_suffix_length("áaáabá ", "ááabá   "), 4);
+            assert_eq!(common_suffix_length("áb ", "aáb  "), 1 + "á".len());
+            assert_eq!(common_suffix_length("abá ", "bá"), 1 + "á".len());
+            assert_eq!(
+                common_suffix_length("áaáabá ", "ááabá   "),
+                2 + 2 * "á".len()
+            );
         }
     }
 }
author	Dan Davison <dandavison7@gmail.com>	2019-07-19 17:45:09 -0400
committer	Dan Davison <dandavison7@gmail.com>	2019-07-20 18:10:27 -0400
commit	2404597c29f1ca4125740891224216d00e75ac6f (patch)
tree	09deb1306a8f7f9911f8af89004a4389d5aca40c
parent	003d3c464888f603df63b727adab639b024a6ab3 (diff)