From 2404597c29f1ca4125740891224216d00e75ac6f Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Fri, 19 Jul 2019 17:45:09 -0400 Subject: Use grapheme units for all visible character calculations --- src/edits.rs | 60 +++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 21 deletions(-) (limited to 'src/edits.rs') diff --git a/src/edits.rs b/src/edits.rs index b3eb43fd..20893b4e 100644 --- a/src/edits.rs +++ b/src/edits.rs @@ -305,6 +305,7 @@ mod tests { mod string_pair { use std::iter::Peekable; + use unicode_segmentation::UnicodeSegmentation; /// A pair of right-trimmed strings. pub struct StringPair { @@ -315,9 +316,10 @@ mod string_pair { impl StringPair { pub fn new(s0: &str, s1: &str) -> StringPair { - let common_prefix_length = StringPair::common_prefix_length(s0.chars(), s1.chars()); - let (common_suffix_length, trailing_whitespace) = - StringPair::suffix_data(s0.chars(), s1.chars()); + let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true)); + let common_prefix_length = StringPair::common_prefix_length(g0, g1); // TODO: pass references + let (g0, g1) = (s0.grapheme_indices(true), s1.grapheme_indices(true)); + let (common_suffix_length, trailing_whitespace) = StringPair::suffix_data(g0, g1); StringPair { common_prefix_length, common_suffix_length, @@ -328,26 +330,36 @@ mod string_pair { } } - fn common_prefix_length(s0: I, s1: I) -> usize + /// Align the two strings at their left ends and consider only + /// the bytes up to the length of the shorter string. Return + /// the byte offset of the first differing grapheme cluster, + /// or the byte length of shorter string if they do not + /// differ. + fn common_prefix_length<'a, I>(s0: I, s1: I) -> usize where - I: Iterator, - I::Item: PartialEq, + I: Iterator, { let mut i = 0; - for (c0, c1) in s0.zip(s1) { + for ((_, c0), (_, c1)) in s0.zip(s1) { if c0 != c1 { break; } else { - i += 1; + i += c0.len(); } } i } - /// Return common suffix length and number of trailing whitespace characters on each string. - fn suffix_data(s0: I, s1: I) -> (usize, [usize; 2]) + /// Trim trailing whitespace and align the two strings at + /// their right ends. Fix the origin at their right ends and, + /// looking left, consider only the bytes up to the length of + /// the shorter string. Return the byte offset of the first + /// differing grapheme cluster, or the byte length of the + /// shorter string if they do not differ. Also return the + /// number of bytes of whitespace trimmed from each string. + fn suffix_data<'a, I>(s0: I, s1: I) -> (usize, [usize; 2]) where - I: DoubleEndedIterator, + I: DoubleEndedIterator, { let mut s0 = s0.rev().peekable(); let mut s1 = s1.rev().peekable(); @@ -358,21 +370,24 @@ mod string_pair { } /// Consume leading whitespace; return number of characters consumed. - fn consume_whitespace(s: &mut Peekable) -> usize + fn consume_whitespace<'a, I>(s: &mut Peekable) -> usize where - I: Iterator, + I: Iterator, { - let mut i = 0; + let mut n = 0; loop { match s.peek() { - Some('\n') | Some(' ') => { + // TODO: Use a whitespace unicode character class? + // Allow for whitespace grapheme clusters > 1 + // byte? + Some(&(_, "\n")) | Some(&(_, " ")) => { s.next(); - i += 1; + n += 1; } _ => break, } } - i + n } } @@ -440,13 +455,16 @@ mod string_pair { assert_eq!(common_suffix_length(" ", "á"), 0); assert_eq!(common_suffix_length("á ", ""), 0); assert_eq!(common_suffix_length("á", "b "), 0); - assert_eq!(common_suffix_length("á", "á "), 1); + assert_eq!(common_suffix_length("á", "á "), "á".len()); assert_eq!(common_suffix_length("a ", "áb "), 0); assert_eq!(common_suffix_length("ab", "á "), 0); assert_eq!(common_suffix_length("áb ", "b "), 1); - assert_eq!(common_suffix_length("áb ", "aáb "), 2); - assert_eq!(common_suffix_length("abá ", "bá"), 2); - assert_eq!(common_suffix_length("áaáabá ", "ááabá "), 4); + assert_eq!(common_suffix_length("áb ", "aáb "), 1 + "á".len()); + assert_eq!(common_suffix_length("abá ", "bá"), 1 + "á".len()); + assert_eq!( + common_suffix_length("áaáabá ", "ááabá "), + 2 + 2 * "á".len() + ); } } } -- cgit v1.2.3