diff options
author | Clement Tsang <34804052+ClementTsang@users.noreply.github.com> | 2023-11-27 08:30:43 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-11-27 03:30:43 -0500 |
commit | eab8736dfdbdcdcbc87ebfcffba756e916129539 (patch) | |
tree | a2e548a28cde3aec4efee03c1aa030b96d7754f9 | |
parent | 94e4573ebcfb25728a0265f1907e203263b889a1 (diff) |
refactor: iteratively detect ASCII and build string when truncating (#1334)
* refactor: iteratively detect ASCII and build string when truncating
* more tests
* test going by usize
* Revert "test going by usize"
This reverts commit 4fe71260e70696a0bb5907c97a2ca049fc23214c.
-rw-r--r-- | src/utils/gen_util.rs | 182 |
1 files changed, 135 insertions, 47 deletions
diff --git a/src/utils/gen_util.rs b/src/utils/gen_util.rs index e423b6b2..4b6f8e4e 100644 --- a/src/utils/gen_util.rs +++ b/src/utils/gen_util.rs @@ -1,4 +1,4 @@ -use std::cmp::Ordering; +use std::{cmp::Ordering, num::NonZeroUsize}; use tui::text::{Line, Span, Text}; use unicode_segmentation::UnicodeSegmentation; @@ -59,6 +59,8 @@ pub fn get_decimal_prefix(quantity: u64, unit: &str) -> (f64, String) { } /// Truncates text if it is too long, and adds an ellipsis at the end if needed. +/// +/// TODO: Maybe cache results from this function for some cases? e.g. columns pub fn truncate_to_text<'a, U: Into<usize>>(content: &str, width: U) -> Text<'a> { Text { lines: vec![Line::from(vec![Span::raw(truncate_str(content, width))])], @@ -93,10 +95,66 @@ fn grapheme_width(g: &str) -> usize { } } -/// Truncates a string with an ellipsis character. +enum AsciiIterationResult { + Complete, + Remaining(usize), +} + +/// Greedily add characters to the output until a non-ASCII grapheme is found, or +/// the output is `width` long. +#[inline] +fn greedy_ascii_add(content: &str, width: NonZeroUsize) -> (String, AsciiIterationResult) { + let width: usize = width.into(); + + let mut text = Vec::with_capacity(width); + + let s = content.as_bytes(); + + let mut current_index = 0; + + while current_index < width - 1 { + let current_byte = s[current_index]; + if current_byte.is_ascii() { + text.push(current_byte); + current_index += 1; + } else { + debug_assert!(text.is_ascii()); + + let current_index = AsciiIterationResult::Remaining(current_index); + + // SAFETY: This conversion is safe to do unchecked, we only push ASCII characters up to + // this point. + let current_text = unsafe { String::from_utf8_unchecked(text) }; + + return (current_text, current_index); + } + } + + // If we made it all the way through, then we probably hit the width limit. + debug_assert!(text.is_ascii()); + + let current_index = if s[current_index].is_ascii() { + let mut ellipsis = [0; 3]; + '…'.encode_utf8(&mut ellipsis); + text.extend_from_slice(&ellipsis); + AsciiIterationResult::Complete + } else { + AsciiIterationResult::Remaining(current_index) + }; + + // SAFETY: This conversion is safe to do unchecked, we only push ASCII characters up to + // this point. + let current_text = unsafe { String::from_utf8_unchecked(text) }; + + (current_text, current_index) +} + +/// Truncates a string to the specified width with an ellipsis character. /// /// NB: This probably does not handle EVERY case, but I think it handles most cases /// we will use this function for fine... hopefully. +/// +/// TODO: Maybe fuzz this function? #[inline] fn truncate_str<U: Into<usize>>(content: &str, width: U) -> String { let width = width.into(); @@ -106,54 +164,54 @@ fn truncate_str<U: Into<usize>>(content: &str, width: U) -> String { // need to copy the entire string over. content.to_owned() - } else if width > 0 { - if content.is_ascii() { - // If the entire string is ASCII, we can use a much simpler approach - // in regards to what we truncate. - - let mut text = String::with_capacity(width); - let (keep, _throw) = content.split_at(width - 1); - text.push_str(keep); - text.push('…'); - - text - } else { - // Otherwise iterate by grapheme and greedily fit as many graphemes - // as width will allow. - - let mut text = String::with_capacity(width); - let mut curr_width = 0; - let mut early_break = false; - - // This tracks the length of the last added string - note this does NOT match the grapheme *width*. - let mut last_added_str_len = 0; - - // Cases to handle: - // - Completes adding the entire string. - // - Adds a character up to the boundary, then fails. - // - Adds a character not up to the boundary, then fails. - // Inspired by https://tomdebruijn.com/posts/rust-string-length-width-calculations/ - for g in UnicodeSegmentation::graphemes(content, true) { - let g_width = grapheme_width(g); - - if curr_width + g_width <= width { - curr_width += g_width; - last_added_str_len = g.len(); - text.push_str(g); - } else { - early_break = true; - break; + } else if let Some(nz_width) = NonZeroUsize::new(width) { + // What we are essentially doing is optimizing for the case that + // most, if not all of the string is ASCII. As such: + // - Step through each byte until (width - 1) is hit or we find a non-ascii + // byte. + // - If the byte is ascii, then add it. + // + // If we didn't get a complete truncated string, then continue on treating the rest as graphemes. + + let (mut text, res) = greedy_ascii_add(content, nz_width); + match res { + AsciiIterationResult::Complete => text, + AsciiIterationResult::Remaining(current_index) => { + let mut curr_width = text.len(); + let mut early_break = false; + + // This tracks the length of the last added string - note this does NOT match the grapheme *width*. + // Since the previous characters are always ASCII, this is always initialized as 1, unless the string + // is empty. + let mut last_added_str_len = if text.is_empty() { 0 } else { 1 }; + + // Cases to handle: + // - Completes adding the entire string. + // - Adds a character up to the boundary, then fails. + // - Adds a character not up to the boundary, then fails. + // Inspired by https://tomdebruijn.com/posts/rust-string-length-width-calculations/ + for g in UnicodeSegmentation::graphemes(&content[current_index..], true) { + let g_width = grapheme_width(g); + + if curr_width + g_width <= width { + curr_width += g_width; + last_added_str_len = g.len(); + text.push_str(g); + } else { + early_break = true; + break; + } } - } - if early_break { - if curr_width == width { - // Remove the last grapheme cluster added. - text.truncate(text.len() - last_added_str_len); + if early_break { + if curr_width == width { + // Remove the last grapheme cluster added. + text.truncate(text.len() - last_added_str_len); + } + text.push('…'); } - text.push('…'); + text } - text } } else { String::default() @@ -336,7 +394,7 @@ mod test { } #[test] - fn test_truncate_mixed() { + fn test_truncate_mixed_one() { let test = "Test (施氏食獅史) Test"; assert_eq!( @@ -357,6 +415,8 @@ mod test { "should truncate the t and replace the s with ellipsis" ); + assert_eq!(truncate_str(test, 20_usize), "Test (施氏食獅史) T…"); + assert_eq!(truncate_str(test, 19_usize), "Test (施氏食獅史) …"); assert_eq!(truncate_str(test, 18_usize), "Test (施氏食獅史)…"); assert_eq!(truncate_str(test, 17_usize), "Test (施氏食獅史…"); assert_eq!(truncate_str(test, 16_usize), "Test (施氏食獅…"); @@ -366,6 +426,34 @@ mod test { assert_eq!(truncate_str(test, 8_usize), "Test (…"); assert_eq!(truncate_str(test, 7_usize), "Test (…"); assert_eq!(truncate_str(test, 6_usize), "Test …"); + assert_eq!(truncate_str(test, 5_usize), "Test…"); + assert_eq!(truncate_str(test, 4_usize), "Tes…"); + } + + #[test] + fn test_truncate_mixed_two() { + let test = "Test (施氏abc食abc獅史) Test"; + + assert_eq!( + truncate_str(test, 30_usize), + test, + "should match base string as there is extra room" + ); + + assert_eq!( + truncate_str(test, 28_usize), + test, + "should match base string as there is just enough room" + ); + + assert_eq!(truncate_str(test, 26_usize), "Test (施氏abc食abc獅史) T…"); + assert_eq!(truncate_str(test, 21_usize), "Test (施氏abc食abc獅…"); + assert_eq!(truncate_str(test, 20_usize), "Test (施氏abc食abc…"); + assert_eq!(truncate_str(test, 16_usize), "Test (施氏abc食…"); + assert_eq!(truncate_str(test, 15_usize), "Test (施氏abc…"); + assert_eq!(truncate_str(test, 14_usize), "Test (施氏abc…"); + assert_eq!(truncate_str(test, 11_usize), "Test (施氏…"); + assert_eq!(truncate_str(test, 10_usize), "Test (施…"); } #[test] |