summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorClement Tsang <34804052+ClementTsang@users.noreply.github.com>2023-11-24 04:45:03 +0000
committerGitHub <noreply@github.com>2023-11-23 23:45:03 -0500
commita93521d2b10b0d3d05c103a7837a5a1a2a1a1b64 (patch)
treeb7b408ba5a3d4da2d124b39191d1dcb8ab42234d
parent3a50d7622e2e776c792cb77e3d67f464e6e49868 (diff)
refactor: add fast branch for ascii-only string truncate (#1330)
This is just a first attempt to speed up what looked like a hot spot in samply's profiling results. Benchmark code and results here: https://gist.github.com/ClementTsang/e242f12f7e1d1902ed414dcc18c3b321
-rw-r--r--.gitignore3
-rw-r--r--src/utils/gen_util.rs111
2 files changed, 78 insertions, 36 deletions
diff --git a/.gitignore b/.gitignore
index 1d84d108..2e446385 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,3 +42,6 @@ dhat/
# cargo vet
supply-chain/
+
+# samply profiling
+profile.json
diff --git a/src/utils/gen_util.rs b/src/utils/gen_util.rs
index cc72c21a..97c6dbc1 100644
--- a/src/utils/gen_util.rs
+++ b/src/utils/gen_util.rs
@@ -100,43 +100,59 @@ fn grapheme_width(g: &str) -> usize {
#[inline]
fn truncate_str<U: Into<usize>>(content: &str, width: U) -> String {
let width = width.into();
- let mut text = String::with_capacity(width);
if width > 0 {
- let mut curr_width = 0;
- let mut early_break = false;
-
- // This tracks the length of the last added string - note this does NOT match the grapheme *width*.
- let mut last_added_str_len = 0;
-
- // Cases to handle:
- // - Completes adding the entire string.
- // - Adds a character up to the boundary, then fails.
- // - Adds a character not up to the boundary, then fails.
- // Inspired by https://tomdebruijn.com/posts/rust-string-length-width-calculations/
- for g in UnicodeSegmentation::graphemes(content, true) {
- let g_width = grapheme_width(g);
-
- if curr_width + g_width <= width {
- curr_width += g_width;
- last_added_str_len = g.len();
- text.push_str(g);
+ if content.is_ascii() {
+ // If the entire string is ASCII, we can use a much simpler approach.
+
+ if content.len() <= width {
+ content.to_owned()
} else {
- early_break = true;
- break;
+ let mut text = String::with_capacity(width);
+ let (keep, _throw) = content.split_at(width - 1);
+ text.push_str(keep);
+ text.push('…');
+
+ text
+ }
+ } else {
+ let mut text = String::with_capacity(width);
+ let mut curr_width = 0;
+ let mut early_break = false;
+
+ // This tracks the length of the last added string - note this does NOT match the grapheme *width*.
+ let mut last_added_str_len = 0;
+
+ // Cases to handle:
+ // - Completes adding the entire string.
+ // - Adds a character up to the boundary, then fails.
+ // - Adds a character not up to the boundary, then fails.
+ // Inspired by https://tomdebruijn.com/posts/rust-string-length-width-calculations/
+ for g in UnicodeSegmentation::graphemes(content, true) {
+ let g_width = grapheme_width(g);
+
+ if curr_width + g_width <= width {
+ curr_width += g_width;
+ last_added_str_len = g.len();
+ text.push_str(g);
+ } else {
+ early_break = true;
+ break;
+ }
}
- }
- if early_break {
- if curr_width == width {
- // Remove the last grapheme cluster added.
- text.truncate(text.len() - last_added_str_len);
+ if early_break {
+ if curr_width == width {
+ // Remove the last grapheme cluster added.
+ text.truncate(text.len() - last_added_str_len);
+ }
+ text.push('…');
}
- text.push('…');
+ text
}
+ } else {
+ String::default()
}
-
- text
}
#[inline]
@@ -252,7 +268,7 @@ mod test {
assert_eq!(
truncate_str(cpu_header, 8_usize),
cpu_header,
- "should match base string as there is enough room"
+ "should match base string as there is extra room"
);
assert_eq!(
@@ -269,13 +285,36 @@ mod test {
}
#[test]
- fn truncate_cjk() {
+ fn test_truncate_ascii() {
+ let content = "0123456";
+
+ assert_eq!(
+ truncate_str(content, 8_usize),
+ content,
+ "should match base string as there is extra room"
+ );
+
+ assert_eq!(
+ truncate_str(content, 7_usize),
+ content,
+ "should match base string as there is enough room"
+ );
+
+ assert_eq!(truncate_str(content, 6_usize), "01234…");
+ assert_eq!(truncate_str(content, 5_usize), "0123…");
+ assert_eq!(truncate_str(content, 4_usize), "012…");
+ assert_eq!(truncate_str(content, 1_usize), "…");
+ assert_eq!(truncate_str(content, 0_usize), "");
+ }
+
+ #[test]
+ fn test_truncate_cjk() {
let cjk = "施氏食獅史";
assert_eq!(
truncate_str(cjk, 11_usize),
cjk,
- "should match base string as there is enough room"
+ "should match base string as there is extra room"
);
assert_eq!(
@@ -292,13 +331,13 @@ mod test {
}
#[test]
- fn truncate_mixed() {
+ fn test_truncate_mixed() {
let test = "Test (施氏食獅史) Test";
assert_eq!(
truncate_str(test, 30_usize),
test,
- "should match base string as there is enough room"
+ "should match base string as there is extra room"
);
assert_eq!(
@@ -325,7 +364,7 @@ mod test {
}
#[test]
- fn truncate_flags() {
+ fn test_truncate_flags() {
let flag = "🇨🇦";
assert_eq!(truncate_str(flag, 3_usize), flag);
assert_eq!(truncate_str(flag, 2_usize), flag);
@@ -368,7 +407,7 @@ mod test {
/// This might not be the best way to handle it, but this at least tests that it doesn't crash...
#[test]
- fn truncate_hindi() {
+ fn test_truncate_hindi() {
// cSpell:disable
let test = "हिन्दी";
assert_eq!(truncate_str(test, 10_usize), test);