From a93521d2b10b0d3d05c103a7837a5a1a2a1a1b64 Mon Sep 17 00:00:00 2001
From: Clement Tsang <34804052+ClementTsang@users.noreply.github.com>
Date: Fri, 24 Nov 2023 04:45:03 +0000
Subject: refactor: add fast branch for ascii-only string truncate (#1330)

This is just a first attempt to speed up what looked like a hot spot in samply's profiling results.

Benchmark code and results here: https://gist.github.com/ClementTsang/e242f12f7e1d1902ed414dcc18c3b321
---
 .gitignore            |   3 ++
 src/utils/gen_util.rs | 111 ++++++++++++++++++++++++++++++++++----------------
 2 files changed, 78 insertions(+), 36 deletions(-)
diff --git a/.gitignore b/.gitignore
index 1d84d108..2e446385 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,3 +42,6 @@ dhat/
 
 # cargo vet
 supply-chain/
+
+# samply profiling
+profile.json
diff --git a/src/utils/gen_util.rs b/src/utils/gen_util.rs
index cc72c21a..97c6dbc1 100644
--- a/src/utils/gen_util.rs
+++ b/src/utils/gen_util.rs
@@ -100,43 +100,59 @@ fn grapheme_width(g: &str) -> usize {
 #[inline]
 fn truncate_str<U: Into<usize>>(content: &str, width: U) -> String {
     let width = width.into();
-    let mut text = String::with_capacity(width);
 
     if width > 0 {
-        let mut curr_width = 0;
-        let mut early_break = false;
-
-        // This tracks the length of the last added string - note this does NOT match the grapheme *width*.
-        let mut last_added_str_len = 0;
-
-        // Cases to handle:
-        // - Completes adding the entire string.
-        // - Adds a character up to the boundary, then fails.
-        // - Adds a character not up to the boundary, then fails.
-        // Inspired by https://tomdebruijn.com/posts/rust-string-length-width-calculations/
-        for g in UnicodeSegmentation::graphemes(content, true) {
-            let g_width = grapheme_width(g);
-
-            if curr_width + g_width <= width {
-                curr_width += g_width;
-                last_added_str_len = g.len();
-                text.push_str(g);
+        if content.is_ascii() {
+            // If the entire string is ASCII, we can use a much simpler approach.
+
+            if content.len() <= width {
+                content.to_owned()
             } else {
-                early_break = true;
-                break;
+                let mut text = String::with_capacity(width);
+                let (keep, _throw) = content.split_at(width - 1);
+                text.push_str(keep);
+                text.push('…');
+
+                text
+            }
+        } else {
+            let mut text = String::with_capacity(width);
+            let mut curr_width = 0;
+            let mut early_break = false;
+
+            // This tracks the length of the last added string - note this does NOT match the grapheme *width*.
+            let mut last_added_str_len = 0;
+
+            // Cases to handle:
+            // - Completes adding the entire string.
+            // - Adds a character up to the boundary, then fails.
+            // - Adds a character not up to the boundary, then fails.
+            // Inspired by https://tomdebruijn.com/posts/rust-string-length-width-calculations/
+            for g in UnicodeSegmentation::graphemes(content, true) {
+                let g_width = grapheme_width(g);
+
+                if curr_width + g_width <= width {
+                    curr_width += g_width;
+                    last_added_str_len = g.len();
+                    text.push_str(g);
+                } else {
+                    early_break = true;
+                    break;
+                }
             }
-        }
 
-        if early_break {
-            if curr_width == width {
-                // Remove the last grapheme cluster added.
-                text.truncate(text.len() - last_added_str_len);
+            if early_break {
+                if curr_width == width {
+                    // Remove the last grapheme cluster added.
+                    text.truncate(text.len() - last_added_str_len);
+                }
+                text.push('…');
             }
-            text.push('…');
+            text
         }
+    } else {
+        String::default()
     }
-
-    text
 }
 
 #[inline]
@@ -252,7 +268,7 @@ mod test {
         assert_eq!(
             truncate_str(cpu_header, 8_usize),
             cpu_header,
-            "should match base string as there is enough room"
+            "should match base string as there is extra room"
         );
 
         assert_eq!(
@@ -269,13 +285,36 @@ mod test {
     }
 
     #[test]
-    fn truncate_cjk() {
+    fn test_truncate_ascii() {
+        let content = "0123456";
+
+        assert_eq!(
+            truncate_str(content, 8_usize),
+            content,
+            "should match base string as there is extra room"
+        );
+
+        assert_eq!(
+            truncate_str(content, 7_usize),
+            content,
+            "should match base string as there is enough room"
+        );
+
+        assert_eq!(truncate_str(content, 6_usize), "01234…");
+        assert_eq!(truncate_str(content, 5_usize), "0123…");
+        assert_eq!(truncate_str(content, 4_usize), "012…");
+        assert_eq!(truncate_str(content, 1_usize), "…");
+        assert_eq!(truncate_str(content, 0_usize), "");
+    }
+
+    #[test]
+    fn test_truncate_cjk() {
         let cjk = "施氏食獅史";
 
         assert_eq!(
             truncate_str(cjk, 11_usize),
             cjk,
-            "should match base string as there is enough room"
+            "should match base string as there is extra room"
         );
 
         assert_eq!(
@@ -292,13 +331,13 @@ mod test {
     }
 
     #[test]
-    fn truncate_mixed() {
+    fn test_truncate_mixed() {
         let test = "Test (施氏食獅史) Test";
 
         assert_eq!(
             truncate_str(test, 30_usize),
             test,
-            "should match base string as there is enough room"
+            "should match base string as there is extra room"
         );
 
         assert_eq!(
@@ -325,7 +364,7 @@ mod test {
     }
 
     #[test]
-    fn truncate_flags() {
+    fn test_truncate_flags() {
         let flag = "🇨🇦";
         assert_eq!(truncate_str(flag, 3_usize), flag);
         assert_eq!(truncate_str(flag, 2_usize), flag);
@@ -368,7 +407,7 @@ mod test {
 
     /// This might not be the best way to handle it, but this at least tests that it doesn't crash...
     #[test]
-    fn truncate_hindi() {
+    fn test_truncate_hindi() {
         // cSpell:disable
         let test = "हिन्दी";
         assert_eq!(truncate_str(test, 10_usize), test);
-- 
cgit v1.2.3