diff options
author | Dan Davison <dandavison7@gmail.com> | 2020-06-11 19:27:24 -0400 |
---|---|---|
committer | Dan Davison <dandavison7@gmail.com> | 2020-06-11 19:27:24 -0400 |
commit | 37a6c55d11b3ffe8f9b88ac2c772f82d6b8f0efa (patch) | |
tree | 9d742c676d31b267f73904f9f27eaf9e277c3106 /src/edits.rs | |
parent | 12e251d2d0a3117929c5f097071f5361a2be4d1a (diff) |
Bug fix: fix tokenization of non-ASCII text
Diffstat (limited to 'src/edits.rs')
-rw-r--r-- | src/edits.rs | 9 |
1 files changed, 5 insertions, 4 deletions
diff --git a/src/edits.rs b/src/edits.rs index a7142d6c..3707d195 100644 --- a/src/edits.rs +++ b/src/edits.rs @@ -1,6 +1,7 @@ use regex::Regex; use lazy_static::lazy_static; +use unicode_segmentation::UnicodeSegmentation; use unicode_width::UnicodeWidthStr; use crate::align; @@ -90,8 +91,8 @@ fn tokenize(line: &str) -> Vec<&str> { tokens.push(""); } // Align separating text as multiple single-character tokens. - for i in offset..m.start() { - tokens.push(&line[i..i + 1]); + for t in line[offset..m.start()].graphemes(true) { + tokens.push(t); } tokens.push(&line[m.start()..m.end()]); offset = m.end(); @@ -100,8 +101,8 @@ fn tokenize(line: &str) -> Vec<&str> { if offset == 0 { tokens.push(""); } - for i in offset..line.len() { - tokens.push(&line[i..i + 1]); + for t in line[offset..line.len()].graphemes(true) { + tokens.push(t); } } tokens |