summaryrefslogtreecommitdiffstats
path: root/src/edits.rs
diff options
context:
space:
mode:
authorDan Davison <dandavison7@gmail.com>2020-06-11 19:27:24 -0400
committerDan Davison <dandavison7@gmail.com>2020-06-11 19:27:24 -0400
commit37a6c55d11b3ffe8f9b88ac2c772f82d6b8f0efa (patch)
tree9d742c676d31b267f73904f9f27eaf9e277c3106 /src/edits.rs
parent12e251d2d0a3117929c5f097071f5361a2be4d1a (diff)
Bug fix: fix tokenization of non-ASCII text
Diffstat (limited to 'src/edits.rs')
-rw-r--r--src/edits.rs9
1 files changed, 5 insertions, 4 deletions
diff --git a/src/edits.rs b/src/edits.rs
index a7142d6c..3707d195 100644
--- a/src/edits.rs
+++ b/src/edits.rs
@@ -1,6 +1,7 @@
use regex::Regex;
use lazy_static::lazy_static;
+use unicode_segmentation::UnicodeSegmentation;
use unicode_width::UnicodeWidthStr;
use crate::align;
@@ -90,8 +91,8 @@ fn tokenize(line: &str) -> Vec<&str> {
tokens.push("");
}
// Align separating text as multiple single-character tokens.
- for i in offset..m.start() {
- tokens.push(&line[i..i + 1]);
+ for t in line[offset..m.start()].graphemes(true) {
+ tokens.push(t);
}
tokens.push(&line[m.start()..m.end()]);
offset = m.end();
@@ -100,8 +101,8 @@ fn tokenize(line: &str) -> Vec<&str> {
if offset == 0 {
tokens.push("");
}
- for i in offset..line.len() {
- tokens.push(&line[i..i + 1]);
+ for t in line[offset..line.len()].graphemes(true) {
+ tokens.push(t);
}
}
tokens