From 37a6c55d11b3ffe8f9b88ac2c772f82d6b8f0efa Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 11 Jun 2020 19:27:24 -0400 Subject: Bug fix: fix tokenization of non-ASCII text --- src/edits.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'src/edits.rs') diff --git a/src/edits.rs b/src/edits.rs index a7142d6c..3707d195 100644 --- a/src/edits.rs +++ b/src/edits.rs @@ -1,6 +1,7 @@ use regex::Regex; use lazy_static::lazy_static; +use unicode_segmentation::UnicodeSegmentation; use unicode_width::UnicodeWidthStr; use crate::align; @@ -90,8 +91,8 @@ fn tokenize(line: &str) -> Vec<&str> { tokens.push(""); } // Align separating text as multiple single-character tokens. - for i in offset..m.start() { - tokens.push(&line[i..i + 1]); + for t in line[offset..m.start()].graphemes(true) { + tokens.push(t); } tokens.push(&line[m.start()..m.end()]); offset = m.end(); @@ -100,8 +101,8 @@ fn tokenize(line: &str) -> Vec<&str> { if offset == 0 { tokens.push(""); } - for i in offset..line.len() { - tokens.push(&line[i..i + 1]); + for t in line[offset..line.len()].graphemes(true) { + tokens.push(t); } } tokens -- cgit v1.2.3