From 37a6c55d11b3ffe8f9b88ac2c772f82d6b8f0efa Mon Sep 17 00:00:00 2001
From: Dan Davison <dandavison7@gmail.com>
Date: Thu, 11 Jun 2020 19:27:24 -0400
Subject: Bug fix: fix tokenization of non-ASCII text

---
 src/edits.rs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'src/edits.rs')

diff --git a/src/edits.rs b/src/edits.rs
index a7142d6c..3707d195 100644
--- a/src/edits.rs
+++ b/src/edits.rs
@@ -1,6 +1,7 @@
 use regex::Regex;
 
 use lazy_static::lazy_static;
+use unicode_segmentation::UnicodeSegmentation;
 use unicode_width::UnicodeWidthStr;
 
 use crate::align;
@@ -90,8 +91,8 @@ fn tokenize(line: &str) -> Vec<&str> {
             tokens.push("");
         }
         // Align separating text as multiple single-character tokens.
-        for i in offset..m.start() {
-            tokens.push(&line[i..i + 1]);
+        for t in line[offset..m.start()].graphemes(true) {
+            tokens.push(t);
         }
         tokens.push(&line[m.start()..m.end()]);
         offset = m.end();
@@ -100,8 +101,8 @@ fn tokenize(line: &str) -> Vec<&str> {
         if offset == 0 {
             tokens.push("");
         }
-        for i in offset..line.len() {
-            tokens.push(&line[i..i + 1]);
+        for t in line[offset..line.len()].graphemes(true) {
+            tokens.push(t);
         }
     }
     tokens
-- 
cgit v1.2.3