diff options
author | Dan Davison <dandavison7@gmail.com> | 2019-08-04 11:13:11 -0700 |
---|---|---|
committer | Dan Davison <dandavison7@gmail.com> | 2019-08-06 23:11:00 -0700 |
commit | 8db9a7a786525e560985786448783d234ba9c2cd (patch) | |
tree | fde216ef7daac475c03629949b3e4ebc656ac5a7 /src/edits.rs | |
parent | ae69199ba3240bc01e464ef16dfa884b7463412c (diff) |
Change tokenization algorithm
Diffstat (limited to 'src/edits.rs')
-rw-r--r-- | src/edits.rs | 67 |
1 files changed, 56 insertions, 11 deletions
diff --git a/src/edits.rs b/src/edits.rs index acba837c..230f1e22 100644 --- a/src/edits.rs +++ b/src/edits.rs @@ -81,11 +81,19 @@ where } fn tokenize(line: &str) -> Vec<(usize, &str)> { - let regex = Regex::new("[^ ]*( +|$)").unwrap(); - regex - .find_iter(line) - .map(|m| (m.start(), m.as_str())) - .collect() + let separators = Regex::new(r"[ ,;.:()\[\]<>]+").unwrap(); + let mut tokens = Vec::new(); + let mut offset = 0; + for m in separators.find_iter(line) { + let (start, end) = (m.start(), m.end()); + tokens.push((offset, &line[offset..start])); + tokens.push((start, m.as_str())); + offset = end; + } + if offset < line.len() { + tokens.push((offset, &line[offset..line.len()])); + } + tokens } pub fn coalesce_minus_edits<'a, EditOperation>( @@ -183,14 +191,26 @@ mod tests { #[test] fn test_tokenize_1() { - assert_eq!(tokenize("aaa bbb"), vec![(0, "aaa "), (4, "bbb")]) + assert_eq!( + tokenize("aaa bbb"), + substring_indices(vec!["aaa", " ", "bbb"]) + ) } #[test] fn test_tokenize_2() { assert_eq!( tokenize("fn coalesce_edits<'a, EditOperation>("), - substring_indices(vec!["fn ", "coalesce_edits<'a, ", "EditOperation>("]) + substring_indices(vec![ + "fn", + " ", + "coalesce_edits", + "<", + "'a", + ", ", + "EditOperation", + ">(" + ]) ); } @@ -199,10 +219,35 @@ mod tests { assert_eq!( tokenize("fn coalesce_edits<'a, 'b, EditOperation>("), substring_indices(vec![ - "fn ", - "coalesce_edits<'a, ", - "'b, ", - "EditOperation>(" + "fn", + " ", + "coalesce_edits", + "<", + "'a", + ", ", + "'b", + ", ", + "EditOperation", + ">(" + ]) + ); + } + + #[test] + fn test_tokenize_4() { + assert_eq!( + tokenize("annotated_plus_lines.push(vec![(non_insertion, plus_line)]);"), + substring_indices(vec![ + "annotated_plus_lines", + ".", + "push", + "(", + "vec!", + "[(", + "non_insertion", + ", ", + "plus_line", + ")]);" ]) ); } |