summaryrefslogtreecommitdiffstats
path: root/src/edits.rs
diff options
context:
space:
mode:
authorDan Davison <dandavison7@gmail.com>2019-08-04 11:13:11 -0700
committerDan Davison <dandavison7@gmail.com>2019-08-06 23:11:00 -0700
commit8db9a7a786525e560985786448783d234ba9c2cd (patch)
treefde216ef7daac475c03629949b3e4ebc656ac5a7 /src/edits.rs
parentae69199ba3240bc01e464ef16dfa884b7463412c (diff)
Change tokenization algorithm
Diffstat (limited to 'src/edits.rs')
-rw-r--r--src/edits.rs67
1 files changed, 56 insertions, 11 deletions
diff --git a/src/edits.rs b/src/edits.rs
index acba837c..230f1e22 100644
--- a/src/edits.rs
+++ b/src/edits.rs
@@ -81,11 +81,19 @@ where
}
fn tokenize(line: &str) -> Vec<(usize, &str)> {
- let regex = Regex::new("[^ ]*( +|$)").unwrap();
- regex
- .find_iter(line)
- .map(|m| (m.start(), m.as_str()))
- .collect()
+ let separators = Regex::new(r"[ ,;.:()\[\]<>]+").unwrap();
+ let mut tokens = Vec::new();
+ let mut offset = 0;
+ for m in separators.find_iter(line) {
+ let (start, end) = (m.start(), m.end());
+ tokens.push((offset, &line[offset..start]));
+ tokens.push((start, m.as_str()));
+ offset = end;
+ }
+ if offset < line.len() {
+ tokens.push((offset, &line[offset..line.len()]));
+ }
+ tokens
}
pub fn coalesce_minus_edits<'a, EditOperation>(
@@ -183,14 +191,26 @@ mod tests {
#[test]
fn test_tokenize_1() {
- assert_eq!(tokenize("aaa bbb"), vec![(0, "aaa "), (4, "bbb")])
+ assert_eq!(
+ tokenize("aaa bbb"),
+ substring_indices(vec!["aaa", " ", "bbb"])
+ )
}
#[test]
fn test_tokenize_2() {
assert_eq!(
tokenize("fn coalesce_edits<'a, EditOperation>("),
- substring_indices(vec!["fn ", "coalesce_edits<'a, ", "EditOperation>("])
+ substring_indices(vec![
+ "fn",
+ " ",
+ "coalesce_edits",
+ "<",
+ "'a",
+ ", ",
+ "EditOperation",
+ ">("
+ ])
);
}
@@ -199,10 +219,35 @@ mod tests {
assert_eq!(
tokenize("fn coalesce_edits<'a, 'b, EditOperation>("),
substring_indices(vec![
- "fn ",
- "coalesce_edits<'a, ",
- "'b, ",
- "EditOperation>("
+ "fn",
+ " ",
+ "coalesce_edits",
+ "<",
+ "'a",
+ ", ",
+ "'b",
+ ", ",
+ "EditOperation",
+ ">("
+ ])
+ );
+ }
+
+ #[test]
+ fn test_tokenize_4() {
+ assert_eq!(
+ tokenize("annotated_plus_lines.push(vec![(non_insertion, plus_line)]);"),
+ substring_indices(vec![
+ "annotated_plus_lines",
+ ".",
+ "push",
+ "(",
+ "vec!",
+ "[(",
+ "non_insertion",
+ ", ",
+ "plus_line",
+ ")]);"
])
);
}