globset: use bstr

This simplifies the various path related functions and pushed more platform dependent code down into bstr. This likely also makes things a bit more efficient on Windows, since we now only do a single UTF-8 check for each file path.
author: Andrew Gallant <jamslam@gmail.com> 2019-04-04 18:33:41 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2019-04-05 23:24:08 -0400
commit: 8a7f43b84dfd4b3e186804a29c66215b3cfeb8f7 (patch)
tree: f708126be8e455e99e909a38ddd35c1783210c8f
parent: d968a27ed5298d99e46ff65b68a7f6c2c641105f (diff)
4 files changed, 60 insertions, 94 deletions
diff --git a/globset/Cargo.toml b/globset/Cargo.toml
index 81cb2c31..371a3272 100644
--- a/globset/Cargo.toml
+++ b/globset/Cargo.toml
@@ -20,10 +20,10 @@ bench = false
 
 [dependencies]
 aho-corasick = "0.7.3"
+bstr = { version = "0.1.2", default-features = false, features = ["std"] }
 fnv = "1.0.6"
 log = "0.4.5"
-memchr = "2.1.0"
-regex = "1.1.0"
+regex = "1.1.5"
 
 [dev-dependencies]
 glob = "0.2.11"
diff --git a/globset/src/glob.rs b/globset/src/glob.rs
index eccfb2d3..5e635a20 100644
--- a/globset/src/glob.rs
+++ b/globset/src/glob.rs
@@ -120,7 +120,7 @@ impl GlobMatcher {
 
     /// Tests whether the given path matches this pattern or not.
     pub fn is_match_candidate(&self, path: &Candidate) -> bool {
-        self.re.is_match(&path.path)
+        self.re.is_match(path.path.as_bytes())
     }
 }
 
@@ -145,7 +145,7 @@ impl GlobStrategic {
 
     /// Tests whether the given path matches this pattern or not.
     fn is_match_candidate(&self, candidate: &Candidate) -> bool {
-        let byte_path = &*candidate.path;
+        let byte_path = candidate.path.as_bytes();
 
         match self.strategy {
             MatchStrategy::Literal(ref lit) => lit.as_bytes() == byte_path,
diff --git a/globset/src/lib.rs b/globset/src/lib.rs
index a558e15b..de5948da 100644
--- a/globset/src/lib.rs
+++ b/globset/src/lib.rs
@@ -104,27 +104,25 @@ or to enable case insensitive matching.
 #![deny(missing_docs)]
 
 extern crate aho_corasick;
+extern crate bstr;
 extern crate fnv;
 #[macro_use]
 extern crate log;
-extern crate memchr;
 extern crate regex;
 
 use std::borrow::Cow;
 use std::collections::{BTreeMap, HashMap};
 use std::error::Error as StdError;
-use std::ffi::OsStr;
 use std::fmt;
 use std::hash;
 use std::path::Path;
 use std::str;
 
 use aho_corasick::AhoCorasick;
+use bstr::{B, BStr, BString};
 use regex::bytes::{Regex, RegexBuilder, RegexSet};
 
-use pathutil::{
-    file_name, file_name_ext, normalize_path, os_str_bytes, path_bytes,
-};
+use pathutil::{file_name, file_name_ext, normalize_path};
 use glob::MatchStrategy;
 pub use glob::{Glob, GlobBuilder, GlobMatcher};
 
@@ -489,24 +487,25 @@ impl GlobSetBuilder {
 /// path against multiple globs or sets of globs.
 #[derive(Clone, Debug)]
 pub struct Candidate<'a> {
-    path: Cow<'a, [u8]>,
-    basename: Cow<'a, [u8]>,
-    ext: Cow<'a, [u8]>,
+    path: Cow<'a, BStr>,
+    basename: Cow<'a, BStr>,
+    ext: Cow<'a, BStr>,
 }
 
 impl<'a> Candidate<'a> {
     /// Create a new candidate for matching from the given path.
     pub fn new<P: AsRef<Path> + ?Sized>(path: &'a P) -> Candidate<'a> {
-        let path = path.as_ref();
-        let basename = file_name(path).unwrap_or(OsStr::new(""));
+        let path = normalize_path(BString::from_path_lossy(path.as_ref()));
+        let basename = file_name(&path).unwrap_or(Cow::Borrowed(B("")));
+        let ext = file_name_ext(&basename).unwrap_or(Cow::Borrowed(B("")));
         Candidate {
-            path: normalize_path(path_bytes(path)),
-            basename: os_str_bytes(basename),
-            ext: file_name_ext(basename).unwrap_or(Cow::Borrowed(b"")),
+            path: path,
+            basename: basename,
+            ext: ext,
         }
     }
 
-    fn path_prefix(&self, max: usize) -> &[u8] {
+    fn path_prefix(&self, max: usize) -> &BStr {
         if self.path.len() <= max {
             &*self.path
         } else {
@@ -514,7 +513,7 @@ impl<'a> Candidate<'a> {
         }
     }
 
-    fn path_suffix(&self, max: usize) -> &[u8] {
+    fn path_suffix(&self, max: usize) -> &BStr {
         if self.path.len() <= max {
             &*self.path
         } else {
@@ -575,12 +574,12 @@ impl LiteralStrategy {
     }
 
     fn is_match(&self, candidate: &Candidate) -> bool {
-        self.0.contains_key(&*candidate.path)
+        self.0.contains_key(candidate.path.as_bytes())
     }
 
     #[inline(never)]
     fn matches_into(&self, candidate: &Candidate, matches: &mut Vec<usize>) {
-        if let Some(hits) = self.0.get(&*candidate.path) {
+        if let Some(hits) = self.0.get(candidate.path.as_bytes()) {
             matches.extend(hits);
         }
     }
@@ -602,7 +601,7 @@ impl BasenameLiteralStrategy {
         if candidate.basename.is_empty() {
             return false;
         }
-        self.0.contains_key(&*candidate.basename)
+        self.0.contains_key(candidate.basename.as_bytes())
     }
 
     #[inline(never)]
@@ -610,7 +609,7 @@ impl BasenameLiteralStrategy {
         if candidate.basename.is_empty() {
             return;
         }
-        if let Some(hits) = self.0.get(&*candidate.basename) {
+        if let Some(hits) = self.0.get(candidate.basename.as_bytes()) {
             matches.extend(hits);
         }
     }
@@ -632,7 +631,7 @@ impl ExtensionStrategy {
         if candidate.ext.is_empty() {
             return false;
         }
-        self.0.contains_key(&*candidate.ext)
+        self.0.contains_key(candidate.ext.as_bytes())
     }
 
     #[inline(never)]
@@ -640,7 +639,7 @@ impl ExtensionStrategy {
         if candidate.ext.is_empty() {
             return;
         }
-        if let Some(hits) = self.0.get(&*candidate.ext) {
+        if let Some(hits) = self.0.get(candidate.ext.as_bytes()) {
             matches.extend(hits);
         }
     }
@@ -710,11 +709,11 @@ impl RequiredExtensionStrategy {
         if candidate.ext.is_empty() {
             return false;
         }
-        match self.0.get(&*candidate.ext) {
+        match self.0.get(candidate.ext.as_bytes()) {
             None => false,
             Some(regexes) => {
                 for &(_, ref re) in regexes {
-                    if re.is_match(&*candidate.path) {
+                    if re.is_match(candidate.path.as_bytes()) {
                         return true;
                     }
                 }
@@ -728,9 +727,9 @@ impl RequiredExtensionStrategy {
         if candidate.ext.is_empty() {
             return;
         }
-        if let Some(regexes) = self.0.get(&*candidate.ext) {
+        if let Some(regexes) = self.0.get(candidate.ext.as_bytes()) {
             for &(global_index, ref re) in regexes {
-                if re.is_match(&*candidate.path) {
+                if re.is_match(candidate.path.as_bytes()) {
                     matches.push(global_index);
                 }
             }
@@ -746,11 +745,11 @@ struct RegexSetStrategy {
 
 impl RegexSetStrategy {
     fn is_match(&self, candidate: &Candidate) -> bool {
-        self.matcher.is_match(&*candidate.path)
+        self.matcher.is_match(candidate.path.as_bytes())
     }
 
     fn matches_into(&self, candidate: &Candidate, matches: &mut Vec<usize>) {
-        for i in self.matcher.matches(&*candidate.path) {
+        for i in self.matcher.matches(candidate.path.as_bytes()) {
             matches.push(self.map[i]);
         }
     }
diff --git a/globset/src/pathutil.rs b/globset/src/pathutil.rs
index 4b808e86..62a68322 100644
--- a/globset/src/pathutil.rs
+++ b/globset/src/pathutil.rs
@@ -1,41 +1,30 @@
 use std::borrow::Cow;
-use std::ffi::OsStr;
-use std::path::Path;
+
+use bstr::BStr;
 
 /// The final component of the path, if it is a normal file.
 ///
 /// If the path terminates in ., .., or consists solely of a root of prefix,
 /// file_name will return None.
-#[cfg(unix)]
-pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
-    path: &'a P,
-) -> Option<&'a OsStr> {
-    use std::os::unix::ffi::OsStrExt;
-    use memchr::memrchr;
-
-    let path = path.as_ref().as_os_str().as_bytes();
+pub fn file_name<'a>(path: &Cow<'a, BStr>) -> Option<Cow<'a, BStr>> {
     if path.is_empty() {
         return None;
     } else if path.len() == 1 && path[0] == b'.' {
         return None;
-    } else if path.last() == Some(&b'.') {
+    } else if path.last() == Some(b'.') {
         return None;
-    } else if path.len() >= 2 && &path[path.len() - 2..] == &b".."[..] {
+    } else if path.len() >= 2 && &path[path.len() - 2..] == ".." {
         return None;
     }
-    let last_slash = memrchr(b'/', path).map(|i| i + 1).unwrap_or(0);
-    Some(OsStr::from_bytes(&path[last_slash..]))
-}
-
-/// The final component of the path, if it is a normal file.
-///
-/// If the path terminates in ., .., or consists solely of a root of prefix,
-/// file_name will return None.
-#[cfg(not(unix))]
-pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
-    path: &'a P,
-) -> Option<&'a OsStr> {
-    path.as_ref().file_name()
+    let last_slash = path.rfind_byte(b'/').map(|i| i + 1).unwrap_or(0);
+    Some(match *path {
+        Cow::Borrowed(path) => Cow::Borrowed(&path[last_slash..]),
+        Cow::Owned(ref path) => {
+            let mut path = path.clone();
+            path.drain_bytes(..last_slash);
+            Cow::Owned(path)
+        }
+    })
 }
 
 /// Return a file extension given a path's file name.
@@ -54,59 +43,34 @@ pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
 /// a pattern like `*.rs` is obviously trying to match files with a `rs`
 /// extension, but it also matches files like `.rs`, which doesn't have an
 /// extension according to std::path::Path::extension.
-pub fn file_name_ext(name: &OsStr) -> Option<Cow<[u8]>> {
+pub fn file_name_ext<'a>(name: &Cow<'a, BStr>) -> Option<Cow<'a, BStr>> {
     if name.is_empty() {
         return None;
     }
-    let name = os_str_bytes(name);
     let last_dot_at = {
         let result = name
-            .iter().enumerate().rev()
-            .find(|&(_, &b)| b == b'.')
+            .bytes().enumerate().rev()
+            .find(|&(_, b)| b == b'.')
             .map(|(i, _)| i);
         match result {
             None => return None,
             Some(i) => i,
         }
     };
-    Some(match name {
+    Some(match *name {
         Cow::Borrowed(name) => Cow::Borrowed(&name[last_dot_at..]),
-        Cow::Owned(mut name) => {
-            name.drain(..last_dot_at);
+        Cow::Owned(ref name) => {
+            let mut name = name.clone();
+            name.drain_bytes(..last_dot_at);
             Cow::Owned(name)
         }
     })
 }
 
-/// Return raw bytes of a path, transcoded to UTF-8 if necessary.
-pub fn path_bytes(path: &Path) -> Cow<[u8]> {
-    os_str_bytes(path.as_os_str())
-}
-
-/// Return the raw bytes of the given OS string, possibly transcoded to UTF-8.
-#[cfg(unix)]
-pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
-    use std::os::unix::ffi::OsStrExt;
-    Cow::Borrowed(s.as_bytes())
-}
-
-/// Return the raw bytes of the given OS string, possibly transcoded to UTF-8.
-#[cfg(not(unix))]
-pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
-    // TODO(burntsushi): On Windows, OS strings are WTF-8, which is a superset
-    // of UTF-8, so even if we could get at the raw bytes, they wouldn't
-    // be useful. We *must* convert to UTF-8 before doing path matching.
-    // Unfortunate, but necessary.
-    match s.to_string_lossy() {
-        Cow::Owned(s) => Cow::Owned(s.into_bytes()),
-        Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
-    }
-}
-
 /// Normalizes a path to use `/` as a separator everywhere, even on platforms
 /// that recognize other characters as separators.
 #[cfg(unix)]
-pub fn normalize_path(path: Cow<[u8]>) -> Cow<[u8]> {
+pub fn normalize_path(path: Cow<BStr>) -> Cow<BStr> {
     // UNIX only uses /, so we're good.
     path
 }
@@ -114,7 +78,7 @@ pub fn normalize_path(path: Cow<[u8]>) -> Cow<[u8]> {
 /// Normalizes a path to use `/` as a separator everywhere, even on platforms
 /// that recognize other characters as separators.
 #[cfg(not(unix))]
-pub fn normalize_path(mut path: Cow<[u8]>) -> Cow<[u8]> {
+pub fn normalize_path(mut path: Cow<BStr>) -> Cow<BStr> {
     use std::path::is_separator;
 
     for i in 0..path.len() {
@@ -129,7 +93,8 @@ pub fn normalize_path(mut path: Cow<[u8]>) -> Cow<[u8]> {
 #[cfg(test)]
 mod tests {
     use std::borrow::Cow;
-    use std::ffi::OsStr;
+
+    use bstr::{B, BString};
 
     use super::{file_name_ext, normalize_path};
 
@@ -137,8 +102,9 @@ mod tests {
         ($name:ident, $file_name:expr, $ext:expr) => {
             #[test]
             fn $name() {
-                let got = file_name_ext(OsStr::new($file_name));
-                assert_eq!($ext.map(|s| Cow::Borrowed(s.as_bytes())), got);
+                let bs = BString::from($file_name);
+                let got = file_name_ext(&Cow::Owned(bs));
+                assert_eq!($ext.map(|s| Cow::Borrowed(B(s))), got);
             }
         };
     }
@@ -153,7 +119,8 @@ mod tests {
         ($name:ident, $path:expr, $expected:expr) => {
             #[test]
             fn $name() {
-                let got = normalize_path(Cow::Owned($path.to_vec()));
+                let bs = BString::from_slice($path);
+                let got = normalize_path(Cow::Owned(bs));
                 assert_eq!($expected.to_vec(), got.into_owned());
             }
         };
author	Andrew Gallant <jamslam@gmail.com>	2019-04-04 18:33:41 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2019-04-05 23:24:08 -0400
commit	8a7f43b84dfd4b3e186804a29c66215b3cfeb8f7 (patch)
tree	f708126be8e455e99e909a38ddd35c1783210c8f
parent	d968a27ed5298d99e46ff65b68a7f6c2c641105f (diff)