From 8a7f43b84dfd4b3e186804a29c66215b3cfeb8f7 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 4 Apr 2019 18:33:41 -0400 Subject: globset: use bstr This simplifies the various path related functions and pushed more platform dependent code down into bstr. This likely also makes things a bit more efficient on Windows, since we now only do a single UTF-8 check for each file path. --- globset/Cargo.toml | 4 +-- globset/src/glob.rs | 4 +-- globset/src/lib.rs | 53 ++++++++++++++-------------- globset/src/pathutil.rs | 93 ++++++++++++++++--------------------------------- 4 files changed, 60 insertions(+), 94 deletions(-) diff --git a/globset/Cargo.toml b/globset/Cargo.toml index 81cb2c31..371a3272 100644 --- a/globset/Cargo.toml +++ b/globset/Cargo.toml @@ -20,10 +20,10 @@ bench = false [dependencies] aho-corasick = "0.7.3" +bstr = { version = "0.1.2", default-features = false, features = ["std"] } fnv = "1.0.6" log = "0.4.5" -memchr = "2.1.0" -regex = "1.1.0" +regex = "1.1.5" [dev-dependencies] glob = "0.2.11" diff --git a/globset/src/glob.rs b/globset/src/glob.rs index eccfb2d3..5e635a20 100644 --- a/globset/src/glob.rs +++ b/globset/src/glob.rs @@ -120,7 +120,7 @@ impl GlobMatcher { /// Tests whether the given path matches this pattern or not. pub fn is_match_candidate(&self, path: &Candidate) -> bool { - self.re.is_match(&path.path) + self.re.is_match(path.path.as_bytes()) } } @@ -145,7 +145,7 @@ impl GlobStrategic { /// Tests whether the given path matches this pattern or not. fn is_match_candidate(&self, candidate: &Candidate) -> bool { - let byte_path = &*candidate.path; + let byte_path = candidate.path.as_bytes(); match self.strategy { MatchStrategy::Literal(ref lit) => lit.as_bytes() == byte_path, diff --git a/globset/src/lib.rs b/globset/src/lib.rs index a558e15b..de5948da 100644 --- a/globset/src/lib.rs +++ b/globset/src/lib.rs @@ -104,27 +104,25 @@ or to enable case insensitive matching. #![deny(missing_docs)] extern crate aho_corasick; +extern crate bstr; extern crate fnv; #[macro_use] extern crate log; -extern crate memchr; extern crate regex; use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::error::Error as StdError; -use std::ffi::OsStr; use std::fmt; use std::hash; use std::path::Path; use std::str; use aho_corasick::AhoCorasick; +use bstr::{B, BStr, BString}; use regex::bytes::{Regex, RegexBuilder, RegexSet}; -use pathutil::{ - file_name, file_name_ext, normalize_path, os_str_bytes, path_bytes, -}; +use pathutil::{file_name, file_name_ext, normalize_path}; use glob::MatchStrategy; pub use glob::{Glob, GlobBuilder, GlobMatcher}; @@ -489,24 +487,25 @@ impl GlobSetBuilder { /// path against multiple globs or sets of globs. #[derive(Clone, Debug)] pub struct Candidate<'a> { - path: Cow<'a, [u8]>, - basename: Cow<'a, [u8]>, - ext: Cow<'a, [u8]>, + path: Cow<'a, BStr>, + basename: Cow<'a, BStr>, + ext: Cow<'a, BStr>, } impl<'a> Candidate<'a> { /// Create a new candidate for matching from the given path. pub fn new + ?Sized>(path: &'a P) -> Candidate<'a> { - let path = path.as_ref(); - let basename = file_name(path).unwrap_or(OsStr::new("")); + let path = normalize_path(BString::from_path_lossy(path.as_ref())); + let basename = file_name(&path).unwrap_or(Cow::Borrowed(B(""))); + let ext = file_name_ext(&basename).unwrap_or(Cow::Borrowed(B(""))); Candidate { - path: normalize_path(path_bytes(path)), - basename: os_str_bytes(basename), - ext: file_name_ext(basename).unwrap_or(Cow::Borrowed(b"")), + path: path, + basename: basename, + ext: ext, } } - fn path_prefix(&self, max: usize) -> &[u8] { + fn path_prefix(&self, max: usize) -> &BStr { if self.path.len() <= max { &*self.path } else { @@ -514,7 +513,7 @@ impl<'a> Candidate<'a> { } } - fn path_suffix(&self, max: usize) -> &[u8] { + fn path_suffix(&self, max: usize) -> &BStr { if self.path.len() <= max { &*self.path } else { @@ -575,12 +574,12 @@ impl LiteralStrategy { } fn is_match(&self, candidate: &Candidate) -> bool { - self.0.contains_key(&*candidate.path) + self.0.contains_key(candidate.path.as_bytes()) } #[inline(never)] fn matches_into(&self, candidate: &Candidate, matches: &mut Vec) { - if let Some(hits) = self.0.get(&*candidate.path) { + if let Some(hits) = self.0.get(candidate.path.as_bytes()) { matches.extend(hits); } } @@ -602,7 +601,7 @@ impl BasenameLiteralStrategy { if candidate.basename.is_empty() { return false; } - self.0.contains_key(&*candidate.basename) + self.0.contains_key(candidate.basename.as_bytes()) } #[inline(never)] @@ -610,7 +609,7 @@ impl BasenameLiteralStrategy { if candidate.basename.is_empty() { return; } - if let Some(hits) = self.0.get(&*candidate.basename) { + if let Some(hits) = self.0.get(candidate.basename.as_bytes()) { matches.extend(hits); } } @@ -632,7 +631,7 @@ impl ExtensionStrategy { if candidate.ext.is_empty() { return false; } - self.0.contains_key(&*candidate.ext) + self.0.contains_key(candidate.ext.as_bytes()) } #[inline(never)] @@ -640,7 +639,7 @@ impl ExtensionStrategy { if candidate.ext.is_empty() { return; } - if let Some(hits) = self.0.get(&*candidate.ext) { + if let Some(hits) = self.0.get(candidate.ext.as_bytes()) { matches.extend(hits); } } @@ -710,11 +709,11 @@ impl RequiredExtensionStrategy { if candidate.ext.is_empty() { return false; } - match self.0.get(&*candidate.ext) { + match self.0.get(candidate.ext.as_bytes()) { None => false, Some(regexes) => { for &(_, ref re) in regexes { - if re.is_match(&*candidate.path) { + if re.is_match(candidate.path.as_bytes()) { return true; } } @@ -728,9 +727,9 @@ impl RequiredExtensionStrategy { if candidate.ext.is_empty() { return; } - if let Some(regexes) = self.0.get(&*candidate.ext) { + if let Some(regexes) = self.0.get(candidate.ext.as_bytes()) { for &(global_index, ref re) in regexes { - if re.is_match(&*candidate.path) { + if re.is_match(candidate.path.as_bytes()) { matches.push(global_index); } } @@ -746,11 +745,11 @@ struct RegexSetStrategy { impl RegexSetStrategy { fn is_match(&self, candidate: &Candidate) -> bool { - self.matcher.is_match(&*candidate.path) + self.matcher.is_match(candidate.path.as_bytes()) } fn matches_into(&self, candidate: &Candidate, matches: &mut Vec) { - for i in self.matcher.matches(&*candidate.path) { + for i in self.matcher.matches(candidate.path.as_bytes()) { matches.push(self.map[i]); } } diff --git a/globset/src/pathutil.rs b/globset/src/pathutil.rs index 4b808e86..62a68322 100644 --- a/globset/src/pathutil.rs +++ b/globset/src/pathutil.rs @@ -1,41 +1,30 @@ use std::borrow::Cow; -use std::ffi::OsStr; -use std::path::Path; + +use bstr::BStr; /// The final component of the path, if it is a normal file. /// /// If the path terminates in ., .., or consists solely of a root of prefix, /// file_name will return None. -#[cfg(unix)] -pub fn file_name<'a, P: AsRef + ?Sized>( - path: &'a P, -) -> Option<&'a OsStr> { - use std::os::unix::ffi::OsStrExt; - use memchr::memrchr; - - let path = path.as_ref().as_os_str().as_bytes(); +pub fn file_name<'a>(path: &Cow<'a, BStr>) -> Option> { if path.is_empty() { return None; } else if path.len() == 1 && path[0] == b'.' { return None; - } else if path.last() == Some(&b'.') { + } else if path.last() == Some(b'.') { return None; - } else if path.len() >= 2 && &path[path.len() - 2..] == &b".."[..] { + } else if path.len() >= 2 && &path[path.len() - 2..] == ".." { return None; } - let last_slash = memrchr(b'/', path).map(|i| i + 1).unwrap_or(0); - Some(OsStr::from_bytes(&path[last_slash..])) -} - -/// The final component of the path, if it is a normal file. -/// -/// If the path terminates in ., .., or consists solely of a root of prefix, -/// file_name will return None. -#[cfg(not(unix))] -pub fn file_name<'a, P: AsRef + ?Sized>( - path: &'a P, -) -> Option<&'a OsStr> { - path.as_ref().file_name() + let last_slash = path.rfind_byte(b'/').map(|i| i + 1).unwrap_or(0); + Some(match *path { + Cow::Borrowed(path) => Cow::Borrowed(&path[last_slash..]), + Cow::Owned(ref path) => { + let mut path = path.clone(); + path.drain_bytes(..last_slash); + Cow::Owned(path) + } + }) } /// Return a file extension given a path's file name. @@ -54,59 +43,34 @@ pub fn file_name<'a, P: AsRef + ?Sized>( /// a pattern like `*.rs` is obviously trying to match files with a `rs` /// extension, but it also matches files like `.rs`, which doesn't have an /// extension according to std::path::Path::extension. -pub fn file_name_ext(name: &OsStr) -> Option> { +pub fn file_name_ext<'a>(name: &Cow<'a, BStr>) -> Option> { if name.is_empty() { return None; } - let name = os_str_bytes(name); let last_dot_at = { let result = name - .iter().enumerate().rev() - .find(|&(_, &b)| b == b'.') + .bytes().enumerate().rev() + .find(|&(_, b)| b == b'.') .map(|(i, _)| i); match result { None => return None, Some(i) => i, } }; - Some(match name { + Some(match *name { Cow::Borrowed(name) => Cow::Borrowed(&name[last_dot_at..]), - Cow::Owned(mut name) => { - name.drain(..last_dot_at); + Cow::Owned(ref name) => { + let mut name = name.clone(); + name.drain_bytes(..last_dot_at); Cow::Owned(name) } }) } -/// Return raw bytes of a path, transcoded to UTF-8 if necessary. -pub fn path_bytes(path: &Path) -> Cow<[u8]> { - os_str_bytes(path.as_os_str()) -} - -/// Return the raw bytes of the given OS string, possibly transcoded to UTF-8. -#[cfg(unix)] -pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> { - use std::os::unix::ffi::OsStrExt; - Cow::Borrowed(s.as_bytes()) -} - -/// Return the raw bytes of the given OS string, possibly transcoded to UTF-8. -#[cfg(not(unix))] -pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> { - // TODO(burntsushi): On Windows, OS strings are WTF-8, which is a superset - // of UTF-8, so even if we could get at the raw bytes, they wouldn't - // be useful. We *must* convert to UTF-8 before doing path matching. - // Unfortunate, but necessary. - match s.to_string_lossy() { - Cow::Owned(s) => Cow::Owned(s.into_bytes()), - Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()), - } -} - /// Normalizes a path to use `/` as a separator everywhere, even on platforms /// that recognize other characters as separators. #[cfg(unix)] -pub fn normalize_path(path: Cow<[u8]>) -> Cow<[u8]> { +pub fn normalize_path(path: Cow) -> Cow { // UNIX only uses /, so we're good. path } @@ -114,7 +78,7 @@ pub fn normalize_path(path: Cow<[u8]>) -> Cow<[u8]> { /// Normalizes a path to use `/` as a separator everywhere, even on platforms /// that recognize other characters as separators. #[cfg(not(unix))] -pub fn normalize_path(mut path: Cow<[u8]>) -> Cow<[u8]> { +pub fn normalize_path(mut path: Cow) -> Cow { use std::path::is_separator; for i in 0..path.len() { @@ -129,7 +93,8 @@ pub fn normalize_path(mut path: Cow<[u8]>) -> Cow<[u8]> { #[cfg(test)] mod tests { use std::borrow::Cow; - use std::ffi::OsStr; + + use bstr::{B, BString}; use super::{file_name_ext, normalize_path}; @@ -137,8 +102,9 @@ mod tests { ($name:ident, $file_name:expr, $ext:expr) => { #[test] fn $name() { - let got = file_name_ext(OsStr::new($file_name)); - assert_eq!($ext.map(|s| Cow::Borrowed(s.as_bytes())), got); + let bs = BString::from($file_name); + let got = file_name_ext(&Cow::Owned(bs)); + assert_eq!($ext.map(|s| Cow::Borrowed(B(s))), got); } }; } @@ -153,7 +119,8 @@ mod tests { ($name:ident, $path:expr, $expected:expr) => { #[test] fn $name() { - let got = normalize_path(Cow::Owned($path.to_vec())); + let bs = BString::from_slice($path); + let got = normalize_path(Cow::Owned(bs)); assert_eq!($expected.to_vec(), got.into_owned()); } }; -- cgit v1.2.3