summaryrefslogtreecommitdiffstats
path: root/grep-cli/src/pattern.rs
diff options
context:
space:
mode:
Diffstat (limited to 'grep-cli/src/pattern.rs')
-rw-r--r--grep-cli/src/pattern.rs205
1 files changed, 205 insertions, 0 deletions
diff --git a/grep-cli/src/pattern.rs b/grep-cli/src/pattern.rs
new file mode 100644
index 00000000..ed1d95a5
--- /dev/null
+++ b/grep-cli/src/pattern.rs
@@ -0,0 +1,205 @@
+use std::error;
+use std::ffi::OsStr;
+use std::fmt;
+use std::fs::File;
+use std::io::{self, BufRead};
+use std::path::Path;
+use std::str;
+
+use escape::{escape, escape_os};
+
+/// An error that occurs when a pattern could not be converted to valid UTF-8.
+///
+/// The purpose of this error is to give a more targeted failure mode for
+/// patterns written by end users that are not valid UTF-8.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct InvalidPatternError {
+ original: String,
+ valid_up_to: usize,
+}
+
+impl InvalidPatternError {
+ /// Returns the index in the given string up to which valid UTF-8 was
+ /// verified.
+ pub fn valid_up_to(&self) -> usize {
+ self.valid_up_to
+ }
+}
+
+impl error::Error for InvalidPatternError {
+ fn description(&self) -> &str { "invalid pattern" }
+}
+
+impl fmt::Display for InvalidPatternError {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(
+ f,
+ "found invalid UTF-8 in pattern at byte offset {} \
+ (use hex escape sequences to match arbitrary bytes \
+ in a pattern, e.g., \\xFF): '{}'",
+ self.valid_up_to,
+ self.original,
+ )
+ }
+}
+
+impl From<InvalidPatternError> for io::Error {
+ fn from(paterr: InvalidPatternError) -> io::Error {
+ io::Error::new(io::ErrorKind::Other, paterr)
+ }
+}
+
+/// Convert an OS string into a regular expression pattern.
+///
+/// This conversion fails if the given pattern is not valid UTF-8, in which
+/// case, a targeted error with more information about where the invalid UTF-8
+/// occurs is given. The error also suggests the use of hex escape sequences,
+/// which are supported by many regex engines.
+pub fn pattern_from_os(pattern: &OsStr) -> Result<&str, InvalidPatternError> {
+ pattern.to_str().ok_or_else(|| {
+ let valid_up_to = pattern
+ .to_string_lossy()
+ .find('\u{FFFD}')
+ .expect("a Unicode replacement codepoint for invalid UTF-8");
+ InvalidPatternError {
+ original: escape_os(pattern),
+ valid_up_to: valid_up_to,
+ }
+ })
+}
+
+/// Convert arbitrary bytes into a regular expression pattern.
+///
+/// This conversion fails if the given pattern is not valid UTF-8, in which
+/// case, a targeted error with more information about where the invalid UTF-8
+/// occurs is given. The error also suggests the use of hex escape sequences,
+/// which are supported by many regex engines.
+pub fn pattern_from_bytes(
+ pattern: &[u8],
+) -> Result<&str, InvalidPatternError> {
+ str::from_utf8(pattern).map_err(|err| {
+ InvalidPatternError {
+ original: escape(pattern),
+ valid_up_to: err.valid_up_to(),
+ }
+ })
+}
+
+/// Read patterns from a file path, one per line.
+///
+/// If there was a problem reading or if any of the patterns contain invalid
+/// UTF-8, then an error is returned. If there was a problem with a specific
+/// pattern, then the error message will include the line number and the file
+/// path.
+pub fn patterns_from_path<P: AsRef<Path>>(path: P) -> io::Result<Vec<String>> {
+ let path = path.as_ref();
+ let file = File::open(path).map_err(|err| {
+ io::Error::new(
+ io::ErrorKind::Other,
+ format!("{}: {}", path.display(), err),
+ )
+ })?;
+ patterns_from_reader(file).map_err(|err| {
+ io::Error::new(
+ io::ErrorKind::Other,
+ format!("{}:{}", path.display(), err),
+ )
+ })
+}
+
+/// Read patterns from stdin, one per line.
+///
+/// If there was a problem reading or if any of the patterns contain invalid
+/// UTF-8, then an error is returned. If there was a problem with a specific
+/// pattern, then the error message will include the line number and the fact
+/// that it came from stdin.
+pub fn patterns_from_stdin() -> io::Result<Vec<String>> {
+ let stdin = io::stdin();
+ let locked = stdin.lock();
+ patterns_from_reader(locked).map_err(|err| {
+ io::Error::new(
+ io::ErrorKind::Other,
+ format!("<stdin>:{}", err),
+ )
+ })
+}
+
+/// Read patterns from any reader, one per line.
+///
+/// If there was a problem reading or if any of the patterns contain invalid
+/// UTF-8, then an error is returned. If there was a problem with a specific
+/// pattern, then the error message will include the line number.
+///
+/// Note that this routine uses its own internal buffer, so the caller should
+/// not provide their own buffered reader if possible.
+///
+/// # Example
+///
+/// This shows how to parse patterns, one per line.
+///
+/// ```
+/// use grep_cli::patterns_from_reader;
+///
+/// # fn example() -> Result<(), Box<::std::error::Error>> {
+/// let patterns = "\
+/// foo
+/// bar\\s+foo
+/// [a-z]{3}
+/// ";
+///
+/// assert_eq!(patterns_from_reader(patterns.as_bytes())?, vec![
+/// r"foo",
+/// r"bar\s+foo",
+/// r"[a-z]{3}",
+/// ]);
+/// # Ok(()) }
+/// ```
+pub fn patterns_from_reader<R: io::Read>(rdr: R) -> io::Result<Vec<String>> {
+ let mut patterns = vec![];
+ let mut bufrdr = io::BufReader::new(rdr);
+ let mut line = vec![];
+ let mut line_number = 0;
+ while {
+ line.clear();
+ line_number += 1;
+ bufrdr.read_until(b'\n', &mut line)? > 0
+ } {
+ line.pop().unwrap(); // remove trailing '\n'
+ if line.last() == Some(&b'\r') {
+ line.pop().unwrap();
+ }
+ match pattern_from_bytes(&line) {
+ Ok(pattern) => patterns.push(pattern.to_string()),
+ Err(err) => {
+ return Err(io::Error::new(
+ io::ErrorKind::Other,
+ format!("{}: {}", line_number, err),
+ ));
+ }
+ }
+ }
+ Ok(patterns)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::{pattern_from_bytes, pattern_from_os};
+
+ #[test]
+ fn bytes() {
+ let pat = b"abc\xFFxyz";
+ let err = pattern_from_bytes(pat).unwrap_err();
+ assert_eq!(3, err.valid_up_to());
+ }
+
+ #[test]
+ #[cfg(unix)]
+ fn os() {
+ use std::os::unix::ffi::OsStrExt;
+ use std::ffi::OsStr;
+
+ let pat = OsStr::from_bytes(b"abc\xFFxyz");
+ let err = pattern_from_os(pat).unwrap_err();
+ assert_eq!(3, err.valid_up_to());
+ }
+}