struggling with printing contexts, what a mess

author: Andrew Gallant <jamslam@gmail.com> 2016-08-31 20:02:59 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2016-08-31 20:02:59 -0400
commit: 5aa3b9bc58e752c6c0c8f50a299eaf4e4bc00607 (patch)
tree: 31d718e33fb534f525a713313c85ca9dab646cdd /src
parent: 79f62012d7e59c0c2e6ee778e8f8eeeff83753ce (diff)
2 files changed, 321 insertions, 30 deletions
diff --git a/src/printer.rs b/src/printer.rs
index 88fad900..80d99242 100644
--- a/src/printer.rs
+++ b/src/printer.rs
@@ -58,6 +58,24 @@ impl<W: io::Write> Printer<W> {
         self.write(b"\n");
     }
 
+    pub fn context<P: AsRef<Path>>(
+        &mut self,
+        path: P,
+        buf: &[u8],
+        start: usize,
+        end: usize,
+        line_number: Option<u64>,
+    ) {
+        self.write(path.as_ref().to_string_lossy().as_bytes());
+        self.write(b"-");
+        if let Some(line_number) = line_number {
+            self.write(line_number.to_string().as_bytes());
+            self.write(b"-");
+        }
+        self.write(&buf[start..end]);
+        self.write(b"\n");
+    }
+
     pub fn binary_matched<P: AsRef<Path>>(&mut self, path: P) {
         wln!(&mut self.wtr, "binary file {} matches", path.as_ref().display());
     }
diff --git a/src/search.rs b/src/search.rs
index 7e2f948d..e0f8db5c 100644
--- a/src/search.rs
+++ b/src/search.rs
@@ -66,9 +66,13 @@ pub struct Searcher<'a, R, W: 'a> {
     match_count: u64,
     line_count: Option<u64>,
     last_match: Match,
+    last_printed: (usize, usize),
+    after_context_remaining: usize,
     count: bool,
     invert_match: bool,
     line_number: bool,
+    before_context: usize,
+    after_context: usize,
 }
 
 impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
@@ -100,9 +104,13 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
             match_count: 0,
             line_count: None,
             last_match: Match::default(),
+            last_printed: (0, 0),
+            after_context_remaining: 0,
             count: false,
             invert_match: false,
             line_number: false,
+            before_context: 0,
+            after_context: 0,
         }
     }
 
@@ -128,6 +136,20 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
         self
     }
 
+    /// The number of contextual lines to show before each match. The default
+    /// is zero.
+    pub fn before_context(mut self, count: usize) -> Self {
+        self.before_context = count;
+        self
+    }
+
+    /// The number of contextual lines to show after each match. The default
+    /// is zero.
+    pub fn after_context(mut self, count: usize) -> Self {
+        self.after_context = count;
+        self
+    }
+
     /// Execute the search. Results are written to the printer and the total
     /// number of matches is returned.
     #[inline(never)]
@@ -136,8 +158,19 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
         self.match_count = 0;
         self.line_count = if self.line_number { Some(0) } else { None };
         self.last_match = Match::default();
+        self.last_printed = (0, 0);
+        self.after_context_remaining = 0;
         loop {
-            let ok = try!(self.inp.fill(&mut self.haystack).map_err(|err| {
+            let mut keep_from = self.inp.lastnl;
+            if self.before_context > 0 {
+                let start_of_keep = cmp::min(
+                    self.inp.lastnl, self.last_printed.1 + 2);
+                keep_from = start_of_previous_lines(
+                    &self.inp.buf[start_of_keep..],
+                    self.inp.lastnl - start_of_keep,
+                    self.before_context);
+            }
+            let ok = try!(self.inp.fill(&mut self.haystack, keep_from).map_err(|err| {
                 Error::from_io(err, &self.path)
             }));
             if !ok {
@@ -173,13 +206,19 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
                         let upto = cmp::min(
                             self.inp.lastnl, self.last_match.end() + 1);
                         self.count_lines(upto);
-                        self.printer.matched(
-                            self.path,
-                            &self.inp.buf,
-                            self.last_match.start(),
-                            self.last_match.end(),
-                            self.line_count,
-                        );
+                        let start = self.last_match.start();
+                        let end = self.last_match.end();
+
+                        if self.before_context > 0 {
+                            let start = start.saturating_sub(1);
+                            let before_context_start = start_of_previous_lines(
+                                &self.inp.buf, start, self.before_context);
+                            let mut it = IterLines::new(before_context_start);
+                            while let Some((s, e)) = it.next(&self.inp.buf[..start]) {
+                                self.print_context(s, e);
+                            }
+                        }
+                        self.print_match(start, end);
                     }
                     // Move the position one past the end of the match so that
                     // the next search starts after the nl. If we're at EOF,
@@ -197,22 +236,15 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
     #[inline(always)]
     fn find_inverted_matches(&mut self, upto: usize) {
         debug_assert!(self.invert_match);
-        while self.inp.pos < upto {
-            let pos = memchr(b'\n', &self.inp.buf[self.inp.pos..upto])
-                      .unwrap_or(upto);
+        let mut it = IterLines::new(self.inp.pos);
+        while let Some((start, end)) = it.next(&self.inp.buf[..upto]) {
             if !self.count {
                 if let Some(ref mut line_count) = self.line_count {
                     *line_count += 1;
                 }
-                self.printer.matched(
-                    &self.path,
-                    &self.inp.buf,
-                    self.inp.pos,
-                    self.inp.pos + pos,
-                    self.line_count,
-                );
+                self.print_match(start, end);
             }
-            self.inp.pos += pos + 1;
+            self.inp.pos = end + 1;
             self.match_count += 1;
         }
     }
@@ -223,12 +255,28 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
             *line_count += count_lines(&self.inp.buf[self.inp.pos..upto]);
         }
     }
+
+    #[inline(always)]
+    fn print_match(&mut self, start: usize, end: usize) {
+        self.printer.matched(
+            &self.path, &self.inp.buf, start, end, self.line_count);
+        self.last_printed = (start, end);
+        self.after_context_remaining = self.after_context;
+    }
+
+    #[inline(always)]
+    fn print_context(&mut self, start: usize, end: usize) {
+        self.printer.context(
+            &self.path, &self.inp.buf, start, end, self.line_count);
+        self.last_printed = (start, end);
+    }
 }
 
 pub struct InputBuffer {
     read_size: usize,
     buf: Vec<u8>,
-    tmp: Vec<u8>,
+    tmp1: Vec<u8>,
+    tmp2: Vec<u8>,
     pos: usize,
     lastnl: usize,
     end: usize,
@@ -246,6 +294,8 @@ impl InputBuffer {
     ///
     /// The capacity determines the size of each read from the underlying
     /// reader.
+    ///
+    /// `cap` must be a minimum of `1`.
     pub fn with_capacity(mut cap: usize) -> InputBuffer {
         if cap == 0 {
             cap = 1;
@@ -253,7 +303,8 @@ impl InputBuffer {
         InputBuffer {
             read_size: cap,
             buf: vec![0; cap],
-            tmp: vec![],
+            tmp1: vec![],
+            tmp2: vec![],
             pos: 0,
             lastnl: 0,
             end: 0,
@@ -270,16 +321,34 @@ impl InputBuffer {
         self.is_binary = false;
     }
 
-    fn fill<R: io::Read>(&mut self, rdr: &mut R) -> Result<bool, io::Error> {
+    fn fill<R: io::Read>(
+        &mut self,
+        rdr: &mut R,
+        keep_from: usize,
+    ) -> Result<bool, io::Error> {
+        self.pos = 0;
+        self.tmp1.clear();
+        self.tmp2.clear();
+
+        // Save the leftovers from the previous fill before anything else.
         if self.lastnl < self.end {
-            self.tmp.clear();
-            self.tmp.extend_from_slice(&self.buf[self.lastnl..self.end]);
-            self.buf[0..self.tmp.len()].copy_from_slice(&self.tmp);
-            self.end = self.tmp.len();
+            self.tmp1.extend_from_slice(&self.buf[self.lastnl..self.end]);
+        }
+        // If we need to save lines to account for context, do that here.
+        // These context lines have already been searched, but make up the
+        // first bytes of this buffer.
+        if keep_from < self.lastnl {
+            self.tmp2.extend_from_slice(&self.buf[keep_from..self.lastnl]);
+            self.buf[0..self.tmp2.len()].copy_from_slice(&self.tmp2);
+            self.pos = self.tmp2.len();
+        }
+        if !self.tmp1.is_empty() {
+            let (start, end) = (self.pos, self.pos + self.tmp1.len());
+            self.buf[start..end].copy_from_slice(&self.tmp1);
+            self.end = end;
         } else {
-            self.end = 0;
+            self.end = self.pos;
         }
-        self.pos = 0;
         self.lastnl = 0;
         while self.lastnl == 0 {
             if self.buf.len() - self.end < self.read_size {
@@ -296,7 +365,7 @@ impl InputBuffer {
             }
             self.first = false;
             if n == 0 {
-                if self.end == 0 {
+                if self.end - self.pos == 0 {
                     return Ok(false);
                 }
                 self.lastnl = self.end;
@@ -316,6 +385,10 @@ impl InputBuffer {
     }
 }
 
+/// Returns true if and only if the given buffer is determined to be "binary"
+/// or otherwise not contain text data that is usefully searchable.
+///
+/// Note that this may return both false positives and false negatives!
 #[inline(always)]
 fn is_binary(buf: &[u8]) -> bool {
     if buf.len() >= 4 && &buf[0..4] == b"%PDF" {
@@ -324,6 +397,7 @@ fn is_binary(buf: &[u8]) -> bool {
     memchr(b'\x00', &buf[0..cmp::min(1024, buf.len())]).is_some()
 }
 
+/// Count the number of lines in the given buffer.
 #[inline(always)]
 fn count_lines(mut buf: &[u8]) -> u64 {
     let mut count = 0;
@@ -334,6 +408,96 @@ fn count_lines(mut buf: &[u8]) -> u64 {
     count
 }
 
+/// An "iterator" over lines in a particular buffer.
+///
+/// Idiomatic Rust would borrow the buffer and use it as internal state to
+/// advance over the positions of each line. We neglect that approach to avoid
+/// the borrow in the search code. (Because the borrow prevents composition
+/// through other mutable methods.)
+struct IterLines {
+    pos: usize,
+}
+
+impl IterLines {
+    /// Creates a new iterator over lines starting at the position given.
+    ///
+    /// The buffer is passed to the `next` method.
+    #[inline(always)]
+    fn new(start: usize) -> IterLines {
+        IterLines {
+            pos: start,
+        }
+    }
+
+    /// Return the start and end position of the next line in the buffer. The
+    /// buffer given should be the same on every call.
+    #[inline(always)]
+    fn next(&mut self, buf: &[u8]) -> Option<(usize, usize)> {
+        match memchr(b'\n', &buf[self.pos..]) {
+            None => {
+                if self.pos < buf.len() {
+                    let start = self.pos;
+                    self.pos = buf.len();
+                    Some((start, buf.len()))
+                } else {
+                    None
+                }
+            }
+            Some(end) => {
+                let start = self.pos;
+                let end = self.pos + end;
+                self.pos = end + 1;
+                Some((start, end))
+            }
+        }
+    }
+}
+
+/// Returns the starting index of the Nth line preceding `end`.
+///
+/// If `buf` is empty, then `0` is returned. If `count` is `0`, then `end` is
+/// returned.
+///
+/// If `end` points at a new line in `buf`, then searching starts as if `end`
+/// pointed immediately before the new line.
+///
+/// The position returned corresponds to the first byte in the given line.
+#[inline(always)]
+fn start_of_previous_lines(
+    buf: &[u8],
+    mut end: usize,
+    mut count: usize,
+) -> usize {
+    if buf.is_empty() {
+        return 0;
+    }
+    if count == 0 {
+        return end;
+    }
+    if end == buf.len() {
+        end -= 1;
+    }
+    if buf[end] == b'\n' {
+        end -= 1;
+    }
+    while count > 0 {
+        match memrchr(b'\n', &buf[..end]) {
+            None => {
+                return 0;
+            }
+            Some(i) => {
+                count -= 1;
+                end = i;
+                if end == 0 {
+                    return end;
+                }
+                end -= 1;
+            }
+        }
+    }
+    end + 2
+}
+
 #[cfg(test)]
 mod tests {
     use std::io;
@@ -343,7 +507,7 @@ mod tests {
 
     use printer::Printer;
 
-    use super::{InputBuffer, Searcher};
+    use super::{InputBuffer, Searcher, start_of_previous_lines};
 
     fn hay(s: &str) -> io::Cursor<Vec<u8>> {
         io::Cursor::new(s.to_string().into_bytes())
@@ -358,6 +522,88 @@ mod tests {
     }
 
     #[test]
+    fn previous_lines() {
+        let text = &b"\
+For the Doctor Watsons of this world, as opposed to the Sherlock
+Holmeses, success in the province of detective work must always
+be, to a very large extent, the result of luck. Sherlock Holmes
+can extract a clew from a wisp of straw or a flake of cigar ash;
+but Doctor Watson has to have it taken out for him and dusted,
+and exhibited clearly, with a label attached.\
+"[..];
+        assert_eq!(366, text.len());
+
+        assert_eq!(0, start_of_previous_lines(text, 366, 100));
+        assert_eq!(366, start_of_previous_lines(text, 366, 0));
+
+        assert_eq!(321, start_of_previous_lines(text, 366, 1));
+        assert_eq!(321, start_of_previous_lines(text, 365, 1));
+        assert_eq!(321, start_of_previous_lines(text, 364, 1));
+        assert_eq!(321, start_of_previous_lines(text, 322, 1));
+        assert_eq!(321, start_of_previous_lines(text, 321, 1));
+        assert_eq!(258, start_of_previous_lines(text, 320, 1));
+
+        assert_eq!(258, start_of_previous_lines(text, 366, 2));
+        assert_eq!(258, start_of_previous_lines(text, 365, 2));
+        assert_eq!(258, start_of_previous_lines(text, 364, 2));
+        assert_eq!(258, start_of_previous_lines(text, 322, 2));
+        assert_eq!(258, start_of_previous_lines(text, 321, 2));
+        assert_eq!(193, start_of_previous_lines(text, 320, 2));
+
+        assert_eq!(65, start_of_previous_lines(text, 66, 1));
+        assert_eq!(0, start_of_previous_lines(text, 66, 2));
+        assert_eq!(64, start_of_previous_lines(text, 64, 0));
+        assert_eq!(0, start_of_previous_lines(text, 64, 1));
+        assert_eq!(0, start_of_previous_lines(text, 64, 2));
+
+        assert_eq!(0, start_of_previous_lines(text, 0, 2));
+        assert_eq!(0, start_of_previous_lines(text, 0, 1));
+    }
+
+    #[test]
+    fn previous_lines_short() {
+        let text = &b"a\nb\nc\nd\ne\nf\n"[..];
+        assert_eq!(12, text.len());
+
+        assert_eq!(10, start_of_previous_lines(text, 12, 1));
+        assert_eq!(8, start_of_previous_lines(text, 12, 2));
+        assert_eq!(6, start_of_previous_lines(text, 12, 3));
+        assert_eq!(4, start_of_previous_lines(text, 12, 4));
+        assert_eq!(2, start_of_previous_lines(text, 12, 5));
+        assert_eq!(0, start_of_previous_lines(text, 12, 6));
+        assert_eq!(0, start_of_previous_lines(text, 12, 7));
+        assert_eq!(10, start_of_previous_lines(text, 11, 1));
+        assert_eq!(8, start_of_previous_lines(text, 11, 2));
+        assert_eq!(6, start_of_previous_lines(text, 11, 3));
+        assert_eq!(4, start_of_previous_lines(text, 11, 4));
+        assert_eq!(2, start_of_previous_lines(text, 11, 5));
+        assert_eq!(0, start_of_previous_lines(text, 11, 6));
+        assert_eq!(0, start_of_previous_lines(text, 11, 7));
+        assert_eq!(10, start_of_previous_lines(text, 10, 1));
+        assert_eq!(8, start_of_previous_lines(text, 10, 2));
+        assert_eq!(6, start_of_previous_lines(text, 10, 3));
+        assert_eq!(4, start_of_previous_lines(text, 10, 4));
+        assert_eq!(2, start_of_previous_lines(text, 10, 5));
+        assert_eq!(0, start_of_previous_lines(text, 10, 6));
+        assert_eq!(0, start_of_previous_lines(text, 10, 7));
+
+        assert_eq!(8, start_of_previous_lines(text, 9, 1));
+        assert_eq!(8, start_of_previous_lines(text, 8, 1));
+
+        assert_eq!(6, start_of_previous_lines(text, 7, 1));
+        assert_eq!(6, start_of_previous_lines(text, 6, 1));
+
+        assert_eq!(4, start_of_previous_lines(text, 5, 1));
+        assert_eq!(4, start_of_previous_lines(text, 4, 1));
+
+        assert_eq!(2, start_of_previous_lines(text, 3, 1));
+        assert_eq!(2, start_of_previous_lines(text, 2, 1));
+
+        assert_eq!(0, start_of_previous_lines(text, 1, 1));
+        assert_eq!(0, start_of_previous_lines(text, 0, 1));
+    }
+
+    #[test]
     fn basic_search() {
         let text = hay("\
 For the Doctor Watsons of this world, as opposed to the Sherlock
@@ -510,4 +756,31 @@ and exhibited clearly, with a label attached.\
         let out = String::from_utf8(pp.into_inner()).unwrap();
         assert_eq!(out, "/baz.rs:4\n");
     }
+
+    #[test]
+    fn before_context_one() {
+        let text = hay("\
+For the Doctor Watsons of this world, as opposed to the Sherlock
+Holmeses, success in the province of detective work must always
+be, to a very large extent, the result of luck. Sherlock Holmes
+can extract a clew from a wisp of straw or a flake of cigar ash;
+but Doctor Watson has to have it taken out for him and dusted,
+and exhibited clearly, with a label attached.\
+");
+        let mut inp = InputBuffer::with_capacity(1);
+        let mut pp = Printer::new(vec![]);
+        let grep = matcher("Sherlock");
+        let count = {
+            let searcher = Searcher::new(
+                &mut inp, &mut pp, &grep, test_path(), text);
+            searcher.before_context(1).run().unwrap()
+        };
+        assert_eq!(2, count);
+        let out = String::from_utf8(pp.into_inner()).unwrap();
+        assert_eq!(out, "\
+/baz.rs:For the Doctor Watsons of this world, as opposed to the Sherlock
+/baz.rs-Holmeses, success in the province of detective work must always
+/baz.rs:be, to a very large extent, the result of luck. Sherlock Holmes
+");
+    }
 }
author	Andrew Gallant <jamslam@gmail.com>	2016-08-31 20:02:59 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2016-08-31 20:02:59 -0400
commit	5aa3b9bc58e752c6c0c8f50a299eaf4e4bc00607 (patch)
tree	31d718e33fb534f525a713313c85ca9dab646cdd /src
parent	79f62012d7e59c0c2e6ee778e8f8eeeff83753ce (diff)