summaryrefslogtreecommitdiffstats
path: root/buffered-reader/src/lib.rs
diff options
context:
space:
mode:
Diffstat (limited to 'buffered-reader/src/lib.rs')
-rw-r--r--buffered-reader/src/lib.rs372
1 files changed, 372 insertions, 0 deletions
diff --git a/buffered-reader/src/lib.rs b/buffered-reader/src/lib.rs
new file mode 100644
index 00000000..951b8c71
--- /dev/null
+++ b/buffered-reader/src/lib.rs
@@ -0,0 +1,372 @@
+//! An improved `BufRead` interface.
+
+extern crate flate2;
+extern crate bzip2;
+
+use std::str;
+use std::io;
+use std::io::{Error,ErrorKind};
+use std::cmp;
+use std::fmt;
+
+mod generic;
+mod memory;
+mod limitor;
+mod decompress;
+
+pub use self::generic::BufferedReaderGeneric;
+pub use self::memory::BufferedReaderMemory;
+pub use self::limitor::BufferedReaderLimitor;
+pub use self::decompress::BufferedReaderDeflate;
+pub use self::decompress::BufferedReaderZlib;
+pub use self::decompress::BufferedReaderBzip;
+
+// The default buffer size.
+const DEFAULT_BUF_SIZE: usize = 8 * 1024;
+
+/// A `BufferedReader` is a type of `Read`er that has an internal
+/// buffer, and allows working directly from that buffer. Like a
+/// `BufRead`er, the internal buffer amortizes system calls. And,
+/// like a `BufRead`, a `BufferedReader` exposes the internal buffer
+/// so that a user can work with the data in place rather than having
+/// to first copy it to a local buffer. However, unlike `BufRead`,
+/// `BufferedReader` allows the caller to ensure that the internal
+/// buffer has a certain amount of data.
+pub trait BufferedReader : io::Read + fmt::Debug {
+ /// Return the data in the internal buffer. Normally, the
+ /// returned buffer will contain *at least* `amount` bytes worth
+ /// of data. Less data may be returned if the end of the file is
+ /// reached or an error occurs. In these cases, any remaining
+ /// data is returned. Note: the error is not discarded, but will
+ /// be returned when data is called and the internal buffer is
+ /// empty.
+ ///
+ /// This function does not advance the cursor. Thus, multiple
+ /// calls will return the same data. To advance the cursor, use
+ /// `consume`.
+ fn data(&mut self, amount: usize) -> Result<&[u8], io::Error>;
+
+ /// Like `data`, but returns an error if there is not at least
+ /// `amount` bytes available.
+ fn data_hard(&mut self, amount: usize) -> Result<&[u8], io::Error> {
+ let result = self.data(amount);
+ if let Ok(buffer) = result {
+ if buffer.len() < amount {
+ return Err(Error::new(ErrorKind::UnexpectedEof, "unepxected EOF"));
+ }
+ }
+ return result;
+ }
+
+ /// Return all of the data until EOF. Like `data`, this does not
+ /// actually consume the data that is read.
+ ///
+ /// In general, you shouldn't use this function as it can cause an
+ /// enormous amount of buffering. But, if you know that the
+ /// amount of data is limited, this is acceptable.
+ fn data_eof(&mut self) -> Result<&[u8], io::Error> {
+ // Don't just read std::usize::MAX bytes at once. The
+ // implementation might try to actually allocate a buffer that
+ // large! Instead, try with increasingly larger buffers until
+ // the read is (strictly) shorter than the specified size.
+ let mut s = DEFAULT_BUF_SIZE;
+ while s < std::usize::MAX {
+ match self.data(s) {
+ Ok(ref buffer) =>
+ if buffer.len() < s {
+ // We really want to do
+ //
+ // return Ok(buffer);
+ //
+ // But, the borrower checker won't let us:
+ //
+ // error[E0499]: cannot borrow `*self` as
+ // mutable more than once at a time.
+ //
+ // Instead, we break out of the loop, and then
+ // call self.data(s) again. This extra call
+ // shouldn't have any significant cost,
+ // because the buffer should already be
+ // prepared.
+ break;
+ } else {
+ s *= 2;
+ },
+ Err(err) =>
+ return Err(err),
+ }
+ }
+ return self.data(s);
+ }
+
+ /// Mark the first `amount` bytes of the internal buffer as
+ /// consumed. It is an error to call this function without having
+ /// first successfully called `data` (or a related function) to
+ /// buffer `amount` bytes.
+ ///
+ /// This function returns the data that has been consumed.
+ fn consume(&mut self, amount: usize) -> &[u8];
+
+ /// This is a convenient function that effectively combines data()
+ /// and consume().
+ fn data_consume(&mut self, amount: usize)
+ -> Result<&[u8], std::io::Error>;
+
+
+ // This is a convenient function that effectively combines
+ // data_hard() and consume().
+ fn data_consume_hard(&mut self, amount: usize) -> Result<&[u8], io::Error>;
+
+ /// A convenience function for reading a 16-bit unsigned integer
+ /// in big endian format.
+ fn read_be_u16(&mut self) -> Result<u16, std::io::Error> {
+ let input = self.data_consume_hard(2)?;
+ return Ok(((input[0] as u16) << 8) + (input[1] as u16));
+ }
+
+ /// A convenience function for reading a 32-bit unsigned integer
+ /// in big endian format.
+ fn read_be_u32(&mut self) -> Result<u32, std::io::Error> {
+ let input = self.data_consume_hard(4)?;
+ return Ok(((input[0] as u32) << 24) + ((input[1] as u32) << 16)
+ + ((input[2] as u32) << 8) + (input[3] as u32));
+ }
+
+ /// Reads and consumes `amount` bytes, and returns them in a
+ /// caller owned buffer. Implementations may optimize this to
+ /// avoid a copy.
+ fn steal(&mut self, amount: usize) -> Result<Vec<u8>, std::io::Error> {
+ let mut data = self.data_consume_hard(amount)?;
+ assert!(data.len() >= amount);
+ if data.len() > amount {
+ data = &data[..amount];
+ }
+ return Ok(data.to_vec());
+ }
+
+ /// Like steal, but instead of stealing a fixed number of bytes,
+ /// it steals all of the data it can.
+ fn steal_eof(&mut self) -> Result<Vec<u8>, std::io::Error> {
+ let len = self.data_eof()?.len();
+ let data = self.steal(len)?;
+ return Ok(data);
+ }
+
+ fn into_inner<'a>(self: Box<Self>) -> Option<Box<BufferedReader + 'a>>
+ where Self: 'a;
+}
+
+/// This function implements the `std::io::Read::read` method in terms
+/// of the `data_consume` method. We can't use the `io::std::Read`
+/// interface, because the `BufferedReader` may have buffered some
+/// data internally (in which case a read will not return the buffered
+/// data, but the following data). This implementation is generic.
+/// When deriving a `BufferedReader`, you can include the following:
+///
+/// ```text
+/// impl<'a, T: BufferedReader> std::io::Read for BufferedReaderXXX<'a, T> {
+/// fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
+/// return buffered_reader_generic_read_impl(self, buf);
+/// }
+/// }
+/// ```
+///
+/// It would be nice if we could do:
+///
+/// ```text
+/// impl <T: BufferedReader> std::io::Read for T { ... }
+/// ```
+///
+/// but, alas, Rust doesn't like that ("error[E0119]: conflicting
+/// implementations of trait `std::io::Read` for type `&mut _`").
+pub fn buffered_reader_generic_read_impl<T: BufferedReader>
+ (bio: &mut T, buf: &mut [u8]) -> Result<usize, io::Error> {
+ match bio.data_consume(buf.len()) {
+ Ok(inner) => {
+ let amount = cmp::min(buf.len(), inner.len());
+ buf[0..amount].copy_from_slice(&inner[0..amount]);
+ return Ok(amount);
+ },
+ Err(err) => return Err(err),
+ }
+}
+
+/// Make a `Box<BufferedReader>` look like a BufferedReader.
+impl <'a> BufferedReader for Box<BufferedReader + 'a> {
+ fn data(&mut self, amount: usize) -> Result<&[u8], io::Error> {
+ return self.as_mut().data(amount);
+ }
+
+ fn data_hard(&mut self, amount: usize) -> Result<&[u8], io::Error> {
+ return self.as_mut().data_hard(amount);
+ }
+
+ fn data_eof(&mut self) -> Result<&[u8], io::Error> {
+ return self.as_mut().data_eof();
+ }
+
+ fn consume(&mut self, amount: usize) -> &[u8] {
+ return self.as_mut().consume(amount);
+ }
+
+ fn data_consume(&mut self, amount: usize)
+ -> Result<&[u8], std::io::Error> {
+ return self.as_mut().data_consume(amount);
+ }
+
+ fn data_consume_hard(&mut self, amount: usize) -> Result<&[u8], io::Error> {
+ return self.as_mut().data_consume_hard(amount);
+ }
+
+ fn read_be_u16(&mut self) -> Result<u16, std::io::Error> {
+ return self.as_mut().read_be_u16();
+ }
+
+ fn read_be_u32(&mut self) -> Result<u32, std::io::Error> {
+ return self.as_mut().read_be_u32();
+ }
+
+ fn steal(&mut self, amount: usize) -> Result<Vec<u8>, std::io::Error> {
+ return self.as_mut().steal(amount);
+ }
+
+ fn steal_eof(&mut self) -> Result<Vec<u8>, std::io::Error> {
+ return self.as_mut().steal_eof();
+ }
+
+ fn into_inner<'b>(self: Box<Self>) -> Option<Box<BufferedReader + 'b>>
+ where Self: 'b {
+ // Strip the outer box.
+ (*self).into_inner()
+ }
+}
+
+// The file was created as follows:
+//
+// for i in $(seq 0 9999); do printf "%04d\n" $i; done > buffered-reader-test.txt
+#[cfg(test)]
+fn buffered_reader_test_data_check<'a, T: BufferedReader + 'a>(bio: &mut T) {
+ for i in 0 .. 10000 {
+ let consumed = {
+ // Each number is 4 bytes plus a newline character.
+ let d = bio.data_hard(5);
+ if d.is_err() {
+ println!("Error for i == {}: {:?}", i, d);
+ }
+ let d = d.unwrap();
+ assert!(d.len() >= 5);
+ assert_eq!(format!("{:04}\n", i), str::from_utf8(&d[0..5]).unwrap());
+
+ 5
+ };
+
+ bio.consume(consumed);
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn buffered_reader_eof_test() {
+ let data : &[u8] = include_bytes!("buffered-reader-test.txt");
+
+ // Make sure data_eof works.
+ {
+ let mut bio = BufferedReaderMemory::new(data);
+ let amount = {
+ bio.data_eof().unwrap().len()
+ };
+ bio.consume(amount);
+ assert_eq!(bio.data(1).unwrap().len(), 0);
+ }
+
+ // Try it again with a limitor.
+ {
+ let bio = BufferedReaderMemory::new(data);
+ let mut bio2 = BufferedReaderLimitor::new(bio, (data.len() / 2) as u64);
+ let amount = {
+ bio2.data_eof().unwrap().len()
+ };
+ assert_eq!(amount, data.len() / 2);
+ bio2.consume(amount);
+ assert_eq!(bio2.data(1).unwrap().len(), 0);
+ }
+ }
+
+ #[cfg(test)]
+ fn buffered_reader_read_test_aux<'a, T: BufferedReader + 'a>
+ (mut bio: T, data: &[u8]) {
+ let mut buffer = [0; 99];
+
+ // Make sure the test file has more than buffer.len() bytes
+ // worth of data.
+ assert!(buffer.len() < data.len());
+
+ // The number of reads we'll have to perform.
+ let iters = (data.len() + buffer.len() - 1) / buffer.len();
+ // Iterate more than the number of required reads to check
+ // what happens when we try to read beyond the end of the
+ // file.
+ for i in 1..iters + 2 {
+ let data_start = (i - 1) * buffer.len();
+
+ // We don't want to just check that read works in
+ // isolation. We want to be able to mix .read and .data
+ // calls.
+ {
+ let result = bio.data(buffer.len());
+ let buffer = result.unwrap();
+ if buffer.len() > 0 {
+ assert_eq!(buffer,
+ &data[data_start..data_start + buffer.len()]);
+ }
+ }
+
+ // Now do the actual read.
+ let result = bio.read(&mut buffer[..]);
+ let got = result.unwrap();
+ if got > 0 {
+ assert_eq!(&buffer[0..got],
+ &data[data_start..data_start + got]);
+ }
+
+ if i > iters {
+ // We should have read everything.
+ assert!(got == 0);
+ } else if i == iters {
+ // The last read. This may be less than buffer.len().
+ // But it should include at least one byte.
+ assert!(0 < got);
+ assert!(got <= buffer.len());
+ } else {
+ assert_eq!(got, buffer.len());
+ }
+ }
+ }
+
+ #[test]
+ fn buffered_reader_read_test() {
+ let data : &[u8] = include_bytes!("buffered-reader-test.txt");
+
+ {
+ let bio = BufferedReaderMemory::new(data);
+ buffered_reader_read_test_aux (bio, data);
+ }
+
+ {
+ use std::path::PathBuf;
+ use std::fs::File;
+
+ let path : PathBuf = [env!("CARGO_MANIFEST_DIR"),
+ "src",
+ "buffered-reader-test.txt"]
+ .iter().collect();
+
+ let mut f = File::open(&path).expect(&path.to_string_lossy());
+ let bio = BufferedReaderGeneric::new(&mut f, None);
+ buffered_reader_read_test_aux (bio, data);
+ }
+ }
+}