diff options
Diffstat (limited to 'buffered-reader/src/lib.rs')
-rw-r--r-- | buffered-reader/src/lib.rs | 372 |
1 files changed, 372 insertions, 0 deletions
diff --git a/buffered-reader/src/lib.rs b/buffered-reader/src/lib.rs new file mode 100644 index 00000000..951b8c71 --- /dev/null +++ b/buffered-reader/src/lib.rs @@ -0,0 +1,372 @@ +//! An improved `BufRead` interface. + +extern crate flate2; +extern crate bzip2; + +use std::str; +use std::io; +use std::io::{Error,ErrorKind}; +use std::cmp; +use std::fmt; + +mod generic; +mod memory; +mod limitor; +mod decompress; + +pub use self::generic::BufferedReaderGeneric; +pub use self::memory::BufferedReaderMemory; +pub use self::limitor::BufferedReaderLimitor; +pub use self::decompress::BufferedReaderDeflate; +pub use self::decompress::BufferedReaderZlib; +pub use self::decompress::BufferedReaderBzip; + +// The default buffer size. +const DEFAULT_BUF_SIZE: usize = 8 * 1024; + +/// A `BufferedReader` is a type of `Read`er that has an internal +/// buffer, and allows working directly from that buffer. Like a +/// `BufRead`er, the internal buffer amortizes system calls. And, +/// like a `BufRead`, a `BufferedReader` exposes the internal buffer +/// so that a user can work with the data in place rather than having +/// to first copy it to a local buffer. However, unlike `BufRead`, +/// `BufferedReader` allows the caller to ensure that the internal +/// buffer has a certain amount of data. +pub trait BufferedReader : io::Read + fmt::Debug { + /// Return the data in the internal buffer. Normally, the + /// returned buffer will contain *at least* `amount` bytes worth + /// of data. Less data may be returned if the end of the file is + /// reached or an error occurs. In these cases, any remaining + /// data is returned. Note: the error is not discarded, but will + /// be returned when data is called and the internal buffer is + /// empty. + /// + /// This function does not advance the cursor. Thus, multiple + /// calls will return the same data. To advance the cursor, use + /// `consume`. + fn data(&mut self, amount: usize) -> Result<&[u8], io::Error>; + + /// Like `data`, but returns an error if there is not at least + /// `amount` bytes available. + fn data_hard(&mut self, amount: usize) -> Result<&[u8], io::Error> { + let result = self.data(amount); + if let Ok(buffer) = result { + if buffer.len() < amount { + return Err(Error::new(ErrorKind::UnexpectedEof, "unepxected EOF")); + } + } + return result; + } + + /// Return all of the data until EOF. Like `data`, this does not + /// actually consume the data that is read. + /// + /// In general, you shouldn't use this function as it can cause an + /// enormous amount of buffering. But, if you know that the + /// amount of data is limited, this is acceptable. + fn data_eof(&mut self) -> Result<&[u8], io::Error> { + // Don't just read std::usize::MAX bytes at once. The + // implementation might try to actually allocate a buffer that + // large! Instead, try with increasingly larger buffers until + // the read is (strictly) shorter than the specified size. + let mut s = DEFAULT_BUF_SIZE; + while s < std::usize::MAX { + match self.data(s) { + Ok(ref buffer) => + if buffer.len() < s { + // We really want to do + // + // return Ok(buffer); + // + // But, the borrower checker won't let us: + // + // error[E0499]: cannot borrow `*self` as + // mutable more than once at a time. + // + // Instead, we break out of the loop, and then + // call self.data(s) again. This extra call + // shouldn't have any significant cost, + // because the buffer should already be + // prepared. + break; + } else { + s *= 2; + }, + Err(err) => + return Err(err), + } + } + return self.data(s); + } + + /// Mark the first `amount` bytes of the internal buffer as + /// consumed. It is an error to call this function without having + /// first successfully called `data` (or a related function) to + /// buffer `amount` bytes. + /// + /// This function returns the data that has been consumed. + fn consume(&mut self, amount: usize) -> &[u8]; + + /// This is a convenient function that effectively combines data() + /// and consume(). + fn data_consume(&mut self, amount: usize) + -> Result<&[u8], std::io::Error>; + + + // This is a convenient function that effectively combines + // data_hard() and consume(). + fn data_consume_hard(&mut self, amount: usize) -> Result<&[u8], io::Error>; + + /// A convenience function for reading a 16-bit unsigned integer + /// in big endian format. + fn read_be_u16(&mut self) -> Result<u16, std::io::Error> { + let input = self.data_consume_hard(2)?; + return Ok(((input[0] as u16) << 8) + (input[1] as u16)); + } + + /// A convenience function for reading a 32-bit unsigned integer + /// in big endian format. + fn read_be_u32(&mut self) -> Result<u32, std::io::Error> { + let input = self.data_consume_hard(4)?; + return Ok(((input[0] as u32) << 24) + ((input[1] as u32) << 16) + + ((input[2] as u32) << 8) + (input[3] as u32)); + } + + /// Reads and consumes `amount` bytes, and returns them in a + /// caller owned buffer. Implementations may optimize this to + /// avoid a copy. + fn steal(&mut self, amount: usize) -> Result<Vec<u8>, std::io::Error> { + let mut data = self.data_consume_hard(amount)?; + assert!(data.len() >= amount); + if data.len() > amount { + data = &data[..amount]; + } + return Ok(data.to_vec()); + } + + /// Like steal, but instead of stealing a fixed number of bytes, + /// it steals all of the data it can. + fn steal_eof(&mut self) -> Result<Vec<u8>, std::io::Error> { + let len = self.data_eof()?.len(); + let data = self.steal(len)?; + return Ok(data); + } + + fn into_inner<'a>(self: Box<Self>) -> Option<Box<BufferedReader + 'a>> + where Self: 'a; +} + +/// This function implements the `std::io::Read::read` method in terms +/// of the `data_consume` method. We can't use the `io::std::Read` +/// interface, because the `BufferedReader` may have buffered some +/// data internally (in which case a read will not return the buffered +/// data, but the following data). This implementation is generic. +/// When deriving a `BufferedReader`, you can include the following: +/// +/// ```text +/// impl<'a, T: BufferedReader> std::io::Read for BufferedReaderXXX<'a, T> { +/// fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> { +/// return buffered_reader_generic_read_impl(self, buf); +/// } +/// } +/// ``` +/// +/// It would be nice if we could do: +/// +/// ```text +/// impl <T: BufferedReader> std::io::Read for T { ... } +/// ``` +/// +/// but, alas, Rust doesn't like that ("error[E0119]: conflicting +/// implementations of trait `std::io::Read` for type `&mut _`"). +pub fn buffered_reader_generic_read_impl<T: BufferedReader> + (bio: &mut T, buf: &mut [u8]) -> Result<usize, io::Error> { + match bio.data_consume(buf.len()) { + Ok(inner) => { + let amount = cmp::min(buf.len(), inner.len()); + buf[0..amount].copy_from_slice(&inner[0..amount]); + return Ok(amount); + }, + Err(err) => return Err(err), + } +} + +/// Make a `Box<BufferedReader>` look like a BufferedReader. +impl <'a> BufferedReader for Box<BufferedReader + 'a> { + fn data(&mut self, amount: usize) -> Result<&[u8], io::Error> { + return self.as_mut().data(amount); + } + + fn data_hard(&mut self, amount: usize) -> Result<&[u8], io::Error> { + return self.as_mut().data_hard(amount); + } + + fn data_eof(&mut self) -> Result<&[u8], io::Error> { + return self.as_mut().data_eof(); + } + + fn consume(&mut self, amount: usize) -> &[u8] { + return self.as_mut().consume(amount); + } + + fn data_consume(&mut self, amount: usize) + -> Result<&[u8], std::io::Error> { + return self.as_mut().data_consume(amount); + } + + fn data_consume_hard(&mut self, amount: usize) -> Result<&[u8], io::Error> { + return self.as_mut().data_consume_hard(amount); + } + + fn read_be_u16(&mut self) -> Result<u16, std::io::Error> { + return self.as_mut().read_be_u16(); + } + + fn read_be_u32(&mut self) -> Result<u32, std::io::Error> { + return self.as_mut().read_be_u32(); + } + + fn steal(&mut self, amount: usize) -> Result<Vec<u8>, std::io::Error> { + return self.as_mut().steal(amount); + } + + fn steal_eof(&mut self) -> Result<Vec<u8>, std::io::Error> { + return self.as_mut().steal_eof(); + } + + fn into_inner<'b>(self: Box<Self>) -> Option<Box<BufferedReader + 'b>> + where Self: 'b { + // Strip the outer box. + (*self).into_inner() + } +} + +// The file was created as follows: +// +// for i in $(seq 0 9999); do printf "%04d\n" $i; done > buffered-reader-test.txt +#[cfg(test)] +fn buffered_reader_test_data_check<'a, T: BufferedReader + 'a>(bio: &mut T) { + for i in 0 .. 10000 { + let consumed = { + // Each number is 4 bytes plus a newline character. + let d = bio.data_hard(5); + if d.is_err() { + println!("Error for i == {}: {:?}", i, d); + } + let d = d.unwrap(); + assert!(d.len() >= 5); + assert_eq!(format!("{:04}\n", i), str::from_utf8(&d[0..5]).unwrap()); + + 5 + }; + + bio.consume(consumed); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn buffered_reader_eof_test() { + let data : &[u8] = include_bytes!("buffered-reader-test.txt"); + + // Make sure data_eof works. + { + let mut bio = BufferedReaderMemory::new(data); + let amount = { + bio.data_eof().unwrap().len() + }; + bio.consume(amount); + assert_eq!(bio.data(1).unwrap().len(), 0); + } + + // Try it again with a limitor. + { + let bio = BufferedReaderMemory::new(data); + let mut bio2 = BufferedReaderLimitor::new(bio, (data.len() / 2) as u64); + let amount = { + bio2.data_eof().unwrap().len() + }; + assert_eq!(amount, data.len() / 2); + bio2.consume(amount); + assert_eq!(bio2.data(1).unwrap().len(), 0); + } + } + + #[cfg(test)] + fn buffered_reader_read_test_aux<'a, T: BufferedReader + 'a> + (mut bio: T, data: &[u8]) { + let mut buffer = [0; 99]; + + // Make sure the test file has more than buffer.len() bytes + // worth of data. + assert!(buffer.len() < data.len()); + + // The number of reads we'll have to perform. + let iters = (data.len() + buffer.len() - 1) / buffer.len(); + // Iterate more than the number of required reads to check + // what happens when we try to read beyond the end of the + // file. + for i in 1..iters + 2 { + let data_start = (i - 1) * buffer.len(); + + // We don't want to just check that read works in + // isolation. We want to be able to mix .read and .data + // calls. + { + let result = bio.data(buffer.len()); + let buffer = result.unwrap(); + if buffer.len() > 0 { + assert_eq!(buffer, + &data[data_start..data_start + buffer.len()]); + } + } + + // Now do the actual read. + let result = bio.read(&mut buffer[..]); + let got = result.unwrap(); + if got > 0 { + assert_eq!(&buffer[0..got], + &data[data_start..data_start + got]); + } + + if i > iters { + // We should have read everything. + assert!(got == 0); + } else if i == iters { + // The last read. This may be less than buffer.len(). + // But it should include at least one byte. + assert!(0 < got); + assert!(got <= buffer.len()); + } else { + assert_eq!(got, buffer.len()); + } + } + } + + #[test] + fn buffered_reader_read_test() { + let data : &[u8] = include_bytes!("buffered-reader-test.txt"); + + { + let bio = BufferedReaderMemory::new(data); + buffered_reader_read_test_aux (bio, data); + } + + { + use std::path::PathBuf; + use std::fs::File; + + let path : PathBuf = [env!("CARGO_MANIFEST_DIR"), + "src", + "buffered-reader-test.txt"] + .iter().collect(); + + let mut f = File::open(&path).expect(&path.to_string_lossy()); + let bio = BufferedReaderGeneric::new(&mut f, None); + buffered_reader_read_test_aux (bio, data); + } + } +} |