diff options
Diffstat (limited to 'grep-printer/src/jsont.rs')
-rw-r--r-- | grep-printer/src/jsont.rs | 213 |
1 files changed, 213 insertions, 0 deletions
diff --git a/grep-printer/src/jsont.rs b/grep-printer/src/jsont.rs new file mode 100644 index 00000000..5028349a --- /dev/null +++ b/grep-printer/src/jsont.rs @@ -0,0 +1,213 @@ +// This module defines the types we use for JSON serialization. We specifically +// omit deserialization, partially because there isn't a clear use case for +// them at this time, but also because deserialization will complicate things. +// Namely, the types below are designed in a way that permits JSON +// serialization with little or no allocation. Allocation is often quite +// convenient for deserialization however, so these types would become a bit +// more complex. + +use std::borrow::Cow; +use std::path::Path; +use std::str; + +use base64; +use serde::{Serialize, Serializer}; + +use stats::Stats; + +#[derive(Serialize)] +#[serde(tag = "type", content = "data")] +#[serde(rename_all = "snake_case")] +pub enum Message<'a> { + Begin(Begin<'a>), + End(End<'a>), + Match(Match<'a>), + Context(Context<'a>), +} + +#[derive(Serialize)] +pub struct Begin<'a> { + #[serde(serialize_with = "ser_path")] + pub path: Option<&'a Path>, +} + +#[derive(Serialize)] +pub struct End<'a> { + #[serde(serialize_with = "ser_path")] + pub path: Option<&'a Path>, + pub binary_offset: Option<u64>, + pub stats: Stats, +} + +#[derive(Serialize)] +pub struct Match<'a> { + #[serde(serialize_with = "ser_path")] + pub path: Option<&'a Path>, + #[serde(serialize_with = "ser_bytes")] + pub lines: &'a [u8], + pub line_number: Option<u64>, + pub absolute_offset: u64, + pub submatches: &'a [SubMatch<'a>], +} + +#[derive(Serialize)] +pub struct Context<'a> { + #[serde(serialize_with = "ser_path")] + pub path: Option<&'a Path>, + #[serde(serialize_with = "ser_bytes")] + pub lines: &'a [u8], + pub line_number: Option<u64>, + pub absolute_offset: u64, + pub submatches: &'a [SubMatch<'a>], +} + +#[derive(Serialize)] +pub struct SubMatch<'a> { + #[serde(rename = "match")] + #[serde(serialize_with = "ser_bytes")] + pub m: &'a [u8], + pub start: usize, + pub end: usize, +} + +/// Data represents things that look like strings, but may actually not be +/// valid UTF-8. To handle this, `Data` is serialized as an object with one +/// of two keys: `text` (for valid UTF-8) or `bytes` (for invalid UTF-8). +/// +/// The happy path is valid UTF-8, which streams right through as-is, since +/// it is natively supported by JSON. When invalid UTF-8 is found, then it is +/// represented as arbitrary bytes and base64 encoded. +#[derive(Clone, Debug, Hash, PartialEq, Eq, Serialize)] +#[serde(untagged)] +enum Data<'a> { + Text { text: Cow<'a, str> }, + Bytes { + #[serde(serialize_with = "to_base64")] + bytes: &'a [u8], + }, +} + +impl<'a> Data<'a> { + fn from_bytes(bytes: &[u8]) -> Data { + match str::from_utf8(bytes) { + Ok(text) => Data::Text { text: Cow::Borrowed(text) }, + Err(_) => Data::Bytes { bytes }, + } + } + + #[cfg(unix)] + fn from_path(path: &Path) -> Data { + use std::os::unix::ffi::OsStrExt; + + match path.to_str() { + Some(text) => Data::Text { text: Cow::Borrowed(text) }, + None => Data::Bytes { bytes: path.as_os_str().as_bytes() }, + } + } + + #[cfg(not(unix))] + fn from_path(path: &Path) -> Data { + // Using lossy conversion means some paths won't round trip precisely, + // but it's not clear what we should actually do. Serde rejects + // non-UTF-8 paths, and OsStr's are serialized as a sequence of UTF-16 + // code units on Windows. Neither seem appropriate for this use case, + // so we do the easy thing for now. + Data::Text { text: path.to_string_lossy() } + } + + // Unused deserialization routines. + + /* + fn into_bytes(self) -> Vec<u8> { + match self { + Data::Text { text } => text.into_bytes(), + Data::Bytes { bytes } => bytes, + } + } + + #[cfg(unix)] + fn into_path_buf(&self) -> PathBuf { + use std::os::unix::ffi::OsStrExt; + + match self { + Data::Text { text } => PathBuf::from(text), + Data::Bytes { bytes } => { + PathBuf::from(OsStr::from_bytes(bytes)) + } + } + } + + #[cfg(not(unix))] + fn into_path_buf(&self) -> PathBuf { + match self { + Data::Text { text } => PathBuf::from(text), + Data::Bytes { bytes } => { + PathBuf::from(String::from_utf8_lossy(&bytes).into_owned()) + } + } + } + */ +} + +fn to_base64<T, S>( + bytes: T, + ser: S, +) -> Result<S::Ok, S::Error> +where T: AsRef<[u8]>, + S: Serializer +{ + ser.serialize_str(&base64::encode(&bytes)) +} + +fn ser_bytes<T, S>( + bytes: T, + ser: S, +) -> Result<S::Ok, S::Error> +where T: AsRef<[u8]>, + S: Serializer +{ + Data::from_bytes(bytes.as_ref()).serialize(ser) +} + +fn ser_path<P, S>( + path: &Option<P>, + ser: S, +) -> Result<S::Ok, S::Error> +where P: AsRef<Path>, + S: Serializer +{ + path.as_ref().map(|p| Data::from_path(p.as_ref())).serialize(ser) +} + +// The following are some deserialization helpers, in case we decide to support +// deserialization of the above types. + +/* +fn from_base64<'de, D>( + de: D, +) -> Result<Vec<u8>, D::Error> +where D: Deserializer<'de> +{ + let encoded = String::deserialize(de)?; + let decoded = base64::decode(encoded.as_bytes()) + .map_err(D::Error::custom)?; + Ok(decoded) +} + +fn deser_bytes<'de, D>( + de: D, +) -> Result<Vec<u8>, D::Error> +where D: Deserializer<'de> +{ + Data::deserialize(de).map(|datum| datum.into_bytes()) +} + +fn deser_path<'de, D>( + de: D, +) -> Result<Option<PathBuf>, D::Error> +where D: Deserializer<'de> +{ + Option::<Data>::deserialize(de) + .map(|opt| opt.map(|datum| datum.into_path_buf())) +} +*/ |