diff options
author | Andrew Gallant <jamslam@gmail.com> | 2016-07-31 13:15:55 -0400 |
---|---|---|
committer | Andrew Gallant <jamslam@gmail.com> | 2016-07-31 13:15:55 -0400 |
commit | 275f22b41f22f81174e04dac997a6dff40db3ba1 (patch) | |
tree | 21b3d895c51fb19bf0051825de9aa2c764ca1654 | |
parent | 786764d7c50cfa0d18bcffa1262227d11a745e64 (diff) |
Add 'sum' statistic for numeric fields.
Closes #34.
-rw-r--r-- | src/cmd/stats.rs | 64 | ||||
-rw-r--r-- | tests/test_stats.rs | 8 |
2 files changed, 68 insertions, 4 deletions
diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index 09b1445..40965c8 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -22,7 +22,7 @@ use self::FieldType::{TUnknown, TNull, TUnicode, TFloat, TInteger}; static USAGE: &'static str = " Computes basic statistics on CSV data. -Basic statistics includes mean, median, mode, standard deviation, max and +Basic statistics includes mean, median, mode, standard deviation, sum, max and min values. Note that some statistics are expensive to compute, so they must be enabled explicitly. By default, the following statistics are reported for *every* column in the CSV data: mean, max, min and standard deviation. The @@ -199,6 +199,7 @@ impl Args { fn new_stats(&self, record_len: usize) -> Vec<Stats> { repeat(Stats::new(WhichStats { include_nulls: self.flag_nulls, + sum: true, range: true, dist: true, cardinality: self.flag_cardinality || self.flag_everything, @@ -209,7 +210,7 @@ impl Args { fn stat_headers(&self) -> Vec<String> { let mut fields = vec![ - "field", "type", "min", "max", "min_length", "max_length", + "field", "type", "sum", "min", "max", "min_length", "max_length", "mean", "stddev", ]; let all = self.flag_everything; @@ -223,6 +224,7 @@ impl Args { #[derive(Clone, Debug, Eq, PartialEq)] struct WhichStats { include_nulls: bool, + sum: bool, range: bool, dist: bool, cardinality: bool, @@ -239,6 +241,7 @@ impl Commute for WhichStats { #[derive(Clone)] struct Stats { typ: FieldType, + sum: Option<TypedSum>, minmax: Option<TypedMinMax>, online: Option<OnlineStats>, mode: Option<Unsorted<ByteString>>, @@ -248,14 +251,16 @@ struct Stats { impl Stats { fn new(which: WhichStats) -> Stats { - let (mut minmax, mut online) = (None, None); - let (mut mode, mut median) = (None, None); + let (mut sum, mut minmax, mut online, mut mode, mut median) = + (None, None, None, None, None); + if which.sum { sum = Some(Default::default()); } if which.range { minmax = Some(Default::default()); } if which.dist { online = Some(Default::default()); } if which.mode || which.cardinality { mode = Some(Default::default()); } if which.median { median = Some(Default::default()); } Stats { typ: Default::default(), + sum: sum, minmax: minmax, online: online, mode: mode, @@ -269,6 +274,7 @@ impl Stats { self.typ.merge(sample_type); let t = self.typ; + self.sum.as_mut().map(|v| v.add(t, sample)); self.minmax.as_mut().map(|v| v.add(t, sample)); self.mode.as_mut().map(|v| v.add(sample.to_vec())); match self.typ { @@ -299,6 +305,10 @@ impl Stats { let empty = || "".to_owned(); pieces.push(self.typ.to_string()); + match self.sum.as_ref().and_then(|sum| sum.show(typ)) { + Some(sum) => { pieces.push(sum); } + None => { pieces.push(empty()); } + } match self.minmax.as_ref().and_then(|mm| mm.show(typ)) { Some(mm) => { pieces.push(mm.0); pieces.push(mm.1); } None => { pieces.push(empty()); pieces.push(empty()); } @@ -434,6 +444,52 @@ impl fmt::Display for FieldType { } } +/// TypedSum keeps a rolling sum of the data seen. +/// +/// It sums integers until it sees a float, at which point it sums floats. +#[derive(Clone, Default)] +struct TypedSum { + integer: i64, + float: Option<f64>, +} + +impl TypedSum { + fn add(&mut self, typ: FieldType, sample: &[u8]) { + if sample.is_empty() { + return; + } + match typ { + TFloat => { + let float: f64 = from_bytes(sample).unwrap(); + match self.float { + None => { + self.float = Some((self.integer as f64) + float); + } + Some(ref mut float) => { + *float += from_bytes(sample).unwrap(); + } + } + } + TInteger => { + if let Some(ref mut float) = self.float { + *float += from_bytes(sample).unwrap(); + } else { + self.integer += from_bytes(sample).unwrap(); + } + } + _ => {} + } + } + + fn show(&self, typ: FieldType) -> Option<String> { + match typ { + TNull | TUnicode | TUnknown => None, + TInteger => Some(self.integer.to_string()), + TFloat => Some(self.float.unwrap_or(0.0).to_string()), + } + } +} + /// TypedMinMax keeps track of minimum/maximum values for each possible type /// where min/max makes sense. #[derive(Clone)] diff --git a/tests/test_stats.rs b/tests/test_stats.rs index 0338e72..b63b396 100644 --- a/tests/test_stats.rs +++ b/tests/test_stats.rs @@ -142,6 +142,14 @@ stats_tests!(stats_includenulls_null_mode, "mode", &[""], "N/A", true); stats_tests!(stats_includenulls_mean, "mean", &["5", "", "15", "10"], "7.5", true); +stats_tests!(stats_sum_integers, "sum", &["1", "2"], "3"); +stats_tests!(stats_sum_floats, "sum", &["1.5", "2.8"], "4.3"); +stats_tests!(stats_sum_mixed1, "sum", &["1.5", "2"], "3.5"); +stats_tests!(stats_sum_mixed2, "sum", &["2", "1.5"], "3.5"); +stats_tests!(stats_sum_mixed3, "sum", &["1.5", "hi", "2.8"], "4.3"); +stats_tests!(stats_sum_nulls1, "sum", &["1", "", "2"], "3"); +stats_tests!(stats_sum_nulls2, "sum", &["", "1", "2"], "3"); + stats_tests!(stats_min, "min", &["2", "1.1"], "1.1"); stats_tests!(stats_max, "max", &["2", "1.1"], "2"); stats_tests!(stats_min_mix, "min", &["2", "a", "1.1"], "1.1"); |