summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2016-07-31 13:15:55 -0400
committerAndrew Gallant <jamslam@gmail.com>2016-07-31 13:15:55 -0400
commit275f22b41f22f81174e04dac997a6dff40db3ba1 (patch)
tree21b3d895c51fb19bf0051825de9aa2c764ca1654
parent786764d7c50cfa0d18bcffa1262227d11a745e64 (diff)
Add 'sum' statistic for numeric fields.
Closes #34.
-rw-r--r--src/cmd/stats.rs64
-rw-r--r--tests/test_stats.rs8
2 files changed, 68 insertions, 4 deletions
diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs
index 09b1445..40965c8 100644
--- a/src/cmd/stats.rs
+++ b/src/cmd/stats.rs
@@ -22,7 +22,7 @@ use self::FieldType::{TUnknown, TNull, TUnicode, TFloat, TInteger};
static USAGE: &'static str = "
Computes basic statistics on CSV data.
-Basic statistics includes mean, median, mode, standard deviation, max and
+Basic statistics includes mean, median, mode, standard deviation, sum, max and
min values. Note that some statistics are expensive to compute, so they must
be enabled explicitly. By default, the following statistics are reported for
*every* column in the CSV data: mean, max, min and standard deviation. The
@@ -199,6 +199,7 @@ impl Args {
fn new_stats(&self, record_len: usize) -> Vec<Stats> {
repeat(Stats::new(WhichStats {
include_nulls: self.flag_nulls,
+ sum: true,
range: true,
dist: true,
cardinality: self.flag_cardinality || self.flag_everything,
@@ -209,7 +210,7 @@ impl Args {
fn stat_headers(&self) -> Vec<String> {
let mut fields = vec![
- "field", "type", "min", "max", "min_length", "max_length",
+ "field", "type", "sum", "min", "max", "min_length", "max_length",
"mean", "stddev",
];
let all = self.flag_everything;
@@ -223,6 +224,7 @@ impl Args {
#[derive(Clone, Debug, Eq, PartialEq)]
struct WhichStats {
include_nulls: bool,
+ sum: bool,
range: bool,
dist: bool,
cardinality: bool,
@@ -239,6 +241,7 @@ impl Commute for WhichStats {
#[derive(Clone)]
struct Stats {
typ: FieldType,
+ sum: Option<TypedSum>,
minmax: Option<TypedMinMax>,
online: Option<OnlineStats>,
mode: Option<Unsorted<ByteString>>,
@@ -248,14 +251,16 @@ struct Stats {
impl Stats {
fn new(which: WhichStats) -> Stats {
- let (mut minmax, mut online) = (None, None);
- let (mut mode, mut median) = (None, None);
+ let (mut sum, mut minmax, mut online, mut mode, mut median) =
+ (None, None, None, None, None);
+ if which.sum { sum = Some(Default::default()); }
if which.range { minmax = Some(Default::default()); }
if which.dist { online = Some(Default::default()); }
if which.mode || which.cardinality { mode = Some(Default::default()); }
if which.median { median = Some(Default::default()); }
Stats {
typ: Default::default(),
+ sum: sum,
minmax: minmax,
online: online,
mode: mode,
@@ -269,6 +274,7 @@ impl Stats {
self.typ.merge(sample_type);
let t = self.typ;
+ self.sum.as_mut().map(|v| v.add(t, sample));
self.minmax.as_mut().map(|v| v.add(t, sample));
self.mode.as_mut().map(|v| v.add(sample.to_vec()));
match self.typ {
@@ -299,6 +305,10 @@ impl Stats {
let empty = || "".to_owned();
pieces.push(self.typ.to_string());
+ match self.sum.as_ref().and_then(|sum| sum.show(typ)) {
+ Some(sum) => { pieces.push(sum); }
+ None => { pieces.push(empty()); }
+ }
match self.minmax.as_ref().and_then(|mm| mm.show(typ)) {
Some(mm) => { pieces.push(mm.0); pieces.push(mm.1); }
None => { pieces.push(empty()); pieces.push(empty()); }
@@ -434,6 +444,52 @@ impl fmt::Display for FieldType {
}
}
+/// TypedSum keeps a rolling sum of the data seen.
+///
+/// It sums integers until it sees a float, at which point it sums floats.
+#[derive(Clone, Default)]
+struct TypedSum {
+ integer: i64,
+ float: Option<f64>,
+}
+
+impl TypedSum {
+ fn add(&mut self, typ: FieldType, sample: &[u8]) {
+ if sample.is_empty() {
+ return;
+ }
+ match typ {
+ TFloat => {
+ let float: f64 = from_bytes(sample).unwrap();
+ match self.float {
+ None => {
+ self.float = Some((self.integer as f64) + float);
+ }
+ Some(ref mut float) => {
+ *float += from_bytes(sample).unwrap();
+ }
+ }
+ }
+ TInteger => {
+ if let Some(ref mut float) = self.float {
+ *float += from_bytes(sample).unwrap();
+ } else {
+ self.integer += from_bytes(sample).unwrap();
+ }
+ }
+ _ => {}
+ }
+ }
+
+ fn show(&self, typ: FieldType) -> Option<String> {
+ match typ {
+ TNull | TUnicode | TUnknown => None,
+ TInteger => Some(self.integer.to_string()),
+ TFloat => Some(self.float.unwrap_or(0.0).to_string()),
+ }
+ }
+}
+
/// TypedMinMax keeps track of minimum/maximum values for each possible type
/// where min/max makes sense.
#[derive(Clone)]
diff --git a/tests/test_stats.rs b/tests/test_stats.rs
index 0338e72..b63b396 100644
--- a/tests/test_stats.rs
+++ b/tests/test_stats.rs
@@ -142,6 +142,14 @@ stats_tests!(stats_includenulls_null_mode, "mode", &[""], "N/A", true);
stats_tests!(stats_includenulls_mean,
"mean", &["5", "", "15", "10"], "7.5", true);
+stats_tests!(stats_sum_integers, "sum", &["1", "2"], "3");
+stats_tests!(stats_sum_floats, "sum", &["1.5", "2.8"], "4.3");
+stats_tests!(stats_sum_mixed1, "sum", &["1.5", "2"], "3.5");
+stats_tests!(stats_sum_mixed2, "sum", &["2", "1.5"], "3.5");
+stats_tests!(stats_sum_mixed3, "sum", &["1.5", "hi", "2.8"], "4.3");
+stats_tests!(stats_sum_nulls1, "sum", &["1", "", "2"], "3");
+stats_tests!(stats_sum_nulls2, "sum", &["", "1", "2"], "3");
+
stats_tests!(stats_min, "min", &["2", "1.1"], "1.1");
stats_tests!(stats_max, "max", &["2", "1.1"], "2");
stats_tests!(stats_min_mix, "min", &["2", "a", "1.1"], "1.1");