The OpenD Programming Language

1 /**Summary statistics such as mean, median, sum, variance, skewness, kurtosis.
2  * Except for median and median absolute deviation, which cannot be calculated
3  * online, all summary statistics have both an input range interface and an
4  * output range interface.
5  *
6  * Notes: The put method on the structs defined in this module returns this by
7  *        ref.  The use case for returning this is to enable these structs
8  *        to be used with std.algorithm.reduce.  The rationale for returning
9  *        by ref is that the return value usually won't be used, and the
10  *        overhead of returning a large struct by value should be avoided.
11  *
12  * Bugs:  This whole module assumes that input will be doubles or types implicitly
13  *        convertible to double.  No allowances are made for user-defined numeric
14  *        types such as BigInts.  This is necessary for simplicity.  However,
15  *        if you have a function that converts your data to doubles, most of
16  *        these functions work with any input range, so you can simply map
17  *        this function onto your range.
18  *
19  * Author:  David Simcha
20  */
21 /*
22  * Copyright (C) 2008-2010 David Simcha
23  *
24  * License:
25  * Boost Software License - Version 1.0 - August 17th, 2003
26  *
27  * Permission is hereby granted, free of charge, to any person or organization
28  * obtaining a copy of the software and accompanying documentation covered by
29  * this license (the "Software") to use, reproduce, display, distribute,
30  * execute, and transmit the Software, and to prepare derivative works of the
31  * Software, and to permit third-parties to whom the Software is furnished to
32  * do so, all subject to the following:
33  *
34  * The copyright notices in the Software and this entire statement, including
35  * the above license grant, this restriction and the following disclaimer,
36  * must be included in all copies of the Software, in whole or in part, and
37  * all derivative works of the Software, unless such copies or derivative
38  * works are solely in the form of machine-executable object code generated by
39  * a source language processor.
40  *
41  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
42  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
43  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
44  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
45  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
46  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
47  * DEALINGS IN THE SOFTWARE.
48  */
49 
50 
51 module dstats.summary;
52 
53 import std.functional, std.conv, std.range, std.array,
54     std.traits, std.math;
55 
56 import std.algorithm : reduce, min, max, swap, map, filter;
57 
58 import dstats.sort, dstats.base, dstats.alloc;
59 
60 version(unittest) {
61     import std.stdio, dstats.random;
62 }
63 
64 /**Finds median of an input range in O(N) time on average.  In the case of an
65  * even number of elements, the mean of the two middle elements is returned.
66  * This is a convenience founction designed specifically for numeric types,
67  * where the averaging of the two middle elements is desired.  A more general
68  * selection algorithm that can handle any type with a total ordering, as well
69  * as selecting any position in the ordering, can be found at
70  * dstats.sort.quickSelect() and dstats.sort.partitionK().
71  * Allocates memory, does not reorder input data.*/
72 double median(T)(T data)
73 if(doubleInput!(T)) {
74     auto alloc = newRegionAllocator();
75     auto dataDup = alloc.array(data);
76     return medianPartition(dataDup);
77 }
78 
79 /**Median finding as in median(), but will partition input data such that
80  * elements less than the median will have smaller indices than that of the
81  * median, and elements larger than the median will have larger indices than
82  * that of the median. Useful both for its partititioning and to avoid
83  * memory allocations.  Requires a random access range with swappable
84  * elements.*/
85 double medianPartition(T)(T data)
86 if(isRandomAccessRange!(T) &&
87    is(ElementType!(T) : double) &&
88    hasSwappableElements!(T) &&
89    hasLength!(T))
90 {
91     if(data.length == 0) {
92         return double.nan;
93     }
94     // Upper half of median in even length case is just the smallest element
95     // with an index larger than the lower median, after the array is
96     // partially sorted.
97     if(data.length == 1) {
98         return data[0];
99     } else if(data.length & 1) {  //Is odd.
100         return cast(double) partitionK(data, data.length / 2);
101     } else {
102         auto lower = partitionK(data, data.length / 2 - 1);
103         auto upper = ElementType!(T).max;
104 
105         // Avoid requiring slicing to be supported.
106         foreach(i; data.length / 2..data.length) {
107             if(data[i] < upper) {
108                 upper = data[i];
109             }
110         }
111         return lower * 0.5 + upper * 0.5;
112     }
113 }
114 
115 unittest {
116     float brainDeadMedian(float[] foo) {
117         qsort(foo);
118         if(foo.length & 1)
119             return foo[$ / 2];
120         return (foo[$ / 2] + foo[$ / 2 - 1]) / 2;
121     }
122 
123     float[] test = new float[1000];
124     size_t upperBound, lowerBound;
125     foreach(testNum; 0..1000) {
126         foreach(ref e; test) {
127             e = uniform(0f, 1000f);
128         }
129         do {
130             upperBound = uniform(0u, test.length);
131             lowerBound = uniform(0u, test.length);
132         } while(lowerBound == upperBound);
133         if(lowerBound > upperBound) {
134             swap(lowerBound, upperBound);
135         }
136         auto quickRes = median(test[lowerBound..upperBound]);
137         auto accurateRes = brainDeadMedian(test[lowerBound..upperBound]);
138 
139         // Off by some tiny fraction in even N case because of division.
140         // No idea why, but it's too small a rounding error to care about.
141         assert(approxEqual(quickRes, accurateRes));
142     }
143 
144     // Make sure everything works with lowest common denominator range type.
145     static struct Count {
146         uint num;
147         uint upTo;
148         @property size_t front() {
149             return num;
150         }
151         void popFront() {
152             num++;
153         }
154         @property bool empty() {
155             return num >= upTo;
156         }
157     }
158 
159     Count a;
160     a.upTo = 100;
161     assert(approxEqual(median(a), 49.5));
162 }
163 
164 /**Plain old data holder struct for median, median absolute deviation.
165  * Alias this'd to the median absolute deviation member.
166  */
167 struct MedianAbsDev {
168     double median;
169     double medianAbsDev;
170 
171     alias medianAbsDev this;
172 }
173 
174 /**Calculates the median absolute deviation of a dataset.  This is the median
175  * of all absolute differences from the median of the dataset.
176  *
177  * Returns:  A MedianAbsDev struct that contains the median (since it is
178  * computed anyhow) and the median absolute deviation.
179  *
180  * Notes:  No bias correction is used in this implementation, since using
181  * one would require assumptions about the underlying distribution of the data.
182  */
183 MedianAbsDev medianAbsDev(T)(T data)
184 if(doubleInput!(T)) {
185     auto alloc = newRegionAllocator();
186     auto dataDup = alloc.array(data);
187     immutable med = medianPartition(dataDup);
188     immutable len = dataDup.length;
189     alloc.freeLast();
190 
191     double[] devs = alloc.uninitializedArray!(double[])(len);
192 
193     size_t i = 0;
194     foreach(elem; data) {
195         devs[i++] = abs(med - elem);
196     }
197     auto ret = medianPartition(devs);
198     alloc.freeLast();
199     return MedianAbsDev(med, ret);
200 }
201 
202 unittest {
203     assert(approxEqual(medianAbsDev([7,1,8,2,8,1,9,2,8,4,5,9].dup).medianAbsDev, 2.5L));
204     assert(approxEqual(medianAbsDev([8,6,7,5,3,0,999].dup).medianAbsDev, 2.0L));
205 }
206 
207 /**Computes the interquantile range of data at the given quantile value in O(N)
208  * time complexity.  For example, using a quantile value of either 0.25 or 0.75
209  * will give the interquartile range.  (This is the default since it is
210  * apparently the most common interquantile range in common usage.)
211  * Using a quantile value of 0.2 or 0.8 will give the interquntile range.
212  *
213  * If the quantile point falls between two indices, linear interpolation is
214  * used.
215  *
216  * This function is somewhat more efficient than simply finding the upper and
217  * lower quantile and subtracting them.
218  *
219  * Tip:  A quantile of 0 or 1 is handled as a special case and will compute the
220  *       plain old range of the data in a single pass.
221  */
222 double interquantileRange(R)(R data, double quantile = 0.25)
223 if(doubleInput!R) {
224     alias quantile q;  // Save typing.
225     dstatsEnforce(q >= 0 && q <= 1,
226         "Quantile must be between 0, 1 for interquantileRange.");
227 
228     auto alloc = newRegionAllocator();
229     if(q > 0.5) {
230         q = 1.0 - q;
231     }
232 
233     if(q == 0) {  // Special case:  Compute the plain old range.
234         double minElem = double.infinity;
235         double maxElem = -double.infinity;
236 
237         foreach(elem; data) {
238             minElem = min(minElem, elem);
239             maxElem = max(maxElem, elem);
240         }
241 
242         return maxElem - minElem;
243     }
244 
245     // Common case.
246     auto duped = alloc.array(data);
247     immutable double N = duped.length;
248     if(duped.length < 2) {
249         return double.nan;  // Can't do it.
250     }
251 
252     immutable lowEnd = to!size_t((N - 1) * q);
253     immutable lowFract = (N - 1) * q - lowEnd;
254 
255     partitionK(duped, lowEnd);
256     immutable lowQuantile1 = duped[lowEnd];
257     double minAbove = double.infinity;
258 
259     foreach(elem; duped[lowEnd + 1..$]) {
260         minAbove = min(minAbove, elem);
261     }
262 
263     immutable lowerQuantile =
264         lowFract * minAbove + (1 - lowFract) * lowQuantile1;
265 
266     immutable highEnd = to!size_t((N - 1) * (1.0 - q) - lowEnd);
267     immutable highFract = (N - 1) * (1.0 - q) - lowEnd - highEnd;
268     duped = duped[lowEnd..$];
269     assert(highEnd < duped.length - 1);
270 
271     partitionK(duped, highEnd);
272     immutable minAbove2 = reduce!min(double.infinity, duped[highEnd + 1..$]);
273     immutable upperQuantile = minAbove2 * highFract
274                             + duped[highEnd] * (1 - highFract);
275 
276     return upperQuantile - lowerQuantile;
277 }
278 
279 unittest {
280     // 0 3 5 6 7 8 9
281     assert(approxEqual(interquantileRange([1,2,3,4,5,6,7,8]), 3.5));
282     assert(approxEqual(interquantileRange([1,2,3,4,5,6,7,8,9]), 4));
283     assert(interquantileRange([1,9,2,4,3,6,8], 0) == 8);
284     assert(approxEqual(interquantileRange([8,6,7,5,3,0,9], 0.2), 4.4));
285 }
286 
287 /**Output range to calculate the mean online.  Getter for mean costs a branch to
288  * check for N == 0.  This struct uses O(1) space and does *NOT* store the
289  * individual elements.
290  *
291  * Note:  This struct can implicitly convert to the value of the mean.
292  *
293  * Examples:
294  * ---
295  * Mean summ;
296  * summ.put(1);
297  * summ.put(2);
298  * summ.put(3);
299  * summ.put(4);
300  * summ.put(5);
301  * assert(summ.mean == 3);
302  * ---*/
303 struct Mean {
304 private:
305     double result = 0;
306     double k = 0;
307 
308 public:
309     ///// Allow implicit casting to double, by returning the current mean.
310     alias mean this;
311 
312     ///
313     void put(double element) pure nothrow @safe {
314         result += (element - result) / ++k;
315     }
316 
317     /**Adds the contents of rhs to this instance.
318      *
319      * Examples:
320      * ---
321      * Mean mean1, mean2, combined;
322      * foreach(i; 0..5) {
323      *     mean1.put(i);
324      * }
325      *
326      * foreach(i; 5..10) {
327      *     mean2.put(i);
328      * }
329      *
330      * mean1.put(mean2);
331      *
332      * foreach(i; 0..10) {
333      *     combined.put(i);
334      * }
335      *
336      * assert(approxEqual(combined.mean, mean1.mean));
337      * ---
338      */
339      void put(typeof(this) rhs) pure nothrow @safe {
340          immutable totalN = k + rhs.k;
341          result = result * (k / totalN) + rhs.result * (rhs.k / totalN);
342          k = totalN;
343      }
344 
345     const pure nothrow @property @safe {
346 
347         ///
348         double sum() {
349             return result * k;
350         }
351 
352         ///
353         double mean() {
354             return (k == 0) ? double.nan : result;
355         }
356 
357         ///
358         double N() {
359             return k;
360         }
361 
362         /**Simply returns this.  Useful in generic programming contexts.*/
363         Mean toMean() {
364             return this;
365         }
366     }
367 
368     ///
369     string toString() const {
370         return to!(string)(mean);
371     }
372 }
373 
374 /**Finds the arithmetic mean of any input range whose elements are implicitly
375  * convertible to double.*/
376 Mean mean(T)(T data)
377 if(doubleIterable!(T)) {
378 
379     static if(isRandomAccessRange!T && hasLength!T) {
380         // This is optimized for maximum instruction level parallelism:
381         // The loop is unrolled such that there are 1 / (nILP)th the data
382         // dependencies of the naive algorithm.
383         enum nILP = 8;
384 
385         Mean ret;
386         size_t i = 0;
387         if(data.length > 2 * nILP) {
388             double k = 0;
389             double[nILP] means = 0;
390             for(; i + nILP < data.length; i += nILP) {
391                 immutable kNeg1 = 1 / ++k;
392 
393                 foreach(j; StaticIota!nILP) {
394                     means[j] += (data[i + j] - means[j]) * kNeg1;
395                 }
396             }
397 
398             ret.k = k;
399             ret.result = means[0];
400             foreach(m; means[1..$]) {
401                 ret.put( Mean(m, k));
402             }
403         }
404 
405         // Handle the remainder.
406         for(; i < data.length; i++) {
407             ret.put(data[i]);
408         }
409         return ret;
410 
411     } else {
412         // Just submit everything to a single Mean struct and return it.
413         Mean meanCalc;
414 
415         foreach(element; data) {
416             meanCalc.put(element);
417         }
418         return meanCalc;
419     }
420 }
421 
422 /**Output range to calculate the geometric mean online.
423  * Operates similarly to dstats.summary.Mean*/
424 struct GeometricMean {
425 private:
426     Mean m;
427 public:
428     /////Allow implicit casting to double, by returning current geometric mean.
429     alias geoMean this;
430 
431     ///
432     void put(double element) pure nothrow @safe {
433         m.put(log2(element));
434     }
435 
436     /// Combine two GeometricMean's.
437     void put(typeof(this) rhs) pure nothrow @safe {
438         m.put(rhs.m);
439     }
440 
441     const pure nothrow @property {
442         ///
443         double geoMean() {
444             return exp2(m.mean);
445         }
446 
447         ///
448         double N() {
449             return m.k;
450         }
451     }
452 
453     ///
454     string toString() const {
455         return to!(string)(geoMean);
456     }
457 }
458 
459 /**Calculates the geometric mean of any input range that has elements implicitly
460  * convertible to double*/
461 double geometricMean(T)(T data)
462 if(doubleIterable!(T)) {
463     // This is relatively seldom used and the log function is the bottleneck
464     // anyhow, not worth ILP optimizing.
465     GeometricMean m;
466     foreach(elem; data) {
467         m.put(elem);
468     }
469     return m.geoMean;
470 }
471 
472 unittest {
473     string[] data = ["1", "2", "3", "4", "5"];
474     auto foo = map!(to!(uint))(data);
475 
476     auto result = geometricMean(map!(to!(uint))(data));
477     assert(approxEqual(result, 2.60517));
478 
479     Mean mean1, mean2, combined;
480     foreach(i; 0..5) {
481       mean1.put(i);
482     }
483 
484     foreach(i; 5..10) {
485       mean2.put(i);
486     }
487 
488     mean1.put(mean2);
489 
490     foreach(i; 0..10) {
491       combined.put(i);
492     }
493 
494     assert(approxEqual(combined.mean, mean1.mean),
495         text(combined.mean, "  ", mean1.mean));
496     assert(combined.N == mean1.N);
497 }
498 
499 /**Finds the sum of an input range whose elements implicitly convert to double.
500  * User has option of making U a different type than T to prevent overflows
501  * on large array summing operations.  However, by default, return type is
502  * T (same as input type).*/
503 U sum(T, U = Unqual!(ForeachType!(T)))(T data)
504 if(doubleIterable!(T)) {
505 
506     static if(isRandomAccessRange!T && hasLength!T) {
507         enum nILP = 8;
508         U[nILP] sum = 0;
509 
510         size_t i = 0;
511         if(data.length > 2 * nILP) {
512 
513             for(; i + nILP < data.length; i += nILP) {
514                 foreach(j; StaticIota!nILP) {
515                     sum[j] += data[i + j];
516                 }
517             }
518 
519             foreach(j; 1..nILP) {
520                 sum[0] += sum[j];
521             }
522         }
523 
524         for(; i < data.length; i++) {
525             sum[0] += data[i];
526         }
527 
528         return sum[0];
529     } else {
530         U sum = 0;
531         foreach(elem; data) {
532             sum += elem;
533         }
534 
535         return sum;
536     }
537 }
538 
539 unittest {
540     assert(sum([1,2,3,4,5,6,7,8,9,10][]) == 55);
541     assert(sum(filter!"true"([1,2,3,4,5,6,7,8,9,10][])) == 55);
542     assert(sum(cast(int[]) [1,2,3,4,5])==15);
543     assert(approxEqual( sum(cast(int[]) [40.0, 40.1, 5.2]), 85.3));
544     assert(mean(cast(int[]) [1,2,3]).mean == 2);
545     assert(mean(cast(int[]) [1.0, 2.0, 3.0]).mean == 2.0);
546     assert(mean([1, 2, 5, 10, 17][]).mean == 7);
547     assert(mean([1, 2, 5, 10, 17][]).sum == 35);
548     assert(approxEqual(mean([8,6,7,5,3,0,9,3,6,2,4,3,6][]).mean, 4.769231));
549 
550     // Test the OO struct a little, since we're using the new ILP algorithm.
551     Mean m;
552     m.put(1);
553     m.put(2);
554     m.put(5);
555     m.put(10);
556     m.put(17);
557     assert(m.mean == 7);
558 
559     foreach(i; 0..100) {
560         // Monte carlo test the unrolled version.
561         auto foo = randArray!rNormal(uniform(5, 100), 0, 1);
562         auto res1 = mean(foo);
563         Mean res2;
564         foreach(elem; foo) {
565             res2.put(elem);
566         }
567 
568         foreach(ti, elem; res1.tupleof) {
569             assert(approxEqual(elem, res2.tupleof[ti]));
570         }
571     }
572 }
573 
574 
575 /**Output range to compute mean, stdev, variance online.  Getter methods
576  * for stdev, var cost a few floating point ops.  Getter for mean costs
577  * a single branch to check for N == 0.  Relatively expensive floating point
578  * ops, if you only need mean, try Mean.  This struct uses O(1) space and
579  * does *NOT* store the individual elements.
580  *
581  * Note:  This struct can implicitly convert to a Mean struct.
582  *
583  * References: Computing Higher-Order Moments Online.
584  * http://people.xiph.org/~tterribe/notes/homs.html
585  *
586  * Examples:
587  * ---
588  * MeanSD summ;
589  * summ.put(1);
590  * summ.put(2);
591  * summ.put(3);
592  * summ.put(4);
593  * summ.put(5);
594  * assert(summ.mean == 3);
595  * assert(summ.stdev == sqrt(2.5));
596  * assert(summ.var == 2.5);
597  * ---*/
598 struct MeanSD {
599 private:
600     double _mean = 0;
601     double _var = 0;
602     double _k = 0;
603 public:
604     ///
605     void put(double element) pure nothrow @safe {
606         immutable kMinus1 = _k;
607         immutable delta = element - _mean;
608         immutable deltaN = delta / ++_k;
609 
610         _mean += deltaN;
611         _var += kMinus1 * deltaN * delta;
612         return;
613     }
614 
615     /// Combine two MeanSD's.
616     void put(typeof(this) rhs) pure nothrow @safe {
617         if(_k == 0) {
618             foreach(ti, elem; rhs.tupleof) {
619                 this.tupleof[ti] = elem;
620             }
621 
622             return;
623         } else if(rhs._k == 0) {
624             return;
625         }
626 
627         immutable totalN = _k + rhs._k;
628         immutable delta = rhs._mean - _mean;
629         _mean = _mean * (_k / totalN) + rhs._mean * (rhs._k / totalN);
630 
631         _var = _var + rhs._var + (_k / totalN * rhs._k * delta * delta);
632         _k = totalN;
633     }
634 
635     const pure nothrow @property @safe {
636 
637         ///
638         double sum() {
639             return _k * _mean;
640         }
641 
642         ///
643         double mean() {
644             return (_k == 0) ? double.nan : _mean;
645         }
646 
647         ///
648         double stdev() {
649             return sqrt(var);
650         }
651 
652         ///
653         double var() {
654             return (_k < 2) ? double.nan : _var / (_k - 1);
655         }
656 
657         /**
658         Mean squared error.  In other words, a biased estimate of variance.
659         */
660         double mse() {
661             return (_k < 1) ? double.nan : _var / _k;
662         }
663 
664         ///
665         double N() {
666             return _k;
667         }
668 
669         /**Converts this struct to a Mean struct.  Also called when an
670          * implicit conversion via alias this takes place.
671          */
672         Mean toMean() {
673             return Mean(_mean, _k);
674         }
675 
676         /**Simply returns this.  Useful in generic programming contexts.*/
677         MeanSD toMeanSD() const  {
678             return this;
679         }
680     }
681 
682     alias toMean this;
683 
684     ///
685     string toString() const {
686         return text("N = ", cast(ulong) _k, "\nMean = ", mean, "\nVariance = ",
687                var, "\nStdev = ", stdev);
688     }
689 }
690 
691 /**Puts all elements of data into a MeanSD struct,
692  * then returns this struct.  This can be faster than doing this manually
693  * due to ILP optimizations.*/
694 MeanSD meanStdev(T)(T data)
695 if(doubleIterable!(T)) {
696 
697     MeanSD ret;
698 
699     static if(isRandomAccessRange!T && hasLength!T) {
700         // Optimize for instruction level parallelism.
701         enum nILP = 6;
702         double k = 0;
703         double[nILP] means = 0;
704         double[nILP] variances = 0;
705         size_t i = 0;
706 
707         if(data.length > 2 * nILP) {
708             for(; i + nILP < data.length; i += nILP) {
709                 immutable kMinus1 = k;
710                 immutable kNeg1 = 1 / ++k;
711 
712                 foreach(j; StaticIota!nILP) {
713                     immutable double delta = data[i + j] - means[j];
714                     immutable deltaN = delta * kNeg1;
715 
716                     means[j] += deltaN;
717                     variances[j] += kMinus1 * deltaN * delta;
718                 }
719             }
720 
721             ret._mean = means[0];
722             ret._var = variances[0];
723             ret._k = k;
724 
725             foreach(j; 1..nILP) {
726                 ret.put( MeanSD(means[j], variances[j], k));
727             }
728         }
729 
730         // Handle remainder.
731         for(; i < data.length; i++) {
732             ret.put(data[i]);
733         }
734     } else {
735         foreach(elem; data) {
736             ret.put(elem);
737         }
738     }
739     return ret;
740 }
741 
742 /**Finds the variance of an input range with members implicitly convertible
743  * to doubles.*/
744 double variance(T)(T data)
745 if(doubleIterable!(T)) {
746     return meanStdev(data).var;
747 }
748 
749 /**Calculate the standard deviation of an input range with members
750  * implicitly converitble to double.*/
751 double stdev(T)(T data)
752 if(doubleIterable!(T)) {
753     return meanStdev(data).stdev;
754 }
755 
756 unittest {
757     auto res = meanStdev(cast(int[]) [3, 1, 4, 5]);
758     assert(approxEqual(res.stdev, 1.7078));
759     assert(approxEqual(res.mean, 3.25));
760     res = meanStdev(cast(double[]) [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]);
761     assert(approxEqual(res.stdev, 2.160247));
762     assert(approxEqual(res.mean, 4));
763     assert(approxEqual(res.sum, 28));
764 
765     MeanSD mean1, mean2, combined;
766     foreach(i; 0..5) {
767       mean1.put(i);
768     }
769 
770     foreach(i; 5..10) {
771       mean2.put(i);
772     }
773 
774     mean1.put(mean2);
775 
776     foreach(i; 0..10) {
777       combined.put(i);
778     }
779 
780     assert(approxEqual(combined.mean, mean1.mean));
781     assert(approxEqual(combined.stdev, mean1.stdev));
782     assert(combined.N == mean1.N);
783     assert(approxEqual(combined.mean, 4.5));
784     assert(approxEqual(combined.stdev, 3.027650));
785 
786     foreach(i; 0..100) {
787         // Monte carlo test the unrolled version.
788         auto foo = randArray!rNormal(uniform(5, 100), 0, 1);
789         auto res1 = meanStdev(foo);
790         MeanSD res2;
791         foreach(elem; foo) {
792             res2.put(elem);
793         }
794 
795         foreach(ti, elem; res1.tupleof) {
796             assert(approxEqual(elem, res2.tupleof[ti]));
797         }
798 
799         MeanSD resCornerCase;  // Test corner cases where one of the N's is 0.
800         resCornerCase.put(res1);
801         MeanSD dummy;
802         resCornerCase.put(dummy);
803         foreach(ti, elem; res1.tupleof) {
804             assert(elem == resCornerCase.tupleof[ti]);
805         }
806     }
807 }
808 
809 /**Output range to compute mean, stdev, variance, skewness, kurtosis, min, and
810  * max online. Using this struct is relatively expensive, so if you just need
811  * mean and/or stdev, try MeanSD or Mean. Getter methods for stdev,
812  * var cost a few floating point ops.  Getter for mean costs a single branch to
813  * check for N == 0.  Getters for skewness and kurtosis cost a whole bunch of
814  * floating point ops.  This struct uses O(1) space and does *NOT* store the
815  * individual elements.
816  *
817  * Note:  This struct can implicitly convert to a MeanSD.
818  *
819  * References: Computing Higher-Order Moments Online.
820  * http://people.xiph.org/~tterribe/notes/homs.html
821  *
822  * Examples:
823  * ---
824  * Summary summ;
825  * summ.put(1);
826  * summ.put(2);
827  * summ.put(3);
828  * summ.put(4);
829  * summ.put(5);
830  * assert(summ.N == 5);
831  * assert(summ.mean == 3);
832  * assert(summ.stdev == sqrt(2.5));
833  * assert(summ.var == 2.5);
834  * assert(approxEqual(summ.kurtosis, -1.9120));
835  * assert(summ.min == 1);
836  * assert(summ.max == 5);
837  * assert(summ.sum == 15);
838  * ---*/
839 struct Summary {
840 private:
841     double _mean = 0;
842     double _m2 = 0;
843     double _m3 = 0;
844     double _m4 = 0;
845     double _k = 0;
846     double _min = double.infinity;
847     double _max = -double.infinity;
848 public:
849     ///
850     void put(double element) pure nothrow @safe {
851         immutable kMinus1 = _k;
852         immutable kNeg1 = 1.0 / ++_k;
853         _min = (element < _min) ? element : _min;
854         _max = (element > _max) ? element : _max;
855 
856         immutable delta = element - _mean;
857         immutable deltaN = delta * kNeg1;
858         _mean += deltaN;
859 
860         _m4 += kMinus1 * deltaN * (_k * _k - 3 * _k + 3) * deltaN * deltaN * delta +
861             6 * _m2 * deltaN * deltaN - 4 * deltaN * _m3;
862         _m3 += kMinus1 * deltaN * (_k - 2) * deltaN * delta - 3 * delta * _m2 * kNeg1;
863         _m2 += kMinus1 * deltaN * delta;
864     }
865 
866     /// Combine two Summary's.
867     void put(typeof(this) rhs) pure nothrow @safe {
868         if(_k == 0) {
869             foreach(ti, elem; rhs.tupleof) {
870                 this.tupleof[ti] = elem;
871             }
872 
873             return;
874         } else if(rhs._k == 0) {
875             return;
876         }
877 
878         immutable totalN = _k + rhs._k;
879         immutable delta = rhs._mean - _mean;
880         immutable deltaN = delta / totalN;
881         _mean = _mean * (_k / totalN) + rhs._mean * (rhs._k / totalN);
882 
883         _m4 = _m4 + rhs._m4 +
884             deltaN * _k * deltaN * rhs._k * deltaN * delta *
885             (_k * _k - _k * rhs._k + rhs._k * rhs._k) +
886             6 * deltaN * _k * deltaN * _k * rhs._m2 +
887             6 * deltaN * rhs._k * deltaN * rhs._k * _m2 +
888             4 * deltaN * _k * rhs._m3 -
889             4 * deltaN * rhs._k * _m3;
890 
891         _m3 = _m3 + rhs._m3 + deltaN * _k * deltaN * rhs._k * (_k - rhs._k) +
892             3 * deltaN * _k * rhs._m2 -
893             3 * deltaN * rhs._k * _m2;
894 
895         _m2 = _m2 + rhs._m2 + (_k / totalN * rhs._k * delta * delta);
896 
897         _k = totalN;
898         _max = (_max > rhs._max) ? _max : rhs._max;
899         _min = (_min < rhs._min) ? _min : rhs._min;
900     }
901 
902     const pure nothrow @property @safe {
903 
904         ///
905         double sum() {
906             return _mean * _k;
907         }
908 
909         ///
910         double mean() {
911             return (_k == 0) ? double.nan : _mean;
912         }
913 
914         ///
915         double stdev() {
916             return sqrt(var);
917         }
918 
919         ///
920         double var() {
921             return (_k < 2) ? double.nan : _m2 / (_k - 1);
922         }
923 
924         /**
925         Mean squared error.  In other words, a biased estimate of variance.
926         */
927         double mse() {
928             return (_k < 1) ? double.nan : _m2 / _k;
929         }
930 
931         ///
932         double skewness() {
933             immutable sqM2 = sqrt(_m2);
934             return _m3 / (sqM2 * sqM2 * sqM2) * sqrt(_k);
935         }
936 
937         ///
938         double kurtosis() {
939             return _m4 / _m2 * _k  / _m2 - 3;
940         }
941 
942         ///
943         double N() {
944             return _k;
945         }
946 
947         ///
948         double min() {
949             return _min;
950         }
951 
952         ///
953         double max() {
954             return _max;
955         }
956 
957         /**Converts this struct to a MeanSD.  Called via alias this when an
958          * implicit conversion is attetmpted.
959          */
960         MeanSD toMeanSD() {
961             return MeanSD(_mean, _m2, _k);
962         }
963     }
964 
965     alias toMeanSD this;
966 
967     ///
968     string toString() const {
969         return text("N = ", roundTo!long(_k),
970                   "\nMean = ", mean,
971                   "\nVariance = ", var,
972                   "\nStdev = ", stdev,
973                   "\nSkewness = ", skewness,
974                   "\nKurtosis = ", kurtosis,
975                   "\nMin = ", _min,
976                   "\nMax = ", _max);
977     }
978 }
979 
980 unittest {
981     // Everything else is tested indirectly through kurtosis, skewness.  Test
982     // put(typeof(this)).
983 
984     Summary mean1, mean2, combined;
985     foreach(i; 0..5) {
986       mean1.put(i);
987     }
988 
989     foreach(i; 5..10) {
990       mean2.put(i);
991     }
992 
993     auto m1_2 = mean1;
994     auto m2_2 = mean2;
995     m1_2.put(m2_2);
996 
997     mean1.put(mean2);
998 
999     foreach(i; 0..10) {
1000       combined.put(i);
1001     }
1002 
1003     foreach(ti, elem; mean1.tupleof) {
1004         assert(approxEqual(elem, combined.tupleof[ti]));
1005     }
1006 
1007     Summary summCornerCase;  // Case where one N is zero.
1008     summCornerCase.put(mean1);
1009     Summary dummy;
1010     summCornerCase.put(dummy);
1011     foreach(ti, elem; summCornerCase.tupleof) {
1012         assert(elem == mean1.tupleof[ti]);
1013     }
1014 }
1015 
1016 /**Excess kurtosis relative to normal distribution.  High kurtosis means that
1017  * the variance is due to infrequent, large deviations from the mean.  Low
1018  * kurtosis means that the variance is due to frequent, small deviations from
1019  * the mean.  The normal distribution is defined as having kurtosis of 0.
1020  * Input must be an input range with elements implicitly convertible to double.*/
1021 double kurtosis(T)(T data)
1022 if(doubleIterable!(T)) {
1023     // This is too infrequently used and has too much ILP within a single
1024     // iteration to be worth ILP optimizing.
1025     Summary kCalc;
1026     foreach(elem; data) {
1027         kCalc.put(elem);
1028     }
1029     return kCalc.kurtosis;
1030 }
1031 
1032 unittest {
1033     // Values from Matlab.
1034     assert(approxEqual(kurtosis([1, 1, 1, 1, 10].dup), 0.25));
1035     assert(approxEqual(kurtosis([2.5, 3.5, 4.5, 5.5].dup), -1.36));
1036     assert(approxEqual(kurtosis([1,2,2,2,2,2,100].dup), 2.1657));
1037 }
1038 
1039 /**Skewness is a measure of symmetry of a distribution.  Positive skewness
1040  * means that the right tail is longer/fatter than the left tail.  Negative
1041  * skewness means the left tail is longer/fatter than the right tail.  Zero
1042  * skewness indicates a symmetrical distribution.  Input must be an input
1043  * range with elements implicitly convertible to double.*/
1044 double skewness(T)(T data)
1045 if(doubleIterable!(T)) {
1046     // This is too infrequently used and has too much ILP within a single
1047     // iteration to be worth ILP optimizing.
1048     Summary sCalc;
1049     foreach(elem; data) {
1050         sCalc.put(elem);
1051     }
1052     return sCalc.skewness;
1053 }
1054 
1055 unittest {
1056     // Values from Octave.
1057     assert(approxEqual(skewness([1,2,3,4,5].dup), 0));
1058     assert(approxEqual(skewness([3,1,4,1,5,9,2,6,5].dup), 0.5443));
1059     assert(approxEqual(skewness([2,7,1,8,2,8,1,8,2,8,4,5,9].dup), -0.0866));
1060 
1061     // Test handling of ranges that are not arrays.
1062     string[] stringy = ["3", "1", "4", "1", "5", "9", "2", "6", "5"];
1063     auto intified = map!(to!(int))(stringy);
1064     assert(approxEqual(skewness(intified), 0.5443));
1065 }
1066 
1067 /**Convenience function.  Puts all elements of data into a Summary struct,
1068  * and returns this struct.*/
1069 Summary summary(T)(T data)
1070 if(doubleIterable!(T)) {
1071     // This is too infrequently used and has too much ILP within a single
1072     // iteration to be worth ILP optimizing.
1073     Summary summ;
1074     foreach(elem; data) {
1075         summ.put(elem);
1076     }
1077     return summ;
1078 }
1079 // Just a convenience function for a well-tested struct.  No unittest really
1080 // necessary.  (Famous last words.)
1081 
1082 ///
1083 struct ZScore(T) if(isForwardRange!(T) && is(ElementType!(T) : double)) {
1084 private:
1085     T range;
1086     double mean;
1087     double sdNeg1;
1088 
1089     double z(double elem) {
1090         return (elem - mean) * sdNeg1;
1091     }
1092 
1093 public:
1094     this(T range) {
1095         this.range = range;
1096         auto msd = meanStdev(range);
1097         this.mean = msd.mean;
1098         this.sdNeg1 = 1.0 / msd.stdev;
1099     }
1100 
1101     this(T range, double mean, double sd) {
1102         this.range = range;
1103         this.mean = mean;
1104         this.sdNeg1 = 1.0 / sd;
1105     }
1106 
1107     ///
1108     @property double front() {
1109         return z(range.front);
1110     }
1111 
1112     ///
1113     void popFront() {
1114         range.popFront;
1115     }
1116 
1117     ///
1118     @property bool empty() {
1119         return range.empty;
1120     }
1121 
1122     static if(isForwardRange!(T)) {
1123         ///
1124         @property typeof(this) save() {
1125             auto ret = this;
1126             ret.range = range.save;
1127             return ret;
1128         }
1129     }
1130 
1131     static if(isRandomAccessRange!(T)) {
1132         ///
1133         double opIndex(size_t index) {
1134             return z(range[index]);
1135         }
1136     }
1137 
1138     static if(isBidirectionalRange!(T)) {
1139         ///
1140         @property double back() {
1141             return z(range.back);
1142         }
1143 
1144         ///
1145         void popBack() {
1146             range.popBack;
1147         }
1148     }
1149 
1150     static if(hasLength!(T)) {
1151         ///
1152         @property size_t length() {
1153             return range.length;
1154         }
1155     }
1156 }
1157 
1158 /**Returns a range with whatever properties T has (forward range, random
1159  * access range, bidirectional range, hasLength, etc.),
1160  * of the z-scores of the underlying
1161  * range.  A z-score of an element in a range is defined as
1162  * (element - mean(range)) / stdev(range).
1163  *
1164  * Notes:
1165  *
1166  * If the data contained in the range is a sample of a larger population,
1167  * rather than an entire population, then technically, the results output
1168  * from the ZScore range are T statistics, not Z statistics.  This is because
1169  * the sample mean and standard deviation are only estimates of the population
1170  * parameters.  This does not affect the mechanics of using this range,
1171  * but it does affect the interpretation of its output.
1172  *
1173  * Accessing elements of this range is fairly expensive, as a
1174  * floating point multiply is involved.  Also, constructing this range is
1175  * costly, as the entire input range has to be iterated over to find the
1176  * mean and standard deviation.
1177  */
1178 ZScore!(T) zScore(T)(T range)
1179 if(isForwardRange!(T) && doubleInput!(T)) {
1180     return ZScore!(T)(range);
1181 }
1182 
1183 /**Allows the construction of a ZScore range with precomputed mean and
1184  * stdev.
1185  */
1186 ZScore!(T) zScore(T)(T range, double mean, double sd)
1187 if(isForwardRange!(T) && doubleInput!(T)) {
1188     return ZScore!(T)(range, mean, sd);
1189 }
1190 
1191 unittest {
1192     int[] arr = [1,2,3,4,5];
1193     auto m = mean(arr).mean;
1194     auto sd = stdev(arr);
1195     auto z = zScore(arr);
1196 
1197     size_t pos = 0;
1198     foreach(elem; z) {
1199         assert(approxEqual(elem, (arr[pos++] - m) / sd));
1200     }
1201 
1202     assert(z.length == 5);
1203     foreach(i; 0..z.length) {
1204         assert(approxEqual(z[i], (arr[i] - m) / sd));
1205     }
1206 }