1 /**Summary statistics such as mean, median, sum, variance, skewness, kurtosis. 2 * Except for median and median absolute deviation, which cannot be calculated 3 * online, all summary statistics have both an input range interface and an 4 * output range interface. 5 * 6 * Notes: The put method on the structs defined in this module returns this by 7 * ref. The use case for returning this is to enable these structs 8 * to be used with std.algorithm.reduce. The rationale for returning 9 * by ref is that the return value usually won't be used, and the 10 * overhead of returning a large struct by value should be avoided. 11 * 12 * Bugs: This whole module assumes that input will be doubles or types implicitly 13 * convertible to double. No allowances are made for user-defined numeric 14 * types such as BigInts. This is necessary for simplicity. However, 15 * if you have a function that converts your data to doubles, most of 16 * these functions work with any input range, so you can simply map 17 * this function onto your range. 18 * 19 * Author: David Simcha 20 */ 21 /* 22 * Copyright (C) 2008-2010 David Simcha 23 * 24 * License: 25 * Boost Software License - Version 1.0 - August 17th, 2003 26 * 27 * Permission is hereby granted, free of charge, to any person or organization 28 * obtaining a copy of the software and accompanying documentation covered by 29 * this license (the "Software") to use, reproduce, display, distribute, 30 * execute, and transmit the Software, and to prepare derivative works of the 31 * Software, and to permit third-parties to whom the Software is furnished to 32 * do so, all subject to the following: 33 * 34 * The copyright notices in the Software and this entire statement, including 35 * the above license grant, this restriction and the following disclaimer, 36 * must be included in all copies of the Software, in whole or in part, and 37 * all derivative works of the Software, unless such copies or derivative 38 * works are solely in the form of machine-executable object code generated by 39 * a source language processor. 40 * 41 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 42 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 43 * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 44 * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 45 * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 46 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 47 * DEALINGS IN THE SOFTWARE. 48 */ 49 50 51 module dstats.summary; 52 53 import std.functional, std.conv, std.range, std.array, 54 std.traits, std.math; 55 56 import std.algorithm : reduce, min, max, swap, map, filter; 57 58 import dstats.sort, dstats.base, dstats.alloc; 59 60 version(unittest) { 61 import std.stdio, dstats.random; 62 } 63 64 /**Finds median of an input range in O(N) time on average. In the case of an 65 * even number of elements, the mean of the two middle elements is returned. 66 * This is a convenience founction designed specifically for numeric types, 67 * where the averaging of the two middle elements is desired. A more general 68 * selection algorithm that can handle any type with a total ordering, as well 69 * as selecting any position in the ordering, can be found at 70 * dstats.sort.quickSelect() and dstats.sort.partitionK(). 71 * Allocates memory, does not reorder input data.*/ 72 double median(T)(T data) 73 if(doubleInput!(T)) { 74 auto alloc = newRegionAllocator(); 75 auto dataDup = alloc.array(data); 76 return medianPartition(dataDup); 77 } 78 79 /**Median finding as in median(), but will partition input data such that 80 * elements less than the median will have smaller indices than that of the 81 * median, and elements larger than the median will have larger indices than 82 * that of the median. Useful both for its partititioning and to avoid 83 * memory allocations. Requires a random access range with swappable 84 * elements.*/ 85 double medianPartition(T)(T data) 86 if(isRandomAccessRange!(T) && 87 is(ElementType!(T) : double) && 88 hasSwappableElements!(T) && 89 hasLength!(T)) 90 { 91 if(data.length == 0) { 92 return double.nan; 93 } 94 // Upper half of median in even length case is just the smallest element 95 // with an index larger than the lower median, after the array is 96 // partially sorted. 97 if(data.length == 1) { 98 return data[0]; 99 } else if(data.length & 1) { //Is odd. 100 return cast(double) partitionK(data, data.length / 2); 101 } else { 102 auto lower = partitionK(data, data.length / 2 - 1); 103 auto upper = ElementType!(T).max; 104 105 // Avoid requiring slicing to be supported. 106 foreach(i; data.length / 2..data.length) { 107 if(data[i] < upper) { 108 upper = data[i]; 109 } 110 } 111 return lower * 0.5 + upper * 0.5; 112 } 113 } 114 115 unittest { 116 float brainDeadMedian(float[] foo) { 117 qsort(foo); 118 if(foo.length & 1) 119 return foo[$ / 2]; 120 return (foo[$ / 2] + foo[$ / 2 - 1]) / 2; 121 } 122 123 float[] test = new float[1000]; 124 size_t upperBound, lowerBound; 125 foreach(testNum; 0..1000) { 126 foreach(ref e; test) { 127 e = uniform(0f, 1000f); 128 } 129 do { 130 upperBound = uniform(0u, test.length); 131 lowerBound = uniform(0u, test.length); 132 } while(lowerBound == upperBound); 133 if(lowerBound > upperBound) { 134 swap(lowerBound, upperBound); 135 } 136 auto quickRes = median(test[lowerBound..upperBound]); 137 auto accurateRes = brainDeadMedian(test[lowerBound..upperBound]); 138 139 // Off by some tiny fraction in even N case because of division. 140 // No idea why, but it's too small a rounding error to care about. 141 assert(approxEqual(quickRes, accurateRes)); 142 } 143 144 // Make sure everything works with lowest common denominator range type. 145 static struct Count { 146 uint num; 147 uint upTo; 148 @property size_t front() { 149 return num; 150 } 151 void popFront() { 152 num++; 153 } 154 @property bool empty() { 155 return num >= upTo; 156 } 157 } 158 159 Count a; 160 a.upTo = 100; 161 assert(approxEqual(median(a), 49.5)); 162 } 163 164 /**Plain old data holder struct for median, median absolute deviation. 165 * Alias this'd to the median absolute deviation member. 166 */ 167 struct MedianAbsDev { 168 double median; 169 double medianAbsDev; 170 171 alias medianAbsDev this; 172 } 173 174 /**Calculates the median absolute deviation of a dataset. This is the median 175 * of all absolute differences from the median of the dataset. 176 * 177 * Returns: A MedianAbsDev struct that contains the median (since it is 178 * computed anyhow) and the median absolute deviation. 179 * 180 * Notes: No bias correction is used in this implementation, since using 181 * one would require assumptions about the underlying distribution of the data. 182 */ 183 MedianAbsDev medianAbsDev(T)(T data) 184 if(doubleInput!(T)) { 185 auto alloc = newRegionAllocator(); 186 auto dataDup = alloc.array(data); 187 immutable med = medianPartition(dataDup); 188 immutable len = dataDup.length; 189 alloc.freeLast(); 190 191 double[] devs = alloc.uninitializedArray!(double[])(len); 192 193 size_t i = 0; 194 foreach(elem; data) { 195 devs[i++] = abs(med - elem); 196 } 197 auto ret = medianPartition(devs); 198 alloc.freeLast(); 199 return MedianAbsDev(med, ret); 200 } 201 202 unittest { 203 assert(approxEqual(medianAbsDev([7,1,8,2,8,1,9,2,8,4,5,9].dup).medianAbsDev, 2.5L)); 204 assert(approxEqual(medianAbsDev([8,6,7,5,3,0,999].dup).medianAbsDev, 2.0L)); 205 } 206 207 /**Computes the interquantile range of data at the given quantile value in O(N) 208 * time complexity. For example, using a quantile value of either 0.25 or 0.75 209 * will give the interquartile range. (This is the default since it is 210 * apparently the most common interquantile range in common usage.) 211 * Using a quantile value of 0.2 or 0.8 will give the interquntile range. 212 * 213 * If the quantile point falls between two indices, linear interpolation is 214 * used. 215 * 216 * This function is somewhat more efficient than simply finding the upper and 217 * lower quantile and subtracting them. 218 * 219 * Tip: A quantile of 0 or 1 is handled as a special case and will compute the 220 * plain old range of the data in a single pass. 221 */ 222 double interquantileRange(R)(R data, double quantile = 0.25) 223 if(doubleInput!R) { 224 alias quantile q; // Save typing. 225 dstatsEnforce(q >= 0 && q <= 1, 226 "Quantile must be between 0, 1 for interquantileRange."); 227 228 auto alloc = newRegionAllocator(); 229 if(q > 0.5) { 230 q = 1.0 - q; 231 } 232 233 if(q == 0) { // Special case: Compute the plain old range. 234 double minElem = double.infinity; 235 double maxElem = -double.infinity; 236 237 foreach(elem; data) { 238 minElem = min(minElem, elem); 239 maxElem = max(maxElem, elem); 240 } 241 242 return maxElem - minElem; 243 } 244 245 // Common case. 246 auto duped = alloc.array(data); 247 immutable double N = duped.length; 248 if(duped.length < 2) { 249 return double.nan; // Can't do it. 250 } 251 252 immutable lowEnd = to!size_t((N - 1) * q); 253 immutable lowFract = (N - 1) * q - lowEnd; 254 255 partitionK(duped, lowEnd); 256 immutable lowQuantile1 = duped[lowEnd]; 257 double minAbove = double.infinity; 258 259 foreach(elem; duped[lowEnd + 1..$]) { 260 minAbove = min(minAbove, elem); 261 } 262 263 immutable lowerQuantile = 264 lowFract * minAbove + (1 - lowFract) * lowQuantile1; 265 266 immutable highEnd = to!size_t((N - 1) * (1.0 - q) - lowEnd); 267 immutable highFract = (N - 1) * (1.0 - q) - lowEnd - highEnd; 268 duped = duped[lowEnd..$]; 269 assert(highEnd < duped.length - 1); 270 271 partitionK(duped, highEnd); 272 immutable minAbove2 = reduce!min(double.infinity, duped[highEnd + 1..$]); 273 immutable upperQuantile = minAbove2 * highFract 274 + duped[highEnd] * (1 - highFract); 275 276 return upperQuantile - lowerQuantile; 277 } 278 279 unittest { 280 // 0 3 5 6 7 8 9 281 assert(approxEqual(interquantileRange([1,2,3,4,5,6,7,8]), 3.5)); 282 assert(approxEqual(interquantileRange([1,2,3,4,5,6,7,8,9]), 4)); 283 assert(interquantileRange([1,9,2,4,3,6,8], 0) == 8); 284 assert(approxEqual(interquantileRange([8,6,7,5,3,0,9], 0.2), 4.4)); 285 } 286 287 /**Output range to calculate the mean online. Getter for mean costs a branch to 288 * check for N == 0. This struct uses O(1) space and does *NOT* store the 289 * individual elements. 290 * 291 * Note: This struct can implicitly convert to the value of the mean. 292 * 293 * Examples: 294 * --- 295 * Mean summ; 296 * summ.put(1); 297 * summ.put(2); 298 * summ.put(3); 299 * summ.put(4); 300 * summ.put(5); 301 * assert(summ.mean == 3); 302 * ---*/ 303 struct Mean { 304 private: 305 double result = 0; 306 double k = 0; 307 308 public: 309 ///// Allow implicit casting to double, by returning the current mean. 310 alias mean this; 311 312 /// 313 void put(double element) pure nothrow @safe { 314 result += (element - result) / ++k; 315 } 316 317 /**Adds the contents of rhs to this instance. 318 * 319 * Examples: 320 * --- 321 * Mean mean1, mean2, combined; 322 * foreach(i; 0..5) { 323 * mean1.put(i); 324 * } 325 * 326 * foreach(i; 5..10) { 327 * mean2.put(i); 328 * } 329 * 330 * mean1.put(mean2); 331 * 332 * foreach(i; 0..10) { 333 * combined.put(i); 334 * } 335 * 336 * assert(approxEqual(combined.mean, mean1.mean)); 337 * --- 338 */ 339 void put(typeof(this) rhs) pure nothrow @safe { 340 immutable totalN = k + rhs.k; 341 result = result * (k / totalN) + rhs.result * (rhs.k / totalN); 342 k = totalN; 343 } 344 345 const pure nothrow @property @safe { 346 347 /// 348 double sum() { 349 return result * k; 350 } 351 352 /// 353 double mean() { 354 return (k == 0) ? double.nan : result; 355 } 356 357 /// 358 double N() { 359 return k; 360 } 361 362 /**Simply returns this. Useful in generic programming contexts.*/ 363 Mean toMean() { 364 return this; 365 } 366 } 367 368 /// 369 string toString() const { 370 return to!(string)(mean); 371 } 372 } 373 374 /**Finds the arithmetic mean of any input range whose elements are implicitly 375 * convertible to double.*/ 376 Mean mean(T)(T data) 377 if(doubleIterable!(T)) { 378 379 static if(isRandomAccessRange!T && hasLength!T) { 380 // This is optimized for maximum instruction level parallelism: 381 // The loop is unrolled such that there are 1 / (nILP)th the data 382 // dependencies of the naive algorithm. 383 enum nILP = 8; 384 385 Mean ret; 386 size_t i = 0; 387 if(data.length > 2 * nILP) { 388 double k = 0; 389 double[nILP] means = 0; 390 for(; i + nILP < data.length; i += nILP) { 391 immutable kNeg1 = 1 / ++k; 392 393 foreach(j; StaticIota!nILP) { 394 means[j] += (data[i + j] - means[j]) * kNeg1; 395 } 396 } 397 398 ret.k = k; 399 ret.result = means[0]; 400 foreach(m; means[1..$]) { 401 ret.put( Mean(m, k)); 402 } 403 } 404 405 // Handle the remainder. 406 for(; i < data.length; i++) { 407 ret.put(data[i]); 408 } 409 return ret; 410 411 } else { 412 // Just submit everything to a single Mean struct and return it. 413 Mean meanCalc; 414 415 foreach(element; data) { 416 meanCalc.put(element); 417 } 418 return meanCalc; 419 } 420 } 421 422 /**Output range to calculate the geometric mean online. 423 * Operates similarly to dstats.summary.Mean*/ 424 struct GeometricMean { 425 private: 426 Mean m; 427 public: 428 /////Allow implicit casting to double, by returning current geometric mean. 429 alias geoMean this; 430 431 /// 432 void put(double element) pure nothrow @safe { 433 m.put(log2(element)); 434 } 435 436 /// Combine two GeometricMean's. 437 void put(typeof(this) rhs) pure nothrow @safe { 438 m.put(rhs.m); 439 } 440 441 const pure nothrow @property { 442 /// 443 double geoMean() { 444 return exp2(m.mean); 445 } 446 447 /// 448 double N() { 449 return m.k; 450 } 451 } 452 453 /// 454 string toString() const { 455 return to!(string)(geoMean); 456 } 457 } 458 459 /**Calculates the geometric mean of any input range that has elements implicitly 460 * convertible to double*/ 461 double geometricMean(T)(T data) 462 if(doubleIterable!(T)) { 463 // This is relatively seldom used and the log function is the bottleneck 464 // anyhow, not worth ILP optimizing. 465 GeometricMean m; 466 foreach(elem; data) { 467 m.put(elem); 468 } 469 return m.geoMean; 470 } 471 472 unittest { 473 string[] data = ["1", "2", "3", "4", "5"]; 474 auto foo = map!(to!(uint))(data); 475 476 auto result = geometricMean(map!(to!(uint))(data)); 477 assert(approxEqual(result, 2.60517)); 478 479 Mean mean1, mean2, combined; 480 foreach(i; 0..5) { 481 mean1.put(i); 482 } 483 484 foreach(i; 5..10) { 485 mean2.put(i); 486 } 487 488 mean1.put(mean2); 489 490 foreach(i; 0..10) { 491 combined.put(i); 492 } 493 494 assert(approxEqual(combined.mean, mean1.mean), 495 text(combined.mean, " ", mean1.mean)); 496 assert(combined.N == mean1.N); 497 } 498 499 /**Finds the sum of an input range whose elements implicitly convert to double. 500 * User has option of making U a different type than T to prevent overflows 501 * on large array summing operations. However, by default, return type is 502 * T (same as input type).*/ 503 U sum(T, U = Unqual!(ForeachType!(T)))(T data) 504 if(doubleIterable!(T)) { 505 506 static if(isRandomAccessRange!T && hasLength!T) { 507 enum nILP = 8; 508 U[nILP] sum = 0; 509 510 size_t i = 0; 511 if(data.length > 2 * nILP) { 512 513 for(; i + nILP < data.length; i += nILP) { 514 foreach(j; StaticIota!nILP) { 515 sum[j] += data[i + j]; 516 } 517 } 518 519 foreach(j; 1..nILP) { 520 sum[0] += sum[j]; 521 } 522 } 523 524 for(; i < data.length; i++) { 525 sum[0] += data[i]; 526 } 527 528 return sum[0]; 529 } else { 530 U sum = 0; 531 foreach(elem; data) { 532 sum += elem; 533 } 534 535 return sum; 536 } 537 } 538 539 unittest { 540 assert(sum([1,2,3,4,5,6,7,8,9,10][]) == 55); 541 assert(sum(filter!"true"([1,2,3,4,5,6,7,8,9,10][])) == 55); 542 assert(sum(cast(int[]) [1,2,3,4,5])==15); 543 assert(approxEqual( sum(cast(int[]) [40.0, 40.1, 5.2]), 85.3)); 544 assert(mean(cast(int[]) [1,2,3]).mean == 2); 545 assert(mean(cast(int[]) [1.0, 2.0, 3.0]).mean == 2.0); 546 assert(mean([1, 2, 5, 10, 17][]).mean == 7); 547 assert(mean([1, 2, 5, 10, 17][]).sum == 35); 548 assert(approxEqual(mean([8,6,7,5,3,0,9,3,6,2,4,3,6][]).mean, 4.769231)); 549 550 // Test the OO struct a little, since we're using the new ILP algorithm. 551 Mean m; 552 m.put(1); 553 m.put(2); 554 m.put(5); 555 m.put(10); 556 m.put(17); 557 assert(m.mean == 7); 558 559 foreach(i; 0..100) { 560 // Monte carlo test the unrolled version. 561 auto foo = randArray!rNormal(uniform(5, 100), 0, 1); 562 auto res1 = mean(foo); 563 Mean res2; 564 foreach(elem; foo) { 565 res2.put(elem); 566 } 567 568 foreach(ti, elem; res1.tupleof) { 569 assert(approxEqual(elem, res2.tupleof[ti])); 570 } 571 } 572 } 573 574 575 /**Output range to compute mean, stdev, variance online. Getter methods 576 * for stdev, var cost a few floating point ops. Getter for mean costs 577 * a single branch to check for N == 0. Relatively expensive floating point 578 * ops, if you only need mean, try Mean. This struct uses O(1) space and 579 * does *NOT* store the individual elements. 580 * 581 * Note: This struct can implicitly convert to a Mean struct. 582 * 583 * References: Computing Higher-Order Moments Online. 584 * http://people.xiph.org/~tterribe/notes/homs.html 585 * 586 * Examples: 587 * --- 588 * MeanSD summ; 589 * summ.put(1); 590 * summ.put(2); 591 * summ.put(3); 592 * summ.put(4); 593 * summ.put(5); 594 * assert(summ.mean == 3); 595 * assert(summ.stdev == sqrt(2.5)); 596 * assert(summ.var == 2.5); 597 * ---*/ 598 struct MeanSD { 599 private: 600 double _mean = 0; 601 double _var = 0; 602 double _k = 0; 603 public: 604 /// 605 void put(double element) pure nothrow @safe { 606 immutable kMinus1 = _k; 607 immutable delta = element - _mean; 608 immutable deltaN = delta / ++_k; 609 610 _mean += deltaN; 611 _var += kMinus1 * deltaN * delta; 612 return; 613 } 614 615 /// Combine two MeanSD's. 616 void put(typeof(this) rhs) pure nothrow @safe { 617 if(_k == 0) { 618 foreach(ti, elem; rhs.tupleof) { 619 this.tupleof[ti] = elem; 620 } 621 622 return; 623 } else if(rhs._k == 0) { 624 return; 625 } 626 627 immutable totalN = _k + rhs._k; 628 immutable delta = rhs._mean - _mean; 629 _mean = _mean * (_k / totalN) + rhs._mean * (rhs._k / totalN); 630 631 _var = _var + rhs._var + (_k / totalN * rhs._k * delta * delta); 632 _k = totalN; 633 } 634 635 const pure nothrow @property @safe { 636 637 /// 638 double sum() { 639 return _k * _mean; 640 } 641 642 /// 643 double mean() { 644 return (_k == 0) ? double.nan : _mean; 645 } 646 647 /// 648 double stdev() { 649 return sqrt(var); 650 } 651 652 /// 653 double var() { 654 return (_k < 2) ? double.nan : _var / (_k - 1); 655 } 656 657 /** 658 Mean squared error. In other words, a biased estimate of variance. 659 */ 660 double mse() { 661 return (_k < 1) ? double.nan : _var / _k; 662 } 663 664 /// 665 double N() { 666 return _k; 667 } 668 669 /**Converts this struct to a Mean struct. Also called when an 670 * implicit conversion via alias this takes place. 671 */ 672 Mean toMean() { 673 return Mean(_mean, _k); 674 } 675 676 /**Simply returns this. Useful in generic programming contexts.*/ 677 MeanSD toMeanSD() const { 678 return this; 679 } 680 } 681 682 alias toMean this; 683 684 /// 685 string toString() const { 686 return text("N = ", cast(ulong) _k, "\nMean = ", mean, "\nVariance = ", 687 var, "\nStdev = ", stdev); 688 } 689 } 690 691 /**Puts all elements of data into a MeanSD struct, 692 * then returns this struct. This can be faster than doing this manually 693 * due to ILP optimizations.*/ 694 MeanSD meanStdev(T)(T data) 695 if(doubleIterable!(T)) { 696 697 MeanSD ret; 698 699 static if(isRandomAccessRange!T && hasLength!T) { 700 // Optimize for instruction level parallelism. 701 enum nILP = 6; 702 double k = 0; 703 double[nILP] means = 0; 704 double[nILP] variances = 0; 705 size_t i = 0; 706 707 if(data.length > 2 * nILP) { 708 for(; i + nILP < data.length; i += nILP) { 709 immutable kMinus1 = k; 710 immutable kNeg1 = 1 / ++k; 711 712 foreach(j; StaticIota!nILP) { 713 immutable double delta = data[i + j] - means[j]; 714 immutable deltaN = delta * kNeg1; 715 716 means[j] += deltaN; 717 variances[j] += kMinus1 * deltaN * delta; 718 } 719 } 720 721 ret._mean = means[0]; 722 ret._var = variances[0]; 723 ret._k = k; 724 725 foreach(j; 1..nILP) { 726 ret.put( MeanSD(means[j], variances[j], k)); 727 } 728 } 729 730 // Handle remainder. 731 for(; i < data.length; i++) { 732 ret.put(data[i]); 733 } 734 } else { 735 foreach(elem; data) { 736 ret.put(elem); 737 } 738 } 739 return ret; 740 } 741 742 /**Finds the variance of an input range with members implicitly convertible 743 * to doubles.*/ 744 double variance(T)(T data) 745 if(doubleIterable!(T)) { 746 return meanStdev(data).var; 747 } 748 749 /**Calculate the standard deviation of an input range with members 750 * implicitly converitble to double.*/ 751 double stdev(T)(T data) 752 if(doubleIterable!(T)) { 753 return meanStdev(data).stdev; 754 } 755 756 unittest { 757 auto res = meanStdev(cast(int[]) [3, 1, 4, 5]); 758 assert(approxEqual(res.stdev, 1.7078)); 759 assert(approxEqual(res.mean, 3.25)); 760 res = meanStdev(cast(double[]) [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]); 761 assert(approxEqual(res.stdev, 2.160247)); 762 assert(approxEqual(res.mean, 4)); 763 assert(approxEqual(res.sum, 28)); 764 765 MeanSD mean1, mean2, combined; 766 foreach(i; 0..5) { 767 mean1.put(i); 768 } 769 770 foreach(i; 5..10) { 771 mean2.put(i); 772 } 773 774 mean1.put(mean2); 775 776 foreach(i; 0..10) { 777 combined.put(i); 778 } 779 780 assert(approxEqual(combined.mean, mean1.mean)); 781 assert(approxEqual(combined.stdev, mean1.stdev)); 782 assert(combined.N == mean1.N); 783 assert(approxEqual(combined.mean, 4.5)); 784 assert(approxEqual(combined.stdev, 3.027650)); 785 786 foreach(i; 0..100) { 787 // Monte carlo test the unrolled version. 788 auto foo = randArray!rNormal(uniform(5, 100), 0, 1); 789 auto res1 = meanStdev(foo); 790 MeanSD res2; 791 foreach(elem; foo) { 792 res2.put(elem); 793 } 794 795 foreach(ti, elem; res1.tupleof) { 796 assert(approxEqual(elem, res2.tupleof[ti])); 797 } 798 799 MeanSD resCornerCase; // Test corner cases where one of the N's is 0. 800 resCornerCase.put(res1); 801 MeanSD dummy; 802 resCornerCase.put(dummy); 803 foreach(ti, elem; res1.tupleof) { 804 assert(elem == resCornerCase.tupleof[ti]); 805 } 806 } 807 } 808 809 /**Output range to compute mean, stdev, variance, skewness, kurtosis, min, and 810 * max online. Using this struct is relatively expensive, so if you just need 811 * mean and/or stdev, try MeanSD or Mean. Getter methods for stdev, 812 * var cost a few floating point ops. Getter for mean costs a single branch to 813 * check for N == 0. Getters for skewness and kurtosis cost a whole bunch of 814 * floating point ops. This struct uses O(1) space and does *NOT* store the 815 * individual elements. 816 * 817 * Note: This struct can implicitly convert to a MeanSD. 818 * 819 * References: Computing Higher-Order Moments Online. 820 * http://people.xiph.org/~tterribe/notes/homs.html 821 * 822 * Examples: 823 * --- 824 * Summary summ; 825 * summ.put(1); 826 * summ.put(2); 827 * summ.put(3); 828 * summ.put(4); 829 * summ.put(5); 830 * assert(summ.N == 5); 831 * assert(summ.mean == 3); 832 * assert(summ.stdev == sqrt(2.5)); 833 * assert(summ.var == 2.5); 834 * assert(approxEqual(summ.kurtosis, -1.9120)); 835 * assert(summ.min == 1); 836 * assert(summ.max == 5); 837 * assert(summ.sum == 15); 838 * ---*/ 839 struct Summary { 840 private: 841 double _mean = 0; 842 double _m2 = 0; 843 double _m3 = 0; 844 double _m4 = 0; 845 double _k = 0; 846 double _min = double.infinity; 847 double _max = -double.infinity; 848 public: 849 /// 850 void put(double element) pure nothrow @safe { 851 immutable kMinus1 = _k; 852 immutable kNeg1 = 1.0 / ++_k; 853 _min = (element < _min) ? element : _min; 854 _max = (element > _max) ? element : _max; 855 856 immutable delta = element - _mean; 857 immutable deltaN = delta * kNeg1; 858 _mean += deltaN; 859 860 _m4 += kMinus1 * deltaN * (_k * _k - 3 * _k + 3) * deltaN * deltaN * delta + 861 6 * _m2 * deltaN * deltaN - 4 * deltaN * _m3; 862 _m3 += kMinus1 * deltaN * (_k - 2) * deltaN * delta - 3 * delta * _m2 * kNeg1; 863 _m2 += kMinus1 * deltaN * delta; 864 } 865 866 /// Combine two Summary's. 867 void put(typeof(this) rhs) pure nothrow @safe { 868 if(_k == 0) { 869 foreach(ti, elem; rhs.tupleof) { 870 this.tupleof[ti] = elem; 871 } 872 873 return; 874 } else if(rhs._k == 0) { 875 return; 876 } 877 878 immutable totalN = _k + rhs._k; 879 immutable delta = rhs._mean - _mean; 880 immutable deltaN = delta / totalN; 881 _mean = _mean * (_k / totalN) + rhs._mean * (rhs._k / totalN); 882 883 _m4 = _m4 + rhs._m4 + 884 deltaN * _k * deltaN * rhs._k * deltaN * delta * 885 (_k * _k - _k * rhs._k + rhs._k * rhs._k) + 886 6 * deltaN * _k * deltaN * _k * rhs._m2 + 887 6 * deltaN * rhs._k * deltaN * rhs._k * _m2 + 888 4 * deltaN * _k * rhs._m3 - 889 4 * deltaN * rhs._k * _m3; 890 891 _m3 = _m3 + rhs._m3 + deltaN * _k * deltaN * rhs._k * (_k - rhs._k) + 892 3 * deltaN * _k * rhs._m2 - 893 3 * deltaN * rhs._k * _m2; 894 895 _m2 = _m2 + rhs._m2 + (_k / totalN * rhs._k * delta * delta); 896 897 _k = totalN; 898 _max = (_max > rhs._max) ? _max : rhs._max; 899 _min = (_min < rhs._min) ? _min : rhs._min; 900 } 901 902 const pure nothrow @property @safe { 903 904 /// 905 double sum() { 906 return _mean * _k; 907 } 908 909 /// 910 double mean() { 911 return (_k == 0) ? double.nan : _mean; 912 } 913 914 /// 915 double stdev() { 916 return sqrt(var); 917 } 918 919 /// 920 double var() { 921 return (_k < 2) ? double.nan : _m2 / (_k - 1); 922 } 923 924 /** 925 Mean squared error. In other words, a biased estimate of variance. 926 */ 927 double mse() { 928 return (_k < 1) ? double.nan : _m2 / _k; 929 } 930 931 /// 932 double skewness() { 933 immutable sqM2 = sqrt(_m2); 934 return _m3 / (sqM2 * sqM2 * sqM2) * sqrt(_k); 935 } 936 937 /// 938 double kurtosis() { 939 return _m4 / _m2 * _k / _m2 - 3; 940 } 941 942 /// 943 double N() { 944 return _k; 945 } 946 947 /// 948 double min() { 949 return _min; 950 } 951 952 /// 953 double max() { 954 return _max; 955 } 956 957 /**Converts this struct to a MeanSD. Called via alias this when an 958 * implicit conversion is attetmpted. 959 */ 960 MeanSD toMeanSD() { 961 return MeanSD(_mean, _m2, _k); 962 } 963 } 964 965 alias toMeanSD this; 966 967 /// 968 string toString() const { 969 return text("N = ", roundTo!long(_k), 970 "\nMean = ", mean, 971 "\nVariance = ", var, 972 "\nStdev = ", stdev, 973 "\nSkewness = ", skewness, 974 "\nKurtosis = ", kurtosis, 975 "\nMin = ", _min, 976 "\nMax = ", _max); 977 } 978 } 979 980 unittest { 981 // Everything else is tested indirectly through kurtosis, skewness. Test 982 // put(typeof(this)). 983 984 Summary mean1, mean2, combined; 985 foreach(i; 0..5) { 986 mean1.put(i); 987 } 988 989 foreach(i; 5..10) { 990 mean2.put(i); 991 } 992 993 auto m1_2 = mean1; 994 auto m2_2 = mean2; 995 m1_2.put(m2_2); 996 997 mean1.put(mean2); 998 999 foreach(i; 0..10) { 1000 combined.put(i); 1001 } 1002 1003 foreach(ti, elem; mean1.tupleof) { 1004 assert(approxEqual(elem, combined.tupleof[ti])); 1005 } 1006 1007 Summary summCornerCase; // Case where one N is zero. 1008 summCornerCase.put(mean1); 1009 Summary dummy; 1010 summCornerCase.put(dummy); 1011 foreach(ti, elem; summCornerCase.tupleof) { 1012 assert(elem == mean1.tupleof[ti]); 1013 } 1014 } 1015 1016 /**Excess kurtosis relative to normal distribution. High kurtosis means that 1017 * the variance is due to infrequent, large deviations from the mean. Low 1018 * kurtosis means that the variance is due to frequent, small deviations from 1019 * the mean. The normal distribution is defined as having kurtosis of 0. 1020 * Input must be an input range with elements implicitly convertible to double.*/ 1021 double kurtosis(T)(T data) 1022 if(doubleIterable!(T)) { 1023 // This is too infrequently used and has too much ILP within a single 1024 // iteration to be worth ILP optimizing. 1025 Summary kCalc; 1026 foreach(elem; data) { 1027 kCalc.put(elem); 1028 } 1029 return kCalc.kurtosis; 1030 } 1031 1032 unittest { 1033 // Values from Matlab. 1034 assert(approxEqual(kurtosis([1, 1, 1, 1, 10].dup), 0.25)); 1035 assert(approxEqual(kurtosis([2.5, 3.5, 4.5, 5.5].dup), -1.36)); 1036 assert(approxEqual(kurtosis([1,2,2,2,2,2,100].dup), 2.1657)); 1037 } 1038 1039 /**Skewness is a measure of symmetry of a distribution. Positive skewness 1040 * means that the right tail is longer/fatter than the left tail. Negative 1041 * skewness means the left tail is longer/fatter than the right tail. Zero 1042 * skewness indicates a symmetrical distribution. Input must be an input 1043 * range with elements implicitly convertible to double.*/ 1044 double skewness(T)(T data) 1045 if(doubleIterable!(T)) { 1046 // This is too infrequently used and has too much ILP within a single 1047 // iteration to be worth ILP optimizing. 1048 Summary sCalc; 1049 foreach(elem; data) { 1050 sCalc.put(elem); 1051 } 1052 return sCalc.skewness; 1053 } 1054 1055 unittest { 1056 // Values from Octave. 1057 assert(approxEqual(skewness([1,2,3,4,5].dup), 0)); 1058 assert(approxEqual(skewness([3,1,4,1,5,9,2,6,5].dup), 0.5443)); 1059 assert(approxEqual(skewness([2,7,1,8,2,8,1,8,2,8,4,5,9].dup), -0.0866)); 1060 1061 // Test handling of ranges that are not arrays. 1062 string[] stringy = ["3", "1", "4", "1", "5", "9", "2", "6", "5"]; 1063 auto intified = map!(to!(int))(stringy); 1064 assert(approxEqual(skewness(intified), 0.5443)); 1065 } 1066 1067 /**Convenience function. Puts all elements of data into a Summary struct, 1068 * and returns this struct.*/ 1069 Summary summary(T)(T data) 1070 if(doubleIterable!(T)) { 1071 // This is too infrequently used and has too much ILP within a single 1072 // iteration to be worth ILP optimizing. 1073 Summary summ; 1074 foreach(elem; data) { 1075 summ.put(elem); 1076 } 1077 return summ; 1078 } 1079 // Just a convenience function for a well-tested struct. No unittest really 1080 // necessary. (Famous last words.) 1081 1082 /// 1083 struct ZScore(T) if(isForwardRange!(T) && is(ElementType!(T) : double)) { 1084 private: 1085 T range; 1086 double mean; 1087 double sdNeg1; 1088 1089 double z(double elem) { 1090 return (elem - mean) * sdNeg1; 1091 } 1092 1093 public: 1094 this(T range) { 1095 this.range = range; 1096 auto msd = meanStdev(range); 1097 this.mean = msd.mean; 1098 this.sdNeg1 = 1.0 / msd.stdev; 1099 } 1100 1101 this(T range, double mean, double sd) { 1102 this.range = range; 1103 this.mean = mean; 1104 this.sdNeg1 = 1.0 / sd; 1105 } 1106 1107 /// 1108 @property double front() { 1109 return z(range.front); 1110 } 1111 1112 /// 1113 void popFront() { 1114 range.popFront; 1115 } 1116 1117 /// 1118 @property bool empty() { 1119 return range.empty; 1120 } 1121 1122 static if(isForwardRange!(T)) { 1123 /// 1124 @property typeof(this) save() { 1125 auto ret = this; 1126 ret.range = range.save; 1127 return ret; 1128 } 1129 } 1130 1131 static if(isRandomAccessRange!(T)) { 1132 /// 1133 double opIndex(size_t index) { 1134 return z(range[index]); 1135 } 1136 } 1137 1138 static if(isBidirectionalRange!(T)) { 1139 /// 1140 @property double back() { 1141 return z(range.back); 1142 } 1143 1144 /// 1145 void popBack() { 1146 range.popBack; 1147 } 1148 } 1149 1150 static if(hasLength!(T)) { 1151 /// 1152 @property size_t length() { 1153 return range.length; 1154 } 1155 } 1156 } 1157 1158 /**Returns a range with whatever properties T has (forward range, random 1159 * access range, bidirectional range, hasLength, etc.), 1160 * of the z-scores of the underlying 1161 * range. A z-score of an element in a range is defined as 1162 * (element - mean(range)) / stdev(range). 1163 * 1164 * Notes: 1165 * 1166 * If the data contained in the range is a sample of a larger population, 1167 * rather than an entire population, then technically, the results output 1168 * from the ZScore range are T statistics, not Z statistics. This is because 1169 * the sample mean and standard deviation are only estimates of the population 1170 * parameters. This does not affect the mechanics of using this range, 1171 * but it does affect the interpretation of its output. 1172 * 1173 * Accessing elements of this range is fairly expensive, as a 1174 * floating point multiply is involved. Also, constructing this range is 1175 * costly, as the entire input range has to be iterated over to find the 1176 * mean and standard deviation. 1177 */ 1178 ZScore!(T) zScore(T)(T range) 1179 if(isForwardRange!(T) && doubleInput!(T)) { 1180 return ZScore!(T)(range); 1181 } 1182 1183 /**Allows the construction of a ZScore range with precomputed mean and 1184 * stdev. 1185 */ 1186 ZScore!(T) zScore(T)(T range, double mean, double sd) 1187 if(isForwardRange!(T) && doubleInput!(T)) { 1188 return ZScore!(T)(range, mean, sd); 1189 } 1190 1191 unittest { 1192 int[] arr = [1,2,3,4,5]; 1193 auto m = mean(arr).mean; 1194 auto sd = stdev(arr); 1195 auto z = zScore(arr); 1196 1197 size_t pos = 0; 1198 foreach(elem; z) { 1199 assert(approxEqual(elem, (arr[pos++] - m) / sd)); 1200 } 1201 1202 assert(z.length == 5); 1203 foreach(i; 0..z.length) { 1204 assert(approxEqual(z[i], (arr[i] - m) / sd)); 1205 } 1206 }