1 /**Relatively low-level primitives on which to build higher-level math/stat 2 * functionality. Some are used internally, some are just things that may be 3 * useful to users of this library. This module is starting to take on the 4 * appearance of a small utility library. 5 * 6 * Note: In several functions in this module that return arrays, the last 7 * parameter is an optional buffer for storing the return value. If this 8 * parameter is ommitted or the buffer is not large enough, one will be 9 * allocated on the GC heap. 10 * 11 * Author: David Simcha*/ 12 /* 13 * License: 14 * Boost Software License - Version 1.0 - August 17th, 2003 15 * 16 * Permission is hereby granted, free of charge, to any person or organization 17 * obtaining a copy of the software and accompanying documentation covered by 18 * this license (the "Software") to use, reproduce, display, distribute, 19 * execute, and transmit the Software, and to prepare derivative works of the 20 * Software, and to permit third-parties to whom the Software is furnished to 21 * do so, all subject to the following: 22 ** The copyright notices in the Software and this entire statement, including 23 * the above license grant, this restriction and the following disclaimer, 24 * must be included in all copies of the Software, in whole or in part, and 25 * all derivative works of the Software, unless such copies or derivative 26 * works are solely in the form of machine-executable object code generated by 27 * a source language processor. 28 * 29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 32 * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 33 * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 34 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 35 * DEALINGS IN THE SOFTWARE. 36 */ 37 module dstats.base; 38 39 import std.math, std.mathspecial, std.traits, std.typecons, std.algorithm, 40 std.range, std.exception, std.conv, std.functional, std.typetuple, 41 std.numeric; 42 43 import dstats.alloc, dstats.sort; 44 45 // Returns the number of dimensions in an array T. 46 package template nDims(T) 47 { 48 static if(isArray!T) 49 { 50 enum nDims = 1 + nDims!(typeof(T.init[0])); 51 } 52 else 53 { 54 enum nDims = 0; 55 } 56 } 57 58 version(unittest) 59 { 60 static assert(nDims!(uint[]) == 1); 61 static assert(nDims!(float[][]) == 2); 62 } 63 64 import std.string : strip; 65 66 immutable double[] logFactorialTable; 67 68 private enum size_t staticFacTableLen = 10_000; 69 70 shared static this() { 71 // Allocating on heap instead of static data segment to avoid 72 // false pointer GC issues. 73 double[] sfTemp = new double[staticFacTableLen]; 74 sfTemp[0] = 0; 75 for(uint i = 1; i < staticFacTableLen; i++) { 76 sfTemp[i] = sfTemp[i - 1] + log(cast(double)i); 77 } 78 logFactorialTable = assumeUnique(sfTemp); 79 } 80 81 version(unittest) { 82 import std.stdio, std.random, std.file; 83 } 84 85 /** 86 This is the exception that is thrown on invalid arguments to a dstats function. 87 */ 88 class DstatsArgumentException : Exception { 89 this(string msg, string file, int line) { 90 super(msg, file, line); 91 } 92 } 93 94 T dstatsEnforce(T, string file = __FILE__, int line = __LINE__) 95 (T value, lazy const(char)[] msg = null) { 96 if(!value) { 97 const(char)[] lazyMsg = msg; 98 auto exceptMsg = (lazyMsg !is null) ? lazyMsg.idup : "Invalid argument."; 99 throw new DstatsArgumentException(exceptMsg, file, line); 100 } 101 102 return value; 103 } 104 105 package void enforceConfidence 106 (string file = __FILE__, int line = __LINE__)(double conf) { 107 dstatsEnforce!(bool, file, line)(conf >= 0 && conf <= 1, 108 text("Confidence intervals must be between 0 and 1, not ", conf, ".")); 109 } 110 111 /** 112 Tests whether T is an input range whose elements can be implicitly 113 converted to doubles.*/ 114 template doubleInput(T) { 115 enum doubleInput = isInputRange!(T) && is(ElementType!(T) : double); 116 } 117 118 /**Tests whether T is iterable and has elements of a type implicitly 119 * convertible to double.*/ 120 template doubleIterable(T) { 121 static if(!isIterable!T) { 122 enum doubleIterable = false; 123 } else { 124 enum doubleIterable = is(ForeachType!(T) : double); 125 } 126 } 127 128 /** 129 Given a tuple possibly containing forward ranges, returns a tuple where 130 save() has been called on all forward ranges. 131 */ 132 Tuple!T saveAll(T...)(T args) { 133 Tuple!T ret; 134 135 foreach(ti, elem; args) { 136 static if(isForwardRange!(typeof(elem))) { 137 ret.field[ti] = elem.save; 138 } else { 139 ret.field[ti] = elem; 140 } 141 } 142 143 return ret; 144 } 145 146 /**Bins data into nbin equal width bins, indexed from 147 * 0 to nbin - 1, with 0 being the smallest bin, etc. 148 * The values returned are the counts for each bin. 149 * 150 * Works with any forward range with elements implicitly convertible to double. 151 */ 152 Ret[] binCounts(Ret = uint, T)(T data, uint nbin, Ret[] buf = null) 153 if(isForwardRange!(T) && doubleInput!(T)) { 154 dstatsEnforce(nbin > 0, "Cannot bin data into zero bins."); 155 156 alias Unqual!(ElementType!(T)) E; 157 E min = data.front, max = data.front; 158 foreach(elem; data) { 159 if(elem > max) 160 max = elem; 161 else if(elem < min) 162 min = elem; 163 } 164 E range = max - min; 165 166 Ret[] bins; 167 if(buf.length < nbin) { 168 bins = new Ret[nbin]; 169 } else { 170 bins = buf[0..nbin]; 171 bins[] = 0; 172 } 173 174 foreach(elem; data) { 175 // Using the truncation as a feature. 176 uint whichBin = cast(uint) ((elem - min) * nbin / range); 177 178 // Handle edge case by putting largest item in largest bin. 179 if(whichBin == nbin) whichBin--; 180 181 bins[whichBin]++; 182 } 183 184 return bins; 185 } 186 187 unittest { 188 double[] data = [0.0, .01, .03, .05, .11, .31, .51, .71, .89, 1]; 189 auto res = binCounts(data, 10); 190 assert(res == [4U, 1, 0, 1, 0, 1, 0, 1, 1, 1]); 191 192 auto buf = new uint[10]; 193 foreach(ref elem; buf) { 194 elem = uniform(0, 34534); 195 } 196 197 res = binCounts(data, 10, buf); 198 assert(res == [4U, 1, 0, 1, 0, 1, 0, 1, 1, 1]); 199 } 200 201 /**Bins data into nbin equal width bins, indexed from 202 * 0 to nbin - 1, with 0 being the smallest bin, etc. 203 * The values returned are the bin index for each element. 204 * 205 * Default return type is ubyte, because in the dstats.infotheory, 206 * entropy() and related functions specialize on ubytes, and become 207 * substandially faster. However, if you're using more than 255 bins, 208 * you'll have to provide a different return type as a template parameter.*/ 209 Ret[] bin(Ret = ubyte, T)(T data, uint nbin, Ret[] buf = null) 210 if(isForwardRange!(T) && doubleInput!(T) && isIntegral!(Ret)) { 211 dstatsEnforce(nbin > 0, "Cannot bin data into zero bins."); 212 dstatsEnforce(nbin <= (cast(uint) Ret.max + 1), "Cannot bin into " ~ 213 to!string(nbin) ~ " bins and store the results in a " ~ 214 Ret.stringof ~ "."); 215 alias ElementType!(T) E; 216 Unqual!(E) min = data.front, max = data.front; 217 218 auto dminmax = data; 219 dminmax.popFront; 220 foreach(elem; dminmax) { 221 if(elem > max) 222 max = elem; 223 else if(elem < min) 224 min = elem; 225 } 226 E range = max - min; 227 Ret[] bins; 228 if(buf.length < data.length) { 229 bins = uninitializedArray!(Ret[])(data.length); 230 } else { 231 bins = buf[0..data.length]; 232 } 233 234 foreach(i, elem; data) { 235 // Using the truncation as a feature. 236 uint whichBin = cast(uint) ((elem - min) * nbin / range); 237 238 // Handle edge case by putting largest item in largest bin. 239 if(whichBin == nbin) 240 whichBin--; 241 242 bins[i] = cast(Ret) whichBin; 243 } 244 245 return bins; 246 } 247 248 unittest { 249 auto alloc = newRegionAllocator(); 250 double[] data = [0.0, .01, .03, .05, .11, .31, .51, .71, .89, 1]; 251 auto res = bin(data, 10); 252 assert(res == to!(ubyte[])([0, 0, 0, 0, 1, 3, 5, 7, 8, 9])); 253 254 auto buf = new ubyte[20]; 255 foreach(ref elem; buf) { 256 elem = cast(ubyte) uniform(0U, 255); 257 } 258 259 res = bin(data, 10, buf); 260 assert(res == to!(ubyte[])([0, 0, 0, 0, 1, 3, 5, 7, 8, 9])); 261 262 // Make sure this throws: 263 try { 264 auto foo = bin( seq(0U, 1_000U), 512); 265 assert(0); 266 } catch(Exception e) { 267 // It's supposed to throw. 268 } 269 } 270 271 /**Bins data into nbin equal frequency bins, indexed from 272 * 0 to nbin - 1, with 0 being the smallest bin, etc. 273 * The values returned are the bin index for each element. 274 * 275 * Default return type is ubyte, because in the dstats.infotheory, 276 * entropy() and related functions specialize on ubytes, and become 277 * substandially faster. However, if you're using more than 256 bins, 278 * you'll have to provide a different return type as a template parameter.*/ 279 Ret[] frqBin(Ret = ubyte, T)(T data, uint nbin, Ret[] buf = null) 280 if(doubleInput!(T) && isForwardRange!(T) && hasLength!(T) && isIntegral!(Ret)) { 281 dstatsEnforce(nbin > 0, "Cannot bin data into zero bins."); 282 dstatsEnforce(nbin <= data.length, 283 "Cannot equal frequency bin data into more than data.length bins."); 284 dstatsEnforce(nbin <= (cast(ulong) Ret.max) + 1UL, "Cannot bin into " ~ 285 to!string(nbin) ~ " bins and store the results in a " ~ 286 Ret.stringof ~ "."); 287 288 Ret[] result; 289 if(buf.length < data.length) { 290 result = uninitializedArray!(Ret[])(data.length); 291 } else { 292 result = buf[0..data.length]; 293 } 294 295 auto alloc = newRegionAllocator(); 296 auto perm = alloc.uninitializedArray!(size_t[])(data.length); 297 298 foreach(i, ref e; perm) { 299 e = i; 300 } 301 302 static if(isRandomAccessRange!(T)) { 303 bool compare(size_t lhs, size_t rhs) { 304 return data[lhs] < data[rhs]; 305 } 306 307 qsort!compare(perm); 308 } else { 309 auto dd = alloc.array(data); 310 qsort(dd, perm); 311 alloc.freeLast(); 312 } 313 314 auto rem = data.length % nbin; 315 Ret bin = 0; 316 auto i = 0, frq = data.length / nbin; 317 while(i < data.length) { 318 foreach(j; 0..(bin < rem) ? frq + 1 : frq) { 319 result[perm[i++]] = bin; 320 } 321 bin++; 322 } 323 return result; 324 } 325 326 unittest { 327 double[] data = [5U, 1, 3, 8, 30, 10, 7]; 328 auto res = frqBin(data, 3); 329 assert(res == to!(ubyte[])([0, 0, 0, 1, 2, 2, 1])); 330 data = [3, 1, 4, 1, 5, 9, 2, 6, 5]; 331 332 auto buf = new ubyte[32]; 333 foreach(i, ref elem; buf) { 334 elem = cast(ubyte) i; 335 } 336 337 res = frqBin(data, 4, buf); 338 assert(res == to!(ubyte[])([1, 0, 1, 0, 2, 3, 0, 3, 2])); 339 data = [3U, 1, 4, 1, 5, 9, 2, 6, 5, 3, 4, 8, 9, 7, 9, 2]; 340 res = frqBin(data, 4); 341 assert(res == to!(ubyte[])([1, 0, 1, 0, 2, 3, 0, 2, 2, 1, 1, 3, 3, 2, 3, 0])); 342 343 // Make sure this throws: 344 try { 345 auto foo = frqBin( seq(0U, 1_000U), 512); 346 assert(0); 347 } catch(Exception e) { 348 // It's supposed to throw. 349 } 350 } 351 352 /**Generates a sequence from [start..end] by increment. Includes start, 353 * excludes end. Does so eagerly as an array. 354 * 355 * Examples: 356 * --- 357 * auto s = seq(0, 5); 358 * assert(s == [0, 1, 2, 3, 4]); 359 * --- 360 */ 361 CommonType!(T, U)[] seq(T, U, V = uint)(T start, U end, V increment = 1U) { 362 dstatsEnforce(increment > 0, "Cannot have a seq increment <= 0."); 363 dstatsEnforce(end >= start, "End must be >= start in seq."); 364 365 alias CommonType!(T, U) R; 366 auto output = uninitializedArray!(R[]) 367 (cast(size_t) ((end - start) / increment)); 368 369 size_t count = 0; 370 for(T i = start; i < end; i += increment) { 371 output[count++] = i; 372 } 373 return output; 374 } 375 376 unittest { 377 auto s = seq(0, 5); 378 assert(s == [0, 1, 2, 3, 4]); 379 } 380 381 /**Given an input array, outputs an array containing the rank from 382 * [1, input.length] corresponding to each element. Ties are dealt with by 383 * averaging. This function does not reorder the input range. 384 * Return type is float[] by default, but if you are sure you have no ties, 385 * ints can be used for efficiency (in which case ties will not be averaged), 386 * and if you need more precision when averaging ties, you can use double or 387 * real. 388 * 389 * Works with any input range. 390 * 391 * Examples: 392 * --- 393 * uint[] test = [3, 5, 3, 1, 2]; 394 * assert(rank!("a < b", float)(test) == [3.5f, 5f, 3.5f, 1f, 2f]); 395 * assert(test == [3U, 5, 3, 1, 2]); 396 * ---*/ 397 Ret[] rank(alias compFun = "a < b", Ret = double, T)(T input, Ret[] buf = null) 398 if(isInputRange!(T) && is(typeof(input.front < input.front))) { 399 auto alloc = newRegionAllocator(); 400 401 static if(!isRandomAccessRange!(T) || !hasLength!(T)) { 402 return rankSort!(compFun, Ret)( alloc.array(input), buf); 403 } else { 404 405 /* It's faster to duplicate the input and then use rankSort on the 406 * duplicate, but more space efficient to create an index array. Check 407 * TempAlloc to see whether we can fit everything we need on the current 408 * frame. If yes, use the faster algorithm since we're effectively 409 * not saving any space by using the space-efficient algorithm. 410 * If no, use the more space-efficient algorithm. 411 */ 412 auto bytesNeeded = input.length * (ElementType!(T).sizeof + size_t.sizeof); 413 if(bytesNeeded < alloc.segmentSlack) { 414 return rankSort!(compFun, Ret)( alloc.array(input), buf); 415 } else { 416 return rankUsingIndex!(compFun, Ret)(input, buf); 417 } 418 } 419 420 assert(0); 421 } 422 423 /* Space-efficient but slow way of computing ranks. The extra indirection 424 * caused by creating the index array wreaks havock with CPU caches, especially 425 * for large arrays that don't fit entirely in cache. 426 */ 427 private Ret[] rankUsingIndex(alias compFun, Ret, T)(T input, Ret[] buf) { 428 auto alloc = newRegionAllocator(); 429 size_t[] indices = alloc.uninitializedArray!(size_t[])(input.length); 430 foreach(i, ref elem; indices) { 431 elem = i; 432 } 433 434 bool compare(size_t lhs, size_t rhs) { 435 alias binaryFun!(compFun) innerComp; 436 return innerComp(input[lhs], input[rhs]); 437 } 438 439 dstats.sort.qsort!compare(indices); 440 441 Ret[] ret; 442 if(buf.length < indices.length) { 443 ret = uninitializedArray!(Ret[])(indices.length); 444 } else { 445 ret = buf[0..indices.length]; 446 } 447 448 foreach(i, index; indices) { 449 ret[index] = i + 1; 450 } 451 452 auto myIndexed = Indexed!(T)(input, indices); 453 454 static if(!isIntegral!Ret) { 455 averageTies(myIndexed, ret, indices); 456 } 457 458 return ret; 459 } 460 461 private struct Indexed(T) { 462 T someRange; 463 size_t[] indices; 464 465 ElementType!T opIndex(size_t index) { 466 return someRange[indices[index]]; 467 } 468 469 @property size_t length() { 470 return indices.length; 471 } 472 } 473 474 /**Same as rank(), but also sorts the input range. 475 * The array returned will still be identical to that returned by rank(), i.e. 476 * the rank of each element will correspond to the ranks of the elements in the 477 * input array before sorting. 478 * 479 * Examples: 480 * --- 481 * uint[] test = [3, 5, 3, 1, 2]; 482 * assert(rankSort(test) == [3.5, 5, 3.5, 1.0, 2.0]); 483 * assert(test == [1U, 2, 3, 4, 5]); 484 * --- 485 */ 486 Ret[] rankSort(alias compFun = "a < b", Ret = double, T)(T input, Ret[] buf = null) 487 if(isRandomAccessRange!(T) && hasLength!(T) && is(typeof(input.front < input.front))) { 488 auto alloc = newRegionAllocator(); 489 Ret[] ranks; 490 if(buf.length < input.length) { 491 ranks = uninitializedArray!(Ret[])(input.length); 492 } else { 493 ranks = buf[0..input.length]; 494 } 495 496 size_t[] perms = alloc.uninitializedArray!(size_t[])(input.length); 497 foreach(i, ref p; perms) { 498 p = i; 499 } 500 501 dstats.sort.qsort!compFun(input, perms); 502 foreach(i; 0..perms.length) { 503 ranks[perms[i]] = i + 1; 504 } 505 506 static if(!isIntegral!Ret) { 507 averageTies(input, ranks, perms); 508 } 509 510 return ranks; 511 } 512 513 unittest { 514 uint[] test = [3, 5, 3, 1, 2]; 515 516 float[] dummy; 517 assert(rankUsingIndex!("a < b", float)(test, dummy) == [3.5f, 5f, 3.5f, 1f, 2f]); 518 assert(test == [3U, 5, 3, 1, 2]); 519 520 double[] dummy2; 521 assert(rankUsingIndex!("a < b", double)(test, dummy2) == [3.5, 5, 3.5, 1, 2]); 522 assert(rankSort(test) == [3.5, 5.0, 3.5, 1.0, 2.0]); 523 assert(test == [1U,2,3,3,5]); 524 525 uint[] test2 = [3,3,1,2]; 526 assert(rank(test2) == [3.5,3.5,1,2]); 527 } 528 529 private void averageTies(T, U)(T sortedInput, U[] ranks, size_t[] perms) 530 in { 531 assert(sortedInput.length == ranks.length); 532 assert(ranks.length == perms.length); 533 } do { 534 size_t tieCount = 1; 535 foreach(i; 1..ranks.length) { 536 if(sortedInput[i] == sortedInput[i - 1]) { 537 tieCount++; 538 } else if(tieCount > 1) { 539 double avg = 0; 540 immutable increment = 1.0 / tieCount; 541 542 foreach(perm; perms[i - tieCount..i]) { 543 avg += ranks[perm] * increment; 544 } 545 546 foreach(perm; perms[i - tieCount..i]) { 547 ranks[perm] = avg; 548 } 549 tieCount = 1; 550 } 551 } 552 553 if(tieCount > 1) { // Handle the end. 554 double avg = 0; 555 immutable increment = 1.0 / tieCount; 556 557 foreach(perm; perms[perms.length - tieCount..$]) { 558 avg += ranks[perm] * increment; 559 } 560 561 foreach(perm; perms[perms.length - tieCount..$]) { 562 ranks[perm] = avg; 563 } 564 } 565 } 566 567 /**Returns an associative array of counts of every element in input. 568 * Works w/ any iterable. 569 * 570 * Examples: 571 * --- 572 * int[] foo = [1,2,3,1,2,4]; 573 * uint[int] frq = frequency(foo); 574 * assert(frq.length == 4); 575 * assert(frq[1] == 2); 576 * assert(frq[4] == 1); 577 * ---*/ 578 uint[ForeachType!(T)] frequency(T)(T input) 579 if(isIterable!(T)) { 580 typeof(return) output; 581 foreach(i; input) { 582 output[i]++; 583 } 584 return output; 585 } 586 587 unittest { 588 int[] foo = [1,2,3,1,2,4]; 589 uint[int] frq = frequency(foo); 590 assert(frq.length == 4); 591 assert(frq[1] == 2); 592 assert(frq[4] == 1); 593 } 594 595 /**Given a range of values and a range of categories, separates values 596 * by category. This function also guarantees that the order within each 597 * category will be maintained. 598 * 599 * Note: While the general convention of this library is to try to avoid 600 * heap allocations whenever possible so that multithreaded code scales well and 601 * false pointers aren't an issue, this function allocates like crazy 602 * because there's basically no other way to implement it. Don't use it in 603 * performance-critical multithreaded code. 604 * 605 * Examples: 606 * --- 607 * uint[] values = [1,2,3,4,5,6,7,8]; 608 * bool[] categories = [false, false, false, false, true, true, true, true]; 609 * auto separated = byCategory(values, categories); 610 * auto tResult = studentsTTest(separated.values); 611 * --- 612 */ 613 ElementType!(V)[][ElementType!(C)] byCategory(V, C)(V values, C categories) 614 if(isInputRange!(V) && isInputRange!(C) && !is(ElementType!C == bool)) { 615 alias ElementType!(V) EV; 616 alias ElementType!(C) EC; 617 618 Appender!(EV[])[EC] aa; 619 while(!values.empty && !categories.empty) { 620 scope(exit) { 621 values.popFront(); 622 categories.popFront(); 623 } 624 625 auto category = categories.front; 626 auto ptr = category in aa; 627 if(ptr is null) { 628 aa[category] = typeof(aa[category]).init; 629 ptr = category in aa; 630 } 631 632 ptr.put(values.front); 633 } 634 635 EV[][EC] ret; 636 foreach(k, v; aa) { 637 ret[k] = v.data; 638 } 639 640 return ret; 641 } 642 643 /**Special case implementation for when ElementType!C is boolean.*/ 644 ElementType!(V)[][2] byCategory(V, C)(V values, C categories) 645 if(isInputRange!(V) && isInputRange!(C) && is(ElementType!C == bool)) { 646 typeof(return) ret; 647 648 static if(hasLength!V || hasLength!C) { 649 // Preallocate. 650 size_t zeroIndex, oneIndex; 651 652 static if(!hasLength!V) { 653 ret[0].length = categories.length; 654 ret[1].length = values.length; 655 } else static if(!hasLength!C) { 656 ret[0].length = values.length; 657 ret[1].length = values.length; 658 } else { 659 immutable len = min(categories.length, values.length); 660 ret[0].length = len; 661 ret[1].length = len; 662 } 663 664 while(!values.empty && !categories.empty) { 665 scope(exit) { 666 values.popFront(); 667 categories.popFront(); 668 } 669 670 auto category = categories.front; 671 if(category) { 672 ret[1][oneIndex++] = values.front; 673 } else { 674 ret[0][zeroIndex++] = values.front; 675 } 676 } 677 678 ret[0] = ret[0][0..zeroIndex]; 679 ret[1] = ret[1][0..oneIndex]; 680 } else { 681 auto app0 = appender!(ElementType!(V)[])(); 682 auto app1 = appender!(ElementType!(V)[])(); 683 684 while(!values.empty && !categories.empty) { 685 scope(exit) { 686 values.popFront(); 687 categories.popFront(); 688 } 689 690 auto category = categories.front; 691 if(category) { 692 app1.put(values.front); 693 } else { 694 app0.put(values.front); 695 } 696 } 697 698 ret[0] = app0.data; 699 ret[1] = app1.data; 700 } 701 702 return ret; 703 } 704 705 unittest { 706 int[] nums = [1,2,3,4,5,6,7,8,9]; 707 int[] categories = [0,1,2,0,1,2,0,1,2]; 708 709 // The filter is just to prevent having a length. 710 auto categories2 = filter!"a == a"(map!"a % 2 == 0"(nums)); 711 712 auto result = byCategory(nums, categories); 713 assert(result[0] == [1,4,7]); 714 assert(result[1] == [2,5,8]); 715 assert(result[2] == [3,6,9]); 716 717 auto res2 = byCategory(filter!"a == a"(nums), categories2); 718 assert(res2[0] == [1,3,5,7,9]); 719 assert(res2[1] == [2,4,6,8]); 720 721 auto res3 = byCategory(nums, 722 [false, true, false, true, false, true, false, true, false]); 723 assert(res2 == res3); 724 } 725 726 /**Finds the area under the ROC curve (a curve with sensitivity on the Y-axis 727 * and 1 - specificity on the X-axis). This is a useful metric for 728 * determining how well a test statistic discriminates between two classes. 729 * The following assumptions are made in this implementation: 730 * 731 * 1. For some cutoff value c and test statistic T, your decision rule is of 732 * the form "Class A if T > c, Class B if T < c". 733 * 734 * 2. In the case of ties, i.e. if class A and class B both have an identical 735 * value, linear interpolation is used. This is because changing the 736 * value of c infinitesimally will change both sensitivity and specificity 737 * in these cases. 738 */ 739 double auroc(R1, R2)(R1 classATs, R2 classBTs) 740 if(isNumeric!(ElementType!R1) && isNumeric!(ElementType!R2)) { 741 auto alloc = newRegionAllocator(); 742 auto classA = alloc.array(classATs); 743 auto classB = alloc.array(classBTs); 744 qsort(classA); 745 qsort(classB); 746 747 // Start cutoff at -infinity, such that we get everything in class A, i.e. 748 // perfect specificity, zero sensitivity. We arbitrarily define class B 749 // as our "positive" class. 750 double tp = 0, tn = classA.length, fp = 0, fn = classB.length; 751 double[2] lastPoint = 0; 752 753 Unqual!(CommonType!(ElementType!R1, ElementType!R2)) currentVal; 754 755 ElementType!R1 popA() { 756 tn--; 757 fp++; 758 auto ret = classA.front; 759 classA.popFront(); 760 return ret; 761 } 762 763 ElementType!R2 popB() { 764 fn--; 765 tp++; 766 auto ret = classB.front; 767 classB.popFront(); 768 return ret; 769 } 770 771 double area = 0; 772 while(!classA.empty && !classB.empty) { 773 if(classA.front < classB.front) { 774 currentVal = popA(); 775 } else { 776 currentVal = popB(); 777 } 778 779 // Handle ties. 780 while(!classA.empty && classA.front == currentVal) { 781 popA(); 782 } 783 784 while(!classB.empty && classB.front == currentVal) { 785 popB(); 786 } 787 788 double[2] curPoint; 789 curPoint[0] = 1.0 - tn / (fp + tn); 790 curPoint[1] = tp / (tp + fn); 791 792 immutable xDist = curPoint[0] - lastPoint[0]; 793 area += xDist * lastPoint[1]; // Rectangular part. 794 area += xDist * 0.5 * (curPoint[1] - lastPoint[1]); // Triangular part. 795 lastPoint[] = curPoint[]; 796 } 797 798 if(classA.length > 0 && classB.length == 0) { 799 // Then we already have a sensitivity of 1, move straight to the right 800 // to the point (1, 1). 801 802 immutable xDist = 1 - lastPoint[0]; 803 area += xDist * lastPoint[1]; // Rectangular part. 804 area += xDist * 0.5 * (1 - lastPoint[1]); // Triangular part. 805 } 806 807 return area; 808 } 809 810 unittest { 811 // Values worked out by hand on paper. If you don't believe me, work 812 // them out yourself. 813 assert(auroc([4,5,6], [1,2,3]) == 1); 814 assert(approxEqual(auroc([8,6,7,5,3,0,9], [3,6,2,4,3,6]), 0.6904762)); 815 assert(approxEqual(auroc([2,7,1,8,2,8,1,8], [3,1,4,1,5,9,2,6]), 0.546875)); 816 } 817 818 /// 819 T sign(T)(T num) pure nothrow if(is(typeof(num < 0))) { 820 if (num > 0) return 1; 821 if (num < 0) return -1; 822 return 0; 823 } 824 825 unittest { 826 assert(sign(3.14159265)==1); 827 assert(sign(-3)==-1); 828 assert(sign(-2.7182818)==-1); 829 } 830 831 /// 832 /*Values up to 9,999 are pre-calculated and stored in 833 * an immutable global array, for performance. After this point, the gamma 834 * function is used, because caching would take up too much memory, and if 835 * done lazily, would cause threading issues.*/ 836 double logFactorial(ulong n) { 837 //Input is uint, can't be less than 0, no need to check. 838 if(n < staticFacTableLen) { 839 return logFactorialTable[cast(size_t) n]; 840 } else return logGamma(n + 1); 841 } 842 843 unittest { 844 // Cache branch. 845 assert(cast(uint) round(exp(logFactorial(4)))==24); 846 assert(cast(uint) round(exp(logFactorial(5)))==120); 847 assert(cast(uint) round(exp(logFactorial(6)))==720); 848 assert(cast(uint) round(exp(logFactorial(7)))==5040); 849 assert(cast(uint) round(exp(logFactorial(3)))==6); 850 // Gamma branch. 851 assert(approxEqual(logFactorial(12000), 1.007175584216837e5, 1e-14)); 852 assert(approxEqual(logFactorial(14000), 1.196610688711534e5, 1e-14)); 853 } 854 855 ///Log of (n choose k). 856 double logNcomb(ulong n, ulong k) 857 in { 858 assert(k <= n); 859 } do { 860 if(n < k) return -double.infinity; 861 //Extra parentheses increase numerical accuracy. 862 return logFactorial(n) - (logFactorial(k) + logFactorial(n - k)); 863 } 864 865 unittest { 866 assert(cast(uint) round(exp(logNcomb(4,2)))==6); 867 assert(cast(uint) round(exp(logNcomb(30,8)))==5852925); 868 assert(cast(uint) round(exp(logNcomb(28,5)))==98280); 869 } 870 871 static if(size_t.sizeof == 4) { 872 private enum MAX_PERM_LEN = 12; 873 } else { 874 private enum MAX_PERM_LEN = 20; 875 } 876 877 /** 878 A struct that generates all possible permutations of a sequence. 879 880 Notes: 881 882 Permutations are output in undefined order. 883 884 The array returned by front is recycled across iterations. To preserve 885 it across iterations, wrap this range using map!"a.dup" or 886 map!"a.idup". 887 888 Bugs: Only supports iterating over up to size_t.max permutations. 889 This means the max permutation length is 12 on 32-bit machines, or 20 890 on 64-bit. This was a conscious tradeoff to allow this range to have a 891 length of type size_t, since iterating over such huge permutation spaces 892 would be insanely slow anyhow. 893 894 Examples: 895 --- 896 double[][] res; 897 auto perm = map!"a.dup"(Perm!(double)([1.0, 2.0, 3.0][])); 898 foreach(p; perm) { 899 res ~= p; 900 } 901 902 auto sorted = sort(res); 903 assert(sorted.canFind([1.0, 2.0, 3.0])); 904 assert(sorted.canFind([1.0, 3.0, 2.0])); 905 assert(sorted.canFind([2.0, 1.0, 3.0])); 906 assert(sorted.canFind([2.0, 3.0, 1.0])); 907 assert(sorted.canFind([3.0, 1.0, 2.0])); 908 assert(sorted.canFind([3.0, 2.0, 1.0])); 909 assert(sorted.length == 6); 910 --- 911 */ 912 struct Perm(T) { 913 private: 914 915 // Optimization: Since we know this thing can't get too big (there's 916 // an dstatsEnforce statement for it in the c'tor), just use arrays of the max 917 // possible size for stuff and store them inline, if it's all just bytes. 918 static if(T.sizeof == 1) { 919 T[MAX_PERM_LEN] perm; 920 } else { 921 T* perm; 922 } 923 924 // The length of this range. 925 size_t nPerms; 926 927 ubyte[MAX_PERM_LEN] Is; 928 ubyte currentIndex; 929 930 // The length of these arrays. Stored once to minimize overhead. 931 ubyte len; 932 alias const(T)[] PermArray; 933 934 public: 935 /**Generate permutations from an input range. 936 * Create a duplicate of this sequence 937 * so that the original sequence is not modified.*/ 938 this(U)(U input) 939 if(isForwardRange!(U)) { 940 941 static if(ElementType!(U).sizeof > 1) { 942 auto arr = array(input); 943 dstatsEnforce(arr.length <= MAX_PERM_LEN, text( 944 "Can't iterate permutations of an array this long. (Max length: ", 945 MAX_PERM_LEN, ")")); 946 len = cast(ubyte) arr.length; 947 perm = arr.ptr; 948 } else { 949 foreach(elem; input) { 950 dstatsEnforce(len < MAX_PERM_LEN, text( 951 "Can't iterate permutations of an array this long. (Max length: ", 952 MAX_PERM_LEN, ")")); 953 954 perm[len++] = elem; 955 } 956 } 957 958 popFront(); 959 960 nPerms = 1; 961 for(size_t i = 2; i <= len; i++) { 962 nPerms *= i; 963 } 964 } 965 966 /** 967 Returns the current permutation. The array is const because it is 968 recycled across iterations and modifying it would destroy the state of 969 the permutation generator. 970 */ 971 @property const(T)[] front() { 972 return perm[0..len]; 973 } 974 975 /** 976 Get the next permutation in the sequence. This will overwrite the 977 contents of the array returned by the last call to front. 978 */ 979 void popFront() { 980 if(len == 0) { 981 nPerms--; 982 return; 983 } 984 if(currentIndex == len - 1) { 985 currentIndex--; 986 nPerms--; 987 return; 988 } 989 990 uint max = len - currentIndex; 991 if(Is[currentIndex] == max) { 992 993 if(currentIndex == 0) { 994 nPerms--; 995 assert(nPerms == 0, to!string(nPerms)); 996 return; 997 } 998 999 Is[currentIndex..len] = 0; 1000 currentIndex--; 1001 return popFront(); 1002 } else { 1003 rotateLeft(perm[currentIndex..len]); 1004 Is[currentIndex]++; 1005 currentIndex++; 1006 return popFront(); 1007 } 1008 } 1009 1010 /// 1011 @property bool empty() { 1012 return nPerms == 0; 1013 } 1014 1015 /** 1016 The number of permutations left. 1017 */ 1018 @property size_t length() const pure nothrow { 1019 return nPerms; 1020 } 1021 1022 /// 1023 @property typeof(this) save() { 1024 auto ret = this; 1025 static if(T.sizeof > 1) { 1026 ret.perm = (ret.perm[0..len].dup).ptr; 1027 } 1028 return ret; 1029 } 1030 } 1031 1032 /** 1033 Create a Perm struct from a range or of a set of bounds. 1034 1035 Examples: 1036 --- 1037 auto p = perm([1,2,3]); // All permutations of [1,2,3]. 1038 auto p = perm(5); // All permutations of [0,1,2,3,4]. 1039 auto p = perm(-1, 2); // All permutations of [-1, 0, 1]. 1040 --- 1041 */ 1042 auto perm(T...)(T stuff) { 1043 static if(isForwardRange!(T[0])) { 1044 return Perm!(ElementType!T)(stuff); 1045 } else static if(T.length == 1) { 1046 static assert(isIntegral!(T[0]), 1047 "If one argument is passed to perm(), it must be an integer."); 1048 1049 dstatsEnforce(stuff[0] >= 0, "Cannot generate permutations of length < 0."); 1050 dstatsEnforce(stuff[0] <= MAX_PERM_LEN, text( 1051 "Can't iterate permutations of an array of length ", 1052 stuff[0], ". (Max length: ", MAX_PERM_LEN, ")")); 1053 1054 // Optimization: Since we know the lower 1055 // bound is zero and the upper bound can't be > byte.max, use bytes 1056 // instead of bigger integer types. 1057 return Perm!byte(seq(cast(byte) 0, cast(byte) stuff[0])); 1058 } else { 1059 static assert(stuff.length == 2); 1060 return Perm!(CommonType!(T[0], T[1]))(seq(stuff[0], stuff[1])); 1061 } 1062 } 1063 1064 unittest { 1065 // Test degenerate case of len == 0; 1066 uint nZero = 0; 1067 foreach(elem; perm(0)) { 1068 assert(elem.length == 0); 1069 nZero++; 1070 } 1071 assert(nZero == 1); 1072 1073 double[][] res; 1074 auto p1 = map!"a.dup"(perm([1.0, 2.0, 3.0][])); 1075 assert(p1.length == 6); 1076 foreach(p; p1) { 1077 res ~= p; 1078 } 1079 auto sortedRes = sort(res); 1080 assert(sortedRes.contains([1.0, 2.0, 3.0])); 1081 assert(sortedRes.contains([1.0, 3.0, 2.0])); 1082 assert(sortedRes.contains([2.0, 1.0, 3.0])); 1083 assert(sortedRes.contains([2.0, 3.0, 1.0])); 1084 assert(sortedRes.contains([3.0, 1.0, 2.0])); 1085 assert(sortedRes.contains([3.0, 2.0, 1.0])); 1086 assert(res.length == 6); 1087 byte[][] res2; 1088 auto perm2 = map!"a.dup"(perm(3)); 1089 foreach(p; perm2) { 1090 res2 ~= p; 1091 } 1092 auto sortedRes2 = sort(res2); 1093 assert(sortedRes2.contains( to!(byte[])([0, 1, 2]))); 1094 assert(sortedRes2.contains( to!(byte[])([0, 2, 1]))); 1095 assert(sortedRes2.contains( to!(byte[])([1, 0, 2]))); 1096 assert(sortedRes2.contains( to!(byte[])([1, 2, 0]))); 1097 assert(sortedRes2.contains( to!(byte[])([2, 0, 1]))); 1098 assert(sortedRes2.contains( to!(byte[])([2, 1, 0]))); 1099 assert(res2.length == 6); 1100 1101 // Indirect tests: If the elements returned are unique, there are N! of 1102 // them, and they contain what they're supposed to contain, the result is 1103 // correct. 1104 auto perm3 = perm(0U, 6U); 1105 bool[uint[]] table; 1106 foreach(p; perm3) { 1107 table[p.idup] = true; 1108 } 1109 assert(table.length == 720); 1110 foreach(elem, val; table) { 1111 assert(elem.dup.insertionSort == [0U, 1, 2, 3, 4, 5]); 1112 } 1113 auto perm4 = perm(5); 1114 bool[byte[]] table2; 1115 foreach(p; perm4) { 1116 table2[p.idup] = true; 1117 } 1118 assert(table2.length == 120); 1119 foreach(elem, val; table2) { 1120 assert(elem.dup.insertionSort == to!(byte[])([0, 1, 2, 3, 4])); 1121 } 1122 } 1123 1124 /** 1125 Generates every possible combination of r elements of the given sequence, or 1126 array indices from zero to N, depending on which c'tor is called. Uses 1127 an input range interface. 1128 1129 Note: The buffer that is returned by front is recycled across iterations. 1130 To duplicate it instead, use map!"a.dup" or map!"a.idup". 1131 1132 Bugs: Only supports iterating over up to size_t.max combinations. 1133 This was a conscious tradeoff to allow this range to have a 1134 length of type size_t, since iterating over such huge combination spaces 1135 would be insanely slow anyhow. 1136 1137 Examples: 1138 --- 1139 auto comb1 = map!"a.dup"(Comb!(uint)(5, 2)); 1140 uint[][] vals; 1141 foreach(c; comb1) { 1142 vals ~= c; 1143 } 1144 auto sorted = sort(vals); 1145 assert(sorted.canFind([0u,1])); 1146 assert(sorted.canFind([0u,2])); 1147 assert(sorted.canFind([0u,3])); 1148 assert(sorted.canFind([0u,4])); 1149 assert(sorted.canFind([1u,2])); 1150 assert(sorted.canFind([1u,3])); 1151 assert(sorted.canFind([1u,4])); 1152 assert(sorted.canFind([2u,3])); 1153 assert(sorted.canFind([2u,4])); 1154 assert(sorted.canFind([3u,4])); 1155 assert(sorted.length == 10); 1156 --- 1157 */ 1158 struct Comb(T) { 1159 private: 1160 int N; 1161 int R; 1162 int diff; 1163 uint* pos; 1164 T* myArray; 1165 T* chosen; 1166 size_t _length; 1167 1168 alias const(T)[] CombArray; 1169 1170 void popFrontNum() { 1171 int index = R - 1; 1172 for(; index != -1 && pos[index] == diff + index; --index) {} 1173 if(index == -1) { 1174 _length--; 1175 return; 1176 } 1177 pos[index]++; 1178 for(uint i = index + 1; i < R; ++i) { 1179 pos[i] = pos[index] + i - index; 1180 } 1181 _length--; 1182 } 1183 1184 void popFrontArray() { 1185 int index = R - 1; 1186 for(; index != -1 && pos[index] == diff + index; --index) {} 1187 if(index == -1) { 1188 _length--; 1189 return; 1190 } 1191 pos[index]++; 1192 chosen[index] = myArray[pos[index]]; 1193 for(uint i = index + 1; i < R; ++i) { 1194 pos[i] = pos[index] + i - index; 1195 chosen[i] = myArray[pos[i]]; 1196 } 1197 _length--; 1198 } 1199 1200 void setLen() { 1201 // Used at construction. 1202 auto rLen = exp( logNcomb(N, R)); 1203 dstatsEnforce(rLen < size_t.max, "Too many combinations."); 1204 _length = roundTo!size_t(rLen); 1205 } 1206 1207 public: 1208 1209 /** 1210 Ctor to generate all possible combinations of array indices for a length r 1211 array. This is a special-case optimization and is faster than simply 1212 using the other ctor to generate all length r combinations from 1213 seq(0, length). 1214 1215 For efficiency, uint is used instead of size_t since, on a 64-bit system, 1216 generating all possible combinations of an array bigger than uint.max 1217 wouldn't be feasible anyhow. 1218 */ 1219 static if(is(T == uint)) { 1220 this(uint n, uint r) 1221 in { 1222 assert(n >= r); 1223 } do { 1224 if(r > 0) { 1225 pos = (seq(0U, r)).ptr; 1226 pos[r - 1]--; 1227 } 1228 N = n; 1229 R = r; 1230 diff = N - R; 1231 popFront(); 1232 setLen(); 1233 } 1234 } 1235 1236 /**General ctor. array is a sequence from which to generate the 1237 * combinations. r is the length of the combinations to be generated.*/ 1238 this(T[] array, uint r) { 1239 if(r > 0) { 1240 pos = (seq(0U, r)).ptr; 1241 pos[r - 1]--; 1242 } 1243 N = to!uint(array.length); 1244 R = r; 1245 diff = N - R; 1246 auto temp = array.dup; 1247 myArray = temp.ptr; 1248 chosen = (new T[r]).ptr; 1249 foreach(i; 0..r) { 1250 chosen[i] = myArray[pos[i]]; 1251 } 1252 popFront(); 1253 setLen(); 1254 } 1255 1256 /** 1257 Gets the current combination. 1258 */ 1259 @property const(T)[] front() { 1260 static if(!is(T == uint)) { 1261 return chosen[0..R].dup; 1262 } else { 1263 return (myArray is null) ? pos[0..R] : chosen[0..R]; 1264 } 1265 } 1266 1267 /** 1268 Advances to the next combination. The array returned by front will be 1269 overwritten with the new results. 1270 */ 1271 void popFront() { 1272 return (myArray is null) ? popFrontNum() : popFrontArray(); 1273 } 1274 1275 /// 1276 @property bool empty() const pure nothrow { 1277 return length == 0; 1278 } 1279 1280 /// 1281 @property size_t length() const pure nothrow { 1282 return _length; 1283 } 1284 1285 /// 1286 @property typeof(this) save() { 1287 auto ret = this; 1288 ret.pos = (pos[0..R].dup).ptr; 1289 1290 if(chosen !is null) { 1291 ret.chosen = (chosen[0..R].dup).ptr; 1292 } 1293 1294 return ret; 1295 } 1296 } 1297 1298 /**Create a Comb struct from a range or of a set of bounds. 1299 * 1300 * Examples: 1301 * --- 1302 * auto c1 = comb([1,2,3], 2); // Any two elements from [1,2,3]. 1303 * auto c2 = comb(5, 3); // Any three elements from [0,1,2,3,4]. 1304 * --- 1305 */ 1306 auto comb(T)(T stuff, uint r) { 1307 static if(isForwardRange!(T)) { 1308 return Comb!(ElementType!T)(stuff, r); 1309 } else { 1310 static assert(isIntegral!T, "Can only call comb on ints and ranges."); 1311 return Comb!(uint)(cast(uint) stuff, r); 1312 } 1313 } 1314 1315 unittest { 1316 // Test degenerate case of r == 0. Shouldn't segfault. 1317 uint nZero = 0; 1318 foreach(elem; comb(5, 0)) { 1319 assert(elem.length == 0); 1320 nZero++; 1321 } 1322 assert(nZero == 1); 1323 1324 nZero = 0; 1325 uint[] foo = [1,2,3,4,5]; 1326 foreach(elem; comb(foo, 0)) { 1327 assert(elem.length == 0); 1328 nZero++; 1329 } 1330 assert(nZero == 1); 1331 1332 // Test indexing verison first. 1333 auto comb1 = map!"a.dup"(comb(5, 2)); 1334 uint[][] vals; 1335 foreach(c; comb1) { 1336 vals ~= c; 1337 } 1338 1339 auto sortedVals = sort(vals); 1340 assert(sortedVals.contains([0u,1].dup)); 1341 assert(sortedVals.contains([0u,2].dup)); 1342 assert(sortedVals.contains([0u,3].dup)); 1343 assert(sortedVals.contains([0u,4].dup)); 1344 assert(sortedVals.contains([1u,2].dup)); 1345 assert(sortedVals.contains([1u,3].dup)); 1346 assert(sortedVals.contains([1u,4].dup)); 1347 assert(sortedVals.contains([2u,3].dup)); 1348 assert(sortedVals.contains([2u,4].dup)); 1349 assert(sortedVals.contains([3u,4].dup)); 1350 assert(vals.length == 10); 1351 1352 // Now, test the array version. 1353 auto comb2 = map!"a.dup"(comb(seq(5U, 10U), 3)); 1354 vals = null; 1355 foreach(c; comb2) { 1356 vals ~= c; 1357 } 1358 sortedVals = sort(vals); 1359 assert(sortedVals.contains([5u, 6, 7].dup)); 1360 assert(sortedVals.contains([5u, 6, 8].dup)); 1361 assert(sortedVals.contains([5u, 6, 9].dup)); 1362 assert(sortedVals.contains([5u, 7, 8].dup)); 1363 assert(sortedVals.contains([5u, 7, 9].dup)); 1364 assert(sortedVals.contains([5u, 8, 9].dup)); 1365 assert(sortedVals.contains([6U, 7, 8].dup)); 1366 assert(sortedVals.contains([6u, 7, 9].dup)); 1367 assert(sortedVals.contains([6u, 8, 9].dup)); 1368 assert(sortedVals.contains([7u, 8, 9].dup)); 1369 assert(vals.length == 10); 1370 1371 // Now a test of a larger dataset where more subtle bugs could hide. 1372 // If the values returned are unique even after sorting, are composed of 1373 // the correct elements, and there is the right number of them, this thing 1374 // works. 1375 1376 bool[uint[]] results; // Keep track of how many UNIQUE items we have. 1377 auto comb3 = Comb!(uint)(seq(10U, 22U), 6); 1378 foreach(c; comb3) { 1379 auto dupped = c.dup.sort(); 1380 // Make sure all elems are unique and within range. 1381 assert(dupped.length == 6); 1382 assert(dupped[0] > 9 && dupped[0] < 22); 1383 foreach(i; 1..dupped.length) { 1384 // Make sure elements are unique. Remember, the array is sorted. 1385 assert(dupped[i] > dupped[i - 1]); 1386 assert(dupped[i] > 9 && dupped[i] < 22); 1387 } 1388 results[dupped.release().idup] = true; 1389 } 1390 assert(results.length == 924); // (12 choose 6). 1391 } 1392 1393 /**Converts a range with arbitrary element types (usually strings) to a 1394 * range of reals lazily. Ignores any elements that could not be successfully 1395 * converted. Useful for creating an input range that can be used with this 1396 * lib out of a File without having to read the whole file into an array first. 1397 * The advantages to this over just using std.algorithm.map are that it's 1398 * less typing and that it ignores non-convertible elements, such as blank 1399 * lines. 1400 * 1401 * If rangeIn is an inputRange, then the result of this function is an input 1402 * range. Otherwise, the result is a forward range. 1403 * 1404 * Note: The reason this struct doesn't have length or random access, 1405 * even if this is supported by rangeIn, is because it has to be able to 1406 * filter out non-convertible elements. 1407 * 1408 * Examples: 1409 * --- 1410 * // Perform a T-test to see whether the mean of the data being input as text 1411 * // from stdin is different from zero. This data might not even fit in memory 1412 * // if it were read in eagerly. 1413 * 1414 * auto myRange = toNumericRange( stdin.byLine() ); 1415 * TestRes result = studentsTTest(myRange); 1416 * writeln(result); 1417 * --- 1418 */ 1419 ToNumericRange!R toNumericRange(R)(R rangeIn) if(isInputRange!R) { 1420 alias ToNumericRange!R RT; 1421 return RT(rangeIn); 1422 } 1423 1424 /// 1425 struct ToNumericRange(R) if(isInputRange!R) { 1426 private: 1427 alias ElementType!R E; 1428 R inputRange; 1429 real _front; 1430 1431 public: 1432 this(R inputRange) { 1433 this.inputRange = inputRange; 1434 try { 1435 _front = to!real(inputRange.front); 1436 } catch(ConvException) { 1437 popFront(); 1438 } 1439 } 1440 1441 @property real front() { 1442 return _front; 1443 } 1444 1445 void popFront() { 1446 while(true) { 1447 inputRange.popFront(); 1448 if(inputRange.empty) { 1449 return; 1450 } 1451 auto inFront = inputRange.front; 1452 1453 // If inFront is some string, strip the whitespace. 1454 static if( is(typeof(strip(inFront)))) { 1455 inFront = strip(inFront); 1456 } 1457 1458 try { 1459 _front = to!real(inFront); 1460 return; 1461 } catch(ConvException) { 1462 continue; 1463 } 1464 } 1465 } 1466 1467 @property bool empty() { 1468 return inputRange.empty; 1469 } 1470 } 1471 1472 unittest { 1473 // Test both with non-convertible element as first element and without. 1474 // This is because non-convertible elements as the first element are 1475 // handled as a special case in the implementation. 1476 string[2] dataArr = ["3.14\n2.71\n8.67\nabracadabra\n362436", 1477 "foobar\n3.14\n2.71\n8.67\nabracadabra\n362436"]; 1478 1479 foreach(data; dataArr) { 1480 std.file.write("NumericFileTestDeleteMe.txt", data); 1481 scope(exit) std.file.remove("NumericFileTestDeleteMe.txt"); 1482 auto myFile = File("NumericFileTestDeleteMe.txt"); 1483 auto rng = toNumericRange(myFile.byLine()); 1484 assert(approxEqual(rng.front, 3.14)); 1485 rng.popFront; 1486 assert(approxEqual(rng.front, 2.71)); 1487 rng.popFront; 1488 assert(approxEqual(rng.front, 8.67)); 1489 rng.popFront; 1490 assert(approxEqual(rng.front, 362435)); 1491 assert(!rng.empty); 1492 rng.popFront; 1493 assert(rng.empty); 1494 myFile.close(); 1495 } 1496 } 1497 1498 // Used for ILP optimizations. 1499 package template StaticIota(size_t upTo) { 1500 static if(upTo == 0) { 1501 alias TypeTuple!() StaticIota; 1502 } else { 1503 alias TypeTuple!(StaticIota!(upTo - 1), upTo - 1) StaticIota; 1504 } 1505 } 1506 1507 package: 1508 1509 // If SciD is available, it is used for matrix storage and operations 1510 // such as Cholesky decomposition. Otherwise, some old, crappy routines 1511 // that were around since before SciD are used. 1512 version(scid) { 1513 import scid.matvec, scid.linalg, scid.exception; 1514 1515 alias ExternalMatrixView!double DoubleMatrix; 1516 1517 DoubleMatrix doubleMatrix( 1518 size_t nRows, 1519 size_t nColumns, 1520 RegionAllocator alloc 1521 ) { 1522 return DoubleMatrix(nRows, nColumns, alloc); 1523 } 1524 1525 void choleskySolve(DoubleMatrix a, double[] b, double[] x) { 1526 choleskyDestructive(a); 1527 auto ans = externalVectorView(x); 1528 auto vec = externalVectorView(b); 1529 scid.linalg.choleskySolve(a, vec, ans); 1530 } 1531 1532 void invert(DoubleMatrix from, DoubleMatrix to) { 1533 try { 1534 to[] = inv(from); 1535 } catch(Exception) { 1536 foreach(i; 0..to.rows) foreach(j; 0..to.columns) { 1537 to[i, j] = double.nan; 1538 } 1539 } 1540 } 1541 1542 } else { 1543 version = noscid; 1544 1545 struct DoubleMatrix { 1546 double[][] arrayOfArrays; 1547 //alias arrayOfArraysFun this; 1548 1549 const pure nothrow @property { 1550 size_t rows() { 1551 return arrayOfArrays.length; 1552 } 1553 1554 size_t columns() { 1555 return (arrayOfArrays.length) ? 1556 (arrayOfArrays[0].length) : 0; 1557 } 1558 } 1559 1560 ref double opIndex(size_t i, size_t j) { 1561 return arrayOfArrays[i][j]; 1562 } 1563 } 1564 1565 DoubleMatrix doubleMatrix( 1566 size_t nRows, 1567 size_t nColumns, 1568 RegionAllocator alloc 1569 ) { 1570 return DoubleMatrix( 1571 alloc.uninitializedArray!(double[][])(nRows, nColumns) 1572 ); 1573 } 1574 1575 void invert(DoubleMatrix from, DoubleMatrix to) { 1576 invert(from.arrayOfArrays, to.arrayOfArrays); 1577 } 1578 1579 // Uses Gauss-Jordan elim. w/ row pivoting to invert from. Stores the results 1580 // in to and leaves from in an undefined state. 1581 package void invert(double[][] from, double[][] to) { 1582 // Normalize. 1583 foreach(i, row; from) { 1584 double absMax = 1.0 / reduce!(max)(map!(abs)(row[0..from.length])); 1585 row[] *= absMax; 1586 to[i][] = 0; 1587 to[i][i] = absMax; 1588 } 1589 1590 foreach(col; 0..from.length) { 1591 size_t bestRow; 1592 double biggest = 0; 1593 foreach(row; col..from.length) { 1594 if(abs(from[row][col]) > biggest) { 1595 bestRow = row; 1596 biggest = abs(from[row][col]); 1597 } 1598 } 1599 1600 swap(from[col], from[bestRow]); 1601 swap(to[col], to[bestRow]); 1602 immutable pivotFactor = from[col][col]; 1603 1604 foreach(row; 0..from.length) if(row != col) { 1605 immutable ratio = from[row][col] / pivotFactor; 1606 1607 // If you're ever looking to optimize this code, good luck. The 1608 // bottleneck is almost ENTIRELY this one line: 1609 from[row][] -= from[col][] * ratio; 1610 to[row][] -= to[col][] * ratio; 1611 } 1612 } 1613 1614 foreach(i; 0..from.length) { 1615 immutable diagVal = from[i][i]; 1616 from[i][] /= diagVal; 1617 to[i][] /= diagVal; 1618 } 1619 } 1620 1621 unittest { 1622 auto mat = [[1.0, 2, 3], [4.0, 7, 6], [7.0, 8, 9]]; 1623 auto toMat = [new double[3], new double[3], new double[3]]; 1624 invert(mat, toMat); 1625 assert(approxEqual(toMat[0], [-0.625, -0.25, 0.375])); 1626 assert(approxEqual(toMat[1], [-0.25, 0.5, -0.25])); 1627 assert(approxEqual(toMat[2], [0.708333, -0.25, 0.041667])); 1628 } 1629 1630 void solve(DoubleMatrix mat, double[] vec) { 1631 solve(mat.arrayOfArrays, vec); 1632 } 1633 1634 // Solve a system of linear equations mat * b = vec for b. The result is 1635 // stored in vec, and mat is left in an undefined state. Uses Gaussian 1636 // elimination w/ row pivoting. Roughly 4x faster than using inversion to 1637 // solve the system, and uses roughly half the memory. 1638 void solve(double[][] mat, double[] vec) { 1639 // Normalize. 1640 foreach(i, row; mat) { 1641 double absMax = 1.0 / reduce!(max)(map!(abs)(row[0..mat.length])); 1642 row[] *= absMax; 1643 vec[i] *= absMax; 1644 } 1645 1646 foreach(col; 0..mat.length - 1) { 1647 size_t bestRow; 1648 double biggest = 0; 1649 foreach(row; col..mat.length) { 1650 if(abs(mat[row][col]) > biggest) { 1651 bestRow = row; 1652 biggest = abs(mat[row][col]); 1653 } 1654 } 1655 1656 swap(mat[col], mat[bestRow]); 1657 swap(vec[col], vec[bestRow]); 1658 immutable pivotFactor = mat[col][col]; 1659 1660 foreach(row; col + 1..mat.length) { 1661 immutable ratio = mat[row][col] / pivotFactor; 1662 1663 // If you're ever looking to optimize this code, good luck. The 1664 // bottleneck is almost ENTIRELY this one line: 1665 mat[row][col..$] -= mat[col][col..$] * ratio; 1666 vec[row] -= vec[col] * ratio; 1667 } 1668 } 1669 1670 foreach(i; 0..mat.length) { 1671 double diagVal = mat[i][i]; 1672 mat[i][] /= diagVal; 1673 vec[i] /= diagVal; 1674 } 1675 1676 // Do back substitution. 1677 for(size_t row = mat.length - 1; row != size_t.max; row--) { 1678 auto v1 = vec[row + 1..$]; 1679 auto v2 = mat[row][$ - v1.length..$]; 1680 vec[row] -= dotProduct(v1, v2); 1681 } 1682 } 1683 1684 unittest { 1685 auto mat = [[2.0, 1, -1], [-3.0, -1, 2], [-2.0, 1, 2]]; 1686 auto vec = [8.0, -11, -3]; 1687 solve(mat, vec); 1688 assert(approxEqual(vec, [2, 3, -1])); 1689 1690 auto mat2 = [[1.0, 2, 3], [4.0, 7, 6], [7.0, 8, 9]]; 1691 auto vec2 = [8.0, 6, 7]; 1692 solve(mat2, vec2); 1693 assert(approxEqual(vec2, [-3.875, -0.75, 4.45833])); 1694 } 1695 1696 // Cholesky decomposition functions adapted from Don Clugston's MathExtra 1697 // lib, used w/ permission. 1698 void choleskyDecompose(double[][] a, double[] diag) { 1699 immutable N = diag.length; 1700 1701 foreach(i; 0..N) { 1702 const ai = a[i]; 1703 double sum = ai[i]; 1704 1705 for(sizediff_t k = i - 1; k >= 0; --k) { 1706 sum -= ai[k] * ai[k]; 1707 } 1708 1709 if (sum > 0.0) { 1710 diag[i] = sqrt(sum); 1711 1712 foreach(j; i + 1..N) { 1713 sum = ai[j] - dotProduct(ai[0..i], a[j][0..i]); 1714 a[j][i] = sum / diag[i]; 1715 } 1716 } else { 1717 // not positive definite (could be caused by rounding errors) 1718 diag[i] = 0; 1719 // make this whole row zero so they have no further effect 1720 foreach(j; i + 1..N) a[j][i] = 0; 1721 } 1722 } 1723 } 1724 1725 void choleskySolve(double[][] a, double[] diag, double[] b, double[] x) { 1726 immutable N = x.length; 1727 1728 foreach(i; 0..N) { 1729 const ai = a[i]; 1730 1731 if(diag[i] > 0) { 1732 double sum = b[i]; 1733 sum -= dotProduct(ai[0..i], x[0..i]); 1734 x[i] = sum / diag[i]; 1735 } else x[i] = 0; // skip pos definite rows 1736 } 1737 1738 for(sizediff_t i = N - 1; i >= 0; --i) { 1739 if (diag[i] > 0) { 1740 double sum = x[i]; 1741 for(sizediff_t k = i + 1; k < N; ++k) sum -= a[k][i] * x[k]; 1742 x[i] = sum / diag[i]; 1743 } else x[i] = 0; // skip pos definite rows 1744 } 1745 // Convert failed columns in solution to NANs if required. 1746 foreach(i; 0..N) { 1747 if(diag[i].isNaN() || diag[i] <= 0) x[i] = double.nan; 1748 } 1749 } 1750 1751 void choleskySolve(DoubleMatrix a, double[] b, double[] x) { 1752 auto alloc = newRegionAllocator(); 1753 auto diag = alloc.uninitializedArray!(double[])(x.length); 1754 choleskyDecompose(a.arrayOfArrays, diag); 1755 choleskySolve(a.arrayOfArrays, diag, b, x); 1756 } 1757 }