1 /**Probability distribution CDFs, PDFs/PMFs, and a few inverse CDFs. 2 * 3 * Authors: David Simcha, Don Clugston*/ 4 /* 5 * Acknowledgements: Some of this module was borrowed the mathstat module 6 * of Don Clugston's MathExtra library. This was done to create a 7 * coherent, complete library without massive dependencies, and without 8 * reinventing the wheel. These functions have been renamed to 9 * fit the naming conventions of this library, and are noted below. 10 * The code from Don Clugston's MathExtra library was based on the Cephes 11 * library by Stephen Moshier. 12 * 13 * Conventions: 14 * Cumulative distribution functions are named <distribution>CDF. For 15 * discrete distributions, are the P(X <= x) where X is the random variable, 16 * NOT P(X < x). 17 * 18 * All CDFs have a complement, named <distribution>CDFR, which stands for 19 * "Cumulative Distribution Function Right". For discrete distributions, 20 * this is P(X >= x), NOT P(X > x) and is therefore NOT equal to 21 * 1 - <distribution>CDF. Also, even for continuous distributions, the 22 * numerical accuracy is higher for small p-values if the CDFR is used than 23 * if 1 - CDF is used. 24 * 25 * If a PDF/PMF function is included for a distribution, it is named 26 * <distribution>PMF or <distribution>PDF (PMF for discrete, PDF for 27 * continuous distributions). 28 * 29 * If an inverse CDF is included, it is named inv<Distribution>CDF. 30 * 31 * For all distributions, the test statistic is the first function parameter 32 * and the distribution parameters are further down the function parameter 33 * list. This is important for certain generic code, such as tests and 34 * the parametrize template. 35 * 36 * The following functions are identical or functionally equivalent to 37 * functions found in MathExtra/Tango.Math.probability. This information 38 * might be useful if someone is trying to integrate this library into other code: 39 * 40 * normalCDF <=> normalDistribution 41 * 42 * normalCDFR <=> normalDistributionCompl 43 * 44 * invNormalCDF <=> normalDistributionComplInv 45 * 46 * studentsTCDF <=> studentsTDistribution (Note reversal in argument order) 47 * 48 * invStudentsTCDF <=> studentsTDistributionInv (Again, arg order reversed) 49 * 50 * binomialCDF <=> binomialDistribution 51 * 52 * negBinomCDF <=> negativeBinomialDistribution 53 * 54 * poissonCDF <=> poissonDistribution 55 * 56 * chiSqrCDF <=> chiSqrDistribution (Note reversed arg order) 57 * 58 * chiSqrCDFR <=> chiSqrDistributionCompl (Note reversed arg order) 59 * 60 * invChiSqCDFR <=> chiSqrDistributionComplInv 61 * 62 * fisherCDF <=> fDistribution (Note reversed arg order) 63 * 64 * fisherCDFR <=> fDistributionCompl (Note reversed arg order) 65 * 66 * invFisherCDFR <=> fDistributionComplInv 67 * 68 * gammaCDF <=> gammaDistribution (Note arg reversal) 69 * 70 * gammaCDFR <=> gammaDistributionCompl (Note arg reversal) 71 * 72 * Note that CDFRs/Compls of continuous distributions are not equivalent, 73 * because in Tango/MathExtra they represent P(X > x) while in dstats they 74 * represent P(X >= x). 75 * 76 * 77 * Copyright (c) 2008-2009, David Simcha and Don Clugston 78 * All rights reserved. 79 * 80 * Redistribution and use in source and binary forms, with or without 81 * modification, are permitted provided that the following conditions are met: 82 * 83 * * Redistributions of source code must retain the above copyright 84 * notice, this list of conditions and the following disclaimer. 85 * 86 * * Redistributions in binary form must reproduce the above copyright 87 * notice, this list of conditions and the following disclaimer in the 88 * documentation and/or other materials provided with the distribution. 89 * 90 * * Neither the name of the authors nor the 91 * names of its contributors may be used to endorse or promote products 92 * derived from this software without specific prior written permission. 93 * 94 * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY 95 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 96 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 97 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 98 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 99 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 100 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 101 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 102 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 103 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/ 104 105 module dstats.distrib; 106 107 import std.algorithm, std.conv, std.exception, std.math, std.traits, 108 std.mathspecial, std.range; 109 110 alias std.mathspecial.erfc erfc; 111 alias std.mathspecial.erf erf; 112 113 import dstats.base; 114 115 // CTFE doesn't work yet for sqrt() in GDC. This value is sqrt(2 * PI). 116 enum SQ2PI = 2.50662827463100050241576528481104525300698674060993831662992; 117 118 version(unittest) { 119 import std.stdio, std.random; 120 121 alias std.math.approxEqual ae; 122 } 123 124 /**Takes a distribution function (CDF or PDF/PMF) as a template argument, and 125 * parameters as function arguments in the order that they appear in the 126 * function declaration and returns a delegate that binds the supplied 127 * parameters to the distribution function. Assumes the non-parameter 128 * argument is the first argument to the distribution function. 129 * 130 * Examples: 131 * --- 132 * auto stdNormal = parametrize!(normalCDF)(0.0L, 1.0L); 133 * --- 134 * 135 * stdNormal is now a delegate for the normal(0, 1) distribution.*/ 136 double delegate(ParameterTypeTuple!(distrib)[0]) 137 parametrize(alias distrib)(ParameterTypeTuple!(distrib)[1..$] 138 parameters) { 139 140 double calculate(ParameterTypeTuple!(distrib)[0] arg) { 141 return distrib(arg, parameters); 142 } 143 144 return &calculate; 145 } 146 147 unittest { 148 // Just basically see if this compiles. 149 auto stdNormal = parametrize!normalCDF(0, 1); 150 assert(approxEqual(stdNormal(2.5), normalCDF(2.5, 0, 1))); 151 } 152 153 /// 154 struct ParamFunctor(alias distrib) { 155 ParameterTypeTuple!(distrib)[1..$] parameters; 156 157 double opCall(ParameterTypeTuple!(distrib)[0] arg) { 158 return distrib(arg, parameters); 159 } 160 } 161 162 /**Takes a distribution function (CDF or PDF/PMF) as a template argument, and 163 * parameters as function arguments in the order that they appear in the 164 * function declaration and returns a functor that binds the supplied 165 * parameters to the distribution function. Assumes the non-parameter 166 * argument is the first argument to the distribution function. 167 * 168 * Examples: 169 * --- 170 * auto stdNormal = paramFunctor!(normalCDF)(0.0L, 1.0L); 171 * --- 172 * 173 * stdNormal is now a functor for the normal(0, 1) distribution.*/ 174 ParamFunctor!(distrib) paramFunctor(alias distrib) 175 (ParameterTypeTuple!(distrib)[1..$] parameters) { 176 ParamFunctor!(distrib) ret; 177 foreach(ti, elem; parameters) { 178 ret.tupleof[ti] = elem; 179 } 180 return ret; 181 } 182 183 unittest { 184 // Just basically see if this compiles. 185 auto stdNormal = paramFunctor!normalCDF(0, 1); 186 assert(approxEqual(stdNormal(2.5), normalCDF(2.5, 0, 1))); 187 } 188 189 /// 190 double uniformPDF(double X, double lower, double upper) { 191 dstatsEnforce(X >= lower, "Can't have X < lower bound in uniform distribution."); 192 dstatsEnforce(X <= upper, "Can't have X > upper bound in uniform distribution."); 193 return 1.0L / (upper - lower); 194 } 195 196 /// 197 double uniformCDF(double X, double lower, double upper) { 198 dstatsEnforce(X >= lower, "Can't have X < lower bound in uniform distribution."); 199 dstatsEnforce(X <= upper, "Can't have X > upper bound in uniform distribution."); 200 201 return (X - lower) / (upper - lower); 202 } 203 204 /// 205 double uniformCDFR(double X, double lower, double upper) { 206 dstatsEnforce(X >= lower, "Can't have X < lower bound in uniform distribution."); 207 dstatsEnforce(X <= upper, "Can't have X > upper bound in uniform distribution."); 208 209 return (upper - X) / (upper - lower); 210 } 211 212 /// 213 double poissonPMF(ulong k, double lambda) { 214 dstatsEnforce(lambda > 0, "Cannot have a Poisson with lambda <= 0 or nan."); 215 216 return exp(cast(double) k * log(lambda) - 217 (lambda + logFactorial(k))); //Grouped for best precision. 218 } 219 220 unittest { 221 assert(approxEqual(poissonPMF(1, .1), .0904837)); 222 } 223 224 enum POISSON_NORMAL = 1UL << 12; // Where to switch to normal approx. 225 226 // The gamma incomplete function is too unstable and the distribution 227 // is for all practical purposes normal anyhow. 228 private double normApproxPoisCDF(ulong k, double lambda) 229 in { 230 assert(lambda > 0); 231 } do { 232 double sd = sqrt(lambda); 233 // mean == lambda. 234 return normalCDF(k + 0.5L, lambda, sd); 235 } 236 237 /**P(K <= k) where K is r.v.*/ 238 double poissonCDF(ulong k, double lambda) { 239 dstatsEnforce(lambda > 0, "Cannot have a poisson with lambda <= 0 or nan."); 240 241 return (max(k, lambda) >= POISSON_NORMAL) ? 242 normApproxPoisCDF(k, lambda) : 243 gammaIncompleteCompl(k + 1, lambda); 244 } 245 246 unittest { 247 // Make sure this jives with adding up PMF elements, since this is a 248 // discrete distribution. 249 static double pmfSum(uint k, double lambda) { 250 double ret = 0; 251 foreach(i; 0..k + 1) { 252 ret += poissonPMF(i, lambda); 253 } 254 return ret; 255 } 256 257 assert(approxEqual(poissonCDF(1, 0.5), pmfSum(1, 0.5))); 258 assert(approxEqual(poissonCDF(3, 0.7), pmfSum(3, 0.7))); 259 260 // Absurdly huge values: Test normal approximation. 261 // Values from R. 262 double ans = poissonCDF( (1UL << 50) - 10_000_000, 1UL << 50); 263 assert(approxEqual(ans, 0.3828427)); 264 265 // Make sure cutoff is reasonable, i.e. make sure gamma incomplete branch 266 // and normal branch get roughly the same answer near the cutoff. 267 for(double lambda = POISSON_NORMAL / 2; lambda <= POISSON_NORMAL * 2; lambda += 100) { 268 for(ulong k = POISSON_NORMAL / 2; k <= POISSON_NORMAL * 2; k += 100) { 269 double normAns = normApproxPoisCDF(k, lambda); 270 double gammaAns = gammaIncompleteCompl(k + 1, lambda); 271 assert(abs(normAns - gammaAns) < 0.01, text(normAns, '\t', gammaAns)); 272 } 273 } 274 } 275 276 // The gamma incomplete function is too unstable and the distribution 277 // is for all practical purposes normal anyhow. 278 private double normApproxPoisCDFR(ulong k, double lambda) 279 in { 280 assert(lambda > 0); 281 } do { 282 double sd = sqrt(lambda); 283 // mean == lambda. 284 return normalCDFR(k - 0.5L, lambda, sd); 285 } 286 287 /**P(K >= k) where K is r.v.*/ 288 double poissonCDFR(ulong k, double lambda) { 289 dstatsEnforce(lambda > 0, "Can't have a poisson with lambda <= 0 or nan."); 290 291 return (max(k, lambda) >= POISSON_NORMAL) ? 292 normApproxPoisCDFR(k, lambda) : 293 gammaIncomplete(k, lambda); 294 } 295 296 unittest { 297 // Make sure this jives with adding up PMF elements, since this is a 298 // discrete distribution. 299 static double pmfSum(uint k, double lambda) { 300 double ret = 0; 301 foreach(i; 0..k + 1) { 302 ret += poissonPMF(i, lambda); 303 } 304 return ret; 305 } 306 307 assert(approxEqual(poissonCDFR(1, 0.5), 1 - pmfSum(0, 0.5))); 308 assert(approxEqual(poissonCDFR(3, 0.7), 1 - pmfSum(2, 0.7))); 309 310 // Absurdly huge value to test normal approximation. 311 // Values from R. 312 double ans = poissonCDFR( (1UL << 50) - 10_000_000, 1UL << 50); 313 assert(approxEqual(ans, 0.6171573)); 314 315 // Make sure cutoff is reasonable, i.e. make sure gamma incomplete branch 316 // and normal branch get roughly the same answer near the cutoff. 317 for(double lambda = POISSON_NORMAL / 2; lambda <= POISSON_NORMAL * 2; lambda += 100) { 318 for(ulong k = POISSON_NORMAL / 2; k <= POISSON_NORMAL * 2; k += 100) { 319 double normAns = normApproxPoisCDFR(k, lambda); 320 double gammaAns = gammaIncomplete(k, lambda); 321 assert(abs(normAns - gammaAns) < 0.01, text(normAns, '\t', gammaAns)); 322 } 323 } 324 } 325 326 /**Returns the value of k for the given p-value and lambda. If p-val 327 * doesn't exactly map to a value of k, the k for which poissonCDF(k, lambda) 328 * is closest to pVal is used.*/ 329 uint invPoissonCDF(double pVal, double lambda) { 330 dstatsEnforce(lambda > 0, "Cannot have a poisson with lambda <= 0 or nan."); 331 dstatsEnforce(pVal >= 0 && pVal <= 1, "P-values must be between 0, 1."); 332 333 // Use normal approximation to get approx answer, then brute force search. 334 // This works better than you think because for small n, there's not much 335 // search space and for large n, the normal approx. is doublely good. 336 uint guess = cast(uint) max(round( 337 invNormalCDF(pVal, lambda, sqrt(lambda)) + 0.5), 0.0L); 338 double guessP = poissonCDF(guess, lambda); 339 340 if(guessP == pVal) { 341 return guess; 342 } else if(guessP < pVal) { 343 for(uint k = guess + 1; ; k++) { 344 double newP = guessP + poissonPMF(k, lambda); 345 if(newP >= 1) 346 return k; 347 if(abs(newP - pVal) > abs(guessP - pVal)) { 348 return k - 1; 349 } else { 350 guessP = newP; 351 } 352 } 353 } else { 354 for(uint k = guess - 1; k != uint.max; k--) { 355 double newP = guessP - poissonPMF(k + 1, lambda); 356 if(abs(newP - pVal) > abs(guessP - pVal)) { 357 return k + 1; 358 } else { 359 guessP = newP; 360 } 361 } 362 return 0; 363 } 364 } 365 366 unittest { 367 foreach(i; 0..1_000) { 368 // Restricted variable ranges are because, in the tails, more than one 369 // value of k can map to the same p-value at machine precision. 370 // Obviously, this is one of those corner cases that nothing can be 371 // done about. 372 double lambda = uniform(.05L, 8.0L); 373 uint k = uniform(0U, cast(uint) ceil(3.0L * lambda)); 374 double pVal = poissonCDF(k, lambda); 375 assert(invPoissonCDF(pVal, lambda) == k); 376 } 377 } 378 379 /// 380 double binomialPMF(ulong k, ulong n, double p) { 381 dstatsEnforce(k <= n, "k cannot be > n in binomial distribution."); 382 dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in binomial distribution."); 383 return exp(logNcomb(n, k) + k * log(p) + (n - k) * log(1 - p)); 384 } 385 386 unittest { 387 assert(approxEqual(binomialPMF(0, 10, .5), cast(double) 1/1024)); 388 assert(approxEqual(binomialPMF(100, 1000, .11), .024856)); 389 } 390 391 // Determines what value of n we switch to normal approximation at b/c 392 // betaIncomplete becomes unstable. 393 private enum BINOM_APPROX = 1UL << 24; 394 395 // Cutoff value of n * p for deciding whether to go w/ normal or poisson approx 396 // when betaIncomplete becomes unstable. 397 private enum BINOM_POISSON = 1_024; 398 399 // betaIncomplete is numerically unstable for huge values of n. 400 // Luckily this is exactly when the normal approximation becomes 401 // for all practical purposes exact. 402 private double normApproxBinomCDF(double k, double n, double p) 403 in { 404 assert(k <= n); 405 assert(p >= 0 && p <= 1); 406 } do { 407 double mu = p * n; 408 double sd = sqrt( to!double(n) ) * sqrt(p) * sqrt(1 - p); 409 double xCC = k + 0.5L; 410 return normalCDF(xCC, mu, sd); 411 } 412 413 ///P(K <= k) where K is random variable. 414 double binomialCDF(ulong k, ulong n, double p) { 415 dstatsEnforce(k <= n, "k cannot be > n in binomial distribution."); 416 dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in binomial distribution."); 417 418 if(k == n) { 419 return 1; 420 } else if(k == 0) { 421 return pow(1.0 - p, cast(double) n); 422 } 423 424 if(n > BINOM_APPROX) { 425 if(n * p < BINOM_POISSON) { 426 return poissonCDF(k, n * p); 427 } else if(n * (1 - p) < BINOM_POISSON) { 428 return poissonCDFR(n - k, n * (1 - p)); 429 } else { 430 return normApproxBinomCDF(k, n, p); 431 } 432 } 433 434 return betaIncomplete(n - k, k + 1, 1.0 - p); 435 } 436 437 unittest { 438 assert(approxEqual(binomialCDF(10, 100, .11), 0.4528744401)); 439 assert(approxEqual(binomialCDF(15, 100, .12), 0.8585510507)); 440 assert(approxEqual(binomialCDF(50, 1000, .04), 0.95093595)); 441 assert(approxEqual(binomialCDF(7600, 15000, .5), .9496193045414)); 442 assert(approxEqual(binomialCDF(0, 10, 0.2), 0.1073742)); 443 444 // Absurdly huge numbers: 445 { 446 ulong k = (1UL << 60) - 100_000_000; 447 ulong n = 1UL << 61; 448 assert(approxEqual(binomialCDF(k, n, 0.5L), 0.4476073)); 449 } 450 451 // Test Poisson branch. 452 double poisAns = binomialCDF(85, 1UL << 26, 1.49e-6); 453 assert(approxEqual(poisAns, 0.07085327)); 454 455 // Test poissonCDFR branch. 456 poisAns = binomialCDF( (1UL << 25) - 100, 1UL << 25, 0.9999975L); 457 assert(approxEqual(poisAns, 0.04713316)); 458 459 // Make sure cutoff is reasonable: Just below it, we should get similar 460 // results for normal, exact. 461 for(ulong n = BINOM_APPROX / 2; n < BINOM_APPROX; n += 200_000) { 462 for(double p = 0.01; p <= 0.99; p += 0.05) { 463 464 long lowerK = roundTo!long( n * p * 0.99); 465 long upperK = roundTo!long( n * p / 0.99); 466 467 for(ulong k = lowerK; k <= min(n, upperK); k += 1_000) { 468 double normRes = normApproxBinomCDF(k, n, p); 469 double exactRes = binomialCDF(k, n, p); 470 assert(abs(normRes - exactRes) < 0.001, 471 text(normRes, '\t', exactRes)); 472 } 473 } 474 } 475 476 } 477 478 // betaIncomplete is numerically unstable for huge values of n. 479 // Luckily this is exactly when the normal approximation becomes 480 // for all practical purposes exact. 481 private double normApproxBinomCDFR(ulong k, ulong n, double p) 482 in { 483 assert(k <= n); 484 assert(p >= 0 && p <= 1); 485 } do { 486 double mu = p * n; 487 double sd = sqrt( to!double(n) ) * sqrt(p) * sqrt(1 - p); 488 double xCC = k - 0.5L; 489 return normalCDFR(xCC, mu, sd); 490 } 491 492 ///P(K >= k) where K is random variable. 493 double binomialCDFR(ulong k, ulong n, double p) { 494 dstatsEnforce(k <= n, "k cannot be > n in binomial distribution."); 495 dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in binomial distribution."); 496 497 if(k == 0) { 498 return 1; 499 } else if(k == n) { 500 return pow(p, cast(double) n); 501 } 502 503 if(n > BINOM_APPROX) { 504 if(n * p < BINOM_POISSON) { 505 return poissonCDFR(k, n * p); 506 } else if(n * (1 - p) < BINOM_POISSON) { 507 return poissonCDF(n - k, n * (1 - p)); 508 } else { 509 return normApproxBinomCDFR(k, n, p); 510 } 511 } 512 513 return betaIncomplete(k, n - k + 1, p); 514 } 515 516 unittest { 517 // Values from R, Maxima. 518 assert(approxEqual(binomialCDF(10, 100, .11), 1 - 519 binomialCDFR(11, 100, .11))); 520 assert(approxEqual(binomialCDF(15, 100, .12), 1 - 521 binomialCDFR(16, 100, .12))); 522 assert(approxEqual(binomialCDF(50, 1000, .04), 1 - 523 binomialCDFR(51, 1000, .04))); 524 assert(approxEqual(binomialCDF(7600, 15000, .5), 1 - 525 binomialCDFR(7601, 15000, .5))); 526 assert(approxEqual(binomialCDF(9, 10, 0.3), 1 - 527 binomialCDFR(10, 10, 0.3))); 528 529 // Absurdly huge numbers, test normal branch. 530 { 531 ulong k = (1UL << 60) - 100_000_000; 532 ulong n = 1UL << 61; 533 assert(approxEqual(binomialCDFR(k, n, 0.5L), 0.5523927)); 534 } 535 536 // Test Poisson inversion branch. 537 double poisRes = binomialCDFR((1UL << 25) - 70, 1UL << 25, 0.9999975L); 538 assert(approxEqual(poisRes, 0.06883905)); 539 540 // Test Poisson branch. 541 poisRes = binomialCDFR(350, 1UL << 25, 1e-5); 542 assert(approxEqual(poisRes, 0.2219235)); 543 544 // Make sure cutoff is reasonable: Just below it, we should get similar 545 // results for normal, exact. 546 for(ulong n = BINOM_APPROX / 2; n < BINOM_APPROX; n += 200_000) { 547 for(double p = 0.01; p <= 0.99; p += 0.05) { 548 549 long lowerK = roundTo!long( n * p * 0.99); 550 long upperK = roundTo!long( n * p / 0.99); 551 552 for(ulong k = lowerK; k <= min(n, upperK); k += 1_000) { 553 double normRes = normApproxBinomCDFR(k, n, p); 554 double exactRes = binomialCDFR(k, n, p); 555 assert(abs(normRes - exactRes) < 0.001, 556 text(normRes, '\t', exactRes)); 557 } 558 } 559 } 560 } 561 562 /**Returns the value of k for the given p-value, n and p. If p-value does 563 * not exactly map to a value of k, the value for which binomialCDF(k, n, p) 564 * is closest to pVal is used.*/ 565 uint invBinomialCDF(double pVal, uint n, double p) { 566 dstatsEnforce(pVal >= 0 && pVal <= 1, "p-values must be between 0, 1."); 567 dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in binomial distribution."); 568 569 // Use normal approximation to get approx answer, then brute force search. 570 // This works better than you think because for small n, there's not much 571 // search space and for large n, the normal approx. is doublely good. 572 uint guess = cast(uint) max(round( 573 invNormalCDF(pVal, n * p, sqrt(n * p * (1 - p)))) + 0.5, 0); 574 if(guess > n) { 575 if(pVal < 0.5) // Numerical issues/overflow. 576 guess = 0; 577 else guess = n; 578 } 579 double guessP = binomialCDF(guess, n, p); 580 581 if(guessP == pVal) { 582 return guess; 583 } else if(guessP < pVal) { 584 for(uint k = guess + 1; k <= n; k++) { 585 double newP = guessP + binomialPMF(k, n, p); 586 if(abs(newP - pVal) > abs(guessP - pVal)) { 587 return k - 1; 588 } else { 589 guessP = newP; 590 } 591 } 592 return n; 593 } else { 594 for(uint k = guess - 1; k != uint.max; k--) { 595 double newP = guessP - binomialPMF(k + 1, n, p); 596 if(abs(newP - pVal) > abs(guessP - pVal)) { 597 return k + 1; 598 } else { 599 guessP = newP; 600 } 601 } 602 return 0; 603 } 604 } 605 606 unittest { 607 Random gen = Random(unpredictableSeed); 608 foreach(i; 0..1_000) { 609 // Restricted variable ranges are because, in the tails, more than one 610 // value of k can map to the same p-value at machine precision. 611 // Obviously, this is one of those corner cases that nothing can be 612 // done about. Using small n's, moderate p's prevents this. 613 uint n = uniform(5U, 10U); 614 uint k = uniform(0U, n); 615 double p = uniform(0.1L, 0.9L); 616 double pVal = binomialCDF(k, n, p); 617 assert(invBinomialCDF(pVal, n, p) == k); 618 } 619 } 620 621 /// 622 double hypergeometricPMF(long x, long n1, long n2, long n) 623 in { 624 assert(x <= n); 625 } do { 626 if(x > n1 || x < (n - n2)) { 627 return 0; 628 } 629 double result = logNcomb(n1, x) + logNcomb(n2, n - x) - logNcomb(n1 + n2, n); 630 return exp(result); 631 } 632 633 unittest { 634 assert(approxEqual(hypergeometricPMF(5, 10, 10, 10), .3437182)); 635 assert(approxEqual(hypergeometricPMF(9, 12, 10, 15), .27089783)); 636 assert(approxEqual(hypergeometricPMF(9, 100, 100, 15), .15500003)); 637 } 638 639 /**P(X <= x), where X is random variable. Uses either direct summation, 640 * normal or binomial approximation depending on parameters.*/ 641 // If anyone knows a better algorithm for this, feel free... 642 // I've read a decent amount about it, though, and getting hypergeometric 643 // CDFs that are both accurate and fast is just plain hard. This 644 // implementation attempts to strike a balance between the two, so that 645 // both speed and accuracy are "good enough" for most practical purposes. 646 double hypergeometricCDF(long x, long n1, long n2, long n) { 647 dstatsEnforce(x <= n, "x must be <= n in hypergeometric distribution."); 648 dstatsEnforce(n <= n1 + n2, "n must be <= n1 + n2 in hypergeometric distribution."); 649 dstatsEnforce(x >= 0, "x must be >= 0 in hypergeometric distribution."); 650 651 ulong expec = (n1 * n) / (n1 + n2); 652 long nComp = n1 + n2 - n, xComp = n2 + x - n; 653 654 // Try to reduce number of calculations using identities. 655 if(x >= n1 || x == n) { 656 return 1; 657 } else if(x > expec && x > n / 2) { 658 return 1 - hypergeometricCDF(n - x - 1, n2, n1, n); 659 } else if(xComp < x && xComp > 0) { 660 return hypergeometricCDF(xComp, n2, n1, nComp); 661 } 662 663 // Speed depends on x mostly, so always use exact for small x. 664 if(x <= 100) { 665 return hyperExact(x, n1, n2, n); 666 } 667 668 // Determine whether to use exact, normal approx or binomial approx. 669 // Using obviously arbitrary but relatively stringent standards 670 // for determining whether to approximate. 671 enum NEXACT = 50L; 672 673 double p = cast(double) n1 / (n1 + n2); 674 double pComp = cast(double) n2 / (n1 + n2); 675 double pMin = min(p, pComp); 676 if(min(n, nComp) * pMin >= 100) { 677 // Since high relative error in the lower tail is a significant problem, 678 // this is a hack to improve the normal approximation: Use the normal 679 // approximation, except calculate the last NEXACT elements exactly, 680 // since elements around the e.v. are where absolute error is highest. 681 // For large x, gives most of the accuracy of an exact calculation in 682 // only a small fraction of the time. 683 if(x <= expec + NEXACT / 2) { 684 return min(1, normApproxHyper(x - NEXACT, n1, n2, n) + 685 hyperExact(x, n1, n2, n, x - NEXACT + 1)); 686 } else { 687 // Just use plain old normal approx. Since P is large, the 688 // relative error won't be so bad anyhow. 689 return normApproxHyper(x, n1, n2, n); 690 } 691 } 692 // Try to make n as small as possible by applying mathematically equivalent 693 // transformations so that binomial approx. works as well as possible. 694 ulong bSc1 = (n1 + n2) / n, bSc2 = (n1 + n2) / n1; 695 696 if(bSc1 >= 50 && bSc1 > bSc2) { 697 // Same hack as normal approximation for rel. acc. in lower tail. 698 if(x <= expec + NEXACT / 2) { 699 return min(1, binomialCDF(cast(uint) (x - NEXACT), cast(uint) n, p) + 700 hyperExact(x, n1, n2, n, x - NEXACT + 1)); 701 } else { 702 return binomialCDF(cast(uint) x, cast(uint) n, p); 703 } 704 } else if(bSc2 >= 50 && bSc2 > bSc1) { 705 double p2 = cast(double) n / (n1 + n2); 706 if(x <= expec + NEXACT / 2) { 707 return min(1, binomialCDF(cast(uint) (x - NEXACT), cast(uint) n1, p2) + 708 hyperExact(x, n1, n2, n, x - NEXACT + 1)); 709 } else { 710 return binomialCDF(cast(uint) x, cast(uint) n1, p2); 711 } 712 } else { 713 return hyperExact(x, n1, n2, n); 714 } 715 } 716 717 unittest { 718 // Values from R and the Maxima CAS. 719 // Test exact branch, including reversing, complementing. 720 assert(approxEqual(hypergeometricCDF(5, 10, 10, 10), 0.6718591)); 721 assert(approxEqual(hypergeometricCDF(3, 11, 15, 10), 0.27745322)); 722 assert(approxEqual(hypergeometricCDF(18, 27, 31, 35), 0.88271714)); 723 assert(approxEqual(hypergeometricCDF(21, 29, 31, 35), 0.99229253)); 724 725 // Normal branch. 726 assert(approxEqual(hypergeometricCDF(501, 2000, 1000, 800), 0.002767073)); 727 assert(approxEqual(hypergeometricCDF(565, 2000, 1000, 800), 0.9977068)); 728 assert(approxEqual(hypergeometricCDF(2700, 10000, 20000, 8000), 0.825652)); 729 730 // Binomial branch. One for each transformation. 731 assert(approxEqual(hypergeometricCDF(110, 5000, 7000, 239), 0.9255627)); 732 assert(approxEqual(hypergeometricCDF(19840, 2950998, 12624, 19933), 0.2020618)); 733 assert(approxEqual(hypergeometricCDF(130, 24195, 52354, 295), 0.9999973)); 734 assert(approxEqual(hypergeometricCDF(103, 901, 49014, 3522), 0.999999)); 735 } 736 737 ///P(X >= x), where X is random variable. 738 double hypergeometricCDFR(ulong x, ulong n1, ulong n2, ulong n) { 739 dstatsEnforce(x <= n, "x must be <= n in hypergeometric distribution."); 740 dstatsEnforce(n <= n1 + n2, "n must be <= n1 + n2 in hypergeometric distribution."); 741 dstatsEnforce(x >= 0, "x must be >= 0 in hypergeometric distribution."); 742 743 return hypergeometricCDF(n - x, n2, n1, n); 744 } 745 746 unittest { 747 //Reverses n1, n2 and subtracts x from n to get mirror image. 748 assert(approxEqual(hypergeometricCDF(5,10,10,10), 749 hypergeometricCDFR(5,10,10,10))); 750 assert(approxEqual(hypergeometricCDF(3, 11, 15, 10), 751 hypergeometricCDFR(7, 15, 11, 10))); 752 assert(approxEqual(hypergeometricCDF(18, 27, 31, 35), 753 hypergeometricCDFR(17, 31, 27, 35))); 754 assert(approxEqual(hypergeometricCDF(21, 29, 31, 35), 755 hypergeometricCDFR(14, 31, 29, 35))); 756 } 757 758 double hyperExact(ulong x, ulong n1, ulong n2, ulong n, ulong startAt = 0) { 759 dstatsEnforce(x <= n, "x must be <= n in hypergeometric distribution."); 760 dstatsEnforce(n <= n1 + n2, "n must be <= n1 + n2 in hypergeometric distribution."); 761 dstatsEnforce(x >= 0, "x must be >= 0 in hypergeometric distribution."); 762 763 immutable double constPart = logFactorial(n1) + logFactorial(n2) + 764 logFactorial(n) + logFactorial(n1 + n2 - n) - logFactorial(n1 + n2); 765 double sum = 0; 766 for(ulong i = x; i != startAt - 1; i--) { 767 double oldSum = sum; 768 sum += exp(constPart - logFactorial(i) - logFactorial(n1 - i) - 769 logFactorial(n2 + i - n) - logFactorial(n - i)); 770 if(isIdentical(sum, oldSum)) { // At full machine precision. 771 break; 772 } 773 } 774 return sum; 775 } 776 777 double normApproxHyper(ulong x, ulong n1, ulong n2, ulong n) { 778 double p1 = cast(double) n1 / (n1 + n2); 779 double p2 = cast(double) n2 / (n1 + n2); 780 double numer = x + 0.5L - n * p1; 781 double denom = sqrt(n * p1 * p2 * (n1 + n2 - n) / (n1 + n2 - 1)); 782 return normalCDF(numer / denom); 783 } 784 785 // Aliases for old names. Not documented because new names should be used. 786 deprecated { 787 alias chiSquareCDF chiSqrCDF; 788 alias chiSquareCDFR chiSqrCDFR; 789 alias invChiSquareCDFR invChiSqCDFR; 790 } 791 792 /// 793 double chiSquarePDF(double x, double v) { 794 dstatsEnforce(x >= 0, "x must be >= 0 in chi-square distribution."); 795 dstatsEnforce(v >= 1.0, "Must have at least 1 degree of freedom for chi-square."); 796 797 // Calculate in log space for stability. 798 immutable logX = log(x); 799 immutable numerator = logX * (0.5 * v - 1) - 0.5 * x; 800 immutable denominator = LN2 * (0.5 * v) + logGamma(0.5 * v); 801 return exp(numerator - denominator); 802 } 803 804 unittest { 805 assert( approxEqual(chiSquarePDF(1, 2), 0.3032653)); 806 assert( approxEqual(chiSquarePDF(2, 1), 0.1037769)); 807 } 808 809 /** 810 * $(POWER χ,2) distribution function and its complement. 811 * 812 * Returns the area under the left hand tail (from 0 to x) 813 * of the Chi square probability density function with 814 * v degrees of freedom. The complement returns the area under 815 * the right hand tail (from x to ∞). 816 * 817 * chiSquareCDF(x | v) = ($(INTEGRATE 0, x) 818 * $(POWER t, v/2-1) $(POWER e, -t/2) dt ) 819 * / $(POWER 2, v/2) $(GAMMA)(v/2) 820 * 821 * chiSquareCDFR(x | v) = ($(INTEGRATE x, ∞) 822 * $(POWER t, v/2-1) $(POWER e, -t/2) dt ) 823 * / $(POWER 2, v/2) $(GAMMA)(v/2) 824 * 825 * Params: 826 * v = degrees of freedom. Must be positive. 827 * x = the $(POWER χ,2) variable. Must be positive. 828 * 829 */ 830 double chiSquareCDF(double x, double v) { 831 dstatsEnforce(x >= 0, "x must be >= 0 in chi-square distribution."); 832 dstatsEnforce(v >= 1.0, "Must have at least 1 degree of freedom for chi-square."); 833 834 // These are very common special cases where we can make the calculation 835 // a lot faster and/or more accurate. 836 if(v == 1) { 837 // Then it's the square of a normal(0, 1). 838 return 1.0L - erfc(sqrt(x) * SQRT1_2); 839 } else if(v == 2) { 840 // Then it's an exponential w/ lambda == 1/2. 841 return 1.0L - exp(-0.5 * x); 842 } else { 843 return gammaIncomplete(0.5 * v, 0.5 * x); 844 } 845 } 846 847 /// 848 double chiSquareCDFR(double x, double v) { 849 dstatsEnforce(x >= 0, "x must be >= 0 in chi-square distribution."); 850 dstatsEnforce(v >= 1.0, "Must have at least 1 degree of freedom for chi-square."); 851 852 // These are very common special cases where we can make the calculation 853 // a lot faster and/or more accurate. 854 if(v == 1) { 855 // Then it's the square of a normal(0, 1). 856 return erfc(sqrt(x) * SQRT1_2); 857 } else if(v == 2) { 858 // Then it's an exponential w/ lambda == 1/2. 859 return exp(-0.5 * x); 860 } else { 861 return gammaIncompleteCompl(0.5 * v, 0.5 * x); 862 } 863 } 864 865 /** 866 * Inverse of complemented $(POWER χ, 2) distribution 867 * 868 * Finds the $(POWER χ, 2) argument x such that the integral 869 * from x to ∞ of the $(POWER χ, 2) density is equal 870 * to the given cumulative probability p. 871 * 872 * Params: 873 * p = Cumulative probability. 0<= p <=1. 874 * v = Degrees of freedom. Must be positive. 875 * 876 */ 877 double invChiSquareCDFR(double v, double p) { 878 dstatsEnforce(v >= 1.0, "Must have at least 1 degree of freedom for chi-square."); 879 dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1."); 880 return 2.0 * gammaIncompleteComplInverse( 0.5*v, p); 881 } 882 883 unittest { 884 assert(feqrel(chiSquareCDFR(invChiSquareCDFR(3.5, 0.1), 3.5), 0.1)>=double.mant_dig-3); 885 assert(approxEqual( 886 chiSquareCDF(0.4L, 19.02L) + chiSquareCDFR(0.4L, 19.02L), 1.0L)); 887 assert(ae( invChiSquareCDFR( 3, chiSquareCDFR(1, 3)), 1)); 888 889 assert(ae(chiSquareCDFR(0.2, 1), 0.6547208)); 890 assert(ae(chiSquareCDFR(0.2, 2), 0.9048374)); 891 assert(ae(chiSquareCDFR(0.8, 1), 0.3710934)); 892 assert(ae(chiSquareCDFR(0.8, 2), 0.67032)); 893 894 assert(ae(chiSquareCDF(0.2, 1), 0.3452792)); 895 assert(ae(chiSquareCDF(0.2, 2), 0.09516258)); 896 assert(ae(chiSquareCDF(0.8, 1), 0.6289066)); 897 assert(ae(chiSquareCDF(0.8, 2), 0.3296800)); 898 } 899 900 /// 901 double normalPDF(double x, double mean = 0, double sd = 1) { 902 dstatsEnforce(sd > 0, "Standard deviation must be > 0 for normal distribution."); 903 double dev = x - mean; 904 return exp(-(dev * dev) / (2 * sd * sd)) / (sd * SQ2PI); 905 } 906 907 unittest { 908 assert(approxEqual(normalPDF(3, 1, 2), 0.1209854)); 909 } 910 911 ///P(X < x) for normal distribution where X is random var. 912 double normalCDF(double x, double mean = 0, double stdev = 1) { 913 dstatsEnforce(stdev > 0, "Standard deviation must be > 0 for normal distribution."); 914 915 // Using a slightly non-obvious implementation in terms of erfc because 916 // it seems more accurate than erf for very small values of Z. 917 918 double Z = (-x + mean) / stdev; 919 return erfc(Z*SQRT1_2)/2; 920 } 921 922 unittest { 923 assert(approxEqual(normalCDF(2), .9772498)); 924 assert(approxEqual(normalCDF(-2), .02275013)); 925 assert(approxEqual(normalCDF(1.3), .90319951)); 926 } 927 928 ///P(X > x) for normal distribution where X is random var. 929 double normalCDFR(double x, double mean = 0, double stdev = 1) { 930 dstatsEnforce(stdev > 0, "Standard deviation must be > 0 for normal distribution."); 931 932 double Z = (x - mean) / stdev; 933 return erfc(Z * SQRT1_2) / 2; 934 } 935 936 unittest { 937 //Should be essentially a mirror image of normalCDF. 938 for(double i = -8; i < 8; i += .1) { 939 assert(approxEqual(normalCDF(i), normalCDFR(-i))); 940 } 941 } 942 943 private enum SQRT2PI = 0x1.40d931ff62705966p+1; // 2.5066282746310005024 944 private enum EXP_2 = 0.13533528323661269189L; /* exp(-2) */ 945 946 /****************************** 947 * Inverse of Normal distribution function 948 * 949 * Returns the argument, x, for which the area under the 950 * Normal probability density function (integrated from 951 * minus infinity to x) is equal to p. 952 */ 953 double invNormalCDF(double p, double mean = 0, double sd = 1) { 954 dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1."); 955 dstatsEnforce(sd > 0, "Standard deviation must be > 0 for normal distribution."); 956 957 return normalDistributionInverse(p) * sd + mean; 958 } 959 960 961 unittest { 962 // The values below are from Excel 2003. 963 assert(fabs(invNormalCDF(0.001) - (-3.09023230616779))< 0.00000000000005); 964 assert(fabs(invNormalCDF(1e-50) - (-14.9333375347885))< 0.00000000000005); 965 assert(feqrel(invNormalCDF(0.999), -invNormalCDF(0.001))>double.mant_dig-6); 966 967 // Excel 2003 gets all the following values wrong! 968 assert(invNormalCDF(0.0)==-double.infinity); 969 assert(invNormalCDF(1.0)==double.infinity); 970 assert(invNormalCDF(0.5)==0); 971 972 // I don't know the correct result for low values 973 // (Excel 2003 returns norminv(p) = -30 for all p < 1e-200). 974 // The value tested here is the one the function returned in Jan 2006. 975 double unknown1 = invNormalCDF(1e-250L); 976 assert( fabs(unknown1 -(-33.79958617269L) ) < 0.00000005); 977 978 Random gen; 979 gen.seed(unpredictableSeed); 980 // normalCDF function trivial given ERF, unlikely to contain subtle bugs. 981 // Just make sure invNormalCDF works like it should as the inverse. 982 foreach(i; 0..1000) { 983 double x = uniform(0.0L, 1.0L); 984 double mean = uniform(0.0L, 100.0L); 985 double sd = uniform(1.0L, 3.0L); 986 double inv = invNormalCDF(x, mean, sd); 987 double rec = normalCDF(inv, mean, sd); 988 assert(approxEqual(x, rec)); 989 } 990 } 991 992 /// 993 double logNormalPDF(double x, double mu = 0, double sigma = 1) { 994 dstatsEnforce(sigma > 0, "sigma must be > 0 for log-normal distribution."); 995 996 immutable mulTerm = 1.0L / (x * sigma * SQ2PI); 997 double expTerm = log(x) - mu; 998 expTerm *= expTerm; 999 expTerm /= 2 * sigma * sigma; 1000 return mulTerm * exp(-expTerm); 1001 } 1002 1003 unittest { 1004 // Values from R. 1005 assert(approxEqual(logNormalPDF(1, 0, 1), 0.3989423)); 1006 assert(approxEqual(logNormalPDF(2, 2, 3), 0.06047173)); 1007 } 1008 1009 /// 1010 double logNormalCDF(double x, double mu = 0, double sigma = 1) { 1011 dstatsEnforce(sigma > 0, "sigma must be > 0 for log-normal distribution."); 1012 1013 return 0.5L + 0.5L * erf((log(x) - mu) / (sigma * SQRT2)); 1014 } 1015 1016 unittest { 1017 assert(approxEqual(logNormalCDF(4), 0.9171715)); 1018 assert(approxEqual(logNormalCDF(1, -2, 3), 0.7475075)); 1019 } 1020 1021 /// 1022 double logNormalCDFR(double x, double mu = 0, double sigma = 1) { 1023 dstatsEnforce(sigma > 0, "sigma must be > 0 for log-normal distribution."); 1024 1025 return 0.5L - 0.5L * erf((log(x) - mu) / (sigma * SQRT2)); 1026 } 1027 1028 unittest { 1029 assert(approxEqual(logNormalCDF(4) + logNormalCDFR(4), 1)); 1030 assert(approxEqual(logNormalCDF(1, -2, 3) + logNormalCDFR(1, -2, 3), 1)); 1031 } 1032 1033 /// 1034 double weibullPDF(double x, double shape, double scale = 1) { 1035 dstatsEnforce(shape > 0, "shape must be > 0 for weibull distribution."); 1036 dstatsEnforce(scale > 0, "scale must be > 0 for weibull distribution."); 1037 1038 if(x < 0) { 1039 return 0; 1040 } 1041 double ret = pow(x / scale, shape - 1) * exp( -pow(x / scale, shape)); 1042 return ret * (shape / scale); 1043 } 1044 1045 unittest { 1046 assert(approxEqual(weibullPDF(2,1,3), 0.1711390)); 1047 } 1048 1049 /// 1050 double weibullCDF(double x, double shape, double scale = 1) { 1051 dstatsEnforce(shape > 0, "shape must be > 0 for weibull distribution."); 1052 dstatsEnforce(scale > 0, "scale must be > 0 for weibull distribution."); 1053 1054 double exponent = pow(x / scale, shape); 1055 return 1 - exp(-exponent); 1056 } 1057 1058 unittest { 1059 assert(approxEqual(weibullCDF(2, 3, 4), 0.1175031)); 1060 } 1061 1062 /// 1063 double weibullCDFR(double x, double shape, double scale = 1) { 1064 dstatsEnforce(shape > 0, "shape must be > 0 for weibull distribution."); 1065 dstatsEnforce(scale > 0, "scale must be > 0 for weibull distribution."); 1066 1067 double exponent = pow(x / scale, shape); 1068 return exp(-exponent); 1069 } 1070 1071 unittest { 1072 assert(approxEqual(weibullCDF(2, 3, 4) + weibullCDFR(2, 3, 4), 1)); 1073 } 1074 1075 // For K-S tests in dstats.random. Todo: Flesh out. 1076 double waldCDF(double x, double mu, double lambda) { 1077 double sqr = sqrt(lambda / (2 * x)); 1078 double term1 = 1 + erf(sqr * (x / mu - 1)); 1079 double term2 = exp(2 * lambda / mu); 1080 double term3 = 1 - erf(sqr * (x / mu + 1)); 1081 return 0.5L * term1 + 0.5L * term2 * term3; 1082 } 1083 1084 // ditto. 1085 double rayleighCDF(double x, double mode) { 1086 return 1.0L - exp(-x * x / (2 * mode * mode)); 1087 } 1088 1089 /// 1090 double studentsTPDF(double t, double df) { 1091 dstatsEnforce(df > 0, "Student's T must have >0 degrees of freedom."); 1092 1093 immutable logPart = logGamma(0.5 * df + 0.5) - logGamma(0.5 * df); 1094 immutable term1 = exp(logPart) / sqrt(df * PI); 1095 immutable term2 = (1.0 + t / df * t) ^^ (-0.5 * df - 0.5); 1096 return term1 * term2; 1097 } 1098 1099 /// 1100 double studentsTCDF(double t, double df) { 1101 dstatsEnforce(df > 0, "Student's T must have >0 degrees of freedom."); 1102 1103 double x = (t + sqrt(t * t + df)) / (2 * sqrt(t * t + df)); 1104 return betaIncomplete(df * 0.5L, df * 0.5L, x); 1105 } 1106 1107 /// 1108 double studentsTCDFR(double t, double df) { 1109 return studentsTCDF(-t, df); 1110 } 1111 1112 unittest { 1113 assert(approxEqual(studentsTPDF(1, 1), 0.1591549)); 1114 assert(approxEqual(studentsTPDF(3, 10), 0.0114055)); 1115 assert(approxEqual(studentsTPDF(-4, 5), 0.005123727)); 1116 1117 assert(approxEqual(studentsTCDF(1, 1), 0.75)); 1118 assert(approxEqual(studentsTCDF(1.061, 2), 0.8)); 1119 assert(approxEqual(studentsTCDF(5.959, 5), 0.9995)); 1120 assert(approxEqual(studentsTCDF(.667, 20), 0.75)); 1121 assert(approxEqual(studentsTCDF(2.353, 3), 0.95)); 1122 } 1123 1124 /****************************************** 1125 * Inverse of Student's t distribution 1126 * 1127 * Given probability p and degrees of freedom df, 1128 * finds the argument t such that the one-sided 1129 * studentsDistribution(nu,t) is equal to p. 1130 * Used to test whether two distributions have the same 1131 * standard deviation. 1132 * 1133 * Params: 1134 * df = degrees of freedom. Must be >1 1135 * p = probability. 0 < p < 1 1136 */ 1137 double invStudentsTCDF(double p, double df) { 1138 dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1."); 1139 dstatsEnforce(df > 0, "Student's T must have >0 degrees of freedom."); 1140 1141 if (p==0) return -double.infinity; 1142 if (p==1) return double.infinity; 1143 1144 double rk, z; 1145 rk = df; 1146 1147 if ( p > 0.25L && p < 0.75L ) { 1148 if ( p == 0.5L ) return 0; 1149 z = 1.0L - 2.0L * p; 1150 z = betaIncompleteInverse( 0.5L, 0.5L*rk, fabs(z) ); 1151 double t = sqrt( rk*z/(1.0L-z) ); 1152 if( p < 0.5L ) 1153 t = -t; 1154 return t; 1155 } 1156 int rflg = -1; // sign of the result 1157 if (p >= 0.5L) { 1158 p = 1.0L - p; 1159 rflg = 1; 1160 } 1161 z = betaIncompleteInverse( 0.5L*rk, 0.5L, 2.0L*p ); 1162 1163 if (z<0) return rflg * double.infinity; 1164 return rflg * sqrt( rk/z - rk ); 1165 } 1166 1167 unittest { 1168 // The remaining values listed here are from Excel, and are unlikely to be accurate 1169 // in the last decimal places. However, they are helpful as a sanity check. 1170 1171 // Microsoft Excel 2003 gives TINV(2*(1-0.995), 10) == 3.16927267160917 1172 assert(approxEqual(invStudentsTCDF(0.995, 10), 3.169_272_67L)); 1173 assert(approxEqual(invStudentsTCDF(0.6, 8), 0.261_921_096_769_043L)); 1174 assert(approxEqual(invStudentsTCDF(0.4, 18), -0.257_123_042_655_869L)); 1175 assert(approxEqual(studentsTCDF(invStudentsTCDF(0.4L, 18), 18), .4L)); 1176 assert(approxEqual(studentsTCDF( invStudentsTCDF(0.9L, 11), 11), 0.9L)); 1177 } 1178 1179 /** 1180 * The Fisher distribution, its complement, and inverse. 1181 * 1182 * The F density function (also known as Snedcor's density or the 1183 * variance ratio density) is the density 1184 * of x = (u1/df1)/(u2/df2), where u1 and u2 are random 1185 * variables having $(POWER χ,2) distributions with df1 1186 * and df2 degrees of freedom, respectively. 1187 * 1188 * fisherCDF returns the area from zero to x under the F density 1189 * function. The complementary function, 1190 * fisherCDFR, returns the area from x to ∞ under the F density function. 1191 * 1192 * The inverse of the complemented Fisher distribution, 1193 * invFisherCDFR, finds the argument x such that the integral 1194 * from x to infinity of the F density is equal to the given probability y. 1195 1196 * Params: 1197 * df1 = Degrees of freedom of the first variable. Must be >= 1 1198 * df2 = Degrees of freedom of the second variable. Must be >= 1 1199 * x = Must be >= 0 1200 */ 1201 double fisherCDF(double x, double df1, double df2) { 1202 dstatsEnforce(df1 > 0 && df2 > 0, 1203 "Fisher distribution must have >0 degrees of freedom."); 1204 dstatsEnforce(x >= 0, "x must be >=0 for Fisher distribution."); 1205 1206 double a = cast(double)(df1); 1207 double b = cast(double)(df2); 1208 double w = a * x; 1209 w = w/(b + w); 1210 return betaIncomplete(0.5L*a, 0.5L*b, w); 1211 } 1212 1213 /** ditto */ 1214 double fisherCDFR(double x, double df1, double df2) { 1215 dstatsEnforce(df1 > 0 && df2 > 0, 1216 "Fisher distribution must have >0 degrees of freedom."); 1217 dstatsEnforce(x >= 0, "x must be >=0 for Fisher distribution."); 1218 1219 double a = cast(double)(df1); 1220 double b = cast(double)(df2); 1221 double w = b / (b + a * x); 1222 return betaIncomplete( 0.5L*b, 0.5L*a, w ); 1223 } 1224 1225 /** 1226 * Inverse of complemented Fisher distribution 1227 * 1228 * Finds the F density argument x such that the integral 1229 * from x to infinity of the F density is equal to the 1230 * given probability p. 1231 * 1232 * This is accomplished using the inverse beta integral 1233 * function and the relations 1234 * 1235 * z = betaIncompleteInverse( df2/2, df1/2, p ), 1236 * x = df2 (1-z) / (df1 z). 1237 * 1238 * Note that the following relations hold for the inverse of 1239 * the uncomplemented F distribution: 1240 * 1241 * z = betaIncompleteInverse( df1/2, df2/2, p ), 1242 * x = df2 z / (df1 (1-z)). 1243 */ 1244 double invFisherCDFR(double df1, double df2, double p ) { 1245 dstatsEnforce(df1 > 0 && df2 > 0, 1246 "Fisher distribution must have >0 degrees of freedom."); 1247 dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1."); 1248 1249 double a = df1; 1250 double b = df2; 1251 /* Compute probability for x = 0.5. */ 1252 double w = betaIncomplete( 0.5L*b, 0.5L*a, 0.5L ); 1253 /* If that is greater than p, then the solution w < .5. 1254 Otherwise, solve at 1-p to remove cancellation in (b - b*w). */ 1255 if ( w > p || p < 0.001L) { 1256 w = betaIncompleteInverse( 0.5L*b, 0.5L*a, p ); 1257 return (b - b*w)/(a*w); 1258 } else { 1259 w = betaIncompleteInverse( 0.5L*a, 0.5L*b, 1.0L - p ); 1260 return b*w/(a*(1.0L-w)); 1261 } 1262 } 1263 1264 unittest { 1265 // fDistCompl(df1, df2, x) = Excel's FDIST(x, df1, df2) 1266 assert(fabs(fisherCDFR(16.5, 6, 4) - 0.00858719177897249L)< 0.0000000000005L); 1267 assert(fabs((1-fisherCDF(0.1, 12, 23)) - 0.99990562845505L)< 0.0000000000005L); 1268 assert(fabs(invFisherCDFR(8, 34, 0.2) - 1.48267037661408L)< 0.0000000005L); 1269 assert(fabs(invFisherCDFR(4, 16, 0.008) - 5.043_537_593_48596L)< 0.0000000005L); 1270 // This one used to fail because of a bug in the definition of MINLOG. 1271 assert(approxEqual(fisherCDFR(invFisherCDFR(4,16, 0.008), 4, 16), 0.008)); 1272 } 1273 1274 /// 1275 double negBinomPMF(ulong k, ulong n, double p) { 1276 dstatsEnforce(p >= 0 && p <= 1, 1277 "p must be between 0, 1 for negative binomial distribution."); 1278 1279 return exp(logNcomb(k - 1 + n, k) + n * log(p) + k * log(1 - p)); 1280 } 1281 1282 unittest { 1283 // Values from R. 1284 assert(approxEqual(negBinomPMF(1, 8, 0.7), 0.1383552)); 1285 assert(approxEqual(negBinomPMF(3, 2, 0.5), 0.125)); 1286 } 1287 1288 1289 /********************** 1290 * Negative binomial distribution. 1291 * 1292 * Returns the sum of the terms 0 through k of the negative 1293 * binomial distribution: 1294 * 1295 * $(BIGSUM j=0, k) $(CHOOSE n+j-1, j) $(POWER p, n) $(POWER (1-p), j) 1296 * ???? In mathworld, it is 1297 * $(BIGSUM j=0, k) $(CHOOSE n+j-1, j-1) $(POWER p, j) $(POWER (1-p), n) 1298 * 1299 * In a sequence of Bernoulli trials, this is the probability 1300 * that k or fewer failures precede the n-th success. 1301 * 1302 * The arguments must be positive, with 0 < p < 1 and r>0. 1303 * 1304 * The Geometric Distribution is a special case of the negative binomial 1305 * distribution. 1306 * ----------------------- 1307 * geometricDistribution(k, p) = negativeBinomialDistribution(k, 1, p); 1308 * ----------------------- 1309 * References: 1310 * $(LINK http://mathworld.wolfram.com/NegativeBinomialDistribution.html) 1311 */ 1312 double negBinomCDF(ulong k, ulong n, double p ) { 1313 dstatsEnforce(p >= 0 && p <= 1, 1314 "p must be between 0, 1 for negative binomial distribution."); 1315 if ( k == 0 ) return pow(p, cast(double) n); 1316 return betaIncomplete( n, k + 1, p ); 1317 } 1318 1319 unittest { 1320 // Values from R. 1321 assert(approxEqual(negBinomCDF(50, 50, 0.5), 0.5397946)); 1322 assert(approxEqual(negBinomCDF(2, 1, 0.5), 0.875)); 1323 } 1324 1325 /**Probability that k or more failures precede the nth success.*/ 1326 double negBinomCDFR(ulong k, ulong n, double p) { 1327 dstatsEnforce(p >= 0 && p <= 1, 1328 "p must be between 0, 1 for negative binomial distribution."); 1329 1330 if(k == 0) 1331 return 1; 1332 return betaIncomplete(k, n, 1.0L - p); 1333 } 1334 1335 unittest { 1336 assert(approxEqual(negBinomCDFR(10, 20, 0.5), 1 - negBinomCDF(9, 20, 0.5))); 1337 } 1338 1339 /// 1340 ulong invNegBinomCDF(double pVal, ulong n, double p) { 1341 dstatsEnforce(p >= 0 && p <= 1, 1342 "p must be between 0, 1 for negative binomial distribution."); 1343 dstatsEnforce(pVal >= 0 && pVal <= 1, 1344 "P-values must be between 0, 1."); 1345 1346 // Normal or gamma approx, then adjust. 1347 double mean = n * (1 - p) / p; 1348 double var = n * (1 - p) / (p * p); 1349 double skew = (2 - p) / sqrt(n * (1 - p)); 1350 double kk = 4.0L / (skew * skew); 1351 double theta = sqrt(var / kk); 1352 double offset = (kk * theta) - mean + 0.5L; 1353 ulong guess; 1354 1355 // invGammaCDFR is very expensive, but worth it in cases where normal approx 1356 // would be the worst. Otherwise, use normal b/c it's *MUCH* cheaper to 1357 // calculate. 1358 if(skew > 1.5 && var > 1_048_576) 1359 guess = cast(long) max(round( 1360 invGammaCDFR(1 - pVal, 1 / theta, kk) - offset), 0.0L); 1361 else 1362 guess = cast(long) max(round( 1363 invNormalCDF(pVal, mean, sqrt(var)) + 0.5), 0.0L); 1364 1365 // This is pretty arbitrary behavior, but I don't want to use exceptions 1366 // and it has to be handled as a special case. 1367 if(pVal > 1 - double.epsilon) 1368 return ulong.max; 1369 double guessP = negBinomCDF(guess, n, p); 1370 1371 if(guessP == pVal) { 1372 return guess; 1373 } else if(guessP < pVal) { 1374 for(ulong k = guess + 1; ; k++) { 1375 double newP = guessP + negBinomPMF(k, n, p); 1376 // Test for aliasing. 1377 if(newP == guessP) 1378 return k - 1; 1379 if(abs(pVal - newP) > abs(guessP - pVal)) { 1380 return k - 1; 1381 } else if(newP >= 1) { 1382 return k; 1383 } else { 1384 guessP = newP; 1385 } 1386 } 1387 } else { 1388 for(ulong k = guess - 1; guess != ulong.max; k--) { 1389 double newP = guessP - negBinomPMF(k + 1, n, p); 1390 // Test for aliasing. 1391 if(newP == guessP) 1392 return k + 1; 1393 if(abs(newP - pVal) > abs(guessP - pVal)) { 1394 return k + 1; 1395 } else { 1396 guessP = newP; 1397 } 1398 } 1399 return 0; 1400 } 1401 } 1402 1403 unittest { 1404 Random gen = Random(unpredictableSeed); 1405 uint nSkipped; 1406 foreach(i; 0..1000) { 1407 uint n = uniform(1u, 10u); 1408 double p = uniform(0.0L, 1L); 1409 uint k = uniform(0, 20); 1410 double pVal = negBinomCDF(k, n, p); 1411 1412 // In extreme tails, p-values can alias, giving incorrect results. 1413 // This is a corner case that nothing can be done about. Just skip 1414 // these. 1415 if(pVal >= 1 - 10 * double.epsilon) { 1416 nSkipped++; 1417 continue; 1418 } 1419 assert(invNegBinomCDF(pVal, n, p) == k); 1420 } 1421 } 1422 1423 /// 1424 double exponentialPDF(double x, double lambda) { 1425 dstatsEnforce(x >= 0, "x must be >0 in exponential distribution"); 1426 dstatsEnforce(lambda > 0, "lambda must be >0 in exponential distribution"); 1427 1428 return lambda * exp(-lambda * x); 1429 } 1430 1431 /// 1432 double exponentialCDF(double x, double lambda) { 1433 dstatsEnforce(x >= 0, "x must be >0 in exponential distribution"); 1434 dstatsEnforce(lambda > 0, "lambda must be >0 in exponential distribution"); 1435 1436 return 1.0 - exp(-lambda * x); 1437 } 1438 1439 /// 1440 double exponentialCDFR(double x, double lambda) { 1441 dstatsEnforce(x >= 0, "x must be >0 in exponential distribution"); 1442 dstatsEnforce(lambda > 0, "lambda must be >0 in exponential distribution"); 1443 1444 return exp(-lambda * x); 1445 } 1446 1447 /// 1448 double invExponentialCDF(double p, double lambda) { 1449 dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in exponential distribution"); 1450 dstatsEnforce(lambda > 0, "lambda must be >0 in exponential distribution"); 1451 return log(-1.0 / (p - 1.0)) / lambda; 1452 } 1453 1454 unittest { 1455 // Values from R. 1456 assert(approxEqual(exponentialPDF(0.75, 3), 0.3161977)); 1457 assert(approxEqual(exponentialCDF(0.75, 3), 0.8946008)); 1458 assert(approxEqual(exponentialCDFR(0.75, 3), 0.1053992)); 1459 1460 assert(approxEqual(invExponentialCDF(0.8, 2), 0.804719)); 1461 assert(approxEqual(invExponentialCDF(0.2, 7), 0.03187765)); 1462 } 1463 1464 /// 1465 double gammaPDF(double x, double rate, double shape) { 1466 dstatsEnforce(x > 0, "x must be >0 in gamma distribution."); 1467 dstatsEnforce(rate > 0, "rate must be >0 in gamma distribution."); 1468 dstatsEnforce(shape > 0, "shape must be >0 in gamma distribution."); 1469 1470 immutable scale = 1.0 / rate; 1471 immutable firstPart = x ^^ (shape - 1); 1472 immutable logNumer = -x / scale; 1473 immutable logDenom = logGamma(shape) + shape * log(scale); 1474 return firstPart * exp(logNumer - logDenom); 1475 } 1476 1477 /// 1478 double gammaCDF(double x, double rate, double shape) { 1479 dstatsEnforce(x > 0, "x must be >0 in gamma distribution."); 1480 dstatsEnforce(rate > 0, "rate must be >0 in gamma distribution."); 1481 dstatsEnforce(shape > 0, "shape must be >0 in gamma distribution."); 1482 1483 return gammaIncomplete(shape, rate * x); 1484 } 1485 1486 /// 1487 double gammaCDFR(double x, double rate, double shape) { 1488 dstatsEnforce(x > 0, "x must be >0 in gamma distribution."); 1489 dstatsEnforce(rate > 0, "rate must be >0 in gamma distribution."); 1490 dstatsEnforce(shape > 0, "shape must be >0 in gamma distribution."); 1491 1492 return gammaIncompleteCompl(shape, rate * x); 1493 } 1494 1495 /**This just calls invGammaCDFR w/ 1 - p b/c invGammaCDFR is more accurate, 1496 * but this function is necessary for consistency. 1497 */ 1498 double invGammaCDF(double p, double rate, double shape) { 1499 return invGammaCDFR(1.0 - p, rate, shape); 1500 } 1501 1502 /// 1503 double invGammaCDFR(double p, double rate, double shape) { 1504 dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in gamma distribution."); 1505 dstatsEnforce(rate > 0, "rate must be >0 in gamma distribution."); 1506 dstatsEnforce(shape > 0, "shape must be >0 in gamma distribution."); 1507 1508 double ratex = gammaIncompleteComplInverse(shape, p); 1509 return ratex / rate; 1510 } 1511 1512 unittest { 1513 assert(approxEqual(gammaPDF(1, 2, 5), 0.1804470)); 1514 assert(approxEqual(gammaPDF(0.5, 8, 4), 1.562935)); 1515 assert(approxEqual(gammaPDF(3, 2, 7), 0.3212463)); 1516 assert(approxEqual(gammaCDF(1, 2, 5), 0.05265302)); 1517 assert(approxEqual(gammaCDFR(1, 2, 5), 0.947347)); 1518 1519 double inv = invGammaCDFR(0.78, 2, 1); 1520 assert(approxEqual(gammaCDFR(inv, 2, 1), 0.78)); 1521 1522 double inv2 = invGammaCDF(0.78, 2, 1); 1523 assert(approxEqual(gammaCDF(inv2, 2, 1), 0.78)); 1524 } 1525 1526 /// 1527 double betaPDF(double x, double alpha, double beta) { 1528 dstatsEnforce(alpha > 0, "Alpha must be >0 for beta distribution."); 1529 dstatsEnforce(beta > 0, "Beta must be >0 for beta distribution."); 1530 dstatsEnforce(x >= 0 && x <= 1, "x must be between 0, 1 for beta distribution."); 1531 1532 return x ^^ (alpha - 1) * (1 - x) ^^ (beta - 1) / 1533 std.mathspecial.beta(alpha, beta); 1534 } 1535 1536 /// 1537 double betaCDF(double x, double alpha, double beta) { 1538 dstatsEnforce(alpha > 0, "Alpha must be >0 for beta distribution."); 1539 dstatsEnforce(beta > 0, "Beta must be >0 for beta distribution."); 1540 dstatsEnforce(x >= 0 && x <= 1, "x must be between 0, 1 for beta distribution."); 1541 1542 return std.mathspecial.betaIncomplete(alpha, beta, x); 1543 } 1544 1545 /// 1546 double betaCDFR(double x, double alpha, double beta) { 1547 dstatsEnforce(alpha > 0, "Alpha must be >0 for beta distribution."); 1548 dstatsEnforce(beta > 0, "Beta must be >0 for beta distribution."); 1549 dstatsEnforce(x >= 0 && x <= 1, "x must be between 0, 1 for beta distribution."); 1550 1551 return std.mathspecial.betaIncomplete(beta, alpha, 1 - x); 1552 } 1553 1554 /// 1555 double invBetaCDF(double p, double alpha, double beta) { 1556 dstatsEnforce(alpha > 0, "Alpha must be >0 for beta distribution."); 1557 dstatsEnforce(beta > 0, "Beta must be >0 for beta distribution."); 1558 dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 for beta distribution."); 1559 1560 return std.mathspecial.betaIncompleteInverse(alpha, beta, p); 1561 } 1562 1563 unittest { 1564 // Values from R. 1565 assert(approxEqual(betaPDF(0.3, 2, 3), 1.764)); 1566 assert(approxEqual(betaPDF(0.78, 0.9, 4), 0.03518569)); 1567 1568 assert(approxEqual(betaCDF(0.3, 2, 3), 0.3483)); 1569 assert(approxEqual(betaCDF(0.78, 0.9, 4), 0.9980752)); 1570 1571 assert(approxEqual(betaCDFR(0.3, 2, 3), 0.6517)); 1572 assert(approxEqual(betaCDFR(0.78, 0.9, 4), 0.001924818)); 1573 1574 assert(approxEqual(invBetaCDF(0.3483, 2, 3), 0.3)); 1575 assert(approxEqual(invBetaCDF(0.9980752, 0.9, 4), 0.78)); 1576 } 1577 1578 /** 1579 The Dirichlet probability density. 1580 1581 Params: 1582 1583 x = An input range of observed values. All must be between [0, 1]. They 1584 must also sum to 1, though this is not checked because small deviations from 1585 this may result due to numerical error. 1586 1587 alpha = A forward range of parameters. This must have the same length as 1588 x. 1589 */ 1590 double dirichletPDF(X, A)(X x, A alpha) 1591 if(isInputRange!X && isForwardRange!A && is(ElementType!X : double) && 1592 is(ElementType!A : double)) { 1593 1594 // Evaluating the multinomial beta function = product(gamma(alpha_1)) over 1595 // gamma(sum(alpha)), in log space. 1596 double logNormalizer = 0; 1597 double sumAlpha = 0; 1598 1599 foreach(a; alpha.save) { 1600 dstatsEnforce(a > 0, "All alpha values must be > 0 for Dirichlet distribution."); 1601 logNormalizer += logGamma(a); 1602 sumAlpha += a; 1603 } 1604 1605 logNormalizer -= logGamma(sumAlpha); 1606 double sum = 0; 1607 foreach(xElem, a; lockstep(x, alpha)) { 1608 dstatsEnforce(xElem > 0, "All x values must be > 0 for Dirichlet distribution."); 1609 sum += log(xElem) * (a - 1); 1610 } 1611 1612 sum -= logNormalizer; 1613 return exp(sum); 1614 } 1615 1616 unittest { 1617 // Test against beta 1618 assert(approxEqual(dirichletPDF([0.1, 0.9], [2, 3]), betaPDF(0.1, 2, 3))); 1619 1620 // A few values from R's gregmisc package 1621 assert(approxEqual(dirichletPDF([0.1, 0.2, 0.7], [4, 5, 6]), 1.356672)); 1622 assert(approxEqual(dirichletPDF([0.8, 0.05, 0.15], [8, 5, 6]), 0.04390199)); 1623 } 1624 1625 /// 1626 double cauchyPDF(double X, double X0 = 0, double gamma = 1) { 1627 dstatsEnforce(gamma > 0, "gamma must be > 0 for Cauchy distribution."); 1628 1629 double toSquare = (X - X0) / gamma; 1630 return 1.0L / ( 1631 PI * gamma * (1 + toSquare * toSquare)); 1632 } 1633 1634 unittest { 1635 assert(approxEqual(cauchyPDF(5), 0.01224269)); 1636 assert(approxEqual(cauchyPDF(2), 0.06366198)); 1637 } 1638 1639 1640 /// 1641 double cauchyCDF(double X, double X0 = 0, double gamma = 1) { 1642 dstatsEnforce(gamma > 0, "gamma must be > 0 for Cauchy distribution."); 1643 1644 return M_1_PI * atan((X - X0) / gamma) + 0.5L; 1645 } 1646 1647 unittest { 1648 // Values from R 1649 assert(approxEqual(cauchyCDF(-10), 0.03172552)); 1650 assert(approxEqual(cauchyCDF(1), 0.75)); 1651 } 1652 1653 /// 1654 double cauchyCDFR(double X, double X0 = 0, double gamma = 1) { 1655 dstatsEnforce(gamma > 0, "gamma must be > 0 for Cauchy distribution."); 1656 1657 return M_1_PI * atan((X0 - X) / gamma) + 0.5L; 1658 } 1659 1660 unittest { 1661 // Values from R 1662 assert(approxEqual(1 - cauchyCDFR(-10), 0.03172552)); 1663 assert(approxEqual(1 - cauchyCDFR(1), 0.75)); 1664 } 1665 1666 /// 1667 double invCauchyCDF(double p, double X0 = 0, double gamma = 1) { 1668 dstatsEnforce(gamma > 0, "gamma must be > 0 for Cauchy distribution."); 1669 dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1."); 1670 1671 return X0 + gamma * tan(PI * (p - 0.5L)); 1672 } 1673 1674 unittest { 1675 // cauchyCDF already tested. Just make sure this is the inverse. 1676 assert(approxEqual(invCauchyCDF(cauchyCDF(.5)), .5)); 1677 assert(approxEqual(invCauchyCDF(cauchyCDF(.99)), .99)); 1678 assert(approxEqual(invCauchyCDF(cauchyCDF(.03)), .03)); 1679 } 1680 1681 // For K-S tests in dstats.random. To be fleshed out later. Intentionally 1682 // lacking ddoc. 1683 double logisticCDF(double x, double loc, double shape) { 1684 return 1.0L / (1 + exp(-(x - loc) / shape)); 1685 } 1686 1687 /// 1688 double laplacePDF(double x, double mu = 0, double b = 1) { 1689 dstatsEnforce(b > 0, "b must be > 0 for laplace distribution."); 1690 1691 return (exp(-abs(x - mu) / b)) / (2 * b); 1692 } 1693 1694 unittest { 1695 // Values from Maxima. 1696 assert(approxEqual(laplacePDF(3, 2, 1), 0.18393972058572)); 1697 assert(approxEqual(laplacePDF(-8, 6, 7), 0.0096668059454723)); 1698 } 1699 1700 /// 1701 double laplaceCDF(double X, double mu = 0, double b = 1) { 1702 dstatsEnforce(b > 0, "b must be > 0 for laplace distribution."); 1703 1704 double diff = (X - mu); 1705 double sign = (diff > 0) ? 1 : -1; 1706 return 0.5L *(1 + sign * (1 - exp(-abs(diff) / b))); 1707 } 1708 1709 unittest { 1710 // Values from Octave. 1711 assert(approxEqual(laplaceCDF(5), 0.9963)); 1712 assert(approxEqual(laplaceCDF(-3.14), .021641)); 1713 assert(approxEqual(laplaceCDF(0.012), 0.50596)); 1714 } 1715 1716 /// 1717 double laplaceCDFR(double X, double mu = 0, double b = 1) { 1718 dstatsEnforce(b > 0, "b must be > 0 for laplace distribution."); 1719 1720 double diff = (mu - X); 1721 double sign = (diff > 0) ? 1 : -1; 1722 return 0.5L *(1 + sign * (1 - exp(-abs(diff) / b))); 1723 } 1724 1725 unittest { 1726 // Values from Octave. 1727 assert(approxEqual(1 - laplaceCDFR(5), 0.9963)); 1728 assert(approxEqual(1 - laplaceCDFR(-3.14), .021641)); 1729 assert(approxEqual(1 - laplaceCDFR(0.012), 0.50596)); 1730 } 1731 1732 /// 1733 double invLaplaceCDF(double p, double mu = 0, double b = 1) { 1734 dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1."); 1735 dstatsEnforce(b > 0, "b must be > 0 for laplace distribution."); 1736 1737 double p05 = p - 0.5L; 1738 double sign = (p05 < 0) ? -1.0L : 1.0L; 1739 return mu - b * sign * log(1.0L - 2 * abs(p05)); 1740 } 1741 1742 unittest { 1743 assert(approxEqual(invLaplaceCDF(0.012), -3.7297)); 1744 assert(approxEqual(invLaplaceCDF(0.82), 1.0217)); 1745 } 1746 1747 double kolmDist()(double x) { 1748 pragma(msg, "kolmDist is scheduled for deprecation. Please use " ~ 1749 "kolmogorovDistrib instead."); 1750 1751 return kolmogorovDistrib(x); 1752 } 1753 1754 /**Kolmogorov distribution. Used in Kolmogorov-Smirnov testing. 1755 * 1756 * References: http://en.wikipedia.org/wiki/Kolmogorov-Smirnov 1757 */ 1758 double kolmogorovDistrib(immutable double x) { 1759 dstatsEnforce(x >= 0, "x must be >= 0 for Kolmogorov distribution."); 1760 1761 if(x == 0) { 1762 //Handle as a special case. Otherwise, get NAN b/c of divide by zero. 1763 return 0; 1764 } 1765 1766 double result = 0; 1767 double i = 1; 1768 immutable xSquared = x * x; 1769 while(true) { 1770 immutable delta = exp(-(2 * i - 1) * (2 * i - 1) * PI * PI / (8 * xSquared)); 1771 i++; 1772 1773 immutable oldResult = result; 1774 result += delta; 1775 if(isNaN(result) || oldResult == result) { 1776 break; 1777 } 1778 } 1779 result *= (sqrt(2 * PI) / x); 1780 return result; 1781 } 1782 1783 unittest { 1784 assert(approxEqual(1 - kolmogorovDistrib(.75), 0.627167)); 1785 assert(approxEqual(1 - kolmogorovDistrib(.5), 0.9639452436)); 1786 assert(approxEqual(1 - kolmogorovDistrib(.9), 0.39273070)); 1787 assert(approxEqual(1 - kolmogorovDistrib(1.2), 0.112249666)); 1788 }