1 /++ 2 This module contains algorithms for the $(LINK2 https://en.wikipedia.org/wiki/Hypergeometric_distribution, Hypergeometric Distribution). 3 4 There are multiple alternative parameterizations of the Hypergeometric Distribution. 5 The formulation in this module measures the number of draws (`k`) with a 6 specific feature in `n` total draws without replacement from a population of 7 size `N` such that `K` of these have the feature of interest. 8 9 Hypergeometric distribution functions can utilize different algorithms. The 10 default is `HypergeometricAlgo.direct`, which can be more time-consuming for 11 large values of the parameters. Additional algorithms are provided to the user 12 to choose the trade-off between running time and accuracy. 13 14 License: $(HTTP www.apache.org/licenses/LICENSE-2.0, Apache-2.0) 15 16 Authors: John Michael Hall 17 18 Copyright: 2023 Mir Stat Authors. 19 20 +/ 21 22 module mir.stat.distribution.hypergeometric; 23 24 import mir.bignum.fp: Fp; 25 import mir.internal.utility: isFloatingPoint; 26 27 /++ 28 Algorithms used to calculate hypergeometric distribution. 29 30 `HypergeometricAlgo.direct` can be more time-consuming for large values of the 31 parameters. Additional algorithms are provided to the user to choose the 32 trade-off between running time and accuracy. 33 +/ 34 enum HypergeometricAlgo { 35 /++ 36 Direct 37 +/ 38 direct, 39 /++ 40 Approximates hypergeometric distribution with binomial distribution. 41 +/ 42 approxBinomial, 43 /++ 44 Approximates hypergeometric distribution with poisson distribution (uses gamma approximation, except for inverse CDF). 45 +/ 46 approxPoisson, 47 /++ 48 Approximates hypergeometric distribution with normal distribution. 49 +/ 50 approxNormal, 51 /++ 52 Approximates hypergeometric distribution with normal distribution (including continuity correction). 53 +/ 54 approxNormalContinuityCorrection, 55 } 56 57 private 58 @safe pure @nogc nothrow 59 T hypergeometricPMFImpl(T, HypergeometricAlgo hypergeometricAlgo) 60 (const size_t k, const size_t N, const size_t K, const size_t n) 61 if (isFloatingPoint!T && hypergeometricAlgo == HypergeometricAlgo.direct) 62 in (K <= N, "K must be less than or equal to N") 63 in (n <= N, "n must be less than or equal to N") 64 in (k <= n, "n - k must be greater than or equal to zero") 65 in (k <= K, "k must be less than or equal to K") 66 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 67 { 68 import mir.combinatorics: binomial; 69 return cast(T) (binomial(K, k) * binomial(N - K, n - k)) / binomial(N, n); 70 } 71 72 private 73 @safe pure @nogc nothrow 74 T hypergeometricPMFImpl(T, HypergeometricAlgo hypergeometricAlgo) 75 (const size_t k, const size_t N, const size_t K, const size_t n) 76 if (isFloatingPoint!T && hypergeometricAlgo == HypergeometricAlgo.approxBinomial) 77 in (K <= N, "K must be less than or equal to N") 78 in (n <= N, "n must be less than or equal to N") 79 in (k <= n, "n - k must be greater than or equal to zero") 80 in (k <= K, "k must be less than or equal to K") 81 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 82 { 83 import mir.stat.distribution.binomial: binomialPMF; 84 85 return binomialPMF(k, n, cast(T) K / N); 86 } 87 88 private 89 @safe pure @nogc nothrow 90 T hypergeometricPMFImpl(T, HypergeometricAlgo hypergeometricAlgo) 91 (const size_t k, const size_t N, const size_t K, const size_t n) 92 if (isFloatingPoint!T && hypergeometricAlgo == HypergeometricAlgo.approxPoisson) 93 in (K <= N, "K must be less than or equal to N") 94 in (n <= N, "n must be less than or equal to N") 95 in (k <= n, "n - k must be greater than or equal to zero") 96 in (k <= K, "k must be less than or equal to K") 97 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 98 { 99 import mir.stat.distribution.poisson: poissonPMF, PoissonAlgo; 100 101 return poissonPMF!(PoissonAlgo.gamma)(k, cast(T) n * K / N); 102 } 103 104 private 105 @safe pure @nogc nothrow 106 T hypergeometricPMFImpl(T, HypergeometricAlgo hypergeometricAlgo) 107 (const size_t k, const size_t N, const size_t K, const size_t n) 108 if (isFloatingPoint!T && 109 hypergeometricAlgo == HypergeometricAlgo.approxNormal) 110 in (K <= N, "K must be less than or equal to N") 111 in (n <= N, "n must be less than or equal to N") 112 in (k <= n, "n - k must be greater than or equal to zero") 113 in (k <= K, "k must be less than or equal to K") 114 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 115 { 116 import mir.math.common: sqrt; 117 import mir.stat.distribution.normal: normalPDF; 118 119 T l = k; 120 return normalPDF(l, cast(T) (n * K) / N, sqrt(n * (cast(T) K / N) * (cast(T) (N - K) / N) * (cast(T) (N - n) / (N - 1)))); 121 } 122 123 private 124 @safe pure @nogc nothrow 125 T hypergeometricPMFImpl(T, HypergeometricAlgo hypergeometricAlgo) 126 (const size_t k, const size_t N, const size_t K, const size_t n) 127 if (isFloatingPoint!T && 128 hypergeometricAlgo == HypergeometricAlgo.approxNormalContinuityCorrection) 129 in (K <= N, "K must be less than or equal to N") 130 in (n <= N, "n must be less than or equal to N") 131 in (k <= n, "n - k must be greater than or equal to zero") 132 in (k <= K, "k must be less than or equal to K") 133 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 134 { 135 import mir.math.common: sqrt; 136 import mir.stat.distribution.normal: normalCDF; 137 138 T location = cast(T) (n * K) / N; 139 T scale = sqrt(n * (cast(T) K / N) * (cast(T) (N - K) / N) * (cast(T) (N - n) / (N - 1))); 140 return normalCDF(k + 0.5, location, scale) - normalCDF(k - 0.5, location, scale); 141 } 142 143 /++ 144 Computes the hypergeometric probability mass function (PMF). 145 146 Additional algorithms may be provided for calculating PMF that allow trading off 147 time and accuracy. If `approxPoisson` is provided, `PoissonAlgo.gamma` is assumed. 148 149 Params: 150 k = value to evaluate PMF (e.g. number of correct draws of object of interest) 151 N = total population size 152 K = number of relevant objects in population 153 n = number of draws 154 155 See_also: $(LINK2 https://en.wikipedia.org/wiki/Hypergeometric_distribution, Hypergeometric Distribution) 156 +/ 157 @safe pure @nogc nothrow 158 T hypergeometricPMF(T, HypergeometricAlgo hypergeometricAlgo = HypergeometricAlgo.direct) 159 (const size_t k, const size_t N, const size_t K, const size_t n) 160 if (isFloatingPoint!T) 161 in (K <= N, "K must be less than or equal to N") 162 in (n <= N, "n must be less than or equal to N") 163 in (k <= n, "n - k must be greater than or equal to zero") 164 in (k <= K, "k must be less than or equal to K") 165 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 166 { 167 if (n == 0) 168 return 1; 169 return hypergeometricPMFImpl!(T, hypergeometricAlgo)(k, N, K, n); 170 } 171 172 /// ditto 173 template hypergeometricPMF(HypergeometricAlgo hypergeometricAlgo = HypergeometricAlgo.direct) 174 { 175 alias hypergeometricPMF = hypergeometricPMF!(double, hypergeometricAlgo); 176 } 177 178 /// ditto 179 @safe pure nothrow @nogc 180 template hypergeometricPMF(T, string hypergeometricAlgo) 181 { 182 mixin("alias hypergeometricPMF = .hypergeometricPMF!(T, HypergeometricAlgo." ~ hypergeometricAlgo ~ ");"); 183 } 184 185 /// ditto 186 @safe pure nothrow @nogc 187 template hypergeometricPMF(string hypergeometricAlgo) 188 { 189 mixin("alias hypergeometricPMF = .hypergeometricPMF!(double, HypergeometricAlgo." ~ hypergeometricAlgo ~ ");"); 190 } 191 192 /// 193 @safe pure @nogc nothrow 194 version(mir_stat_test) 195 unittest 196 { 197 import mir.test: shouldApprox; 198 199 0.hypergeometricPMF(7, 4, 3).shouldApprox == 0.02857143; 200 1.hypergeometricPMF(7, 4, 3).shouldApprox == 0.3428571; 201 2.hypergeometricPMF(7, 4, 3).shouldApprox == 0.5142857; 202 3.hypergeometricPMF(7, 4, 3).shouldApprox == 0.1142857; 203 204 // can also provide a template argument to change output type 205 static assert(is(typeof(hypergeometricPMF!float(3, 7, 4, 3)) == float)); 206 } 207 208 // Check n=0 condition 209 @safe pure @nogc nothrow 210 version(mir_stat_test) 211 unittest 212 { 213 import mir.test: shouldApprox; 214 215 0.hypergeometricPMF(7, 4, 0).shouldApprox == 1.0; 216 } 217 218 /// Alternate algorithms 219 @safe pure @nogc nothrow 220 version(mir_stat_test) 221 unittest 222 { 223 import mir.test: shouldApprox; 224 import mir.math.common: exp; 225 226 // Can approximate hypergeometric with binomial distribution 227 20.hypergeometricPMF!"approxBinomial"(750_000, 250_000, 50).shouldApprox == exp(hypergeometricLPMF(20, 750_000, 250_000, 50)); 228 // Can approximate hypergeometric with poisson distribution 229 20.hypergeometricPMF!"approxPoisson"(100_000, 100, 5_000).shouldApprox == exp(hypergeometricLPMF(20, 100_000, 100, 5_000)); 230 // Can approximate hypergeometric with normal distribution 231 3_500.hypergeometricPMF!"approxNormal"(10_000, 7_500, 5_000).shouldApprox == exp(hypergeometricLPMF(3_500, 10_000, 7_500, 5_000)); 232 // Can approximate hypergeometric with normal distribution (with continuity correction) 233 3_500.hypergeometricPMF!"approxNormalContinuityCorrection"(10_000, 7_500, 5_000).shouldApprox == exp(hypergeometricLPMF(3_500, 10_000, 7_500, 5_000)); 234 } 235 236 /++ 237 Computes the hypergeometric probability mass function (PMF) with extended 238 floating point types (e.g. `Fp!128`), which provides additional accuracy for 239 large values of `k`, `N`, `K`, or `n`. 240 241 Params: 242 k = value to evaluate PMF (e.g. number of correct draws of object of interest) 243 N = total population size 244 K = number of relevant objects in population 245 n = number of draws 246 247 See_also: $(LINK2 https://en.wikipedia.org/wiki/Hypergeometric_distribution, Hypergeometric Distribution) 248 +/ 249 @safe pure @nogc nothrow 250 T fp_hypergeometricPMF(T = Fp!128)(const size_t k, const size_t N, const size_t K, const size_t n) 251 if (is(T == Fp!size, size_t size)) 252 in (K <= N, "K must be less than or equal to N") 253 in (n <= N, "n must be less than or equal to N") 254 in (k <= n, "n - k must be greater than or equal to zero") 255 in (k <= K, "k must be less than or equal to K") 256 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 257 { 258 import mir.math.numeric: binomialCoefficient; 259 return binomialCoefficient(K, cast(const uint) k) * 260 binomialCoefficient(N - K, cast(const uint) (n - k)) / 261 binomialCoefficient(N, cast(const uint) n); 262 } 263 264 /// 265 @safe pure @nogc nothrow 266 version(mir_stat_test) 267 unittest 268 { 269 import mir.bignum.fp: Fp, fp_log; 270 import mir.test: shouldApprox; 271 272 enum size_t val = 1_000_000; 273 size_t N = val + 5; 274 size_t K = val / 2; 275 size_t n = val / 100; 276 0.fp_hypergeometricPMF(N, K, n).fp_log!double.shouldApprox == hypergeometricLPMF(0, N, K, n); 277 1.fp_hypergeometricPMF(N, K, n).fp_log!double.shouldApprox == hypergeometricLPMF(1, N, K, n); 278 2.fp_hypergeometricPMF(N, K, n).fp_log!double.shouldApprox == hypergeometricLPMF(2, N, K, n); 279 5.fp_hypergeometricPMF(N, K, n).fp_log!double.shouldApprox == hypergeometricLPMF(5, N, K, n); 280 (n / 2).fp_hypergeometricPMF(N, K, n).fp_log!double.shouldApprox == hypergeometricLPMF(n / 2, N, K, n); 281 (n - 5).fp_hypergeometricPMF(N, K, n).fp_log!double.shouldApprox == hypergeometricLPMF(n - 5, N, K, n); 282 (n - 2).fp_hypergeometricPMF(N, K, n).fp_log!double.shouldApprox == hypergeometricLPMF(n - 2, N, K, n); 283 (n - 1).fp_hypergeometricPMF(N, K, n).fp_log!double.shouldApprox == hypergeometricLPMF(n - 1, N, K, n); 284 n.fp_hypergeometricPMF(N, K, n).fp_log!double.shouldApprox == hypergeometricLPMF(n, N, K, n); 285 } 286 287 private 288 @safe pure @nogc nothrow 289 T hypergeometricCDFImpl(T, HypergeometricAlgo hypergeometricAlgo) 290 (const size_t k, const size_t N, const size_t K, const size_t n) 291 if (isFloatingPoint!T && hypergeometricAlgo == HypergeometricAlgo.direct) 292 in (K <= N, "K must be less than or equal to N") 293 in (n <= N, "n must be less than or equal to N") 294 in (k <= n, "n - k must be greater than or equal to zero") 295 in (k <= K, "k must be less than or equal to K") 296 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 297 { 298 import mir.combinatorics: binomial; 299 300 T result = 0; 301 const denominator = binomial(N, n); 302 if (k <= n / 2) { 303 foreach (size_t i; 0 .. (k + 1)) { 304 result += cast(T) (binomial(K, i) * binomial(N - K, n - i)) / denominator; 305 } 306 } else { 307 result = 1 - hypergeometricCCDFImpl!(T, hypergeometricAlgo)(k, N, K, n); 308 } 309 return result; 310 } 311 312 private 313 @safe pure @nogc nothrow 314 T hypergeometricCDFImpl(T, HypergeometricAlgo hypergeometricAlgo) 315 (const size_t k, const size_t N, const size_t K, const size_t n) 316 if (isFloatingPoint!T && hypergeometricAlgo == HypergeometricAlgo.approxBinomial) 317 in (K <= N, "K must be less than or equal to N") 318 in (n <= N, "n must be less than or equal to N") 319 in (k <= n, "n - k must be greater than or equal to zero") 320 in (k <= K, "k must be less than or equal to K") 321 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 322 { 323 import mir.stat.distribution.binomial: binomialCDF; 324 325 return binomialCDF(k, n, cast(T) K / N); 326 } 327 328 private 329 @safe pure @nogc nothrow 330 T hypergeometricCDFImpl(T, HypergeometricAlgo hypergeometricAlgo) 331 (const size_t k, const size_t N, const size_t K, const size_t n) 332 if (isFloatingPoint!T && hypergeometricAlgo == HypergeometricAlgo.approxPoisson) 333 in (K <= N, "K must be less than or equal to N") 334 in (n <= N, "n must be less than or equal to N") 335 in (k <= n, "n - k must be greater than or equal to zero") 336 in (k <= K, "k must be less than or equal to K") 337 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 338 { 339 import mir.stat.distribution.poisson: poissonCDF, PoissonAlgo; 340 341 return poissonCDF!(PoissonAlgo.gamma)(k, cast(T) n * K / N); 342 } 343 344 private 345 @safe pure @nogc nothrow 346 T hypergeometricCDFImpl(T, HypergeometricAlgo hypergeometricAlgo) 347 (const size_t k, const size_t N, const size_t K, const size_t n) 348 if (isFloatingPoint!T && 349 (hypergeometricAlgo == HypergeometricAlgo.approxNormal || 350 hypergeometricAlgo == HypergeometricAlgo.approxNormalContinuityCorrection)) 351 in (K <= N, "K must be less than or equal to N") 352 in (n <= N, "n must be less than or equal to N") 353 in (k <= n, "n - k must be greater than or equal to zero") 354 in (k <= K, "k must be less than or equal to K") 355 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 356 { 357 import mir.math.common: sqrt; 358 import mir.stat.distribution.normal: normalCDF; 359 360 T l = k; 361 static if (hypergeometricAlgo == HypergeometricAlgo.approxNormalContinuityCorrection) { 362 l += 0.5; 363 } 364 return normalCDF(l, cast(T) (n * K) / N, sqrt(n * (cast(T) K / N) * (cast(T) (N - K) / N) * (cast(T) (N - n) / (N - 1)))); 365 } 366 367 /++ 368 Computes the hypergeometric cumulative distribution function (CDF). 369 370 Additional algorithms may be provided for calculating CDF that allow trading off 371 time and accuracy. If `approxPoisson` is provided, `PoissonAlgo.gamma` is assumed. 372 373 Setting `hypergeometricAlgo = HypergeometricAlgo.direct` results in direct 374 summation being used, which can result in significant slowdowns for large values 375 of `k`. 376 377 Params: 378 k = value to evaluate CDF (e.g. number of correct draws of object of interest) 379 N = total population size 380 K = number of relevant objects in population 381 n = number of draws 382 383 See_also: $(LINK2 https://en.wikipedia.org/wiki/Hypergeometric_distribution, Hypergeometric Distribution) 384 +/ 385 @safe pure @nogc nothrow 386 T hypergeometricCDF(T, HypergeometricAlgo hypergeometricAlgo = HypergeometricAlgo.direct) 387 (const size_t k, const size_t N, const size_t K, const size_t n) 388 if (isFloatingPoint!T) 389 in (K <= N, "K must be less than or equal to N") 390 in (n <= N, "n must be less than or equal to N") 391 in (k <= n, "n - k must be greater than or equal to zero") 392 in (k <= K, "k must be less than or equal to K") 393 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 394 { 395 if (k == n || n == 0) 396 return 1; 397 return hypergeometricCDFImpl!(T, hypergeometricAlgo)(k, N, K, n); 398 } 399 400 /// ditto 401 template hypergeometricCDF(HypergeometricAlgo hypergeometricAlgo = HypergeometricAlgo.direct) 402 { 403 alias hypergeometricCDF = hypergeometricCDF!(double, hypergeometricAlgo); 404 } 405 406 /// ditto 407 @safe pure nothrow @nogc 408 template hypergeometricCDF(T, string hypergeometricAlgo) 409 { 410 mixin("alias hypergeometricCDF = .hypergeometricCDF!(T, HypergeometricAlgo." ~ hypergeometricAlgo ~ ");"); 411 } 412 413 /// ditto 414 @safe pure nothrow @nogc 415 template hypergeometricCDF(string hypergeometricAlgo) 416 { 417 mixin("alias hypergeometricCDF = .hypergeometricCDF!(double, HypergeometricAlgo." ~ hypergeometricAlgo ~ ");"); 418 } 419 420 /// 421 @safe pure @nogc nothrow 422 version(mir_stat_test) 423 unittest 424 { 425 import mir.test: shouldApprox; 426 427 0.hypergeometricCDF(7, 4, 3).shouldApprox == 0.02857143; 428 1.hypergeometricCDF(7, 4, 3).shouldApprox == 0.3714286; 429 2.hypergeometricCDF(7, 4, 3).shouldApprox == 0.8857143; 430 3.hypergeometricCDF(7, 4, 3).shouldApprox == 1.0; 431 432 // can also provide a template argument to change output type 433 static assert(is(typeof(hypergeometricCDF!float(3, 7, 4, 3)) == float)); 434 } 435 436 // Check n=0 condition 437 @safe pure @nogc nothrow 438 version(mir_stat_test) 439 unittest 440 { 441 import mir.test: shouldApprox; 442 443 0.hypergeometricCDF(7, 4, 0).shouldApprox == 1.0; 444 } 445 446 /// Alternate algorithms 447 @safe pure @nogc nothrow 448 version(mir_stat_test) 449 unittest 450 { 451 import mir.test: shouldApprox; 452 453 // Can approximate hypergeometric with binomial distribution 454 20.hypergeometricCDF!"approxBinomial"(750_000, 250_000, 50).shouldApprox(1e-2) == 0.8740839; 455 // Can approximate hypergeometric with poisson distribution 456 8.hypergeometricCDF!"approxPoisson"(100_000, 100, 5_000).shouldApprox(1e-2) == 0.9370063; 457 // Can approximate hypergeometric with normal distribution 458 3_750.hypergeometricCDF!"approxNormal"(10_000, 7_500, 5_000).shouldApprox(2e-2) == 0.5092122; 459 // Can approximate hypergeometric with normal distribution 460 3_750.hypergeometricCDF!"approxNormalContinuityCorrection"(10_000, 7_500, 5_000).shouldApprox(1e-2) == 0.5092122; 461 } 462 463 private 464 @safe pure @nogc nothrow 465 T hypergeometricCCDFImpl(T, HypergeometricAlgo hypergeometricAlgo) 466 (const size_t k, const size_t N, const size_t K, const size_t n) 467 if (isFloatingPoint!T && hypergeometricAlgo == HypergeometricAlgo.direct) 468 in (K <= N, "K must be less than or equal to N") 469 in (n <= N, "n must be less than or equal to N") 470 in (k <= n, "n - k must be greater than or equal to zero") 471 in (k <= K, "k must be less than or equal to K") 472 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 473 { 474 import mir.combinatorics: binomial; 475 476 T result = 0; 477 const denominator = binomial(N, n); 478 if (k > n / 2) { 479 foreach (size_t i; (k + 1) .. (n + 1)) { 480 result += cast(T) (binomial(K, i) * binomial(N - K, n - i)) / denominator; 481 } 482 } else { 483 result = 1 - hypergeometricCDFImpl!(T, hypergeometricAlgo)(k, N, K, n); 484 } 485 return result; 486 } 487 488 private 489 @safe pure @nogc nothrow 490 T hypergeometricCCDFImpl(T, HypergeometricAlgo hypergeometricAlgo) 491 (const size_t k, const size_t N, const size_t K, const size_t n) 492 if (isFloatingPoint!T && hypergeometricAlgo == HypergeometricAlgo.approxBinomial) 493 in (K <= N, "K must be less than or equal to N") 494 in (n <= N, "n must be less than or equal to N") 495 in (k <= n, "n - k must be greater than or equal to zero") 496 in (k <= K, "k must be less than or equal to K") 497 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 498 { 499 import mir.stat.distribution.binomial: binomialCCDF; 500 501 return binomialCCDF(k, n, cast(T) K / N); 502 } 503 504 private 505 @safe pure @nogc nothrow 506 T hypergeometricCCDFImpl(T, HypergeometricAlgo hypergeometricAlgo) 507 (const size_t k, const size_t N, const size_t K, const size_t n) 508 if (isFloatingPoint!T && hypergeometricAlgo == HypergeometricAlgo.approxPoisson) 509 in (K <= N, "K must be less than or equal to N") 510 in (n <= N, "n must be less than or equal to N") 511 in (k <= n, "n - k must be greater than or equal to zero") 512 in (k <= K, "k must be less than or equal to K") 513 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 514 { 515 import mir.stat.distribution.poisson: poissonCCDF, PoissonAlgo; 516 517 return poissonCCDF!(PoissonAlgo.gamma)(k, cast(T) n * K / N); 518 } 519 520 private 521 @safe pure @nogc nothrow 522 T hypergeometricCCDFImpl(T, HypergeometricAlgo hypergeometricAlgo) 523 (const size_t k, const size_t N, const size_t K, const size_t n) 524 if (isFloatingPoint!T && 525 (hypergeometricAlgo == HypergeometricAlgo.approxNormal || 526 hypergeometricAlgo == HypergeometricAlgo.approxNormalContinuityCorrection)) 527 in (K <= N, "K must be less than or equal to N") 528 in (n <= N, "n must be less than or equal to N") 529 in (k <= n, "n - k must be greater than or equal to zero") 530 in (k <= K, "k must be less than or equal to K") 531 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 532 { 533 import mir.math.common: sqrt; 534 import mir.stat.distribution.normal: normalCCDF; 535 536 T l = k; 537 static if (hypergeometricAlgo == HypergeometricAlgo.approxNormalContinuityCorrection) { 538 l += 0.5; 539 } 540 return normalCCDF(l, cast(T) (n * K) / N, sqrt(n * (cast(T) K / N) * (cast(T) (N - K) / N) * (cast(T) (N - n) / (N - 1)))); 541 } 542 543 /++ 544 Computes the hypergeometric complementary cumulative distribution function (CCDF). 545 546 Additional algorithms may be provided for calculating CCDF that allow trading off 547 time and accuracy. If `approxPoisson` is provided, `PoissonAlgo.gamma` is assumed. 548 549 Setting `hypergeometricAlgo = HypergeometricAlgo.direct` results in direct 550 summation being used, which can result in significant slowdowns for large values 551 of `k`. 552 553 Params: 554 k = value to evaluate CCDF (e.g. number of correct draws of object of interest) 555 N = total population size 556 K = number of relevant objects in population 557 n = number of draws 558 559 See_also: $(LINK2 https://en.wikipedia.org/wiki/Hypergeometric_distribution, Hypergeometric Distribution) 560 +/ 561 @safe pure @nogc nothrow 562 T hypergeometricCCDF(T, HypergeometricAlgo hypergeometricAlgo = HypergeometricAlgo.direct) 563 (const size_t k, const size_t N, const size_t K, const size_t n) 564 if (isFloatingPoint!T) 565 in (K <= N, "K must be less than or equal to N") 566 in (n <= N, "n must be less than or equal to N") 567 in (k <= n, "n - k must be greater than or equal to zero") 568 in (k <= K, "k must be less than or equal to K") 569 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 570 { 571 if (k == n || n == 0) 572 return 0; 573 return hypergeometricCCDFImpl!(T, hypergeometricAlgo)(k, N, K, n); 574 } 575 576 /// ditto 577 template hypergeometricCCDF(HypergeometricAlgo hypergeometricAlgo = HypergeometricAlgo.direct) 578 { 579 alias hypergeometricCCDF = hypergeometricCCDF!(double, hypergeometricAlgo); 580 } 581 582 /// ditto 583 @safe pure nothrow @nogc 584 template hypergeometricCCDF(T, string hypergeometricAlgo) 585 { 586 mixin("alias hypergeometricCCDF = .hypergeometricCCDF!(T, HypergeometricAlgo." ~ hypergeometricAlgo ~ ");"); 587 } 588 589 /// ditto 590 @safe pure nothrow @nogc 591 template hypergeometricCCDF(string hypergeometricAlgo) 592 { 593 mixin("alias hypergeometricCCDF = .hypergeometricCCDF!(double, HypergeometricAlgo." ~ hypergeometricAlgo ~ ");"); 594 } 595 596 /// 597 @safe pure @nogc nothrow 598 version(mir_stat_test) 599 unittest 600 { 601 import mir.test: shouldApprox; 602 603 0.hypergeometricCCDF(7, 4, 3).shouldApprox == 0.9714286; 604 1.hypergeometricCCDF(7, 4, 3).shouldApprox == 0.6285714; 605 2.hypergeometricCCDF(7, 4, 3).shouldApprox == 0.1142857; 606 3.hypergeometricCCDF(7, 4, 3).shouldApprox == 0.0; 607 608 // can also provide a template argument to change output type 609 static assert(is(typeof(hypergeometricCCDF!float(3, 7, 4, 3)) == float)); 610 } 611 612 // Check n=0 condition 613 @safe pure @nogc nothrow 614 version(mir_stat_test) 615 unittest 616 { 617 import mir.test: shouldApprox; 618 619 0.hypergeometricCCDF(7, 4, 0).shouldApprox == 0.0; 620 } 621 622 /// Alternate algorithms 623 @safe pure @nogc nothrow 624 version(mir_stat_test) 625 unittest 626 { 627 import mir.test: shouldApprox; 628 import mir.math.common: exp; 629 630 // Can approximate hypergeometric with binomial distribution 631 20.hypergeometricCCDF!"approxBinomial"(750_000, 250_000, 50).shouldApprox(1e-2) == 0.1259161; 632 // Can approximate hypergeometric with poisson distribution 633 8.hypergeometricCCDF!"approxPoisson"(100_000, 100, 5_000).shouldApprox(1e-1) == 0.0629937; 634 // Can approximate hypergeometric with normal distribution 635 3_750.hypergeometricCCDF!"approxNormal"(10_000, 7_500, 5_000).shouldApprox(2e-2) == 0.4907878; 636 // Can approximate hypergeometric with normal distribution 637 3_750.hypergeometricCCDF!"approxNormalContinuityCorrection"(10_000, 7_500, 5_000).shouldApprox(1e-2) == 0.4907878; 638 } 639 640 private 641 @safe pure nothrow @nogc 642 size_t hypergeometricInvCDFImpl(T, HypergeometricAlgo hypergeometricAlgo) 643 (const T p, const size_t N, const size_t K, const size_t n) 644 if (isFloatingPoint!T && hypergeometricAlgo == HypergeometricAlgo.direct) 645 in (K <= N, "K must be less than or equal to N") 646 in (n <= N, "n must be less than or equal to N") 647 in (p >= 0, "p must be greater than or equal to 0") 648 in (p <= 1, "p must be less than or equal to 1") 649 { 650 size_t guess = 0; 651 if (n < 0.1 * N) { 652 if ((n * N > 9 * (N - K)) && 653 (n * (N - K) > 9 * K)) { 654 guess = hypergeometricInvCDFImpl!(T, HypergeometricAlgo.approxNormalContinuityCorrection)(p, N, K, n); 655 } else if (K < 0.1 * N) { 656 guess = hypergeometricInvCDFImpl!(T, HypergeometricAlgo.approxPoisson)(p, N, K, n); 657 } else { 658 guess = hypergeometricInvCDFImpl!(T, HypergeometricAlgo.approxBinomial)(p, N, K, n); 659 } 660 } 661 T cdfGuess = hypergeometricCDF!(T, hypergeometricAlgo)(guess, N, K, n); 662 663 if (p <= cdfGuess) { 664 if (guess == 0) { 665 return guess; 666 } 667 for (size_t i = (guess - 1); guess >= 0; i--) { 668 cdfGuess -= hypergeometricPMF!(T, hypergeometricAlgo)(i + 1, N, K, n); 669 if (p > cdfGuess) { 670 guess = i + 1; 671 break; 672 } 673 } 674 } else { 675 while(p > cdfGuess) { 676 guess++; 677 cdfGuess += hypergeometricPMF!(T, hypergeometricAlgo)(guess, N, K, n); 678 } 679 } 680 return guess; 681 } 682 683 private 684 @safe pure @nogc nothrow 685 size_t hypergeometricInvCDFImpl(T, HypergeometricAlgo hypergeometricAlgo) 686 (const T p, const size_t N, const size_t K, const size_t n) 687 if (isFloatingPoint!T && hypergeometricAlgo == HypergeometricAlgo.approxBinomial) 688 in (K <= N, "K must be less than or equal to N") 689 in (n <= N, "n must be less than or equal to N") 690 in (p >= 0, "p must be greater than or equal to 0") 691 in (p <= 1, "p must be less than or equal to 1") 692 { 693 import mir.stat.distribution.binomial: binomialInvCDF; 694 695 return binomialInvCDF(p, n, cast(T) K / N); 696 } 697 698 private 699 @safe pure @nogc nothrow 700 size_t hypergeometricInvCDFImpl(T, HypergeometricAlgo hypergeometricAlgo) 701 (const T p, const size_t N, const size_t K, const size_t n) 702 if (isFloatingPoint!T && hypergeometricAlgo == HypergeometricAlgo.approxPoisson) 703 in (K <= N, "K must be less than or equal to N") 704 in (n <= N, "n must be less than or equal to N") 705 in (p >= 0, "p must be greater than or equal to 0") 706 in (p <= 1, "p must be less than or equal to 1") 707 { 708 import mir.stat.distribution.poisson: poissonInvCDF, PoissonAlgo; 709 710 // Using PoissonAlgo.direct because PoissonAlgo.gamma does not return the 711 // same result 712 return poissonInvCDF!(PoissonAlgo.direct)(p, cast(T) n * K / N); 713 } 714 715 private 716 @safe pure @nogc nothrow 717 size_t hypergeometricInvCDFImpl(T, HypergeometricAlgo hypergeometricAlgo) 718 (const T p, const size_t N, const size_t K, const size_t n) 719 if (isFloatingPoint!T && 720 (hypergeometricAlgo == HypergeometricAlgo.approxNormal || 721 hypergeometricAlgo == HypergeometricAlgo.approxNormalContinuityCorrection)) 722 in (K <= N, "K must be less than or equal to N") 723 in (n <= N, "n must be less than or equal to N") 724 in (p >= 0, "p must be greater than or equal to 0") 725 in (p <= 1, "p must be less than or equal to 1") 726 { 727 import mir.math.common: floor, sqrt; 728 import mir.stat.distribution.normal: normalCDF, normalInvCDF; 729 730 T location = cast(T) (n * K) / N; 731 T scale = sqrt(n * (cast(T) K / N) * (cast(T) (N - K) / N) * (cast(T) (N - n) / (N - 1))); 732 733 // Handles case where p is small or large, better than just using pLowerBound = 0 or pUpperBound = 1 734 T pLowerBound = 0; 735 T pUpperBound = 1; 736 T lowerValue = 0; 737 T upperValue = K; 738 static if (hypergeometricAlgo == HypergeometricAlgo.approxNormalContinuityCorrection) { 739 lowerValue += 0.5; 740 upperValue -= 0.5; 741 } 742 pLowerBound = normalCDF(lowerValue, location, scale); 743 pUpperBound = normalCDF(upperValue, location, scale); 744 if (p <= pLowerBound) { 745 return 0; 746 } else if (p >= pUpperBound) { 747 return K; 748 } 749 750 auto result = normalInvCDF(p, location, scale); 751 static if (hypergeometricAlgo == HypergeometricAlgo.approxNormalContinuityCorrection) { 752 result = result - 0.5; 753 } 754 return cast(size_t) floor(result); 755 } 756 757 /++ 758 Computes the hypergeometric inverse cumulative distribution function (InvCDF). 759 760 Additional algorithms may be provided for calculating InvCDF that allow trading off 761 time and accuracy. If `approxPoisson` is provided, `PoissonAlgo.direct` is assumed. 762 This is different from other functions that use `PoissonAlgo.gamma` since in this 763 case it does not provide the same result. 764 765 Setting `hypergeometricAlgo = HypergeometricAlgo.direct` results in direct 766 summation being used, which can result in significant slowdowns for large values 767 of `k`. 768 769 Params: 770 p = value to evaluate InvCDF 771 N = total population size 772 K = number of relevant objects in population 773 n = number of draws 774 775 See_also: $(LINK2 https://en.wikipedia.org/wiki/Hypergeometric_distribution, Hypergeometric Distribution) 776 +/ 777 @safe pure @nogc nothrow 778 size_t hypergeometricInvCDF(T, HypergeometricAlgo hypergeometricAlgo = HypergeometricAlgo.direct) 779 (const T p, const size_t N, const size_t K, const size_t n) 780 if (isFloatingPoint!T) 781 in (K <= N, "K must be less than or equal to N") 782 in (n <= N, "n must be less than or equal to N") 783 in (p >= 0, "p must be greater than or equal to 0") 784 in (p <= 1, "p must be less than or equal to 1") 785 { 786 if (n == 0 || p == 0) { 787 return 0; 788 } else if (p == 1) { 789 return K; 790 } 791 return hypergeometricInvCDFImpl!(T, hypergeometricAlgo)(p, N, K, n); 792 } 793 794 /// ditto 795 @safe pure nothrow @nogc 796 template hypergeometricInvCDF(T, string hypergeometricAlgo) 797 { 798 mixin("alias hypergeometricInvCDF = .hypergeometricInvCDF!(T, HypergeometricAlgo." ~ hypergeometricAlgo ~ ");"); 799 } 800 801 /// ditto 802 @safe pure nothrow @nogc 803 template hypergeometricInvCDF(string hypergeometricAlgo) 804 { 805 mixin("alias hypergeometricInvCDF = .hypergeometricInvCDF!(double, HypergeometricAlgo." ~ hypergeometricAlgo ~ ");"); 806 } 807 808 /// 809 @safe pure @nogc nothrow 810 version(mir_stat_test) 811 unittest 812 { 813 import mir.test: should; 814 815 0.0.hypergeometricInvCDF(40, 15, 20).should == 0; 816 0.1.hypergeometricInvCDF(40, 15, 20).should == 6; 817 0.2.hypergeometricInvCDF(40, 15, 20).should == 6; 818 0.3.hypergeometricInvCDF(40, 15, 20).should == 7; 819 0.4.hypergeometricInvCDF(40, 15, 20).should == 7; 820 0.5.hypergeometricInvCDF(40, 15, 20).should == 7; 821 0.6.hypergeometricInvCDF(40, 15, 20).should == 8; 822 0.7.hypergeometricInvCDF(40, 15, 20).should == 8; 823 0.8.hypergeometricInvCDF(40, 15, 20).should == 9; 824 0.9.hypergeometricInvCDF(40, 15, 20).should == 9; 825 1.0.hypergeometricInvCDF(40, 15, 20).should == 15; 826 } 827 828 // Check n=0 condition 829 @safe pure @nogc nothrow 830 version(mir_stat_test) 831 unittest 832 { 833 import mir.test: should; 834 835 0.0.hypergeometricInvCDF(7, 4, 0).should == 0; 836 0.5.hypergeometricInvCDF(7, 4, 0).should == 0; 837 1.0.hypergeometricInvCDF(7, 4, 0).should == 0; 838 } 839 840 /// Alternate algorithms 841 @safe pure @nogc nothrow 842 version(mir_stat_test) 843 unittest 844 { 845 import mir.test: shouldApprox; 846 import mir.math.common: exp; 847 848 // Can approximate hypergeometric with binomial distribution 849 0.5.hypergeometricInvCDF!"approxBinomial"(750_000, 250_000, 50).shouldApprox!double == 17; 850 // Can approximate hypergeometric with poisson distribution 851 0.4.hypergeometricInvCDF!"approxPoisson"(100_000, 100, 5_000).shouldApprox!double == 4; 852 // Can approximate hypergeometric with normal distribution 853 0.6.hypergeometricInvCDF!"approxNormal"(10_000, 7_500, 5_000).shouldApprox!double == 3755; 854 // Can approximate hypergeometric with normal distribution 855 0.6.hypergeometricInvCDF!"approxNormalContinuityCorrection"(10_000, 7_500, 5_000).shouldApprox!double(1) == 3755; 856 } 857 858 // test approxNormal / approxNormalContinuityCorrection 859 version(mir_stat_test) 860 @safe pure nothrow @nogc 861 unittest { 862 import mir.math.common: floor, sqrt; 863 import mir.stat.distribution.normal: normalInvCDF; 864 import mir.test: should; 865 866 size_t N = 50; 867 size_t K = 15; 868 size_t n = 25; 869 double location = cast(double) (n * K) / N; 870 double scale = sqrt(n * (cast(double) K / N) * (cast(double) (N - K) / N) * (cast(double) (N - n) / (N - 1))); 871 0.0000001.hypergeometricInvCDF!"approxNormal"(N, K, n).should == 0; 872 0.0000001.hypergeometricInvCDF!"approxNormalContinuityCorrection"(N, K, n).should == 0; 873 0.9999999.hypergeometricInvCDF!"approxNormal"(N, K, n).should == K; 874 0.9999999.hypergeometricInvCDF!"approxNormalContinuityCorrection"(N, K, n).should == K; 875 double checkValue; 876 for (double x = 0.05; x < 1; x = x + 0.05) { 877 checkValue = normalInvCDF(x, location, scale); 878 x.hypergeometricInvCDF!"approxNormal"(N, K, n).should == floor(checkValue); 879 x.hypergeometricInvCDF!"approxNormalContinuityCorrection"(N, K, n).should == floor(checkValue - 0.5); 880 } 881 } 882 883 // test alternate direct guess paths 884 version(mir_stat_test) 885 @safe pure nothrow @nogc 886 unittest { 887 import mir.math.common: approxEqual; 888 889 static immutable size_t[] Ns = [120, 100, 200]; 890 static immutable size_t[] Ks = [ 50, 50, 9]; 891 static immutable size_t[] ns = [ 10, 9, 1]; 892 893 size_t value; 894 for (size_t i; i < Ns.length; i++) { 895 for (double x = 0.01; x < 1; x = x + 0.01) { 896 value = x.hypergeometricInvCDF(Ns[i], Ks[i], ns[i]); 897 assert(value.hypergeometricCDF(Ns[i], Ks[i], ns[i]) >= x); 898 if (value > 1) 899 assert((value - 1).hypergeometricCDF(Ns[i], Ks[i], ns[i]) < x); 900 } 901 } 902 } 903 904 /++ 905 Computes the hypergeometric log probability mass function (LPMF). 906 907 Params: 908 k = value to evaluate LPMF (e.g. number of correct draws of object of interest) 909 N = total population size 910 K = number of relevant objects in population 911 n = number of draws 912 913 See_also: $(LINK2 https://en.wikipedia.org/wiki/Hypergeometric_distribution, Hypergeometric Distribution) 914 +/ 915 @safe pure @nogc nothrow 916 T hypergeometricLPMF(T = double)(const size_t k, const size_t N, const size_t K, const size_t n) 917 if (isFloatingPoint!T) 918 in (K <= N, "K must be less than or equal to N") 919 in (n <= N, "n must be less than or equal to N") 920 in (k <= n, "n - k must be greater than or equal to zero") 921 in (k <= K, "k must be less than or equal to K") 922 in (k + N >= n + K, "`N - K` must be greater than or equal to `n - k`") 923 { 924 import mir.math.internal.log_binomial: logBinomialCoefficient; 925 926 return logBinomialCoefficient(K, cast(const uint) k) + 927 logBinomialCoefficient(N - K, cast(const uint) (n - k)) - 928 logBinomialCoefficient(N, cast(const uint) n); 929 } 930 931 /// 932 @safe pure @nogc nothrow 933 version(mir_stat_test) 934 unittest 935 { 936 import mir.math.common: log; 937 import mir.test: shouldApprox; 938 939 0.hypergeometricLPMF(7, 4, 3).shouldApprox == log(0.02857143); 940 1.hypergeometricLPMF(7, 4, 3).shouldApprox == log(0.3428571); 941 2.hypergeometricLPMF(7, 4, 3).shouldApprox == log(0.5142857); 942 3.hypergeometricLPMF(7, 4, 3).shouldApprox == log(0.1142857); 943 }