The OpenD Programming Language

1 /**A module for performing linear regression.  This module has an unusual
2  * interface, as it is range-based instead of matrix based. Values for
3  * independent variables are provided as either a tuple or a range of ranges.
4  * This means that one can use, for example, map, to fit high order models and
5  * lazily evaluate certain values.  (For details, see examples below.)
6  *
7  * Author:  David Simcha*/
8  /*
9  * License:
10  * Boost Software License - Version 1.0 - August 17th, 2003
11  *
12  * Permission is hereby granted, free of charge, to any person or organization
13  * obtaining a copy of the software and accompanying documentation covered by
14  * this license (the "Software") to use, reproduce, display, distribute,
15  * execute, and transmit the Software, and to prepare derivative works of the
16  * Software, and to permit third-parties to whom the Software is furnished to
17  * do so, all subject to the following:
18  *
19  * The copyright notices in the Software and this entire statement, including
20  * the above license grant, this restriction and the following disclaimer,
21  * must be included in all copies of the Software, in whole or in part, and
22  * all derivative works of the Software, unless such copies or derivative
23  * works are solely in the form of machine-executable object code generated by
24  * a source language processor.
25  *
26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
29  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
30  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
31  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
32  * DEALINGS IN THE SOFTWARE.
33  */
34 module dstats.regress;
35 
36 import std.math, std.traits, std.array, std.traits, std.string,
37     std.exception, std.typetuple, std.typecons, std.numeric, std.parallelism;
38 
39 import std.algorithm : map, copy, max, min, filter, reduce;
40 
41 alias std.range.repeat repeat;
42 
43 import dstats.alloc, std.range, std.conv, dstats.distrib, dstats.cor,
44     dstats.base, dstats.summary, dstats.sort;
45 
46 version(unittest) {
47     import std.stdio, dstats.random, std.functional;
48 }
49 
50 ///
51 struct PowMap(ExpType, T)
52 if(isForwardRange!(T)) {
53     Unqual!T range;
54     Unqual!ExpType exponent;
55     double cache;
56 
57     this(T range, ExpType exponent) {
58         this.range = range;
59         this.exponent = exponent;
60 
61         static if(isIntegral!ExpType) {
62             cache = pow(cast(double) range.front, exponent);
63         } else {
64             cache = pow(cast(ExpType) range.front, exponent);
65         }
66     }
67 
68     @property double front() const pure nothrow {
69         return cache;
70     }
71 
72     void popFront() {
73         range.popFront;
74         if(!range.empty) {
75             cache = pow(cast(double) range.front, exponent);
76         }
77     }
78 
79     @property typeof(this) save() {
80         return this;
81     }
82 
83     @property bool empty() {
84         return range.empty;
85     }
86 }
87 
88 /**Maps a forward range to a power determined at runtime.  ExpType is the type
89  * of the exponent.  Using an int is faster than using a double, but obviously
90  * less flexible.*/
91 PowMap!(ExpType, T) powMap(ExpType, T)(T range, ExpType exponent) {
92     alias PowMap!(ExpType, T) RT;
93     return RT(range, exponent);
94 }
95 
96 // Very ad-hoc, does a bunch of matrix ops for linearRegress and
97 // linearRegressBeta.  Specifically, computes xTx and xTy.
98 // Written specifically to be efficient in the context used here.
99 private void rangeMatrixMulTrans(U, T...)(
100     ref double[] xTy,
101     ref DoubleMatrix xTx,
102     U y, ref T matIn,
103     RegionAllocator alloc
104 ) {
105     static if(isArray!(T[0]) &&
106         isInputRange!(typeof(matIn[0][0])) && matIn.length == 1) {
107         alias matIn[0] mat;
108     } else {
109         alias matIn mat;
110     }
111 
112     bool someEmpty() {
113         if(y.empty) {
114             return true;
115         }
116         foreach(range; mat) {
117             if(range.empty) {
118                 return true;
119             }
120         }
121         return false;
122     }
123 
124     void popAll() {
125         foreach(ti, range; mat) {
126             mat[ti].popFront;
127         }
128         y.popFront;
129     }
130 
131     xTy[] = 0;
132     xTx = doubleMatrix(mat.length, mat.length, alloc);
133     foreach(i; 0..mat.length) foreach(j; 0..mat.length) {
134         xTx[i, j] = 0;
135     }
136 
137     auto alloc2 = newRegionAllocator();
138     auto deltas = alloc2.uninitializedArray!(double[])(mat.length);
139 
140     // Using an algorithm similar to the one for Pearson cor to improve
141     // numerical stability.  Calculate means and covariances, then
142     // combine them:  Sum of squares = mean1 * N * mean2 + N * cov.
143     double k = 0;
144     double[] means = alloc2.uninitializedArray!(double[])(mat.length);
145     means[] = 0;
146     double yMean = 0;
147 
148     while(!someEmpty) {
149         foreach(i, elem; mat) {
150             deltas[i] = cast(double) elem.front;
151         }
152 
153         immutable kMinus1 = k;
154         immutable kNeg1 = 1 / ++k;
155         deltas[] -= means[];
156         means[] += deltas[] * kNeg1;
157 
158         immutable yDelta = cast(double) y.front - yMean;
159         yMean += yDelta * kNeg1;
160 
161         foreach(i, delta1; deltas) {
162             xTy[i] += kMinus1 * delta1 * kNeg1 * yDelta;
163             xTx[i, i] += kMinus1 * delta1 * kNeg1 * delta1;
164 
165             foreach(j, delta2; deltas[0..i]) {
166                 xTx[i, j] += kMinus1 * delta1 * kNeg1 * delta2;
167             }
168         }
169         popAll();
170     }
171 
172     // k is n now that we're done looping over the data.
173     alias k n;
174 
175     // mat now consists of covariance * n.    Divide by n and add mean1 * mean2
176     // to get sum of products.
177     foreach(i; 0..xTx.rows) foreach(j; 0..i + 1) {
178         xTx[i, j] /= n;
179         xTx[i, j] += means[i] * means[j];
180     }
181 
182     // Similarly for the xTy vector
183     foreach(i, ref elem; xTy) {
184         xTy[i] /= n;
185         elem += yMean * means[i];
186     }
187     symmetrize(xTx);
188 }
189 
190 // Copies values from lower triangle to upper triangle.
191 private void symmetrize(ref DoubleMatrix mat) {
192     foreach(i; 1..mat.rows) {
193         foreach(j; 0..i) {
194             mat[j, i] = mat[i, j];
195         }
196     }
197 }
198 
199 /**Struct that holds the results of a linear regression.  It's a plain old
200  * data struct.*/
201 struct RegressRes {
202     /**The coefficients, one for each range in X.  These will be in the order
203      * that the X ranges were passed in.*/
204     double[] betas;
205 
206     /**The standard error terms of the X ranges passed in.*/
207     double[] stdErr;
208 
209     /**The lower confidence bounds of the beta terms, at the confidence level
210      * specificied.  (Default 0.95).*/
211     double[] lowerBound;
212 
213     /**The upper confidence bounds of the beta terms, at the confidence level
214      * specificied.  (Default 0.95).*/
215     double[] upperBound;
216 
217     /**The P-value for the alternative that the corresponding beta value is
218      * different from zero against the null that it is equal to zero.*/
219     double[] p;
220 
221     /**The coefficient of determination.*/
222     double R2;
223 
224     /**The adjusted coefficient of determination.*/
225     double adjustedR2;
226 
227     /**The root mean square of the residuals.*/
228     double residualError;
229 
230     /**The P-value for the model as a whole.  Based on an F-statistic.  The
231      * null here is that the model has no predictive value, the alternative
232      * is that it does.*/
233     double overallP;
234 
235     // Just used internally.
236     private static string arrStr(T)(T arr) {
237         return text(arr)[1..$ - 1];
238     }
239 
240     /**Print out the results in the default format.*/
241     string toString() {
242         return "Betas:  " ~ arrStr(betas) ~ "\nLower Conf. Int.:  " ~
243             arrStr(lowerBound) ~ "\nUpper Conf. Int.:  " ~ arrStr(upperBound) ~
244             "\nStd. Err:  " ~ arrStr(stdErr) ~ "\nP Values:  " ~ arrStr(p) ~
245             "\nR^2:  " ~ text(R2) ~
246             "\nAdjusted R^2:  " ~ text(adjustedR2) ~
247             "\nStd. Residual Error:  " ~ text(residualError)
248             ~ "\nOverall P:  " ~ text(overallP);
249     }
250 }
251 
252 /**Forward Range for holding the residuals from a regression analysis.*/
253 struct Residuals(F, U, T...) {
254     static if(T.length == 1 && isForwardRange!(ElementType!(T[0]))) {
255         alias T[0] R;
256         alias typeof(array(R.init)) XType;
257         enum bool needDup = true;
258     } else {
259         alias T R;
260         alias staticMap!(Unqual, R) XType;
261         enum bool needDup = false;
262     }
263 
264     Unqual!U Y;
265     XType X;
266     F[] betas;
267     double residual;
268     bool _empty;
269 
270     void nextResidual() {
271         double sum = 0;
272         size_t i = 0;
273         foreach(elem; X) {
274             double frnt = elem.front;
275             sum += frnt * betas[i];
276             i++;
277         }
278         residual = Y.front - sum;
279     }
280 
281     this(F[] betas, U Y, R X) {
282         static if(is(typeof(X.length))) {
283             dstatsEnforce(X.length == betas.length,
284                 "Betas and X must have same length for residuals.");
285         } else {
286             dstatsEnforce(walkLength(X) == betas.length,
287                 "Betas and X must have same length for residuals.");
288         }
289 
290         static if(needDup) {
291             this.X = array(X);
292         } else {
293             this.X = X;
294         }
295 
296         foreach(i, elem; this.X) {
297             static if(isForwardRange!(typeof(elem))) {
298                 this.X[i] = this.X[i].save;
299             }
300         }
301 
302         this.Y = Y;
303         this.betas = betas;
304         if(Y.empty) {
305             _empty = true;
306             return;
307         }
308         foreach(elem; X) {
309             if(elem.empty) {
310                 _empty = true;
311                 return;
312             }
313         }
314         nextResidual;
315     }
316 
317     @property double front() const pure nothrow {
318         return residual;
319     }
320 
321     void popFront() {
322         Y.popFront;
323         if(Y.empty) {
324             _empty = true;
325             return;
326         }
327         foreach(ti, elem; X) {
328             X[ti].popFront;
329             if(X[ti].empty) {
330                 _empty = true;
331                 return;
332             }
333         }
334         nextResidual;
335     }
336 
337     @property bool empty() const pure nothrow {
338         return _empty;
339     }
340 
341     @property typeof(this) save() {
342         auto ret = this;
343         ret.Y = ret.Y.save;
344         foreach(ti, xElem; ret.X) {
345             ret.X[ti] = ret.X[ti].save;
346         }
347 
348         return ret;
349     }
350 }
351 
352 /**Given the beta coefficients from a linear regression, and X and Y values,
353  * returns a range that lazily computes the residuals.
354  */
355 Residuals!(F, U, T) residuals(F, U, T...)(F[] betas, U Y, T X)
356 if(isFloatingPoint!F && isForwardRange!U && allSatisfy!(isForwardRange, T)) {
357     alias Residuals!(F, U, T) RT;
358     return RT(betas, Y, X);
359 }
360 
361 // Compiles summary statistics while iterating, to allow ridge regression over
362 // input ranges.
363 private struct SummaryIter(R) {
364     R range;
365     MeanSD summ;
366 
367     this(R range) {
368         this.range = range;
369     }
370 
371     double front() @property {
372         return range.front;
373     }
374 
375     void popFront() {
376         summ.put(range.front);
377         range.popFront();
378     }
379 
380     bool empty() @property {
381         return range.empty;
382     }
383 
384     double mse() @property const pure nothrow { return summ.mse; }
385 
386     double N() @property const pure nothrow { return summ.N; }
387 }
388 
389 private template SummaryType(R) {
390     alias SummaryIter!R SummaryType;
391 }
392 
393 /**
394 Perform a linear regression and return just the beta values.  The advantages
395 to just returning the beta values are that it's faster and that each range
396 needs to be iterated over only once, and thus can be just an input range.
397 The beta values are returned such that the smallest index corresponds to
398 the leftmost element of X.  X can be either a tuple or a range of input
399 ranges.  Y must be an input range.
400 
401 If, after all X variables are passed in, a numeric type is passed as the last
402 parameter, this is treated as a ridge parameter and ridge regression is
403 performed.  Ridge regression is a form of regression that penalizes the L2 norm
404 of the beta vector and therefore results in more parsimonious models.
405 However, it makes statistical inference such as that supported by
406 linearRegress() difficult to impossible.  Therefore, linearRegress() doesn't
407 support ridges.
408 
409 If no ridge parameter is passed, or equivalently if the ridge parameter is
410 zero, then ordinary least squares regression is performed.
411 
412 Notes:  The X ranges are traversed in lockstep, but the traversal is stopped
413 at the end of the shortest one.  Therefore, using infinite ranges is safe.
414 For example, using repeat(1) to get an intercept term works.
415 
416 References:
417 
418 http://www.mathworks.com/help/toolbox/stats/ridge.html
419 
420 Venables, W. N. & Ripley, B. D. (2002) Modern Applied Statistics with S.
421 Fourth Edition. Springer, New York. ISBN 0-387-95457-0
422 (This is the citation for the MASS R package.)
423 
424 Examples:
425 ---
426 int[] nBeers = [8,6,7,5,3,0,9];
427 int[] nCoffees = [3,6,2,4,3,6,8];
428 int[] musicVolume = [3,1,4,1,5,9,2];
429 int[] programmingSkill = [2,7,1,8,2,8,1];
430 double[] betas = linearRegressBeta(programmingSkill, repeat(1), nBeers, nCoffees,
431     musicVolume, map!"a * a"(musicVolume));
432 
433 // Now throw in a ridge parameter of 2.5.
434 double[] ridgeBetas = linearRegressBeta(programmingSkill, repeat(1), nBeers,
435     nCoffees, musicVolume, map!"a * a"(musicVolume), 2.5);
436 ---
437  */
438 double[] linearRegressBeta(U, T...)(U Y, T XIn)
439 if(doubleInput!(U)) {
440     double[] dummy;
441     return linearRegressBetaBuf!(U, T)(dummy, Y, XIn);
442 }
443 
444 /**
445 Same as linearRegressBeta, but allows the user to specify a buffer for
446 the beta terms.  If the buffer is too short, a new one is allocated.
447 Otherwise, the results are returned in the user-provided buffer.
448  */
449 double[] linearRegressBetaBuf(U, TRidge...)(double[] buf, U Y, TRidge XRidge)
450 if(doubleInput!(U)) {
451     auto alloc = newRegionAllocator();
452 
453     static if(isFloatingPoint!(TRidge[$ - 1]) || isIntegral!(TRidge[$ - 1])) {
454         // ridge param.
455         alias XRidge[$ - 1] ridgeParam;
456         alias TRidge[0..$ - 1] T;
457         alias XRidge[0..$ - 1] XIn;
458         enum bool ridge = true;
459         dstatsEnforce(ridgeParam >= 0,
460             "Cannot do ridge regerssion with ridge param <= 0.");
461 
462         static SummaryIter!R summaryIter(R)(R range) {
463             return typeof(return)(range);
464         }
465 
466     } else {
467         enum bool ridge = false;
468         enum ridgeParam = 0;
469         alias TRidge T;
470         alias XRidge XIn;
471 
472         static R summaryIter(R)(R range) {
473             return range;
474         }
475     }
476 
477     static if(isArray!(T[0]) && isInputRange!(typeof(XIn[0][0])) &&
478         T.length == 1) {
479         auto X = alloc.array(map!(summaryIter)(XIn[0]));
480         alias typeof(X[0]) E;
481     } else {
482         static if(ridge) {
483             alias staticMap!(SummaryType, T) XType;
484             XType X;
485 
486             foreach(ti, elem; XIn) {
487                 X[ti] = summaryIter(elem);
488             }
489         } else {
490             alias XIn X;
491         }
492     }
493 
494     DoubleMatrix xTx;
495     double[] xTy = alloc.uninitializedArray!(double[])(X.length);
496     double[] ret;
497     if(buf.length < X.length) {
498         ret = new double[X.length];
499     } else {
500         ret = buf[0..X.length];
501     }
502 
503     rangeMatrixMulTrans(xTy, xTx, Y, X, alloc);
504 
505     static if(ridge) {
506         if(ridgeParam > 0) {
507             foreach(i, range; X) {
508                 // The whole matrix is scaled by N, as well as the xTy vector,
509                 // so scale the ridge param similarly.
510                 xTx[i, i] += ridgeParam * range.mse / range.N;
511             }
512         }
513     }
514 
515     choleskySolve(xTx, xTy, ret);
516     return ret;
517 }
518 
519 /**
520 Perform a linear regression as in linearRegressBeta, but return a
521 RegressRes with useful stuff for statistical inference.  If the last element
522 of input is a real, this is used to specify the confidence intervals to
523 be calculated.  Otherwise, the default of 0.95 is used.  The rest of input
524 should be the elements of X.
525 
526 When using this function, which provides several useful statistics useful
527 for inference, each range must be traversed twice.  This means:
528 
529 1.  They have to be forward ranges, not input ranges.
530 
531 2.  If you have a large amount of data and you're mapping it to some
532     expensive function, you may want to do this eagerly instead of lazily.
533 
534 Notes:
535 
536 The X ranges are traversed in lockstep, but the traversal is stopped
537 at the end of the shortest one.  Therefore, using infinite ranges is safe.
538 For example, using repeat(1) to get an intercept term works.
539 
540 If the confidence interval specified is exactly 0, this is treated as a
541 special case and confidence interval calculation is skipped.  This can speed
542 things up significantly and therefore can be useful in monte carlo and possibly
543 data mining contexts.
544 
545 Bugs:  The statistical tests performed in this function assume that an
546 intercept term is included in your regression model.  If no intercept term
547 is included, the P-values, confidence intervals and adjusted R^2 values
548 calculated by this function will be wrong.
549 
550 Examples:
551 ---
552 int[] nBeers = [8,6,7,5,3,0,9];
553 int[] nCoffees = [3,6,2,4,3,6,8];
554 int[] musicVolume = [3,1,4,1,5,9,2];
555 int[] programmingSkill = [2,7,1,8,2,8,1];
556 
557 // Using default confidence interval:
558 auto results = linearRegress(programmingSkill, repeat(1), nBeers, nCoffees,
559     musicVolume, map!"a * a"(musicVolume));
560 
561 // Using user-specified confidence interval:
562 auto results = linearRegress(programmingSkill, repeat(1), nBeers, nCoffees,
563     musicVolume, map!"a * a"(musicVolume), 0.8675309);
564 ---
565 */
566 RegressRes linearRegress(U, TC...)(U Y, TC input) {
567     static if(is(TC[$ - 1] : double)) {
568         double confLvl = input[$ - 1];
569         enforceConfidence(confLvl);
570         alias TC[0..$ - 1] T;
571         alias input[0..$ - 1] XIn;
572     } else {
573         double confLvl = 0.95; // Default;
574         alias TC T;
575         alias input XIn;
576     }
577 
578     auto alloc = newRegionAllocator();
579     static if(isForwardRange!(T[0]) && isForwardRange!(typeof(XIn[0].front())) &&
580         T.length == 1) {
581 
582         enum bool arrayX = true;
583         alias typeof(XIn[0].front) E;
584         E[] X = alloc.array(XIn[0]);
585     } else static if(allSatisfy!(isForwardRange, T)) {
586         enum bool arrayX = false;
587         alias XIn X;
588     } else {
589         static assert(0, "Linear regression can only be performed with " ~
590             "tuples of forward ranges or ranges of forward ranges.");
591     }
592 
593     auto xTx = doubleMatrix(X.length, X.length, alloc);
594     auto xTxNeg1 = doubleMatrix(X.length, X.length, alloc);
595     double[] xTy = alloc.uninitializedArray!(double[])(X.length);
596 
597     static if(arrayX) {
598         auto xSaved = alloc.array(X);
599         foreach(ref elem; xSaved) {
600             elem = elem.save;
601         }
602     } else {
603         auto xSaved = X;
604         foreach(ti, Type; X) {
605             xSaved[ti] = X[ti].save;
606         }
607     }
608 
609     rangeMatrixMulTrans(xTy, xTx, Y.save, X, alloc);
610     invert(xTx, xTxNeg1);
611     auto betas = new double[X.length];
612     foreach(i; 0..betas.length) {
613         betas[i] = 0;
614         foreach(j; 0..betas.length) {
615             betas[i] += xTxNeg1[i, j] * xTy[j];
616         }
617     }
618 
619     X = xSaved;
620     auto residuals = residuals(betas, Y, X);
621     double S = 0;
622     ulong n = 0;
623     PearsonCor R2Calc;
624     for(; !residuals.empty; residuals.popFront) {
625         immutable residual = residuals.front;
626         S += residual * residual;
627         immutable Yfront = residuals.Y.front;
628         immutable predicted = Yfront - residual;
629         R2Calc.put(predicted, Yfront);
630         n++;
631     }
632     immutable ulong df =  n - X.length;
633     immutable double R2 = R2Calc.cor ^^ 2;
634     immutable double adjustedR2 = 1.0L - (1.0L - R2) * ((n - 1.0L) / df);
635 
636     immutable double sigma2 = S / (n - X.length);
637 
638     double[] stdErr = new double[betas.length];
639     foreach(i, ref elem; stdErr) {
640         elem = sqrt( S * xTxNeg1[i, i] / df / n);
641     }
642 
643     double[] lowerBound, upperBound;
644     if(confLvl == 0) {
645         // Then we're going to skip the computation to save time.  (See below.)
646         lowerBound = betas;
647         upperBound = betas;
648     } else {
649         lowerBound = new double[betas.length];
650         upperBound = new double[betas.length];
651     }
652     auto p = new double[betas.length];
653 
654     foreach(i, beta; betas) {
655         try {
656             p[i] = 2 * min(studentsTCDF(beta / stdErr[i], df),
657                            studentsTCDFR(beta / stdErr[i], df));
658         } catch(DstatsArgumentException) {
659             // Leave it as a NaN.
660         }
661 
662         if(confLvl > 0) {
663             // Skip confidence level computation if level is zero, to save
664             // computation time.  This is important in monte carlo and possibly
665             // data mining contexts.
666             try {
667                 double delta = invStudentsTCDF(0.5 * (1 - confLvl), df) *
668                      stdErr[i];
669                 upperBound[i] = beta - delta;
670                 lowerBound[i] = beta + delta;
671             } catch(DstatsArgumentException) {
672                 // Leave confidence bounds as NaNs.
673             }
674         }
675     }
676 
677     double F = (R2 / (X.length - 1)) / ((1 - R2) / (n - X.length));
678     double overallP;
679     try {
680         overallP = fisherCDFR(F, X.length - 1, n - X.length);
681     } catch(DstatsArgumentException) {
682         // Leave it as a NaN.
683     }
684 
685     return RegressRes(betas, stdErr, lowerBound, upperBound, p, R2,
686         adjustedR2, sqrt(sigma2), overallP);
687 }
688 
689 
690 /**Struct returned by polyFit.*/
691 struct PolyFitRes(T) {
692 
693     /**The array of PowMap ranges created by polyFit.*/
694     T X;
695 
696     /**The rest of the results.  This is alias this'd.*/
697     RegressRes regressRes;
698     alias regressRes this;
699 }
700 
701 /**Convenience function that takes a forward range X and a forward range Y,
702  * creates an array of PowMap structs for integer powers from 0 through N,
703  * and calls linearRegressBeta.
704  *
705  * Returns:  An array of doubles.  The index of each element corresponds to
706  * the exponent.  For example, the X<sup>2</sup> term will have an index of
707  * 2.
708  */
709 double[] polyFitBeta(T, U)(U Y, T X, uint N, double ridge = 0) {
710     double[] dummy;
711     return polyFitBetaBuf!(T, U)(dummy, Y, X, N);
712 }
713 
714 /**Same as polyFitBeta, but allows the caller to provide an explicit buffer
715  * to return the coefficients in.  If it's too short, a new one will be
716  * allocated.  Otherwise, results will be returned in the user-provided buffer.
717  */
718 double[] polyFitBetaBuf(T, U)(double[] buf, U Y, T X, uint N, double ridge = 0) {
719     auto alloc = newRegionAllocator();
720     auto pows = alloc.uninitializedArray!(PowMap!(uint, T)[])(N + 1);
721     foreach(exponent; 0..N + 1) {
722         pows[exponent] = powMap(X, exponent);
723     }
724 
725     if(ridge == 0) {
726         return linearRegressBetaBuf(buf, Y, pows);
727     } else {
728         return linearRegressBetaBuf(buf, Y, pows, ridge);
729     }
730 }
731 
732 /**Convenience function that takes a forward range X and a forward range Y,
733  * creates an array of PowMap structs for integer powers 0 through N,
734  * and calls linearRegress.
735  *
736  * Returns:  A PolyFitRes containing the array of PowMap structs created and
737  * a RegressRes.  The PolyFitRes is alias this'd to the RegressRes.*/
738 PolyFitRes!(PowMap!(uint, T)[])
739 polyFit(T, U)(U Y, T X, uint N, double confInt = 0.95) {
740     enforceConfidence(confInt);
741     auto pows = new PowMap!(uint, T)[N + 1];
742     foreach(exponent; 0..N + 1) {
743         pows[exponent] = powMap(X, exponent);
744     }
745     alias PolyFitRes!(typeof(pows)) RT;
746     RT ret;
747     ret.X = pows;
748     ret.regressRes = linearRegress(Y, pows, confInt);
749     return ret;
750 }
751 
752 unittest {
753     // These are a bunch of values gleaned from various examples on the Web.
754     double[] heights = [1.47,1.5,1.52,1.55,1.57,1.60,1.63,1.65,1.68,1.7,1.73,1.75,
755         1.78,1.8,1.83];
756     double[] weights = [52.21,53.12,54.48,55.84,57.2,58.57,59.93,61.29,63.11,64.47,
757         66.28,68.1,69.92,72.19,74.46];
758     float[] diseaseSev = [1.9, 3.1, 3.3, 4.8, 5.3, 6.1, 6.4, 7.6, 9.8, 12.4];
759     ubyte[] temperature = [2,1,5,5,20,20,23,10,30,25];
760 
761     // Values from R.
762     auto res1 = polyFit(diseaseSev, temperature, 1);
763     assert(approxEqual(res1.betas[0], 2.6623));
764     assert(approxEqual(res1.betas[1], 0.2417));
765     assert(approxEqual(res1.stdErr[0], 1.1008));
766     assert(approxEqual(res1.stdErr[1], 0.0635));
767     assert(approxEqual(res1.p[0], 0.0419));
768     assert(approxEqual(res1.p[1], 0.0052));
769     assert(approxEqual(res1.R2, 0.644));
770     assert(approxEqual(res1.adjustedR2, 0.6001));
771     assert(approxEqual(res1.residualError, 2.03));
772     assert(approxEqual(res1.overallP, 0.00518));
773 
774 
775     auto res2 = polyFit(weights, heights, 2);
776     assert(approxEqual(res2.betas[0], 128.813));
777     assert(approxEqual(res2.betas[1], -143.162));
778     assert(approxEqual(res2.betas[2], 61.960));
779 
780     assert(approxEqual(res2.stdErr[0], 16.308));
781     assert(approxEqual(res2.stdErr[1], 19.833));
782     assert(approxEqual(res2.stdErr[2], 6.008));
783 
784     assert(approxEqual(res2.p[0], 4.28e-6));
785     assert(approxEqual(res2.p[1], 1.06e-5));
786     assert(approxEqual(res2.p[2], 2.57e-7));
787 
788     assert(approxEqual(res2.R2, 0.9989, 0.0001));
789     assert(approxEqual(res2.adjustedR2, 0.9987, 0.0001));
790 
791     assert(approxEqual(res2.lowerBound[0], 92.9, 0.01));
792     assert(approxEqual(res2.lowerBound[1], -186.8, 0.01));
793     assert(approxEqual(res2.lowerBound[2], 48.7, 0.01));
794     assert(approxEqual(res2.upperBound[0], 164.7, 0.01));
795     assert(approxEqual(res2.upperBound[1], -99.5, 0.01));
796     assert(approxEqual(res2.upperBound[2], 75.2, 0.01));
797 
798     auto res3 = linearRegress(weights, repeat(1), heights, map!"a * a"(heights));
799     assert(res2.betas == res3.betas);
800 
801     double[2] beta1Buf;
802     auto beta1 = linearRegressBetaBuf
803         (beta1Buf[], diseaseSev, repeat(1), temperature);
804     assert(beta1Buf.ptr == beta1.ptr);
805     assert(beta1Buf[] == beta1[]);
806     assert(approxEqual(beta1, res1.betas));
807     auto beta2 = polyFitBeta(weights, heights, 2);
808     assert(approxEqual(beta2, res2.betas));
809 
810     auto res4 = linearRegress(weights, repeat(1), heights);
811     assert(approxEqual(res4.p, 3.604e-14));
812     assert(approxEqual(res4.betas, [-39.062, 61.272]));
813     assert(approxEqual(res4.p, [6.05e-9, 3.60e-14]));
814     assert(approxEqual(res4.R2, 0.9892));
815     assert(approxEqual(res4.adjustedR2, 0.9884));
816     assert(approxEqual(res4.residualError, 0.7591));
817     assert(approxEqual(res4.lowerBound, [-45.40912, 57.43554]));
818     assert(approxEqual(res4.upperBound, [-32.71479, 65.10883]));
819 
820     // Test residuals.
821     assert(approxEqual(residuals(res4.betas, weights, repeat(1), heights),
822         [1.20184170, 0.27367611,  0.40823237, -0.06993322,  0.06462305,
823          -0.40354255, -0.88170814,  -0.74715188, -0.76531747, -0.63076120,
824          -0.65892680, -0.06437053, -0.08253613,  0.96202014,  1.39385455]));
825 
826     // Test nonzero ridge parameters.
827         // Values from R's MASS package.
828     auto a = [1, 2, 3, 4, 5, 6, 7];
829     auto b = [8, 6, 7, 5, 3, 0, 9];
830     auto c = [2, 7, 1, 8, 2, 8, 1];
831 
832     // With a ridge param. of zero, ridge regression reduces to regular
833     // OLS regression.
834     assert(approxEqual(linearRegressBeta(a, repeat(1), b, c, 0),
835         linearRegressBeta(a, repeat(1), b, c)));
836 
837     // Test the ridge regression. Values from R MASS package.
838     auto ridge1 = linearRegressBeta(a, repeat(1), b, c, 1);
839     auto ridge2 = linearRegressBeta(a, repeat(1), b, c, 2);
840     auto ridge3 = linearRegressBeta(c, [[1,1,1,1,1,1,1], a, b], 10);
841     assert(approxEqual(ridge1, [6.0357757, -0.2729671, -0.1337131]));
842     assert(approxEqual(ridge2, [5.62367784, -0.22449854, -0.09775174]));
843     assert(approxEqual(ridge3, [5.82653624, -0.05197246, -0.27185592 ]));
844 }
845 
846 private MeanSD[] calculateSummaries(X...)(X xIn, RegionAllocator alloc) {
847     // This is slightly wasteful because it sticks this shallow dup in
848     // an unfreeable pos on TempAlloc.
849     static if(X.length == 1 && isRoR!(X[0])) {
850         auto ret = alloc.uninitializedArray!(MeanSD[])(xIn[0].length);
851         auto alloc2 = newRegionAllocator();
852         auto x = alloc2.array(xIn[0]);
853 
854         foreach(ref range; x) {
855             range = range.save;
856         }
857 
858         enum allHaveLength = hasLength!(ElementType!(typeof(x)));
859 
860     } else {
861         auto ret = alloc2.uninitializedArray!MeanSD(xIn.length);
862         alias xIn x;
863 
864         foreach(ti, R; X) {
865             x[ti] = x[ti].save;
866         }
867 
868         enum allHaveLength = allSatisfy!(hasLength, X);
869     }
870 
871     ret[] = MeanSD.init;
872 
873     static if(allHaveLength) {
874         size_t minLen = size_t.max;
875         foreach(range; x) {
876             minLen = min(minLen, range.length);
877         }
878 
879         foreach(i, range; x) {
880             ret[i] = meanStdev(take(range, minLen));
881         }
882 
883     } else {
884         bool someEmpty() {
885             foreach(range; x) {
886                 if(range.empty) return true;
887             }
888 
889             return false;
890         }
891 
892         void popAll() {
893             foreach(ti, elem; x) {
894                 x[ti].popFront();
895             }
896         }
897 
898         while(!someEmpty) {
899             foreach(i, range; x) {
900                 ret[i].put(range.front);
901             }
902             popAll();
903         }
904     }
905 
906     return ret;
907 }
908 
909 private double softThresh(double z, double gamma) {
910     if(gamma >= abs(z)) {
911         return 0;
912     } else if(z > 0) {
913         return z - gamma;
914     } else {
915         return z + gamma;
916     }
917 }
918 
919 private struct PreprocessedData {
920     MeanSD[] xSumm;
921     MeanSD ySumm;
922     double[] y;
923     double[][] x;
924 }
925 
926 private PreprocessedData preprocessStandardize(Y, X...)
927 (Y yIn, X xIn, RegionAllocator alloc) {
928     static if(X.length == 1 && isRoR!(X[0])) {
929         auto xRaw = alloc.array(xIn[0]);
930     } else {
931         alias xIn xRaw;
932     }
933 
934     auto summaries = calculateSummaries(xRaw, alloc);
935     immutable minLen = to!size_t(
936         reduce!min(
937             map!"a.N"(summaries)
938         )
939     );
940 
941     auto x = alloc.uninitializedArray!(double[][])(summaries.length);
942     foreach(i, range; xRaw) {
943         x[i] = alloc.array(map!(to!double)(take(range, minLen)));
944         x[i][] -= summaries[i].mean;
945         x[i][] /= sqrt(summaries[i].mse);
946     }
947 
948     double[] y;
949     MeanSD ySumm;
950     if(yIn.length) {
951         y = alloc.array(map!(to!double)(take(yIn, minLen)));
952         ySumm = meanStdev(y);
953         y[] -= ySumm.mean;
954     }
955 
956     return PreprocessedData(summaries, ySumm, y, x);
957 }
958 
959 /**
960 Performs lasso (L1) and/or ridge (L2) penalized linear regression.  Due to the
961 way the data is standardized, no intercept term should be included in x
962 (unlike linearRegress and linearRegressBeta).  The intercept coefficient is
963 implicitly included and returned in the first element of the returned array.
964 Usage is otherwise identical.
965 
966 Note:  Setting lasso equal to zero is equivalent to performing ridge regression.
967        This can also be done with linearRegressBeta.  However, the
968        linearRegressBeta algorithm is optimized for memory efficiency and
969        large samples.  This algorithm is optimized for large feature sets.
970 
971 Returns:  The beta coefficients for the regression model.
972 
973 References:
974 
975 Friedman J, et al Pathwise coordinate optimization. Ann. Appl. Stat.
976 2007;2:302-332.
977 
978 Goeman, J. J., L1 penalized estimation in the Cox proportional hazards model.
979 Biometrical Journal 52(1), 70{84.
980 
981 Eilers, P., Boer, J., Van Ommen, G., Van Houwelingen, H. 2001 Classification of
982 microarray data with penalized logistic regression. Proceedings of SPIE.
983 Progress in Biomedical Optics and Images vol. 4266, pp. 187-198
984 
985 Douglas M. Hawkins and Xiangrong Yin.  A faster algorithm for ridge regression
986 of reduced rank data.  Computational Statistics & Data Analysis Volume 40,
987 Issue 2, 28 August 2002, Pages 253-262
988 
989 http://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
990 */
991 double[] linearRegressPenalized(Y, X...)
992 (Y yIn, X xIn, double lasso, double ridge) {
993     auto alloc = newRegionAllocator();
994 
995     auto preproc = preprocessStandardize(yIn, xIn, alloc);
996 
997     auto summaries = preproc.xSumm;
998     auto ySumm = preproc.ySumm;
999     auto x = preproc.x;
1000     auto y = preproc.y;
1001 
1002     auto betasFull = new double[x.length + 1];
1003     betasFull[] = 0;
1004     auto betas = betasFull[1..$];
1005 
1006     if(lasso > 0) {
1007         coordDescent(y, x, betas, lasso, ridge, null);
1008     } else if(y.length > x.length) {
1009         // Correct for different )#*$# scaling conventions.
1010         foreach(i, feature; x) {
1011             feature[] /= sqrt(summaries[i].mse);
1012         }
1013 
1014         linearRegressBetaBuf(betas, y, x, ridge);
1015 
1016         // More correction for diff. scaling factors.
1017         foreach(i, ref b; betas) {
1018             b /= sqrt(summaries[i].mse);
1019         }
1020 
1021     } else {
1022         ridgeLargeP(y, x, ridge, betas, null);
1023     }
1024 
1025     foreach(i, ref elem; betas) {
1026         elem /= sqrt(summaries[i].mse);
1027     }
1028 
1029     betasFull[0] = ySumm.mean;
1030     foreach(i, beta; betas) {
1031         betasFull[0] -= beta * summaries[i].mean;
1032     }
1033 
1034     return betasFull;
1035 }
1036 
1037 private void coordDescent(
1038     double[] y,
1039     double[][] x,
1040     double[] betas,
1041     double lasso,
1042     double ridge,
1043     double[] w
1044 ) {
1045     auto alloc = newRegionAllocator();
1046 
1047     auto predictions = alloc.uninitializedArray!(double[])(y.length);
1048     predictions[] = 0;
1049 
1050     void makePredictions() {
1051         foreach(j, beta; betas) {
1052             predictions[] += x[j][] * beta;
1053         }
1054     }
1055 
1056     if(reduce!max(0.0, map!abs(betas)) > 0) {
1057         makePredictions();
1058     }
1059 
1060     auto residuals = alloc.uninitializedArray!(double[])(y.length);
1061 
1062     uint iter = 0;
1063     enum maxIter = 10_000;
1064     enum relEpsilon = 1e-5;
1065     enum absEpsilon = 1e-10;
1066     immutable n = cast(double) y.length;
1067     auto perm = alloc.array(iota(0U, x.length));
1068 
1069     auto weightDots = alloc.uninitializedArray!(double[])(x.length);
1070     if(w.length == 0) {
1071         ridge /= n;
1072         lasso /= n;
1073         weightDots[] = 1;
1074     } else {
1075         foreach(j, col; x) {
1076             weightDots[j] = dotProduct(w, map!"a * a"(col));
1077         }
1078     }
1079 
1080     double doIter(double[] betas, double[][] x) {
1081         double maxRelError = 0;
1082         bool predictionsChanged = true;
1083 
1084         foreach(j, ref b; betas) {
1085             if(b == 0) {
1086                 if(predictionsChanged) {
1087                     residuals[] = y[] - predictions[];
1088                 }
1089             } else {
1090                 predictions[] -= x[j][] * b;
1091                 residuals[] = y[] - predictions[];
1092             }
1093 
1094             double z;
1095             if(w.length) {
1096                 z = 0;
1097                 foreach(i, weight; w) {
1098                     z += weight * x[j][i] * residuals[i];
1099                 }
1100             } else {
1101                 z = dotProduct(residuals, x[j]) / n;
1102             }
1103 
1104             immutable newB = softThresh(z, lasso) / (weightDots[j] + ridge);
1105             immutable absErr = abs(b - newB);
1106             immutable err = absErr / max(abs(b), abs(newB));
1107 
1108             if(absErr > absEpsilon) {
1109                 maxRelError = max(maxRelError, err);
1110             }
1111 
1112             if(b == 0 && newB == 0) {
1113                 predictionsChanged = false;
1114             } else {
1115                 predictionsChanged = true;
1116             }
1117 
1118             b = newB;
1119             if(b != 0) {
1120                 predictions[] += x[j][] * b;
1121             }
1122         }
1123 
1124         return maxRelError;
1125     }
1126 
1127     void toConvergence() {
1128         double maxRelErr = doIter(betas, x);
1129         iter++;
1130         if(maxRelErr < relEpsilon) return;
1131 
1132         static bool absGreater(double x, double y) { return abs(x) > abs(y); }
1133 
1134         while(iter < maxIter) {
1135             size_t split = 0;
1136             while(split < betas.length && abs(betas[split]) > 0) {
1137                 split++;
1138             }
1139 
1140             try {
1141                 qsort!absGreater(betas, x, perm, weightDots);
1142             } catch(SortException) {
1143                 betas[] = double.nan;
1144                 break;
1145             }
1146 
1147             maxRelErr = double.infinity;
1148             for(; !(maxRelErr < relEpsilon) && split < betas.length
1149             && iter < maxIter; iter++) {
1150                 maxRelErr = doIter(betas[0..split], x[0..split]);
1151             }
1152 
1153             maxRelErr = doIter(betas, x);
1154             iter++;
1155             if(maxRelErr < relEpsilon) break;
1156         }
1157     }
1158 
1159     toConvergence();
1160     try {
1161         qsort(perm, x, betas);
1162     } catch(SortException) {
1163         return;
1164     }
1165 }
1166 
1167 /*
1168 Ridge regression, case where P > N, where P is number of features and N
1169 is number of samples.
1170 */
1171 private void ridgeLargeP(
1172     const(double)[] y,
1173     const double[][] x,
1174     immutable double lambda,
1175     double[] betas,
1176     const double[] w = null
1177 ) {
1178     static if(haveSvd) {
1179         return eilersRidge(y, x, lambda, betas, w);
1180     } else {
1181         return shermanMorrisonRidge(y, x, lambda, betas, w);
1182     }
1183 }
1184 
1185 version(scid) {
1186     version(nodeps) {
1187         enum haveSvd = false;
1188     } else {
1189         enum haveSvd = true;
1190     }
1191 } else {
1192     enum haveSvd = false;
1193 }
1194 
1195 /*
1196 An implementation of ridge regression for large dimension.  Taken from:
1197 
1198 Eilers, P., Boer, J., Van Ommen, G., Van Houwelingen, H. 2001 Classification of
1199 microarray data with penalized logistic regression. Proceedings of SPIE.
1200 Progress in Biomedical Optics and Images vol. 4266, pp. 187-198
1201 
1202 This algorithm is very fast, O(N^2 * P) but requires singular value
1203 decomposition.  Therefore, we only use if we're using SciD with full
1204 dependency support.
1205 */
1206 static if(haveSvd) private void eilersRidge(
1207     const(double)[] yArr,
1208     const double[][] x,
1209     immutable double lambda,
1210     double[] betas,
1211     const double[] weightArr
1212 ) {
1213     if(x.length == 0) return;
1214     auto alloc = newRegionAllocator();
1215     immutable n = x[0].length;
1216     immutable p = x.length;
1217 
1218     auto xMat = ExternalMatrixView!double(n, p, alloc);
1219     foreach(i; 0..n) foreach(j; 0..p) {
1220         xMat[i, j] = x[j][i];
1221     }
1222 
1223     typeof(svdDestructive(xMat, SvdType.economy, alloc)) svdRes;
1224     try {
1225         svdRes = svdDestructive(xMat, SvdType.economy, alloc);
1226     } catch(Exception) {
1227         betas[] = double.nan;
1228         return;
1229     }
1230 
1231     auto us = eval(svdRes.u * svdRes.s, alloc);
1232     ExternalMatrixView!double usWus;
1233     ExternalVectorView!double usWy;
1234 
1235     // Have to cast away const because ExternalVectorView doesn't play
1236     // nice w/ it yet.
1237     auto y = ExternalVectorView!double(cast(double[]) yArr);
1238 
1239     if(weightArr.length) {
1240         // Once we've multiplied s by u, we don't need it anymore.  Overwrite
1241         // its contents with the weight array to get weights in the form of
1242         // a diagonal matrix.
1243         auto w = svdRes.s;
1244         svdRes.s = typeof(svdRes.s).init;
1245 
1246         foreach(i, weight; weightArr) {
1247             w[i, i] = weight;
1248         }
1249 
1250         usWus = eval(us.t * w * us, alloc);
1251         usWy = eval(us.t * w * y, alloc);
1252     } else {
1253         usWus = eval(us.t * us, alloc);
1254         usWy = eval(us.t * y, alloc);
1255     }
1256 
1257     assert(usWus.rows == usWus.columns);
1258     assert(usWus.rows == n);
1259     foreach(i; 0..n) {
1260         usWus[i, i] += lambda;
1261     }
1262 
1263     auto theta = eval(inv(usWus) * usWy, alloc);
1264 
1265     // Work around weird SciD bug by transposing matrix manually.
1266     auto v = ExternalMatrixView!(double, StorageOrder.RowMajor)(
1267         svdRes.vt.columns,
1268         svdRes.vt.data[0..svdRes.vt.rows * svdRes.vt.columns]
1269     );
1270     eval(v * theta, betas);
1271 }
1272 
1273 /*
1274 This algorithm is used as a fallback in case we don't have SVD available.
1275 It's O(N P^2) instead of O(N^2 * P).  It was adapted from the
1276 following paper:
1277 
1278 Douglas M. Hawkins and Xiangrong Yin.  A faster algorithm for ridge regression
1279 of reduced rank data.  Computational Statistics & Data Analysis Volume 40,
1280 Issue 2, 28 August 2002, Pages 253-262
1281 
1282 It also uses Wikipedia's page on Sherman-Morrison:
1283 
1284 http://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
1285 
1286 I simplified it by only doing rank-1 updates of inv(x' * x * lambda * I),
1287 since I'm just trying to calculate the ridge estimate, not do efficient
1288 cross-validation.  The idea is to start with an empty dataset.  Here,
1289 inv(x' * x + lambda * I) = 1 / lambda * I.  Then, add each sample (row of x)
1290 sequentially.  This is equivalent to adding s * s' to (x' * x + lambda * I)
1291 where s is the vector of predictors for the sample.  This is known as a
1292 dyadic product.  If p is the number of features/dimensions, and n is the
1293 number of samples, we have n updates, each of which is O(P^2) for an
1294 O(N * P^2) algorithm.  Naive algoriths would be O(P^3).
1295 */
1296 static if(!haveSvd)
1297 private void shermanMorrisonRidge(
1298     const(double)[] y,
1299     const(double[])[] x,  // Column major
1300     immutable double lambda,
1301     double[] betas,
1302     const(double)[] w
1303 ) in {
1304     assert(lambda > 0);
1305     foreach(col; x) assert(col.length == x[0].length);
1306     if(x.length) assert(y.length == x[0].length);
1307 } do {
1308     auto alloc = newRegionAllocator();
1309     immutable p = x.length;
1310     if(p == 0) return;
1311     immutable n = x[0].length;
1312 
1313     auto v = alloc.uninitializedArray!(double[])(p);
1314     double[] u;
1315     if(w.length) {
1316         u = alloc.uninitializedArray!(double[])(p);
1317     } else {
1318         u = v;
1319     }
1320 
1321     auto vTxTxNeg1 = alloc.uninitializedArray!(double[])(p);
1322     auto xTxNeg1u = alloc.uninitializedArray!(double[])(p);
1323 
1324     // Before any updates are done, x'x = I * lambda, so inv(x'x) = I / lambda.
1325     auto xTxNeg1 = alloc.uninitializedArray!(double[][])(p, p);
1326     foreach(i; 0..p) foreach(j; 0..p) {
1327         xTxNeg1[i][j] = (i == j) ? (1.0 / lambda) : 0;
1328     }
1329 
1330     foreach(sampleIndex; 0..n) {
1331         copy(transversal(x[], sampleIndex), v);
1332         if(w.length) {
1333             u[] = w[sampleIndex] * v[];
1334         }
1335 
1336         // Calculate denominator of update:  1 + v' * xTxNeg1 * u
1337         vTxTxNeg1[] = 0;
1338         foreach(rowIndex, row; xTxNeg1) {
1339             vTxTxNeg1[] += v[rowIndex] * row[];
1340         }
1341         immutable denom = 1.0 + dotProduct(vTxTxNeg1, u);
1342 
1343         // Calculate numerator.  The parentheses indicate how the operation
1344         // is coded.  This is for computational efficiency.  Removing the
1345         // parentheses would be mathematically equivalent due to associativity:
1346         // (xTxNeg1 * u) * (v' * xTxNeg1).
1347         xTxNeg1u[] = 0;
1348         foreach(rowIndex, row; xTxNeg1) {
1349             xTxNeg1u[rowIndex] = dotProduct(row, u);
1350         }
1351 
1352         foreach(i, row; xTxNeg1) {
1353             immutable multiplier = xTxNeg1u[i] / denom;
1354             row[] -= multiplier * vTxTxNeg1[];
1355         }
1356     }
1357 
1358     auto xTy = alloc.uninitializedArray!(double[])(p);
1359     if(w.length) {
1360         auto yw = alloc.uninitializedArray!(double[])(n);
1361         yw[] = y[] * w[];
1362         y = yw;
1363     }
1364 
1365     foreach(colIndex, col; x) {
1366         xTy[colIndex] = dotProduct(col[], y[]);
1367     }
1368 
1369     foreach(rowIndex, row; xTxNeg1) {
1370         betas[rowIndex] = dotProduct(row, xTy);
1371     }
1372 }
1373 
1374 unittest {
1375     // Test ridge regression.  We have three impls for all kinds of diff.
1376     // scenarios.  See if they all agree.  Note that the ridiculously small but
1377     // nonzero lasso param is to force the use of the coord descent algo.
1378     auto y = new double[12];
1379     auto x = new double[][16];
1380     foreach(ref elem; x) elem = new double[12];
1381     x[0][] = 1;
1382     auto gen = Random(31415);  // For random but repeatable results.
1383 
1384     foreach(iter; 0..1000) {
1385         foreach(col; x[1..$]) foreach(ref elem; col) elem = rNormal(0, 1, gen);
1386         foreach(ref elem; y) elem = rNormal(0, 1, gen);
1387         immutable ridge = uniform(0.1, 10.0, gen);
1388 
1389         auto normalEq = linearRegressBeta(y, x, ridge);
1390         auto coordDescent = linearRegressPenalized(
1391             y, x[1..$], double.min_normal, ridge);
1392         auto linalgTrick = linearRegressPenalized(y, x[1..$], 0, ridge);
1393 
1394         // Every once in a blue moon coordinate descent doesn't converge that
1395         // well.  These small errors are of no practical significance, hence
1396         // the wide tolerance.  However, if the direct normal equations
1397         // and linalg trick don't agree extremely closely, then something's
1398         // fundamentally wrong.
1399         assert(approxEqual(normalEq, coordDescent, 0.02, 1e-4), text(
1400             normalEq, coordDescent));
1401         assert(approxEqual(linalgTrick, coordDescent, 0.02, 1e-4), text(
1402             linalgTrick, coordDescent));
1403         assert(approxEqual(normalEq, linalgTrick, 1e-6, 1e-8), text(
1404             normalEq, linalgTrick));
1405     }
1406 
1407     // Test stuff that's got some lasso in it.  Values from R's Penalized
1408     // package.
1409     y = [1.0, 2.0, 3, 4, 5, 6, 7];
1410     x = [[8.0, 6, 7, 5, 3, 0, 9],
1411          [3.0, 6, 2, 4, 3, 6, 8],
1412          [3.0, 1, 4, 1, 5, 9, 2],
1413          [2.0, 7, 1, 8, 2, 8, 1]];
1414 
1415     assert(approxEqual(linearRegressPenalized(y, x, 1, 0),
1416         [4.16316, -0.3603197, 0.6308278, 0, -0.2633263]));
1417     assert(approxEqual(linearRegressPenalized(y, x, 1, 3),
1418         [2.519590, -0.09116883, 0.38067757, 0.13122413, -0.05637939]));
1419     assert(approxEqual(linearRegressPenalized(y, x, 2, 0.1),
1420         [1.247235, 0, 0.4440735, 0.2023602, 0]));
1421     assert(approxEqual(linearRegressPenalized(y, x, 5, 7),
1422         [3.453787, 0, 0.10968736, 0.01253992, 0]));
1423 }
1424 
1425 /**
1426 Computes a logistic regression using a maximum likelihood estimator
1427 and returns the beta coefficients.  This is a generalized linear model with
1428 the link function f(XB) = 1 / (1 + exp(XB)). This is generally used to model
1429 the probability that a binary Y variable is 1 given a set of X variables.
1430 
1431 For the purpose of this function, Y variables are interpreted as Booleans,
1432 regardless of their type.  X may be either a range of ranges or a tuple of
1433 ranges.  However, note that unlike in linearRegress, they are copied to an
1434 array if they are not random access ranges.  Note that each value is accessed
1435 several times, so if your range is a map to something expensive, you may
1436 want to evaluate it eagerly.
1437 
1438 If the last parameter passed in is a numeric value instead of a range,
1439 it is interpreted as a ridge parameter and ridge regression is performed.  This
1440 penalizes the L2 norm of the beta vector (in a scaled space) and results
1441 in more parsimonious models.  It limits the usefulness of inference techniques
1442 (p-values, confidence intervals), however, and is therefore not offered
1443 in logisticRegres().
1444 
1445 If no ridge parameter is passed, or equivalenty if the ridge parameter is
1446 zero, then ordinary maximum likelihood regression is performed.
1447 
1448 Note that, while this implementation of ridge regression was tested against
1449 the R Design Package implementation, it uses slightly different conventions
1450 that make the results not comparable without transformation.  dstats uses a
1451 biased estimate of the variance to scale the beta vector penalties, while
1452 Design uses an unbiased estimate.  Furthermore, Design penalizes by 1/2 of the
1453 L2 norm, whereas dstats penalizes by the L2 norm.  Therefore, if n is the
1454 sample size, and lambda is the penalty used with dstats, the proper penalty
1455 to use in Design to get the same results is 2 * (n - 1) * lambda / n.
1456 
1457 Also note that, as in linearRegress, repeat(1) can be used for the intercept
1458 term.
1459 
1460 Returns:  The beta coefficients for the regression model.
1461 
1462 References:
1463 
1464 http://en.wikipedia.org/wiki/Logistic_regression
1465 
1466 http://socserv.mcmaster.ca/jfox/Courses/UCLA/logistic-regression-notes.pdf
1467 
1468 S. Le Cessie and J. C. Van Houwelingen.  Ridge Estimators in Logistic
1469 Regression.  Journal of the Royal Statistical Society. Series C
1470 (Applied Statistics), Vol. 41, No. 1(1992), pp. 191-201
1471 
1472 Frank E Harrell Jr (2009). Design: Design Package. R package version 2.3-0.
1473 http://CRAN.R-project.org/package=Design
1474  */
1475 double[] logisticRegressBeta(T, U...)(T yIn, U xRidge) {
1476     static if(isFloatingPoint!(U[$ - 1]) || isIntegral!(U[$ - 1])) {
1477         alias xRidge[$ - 1] ridge;
1478         alias xRidge[0..$ - 1] xIn;
1479     } else {
1480         enum double ridge = 0.0;
1481         alias xRidge xIn;
1482     }
1483 
1484     return logisticRegressImpl(false, ridge, yIn, xIn).betas;
1485 }
1486 
1487 /**
1488 Plain old data struct to hold the results of a logistic regression.
1489 */
1490 struct LogisticRes {
1491     /**The coefficients, one for each range in X.  These will be in the order
1492      * that the X ranges were passed in.*/
1493     double[] betas;
1494 
1495     /**The standard error terms of the X ranges passed in.*/
1496     double[] stdErr;
1497 
1498     /**
1499     The Wald lower confidence bounds of the beta terms, at the confidence level
1500     specificied.  (Default 0.95).*/
1501     double[] lowerBound;
1502 
1503     /**
1504     The Wald upper confidence bounds of the beta terms, at the confidence level
1505     specificied.  (Default 0.95).*/
1506     double[] upperBound;
1507 
1508     /**
1509     The P-value for the alternative that the corresponding beta value is
1510     different from zero against the null that it is equal to zero.  These
1511     are calculated using the Wald Test.*/
1512     double[] p;
1513 
1514     /**
1515     The log likelihood for the null model.
1516     */
1517     double nullLogLikelihood;
1518 
1519     /**
1520     The log likelihood for the model fit.
1521     */
1522     double logLikelihood;
1523 
1524     /**
1525     Akaike Information Criterion, which is a complexity-penalized goodness-
1526     of-fit score, equal to 2 * k - 2 log(L) where L is the log likelihood and
1527     k is the number of parameters.
1528     */
1529     double aic() const pure nothrow @property @safe {
1530         return 2 * (betas.length - logLikelihood);
1531     }
1532 
1533     /**
1534     The P-value for the model as a whole, based on the likelihood ratio test.
1535     The null here is that the model has no predictive value, the alternative
1536     is that it does have predictive value.*/
1537     double overallP;
1538 
1539     // Just used internally.
1540     private static string arrStr(T)(T arr) {
1541         return text(arr)[1..$ - 1];
1542     }
1543 
1544     /**Print out the results in the default format.*/
1545     string toString() {
1546         return "Betas:  " ~ arrStr(betas) ~ "\nLower Conf. Int.:  " ~
1547             arrStr(lowerBound) ~ "\nUpper Conf. Int.:  " ~ arrStr(upperBound) ~
1548             "\nStd. Err:  " ~ arrStr(stdErr) ~ "\nP Values:  " ~ arrStr(p) ~
1549             "\nNull Log Likelihood:  " ~ text(nullLogLikelihood) ~
1550             "\nLog Likelihood:  " ~ text(logLikelihood) ~
1551             "\nAIC:  " ~ text(aic) ~
1552             "\nOverall P:  " ~ text(overallP);
1553     }
1554 }
1555 
1556 /**
1557 Similar to logisticRegressBeta, but returns a LogisticRes with useful stuff for
1558 statistical inference.  If the last element of input is a floating point
1559 number instead of a range, it is used to specify the confidence interval
1560 calculated.  Otherwise, the default of 0.95 is used.
1561 
1562 References:
1563 
1564 http://en.wikipedia.org/wiki/Wald_test
1565 http://en.wikipedia.org/wiki/Akaike_information_criterion
1566 */
1567 LogisticRes logisticRegress(T, V...)(T yIn, V input) {
1568     return logisticRegressImpl!(T, V)(true, 0, yIn, input);
1569 }
1570 
1571 unittest {
1572     // Values from R.  Confidence intervals from confint.default().
1573     // R doesn't automatically calculate likelihood ratio P-value, and reports
1574     // deviations instead of log likelihoods.  Deviations are just
1575     // -2 * likelihood.
1576     alias approxEqual ae;  // Save typing.
1577 
1578     // Start with the basics, with X as a ror.
1579     auto y1 =  [1,   0, 0, 0, 1, 0, 0];
1580     auto x1 = [[1.0, 1 ,1 ,1 ,1 ,1 ,1],
1581               [8.0, 6, 7, 5, 3, 0, 9]];
1582     auto res1 = logisticRegress(y1, x1);
1583     assert(ae(res1.betas[0], -0.98273));
1584     assert(ae(res1.betas[1], 0.01219));
1585     assert(ae(res1.stdErr[0], 1.80803));
1586     assert(ae(res1.stdErr[1], 0.29291));
1587     assert(ae(res1.p[0], 0.587));
1588     assert(ae(res1.p[1], 0.967));
1589     assert(ae(res1.aic, 12.374));
1590     assert(ae(res1.logLikelihood, -0.5 * 8.3758));
1591     assert(ae(res1.nullLogLikelihood, -0.5 * 8.3740));
1592     assert(ae(res1.lowerBound[0], -4.5264052));
1593     assert(ae(res1.lowerBound[1], -0.5618933));
1594     assert(ae(res1.upperBound[0], 2.560939));
1595     assert(ae(res1.upperBound[1], 0.586275));
1596 
1597     // Use tuple.
1598     auto y2   = [1,0,1,1,0,1,0,0,0,1,0,1];
1599     auto x2_1 = [3,1,4,1,5,9,2,6,5,3,5,8];
1600     auto x2_2 = [2,7,1,8,2,8,1,8,2,8,4,5];
1601     auto res2 = logisticRegress(y2, repeat(1), x2_1, x2_2);
1602     assert(ae(res2.betas[0], -1.1875));
1603     assert(ae(res2.betas[1], 0.1021));
1604     assert(ae(res2.betas[2], 0.1603));
1605     assert(ae(res2.stdErr[0], 1.5430));
1606     assert(ae(res2.stdErr[1], 0.2507));
1607     assert(ae(res2.stdErr[2], 0.2108));
1608     assert(ae(res2.p[0], 0.442));
1609     assert(ae(res2.p[1], 0.684));
1610     assert(ae(res2.p[2], 0.447));
1611     assert(ae(res2.aic, 21.81));
1612     assert(ae(res2.nullLogLikelihood, -0.5 * 16.636));
1613     assert(ae(res2.logLikelihood, -0.5 * 15.810));
1614     assert(ae(res2.lowerBound[0], -4.2116584));
1615     assert(ae(res2.lowerBound[1], -0.3892603));
1616     assert(ae(res2.lowerBound[2], -0.2528110));
1617     assert(ae(res2.upperBound[0], 1.8366823));
1618     assert(ae(res2.upperBound[1], 0.5934631));
1619     assert(ae(res2.upperBound[2], 0.5733693));
1620 
1621     auto x2Intercept = [1,1,1,1,1,1,1,1,1,1,1,1];
1622     auto res2a = logisticRegress(y2,
1623         filter!"a.length"([x2Intercept, x2_1, x2_2]));
1624     foreach(ti, elem; res2a.tupleof) {
1625         assert(ae(elem, res2.tupleof[ti]));
1626     }
1627 
1628     // Use a huge range of values to test numerical stability.
1629 
1630     // The filter is to make y3 a non-random access range.
1631     auto y3 = filter!"a < 2"([1,1,1,1,0,0,0,0]);
1632     auto x3_1 = filter!"a > 0"([1, 1e10, 2, 2e10, 3, 3e15, 4, 4e7]);
1633     auto x3_2 = [1e8, 1e6, 1e7, 1e5, 1e3, 1e0, 1e9, 1e11];
1634     auto x3_3 = [-5e12, 5e2, 6e5, 4e3, -999999, -666, -3e10, -2e10];
1635     auto res3 = logisticRegress(y3, repeat(1), x3_1, x3_2, x3_3, 0.99);
1636     assert(ae(res3.betas[0], 1.115e0));
1637     assert(ae(res3.betas[1], -4.674e-15));
1638     assert(ae(res3.betas[2], -7.026e-9));
1639     assert(ae(res3.betas[3], -2.109e-12));
1640     assert(ae(res3.stdErr[0], 1.158));
1641     assert(ae(res3.stdErr[1], 2.098e-13));
1642     assert(ae(res3.stdErr[2], 1.878e-8));
1643     assert(ae(res3.stdErr[3], 4.789e-11));
1644     assert(ae(res3.p[0], 0.336));
1645     assert(ae(res3.p[1], 0.982));
1646     assert(ae(res3.p[2], 0.708));
1647     assert(ae(res3.p[3], 0.965));
1648     assert(ae(res3.aic, 12.544));
1649     assert(ae(res3.nullLogLikelihood, -0.5 * 11.0904));
1650     assert(ae(res3.logLikelihood, -0.5 * 4.5442));
1651     // Not testing confidence intervals b/c they'd be so buried in numerical
1652     // fuzz.
1653 
1654 
1655     // Test with a just plain huge dataset that R chokes for several minutes
1656     // on.  If you think this unittest is slow, try getting the reference
1657     // values from R.
1658     auto y4 = chain(
1659                 take(cycle([0,0,0,0,1]), 500_000),
1660                 take(cycle([1,1,1,1,0]), 500_000));
1661     auto x4_1 = iota(0, 1_000_000);
1662     auto x4_2 = array(map!(compose!(exp, "a / 1_000_000.0"))(x4_1));
1663     auto x4_3 = take(cycle([1,2,3,4,5]), 1_000_000);
1664     auto x4_4 = take(cycle([8,6,7,5,3,0,9]), 1_000_000);
1665     auto res4 = logisticRegress(y4, repeat(1), x4_1, x4_2, x4_3, x4_4, 0.99);
1666     assert(ae(res4.betas[0], -1.574));
1667     assert(ae(res4.betas[1], 5.625e-6));
1668     assert(ae(res4.betas[2], -7.282e-1));
1669     assert(ae(res4.betas[3], -4.381e-6));
1670     assert(ae(res4.betas[4], -8.343e-6));
1671     assert(ae(res4.stdErr[0], 3.693e-2));
1672     assert(ae(res4.stdErr[1], 7.188e-8));
1673     assert(ae(res4.stdErr[2], 4.208e-2));
1674     assert(ae(res4.stdErr[3], 1.658e-3));
1675     assert(ae(res4.stdErr[4], 8.164e-4));
1676     assert(ae(res4.p[0], 0));
1677     assert(ae(res4.p[1], 0));
1678     assert(ae(res4.p[2], 0));
1679     assert(ae(res4.p[3], 0.998));
1680     assert(ae(res4.p[4], 0.992));
1681     assert(ae(res4.aic, 1089339));
1682     assert(ae(res4.nullLogLikelihood, -0.5 * 1386294));
1683     assert(ae(res4.logLikelihood, -0.5 * 1089329));
1684     assert(ae(res4.lowerBound[0], -1.668899));
1685     assert(ae(res4.lowerBound[1], 5.439787e-6));
1686     assert(ae(res4.lowerBound[2], -0.8366273));
1687     assert(ae(res4.lowerBound[3], -4.27406e-3));
1688     assert(ae(res4.lowerBound[4], -2.111240e-3));
1689     assert(ae(res4.upperBound[0], -1.478623));
1690     assert(ae(res4.upperBound[1], 5.810089e-6));
1691     assert(ae(res4.upperBound[2], -6.198418e-1));
1692     assert(ae(res4.upperBound[3], 4.265302e-3));
1693     assert(ae(res4.upperBound[4], 2.084554e-3));
1694 
1695     // Test ridge stuff.
1696     auto ridge2 = logisticRegressBeta(y2, repeat(1), x2_1, x2_2, 3);
1697     assert(ae(ridge2[0], -0.40279319));
1698     assert(ae(ridge2[1], 0.03575638));
1699     assert(ae(ridge2[2], 0.05313875));
1700 
1701     auto ridge2_2 = logisticRegressBeta(y2, repeat(1), x2_1, x2_2, 2);
1702     assert(ae(ridge2_2[0], -0.51411490));
1703     assert(ae(ridge2_2[1], 0.04536590));
1704     assert(ae(ridge2_2[2], 0.06809964));
1705 }
1706 
1707 /// The logistic function used in logistic regression.
1708 double logistic(double xb) pure nothrow @safe {
1709     return 1.0 / (1 + exp(-xb));
1710 }
1711 
1712 /**
1713 Performs lasso (L1) and/or ridge (L2) penalized logistic regression.  Due to the
1714 way the data is standardized, no intercept term should be included in x
1715 (unlike logisticRegress and logisticRegressBeta).  The intercept coefficient is
1716 implicitly included and returned in the first element of the returned array.
1717 Usage is otherwise identical.
1718 
1719 Note:  Setting lasso equal to zero is equivalent to performing ridge regression.
1720        This can also be done with logisticRegressBeta.  However, the
1721        logisticRegressBeta algorithm is optimized for memory efficiency and
1722        large samples.  This algorithm is optimized for large feature sets.
1723 
1724 Returns:  The beta coefficients for the regression model.
1725 
1726 References:
1727 
1728 Friedman J, et al Pathwise coordinate optimization. Ann. Appl. Stat.
1729 2007;2:302-332.
1730 
1731 Goeman, J. J., L1 penalized estimation in the Cox proportional hazards model.
1732 Biometrical Journal 52(1), 70{84.
1733 
1734 Eilers, P., Boer, J., Van Ommen, G., Van Houwelingen, H. 2001 Classification of
1735 microarray data with penalized logistic regression. Proceedings of SPIE.
1736 Progress in Biomedical Optics and Images vol. 4266, pp. 187-198
1737 
1738 Douglas M. Hawkins and Xiangrong Yin.  A faster algorithm for ridge regression
1739 of reduced rank data.  Computational Statistics & Data Analysis Volume 40,
1740 Issue 2, 28 August 2002, Pages 253-262
1741 
1742 http://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
1743 */
1744 double[] logisticRegressPenalized(Y, X...)
1745 (Y yIn, X xIn, double lasso, double ridge) {
1746     auto alloc = newRegionAllocator();
1747 
1748     static assert(!isInfinite!Y, "Can't do regression with infinite # of Y's.");
1749     static if(isRandomAccessRange!Y) {
1750         alias yIn y;
1751     } else {
1752         auto y = toBools(yIn);
1753     }
1754 
1755     static if(X.length == 1 && isRoR!X) {
1756         enum bool tupleMode = false;
1757         static if(isForwardRange!X) {
1758             auto x = toRandomAccessRoR(y.length, xIn, alloc);
1759         } else {
1760             auto x = toRandomAccessRoR(y.length, alloc.array(xIn), alloc);
1761         }
1762     } else {
1763         enum bool tupleMode = true;
1764         auto xx = toRandomAccessTuple(xIn, alloc);
1765         auto x = xx.expand;
1766     }
1767 
1768     auto betas = new double[x.length + 1];
1769     if(y.length >= x.length && lasso == 0) {
1770         // Add intercept term.
1771         static if(tupleMode) {
1772             doMLENewton(betas, (double[]).init, ridge, y, repeat(1), x);
1773         } else static if(is(x == double[][])) {
1774             auto xInt = alloc.uninitializedArray!(double[])(betas.length);
1775             xInt[1..$] = x[];
1776             xInt[0] = alloc.array(repeat(1.0, y.length));
1777             doMLENewton(betas, (double[]).init, ridge, y, xInt);
1778         } else {
1779             // No choice but to dup the whole thing.
1780             auto xInt = alloc.uninitializedArray!(double[][])(betas.length);
1781             xInt[0] = alloc.array(repeat(1.0, y.length));
1782 
1783             foreach(i, ref arr; xInt[1..$]) {
1784                 arr = alloc.array(
1785                     map!(to!double)(take(x[i], y.length))
1786                 );
1787             }
1788 
1789             doMLENewton(betas, (double[]).init, ridge, y, xInt);
1790         }
1791 
1792     } else {
1793         logisticRegressPenalizedImpl(betas, lasso, ridge, y, x);
1794     }
1795 
1796     return betas;
1797 }
1798 
1799 unittest {
1800     // Test ridge regression.  We have three impls for all kinds of diff.
1801     // scenarios.  See if they all agree.  Note that the ridiculously small but
1802     // nonzero lasso param is to force the use of the coord descent algo.
1803     auto y = new bool[12];
1804     auto x = new double[][16];
1805     foreach(ref elem; x) elem = new double[12];
1806     x[0][] = 1;
1807     auto gen = Random(31415);  // For random but repeatable results.
1808 
1809     foreach(iter; 0..1000) {
1810         foreach(col; x[1..$]) foreach(ref elem; col) elem = rNormal(0, 1, gen);
1811 
1812         // Nothing will converge if y is all true's or all false's.
1813         size_t trueCount;
1814         do {
1815             foreach(ref elem; y) elem = cast(bool) rBernoulli(0.5, gen);
1816             trueCount = count!"a"(y);
1817         } while(trueCount == 0 || trueCount == y.length);
1818 
1819         immutable ridge = uniform(0.1, 10.0, gen);
1820 
1821         auto normalEq = logisticRegressBeta(y, x, ridge);
1822         auto coordDescent = logisticRegressPenalized(
1823             y, x[1..$], double.min_normal, ridge);
1824         auto linalgTrick = logisticRegressPenalized(y, x[1..$], 0, ridge);
1825 
1826         // Every once in a blue moon coordinate descent doesn't converge that
1827         // well.  These small errors are of no practical significance, hence
1828         // the wide tolerance.  However, if the direct normal equations
1829         // and linalg trick don't agree extremely closely, then something's
1830         // fundamentally wrong.
1831         assert(approxEqual(normalEq, coordDescent, 0.02, 1e-4), text(
1832             normalEq, coordDescent));
1833         assert(approxEqual(linalgTrick, coordDescent, 0.02, 1e-4), text(
1834             linalgTrick, coordDescent));
1835         assert(approxEqual(normalEq, linalgTrick, 1e-6, 1e-8), text(
1836             normalEq, linalgTrick));
1837     }
1838 
1839     assert(approxEqual(logisticRegressBeta(y, x[0], x[1], x[2]),
1840         logisticRegressPenalized(y, x[1], x[2], 0, 0)));
1841     assert(approxEqual(logisticRegressBeta(y, [x[0], x[1], x[2]]),
1842         logisticRegressPenalized(y, [x[1], x[2]], 0, 0)));
1843     assert(approxEqual(logisticRegressBeta(y, [x[0], x[1], x[2]]),
1844         logisticRegressPenalized(y,
1845         [to!(float[])(x[1]), to!(float[])(x[2])], 0, 0)));
1846 
1847     // Make sure the adding intercept stuff is right for the Newton path.
1848     //assert(logisticRegressBeta(x[0], x[1], x[2]) ==
1849 
1850     // Test stuff that's got some lasso in it.  Values from R's Penalized
1851     // package.
1852     y = [1, 0, 0, 1, 1, 1, 0];
1853     x = [[8.0, 6, 7, 5, 3, 0, 9],
1854          [3.0, 6, 2, 4, 3, 6, 8],
1855          [3.0, 1, 4, 1, 5, 9, 2],
1856          [2.0, 7, 1, 8, 2, 8, 1]];
1857 
1858     // Values from R's Penalized package.  Note that it uses a convention for
1859     // the ridge parameter such that Penalized ridge = 2 * dstats ridge.
1860     assert(approxEqual(logisticRegressPenalized(y, x, 1, 0),
1861         [1.642080, -0.22086515, -0.02587546,  0.00000000, 0.00000000 ]));
1862     assert(approxEqual(logisticRegressPenalized(y, x, 1, 3),
1863         [0.5153373, -0.04278257, -0.00888014,  0.01316831,  0.00000000]));
1864     assert(approxEqual(logisticRegressPenalized(y, x, 2, 0.1),
1865         [0.2876821, 0, 0., 0., 0]));
1866     assert(approxEqual(logisticRegressPenalized(y, x, 1.2, 7),
1867         [0.367613 , -0.017227631, 0.000000000, 0.003875104, 0.000000000]));
1868 }
1869 
1870 /**
1871 This function performs loess regression.  Loess regression is a local
1872 regression procedure, where a prediction of the dependent (y) variable
1873 is made from an observation of the independent (x) variable by weighted
1874 least squares over x values in the neighborhood of the value being evaluated.
1875 
1876 In the future a separate function may be included to perform loess regression
1877 with multiple predictors.  However, one predictor is the much more common
1878 case and the multiple predictor case will require a much different API
1879 and implementation, so for now only one predictor is supported.
1880 
1881 Params:
1882 
1883 y      = Observations of the dependent variable.
1884 x      = Observations of the independent variable.
1885 span   = The fraction of x observations considered to be "in the neighborhood"
1886          when performing local regression to predict the y value for a new x.
1887          For example, if 8 observations are provided and span == 0.5,
1888          the 4 nearest neighbors will be used on evaluation.
1889 degree = The polynomial degree of the local regression.  Must be less than
1890          the number of neighbors (span * x.length).
1891 
1892 Returns:
1893 
1894 A Loess1D object.  This object can be used to make predictions based on
1895 the loess model.  The computations involved are done lazily, i.e. this
1896 function sets up the Loess1D instance and returns without computing
1897 any regression models.
1898 
1899 Examples:
1900 ---
1901 auto x = [1, 2, 3, 4, 5, 6, 7];
1902 auto y = [3, 6, 2, 4, 3, 6, 8];
1903 
1904 // Build the Loess1D object.
1905 auto model = loess1D(y, x, 0.5);
1906 
1907 // Compute the weights for robust regression.
1908 model.computeRobustWeights(2);
1909 
1910 // Predict the value of y when x == 5.5, using a robustness level of 2.
1911 auto prediction = model.predict(5.5, 2);
1912 ---
1913 
1914 References:
1915 
1916 Cleveland, W.S. (1979). "Robust Locally Weighted Regression and Smoothing
1917 Scatterplots". Journal of the American Statistical Association 74 (368):
1918 829-836.
1919 */
1920 Loess1D loess1D(RX, RY)(
1921     RY y,
1922     RX x,
1923     double span,
1924     int degree = 1
1925 ) {
1926     dstatsEnforce(span > 0 && span <= 1, format(
1927         "Span must be >0 and <= 1 for loess1D, not %s.", span
1928     ));
1929     dstatsEnforce(degree >= 0, "Degree must be >= 0 for loess1D.");
1930 
1931     auto ret = new Loess1D;
1932     ret._x = array(map!(to!double)(x));
1933     ret._y = array(map!(to!double)(y));
1934     qsort(ret._x, ret._y);
1935 
1936     ret.nNeighbors = to!int(ret.x.length * span);
1937     dstatsEnforce(ret.nNeighbors > degree, format(
1938         "Cannot do degree %s loess with a window of only %s points.  " ~
1939         "Increase the span parameter.", degree, ret.nNeighbors
1940     ));
1941 
1942     ret._degree = degree;
1943     auto xPoly = new double[][degree];
1944     if(degree > 0) xPoly[0] = ret._x;
1945 
1946     foreach(d; 2..degree + 1) {
1947         xPoly[d - 1] = ret._x.dup;
1948         xPoly[d - 1][] ^^= d;
1949     }
1950 
1951     ret.xPoly = xPoly;
1952     return ret;
1953 }
1954 
1955 unittest {
1956     auto x = [1, 2, 3, 4, 5, 6, 7, 8];
1957     auto y = [3, 2, 8, 2, 6, 9, 0, 1];
1958 
1959     auto loess1 = loess1D(y, x, 0.75, 1);
1960 
1961     // Values from R's lowess() function.  This gets slightly different
1962     // results than loess(), probably due to disagreements bout windowing
1963     // details.
1964     assert(approxEqual(loess1.predictions(0),
1965         [2.9193046, 3.6620295, 4.2229953, 5.2642335, 5.3433985, 4.4225636,
1966          2.7719778, 0.6643268]
1967     ));
1968 
1969     loess1 = loess1D(y, x, 0.5, 1);
1970     assert(approxEqual(loess1.predictions(0),
1971         [2.1615941, 4.0041736, 4.5642738, 4.8631052, 5.7136895, 5.5642738,
1972          2.8631052, -0.1977227]
1973     ));
1974 
1975     assert(approxEqual(loess1.predictions(2),
1976         [2.2079526, 3.9809030, 4.4752888, 4.8849727, 5.7260333, 5.4465225,
1977          2.8769120, -0.1116018]
1978     ));
1979 
1980     // Test 0th and 2nd order using R's loess() function since lowess() doesn't
1981     // support anything besides first degree.
1982     auto loess0 = loess1D(y, x, 0.5, 0);
1983     assert(approxEqual(loess0.predictions(0),
1984         [3.378961, 4.004174, 4.564274, 4.863105, 5.713689, 5.564274, 2.863105,
1985          1.845369]
1986     ));
1987 
1988     // Not testing the last point.  R's loess() consistently gets slightly
1989     // different answers for the last point than either this function or
1990     // R's lowess() for degree > 0.  (This function and R's lowess() agree
1991     // when this happens.)  It's not clear which is right but the differences
1992     // are small and not practically important.
1993     auto loess2 = loess1D(y, x, 0.75, 2);
1994     assert(approxEqual(loess2.predictions(0)[0..$ - 1],
1995         [2.4029984, 4.1021339, 4.8288941, 4.5523535, 6.0000000, 6.4476465,
1996          3.7669741]
1997     ));
1998 }
1999 
2000 /**
2001 This class is returned from the loess1D function and holds the state of a
2002 loess regression with one predictor variable.
2003 */
2004 final class Loess1D {
2005 private:
2006     double[] _x;
2007     double[][] xPoly;
2008     double[] _y;
2009     double[][] yHat;
2010     double[][] residualWeights;
2011 
2012     int _degree;
2013     int nNeighbors;
2014 
2015     size_t nearestNeighborX(double point) const {
2016         auto sortedX = assumeSorted(x);
2017         auto trisected = sortedX.trisect(point);
2018 
2019         if(trisected[1].length) return trisected[0].length;
2020         if(!trisected[0].length) return 0;
2021         if(!trisected[2].length) return trisected[0].length - 1;
2022 
2023         if(point - trisected[0][trisected[0].length - 1] <
2024            trisected[2][0] - point
2025         ) {
2026             return trisected[0].length - 1;
2027         }
2028 
2029         return trisected[0].length;
2030     }
2031 
2032     static void computexTWx(
2033         const double[][] xPoly,
2034         const(double)[] weights,
2035         ref DoubleMatrix covMatrix
2036     ) {
2037         foreach(i; 0..xPoly.length) foreach(j; 0..i + 1) {
2038             covMatrix[i + 1, j + 1] = covMatrix[j + 1, i + 1] = threeDot(
2039                 xPoly[i], weights, xPoly[j]
2040             );
2041         }
2042 
2043         // Handle intercept terms
2044         foreach(i; 1..covMatrix.rows) {
2045             covMatrix[0, i] = covMatrix[i, 0] = dotProduct(weights, xPoly[i - 1]);
2046         }
2047 
2048         covMatrix[0, 0] = sum(weights);
2049     }
2050 
2051     static void computexTWy(
2052         const(double)[][] xPoly,
2053         const(double)[] y,
2054         const(double)[] weights,
2055         double[] ans
2056     ) {
2057         foreach(i; 0..xPoly.length) {
2058             ans[i + 1] = threeDot(xPoly[i], weights, y);
2059         }
2060 
2061         // Intercept:
2062         ans[0] = dotProduct(weights, y);
2063     }
2064 
2065 public:
2066     /**
2067     Predict the value of y when x == point, using robustness iterations
2068     of the biweight procedure outlined in the reference to make the
2069     estimates more robust.
2070 
2071     Notes:
2072 
2073     This function is computationally intensive but may be called
2074     from multiple threads simultaneously.  When predicting a
2075     large number of points, a parallel foreach loop may be used.
2076 
2077     Before calling this function with robustness > 0,
2078     computeRobustWeights() must be called.  See this function for details.
2079 
2080     Returns:  The predicted y value.
2081     */
2082     double predict(double point, int robustness = 0) const {
2083         dstatsEnforce(cast(int) residualWeights.length >= robustness,
2084             "For robust loess1D estimates, computeRobustWeights() " ~
2085             "must be called with the proper robustness level before " ~
2086             "calling predict()."
2087         );
2088 
2089         auto alloc = newRegionAllocator();
2090         auto covMatrix = doubleMatrix(degree + 1, degree + 1, alloc);
2091         auto xTy = alloc.uninitializedArray!(double[])(degree + 1);
2092         auto betas = alloc.uninitializedArray!(double[])(degree + 1);
2093         auto xPolyNeighbors = alloc.array(xPoly);
2094 
2095         size_t firstNeighbor = nearestNeighborX(point);
2096         size_t lastNeighbor = firstNeighbor;
2097 
2098         while(lastNeighbor - firstNeighbor + 1 < nNeighbors) {
2099             immutable upperDiff = (lastNeighbor + 1 >= x.length) ?
2100                 double.infinity : (x[lastNeighbor + 1] - point);
2101             immutable lowerDiff = (firstNeighbor == 0) ?
2102                 double.infinity : (point - x[firstNeighbor - 1]);
2103 
2104             if(upperDiff < lowerDiff) {
2105                 lastNeighbor++;
2106             } else {
2107                 firstNeighbor--;
2108             }
2109         }
2110 
2111         foreach(j, col; xPoly) {
2112             xPolyNeighbors[j] = xPoly[j][firstNeighbor..lastNeighbor + 1];
2113         }
2114         auto yNeighbors = y[firstNeighbor..lastNeighbor + 1];
2115         immutable maxDist = computeMaxDist(x[firstNeighbor..lastNeighbor + 1], point);
2116         auto w = alloc.uninitializedArray!(double[])(yNeighbors.length);
2117         foreach(j, neighbor; x[firstNeighbor..lastNeighbor + 1]) {
2118             immutable diff = abs(point - neighbor);
2119             w[j] = max(0, (1 - (diff / maxDist) ^^ 3) ^^ 3);
2120             if(robustness > 0) {
2121                 w[j] *= residualWeights[robustness - 1][j + firstNeighbor];
2122             }
2123         }
2124 
2125         computexTWx(xPolyNeighbors, w, covMatrix);
2126         computexTWy(xPolyNeighbors, w, yNeighbors, xTy);
2127         choleskySolve(covMatrix, xTy, betas);
2128 
2129         double ret = 0;
2130         foreach(d; 0..degree + 1) {
2131             ret += betas[d] * point ^^ d;
2132         }
2133 
2134         return ret;
2135     }
2136 
2137     /**
2138     Compute the weights for robust loess, for all robustness levels <= the
2139     robustness parameter.  This computation is embarrassingly parallel, so if a
2140     TaskPool is provided it will be parallelized.
2141 
2142     This function must be called before calling predict() with a robustness
2143     value > 0.  computeRobustWeights() must be called with a robustness level
2144     >= the robustness level predict() is to be called with.  This is not
2145     handled implicitly because computeRobustWeights() is very computationally
2146     intensive and because it modifies the state of the Loess1D object, while
2147     predict() is const.  Forcing computeRobustWeights() to be called explicitly
2148     allows multiple instances of predict() to be evaluated in parallel.
2149     */
2150     void computeRobustWeights(int robustness, TaskPool pool = null) {
2151         dstatsEnforce(robustness >= 0, "Robustness cannot be <0.");
2152 
2153         if(cast(int) yHat.length < robustness) {
2154             computeRobustWeights(robustness - 1, pool);
2155         }
2156 
2157         if(yHat.length >= robustness + 1) {
2158             return;
2159         }
2160 
2161         if(robustness > 0) {
2162             auto resid = new double[y.length];
2163             residualWeights ~= resid;
2164             resid[] = y[] - yHat[$ - 1][];
2165 
2166             foreach(ref r; resid) r = abs(r);
2167             immutable sixMed = 6 * median(resid);
2168 
2169             foreach(ref r; resid) {
2170                 r = biweight(r / sixMed);
2171             }
2172         }
2173 
2174         yHat ~= new double[y.length];
2175 
2176         if(pool is null) {
2177             foreach(i, point; x) {
2178                 yHat[robustness][i] = predict(point, robustness);
2179             }
2180         } else {
2181             foreach(i, point; pool.parallel(x, 1)) {
2182                 yHat[robustness][i] = predict(point, robustness);
2183             }
2184         }
2185     }
2186 
2187     /**
2188     Obtain smoothed predictions of y at the values of x provided on creation
2189     of this object, for the given level of robustness.  Evaluating
2190     these is computationally expensive and may be parallelized by
2191     providing a TaskPool object.
2192     */
2193     const(double)[] predictions(int robustness, TaskPool pool = null) {
2194         dstatsEnforce(robustness >= 0, "Robustness cannot be <0.");
2195         computeRobustWeights(robustness, pool);
2196         return yHat[robustness];
2197     }
2198 
2199 const pure nothrow @property @safe:
2200 
2201     /// The polynomial degree for the local regression.
2202     int degree() {
2203         return _degree;
2204     }
2205 
2206     /// The x values provided on object creation.
2207     const(double)[] x() {
2208         return _x;
2209     }
2210 
2211     /// The y values provided on object creation.
2212     const(double)[] y() {
2213         return _y;
2214     }
2215 }
2216 
2217 private:
2218 double biweight(double x) pure nothrow @safe {
2219     if(abs(x) >= 1) return 0;
2220     return (1 - x ^^ 2) ^^ 2;
2221 }
2222 
2223 double computeMaxDist(const double[] stuff, double x) pure nothrow @safe {
2224     double ret = 0;
2225     foreach(elem; stuff) {
2226         ret = max(ret, abs(elem - x));
2227     }
2228 
2229     return ret;
2230 }
2231 
2232 double absMax(double a, double b) {
2233     return max(abs(a), abs(b));
2234 }
2235 
2236 LogisticRes logisticRegressImpl(T, V...)
2237 (bool inference, double ridge, T yIn, V input) {
2238     auto alloc = newRegionAllocator();
2239 
2240     static if(isFloatingPoint!(V[$ - 1])) {
2241         alias input[$ - 1] conf;
2242         alias V[0..$ - 1] U;
2243         alias input[0..$ - 1] xIn;
2244         enforceConfidence(conf);
2245     } else {
2246         alias V U;
2247         alias input xIn;
2248         enum conf = 0.95;
2249     }
2250 
2251     static assert(!isInfinite!T, "Can't do regression with infinite # of Y's.");
2252     static if(isRandomAccessRange!T) {
2253         alias yIn y;
2254     } else {
2255         auto y = toBools(yIn, alloc);
2256     }
2257 
2258     static if(U.length == 1 && isRoR!U) {
2259         static if(isForwardRange!U) {
2260             auto x = toRandomAccessRoR(y.length, xIn, alloc);
2261         } else {
2262             auto x = toRandomAccessRoR(y.length, alloc.array(xIn), alloc);
2263         }
2264     } else {
2265         auto xx = toRandomAccessTuple(xIn, alloc);
2266         auto x = xx.expand;
2267     }
2268 
2269     typeof(return) ret;
2270     ret.betas.length = x.length;
2271     if(inference) ret.stdErr.length = x.length;
2272     ret.logLikelihood = doMLENewton(ret.betas, ret.stdErr, ridge, y, x);
2273 
2274     if(!inference) return ret;
2275 
2276     static bool hasNaNs(R)(R range) {
2277         return !filter!isNaN(range).empty;
2278     }
2279 
2280     if(isNaN(ret.logLikelihood) || hasNaNs(ret.betas) || hasNaNs(ret.stdErr)) {
2281         // Then we didn't converge or our data was defective.
2282         return ret;
2283     }
2284 
2285     ret.nullLogLikelihood = .priorLikelihood(y);
2286     double lratio = ret.logLikelihood - ret.nullLogLikelihood;
2287 
2288     // Compensate for numerical fuzz.
2289     if(lratio < 0 && lratio > -1e-5) lratio = 0;
2290     if(lratio >= 0 && x.length >= 2) {
2291         ret.overallP = chiSquareCDFR(2 * lratio, x.length - 1);
2292     }
2293 
2294     ret.p.length = x.length;
2295     ret.lowerBound.length = x.length;
2296     ret.upperBound.length = x.length;
2297     immutable nDev = -invNormalCDF((1 - conf) / 2);
2298     foreach(i; 0..x.length) {
2299         ret.p[i] = 2 * normalCDF(-abs(ret.betas[i]) / ret.stdErr[i]);
2300         ret.lowerBound[i] = ret.betas[i] - nDev * ret.stdErr[i];
2301         ret.upperBound[i] = ret.betas[i] + nDev * ret.stdErr[i];
2302     }
2303 
2304     return ret;
2305 }
2306 
2307 private void logisticRegressPenalizedImpl(Y, X...)
2308 (double[] betas, double lasso, double ridge, Y y, X xIn) {
2309     static if(isRoR!(X[0]) && X.length == 1) {
2310         alias xIn[0] x;
2311     } else {
2312         alias xIn x;
2313     }
2314 
2315     auto alloc = newRegionAllocator();
2316     auto ps = alloc.uninitializedArray!(double[])(y.length);
2317     betas[] = 0;
2318     auto betasRaw = alloc.array(betas[1..$]);
2319 
2320     double oldLikelihood = -double.infinity;
2321     double oldPenalty2 = double.infinity;
2322     double oldPenalty1 = double.infinity;
2323     enum eps = 1e-6;
2324     enum maxIter = 1000;
2325 
2326     auto weights = alloc.uninitializedArray!(double[])(y.length);
2327     auto z = alloc.uninitializedArray!(double[])(y.length);
2328     auto xMeans = alloc.uninitializedArray!(double[])(x.length);
2329     auto xSds = alloc.uninitializedArray!(double[])(x.length);
2330     double zMean = 0;
2331     auto xCenterScale = alloc.uninitializedArray!(double[][])(x.length);
2332     foreach(ref col; xCenterScale) col = alloc.uninitializedArray!(double[])(y.length);
2333 
2334     // Puts x in xCenterScale, with weighted mean subtracted and weighted
2335     // biased stdev divided.  Also standardizes z similarly.
2336     //
2337     // Returns:  true if successful, false if weightSum is so small that the
2338     //           algorithm has converged for all practical purposes.
2339     bool doCenterScale() {
2340         immutable weightSum = sum(weights);
2341         if(weightSum < eps) return false;
2342 
2343         xMeans[] = 0;
2344         zMean = 0;
2345         xSds[] = 0;
2346 
2347         // Do copying.
2348         foreach(i, col; x) {
2349             copy(take(col, y.length), xCenterScale[i]);
2350         }
2351 
2352         // Compute weighted means.
2353         foreach(j, col; xCenterScale) {
2354             foreach(i, w; weights) {
2355                 xMeans[j] += w * col[i];
2356             }
2357         }
2358 
2359         foreach(i, w; weights) {
2360             zMean += w * z[i];
2361         }
2362 
2363         xMeans[] /= weightSum;
2364         zMean /= weightSum;
2365         z[] -= zMean;
2366         foreach(i, col; xCenterScale) col[] -= xMeans[i];
2367 
2368         // Compute biased stdevs.
2369         foreach(j, ref sd; xSds) {
2370             sd = sqrt(meanStdev(xCenterScale[j]).mse);
2371         }
2372 
2373         foreach(i, col; xCenterScale) col[] /= xSds[i];
2374         return true;
2375     }
2376 
2377     // Rescales the beta coefficients to undo the effects of standardizing.
2378     void rescaleBetas() {
2379         betas[0] = zMean;
2380         foreach(i, b; betasRaw) {
2381             betas[i + 1] = b / xSds[i];
2382             betas[0] -= betas[i + 1] * xMeans[i];
2383         }
2384     }
2385 
2386     foreach(iter; 0..maxIter) {
2387         evalPs(betas[0], ps, betas[1..$], x);
2388         immutable lh = logLikelihood(ps, y);
2389         immutable penalty2 = ridge * reduce!"a + b * b"(0.0, betas);
2390         immutable penalty1 = lasso * reduce!"a + (b < 0) ? -b : b"(0.0, betas);
2391 
2392         if((lh - oldLikelihood) / (absMax(lh, oldLikelihood) + 0.1) < eps
2393         && (oldPenalty2 - penalty2) / (absMax(penalty2, oldPenalty2) + 0.1) < eps
2394         && (oldPenalty1 - penalty1) / (absMax(penalty1, oldPenalty1) + 0.1) < eps) {
2395             return;
2396         } else if(isNaN(lh) || isNaN(penalty2) || isNaN(penalty1)) {
2397             betas[] = double.nan;
2398             return;
2399         }
2400 
2401         oldPenalty2 = penalty2;
2402         oldPenalty1 = penalty1;
2403         oldLikelihood = lh;
2404 
2405         foreach(i, ref w; weights) {
2406             w = (ps[i] * (1 - ps[i]));
2407         }
2408 
2409         z[] = betas[0];
2410         foreach(i, col; x) {
2411             static if(is(typeof(col) : const(double)[])) {
2412                 z[] += col[] * betas[i + 1];
2413             } else {
2414                 foreach(j, ref elem; z) {
2415                     elem += col[j] * betas[i + 1];
2416                 }
2417             }
2418         }
2419 
2420         foreach(i, w; weights) {
2421             if(w == 0){
2422                 z[i] = 0;
2423             } else {
2424                 immutable double yi = (y[i] == 0) ? 0.0 : 1.0;
2425                 z[i] += (yi - ps[i]) / w;
2426             }
2427         }
2428 
2429         immutable centerScaleRes = doCenterScale();
2430 
2431         // If this is false then weightSum is so small that all probabilities
2432         // are for all practical purposes either 0 or 1.  We can declare
2433         // convergence and go home.
2434         if(!centerScaleRes) return;
2435 
2436         if(lasso > 0) {
2437             // Correct for different conventions in defining ridge params
2438             // so all functions get the same answer.
2439             immutable ridgeCorrected = ridge * 2.0;
2440             coordDescent(z, xCenterScale, betasRaw,
2441                 lasso, ridgeCorrected, weights);
2442         } else {
2443             // Correct for different conventions in defining ridge params
2444             // so all functions get the same answer.
2445             immutable ridgeCorrected = ridge * 2.0;
2446             ridgeLargeP(z, xCenterScale, ridgeCorrected, betasRaw, weights);
2447         }
2448 
2449         rescaleBetas();
2450     }
2451 
2452     immutable lh = logLikelihood(ps, y);
2453     immutable penalty2 = ridge * reduce!"a + b * b"(0.0, betas);
2454     immutable penalty1 = lasso * reduce!"a + (b < 0) ? -b : b"(0.0, betas);
2455 
2456     if((lh - oldLikelihood) / (absMax(lh, oldLikelihood) + 0.1) < eps
2457     && (oldPenalty2 - penalty2) / (absMax(penalty2, oldPenalty2) + 0.1) < eps
2458     && (oldPenalty1 - penalty1) / (absMax(penalty1, oldPenalty1) + 0.1) < eps) {
2459         return;
2460     } else {
2461         // If we got here, we haven't converged.  Return NaNs instead of bogus
2462         // values.
2463         betas[] = double.nan;
2464     }
2465 }
2466 
2467 // Calculate the mean squared error of all ranges.  This is delicate, though,
2468 // because some may be infinite and we want to stop at the shortest range.
2469 double[] calculateMSEs(U...)(U xIn, RegionAllocator alloc) {
2470     static if(isRoR!(U[0]) && U.length == 1) {
2471         alias xIn[0] x;
2472     } else {
2473         alias xIn x;
2474     }
2475 
2476     size_t minLen = size_t.max;
2477     foreach(r; x) {
2478         static if(!isInfinite!(typeof(r))) {
2479             static assert(hasLength!(typeof(r)),
2480                 "Ranges passed to doMLENewton should be random access, meaning " ~
2481                 "either infinite or with length.");
2482 
2483             minLen = min(minLen, r.length);
2484         }
2485     }
2486 
2487     dstatsEnforce(minLen < size_t.max,
2488         "Can't do logistic regression if all of the ranges are infinite.");
2489 
2490     auto ret = alloc.uninitializedArray!(double[])(x.length);
2491     foreach(ti, range; x) {
2492         //Workaround for bug https://issues.dlang.org/show_bug.cgi?id=13151
2493         //can be removed once no more need for 2.066.* support
2494         static if(is(typeof(range.save) R == Take!T, T)) {
2495             auto saved = range.save;
2496             ret[ti] = meanStdev(R(saved.source, min(minLen, saved.maxLength))).mse;
2497         } else {
2498             ret[ti] = meanStdev(take(range.save, minLen)).mse;
2499         }
2500     }
2501 
2502     return ret;
2503 }
2504 
2505 double doMLENewton(T, U...)
2506 (double[] beta, double[] stdError, double ridge, T y, U xIn) {
2507     // This big, disgusting function uses the Newton-Raphson method as outlined
2508     // in http://socserv.mcmaster.ca/jfox/Courses/UCLA/logistic-regression-notes.pdf
2509     //
2510     // The matrix operations are kind of obfuscated because they're written
2511     // using very low-level primitives and with as little temp space as
2512     // possible used.
2513     static if(isRoR!(U[0]) && U.length == 1) {
2514         alias xIn[0] x;
2515     } else {
2516         alias xIn x;
2517     }
2518 
2519     auto alloc = newRegionAllocator();
2520 
2521     double[] mses;  // Used for ridge.
2522     if(ridge > 0) {
2523         mses = calculateMSEs(x, alloc);
2524     }
2525 
2526     beta[] = 0;
2527     if(stdError.length) stdError[] = double.nan;
2528 
2529     auto ps = alloc.uninitializedArray!(double[])(y.length);
2530 
2531     double getPenalty() {
2532         if(ridge == 0) return 0;
2533 
2534         double ret = 0;
2535         foreach(i, b; beta) {
2536             ret += ridge * mses[i] * b ^^ 2;
2537         }
2538 
2539         return ret;
2540     }
2541 
2542     enum eps = 1e-6;
2543     enum maxIter = 1000;
2544 
2545     auto oldLikelihood = -double.infinity;
2546     double oldPenalty = -double.infinity;
2547     auto firstDerivTerms = alloc.uninitializedArray!(double[])(beta.length);
2548 
2549     // matSaved saves mat for inverting to find std. errors, only if we
2550     // care about std. errors.
2551     auto mat = doubleMatrix(beta.length, beta.length, alloc);
2552     DoubleMatrix matSaved;
2553 
2554     if(stdError.length) {
2555         matSaved = doubleMatrix(beta.length, beta.length, alloc);
2556     }
2557 
2558     void saveMat() {
2559         foreach(i; 0..mat.rows) foreach(j; 0..mat.columns) {
2560             matSaved[i, j] = mat[i, j];
2561         }
2562     }
2563 
2564     void doStdErrs() {
2565         if(stdError.length) {
2566             // Here, we actually need to invert the information matrix.
2567             // We can use mat as scratch space since we don't need it
2568             // anymore.
2569             invert(matSaved, mat);
2570 
2571             foreach(i; 0..beta.length) {
2572                 stdError[i] = sqrt(mat[i, i]);
2573             }
2574         }
2575     }
2576 
2577     auto updates = alloc.uninitializedArray!(double[])(beta.length);
2578     foreach(iter; 0..maxIter) {
2579         evalPs(ps, beta, x);
2580         immutable lh = logLikelihood(ps, y);
2581         immutable penalty = getPenalty();
2582 
2583         if((lh - oldLikelihood) / (absMax(lh, oldLikelihood) + 0.1) < eps
2584         && (oldPenalty - penalty) / (absMax(penalty, oldPenalty) + 0.1) < eps) {
2585             doStdErrs();
2586             return lh;
2587         } else if(isNaN(lh)) {
2588             beta[] = double.nan;
2589             return lh;
2590         }
2591 
2592         oldLikelihood = lh;
2593         oldPenalty = penalty;
2594 
2595         foreach(i; 0..mat.rows) foreach(j; 0..mat.columns) {
2596             mat[i, j] = 0;
2597         }
2598 
2599         // Calculate X' * V * X in the notation of our reference.  Since
2600         // V is a diagonal matrix of ps[] * (1.0 - ps[]), we only have one
2601         // dimension representing it.  Do this for the lower half, then
2602         // symmetrize the matrix.
2603         foreach(i, xi; x) foreach(j, xj; x[0..i + 1]) {
2604             foreach(k; 0..ps.length) {
2605                 mat[i, j] += (ps[k] * (1 - ps[k])) * xi[k] * xj[k];
2606             }
2607         }
2608 
2609         symmetrize(mat);
2610 
2611         if(stdError.length) {
2612             saveMat();
2613         }
2614 
2615         // Convert ps to ys - ps.
2616         foreach(pIndex, ref p; ps) {
2617             p = (y[pIndex] == 0) ? -p : (1 - p);
2618         }
2619 
2620         // Compute X'(y - p).
2621         foreach(ti, xRange; x) {
2622             //Workaround for bug https://issues.dlang.org/show_bug.cgi?id=13151
2623             //can be removed once no more need for 2.066.* support
2624             static if(is(typeof(xRange) R == Take!T, T)) {
2625                 firstDerivTerms[ti] = dotProduct(
2626                         R(xRange.source, min(ps.length, xRange.maxLength)),
2627                         ps);
2628             } else {
2629                 firstDerivTerms[ti] = dotProduct(take(xRange, ps.length), ps);
2630             }
2631         }
2632 
2633         // Add ridge penalties, if any.
2634         if(ridge > 0) {
2635             foreach(diagIndex, mse; mses) {
2636                 mat[diagIndex, diagIndex] += 2 * ridge * mse;
2637                 firstDerivTerms[diagIndex] -= beta[diagIndex] * 2 * ridge * mse;
2638             }
2639         }
2640 
2641         choleskySolve(mat, firstDerivTerms, updates);
2642         beta[] += updates[];
2643 
2644         debug(print) writeln("Iter:  ", iter);
2645     }
2646 
2647     immutable lh = logLikelihood(ps, y);
2648     immutable penalty = getPenalty();
2649     if((lh - oldLikelihood) / (absMax(lh, oldLikelihood) + 0.1) < eps
2650     && (oldPenalty - penalty) / (absMax(penalty, oldPenalty) + 0.1) < eps) {
2651         doStdErrs();
2652         return lh;
2653     } else {
2654         // If we got here, we haven't converged.  Return NaNs instead of bogus
2655         // values.
2656         beta[] = double.nan;
2657         return double.nan;
2658     }
2659 }
2660 
2661 private double logLikelihood(Y)(double[] ps, Y y) {
2662     double sum = 0;
2663     size_t i = 0;
2664     foreach(yVal; y) {
2665         scope(exit) i++;
2666         if(yVal) {
2667             sum += log(ps[i]);
2668         } else {
2669             sum += log(1 - ps[i]);
2670         }
2671     }
2672     return sum;
2673 }
2674 
2675 void evalPs(X...)(double[] ps, double[] beta, X xIn) {
2676     evalPs(0, ps, beta, xIn);
2677 }
2678 
2679 void evalPs(X...)(double interceptTerm, double[] ps, double[] beta, X xIn) {
2680     static if(isRoR!(X[0]) && X.length == 1) {
2681         alias xIn[0] x;
2682     } else {
2683         alias xIn x;
2684     }
2685 
2686     assert(x.length == beta.length);
2687     ps[] = interceptTerm;
2688 
2689     foreach(i, range; x) {
2690         static if(hasLength!(typeof(range))) {
2691             assert(range.length == ps.length);
2692         }
2693 
2694         static if(is(typeof(range) == double[])) {
2695             // Take advantage of array ops.
2696             ps[] += range[0..ps.length] * beta[i];
2697         } else {
2698             immutable b = beta[i];
2699 
2700             size_t j = 0;
2701             foreach(elem; range) {
2702                 if(j >= ps.length) break;
2703                 ps[j++] += b * elem;
2704             }
2705         }
2706     }
2707 
2708     foreach(ref elem; ps) elem = logistic(elem);
2709 }
2710 
2711 template isRoR(T) {
2712     static if(!isInputRange!T) {
2713         enum isRoR = false;
2714     } else {
2715         enum isRoR = isInputRange!(typeof(T.init.front()));
2716     }
2717 }
2718 
2719 template isFloatMat(T) {
2720     static if(is(T : const(float[][])) ||
2721         is(T : const(real[][])) || is(T : const(double[][]))) {
2722         enum isFloatMat = true;
2723     } else {
2724         enum isFloatMat = false;
2725     }
2726 }
2727 
2728 template NonRandomToArray(T) {
2729     static if(isRandomAccessRange!T) {
2730         alias T NonRandomToArray;
2731     } else {
2732         alias Unqual!(ElementType!(T))[] NonRandomToArray;
2733     }
2734 }
2735 
2736 double priorLikelihood(Y)(Y y) {
2737     uint nTrue, n;
2738     foreach(elem; y) {
2739         n++;
2740         if(elem) nTrue++;
2741     }
2742 
2743     immutable p = cast(double) nTrue / n;
2744     return nTrue * log(p) + (n - nTrue) * log(1 - p);
2745 }
2746 
2747 bool[] toBools(R)(R range, RegionAllocator alloc) {
2748     return alloc.array(map!"(a) ? true : false"(range));
2749 }
2750 
2751 auto toRandomAccessRoR(T)(size_t len, T ror, RegionAllocator alloc) {
2752     static assert(isRoR!T);
2753     alias ElementType!T E;
2754     static if(isArray!T && isRandomAccessRange!E) {
2755         return ror;
2756     } else static if(!isArray!T && isRandomAccessRange!E) {
2757         // Shallow copy so we know it has cheap slicing and stuff,
2758         // even if it is random access.
2759         return alloc.array(ror);
2760     } else {
2761         alias ElementType!E EE;
2762         auto ret = alloc.uninitializedArray!(EE[])(walkLength(ror.save));
2763 
2764         foreach(ref col; ret) {
2765             scope(exit) ror.popFront();
2766             col = alloc.uninitializedArray!(EE[])(len);
2767 
2768             size_t i;
2769             foreach(elem; ror.front) {
2770                 col[i++] = elem;
2771             }
2772         }
2773 
2774         return ret;
2775     }
2776 }
2777 
2778 auto toRandomAccessTuple(T...)(T input, RegionAllocator alloc) {
2779     Tuple!(staticMap!(NonRandomToArray, T)) ret;
2780 
2781     foreach(ti, range; input) {
2782         static if(isRandomAccessRange!(typeof(range))) {
2783             ret.field[ti] = range;
2784         } else {
2785             ret.field[ti] = alloc.array(range);
2786         }
2787     }
2788 
2789     return ret;
2790 }
2791 
2792 // Borrowed and modified from Phobos's dotProduct() function,
2793 // Copyright Andrei Alexandrescu 2008 - 2009.  Originally licensed
2794 // under the Boost Software License 1.0.  (License text included
2795 // at the beginning of this file.)
2796 private double threeDot(
2797     const double[] x1,
2798     const double[] x2,
2799     const double[] x3
2800 ) in {
2801     assert(x1.length == x2.length);
2802     assert(x2.length == x3.length);
2803 } do {
2804     immutable n = x1.length;
2805     auto avec = x1.ptr, bvec = x2.ptr, cvec = x3.ptr;
2806     typeof(return) sum0 = 0, sum1 = 0;
2807 
2808     const all_endp = avec + n;
2809     const smallblock_endp = avec + (n & ~3);
2810     const bigblock_endp = avec + (n & ~15);
2811 
2812     for (; avec != bigblock_endp; avec += 16, bvec += 16, cvec += 16)
2813     {
2814         sum0 += avec[0] * bvec[0] * cvec[0];
2815         sum1 += avec[1] * bvec[1] * cvec[1];
2816         sum0 += avec[2] * bvec[2] * cvec[2];
2817         sum1 += avec[3] * bvec[3] * cvec[3];
2818         sum0 += avec[4] * bvec[4] * cvec[4];
2819         sum1 += avec[5] * bvec[5] * cvec[5];
2820         sum0 += avec[6] * bvec[6] * cvec[6];
2821         sum1 += avec[7] * bvec[7] * cvec[7];
2822         sum0 += avec[8] * bvec[8] * cvec[8];
2823         sum1 += avec[9] * bvec[9] * cvec[9];
2824         sum0 += avec[10] * bvec[10] * cvec[10];
2825         sum1 += avec[11] * bvec[11] * cvec[11];
2826         sum0 += avec[12] * bvec[12] * cvec[12];
2827         sum1 += avec[13] * bvec[13] * cvec[13];
2828         sum0 += avec[14] * bvec[14] * cvec[14];
2829         sum1 += avec[15] * bvec[15] * cvec[15];
2830     }
2831 
2832     for (; avec != smallblock_endp; avec += 4, bvec += 4, cvec += 4) {
2833         sum0 += avec[0] * bvec[0] * cvec[0];
2834         sum1 += avec[1] * bvec[1] * cvec[1];
2835         sum0 += avec[2] * bvec[2] * cvec[2];
2836         sum1 += avec[3] * bvec[3] * cvec[3];
2837     }
2838 
2839     sum0 += sum1;
2840 
2841     /* Do trailing portion in naive loop. */
2842     while (avec != all_endp)
2843     {
2844         sum0 += *avec * *bvec * *cvec;
2845         ++avec;
2846         ++bvec;
2847         ++cvec;
2848     }
2849 
2850     return sum0;
2851 }
2852 
2853 unittest {
2854     auto a = [1.0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21];
2855     auto b = a.dup;
2856     b[] *= 2;
2857     auto c = b.dup;
2858     c[] *= 3;
2859     immutable ans1 = threeDot(a, b, c);
2860 
2861     double ans2 = 0;
2862     foreach(i; 0..21) {
2863         ans2 += a[i] * b[i] * c[i];
2864     }
2865 
2866     assert(approxEqual(ans1, ans2));
2867 }