The OpenD Programming Language

1 module ggplotd.stat;
2 
3 version( unittest )
4 {
5     import dunit.toolkit;
6 }
7 
8 import std.traits : isCallable;
9 
10 /**
11  Create Aes based on a function
12 
13  Params:
14     func = Function
15     min  = x coordinate to start from
16     max  = x coordinate to end at
17     precision = Number of points to calculate
18 
19   Examples:
20   --------------
21     /// http://blackedder.github.io/ggplotd/images/function.png
22     import std.random : uniform;
23     import std.typecons : Tuple; import ggplotd.stat : statFunction; 
24     import ggplotd.ggplotd : GGPlotD; 
25     import ggplotd.geom : geomLine, geomPoint;
26     import ggplotd.range : mergeRange;
27 
28     auto f = (double x) { return x / (1 + x); };
29 
30     auto aes = statFunction(f, 0.0, 10);
31     auto gg = GGPlotD().put(geomLine(aes));
32 
33     // Generate some noisy points 
34     auto f2 = (double x) { return x / (1 + x) * uniform(0.75, 1.25); };
35     auto aes2 = f2.statFunction(0.0, 10, 25);
36 
37     // Show points in different colour
38     auto withColour = Tuple!(string, "colour")("aquamarine").mergeRange(aes2);
39     gg = gg.put(withColour.geomPoint);
40 
41     gg.save("function.png");
42   --------------
43 
44 */
45 auto statFunction( FUNC, T )( FUNC func, T min, 
46     T max, size_t precision = 50 ) if (isCallable!FUNC)
47 {
48     import std.array : array;
49     import std.algorithm : map;
50     import std.range : iota;
51 
52     import ggplotd.aes : Aes;
53     auto stepSize = (max-min)/(precision-1);
54     auto xs = [min];
55     if (stepSize > 0)
56         xs = iota( min, max, stepSize ).array ~ max;
57     auto ys = xs.map!((x) => func(x));
58 
59     return Aes!(typeof(xs), "x", typeof(ys), "y")( xs, ys );
60 }
61 
62 unittest
63 {
64     import std.array : array;
65     import std.algorithm : map;
66 
67     auto aes = statFunction( (double x) { return 2*x; }, 0.0, 1, 2 );
68     assertEqual( aes.map!("a.x").array, [0.0, 1.0] );
69     assertEqual( aes.map!("a.y").array, [0.0, 2.0] );
70 
71     aes = statFunction( (double x) { return 3*x; }, -1.0, 1, 3 );
72     assertEqual( aes.map!("a.x").array, [-1.0, 0.0, 1.0] );
73     assertEqual( aes.map!("a.y").array, [-3.0, 0.0, 3.0] );
74 }
75 
76 /+++++++++++
77 Binning
78 +++++++++++/
79 
80 private auto binID(T)( T value, T min, T max, T width )
81 {
82     import std.conv : to;
83     import std.math : floor;
84     assert( min != max, "Minimum value of bin should differ from maximum" );
85     assert( width != 0, "Bin width can't be 0" );
86 
87     import std.stdio : writeln;
88     if ( !(value <= max && value >= min) )
89         writeln( value, " ", min, " ", max );
90 
91     assert( value <= max && value >= min, "Value must be within given range" );
92     if (value==max) // Corner case for highest value
93         value -= 0.1*width; // Might not work correctly for integers
94     auto id = floor((value - min) / width);
95     return id.to!int;
96 }
97 
98 unittest
99 {
100     import std.algorithm : map;
101     import std.array : array;
102     assertEqual(
103         [0.1,0.3,0.6,0.2,0.4].map!((a) => a.binID( 0, 0.6, 0.2 )).array,
104         [0,1,2,1,2] 
105     );
106     // TODO: add tests for integers etc.
107 }
108 
109 private auto idToRange(T)( size_t value, T min, T width )
110 {
111     auto base = min + value*width;
112     return [ base, base + width ];
113 }
114 
115 // Multidimensional bin. Data either is range of range, with each subrange
116 // a set of data. TODO: should we treat one dimensional data differently?
117 private template bin(DATA)
118 {
119     struct Bin
120     {
121         double[][] range;
122         size_t count;
123     }
124 
125     auto bin(DATA data, double[] mins, double[] maxs, 
126         size_t[] noBins)
127     {
128         import std.range : zip;
129         import std.algorithm : filter, all, group, map;
130         import ggplotd.range : groupBy;
131         assert( noBins.all!((a) => a > 0), "noBins must be larger than 0" );
132 
133         auto widths = zip(mins, maxs, noBins).map!((t) => (t[1]-t[0])/(t[2]));
134 
135         auto binIDs = data
136             .filter!((sample)
137             { 
138                 return zip(sample, mins, maxs).
139                     all!( (args) => (args[0] >= args[1] && args[0] <= args[2]));
140             })
141             .map!((sample) 
142             {
143                 import std.array : array; // Needed for groupBy to work correctly
144                 return zip(sample, mins, maxs, widths)
145                     .map!( (args) => binID( args[0], args[1], args[2], args[3] ) ).array;
146             } );
147 
148         import std.typecons : tuple;
149 
150         return binIDs.groupBy!((a) => tuple(a)).values.map!((g) 
151                 {
152                     import std.array : array;
153                     Bin bin;
154                     bin.count = g.length;
155                     bin.range = zip( g[0], mins, widths )
156                         .map!((args) => idToRange( args[0], args[1], args[2] ) )
157                         .array;
158                     return bin;
159                 });
160     }
161 }
162 
163 unittest {
164     import std.algorithm : reduce;
165     auto bins = bin( [[0.0],[0.1],[0.2],[0.2],[0.01],[0.3]], [0.0], [0.3], [10] );
166     assertEqual( 6, reduce!((a,b) => a += b.count )( 0, bins ) );
167     auto bins2 = bin( [[0.0,100],[0.1,101],[0.2,109],[0.01,110],[0.3,103.1]], [0.0,100], [0.3,110], [10,10] );
168     assertEqual( 5, reduce!((a,b) => a += b.count )( 0, bins2 ) );
169 }
170 
171 private auto bin(DATA)(DATA data, double min, double max, 
172     size_t noBins)
173 {
174     import std.algorithm : map;
175     return bin( data.map!((d) => [d]), [min], [max], [noBins] );
176 }
177 
178 unittest {
179     import std.algorithm : reduce;
180     auto bins = bin( [0.0,0.1,0.2,0.2,0.01,0.3], 0.0, 0.3, 10 );
181     assertEqual( 6, reduce!((a,b) => a += b.count )( 0, bins ) );
182 
183     import std.algorithm : map;
184     import std.array : array;
185     import std.random : uniform;
186     import std.range : iota, walkLength;
187     auto xs = iota(0,10,1).map!((i)=>uniform(0,0.75)+uniform(0.25,1)).array;
188     auto binsR = bin( xs, 0.0, 2.0, 5 );
189     assertEqual( xs.walkLength, reduce!((a,b) => a += b.count )( 0, binsR ) );
190 
191     binsR = bin( xs, 0.0, 1.0, 5 );
192     assertLessThanOrEqual( binsR.walkLength, 5 );
193 }
194 
195 private template statHistND(int dim, AES)
196 {
197     import std.algorithm : map;
198     import ggplotd.aes : Aes;
199     import ggplotd.range : mergeRange;
200 
201     struct VolderMort(AESV)
202     {
203         private import std.range : ElementType;
204         private import std.typecons : Tuple;
205         private import ggplotd.aes : group;
206 
207         typeof(group(AESV.init)) grouped;
208         ElementType!(ElementType!(typeof(grouped))) defaults;
209         typeof(bin!(double[][])((double[][]).init, (double[]).init, 
210                     (double[]).init, (size_t[]).init))
211             binned;
212 
213         size_t[] _noBins;
214         double[] mins;
215         double[] maxs;
216 
217         this( AESV aes, size_t[] noBins )
218         {
219             import std.algorithm : min, max, reduce;
220             import std.array : array;
221             import std.conv : to;
222             import std.range : empty, popFront, front, take, walkLength;
223             import std.typecons : tuple;
224 
225             import ggplotd.algorithm : safeMin, safeMax;
226             import ggplotd.range : uniquer;
227 
228             static assert( dim == 1 || dim == 2, "Only dimension of 1 or 2 i supported" );
229 
230             _noBins = noBins;
231             static if (dim == 1) {
232                 if (_noBins[0] < 1)
233                     _noBins[0] = min(aes.map!((a) => a.x.to!double)
234                             .uniquer.take(30).walkLength,
235                             min(30,max(11, aes.walkLength/10)));
236                 auto seed = tuple(
237                         aes.front.x.to!double, aes.front.x.to!double );
238                 auto minmax = reduce!((a,b) => safeMin(a,b.x.to!double),
239                         (a,b) => safeMax(a,b.x.to!double))(seed,aes);
240                 mins = [minmax[0]];
241                 maxs = [minmax[1]];
242             }
243             else
244             {
245                 if (_noBins[0] < 1)
246                     _noBins[0] = min(aes.map!((a) => a.x.to!double)
247                             .uniquer.take(30).walkLength,
248                             min(30,max(11, aes.walkLength/25)));
249                 if (_noBins[1] < 1)
250                     _noBins[1] = min(aes.map!((a) => a.y.to!double)
251                             .uniquer.take(30).walkLength,
252                             min(30,max(11, aes.walkLength/25)));
253                 auto seed = tuple(
254                         aes.front.x.to!double, aes.front.x.to!double,
255                         aes.front.y.to!double, aes.front.y.to!double );
256                 auto minmax = reduce!(
257                             (a,b) => safeMin(a,b.x.to!double),
258                             (a,b) => safeMax(a,b.x.to!double),
259                             (a,b) => safeMin(a,b.y.to!double),
260                             (a,b) => safeMax(a,b.y.to!double)
261                         )(seed,aes);
262                 mins = [minmax[0],minmax[2]];
263                 maxs = [minmax[1],minmax[3]];
264              }
265 
266             grouped = group(aes);
267 
268             foreach( i; 0..mins.length ) {
269                 if (mins[i] == maxs[i]) {
270                     mins[i] -= 0.5;
271                     maxs[i] += 0.5;
272                 }
273             }
274 
275             defaults = grouped.front.front;
276             static if (dim == 1)
277               auto data = grouped.front.map!((t) => [t.x.to!double]); // Extract the x coordinates
278             else 
279               auto data = grouped.front.map!((t) => [t.x.to!double,t.y.to!double]); // Extract the x coordinates
280             binned = bin(
281                     data.array, 
282                     mins, maxs, 
283                     _noBins);
284             assert( !grouped.empty, "Groups should not be empty" );
285             grouped.popFront;
286         }
287 
288         @property bool empty() { 
289             import std.range : empty;
290             return (grouped.empty && binned.empty); }
291 
292         @property auto front()
293         {
294             import ggplotd.aes : merge;
295             import std.conv : to;
296             static if (dim == 1)
297             {
298                 auto w = binned.front.range[0][1]-binned.front.range[0][0];
299                 return defaults.merge(
300                         Tuple!(double, "x", double, "y", double, "width", double, "height")(
301                             binned.front.range[0][0] + 0.5*w, 0.5*binned.front.count, 
302                             w, binned.front.count) );
303             }
304             else
305             {
306                 // Returning polygons. In theory could also return rectangle, but their lines
307                 // will overlap which will make it look wrong. Ideally would be able to set 
308                 // linewidth to 0 for the rectangles and that would solve it.
309                 return [
310                     defaults.merge(
311                         Tuple!(double, "x", double, "y", double, "colour")(
312                             binned.front.range[0][0], binned.front.range[1][0],
313                             binned.front.count.to!double)),
314 
315                     defaults.merge(
316                         Tuple!(double, "x", double, "y", double, "colour")(
317                             binned.front.range[0][1], binned.front.range[1][0],
318                             binned.front.count.to!double)),
319 
320                     defaults.merge(
321                         Tuple!(double, "x", double, "y", double, "colour")(
322                             binned.front.range[0][1], binned.front.range[1][1],
323                             binned.front.count.to!double)),
324 
325                     defaults.merge(
326                         Tuple!(double, "x", double, "y", double, "colour")(
327                             binned.front.range[0][0], binned.front.range[1][1],
328                             binned.front.count.to!double)),
329 
330                     ];
331             }
332         }
333 
334         void popFront()
335         {
336             import std.array : array;
337             import std.conv : to;
338             import std.range : empty, front, popFront;
339             if (!binned.empty)
340                 binned.popFront;
341             if (binned.empty && !grouped.empty)
342             {
343                 defaults = grouped.front.front;
344                 static if (dim == 1) // Extract the coordinates
345                     auto data = grouped.front.map!((t) => [t.x.to!double]); 
346                 else 
347                     auto data = grouped.front.map!((t) => [t.x.to!double,t.y.to!double]);
348                 binned = bin(data.array, mins, maxs, _noBins);
349                 grouped.popFront;
350             }
351         }
352     }
353 
354     auto statHistND(AES aesRange, size_t[] noBins)
355     {
356         // Get maxs, mins and noBins
357         return VolderMort!(typeof(aesRange))( aesRange, noBins );
358     }
359 }
360 
361 /**
362  Create Aes that specifies the bins to draw an histogram 
363 
364  Params:
365     aesRaw = Data that the histogram will be based on 
366     noBins  = Optional number of bins. On a value of 0 the number of bins will be chosen automatically.
367 
368  Returns: Range that holds rectangles representing different bins
369 */
370 auto statHist(AES)(AES aesRaw, size_t noBins = 0)
371 {
372     return statHistND!(1,AES)( aesRaw, [noBins] );
373 }
374 
375 unittest
376 {
377     import std.algorithm : each, map, sort;
378     import std.array : array;
379     import std.random : uniform;
380     import std.range : iota, walkLength;
381 
382     import ggplotd.aes : Aes;
383 
384     auto xs = iota(0,100,1).map!((a) => uniform(0,6) ).array;
385     auto sh = statHist( Aes!(int[], "x")(xs) );
386     auto binXs = sh.map!((b) => b.x).array.sort().array;
387     assertEqual( binXs.length, 6 );
388 
389     // Test single point 
390     xs = [1];
391     sh = statHist( Aes!(int[], "x")(xs) );
392     assertEqual(sh.walkLength, 1);
393 }
394 
395 /**
396  Create Aes that specifies the bins to draw an histogram 
397 
398  Params:
399     aesRange = Data that the histogram will be based on 
400     noBinsX  = Optional number of bins for x axis. On a value of 0 the number of bins will be chosen automatically.
401     noBinsY  = Optional number of bins for y axis. On a value of 0 the number of bins will be chosen automatically.
402 
403  Returns: Range that holds rectangles representing different bins
404 */
405 auto statHist2D(AES)(AES aesRange, size_t noBinsX = 0, size_t noBinsY = 0)
406 {
407     return statHistND!(2,AES)( aesRange, [noBinsX, noBinsY]);
408 }
409 
410 /**
411 Calculate kernel density for given data
412 
413 Params:
414    aesRange = Data that the histogram will be based on 
415 
416 Returns: InputRange that holds x and y coordinates for the kernel
417 */
418 auto statDensity(AES)( AES aesRange )
419 {
420     import std.algorithm : joiner, map, min, max, reduce;
421     import std.conv : to;
422     import std.range : chain, front;
423     import std.typecons : tuple, Tuple;
424     import ggplotd.aes : Aes, group;
425     import ggplotd.range : mergeRange;
426     import ggplotd.algorithm : safeMin, safeMax;
427 
428     auto minmax = reduce!(
429             (a,b) => safeMin(a,b.x.to!double),
430             (a,b) => safeMax(a,b.x.to!double))(tuple(double.init,double.init), aesRange);
431     auto margin = (minmax[1] - minmax[0])/10.0;
432     minmax[0] -= margin;
433     minmax[1] += margin;
434 
435 
436 
437     return aesRange.group.map!((g) {
438         auto xs = g.map!((t) => t.x.to!double);
439 
440         // Calculate the kernel width (using scott thing in dstats)
441         // Initialize kernel with normal distribution.
442         import std.math : isFinite;
443         import dstats.kerneldensity : scottBandwidth, KernelDensity;
444         import dstats.distrib : normalPDF;
445         auto sigma = scottBandwidth(xs);
446         if (!isFinite(sigma) || sigma <= 0)
447             sigma = 0.5;
448         auto kernel = (double x) { return normalPDF(x, 0.0, sigma); };
449         auto density = KernelDensity.fromCallable(kernel, xs);
450 
451         // Use statFunction with the kernel to produce a line
452         // Also add points to close the path down to zero
453         auto coords = chain(
454                 [Tuple!(double, "x", double, "y")( minmax[0], 0.0 )],
455                 statFunction( density, minmax[0], minmax[1] ),
456                 [Tuple!(double, "x", double, "y")( minmax[1], 0.0 )]);
457 
458 
459         return g.front.mergeRange( coords );
460     }).joiner;
461 }
462 
463 ///
464 unittest
465 {
466     import std.stdio : writeln;
467     import std.algorithm : map;
468     import std.array : array;
469     import std.random : uniform;
470     import std.range : chain, iota, repeat, walkLength;
471 
472     import ggplotd.aes : Aes;
473 
474     auto xs = iota(0,100,1)
475         .map!((i)=>uniform(0,0.75)+uniform(0.25,1))
476         .array;
477 
478     auto dens = statDensity( Aes!( typeof(xs), "x")( xs ) );
479     auto dim = dens.walkLength;
480     assertGreaterThan( dim, 1 );
481 
482     // Test that grouping leads to longer (twice as long) result
483     auto cols = chain("a".repeat(50),"b".repeat(50) );
484     auto dens2 = statDensity( Aes!(typeof(cols), "colour", typeof(xs), "x")( cols, xs ) );
485     assertGreaterThan( dens2.walkLength, 1.9*dim );
486     assertLessThan( dens2.walkLength, 2.1*dim );
487 
488     // Test that colour is passed through (merged)
489     assertEqual( dens2.front.colour.length, 1 );
490 
491     // Test single point 
492     xs = [1];
493     dens = statDensity( Aes!( typeof(xs), "x")( xs ) );
494     assertEqual(dens.walkLength, 3);
495 }
496 
497 /**
498 Calculate kernel density for given x and y data
499 
500 Params:
501    aesRange = Data that the histogram will be based on 
502 
503 Returns: Range of ranges that holds polygons for the kernel
504 */
505 auto statDensity2D(AES)( AES aesRange )
506 {
507     import std.algorithm : joiner, map, min, max, reduce;
508     import std.array : array;
509     import std.conv : to;
510     import std.range : chain, front, iota, zip;
511     import std.meta : Erase;
512     import std.typecons : tuple, Tuple;
513     import ggplotd.aes : Aes, group, DefaultGroupFields;
514     import ggplotd.algorithm : safeMin, safeMax;
515     import ggplotd.range : mergeRange;
516 
517     auto minmax = reduce!(
518             (a,b) => safeMin(a,b.x.to!double),
519             (a,b) => safeMax(a,b.x.to!double),
520             (a,b) => safeMin(a,b.y.to!double),
521             (a,b) => safeMax(a,b.y.to!double),
522             )(tuple(double.init,double.init,double.init,double.init), aesRange);
523 
524     if (minmax[0] == minmax[1]) {
525         minmax[0] -= 0.5;
526         minmax[1] += 0.5;
527     }
528     if (minmax[2] == minmax[3]) {
529         minmax[2] -= 0.5;
530         minmax[3] += 0.5;
531     }
532 
533     return aesRange.group!(Erase!("colour",DefaultGroupFields)).map!((g) {
534         auto xs = g.map!((t) => t.x.to!double);
535         auto ys = g.map!((t) => t.y.to!double);
536 
537         import std.math : isFinite;
538         // Calculate the kernel width (using scott thing in dstats)
539         // Initialize kernel with normal distribution.
540         import dstats.kerneldensity : scottBandwidth, KernelDensity;
541         import dstats.distrib : normalPDF;
542         auto sigmaX = scottBandwidth(xs);
543         if (!isFinite(sigmaX) || sigmaX <= 0)
544             sigmaX = 1e-5;
545         auto sigmaY = scottBandwidth(ys);
546         if (!isFinite(sigmaY) || sigmaY <= 0)
547             sigmaY = 1e-5;
548 
549         auto kernel = (double x, double y) { return normalPDF(x, 0.0, sigmaX)*
550             normalPDF(y, 0.0, sigmaY); };
551         auto density = KernelDensity.fromCallable(kernel, xs, ys);
552 
553         auto marginX = (minmax[1] - minmax[0])/10.0;
554         auto marginY = (minmax[3] - minmax[2])/10.0;
555 
556         minmax[0] -= marginX;
557         minmax[1] += marginX;
558         minmax[2] -= marginY;
559         minmax[3] += marginY;
560 
561 
562         // Use statFunction with the kernel to produce a line
563         // Also add points to close the path down to zero
564         auto stepX = (minmax[1] - minmax[0])/25.0;
565         auto stepY = (minmax[3] - minmax[2])/25.0;
566         auto xCoords = iota( minmax[0], minmax[1], stepX ).array ~ minmax[1];
567         auto yCoords = iota( minmax[2], minmax[3], stepY ).array ~ minmax[3];
568         auto coords = zip(xCoords, xCoords[1..$]).map!( (xr) {
569                 return zip(yCoords, yCoords[1..$]).map!( (yr) {
570                         // Two polygons
571                         return [
572                         g.front.mergeRange(Aes!( double[], "x", double[], "y", double[], "colour" )
573                         ([xr[0], xr[0], xr[1]],
574                          [yr[0], yr[1], yr[1]],
575                          [density(xr[0],yr[0]),density(xr[0],yr[1]),density(xr[1],yr[1])])),
576                         g.front.mergeRange(
577                                 Aes!( double[], "x", double[], "y", double[], "colour" )
578                         ([xr[0], xr[1], xr[1]],
579                          [yr[0], yr[0], yr[1]],
580                          [density(xr[0],yr[0]),density(xr[1],yr[0]),density(xr[1],yr[1])]))
581                          ];
582                 }).joiner;
583         }).joiner;
584         return coords;
585         //return g;
586     }).joiner;
587 }
588 
589 ///
590 unittest
591 {
592     import std.algorithm : map;
593     import std.array : array;
594     import std.random : uniform;
595     import std.range : iota, walkLength;
596 
597     import ggplotd.aes : Aes;
598     auto xs = iota(0,500,1).map!((x) => uniform(0.0,5)+uniform(0.0,5))
599         .array;
600     auto ys = iota(0,500,1).map!((y) => uniform(1.0,1.5)+uniform(1.0,1.5))
601         .array;
602     auto aes = Aes!(typeof(xs), "x", typeof(ys), "y")( xs, ys);
603 
604     auto sD = statDensity2D( aes );
605     assertGreaterThan( sD.walkLength, 1150 );
606     assertLessThan( sD.walkLength, 1450 );
607     assertEqual( sD.front.walkLength, 3 );
608 
609     // One value
610     xs = [1];
611     ys = [2];
612     aes = Aes!(typeof(xs), "x", typeof(ys), "y")( xs, ys);
613 
614     sD = statDensity2D( aes );
615     assertGreaterThan(sD.walkLength, 0);
616 }