inteli.math source code

1 /**
2 * Transcendental bonus functions.
3 *
4 * Copyright: Copyright Guillaumr Piolat 2016-2020.
5 *            Copyright (C) 2007  Julien Pommier
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.math;
9 
10 /* Copyright (C) 2007  Julien Pommier
11 
12   This software is provided 'as-is', without any express or implied
13   warranty.  In no event will the authors be held liable for any damages
14   arising from the use of this software.
15 
16   Permission is granted to anyone to use this software for any purpose,
17   including commercial applications, and to alter it and redistribute it
18   freely, subject to the following restrictions:
19 
20   1. The origin of this software must not be misrepresented; you must not
21      claim that you wrote the original software. If you use this software
22      in a product, an acknowledgment in the product documentation would be
23      appreciated but is not required.
24   2. Altered source versions must be plainly marked as such, and must not be
25      misrepresented as being the original software.
26   3. This notice may not be removed or altered from any source distribution.
27 
28   (this is the zlib license)
29 */
30 import inteli.emmintrin;
31 import inteli.internals;
32 
33 nothrow @nogc:
34 
35 /// Natural `log` computed for a single 32-bit float.
36 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
37 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
38 // #BONUS
39 float _mm_log_ss(float v) pure @safe
40 {
41     __m128 r = _mm_log_ps(_mm_set1_ps(v));
42     return r.array[0];
43 }
44 
45 /// Natural logarithm computed for 4 simultaneous float.
46 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
47 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
48 // #BONUS
49 __m128 _mm_log_ps(__m128 x) pure @safe
50 {
51     static immutable __m128i _psi_inv_mant_mask = [~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000];
52     static immutable __m128 _ps_cephes_SQRTHF = [0.707106781186547524, 0.707106781186547524, 0.707106781186547524, 0.707106781186547524];
53     static immutable __m128 _ps_cephes_log_p0 = [7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2];
54     static immutable __m128 _ps_cephes_log_p1 = [- 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1];
55     static immutable __m128 _ps_cephes_log_p2 = [1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1];
56     static immutable __m128 _ps_cephes_log_p3 = [- 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1];
57     static immutable __m128 _ps_cephes_log_p4 = [+ 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1];
58     static immutable __m128 _ps_cephes_log_p5 = [- 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1];
59     static immutable __m128 _ps_cephes_log_p6 = [+ 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1];
60     static immutable __m128 _ps_cephes_log_p7 = [- 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1];
61     static immutable __m128 _ps_cephes_log_p8 = [+ 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1];
62     static immutable __m128 _ps_cephes_log_q1 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
63     static immutable __m128 _ps_cephes_log_q2 = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
64 
65     /* the smallest non denormalized float number */
66     static immutable __m128i _psi_min_norm_pos  = [0x00800000,   0x00800000,   0x00800000, 0x00800000];
67 
68     __m128i emm0;
69     __m128 one = _ps_1;
70     __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
71     x = _mm_max_ps(x, cast(__m128)_psi_min_norm_pos);  /* cut off denormalized stuff */
72     emm0 = _mm_srli_epi32(cast(__m128i)x, 23);
73 
74     /* keep only the fractional part */
75     x = _mm_and_ps(x, cast(__m128)_psi_inv_mant_mask);
76     x = _mm_or_ps(x, _ps_0p5);
77 
78     emm0 = _mm_sub_epi32(emm0, _pi32_0x7f);
79     __m128 e = _mm_cvtepi32_ps(emm0);
80     e += one;
81     __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF);
82     __m128 tmp = _mm_and_ps(x, mask);
83     x -= one;
84     e -= _mm_and_ps(one, mask);
85     x += tmp;
86     __m128 z = x * x;
87     __m128 y = _ps_cephes_log_p0;
88     y *= x;
89     y += _ps_cephes_log_p1;
90     y *= x;
91     y += _ps_cephes_log_p2;
92     y *= x;
93     y += _ps_cephes_log_p3;
94     y *= x;
95     y += _ps_cephes_log_p4;
96     y *= x;
97     y += _ps_cephes_log_p5;
98     y *= x;
99     y += _ps_cephes_log_p6;
100     y *= x;
101     y += _ps_cephes_log_p7;
102     y *= x;
103     y += _ps_cephes_log_p8;
104     y *= x;
105 
106     y = y * z;
107     tmp = e * _ps_cephes_log_q1;
108     y += tmp;
109     tmp = z * _ps_0p5;
110     y = y - tmp;
111     tmp = e * _ps_cephes_log_q2;
112     x += y;
113     x += tmp;
114     x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
115     return x;
116 }
117 
118 /// Natural `exp` computed for a single float.
119 /// This is an approximation, valid up to approximately -109dB of accuracy
120 /// IMPORTANT: NaN input not supported.
121 // #BONUS
122 float _mm_exp_ss(float v) pure @safe
123 {
124     __m128 r = _mm_exp_ps(_mm_set1_ps(v));
125     return r.array[0];
126 }
127 
128 /// Natural `exp` computed for 4 simultaneous float in `x`.
129 /// This is an approximation, valid up to approximately -109dB of accuracy
130 /// IMPORTANT: NaN input not supported.
131 // #BONUS
132 __m128 _mm_exp_ps(__m128 x) pure @safe
133 {
134     static immutable __m128 _ps_exp_hi         = [88.3762626647949f, 88.3762626647949f, 88.3762626647949f, 88.3762626647949f];
135     static immutable __m128 _ps_exp_lo         = [-88.3762626647949f, -88.3762626647949f, -88.3762626647949f, -88.3762626647949f];
136     static immutable __m128 _ps_cephes_LOG2EF  = [1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341];
137     static immutable __m128 _ps_cephes_exp_C1  = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
138     static immutable __m128 _ps_cephes_exp_C2  = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
139     static immutable __m128 _ps_cephes_exp_p0  = [1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4];
140     static immutable __m128 _ps_cephes_exp_p1  = [1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3];
141     static immutable __m128 _ps_cephes_exp_p2  = [8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3];
142     static immutable __m128 _ps_cephes_exp_p3  = [4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2];
143     static immutable __m128 _ps_cephes_exp_p4  = [1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1];
144     static immutable __m128 _ps_cephes_exp_p5  = [5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1];
145 
146     __m128 tmp = _mm_setzero_ps(), fx;
147     __m128i emm0;
148     __m128 one = _ps_1;
149 
150     x = _mm_min_ps(x, _ps_exp_hi);
151     x = _mm_max_ps(x, _ps_exp_lo);
152 
153     /* express exp(x) as exp(g + n*log(2)) */
154     fx = x * _ps_cephes_LOG2EF;
155     fx += _ps_0p5;
156 
157     /* how to perform a floorf with SSE: just below */
158     emm0 = _mm_cvttps_epi32(fx);
159     tmp  = _mm_cvtepi32_ps(emm0);
160 
161     /* if greater, substract 1 */
162     __m128 mask = _mm_cmpgt_ps(tmp, fx);
163     mask = _mm_and_ps(mask, one);
164     fx = tmp - mask;
165 
166     tmp = fx * _ps_cephes_exp_C1;
167     __m128 z = fx * _ps_cephes_exp_C2;
168     x -= tmp;
169     x -= z;
170 
171     z = x * x;
172 
173     __m128 y = _ps_cephes_exp_p0;
174     y *= x;
175     y += _ps_cephes_exp_p1;
176     y *= x;
177     y += _ps_cephes_exp_p2;
178     y *= x;
179     y += _ps_cephes_exp_p3;
180     y *= x;
181     y += _ps_cephes_exp_p4;
182     y *= x;
183     y += _ps_cephes_exp_p5;
184     y *= z;
185     y += x;
186     y += one;
187 
188     /* build 2^n */
189     emm0 = _mm_cvttps_epi32(fx);
190 
191     emm0 = _mm_add_epi32(emm0, _pi32_0x7f);
192     emm0 = _mm_slli_epi32(emm0, 23);
193     __m128 pow2n = cast(__m128)emm0;
194     y *= pow2n;
195     return y;
196 }
197 
198 /// Computes `base^exponent` for a single 32-bit float.
199 /// This is an approximation, valid up to approximately -100dB of accuracy
200 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
201 // #BONUS
202 float _mm_pow_ss(float base, float exponent) pure @safe
203 {
204     __m128 r = _mm_pow_ps(_mm_set1_ps(base), _mm_set1_ps(exponent));
205     return r.array[0];
206 }
207 
208 /// Computes `base^exponent`, for 4 floats at once.
209 /// This is an approximation, valid up to approximately -100dB of accuracy
210 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
211 // #BONUS
212 __m128 _mm_pow_ps(__m128 base, __m128 exponents) pure @safe
213 {
214     return _mm_exp_ps(exponents * _mm_log_ps(base));
215 }
216 
217 /// Computes `base^exponent`, for 4 floats at once.
218 /// This is an approximation, valid up to approximately -100dB of accuracy
219 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
220 // #BONUS
221 __m128 _mm_pow_ps(__m128 base, float exponent) pure @safe
222 {
223     return _mm_exp_ps(_mm_set1_ps(exponent) * _mm_log_ps(base));
224 }
225 
226 unittest
227 {
228     import std.math;
229 
230     bool approxEquals(double groundTruth, double approx, double epsilon) pure @trusted @nogc nothrow
231     {
232         if (!isFinite(groundTruth))
233             return true; // no need to approximate where this is NaN or infinite
234 
235         if (groundTruth == 0) // the approximaton should produce zero too if needed
236         {
237             return approx == 0;
238         }
239 
240         if (approx == 0)
241         {
242             // If the approximation produces zero, the error should be below 140 dB
243             return ( abs(groundTruth) < 1e-7 );
244         }
245 
246         if ( ( abs(groundTruth / approx) - 1 ) >= epsilon)
247         {
248             import core.stdc.stdio;
249             debug printf("approxEquals (%g, %g, %g) failed\n", groundTruth, approx, epsilon);
250             debug printf("ratio is %f\n", abs(groundTruth / approx) - 1);
251         }
252 
253         return ( abs(groundTruth / approx) - 1 ) < epsilon;
254     }
255 
256     // test _mm_log_ps
257     for (double mantissa = 0.1; mantissa < 1.0; mantissa += 0.05)
258     {
259         foreach (exponent; -23..23)
260         {
261             double x = mantissa * 2.0 ^^ exponent;
262             double phobosValue = log(x);
263             __m128 v = _mm_log_ps(_mm_set1_ps(x));
264             foreach(i; 0..4)
265                 assert(approxEquals(phobosValue, v.array[i], 1.1e-6));
266         }
267     }
268 
269     // test _mm_exp_ps    
270     for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
271     {
272         foreach (exponent; -23..23)
273         {
274             double x = mantissa * 2.0 ^^ exponent;
275 
276             // don't test too high numbers because they saturate FP precision pretty fast
277             if (x > 50) continue;
278 
279             double phobosValue = exp(x);
280             __m128 v = _mm_exp_ps(_mm_set1_ps(x));
281             foreach(i; 0..4)
282             {
283                 if (!approxEquals(phobosValue, v.array[i], 3.4e-6))
284                 {
285                     import core.stdc.stdio;
286                     printf("x = %f   truth = %f vs estimate = %fn", x, phobosValue, v.array[i]);
287                     assert(false);
288                 }
289             }
290         }
291     }
292 
293     // test than exp(-inf) is 0
294     {
295         __m128 R = _mm_exp_ps(_mm_set1_ps(-float.infinity));
296         float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
297         assert(R.array == correct);
298     }
299 
300     // test log baheviour with NaN and infinities
301     // the only guarantee for now is that _mm_log_ps(negative) yield a NaN
302     {
303         __m128 R = _mm_log_ps(_mm_setr_ps(+0.0f, -0.0f, -1.0f, float.nan));
304       // DOESN'T PASS
305       //  assert(isInfinity(R[0]) && R[0] < 0); // log(+0.0f) = -infinity
306       // DOESN'T PASS
307       //  assert(isInfinity(R[1]) && R[1] < 0); // log(-0.0f) = -infinity
308         assert(isNaN(R.array[2])); // log(negative number) = NaN
309 
310         // DOESN'T PASS
311         //assert(isNaN(R[3])); // log(NaN) = NaN
312     }
313 
314 
315     // test _mm_pow_ps
316     for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
317     {
318         foreach (exponent; -8..4)
319         {
320             double powExponent = mantissa * 2.0 ^^ exponent;
321 
322             for (double mantissa2 = 0.1; mantissa2 < 1.0; mantissa2 += 0.1)
323             {
324                 foreach (exponent2; -4..4)
325                 {
326                     double powBase = mantissa2 * 2.0 ^^ exponent2;
327                     double phobosValue = pow(powBase, powExponent);
328                     float fPhobos = phobosValue;
329                     if (!isFinite(fPhobos)) continue;
330                      __m128 v = _mm_pow_ps(_mm_set1_ps(powBase), _mm_set1_ps(powExponent));
331 
332                     foreach(i; 0..4)
333                     {
334                         if (!approxEquals(phobosValue, v.array[i], 1e-5))
335                         {
336                             printf("%g ^^ %g\n", powBase, powExponent);
337                             assert(false);
338                         }
339                     }
340                 }
341             }
342         }
343     }
344 }
345 
346 private:
347 
348 static immutable __m128 _ps_1   = [1.0f, 1.0f, 1.0f, 1.0f];
349 static immutable __m128 _ps_0p5 = [0.5f, 0.5f, 0.5f, 0.5f];
350 static immutable __m128i _pi32_0x7f = [0x7f, 0x7f, 0x7f, 0x7f];
The OpenD Programming Language