1 /** 2 * Transcendental bonus functions. 3 * 4 * Copyright: Copyright Guillaumr Piolat 2016-2020. 5 * Copyright (C) 2007 Julien Pommier 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.math; 9 10 /* Copyright (C) 2007 Julien Pommier 11 12 This software is provided 'as-is', without any express or implied 13 warranty. In no event will the authors be held liable for any damages 14 arising from the use of this software. 15 16 Permission is granted to anyone to use this software for any purpose, 17 including commercial applications, and to alter it and redistribute it 18 freely, subject to the following restrictions: 19 20 1. The origin of this software must not be misrepresented; you must not 21 claim that you wrote the original software. If you use this software 22 in a product, an acknowledgment in the product documentation would be 23 appreciated but is not required. 24 2. Altered source versions must be plainly marked as such, and must not be 25 misrepresented as being the original software. 26 3. This notice may not be removed or altered from any source distribution. 27 28 (this is the zlib license) 29 */ 30 import inteli.emmintrin; 31 import inteli.internals; 32 33 nothrow @nogc: 34 35 /// Natural `log` computed for a single 32-bit float. 36 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50 37 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 38 // #BONUS 39 float _mm_log_ss(float v) pure @safe 40 { 41 __m128 r = _mm_log_ps(_mm_set1_ps(v)); 42 return r.array[0]; 43 } 44 45 /// Natural logarithm computed for 4 simultaneous float. 46 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50 47 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 48 // #BONUS 49 __m128 _mm_log_ps(__m128 x) pure @safe 50 { 51 static immutable __m128i _psi_inv_mant_mask = [~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000]; 52 static immutable __m128 _ps_cephes_SQRTHF = [0.707106781186547524, 0.707106781186547524, 0.707106781186547524, 0.707106781186547524]; 53 static immutable __m128 _ps_cephes_log_p0 = [7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2]; 54 static immutable __m128 _ps_cephes_log_p1 = [- 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1]; 55 static immutable __m128 _ps_cephes_log_p2 = [1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1]; 56 static immutable __m128 _ps_cephes_log_p3 = [- 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1]; 57 static immutable __m128 _ps_cephes_log_p4 = [+ 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1]; 58 static immutable __m128 _ps_cephes_log_p5 = [- 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1]; 59 static immutable __m128 _ps_cephes_log_p6 = [+ 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1]; 60 static immutable __m128 _ps_cephes_log_p7 = [- 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1]; 61 static immutable __m128 _ps_cephes_log_p8 = [+ 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1]; 62 static immutable __m128 _ps_cephes_log_q1 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4]; 63 static immutable __m128 _ps_cephes_log_q2 = [0.693359375, 0.693359375, 0.693359375, 0.693359375]; 64 65 /* the smallest non denormalized float number */ 66 static immutable __m128i _psi_min_norm_pos = [0x00800000, 0x00800000, 0x00800000, 0x00800000]; 67 68 __m128i emm0; 69 __m128 one = _ps_1; 70 __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); 71 x = _mm_max_ps(x, cast(__m128)_psi_min_norm_pos); /* cut off denormalized stuff */ 72 emm0 = _mm_srli_epi32(cast(__m128i)x, 23); 73 74 /* keep only the fractional part */ 75 x = _mm_and_ps(x, cast(__m128)_psi_inv_mant_mask); 76 x = _mm_or_ps(x, _ps_0p5); 77 78 emm0 = _mm_sub_epi32(emm0, _pi32_0x7f); 79 __m128 e = _mm_cvtepi32_ps(emm0); 80 e += one; 81 __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF); 82 __m128 tmp = _mm_and_ps(x, mask); 83 x -= one; 84 e -= _mm_and_ps(one, mask); 85 x += tmp; 86 __m128 z = x * x; 87 __m128 y = _ps_cephes_log_p0; 88 y *= x; 89 y += _ps_cephes_log_p1; 90 y *= x; 91 y += _ps_cephes_log_p2; 92 y *= x; 93 y += _ps_cephes_log_p3; 94 y *= x; 95 y += _ps_cephes_log_p4; 96 y *= x; 97 y += _ps_cephes_log_p5; 98 y *= x; 99 y += _ps_cephes_log_p6; 100 y *= x; 101 y += _ps_cephes_log_p7; 102 y *= x; 103 y += _ps_cephes_log_p8; 104 y *= x; 105 106 y = y * z; 107 tmp = e * _ps_cephes_log_q1; 108 y += tmp; 109 tmp = z * _ps_0p5; 110 y = y - tmp; 111 tmp = e * _ps_cephes_log_q2; 112 x += y; 113 x += tmp; 114 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN 115 return x; 116 } 117 118 /// Natural `exp` computed for a single float. 119 /// This is an approximation, valid up to approximately -109dB of accuracy 120 /// IMPORTANT: NaN input not supported. 121 // #BONUS 122 float _mm_exp_ss(float v) pure @safe 123 { 124 __m128 r = _mm_exp_ps(_mm_set1_ps(v)); 125 return r.array[0]; 126 } 127 128 /// Natural `exp` computed for 4 simultaneous float in `x`. 129 /// This is an approximation, valid up to approximately -109dB of accuracy 130 /// IMPORTANT: NaN input not supported. 131 // #BONUS 132 __m128 _mm_exp_ps(__m128 x) pure @safe 133 { 134 static immutable __m128 _ps_exp_hi = [88.3762626647949f, 88.3762626647949f, 88.3762626647949f, 88.3762626647949f]; 135 static immutable __m128 _ps_exp_lo = [-88.3762626647949f, -88.3762626647949f, -88.3762626647949f, -88.3762626647949f]; 136 static immutable __m128 _ps_cephes_LOG2EF = [1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341]; 137 static immutable __m128 _ps_cephes_exp_C1 = [0.693359375, 0.693359375, 0.693359375, 0.693359375]; 138 static immutable __m128 _ps_cephes_exp_C2 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4]; 139 static immutable __m128 _ps_cephes_exp_p0 = [1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4]; 140 static immutable __m128 _ps_cephes_exp_p1 = [1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3]; 141 static immutable __m128 _ps_cephes_exp_p2 = [8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3]; 142 static immutable __m128 _ps_cephes_exp_p3 = [4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2]; 143 static immutable __m128 _ps_cephes_exp_p4 = [1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1]; 144 static immutable __m128 _ps_cephes_exp_p5 = [5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1]; 145 146 __m128 tmp = _mm_setzero_ps(), fx; 147 __m128i emm0; 148 __m128 one = _ps_1; 149 150 x = _mm_min_ps(x, _ps_exp_hi); 151 x = _mm_max_ps(x, _ps_exp_lo); 152 153 /* express exp(x) as exp(g + n*log(2)) */ 154 fx = x * _ps_cephes_LOG2EF; 155 fx += _ps_0p5; 156 157 /* how to perform a floorf with SSE: just below */ 158 emm0 = _mm_cvttps_epi32(fx); 159 tmp = _mm_cvtepi32_ps(emm0); 160 161 /* if greater, substract 1 */ 162 __m128 mask = _mm_cmpgt_ps(tmp, fx); 163 mask = _mm_and_ps(mask, one); 164 fx = tmp - mask; 165 166 tmp = fx * _ps_cephes_exp_C1; 167 __m128 z = fx * _ps_cephes_exp_C2; 168 x -= tmp; 169 x -= z; 170 171 z = x * x; 172 173 __m128 y = _ps_cephes_exp_p0; 174 y *= x; 175 y += _ps_cephes_exp_p1; 176 y *= x; 177 y += _ps_cephes_exp_p2; 178 y *= x; 179 y += _ps_cephes_exp_p3; 180 y *= x; 181 y += _ps_cephes_exp_p4; 182 y *= x; 183 y += _ps_cephes_exp_p5; 184 y *= z; 185 y += x; 186 y += one; 187 188 /* build 2^n */ 189 emm0 = _mm_cvttps_epi32(fx); 190 191 emm0 = _mm_add_epi32(emm0, _pi32_0x7f); 192 emm0 = _mm_slli_epi32(emm0, 23); 193 __m128 pow2n = cast(__m128)emm0; 194 y *= pow2n; 195 return y; 196 } 197 198 /// Computes `base^exponent` for a single 32-bit float. 199 /// This is an approximation, valid up to approximately -100dB of accuracy 200 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 201 // #BONUS 202 float _mm_pow_ss(float base, float exponent) pure @safe 203 { 204 __m128 r = _mm_pow_ps(_mm_set1_ps(base), _mm_set1_ps(exponent)); 205 return r.array[0]; 206 } 207 208 /// Computes `base^exponent`, for 4 floats at once. 209 /// This is an approximation, valid up to approximately -100dB of accuracy 210 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 211 // #BONUS 212 __m128 _mm_pow_ps(__m128 base, __m128 exponents) pure @safe 213 { 214 return _mm_exp_ps(exponents * _mm_log_ps(base)); 215 } 216 217 /// Computes `base^exponent`, for 4 floats at once. 218 /// This is an approximation, valid up to approximately -100dB of accuracy 219 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 220 // #BONUS 221 __m128 _mm_pow_ps(__m128 base, float exponent) pure @safe 222 { 223 return _mm_exp_ps(_mm_set1_ps(exponent) * _mm_log_ps(base)); 224 } 225 226 unittest 227 { 228 import std.math; 229 230 bool approxEquals(double groundTruth, double approx, double epsilon) pure @trusted @nogc nothrow 231 { 232 if (!isFinite(groundTruth)) 233 return true; // no need to approximate where this is NaN or infinite 234 235 if (groundTruth == 0) // the approximaton should produce zero too if needed 236 { 237 return approx == 0; 238 } 239 240 if (approx == 0) 241 { 242 // If the approximation produces zero, the error should be below 140 dB 243 return ( abs(groundTruth) < 1e-7 ); 244 } 245 246 if ( ( abs(groundTruth / approx) - 1 ) >= epsilon) 247 { 248 import core.stdc.stdio; 249 debug printf("approxEquals (%g, %g, %g) failed\n", groundTruth, approx, epsilon); 250 debug printf("ratio is %f\n", abs(groundTruth / approx) - 1); 251 } 252 253 return ( abs(groundTruth / approx) - 1 ) < epsilon; 254 } 255 256 // test _mm_log_ps 257 for (double mantissa = 0.1; mantissa < 1.0; mantissa += 0.05) 258 { 259 foreach (exponent; -23..23) 260 { 261 double x = mantissa * 2.0 ^^ exponent; 262 double phobosValue = log(x); 263 __m128 v = _mm_log_ps(_mm_set1_ps(x)); 264 foreach(i; 0..4) 265 assert(approxEquals(phobosValue, v.array[i], 1.1e-6)); 266 } 267 } 268 269 // test _mm_exp_ps 270 for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1) 271 { 272 foreach (exponent; -23..23) 273 { 274 double x = mantissa * 2.0 ^^ exponent; 275 276 // don't test too high numbers because they saturate FP precision pretty fast 277 if (x > 50) continue; 278 279 double phobosValue = exp(x); 280 __m128 v = _mm_exp_ps(_mm_set1_ps(x)); 281 foreach(i; 0..4) 282 { 283 if (!approxEquals(phobosValue, v.array[i], 3.4e-6)) 284 { 285 import core.stdc.stdio; 286 printf("x = %f truth = %f vs estimate = %fn", x, phobosValue, v.array[i]); 287 assert(false); 288 } 289 } 290 } 291 } 292 293 // test than exp(-inf) is 0 294 { 295 __m128 R = _mm_exp_ps(_mm_set1_ps(-float.infinity)); 296 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 297 assert(R.array == correct); 298 } 299 300 // test log baheviour with NaN and infinities 301 // the only guarantee for now is that _mm_log_ps(negative) yield a NaN 302 { 303 __m128 R = _mm_log_ps(_mm_setr_ps(+0.0f, -0.0f, -1.0f, float.nan)); 304 // DOESN'T PASS 305 // assert(isInfinity(R[0]) && R[0] < 0); // log(+0.0f) = -infinity 306 // DOESN'T PASS 307 // assert(isInfinity(R[1]) && R[1] < 0); // log(-0.0f) = -infinity 308 assert(isNaN(R.array[2])); // log(negative number) = NaN 309 310 // DOESN'T PASS 311 //assert(isNaN(R[3])); // log(NaN) = NaN 312 } 313 314 315 // test _mm_pow_ps 316 for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1) 317 { 318 foreach (exponent; -8..4) 319 { 320 double powExponent = mantissa * 2.0 ^^ exponent; 321 322 for (double mantissa2 = 0.1; mantissa2 < 1.0; mantissa2 += 0.1) 323 { 324 foreach (exponent2; -4..4) 325 { 326 double powBase = mantissa2 * 2.0 ^^ exponent2; 327 double phobosValue = pow(powBase, powExponent); 328 float fPhobos = phobosValue; 329 if (!isFinite(fPhobos)) continue; 330 __m128 v = _mm_pow_ps(_mm_set1_ps(powBase), _mm_set1_ps(powExponent)); 331 332 foreach(i; 0..4) 333 { 334 if (!approxEquals(phobosValue, v.array[i], 1e-5)) 335 { 336 printf("%g ^^ %g\n", powBase, powExponent); 337 assert(false); 338 } 339 } 340 } 341 } 342 } 343 } 344 } 345 346 private: 347 348 static immutable __m128 _ps_1 = [1.0f, 1.0f, 1.0f, 1.0f]; 349 static immutable __m128 _ps_0p5 = [0.5f, 0.5f, 0.5f, 0.5f]; 350 static immutable __m128i _pi32_0x7f = [0x7f, 0x7f, 0x7f, 0x7f];