//Excised from deep in the bowels of the cuda distribution // Allan Struthers (Jun 29 2010) // Full file /usr/local/include/cuda/math_functions.h // is 3829 lines // Start line is 1287 // /* approximate 2*atanh(a/2) for |a| < 0.245 */ __device_func__(float __internal_atanhf_kernel(float a_1, float a_2)) { float a, a2, t; a = a_1 + a_2; a2 = a * a; t = 1.566305595598990E-001f/64.0f; t = t * a2 + 1.995081856004762E-001f/16.0f; t = t * a2 + 3.333382699617026E-001f/4.0f; t = t * a2; t = t * a + a_2; t = t + a_1; return t; } /* compute atan(r) in first octant, i.e. 0 <= r <= 1 * eps ~= 2.16e-7 */ __device_func__(float __internal_atanf_kernel(float a)) { float t4, t0, t1; t4 = a * a; t0 = - 5.674867153f; t0 = t4 * - 0.823362947f + t0; t0 = t0 * t4 - 6.565555096f; t0 = t0 * t4; t0 = t0 * a; t1 = t4 + 11.33538818f; t1 = t1 * t4 + 28.84246826f; t1 = t1 * t4 + 19.69667053f; t1 = 1.0f / t1; a = t0 * t1 + a; return a; } /* approximate tangent on -pi/4...+pi/4 */ __device_func__(float __internal_tan_kernel(float a)) { float a2, s, t; a2 = a * a; t = 4.114678393115178E-003f * a2 - 8.231194034909670E-001f; s = a2 - 2.469348886157666E+000f; s = 1.0f / s; t = t * s; t = t * a2; t = t * a + a; return t; } __device_func__(float __internal_accurate_logf(float a)) { float t; float z; float m; int ia, e; ia = __float_as_int(a); /* handle special cases */ if ((ia < 0x00800000) || (ia > 0x7f7fffff)) { return __logf(a); } /* log(a) = 2 * atanh((a-1)/(a+1)) */ m = __int_as_float((ia & 0x807fffff) | 0x3f800000); e = ((unsigned)ia >> 23) - 127; if (m > CUDART_SQRT_TWO_F) { m = m * 0.5f; e = e + 1; } t = m - 1.0f; z = m + 1.0f; z = t / z; z = -t * z; z = __internal_atanhf_kernel(t, z); z = (float)e * CUDART_LN2_F + z; return z; } __device_func__(float2 __internal_log_ep(float a)) { float2 res; int expo; float m; float log_hi, log_lo; float t_hi, t_lo; float f, g, u, v, q; #if !defined(__CUDABE__) && defined(__linux__) && !defined(__LP64__) volatile float r, s, e; #else float r, s, e; #endif expo = (__float_as_int(a) >> 23) & 0xff; #if !defined(__CUDABE__) /* convert denormals to normals for computation of log(a) */ if (expo == 0) { a *= CUDART_TWO_TO_24_F; expo = (__float_as_int(a) >> 23) & 0xff; expo -= 24; } #endif expo -= 127; m = __int_as_float((__float_as_int(a) & 0x807fffff) | 0x3f800000); if (m > CUDART_SQRT_TWO_F) { m = m * 0.5f; expo = expo + 1; } /* compute log(m) with extended precision using an algorithm from P.T.P. * Tang, "Table Driven Implementation of the Logarithm Function", TOMS, * Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomial * approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilized. */ f = m - 1.0f; g = m + 1.0f; g = 1.0f / g; u = 2.0f * f * g; v = u * u; q = 1.49356810919559350E-001f/64.0f; q = q * v + 1.99887797540072460E-001f/16.0f; q = q * v + 3.33333880955515580E-001f/4.0f; q = q * v; q = q * u; log_hi = __int_as_float(__float_as_int(u) & 0xfffff000); v = __int_as_float(__float_as_int(f) & 0xfffff000); u = 2.0f * (f - log_hi); f = f - v; u = u - log_hi * v; u = u - log_hi * f; u = g * u; /* compute log(m) = log_hi + u + q in double-single format*/ /* log += u; |log| > |u| */ r = log_hi + u; s = u - (r - log_hi); log_hi = r; log_lo = s; /* log += q; |log| > |q| */ r = log_hi + q; s = ((log_hi - r) + q) + log_lo; log_hi = e = r + s; log_lo = (r - e) + s; /* log(2)*expo in double-single format */ t_hi = expo * 0.6931457519f; /* multiplication is exact */ t_lo = expo * 1.4286067653e-6f; /* log(a) = log(m) + log(2)*expo; if expo != 0, |log(2)*expo| > |log(m)| */ r = t_hi + log_hi; s = (((t_hi - r) + log_hi) + log_lo) + t_lo; res.y = e = r + s; res.x = (r - e) + s; return res; } __device_func__(float __internal_accurate_log2f(float a)) { return CUDART_L2E_F * __internal_accurate_logf(a); } /* 160 bits of 2/PI for Payne-Hanek style argument reduction. */ static __constant__ unsigned int __cudart_i2opi_f [] = { 0x3c439041, 0xdb629599, 0xf534ddc0, 0xfc2757d1, 0x4e441529, 0xa2f9836e, }; /* reduce argument to trig function to -pi/4...+pi/4 */ __device_func__(float __internal_trig_reduction_kernel(float a, int *quadrant)) { float j; int q; if (__cuda_fabsf(a) > CUDART_TRIG_PLOSS_F) { /* Payne-Hanek style argument reduction. */ unsigned int ia = __float_as_int(a); unsigned int s = ia & 0x80000000; unsigned int result[7]; unsigned int phi, plo; unsigned int hi, lo; unsigned int e; int idx; e = ((ia >> 23) & 0xff) - 128; ia = (ia << 8) | 0x80000000; /* compute x * 2/pi */ idx = 4 - (e >> 5); hi = 0; #if defined(__CUDABE__) #pragma unroll 1 #endif /* __CUDABE__ */ for (q = 0; q < 6; q++) { plo = __cudart_i2opi_f[q] * ia; phi = __umulhi (__cudart_i2opi_f[q], ia); lo = hi + plo; hi = phi + (lo < plo); result[q] = lo; } result[q] = hi; e = e & 31; /* shift result such that hi:lo<63:62> are the least significant integer bits, and hi:lo<61:0> are the fractional bits of the result */ hi = result[idx+2]; lo = result[idx+1]; if (e) { q = 32 - e; hi = (hi << e) | (lo >> q); lo = (lo << e) | (result[idx] >> q); } q = hi >> 30; /* fraction */ hi = (hi << 2) | (lo >> 30); lo = (lo << 2); e = (hi + (lo > 0)) > 0x80000000; /* fraction >= 0.5 */ q += e; if (s) q = -q; if (e) { unsigned int t; hi = ~hi; lo = -(int)lo; t = (lo == 0); hi += t; s = s ^ 0x80000000; } *quadrant = q; /* normalize fraction */ e = 0; while ((int)hi > 0) { hi = (hi << 1) | (lo >> 31); lo = (lo << 1); e--; } lo = hi * 0xc90fdaa2; hi = __umulhi(hi, 0xc90fdaa2); if ((int)hi > 0) { hi = (hi << 1) | (lo >> 31); lo = (lo << 1); e--; } hi = hi + (lo > 0); ia = s | (((e + 126) << 23) + (hi >> 8) + ((hi << 24) >= 0x80000000)); return __int_as_float(ia); } q = __float2int_rn(a * CUDART_2_OVER_PI_F); j = (float)q; a = a - j * 1.5703125000000000e+000f; a = a - j * 4.8351287841796875e-004f; a = a - j * 3.1385570764541626e-007f; a = a - j * 6.0771005065061922e-011f; *quadrant = q; return a; } /* High quality implementation of expf(). A naive implementation, expf(x) = * exp2f (x * log2(e)), loses significant accuracy for large arguments, and * may return results with only 15 to 16 good bits (out of 24). The present * implementation limits the error to about 2 ulps across the entire argument * range. It does so by employing an extended precision representation for * ln(2) which is composited from ln2_hi = 0.6931457519f which provides the * most significant 16-bit of ln(2), and ln2_lo = 1.4286067653e-6f, which * provides the least significant 24 bits. */ __device_func__(float __internal_expf_kernel(float a, float scale)) { float j, z; j = __cuda_truncf(a * CUDART_L2E_F); z = a - j * 0.6931457519f; z = z - j * 1.4286067653e-6f; z = z * CUDART_L2E_F; z = __cuda_exp2f(z) * __cuda_exp2f(j + scale); return z; } __device_func__(float __internal_accurate_expf(float a)) { float z; z = __internal_expf_kernel(a, 0.0f); if (a < -105.0f) z = 0.0f; if (a > 105.0f) z = CUDART_INF_F; return z; } __device_func__(float __internal_accurate_exp10f(float a)) { float j, z; j = __cuda_truncf(a * CUDART_L2T_F); z = a - j * 3.0102920532226563e-001f; z = z - j * 7.9034171557301747e-007f; z = z * CUDART_L2T_F; z = __cuda_exp2f(z) * __cuda_exp2f(j); if (a < -46.0f) z = 0.0f; if (a > 46.0f) z = CUDART_INF_F; return z; } __device_func__(float __internal_lgammaf_pos(float a)) { float sum; float s, t; if (a == CUDART_INF_F) { return a; } if (a >= 3.0f) { if (a >= 7.8f) { /* Stirling approximation for a >= 8; coefficients from Hart et al, * "Computer Approximations", Wiley 1968. Approximation 5401 */ s = 1.0f / a; t = s * s; sum = 0.77783067e-3f; sum = sum * t - 0.2777655457e-2f; sum = sum * t + 0.83333273853e-1f; sum = sum * s + 0.918938533204672f; s = 0.5f * __internal_accurate_logf(a); t = a - 0.5f; s = s * t; t = s - a; s = __fadd_rn(s, sum); /* prevent FMAD merging */ t = t + s; return t; } else { a = a - 3.0f; s = - 7.488903254816711E+002f; s = s * a - 1.234974215949363E+004f; s = s * a - 4.106137688064877E+004f; s = s * a - 4.831066242492429E+004f; s = s * a - 1.430333998207429E+005f; t = a - 2.592509840117874E+002f; t = t * a - 1.077717972228532E+004f; t = t * a - 9.268505031444956E+004f; t = t * a - 2.063535768623558E+005f; t = s / t; t = t + a; return t; } } else if (a >= 1.5f) { a = a - 2.0f; t = + 4.959849168282574E-005f; t = t * a - 2.208948403848352E-004f; t = t * a + 5.413142447864599E-004f; t = t * a - 1.204516976842832E-003f; t = t * a + 2.884251838546602E-003f; t = t * a - 7.382757963931180E-003f; t = t * a + 2.058131963026755E-002f; t = t * a - 6.735248600734503E-002f; t = t * a + 3.224670187176319E-001f; t = t * a + 4.227843368636472E-001f; t = t * a; return t; } else if (a >= 0.7f) { a = 1.0f - a; t = + 4.588266515364258E-002f; t = t * a + 1.037396712740616E-001f; t = t * a + 1.228036339653591E-001f; t = t * a + 1.275242157462838E-001f; t = t * a + 1.432166835245778E-001f; t = t * a + 1.693435824224152E-001f; t = t * a + 2.074079329483975E-001f; t = t * a + 2.705875136435339E-001f; t = t * a + 4.006854436743395E-001f; t = t * a + 8.224669796332661E-001f; t = t * a + 5.772156651487230E-001f; t = t * a; return t; } else { t = + 3.587515669447039E-003f; t = t * a - 5.471285428060787E-003f; t = t * a - 4.462712795343244E-002f; t = t * a + 1.673177015593242E-001f; t = t * a - 4.213597883575600E-002f; t = t * a - 6.558672843439567E-001f; t = t * a + 5.772153712885004E-001f; t = t * a; t = t * a + a; return -__internal_accurate_logf(t); } } /* approximate sine on -pi/4...+pi/4 */ __device_func__(float __internal_sin_kernel(float x)) { float x2, z; x2 = x * x; z = - 1.95152959e-4f; z = z * x2 + 8.33216087e-3f; z = z * x2 - 1.66666546e-1f; z = z * x2; z = z * x + x; return z; } /* approximate cosine on -pi/4...+pi/4 */ __device_func__(float __internal_cos_kernel(float x)) { float x2, z; x2 = x * x; z = 2.44331571e-5f; z = z * x2 - 1.38873163e-3f; z = z * x2 + 4.16666457e-2f; z = z * x2 - 5.00000000e-1f; z = z * x2 + 1.00000000e+0f; return z; } __device_func__(float __internal_accurate_sinf(float a)) { float z; int i; if ((__cuda___isinff(a)) || (a == CUDART_ZERO_F)) { return __fmul_rn (a, CUDART_ZERO_F); } z = __internal_trig_reduction_kernel(a, &i); /* here, abs(z) <= pi/4, and i has the quadrant */ if (i & 1) { z = __internal_cos_kernel(z); } else { z = __internal_sin_kernel(z); } if (i & 2) { z = -z; } return z; } // Stop line is 1718