1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24 */
25/*
26 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27 * Use is subject to license terms.
28 */
29
30#include <sys/isa_defs.h>
31#include "libm_inlines.h"
32
33#ifdef _LITTLE_ENDIAN
34#define HI(x)	*(1+(int*)x)
35#define LO(x)	*(unsigned*)x
36#else
37#define HI(x)	*(int*)x
38#define LO(x)	*(1+(unsigned*)x)
39#endif
40
41#ifdef __RESTRICT
42#define restrict _Restrict
43#else
44#define restrict
45#endif
46
47/* double rsqrt(double x)
48 *
49 * Method :
50 *	1. Special cases:
51 *		for x = NaN				=> QNaN;
52 *		for x = +Inf				=> 0;
53 *		for x is negative, -Inf			=> QNaN + invalid;
54 *		for x = +0				=> +Inf + divide-by-zero;
55 *		for x = -0				=> -Inf + divide-by-zero.
56 *	2. Computes reciprocal square root from:
57 *		x = m * 2**n
58 *	Where:
59 *		m = [0.5, 2),
60 *		n = ((exponent + 1) & ~1).
61 *	Then:
62 *		rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
63 *	2. Computes 1/sqrt(m) from:
64 *		1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
65 *	Where:
66 *		m = m0 + dm,
67 *		m0 = 0.5 * (1 + k/64) for m = [0.5,         0.5+127/256), k = [0, 63];
68 *		m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127];
69 *		m0 = 2.0              for m = [1.0+127/128, 2.0),         k = 128.
70 *	Then:
71 *		1/sqrt(m0) is looked up in a table,
72 *		1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)).
73 *		1/sqrt(1 + (1/m0)*dm) is computed using approximation:
74 *			1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3)
75 *						* z + a2) * z + a1) * z + a0
76 *			where z = [-1/128, 1/128].
77 *
78 * Accuracy:
79 *	The maximum relative error for the approximating
80 *	polynomial is 2**(-56.26).
81 *	Maximum error observed: less than 0.563 ulp after 1.500.000.000
82 *	results.
83 */
84
85extern double sqrt (double);
86extern const double __vlibm_TBL_rsqrt[];
87
88static void
89__vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey);
90
91#pragma no_inline(__vrsqrt_n)
92
93#define RETURN(ret)						\
94{								\
95	*py = (ret);						\
96	py += stridey;						\
97	if (n_n == 0)						\
98	{							\
99		spx = px; spy = py;				\
100		hx = HI(px);					\
101		continue;					\
102	}							\
103	n--;							\
104	break;							\
105}
106
107static const double
108	DONE = 1.0,
109	K1 = -5.00000000000005209867e-01,
110	K2 =  3.75000000000004884257e-01,
111	K3 = -3.12499999317136886551e-01,
112	K4 =  2.73437499359815081532e-01,
113	K5 = -2.46116125605037803130e-01,
114	K6 =  2.25606914648617522896e-01;
115
116void
117__vrsqrt(int n, double * restrict px, int stridex, double * restrict py, int stridey)
118{
119	double		*spx, *spy;
120	int		ax, lx, hx, n_n;
121	double		res;
122
123	while (n > 1)
124	{
125		n_n = 0;
126		spx = px;
127		spy = py;
128		hx = HI(px);
129		for (; n > 1 ; n--)
130		{
131			px += stridex;
132			if (hx >= 0x7ff00000)		/* X = NaN or Inf	*/
133			{
134				res = *(px - stridex);
135				RETURN (DONE / res)
136			}
137
138			py += stridey;
139
140			if (hx < 0x00100000)		/* X = denormal, zero or negative	*/
141			{
142				py -= stridey;
143				ax = hx & 0x7fffffff;
144				lx = LO((px - stridex));
145				res = *(px - stridex);
146
147				if ((ax | lx) == 0)	/* |X| = zero	*/
148				{
149					RETURN (DONE / res)
150				}
151				else if (hx >= 0)	/* X = denormal	*/
152				{
153					double		res_c0, dsqrt_exp0;
154					int		ind0, sqrt_exp0;
155					double		xx0, dexp_hi0, dexp_lo0;
156					int		hx0, resh0, res_ch0;
157
158					res = *(long long*)&res;
159
160					hx0 = HI(&res);
161					sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
162					ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
163
164					resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
165					res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
166					HI(&res) = resh0;
167					HI(&res_c0) = res_ch0;
168					LO(&res_c0) = 0;
169
170					dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
171					dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
172					xx0 = dexp_hi0 * dexp_hi0;
173					xx0 = (res - res_c0) * xx0;
174					res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
175
176					res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
177
178					HI(&dsqrt_exp0) = sqrt_exp0;
179					LO(&dsqrt_exp0) = 0;
180					res *= dsqrt_exp0;
181
182					RETURN (res)
183				}
184				else	/* X = negative	*/
185				{
186					RETURN (sqrt(res))
187				}
188			}
189			n_n++;
190			hx = HI(px);
191		}
192		if (n_n > 0)
193			__vrsqrt_n(n_n, spx, stridex, spy, stridey);
194	}
195	if (n > 0)
196	{
197		hx = HI(px);
198
199		if (hx >= 0x7ff00000)		/* X = NaN or Inf	*/
200		{
201			res = *px;
202			*py = DONE / res;
203		}
204		else if (hx < 0x00100000)	/* X = denormal, zero or negative	*/
205		{
206			ax = hx & 0x7fffffff;
207			lx = LO(px);
208			res = *px;
209
210			if ((ax | lx) == 0)	/* |X| = zero	*/
211			{
212				*py = DONE / res;
213			}
214			else if (hx >= 0)	/* X = denormal	*/
215			{
216				double		res_c0, dsqrt_exp0;
217				int		ind0, sqrt_exp0;
218				double		xx0, dexp_hi0, dexp_lo0;
219				int		hx0, resh0, res_ch0;
220
221				res = *(long long*)&res;
222
223				hx0 = HI(&res);
224				sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
225				ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
226
227				resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
228				res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
229				HI(&res) = resh0;
230				HI(&res_c0) = res_ch0;
231				LO(&res_c0) = 0;
232
233				dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
234				dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
235				xx0 = dexp_hi0 * dexp_hi0;
236				xx0 = (res - res_c0) * xx0;
237				res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
238
239				res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
240
241				HI(&dsqrt_exp0) = sqrt_exp0;
242				LO(&dsqrt_exp0) = 0;
243				res *= dsqrt_exp0;
244
245				*py = res;
246			}
247			else	/* X = negative	*/
248			{
249				*py = sqrt(res);
250			}
251		}
252		else
253		{
254			double		res_c0, dsqrt_exp0;
255			int		ind0, sqrt_exp0;
256			double		xx0, dexp_hi0, dexp_lo0;
257			int		resh0, res_ch0;
258
259			sqrt_exp0 = (0x5fe - (hx >> 21)) << 20;
260			ind0 = (((hx >> 10) & 0x7f8) + 8) & -16;
261
262			resh0 = (hx & 0x001fffff) | 0x3fe00000;
263			res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
264			HI(&res) = resh0;
265			LO(&res) = LO(px);
266			HI(&res_c0) = res_ch0;
267			LO(&res_c0) = 0;
268
269			dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
270			dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
271			xx0 = dexp_hi0 * dexp_hi0;
272			xx0 = (res - res_c0) * xx0;
273			res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
274
275			res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
276
277			HI(&dsqrt_exp0) = sqrt_exp0;
278			LO(&dsqrt_exp0) = 0;
279			res *= dsqrt_exp0;
280
281			*py = res;
282		}
283	}
284}
285
286static void
287__vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey)
288{
289	double		res0, res_c0, dsqrt_exp0;
290	double		res1, res_c1, dsqrt_exp1;
291	double		res2, res_c2, dsqrt_exp2;
292	int		ind0, sqrt_exp0;
293	int		ind1, sqrt_exp1;
294	int		ind2, sqrt_exp2;
295	double		xx0, dexp_hi0, dexp_lo0;
296	double		xx1, dexp_hi1, dexp_lo1;
297	double		xx2, dexp_hi2, dexp_lo2;
298	int		hx0, resh0, res_ch0;
299	int		hx1, resh1, res_ch1;
300	int		hx2, resh2, res_ch2;
301
302	LO(&dsqrt_exp0) = 0;
303	LO(&dsqrt_exp1) = 0;
304	LO(&dsqrt_exp2) = 0;
305	LO(&res_c0) = 0;
306	LO(&res_c1) = 0;
307	LO(&res_c2) = 0;
308
309	for(; n > 2 ; n -= 3)
310	{
311		hx0 = HI(px);
312		LO(&res0) = LO(px);
313		px += stridex;
314
315		hx1 = HI(px);
316		LO(&res1) = LO(px);
317		px += stridex;
318
319		hx2 = HI(px);
320		LO(&res2) = LO(px);
321		px += stridex;
322
323		sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
324		sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20;
325		sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20;
326		ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
327		ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16;
328		ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16;
329
330		resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
331		resh1 = (hx1 & 0x001fffff) | 0x3fe00000;
332		resh2 = (hx2 & 0x001fffff) | 0x3fe00000;
333		res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
334		res_ch1 = (resh1 + 0x00002000) & 0x7fffc000;
335		res_ch2 = (resh2 + 0x00002000) & 0x7fffc000;
336		HI(&res0) = resh0;
337		HI(&res1) = resh1;
338		HI(&res2) = resh2;
339		HI(&res_c0) = res_ch0;
340		HI(&res_c1) = res_ch1;
341		HI(&res_c2) = res_ch2;
342
343		dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
344		dexp_hi1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[0];
345		dexp_hi2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[0];
346		dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
347		dexp_lo1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[1];
348		dexp_lo2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[1];
349		xx0 = dexp_hi0 * dexp_hi0;
350		xx1 = dexp_hi1 * dexp_hi1;
351		xx2 = dexp_hi2 * dexp_hi2;
352		xx0 = (res0 - res_c0) * xx0;
353		xx1 = (res1 - res_c1) * xx1;
354		xx2 = (res2 - res_c2) * xx2;
355		res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
356		res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * xx1 + K2) * xx1 + K1) * xx1;
357		res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * xx2 + K2) * xx2 + K1) * xx2;
358
359		res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
360		res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1;
361		res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2;
362
363		HI(&dsqrt_exp0) = sqrt_exp0;
364		HI(&dsqrt_exp1) = sqrt_exp1;
365		HI(&dsqrt_exp2) = sqrt_exp2;
366		res0 *= dsqrt_exp0;
367		res1 *= dsqrt_exp1;
368		res2 *= dsqrt_exp2;
369
370		*py = res0;
371		py += stridey;
372
373		*py = res1;
374		py += stridey;
375
376		*py = res2;
377		py += stridey;
378	}
379
380	for(; n > 0 ; n--)
381	{
382		hx0 = HI(px);
383
384		sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
385		ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
386
387		resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
388		res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
389		HI(&res0) = resh0;
390		LO(&res0) = LO(px);
391		HI(&res_c0) = res_ch0;
392		LO(&res_c0) = 0;
393
394		px += stridex;
395
396		dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
397		dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
398		xx0 = dexp_hi0 * dexp_hi0;
399		xx0 = (res0 - res_c0) * xx0;
400		res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
401
402		res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
403
404		HI(&dsqrt_exp0) = sqrt_exp0;
405		LO(&dsqrt_exp0) = 0;
406		res0 *= dsqrt_exp0;
407
408		*py = res0;
409		py += stridey;
410	}
411}
412