1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24 */
25/*
26 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27 * Use is subject to license terms.
28 */
29
30#include <sys/isa_defs.h>
31#include "libm_inlines.h"
32
33#ifdef _LITTLE_ENDIAN
34#define HI(x)	*(1+(int*)x)
35#define LO(x)	*(unsigned*)x
36#else
37#define HI(x)	*(int*)x
38#define LO(x)	*(1+(unsigned*)x)
39#endif
40
41#ifdef __RESTRICT
42#define restrict _Restrict
43#else
44#define restrict
45#endif
46
47/* double rhypot(double x, double y)
48 *
49 * Method :
50 *	1. Special cases:
51 *		x or  y = Inf					=> 0
52 *		x or  y = NaN					=> QNaN
53 *		x and y = 0					=> Inf + divide-by-zero
54 *	2. Computes rhypot(x,y):
55 *		rhypot(x,y) = m * sqrt(1/(xnm * xnm + ynm * ynm))
56 *	Where:
57 *		m = 1/max(|x|,|y|)
58 *		xnm = x * m
59 *		ynm = y * m
60 *
61 *	Compute 1/(xnm * xnm + ynm * ynm) by simulating
62 *	muti-precision arithmetic.
63 *
64 * Accuracy:
65 *	Maximum error observed: less than 0.869 ulp after 1.000.000.000
66 *	results.
67 */
68
69extern double sqrt(double);
70extern double fabs(double);
71
72static const int __vlibm_TBL_rhypot[] = {
73/* i = [0,127]
74 * TBL[i] = 0x3ff00000 + *(int*)&(1.0 / *(double*)&(0x3ff0000000000000ULL + (i << 45))); */
75 0x7fe00000,  0x7fdfc07f, 0x7fdf81f8,  0x7fdf4465,
76 0x7fdf07c1,  0x7fdecc07, 0x7fde9131,  0x7fde573a,
77 0x7fde1e1e,  0x7fdde5d6, 0x7fddae60,  0x7fdd77b6,
78 0x7fdd41d4,  0x7fdd0cb5, 0x7fdcd856,  0x7fdca4b3,
79 0x7fdc71c7,  0x7fdc3f8f, 0x7fdc0e07,  0x7fdbdd2b,
80 0x7fdbacf9,  0x7fdb7d6c, 0x7fdb4e81,  0x7fdb2036,
81 0x7fdaf286,  0x7fdac570, 0x7fda98ef,  0x7fda6d01,
82 0x7fda41a4,  0x7fda16d3, 0x7fd9ec8e,  0x7fd9c2d1,
83 0x7fd99999,  0x7fd970e4, 0x7fd948b0,  0x7fd920fb,
84 0x7fd8f9c1,  0x7fd8d301, 0x7fd8acb9,  0x7fd886e5,
85 0x7fd86186,  0x7fd83c97, 0x7fd81818,  0x7fd7f405,
86 0x7fd7d05f,  0x7fd7ad22, 0x7fd78a4c,  0x7fd767dc,
87 0x7fd745d1,  0x7fd72428, 0x7fd702e0,  0x7fd6e1f7,
88 0x7fd6c16c,  0x7fd6a13c, 0x7fd68168,  0x7fd661ec,
89 0x7fd642c8,  0x7fd623fa, 0x7fd60581,  0x7fd5e75b,
90 0x7fd5c988,  0x7fd5ac05, 0x7fd58ed2,  0x7fd571ed,
91 0x7fd55555,  0x7fd53909, 0x7fd51d07,  0x7fd50150,
92 0x7fd4e5e0,  0x7fd4cab8, 0x7fd4afd6,  0x7fd49539,
93 0x7fd47ae1,  0x7fd460cb, 0x7fd446f8,  0x7fd42d66,
94 0x7fd41414,  0x7fd3fb01, 0x7fd3e22c,  0x7fd3c995,
95 0x7fd3b13b,  0x7fd3991c, 0x7fd38138,  0x7fd3698d,
96 0x7fd3521c,  0x7fd33ae4, 0x7fd323e3,  0x7fd30d19,
97 0x7fd2f684,  0x7fd2e025, 0x7fd2c9fb,  0x7fd2b404,
98 0x7fd29e41,  0x7fd288b0, 0x7fd27350,  0x7fd25e22,
99 0x7fd24924,  0x7fd23456, 0x7fd21fb7,  0x7fd20b47,
100 0x7fd1f704,  0x7fd1e2ef, 0x7fd1cf06,  0x7fd1bb4a,
101 0x7fd1a7b9,  0x7fd19453, 0x7fd18118,  0x7fd16e06,
102 0x7fd15b1e,  0x7fd1485f, 0x7fd135c8,  0x7fd12358,
103 0x7fd11111,  0x7fd0fef0, 0x7fd0ecf5,  0x7fd0db20,
104 0x7fd0c971,  0x7fd0b7e6, 0x7fd0a681,  0x7fd0953f,
105 0x7fd08421,  0x7fd07326, 0x7fd0624d,  0x7fd05197,
106 0x7fd04104,  0x7fd03091, 0x7fd02040,  0x7fd01010,
107};
108
109static const unsigned long long LCONST[] = {
1100x3ff0000000000000ULL,	/* DONE = 1.0		*/
1110x4000000000000000ULL,	/* DTWO = 2.0		*/
1120x4230000000000000ULL,	/* D2ON36 = 2**36	*/
1130x7fd0000000000000ULL,	/* D2ON1022 = 2**1022	*/
1140x3cb0000000000000ULL,	/* D2ONM52 = 2**-52	*/
115};
116
117#define RET_SC(I)										\
118	px += stridex;										\
119	py += stridey;										\
120	pz += stridez;										\
121	if (--n <= 0)										\
122		break;										\
123	goto start##I;
124
125#define RETURN(I, ret)										\
126{												\
127	pz[0] = (ret);										\
128	RET_SC(I)										\
129}
130
131#define PREP(I)											\
132hx##I = HI(px);										\
133hy##I = HI(py);										\
134hx##I &= 0x7fffffff;										\
135hy##I &= 0x7fffffff;										\
136pz##I = pz;											\
137if (hx##I >= 0x7ff00000 || hy##I >= 0x7ff00000)	/* |X| or |Y| = Inf,NaN */		\
138{												\
139	lx = LO(px);									\
140	ly = LO(py);									\
141	x = *px;										\
142	y = *py;										\
143	if (hx##I == 0x7ff00000 && lx == 0) res0 = 0.0;		/* |X| = Inf */		\
144	else if (hy##I == 0x7ff00000 && ly == 0) res0 = 0.0;	/* |Y| = Inf */		\
145	else res0 = fabs(x) + fabs(y);								\
146												\
147	RETURN (I, res0)									\
148}												\
149x##I = *px;											\
150y##I = *py;											\
151diff0 = hy##I - hx##I;										\
152j0 = diff0 >> 31;										\
153if (hx##I < 0x00100000 && hy##I < 0x00100000)	/* |X| and |Y| = subnormal or zero */		\
154{												\
155	lx = LO(px);									\
156	ly = LO(py);									\
157	x = x##I;										\
158	y = y##I;										\
159												\
160	if ((hx##I | hy##I | lx | ly) == 0)	/* |X| and |Y| = 0 */				\
161		RETURN (I, DONE / 0.0)							\
162												\
163	x = fabs(x);										\
164	y = fabs(y);										\
165												\
166	x = *(long long*)&x;									\
167	y = *(long long*)&y;									\
168												\
169	x *= D2ONM52;										\
170	y *= D2ONM52;										\
171												\
172	x_hi0 = (x + D2ON36) - D2ON36;							\
173	y_hi0 = (y + D2ON36) - D2ON36;							\
174	x_lo0 = x - x_hi0;									\
175	y_lo0 = y - y_hi0;									\
176	res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);						\
177	res0_lo = ((x + x_hi0) * x_lo0 + (y + y_hi0) * y_lo0);					\
178												\
179	dres0 = res0_hi + res0_lo;								\
180												\
181	iarr0 = HI(&dres0);								\
182	iexp0 = iarr0 & 0xfff00000;								\
183												\
184	iarr0 = (iarr0 >> 11) & 0x1fc;								\
185	itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];					\
186	itbl0 -= iexp0;										\
187	HI(&dd0) = itbl0;						\
188	LO(&dd0) = 0;								\
189												\
190	dd0 = dd0 * (DTWO - dd0 * dres0);							\
191	dd0 = dd0 * (DTWO - dd0 * dres0);							\
192	dres0 = dd0 * (DTWO - dd0 * dres0);							\
193												\
194	HI(&res0) = HI(&dres0) & 0xffffff00;					\
195	LO(&res0) = 0;								\
196	res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;				\
197	res0 = sqrt (res0);									\
198												\
199	res0 = D2ON1022 * res0;									\
200	RETURN (I, res0)									\
201}												\
202j0 = hy##I - (diff0 & j0);									\
203j0 &= 0x7ff00000;										\
204HI(&scl##I) = 0x7ff00000 - j0;
205
206void
207__vrhypot(int n, double * restrict px, int stridex, double * restrict py,
208	int stridey, double * restrict pz, int stridez)
209{
210	int		i = 0;
211	double		x, y;
212	double		x_hi0, x_lo0, y_hi0, y_lo0, scl0 = 0;
213	double		x0, y0, res0, dd0;
214	double		res0_hi,res0_lo, dres0;
215	double		x_hi1, x_lo1, y_hi1, y_lo1, scl1 = 0;
216	double		x1 = 0.0L, y1 = 0.0L, res1, dd1;
217	double		res1_hi,res1_lo, dres1;
218	double		x_hi2, x_lo2, y_hi2, y_lo2, scl2 = 0;
219	double		x2, y2, res2, dd2;
220	double		res2_hi,res2_lo, dres2;
221
222	int		hx0, hy0, j0, diff0;
223	int		iarr0, iexp0, itbl0;
224	int		hx1, hy1;
225	int		iarr1, iexp1, itbl1;
226	int		hx2, hy2;
227	int		iarr2, iexp2, itbl2;
228
229	int		lx, ly;
230
231	double		DONE = ((double*)LCONST)[0];
232	double		DTWO = ((double*)LCONST)[1];
233	double		D2ON36 = ((double*)LCONST)[2];
234	double		D2ON1022 = ((double*)LCONST)[3];
235	double		D2ONM52 = ((double*)LCONST)[4];
236
237	double		*pz0, *pz1 = 0, *pz2;
238
239	do
240	{
241start0:
242		PREP(0)
243		px += stridex;
244		py += stridey;
245		pz += stridez;
246		i = 1;
247		if (--n <= 0)
248			break;
249
250start1:
251		PREP(1)
252		px += stridex;
253		py += stridey;
254		pz += stridez;
255		i = 2;
256		if (--n <= 0)
257			break;
258
259start2:
260		PREP(2)
261
262		x0 *= scl0;
263		y0 *= scl0;
264		x1 *= scl1;
265		y1 *= scl1;
266		x2 *= scl2;
267		y2 *= scl2;
268
269		x_hi0 = (x0 + D2ON36) - D2ON36;
270		y_hi0 = (y0 + D2ON36) - D2ON36;
271		x_hi1 = (x1 + D2ON36) - D2ON36;
272		y_hi1 = (y1 + D2ON36) - D2ON36;
273		x_hi2 = (x2 + D2ON36) - D2ON36;
274		y_hi2 = (y2 + D2ON36) - D2ON36;
275		x_lo0 = x0 - x_hi0;
276		y_lo0 = y0 - y_hi0;
277		x_lo1 = x1 - x_hi1;
278		y_lo1 = y1 - y_hi1;
279		x_lo2 = x2 - x_hi2;
280		y_lo2 = y2 - y_hi2;
281		res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
282		res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
283		res2_hi = (x_hi2 * x_hi2 + y_hi2 * y_hi2);
284		res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
285		res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
286		res2_lo = ((x2 + x_hi2) * x_lo2 + (y2 + y_hi2) * y_lo2);
287
288		dres0 = res0_hi + res0_lo;
289		dres1 = res1_hi + res1_lo;
290		dres2 = res2_hi + res2_lo;
291
292		iarr0 = HI(&dres0);
293		iarr1 = HI(&dres1);
294		iarr2 = HI(&dres2);
295		iexp0 = iarr0 & 0xfff00000;
296		iexp1 = iarr1 & 0xfff00000;
297		iexp2 = iarr2 & 0xfff00000;
298
299		iarr0 = (iarr0 >> 11) & 0x1fc;
300		iarr1 = (iarr1 >> 11) & 0x1fc;
301		iarr2 = (iarr2 >> 11) & 0x1fc;
302		itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
303		itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
304		itbl2 = ((int*)((char*)__vlibm_TBL_rhypot + iarr2))[0];
305		itbl0 -= iexp0;
306		itbl1 -= iexp1;
307		itbl2 -= iexp2;
308		HI(&dd0) = itbl0;
309		HI(&dd1) = itbl1;
310		HI(&dd2) = itbl2;
311		LO(&dd0) = 0;
312		LO(&dd1) = 0;
313		LO(&dd2) = 0;
314
315		dd0 = dd0 * (DTWO - dd0 * dres0);
316		dd1 = dd1 * (DTWO - dd1 * dres1);
317		dd2 = dd2 * (DTWO - dd2 * dres2);
318		dd0 = dd0 * (DTWO - dd0 * dres0);
319		dd1 = dd1 * (DTWO - dd1 * dres1);
320		dd2 = dd2 * (DTWO - dd2 * dres2);
321		dres0 = dd0 * (DTWO - dd0 * dres0);
322		dres1 = dd1 * (DTWO - dd1 * dres1);
323		dres2 = dd2 * (DTWO - dd2 * dres2);
324
325		HI(&res0) = HI(&dres0) & 0xffffff00;
326		HI(&res1) = HI(&dres1) & 0xffffff00;
327		HI(&res2) = HI(&dres2) & 0xffffff00;
328		LO(&res0) = 0;
329		LO(&res1) = 0;
330		LO(&res2) = 0;
331		res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
332		res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
333		res2 += (DONE - res2_hi * res2 - res2_lo * res2) * dres2;
334		res0 = sqrt (res0);
335		res1 = sqrt (res1);
336		res2 = sqrt (res2);
337
338		res0 = scl0 * res0;
339		res1 = scl1 * res1;
340		res2 = scl2 * res2;
341
342		*pz0 = res0;
343		*pz1 = res1;
344		*pz2 = res2;
345
346		px += stridex;
347		py += stridey;
348		pz += stridez;
349		i = 0;
350
351	} while (--n > 0);
352
353	if (i > 0)
354	{
355		x0 *= scl0;
356		y0 *= scl0;
357
358		x_hi0 = (x0 + D2ON36) - D2ON36;
359		y_hi0 = (y0 + D2ON36) - D2ON36;
360		x_lo0 = x0 - x_hi0;
361		y_lo0 = y0 - y_hi0;
362		res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
363		res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
364
365		dres0 = res0_hi + res0_lo;
366
367		iarr0 = HI(&dres0);
368		iexp0 = iarr0 & 0xfff00000;
369
370		iarr0 = (iarr0 >> 11) & 0x1fc;
371		itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
372		itbl0 -= iexp0;
373		HI(&dd0) = itbl0;
374		LO(&dd0) = 0;
375
376		dd0 = dd0 * (DTWO - dd0 * dres0);
377		dd0 = dd0 * (DTWO - dd0 * dres0);
378		dres0 = dd0 * (DTWO - dd0 * dres0);
379
380		HI(&res0) = HI(&dres0) & 0xffffff00;
381		LO(&res0) = 0;
382		res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
383		res0 = sqrt (res0);
384
385		res0 = scl0 * res0;
386
387		*pz0 = res0;
388
389		if (i > 1)
390		{
391			x1 *= scl1;
392			y1 *= scl1;
393
394			x_hi1 = (x1 + D2ON36) - D2ON36;
395			y_hi1 = (y1 + D2ON36) - D2ON36;
396			x_lo1 = x1 - x_hi1;
397			y_lo1 = y1 - y_hi1;
398			res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
399			res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
400
401			dres1 = res1_hi + res1_lo;
402
403			iarr1 = HI(&dres1);
404			iexp1 = iarr1 & 0xfff00000;
405
406			iarr1 = (iarr1 >> 11) & 0x1fc;
407			itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
408			itbl1 -= iexp1;
409			HI(&dd1) = itbl1;
410			LO(&dd1) = 0;
411
412			dd1 = dd1 * (DTWO - dd1 * dres1);
413			dd1 = dd1 * (DTWO - dd1 * dres1);
414			dres1 = dd1 * (DTWO - dd1 * dres1);
415
416			HI(&res1) = HI(&dres1) & 0xffffff00;
417			LO(&res1) = 0;
418			res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
419			res1 = sqrt (res1);
420
421			res1 = scl1 * res1;
422
423			*pz1 = res1;
424		}
425	}
426}
427