xref: /illumos-gate/usr/src/lib/libmvec/common/__vrhypot.c (revision 25c28e83beb90e7c80452a7c818c5e6f73a07dc8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24  */
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #include <sys/isa_defs.h>
31 #include "libm_synonyms.h"
32 #include "libm_inlines.h"
33 
34 #ifdef _LITTLE_ENDIAN
35 #define HI(x)	*(1+(int*)x)
36 #define LO(x)	*(unsigned*)x
37 #else
38 #define HI(x)	*(int*)x
39 #define LO(x)	*(1+(unsigned*)x)
40 #endif
41 
42 #ifdef __RESTRICT
43 #define restrict _Restrict
44 #else
45 #define restrict
46 #endif
47 
48 /* double rhypot(double x, double y)
49  *
50  * Method :
51  *	1. Special cases:
52  *		x or  y = Inf					=> 0
53  *		x or  y = NaN					=> QNaN
54  *		x and y = 0					=> Inf + divide-by-zero
55  *	2. Computes rhypot(x,y):
56  *		rhypot(x,y) = m * sqrt(1/(xnm * xnm + ynm * ynm))
57  *	Where:
58  *		m = 1/max(|x|,|y|)
59  *		xnm = x * m
60  *		ynm = y * m
61  *
62  *	Compute 1/(xnm * xnm + ynm * ynm) by simulating
63  *	muti-precision arithmetic.
64  *
65  * Accuracy:
66  *	Maximum error observed: less than 0.869 ulp after 1.000.000.000
67  *	results.
68  */
69 
70 #define sqrt __sqrt
71 
72 extern double sqrt(double);
73 
74 extern double fabs(double);
75 
76 static const int __vlibm_TBL_rhypot[] = {
77 /* i = [0,127]
78  * TBL[i] = 0x3ff00000 + *(int*)&(1.0 / *(double*)&(0x3ff0000000000000ULL + (i << 45))); */
79  0x7fe00000,  0x7fdfc07f, 0x7fdf81f8,  0x7fdf4465,
80  0x7fdf07c1,  0x7fdecc07, 0x7fde9131,  0x7fde573a,
81  0x7fde1e1e,  0x7fdde5d6, 0x7fddae60,  0x7fdd77b6,
82  0x7fdd41d4,  0x7fdd0cb5, 0x7fdcd856,  0x7fdca4b3,
83  0x7fdc71c7,  0x7fdc3f8f, 0x7fdc0e07,  0x7fdbdd2b,
84  0x7fdbacf9,  0x7fdb7d6c, 0x7fdb4e81,  0x7fdb2036,
85  0x7fdaf286,  0x7fdac570, 0x7fda98ef,  0x7fda6d01,
86  0x7fda41a4,  0x7fda16d3, 0x7fd9ec8e,  0x7fd9c2d1,
87  0x7fd99999,  0x7fd970e4, 0x7fd948b0,  0x7fd920fb,
88  0x7fd8f9c1,  0x7fd8d301, 0x7fd8acb9,  0x7fd886e5,
89  0x7fd86186,  0x7fd83c97, 0x7fd81818,  0x7fd7f405,
90  0x7fd7d05f,  0x7fd7ad22, 0x7fd78a4c,  0x7fd767dc,
91  0x7fd745d1,  0x7fd72428, 0x7fd702e0,  0x7fd6e1f7,
92  0x7fd6c16c,  0x7fd6a13c, 0x7fd68168,  0x7fd661ec,
93  0x7fd642c8,  0x7fd623fa, 0x7fd60581,  0x7fd5e75b,
94  0x7fd5c988,  0x7fd5ac05, 0x7fd58ed2,  0x7fd571ed,
95  0x7fd55555,  0x7fd53909, 0x7fd51d07,  0x7fd50150,
96  0x7fd4e5e0,  0x7fd4cab8, 0x7fd4afd6,  0x7fd49539,
97  0x7fd47ae1,  0x7fd460cb, 0x7fd446f8,  0x7fd42d66,
98  0x7fd41414,  0x7fd3fb01, 0x7fd3e22c,  0x7fd3c995,
99  0x7fd3b13b,  0x7fd3991c, 0x7fd38138,  0x7fd3698d,
100  0x7fd3521c,  0x7fd33ae4, 0x7fd323e3,  0x7fd30d19,
101  0x7fd2f684,  0x7fd2e025, 0x7fd2c9fb,  0x7fd2b404,
102  0x7fd29e41,  0x7fd288b0, 0x7fd27350,  0x7fd25e22,
103  0x7fd24924,  0x7fd23456, 0x7fd21fb7,  0x7fd20b47,
104  0x7fd1f704,  0x7fd1e2ef, 0x7fd1cf06,  0x7fd1bb4a,
105  0x7fd1a7b9,  0x7fd19453, 0x7fd18118,  0x7fd16e06,
106  0x7fd15b1e,  0x7fd1485f, 0x7fd135c8,  0x7fd12358,
107  0x7fd11111,  0x7fd0fef0, 0x7fd0ecf5,  0x7fd0db20,
108  0x7fd0c971,  0x7fd0b7e6, 0x7fd0a681,  0x7fd0953f,
109  0x7fd08421,  0x7fd07326, 0x7fd0624d,  0x7fd05197,
110  0x7fd04104,  0x7fd03091, 0x7fd02040,  0x7fd01010,
111 };
112 
113 static const unsigned long long LCONST[] = {
114 0x3ff0000000000000ULL,	/* DONE = 1.0		*/
115 0x4000000000000000ULL,	/* DTWO = 2.0		*/
116 0x4230000000000000ULL,	/* D2ON36 = 2**36	*/
117 0x7fd0000000000000ULL,	/* D2ON1022 = 2**1022	*/
118 0x3cb0000000000000ULL,	/* D2ONM52 = 2**-52	*/
119 };
120 
121 #define RET_SC(I)										\
122 	px += stridex;										\
123 	py += stridey;										\
124 	pz += stridez;										\
125 	if (--n <= 0)										\
126 		break;										\
127 	goto start##I;
128 
129 #define RETURN(I, ret)										\
130 {												\
131 	pz[0] = (ret);										\
132 	RET_SC(I)										\
133 }
134 
135 #define PREP(I)											\
136 hx##I = HI(px);										\
137 hy##I = HI(py);										\
138 hx##I &= 0x7fffffff;										\
139 hy##I &= 0x7fffffff;										\
140 pz##I = pz;											\
141 if (hx##I >= 0x7ff00000 || hy##I >= 0x7ff00000)	/* |X| or |Y| = Inf,NaN */		\
142 {												\
143 	lx = LO(px);									\
144 	ly = LO(py);									\
145 	x = *px;										\
146 	y = *py;										\
147 	if (hx##I == 0x7ff00000 && lx == 0) res0 = 0.0;		/* |X| = Inf */		\
148 	else if (hy##I == 0x7ff00000 && ly == 0) res0 = 0.0;	/* |Y| = Inf */		\
149 	else res0 = fabs(x) + fabs(y);								\
150 												\
151 	RETURN (I, res0)									\
152 }												\
153 x##I = *px;											\
154 y##I = *py;											\
155 diff0 = hy##I - hx##I;										\
156 j0 = diff0 >> 31;										\
157 if (hx##I < 0x00100000 && hy##I < 0x00100000)	/* |X| and |Y| = subnormal or zero */		\
158 {												\
159 	lx = LO(px);									\
160 	ly = LO(py);									\
161 	x = x##I;										\
162 	y = y##I;										\
163 												\
164 	if ((hx##I | hy##I | lx | ly) == 0)	/* |X| and |Y| = 0 */				\
165 		RETURN (I, DONE / 0.0)							\
166 												\
167 	x = fabs(x);										\
168 	y = fabs(y);										\
169 												\
170 	x = *(long long*)&x;									\
171 	y = *(long long*)&y;									\
172 												\
173 	x *= D2ONM52;										\
174 	y *= D2ONM52;										\
175 												\
176 	x_hi0 = (x + D2ON36) - D2ON36;							\
177 	y_hi0 = (y + D2ON36) - D2ON36;							\
178 	x_lo0 = x - x_hi0;									\
179 	y_lo0 = y - y_hi0;									\
180 	res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);						\
181 	res0_lo = ((x + x_hi0) * x_lo0 + (y + y_hi0) * y_lo0);					\
182 												\
183 	dres0 = res0_hi + res0_lo;								\
184 												\
185 	iarr0 = HI(&dres0);								\
186 	iexp0 = iarr0 & 0xfff00000;								\
187 												\
188 	iarr0 = (iarr0 >> 11) & 0x1fc;								\
189 	itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];					\
190 	itbl0 -= iexp0;										\
191 	HI(&dd0) = itbl0;						\
192 	LO(&dd0) = 0;								\
193 												\
194 	dd0 = dd0 * (DTWO - dd0 * dres0);							\
195 	dd0 = dd0 * (DTWO - dd0 * dres0);							\
196 	dres0 = dd0 * (DTWO - dd0 * dres0);							\
197 												\
198 	HI(&res0) = HI(&dres0) & 0xffffff00;					\
199 	LO(&res0) = 0;								\
200 	res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;				\
201 	res0 = sqrt (res0);									\
202 												\
203 	res0 = D2ON1022 * res0;									\
204 	RETURN (I, res0)									\
205 }												\
206 j0 = hy##I - (diff0 & j0);									\
207 j0 &= 0x7ff00000;										\
208 HI(&scl##I) = 0x7ff00000 - j0;
209 
210 void
211 __vrhypot(int n, double * restrict px, int stridex, double * restrict py,
212 	int stridey, double * restrict pz, int stridez)
213 {
214 	int		i = 0;
215 	double		x, y;
216 	double		x_hi0, x_lo0, y_hi0, y_lo0, scl0 = 0;
217 	double		x0, y0, res0, dd0;
218 	double		res0_hi,res0_lo, dres0;
219 	double		x_hi1, x_lo1, y_hi1, y_lo1, scl1 = 0;
220 	double		x1 = 0.0L, y1 = 0.0L, res1, dd1;
221 	double		res1_hi,res1_lo, dres1;
222 	double		x_hi2, x_lo2, y_hi2, y_lo2, scl2 = 0;
223 	double		x2, y2, res2, dd2;
224 	double		res2_hi,res2_lo, dres2;
225 
226 	int		hx0, hy0, j0, diff0;
227 	int		iarr0, iexp0, itbl0;
228 	int		hx1, hy1;
229 	int		iarr1, iexp1, itbl1;
230 	int		hx2, hy2;
231 	int		iarr2, iexp2, itbl2;
232 
233 	int		lx, ly;
234 
235 	double		DONE = ((double*)LCONST)[0];
236 	double		DTWO = ((double*)LCONST)[1];
237 	double		D2ON36 = ((double*)LCONST)[2];
238 	double		D2ON1022 = ((double*)LCONST)[3];
239 	double		D2ONM52 = ((double*)LCONST)[4];
240 
241 	double		*pz0, *pz1 = 0, *pz2;
242 
243 	do
244 	{
245 start0:
246 		PREP(0)
247 		px += stridex;
248 		py += stridey;
249 		pz += stridez;
250 		i = 1;
251 		if (--n <= 0)
252 			break;
253 
254 start1:
255 		PREP(1)
256 		px += stridex;
257 		py += stridey;
258 		pz += stridez;
259 		i = 2;
260 		if (--n <= 0)
261 			break;
262 
263 start2:
264 		PREP(2)
265 
266 		x0 *= scl0;
267 		y0 *= scl0;
268 		x1 *= scl1;
269 		y1 *= scl1;
270 		x2 *= scl2;
271 		y2 *= scl2;
272 
273 		x_hi0 = (x0 + D2ON36) - D2ON36;
274 		y_hi0 = (y0 + D2ON36) - D2ON36;
275 		x_hi1 = (x1 + D2ON36) - D2ON36;
276 		y_hi1 = (y1 + D2ON36) - D2ON36;
277 		x_hi2 = (x2 + D2ON36) - D2ON36;
278 		y_hi2 = (y2 + D2ON36) - D2ON36;
279 		x_lo0 = x0 - x_hi0;
280 		y_lo0 = y0 - y_hi0;
281 		x_lo1 = x1 - x_hi1;
282 		y_lo1 = y1 - y_hi1;
283 		x_lo2 = x2 - x_hi2;
284 		y_lo2 = y2 - y_hi2;
285 		res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
286 		res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
287 		res2_hi = (x_hi2 * x_hi2 + y_hi2 * y_hi2);
288 		res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
289 		res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
290 		res2_lo = ((x2 + x_hi2) * x_lo2 + (y2 + y_hi2) * y_lo2);
291 
292 		dres0 = res0_hi + res0_lo;
293 		dres1 = res1_hi + res1_lo;
294 		dres2 = res2_hi + res2_lo;
295 
296 		iarr0 = HI(&dres0);
297 		iarr1 = HI(&dres1);
298 		iarr2 = HI(&dres2);
299 		iexp0 = iarr0 & 0xfff00000;
300 		iexp1 = iarr1 & 0xfff00000;
301 		iexp2 = iarr2 & 0xfff00000;
302 
303 		iarr0 = (iarr0 >> 11) & 0x1fc;
304 		iarr1 = (iarr1 >> 11) & 0x1fc;
305 		iarr2 = (iarr2 >> 11) & 0x1fc;
306 		itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
307 		itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
308 		itbl2 = ((int*)((char*)__vlibm_TBL_rhypot + iarr2))[0];
309 		itbl0 -= iexp0;
310 		itbl1 -= iexp1;
311 		itbl2 -= iexp2;
312 		HI(&dd0) = itbl0;
313 		HI(&dd1) = itbl1;
314 		HI(&dd2) = itbl2;
315 		LO(&dd0) = 0;
316 		LO(&dd1) = 0;
317 		LO(&dd2) = 0;
318 
319 		dd0 = dd0 * (DTWO - dd0 * dres0);
320 		dd1 = dd1 * (DTWO - dd1 * dres1);
321 		dd2 = dd2 * (DTWO - dd2 * dres2);
322 		dd0 = dd0 * (DTWO - dd0 * dres0);
323 		dd1 = dd1 * (DTWO - dd1 * dres1);
324 		dd2 = dd2 * (DTWO - dd2 * dres2);
325 		dres0 = dd0 * (DTWO - dd0 * dres0);
326 		dres1 = dd1 * (DTWO - dd1 * dres1);
327 		dres2 = dd2 * (DTWO - dd2 * dres2);
328 
329 		HI(&res0) = HI(&dres0) & 0xffffff00;
330 		HI(&res1) = HI(&dres1) & 0xffffff00;
331 		HI(&res2) = HI(&dres2) & 0xffffff00;
332 		LO(&res0) = 0;
333 		LO(&res1) = 0;
334 		LO(&res2) = 0;
335 		res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
336 		res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
337 		res2 += (DONE - res2_hi * res2 - res2_lo * res2) * dres2;
338 		res0 = sqrt (res0);
339 		res1 = sqrt (res1);
340 		res2 = sqrt (res2);
341 
342 		res0 = scl0 * res0;
343 		res1 = scl1 * res1;
344 		res2 = scl2 * res2;
345 
346 		*pz0 = res0;
347 		*pz1 = res1;
348 		*pz2 = res2;
349 
350 		px += stridex;
351 		py += stridey;
352 		pz += stridez;
353 		i = 0;
354 
355 	} while (--n > 0);
356 
357 	if (i > 0)
358 	{
359 		x0 *= scl0;
360 		y0 *= scl0;
361 
362 		x_hi0 = (x0 + D2ON36) - D2ON36;
363 		y_hi0 = (y0 + D2ON36) - D2ON36;
364 		x_lo0 = x0 - x_hi0;
365 		y_lo0 = y0 - y_hi0;
366 		res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
367 		res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
368 
369 		dres0 = res0_hi + res0_lo;
370 
371 		iarr0 = HI(&dres0);
372 		iexp0 = iarr0 & 0xfff00000;
373 
374 		iarr0 = (iarr0 >> 11) & 0x1fc;
375 		itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
376 		itbl0 -= iexp0;
377 		HI(&dd0) = itbl0;
378 		LO(&dd0) = 0;
379 
380 		dd0 = dd0 * (DTWO - dd0 * dres0);
381 		dd0 = dd0 * (DTWO - dd0 * dres0);
382 		dres0 = dd0 * (DTWO - dd0 * dres0);
383 
384 		HI(&res0) = HI(&dres0) & 0xffffff00;
385 		LO(&res0) = 0;
386 		res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
387 		res0 = sqrt (res0);
388 
389 		res0 = scl0 * res0;
390 
391 		*pz0 = res0;
392 
393 		if (i > 1)
394 		{
395 			x1 *= scl1;
396 			y1 *= scl1;
397 
398 			x_hi1 = (x1 + D2ON36) - D2ON36;
399 			y_hi1 = (y1 + D2ON36) - D2ON36;
400 			x_lo1 = x1 - x_hi1;
401 			y_lo1 = y1 - y_hi1;
402 			res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
403 			res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
404 
405 			dres1 = res1_hi + res1_lo;
406 
407 			iarr1 = HI(&dres1);
408 			iexp1 = iarr1 & 0xfff00000;
409 
410 			iarr1 = (iarr1 >> 11) & 0x1fc;
411 			itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
412 			itbl1 -= iexp1;
413 			HI(&dd1) = itbl1;
414 			LO(&dd1) = 0;
415 
416 			dd1 = dd1 * (DTWO - dd1 * dres1);
417 			dd1 = dd1 * (DTWO - dd1 * dres1);
418 			dres1 = dd1 * (DTWO - dd1 * dres1);
419 
420 			HI(&res1) = HI(&dres1) & 0xffffff00;
421 			LO(&res1) = 0;
422 			res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
423 			res1 = sqrt (res1);
424 
425 			res1 = scl1 * res1;
426 
427 			*pz1 = res1;
428 		}
429 	}
430 }
431 
432